[
  {
    "path": ".gitignore",
    "content": "*.pyc\n"
  },
  {
    "path": "BoWV.py",
    "content": "from data_handler import get_data\nimport argparse\nimport sys\nimport numpy as np\nimport pdb\nfrom sklearn.metrics import make_scorer, f1_score, accuracy_score, recall_score, precision_score, classification_report, precision_recall_fscore_support\nfrom sklearn.ensemble  import GradientBoostingClassifier, RandomForestClassifier\nfrom sklearn.model_selection import cross_val_score, cross_val_predict\nfrom sklearn.feature_extraction.text import TfidfVectorizer\nfrom sklearn.metrics import make_scorer, f1_score, accuracy_score, recall_score, precision_score, classification_report, precision_recall_fscore_support\nfrom sklearn.utils import shuffle\nfrom sklearn.ensemble  import GradientBoostingClassifier, RandomForestClassifier\nfrom sklearn.svm import SVC, LinearSVC\nfrom sklearn.model_selection import KFold\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn.utils import shuffle\nimport codecs\nimport operator\nimport gensim, sklearn\nfrom collections import defaultdict\nfrom batch_gen import batch_gen\nfrom my_tokenizer import glove_tokenize\nfrom nltk.tokenize import TweetTokenizer\n\n\n### Preparing the text data\ntexts = []  # list of text samples\nlabels_index = {}  # dictionary mapping label name to numeric id\nlabels = []  # list of label ids\n\n\n# logistic, gradient_boosting, random_forest, svm_linear, svm_rbf\nGLOVE_MODEL_FILE = None\nEMBEDDING_DIM = None\nMODEL_TYPE = None\nCLASS_WEIGHT = None\nN_ESTIMATORS = None\nLOSS_FUN = None\nKERNEL = None\nTOKENIZER = None\n\nSEED=42\nMAX_NB_WORDS = None\nNO_OF_FOLDS=10\n\n\n# vocab generation\nvocab, reverse_vocab = {}, {}\nfreq = defaultdict(int)\ntweets = {}\n\nword2vec_model = None\n\n\ndef select_tweets_whose_embedding_exists():\n    # selects the tweets as in mean_glove_embedding method\n    # Processing\n    tweets = get_data()\n    X, Y = [], []\n    tweet_return = []\n    for tweet in tweets:\n        _emb = 0\n        words = TOKENIZER(tweet['text'].lower())\n        for w in words:\n            if w in word2vec_model:  # Check if embeeding there in GLove model\n                _emb+=1\n        if _emb:   # Not a blank tweet\n            tweet_return.append(tweet)\n    print 'Tweets selected:', len(tweet_return)\n    return tweet_return\n\n\ndef gen_data():\n    y_map = {\n            'none': 0,\n            'racism': 1,\n            'sexism': 2\n            }\n\n    X, y = [], []\n    for tweet in tweets:\n        words = glove_tokenize(tweet['text'].lower())\n        emb = np.zeros(EMBEDDING_DIM)\n        for word in words:\n            try:\n                emb += word2vec_model[word]\n            except:\n                pass\n        emb /= len(words)\n        X.append(emb)\n        y.append(y_map[tweet['label']])\n    return X, y\n\n    \ndef get_model(m_type=None):\n    if not m_type:\n        print \"ERROR: Please specify a model type!\"\n        return None\n    if m_type == 'logistic':\n        logreg = LogisticRegression()\n    elif m_type == \"gradient_boosting\":\n        logreg = GradientBoostingClassifier(loss=LOSS_FUN, n_estimators=N_ESTIMATORS)\n    elif m_type == \"random_forest\":\n        logreg = RandomForestClassifier(class_weight=CLASS_WEIGHT, n_estimators=N_ESTIMATORS)\n    elif m_type == \"svm\":\n        logreg = SVC(class_weight=CLASS_WEIGHT, kernel=KERNEL)\n    elif m_type == \"svm_linear\":\n        logreg = LinearSVC(loss=LOSS_FUN, class_weight=CLASS_WEIGHT)\n    else:\n        print \"ERROR: Please specify a correct model\"\n        return None\n\n    return logreg\n\n\ndef classification_model(X, Y, model_type=None):\n    X, Y = shuffle(X, Y, random_state=SEED)\n    print \"Model Type:\", model_type\n\n    #predictions = cross_val_predict(logreg, X, Y, cv=NO_OF_FOLDS)\n    scores1 = cross_val_score(get_model(model_type), X, Y, cv=NO_OF_FOLDS, scoring='precision_weighted')\n    print \"Precision(avg): %0.3f (+/- %0.3f)\" % (scores1.mean(), scores1.std() * 2)\n\n    scores2 = cross_val_score(get_model(model_type), X, Y, cv=NO_OF_FOLDS, scoring='recall_weighted')\n    print \"Recall(avg): %0.3f (+/- %0.3f)\" % (scores2.mean(), scores2.std() * 2)\n    \n    scores3 = cross_val_score(get_model(model_type), X, Y, cv=NO_OF_FOLDS, scoring='f1_weighted')\n    print \"F1-score(avg): %0.3f (+/- %0.3f)\" % (scores3.mean(), scores3.std() * 2)\n\n\nif __name__ == \"__main__\":\n\n    parser = argparse.ArgumentParser(description='BagOfWords model for twitter Hate speech detection')\n    parser.add_argument('-m', '--model', choices=['logistic', 'gradient_boosting', 'random_forest', 'svm', 'svm_linear'], required=True)\n    parser.add_argument('-f', '--embeddingfile', required=True)\n    parser.add_argument('-d', '--dimension', required=True)\n    parser.add_argument('--tokenizer', choices=['glove', 'nltk'], required=True)\n    parser.add_argument('-s', '--seed', default=SEED)\n    parser.add_argument('--folds', default=NO_OF_FOLDS)\n    parser.add_argument('--estimators', default=N_ESTIMATORS)\n    parser.add_argument('--loss', default=LOSS_FUN)\n    parser.add_argument('--kernel', default=KERNEL)\n    parser.add_argument('--class_weight')\n\n\n    args = parser.parse_args()\n    MODEL_TYPE = args.model\n    GLOVE_MODEL_FILE = args.embeddingfile\n    EMBEDDING_DIM = int(args.dimension)\n    SEED = int(args.seed)\n    NO_OF_FOLDS = int(args.folds)\n    CLASS_WEIGHT = args.class_weight\n    N_ESTIMATORS = int(args.estimators)\n    LOSS_FUN = args.loss\n    KERNEL = args.kernel\n    if args.tokenizer == \"glove\":\n        TOKENIZER = glove_tokenize\n    elif args.tokenizer == \"nltk\":\n        TOKENIZER = TweetTokenizer().tokenize\n\n    print 'GLOVE embedding: %s' %(GLOVE_MODEL_FILE)\n    print 'Embedding Dimension: %d' %(EMBEDDING_DIM)\n    word2vec_model = gensim.models.Word2Vec.load_word2vec_format(GLOVE_MODEL_FILE)\n\n    #filter_vocab(20000)\n\n    tweets = select_tweets_whose_embedding_exists()\n    X, Y = gen_data()\n\n    classification_model(X, Y, MODEL_TYPE)\n"
  },
  {
    "path": "README.md",
    "content": "# Hate Speech Detection on Twitter\n\nImplementation of our paper titled - \"Deep Learning for Hate Speech Detection\" (to appear in WWW'17 proceedings). \n\n## Dataset\n\nDataset can be downloaded from [https://github.com/zeerakw/hatespeech](https://github.com/zeerakw/hatespeech). Contains tweet id's and corresponding annotations. \n\nTweets are labelled as either Racist, Sexist or Neither Racist or Sexist. \n\nUse your favourite tweet crawler and download the data and place the tweets in the folder 'tweet_data'.\n\n\n## Requirements\n* Keras \n* Tensorflow or Theano (we experimented with theano)\n* Gensim\n* xgboost\n* NLTK\n* Sklearn\n* Numpy\n\n## Instructions to run\n\nBefore running the model, make sure you have setup the input dataset in a folder named `tweet_data`.   \nTo run a model for training, use the following instructions mentioned below. Use appropriate parameter settings to test the variations of the models.\n\n\n### This script contains code for runnning NN_model + GDBT. \n\nSteps to run NN_model + GDBT\n * Run NN_model first (CNN/LSTM/Fast_text). It will create a model file\n * Change the name of the file at line 50 pointing to the model file\n * Run nn_classifier file as per instructions below\n\npython nn_classifier.py <GradientBoosting(xgboost) or Random Forest> \n\n\n- BagOfWords models - **BoWV.py[does not supports XGBOOST, supports sklearn's GBDT]**\n```\nusage: BoWV.py [-h] -m [Deprecated]\n               {logistic,gradient_boosting,random_forest,svm,svm_linear} -f\n               EMBEDDINGFILE -d DIMENSION --tokenizer {glove,nltk} [-s SEED]\n               [--folds FOLDS] [--estimators ESTIMATORS] [--loss LOSS]\n               [--kernel KERNEL] [--class_weight CLASS_WEIGHT]\n\nBagOfWords model for twitter Hate speech detection\n\noptional arguments:\n  -h, --help            show this help message and exit\n  -m {logistic,gradient_boosting,random_forest,svm,svm_linear}, --model {logistic,gradient_boosting,random_forest,svm,svm_linear}\n  -f EMBEDDINGFILE, --embeddingfile EMBEDDINGFILE\n  -d DIMENSION, --dimension DIMENSION\n  --tokenizer {glove,nltk}\n  -s SEED, --seed SEED\n  --folds FOLDS\n  --estimators ESTIMATORS\n  --loss LOSS\n  --kernel KERNEL\n  --class_weight CLASS_WEIGHT\n```\n\n- TF-IDF based models - **tfidf.py**\n```\nusage: tfidf.py [-h] -m\n                {tfidf_svm,tfidf_svm_linear,tfidf_logistic,tfidf_gradient_boosting,tfidf_random_forest}\n                --max_ngram MAX_NGRAM --tokenizer {glove,nltk} [-s SEED]\n                [--folds FOLDS] [--estimators ESTIMATORS] [--loss LOSS]\n                [--kernel KERNEL] [--class_weight CLASS_WEIGHT]\n                [--use-inverse-doc-freq]\n\nTF-IDF model for twitter Hate speech detection\n\noptional arguments:\n  -h, --help            show this help message and exit\n  -m {tfidf_svm,tfidf_svm_linear,tfidf_logistic,tfidf_gradient_boosting,tfidf_random_forest}, --model {tfidf_svm,tfidf_svm_linear,tfidf_logistic,tfidf_gradient_boosting,tfidf_random_forest}\n  --max_ngram MAX_NGRAM\n  --tokenizer {glove,nltk}\n  -s SEED, --seed SEED\n  --folds FOLDS\n  --estimators ESTIMATORS\n  --loss LOSS\n  --kernel KERNEL\n  --class_weight CLASS_WEIGHT\n  --use-inverse-doc-freq\n```\n\n- LSTM(RNN) based methods - **lstm.py**\n```\nusage: lstm.py [-h] -f EMBEDDINGFILE -d DIMENSION --tokenizer {glove,nltk}\n               --loss LOSS --optimizer OPTIMIZER --epochs EPOCHS --batch-size\n               BATCH_SIZE [-s SEED] [--folds FOLDS] [--kernel KERNEL]\n               [--class_weight CLASS_WEIGHT] --initialize-weights\n               {random,glove} [--learn-embeddings] [--scale-loss-function]\n\nLSTM based models for twitter Hate speech detection\n\noptional arguments:\n  -h, --help            show this help message and exit\n  -f EMBEDDINGFILE, --embeddingfile EMBEDDINGFILE\n  -d DIMENSION, --dimension DIMENSION\n  --tokenizer {glove,nltk}\n  --loss LOSS\n  --optimizer OPTIMIZER\n  --epochs EPOCHS\n  --batch-size BATCH_SIZE\n  -s SEED, --seed SEED\n  --folds FOLDS\n  --kernel KERNEL\n  --class_weight CLASS_WEIGHT\n  --initialize-weights {random,glove}\n  --learn-embeddings\n  --scale-loss-function\n```\n\n- CNN based models - **cnn.py**\n```\nusage: cnn.py [-h] -f EMBEDDINGFILE -d DIMENSION --tokenizer {glove,nltk}\n              --loss LOSS --optimizer OPTIMIZER --epochs EPOCHS --batch-size\n              BATCH_SIZE [-s SEED] [--folds FOLDS]\n              [--class_weight CLASS_WEIGHT] --initialize-weights\n              {random,glove} [--learn-embeddings] [--scale-loss-function]\n\nCNN based models for twitter Hate speech detection\n\noptional arguments:\n  -h, --help            show this help message and exit\n  -f EMBEDDINGFILE, --embeddingfile EMBEDDINGFILE\n  -d DIMENSION, --dimension DIMENSION\n  --tokenizer {glove,nltk}\n  --loss LOSS\n  --optimizer OPTIMIZER\n  --epochs EPOCHS\n  --batch-size BATCH_SIZE\n  -s SEED, --seed SEED\n  --folds FOLDS\n  --class_weight CLASS_WEIGHT\n  --initialize-weights {random,glove}\n  --learn-embeddings\n  --scale-loss-function\n```\n\n\n\n## Examples:\n```\npython BoWV.py --model logistic --seed 42 -f glove.twitter.27b.25d.txt -d 25 --seed 42 --folds 10 --tokenizer glove  \npython tfidf.py -m tfidf_svm_linear --max_ngram 3 --tokenizer glove --loss squared_hinge\npython lstm.py -f ~/DATASETS/glove-twitter/GENSIM.glove.twitter.27B.25d.txt -d 25 --tokenizer glove --loss categorical_crossentropy --optimizer adam --initialize-weights random --learn-embeddings --epochs 10 --batch-size 512\npython cnn.py -f ~/DATASETS/glove-twitter/GENSIM.glove.twitter.27B.25d.txt -d 25 --tokenizer nltk --loss categorical_crossentropy --optimizer adam --epochs 10 --batch-size 128 --initialize-weights random --scale-loss-function\n\n```\n"
  },
  {
    "path": "batch_gen.py",
    "content": "import numpy as np\nimport random\nimport pdb\nimport math\n\ndef batch_gen(X, batch_size):\n    n_batches = X.shape[0]/float(batch_size)\n    n_batches = int(math.ceil(n_batches))\n    end = int(X.shape[0]/float(batch_size)) * batch_size\n    n = 0\n    for i in xrange(0,n_batches):\n        if i < n_batches - 1: \n            batch = X[i*batch_size:(i+1) * batch_size, :]\n            yield batch\n        \n        else:\n            batch = X[end: , :]\n            n += X[end:, :].shape[0]\n            yield batch\n        \n\n\nif __name__ == \"__main__\":\n    X = np.random.rand(123, 32)\n    for batch in batch_gen(X, 21):\n        print batch.shape\n"
  },
  {
    "path": "cnn.py",
    "content": "from data_handler import get_data\nimport argparse\nfrom keras.preprocessing.sequence import pad_sequences\nfrom keras.layers import Embedding, Input, LSTM\nfrom keras.models import Sequential, Model\nfrom keras.layers import Activation, Dense, Dropout, Embedding, Flatten, Input, Merge, Convolution1D, MaxPooling1D, GlobalMaxPooling1D\nimport numpy as np\nimport pdb\nfrom nltk import tokenize\nfrom sklearn.metrics import make_scorer, f1_score, accuracy_score, recall_score, precision_score, classification_report, precision_recall_fscore_support\nfrom sklearn.ensemble  import GradientBoostingClassifier, RandomForestClassifier\nfrom gensim.parsing.preprocessing import STOPWORDS\nfrom sklearn.model_selection import KFold\nfrom keras.utils import np_utils\nfrom string import punctuation\nimport codecs\nimport operator\nimport gensim, sklearn\nfrom collections import defaultdict\nfrom batch_gen import batch_gen\nimport sys\n\nfrom nltk import tokenize as tokenize_nltk\nfrom my_tokenizer import glove_tokenize\n\n\n### Preparing the text data\ntexts = []  # list of text samples\nlabels_index = {}  # dictionary mapping label name to numeric id\nlabels = []  # list of label ids\n\n# vocab generation\nvocab, reverse_vocab = {}, {}\nfreq = defaultdict(int)\ntweets = {}\n\n\n\nEMBEDDING_DIM = None\nGLOVE_MODEL_FILE = None\nNO_OF_CLASSES=3\n\nSEED = 42\nNO_OF_FOLDS = 10\nCLASS_WEIGHT = None\nLOSS_FUN = None\nOPTIMIZER = None\nTOKENIZER = None\nINITIALIZE_WEIGHTS_WITH = None\nLEARN_EMBEDDINGS = None\nEPOCHS = 10\nBATCH_SIZE = 128\nSCALE_LOSS_FUN = None\n\n\nword2vec_model = None\n\n\n\ndef get_embedding(word):\n    #return\n    try:\n        return word2vec_model[word]\n    except Exception, e:\n        print 'Encoding not found: %s' %(word)\n        return np.zeros(EMBEDDING_DIM)\n\ndef get_embedding_weights():\n    embedding = np.zeros((len(vocab) + 1, EMBEDDING_DIM))\n    n = 0\n    for k, v in vocab.iteritems():\n        try:\n            embedding[v] = word2vec_model[k]\n        except:\n            n += 1\n            pass\n    print \"%d embedding missed\"%n\n    #pdb.set_trace()\n    return embedding\n\n\ndef select_tweets():\n    # selects the tweets as in mean_glove_embedding method\n    # Processing\n    tweets = get_data()\n    X, Y = [], []\n    tweet_return = []\n    for tweet in tweets:\n        _emb = 0\n        words = TOKENIZER(tweet['text'].lower())\n        for w in words:\n            if w in word2vec_model:  # Check if embeeding there in GLove model\n                _emb+=1\n        if _emb:   # Not a blank tweet\n            tweet_return.append(tweet)\n    print 'Tweets selected:', len(tweet_return)\n    return tweet_return\n\n\ndef gen_vocab():\n    # Processing\n    vocab_index = 1\n    for tweet in tweets:\n        text = TOKENIZER(tweet['text'].lower())\n        text = ''.join([c for c in text if c not in punctuation])\n        words = text.split()\n        words = [word for word in words if word not in STOPWORDS]\n\n        for word in words:\n            if word not in vocab:\n                vocab[word] = vocab_index\n                reverse_vocab[vocab_index] = word       # generate reverse vocab as well\n                vocab_index += 1\n            freq[word] += 1\n    vocab['UNK'] = len(vocab) + 1\n    reverse_vocab[len(vocab)] = 'UNK'\n\n\ndef filter_vocab(k):\n    global freq, vocab\n    freq_sorted = sorted(freq.items(), key=operator.itemgetter(1))\n    tokens = freq_sorted[:k]\n    vocab = dict(zip(tokens, range(1, len(tokens) + 1)))\n    vocab['UNK'] = len(vocab) + 1\n\n\ndef gen_sequence():\n    y_map = {\n            'none': 0,\n            'racism': 1,\n            'sexism': 2\n            }\n\n    X, y = [], []\n    for tweet in tweets:\n        text = TOKENIZER(tweet['text'].lower())\n        text = ''.join([c for c in text if c not in punctuation])\n        words = text.split()\n        words = [word for word in words if word not in STOPWORDS]\n        seq, _emb = [], []\n        for word in words:\n            seq.append(vocab.get(word, vocab['UNK']))\n        X.append(seq)\n        y.append(y_map[tweet['label']])\n    return X, y\n\n\ndef shuffle_weights(model):\n    weights = model.get_weights()\n    weights = [np.random.permutation(w.flat).reshape(w.shape) for w in weights]\n    model.set_weights(weights)\n\n\ndef cnn_model(sequence_length, embedding_dim):\n    model_variation = 'CNN-rand'  #  CNN-rand | CNN-non-static | CNN-static\n    print('Model variation is %s' % model_variation)\n\n    # Model Hyperparameters\n    n_classes = NO_OF_CLASSES\n    embedding_dim = EMBEDDING_DIM\n    filter_sizes = (3, 4, 5)\n    num_filters = 100\n    dropout_prob = (0.25, 0.5)\n    hidden_dims = 100\n\n    # Training parameters\n    # Word2Vec parameters, see train_word2vec\n    #min_word_count = 1  # Minimum word count\n    #context = 10        # Context window size\n\n    graph_in = Input(shape=(sequence_length, embedding_dim))\n    convs = []\n    for fsz in filter_sizes:\n        conv = Convolution1D(nb_filter=num_filters,\n                             filter_length=fsz,\n                             border_mode='valid',\n                             activation='relu')(graph_in)\n                             #,subsample_length=1)(graph_in)\n        pool = GlobalMaxPooling1D()(conv)\n        #flatten = Flatten()(pool)\n        convs.append(pool)\n\n    if len(filter_sizes)>1:\n        out = Merge(mode='concat')(convs)\n    else:\n        out = convs[0]\n\n    graph = Model(input=graph_in, output=out)\n\n    # main sequential model\n    model = Sequential()\n    #if not model_variation=='CNN-rand':\n    model.add(Embedding(len(vocab)+1, embedding_dim, input_length=sequence_length, trainable=LEARN_EMBEDDINGS))\n    model.add(Dropout(dropout_prob[0]))#, input_shape=(sequence_length, embedding_dim)))\n    model.add(graph)\n    model.add(Dropout(dropout_prob[1]))\n    model.add(Activation('relu'))\n    model.add(Dense(n_classes))\n    model.add(Activation('softmax'))\n    model.compile(loss=LOSS_FUN, optimizer=OPTIMIZER, metrics=['accuracy'])\n    print model.summary()\n    return model\n\n\ndef train_CNN(X, y, inp_dim, model, weights, epochs=EPOCHS, batch_size=BATCH_SIZE):\n    cv_object = KFold(n_splits=NO_OF_FOLDS, shuffle=True, random_state=42)\n    print cv_object\n    p, r, f1 = 0., 0., 0.\n    p1, r1, f11 = 0., 0., 0.\n    sentence_len = X.shape[1]\n    for train_index, test_index in cv_object.split(X):\n        if INITIALIZE_WEIGHTS_WITH == \"glove\":\n            model.layers[0].set_weights([weights])\n        elif INITIALIZE_WEIGHTS_WITH == \"random\":\n            shuffle_weights(model)\n        else:\n            print \"ERROR!\"\n            return\n\n        X_train, y_train = X[train_index], y[train_index]\n        X_test, y_test = X[test_index], y[test_index]\n        y_train = y_train.reshape((len(y_train), 1))\n        X_temp = np.hstack((X_train, y_train))\n        for epoch in xrange(epochs):\n            for X_batch in batch_gen(X_temp, batch_size):\n                x = X_batch[:, :sentence_len]\n                y_temp = X_batch[:, sentence_len]\n\n                class_weights = None\n                if SCALE_LOSS_FUN:\n                    class_weights = {}\n                    class_weights[0] = np.where(y_temp == 0)[0].shape[0]/float(len(y_temp))\n                    class_weights[1] = np.where(y_temp == 1)[0].shape[0]/float(len(y_temp))\n                    class_weights[2] = np.where(y_temp == 2)[0].shape[0]/float(len(y_temp))\n\n                try:\n                    y_temp = np_utils.to_categorical(y_temp, nb_classes=3)\n                except Exception as e:\n                    print e\n                    print y_temp\n                print x.shape, y.shape\n                loss, acc = model.train_on_batch(x, y_temp, class_weight=class_weights)\n                print loss, acc\n        y_pred = model.predict_on_batch(X_test)\n        y_pred = np.argmax(y_pred, axis=1)\n        print classification_report(y_test, y_pred)\n        print precision_recall_fscore_support(y_test, y_pred)\n        print y_pred\n        p += precision_score(y_test, y_pred, average='weighted')\n        p1 += precision_score(y_test, y_pred, average='micro')\n        r += recall_score(y_test, y_pred, average='weighted')\n        r1 += recall_score(y_test, y_pred, average='micro')\n        f1 += f1_score(y_test, y_pred, average='weighted')\n        f11 += f1_score(y_test, y_pred, average='micro')\n\n    print \"macro results are\"\n    print \"average precision is %f\" %(p/NO_OF_FOLDS)\n    print \"average recall is %f\" %(r/NO_OF_FOLDS)\n    print \"average f1 is %f\" %(f1/NO_OF_FOLDS)\n\n    print \"micro results are\"\n    print \"average precision is %f\" %(p1/NO_OF_FOLDS)\n    print \"average recall is %f\" %(r1/NO_OF_FOLDS)\n    print \"average f1 is %f\" %(f11/NO_OF_FOLDS)\n\n\nif __name__ == \"__main__\":\n    parser = argparse.ArgumentParser(description='CNN based models for twitter Hate speech detection')\n    parser.add_argument('-f', '--embeddingfile', required=True)\n    parser.add_argument('-d', '--dimension', required=True)\n    parser.add_argument('--tokenizer', choices=['glove', 'nltk'], required=True)\n    parser.add_argument('--loss', default=LOSS_FUN, required=True)\n    parser.add_argument('--optimizer', default=OPTIMIZER, required=True)\n    parser.add_argument('--epochs', default=EPOCHS, required=True)\n    parser.add_argument('--batch-size', default=BATCH_SIZE, required=True)\n    parser.add_argument('-s', '--seed', default=SEED)\n    parser.add_argument('--folds', default=NO_OF_FOLDS)\n    parser.add_argument('--class_weight')\n    parser.add_argument('--initialize-weights', choices=['random', 'glove'], required=True)\n    parser.add_argument('--learn-embeddings', action='store_true', default=False)\n    parser.add_argument('--scale-loss-function', action='store_true', default=False)\n    args = parser.parse_args()\n\n    GLOVE_MODEL_FILE = args.embeddingfile\n    EMBEDDING_DIM = int(args.dimension)\n    SEED = int(args.seed)\n    NO_OF_FOLDS = int(args.folds)\n    CLASS_WEIGHT = args.class_weight\n    LOSS_FUN = args.loss\n    OPTIMIZER = args.optimizer\n    if args.tokenizer == \"glove\":\n        TOKENIZER = glove_tokenize\n    elif args.tokenizer == \"nltk\":\n        TOKENIZER = tokenize_nltk.casual.TweetTokenizer(strip_handles=True, reduce_len=True).tokenize\n    INITIALIZE_WEIGHTS_WITH = args.initialize_weights\n    LEARN_EMBEDDINGS = args.learn_embeddings\n    EPOCHS = int(args.epochs)\n    BATCH_SIZE = int(args.batch_size)\n    SCALE_LOSS_FUN = args.scale_loss_function\n\n\n\n    print 'GLOVE embedding: %s' %(GLOVE_MODEL_FILE)\n    print 'Embedding Dimension: %d' %(EMBEDDING_DIM)\n    print 'Allowing embedding learning: %s' %(str(LEARN_EMBEDDINGS))\n\n    word2vec_model = gensim.models.Word2Vec.load_word2vec_format(GLOVE_MODEL_FILE)\n    np.random.seed(SEED)\n\n\n    Tweets = select_tweets()\n    tweets = Tweets\n    gen_vocab()\n    #filter_vocab(20000)\n    X, y = gen_sequence()\n    #Y = y.reshape((len(y), 1))\n    MAX_SEQUENCE_LENGTH = max(map(lambda x:len(x), X))\n    print \"max seq length is %d\"%(MAX_SEQUENCE_LENGTH)\n    data = pad_sequences(X, maxlen=MAX_SEQUENCE_LENGTH)\n    y = np.array(y)\n    data, y = sklearn.utils.shuffle(data, y)\n    W = get_embedding_weights()\n    model = cnn_model(data.shape[1], EMBEDDING_DIM)\n    train_CNN(data, y, EMBEDDING_DIM, model, W)\n\n    pdb.set_trace()\n\n\n"
  },
  {
    "path": "data_handler.py",
    "content": "import json\nimport pdb\nimport codecs\nimport pdb\n\ndef get_data():\n    tweets = []\n    files = ['racism.json', 'neither.json', 'sexism.json']\n    for file in files:\n        with codecs.open('./tweet_data/' + file, 'r', encoding='utf-8') as f:\n            data = f.readlines()\n        for line in data:\n            tweet_full = json.loads(line)\n            tweets.append({\n                'id': tweet_full['id'],\n                'text': tweet_full['text'].lower(),\n                'label': tweet_full['Annotation'],\n                'name': tweet_full['user']['name'].split()[0]\n                })\n\n    #pdb.set_trace()\n    return tweets\n\n\nif __name__==\"__main__\":\n    tweets = get_data()\n    males, females = {}, {}\n    with open('./tweet_data/males.txt') as f:\n        males = set([w.strip() for w in f.readlines()])\n    with open('./tweet_data/females.txt') as f:\n        females = set([w.strip() for w in f.readlines()])\n\n    males_c, females_c, not_found = 0, 0, 0\n    for t in tweets:\n        if t['name'] in males:\n            males_c += 1\n        elif t['name'] in females:\n            females_c += 1\n        else:\n            not_found += 1\n    print males_c, females_c, not_found\n    pdb.set_trace()\n"
  },
  {
    "path": "fast_text.py",
    "content": "from data_handler import get_data\nfrom keras.preprocessing.text import Tokenizer\nfrom keras.preprocessing.sequence import pad_sequences\nfrom keras.layers import Embedding, Input, LSTM\nfrom keras.models import Sequential, Model\nfrom keras.layers import Activation, Dense, Dropout, Embedding, Flatten, Input, Merge, Convolution1D, MaxPooling1D, GlobalMaxPooling1D, GlobalAveragePooling1D\nimport numpy as np\nfrom preprocess_twitter import tokenize as tokenizer_g\nimport pdb\nfrom nltk import tokenize\nfrom sklearn.metrics import make_scorer, f1_score, accuracy_score, recall_score, precision_score, classification_report, precision_recall_fscore_support\nfrom sklearn.ensemble  import GradientBoostingClassifier, RandomForestClassifier\nfrom gensim.parsing.preprocessing import STOPWORDS\nfrom sklearn.model_selection import KFold\nfrom keras.utils import np_utils\nimport codecs\nimport operator\nimport gensim, sklearn\nfrom collections import defaultdict\nfrom batch_gen import batch_gen\nfrom string import punctuation\nfrom get_similar_words import get_similar_words\nimport sys\n\n### Preparing the text data\ntexts = []  # list of text samples\nlabels_index = {}  # dictionary mapping label name to numeric id\nlabels = []  # list of label ids\nlabel_map = {\n        'none': 0,\n        'racism': 1,\n        'sexism': 2\n    }\ntweet_data = get_data()\nfor tweet in tweet_data:\n    texts.append(tweet['text'])\n    labels.append(label_map[tweet['label']])\nprint('Found %s texts. (samples)' % len(texts))\n\nEMBEDDING_DIM = int(sys.argv[1])\nnp.random.seed(42)\n# Load the orginal glove file\n# SHASHANK files\n#GLOVE_MODEL_FILE=\"/home/shashank/DL_NLP/glove-twitter\" + str(EMBEDDING_DIM) + \"-w2v\"\n\n\n# PINKESH files\nGLOVE_MODEL_FILE=\"/home/pinkesh/DATASETS/glove-twitter/GENSIM.glove.twitter.27B.\" + str(EMBEDDING_DIM) + \"d.txt\"\nNO_OF_CLASSES=3\n\nMAX_NB_WORDS = None\nVALIDATION_SPLIT = 0.2\nword2vec_model = gensim.models.Word2Vec.load_word2vec_format(GLOVE_MODEL_FILE)\n\n\n# vocab generation\nMyTokenizer = tokenize.casual.TweetTokenizer(strip_handles=True, reduce_len=True)\nvocab, reverse_vocab = {}, {}\nfreq = defaultdict(int)\ntweets = {}\n\n\ndef get_embedding(word):\n    #return\n    try:\n        return word2vec_model[word]\n    except Exception, e:\n        print 'Encoding not found: %s' %(word)\n        return np.zeros(EMBEDDING_DIM)\n\ndef get_embedding_weights():\n    embedding = np.zeros((len(vocab) + 1, EMBEDDING_DIM))\n    n = 0\n    for k, v in vocab.iteritems():\n    \ttry:\n    \t\tembedding[v] = word2vec_model[k]\n    \texcept:\n    \t\tn += 1\n    \t\tpass\n    print \"%d embedding missed\"%n\n    #pdb.set_trace()\n    return embedding\n\n\ndef select_tweets():\n    # selects the tweets as in mean_glove_embedding method\n    # Processing\n    tweets = get_data()\n    X, Y = [], []\n    tweet_return = []\n    for tweet in tweets:\n        _emb = 0\n        words = Tokenize(tweet['text']).split()\n        for w in words:\n            if w in word2vec_model:  # Check if embeeding there in GLove model\n                _emb+=1\n        if _emb:   # Not a blank tweet\n            tweet_return.append(tweet)\n    print 'Tweets selected:', len(tweet_return)\n    #pdb.set_trace()\n    return tweet_return\n\n\ndef gen_vocab():\n    # Processing\n    vocab_index = 1\n    for tweet in tweets:\n        text = Tokenize(tweet['text'])\n        text = ''.join([c for c in text if c not in punctuation])\n        words = text.split()\n        words = [word for word in words if word not in STOPWORDS]\n\n        for word in words:\n            if word not in vocab:\n                vocab[word] = vocab_index\n                reverse_vocab[vocab_index] = word       # generate reverse vocab as well\n                vocab_index += 1\n            freq[word] += 1\n    vocab['UNK'] = len(vocab) + 1\n    reverse_vocab[len(vocab)] = 'UNK'\n    #pdb.set_trace()\n\n\ndef filter_vocab(k):\n    global freq, vocab\n    #pdb.set_trace()\n    freq_sorted = sorted(freq.items(), key=operator.itemgetter(1))\n    tokens = freq_sorted[:k]\n    vocab = dict(zip(tokens, range(1, len(tokens) + 1)))\n    vocab['UNK'] = len(vocab) + 1\n\n\ndef gen_sequence():\n    y_map = {\n            'none': 0,\n            'racism': 1,\n            'sexism': 2\n            }\n\n    X, y = [], []\n    for tweet in tweets:\n        text = Tokenize(tweet['text'])\n        text = ''.join([c for c in text if c not in punctuation])\n        words = text.split()\n        words = [word for word in words if word not in STOPWORDS]\n        seq, _emb = [], []\n        for word in words:\n            seq.append(vocab.get(word, vocab['UNK']))\n        X.append(seq)\n        y.append(y_map[tweet['label']])\n    return X, y\n\n\ndef Tokenize(tweet):\n    #return MyTokenizer.tokenize(tweet)\n    #pdb.set_trace()\n    return tokenizer_g(tweet)\n\n\ndef shuffle_weights(model):\n    weights = model.get_weights()\n    weights = [np.random.permutation(w.flat).reshape(w.shape) for w in weights]\n    model.set_weights(weights)\n\n\ndef fast_text_model(sequence_length):\n    model = Sequential()\n    model.add(Embedding(len(vocab)+1, EMBEDDING_DIM, input_length=sequence_length))\n    #model.add(Embedding(len(vocab)+1, EMBEDDING_DIM, input_length=sequence_length, trainable=False))\n    model.add(Dropout(0.5))\n    model.add(GlobalAveragePooling1D())\n    model.add(Dense(3, activation='softmax'))\n    model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy'])\n    print model.summary()\n    return model\n\ndef train_fasttext(X, y, model, inp_dim,embedding_weights, epochs=10, batch_size=128):\n    cv_object = KFold(n_splits=10, shuffle=True, random_state=42)\n    print cv_object\n    p, r, f1 = 0., 0., 0.\n    p1, r1, f11 = 0., 0., 0.\n    sentence_len = X.shape[1]\n    lookup_table = np.zeros_like(model.layers[0].get_weights()[0])\n    for train_index, test_index in cv_object.split(X):\n        shuffle_weights(model)\n        #pdb.set_trace()\n        #model.layers[0].set_weights([embedding_weights])\n        X_train, y_train = X[train_index], y[train_index]\n        X_test, y_test = X[test_index], y[test_index]\n        y_train = y_train.reshape((len(y_train), 1))\n        X_temp = np.hstack((X_train, y_train))\n        for epoch in xrange(epochs):\n            for X_batch in batch_gen(X_temp, batch_size):\n                x = X_batch[:, :sentence_len]\n                y_temp = X_batch[:, sentence_len]\n\t\tclass_weights = {}\n\t\tclass_weights[0] = np.where(y_temp == 0)[0].shape[0]/float(len(y_temp))\n\t\tclass_weights[1] = np.where(y_temp == 1)[0].shape[0]/float(len(y_temp))\n\t\tclass_weights[2] = np.where(y_temp == 2)[0].shape[0]/float(len(y_temp))\n                try:\n                    y_temp = np_utils.to_categorical(y_temp, nb_classes=3)\n                except Exception as e:\n                    print e\n                #print x.shape, y.shape\n                loss, acc = model.train_on_batch(x, y_temp)#, class_weight=class_weights)\n                print loss, acc\n        #pdb.set_trace()\n        lookup_table += model.layers[0].get_weights()[0]\n        y_pred = model.predict_on_batch(X_test)\n        y_pred = np.argmax(y_pred, axis=1)\n        print classification_report(y_test, y_pred)\n        print precision_recall_fscore_support(y_test, y_pred)\n        print y_pred\n        p += precision_score(y_test, y_pred, average='weighted')\n        p1 += precision_score(y_test, y_pred, average='micro')\n        r += recall_score(y_test, y_pred, average='weighted')\n        r1 += recall_score(y_test, y_pred, average='micro')\n        f1 += f1_score(y_test, y_pred, average='weighted')\n        f11 += f1_score(y_test, y_pred, average='micro')\n\n    print \"macro results are\"\n    print \"average precision is %f\" %(p/10)\n    print \"average recall is %f\" %(r/10)\n    print \"average f1 is %f\" %(f1/10)\n\n    print \"micro results are\"\n    print \"average precision is %f\" %(p1/10)\n    print \"average recall is %f\" %(r1/10)\n    print \"average f1 is %f\" %(f11/10)\n    return lookup_table/float(10)\n\n\ndef check_semantic_sim(embedding_table, word):\n    reverse_vocab = {v:k for k,v in vocab.iteritems()}\n    sim_word_idx = get_similar_words(embedding_table, embedding_table[vocab[word]], 25)\n    sim_words = map(lambda x:reverse_vocab[x[1]], sim_word_idx)\n    print sim_words\n\ndef tryWord(embedding_table):\n    while True:\n        print \"enter word\"\n        word = raw_input()\n        if word == \"pdb\":\n            pdb.set_trace()\n        elif word == 'exit':\n            return\n        else:\n            check_semantic_sim(embedding_table, word)\n\n\nif __name__ == \"__main__\":\n\n    Tweets = select_tweets()\n    tweets = Tweets\n    gen_vocab()\n    X, y = gen_sequence()\n    MAX_SEQUENCE_LENGTH = max(map(lambda x:len(x), X))\n    print \"max seq length is %d\"%(MAX_SEQUENCE_LENGTH)\n    data = pad_sequences(X, maxlen=MAX_SEQUENCE_LENGTH)\n    y = np.array(y)\n    W = get_embedding_weights()\n    data, y = sklearn.utils.shuffle(data, y)\n    model = fast_text_model(data.shape[1])\n    _ = train_fasttext(data, y, model, EMBEDDING_DIM, W)\n    table = model.layers[0].get_weights()[0]\n    #check_semantic_sim(table)\n    tryWord(table)\n    pdb.set_trace()\n\n\n"
  },
  {
    "path": "get_similar_words.py",
    "content": "from sklearn.metrics.pairwise import cosine_similarity\nimport numpy as np\nimport pdb\n\ndef get_similar_words(X, vec, K=1):\n    # X: (n_samples, n_features)\n    # vec: (1, n_features)\n    # returns: K top most similar words with score values and their indexes\n    scores = cosine_similarity(X, vec)\n    scores = sorted([(val, index) for index, val in enumerate(scores.reshape((1,scores.shape[0]))[0])], reverse=True)    \n    scores = scores[1:K]\n    return scores\n\n"
  },
  {
    "path": "lstm.py",
    "content": "from data_handler import get_data\nimport argparse\nfrom keras.preprocessing.sequence import pad_sequences\nfrom keras.layers import Embedding, Input, LSTM\nfrom keras.models import Sequential, Model\nfrom keras.layers import Activation, Dense, Dropout, Embedding, Flatten, Input, Merge, Convolution1D, MaxPooling1D, GlobalMaxPooling1D\nimport numpy as np\nimport pdb\nfrom sklearn.metrics import make_scorer, f1_score, accuracy_score, recall_score, precision_score, classification_report, precision_recall_fscore_support\nfrom sklearn.ensemble  import GradientBoostingClassifier, RandomForestClassifier\nfrom gensim.parsing.preprocessing import STOPWORDS\nfrom sklearn.model_selection import KFold\nfrom keras.utils import np_utils\nimport codecs\nimport operator\nimport gensim, sklearn\nfrom string import punctuation\nfrom collections import defaultdict\nfrom batch_gen import batch_gen\nimport sys\n\nfrom nltk import tokenize as tokenize_nltk\nfrom my_tokenizer import glove_tokenize\n\n\n\n### Preparing the text data\ntexts = []  # list of text samples\nlabels_index = {}  # dictionary mapping label name to numeric id\nlabels = []  # list of label ids\n\n# vocab generation\nvocab, reverse_vocab = {}, {}\nfreq = defaultdict(int)\ntweets = {}\n\n\n\nEMBEDDING_DIM = None\nGLOVE_MODEL_FILE = None\nSEED = 42\nNO_OF_FOLDS = 10\nCLASS_WEIGHT = None\nLOSS_FUN = None\nOPTIMIZER = None\nKERNEL = None\nTOKENIZER = None\nMAX_SEQUENCE_LENGTH = None\nINITIALIZE_WEIGHTS_WITH = None\nLEARN_EMBEDDINGS = None\nEPOCHS = 10\nBATCH_SIZE = 512\nSCALE_LOSS_FUN = None\n\nword2vec_model = None\n\n\n\ndef get_embedding(word):\n    #return\n    try:\n        return word2vec_model[word]\n    except Exception, e:\n        print 'Encoding not found: %s' %(word)\n        return np.zeros(EMBEDDING_DIM)\n\ndef get_embedding_weights():\n    embedding = np.zeros((len(vocab) + 1, EMBEDDING_DIM))\n    n = 0\n    for k, v in vocab.iteritems():\n        try:\n            embedding[v] = word2vec_model[k]\n        except:\n            n += 1\n            pass\n    print \"%d embedding missed\"%n\n    return embedding\n\n\ndef select_tweets():\n    # selects the tweets as in mean_glove_embedding method\n    # Processing\n    tweets = get_data()\n    X, Y = [], []\n    tweet_return = []\n    for tweet in tweets:\n        _emb = 0\n        words = TOKENIZER(tweet['text'].lower())\n        for w in words:\n            if w in word2vec_model:  # Check if embeeding there in GLove model\n                _emb+=1\n        if _emb:   # Not a blank tweet\n            tweet_return.append(tweet)\n    print 'Tweets selected:', len(tweet_return)\n    #pdb.set_trace()\n    return tweet_return\n\n\ndef gen_vocab():\n    # Processing\n    vocab_index = 1\n    for tweet in tweets:\n        text = TOKENIZER(tweet['text'].lower())\n        text = ''.join([c for c in text if c not in punctuation])\n        words = text.split()\n        words = [word for word in words if word not in STOPWORDS]\n\n        for word in words:\n            if word not in vocab:\n                vocab[word] = vocab_index\n                reverse_vocab[vocab_index] = word       # generate reverse vocab as well\n                vocab_index += 1\n            freq[word] += 1\n    vocab['UNK'] = len(vocab) + 1\n    reverse_vocab[len(vocab)] = 'UNK'\n\n\ndef filter_vocab(k):\n    global freq, vocab\n    pdb.set_trace()\n    freq_sorted = sorted(freq.items(), key=operator.itemgetter(1))\n    tokens = freq_sorted[:k]\n    vocab = dict(zip(tokens, range(1, len(tokens) + 1)))\n    vocab['UNK'] = len(vocab) + 1\n\n\ndef gen_sequence():\n    y_map = {\n            'none': 0,\n            'racism': 1,\n            'sexism': 2\n            }\n\n    X, y = [], []\n    for tweet in tweets:\n        text = TOKENIZER(tweet['text'].lower())\n        text = ''.join([c for c in text if c not in punctuation])\n        words = text.split()\n        words = [word for word in words if word not in STOPWORDS]\n        seq, _emb = [], []\n        for word in words:\n            seq.append(vocab.get(word, vocab['UNK']))\n        X.append(seq)\n        y.append(y_map[tweet['label']])\n    return X, y\n\n\ndef shuffle_weights(model):\n    weights = model.get_weights()\n    weights = [np.random.permutation(w.flat).reshape(w.shape) for w in weights]\n    model.set_weights(weights)\n\ndef lstm_model(sequence_length, embedding_dim):\n    model_variation = 'LSTM'\n    print('Model variation is %s' % model_variation)\n    model = Sequential()\n    model.add(Embedding(len(vocab)+1, embedding_dim, input_length=sequence_length, trainable=LEARN_EMBEDDINGS))\n    model.add(Dropout(0.25))#, input_shape=(sequence_length, embedding_dim)))\n    model.add(LSTM(50))\n    model.add(Dropout(0.5))\n    model.add(Dense(3))\n    model.add(Activation('softmax'))\n    model.compile(loss=LOSS_FUN, optimizer=OPTIMIZER, metrics=['accuracy'])\n    print model.summary()\n    return model\n\n\ndef train_LSTM(X, y, model, inp_dim, weights, epochs=EPOCHS, batch_size=BATCH_SIZE):\n    cv_object = KFold(n_splits=NO_OF_FOLDS, shuffle=True, random_state=42)\n    print cv_object\n    p, r, f1 = 0., 0., 0.\n    p1, r1, f11 = 0., 0., 0.\n    sentence_len = X.shape[1]\n    for train_index, test_index in cv_object.split(X):\n        if INITIALIZE_WEIGHTS_WITH == \"glove\":\n            model.layers[0].set_weights([weights])\n        elif INITIALIZE_WEIGHTS_WITH == \"random\":\n            shuffle_weights(model)\n        else:\n            print \"ERROR!\"\n            return\n        X_train, y_train = X[train_index], y[train_index]\n        X_test, y_test = X[test_index], y[test_index]\n        y_train = y_train.reshape((len(y_train), 1))\n        X_temp = np.hstack((X_train, y_train))\n        for epoch in xrange(epochs):\n            for X_batch in batch_gen(X_temp, batch_size):\n                x = X_batch[:, :sentence_len]\n                y_temp = X_batch[:, sentence_len]\n\n                class_weights = None\n                if SCALE_LOSS_FUN:\n                    class_weights = {}\n                    class_weights[0] = np.where(y_temp == 0)[0].shape[0]/float(len(y_temp))\n                    class_weights[1] = np.where(y_temp == 1)[0].shape[0]/float(len(y_temp))\n                    class_weights[2] = np.where(y_temp == 2)[0].shape[0]/float(len(y_temp))\n\n                try:\n                    y_temp = np_utils.to_categorical(y_temp, nb_classes=3)\n                except Exception as e:\n                    print e\n                    print y_temp\n                print x.shape, y.shape\n                loss, acc = model.train_on_batch(x, y_temp, class_weight=class_weights)\n                print loss, acc\n\n        y_pred = model.predict_on_batch(X_test)\n        y_pred = np.argmax(y_pred, axis=1)\n        print classification_report(y_test, y_pred)\n        print precision_recall_fscore_support(y_test, y_pred)\n        print y_pred\n        p += precision_score(y_test, y_pred, average='weighted')\n        p1 += precision_score(y_test, y_pred, average='micro')\n        r += recall_score(y_test, y_pred, average='weighted')\n        r1 += recall_score(y_test, y_pred, average='micro')\n        f1 += f1_score(y_test, y_pred, average='weighted')\n        f11 += f1_score(y_test, y_pred, average='micro')\n\n\n    print \"macro results are\"\n    print \"average precision is %f\" %(p/NO_OF_FOLDS)\n    print \"average recall is %f\" %(r/NO_OF_FOLDS)\n    print \"average f1 is %f\" %(f1/NO_OF_FOLDS)\n\n    print \"micro results are\"\n    print \"average precision is %f\" %(p1/NO_OF_FOLDS)\n    print \"average recall is %f\" %(r1/NO_OF_FOLDS)\n    print \"average f1 is %f\" %(f11/NO_OF_FOLDS)\n\n\nif __name__ == \"__main__\":\n    parser = argparse.ArgumentParser(description='LSTM based models for twitter Hate speech detection')\n    parser.add_argument('-f', '--embeddingfile', required=True)\n    parser.add_argument('-d', '--dimension', required=True)\n    parser.add_argument('--tokenizer', choices=['glove', 'nltk'], required=True)\n    parser.add_argument('--loss', default=LOSS_FUN, required=True)\n    parser.add_argument('--optimizer', default=OPTIMIZER, required=True)\n    parser.add_argument('--epochs', default=EPOCHS, required=True)\n    parser.add_argument('--batch-size', default=BATCH_SIZE, required=True)\n    parser.add_argument('-s', '--seed', default=SEED)\n    parser.add_argument('--folds', default=NO_OF_FOLDS)\n    parser.add_argument('--kernel', default=KERNEL)\n    parser.add_argument('--class_weight')\n    parser.add_argument('--initialize-weights', choices=['random', 'glove'], required=True)\n    parser.add_argument('--learn-embeddings', action='store_true', default=False)\n    parser.add_argument('--scale-loss-function', action='store_true', default=False)\n\n\n    args = parser.parse_args()\n    GLOVE_MODEL_FILE = args.embeddingfile\n    EMBEDDING_DIM = int(args.dimension)\n    SEED = int(args.seed)\n    NO_OF_FOLDS = int(args.folds)\n    CLASS_WEIGHT = args.class_weight\n    LOSS_FUN = args.loss\n    OPTIMIZER = args.optimizer\n    KERNEL = args.kernel\n    if args.tokenizer == \"glove\":\n        TOKENIZER = glove_tokenize\n    elif args.tokenizer == \"nltk\":\n        TOKENIZER = tokenize_nltk.casual.TweetTokenizer(strip_handles=True, reduce_len=True).tokenize\n    INITIALIZE_WEIGHTS_WITH = args.initialize_weights    \n    LEARN_EMBEDDINGS = args.learn_embeddings\n    EPOCHS = int(args.epochs)\n    BATCH_SIZE = int(args.batch_size)\n    SCALE_LOSS_FUN = args.scale_loss_function\n\n\n\n    np.random.seed(SEED)\n    print 'GLOVE embedding: %s' %(GLOVE_MODEL_FILE)\n    print 'Embedding Dimension: %d' %(EMBEDDING_DIM)\n    print 'Allowing embedding learning: %s' %(str(LEARN_EMBEDDINGS))\n\n    word2vec_model = gensim.models.Word2Vec.load_word2vec_format(GLOVE_MODEL_FILE)\n\n    tweets = select_tweets()\n    gen_vocab()\n    #filter_vocab(20000)\n    X, y = gen_sequence()\n    #Y = y.reshape((len(y), 1))\n    MAX_SEQUENCE_LENGTH = max(map(lambda x:len(x), X))\n    print \"max seq length is %d\"%(MAX_SEQUENCE_LENGTH)\n\n    data = pad_sequences(X, maxlen=MAX_SEQUENCE_LENGTH)\n    y = np.array(y)\n    data, y = sklearn.utils.shuffle(data, y)\n    W = get_embedding_weights()\n\n    model = lstm_model(data.shape[1], EMBEDDING_DIM)\n    #model = lstm_model(data.shape[1], 25, get_embedding_weights())\n    train_LSTM(data, y, model, EMBEDDING_DIM, W)\n\n    pdb.set_trace()\n"
  },
  {
    "path": "my_tokenizer.py",
    "content": "from string import punctuation\nfrom preprocess_twitter import tokenize as tokenizer_g\nfrom gensim.parsing.preprocessing import STOPWORDS\n\n\ndef glove_tokenize(text):\n    text = tokenizer_g(text)\n    text = ''.join([c for c in text if c not in punctuation])\n    words = text.split()\n    words = [word for word in words if word not in STOPWORDS]\n    return words\n\n"
  },
  {
    "path": "nn_classifier.py",
    "content": "from data_handler import get_data\nimport sys\nimport numpy as np\nimport pdb, json\nfrom sklearn.metrics import make_scorer, f1_score, accuracy_score, recall_score, precision_score, classification_report, precision_recall_fscore_support\nfrom sklearn.ensemble  import GradientBoostingClassifier, RandomForestClassifier\nfrom sklearn.model_selection import cross_val_score, cross_val_predict\nfrom sklearn.feature_extraction.text import TfidfVectorizer\nimport pdb\nfrom sklearn.metrics import make_scorer, f1_score, accuracy_score, recall_score, precision_score, classification_report, precision_recall_fscore_support\nfrom sklearn.utils import shuffle\nfrom sklearn.ensemble  import GradientBoostingClassifier, RandomForestClassifier\nfrom sklearn.svm import SVC, LinearSVC\nfrom sklearn.model_selection import KFold\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn.utils import shuffle\nimport codecs\nimport operator\nimport gensim, sklearn\nfrom collections import defaultdict\nfrom batch_gen import batch_gen\nfrom my_tokenizer import glove_tokenize\nimport xgboost as xgb\n\n### Preparing the text data\ntexts = []  # list of text samples\nlabels_index = {}  # dictionary mapping label name to numeric id\nlabels = []  # list of label ids\nlabel_map = {\n        'none': 0,\n        'racism': 1,\n        'sexism': 2\n    }\ntweet_data = get_data()\nfor tweet in tweet_data:\n    texts.append(tweet['text'].lower())\n    labels.append(label_map[tweet['label']])\nprint('Found %s texts. (samples)' % len(texts))\n\n\n# logistic, gradient_boosting, random_forest, svm, tfidf_svm_linear, tfidf_svm_rbf\nmodel_count = 2\nword_embed_size = 200\nGLOVE_MODEL_FILE = str(sys.argv[1])\nEMBEDDING_DIM = int(sys.argv[2])\nMODEL_TYPE=sys.argv[3]\nprint 'Embedding Dimension: %d' %(EMBEDDING_DIM)\nprint 'GloVe Embedding: %s' %(GLOVE_MODEL_FILE)\n\nword2vec_model1 = np.load('fast_text.npy')\nword2vec_model1 = word2vec_model1.reshape((word2vec_model1.shape[1], word2vec_model1.shape[2]))\nf_vocab = open('vocab_fast_text', 'r')\nvocab = json.load(f_vocab)\nword2vec_model = {}\nfor k,v in vocab.iteritems():\nword2vec_model[k] = word2vec_model1[int(v)]\ndel word2vec_model1\n\n\nSEED=42\nMAX_NB_WORDS = None\nVALIDATION_SPLIT = 0.2\n\n\n# vocab generation\nvocab, reverse_vocab = {}, {}\nfreq = defaultdict(int)\ntweets = {}\n\n\ndef select_tweets_whose_embedding_exists():\n    # selects the tweets as in mean_glove_embedding method\n    # Processing\n    tweets = get_data()\n    X, Y = [], []\n    tweet_return = []\n    for tweet in tweets:\n        _emb = 0\n        words = glove_tokenize(tweet['text'])\n        for w in words:\n            if w in word2vec_model:  # Check if embeeding there in GLove model\n                _emb+=1\n        if _emb:   # Not a blank tweet\n            tweet_return.append(tweet)\n    print 'Tweets selected:', len(tweet_return)\n    #pdb.set_trace()\n    return tweet_return\n\n\ndef gen_data():\n    y_map = {\n            'none': 0,\n            'racism': 1,\n            'sexism': 2\n            }\n\n    X, y = [], []\n    for tweet in tweets:\n        words = glove_tokenize(tweet['text'])\n        emb = np.zeros(word_embed_size)\n        for word in words:\n            try:\n                emb += word2vec_model[word]\n            except:\n                pass\n        emb /= len(words)\n        X.append(emb)\n        y.append(y_map[tweet['label']])\n    X = np.array(X)\n    y = np.array(y)\n    return X, y\n\n    \ndef get_model(m_type=None):\n    if not m_type:\n        print 'ERROR: Please provide a valid method name'\n        return None\n\n    if m_type == 'logistic':\n        logreg = LogisticRegression()\n    elif m_type == \"gradient_boosting\":\n        #logreg = GradientBoostingClassifier(n_estimators=10)\n        logreg = xgb.XGBClassifier(nthread=-1)\n    elif m_type == \"random_forest\":\n        logreg = RandomForestClassifier(n_estimators=100, n_jobs=-1)\n    elif m_type == \"svm_rbf\":\n        logreg = SVC(class_weight=\"balanced\", kernel='rbf')\n    elif m_type == \"svm_linear\":\n        logreg = LinearSVC(class_weight=\"balanced\")\n    else:\n        print \"ERROR: Please specify a correst model\"\n        return None\n\n    return logreg\n\n\ndef classification_model(X, Y, model_type=\"logistic\"):\n    NO_OF_FOLDS=10\n    X, Y = shuffle(X, Y, random_state=SEED)\n    print \"Model Type:\", model_type\n\n    #predictions = cross_val_predict(logreg, X, Y, cv=NO_OF_FOLDS)\n    scores1 = cross_val_score(get_model(model_type), X, Y, cv=NO_OF_FOLDS, scoring='precision_weighted')\n    print \"Precision(avg): %0.3f (+/- %0.3f)\" % (scores1.mean(), scores1.std() * 2)\n\n    scores2 = cross_val_score(get_model(model_type), X, Y, cv=NO_OF_FOLDS, scoring='recall_weighted')\n    print \"Recall(avg): %0.3f (+/- %0.3f)\" % (scores2.mean(), scores2.std() * 2)\n    \n    scores3 = cross_val_score(get_model(model_type), X, Y, cv=NO_OF_FOLDS, scoring='f1_weighted')\n    print \"F1-score(avg): %0.3f (+/- %0.3f)\" % (scores3.mean(), scores3.std() * 2)\n\n    pdb.set_trace()\n\n\n\nif __name__ == \"__main__\":\n\n    #filter_vocab(20000)\n\n    tweets = select_tweets_whose_embedding_exists()\n    X, Y = gen_data()\n\n    classification_model(X, Y, MODEL_TYPE)\n    pdb.set_trace()\n\n\n"
  },
  {
    "path": "plot_graph_TSNE.py",
    "content": "import gensim\nimport numpy as np\nimport matplotlib.pyplot as plt\nimport json\nfrom sklearn.manifold import TSNE\nimport pdb\nimport codecs\n\n\nwords = ['mohammed', 'murderer', 'pedophile', 'religion', 'terrorism', 'islamic', 'muslim']\n\ndef load_initial_emb():\n    initial_emb = gensim.models.Word2Vec.load_word2vec_format(\"/home/pinkesh/DATASETS/glove-twitter/GENSIM.glove.twitter.27B.200d.txt\")\n    return initial_emb\n\ndef load_final_emb():\n    reverse_vocab = codecs.open3(\"reverse_vocab.json\", 'r', encoding=\"utf-8\").readlines()\n    reverse_vocab = json.loads(\"\".join(reverse_vocab))\n    reverse_vocab['0'] = \"<UNK>\"\n\n    final_emb = {}\n    for i, emb in enumerate(np.load(\"embedding.npy\")):\n        final_emb[reverse_vocab[str(i)].encode(\"utf-8\")] = emb\n    return final_emb\n\ndef get_transform(initial_emb, final_emb):\n    vec = []\n    for w in words:\n        vec.append(initial_emb[w])\n    for w in words:\n        vec.append(final_emb[w])\n    \n    X = np.array(vec)\n    print X.shape\n    \n    model = TSNE(n_components=2, random_state=0)\n    out = model.fit_transform(X)\n    \n    print out\n    print \"Will plot now!\"\n    return out\n\n\n\n# Initial are original\n# Next are final\ndef plot(out):\n    A = out[:7,:]\n    B = out[7:,:]\n    area=150\n    padding=0.0001\n    xmin, xmax = min(out[:, 0]), max(out[:, 0])\n    ymin, ymax = min(out[:, 1]), max(out[:, 1])\n    \n    fig, ax = plt.subplots()\n    \n    for (color, label, data) in [('red', 'GloVe', A), ('green', 'FastText+GloVe+Dyn', B)]:\n        ax.scatter(data[:,0], data[:,1], c=color, s=area, label=label,\n                                      alpha=0.3, edgecolors='none')\n        for (row, word) in zip(data, words):\n            ax.annotate(word, xy=(row[0], row[1]), xytext=(row[0], row[1]),)\n    \n    plt.axis([xmin-padding,xmax+padding,ymin-padding,ymax+padding])\n    plt.legend()\n    plt.grid(True)\n    \n    plt.show()\n\n\nif __name__==\"__main__\":\n    ini = load_initial_emb()\n    fin = load_final_emb()\n    out = get_transform(ini, fin)\n    plot(out)\n"
  },
  {
    "path": "preprocess_twitter.py",
    "content": "\"\"\"\npreprocess-twitter.py\n\npython preprocess-twitter.py \"Some random text with #hashtags, @mentions and http://t.co/kdjfkdjf (links). :)\"\n\nScript for preprocessing tweets by Romain Paulus\nwith small modifications by Jeffrey Pennington\nwith translation to Python by Motoki Wu\n\nTranslation of Ruby script to create features for GloVe vectors for Twitter data.\nhttp://nlp.stanford.edu/projects/glove/preprocess-twitter.rb\n\"\"\"\n\nimport sys\nimport re\n\nFLAGS = re.MULTILINE | re.DOTALL\n\ndef hashtag(text):\n    text = text.group()\n    hashtag_body = text[1:]\n    if hashtag_body.isupper():\n        result = u\"<hashtag> {} <allcaps>\".format(hashtag_body)\n    else:\n        result = \" \".join([\"<hashtag>\"] + re.split(ur\"(?=[A-Z])\", hashtag_body, flags=FLAGS))\n    return result\n\ndef allcaps(text):\n    text = text.group()\n    return text.lower() + \" <allcaps>\"\n\n\ndef tokenize(text):\n    # Different regex parts for smiley faces\n    eyes = r\"[8:=;]\"\n    nose = r\"['`\\-]?\"\n\n    # function so code less repetitive\n    def re_sub(pattern, repl):\n        return re.sub(pattern, repl, text, flags=FLAGS)\n\n    text = re_sub(r\"https?:\\/\\/\\S+\\b|www\\.(\\w+\\.)+\\S*\", \"<url>\")\n    text = re_sub(r\"/\",\" / \")\n    text = re_sub(r\"@\\w+\", \"<user>\")\n    text = re_sub(r\"{}{}[)dD]+|[)dD]+{}{}\".format(eyes, nose, nose, eyes), \"<smile>\")\n    text = re_sub(r\"{}{}p+\".format(eyes, nose), \"<lolface>\")\n    text = re_sub(r\"{}{}\\(+|\\)+{}{}\".format(eyes, nose, nose, eyes), \"<sadface>\")\n    text = re_sub(r\"{}{}[\\/|l*]\".format(eyes, nose), \"<neutralface>\")\n    text = re_sub(r\"<3\",\"<heart>\")\n    text = re_sub(r\"[-+]?[.\\d]*[\\d]+[:,.\\d]*\", \"<number>\")\n    text = re_sub(r\"#\\S+\", hashtag)\n    text = re_sub(r\"([!?.]){2,}\", r\"\\1 <repeat>\")\n    text = re_sub(r\"\\b(\\S*?)(.)\\2{2,}\\b\", r\"\\1\\2 <elong>\")\n\n    ## -- I just don't understand why the Ruby script adds <allcaps> to everything so I limited the selection.\n    # text = re_sub(r\"([^a-z0-9()<>'`\\-]){2,}\", allcaps)\n    text = re_sub(r\"([A-Z]){2,}\", allcaps)\n\n    return text.lower()\n\n\nif __name__ == '__main__':\n    _, text = sys.argv\n    if text == \"test\":\n        text = u\"I TEST alllll kinds of #hashtags and #HASHTAGS, @mentions and 3000 (http://t.co/dkfjkdf). w/ <3 :) haha!!!!!\"\n    tokens = tokenize(text)\n    print tokens\n"
  },
  {
    "path": "tfidf.py",
    "content": "from data_handler import get_data\nimport argparse\nimport sys\nimport numpy as np\nfrom sklearn.metrics import make_scorer, f1_score, accuracy_score, recall_score, precision_score, classification_report, precision_recall_fscore_support\nfrom sklearn.ensemble  import GradientBoostingClassifier, RandomForestClassifier\nfrom sklearn.model_selection import cross_val_score, cross_val_predict\nfrom sklearn.feature_extraction.text import TfidfVectorizer\nimport pdb\nfrom sklearn.metrics import make_scorer, f1_score, accuracy_score, recall_score, precision_score, classification_report, precision_recall_fscore_support\nfrom sklearn.utils import shuffle\nfrom sklearn.ensemble  import GradientBoostingClassifier, RandomForestClassifier\nfrom sklearn.svm import SVC, LinearSVC\nfrom sklearn.model_selection import KFold\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn.utils import shuffle\nimport codecs\nimport operator\nimport gensim, sklearn\nfrom collections import defaultdict\nfrom batch_gen import batch_gen\nfrom my_tokenizer import glove_tokenize\nfrom nltk.tokenize import TweetTokenizer\n\n\n### Preparing the text data\ntexts = []  # list of text samples\nlabels_index = {}  # dictionary mapping label name to numeric id\nlabels = []  # list of label ids\n\n# vocab generation\nvocab, reverse_vocab = {}, {}\nfreq = defaultdict(int)\ntweets = {}\n\n\n# tfidf_logistic, tfidf_gradient_boosting, tfidf_random_forest, tfidf_svm_linear, tfidf_svm_rbf\nMODEL_TYPE=None\nMAX_NGRAM_LENGTH=None\nNO_OF_FOLDS=10\nCLASS_WEIGHT = None\nN_ESTIMATORS = None\nLOSS_FUN = None\nKERNEL = None\nMAX_NGRAM_LENGTH = None\nSEED=42\nTOKENIZER=None\n\n\ndef gen_data():\n    label_map = {\n            'none': 0,\n            'racism': 1,\n            'sexism': 2\n        }\n    tweet_data = get_data()\n    for tweet in tweet_data:\n        texts.append(tweet['text'].lower())\n        labels.append(label_map[tweet['label']])\n    print('Found %s texts. (samples)' % len(texts))\n\n\n\ndef get_model(m_type=None):\n    if not m_type:\n        print 'Please specify a model type'\n        return None\n\n    if m_type == \"tfidf_svm\":\n        logreg = SVC(class_weight=CLASS_WEIGHT, kernel=KERNEL)\n    elif m_type == \"tfidf_svm_linear\":\n        logreg = LinearSVC(C=0.01, loss=LOSS_FUN, class_weight=CLASS_WEIGHT)\n    elif m_type == 'tfidf_logistic':\n        logreg = LogisticRegression()\n    elif m_type == \"tfidf_gradient_boosting\":\n        logreg = GradientBoostingClassifier(loss=LOSS_FUN, n_estimators=N_ESTIMATORS)\n    elif m_type == \"tfidf_random_forest\":\n        logreg = RandomForestClassifier(class_weight=CLASS_WEIGHT, n_estimators=N_ESTIMATORS)\n        print \"ERROR: Please specify a correct model\"\n        return None\n\n    return logreg\n\n\ndef classification_model(X, Y, model_type=None):\n    X, Y = shuffle(X, Y, random_state=SEED)\n    print \"Model Type:\", model_type\n\n    #predictions = cross_val_predict(logreg, X, Y, cv=NO_OF_FOLDS)\n    scores1 = cross_val_score(get_model(model_type), X, Y, cv=NO_OF_FOLDS, scoring='precision_weighted')\n    print \"Precision(avg): %0.3f (+/- %0.3f)\" % (scores1.mean(), scores1.std() * 2)\n\n    scores2 = cross_val_score(get_model(model_type), X, Y, cv=NO_OF_FOLDS, scoring='recall_weighted')\n    print \"Recall(avg): %0.3f (+/- %0.3f)\" % (scores2.mean(), scores2.std() * 2)\n    \n    scores3 = cross_val_score(get_model(model_type), X, Y, cv=NO_OF_FOLDS, scoring='f1_weighted')\n    print \"F1-score(avg): %0.3f (+/- %0.3f)\" % (scores3.mean(), scores3.std() * 2)\n\n\n\nif __name__ == \"__main__\":\n\n\n    parser = argparse.ArgumentParser(description='TF-IDF model for twitter Hate speech detection')\n    parser.add_argument('-m', '--model', choices=['tfidf_svm', 'tfidf_svm_linear', 'tfidf_logistic', 'tfidf_gradient_boosting', 'tfidf_random_forest'], required=True)\n    parser.add_argument('--max_ngram', required=True)\n    parser.add_argument('--tokenizer', choices=['glove', 'nltk'], required=True)\n    parser.add_argument('-s', '--seed', default=SEED)\n    parser.add_argument('--folds', default=NO_OF_FOLDS)\n    parser.add_argument('--estimators', default=N_ESTIMATORS)\n    parser.add_argument('--loss', default=LOSS_FUN)\n    parser.add_argument('--kernel', default=KERNEL)\n    parser.add_argument('--class_weight')\n    parser.add_argument('--use-inverse-doc-freq', action='store_true')\n\n    args = parser.parse_args()\n\n    MODEL_TYPE = args.model\n    SEED = int(args.seed)\n    NO_OF_FOLDS = int(args.folds)\n    CLASS_WEIGHT = args.class_weight\n    N_ESTIMATORS = int(args.estimators) if args.estimators else args.estimators\n    LOSS_FUN = args.loss\n    KERNEL = args.kernel\n    MAX_NGRAM_LENGTH = int(args.max_ngram)\n    USE_IDF = args.use_inverse_doc_freq\n\n    if args.tokenizer == \"glove\":\n        TOKENIZER = glove_tokenize\n    elif args.tokenizer == \"nltk\":\n        TOKENIZER = TweetTokenizer().tokenize\n\n    print 'Max-ngram-length: %d' %(MAX_NGRAM_LENGTH)\n    #filter_vocab(20000)\n\n    # For TFIDF-SVC or any other varient\n    # We do not need to run the above code for TFIDF\n    # It does not use the filtered data using gen_data()\n    gen_data()\n    tfidf_transformer = TfidfVectorizer(use_idf=USE_IDF, analyzer=\"word\", tokenizer=TOKENIZER, ngram_range=(1, MAX_NGRAM_LENGTH))\n    #tfidf_transformer = TfidfVectorizer(use_idf=True, ngram_range=(1, MAX_NGRAM_LENGTH))\n    X_train_tfidf = tfidf_transformer.fit_transform(texts)\n    X = X_train_tfidf\n    Y = labels\n\n    classification_model(X, Y, MODEL_TYPE)\n"
  }
]