Repository: pinkeshbadjatiya/twitter-hatespeech Branch: master Commit: 3b834311953b Files: 14 Total size: 57.6 KB Directory structure: gitextract_19n6zuu9/ ├── .gitignore ├── BoWV.py ├── README.md ├── batch_gen.py ├── cnn.py ├── data_handler.py ├── fast_text.py ├── get_similar_words.py ├── lstm.py ├── my_tokenizer.py ├── nn_classifier.py ├── plot_graph_TSNE.py ├── preprocess_twitter.py └── tfidf.py ================================================ FILE CONTENTS ================================================ ================================================ FILE: .gitignore ================================================ *.pyc ================================================ FILE: BoWV.py ================================================ from data_handler import get_data import argparse import sys import numpy as np import pdb from sklearn.metrics import make_scorer, f1_score, accuracy_score, recall_score, precision_score, classification_report, precision_recall_fscore_support from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier from sklearn.model_selection import cross_val_score, cross_val_predict from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics import make_scorer, f1_score, accuracy_score, recall_score, precision_score, classification_report, precision_recall_fscore_support from sklearn.utils import shuffle from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier from sklearn.svm import SVC, LinearSVC from sklearn.model_selection import KFold from sklearn.linear_model import LogisticRegression from sklearn.utils import shuffle import codecs import operator import gensim, sklearn from collections import defaultdict from batch_gen import batch_gen from my_tokenizer import glove_tokenize from nltk.tokenize import TweetTokenizer ### Preparing the text data texts = [] # list of text samples labels_index = {} # dictionary mapping label name to numeric id labels = [] # list of label ids # logistic, gradient_boosting, random_forest, svm_linear, svm_rbf GLOVE_MODEL_FILE = None EMBEDDING_DIM = None MODEL_TYPE = None CLASS_WEIGHT = None N_ESTIMATORS = None LOSS_FUN = None KERNEL = None TOKENIZER = None SEED=42 MAX_NB_WORDS = None NO_OF_FOLDS=10 # vocab generation vocab, reverse_vocab = {}, {} freq = defaultdict(int) tweets = {} word2vec_model = None def select_tweets_whose_embedding_exists(): # selects the tweets as in mean_glove_embedding method # Processing tweets = get_data() X, Y = [], [] tweet_return = [] for tweet in tweets: _emb = 0 words = TOKENIZER(tweet['text'].lower()) for w in words: if w in word2vec_model: # Check if embeeding there in GLove model _emb+=1 if _emb: # Not a blank tweet tweet_return.append(tweet) print 'Tweets selected:', len(tweet_return) return tweet_return def gen_data(): y_map = { 'none': 0, 'racism': 1, 'sexism': 2 } X, y = [], [] for tweet in tweets: words = glove_tokenize(tweet['text'].lower()) emb = np.zeros(EMBEDDING_DIM) for word in words: try: emb += word2vec_model[word] except: pass emb /= len(words) X.append(emb) y.append(y_map[tweet['label']]) return X, y def get_model(m_type=None): if not m_type: print "ERROR: Please specify a model type!" return None if m_type == 'logistic': logreg = LogisticRegression() elif m_type == "gradient_boosting": logreg = GradientBoostingClassifier(loss=LOSS_FUN, n_estimators=N_ESTIMATORS) elif m_type == "random_forest": logreg = RandomForestClassifier(class_weight=CLASS_WEIGHT, n_estimators=N_ESTIMATORS) elif m_type == "svm": logreg = SVC(class_weight=CLASS_WEIGHT, kernel=KERNEL) elif m_type == "svm_linear": logreg = LinearSVC(loss=LOSS_FUN, class_weight=CLASS_WEIGHT) else: print "ERROR: Please specify a correct model" return None return logreg def classification_model(X, Y, model_type=None): X, Y = shuffle(X, Y, random_state=SEED) print "Model Type:", model_type #predictions = cross_val_predict(logreg, X, Y, cv=NO_OF_FOLDS) scores1 = cross_val_score(get_model(model_type), X, Y, cv=NO_OF_FOLDS, scoring='precision_weighted') print "Precision(avg): %0.3f (+/- %0.3f)" % (scores1.mean(), scores1.std() * 2) scores2 = cross_val_score(get_model(model_type), X, Y, cv=NO_OF_FOLDS, scoring='recall_weighted') print "Recall(avg): %0.3f (+/- %0.3f)" % (scores2.mean(), scores2.std() * 2) scores3 = cross_val_score(get_model(model_type), X, Y, cv=NO_OF_FOLDS, scoring='f1_weighted') print "F1-score(avg): %0.3f (+/- %0.3f)" % (scores3.mean(), scores3.std() * 2) if __name__ == "__main__": parser = argparse.ArgumentParser(description='BagOfWords model for twitter Hate speech detection') parser.add_argument('-m', '--model', choices=['logistic', 'gradient_boosting', 'random_forest', 'svm', 'svm_linear'], required=True) parser.add_argument('-f', '--embeddingfile', required=True) parser.add_argument('-d', '--dimension', required=True) parser.add_argument('--tokenizer', choices=['glove', 'nltk'], required=True) parser.add_argument('-s', '--seed', default=SEED) parser.add_argument('--folds', default=NO_OF_FOLDS) parser.add_argument('--estimators', default=N_ESTIMATORS) parser.add_argument('--loss', default=LOSS_FUN) parser.add_argument('--kernel', default=KERNEL) parser.add_argument('--class_weight') args = parser.parse_args() MODEL_TYPE = args.model GLOVE_MODEL_FILE = args.embeddingfile EMBEDDING_DIM = int(args.dimension) SEED = int(args.seed) NO_OF_FOLDS = int(args.folds) CLASS_WEIGHT = args.class_weight N_ESTIMATORS = int(args.estimators) LOSS_FUN = args.loss KERNEL = args.kernel if args.tokenizer == "glove": TOKENIZER = glove_tokenize elif args.tokenizer == "nltk": TOKENIZER = TweetTokenizer().tokenize print 'GLOVE embedding: %s' %(GLOVE_MODEL_FILE) print 'Embedding Dimension: %d' %(EMBEDDING_DIM) word2vec_model = gensim.models.Word2Vec.load_word2vec_format(GLOVE_MODEL_FILE) #filter_vocab(20000) tweets = select_tweets_whose_embedding_exists() X, Y = gen_data() classification_model(X, Y, MODEL_TYPE) ================================================ FILE: README.md ================================================ # Hate Speech Detection on Twitter Implementation of our paper titled - "Deep Learning for Hate Speech Detection" (to appear in WWW'17 proceedings). ## Dataset Dataset can be downloaded from [https://github.com/zeerakw/hatespeech](https://github.com/zeerakw/hatespeech). Contains tweet id's and corresponding annotations. Tweets are labelled as either Racist, Sexist or Neither Racist or Sexist. Use your favourite tweet crawler and download the data and place the tweets in the folder 'tweet_data'. ## Requirements * Keras * Tensorflow or Theano (we experimented with theano) * Gensim * xgboost * NLTK * Sklearn * Numpy ## Instructions to run Before running the model, make sure you have setup the input dataset in a folder named `tweet_data`. To run a model for training, use the following instructions mentioned below. Use appropriate parameter settings to test the variations of the models. ### This script contains code for runnning NN_model + GDBT. Steps to run NN_model + GDBT * Run NN_model first (CNN/LSTM/Fast_text). It will create a model file * Change the name of the file at line 50 pointing to the model file * Run nn_classifier file as per instructions below python nn_classifier.py - BagOfWords models - **BoWV.py[does not supports XGBOOST, supports sklearn's GBDT]** ``` usage: BoWV.py [-h] -m [Deprecated] {logistic,gradient_boosting,random_forest,svm,svm_linear} -f EMBEDDINGFILE -d DIMENSION --tokenizer {glove,nltk} [-s SEED] [--folds FOLDS] [--estimators ESTIMATORS] [--loss LOSS] [--kernel KERNEL] [--class_weight CLASS_WEIGHT] BagOfWords model for twitter Hate speech detection optional arguments: -h, --help show this help message and exit -m {logistic,gradient_boosting,random_forest,svm,svm_linear}, --model {logistic,gradient_boosting,random_forest,svm,svm_linear} -f EMBEDDINGFILE, --embeddingfile EMBEDDINGFILE -d DIMENSION, --dimension DIMENSION --tokenizer {glove,nltk} -s SEED, --seed SEED --folds FOLDS --estimators ESTIMATORS --loss LOSS --kernel KERNEL --class_weight CLASS_WEIGHT ``` - TF-IDF based models - **tfidf.py** ``` usage: tfidf.py [-h] -m {tfidf_svm,tfidf_svm_linear,tfidf_logistic,tfidf_gradient_boosting,tfidf_random_forest} --max_ngram MAX_NGRAM --tokenizer {glove,nltk} [-s SEED] [--folds FOLDS] [--estimators ESTIMATORS] [--loss LOSS] [--kernel KERNEL] [--class_weight CLASS_WEIGHT] [--use-inverse-doc-freq] TF-IDF model for twitter Hate speech detection optional arguments: -h, --help show this help message and exit -m {tfidf_svm,tfidf_svm_linear,tfidf_logistic,tfidf_gradient_boosting,tfidf_random_forest}, --model {tfidf_svm,tfidf_svm_linear,tfidf_logistic,tfidf_gradient_boosting,tfidf_random_forest} --max_ngram MAX_NGRAM --tokenizer {glove,nltk} -s SEED, --seed SEED --folds FOLDS --estimators ESTIMATORS --loss LOSS --kernel KERNEL --class_weight CLASS_WEIGHT --use-inverse-doc-freq ``` - LSTM(RNN) based methods - **lstm.py** ``` usage: lstm.py [-h] -f EMBEDDINGFILE -d DIMENSION --tokenizer {glove,nltk} --loss LOSS --optimizer OPTIMIZER --epochs EPOCHS --batch-size BATCH_SIZE [-s SEED] [--folds FOLDS] [--kernel KERNEL] [--class_weight CLASS_WEIGHT] --initialize-weights {random,glove} [--learn-embeddings] [--scale-loss-function] LSTM based models for twitter Hate speech detection optional arguments: -h, --help show this help message and exit -f EMBEDDINGFILE, --embeddingfile EMBEDDINGFILE -d DIMENSION, --dimension DIMENSION --tokenizer {glove,nltk} --loss LOSS --optimizer OPTIMIZER --epochs EPOCHS --batch-size BATCH_SIZE -s SEED, --seed SEED --folds FOLDS --kernel KERNEL --class_weight CLASS_WEIGHT --initialize-weights {random,glove} --learn-embeddings --scale-loss-function ``` - CNN based models - **cnn.py** ``` usage: cnn.py [-h] -f EMBEDDINGFILE -d DIMENSION --tokenizer {glove,nltk} --loss LOSS --optimizer OPTIMIZER --epochs EPOCHS --batch-size BATCH_SIZE [-s SEED] [--folds FOLDS] [--class_weight CLASS_WEIGHT] --initialize-weights {random,glove} [--learn-embeddings] [--scale-loss-function] CNN based models for twitter Hate speech detection optional arguments: -h, --help show this help message and exit -f EMBEDDINGFILE, --embeddingfile EMBEDDINGFILE -d DIMENSION, --dimension DIMENSION --tokenizer {glove,nltk} --loss LOSS --optimizer OPTIMIZER --epochs EPOCHS --batch-size BATCH_SIZE -s SEED, --seed SEED --folds FOLDS --class_weight CLASS_WEIGHT --initialize-weights {random,glove} --learn-embeddings --scale-loss-function ``` ## Examples: ``` python BoWV.py --model logistic --seed 42 -f glove.twitter.27b.25d.txt -d 25 --seed 42 --folds 10 --tokenizer glove python tfidf.py -m tfidf_svm_linear --max_ngram 3 --tokenizer glove --loss squared_hinge python lstm.py -f ~/DATASETS/glove-twitter/GENSIM.glove.twitter.27B.25d.txt -d 25 --tokenizer glove --loss categorical_crossentropy --optimizer adam --initialize-weights random --learn-embeddings --epochs 10 --batch-size 512 python cnn.py -f ~/DATASETS/glove-twitter/GENSIM.glove.twitter.27B.25d.txt -d 25 --tokenizer nltk --loss categorical_crossentropy --optimizer adam --epochs 10 --batch-size 128 --initialize-weights random --scale-loss-function ``` ================================================ FILE: batch_gen.py ================================================ import numpy as np import random import pdb import math def batch_gen(X, batch_size): n_batches = X.shape[0]/float(batch_size) n_batches = int(math.ceil(n_batches)) end = int(X.shape[0]/float(batch_size)) * batch_size n = 0 for i in xrange(0,n_batches): if i < n_batches - 1: batch = X[i*batch_size:(i+1) * batch_size, :] yield batch else: batch = X[end: , :] n += X[end:, :].shape[0] yield batch if __name__ == "__main__": X = np.random.rand(123, 32) for batch in batch_gen(X, 21): print batch.shape ================================================ FILE: cnn.py ================================================ from data_handler import get_data import argparse from keras.preprocessing.sequence import pad_sequences from keras.layers import Embedding, Input, LSTM from keras.models import Sequential, Model from keras.layers import Activation, Dense, Dropout, Embedding, Flatten, Input, Merge, Convolution1D, MaxPooling1D, GlobalMaxPooling1D import numpy as np import pdb from nltk import tokenize from sklearn.metrics import make_scorer, f1_score, accuracy_score, recall_score, precision_score, classification_report, precision_recall_fscore_support from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier from gensim.parsing.preprocessing import STOPWORDS from sklearn.model_selection import KFold from keras.utils import np_utils from string import punctuation import codecs import operator import gensim, sklearn from collections import defaultdict from batch_gen import batch_gen import sys from nltk import tokenize as tokenize_nltk from my_tokenizer import glove_tokenize ### Preparing the text data texts = [] # list of text samples labels_index = {} # dictionary mapping label name to numeric id labels = [] # list of label ids # vocab generation vocab, reverse_vocab = {}, {} freq = defaultdict(int) tweets = {} EMBEDDING_DIM = None GLOVE_MODEL_FILE = None NO_OF_CLASSES=3 SEED = 42 NO_OF_FOLDS = 10 CLASS_WEIGHT = None LOSS_FUN = None OPTIMIZER = None TOKENIZER = None INITIALIZE_WEIGHTS_WITH = None LEARN_EMBEDDINGS = None EPOCHS = 10 BATCH_SIZE = 128 SCALE_LOSS_FUN = None word2vec_model = None def get_embedding(word): #return try: return word2vec_model[word] except Exception, e: print 'Encoding not found: %s' %(word) return np.zeros(EMBEDDING_DIM) def get_embedding_weights(): embedding = np.zeros((len(vocab) + 1, EMBEDDING_DIM)) n = 0 for k, v in vocab.iteritems(): try: embedding[v] = word2vec_model[k] except: n += 1 pass print "%d embedding missed"%n #pdb.set_trace() return embedding def select_tweets(): # selects the tweets as in mean_glove_embedding method # Processing tweets = get_data() X, Y = [], [] tweet_return = [] for tweet in tweets: _emb = 0 words = TOKENIZER(tweet['text'].lower()) for w in words: if w in word2vec_model: # Check if embeeding there in GLove model _emb+=1 if _emb: # Not a blank tweet tweet_return.append(tweet) print 'Tweets selected:', len(tweet_return) return tweet_return def gen_vocab(): # Processing vocab_index = 1 for tweet in tweets: text = TOKENIZER(tweet['text'].lower()) text = ''.join([c for c in text if c not in punctuation]) words = text.split() words = [word for word in words if word not in STOPWORDS] for word in words: if word not in vocab: vocab[word] = vocab_index reverse_vocab[vocab_index] = word # generate reverse vocab as well vocab_index += 1 freq[word] += 1 vocab['UNK'] = len(vocab) + 1 reverse_vocab[len(vocab)] = 'UNK' def filter_vocab(k): global freq, vocab freq_sorted = sorted(freq.items(), key=operator.itemgetter(1)) tokens = freq_sorted[:k] vocab = dict(zip(tokens, range(1, len(tokens) + 1))) vocab['UNK'] = len(vocab) + 1 def gen_sequence(): y_map = { 'none': 0, 'racism': 1, 'sexism': 2 } X, y = [], [] for tweet in tweets: text = TOKENIZER(tweet['text'].lower()) text = ''.join([c for c in text if c not in punctuation]) words = text.split() words = [word for word in words if word not in STOPWORDS] seq, _emb = [], [] for word in words: seq.append(vocab.get(word, vocab['UNK'])) X.append(seq) y.append(y_map[tweet['label']]) return X, y def shuffle_weights(model): weights = model.get_weights() weights = [np.random.permutation(w.flat).reshape(w.shape) for w in weights] model.set_weights(weights) def cnn_model(sequence_length, embedding_dim): model_variation = 'CNN-rand' # CNN-rand | CNN-non-static | CNN-static print('Model variation is %s' % model_variation) # Model Hyperparameters n_classes = NO_OF_CLASSES embedding_dim = EMBEDDING_DIM filter_sizes = (3, 4, 5) num_filters = 100 dropout_prob = (0.25, 0.5) hidden_dims = 100 # Training parameters # Word2Vec parameters, see train_word2vec #min_word_count = 1 # Minimum word count #context = 10 # Context window size graph_in = Input(shape=(sequence_length, embedding_dim)) convs = [] for fsz in filter_sizes: conv = Convolution1D(nb_filter=num_filters, filter_length=fsz, border_mode='valid', activation='relu')(graph_in) #,subsample_length=1)(graph_in) pool = GlobalMaxPooling1D()(conv) #flatten = Flatten()(pool) convs.append(pool) if len(filter_sizes)>1: out = Merge(mode='concat')(convs) else: out = convs[0] graph = Model(input=graph_in, output=out) # main sequential model model = Sequential() #if not model_variation=='CNN-rand': model.add(Embedding(len(vocab)+1, embedding_dim, input_length=sequence_length, trainable=LEARN_EMBEDDINGS)) model.add(Dropout(dropout_prob[0]))#, input_shape=(sequence_length, embedding_dim))) model.add(graph) model.add(Dropout(dropout_prob[1])) model.add(Activation('relu')) model.add(Dense(n_classes)) model.add(Activation('softmax')) model.compile(loss=LOSS_FUN, optimizer=OPTIMIZER, metrics=['accuracy']) print model.summary() return model def train_CNN(X, y, inp_dim, model, weights, epochs=EPOCHS, batch_size=BATCH_SIZE): cv_object = KFold(n_splits=NO_OF_FOLDS, shuffle=True, random_state=42) print cv_object p, r, f1 = 0., 0., 0. p1, r1, f11 = 0., 0., 0. sentence_len = X.shape[1] for train_index, test_index in cv_object.split(X): if INITIALIZE_WEIGHTS_WITH == "glove": model.layers[0].set_weights([weights]) elif INITIALIZE_WEIGHTS_WITH == "random": shuffle_weights(model) else: print "ERROR!" return X_train, y_train = X[train_index], y[train_index] X_test, y_test = X[test_index], y[test_index] y_train = y_train.reshape((len(y_train), 1)) X_temp = np.hstack((X_train, y_train)) for epoch in xrange(epochs): for X_batch in batch_gen(X_temp, batch_size): x = X_batch[:, :sentence_len] y_temp = X_batch[:, sentence_len] class_weights = None if SCALE_LOSS_FUN: class_weights = {} class_weights[0] = np.where(y_temp == 0)[0].shape[0]/float(len(y_temp)) class_weights[1] = np.where(y_temp == 1)[0].shape[0]/float(len(y_temp)) class_weights[2] = np.where(y_temp == 2)[0].shape[0]/float(len(y_temp)) try: y_temp = np_utils.to_categorical(y_temp, nb_classes=3) except Exception as e: print e print y_temp print x.shape, y.shape loss, acc = model.train_on_batch(x, y_temp, class_weight=class_weights) print loss, acc y_pred = model.predict_on_batch(X_test) y_pred = np.argmax(y_pred, axis=1) print classification_report(y_test, y_pred) print precision_recall_fscore_support(y_test, y_pred) print y_pred p += precision_score(y_test, y_pred, average='weighted') p1 += precision_score(y_test, y_pred, average='micro') r += recall_score(y_test, y_pred, average='weighted') r1 += recall_score(y_test, y_pred, average='micro') f1 += f1_score(y_test, y_pred, average='weighted') f11 += f1_score(y_test, y_pred, average='micro') print "macro results are" print "average precision is %f" %(p/NO_OF_FOLDS) print "average recall is %f" %(r/NO_OF_FOLDS) print "average f1 is %f" %(f1/NO_OF_FOLDS) print "micro results are" print "average precision is %f" %(p1/NO_OF_FOLDS) print "average recall is %f" %(r1/NO_OF_FOLDS) print "average f1 is %f" %(f11/NO_OF_FOLDS) if __name__ == "__main__": parser = argparse.ArgumentParser(description='CNN based models for twitter Hate speech detection') parser.add_argument('-f', '--embeddingfile', required=True) parser.add_argument('-d', '--dimension', required=True) parser.add_argument('--tokenizer', choices=['glove', 'nltk'], required=True) parser.add_argument('--loss', default=LOSS_FUN, required=True) parser.add_argument('--optimizer', default=OPTIMIZER, required=True) parser.add_argument('--epochs', default=EPOCHS, required=True) parser.add_argument('--batch-size', default=BATCH_SIZE, required=True) parser.add_argument('-s', '--seed', default=SEED) parser.add_argument('--folds', default=NO_OF_FOLDS) parser.add_argument('--class_weight') parser.add_argument('--initialize-weights', choices=['random', 'glove'], required=True) parser.add_argument('--learn-embeddings', action='store_true', default=False) parser.add_argument('--scale-loss-function', action='store_true', default=False) args = parser.parse_args() GLOVE_MODEL_FILE = args.embeddingfile EMBEDDING_DIM = int(args.dimension) SEED = int(args.seed) NO_OF_FOLDS = int(args.folds) CLASS_WEIGHT = args.class_weight LOSS_FUN = args.loss OPTIMIZER = args.optimizer if args.tokenizer == "glove": TOKENIZER = glove_tokenize elif args.tokenizer == "nltk": TOKENIZER = tokenize_nltk.casual.TweetTokenizer(strip_handles=True, reduce_len=True).tokenize INITIALIZE_WEIGHTS_WITH = args.initialize_weights LEARN_EMBEDDINGS = args.learn_embeddings EPOCHS = int(args.epochs) BATCH_SIZE = int(args.batch_size) SCALE_LOSS_FUN = args.scale_loss_function print 'GLOVE embedding: %s' %(GLOVE_MODEL_FILE) print 'Embedding Dimension: %d' %(EMBEDDING_DIM) print 'Allowing embedding learning: %s' %(str(LEARN_EMBEDDINGS)) word2vec_model = gensim.models.Word2Vec.load_word2vec_format(GLOVE_MODEL_FILE) np.random.seed(SEED) Tweets = select_tweets() tweets = Tweets gen_vocab() #filter_vocab(20000) X, y = gen_sequence() #Y = y.reshape((len(y), 1)) MAX_SEQUENCE_LENGTH = max(map(lambda x:len(x), X)) print "max seq length is %d"%(MAX_SEQUENCE_LENGTH) data = pad_sequences(X, maxlen=MAX_SEQUENCE_LENGTH) y = np.array(y) data, y = sklearn.utils.shuffle(data, y) W = get_embedding_weights() model = cnn_model(data.shape[1], EMBEDDING_DIM) train_CNN(data, y, EMBEDDING_DIM, model, W) pdb.set_trace() ================================================ FILE: data_handler.py ================================================ import json import pdb import codecs import pdb def get_data(): tweets = [] files = ['racism.json', 'neither.json', 'sexism.json'] for file in files: with codecs.open('./tweet_data/' + file, 'r', encoding='utf-8') as f: data = f.readlines() for line in data: tweet_full = json.loads(line) tweets.append({ 'id': tweet_full['id'], 'text': tweet_full['text'].lower(), 'label': tweet_full['Annotation'], 'name': tweet_full['user']['name'].split()[0] }) #pdb.set_trace() return tweets if __name__=="__main__": tweets = get_data() males, females = {}, {} with open('./tweet_data/males.txt') as f: males = set([w.strip() for w in f.readlines()]) with open('./tweet_data/females.txt') as f: females = set([w.strip() for w in f.readlines()]) males_c, females_c, not_found = 0, 0, 0 for t in tweets: if t['name'] in males: males_c += 1 elif t['name'] in females: females_c += 1 else: not_found += 1 print males_c, females_c, not_found pdb.set_trace() ================================================ FILE: fast_text.py ================================================ from data_handler import get_data from keras.preprocessing.text import Tokenizer from keras.preprocessing.sequence import pad_sequences from keras.layers import Embedding, Input, LSTM from keras.models import Sequential, Model from keras.layers import Activation, Dense, Dropout, Embedding, Flatten, Input, Merge, Convolution1D, MaxPooling1D, GlobalMaxPooling1D, GlobalAveragePooling1D import numpy as np from preprocess_twitter import tokenize as tokenizer_g import pdb from nltk import tokenize from sklearn.metrics import make_scorer, f1_score, accuracy_score, recall_score, precision_score, classification_report, precision_recall_fscore_support from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier from gensim.parsing.preprocessing import STOPWORDS from sklearn.model_selection import KFold from keras.utils import np_utils import codecs import operator import gensim, sklearn from collections import defaultdict from batch_gen import batch_gen from string import punctuation from get_similar_words import get_similar_words import sys ### Preparing the text data texts = [] # list of text samples labels_index = {} # dictionary mapping label name to numeric id labels = [] # list of label ids label_map = { 'none': 0, 'racism': 1, 'sexism': 2 } tweet_data = get_data() for tweet in tweet_data: texts.append(tweet['text']) labels.append(label_map[tweet['label']]) print('Found %s texts. (samples)' % len(texts)) EMBEDDING_DIM = int(sys.argv[1]) np.random.seed(42) # Load the orginal glove file # SHASHANK files #GLOVE_MODEL_FILE="/home/shashank/DL_NLP/glove-twitter" + str(EMBEDDING_DIM) + "-w2v" # PINKESH files GLOVE_MODEL_FILE="/home/pinkesh/DATASETS/glove-twitter/GENSIM.glove.twitter.27B." + str(EMBEDDING_DIM) + "d.txt" NO_OF_CLASSES=3 MAX_NB_WORDS = None VALIDATION_SPLIT = 0.2 word2vec_model = gensim.models.Word2Vec.load_word2vec_format(GLOVE_MODEL_FILE) # vocab generation MyTokenizer = tokenize.casual.TweetTokenizer(strip_handles=True, reduce_len=True) vocab, reverse_vocab = {}, {} freq = defaultdict(int) tweets = {} def get_embedding(word): #return try: return word2vec_model[word] except Exception, e: print 'Encoding not found: %s' %(word) return np.zeros(EMBEDDING_DIM) def get_embedding_weights(): embedding = np.zeros((len(vocab) + 1, EMBEDDING_DIM)) n = 0 for k, v in vocab.iteritems(): try: embedding[v] = word2vec_model[k] except: n += 1 pass print "%d embedding missed"%n #pdb.set_trace() return embedding def select_tweets(): # selects the tweets as in mean_glove_embedding method # Processing tweets = get_data() X, Y = [], [] tweet_return = [] for tweet in tweets: _emb = 0 words = Tokenize(tweet['text']).split() for w in words: if w in word2vec_model: # Check if embeeding there in GLove model _emb+=1 if _emb: # Not a blank tweet tweet_return.append(tweet) print 'Tweets selected:', len(tweet_return) #pdb.set_trace() return tweet_return def gen_vocab(): # Processing vocab_index = 1 for tweet in tweets: text = Tokenize(tweet['text']) text = ''.join([c for c in text if c not in punctuation]) words = text.split() words = [word for word in words if word not in STOPWORDS] for word in words: if word not in vocab: vocab[word] = vocab_index reverse_vocab[vocab_index] = word # generate reverse vocab as well vocab_index += 1 freq[word] += 1 vocab['UNK'] = len(vocab) + 1 reverse_vocab[len(vocab)] = 'UNK' #pdb.set_trace() def filter_vocab(k): global freq, vocab #pdb.set_trace() freq_sorted = sorted(freq.items(), key=operator.itemgetter(1)) tokens = freq_sorted[:k] vocab = dict(zip(tokens, range(1, len(tokens) + 1))) vocab['UNK'] = len(vocab) + 1 def gen_sequence(): y_map = { 'none': 0, 'racism': 1, 'sexism': 2 } X, y = [], [] for tweet in tweets: text = Tokenize(tweet['text']) text = ''.join([c for c in text if c not in punctuation]) words = text.split() words = [word for word in words if word not in STOPWORDS] seq, _emb = [], [] for word in words: seq.append(vocab.get(word, vocab['UNK'])) X.append(seq) y.append(y_map[tweet['label']]) return X, y def Tokenize(tweet): #return MyTokenizer.tokenize(tweet) #pdb.set_trace() return tokenizer_g(tweet) def shuffle_weights(model): weights = model.get_weights() weights = [np.random.permutation(w.flat).reshape(w.shape) for w in weights] model.set_weights(weights) def fast_text_model(sequence_length): model = Sequential() model.add(Embedding(len(vocab)+1, EMBEDDING_DIM, input_length=sequence_length)) #model.add(Embedding(len(vocab)+1, EMBEDDING_DIM, input_length=sequence_length, trainable=False)) model.add(Dropout(0.5)) model.add(GlobalAveragePooling1D()) model.add(Dense(3, activation='softmax')) model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy']) print model.summary() return model def train_fasttext(X, y, model, inp_dim,embedding_weights, epochs=10, batch_size=128): cv_object = KFold(n_splits=10, shuffle=True, random_state=42) print cv_object p, r, f1 = 0., 0., 0. p1, r1, f11 = 0., 0., 0. sentence_len = X.shape[1] lookup_table = np.zeros_like(model.layers[0].get_weights()[0]) for train_index, test_index in cv_object.split(X): shuffle_weights(model) #pdb.set_trace() #model.layers[0].set_weights([embedding_weights]) X_train, y_train = X[train_index], y[train_index] X_test, y_test = X[test_index], y[test_index] y_train = y_train.reshape((len(y_train), 1)) X_temp = np.hstack((X_train, y_train)) for epoch in xrange(epochs): for X_batch in batch_gen(X_temp, batch_size): x = X_batch[:, :sentence_len] y_temp = X_batch[:, sentence_len] class_weights = {} class_weights[0] = np.where(y_temp == 0)[0].shape[0]/float(len(y_temp)) class_weights[1] = np.where(y_temp == 1)[0].shape[0]/float(len(y_temp)) class_weights[2] = np.where(y_temp == 2)[0].shape[0]/float(len(y_temp)) try: y_temp = np_utils.to_categorical(y_temp, nb_classes=3) except Exception as e: print e #print x.shape, y.shape loss, acc = model.train_on_batch(x, y_temp)#, class_weight=class_weights) print loss, acc #pdb.set_trace() lookup_table += model.layers[0].get_weights()[0] y_pred = model.predict_on_batch(X_test) y_pred = np.argmax(y_pred, axis=1) print classification_report(y_test, y_pred) print precision_recall_fscore_support(y_test, y_pred) print y_pred p += precision_score(y_test, y_pred, average='weighted') p1 += precision_score(y_test, y_pred, average='micro') r += recall_score(y_test, y_pred, average='weighted') r1 += recall_score(y_test, y_pred, average='micro') f1 += f1_score(y_test, y_pred, average='weighted') f11 += f1_score(y_test, y_pred, average='micro') print "macro results are" print "average precision is %f" %(p/10) print "average recall is %f" %(r/10) print "average f1 is %f" %(f1/10) print "micro results are" print "average precision is %f" %(p1/10) print "average recall is %f" %(r1/10) print "average f1 is %f" %(f11/10) return lookup_table/float(10) def check_semantic_sim(embedding_table, word): reverse_vocab = {v:k for k,v in vocab.iteritems()} sim_word_idx = get_similar_words(embedding_table, embedding_table[vocab[word]], 25) sim_words = map(lambda x:reverse_vocab[x[1]], sim_word_idx) print sim_words def tryWord(embedding_table): while True: print "enter word" word = raw_input() if word == "pdb": pdb.set_trace() elif word == 'exit': return else: check_semantic_sim(embedding_table, word) if __name__ == "__main__": Tweets = select_tweets() tweets = Tweets gen_vocab() X, y = gen_sequence() MAX_SEQUENCE_LENGTH = max(map(lambda x:len(x), X)) print "max seq length is %d"%(MAX_SEQUENCE_LENGTH) data = pad_sequences(X, maxlen=MAX_SEQUENCE_LENGTH) y = np.array(y) W = get_embedding_weights() data, y = sklearn.utils.shuffle(data, y) model = fast_text_model(data.shape[1]) _ = train_fasttext(data, y, model, EMBEDDING_DIM, W) table = model.layers[0].get_weights()[0] #check_semantic_sim(table) tryWord(table) pdb.set_trace() ================================================ FILE: get_similar_words.py ================================================ from sklearn.metrics.pairwise import cosine_similarity import numpy as np import pdb def get_similar_words(X, vec, K=1): # X: (n_samples, n_features) # vec: (1, n_features) # returns: K top most similar words with score values and their indexes scores = cosine_similarity(X, vec) scores = sorted([(val, index) for index, val in enumerate(scores.reshape((1,scores.shape[0]))[0])], reverse=True) scores = scores[1:K] return scores ================================================ FILE: lstm.py ================================================ from data_handler import get_data import argparse from keras.preprocessing.sequence import pad_sequences from keras.layers import Embedding, Input, LSTM from keras.models import Sequential, Model from keras.layers import Activation, Dense, Dropout, Embedding, Flatten, Input, Merge, Convolution1D, MaxPooling1D, GlobalMaxPooling1D import numpy as np import pdb from sklearn.metrics import make_scorer, f1_score, accuracy_score, recall_score, precision_score, classification_report, precision_recall_fscore_support from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier from gensim.parsing.preprocessing import STOPWORDS from sklearn.model_selection import KFold from keras.utils import np_utils import codecs import operator import gensim, sklearn from string import punctuation from collections import defaultdict from batch_gen import batch_gen import sys from nltk import tokenize as tokenize_nltk from my_tokenizer import glove_tokenize ### Preparing the text data texts = [] # list of text samples labels_index = {} # dictionary mapping label name to numeric id labels = [] # list of label ids # vocab generation vocab, reverse_vocab = {}, {} freq = defaultdict(int) tweets = {} EMBEDDING_DIM = None GLOVE_MODEL_FILE = None SEED = 42 NO_OF_FOLDS = 10 CLASS_WEIGHT = None LOSS_FUN = None OPTIMIZER = None KERNEL = None TOKENIZER = None MAX_SEQUENCE_LENGTH = None INITIALIZE_WEIGHTS_WITH = None LEARN_EMBEDDINGS = None EPOCHS = 10 BATCH_SIZE = 512 SCALE_LOSS_FUN = None word2vec_model = None def get_embedding(word): #return try: return word2vec_model[word] except Exception, e: print 'Encoding not found: %s' %(word) return np.zeros(EMBEDDING_DIM) def get_embedding_weights(): embedding = np.zeros((len(vocab) + 1, EMBEDDING_DIM)) n = 0 for k, v in vocab.iteritems(): try: embedding[v] = word2vec_model[k] except: n += 1 pass print "%d embedding missed"%n return embedding def select_tweets(): # selects the tweets as in mean_glove_embedding method # Processing tweets = get_data() X, Y = [], [] tweet_return = [] for tweet in tweets: _emb = 0 words = TOKENIZER(tweet['text'].lower()) for w in words: if w in word2vec_model: # Check if embeeding there in GLove model _emb+=1 if _emb: # Not a blank tweet tweet_return.append(tweet) print 'Tweets selected:', len(tweet_return) #pdb.set_trace() return tweet_return def gen_vocab(): # Processing vocab_index = 1 for tweet in tweets: text = TOKENIZER(tweet['text'].lower()) text = ''.join([c for c in text if c not in punctuation]) words = text.split() words = [word for word in words if word not in STOPWORDS] for word in words: if word not in vocab: vocab[word] = vocab_index reverse_vocab[vocab_index] = word # generate reverse vocab as well vocab_index += 1 freq[word] += 1 vocab['UNK'] = len(vocab) + 1 reverse_vocab[len(vocab)] = 'UNK' def filter_vocab(k): global freq, vocab pdb.set_trace() freq_sorted = sorted(freq.items(), key=operator.itemgetter(1)) tokens = freq_sorted[:k] vocab = dict(zip(tokens, range(1, len(tokens) + 1))) vocab['UNK'] = len(vocab) + 1 def gen_sequence(): y_map = { 'none': 0, 'racism': 1, 'sexism': 2 } X, y = [], [] for tweet in tweets: text = TOKENIZER(tweet['text'].lower()) text = ''.join([c for c in text if c not in punctuation]) words = text.split() words = [word for word in words if word not in STOPWORDS] seq, _emb = [], [] for word in words: seq.append(vocab.get(word, vocab['UNK'])) X.append(seq) y.append(y_map[tweet['label']]) return X, y def shuffle_weights(model): weights = model.get_weights() weights = [np.random.permutation(w.flat).reshape(w.shape) for w in weights] model.set_weights(weights) def lstm_model(sequence_length, embedding_dim): model_variation = 'LSTM' print('Model variation is %s' % model_variation) model = Sequential() model.add(Embedding(len(vocab)+1, embedding_dim, input_length=sequence_length, trainable=LEARN_EMBEDDINGS)) model.add(Dropout(0.25))#, input_shape=(sequence_length, embedding_dim))) model.add(LSTM(50)) model.add(Dropout(0.5)) model.add(Dense(3)) model.add(Activation('softmax')) model.compile(loss=LOSS_FUN, optimizer=OPTIMIZER, metrics=['accuracy']) print model.summary() return model def train_LSTM(X, y, model, inp_dim, weights, epochs=EPOCHS, batch_size=BATCH_SIZE): cv_object = KFold(n_splits=NO_OF_FOLDS, shuffle=True, random_state=42) print cv_object p, r, f1 = 0., 0., 0. p1, r1, f11 = 0., 0., 0. sentence_len = X.shape[1] for train_index, test_index in cv_object.split(X): if INITIALIZE_WEIGHTS_WITH == "glove": model.layers[0].set_weights([weights]) elif INITIALIZE_WEIGHTS_WITH == "random": shuffle_weights(model) else: print "ERROR!" return X_train, y_train = X[train_index], y[train_index] X_test, y_test = X[test_index], y[test_index] y_train = y_train.reshape((len(y_train), 1)) X_temp = np.hstack((X_train, y_train)) for epoch in xrange(epochs): for X_batch in batch_gen(X_temp, batch_size): x = X_batch[:, :sentence_len] y_temp = X_batch[:, sentence_len] class_weights = None if SCALE_LOSS_FUN: class_weights = {} class_weights[0] = np.where(y_temp == 0)[0].shape[0]/float(len(y_temp)) class_weights[1] = np.where(y_temp == 1)[0].shape[0]/float(len(y_temp)) class_weights[2] = np.where(y_temp == 2)[0].shape[0]/float(len(y_temp)) try: y_temp = np_utils.to_categorical(y_temp, nb_classes=3) except Exception as e: print e print y_temp print x.shape, y.shape loss, acc = model.train_on_batch(x, y_temp, class_weight=class_weights) print loss, acc y_pred = model.predict_on_batch(X_test) y_pred = np.argmax(y_pred, axis=1) print classification_report(y_test, y_pred) print precision_recall_fscore_support(y_test, y_pred) print y_pred p += precision_score(y_test, y_pred, average='weighted') p1 += precision_score(y_test, y_pred, average='micro') r += recall_score(y_test, y_pred, average='weighted') r1 += recall_score(y_test, y_pred, average='micro') f1 += f1_score(y_test, y_pred, average='weighted') f11 += f1_score(y_test, y_pred, average='micro') print "macro results are" print "average precision is %f" %(p/NO_OF_FOLDS) print "average recall is %f" %(r/NO_OF_FOLDS) print "average f1 is %f" %(f1/NO_OF_FOLDS) print "micro results are" print "average precision is %f" %(p1/NO_OF_FOLDS) print "average recall is %f" %(r1/NO_OF_FOLDS) print "average f1 is %f" %(f11/NO_OF_FOLDS) if __name__ == "__main__": parser = argparse.ArgumentParser(description='LSTM based models for twitter Hate speech detection') parser.add_argument('-f', '--embeddingfile', required=True) parser.add_argument('-d', '--dimension', required=True) parser.add_argument('--tokenizer', choices=['glove', 'nltk'], required=True) parser.add_argument('--loss', default=LOSS_FUN, required=True) parser.add_argument('--optimizer', default=OPTIMIZER, required=True) parser.add_argument('--epochs', default=EPOCHS, required=True) parser.add_argument('--batch-size', default=BATCH_SIZE, required=True) parser.add_argument('-s', '--seed', default=SEED) parser.add_argument('--folds', default=NO_OF_FOLDS) parser.add_argument('--kernel', default=KERNEL) parser.add_argument('--class_weight') parser.add_argument('--initialize-weights', choices=['random', 'glove'], required=True) parser.add_argument('--learn-embeddings', action='store_true', default=False) parser.add_argument('--scale-loss-function', action='store_true', default=False) args = parser.parse_args() GLOVE_MODEL_FILE = args.embeddingfile EMBEDDING_DIM = int(args.dimension) SEED = int(args.seed) NO_OF_FOLDS = int(args.folds) CLASS_WEIGHT = args.class_weight LOSS_FUN = args.loss OPTIMIZER = args.optimizer KERNEL = args.kernel if args.tokenizer == "glove": TOKENIZER = glove_tokenize elif args.tokenizer == "nltk": TOKENIZER = tokenize_nltk.casual.TweetTokenizer(strip_handles=True, reduce_len=True).tokenize INITIALIZE_WEIGHTS_WITH = args.initialize_weights LEARN_EMBEDDINGS = args.learn_embeddings EPOCHS = int(args.epochs) BATCH_SIZE = int(args.batch_size) SCALE_LOSS_FUN = args.scale_loss_function np.random.seed(SEED) print 'GLOVE embedding: %s' %(GLOVE_MODEL_FILE) print 'Embedding Dimension: %d' %(EMBEDDING_DIM) print 'Allowing embedding learning: %s' %(str(LEARN_EMBEDDINGS)) word2vec_model = gensim.models.Word2Vec.load_word2vec_format(GLOVE_MODEL_FILE) tweets = select_tweets() gen_vocab() #filter_vocab(20000) X, y = gen_sequence() #Y = y.reshape((len(y), 1)) MAX_SEQUENCE_LENGTH = max(map(lambda x:len(x), X)) print "max seq length is %d"%(MAX_SEQUENCE_LENGTH) data = pad_sequences(X, maxlen=MAX_SEQUENCE_LENGTH) y = np.array(y) data, y = sklearn.utils.shuffle(data, y) W = get_embedding_weights() model = lstm_model(data.shape[1], EMBEDDING_DIM) #model = lstm_model(data.shape[1], 25, get_embedding_weights()) train_LSTM(data, y, model, EMBEDDING_DIM, W) pdb.set_trace() ================================================ FILE: my_tokenizer.py ================================================ from string import punctuation from preprocess_twitter import tokenize as tokenizer_g from gensim.parsing.preprocessing import STOPWORDS def glove_tokenize(text): text = tokenizer_g(text) text = ''.join([c for c in text if c not in punctuation]) words = text.split() words = [word for word in words if word not in STOPWORDS] return words ================================================ FILE: nn_classifier.py ================================================ from data_handler import get_data import sys import numpy as np import pdb, json from sklearn.metrics import make_scorer, f1_score, accuracy_score, recall_score, precision_score, classification_report, precision_recall_fscore_support from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier from sklearn.model_selection import cross_val_score, cross_val_predict from sklearn.feature_extraction.text import TfidfVectorizer import pdb from sklearn.metrics import make_scorer, f1_score, accuracy_score, recall_score, precision_score, classification_report, precision_recall_fscore_support from sklearn.utils import shuffle from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier from sklearn.svm import SVC, LinearSVC from sklearn.model_selection import KFold from sklearn.linear_model import LogisticRegression from sklearn.utils import shuffle import codecs import operator import gensim, sklearn from collections import defaultdict from batch_gen import batch_gen from my_tokenizer import glove_tokenize import xgboost as xgb ### Preparing the text data texts = [] # list of text samples labels_index = {} # dictionary mapping label name to numeric id labels = [] # list of label ids label_map = { 'none': 0, 'racism': 1, 'sexism': 2 } tweet_data = get_data() for tweet in tweet_data: texts.append(tweet['text'].lower()) labels.append(label_map[tweet['label']]) print('Found %s texts. (samples)' % len(texts)) # logistic, gradient_boosting, random_forest, svm, tfidf_svm_linear, tfidf_svm_rbf model_count = 2 word_embed_size = 200 GLOVE_MODEL_FILE = str(sys.argv[1]) EMBEDDING_DIM = int(sys.argv[2]) MODEL_TYPE=sys.argv[3] print 'Embedding Dimension: %d' %(EMBEDDING_DIM) print 'GloVe Embedding: %s' %(GLOVE_MODEL_FILE) word2vec_model1 = np.load('fast_text.npy') word2vec_model1 = word2vec_model1.reshape((word2vec_model1.shape[1], word2vec_model1.shape[2])) f_vocab = open('vocab_fast_text', 'r') vocab = json.load(f_vocab) word2vec_model = {} for k,v in vocab.iteritems(): word2vec_model[k] = word2vec_model1[int(v)] del word2vec_model1 SEED=42 MAX_NB_WORDS = None VALIDATION_SPLIT = 0.2 # vocab generation vocab, reverse_vocab = {}, {} freq = defaultdict(int) tweets = {} def select_tweets_whose_embedding_exists(): # selects the tweets as in mean_glove_embedding method # Processing tweets = get_data() X, Y = [], [] tweet_return = [] for tweet in tweets: _emb = 0 words = glove_tokenize(tweet['text']) for w in words: if w in word2vec_model: # Check if embeeding there in GLove model _emb+=1 if _emb: # Not a blank tweet tweet_return.append(tweet) print 'Tweets selected:', len(tweet_return) #pdb.set_trace() return tweet_return def gen_data(): y_map = { 'none': 0, 'racism': 1, 'sexism': 2 } X, y = [], [] for tweet in tweets: words = glove_tokenize(tweet['text']) emb = np.zeros(word_embed_size) for word in words: try: emb += word2vec_model[word] except: pass emb /= len(words) X.append(emb) y.append(y_map[tweet['label']]) X = np.array(X) y = np.array(y) return X, y def get_model(m_type=None): if not m_type: print 'ERROR: Please provide a valid method name' return None if m_type == 'logistic': logreg = LogisticRegression() elif m_type == "gradient_boosting": #logreg = GradientBoostingClassifier(n_estimators=10) logreg = xgb.XGBClassifier(nthread=-1) elif m_type == "random_forest": logreg = RandomForestClassifier(n_estimators=100, n_jobs=-1) elif m_type == "svm_rbf": logreg = SVC(class_weight="balanced", kernel='rbf') elif m_type == "svm_linear": logreg = LinearSVC(class_weight="balanced") else: print "ERROR: Please specify a correst model" return None return logreg def classification_model(X, Y, model_type="logistic"): NO_OF_FOLDS=10 X, Y = shuffle(X, Y, random_state=SEED) print "Model Type:", model_type #predictions = cross_val_predict(logreg, X, Y, cv=NO_OF_FOLDS) scores1 = cross_val_score(get_model(model_type), X, Y, cv=NO_OF_FOLDS, scoring='precision_weighted') print "Precision(avg): %0.3f (+/- %0.3f)" % (scores1.mean(), scores1.std() * 2) scores2 = cross_val_score(get_model(model_type), X, Y, cv=NO_OF_FOLDS, scoring='recall_weighted') print "Recall(avg): %0.3f (+/- %0.3f)" % (scores2.mean(), scores2.std() * 2) scores3 = cross_val_score(get_model(model_type), X, Y, cv=NO_OF_FOLDS, scoring='f1_weighted') print "F1-score(avg): %0.3f (+/- %0.3f)" % (scores3.mean(), scores3.std() * 2) pdb.set_trace() if __name__ == "__main__": #filter_vocab(20000) tweets = select_tweets_whose_embedding_exists() X, Y = gen_data() classification_model(X, Y, MODEL_TYPE) pdb.set_trace() ================================================ FILE: plot_graph_TSNE.py ================================================ import gensim import numpy as np import matplotlib.pyplot as plt import json from sklearn.manifold import TSNE import pdb import codecs words = ['mohammed', 'murderer', 'pedophile', 'religion', 'terrorism', 'islamic', 'muslim'] def load_initial_emb(): initial_emb = gensim.models.Word2Vec.load_word2vec_format("/home/pinkesh/DATASETS/glove-twitter/GENSIM.glove.twitter.27B.200d.txt") return initial_emb def load_final_emb(): reverse_vocab = codecs.open3("reverse_vocab.json", 'r', encoding="utf-8").readlines() reverse_vocab = json.loads("".join(reverse_vocab)) reverse_vocab['0'] = "" final_emb = {} for i, emb in enumerate(np.load("embedding.npy")): final_emb[reverse_vocab[str(i)].encode("utf-8")] = emb return final_emb def get_transform(initial_emb, final_emb): vec = [] for w in words: vec.append(initial_emb[w]) for w in words: vec.append(final_emb[w]) X = np.array(vec) print X.shape model = TSNE(n_components=2, random_state=0) out = model.fit_transform(X) print out print "Will plot now!" return out # Initial are original # Next are final def plot(out): A = out[:7,:] B = out[7:,:] area=150 padding=0.0001 xmin, xmax = min(out[:, 0]), max(out[:, 0]) ymin, ymax = min(out[:, 1]), max(out[:, 1]) fig, ax = plt.subplots() for (color, label, data) in [('red', 'GloVe', A), ('green', 'FastText+GloVe+Dyn', B)]: ax.scatter(data[:,0], data[:,1], c=color, s=area, label=label, alpha=0.3, edgecolors='none') for (row, word) in zip(data, words): ax.annotate(word, xy=(row[0], row[1]), xytext=(row[0], row[1]),) plt.axis([xmin-padding,xmax+padding,ymin-padding,ymax+padding]) plt.legend() plt.grid(True) plt.show() if __name__=="__main__": ini = load_initial_emb() fin = load_final_emb() out = get_transform(ini, fin) plot(out) ================================================ FILE: preprocess_twitter.py ================================================ """ preprocess-twitter.py python preprocess-twitter.py "Some random text with #hashtags, @mentions and http://t.co/kdjfkdjf (links). :)" Script for preprocessing tweets by Romain Paulus with small modifications by Jeffrey Pennington with translation to Python by Motoki Wu Translation of Ruby script to create features for GloVe vectors for Twitter data. http://nlp.stanford.edu/projects/glove/preprocess-twitter.rb """ import sys import re FLAGS = re.MULTILINE | re.DOTALL def hashtag(text): text = text.group() hashtag_body = text[1:] if hashtag_body.isupper(): result = u" {} ".format(hashtag_body) else: result = " ".join([""] + re.split(ur"(?=[A-Z])", hashtag_body, flags=FLAGS)) return result def allcaps(text): text = text.group() return text.lower() + " " def tokenize(text): # Different regex parts for smiley faces eyes = r"[8:=;]" nose = r"['`\-]?" # function so code less repetitive def re_sub(pattern, repl): return re.sub(pattern, repl, text, flags=FLAGS) text = re_sub(r"https?:\/\/\S+\b|www\.(\w+\.)+\S*", "") text = re_sub(r"/"," / ") text = re_sub(r"@\w+", "") text = re_sub(r"{}{}[)dD]+|[)dD]+{}{}".format(eyes, nose, nose, eyes), "") text = re_sub(r"{}{}p+".format(eyes, nose), "") text = re_sub(r"{}{}\(+|\)+{}{}".format(eyes, nose, nose, eyes), "") text = re_sub(r"{}{}[\/|l*]".format(eyes, nose), "") text = re_sub(r"<3","") text = re_sub(r"[-+]?[.\d]*[\d]+[:,.\d]*", "") text = re_sub(r"#\S+", hashtag) text = re_sub(r"([!?.]){2,}", r"\1 ") text = re_sub(r"\b(\S*?)(.)\2{2,}\b", r"\1\2 ") ## -- I just don't understand why the Ruby script adds to everything so I limited the selection. # text = re_sub(r"([^a-z0-9()<>'`\-]){2,}", allcaps) text = re_sub(r"([A-Z]){2,}", allcaps) return text.lower() if __name__ == '__main__': _, text = sys.argv if text == "test": text = u"I TEST alllll kinds of #hashtags and #HASHTAGS, @mentions and 3000 (http://t.co/dkfjkdf). w/ <3 :) haha!!!!!" tokens = tokenize(text) print tokens ================================================ FILE: tfidf.py ================================================ from data_handler import get_data import argparse import sys import numpy as np from sklearn.metrics import make_scorer, f1_score, accuracy_score, recall_score, precision_score, classification_report, precision_recall_fscore_support from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier from sklearn.model_selection import cross_val_score, cross_val_predict from sklearn.feature_extraction.text import TfidfVectorizer import pdb from sklearn.metrics import make_scorer, f1_score, accuracy_score, recall_score, precision_score, classification_report, precision_recall_fscore_support from sklearn.utils import shuffle from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier from sklearn.svm import SVC, LinearSVC from sklearn.model_selection import KFold from sklearn.linear_model import LogisticRegression from sklearn.utils import shuffle import codecs import operator import gensim, sklearn from collections import defaultdict from batch_gen import batch_gen from my_tokenizer import glove_tokenize from nltk.tokenize import TweetTokenizer ### Preparing the text data texts = [] # list of text samples labels_index = {} # dictionary mapping label name to numeric id labels = [] # list of label ids # vocab generation vocab, reverse_vocab = {}, {} freq = defaultdict(int) tweets = {} # tfidf_logistic, tfidf_gradient_boosting, tfidf_random_forest, tfidf_svm_linear, tfidf_svm_rbf MODEL_TYPE=None MAX_NGRAM_LENGTH=None NO_OF_FOLDS=10 CLASS_WEIGHT = None N_ESTIMATORS = None LOSS_FUN = None KERNEL = None MAX_NGRAM_LENGTH = None SEED=42 TOKENIZER=None def gen_data(): label_map = { 'none': 0, 'racism': 1, 'sexism': 2 } tweet_data = get_data() for tweet in tweet_data: texts.append(tweet['text'].lower()) labels.append(label_map[tweet['label']]) print('Found %s texts. (samples)' % len(texts)) def get_model(m_type=None): if not m_type: print 'Please specify a model type' return None if m_type == "tfidf_svm": logreg = SVC(class_weight=CLASS_WEIGHT, kernel=KERNEL) elif m_type == "tfidf_svm_linear": logreg = LinearSVC(C=0.01, loss=LOSS_FUN, class_weight=CLASS_WEIGHT) elif m_type == 'tfidf_logistic': logreg = LogisticRegression() elif m_type == "tfidf_gradient_boosting": logreg = GradientBoostingClassifier(loss=LOSS_FUN, n_estimators=N_ESTIMATORS) elif m_type == "tfidf_random_forest": logreg = RandomForestClassifier(class_weight=CLASS_WEIGHT, n_estimators=N_ESTIMATORS) print "ERROR: Please specify a correct model" return None return logreg def classification_model(X, Y, model_type=None): X, Y = shuffle(X, Y, random_state=SEED) print "Model Type:", model_type #predictions = cross_val_predict(logreg, X, Y, cv=NO_OF_FOLDS) scores1 = cross_val_score(get_model(model_type), X, Y, cv=NO_OF_FOLDS, scoring='precision_weighted') print "Precision(avg): %0.3f (+/- %0.3f)" % (scores1.mean(), scores1.std() * 2) scores2 = cross_val_score(get_model(model_type), X, Y, cv=NO_OF_FOLDS, scoring='recall_weighted') print "Recall(avg): %0.3f (+/- %0.3f)" % (scores2.mean(), scores2.std() * 2) scores3 = cross_val_score(get_model(model_type), X, Y, cv=NO_OF_FOLDS, scoring='f1_weighted') print "F1-score(avg): %0.3f (+/- %0.3f)" % (scores3.mean(), scores3.std() * 2) if __name__ == "__main__": parser = argparse.ArgumentParser(description='TF-IDF model for twitter Hate speech detection') parser.add_argument('-m', '--model', choices=['tfidf_svm', 'tfidf_svm_linear', 'tfidf_logistic', 'tfidf_gradient_boosting', 'tfidf_random_forest'], required=True) parser.add_argument('--max_ngram', required=True) parser.add_argument('--tokenizer', choices=['glove', 'nltk'], required=True) parser.add_argument('-s', '--seed', default=SEED) parser.add_argument('--folds', default=NO_OF_FOLDS) parser.add_argument('--estimators', default=N_ESTIMATORS) parser.add_argument('--loss', default=LOSS_FUN) parser.add_argument('--kernel', default=KERNEL) parser.add_argument('--class_weight') parser.add_argument('--use-inverse-doc-freq', action='store_true') args = parser.parse_args() MODEL_TYPE = args.model SEED = int(args.seed) NO_OF_FOLDS = int(args.folds) CLASS_WEIGHT = args.class_weight N_ESTIMATORS = int(args.estimators) if args.estimators else args.estimators LOSS_FUN = args.loss KERNEL = args.kernel MAX_NGRAM_LENGTH = int(args.max_ngram) USE_IDF = args.use_inverse_doc_freq if args.tokenizer == "glove": TOKENIZER = glove_tokenize elif args.tokenizer == "nltk": TOKENIZER = TweetTokenizer().tokenize print 'Max-ngram-length: %d' %(MAX_NGRAM_LENGTH) #filter_vocab(20000) # For TFIDF-SVC or any other varient # We do not need to run the above code for TFIDF # It does not use the filtered data using gen_data() gen_data() tfidf_transformer = TfidfVectorizer(use_idf=USE_IDF, analyzer="word", tokenizer=TOKENIZER, ngram_range=(1, MAX_NGRAM_LENGTH)) #tfidf_transformer = TfidfVectorizer(use_idf=True, ngram_range=(1, MAX_NGRAM_LENGTH)) X_train_tfidf = tfidf_transformer.fit_transform(texts) X = X_train_tfidf Y = labels classification_model(X, Y, MODEL_TYPE)