Repository: pinkeshbadjatiya/twitter-hatespeech
Branch: master
Commit: 3b834311953b
Files: 14
Total size: 57.6 KB
Directory structure:
gitextract_19n6zuu9/
├── .gitignore
├── BoWV.py
├── README.md
├── batch_gen.py
├── cnn.py
├── data_handler.py
├── fast_text.py
├── get_similar_words.py
├── lstm.py
├── my_tokenizer.py
├── nn_classifier.py
├── plot_graph_TSNE.py
├── preprocess_twitter.py
└── tfidf.py
================================================
FILE CONTENTS
================================================
================================================
FILE: .gitignore
================================================
*.pyc
================================================
FILE: BoWV.py
================================================
from data_handler import get_data
import argparse
import sys
import numpy as np
import pdb
from sklearn.metrics import make_scorer, f1_score, accuracy_score, recall_score, precision_score, classification_report, precision_recall_fscore_support
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import make_scorer, f1_score, accuracy_score, recall_score, precision_score, classification_report, precision_recall_fscore_support
from sklearn.utils import shuffle
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression
from sklearn.utils import shuffle
import codecs
import operator
import gensim, sklearn
from collections import defaultdict
from batch_gen import batch_gen
from my_tokenizer import glove_tokenize
from nltk.tokenize import TweetTokenizer
### Preparing the text data
texts = [] # list of text samples
labels_index = {} # dictionary mapping label name to numeric id
labels = [] # list of label ids
# logistic, gradient_boosting, random_forest, svm_linear, svm_rbf
GLOVE_MODEL_FILE = None
EMBEDDING_DIM = None
MODEL_TYPE = None
CLASS_WEIGHT = None
N_ESTIMATORS = None
LOSS_FUN = None
KERNEL = None
TOKENIZER = None
SEED=42
MAX_NB_WORDS = None
NO_OF_FOLDS=10
# vocab generation
vocab, reverse_vocab = {}, {}
freq = defaultdict(int)
tweets = {}
word2vec_model = None
def select_tweets_whose_embedding_exists():
# selects the tweets as in mean_glove_embedding method
# Processing
tweets = get_data()
X, Y = [], []
tweet_return = []
for tweet in tweets:
_emb = 0
words = TOKENIZER(tweet['text'].lower())
for w in words:
if w in word2vec_model: # Check if embeeding there in GLove model
_emb+=1
if _emb: # Not a blank tweet
tweet_return.append(tweet)
print 'Tweets selected:', len(tweet_return)
return tweet_return
def gen_data():
y_map = {
'none': 0,
'racism': 1,
'sexism': 2
}
X, y = [], []
for tweet in tweets:
words = glove_tokenize(tweet['text'].lower())
emb = np.zeros(EMBEDDING_DIM)
for word in words:
try:
emb += word2vec_model[word]
except:
pass
emb /= len(words)
X.append(emb)
y.append(y_map[tweet['label']])
return X, y
def get_model(m_type=None):
if not m_type:
print "ERROR: Please specify a model type!"
return None
if m_type == 'logistic':
logreg = LogisticRegression()
elif m_type == "gradient_boosting":
logreg = GradientBoostingClassifier(loss=LOSS_FUN, n_estimators=N_ESTIMATORS)
elif m_type == "random_forest":
logreg = RandomForestClassifier(class_weight=CLASS_WEIGHT, n_estimators=N_ESTIMATORS)
elif m_type == "svm":
logreg = SVC(class_weight=CLASS_WEIGHT, kernel=KERNEL)
elif m_type == "svm_linear":
logreg = LinearSVC(loss=LOSS_FUN, class_weight=CLASS_WEIGHT)
else:
print "ERROR: Please specify a correct model"
return None
return logreg
def classification_model(X, Y, model_type=None):
X, Y = shuffle(X, Y, random_state=SEED)
print "Model Type:", model_type
#predictions = cross_val_predict(logreg, X, Y, cv=NO_OF_FOLDS)
scores1 = cross_val_score(get_model(model_type), X, Y, cv=NO_OF_FOLDS, scoring='precision_weighted')
print "Precision(avg): %0.3f (+/- %0.3f)" % (scores1.mean(), scores1.std() * 2)
scores2 = cross_val_score(get_model(model_type), X, Y, cv=NO_OF_FOLDS, scoring='recall_weighted')
print "Recall(avg): %0.3f (+/- %0.3f)" % (scores2.mean(), scores2.std() * 2)
scores3 = cross_val_score(get_model(model_type), X, Y, cv=NO_OF_FOLDS, scoring='f1_weighted')
print "F1-score(avg): %0.3f (+/- %0.3f)" % (scores3.mean(), scores3.std() * 2)
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='BagOfWords model for twitter Hate speech detection')
parser.add_argument('-m', '--model', choices=['logistic', 'gradient_boosting', 'random_forest', 'svm', 'svm_linear'], required=True)
parser.add_argument('-f', '--embeddingfile', required=True)
parser.add_argument('-d', '--dimension', required=True)
parser.add_argument('--tokenizer', choices=['glove', 'nltk'], required=True)
parser.add_argument('-s', '--seed', default=SEED)
parser.add_argument('--folds', default=NO_OF_FOLDS)
parser.add_argument('--estimators', default=N_ESTIMATORS)
parser.add_argument('--loss', default=LOSS_FUN)
parser.add_argument('--kernel', default=KERNEL)
parser.add_argument('--class_weight')
args = parser.parse_args()
MODEL_TYPE = args.model
GLOVE_MODEL_FILE = args.embeddingfile
EMBEDDING_DIM = int(args.dimension)
SEED = int(args.seed)
NO_OF_FOLDS = int(args.folds)
CLASS_WEIGHT = args.class_weight
N_ESTIMATORS = int(args.estimators)
LOSS_FUN = args.loss
KERNEL = args.kernel
if args.tokenizer == "glove":
TOKENIZER = glove_tokenize
elif args.tokenizer == "nltk":
TOKENIZER = TweetTokenizer().tokenize
print 'GLOVE embedding: %s' %(GLOVE_MODEL_FILE)
print 'Embedding Dimension: %d' %(EMBEDDING_DIM)
word2vec_model = gensim.models.Word2Vec.load_word2vec_format(GLOVE_MODEL_FILE)
#filter_vocab(20000)
tweets = select_tweets_whose_embedding_exists()
X, Y = gen_data()
classification_model(X, Y, MODEL_TYPE)
================================================
FILE: README.md
================================================
# Hate Speech Detection on Twitter
Implementation of our paper titled - "Deep Learning for Hate Speech Detection" (to appear in WWW'17 proceedings).
## Dataset
Dataset can be downloaded from [https://github.com/zeerakw/hatespeech](https://github.com/zeerakw/hatespeech). Contains tweet id's and corresponding annotations.
Tweets are labelled as either Racist, Sexist or Neither Racist or Sexist.
Use your favourite tweet crawler and download the data and place the tweets in the folder 'tweet_data'.
## Requirements
* Keras
* Tensorflow or Theano (we experimented with theano)
* Gensim
* xgboost
* NLTK
* Sklearn
* Numpy
## Instructions to run
Before running the model, make sure you have setup the input dataset in a folder named `tweet_data`.
To run a model for training, use the following instructions mentioned below. Use appropriate parameter settings to test the variations of the models.
### This script contains code for runnning NN_model + GDBT.
Steps to run NN_model + GDBT
* Run NN_model first (CNN/LSTM/Fast_text). It will create a model file
* Change the name of the file at line 50 pointing to the model file
* Run nn_classifier file as per instructions below
python nn_classifier.py <GradientBoosting(xgboost) or Random Forest>
- BagOfWords models - **BoWV.py[does not supports XGBOOST, supports sklearn's GBDT]**
```
usage: BoWV.py [-h] -m [Deprecated]
{logistic,gradient_boosting,random_forest,svm,svm_linear} -f
EMBEDDINGFILE -d DIMENSION --tokenizer {glove,nltk} [-s SEED]
[--folds FOLDS] [--estimators ESTIMATORS] [--loss LOSS]
[--kernel KERNEL] [--class_weight CLASS_WEIGHT]
BagOfWords model for twitter Hate speech detection
optional arguments:
-h, --help show this help message and exit
-m {logistic,gradient_boosting,random_forest,svm,svm_linear}, --model {logistic,gradient_boosting,random_forest,svm,svm_linear}
-f EMBEDDINGFILE, --embeddingfile EMBEDDINGFILE
-d DIMENSION, --dimension DIMENSION
--tokenizer {glove,nltk}
-s SEED, --seed SEED
--folds FOLDS
--estimators ESTIMATORS
--loss LOSS
--kernel KERNEL
--class_weight CLASS_WEIGHT
```
- TF-IDF based models - **tfidf.py**
```
usage: tfidf.py [-h] -m
{tfidf_svm,tfidf_svm_linear,tfidf_logistic,tfidf_gradient_boosting,tfidf_random_forest}
--max_ngram MAX_NGRAM --tokenizer {glove,nltk} [-s SEED]
[--folds FOLDS] [--estimators ESTIMATORS] [--loss LOSS]
[--kernel KERNEL] [--class_weight CLASS_WEIGHT]
[--use-inverse-doc-freq]
TF-IDF model for twitter Hate speech detection
optional arguments:
-h, --help show this help message and exit
-m {tfidf_svm,tfidf_svm_linear,tfidf_logistic,tfidf_gradient_boosting,tfidf_random_forest}, --model {tfidf_svm,tfidf_svm_linear,tfidf_logistic,tfidf_gradient_boosting,tfidf_random_forest}
--max_ngram MAX_NGRAM
--tokenizer {glove,nltk}
-s SEED, --seed SEED
--folds FOLDS
--estimators ESTIMATORS
--loss LOSS
--kernel KERNEL
--class_weight CLASS_WEIGHT
--use-inverse-doc-freq
```
- LSTM(RNN) based methods - **lstm.py**
```
usage: lstm.py [-h] -f EMBEDDINGFILE -d DIMENSION --tokenizer {glove,nltk}
--loss LOSS --optimizer OPTIMIZER --epochs EPOCHS --batch-size
BATCH_SIZE [-s SEED] [--folds FOLDS] [--kernel KERNEL]
[--class_weight CLASS_WEIGHT] --initialize-weights
{random,glove} [--learn-embeddings] [--scale-loss-function]
LSTM based models for twitter Hate speech detection
optional arguments:
-h, --help show this help message and exit
-f EMBEDDINGFILE, --embeddingfile EMBEDDINGFILE
-d DIMENSION, --dimension DIMENSION
--tokenizer {glove,nltk}
--loss LOSS
--optimizer OPTIMIZER
--epochs EPOCHS
--batch-size BATCH_SIZE
-s SEED, --seed SEED
--folds FOLDS
--kernel KERNEL
--class_weight CLASS_WEIGHT
--initialize-weights {random,glove}
--learn-embeddings
--scale-loss-function
```
- CNN based models - **cnn.py**
```
usage: cnn.py [-h] -f EMBEDDINGFILE -d DIMENSION --tokenizer {glove,nltk}
--loss LOSS --optimizer OPTIMIZER --epochs EPOCHS --batch-size
BATCH_SIZE [-s SEED] [--folds FOLDS]
[--class_weight CLASS_WEIGHT] --initialize-weights
{random,glove} [--learn-embeddings] [--scale-loss-function]
CNN based models for twitter Hate speech detection
optional arguments:
-h, --help show this help message and exit
-f EMBEDDINGFILE, --embeddingfile EMBEDDINGFILE
-d DIMENSION, --dimension DIMENSION
--tokenizer {glove,nltk}
--loss LOSS
--optimizer OPTIMIZER
--epochs EPOCHS
--batch-size BATCH_SIZE
-s SEED, --seed SEED
--folds FOLDS
--class_weight CLASS_WEIGHT
--initialize-weights {random,glove}
--learn-embeddings
--scale-loss-function
```
## Examples:
```
python BoWV.py --model logistic --seed 42 -f glove.twitter.27b.25d.txt -d 25 --seed 42 --folds 10 --tokenizer glove
python tfidf.py -m tfidf_svm_linear --max_ngram 3 --tokenizer glove --loss squared_hinge
python lstm.py -f ~/DATASETS/glove-twitter/GENSIM.glove.twitter.27B.25d.txt -d 25 --tokenizer glove --loss categorical_crossentropy --optimizer adam --initialize-weights random --learn-embeddings --epochs 10 --batch-size 512
python cnn.py -f ~/DATASETS/glove-twitter/GENSIM.glove.twitter.27B.25d.txt -d 25 --tokenizer nltk --loss categorical_crossentropy --optimizer adam --epochs 10 --batch-size 128 --initialize-weights random --scale-loss-function
```
================================================
FILE: batch_gen.py
================================================
import numpy as np
import random
import pdb
import math
def batch_gen(X, batch_size):
n_batches = X.shape[0]/float(batch_size)
n_batches = int(math.ceil(n_batches))
end = int(X.shape[0]/float(batch_size)) * batch_size
n = 0
for i in xrange(0,n_batches):
if i < n_batches - 1:
batch = X[i*batch_size:(i+1) * batch_size, :]
yield batch
else:
batch = X[end: , :]
n += X[end:, :].shape[0]
yield batch
if __name__ == "__main__":
X = np.random.rand(123, 32)
for batch in batch_gen(X, 21):
print batch.shape
================================================
FILE: cnn.py
================================================
from data_handler import get_data
import argparse
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding, Input, LSTM
from keras.models import Sequential, Model
from keras.layers import Activation, Dense, Dropout, Embedding, Flatten, Input, Merge, Convolution1D, MaxPooling1D, GlobalMaxPooling1D
import numpy as np
import pdb
from nltk import tokenize
from sklearn.metrics import make_scorer, f1_score, accuracy_score, recall_score, precision_score, classification_report, precision_recall_fscore_support
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from gensim.parsing.preprocessing import STOPWORDS
from sklearn.model_selection import KFold
from keras.utils import np_utils
from string import punctuation
import codecs
import operator
import gensim, sklearn
from collections import defaultdict
from batch_gen import batch_gen
import sys
from nltk import tokenize as tokenize_nltk
from my_tokenizer import glove_tokenize
### Preparing the text data
texts = [] # list of text samples
labels_index = {} # dictionary mapping label name to numeric id
labels = [] # list of label ids
# vocab generation
vocab, reverse_vocab = {}, {}
freq = defaultdict(int)
tweets = {}
EMBEDDING_DIM = None
GLOVE_MODEL_FILE = None
NO_OF_CLASSES=3
SEED = 42
NO_OF_FOLDS = 10
CLASS_WEIGHT = None
LOSS_FUN = None
OPTIMIZER = None
TOKENIZER = None
INITIALIZE_WEIGHTS_WITH = None
LEARN_EMBEDDINGS = None
EPOCHS = 10
BATCH_SIZE = 128
SCALE_LOSS_FUN = None
word2vec_model = None
def get_embedding(word):
#return
try:
return word2vec_model[word]
except Exception, e:
print 'Encoding not found: %s' %(word)
return np.zeros(EMBEDDING_DIM)
def get_embedding_weights():
embedding = np.zeros((len(vocab) + 1, EMBEDDING_DIM))
n = 0
for k, v in vocab.iteritems():
try:
embedding[v] = word2vec_model[k]
except:
n += 1
pass
print "%d embedding missed"%n
#pdb.set_trace()
return embedding
def select_tweets():
# selects the tweets as in mean_glove_embedding method
# Processing
tweets = get_data()
X, Y = [], []
tweet_return = []
for tweet in tweets:
_emb = 0
words = TOKENIZER(tweet['text'].lower())
for w in words:
if w in word2vec_model: # Check if embeeding there in GLove model
_emb+=1
if _emb: # Not a blank tweet
tweet_return.append(tweet)
print 'Tweets selected:', len(tweet_return)
return tweet_return
def gen_vocab():
# Processing
vocab_index = 1
for tweet in tweets:
text = TOKENIZER(tweet['text'].lower())
text = ''.join([c for c in text if c not in punctuation])
words = text.split()
words = [word for word in words if word not in STOPWORDS]
for word in words:
if word not in vocab:
vocab[word] = vocab_index
reverse_vocab[vocab_index] = word # generate reverse vocab as well
vocab_index += 1
freq[word] += 1
vocab['UNK'] = len(vocab) + 1
reverse_vocab[len(vocab)] = 'UNK'
def filter_vocab(k):
global freq, vocab
freq_sorted = sorted(freq.items(), key=operator.itemgetter(1))
tokens = freq_sorted[:k]
vocab = dict(zip(tokens, range(1, len(tokens) + 1)))
vocab['UNK'] = len(vocab) + 1
def gen_sequence():
y_map = {
'none': 0,
'racism': 1,
'sexism': 2
}
X, y = [], []
for tweet in tweets:
text = TOKENIZER(tweet['text'].lower())
text = ''.join([c for c in text if c not in punctuation])
words = text.split()
words = [word for word in words if word not in STOPWORDS]
seq, _emb = [], []
for word in words:
seq.append(vocab.get(word, vocab['UNK']))
X.append(seq)
y.append(y_map[tweet['label']])
return X, y
def shuffle_weights(model):
weights = model.get_weights()
weights = [np.random.permutation(w.flat).reshape(w.shape) for w in weights]
model.set_weights(weights)
def cnn_model(sequence_length, embedding_dim):
model_variation = 'CNN-rand' # CNN-rand | CNN-non-static | CNN-static
print('Model variation is %s' % model_variation)
# Model Hyperparameters
n_classes = NO_OF_CLASSES
embedding_dim = EMBEDDING_DIM
filter_sizes = (3, 4, 5)
num_filters = 100
dropout_prob = (0.25, 0.5)
hidden_dims = 100
# Training parameters
# Word2Vec parameters, see train_word2vec
#min_word_count = 1 # Minimum word count
#context = 10 # Context window size
graph_in = Input(shape=(sequence_length, embedding_dim))
convs = []
for fsz in filter_sizes:
conv = Convolution1D(nb_filter=num_filters,
filter_length=fsz,
border_mode='valid',
activation='relu')(graph_in)
#,subsample_length=1)(graph_in)
pool = GlobalMaxPooling1D()(conv)
#flatten = Flatten()(pool)
convs.append(pool)
if len(filter_sizes)>1:
out = Merge(mode='concat')(convs)
else:
out = convs[0]
graph = Model(input=graph_in, output=out)
# main sequential model
model = Sequential()
#if not model_variation=='CNN-rand':
model.add(Embedding(len(vocab)+1, embedding_dim, input_length=sequence_length, trainable=LEARN_EMBEDDINGS))
model.add(Dropout(dropout_prob[0]))#, input_shape=(sequence_length, embedding_dim)))
model.add(graph)
model.add(Dropout(dropout_prob[1]))
model.add(Activation('relu'))
model.add(Dense(n_classes))
model.add(Activation('softmax'))
model.compile(loss=LOSS_FUN, optimizer=OPTIMIZER, metrics=['accuracy'])
print model.summary()
return model
def train_CNN(X, y, inp_dim, model, weights, epochs=EPOCHS, batch_size=BATCH_SIZE):
cv_object = KFold(n_splits=NO_OF_FOLDS, shuffle=True, random_state=42)
print cv_object
p, r, f1 = 0., 0., 0.
p1, r1, f11 = 0., 0., 0.
sentence_len = X.shape[1]
for train_index, test_index in cv_object.split(X):
if INITIALIZE_WEIGHTS_WITH == "glove":
model.layers[0].set_weights([weights])
elif INITIALIZE_WEIGHTS_WITH == "random":
shuffle_weights(model)
else:
print "ERROR!"
return
X_train, y_train = X[train_index], y[train_index]
X_test, y_test = X[test_index], y[test_index]
y_train = y_train.reshape((len(y_train), 1))
X_temp = np.hstack((X_train, y_train))
for epoch in xrange(epochs):
for X_batch in batch_gen(X_temp, batch_size):
x = X_batch[:, :sentence_len]
y_temp = X_batch[:, sentence_len]
class_weights = None
if SCALE_LOSS_FUN:
class_weights = {}
class_weights[0] = np.where(y_temp == 0)[0].shape[0]/float(len(y_temp))
class_weights[1] = np.where(y_temp == 1)[0].shape[0]/float(len(y_temp))
class_weights[2] = np.where(y_temp == 2)[0].shape[0]/float(len(y_temp))
try:
y_temp = np_utils.to_categorical(y_temp, nb_classes=3)
except Exception as e:
print e
print y_temp
print x.shape, y.shape
loss, acc = model.train_on_batch(x, y_temp, class_weight=class_weights)
print loss, acc
y_pred = model.predict_on_batch(X_test)
y_pred = np.argmax(y_pred, axis=1)
print classification_report(y_test, y_pred)
print precision_recall_fscore_support(y_test, y_pred)
print y_pred
p += precision_score(y_test, y_pred, average='weighted')
p1 += precision_score(y_test, y_pred, average='micro')
r += recall_score(y_test, y_pred, average='weighted')
r1 += recall_score(y_test, y_pred, average='micro')
f1 += f1_score(y_test, y_pred, average='weighted')
f11 += f1_score(y_test, y_pred, average='micro')
print "macro results are"
print "average precision is %f" %(p/NO_OF_FOLDS)
print "average recall is %f" %(r/NO_OF_FOLDS)
print "average f1 is %f" %(f1/NO_OF_FOLDS)
print "micro results are"
print "average precision is %f" %(p1/NO_OF_FOLDS)
print "average recall is %f" %(r1/NO_OF_FOLDS)
print "average f1 is %f" %(f11/NO_OF_FOLDS)
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='CNN based models for twitter Hate speech detection')
parser.add_argument('-f', '--embeddingfile', required=True)
parser.add_argument('-d', '--dimension', required=True)
parser.add_argument('--tokenizer', choices=['glove', 'nltk'], required=True)
parser.add_argument('--loss', default=LOSS_FUN, required=True)
parser.add_argument('--optimizer', default=OPTIMIZER, required=True)
parser.add_argument('--epochs', default=EPOCHS, required=True)
parser.add_argument('--batch-size', default=BATCH_SIZE, required=True)
parser.add_argument('-s', '--seed', default=SEED)
parser.add_argument('--folds', default=NO_OF_FOLDS)
parser.add_argument('--class_weight')
parser.add_argument('--initialize-weights', choices=['random', 'glove'], required=True)
parser.add_argument('--learn-embeddings', action='store_true', default=False)
parser.add_argument('--scale-loss-function', action='store_true', default=False)
args = parser.parse_args()
GLOVE_MODEL_FILE = args.embeddingfile
EMBEDDING_DIM = int(args.dimension)
SEED = int(args.seed)
NO_OF_FOLDS = int(args.folds)
CLASS_WEIGHT = args.class_weight
LOSS_FUN = args.loss
OPTIMIZER = args.optimizer
if args.tokenizer == "glove":
TOKENIZER = glove_tokenize
elif args.tokenizer == "nltk":
TOKENIZER = tokenize_nltk.casual.TweetTokenizer(strip_handles=True, reduce_len=True).tokenize
INITIALIZE_WEIGHTS_WITH = args.initialize_weights
LEARN_EMBEDDINGS = args.learn_embeddings
EPOCHS = int(args.epochs)
BATCH_SIZE = int(args.batch_size)
SCALE_LOSS_FUN = args.scale_loss_function
print 'GLOVE embedding: %s' %(GLOVE_MODEL_FILE)
print 'Embedding Dimension: %d' %(EMBEDDING_DIM)
print 'Allowing embedding learning: %s' %(str(LEARN_EMBEDDINGS))
word2vec_model = gensim.models.Word2Vec.load_word2vec_format(GLOVE_MODEL_FILE)
np.random.seed(SEED)
Tweets = select_tweets()
tweets = Tweets
gen_vocab()
#filter_vocab(20000)
X, y = gen_sequence()
#Y = y.reshape((len(y), 1))
MAX_SEQUENCE_LENGTH = max(map(lambda x:len(x), X))
print "max seq length is %d"%(MAX_SEQUENCE_LENGTH)
data = pad_sequences(X, maxlen=MAX_SEQUENCE_LENGTH)
y = np.array(y)
data, y = sklearn.utils.shuffle(data, y)
W = get_embedding_weights()
model = cnn_model(data.shape[1], EMBEDDING_DIM)
train_CNN(data, y, EMBEDDING_DIM, model, W)
pdb.set_trace()
================================================
FILE: data_handler.py
================================================
import json
import pdb
import codecs
import pdb
def get_data():
tweets = []
files = ['racism.json', 'neither.json', 'sexism.json']
for file in files:
with codecs.open('./tweet_data/' + file, 'r', encoding='utf-8') as f:
data = f.readlines()
for line in data:
tweet_full = json.loads(line)
tweets.append({
'id': tweet_full['id'],
'text': tweet_full['text'].lower(),
'label': tweet_full['Annotation'],
'name': tweet_full['user']['name'].split()[0]
})
#pdb.set_trace()
return tweets
if __name__=="__main__":
tweets = get_data()
males, females = {}, {}
with open('./tweet_data/males.txt') as f:
males = set([w.strip() for w in f.readlines()])
with open('./tweet_data/females.txt') as f:
females = set([w.strip() for w in f.readlines()])
males_c, females_c, not_found = 0, 0, 0
for t in tweets:
if t['name'] in males:
males_c += 1
elif t['name'] in females:
females_c += 1
else:
not_found += 1
print males_c, females_c, not_found
pdb.set_trace()
================================================
FILE: fast_text.py
================================================
from data_handler import get_data
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding, Input, LSTM
from keras.models import Sequential, Model
from keras.layers import Activation, Dense, Dropout, Embedding, Flatten, Input, Merge, Convolution1D, MaxPooling1D, GlobalMaxPooling1D, GlobalAveragePooling1D
import numpy as np
from preprocess_twitter import tokenize as tokenizer_g
import pdb
from nltk import tokenize
from sklearn.metrics import make_scorer, f1_score, accuracy_score, recall_score, precision_score, classification_report, precision_recall_fscore_support
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from gensim.parsing.preprocessing import STOPWORDS
from sklearn.model_selection import KFold
from keras.utils import np_utils
import codecs
import operator
import gensim, sklearn
from collections import defaultdict
from batch_gen import batch_gen
from string import punctuation
from get_similar_words import get_similar_words
import sys
### Preparing the text data
texts = [] # list of text samples
labels_index = {} # dictionary mapping label name to numeric id
labels = [] # list of label ids
label_map = {
'none': 0,
'racism': 1,
'sexism': 2
}
tweet_data = get_data()
for tweet in tweet_data:
texts.append(tweet['text'])
labels.append(label_map[tweet['label']])
print('Found %s texts. (samples)' % len(texts))
EMBEDDING_DIM = int(sys.argv[1])
np.random.seed(42)
# Load the orginal glove file
# SHASHANK files
#GLOVE_MODEL_FILE="/home/shashank/DL_NLP/glove-twitter" + str(EMBEDDING_DIM) + "-w2v"
# PINKESH files
GLOVE_MODEL_FILE="/home/pinkesh/DATASETS/glove-twitter/GENSIM.glove.twitter.27B." + str(EMBEDDING_DIM) + "d.txt"
NO_OF_CLASSES=3
MAX_NB_WORDS = None
VALIDATION_SPLIT = 0.2
word2vec_model = gensim.models.Word2Vec.load_word2vec_format(GLOVE_MODEL_FILE)
# vocab generation
MyTokenizer = tokenize.casual.TweetTokenizer(strip_handles=True, reduce_len=True)
vocab, reverse_vocab = {}, {}
freq = defaultdict(int)
tweets = {}
def get_embedding(word):
#return
try:
return word2vec_model[word]
except Exception, e:
print 'Encoding not found: %s' %(word)
return np.zeros(EMBEDDING_DIM)
def get_embedding_weights():
embedding = np.zeros((len(vocab) + 1, EMBEDDING_DIM))
n = 0
for k, v in vocab.iteritems():
try:
embedding[v] = word2vec_model[k]
except:
n += 1
pass
print "%d embedding missed"%n
#pdb.set_trace()
return embedding
def select_tweets():
# selects the tweets as in mean_glove_embedding method
# Processing
tweets = get_data()
X, Y = [], []
tweet_return = []
for tweet in tweets:
_emb = 0
words = Tokenize(tweet['text']).split()
for w in words:
if w in word2vec_model: # Check if embeeding there in GLove model
_emb+=1
if _emb: # Not a blank tweet
tweet_return.append(tweet)
print 'Tweets selected:', len(tweet_return)
#pdb.set_trace()
return tweet_return
def gen_vocab():
# Processing
vocab_index = 1
for tweet in tweets:
text = Tokenize(tweet['text'])
text = ''.join([c for c in text if c not in punctuation])
words = text.split()
words = [word for word in words if word not in STOPWORDS]
for word in words:
if word not in vocab:
vocab[word] = vocab_index
reverse_vocab[vocab_index] = word # generate reverse vocab as well
vocab_index += 1
freq[word] += 1
vocab['UNK'] = len(vocab) + 1
reverse_vocab[len(vocab)] = 'UNK'
#pdb.set_trace()
def filter_vocab(k):
global freq, vocab
#pdb.set_trace()
freq_sorted = sorted(freq.items(), key=operator.itemgetter(1))
tokens = freq_sorted[:k]
vocab = dict(zip(tokens, range(1, len(tokens) + 1)))
vocab['UNK'] = len(vocab) + 1
def gen_sequence():
y_map = {
'none': 0,
'racism': 1,
'sexism': 2
}
X, y = [], []
for tweet in tweets:
text = Tokenize(tweet['text'])
text = ''.join([c for c in text if c not in punctuation])
words = text.split()
words = [word for word in words if word not in STOPWORDS]
seq, _emb = [], []
for word in words:
seq.append(vocab.get(word, vocab['UNK']))
X.append(seq)
y.append(y_map[tweet['label']])
return X, y
def Tokenize(tweet):
#return MyTokenizer.tokenize(tweet)
#pdb.set_trace()
return tokenizer_g(tweet)
def shuffle_weights(model):
weights = model.get_weights()
weights = [np.random.permutation(w.flat).reshape(w.shape) for w in weights]
model.set_weights(weights)
def fast_text_model(sequence_length):
model = Sequential()
model.add(Embedding(len(vocab)+1, EMBEDDING_DIM, input_length=sequence_length))
#model.add(Embedding(len(vocab)+1, EMBEDDING_DIM, input_length=sequence_length, trainable=False))
model.add(Dropout(0.5))
model.add(GlobalAveragePooling1D())
model.add(Dense(3, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
print model.summary()
return model
def train_fasttext(X, y, model, inp_dim,embedding_weights, epochs=10, batch_size=128):
cv_object = KFold(n_splits=10, shuffle=True, random_state=42)
print cv_object
p, r, f1 = 0., 0., 0.
p1, r1, f11 = 0., 0., 0.
sentence_len = X.shape[1]
lookup_table = np.zeros_like(model.layers[0].get_weights()[0])
for train_index, test_index in cv_object.split(X):
shuffle_weights(model)
#pdb.set_trace()
#model.layers[0].set_weights([embedding_weights])
X_train, y_train = X[train_index], y[train_index]
X_test, y_test = X[test_index], y[test_index]
y_train = y_train.reshape((len(y_train), 1))
X_temp = np.hstack((X_train, y_train))
for epoch in xrange(epochs):
for X_batch in batch_gen(X_temp, batch_size):
x = X_batch[:, :sentence_len]
y_temp = X_batch[:, sentence_len]
class_weights = {}
class_weights[0] = np.where(y_temp == 0)[0].shape[0]/float(len(y_temp))
class_weights[1] = np.where(y_temp == 1)[0].shape[0]/float(len(y_temp))
class_weights[2] = np.where(y_temp == 2)[0].shape[0]/float(len(y_temp))
try:
y_temp = np_utils.to_categorical(y_temp, nb_classes=3)
except Exception as e:
print e
#print x.shape, y.shape
loss, acc = model.train_on_batch(x, y_temp)#, class_weight=class_weights)
print loss, acc
#pdb.set_trace()
lookup_table += model.layers[0].get_weights()[0]
y_pred = model.predict_on_batch(X_test)
y_pred = np.argmax(y_pred, axis=1)
print classification_report(y_test, y_pred)
print precision_recall_fscore_support(y_test, y_pred)
print y_pred
p += precision_score(y_test, y_pred, average='weighted')
p1 += precision_score(y_test, y_pred, average='micro')
r += recall_score(y_test, y_pred, average='weighted')
r1 += recall_score(y_test, y_pred, average='micro')
f1 += f1_score(y_test, y_pred, average='weighted')
f11 += f1_score(y_test, y_pred, average='micro')
print "macro results are"
print "average precision is %f" %(p/10)
print "average recall is %f" %(r/10)
print "average f1 is %f" %(f1/10)
print "micro results are"
print "average precision is %f" %(p1/10)
print "average recall is %f" %(r1/10)
print "average f1 is %f" %(f11/10)
return lookup_table/float(10)
def check_semantic_sim(embedding_table, word):
reverse_vocab = {v:k for k,v in vocab.iteritems()}
sim_word_idx = get_similar_words(embedding_table, embedding_table[vocab[word]], 25)
sim_words = map(lambda x:reverse_vocab[x[1]], sim_word_idx)
print sim_words
def tryWord(embedding_table):
while True:
print "enter word"
word = raw_input()
if word == "pdb":
pdb.set_trace()
elif word == 'exit':
return
else:
check_semantic_sim(embedding_table, word)
if __name__ == "__main__":
Tweets = select_tweets()
tweets = Tweets
gen_vocab()
X, y = gen_sequence()
MAX_SEQUENCE_LENGTH = max(map(lambda x:len(x), X))
print "max seq length is %d"%(MAX_SEQUENCE_LENGTH)
data = pad_sequences(X, maxlen=MAX_SEQUENCE_LENGTH)
y = np.array(y)
W = get_embedding_weights()
data, y = sklearn.utils.shuffle(data, y)
model = fast_text_model(data.shape[1])
_ = train_fasttext(data, y, model, EMBEDDING_DIM, W)
table = model.layers[0].get_weights()[0]
#check_semantic_sim(table)
tryWord(table)
pdb.set_trace()
================================================
FILE: get_similar_words.py
================================================
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import pdb
def get_similar_words(X, vec, K=1):
# X: (n_samples, n_features)
# vec: (1, n_features)
# returns: K top most similar words with score values and their indexes
scores = cosine_similarity(X, vec)
scores = sorted([(val, index) for index, val in enumerate(scores.reshape((1,scores.shape[0]))[0])], reverse=True)
scores = scores[1:K]
return scores
================================================
FILE: lstm.py
================================================
from data_handler import get_data
import argparse
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding, Input, LSTM
from keras.models import Sequential, Model
from keras.layers import Activation, Dense, Dropout, Embedding, Flatten, Input, Merge, Convolution1D, MaxPooling1D, GlobalMaxPooling1D
import numpy as np
import pdb
from sklearn.metrics import make_scorer, f1_score, accuracy_score, recall_score, precision_score, classification_report, precision_recall_fscore_support
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from gensim.parsing.preprocessing import STOPWORDS
from sklearn.model_selection import KFold
from keras.utils import np_utils
import codecs
import operator
import gensim, sklearn
from string import punctuation
from collections import defaultdict
from batch_gen import batch_gen
import sys
from nltk import tokenize as tokenize_nltk
from my_tokenizer import glove_tokenize
### Preparing the text data
texts = [] # list of text samples
labels_index = {} # dictionary mapping label name to numeric id
labels = [] # list of label ids
# vocab generation
vocab, reverse_vocab = {}, {}
freq = defaultdict(int)
tweets = {}
EMBEDDING_DIM = None
GLOVE_MODEL_FILE = None
SEED = 42
NO_OF_FOLDS = 10
CLASS_WEIGHT = None
LOSS_FUN = None
OPTIMIZER = None
KERNEL = None
TOKENIZER = None
MAX_SEQUENCE_LENGTH = None
INITIALIZE_WEIGHTS_WITH = None
LEARN_EMBEDDINGS = None
EPOCHS = 10
BATCH_SIZE = 512
SCALE_LOSS_FUN = None
word2vec_model = None
def get_embedding(word):
#return
try:
return word2vec_model[word]
except Exception, e:
print 'Encoding not found: %s' %(word)
return np.zeros(EMBEDDING_DIM)
def get_embedding_weights():
embedding = np.zeros((len(vocab) + 1, EMBEDDING_DIM))
n = 0
for k, v in vocab.iteritems():
try:
embedding[v] = word2vec_model[k]
except:
n += 1
pass
print "%d embedding missed"%n
return embedding
def select_tweets():
# selects the tweets as in mean_glove_embedding method
# Processing
tweets = get_data()
X, Y = [], []
tweet_return = []
for tweet in tweets:
_emb = 0
words = TOKENIZER(tweet['text'].lower())
for w in words:
if w in word2vec_model: # Check if embeeding there in GLove model
_emb+=1
if _emb: # Not a blank tweet
tweet_return.append(tweet)
print 'Tweets selected:', len(tweet_return)
#pdb.set_trace()
return tweet_return
def gen_vocab():
# Processing
vocab_index = 1
for tweet in tweets:
text = TOKENIZER(tweet['text'].lower())
text = ''.join([c for c in text if c not in punctuation])
words = text.split()
words = [word for word in words if word not in STOPWORDS]
for word in words:
if word not in vocab:
vocab[word] = vocab_index
reverse_vocab[vocab_index] = word # generate reverse vocab as well
vocab_index += 1
freq[word] += 1
vocab['UNK'] = len(vocab) + 1
reverse_vocab[len(vocab)] = 'UNK'
def filter_vocab(k):
global freq, vocab
pdb.set_trace()
freq_sorted = sorted(freq.items(), key=operator.itemgetter(1))
tokens = freq_sorted[:k]
vocab = dict(zip(tokens, range(1, len(tokens) + 1)))
vocab['UNK'] = len(vocab) + 1
def gen_sequence():
y_map = {
'none': 0,
'racism': 1,
'sexism': 2
}
X, y = [], []
for tweet in tweets:
text = TOKENIZER(tweet['text'].lower())
text = ''.join([c for c in text if c not in punctuation])
words = text.split()
words = [word for word in words if word not in STOPWORDS]
seq, _emb = [], []
for word in words:
seq.append(vocab.get(word, vocab['UNK']))
X.append(seq)
y.append(y_map[tweet['label']])
return X, y
def shuffle_weights(model):
weights = model.get_weights()
weights = [np.random.permutation(w.flat).reshape(w.shape) for w in weights]
model.set_weights(weights)
def lstm_model(sequence_length, embedding_dim):
model_variation = 'LSTM'
print('Model variation is %s' % model_variation)
model = Sequential()
model.add(Embedding(len(vocab)+1, embedding_dim, input_length=sequence_length, trainable=LEARN_EMBEDDINGS))
model.add(Dropout(0.25))#, input_shape=(sequence_length, embedding_dim)))
model.add(LSTM(50))
model.add(Dropout(0.5))
model.add(Dense(3))
model.add(Activation('softmax'))
model.compile(loss=LOSS_FUN, optimizer=OPTIMIZER, metrics=['accuracy'])
print model.summary()
return model
def train_LSTM(X, y, model, inp_dim, weights, epochs=EPOCHS, batch_size=BATCH_SIZE):
cv_object = KFold(n_splits=NO_OF_FOLDS, shuffle=True, random_state=42)
print cv_object
p, r, f1 = 0., 0., 0.
p1, r1, f11 = 0., 0., 0.
sentence_len = X.shape[1]
for train_index, test_index in cv_object.split(X):
if INITIALIZE_WEIGHTS_WITH == "glove":
model.layers[0].set_weights([weights])
elif INITIALIZE_WEIGHTS_WITH == "random":
shuffle_weights(model)
else:
print "ERROR!"
return
X_train, y_train = X[train_index], y[train_index]
X_test, y_test = X[test_index], y[test_index]
y_train = y_train.reshape((len(y_train), 1))
X_temp = np.hstack((X_train, y_train))
for epoch in xrange(epochs):
for X_batch in batch_gen(X_temp, batch_size):
x = X_batch[:, :sentence_len]
y_temp = X_batch[:, sentence_len]
class_weights = None
if SCALE_LOSS_FUN:
class_weights = {}
class_weights[0] = np.where(y_temp == 0)[0].shape[0]/float(len(y_temp))
class_weights[1] = np.where(y_temp == 1)[0].shape[0]/float(len(y_temp))
class_weights[2] = np.where(y_temp == 2)[0].shape[0]/float(len(y_temp))
try:
y_temp = np_utils.to_categorical(y_temp, nb_classes=3)
except Exception as e:
print e
print y_temp
print x.shape, y.shape
loss, acc = model.train_on_batch(x, y_temp, class_weight=class_weights)
print loss, acc
y_pred = model.predict_on_batch(X_test)
y_pred = np.argmax(y_pred, axis=1)
print classification_report(y_test, y_pred)
print precision_recall_fscore_support(y_test, y_pred)
print y_pred
p += precision_score(y_test, y_pred, average='weighted')
p1 += precision_score(y_test, y_pred, average='micro')
r += recall_score(y_test, y_pred, average='weighted')
r1 += recall_score(y_test, y_pred, average='micro')
f1 += f1_score(y_test, y_pred, average='weighted')
f11 += f1_score(y_test, y_pred, average='micro')
print "macro results are"
print "average precision is %f" %(p/NO_OF_FOLDS)
print "average recall is %f" %(r/NO_OF_FOLDS)
print "average f1 is %f" %(f1/NO_OF_FOLDS)
print "micro results are"
print "average precision is %f" %(p1/NO_OF_FOLDS)
print "average recall is %f" %(r1/NO_OF_FOLDS)
print "average f1 is %f" %(f11/NO_OF_FOLDS)
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='LSTM based models for twitter Hate speech detection')
parser.add_argument('-f', '--embeddingfile', required=True)
parser.add_argument('-d', '--dimension', required=True)
parser.add_argument('--tokenizer', choices=['glove', 'nltk'], required=True)
parser.add_argument('--loss', default=LOSS_FUN, required=True)
parser.add_argument('--optimizer', default=OPTIMIZER, required=True)
parser.add_argument('--epochs', default=EPOCHS, required=True)
parser.add_argument('--batch-size', default=BATCH_SIZE, required=True)
parser.add_argument('-s', '--seed', default=SEED)
parser.add_argument('--folds', default=NO_OF_FOLDS)
parser.add_argument('--kernel', default=KERNEL)
parser.add_argument('--class_weight')
parser.add_argument('--initialize-weights', choices=['random', 'glove'], required=True)
parser.add_argument('--learn-embeddings', action='store_true', default=False)
parser.add_argument('--scale-loss-function', action='store_true', default=False)
args = parser.parse_args()
GLOVE_MODEL_FILE = args.embeddingfile
EMBEDDING_DIM = int(args.dimension)
SEED = int(args.seed)
NO_OF_FOLDS = int(args.folds)
CLASS_WEIGHT = args.class_weight
LOSS_FUN = args.loss
OPTIMIZER = args.optimizer
KERNEL = args.kernel
if args.tokenizer == "glove":
TOKENIZER = glove_tokenize
elif args.tokenizer == "nltk":
TOKENIZER = tokenize_nltk.casual.TweetTokenizer(strip_handles=True, reduce_len=True).tokenize
INITIALIZE_WEIGHTS_WITH = args.initialize_weights
LEARN_EMBEDDINGS = args.learn_embeddings
EPOCHS = int(args.epochs)
BATCH_SIZE = int(args.batch_size)
SCALE_LOSS_FUN = args.scale_loss_function
np.random.seed(SEED)
print 'GLOVE embedding: %s' %(GLOVE_MODEL_FILE)
print 'Embedding Dimension: %d' %(EMBEDDING_DIM)
print 'Allowing embedding learning: %s' %(str(LEARN_EMBEDDINGS))
word2vec_model = gensim.models.Word2Vec.load_word2vec_format(GLOVE_MODEL_FILE)
tweets = select_tweets()
gen_vocab()
#filter_vocab(20000)
X, y = gen_sequence()
#Y = y.reshape((len(y), 1))
MAX_SEQUENCE_LENGTH = max(map(lambda x:len(x), X))
print "max seq length is %d"%(MAX_SEQUENCE_LENGTH)
data = pad_sequences(X, maxlen=MAX_SEQUENCE_LENGTH)
y = np.array(y)
data, y = sklearn.utils.shuffle(data, y)
W = get_embedding_weights()
model = lstm_model(data.shape[1], EMBEDDING_DIM)
#model = lstm_model(data.shape[1], 25, get_embedding_weights())
train_LSTM(data, y, model, EMBEDDING_DIM, W)
pdb.set_trace()
================================================
FILE: my_tokenizer.py
================================================
from string import punctuation
from preprocess_twitter import tokenize as tokenizer_g
from gensim.parsing.preprocessing import STOPWORDS
def glove_tokenize(text):
text = tokenizer_g(text)
text = ''.join([c for c in text if c not in punctuation])
words = text.split()
words = [word for word in words if word not in STOPWORDS]
return words
================================================
FILE: nn_classifier.py
================================================
from data_handler import get_data
import sys
import numpy as np
import pdb, json
from sklearn.metrics import make_scorer, f1_score, accuracy_score, recall_score, precision_score, classification_report, precision_recall_fscore_support
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.feature_extraction.text import TfidfVectorizer
import pdb
from sklearn.metrics import make_scorer, f1_score, accuracy_score, recall_score, precision_score, classification_report, precision_recall_fscore_support
from sklearn.utils import shuffle
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression
from sklearn.utils import shuffle
import codecs
import operator
import gensim, sklearn
from collections import defaultdict
from batch_gen import batch_gen
from my_tokenizer import glove_tokenize
import xgboost as xgb
### Preparing the text data
texts = [] # list of text samples
labels_index = {} # dictionary mapping label name to numeric id
labels = [] # list of label ids
label_map = {
'none': 0,
'racism': 1,
'sexism': 2
}
tweet_data = get_data()
for tweet in tweet_data:
texts.append(tweet['text'].lower())
labels.append(label_map[tweet['label']])
print('Found %s texts. (samples)' % len(texts))
# logistic, gradient_boosting, random_forest, svm, tfidf_svm_linear, tfidf_svm_rbf
model_count = 2
word_embed_size = 200
GLOVE_MODEL_FILE = str(sys.argv[1])
EMBEDDING_DIM = int(sys.argv[2])
MODEL_TYPE=sys.argv[3]
print 'Embedding Dimension: %d' %(EMBEDDING_DIM)
print 'GloVe Embedding: %s' %(GLOVE_MODEL_FILE)
word2vec_model1 = np.load('fast_text.npy')
word2vec_model1 = word2vec_model1.reshape((word2vec_model1.shape[1], word2vec_model1.shape[2]))
f_vocab = open('vocab_fast_text', 'r')
vocab = json.load(f_vocab)
word2vec_model = {}
for k,v in vocab.iteritems():
word2vec_model[k] = word2vec_model1[int(v)]
del word2vec_model1
SEED=42
MAX_NB_WORDS = None
VALIDATION_SPLIT = 0.2
# vocab generation
vocab, reverse_vocab = {}, {}
freq = defaultdict(int)
tweets = {}
def select_tweets_whose_embedding_exists():
# selects the tweets as in mean_glove_embedding method
# Processing
tweets = get_data()
X, Y = [], []
tweet_return = []
for tweet in tweets:
_emb = 0
words = glove_tokenize(tweet['text'])
for w in words:
if w in word2vec_model: # Check if embeeding there in GLove model
_emb+=1
if _emb: # Not a blank tweet
tweet_return.append(tweet)
print 'Tweets selected:', len(tweet_return)
#pdb.set_trace()
return tweet_return
def gen_data():
y_map = {
'none': 0,
'racism': 1,
'sexism': 2
}
X, y = [], []
for tweet in tweets:
words = glove_tokenize(tweet['text'])
emb = np.zeros(word_embed_size)
for word in words:
try:
emb += word2vec_model[word]
except:
pass
emb /= len(words)
X.append(emb)
y.append(y_map[tweet['label']])
X = np.array(X)
y = np.array(y)
return X, y
def get_model(m_type=None):
if not m_type:
print 'ERROR: Please provide a valid method name'
return None
if m_type == 'logistic':
logreg = LogisticRegression()
elif m_type == "gradient_boosting":
#logreg = GradientBoostingClassifier(n_estimators=10)
logreg = xgb.XGBClassifier(nthread=-1)
elif m_type == "random_forest":
logreg = RandomForestClassifier(n_estimators=100, n_jobs=-1)
elif m_type == "svm_rbf":
logreg = SVC(class_weight="balanced", kernel='rbf')
elif m_type == "svm_linear":
logreg = LinearSVC(class_weight="balanced")
else:
print "ERROR: Please specify a correst model"
return None
return logreg
def classification_model(X, Y, model_type="logistic"):
NO_OF_FOLDS=10
X, Y = shuffle(X, Y, random_state=SEED)
print "Model Type:", model_type
#predictions = cross_val_predict(logreg, X, Y, cv=NO_OF_FOLDS)
scores1 = cross_val_score(get_model(model_type), X, Y, cv=NO_OF_FOLDS, scoring='precision_weighted')
print "Precision(avg): %0.3f (+/- %0.3f)" % (scores1.mean(), scores1.std() * 2)
scores2 = cross_val_score(get_model(model_type), X, Y, cv=NO_OF_FOLDS, scoring='recall_weighted')
print "Recall(avg): %0.3f (+/- %0.3f)" % (scores2.mean(), scores2.std() * 2)
scores3 = cross_val_score(get_model(model_type), X, Y, cv=NO_OF_FOLDS, scoring='f1_weighted')
print "F1-score(avg): %0.3f (+/- %0.3f)" % (scores3.mean(), scores3.std() * 2)
pdb.set_trace()
if __name__ == "__main__":
#filter_vocab(20000)
tweets = select_tweets_whose_embedding_exists()
X, Y = gen_data()
classification_model(X, Y, MODEL_TYPE)
pdb.set_trace()
================================================
FILE: plot_graph_TSNE.py
================================================
import gensim
import numpy as np
import matplotlib.pyplot as plt
import json
from sklearn.manifold import TSNE
import pdb
import codecs
words = ['mohammed', 'murderer', 'pedophile', 'religion', 'terrorism', 'islamic', 'muslim']
def load_initial_emb():
initial_emb = gensim.models.Word2Vec.load_word2vec_format("/home/pinkesh/DATASETS/glove-twitter/GENSIM.glove.twitter.27B.200d.txt")
return initial_emb
def load_final_emb():
reverse_vocab = codecs.open3("reverse_vocab.json", 'r', encoding="utf-8").readlines()
reverse_vocab = json.loads("".join(reverse_vocab))
reverse_vocab['0'] = "<UNK>"
final_emb = {}
for i, emb in enumerate(np.load("embedding.npy")):
final_emb[reverse_vocab[str(i)].encode("utf-8")] = emb
return final_emb
def get_transform(initial_emb, final_emb):
vec = []
for w in words:
vec.append(initial_emb[w])
for w in words:
vec.append(final_emb[w])
X = np.array(vec)
print X.shape
model = TSNE(n_components=2, random_state=0)
out = model.fit_transform(X)
print out
print "Will plot now!"
return out
# Initial are original
# Next are final
def plot(out):
A = out[:7,:]
B = out[7:,:]
area=150
padding=0.0001
xmin, xmax = min(out[:, 0]), max(out[:, 0])
ymin, ymax = min(out[:, 1]), max(out[:, 1])
fig, ax = plt.subplots()
for (color, label, data) in [('red', 'GloVe', A), ('green', 'FastText+GloVe+Dyn', B)]:
ax.scatter(data[:,0], data[:,1], c=color, s=area, label=label,
alpha=0.3, edgecolors='none')
for (row, word) in zip(data, words):
ax.annotate(word, xy=(row[0], row[1]), xytext=(row[0], row[1]),)
plt.axis([xmin-padding,xmax+padding,ymin-padding,ymax+padding])
plt.legend()
plt.grid(True)
plt.show()
if __name__=="__main__":
ini = load_initial_emb()
fin = load_final_emb()
out = get_transform(ini, fin)
plot(out)
================================================
FILE: preprocess_twitter.py
================================================
"""
preprocess-twitter.py
python preprocess-twitter.py "Some random text with #hashtags, @mentions and http://t.co/kdjfkdjf (links). :)"
Script for preprocessing tweets by Romain Paulus
with small modifications by Jeffrey Pennington
with translation to Python by Motoki Wu
Translation of Ruby script to create features for GloVe vectors for Twitter data.
http://nlp.stanford.edu/projects/glove/preprocess-twitter.rb
"""
import sys
import re
FLAGS = re.MULTILINE | re.DOTALL
def hashtag(text):
text = text.group()
hashtag_body = text[1:]
if hashtag_body.isupper():
result = u"<hashtag> {} <allcaps>".format(hashtag_body)
else:
result = " ".join(["<hashtag>"] + re.split(ur"(?=[A-Z])", hashtag_body, flags=FLAGS))
return result
def allcaps(text):
text = text.group()
return text.lower() + " <allcaps>"
def tokenize(text):
# Different regex parts for smiley faces
eyes = r"[8:=;]"
nose = r"['`\-]?"
# function so code less repetitive
def re_sub(pattern, repl):
return re.sub(pattern, repl, text, flags=FLAGS)
text = re_sub(r"https?:\/\/\S+\b|www\.(\w+\.)+\S*", "<url>")
text = re_sub(r"/"," / ")
text = re_sub(r"@\w+", "<user>")
text = re_sub(r"{}{}[)dD]+|[)dD]+{}{}".format(eyes, nose, nose, eyes), "<smile>")
text = re_sub(r"{}{}p+".format(eyes, nose), "<lolface>")
text = re_sub(r"{}{}\(+|\)+{}{}".format(eyes, nose, nose, eyes), "<sadface>")
text = re_sub(r"{}{}[\/|l*]".format(eyes, nose), "<neutralface>")
text = re_sub(r"<3","<heart>")
text = re_sub(r"[-+]?[.\d]*[\d]+[:,.\d]*", "<number>")
text = re_sub(r"#\S+", hashtag)
text = re_sub(r"([!?.]){2,}", r"\1 <repeat>")
text = re_sub(r"\b(\S*?)(.)\2{2,}\b", r"\1\2 <elong>")
## -- I just don't understand why the Ruby script adds <allcaps> to everything so I limited the selection.
# text = re_sub(r"([^a-z0-9()<>'`\-]){2,}", allcaps)
text = re_sub(r"([A-Z]){2,}", allcaps)
return text.lower()
if __name__ == '__main__':
_, text = sys.argv
if text == "test":
text = u"I TEST alllll kinds of #hashtags and #HASHTAGS, @mentions and 3000 (http://t.co/dkfjkdf). w/ <3 :) haha!!!!!"
tokens = tokenize(text)
print tokens
================================================
FILE: tfidf.py
================================================
from data_handler import get_data
import argparse
import sys
import numpy as np
from sklearn.metrics import make_scorer, f1_score, accuracy_score, recall_score, precision_score, classification_report, precision_recall_fscore_support
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.feature_extraction.text import TfidfVectorizer
import pdb
from sklearn.metrics import make_scorer, f1_score, accuracy_score, recall_score, precision_score, classification_report, precision_recall_fscore_support
from sklearn.utils import shuffle
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression
from sklearn.utils import shuffle
import codecs
import operator
import gensim, sklearn
from collections import defaultdict
from batch_gen import batch_gen
from my_tokenizer import glove_tokenize
from nltk.tokenize import TweetTokenizer
### Preparing the text data
texts = [] # list of text samples
labels_index = {} # dictionary mapping label name to numeric id
labels = [] # list of label ids
# vocab generation
vocab, reverse_vocab = {}, {}
freq = defaultdict(int)
tweets = {}
# tfidf_logistic, tfidf_gradient_boosting, tfidf_random_forest, tfidf_svm_linear, tfidf_svm_rbf
MODEL_TYPE=None
MAX_NGRAM_LENGTH=None
NO_OF_FOLDS=10
CLASS_WEIGHT = None
N_ESTIMATORS = None
LOSS_FUN = None
KERNEL = None
MAX_NGRAM_LENGTH = None
SEED=42
TOKENIZER=None
def gen_data():
label_map = {
'none': 0,
'racism': 1,
'sexism': 2
}
tweet_data = get_data()
for tweet in tweet_data:
texts.append(tweet['text'].lower())
labels.append(label_map[tweet['label']])
print('Found %s texts. (samples)' % len(texts))
def get_model(m_type=None):
if not m_type:
print 'Please specify a model type'
return None
if m_type == "tfidf_svm":
logreg = SVC(class_weight=CLASS_WEIGHT, kernel=KERNEL)
elif m_type == "tfidf_svm_linear":
logreg = LinearSVC(C=0.01, loss=LOSS_FUN, class_weight=CLASS_WEIGHT)
elif m_type == 'tfidf_logistic':
logreg = LogisticRegression()
elif m_type == "tfidf_gradient_boosting":
logreg = GradientBoostingClassifier(loss=LOSS_FUN, n_estimators=N_ESTIMATORS)
elif m_type == "tfidf_random_forest":
logreg = RandomForestClassifier(class_weight=CLASS_WEIGHT, n_estimators=N_ESTIMATORS)
print "ERROR: Please specify a correct model"
return None
return logreg
def classification_model(X, Y, model_type=None):
X, Y = shuffle(X, Y, random_state=SEED)
print "Model Type:", model_type
#predictions = cross_val_predict(logreg, X, Y, cv=NO_OF_FOLDS)
scores1 = cross_val_score(get_model(model_type), X, Y, cv=NO_OF_FOLDS, scoring='precision_weighted')
print "Precision(avg): %0.3f (+/- %0.3f)" % (scores1.mean(), scores1.std() * 2)
scores2 = cross_val_score(get_model(model_type), X, Y, cv=NO_OF_FOLDS, scoring='recall_weighted')
print "Recall(avg): %0.3f (+/- %0.3f)" % (scores2.mean(), scores2.std() * 2)
scores3 = cross_val_score(get_model(model_type), X, Y, cv=NO_OF_FOLDS, scoring='f1_weighted')
print "F1-score(avg): %0.3f (+/- %0.3f)" % (scores3.mean(), scores3.std() * 2)
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='TF-IDF model for twitter Hate speech detection')
parser.add_argument('-m', '--model', choices=['tfidf_svm', 'tfidf_svm_linear', 'tfidf_logistic', 'tfidf_gradient_boosting', 'tfidf_random_forest'], required=True)
parser.add_argument('--max_ngram', required=True)
parser.add_argument('--tokenizer', choices=['glove', 'nltk'], required=True)
parser.add_argument('-s', '--seed', default=SEED)
parser.add_argument('--folds', default=NO_OF_FOLDS)
parser.add_argument('--estimators', default=N_ESTIMATORS)
parser.add_argument('--loss', default=LOSS_FUN)
parser.add_argument('--kernel', default=KERNEL)
parser.add_argument('--class_weight')
parser.add_argument('--use-inverse-doc-freq', action='store_true')
args = parser.parse_args()
MODEL_TYPE = args.model
SEED = int(args.seed)
NO_OF_FOLDS = int(args.folds)
CLASS_WEIGHT = args.class_weight
N_ESTIMATORS = int(args.estimators) if args.estimators else args.estimators
LOSS_FUN = args.loss
KERNEL = args.kernel
MAX_NGRAM_LENGTH = int(args.max_ngram)
USE_IDF = args.use_inverse_doc_freq
if args.tokenizer == "glove":
TOKENIZER = glove_tokenize
elif args.tokenizer == "nltk":
TOKENIZER = TweetTokenizer().tokenize
print 'Max-ngram-length: %d' %(MAX_NGRAM_LENGTH)
#filter_vocab(20000)
# For TFIDF-SVC or any other varient
# We do not need to run the above code for TFIDF
# It does not use the filtered data using gen_data()
gen_data()
tfidf_transformer = TfidfVectorizer(use_idf=USE_IDF, analyzer="word", tokenizer=TOKENIZER, ngram_range=(1, MAX_NGRAM_LENGTH))
#tfidf_transformer = TfidfVectorizer(use_idf=True, ngram_range=(1, MAX_NGRAM_LENGTH))
X_train_tfidf = tfidf_transformer.fit_transform(texts)
X = X_train_tfidf
Y = labels
classification_model(X, Y, MODEL_TYPE)
gitextract_19n6zuu9/ ├── .gitignore ├── BoWV.py ├── README.md ├── batch_gen.py ├── cnn.py ├── data_handler.py ├── fast_text.py ├── get_similar_words.py ├── lstm.py ├── my_tokenizer.py ├── nn_classifier.py ├── plot_graph_TSNE.py ├── preprocess_twitter.py └── tfidf.py
SYMBOL INDEX (52 symbols across 12 files) FILE: BoWV.py function select_tweets_whose_embedding_exists (line 55) | def select_tweets_whose_embedding_exists(): function gen_data (line 73) | def gen_data(): function get_model (line 95) | def get_model(m_type=None): function classification_model (line 116) | def classification_model(X, Y, model_type=None): FILE: batch_gen.py function batch_gen (line 6) | def batch_gen(X, batch_size): FILE: cnn.py function get_embedding (line 60) | def get_embedding(word): function get_embedding_weights (line 68) | def get_embedding_weights(): function select_tweets (line 82) | def select_tweets(): function gen_vocab (line 100) | def gen_vocab(): function filter_vocab (line 119) | def filter_vocab(k): function gen_sequence (line 127) | def gen_sequence(): function shuffle_weights (line 148) | def shuffle_weights(model): function cnn_model (line 154) | def cnn_model(sequence_length, embedding_dim): function train_CNN (line 205) | def train_CNN(X, y, inp_dim, model, weights, epochs=EPOCHS, batch_size=B... FILE: data_handler.py function get_data (line 6) | def get_data(): FILE: fast_text.py function get_embedding (line 63) | def get_embedding(word): function get_embedding_weights (line 71) | def get_embedding_weights(): function select_tweets (line 85) | def select_tweets(): function gen_vocab (line 104) | def gen_vocab(): function filter_vocab (line 124) | def filter_vocab(k): function gen_sequence (line 133) | def gen_sequence(): function Tokenize (line 154) | def Tokenize(tweet): function shuffle_weights (line 160) | def shuffle_weights(model): function fast_text_model (line 166) | def fast_text_model(sequence_length): function train_fasttext (line 177) | def train_fasttext(X, y, model, inp_dim,embedding_weights, epochs=10, ba... function check_semantic_sim (line 233) | def check_semantic_sim(embedding_table, word): function tryWord (line 239) | def tryWord(embedding_table): FILE: get_similar_words.py function get_similar_words (line 5) | def get_similar_words(X, vec, K=1): FILE: lstm.py function get_embedding (line 59) | def get_embedding(word): function get_embedding_weights (line 67) | def get_embedding_weights(): function select_tweets (line 80) | def select_tweets(): function gen_vocab (line 99) | def gen_vocab(): function filter_vocab (line 118) | def filter_vocab(k): function gen_sequence (line 127) | def gen_sequence(): function shuffle_weights (line 148) | def shuffle_weights(model): function lstm_model (line 153) | def lstm_model(sequence_length, embedding_dim): function train_LSTM (line 168) | def train_LSTM(X, y, model, inp_dim, weights, epochs=EPOCHS, batch_size=... FILE: my_tokenizer.py function glove_tokenize (line 6) | def glove_tokenize(text): FILE: nn_classifier.py function select_tweets_whose_embedding_exists (line 71) | def select_tweets_whose_embedding_exists(): function gen_data (line 90) | def gen_data(): function get_model (line 114) | def get_model(m_type=None): function classification_model (line 137) | def classification_model(X, Y, model_type="logistic"): FILE: plot_graph_TSNE.py function load_initial_emb (line 12) | def load_initial_emb(): function load_final_emb (line 16) | def load_final_emb(): function get_transform (line 26) | def get_transform(initial_emb, final_emb): function plot (line 47) | def plot(out): FILE: preprocess_twitter.py function hashtag (line 19) | def hashtag(text): function allcaps (line 28) | def allcaps(text): function tokenize (line 33) | def tokenize(text): FILE: tfidf.py function gen_data (line 50) | def gen_data(): function get_model (line 64) | def get_model(m_type=None): function classification_model (line 85) | def classification_model(X, Y, model_type=None):
Condensed preview — 14 files, each showing path, character count, and a content snippet. Download the .json file or copy for the full structured content (62K chars).
[
{
"path": ".gitignore",
"chars": 6,
"preview": "*.pyc\n"
},
{
"path": "BoWV.py",
"chars": 5776,
"preview": "from data_handler import get_data\nimport argparse\nimport sys\nimport numpy as np\nimport pdb\nfrom sklearn.metrics import m"
},
{
"path": "README.md",
"chars": 5583,
"preview": "# Hate Speech Detection on Twitter\n\nImplementation of our paper titled - \"Deep Learning for Hate Speech Detection\" (to a"
},
{
"path": "batch_gen.py",
"chars": 635,
"preview": "import numpy as np\nimport random\nimport pdb\nimport math\n\ndef batch_gen(X, batch_size):\n n_batches = X.shape[0]/float("
},
{
"path": "cnn.py",
"chars": 11123,
"preview": "from data_handler import get_data\nimport argparse\nfrom keras.preprocessing.sequence import pad_sequences\nfrom keras.laye"
},
{
"path": "data_handler.py",
"chars": 1206,
"preview": "import json\nimport pdb\nimport codecs\nimport pdb\n\ndef get_data():\n tweets = []\n files = ['racism.json', 'neither.js"
},
{
"path": "fast_text.py",
"chars": 9008,
"preview": "from data_handler import get_data\nfrom keras.preprocessing.text import Tokenizer\nfrom keras.preprocessing.sequence impor"
},
{
"path": "get_similar_words.py",
"chars": 463,
"preview": "from sklearn.metrics.pairwise import cosine_similarity\nimport numpy as np\nimport pdb\n\ndef get_similar_words(X, vec, K=1)"
},
{
"path": "lstm.py",
"chars": 10103,
"preview": "from data_handler import get_data\nimport argparse\nfrom keras.preprocessing.sequence import pad_sequences\nfrom keras.laye"
},
{
"path": "my_tokenizer.py",
"chars": 361,
"preview": "from string import punctuation\nfrom preprocess_twitter import tokenize as tokenizer_g\nfrom gensim.parsing.preprocessing "
},
{
"path": "nn_classifier.py",
"chars": 5096,
"preview": "from data_handler import get_data\nimport sys\nimport numpy as np\nimport pdb, json\nfrom sklearn.metrics import make_scorer"
},
{
"path": "plot_graph_TSNE.py",
"chars": 2005,
"preview": "import gensim\nimport numpy as np\nimport matplotlib.pyplot as plt\nimport json\nfrom sklearn.manifold import TSNE\nimport pd"
},
{
"path": "preprocess_twitter.py",
"chars": 2243,
"preview": "\"\"\"\npreprocess-twitter.py\n\npython preprocess-twitter.py \"Some random text with #hashtags, @mentions and http://t.co/kdjf"
},
{
"path": "tfidf.py",
"chars": 5398,
"preview": "from data_handler import get_data\nimport argparse\nimport sys\nimport numpy as np\nfrom sklearn.metrics import make_scorer,"
}
]
About this extraction
This page contains the full source code of the pinkeshbadjatiya/twitter-hatespeech GitHub repository, extracted and formatted as plain text for AI agents and large language models (LLMs). The extraction includes 14 files (57.6 KB), approximately 15.8k tokens, and a symbol index with 52 extracted functions, classes, methods, constants, and types. Use this with OpenClaw, Claude, ChatGPT, Cursor, Windsurf, or any other AI tool that accepts text input. You can copy the full output to your clipboard or download it as a .txt file.
Extracted by GitExtract — free GitHub repo to text converter for AI. Built by Nikandr Surkov.