Full Code of pinkeshbadjatiya/twitter-hatespeech for AI

master 3b834311953b cached

14 files

57.6 KB

15.8k tokens

52 symbols

1 requests

Download .txt

Repository: pinkeshbadjatiya/twitter-hatespeech
Branch: master
Commit: 3b834311953b
Files: 14
Total size: 57.6 KB

Directory structure:
gitextract_19n6zuu9/

├── .gitignore
├── BoWV.py
├── README.md
├── batch_gen.py
├── cnn.py
├── data_handler.py
├── fast_text.py
├── get_similar_words.py
├── lstm.py
├── my_tokenizer.py
├── nn_classifier.py
├── plot_graph_TSNE.py
├── preprocess_twitter.py
└── tfidf.py

================================================
FILE CONTENTS
================================================

================================================
FILE: .gitignore
================================================
*.pyc


================================================
FILE: BoWV.py
================================================
from data_handler import get_data
import argparse
import sys
import numpy as np
import pdb
from sklearn.metrics import make_scorer, f1_score, accuracy_score, recall_score, precision_score, classification_report, precision_recall_fscore_support
from sklearn.ensemble  import GradientBoostingClassifier, RandomForestClassifier
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import make_scorer, f1_score, accuracy_score, recall_score, precision_score, classification_report, precision_recall_fscore_support
from sklearn.utils import shuffle
from sklearn.ensemble  import GradientBoostingClassifier, RandomForestClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression
from sklearn.utils import shuffle
import codecs
import operator
import gensim, sklearn
from collections import defaultdict
from batch_gen import batch_gen
from my_tokenizer import glove_tokenize
from nltk.tokenize import TweetTokenizer


### Preparing the text data
texts = []  # list of text samples
labels_index = {}  # dictionary mapping label name to numeric id
labels = []  # list of label ids


# logistic, gradient_boosting, random_forest, svm_linear, svm_rbf
GLOVE_MODEL_FILE = None
EMBEDDING_DIM = None
MODEL_TYPE = None
CLASS_WEIGHT = None
N_ESTIMATORS = None
LOSS_FUN = None
KERNEL = None
TOKENIZER = None

SEED=42
MAX_NB_WORDS = None
NO_OF_FOLDS=10


# vocab generation
vocab, reverse_vocab = {}, {}
freq = defaultdict(int)
tweets = {}

word2vec_model = None


def select_tweets_whose_embedding_exists():
    # selects the tweets as in mean_glove_embedding method
    # Processing
    tweets = get_data()
    X, Y = [], []
    tweet_return = []
    for tweet in tweets:
        _emb = 0
        words = TOKENIZER(tweet['text'].lower())
        for w in words:
            if w in word2vec_model:  # Check if embeeding there in GLove model
                _emb+=1
        if _emb:   # Not a blank tweet
            tweet_return.append(tweet)
    print 'Tweets selected:', len(tweet_return)
    return tweet_return


def gen_data():
    y_map = {
            'none': 0,
            'racism': 1,
            'sexism': 2
            }

    X, y = [], []
    for tweet in tweets:
        words = glove_tokenize(tweet['text'].lower())
        emb = np.zeros(EMBEDDING_DIM)
        for word in words:
            try:
                emb += word2vec_model[word]
            except:
                pass
        emb /= len(words)
        X.append(emb)
        y.append(y_map[tweet['label']])
    return X, y

    
def get_model(m_type=None):
    if not m_type:
        print "ERROR: Please specify a model type!"
        return None
    if m_type == 'logistic':
        logreg = LogisticRegression()
    elif m_type == "gradient_boosting":
        logreg = GradientBoostingClassifier(loss=LOSS_FUN, n_estimators=N_ESTIMATORS)
    elif m_type == "random_forest":
        logreg = RandomForestClassifier(class_weight=CLASS_WEIGHT, n_estimators=N_ESTIMATORS)
    elif m_type == "svm":
        logreg = SVC(class_weight=CLASS_WEIGHT, kernel=KERNEL)
    elif m_type == "svm_linear":
        logreg = LinearSVC(loss=LOSS_FUN, class_weight=CLASS_WEIGHT)
    else:
        print "ERROR: Please specify a correct model"
        return None

    return logreg


def classification_model(X, Y, model_type=None):
    X, Y = shuffle(X, Y, random_state=SEED)
    print "Model Type:", model_type

    #predictions = cross_val_predict(logreg, X, Y, cv=NO_OF_FOLDS)
    scores1 = cross_val_score(get_model(model_type), X, Y, cv=NO_OF_FOLDS, scoring='precision_weighted')
    print "Precision(avg): %0.3f (+/- %0.3f)" % (scores1.mean(), scores1.std() * 2)

    scores2 = cross_val_score(get_model(model_type), X, Y, cv=NO_OF_FOLDS, scoring='recall_weighted')
    print "Recall(avg): %0.3f (+/- %0.3f)" % (scores2.mean(), scores2.std() * 2)
    
    scores3 = cross_val_score(get_model(model_type), X, Y, cv=NO_OF_FOLDS, scoring='f1_weighted')
    print "F1-score(avg): %0.3f (+/- %0.3f)" % (scores3.mean(), scores3.std() * 2)


if __name__ == "__main__":

    parser = argparse.ArgumentParser(description='BagOfWords model for twitter Hate speech detection')
    parser.add_argument('-m', '--model', choices=['logistic', 'gradient_boosting', 'random_forest', 'svm', 'svm_linear'], required=True)
    parser.add_argument('-f', '--embeddingfile', required=True)
    parser.add_argument('-d', '--dimension', required=True)
    parser.add_argument('--tokenizer', choices=['glove', 'nltk'], required=True)
    parser.add_argument('-s', '--seed', default=SEED)
    parser.add_argument('--folds', default=NO_OF_FOLDS)
    parser.add_argument('--estimators', default=N_ESTIMATORS)
    parser.add_argument('--loss', default=LOSS_FUN)
    parser.add_argument('--kernel', default=KERNEL)
    parser.add_argument('--class_weight')


    args = parser.parse_args()
    MODEL_TYPE = args.model
    GLOVE_MODEL_FILE = args.embeddingfile
    EMBEDDING_DIM = int(args.dimension)
    SEED = int(args.seed)
    NO_OF_FOLDS = int(args.folds)
    CLASS_WEIGHT = args.class_weight
    N_ESTIMATORS = int(args.estimators)
    LOSS_FUN = args.loss
    KERNEL = args.kernel
    if args.tokenizer == "glove":
        TOKENIZER = glove_tokenize
    elif args.tokenizer == "nltk":
        TOKENIZER = TweetTokenizer().tokenize

    print 'GLOVE embedding: %s' %(GLOVE_MODEL_FILE)
    print 'Embedding Dimension: %d' %(EMBEDDING_DIM)
    word2vec_model = gensim.models.Word2Vec.load_word2vec_format(GLOVE_MODEL_FILE)

    #filter_vocab(20000)

    tweets = select_tweets_whose_embedding_exists()
    X, Y = gen_data()

    classification_model(X, Y, MODEL_TYPE)


================================================
FILE: README.md
================================================
# Hate Speech Detection on Twitter

Implementation of our paper titled - "Deep Learning for Hate Speech Detection" (to appear in WWW'17 proceedings). 

## Dataset

Dataset can be downloaded from [https://github.com/zeerakw/hatespeech](https://github.com/zeerakw/hatespeech). Contains tweet id's and corresponding annotations. 

Tweets are labelled as either Racist, Sexist or Neither Racist or Sexist. 

Use your favourite tweet crawler and download the data and place the tweets in the folder 'tweet_data'.


## Requirements
* Keras 
* Tensorflow or Theano (we experimented with theano)
* Gensim
* xgboost
* NLTK
* Sklearn
* Numpy

## Instructions to run

Before running the model, make sure you have setup the input dataset in a folder named `tweet_data`.   
To run a model for training, use the following instructions mentioned below. Use appropriate parameter settings to test the variations of the models.


### This script contains code for runnning NN_model + GDBT. 

Steps to run NN_model + GDBT
 * Run NN_model first (CNN/LSTM/Fast_text). It will create a model file
 * Change the name of the file at line 50 pointing to the model file
 * Run nn_classifier file as per instructions below

python nn_classifier.py <GradientBoosting(xgboost) or Random Forest> 


- BagOfWords models - **BoWV.py[does not supports XGBOOST, supports sklearn's GBDT]**
```
usage: BoWV.py [-h] -m [Deprecated]
               {logistic,gradient_boosting,random_forest,svm,svm_linear} -f
               EMBEDDINGFILE -d DIMENSION --tokenizer {glove,nltk} [-s SEED]
               [--folds FOLDS] [--estimators ESTIMATORS] [--loss LOSS]
               [--kernel KERNEL] [--class_weight CLASS_WEIGHT]

BagOfWords model for twitter Hate speech detection

optional arguments:
  -h, --help            show this help message and exit
  -m {logistic,gradient_boosting,random_forest,svm,svm_linear}, --model {logistic,gradient_boosting,random_forest,svm,svm_linear}
  -f EMBEDDINGFILE, --embeddingfile EMBEDDINGFILE
  -d DIMENSION, --dimension DIMENSION
  --tokenizer {glove,nltk}
  -s SEED, --seed SEED
  --folds FOLDS
  --estimators ESTIMATORS
  --loss LOSS
  --kernel KERNEL
  --class_weight CLASS_WEIGHT
```

- TF-IDF based models - **tfidf.py**
```
usage: tfidf.py [-h] -m
                {tfidf_svm,tfidf_svm_linear,tfidf_logistic,tfidf_gradient_boosting,tfidf_random_forest}
                --max_ngram MAX_NGRAM --tokenizer {glove,nltk} [-s SEED]
                [--folds FOLDS] [--estimators ESTIMATORS] [--loss LOSS]
                [--kernel KERNEL] [--class_weight CLASS_WEIGHT]
                [--use-inverse-doc-freq]

TF-IDF model for twitter Hate speech detection

optional arguments:
  -h, --help            show this help message and exit
  -m {tfidf_svm,tfidf_svm_linear,tfidf_logistic,tfidf_gradient_boosting,tfidf_random_forest}, --model {tfidf_svm,tfidf_svm_linear,tfidf_logistic,tfidf_gradient_boosting,tfidf_random_forest}
  --max_ngram MAX_NGRAM
  --tokenizer {glove,nltk}
  -s SEED, --seed SEED
  --folds FOLDS
  --estimators ESTIMATORS
  --loss LOSS
  --kernel KERNEL
  --class_weight CLASS_WEIGHT
  --use-inverse-doc-freq
```

- LSTM(RNN) based methods - **lstm.py**
```
usage: lstm.py [-h] -f EMBEDDINGFILE -d DIMENSION --tokenizer {glove,nltk}
               --loss LOSS --optimizer OPTIMIZER --epochs EPOCHS --batch-size
               BATCH_SIZE [-s SEED] [--folds FOLDS] [--kernel KERNEL]
               [--class_weight CLASS_WEIGHT] --initialize-weights
               {random,glove} [--learn-embeddings] [--scale-loss-function]

LSTM based models for twitter Hate speech detection

optional arguments:
  -h, --help            show this help message and exit
  -f EMBEDDINGFILE, --embeddingfile EMBEDDINGFILE
  -d DIMENSION, --dimension DIMENSION
  --tokenizer {glove,nltk}
  --loss LOSS
  --optimizer OPTIMIZER
  --epochs EPOCHS
  --batch-size BATCH_SIZE
  -s SEED, --seed SEED
  --folds FOLDS
  --kernel KERNEL
  --class_weight CLASS_WEIGHT
  --initialize-weights {random,glove}
  --learn-embeddings
  --scale-loss-function
```

- CNN based models - **cnn.py**
```
usage: cnn.py [-h] -f EMBEDDINGFILE -d DIMENSION --tokenizer {glove,nltk}
              --loss LOSS --optimizer OPTIMIZER --epochs EPOCHS --batch-size
              BATCH_SIZE [-s SEED] [--folds FOLDS]
              [--class_weight CLASS_WEIGHT] --initialize-weights
              {random,glove} [--learn-embeddings] [--scale-loss-function]

CNN based models for twitter Hate speech detection

optional arguments:
  -h, --help            show this help message and exit
  -f EMBEDDINGFILE, --embeddingfile EMBEDDINGFILE
  -d DIMENSION, --dimension DIMENSION
  --tokenizer {glove,nltk}
  --loss LOSS
  --optimizer OPTIMIZER
  --epochs EPOCHS
  --batch-size BATCH_SIZE
  -s SEED, --seed SEED
  --folds FOLDS
  --class_weight CLASS_WEIGHT
  --initialize-weights {random,glove}
  --learn-embeddings
  --scale-loss-function
```



## Examples:
```
python BoWV.py --model logistic --seed 42 -f glove.twitter.27b.25d.txt -d 25 --seed 42 --folds 10 --tokenizer glove  
python tfidf.py -m tfidf_svm_linear --max_ngram 3 --tokenizer glove --loss squared_hinge
python lstm.py -f ~/DATASETS/glove-twitter/GENSIM.glove.twitter.27B.25d.txt -d 25 --tokenizer glove --loss categorical_crossentropy --optimizer adam --initialize-weights random --learn-embeddings --epochs 10 --batch-size 512
python cnn.py -f ~/DATASETS/glove-twitter/GENSIM.glove.twitter.27B.25d.txt -d 25 --tokenizer nltk --loss categorical_crossentropy --optimizer adam --epochs 10 --batch-size 128 --initialize-weights random --scale-loss-function

```


================================================
FILE: batch_gen.py
================================================
import numpy as np
import random
import pdb
import math

def batch_gen(X, batch_size):
    n_batches = X.shape[0]/float(batch_size)
    n_batches = int(math.ceil(n_batches))
    end = int(X.shape[0]/float(batch_size)) * batch_size
    n = 0
    for i in xrange(0,n_batches):
        if i < n_batches - 1: 
            batch = X[i*batch_size:(i+1) * batch_size, :]
            yield batch
        
        else:
            batch = X[end: , :]
            n += X[end:, :].shape[0]
            yield batch
        


if __name__ == "__main__":
    X = np.random.rand(123, 32)
    for batch in batch_gen(X, 21):
        print batch.shape


================================================
FILE: cnn.py
================================================
from data_handler import get_data
import argparse
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding, Input, LSTM
from keras.models import Sequential, Model
from keras.layers import Activation, Dense, Dropout, Embedding, Flatten, Input, Merge, Convolution1D, MaxPooling1D, GlobalMaxPooling1D
import numpy as np
import pdb
from nltk import tokenize
from sklearn.metrics import make_scorer, f1_score, accuracy_score, recall_score, precision_score, classification_report, precision_recall_fscore_support
from sklearn.ensemble  import GradientBoostingClassifier, RandomForestClassifier
from gensim.parsing.preprocessing import STOPWORDS
from sklearn.model_selection import KFold
from keras.utils import np_utils
from string import punctuation
import codecs
import operator
import gensim, sklearn
from collections import defaultdict
from batch_gen import batch_gen
import sys

from nltk import tokenize as tokenize_nltk
from my_tokenizer import glove_tokenize


### Preparing the text data
texts = []  # list of text samples
labels_index = {}  # dictionary mapping label name to numeric id
labels = []  # list of label ids

# vocab generation
vocab, reverse_vocab = {}, {}
freq = defaultdict(int)
tweets = {}



EMBEDDING_DIM = None
GLOVE_MODEL_FILE = None
NO_OF_CLASSES=3

SEED = 42
NO_OF_FOLDS = 10
CLASS_WEIGHT = None
LOSS_FUN = None
OPTIMIZER = None
TOKENIZER = None
INITIALIZE_WEIGHTS_WITH = None
LEARN_EMBEDDINGS = None
EPOCHS = 10
BATCH_SIZE = 128
SCALE_LOSS_FUN = None


word2vec_model = None



def get_embedding(word):
    #return
    try:
        return word2vec_model[word]
    except Exception, e:
        print 'Encoding not found: %s' %(word)
        return np.zeros(EMBEDDING_DIM)

def get_embedding_weights():
    embedding = np.zeros((len(vocab) + 1, EMBEDDING_DIM))
    n = 0
    for k, v in vocab.iteritems():
        try:
            embedding[v] = word2vec_model[k]
        except:
            n += 1
            pass
    print "%d embedding missed"%n
    #pdb.set_trace()
    return embedding


def select_tweets():
    # selects the tweets as in mean_glove_embedding method
    # Processing
    tweets = get_data()
    X, Y = [], []
    tweet_return = []
    for tweet in tweets:
        _emb = 0
        words = TOKENIZER(tweet['text'].lower())
        for w in words:
            if w in word2vec_model:  # Check if embeeding there in GLove model
                _emb+=1
        if _emb:   # Not a blank tweet
            tweet_return.append(tweet)
    print 'Tweets selected:', len(tweet_return)
    return tweet_return


def gen_vocab():
    # Processing
    vocab_index = 1
    for tweet in tweets:
        text = TOKENIZER(tweet['text'].lower())
        text = ''.join([c for c in text if c not in punctuation])
        words = text.split()
        words = [word for word in words if word not in STOPWORDS]

        for word in words:
            if word not in vocab:
                vocab[word] = vocab_index
                reverse_vocab[vocab_index] = word       # generate reverse vocab as well
                vocab_index += 1
            freq[word] += 1
    vocab['UNK'] = len(vocab) + 1
    reverse_vocab[len(vocab)] = 'UNK'


def filter_vocab(k):
    global freq, vocab
    freq_sorted = sorted(freq.items(), key=operator.itemgetter(1))
    tokens = freq_sorted[:k]
    vocab = dict(zip(tokens, range(1, len(tokens) + 1)))
    vocab['UNK'] = len(vocab) + 1


def gen_sequence():
    y_map = {
            'none': 0,
            'racism': 1,
            'sexism': 2
            }

    X, y = [], []
    for tweet in tweets:
        text = TOKENIZER(tweet['text'].lower())
        text = ''.join([c for c in text if c not in punctuation])
        words = text.split()
        words = [word for word in words if word not in STOPWORDS]
        seq, _emb = [], []
        for word in words:
            seq.append(vocab.get(word, vocab['UNK']))
        X.append(seq)
        y.append(y_map[tweet['label']])
    return X, y


def shuffle_weights(model):
    weights = model.get_weights()
    weights = [np.random.permutation(w.flat).reshape(w.shape) for w in weights]
    model.set_weights(weights)


def cnn_model(sequence_length, embedding_dim):
    model_variation = 'CNN-rand'  #  CNN-rand | CNN-non-static | CNN-static
    print('Model variation is %s' % model_variation)

    # Model Hyperparameters
    n_classes = NO_OF_CLASSES
    embedding_dim = EMBEDDING_DIM
    filter_sizes = (3, 4, 5)
    num_filters = 100
    dropout_prob = (0.25, 0.5)
    hidden_dims = 100

    # Training parameters
    # Word2Vec parameters, see train_word2vec
    #min_word_count = 1  # Minimum word count
    #context = 10        # Context window size

    graph_in = Input(shape=(sequence_length, embedding_dim))
    convs = []
    for fsz in filter_sizes:
        conv = Convolution1D(nb_filter=num_filters,
                             filter_length=fsz,
                             border_mode='valid',
                             activation='relu')(graph_in)
                             #,subsample_length=1)(graph_in)
        pool = GlobalMaxPooling1D()(conv)
        #flatten = Flatten()(pool)
        convs.append(pool)

    if len(filter_sizes)>1:
        out = Merge(mode='concat')(convs)
    else:
        out = convs[0]

    graph = Model(input=graph_in, output=out)

    # main sequential model
    model = Sequential()
    #if not model_variation=='CNN-rand':
    model.add(Embedding(len(vocab)+1, embedding_dim, input_length=sequence_length, trainable=LEARN_EMBEDDINGS))
    model.add(Dropout(dropout_prob[0]))#, input_shape=(sequence_length, embedding_dim)))
    model.add(graph)
    model.add(Dropout(dropout_prob[1]))
    model.add(Activation('relu'))
    model.add(Dense(n_classes))
    model.add(Activation('softmax'))
    model.compile(loss=LOSS_FUN, optimizer=OPTIMIZER, metrics=['accuracy'])
    print model.summary()
    return model


def train_CNN(X, y, inp_dim, model, weights, epochs=EPOCHS, batch_size=BATCH_SIZE):
    cv_object = KFold(n_splits=NO_OF_FOLDS, shuffle=True, random_state=42)
    print cv_object
    p, r, f1 = 0., 0., 0.
    p1, r1, f11 = 0., 0., 0.
    sentence_len = X.shape[1]
    for train_index, test_index in cv_object.split(X):
        if INITIALIZE_WEIGHTS_WITH == "glove":
            model.layers[0].set_weights([weights])
        elif INITIALIZE_WEIGHTS_WITH == "random":
            shuffle_weights(model)
        else:
            print "ERROR!"
            return

        X_train, y_train = X[train_index], y[train_index]
        X_test, y_test = X[test_index], y[test_index]
        y_train = y_train.reshape((len(y_train), 1))
        X_temp = np.hstack((X_train, y_train))
        for epoch in xrange(epochs):
            for X_batch in batch_gen(X_temp, batch_size):
                x = X_batch[:, :sentence_len]
                y_temp = X_batch[:, sentence_len]

                class_weights = None
                if SCALE_LOSS_FUN:
                    class_weights = {}
                    class_weights[0] = np.where(y_temp == 0)[0].shape[0]/float(len(y_temp))
                    class_weights[1] = np.where(y_temp == 1)[0].shape[0]/float(len(y_temp))
                    class_weights[2] = np.where(y_temp == 2)[0].shape[0]/float(len(y_temp))

                try:
                    y_temp = np_utils.to_categorical(y_temp, nb_classes=3)
                except Exception as e:
                    print e
                    print y_temp
                print x.shape, y.shape
                loss, acc = model.train_on_batch(x, y_temp, class_weight=class_weights)
                print loss, acc
        y_pred = model.predict_on_batch(X_test)
        y_pred = np.argmax(y_pred, axis=1)
        print classification_report(y_test, y_pred)
        print precision_recall_fscore_support(y_test, y_pred)
        print y_pred
        p += precision_score(y_test, y_pred, average='weighted')
        p1 += precision_score(y_test, y_pred, average='micro')
        r += recall_score(y_test, y_pred, average='weighted')
        r1 += recall_score(y_test, y_pred, average='micro')
        f1 += f1_score(y_test, y_pred, average='weighted')
        f11 += f1_score(y_test, y_pred, average='micro')

    print "macro results are"
    print "average precision is %f" %(p/NO_OF_FOLDS)
    print "average recall is %f" %(r/NO_OF_FOLDS)
    print "average f1 is %f" %(f1/NO_OF_FOLDS)

    print "micro results are"
    print "average precision is %f" %(p1/NO_OF_FOLDS)
    print "average recall is %f" %(r1/NO_OF_FOLDS)
    print "average f1 is %f" %(f11/NO_OF_FOLDS)


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description='CNN based models for twitter Hate speech detection')
    parser.add_argument('-f', '--embeddingfile', required=True)
    parser.add_argument('-d', '--dimension', required=True)
    parser.add_argument('--tokenizer', choices=['glove', 'nltk'], required=True)
    parser.add_argument('--loss', default=LOSS_FUN, required=True)
    parser.add_argument('--optimizer', default=OPTIMIZER, required=True)
    parser.add_argument('--epochs', default=EPOCHS, required=True)
    parser.add_argument('--batch-size', default=BATCH_SIZE, required=True)
    parser.add_argument('-s', '--seed', default=SEED)
    parser.add_argument('--folds', default=NO_OF_FOLDS)
    parser.add_argument('--class_weight')
    parser.add_argument('--initialize-weights', choices=['random', 'glove'], required=True)
    parser.add_argument('--learn-embeddings', action='store_true', default=False)
    parser.add_argument('--scale-loss-function', action='store_true', default=False)
    args = parser.parse_args()

    GLOVE_MODEL_FILE = args.embeddingfile
    EMBEDDING_DIM = int(args.dimension)
    SEED = int(args.seed)
    NO_OF_FOLDS = int(args.folds)
    CLASS_WEIGHT = args.class_weight
    LOSS_FUN = args.loss
    OPTIMIZER = args.optimizer
    if args.tokenizer == "glove":
        TOKENIZER = glove_tokenize
    elif args.tokenizer == "nltk":
        TOKENIZER = tokenize_nltk.casual.TweetTokenizer(strip_handles=True, reduce_len=True).tokenize
    INITIALIZE_WEIGHTS_WITH = args.initialize_weights
    LEARN_EMBEDDINGS = args.learn_embeddings
    EPOCHS = int(args.epochs)
    BATCH_SIZE = int(args.batch_size)
    SCALE_LOSS_FUN = args.scale_loss_function



    print 'GLOVE embedding: %s' %(GLOVE_MODEL_FILE)
    print 'Embedding Dimension: %d' %(EMBEDDING_DIM)
    print 'Allowing embedding learning: %s' %(str(LEARN_EMBEDDINGS))

    word2vec_model = gensim.models.Word2Vec.load_word2vec_format(GLOVE_MODEL_FILE)
    np.random.seed(SEED)


    Tweets = select_tweets()
    tweets = Tweets
    gen_vocab()
    #filter_vocab(20000)
    X, y = gen_sequence()
    #Y = y.reshape((len(y), 1))
    MAX_SEQUENCE_LENGTH = max(map(lambda x:len(x), X))
    print "max seq length is %d"%(MAX_SEQUENCE_LENGTH)
    data = pad_sequences(X, maxlen=MAX_SEQUENCE_LENGTH)
    y = np.array(y)
    data, y = sklearn.utils.shuffle(data, y)
    W = get_embedding_weights()
    model = cnn_model(data.shape[1], EMBEDDING_DIM)
    train_CNN(data, y, EMBEDDING_DIM, model, W)

    pdb.set_trace()




================================================
FILE: data_handler.py
================================================
import json
import pdb
import codecs
import pdb

def get_data():
    tweets = []
    files = ['racism.json', 'neither.json', 'sexism.json']
    for file in files:
        with codecs.open('./tweet_data/' + file, 'r', encoding='utf-8') as f:
            data = f.readlines()
        for line in data:
            tweet_full = json.loads(line)
            tweets.append({
                'id': tweet_full['id'],
                'text': tweet_full['text'].lower(),
                'label': tweet_full['Annotation'],
                'name': tweet_full['user']['name'].split()[0]
                })

    #pdb.set_trace()
    return tweets


if __name__=="__main__":
    tweets = get_data()
    males, females = {}, {}
    with open('./tweet_data/males.txt') as f:
        males = set([w.strip() for w in f.readlines()])
    with open('./tweet_data/females.txt') as f:
        females = set([w.strip() for w in f.readlines()])

    males_c, females_c, not_found = 0, 0, 0
    for t in tweets:
        if t['name'] in males:
            males_c += 1
        elif t['name'] in females:
            females_c += 1
        else:
            not_found += 1
    print males_c, females_c, not_found
    pdb.set_trace()


================================================
FILE: fast_text.py
================================================
from data_handler import get_data
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding, Input, LSTM
from keras.models import Sequential, Model
from keras.layers import Activation, Dense, Dropout, Embedding, Flatten, Input, Merge, Convolution1D, MaxPooling1D, GlobalMaxPooling1D, GlobalAveragePooling1D
import numpy as np
from preprocess_twitter import tokenize as tokenizer_g
import pdb
from nltk import tokenize
from sklearn.metrics import make_scorer, f1_score, accuracy_score, recall_score, precision_score, classification_report, precision_recall_fscore_support
from sklearn.ensemble  import GradientBoostingClassifier, RandomForestClassifier
from gensim.parsing.preprocessing import STOPWORDS
from sklearn.model_selection import KFold
from keras.utils import np_utils
import codecs
import operator
import gensim, sklearn
from collections import defaultdict
from batch_gen import batch_gen
from string import punctuation
from get_similar_words import get_similar_words
import sys

### Preparing the text data
texts = []  # list of text samples
labels_index = {}  # dictionary mapping label name to numeric id
labels = []  # list of label ids
label_map = {
        'none': 0,
        'racism': 1,
        'sexism': 2
    }
tweet_data = get_data()
for tweet in tweet_data:
    texts.append(tweet['text'])
    labels.append(label_map[tweet['label']])
print('Found %s texts. (samples)' % len(texts))

EMBEDDING_DIM = int(sys.argv[1])
np.random.seed(42)
# Load the orginal glove file
# SHASHANK files
#GLOVE_MODEL_FILE="/home/shashank/DL_NLP/glove-twitter" + str(EMBEDDING_DIM) + "-w2v"


# PINKESH files
GLOVE_MODEL_FILE="/home/pinkesh/DATASETS/glove-twitter/GENSIM.glove.twitter.27B." + str(EMBEDDING_DIM) + "d.txt"
NO_OF_CLASSES=3

MAX_NB_WORDS = None
VALIDATION_SPLIT = 0.2
word2vec_model = gensim.models.Word2Vec.load_word2vec_format(GLOVE_MODEL_FILE)


# vocab generation
MyTokenizer = tokenize.casual.TweetTokenizer(strip_handles=True, reduce_len=True)
vocab, reverse_vocab = {}, {}
freq = defaultdict(int)
tweets = {}


def get_embedding(word):
    #return
    try:
        return word2vec_model[word]
    except Exception, e:
        print 'Encoding not found: %s' %(word)
        return np.zeros(EMBEDDING_DIM)

def get_embedding_weights():
    embedding = np.zeros((len(vocab) + 1, EMBEDDING_DIM))
    n = 0
    for k, v in vocab.iteritems():
    	try:
    		embedding[v] = word2vec_model[k]
    	except:
    		n += 1
    		pass
    print "%d embedding missed"%n
    #pdb.set_trace()
    return embedding


def select_tweets():
    # selects the tweets as in mean_glove_embedding method
    # Processing
    tweets = get_data()
    X, Y = [], []
    tweet_return = []
    for tweet in tweets:
        _emb = 0
        words = Tokenize(tweet['text']).split()
        for w in words:
            if w in word2vec_model:  # Check if embeeding there in GLove model
                _emb+=1
        if _emb:   # Not a blank tweet
            tweet_return.append(tweet)
    print 'Tweets selected:', len(tweet_return)
    #pdb.set_trace()
    return tweet_return


def gen_vocab():
    # Processing
    vocab_index = 1
    for tweet in tweets:
        text = Tokenize(tweet['text'])
        text = ''.join([c for c in text if c not in punctuation])
        words = text.split()
        words = [word for word in words if word not in STOPWORDS]

        for word in words:
            if word not in vocab:
                vocab[word] = vocab_index
                reverse_vocab[vocab_index] = word       # generate reverse vocab as well
                vocab_index += 1
            freq[word] += 1
    vocab['UNK'] = len(vocab) + 1
    reverse_vocab[len(vocab)] = 'UNK'
    #pdb.set_trace()


def filter_vocab(k):
    global freq, vocab
    #pdb.set_trace()
    freq_sorted = sorted(freq.items(), key=operator.itemgetter(1))
    tokens = freq_sorted[:k]
    vocab = dict(zip(tokens, range(1, len(tokens) + 1)))
    vocab['UNK'] = len(vocab) + 1


def gen_sequence():
    y_map = {
            'none': 0,
            'racism': 1,
            'sexism': 2
            }

    X, y = [], []
    for tweet in tweets:
        text = Tokenize(tweet['text'])
        text = ''.join([c for c in text if c not in punctuation])
        words = text.split()
        words = [word for word in words if word not in STOPWORDS]
        seq, _emb = [], []
        for word in words:
            seq.append(vocab.get(word, vocab['UNK']))
        X.append(seq)
        y.append(y_map[tweet['label']])
    return X, y


def Tokenize(tweet):
    #return MyTokenizer.tokenize(tweet)
    #pdb.set_trace()
    return tokenizer_g(tweet)


def shuffle_weights(model):
    weights = model.get_weights()
    weights = [np.random.permutation(w.flat).reshape(w.shape) for w in weights]
    model.set_weights(weights)


def fast_text_model(sequence_length):
    model = Sequential()
    model.add(Embedding(len(vocab)+1, EMBEDDING_DIM, input_length=sequence_length))
    #model.add(Embedding(len(vocab)+1, EMBEDDING_DIM, input_length=sequence_length, trainable=False))
    model.add(Dropout(0.5))
    model.add(GlobalAveragePooling1D())
    model.add(Dense(3, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
    print model.summary()
    return model

def train_fasttext(X, y, model, inp_dim,embedding_weights, epochs=10, batch_size=128):
    cv_object = KFold(n_splits=10, shuffle=True, random_state=42)
    print cv_object
    p, r, f1 = 0., 0., 0.
    p1, r1, f11 = 0., 0., 0.
    sentence_len = X.shape[1]
    lookup_table = np.zeros_like(model.layers[0].get_weights()[0])
    for train_index, test_index in cv_object.split(X):
        shuffle_weights(model)
        #pdb.set_trace()
        #model.layers[0].set_weights([embedding_weights])
        X_train, y_train = X[train_index], y[train_index]
        X_test, y_test = X[test_index], y[test_index]
        y_train = y_train.reshape((len(y_train), 1))
        X_temp = np.hstack((X_train, y_train))
        for epoch in xrange(epochs):
            for X_batch in batch_gen(X_temp, batch_size):
                x = X_batch[:, :sentence_len]
                y_temp = X_batch[:, sentence_len]
		class_weights = {}
		class_weights[0] = np.where(y_temp == 0)[0].shape[0]/float(len(y_temp))
		class_weights[1] = np.where(y_temp == 1)[0].shape[0]/float(len(y_temp))
		class_weights[2] = np.where(y_temp == 2)[0].shape[0]/float(len(y_temp))
                try:
                    y_temp = np_utils.to_categorical(y_temp, nb_classes=3)
                except Exception as e:
                    print e
                #print x.shape, y.shape
                loss, acc = model.train_on_batch(x, y_temp)#, class_weight=class_weights)
                print loss, acc
        #pdb.set_trace()
        lookup_table += model.layers[0].get_weights()[0]
        y_pred = model.predict_on_batch(X_test)
        y_pred = np.argmax(y_pred, axis=1)
        print classification_report(y_test, y_pred)
        print precision_recall_fscore_support(y_test, y_pred)
        print y_pred
        p += precision_score(y_test, y_pred, average='weighted')
        p1 += precision_score(y_test, y_pred, average='micro')
        r += recall_score(y_test, y_pred, average='weighted')
        r1 += recall_score(y_test, y_pred, average='micro')
        f1 += f1_score(y_test, y_pred, average='weighted')
        f11 += f1_score(y_test, y_pred, average='micro')

    print "macro results are"
    print "average precision is %f" %(p/10)
    print "average recall is %f" %(r/10)
    print "average f1 is %f" %(f1/10)

    print "micro results are"
    print "average precision is %f" %(p1/10)
    print "average recall is %f" %(r1/10)
    print "average f1 is %f" %(f11/10)
    return lookup_table/float(10)


def check_semantic_sim(embedding_table, word):
    reverse_vocab = {v:k for k,v in vocab.iteritems()}
    sim_word_idx = get_similar_words(embedding_table, embedding_table[vocab[word]], 25)
    sim_words = map(lambda x:reverse_vocab[x[1]], sim_word_idx)
    print sim_words

def tryWord(embedding_table):
    while True:
        print "enter word"
        word = raw_input()
        if word == "pdb":
            pdb.set_trace()
        elif word == 'exit':
            return
        else:
            check_semantic_sim(embedding_table, word)


if __name__ == "__main__":

    Tweets = select_tweets()
    tweets = Tweets
    gen_vocab()
    X, y = gen_sequence()
    MAX_SEQUENCE_LENGTH = max(map(lambda x:len(x), X))
    print "max seq length is %d"%(MAX_SEQUENCE_LENGTH)
    data = pad_sequences(X, maxlen=MAX_SEQUENCE_LENGTH)
    y = np.array(y)
    W = get_embedding_weights()
    data, y = sklearn.utils.shuffle(data, y)
    model = fast_text_model(data.shape[1])
    _ = train_fasttext(data, y, model, EMBEDDING_DIM, W)
    table = model.layers[0].get_weights()[0]
    #check_semantic_sim(table)
    tryWord(table)
    pdb.set_trace()




================================================
FILE: get_similar_words.py
================================================
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import pdb

def get_similar_words(X, vec, K=1):
    # X: (n_samples, n_features)
    # vec: (1, n_features)
    # returns: K top most similar words with score values and their indexes
    scores = cosine_similarity(X, vec)
    scores = sorted([(val, index) for index, val in enumerate(scores.reshape((1,scores.shape[0]))[0])], reverse=True)    
    scores = scores[1:K]
    return scores



================================================
FILE: lstm.py
================================================
from data_handler import get_data
import argparse
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding, Input, LSTM
from keras.models import Sequential, Model
from keras.layers import Activation, Dense, Dropout, Embedding, Flatten, Input, Merge, Convolution1D, MaxPooling1D, GlobalMaxPooling1D
import numpy as np
import pdb
from sklearn.metrics import make_scorer, f1_score, accuracy_score, recall_score, precision_score, classification_report, precision_recall_fscore_support
from sklearn.ensemble  import GradientBoostingClassifier, RandomForestClassifier
from gensim.parsing.preprocessing import STOPWORDS
from sklearn.model_selection import KFold
from keras.utils import np_utils
import codecs
import operator
import gensim, sklearn
from string import punctuation
from collections import defaultdict
from batch_gen import batch_gen
import sys

from nltk import tokenize as tokenize_nltk
from my_tokenizer import glove_tokenize



### Preparing the text data
texts = []  # list of text samples
labels_index = {}  # dictionary mapping label name to numeric id
labels = []  # list of label ids

# vocab generation
vocab, reverse_vocab = {}, {}
freq = defaultdict(int)
tweets = {}



EMBEDDING_DIM = None
GLOVE_MODEL_FILE = None
SEED = 42
NO_OF_FOLDS = 10
CLASS_WEIGHT = None
LOSS_FUN = None
OPTIMIZER = None
KERNEL = None
TOKENIZER = None
MAX_SEQUENCE_LENGTH = None
INITIALIZE_WEIGHTS_WITH = None
LEARN_EMBEDDINGS = None
EPOCHS = 10
BATCH_SIZE = 512
SCALE_LOSS_FUN = None

word2vec_model = None



def get_embedding(word):
    #return
    try:
        return word2vec_model[word]
    except Exception, e:
        print 'Encoding not found: %s' %(word)
        return np.zeros(EMBEDDING_DIM)

def get_embedding_weights():
    embedding = np.zeros((len(vocab) + 1, EMBEDDING_DIM))
    n = 0
    for k, v in vocab.iteritems():
        try:
            embedding[v] = word2vec_model[k]
        except:
            n += 1
            pass
    print "%d embedding missed"%n
    return embedding


def select_tweets():
    # selects the tweets as in mean_glove_embedding method
    # Processing
    tweets = get_data()
    X, Y = [], []
    tweet_return = []
    for tweet in tweets:
        _emb = 0
        words = TOKENIZER(tweet['text'].lower())
        for w in words:
            if w in word2vec_model:  # Check if embeeding there in GLove model
                _emb+=1
        if _emb:   # Not a blank tweet
            tweet_return.append(tweet)
    print 'Tweets selected:', len(tweet_return)
    #pdb.set_trace()
    return tweet_return


def gen_vocab():
    # Processing
    vocab_index = 1
    for tweet in tweets:
        text = TOKENIZER(tweet['text'].lower())
        text = ''.join([c for c in text if c not in punctuation])
        words = text.split()
        words = [word for word in words if word not in STOPWORDS]

        for word in words:
            if word not in vocab:
                vocab[word] = vocab_index
                reverse_vocab[vocab_index] = word       # generate reverse vocab as well
                vocab_index += 1
            freq[word] += 1
    vocab['UNK'] = len(vocab) + 1
    reverse_vocab[len(vocab)] = 'UNK'


def filter_vocab(k):
    global freq, vocab
    pdb.set_trace()
    freq_sorted = sorted(freq.items(), key=operator.itemgetter(1))
    tokens = freq_sorted[:k]
    vocab = dict(zip(tokens, range(1, len(tokens) + 1)))
    vocab['UNK'] = len(vocab) + 1


def gen_sequence():
    y_map = {
            'none': 0,
            'racism': 1,
            'sexism': 2
            }

    X, y = [], []
    for tweet in tweets:
        text = TOKENIZER(tweet['text'].lower())
        text = ''.join([c for c in text if c not in punctuation])
        words = text.split()
        words = [word for word in words if word not in STOPWORDS]
        seq, _emb = [], []
        for word in words:
            seq.append(vocab.get(word, vocab['UNK']))
        X.append(seq)
        y.append(y_map[tweet['label']])
    return X, y


def shuffle_weights(model):
    weights = model.get_weights()
    weights = [np.random.permutation(w.flat).reshape(w.shape) for w in weights]
    model.set_weights(weights)

def lstm_model(sequence_length, embedding_dim):
    model_variation = 'LSTM'
    print('Model variation is %s' % model_variation)
    model = Sequential()
    model.add(Embedding(len(vocab)+1, embedding_dim, input_length=sequence_length, trainable=LEARN_EMBEDDINGS))
    model.add(Dropout(0.25))#, input_shape=(sequence_length, embedding_dim)))
    model.add(LSTM(50))
    model.add(Dropout(0.5))
    model.add(Dense(3))
    model.add(Activation('softmax'))
    model.compile(loss=LOSS_FUN, optimizer=OPTIMIZER, metrics=['accuracy'])
    print model.summary()
    return model


def train_LSTM(X, y, model, inp_dim, weights, epochs=EPOCHS, batch_size=BATCH_SIZE):
    cv_object = KFold(n_splits=NO_OF_FOLDS, shuffle=True, random_state=42)
    print cv_object
    p, r, f1 = 0., 0., 0.
    p1, r1, f11 = 0., 0., 0.
    sentence_len = X.shape[1]
    for train_index, test_index in cv_object.split(X):
        if INITIALIZE_WEIGHTS_WITH == "glove":
            model.layers[0].set_weights([weights])
        elif INITIALIZE_WEIGHTS_WITH == "random":
            shuffle_weights(model)
        else:
            print "ERROR!"
            return
        X_train, y_train = X[train_index], y[train_index]
        X_test, y_test = X[test_index], y[test_index]
        y_train = y_train.reshape((len(y_train), 1))
        X_temp = np.hstack((X_train, y_train))
        for epoch in xrange(epochs):
            for X_batch in batch_gen(X_temp, batch_size):
                x = X_batch[:, :sentence_len]
                y_temp = X_batch[:, sentence_len]

                class_weights = None
                if SCALE_LOSS_FUN:
                    class_weights = {}
                    class_weights[0] = np.where(y_temp == 0)[0].shape[0]/float(len(y_temp))
                    class_weights[1] = np.where(y_temp == 1)[0].shape[0]/float(len(y_temp))
                    class_weights[2] = np.where(y_temp == 2)[0].shape[0]/float(len(y_temp))

                try:
                    y_temp = np_utils.to_categorical(y_temp, nb_classes=3)
                except Exception as e:
                    print e
                    print y_temp
                print x.shape, y.shape
                loss, acc = model.train_on_batch(x, y_temp, class_weight=class_weights)
                print loss, acc

        y_pred = model.predict_on_batch(X_test)
        y_pred = np.argmax(y_pred, axis=1)
        print classification_report(y_test, y_pred)
        print precision_recall_fscore_support(y_test, y_pred)
        print y_pred
        p += precision_score(y_test, y_pred, average='weighted')
        p1 += precision_score(y_test, y_pred, average='micro')
        r += recall_score(y_test, y_pred, average='weighted')
        r1 += recall_score(y_test, y_pred, average='micro')
        f1 += f1_score(y_test, y_pred, average='weighted')
        f11 += f1_score(y_test, y_pred, average='micro')


    print "macro results are"
    print "average precision is %f" %(p/NO_OF_FOLDS)
    print "average recall is %f" %(r/NO_OF_FOLDS)
    print "average f1 is %f" %(f1/NO_OF_FOLDS)

    print "micro results are"
    print "average precision is %f" %(p1/NO_OF_FOLDS)
    print "average recall is %f" %(r1/NO_OF_FOLDS)
    print "average f1 is %f" %(f11/NO_OF_FOLDS)


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description='LSTM based models for twitter Hate speech detection')
    parser.add_argument('-f', '--embeddingfile', required=True)
    parser.add_argument('-d', '--dimension', required=True)
    parser.add_argument('--tokenizer', choices=['glove', 'nltk'], required=True)
    parser.add_argument('--loss', default=LOSS_FUN, required=True)
    parser.add_argument('--optimizer', default=OPTIMIZER, required=True)
    parser.add_argument('--epochs', default=EPOCHS, required=True)
    parser.add_argument('--batch-size', default=BATCH_SIZE, required=True)
    parser.add_argument('-s', '--seed', default=SEED)
    parser.add_argument('--folds', default=NO_OF_FOLDS)
    parser.add_argument('--kernel', default=KERNEL)
    parser.add_argument('--class_weight')
    parser.add_argument('--initialize-weights', choices=['random', 'glove'], required=True)
    parser.add_argument('--learn-embeddings', action='store_true', default=False)
    parser.add_argument('--scale-loss-function', action='store_true', default=False)


    args = parser.parse_args()
    GLOVE_MODEL_FILE = args.embeddingfile
    EMBEDDING_DIM = int(args.dimension)
    SEED = int(args.seed)
    NO_OF_FOLDS = int(args.folds)
    CLASS_WEIGHT = args.class_weight
    LOSS_FUN = args.loss
    OPTIMIZER = args.optimizer
    KERNEL = args.kernel
    if args.tokenizer == "glove":
        TOKENIZER = glove_tokenize
    elif args.tokenizer == "nltk":
        TOKENIZER = tokenize_nltk.casual.TweetTokenizer(strip_handles=True, reduce_len=True).tokenize
    INITIALIZE_WEIGHTS_WITH = args.initialize_weights    
    LEARN_EMBEDDINGS = args.learn_embeddings
    EPOCHS = int(args.epochs)
    BATCH_SIZE = int(args.batch_size)
    SCALE_LOSS_FUN = args.scale_loss_function



    np.random.seed(SEED)
    print 'GLOVE embedding: %s' %(GLOVE_MODEL_FILE)
    print 'Embedding Dimension: %d' %(EMBEDDING_DIM)
    print 'Allowing embedding learning: %s' %(str(LEARN_EMBEDDINGS))

    word2vec_model = gensim.models.Word2Vec.load_word2vec_format(GLOVE_MODEL_FILE)

    tweets = select_tweets()
    gen_vocab()
    #filter_vocab(20000)
    X, y = gen_sequence()
    #Y = y.reshape((len(y), 1))
    MAX_SEQUENCE_LENGTH = max(map(lambda x:len(x), X))
    print "max seq length is %d"%(MAX_SEQUENCE_LENGTH)

    data = pad_sequences(X, maxlen=MAX_SEQUENCE_LENGTH)
    y = np.array(y)
    data, y = sklearn.utils.shuffle(data, y)
    W = get_embedding_weights()

    model = lstm_model(data.shape[1], EMBEDDING_DIM)
    #model = lstm_model(data.shape[1], 25, get_embedding_weights())
    train_LSTM(data, y, model, EMBEDDING_DIM, W)

    pdb.set_trace()


================================================
FILE: my_tokenizer.py
================================================
from string import punctuation
from preprocess_twitter import tokenize as tokenizer_g
from gensim.parsing.preprocessing import STOPWORDS


def glove_tokenize(text):
    text = tokenizer_g(text)
    text = ''.join([c for c in text if c not in punctuation])
    words = text.split()
    words = [word for word in words if word not in STOPWORDS]
    return words



================================================
FILE: nn_classifier.py
================================================
from data_handler import get_data
import sys
import numpy as np
import pdb, json
from sklearn.metrics import make_scorer, f1_score, accuracy_score, recall_score, precision_score, classification_report, precision_recall_fscore_support
from sklearn.ensemble  import GradientBoostingClassifier, RandomForestClassifier
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.feature_extraction.text import TfidfVectorizer
import pdb
from sklearn.metrics import make_scorer, f1_score, accuracy_score, recall_score, precision_score, classification_report, precision_recall_fscore_support
from sklearn.utils import shuffle
from sklearn.ensemble  import GradientBoostingClassifier, RandomForestClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression
from sklearn.utils import shuffle
import codecs
import operator
import gensim, sklearn
from collections import defaultdict
from batch_gen import batch_gen
from my_tokenizer import glove_tokenize
import xgboost as xgb

### Preparing the text data
texts = []  # list of text samples
labels_index = {}  # dictionary mapping label name to numeric id
labels = []  # list of label ids
label_map = {
        'none': 0,
        'racism': 1,
        'sexism': 2
    }
tweet_data = get_data()
for tweet in tweet_data:
    texts.append(tweet['text'].lower())
    labels.append(label_map[tweet['label']])
print('Found %s texts. (samples)' % len(texts))


# logistic, gradient_boosting, random_forest, svm, tfidf_svm_linear, tfidf_svm_rbf
model_count = 2
word_embed_size = 200
GLOVE_MODEL_FILE = str(sys.argv[1])
EMBEDDING_DIM = int(sys.argv[2])
MODEL_TYPE=sys.argv[3]
print 'Embedding Dimension: %d' %(EMBEDDING_DIM)
print 'GloVe Embedding: %s' %(GLOVE_MODEL_FILE)

word2vec_model1 = np.load('fast_text.npy')
word2vec_model1 = word2vec_model1.reshape((word2vec_model1.shape[1], word2vec_model1.shape[2]))
f_vocab = open('vocab_fast_text', 'r')
vocab = json.load(f_vocab)
word2vec_model = {}
for k,v in vocab.iteritems():
word2vec_model[k] = word2vec_model1[int(v)]
del word2vec_model1


SEED=42
MAX_NB_WORDS = None
VALIDATION_SPLIT = 0.2


# vocab generation
vocab, reverse_vocab = {}, {}
freq = defaultdict(int)
tweets = {}


def select_tweets_whose_embedding_exists():
    # selects the tweets as in mean_glove_embedding method
    # Processing
    tweets = get_data()
    X, Y = [], []
    tweet_return = []
    for tweet in tweets:
        _emb = 0
        words = glove_tokenize(tweet['text'])
        for w in words:
            if w in word2vec_model:  # Check if embeeding there in GLove model
                _emb+=1
        if _emb:   # Not a blank tweet
            tweet_return.append(tweet)
    print 'Tweets selected:', len(tweet_return)
    #pdb.set_trace()
    return tweet_return


def gen_data():
    y_map = {
            'none': 0,
            'racism': 1,
            'sexism': 2
            }

    X, y = [], []
    for tweet in tweets:
        words = glove_tokenize(tweet['text'])
        emb = np.zeros(word_embed_size)
        for word in words:
            try:
                emb += word2vec_model[word]
            except:
                pass
        emb /= len(words)
        X.append(emb)
        y.append(y_map[tweet['label']])
    X = np.array(X)
    y = np.array(y)
    return X, y

    
def get_model(m_type=None):
    if not m_type:
        print 'ERROR: Please provide a valid method name'
        return None

    if m_type == 'logistic':
        logreg = LogisticRegression()
    elif m_type == "gradient_boosting":
        #logreg = GradientBoostingClassifier(n_estimators=10)
        logreg = xgb.XGBClassifier(nthread=-1)
    elif m_type == "random_forest":
        logreg = RandomForestClassifier(n_estimators=100, n_jobs=-1)
    elif m_type == "svm_rbf":
        logreg = SVC(class_weight="balanced", kernel='rbf')
    elif m_type == "svm_linear":
        logreg = LinearSVC(class_weight="balanced")
    else:
        print "ERROR: Please specify a correst model"
        return None

    return logreg


def classification_model(X, Y, model_type="logistic"):
    NO_OF_FOLDS=10
    X, Y = shuffle(X, Y, random_state=SEED)
    print "Model Type:", model_type

    #predictions = cross_val_predict(logreg, X, Y, cv=NO_OF_FOLDS)
    scores1 = cross_val_score(get_model(model_type), X, Y, cv=NO_OF_FOLDS, scoring='precision_weighted')
    print "Precision(avg): %0.3f (+/- %0.3f)" % (scores1.mean(), scores1.std() * 2)

    scores2 = cross_val_score(get_model(model_type), X, Y, cv=NO_OF_FOLDS, scoring='recall_weighted')
    print "Recall(avg): %0.3f (+/- %0.3f)" % (scores2.mean(), scores2.std() * 2)
    
    scores3 = cross_val_score(get_model(model_type), X, Y, cv=NO_OF_FOLDS, scoring='f1_weighted')
    print "F1-score(avg): %0.3f (+/- %0.3f)" % (scores3.mean(), scores3.std() * 2)

    pdb.set_trace()



if __name__ == "__main__":

    #filter_vocab(20000)

    tweets = select_tweets_whose_embedding_exists()
    X, Y = gen_data()

    classification_model(X, Y, MODEL_TYPE)
    pdb.set_trace()




================================================
FILE: plot_graph_TSNE.py
================================================
import gensim
import numpy as np
import matplotlib.pyplot as plt
import json
from sklearn.manifold import TSNE
import pdb
import codecs


words = ['mohammed', 'murderer', 'pedophile', 'religion', 'terrorism', 'islamic', 'muslim']

def load_initial_emb():
    initial_emb = gensim.models.Word2Vec.load_word2vec_format("/home/pinkesh/DATASETS/glove-twitter/GENSIM.glove.twitter.27B.200d.txt")
    return initial_emb

def load_final_emb():
    reverse_vocab = codecs.open3("reverse_vocab.json", 'r', encoding="utf-8").readlines()
    reverse_vocab = json.loads("".join(reverse_vocab))
    reverse_vocab['0'] = "<UNK>"

    final_emb = {}
    for i, emb in enumerate(np.load("embedding.npy")):
        final_emb[reverse_vocab[str(i)].encode("utf-8")] = emb
    return final_emb

def get_transform(initial_emb, final_emb):
    vec = []
    for w in words:
        vec.append(initial_emb[w])
    for w in words:
        vec.append(final_emb[w])
    
    X = np.array(vec)
    print X.shape
    
    model = TSNE(n_components=2, random_state=0)
    out = model.fit_transform(X)
    
    print out
    print "Will plot now!"
    return out



# Initial are original
# Next are final
def plot(out):
    A = out[:7,:]
    B = out[7:,:]
    area=150
    padding=0.0001
    xmin, xmax = min(out[:, 0]), max(out[:, 0])
    ymin, ymax = min(out[:, 1]), max(out[:, 1])
    
    fig, ax = plt.subplots()
    
    for (color, label, data) in [('red', 'GloVe', A), ('green', 'FastText+GloVe+Dyn', B)]:
        ax.scatter(data[:,0], data[:,1], c=color, s=area, label=label,
                                      alpha=0.3, edgecolors='none')
        for (row, word) in zip(data, words):
            ax.annotate(word, xy=(row[0], row[1]), xytext=(row[0], row[1]),)
    
    plt.axis([xmin-padding,xmax+padding,ymin-padding,ymax+padding])
    plt.legend()
    plt.grid(True)
    
    plt.show()


if __name__=="__main__":
    ini = load_initial_emb()
    fin = load_final_emb()
    out = get_transform(ini, fin)
    plot(out)


================================================
FILE: preprocess_twitter.py
================================================
"""
preprocess-twitter.py

python preprocess-twitter.py "Some random text with #hashtags, @mentions and http://t.co/kdjfkdjf (links). :)"

Script for preprocessing tweets by Romain Paulus
with small modifications by Jeffrey Pennington
with translation to Python by Motoki Wu

Translation of Ruby script to create features for GloVe vectors for Twitter data.
http://nlp.stanford.edu/projects/glove/preprocess-twitter.rb
"""

import sys
import re

FLAGS = re.MULTILINE | re.DOTALL

def hashtag(text):
    text = text.group()
    hashtag_body = text[1:]
    if hashtag_body.isupper():
        result = u"<hashtag> {} <allcaps>".format(hashtag_body)
    else:
        result = " ".join(["<hashtag>"] + re.split(ur"(?=[A-Z])", hashtag_body, flags=FLAGS))
    return result

def allcaps(text):
    text = text.group()
    return text.lower() + " <allcaps>"


def tokenize(text):
    # Different regex parts for smiley faces
    eyes = r"[8:=;]"
    nose = r"['`\-]?"

    # function so code less repetitive
    def re_sub(pattern, repl):
        return re.sub(pattern, repl, text, flags=FLAGS)

    text = re_sub(r"https?:\/\/\S+\b|www\.(\w+\.)+\S*", "<url>")
    text = re_sub(r"/"," / ")
    text = re_sub(r"@\w+", "<user>")
    text = re_sub(r"{}{}[)dD]+|[)dD]+{}{}".format(eyes, nose, nose, eyes), "<smile>")
    text = re_sub(r"{}{}p+".format(eyes, nose), "<lolface>")
    text = re_sub(r"{}{}\(+|\)+{}{}".format(eyes, nose, nose, eyes), "<sadface>")
    text = re_sub(r"{}{}[\/|l*]".format(eyes, nose), "<neutralface>")
    text = re_sub(r"<3","<heart>")
    text = re_sub(r"[-+]?[.\d]*[\d]+[:,.\d]*", "<number>")
    text = re_sub(r"#\S+", hashtag)
    text = re_sub(r"([!?.]){2,}", r"\1 <repeat>")
    text = re_sub(r"\b(\S*?)(.)\2{2,}\b", r"\1\2 <elong>")

    ## -- I just don't understand why the Ruby script adds <allcaps> to everything so I limited the selection.
    # text = re_sub(r"([^a-z0-9()<>'`\-]){2,}", allcaps)
    text = re_sub(r"([A-Z]){2,}", allcaps)

    return text.lower()


if __name__ == '__main__':
    _, text = sys.argv
    if text == "test":
        text = u"I TEST alllll kinds of #hashtags and #HASHTAGS, @mentions and 3000 (http://t.co/dkfjkdf). w/ <3 :) haha!!!!!"
    tokens = tokenize(text)
    print tokens


================================================
FILE: tfidf.py
================================================
from data_handler import get_data
import argparse
import sys
import numpy as np
from sklearn.metrics import make_scorer, f1_score, accuracy_score, recall_score, precision_score, classification_report, precision_recall_fscore_support
from sklearn.ensemble  import GradientBoostingClassifier, RandomForestClassifier
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.feature_extraction.text import TfidfVectorizer
import pdb
from sklearn.metrics import make_scorer, f1_score, accuracy_score, recall_score, precision_score, classification_report, precision_recall_fscore_support
from sklearn.utils import shuffle
from sklearn.ensemble  import GradientBoostingClassifier, RandomForestClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression
from sklearn.utils import shuffle
import codecs
import operator
import gensim, sklearn
from collections import defaultdict
from batch_gen import batch_gen
from my_tokenizer import glove_tokenize
from nltk.tokenize import TweetTokenizer


### Preparing the text data
texts = []  # list of text samples
labels_index = {}  # dictionary mapping label name to numeric id
labels = []  # list of label ids

# vocab generation
vocab, reverse_vocab = {}, {}
freq = defaultdict(int)
tweets = {}


# tfidf_logistic, tfidf_gradient_boosting, tfidf_random_forest, tfidf_svm_linear, tfidf_svm_rbf
MODEL_TYPE=None
MAX_NGRAM_LENGTH=None
NO_OF_FOLDS=10
CLASS_WEIGHT = None
N_ESTIMATORS = None
LOSS_FUN = None
KERNEL = None
MAX_NGRAM_LENGTH = None
SEED=42
TOKENIZER=None


def gen_data():
    label_map = {
            'none': 0,
            'racism': 1,
            'sexism': 2
        }
    tweet_data = get_data()
    for tweet in tweet_data:
        texts.append(tweet['text'].lower())
        labels.append(label_map[tweet['label']])
    print('Found %s texts. (samples)' % len(texts))



def get_model(m_type=None):
    if not m_type:
        print 'Please specify a model type'
        return None

    if m_type == "tfidf_svm":
        logreg = SVC(class_weight=CLASS_WEIGHT, kernel=KERNEL)
    elif m_type == "tfidf_svm_linear":
        logreg = LinearSVC(C=0.01, loss=LOSS_FUN, class_weight=CLASS_WEIGHT)
    elif m_type == 'tfidf_logistic':
        logreg = LogisticRegression()
    elif m_type == "tfidf_gradient_boosting":
        logreg = GradientBoostingClassifier(loss=LOSS_FUN, n_estimators=N_ESTIMATORS)
    elif m_type == "tfidf_random_forest":
        logreg = RandomForestClassifier(class_weight=CLASS_WEIGHT, n_estimators=N_ESTIMATORS)
        print "ERROR: Please specify a correct model"
        return None

    return logreg


def classification_model(X, Y, model_type=None):
    X, Y = shuffle(X, Y, random_state=SEED)
    print "Model Type:", model_type

    #predictions = cross_val_predict(logreg, X, Y, cv=NO_OF_FOLDS)
    scores1 = cross_val_score(get_model(model_type), X, Y, cv=NO_OF_FOLDS, scoring='precision_weighted')
    print "Precision(avg): %0.3f (+/- %0.3f)" % (scores1.mean(), scores1.std() * 2)

    scores2 = cross_val_score(get_model(model_type), X, Y, cv=NO_OF_FOLDS, scoring='recall_weighted')
    print "Recall(avg): %0.3f (+/- %0.3f)" % (scores2.mean(), scores2.std() * 2)
    
    scores3 = cross_val_score(get_model(model_type), X, Y, cv=NO_OF_FOLDS, scoring='f1_weighted')
    print "F1-score(avg): %0.3f (+/- %0.3f)" % (scores3.mean(), scores3.std() * 2)



if __name__ == "__main__":


    parser = argparse.ArgumentParser(description='TF-IDF model for twitter Hate speech detection')
    parser.add_argument('-m', '--model', choices=['tfidf_svm', 'tfidf_svm_linear', 'tfidf_logistic', 'tfidf_gradient_boosting', 'tfidf_random_forest'], required=True)
    parser.add_argument('--max_ngram', required=True)
    parser.add_argument('--tokenizer', choices=['glove', 'nltk'], required=True)
    parser.add_argument('-s', '--seed', default=SEED)
    parser.add_argument('--folds', default=NO_OF_FOLDS)
    parser.add_argument('--estimators', default=N_ESTIMATORS)
    parser.add_argument('--loss', default=LOSS_FUN)
    parser.add_argument('--kernel', default=KERNEL)
    parser.add_argument('--class_weight')
    parser.add_argument('--use-inverse-doc-freq', action='store_true')

    args = parser.parse_args()

    MODEL_TYPE = args.model
    SEED = int(args.seed)
    NO_OF_FOLDS = int(args.folds)
    CLASS_WEIGHT = args.class_weight
    N_ESTIMATORS = int(args.estimators) if args.estimators else args.estimators
    LOSS_FUN = args.loss
    KERNEL = args.kernel
    MAX_NGRAM_LENGTH = int(args.max_ngram)
    USE_IDF = args.use_inverse_doc_freq

    if args.tokenizer == "glove":
        TOKENIZER = glove_tokenize
    elif args.tokenizer == "nltk":
        TOKENIZER = TweetTokenizer().tokenize

    print 'Max-ngram-length: %d' %(MAX_NGRAM_LENGTH)
    #filter_vocab(20000)

    # For TFIDF-SVC or any other varient
    # We do not need to run the above code for TFIDF
    # It does not use the filtered data using gen_data()
    gen_data()
    tfidf_transformer = TfidfVectorizer(use_idf=USE_IDF, analyzer="word", tokenizer=TOKENIZER, ngram_range=(1, MAX_NGRAM_LENGTH))
    #tfidf_transformer = TfidfVectorizer(use_idf=True, ngram_range=(1, MAX_NGRAM_LENGTH))
    X_train_tfidf = tfidf_transformer.fit_transform(texts)
    X = X_train_tfidf
    Y = labels

    classification_model(X, Y, MODEL_TYPE)

Download .txt

gitextract_19n6zuu9/

├── .gitignore
├── BoWV.py
├── README.md
├── batch_gen.py
├── cnn.py
├── data_handler.py
├── fast_text.py
├── get_similar_words.py
├── lstm.py
├── my_tokenizer.py
├── nn_classifier.py
├── plot_graph_TSNE.py
├── preprocess_twitter.py
└── tfidf.py

Download .txt

SYMBOL INDEX (52 symbols across 12 files)

FILE: BoWV.py
  function select_tweets_whose_embedding_exists (line 55) | def select_tweets_whose_embedding_exists():
  function gen_data (line 73) | def gen_data():
  function get_model (line 95) | def get_model(m_type=None):
  function classification_model (line 116) | def classification_model(X, Y, model_type=None):

FILE: batch_gen.py
  function batch_gen (line 6) | def batch_gen(X, batch_size):

FILE: cnn.py
  function get_embedding (line 60) | def get_embedding(word):
  function get_embedding_weights (line 68) | def get_embedding_weights():
  function select_tweets (line 82) | def select_tweets():
  function gen_vocab (line 100) | def gen_vocab():
  function filter_vocab (line 119) | def filter_vocab(k):
  function gen_sequence (line 127) | def gen_sequence():
  function shuffle_weights (line 148) | def shuffle_weights(model):
  function cnn_model (line 154) | def cnn_model(sequence_length, embedding_dim):
  function train_CNN (line 205) | def train_CNN(X, y, inp_dim, model, weights, epochs=EPOCHS, batch_size=B...

FILE: data_handler.py
  function get_data (line 6) | def get_data():

FILE: fast_text.py
  function get_embedding (line 63) | def get_embedding(word):
  function get_embedding_weights (line 71) | def get_embedding_weights():
  function select_tweets (line 85) | def select_tweets():
  function gen_vocab (line 104) | def gen_vocab():
  function filter_vocab (line 124) | def filter_vocab(k):
  function gen_sequence (line 133) | def gen_sequence():
  function Tokenize (line 154) | def Tokenize(tweet):
  function shuffle_weights (line 160) | def shuffle_weights(model):
  function fast_text_model (line 166) | def fast_text_model(sequence_length):
  function train_fasttext (line 177) | def train_fasttext(X, y, model, inp_dim,embedding_weights, epochs=10, ba...
  function check_semantic_sim (line 233) | def check_semantic_sim(embedding_table, word):
  function tryWord (line 239) | def tryWord(embedding_table):

FILE: get_similar_words.py
  function get_similar_words (line 5) | def get_similar_words(X, vec, K=1):

FILE: lstm.py
  function get_embedding (line 59) | def get_embedding(word):
  function get_embedding_weights (line 67) | def get_embedding_weights():
  function select_tweets (line 80) | def select_tweets():
  function gen_vocab (line 99) | def gen_vocab():
  function filter_vocab (line 118) | def filter_vocab(k):
  function gen_sequence (line 127) | def gen_sequence():
  function shuffle_weights (line 148) | def shuffle_weights(model):
  function lstm_model (line 153) | def lstm_model(sequence_length, embedding_dim):
  function train_LSTM (line 168) | def train_LSTM(X, y, model, inp_dim, weights, epochs=EPOCHS, batch_size=...

FILE: my_tokenizer.py
  function glove_tokenize (line 6) | def glove_tokenize(text):

FILE: nn_classifier.py
  function select_tweets_whose_embedding_exists (line 71) | def select_tweets_whose_embedding_exists():
  function gen_data (line 90) | def gen_data():
  function get_model (line 114) | def get_model(m_type=None):
  function classification_model (line 137) | def classification_model(X, Y, model_type="logistic"):

FILE: plot_graph_TSNE.py
  function load_initial_emb (line 12) | def load_initial_emb():
  function load_final_emb (line 16) | def load_final_emb():
  function get_transform (line 26) | def get_transform(initial_emb, final_emb):
  function plot (line 47) | def plot(out):

FILE: preprocess_twitter.py
  function hashtag (line 19) | def hashtag(text):
  function allcaps (line 28) | def allcaps(text):
  function tokenize (line 33) | def tokenize(text):

FILE: tfidf.py
  function gen_data (line 50) | def gen_data():
  function get_model (line 64) | def get_model(m_type=None):
  function classification_model (line 85) | def classification_model(X, Y, model_type=None):

Download .json

Condensed preview — 14 files, each showing path, character count, and a content snippet. Download the .json file or copy for the full structured content (62K chars).

[
  {
    "path": ".gitignore",
    "chars": 6,
    "preview": "*.pyc\n"
  },
  {
    "path": "BoWV.py",
    "chars": 5776,
    "preview": "from data_handler import get_data\nimport argparse\nimport sys\nimport numpy as np\nimport pdb\nfrom sklearn.metrics import m"
  },
  {
    "path": "README.md",
    "chars": 5583,
    "preview": "# Hate Speech Detection on Twitter\n\nImplementation of our paper titled - \"Deep Learning for Hate Speech Detection\" (to a"
  },
  {
    "path": "batch_gen.py",
    "chars": 635,
    "preview": "import numpy as np\nimport random\nimport pdb\nimport math\n\ndef batch_gen(X, batch_size):\n    n_batches = X.shape[0]/float("
  },
  {
    "path": "cnn.py",
    "chars": 11123,
    "preview": "from data_handler import get_data\nimport argparse\nfrom keras.preprocessing.sequence import pad_sequences\nfrom keras.laye"
  },
  {
    "path": "data_handler.py",
    "chars": 1206,
    "preview": "import json\nimport pdb\nimport codecs\nimport pdb\n\ndef get_data():\n    tweets = []\n    files = ['racism.json', 'neither.js"
  },
  {
    "path": "fast_text.py",
    "chars": 9008,
    "preview": "from data_handler import get_data\nfrom keras.preprocessing.text import Tokenizer\nfrom keras.preprocessing.sequence impor"
  },
  {
    "path": "get_similar_words.py",
    "chars": 463,
    "preview": "from sklearn.metrics.pairwise import cosine_similarity\nimport numpy as np\nimport pdb\n\ndef get_similar_words(X, vec, K=1)"
  },
  {
    "path": "lstm.py",
    "chars": 10103,
    "preview": "from data_handler import get_data\nimport argparse\nfrom keras.preprocessing.sequence import pad_sequences\nfrom keras.laye"
  },
  {
    "path": "my_tokenizer.py",
    "chars": 361,
    "preview": "from string import punctuation\nfrom preprocess_twitter import tokenize as tokenizer_g\nfrom gensim.parsing.preprocessing "
  },
  {
    "path": "nn_classifier.py",
    "chars": 5096,
    "preview": "from data_handler import get_data\nimport sys\nimport numpy as np\nimport pdb, json\nfrom sklearn.metrics import make_scorer"
  },
  {
    "path": "plot_graph_TSNE.py",
    "chars": 2005,
    "preview": "import gensim\nimport numpy as np\nimport matplotlib.pyplot as plt\nimport json\nfrom sklearn.manifold import TSNE\nimport pd"
  },
  {
    "path": "preprocess_twitter.py",
    "chars": 2243,
    "preview": "\"\"\"\npreprocess-twitter.py\n\npython preprocess-twitter.py \"Some random text with #hashtags, @mentions and http://t.co/kdjf"
  },
  {
    "path": "tfidf.py",
    "chars": 5398,
    "preview": "from data_handler import get_data\nimport argparse\nimport sys\nimport numpy as np\nfrom sklearn.metrics import make_scorer,"
  }
]

About this extraction

This page contains the full source code of the pinkeshbadjatiya/twitter-hatespeech GitHub repository, extracted and formatted as plain text for AI agents and large language models (LLMs). The extraction includes 14 files (57.6 KB), approximately 15.8k tokens, and a symbol index with 52 extracted functions, classes, methods, constants, and types. Use this with OpenClaw, Claude, ChatGPT, Cursor, Windsurf, or any other AI tool that accepts text input. You can copy the full output to your clipboard or download it as a .txt file.

Extracted by GitExtract — free GitHub repo to text converter for AI. Built by Nikandr Surkov.

Extract another repo