Showing preview only (1,855K chars total). Download the full file or copy to clipboard to get everything.
Repository: JianWenJun/MLDemo
Branch: master
Commit: 641b85464442
Files: 75
Total size: 1.7 MB
Directory structure:
gitextract_sl_11rs9/
├── Financial_NLP/
│ └── final_demo/
│ ├── README.md
│ ├── __init__.py
│ ├── data_prepare.py
│ ├── extract_feature.py
│ ├── main.py
│ ├── train_model.py
│ └── util.py
├── ML/
│ ├── DecisionTree/
│ │ ├── Boosting.py
│ │ ├── RandomForest.py
│ │ ├── decision_tree.py
│ │ ├── titanic_data_analy.ipynb
│ │ ├── tree_main.py
│ │ └── xgboost_demo.py
│ ├── LogisticRegression_MEM/
│ │ └── LR_MEM_demo.py
│ ├── Perce_SVM/
│ │ ├── SVM.py
│ │ └── perceptron.py
│ ├── REDAME.md
│ ├── TensorDemo/
│ │ ├── NN_tf.py
│ │ └── README.md
│ └── data/
│ └── adult/
│ ├── adult_deal_value.data
│ └── adult_deal_value.test
├── NLP/
│ ├── AutoTitle_F/
│ │ ├── configs/
│ │ │ ├── make_vocab.yaml
│ │ │ ├── predict.yaml
│ │ │ ├── process.yaml
│ │ │ └── train_model.yaml
│ │ ├── data/
│ │ │ ├── __init__.py
│ │ │ ├── batcher.py
│ │ │ ├── data.py
│ │ │ └── data_processed.py
│ │ ├── models/
│ │ │ ├── __init__.py
│ │ │ ├── adaptive.py
│ │ │ ├── loss.py
│ │ │ ├── lr_scheduler.py
│ │ │ ├── optims.py
│ │ │ └── seq2seq.py
│ │ ├── pycocoevalcap/
│ │ │ ├── README.md
│ │ │ ├── __init__.py
│ │ │ ├── bleu/
│ │ │ │ ├── LICENSE
│ │ │ │ ├── __init__.py
│ │ │ │ ├── bleu.py
│ │ │ │ └── bleu_scorer.py
│ │ │ ├── cider/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── cider.py
│ │ │ │ └── cider_scorer.py
│ │ │ ├── license.txt
│ │ │ ├── meteor/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── meteor-1.5.jar
│ │ │ │ ├── meteor.py
│ │ │ │ └── tests/
│ │ │ │ └── test_meteor.py
│ │ │ ├── rouge/
│ │ │ │ ├── __init__.py
│ │ │ │ └── rouge.py
│ │ │ └── test_eval.py
│ │ ├── submit.py
│ │ └── train.py
│ ├── GAN&NLP.md
│ ├── Multi_Label/
│ │ └── ShengCe/
│ │ ├── generate_submit.py
│ │ ├── train_model.py
│ │ └── util.py
│ ├── Seq2Seq/
│ │ ├── __init__.py
│ │ ├── data_util.py
│ │ ├── main.py
│ │ ├── seq2seq_attn.py
│ │ ├── seq2seq_model.py
│ │ └── text_summarizer.py
│ ├── Text_CNN/
│ │ ├── process_data.py
│ │ ├── text_cnn_main.py
│ │ └── text_cnn_model.py
│ └── daguan/
│ ├── README.md
│ ├── data_analy.py
│ ├── lr_scheduler.py
│ ├── main.py
│ ├── model.py
│ ├── optims.py
│ └── predict.py
└── README.md
================================================
FILE CONTENTS
================================================
================================================
FILE: Financial_NLP/final_demo/README.md
================================================
#### 1.项目结构
##### 1.1代码文件说明
>**./final_demo/util.py**:1.管理整个项目的文件存放路径
2.存储和读取各种方法抽取的特征,并组合成DataFrame输出。
**./final_demo/data_prepare.py**:1.文本的清理工作 2.词向量训练工作</br>
**./final_demo/extract_feature.py**:1.各种方法进行特征抽取,并进行保存</br>
**./final_demo/train_model.py**:1.深度模型的构建工作</br>
**./final_demo/main.py**:1.最后整个项目的运行,读取各个方法抽取的特征并使用分类模型进行分类预测</br>
##### 1.2代码运行期间生成文件的目录说明(根目录为./atec/data)
>1./atec/data :根目录,存放该项目下的相关文件夹,和比赛最初的.csv文件(原始数据文件)</br>
>2../atec/data/aux :存放最后提交测试平台的重压文件,包括分词字典,词向量矩阵,停用词,拼写纠错,疑问词。</br>
>3./atec/data/feature :特征存放文件,存放各个方法进行抽取后的特征,一个抽取方法包括3个文件,特征值文件和特征列名文件。</br>
4./atec/data/preprocessed: 原始数据文本预处理文件目录</br>
5../atec/data/tmp: 存放用于提取特征的深度模型</br>
6../atec/data/trained: 存放最后需要提交测试平台的模型文件</br>
#### 2.项目使用步骤
>1.整个项目的初始化工作(设置整个项目的根目录,决定是否需要创建(第一次为create_dir为True,之后为False))</br>
2.文本的预处理工作</br>
3.特征抽取</br>
4.整个项目的运行(构建最终的分类模型,交叉验证的方式)</br>
```python
#最终的运行方式
python util.py
python data_prepare.py
python extract_feature.py
python main.py
```
#### 3赛题思路
[蚂蚁金融NLP竞赛——文本语义相似度赛题总结](https://jianwenjun.xyz/2018/07/13/%E8%9A%82%E8%9A%81%E9%87%91%E8%9E%8DNLP%E7%AB%9E%E8%B5%9B%E2%80%94%E2%80%94%E6%96%87%E6%9C%AC%E8%AF%AD%E4%B9%89%E7%9B%B8%E4%BC%BC%E5%BA%A6%E8%B5%9B%E9%A2%98%E6%80%BB%E7%BB%93/)
================================================
FILE: Financial_NLP/final_demo/__init__.py
================================================
#!/usr/bin/python
# -*- coding: utf-8 -*-
# @Time : 2018/6/15 下午8:52
# @Author : ComeOnJian
# @File : __init__.py.py
================================================
FILE: Financial_NLP/final_demo/data_prepare.py
================================================
#!/usr/bin/python
# -*- coding: utf-8 -*-
# @Time : 2018/6/15 下午10:50
# @Author : ComeOnJian
# @File : data_prepare.py
import jieba
from util import *
import json
import re
from collections import defaultdict
from gensim.models import word2vec
from gensim.models import KeyedVectors
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
# file paths
train_data_all = 'atec_nlp_sim_train_all.csv'
train_all = 'atec_nlp_sim_train_0.6.csv'
test_all = 'atec_nlp_sim_test_0.4.csv' #6550个为1的label
stop_words_path = 'stop_words.txt'
dict_path = 'dict_all.txt'
spelling_corrections_path = 'spelling_corrections.json'
w2v_model_path = 'train_corpus.model'
w2v_vocab_path = 'train_corpus_vocab.txt'
# data
#param
embedding_size = 300
max_sentence_length = 20
max_word_length = 25
# os.path.join(project.aux_dir,'fasttext','')
max_vovab_size = 100000
#################### 文本的清理工作 ####################
def preprocessing(data_df,fname):
"""
:param data_df:需要处理的数据集
:param fname:
:return:
"""
# 加载停用词
stopwords = load_stopwordslist(project.aux_dir + stop_words_path)
# 加载拼写错误替换词
spelling_corrections = load_spelling_corrections(project.aux_dir + spelling_corrections_path)
re_object = re.compile(r'\*+') #去除句子中的脱敏数字***,替换成一
vocabs = defaultdict(int)# 记录词汇表词频
for index, row in data_df.iterrows():
# 每1000个打印一下句子的词向量
if index != 0 and index % 2000 == 0:
print("{:,} {}-sentence embedding.".format(index,fname))
# 分别遍历每行的两个句子,并进行分词处理
for col_name in ["s1", "s2"]:
# 替换掉脱敏的数字
re_str = re_object.subn(u"十一",unicode(row[col_name],'utf-8'))
# 纠正一些词
spell_corr_str = transform_other_word(re_str[0],spelling_corrections)
# 分词
seg_str = seg_sentence(spell_corr_str, stopwords)
for word in seg_str.split(" "):
vocabs[word] = vocabs[word] + 1
data_df.at[index, col_name] = seg_str
data_df.to_csv(project.preprocessed_data_dir + '{}.csv'.format(fname), sep='\t', header=None,index=None,encoding='utf-8')
project.save(project.preprocessed_data_dir + '{}.pickle'.format(fname),vocabs)
del data_df
def seg_sentence(sentence,stop_words):
"""
对句子进行分词
:param sentence:句子,String
"""
sentence_seged = jieba.cut(sentence.strip())
out_str = ""
for word in sentence_seged:
if word not in stop_words:
if word != " ":
out_str += word
out_str += " "
return out_str
def load_stopwordslist(filepath):
"""
加载停用词
:param filepath:停用词文件路径
:return:
"""
with io.open(filepath,"r",encoding="utf-8") as file:
stop_words = [line.strip() for line in file]
return stop_words
def load_spelling_corrections(filepath):
with io.open(filepath,"r",encoding="utf-8") as file:
spelling_corrections = json.load(file)
return spelling_corrections
def load_doubt_words(filpath):
"""
加载疑问词
:param filpath:
:return:
"""
with io.open(filpath,'r',encoding="utf-8") as file:
doubt_words = [line.strip() for line in file]
return doubt_words
def transform_other_word(str_text,reg_dict):
for token_str,replac_str in reg_dict.items():
str_text = str_text.replace(token_str, replac_str)
return str_text
def strip_why(rawq):
rawq = re.sub('为什么|为何|为啥|为么|为撒|咋个|为什|怎么回事|是什么原因|什么原因', '', rawq)
if re.match(r'怎么.*(不|没|了|只|会|又|要|老|总|才|是)',rawq):
rawq = re.sub('怎么', '', rawq)
return rawq
def strip_how(rawq):
rawq = re.sub('怎么办|咋办', '', rawq)
return rawq
#################### 文本的Embeding工作 ####################
def process_save_embedding_wv(nfile,type = 1,isStore_ids = False):
"""
:param type: 词向量的选择:1,知乎,2,训练集 3 知乎+训练集
:return:
"""
w2v_path = project.aux_dir + 'sgns.zhihu.bigram'
if type == 2:
w2v_path = project.aux_dir + 'train_all_data.bigram'
# vocabs_path = project.preprocessed_data_dir + 'data_all_seg.pickle'
tokenizer = Tokenizer(
num_words=max_vovab_size,
split=' ',
lower=False,
char_level=False,
filters=''
)
# 加载所有的词汇表训练集和测试集
pre_deal_train_df = pd.read_csv(project.preprocessed_data_dir + 'train_0.6_seg.csv',
names=["index", "s1", "s2", "label"],
header=None,encoding='utf-8',
sep='\t')
pre_deal_test_df = pd.read_csv(project.preprocessed_data_dir + 'test_0.4_seg.csv',
names=["index", "s1", "s2", "label"],
header=None,encoding='utf-8',
sep='\t',
)
texts = []
texts_s1_test = pre_deal_test_df['s1'].tolist()
texts_s2_test = pre_deal_test_df['s2'].tolist()
texts_s1_train = pre_deal_train_df['s1'].tolist()
texts_s2_train = pre_deal_train_df['s2'].tolist()
texts.extend(texts_s1_test)
texts.extend(texts_s2_test)
texts.extend(texts_s1_train)
texts.extend(texts_s2_train)
# print pre_deal_train_df.isnull().any()
# print pre_deal_test_df.isnull().any()
# 生成token词典
# tests = [u'中 国', u'矿业 大学', u'不错哦']
tokenizer.fit_on_texts(texts)
# 生成各个词对应的index列表
s1_train_ids = tokenizer.texts_to_sequences(texts_s1_train)
s2_train_ids = tokenizer.texts_to_sequences(texts_s2_train)
s1_test_ids = tokenizer.texts_to_sequences(texts_s1_test)
s2_test_ids = tokenizer.texts_to_sequences(texts_s2_test)
num_words_dict = tokenizer.word_index
# 训练集的词汇表的词向量矩阵,行数为最大值+1,形式为:index->vec
embedding_matrix = 1 * np.random.randn(len(num_words_dict) + 1, embedding_size)
embedding_matrix[0] = np.random.randn(embedding_size)
# 加载预训练的词向量w2v
print 'load w2v_model...'
w2v_model = KeyedVectors.load_word2vec_format(w2v_path, binary=False)
print 'finish w2v_model...'
if type == 3:
w2v_path2 = project.aux_dir + 'train_all_data.bigram'
w2v_model2 = KeyedVectors.load_word2vec_format(w2v_path2, binary=False)
count = 0
for word,index in num_words_dict.items():
if word in w2v_model.vocab:
embedding_matrix[index] = w2v_model.word_vec(word)
count = count +1
else:
if type == 3:
if word in w2v_model2.vocab:
embedding_matrix[index] = w2v_model2.word_vec(word)
count = count + 1
print('total {}, word in model have {}'.format(len(num_words_dict),count))
project.save(project.aux_dir + nfile,embedding_matrix)
if isStore_ids:
s1_train_ids_pad = sequence.pad_sequences(s1_train_ids,maxlen=max_sentence_length)
s2_train_ids_pad = sequence.pad_sequences(s2_train_ids,maxlen=max_sentence_length)
s1_test_ids_pad = sequence.pad_sequences(s1_test_ids,maxlen=max_sentence_length)
s2_test_ids_pad = sequence.pad_sequences(s2_test_ids,maxlen=max_sentence_length)
project.save(project.preprocessed_data_dir + 's1_train_ids_pad.pickle',s1_train_ids_pad)
project.save(project.preprocessed_data_dir + 's2_train_ids_pad.pickle',s2_train_ids_pad)
project.save(project.preprocessed_data_dir + 's1_test_ids_pad.pickle',s1_test_ids_pad)
project.save(project.preprocessed_data_dir + 's2_test_ids_pad.pickle',s2_test_ids_pad)
print('finish')
def process_save_char_embedding_wv(isStore_ids = False):
data_local_df = pd.read_csv(project.data_dir + train_all, sep='\t', header=None,names=["index", "s1", "s2", "label"])
data_test_df = pd.read_csv(project.data_dir + test_all, sep='\t', header=None, names=["index", "s1", "s2", "label"])
w2v_char_path = project.aux_dir + 'train_char_all__data.bigram'
w2v_char_model = KeyedVectors.load_word2vec_format(w2v_char_path, binary=False)
# 加载拼写错误替换词
spelling_corrections = load_spelling_corrections(project.aux_dir + spelling_corrections_path)
re_object = re.compile(r'\*+') # 去除句子中的脱敏数字***,替换成一
char_vocabs = project.load(project.preprocessed_data_dir + 'train_all_char_vocabs.pickle')
data_df_list = [data_local_df,data_test_df]
embedding_word_matrix = 1 * np.random.randn((len(char_vocabs) + 1), embedding_size)
embedding_word_matrix[0] = np.random.randn(embedding_size)
for word,index in char_vocabs.items():
if word in w2v_char_model.vocab:
embedding_word_matrix[index] = w2v_char_model.word_vec(word)
if index % 100 == 0:
print 'char {}'.format(index)
project.save(project.aux_dir + 'train_all_char_embedding_matrix.pickle',embedding_word_matrix)
for data_df in data_df_list:
for index, row in data_df.iterrows():
# 每1000个打印一下句子的词向量
if index != 0 and index % 5000 == 0:
print("{:,}sentence word embedding.".format(index))
# 分别遍历每行的两个句子,并进行分词处理
for col_name in ["s1", "s2"]:
# 替换掉脱敏的数字
re_str = re_object.subn(u"十一",unicode(row[col_name],'utf-8'))
# 纠正一些词
spell_corr_str = transform_other_word(re_str[0],spelling_corrections)
spell_corr_str = list(spell_corr_str)
indexs = []
for char in spell_corr_str:
if char in char_vocabs:
indexs.append(char_vocabs[char])
else:
if not char.strip()==u"":
indexs.append(0)
data_df.at[index, col_name] = indexs
if isStore_ids:
s1_train_ids_pad = sequence.pad_sequences(data_local_df['s1'],maxlen=max_word_length)
s2_train_ids_pad = sequence.pad_sequences(data_local_df['s2'],maxlen=max_word_length)
s1_test_ids_pad = sequence.pad_sequences(data_test_df['s1'],maxlen=max_word_length)
s2_test_ids_pad = sequence.pad_sequences(data_test_df['s2'],maxlen=max_word_length)
project.save(project.preprocessed_data_dir + 's1_train_char_ids_pad.pickle',s1_train_ids_pad)
project.save(project.preprocessed_data_dir + 's2_train_char_ids_pad.pickle',s2_train_ids_pad)
project.save(project.preprocessed_data_dir + 's1_test_char_ids_pad.pickle',s1_test_ids_pad)
project.save(project.preprocessed_data_dir + 's2_test_char_ids_pad.pickle',s2_test_ids_pad)
print('finish')
#################### 使用训练集train_all做 pre_train word embedding ####################
def pre_train_w2v(binary = False):
"""
利用已经训练集训练词向量
:param nfile_corpus:已经分好词的文本路径,如"train_segment.corpus"
:param binary:将词向量表是否存储为二进制文件
:return:
"""
# 加载所有的词汇表训练集和测试集
pre_deal_train_df = pd.read_csv(project.preprocessed_data_dir + 'train_0.6_seg.csv',
names=["index", "s1", "s2", "label"],
header=None, encoding='utf-8',
sep='\t')
pre_deal_test_df = pd.read_csv(project.preprocessed_data_dir + 'test_0.4_seg.csv',
names=["index", "s1", "s2", "label"],
header=None, encoding='utf-8',
sep='\t',
)
texts = []
texts_s1_test = [line.strip().split(" ") for line in pre_deal_test_df['s1'].tolist()]
texts_s2_test = [line.strip().split(" ") for line in pre_deal_test_df['s2'].tolist()]
texts_s1_train = [line.strip().split(" ") for line in pre_deal_train_df['s1'].tolist()]
texts_s2_train = [line.strip().split(" ") for line in pre_deal_train_df['s2'].tolist()]
texts.extend(texts_s1_test)
texts.extend(texts_s2_test)
texts.extend(texts_s1_train)
texts.extend(texts_s2_train)
model = word2vec.Word2Vec(sentences=texts,size=300,window=2,min_count=3,workers=2)
model.wv.save_word2vec_format(fname=project.aux_dir + "train_all_data.bigram",binary=binary,fvocab=None)
def pre_train_char_w2v(binary = False):
data_local_df = pd.read_csv(project.data_dir + train_all, sep='\t', header=None,names=["index", "s1", "s2", "label"])
data_test_df = pd.read_csv(project.data_dir + test_all, sep='\t', header=None, names=["index", "s1", "s2", "label"])
# 加载停用词
stopwords = load_stopwordslist(project.aux_dir + stop_words_path)
# 加载拼写错误替换词
spelling_corrections = load_spelling_corrections(project.aux_dir + spelling_corrections_path)
re_object = re.compile(r'\*+') # 去除句子中的脱敏数字***,替换成一
data_df_list = [data_local_df,data_test_df]
texts = []
char_vocabs = {}
char_index = 1
for data_df in data_df_list:
for index, row in data_df.iterrows():
if index != 0 and index % 5000 == 0:
print("{:,} sentence word embedding.".format(index))
# 分别遍历每行的两个句子,并进行分词处理
for col_name in ["s1", "s2"]:
# 替换掉脱敏的数字
re_str = re_object.subn(u"十一", unicode(row[col_name], 'utf-8'))
# 纠正一些词
spell_corr_str = transform_other_word(re_str[0], spelling_corrections)
spell_corr_str = list(spell_corr_str)
for char in spell_corr_str:
if char not in char_vocabs and char not in stopwords and not char.strip()==u"":
char_vocabs[char] = char_index
char_index = char_index + 1
texts.extend(spell_corr_str)
model = word2vec.Word2Vec(sentences=texts,size=300,window=3,workers=2)
model.wv.save_word2vec_format(fname=project.aux_dir + "train_char_all__data.bigram",binary=binary,fvocab=None)
project.save(project.preprocessed_data_dir + 'train_all_char_vocabs.pickle', char_vocabs)
if __name__ == '__main__':
# step 1 # 预处理文本
jieba.load_userdict(project.aux_dir + dict_path)
data_local_df = pd.read_csv(project.data_dir + train_all, sep='\t', header=None,names=["index", "s1", "s2", "label"])
data_test_df = pd.read_csv(project.data_dir + test_all, sep='\t', header=None, names=["index", "s1", "s2", "label"])
data_all_df = pd.read_csv(project.data_dir + train_data_all, sep='\t', header=None, names=["index", "s1", "s2", "label"])
pre_train_char_w2v()
#
preprocessing(data_local_df,'train_0.6_seg')
preprocessing(data_test_df,'test_0.4_seg')
preprocessing(data_all_df,'data_all_seg')
# 保存label
project.save(project.features_dir + 'y_0.4_test.pickle', data_test_df['label'].tolist())
project.save(project.features_dir + 'y_0.6_train.pickle', data_local_df['label'].tolist())
project.save(project.features_dir + 'y_train.pickle', data_all_df['label'].tolist())
# step 2
pre_train_w2v()
# step 3
process_save_embedding_wv('train_all_w2v_embedding_matrix.pickle',type=2,isStore_ids=True)
# process_save_embedding_wv('zhihu_w2v_embedding_matrix.pickle',type=2,isStore_ids=False)
# process_save_embedding_wv('zhihu_w2v_embedding_matrix.pickle',type=3,isStore_ids=False)
# step 4 char wordembedding
process_save_char_embedding_wv(isStore_ids=True)
================================================
FILE: Financial_NLP/final_demo/extract_feature.py
================================================
#!/usr/bin/python
# -*- coding: utf-8 -*-
# @Time : 2018/6/15 下午8:12
# @Author : ComeOnJian
# @File : extract_feature.py
##
import pandas as pd
from train_model import *
from data_prepare import *
from sklearn.model_selection import StratifiedKFold
from keras.callbacks import EarlyStopping,ModelCheckpoint
from keras.utils import to_categorical
from keras.models import load_model
import gc
def before_extract_feature_load_data(train_file,test_file):
train_data = pd.read_csv(train_file,sep='\t', header=None,
names=["index", "s1", "s2", "label"])
test_data = pd.read_csv(test_file, sep='\t', header=None,
names=["index", "s1", "s2", "label"])
return train_data,test_data
def after_extract_feature_save_data(feature_train,feature_test,col_names,feature_name):
project.save_features(feature_train, feature_test, col_names, feature_name)
#################### 深度学习特征提取 ####################
def extract_feature_siamese_lstm_manDist():
# 前期参数设置
embedding_matrix_file_path = 'train_all_w2v_embedding_matrix.pickle'
feature_name = 'dl_siamese_lstm_manDist'
RANOD_SEED = 42
np.random.seed(RANOD_SEED)
nepoch = 40
num_folds = 5
batch_size = 512
# 加载Embeding矩阵
embedding_matrix = project.load(project.aux_dir + embedding_matrix_file_path)
#加载输入数据
X_train_s1 = project.load(project.preprocessed_data_dir + 's1_train_ids_pad.pickle')
X_train_s2 = project.load(project.preprocessed_data_dir + 's2_train_ids_pad.pickle')
X_test_s1 = project.load(project.preprocessed_data_dir + 's1_test_ids_pad.pickle')
X_test_s2 = project.load(project.preprocessed_data_dir + 's2_test_ids_pad.pickle')
#y_0.6_train.pickle 存储的为list
y_train = np.array(project.load(project.features_dir + 'y_0.6_train.pickle'))
y_val = np.array(project.load(project.features_dir + 'y_0.4_test.pickle'))
#定义model param
model_param = {
'lstm_units':50,
'lstm_dropout_rate':0.,
'lstm_re_dropout_rate':0.,
'desen_dropout_rate':0.75,
'num_dense':128
}
# model_checkpoint_path = project.temp_dir + 'fold-checkpoint-'+feature_name + '.h5'
kfold = StratifiedKFold(
n_splits=num_folds,
shuffle=True,
random_state=RANOD_SEED
)
# 存放最后预测结果
y_train_oofp = np.zeros((len(y_train),2),dtype='float64')
y_test_oofp = np.zeros((len(X_test_s1),2),dtype='float64')
train_y = to_categorical(y_train, 2)
val_y = to_categorical(y_val,2)
for fold_num, (ix_train, ix_val) in enumerate(kfold.split(X_train_s1,y_train)):
# 选出需要添加的样本
train_true_mask = y_train[ix_train] == 1
X_train_true_s1 = X_train_s1[ix_train][train_true_mask]
X_train_true_s2 = X_train_s2[ix_train][train_true_mask]
y_train_true = train_y[ix_train][train_true_mask]
# 进行添加
X_add_train_fold_s1 = np.vstack([X_train_s1[ix_train],X_train_true_s2])
X_add_train_fold_s2 = np.vstack([X_train_s2[ix_train],X_train_true_s1])
y_add_train_fold = np.concatenate([train_y[ix_train],y_train_true])
val_true_mask = y_train[ix_val]==1
X_val_true_s1 = X_train_s1[ix_val][val_true_mask]
X_val_true_s2 = X_train_s2[ix_val][val_true_mask]
y_val_true = train_y[ix_val][val_true_mask]
# 进行添加
X_add_val_fold_s1 = np.vstack([X_train_s1[ix_val], X_val_true_s2])
X_add_val_fold_s2 = np.vstack([X_train_s2[ix_val], X_val_true_s1])
y_add_val_fold = np.concatenate([train_y[ix_val], y_val_true])
print 'start train fold {} of {} ......'.format((fold_num + 1), 5)
# 创建模型
model = create_siamese_lstm_ManDistance_model(embedding_matrix, model_param)
# 训练模型
model_checkpoint_path = project.trained_model_dir + 'dl_siamese_lstm_manDist_model{}.h5'.format(fold_num)
model.fit(x=[X_add_train_fold_s1,X_add_train_fold_s2],y=y_add_train_fold,
validation_data=([X_add_val_fold_s1,X_add_val_fold_s2],y_add_val_fold),
batch_size=batch_size,
epochs=nepoch,
verbose=1,
class_weight={0: 1, 1: 2},
callbacks=[
EarlyStopping(
monitor='val_loss',
min_delta=0.005,
patience=5,
verbose=1,
mode='auto'
),
ModelCheckpoint(
model_checkpoint_path,
monitor='val_loss',
save_best_only=True,
save_weights_only=False,
verbose=1
)]
)
model.load_weights(model_checkpoint_path)
y_train_oofp[ix_val] = predict(model,X_train_s1[ix_val],X_train_s2[ix_val])
K.clear_session()
del X_add_train_fold_s1
del X_add_train_fold_s2
del X_add_val_fold_s1
del X_add_val_fold_s2
del y_add_train_fold
del y_add_val_fold
gc.collect()
# save feature
model_path = project.trained_model_dir + 'dl_siamese_lstm_manDist_model0.h5'
model0 = load_model(model_path,
custom_objects={'ManDist': ManDist, 'fbeta_score': fbeta_score, 'precision': precision,
'recall': recall})
y_test_oofp = predict(model0,X_test_s1,X_test_s2)
col_names = ['{}_{}'.format(feature_name,index) for index in range(2)]
after_extract_feature_save_data(y_train_oofp,y_test_oofp,col_names,feature_name)
def extract_feature_siamese_lstm_attention():
# 前期参数设置
embedding_matrix_file_path = 'train_all_w2v_embedding_matrix.pickle'
feature_name = 'dl_siamese_lstm_attention'
RANOD_SEED = 42
np.random.seed(RANOD_SEED)
nepoch = 50
num_folds = 5
batch_size = 512
# 加载Embeding矩阵
embedding_matrix = project.load(project.aux_dir + embedding_matrix_file_path)
#加载输入数据
X_train_s1 = project.load(project.preprocessed_data_dir + 's1_train_ids_pad.pickle')
X_train_s2 = project.load(project.preprocessed_data_dir + 's2_train_ids_pad.pickle')
X_test_s1 = project.load(project.preprocessed_data_dir + 's1_test_ids_pad.pickle')
X_test_s2 = project.load(project.preprocessed_data_dir + 's2_test_ids_pad.pickle')
#y_0.6_train.pickle 存储的为list
y_train = np.array(project.load(project.features_dir + 'y_0.6_train.pickle'))
y_val = np.array(project.load(project.features_dir + 'y_0.4_test.pickle'))
#定义model param
model_param = {
'lstm_units':50,
'lstm_dropout_rate':0.,
'lstm_re_dropout_rate':0.,
'desen_dropout_rate':0.75,
'num_dense':128
}
model_checkpoint_path = project.temp_dir + 'fold-checkpoint-'+feature_name + '.h5'
kfold = StratifiedKFold(
n_splits=num_folds,
shuffle=True,
random_state=RANOD_SEED
)
# 存放最后预测结果
# y_train_oofp = np.zeros_like(y_train,dtype='float64')
y_train_oofp = np.zeros((len(y_train),1),dtype='float64')
y_test_oofp = np.zeros((len(X_test_s1),1),dtype='float64')
for fold_num, (ix_train, ix_val) in enumerate(kfold.split(X_train_s1,y_train)):
# 选出需要添加的样本
train_true_mask = y_train[ix_train] == 1
X_train_true_s1 = X_train_s1[ix_train][train_true_mask]
X_train_true_s2 = X_train_s2[ix_train][train_true_mask]
y_train_true = y_train[ix_train][train_true_mask]
# 进行添加
X_add_train_fold_s1 = np.vstack([X_train_s1[ix_train],X_train_true_s2])
X_add_train_fold_s2 = np.vstack([X_train_s2[ix_train],X_train_true_s1])
y_add_train_fold = np.concatenate([y_train[ix_train],y_train_true])
val_true_mask = y_train[ix_val]==1
X_val_true_s1 = X_train_s1[ix_val][val_true_mask]
X_val_true_s2 = X_train_s2[ix_val][val_true_mask]
y_val_true = y_train[ix_val][val_true_mask]
# 进行添加
X_add_val_fold_s1 = np.vstack([X_train_s1[ix_val], X_val_true_s2])
X_add_val_fold_s2 = np.vstack([X_train_s2[ix_val], X_val_true_s1])
y_add_val_fold = np.concatenate([y_train[ix_val], y_val_true])
print 'start train fold {} of {} ......'.format((fold_num + 1), 5)
# 创建模型
model = create_siamese_lstm_attention_model(embedding_matrix, model_param)
# 训练模型
model_checkpoint_path = project.trained_model_dir + 'dl_siamese_lstm_attention_model{}.h5'.format(fold_num)
model.fit(x=[X_add_train_fold_s1,X_add_train_fold_s2],y=y_add_train_fold,
validation_data=([X_add_val_fold_s1,X_add_val_fold_s2],y_add_val_fold),
batch_size=batch_size,
epochs=nepoch,
verbose=1,
class_weight={0: 1, 1: 2},
callbacks=[
EarlyStopping(
monitor='val_loss',
min_delta=0.005,
patience=5,
verbose=1,
mode='auto'
),
ModelCheckpoint(
model_checkpoint_path,
monitor='val_loss',
save_best_only=True,
save_weights_only=False,
verbose=1
)]
)
model.load_weights(model_checkpoint_path)
y_train_oofp[ix_val] = predict(model,X_train_s1[ix_val],X_train_s2[ix_val])
K.clear_session()
del X_add_train_fold_s1
del X_add_train_fold_s2
del X_add_val_fold_s1
del X_add_val_fold_s2
del y_add_train_fold
del y_add_val_fold
gc.collect()
model_path = project.trained_model_dir + 'dl_siamese_lstm_attention_model0.h5'
model0 = load_model(model_path,
custom_objects={'AttentionLayer1': AttentionLayer1, 'fbeta_score': fbeta_score, 'precision': precision,
'recall': recall})
y_test_oofp = predict(model0, X_test_s1, X_test_s2)
col_names = ['{}_{}'.format(feature_name,index) for index in range(1)]
after_extract_feature_save_data(y_train_oofp,y_test_oofp,col_names,feature_name)
def extract_feature_siamese_lstm_dssm():
# 前期参数设置
embedding_matrix_file_path = 'train_all_w2v_embedding_matrix.pickle'
embedding_char_matrix_file_path = 'train_all_char_embedding_matrix.pickle'
feature_name = 'dl_siamese_lstm_dssm'
RANOD_SEED = 42
np.random.seed(RANOD_SEED)
nepoch = 30
num_folds = 5
batch_size = 512
# 加载Embeding矩阵
embedding_matrix = project.load(project.aux_dir + embedding_matrix_file_path)
char_embedding_matrix = project.load(project.aux_dir + embedding_char_matrix_file_path)
# 加载输入数据
X_train_s1 = project.load(project.preprocessed_data_dir + 's1_train_ids_pad.pickle')
X_train_s2 = project.load(project.preprocessed_data_dir + 's2_train_ids_pad.pickle')
print X_train_s2.shape
X_test_s1 = project.load(project.preprocessed_data_dir + 's1_test_ids_pad.pickle')
X_test_s2 = project.load(project.preprocessed_data_dir + 's2_test_ids_pad.pickle')
X_char_train_s1 = project.load(project.preprocessed_data_dir + 's1_train_char_ids_pad.pickle')
X_char_train_s2 = project.load(project.preprocessed_data_dir + 's2_train_char_ids_pad.pickle')
X_char_test_s1 = project.load(project.preprocessed_data_dir + 's1_test_char_ids_pad.pickle')
X_char_test_s2 = project.load(project.preprocessed_data_dir + 's2_test_char_ids_pad.pickle')
# y_0.6_train.pickle 存储的为list
y_train = np.array(project.load(project.features_dir + 'y_0.6_train.pickle'))
y_val = np.array(project.load(project.features_dir + 'y_0.4_test.pickle'))
# train_y = to_categorical(y_train, 2)
# val_y = to_categorical(y_val,2)
# 定义model param
model_param = {
'lstm_units': 50,
'lstm_dropout_rate': 0.,
'lstm_re_dropout_rate': 0.,
'desen_dropout_rate': 0.75,
'num_dense': 128
}
kfold = StratifiedKFold(
n_splits=num_folds,
shuffle=True,
random_state=RANOD_SEED
)
# 存放最后预测结果
# y_train_oofp = np.zeros_like(y_train,dtype='float64')
y_train_oofp = np.zeros((len(y_train), 1), dtype='float64')
y_test_oofp = np.zeros((len(X_test_s1), 1), dtype='float64')
for fold_num, (ix_train, ix_val) in enumerate(kfold.split(X_train_s1, y_train)):
# 选出需要添加的样本
train_true_mask = y_train[ix_train] == 1
X_train_true_s1 = X_train_s1[ix_train][train_true_mask]
X_train_true_s2 = X_train_s2[ix_train][train_true_mask]
y_train_true = y_train[ix_train][train_true_mask]
# 进行添加
X_add_train_fold_s1 = np.vstack([X_train_s1[ix_train], X_train_true_s2])
X_add_train_fold_s2 = np.vstack([X_train_s2[ix_train], X_train_true_s1])
y_add_train_fold = np.concatenate([y_train[ix_train], y_train_true])
X_train_true_s1_char = X_char_train_s1[ix_train][train_true_mask]
X_train_true_s2_char = X_char_train_s2[ix_train][train_true_mask]
# 进行添加
X_add_train_fold_s1_char = np.vstack([X_char_train_s1[ix_train], X_train_true_s2_char])
X_add_train_fold_s2_char = np.vstack([X_char_train_s2[ix_train], X_train_true_s1_char])
# 验证部分
val_true_mask = y_train[ix_val] == 1
X_val_true_s1 = X_train_s1[ix_val][val_true_mask]
X_val_true_s2 = X_train_s2[ix_val][val_true_mask]
y_val_true = y_train[ix_val][val_true_mask]
# 进行添加
X_add_val_fold_s1 = np.vstack([X_train_s1[ix_val], X_val_true_s2])
X_add_val_fold_s2 = np.vstack([X_train_s2[ix_val], X_val_true_s1])
y_add_val_fold = np.concatenate([y_train[ix_val], y_val_true])
X_val_true_s1_char = X_char_train_s1[ix_val][val_true_mask]
X_val_true_s2_char = X_char_train_s2[ix_val][val_true_mask]
X_add_val_fold_s1_char = np.vstack([X_char_train_s1[ix_val], X_val_true_s2_char])
X_add_val_fold_s2_char = np.vstack([X_char_train_s2[ix_val], X_val_true_s1_char])
print 'start train fold {} of {} ......'.format((fold_num + 1), 5)
# 创建模型
model = create_siamese_lstm_dssm_mdoel(embedding_matrix,char_embedding_matrix, model_param)
# 训练模型
model_checkpoint_path = project.trained_model_dir + 'dl_siamese_lstm_dssm_model{}.h5'.format(fold_num)
model.fit(x=[X_add_train_fold_s1, X_add_train_fold_s2,X_add_train_fold_s1_char,X_add_train_fold_s2_char], y=y_add_train_fold,
validation_data=([X_add_val_fold_s1, X_add_val_fold_s2,X_add_val_fold_s1_char,X_add_val_fold_s2_char], y_add_val_fold),
batch_size=batch_size,
epochs=nepoch,
class_weight={0:1,1:2},
verbose=1,
callbacks=[
EarlyStopping(
monitor='val_loss',
min_delta=0.001,
patience=3,
verbose=1,
mode='auto'
),
ModelCheckpoint(
model_checkpoint_path,
monitor='val_loss',
save_best_only=True,
save_weights_only=False,
verbose=1
)]
)
model.load_weights(model_checkpoint_path)
y_train_oofp[ix_val] = predict1(model, X_train_s1[ix_val], X_train_s2[ix_val],X_char_train_s1[ix_val],X_char_train_s2[ix_val])
K.clear_session()
del X_add_train_fold_s1
del X_add_train_fold_s2
del X_add_val_fold_s1
del X_add_val_fold_s2
del y_add_train_fold
del y_add_val_fold
gc.collect()
model_path = project.trained_model_dir + 'dl_siamese_lstm_dssm_model0.h5'
model0 = load_model(model_path,
custom_objects={'AttentionLayer': AttentionLayer,'ManDist': ManDist,'ConsDist':ConsDist, 'fbeta_score': fbeta_score,
'precision': precision,
'recall': recall})
y_test_oofp = predict1(model0, X_test_s1, X_test_s2,X_char_test_s1,X_char_test_s2)
col_names = ['{}_{}'.format(feature_name, index) for index in range(1)]
after_extract_feature_save_data(y_train_oofp, y_test_oofp, col_names, feature_name)
def extract_feature_siamese_lstm_manDist_char():
feature_name = 'dl_siamese_lstm_manDist_char'
embedding_char_matrix_file_path = 'train_all_char_embedding_matrix.pickle'
nb_filter = 300
filter_width = [4, 3]
y_train_oofp = np.zeros((len(y_train), 1), dtype='float64')
y_test_oofp = np.zeros((len(X_test_s1), 1), dtype='float64')
kfold = StratifiedKFold(
n_splits=5,
shuffle=True,
random_state=44
)
for fold_num, (ix_train, ix_val) in enumerate(kfold.split(X_train_s1,y_train)):
# 选出需要添加的样本
train_true_mask = y_train[ix_train] == 1
X_train_true_s1 = X_train_s1[ix_train][train_true_mask]
X_train_true_s2 = X_train_s2[ix_train][train_true_mask]
y_train_true = y_train[ix_train][train_true_mask]
# 进行添加
X_add_train_fold_s1 = np.vstack([X_train_s1[ix_train],X_train_true_s2])
X_add_train_fold_s2 = np.vstack([X_train_s2[ix_train],X_train_true_s1])
y_add_train_fold = np.concatenate([y_train[ix_train],y_train_true])
val_true_mask = y_train[ix_val]==1
X_val_true_s1 = X_train_s1[ix_val][val_true_mask]
X_val_true_s2 = X_train_s2[ix_val][val_true_mask]
y_val_true = y_train[ix_val][val_true_mask]
# 进行添加
X_add_val_fold_s1 = np.vstack([X_train_s1[ix_val], X_val_true_s2])
X_add_val_fold_s2 = np.vstack([X_train_s2[ix_val], X_val_true_s1])
y_add_val_fold = np.concatenate([y_train[ix_val], y_val_true])
print 'start train fold {} of {} ......'.format((fold_num + 1), 5)
# 创建模型
model = create_abcnn_model(embedding_matrix,nb_filter,filter_width)
# 训练模型
model_checkpoint_path = project.trained_model_dir + 'dl_abcnn_model{}.h5'.format(fold_num)
model.fit(x=[X_add_train_fold_s1,X_add_train_fold_s2],y=y_add_train_fold,
validation_data=([X_add_val_fold_s1,X_add_val_fold_s2],y_add_val_fold),
batch_size=512,
epochs=30,
verbose=1,
class_weight={0: 1, 1: 2},
callbacks=[
EarlyStopping(
monitor='val_loss',
min_delta=0.005,
patience=5,
verbose=1,
mode='auto'
),
ModelCheckpoint(
model_checkpoint_path,
monitor='val_loss',
save_best_only=True,
save_weights_only=False,
verbose=1
)]
)
model.load_weights(model_checkpoint_path)
y_train_oofp[ix_val] = predict(model,X_train_s1[ix_val],X_train_s2[ix_val])
K.clear_session()
del X_add_train_fold_s1
del X_add_train_fold_s2
del X_add_val_fold_s1
del X_add_val_fold_s2
del y_add_train_fold
del y_add_val_fold
gc.collect()
model_path = project.trained_model_dir + 'dl_abcnn_model0.h5'
model0 = load_model(model_path,
custom_objects={'fbeta_score': fbeta_score, 'precision': precision,
'recall': recall})
y_test_oofp = predict(model0, X_test_s1, X_test_s2)
col_names = ['{}_{}'.format(feature_name,index) for index in range(1)]
after_extract_feature_save_data(y_train_oofp,y_test_oofp,col_names,feature_name)
#################### NLP特征提取 ####################
def extract_sentece_length_diff():
"""
长度差特征
"""
# step1 定义抽取特征的方式名
feature_name = 'nlp_sentece_length_diff'
# step2 载入数据
train_data ,test_data = before_extract_feature_load_data(train_file=project.preprocessed_data_dir + 'train_0.6_seg.csv',
test_file=project.preprocessed_data_dir + 'test_0.4_seg.csv')
feature_train = np.zeros((train_data.shape[0],1),dtype='float64')
feature_test = np.zeros((test_data.shape[0],1),dtype='float64')
# 计算两个句子的长度差
def get_length_diff(s1, s2):
return 1 - abs(len(s1) - len(s2)) / float(max(len(s1), len(s2)))
for index,row in train_data.iterrows():
s1 = row['s1'].strip().split(' ')
s2 = row['s2'].strip().split(' ')
diff = get_length_diff(s1,s2)
feature_train[index] = round(diff,5)
for index, row in test_data.iterrows():
s1 = row['s1'].strip().split(' ')
s2 = row['s2'].strip().split(' ')
diff = get_length_diff(s1, s2)
feature_test[index] = round(diff,5)
# step 3 保存特征:参数有:训练集的特征,测试集的特征,抽取特征的方法的多列特征的列名,抽取特征的方式名
col_names = [feature_name]
after_extract_feature_save_data(feature_train, feature_test, col_names, feature_name)
def extract_edit_distance():
# step1 定义抽取特征的方式名
feature_name = 'nlp_edit_distance'
# step2 载入数据
train_data, test_data = before_extract_feature_load_data(
train_file=project.data_dir + 'atec_nlp_sim_train_0.6.csv',
test_file=project.data_dir + 'atec_nlp_sim_test_0.4.csv')
feature_train = np.zeros((train_data.shape[0], 1), dtype='float64')
feature_test = np.zeros((test_data.shape[0], 1), dtype='float64')
# 计算编辑距离
def get_edit_distance(rawq1, rawq2):
m, n = len(rawq1) + 1, len(rawq2) + 1
matrix = [[0] * n for i in range(m)]
matrix[0][0] = 0
for i in range(1, m):
matrix[i][0] = matrix[i - 1][0] + 1
for j in range(1, n):
matrix[0][j] = matrix[0][j - 1] + 1
cost = 0
for i in range(1, m):
for j in range(1, n):
if rawq1[i - 1] == rawq2[j - 1]:
cost = 0
else:
cost = 1
matrix[i][j] = min(matrix[i - 1][j] + 1, matrix[i][j - 1] + 1, matrix[i - 1][j - 1] + cost)
return 1 - matrix[m - 1][n - 1] / float(max(len(rawq1), len(rawq2)))
for index,row in train_data.iterrows():
s1 = row['s1'].strip()
s2 = row['s2'].strip()
edit_distance = get_edit_distance(s1,s2)
feature_train[index] = round(edit_distance,5)
for index, row in test_data.iterrows():
s1 = row['s1'].strip()
s2 = row['s2'].strip()
edit_distance = get_edit_distance(s1, s2)
feature_test[index] = round(edit_distance,5)
# step 3 保存特征:参数有:训练集的特征,测试集的特征,抽取特征的方法的多列特征的列名,抽取特征的方式名
col_names = [feature_name]
after_extract_feature_save_data(feature_train,feature_test,col_names,feature_name)
def extract_ngram(max_ngram = 3):
'''
提取ngram特征
:return:
'''
# step1 定义抽取特征的方式名
feature_name = 'nlp_ngram'
# step2 载入数据
train_data, test_data = before_extract_feature_load_data(
train_file=project.preprocessed_data_dir + 'train_0.6_seg.csv',
test_file=project.preprocessed_data_dir + 'test_0.4_seg.csv')
feature_train = np.zeros((train_data.shape[0], max_ngram), dtype='float64')
feature_test = np.zeros((test_data.shape[0], max_ngram), dtype='float64')
# 定义n_gram的方法
def get_ngram(rawq, ngram_value):
result = []
for i in range(len(rawq)):
if i + ngram_value < len(rawq) + 1:
result.append(rawq[i:i + ngram_value])
return result
def get_ngram_sim(q1_ngram, q2_ngram):
q1_dict = {}
q2_dict = {}
for token in q1_ngram:
if token not in q1_dict:
q1_dict[token] = 1
else:
q1_dict[token] = q1_dict[token] + 1
q1_count = np.sum([value for key, value in q1_dict.items()])
for token in q2_ngram:
if token not in q2_dict:
q2_dict[token] = 1
else:
q2_dict[token] = q2_dict[token] + 1
q2_count = np.sum([value for key, value in q2_dict.items()])
# ngram1有但是ngram2没有
q1_count_only = np.sum([value for key, value in q1_dict.items() if key not in q2_dict])
# ngram2有但是ngram1没有
q2_count_only = np.sum([value for key, value in q2_dict.items() if key not in q1_dict])
# ngram1和ngram2都有的话,计算value的差值
q1_q2_count = np.sum([abs(value - q2_dict[key]) for key, value in q1_dict.items() if key in q2_dict])
# ngram1和ngram2的总值
all_count = q1_count + q2_count
# print(q1_dict)
# print(q2_dict)
# print(q1_count_only)
# print(q2_count_only)
# print(q1_q2_count)
# print(all_count)
return (1 - float(q1_count_only + q2_count_only + q1_q2_count) / (float(all_count) + 0.00000001))
for ngram_value in range(max_ngram):
for index, row in train_data.iterrows():
s1 = row['s1'].strip()
s2 = row['s2'].strip()
ngram1 = get_ngram(s1, ngram_value + 1)
ngram2 = get_ngram(s2, ngram_value + 1)
ngram_sim = get_ngram_sim(ngram1, ngram2)
feature_train[index,ngram_value] = round(ngram_sim,5)
for index, row in test_data.iterrows():
s1 = row['s1'].strip()
s2 = row['s2'].strip()
ngram1 = get_ngram(s1, ngram_value + 1)
ngram2 = get_ngram(s2, ngram_value + 1)
ngram_sim = get_ngram_sim(ngram1, ngram2)
ngram_sim = get_ngram_sim(ngram1, ngram2)
feature_test[index, ngram_value] = round(ngram_sim, 5)
# step 3 保存特征:参数有:训练集的特征,测试集的特征,抽取特征的方法的多列特征的列名,抽取特征的方式名
col_names = [('{}_{}'.format(feature_name,ngram_value))for ngram_value in range(max_ngram)]
after_extract_feature_save_data(feature_train,feature_test,col_names,feature_name)
def extract_sentence_diff_same():
'''
两个句子的相同和不同的词特征
'''
# step1 定义抽取特征的方式名
feature_name = 'nlp_sentece_diff_some'
col_num = 6
# step2 载入数据
train_data, test_data = before_extract_feature_load_data(
train_file=project.preprocessed_data_dir + 'train_0.6_seg.csv',
test_file=project.preprocessed_data_dir + 'test_0.4_seg.csv')
feature_train = np.zeros((train_data.shape[0],col_num),dtype='float64')
feature_test = np.zeros((test_data.shape[0],col_num),dtype='float64')
#统计两个句子的相同和不同
def get_word_diff(q1, q2):
set1 = set(q1.split(" "))
set2 = set(q2.split(" "))
same_word_len = len(set1 & set2)
unique_word1_len = len(set1 - set2)
unique_word2_len = len(set2 - set1)
word1_len = len(set1)
word2_len = len(set2)
avg_len = (word1_len + word2_len) / 2.0
max_len = max(word1_len, word2_len)
min_len = min(word1_len, word2_len)
jaccard_sim = same_word_len / float(len(set1 | set2))
return same_word_len / float(max_len), same_word_len / float(min_len), same_word_len / float(avg_len), \
unique_word1_len / float(word1_len), unique_word2_len /float(word2_len), jaccard_sim
for index,row in train_data.iterrows():
s1 = row['s1'].strip()
s2 = row['s2'].strip()
features = tuple()
features = get_word_diff(s1,s2)
for col_index,feature in enumerate(features):
feature_train[index,col_index] = round(feature,5)
for index, row in test_data.iterrows():
s1 = row['s1'].strip()
s2 = row['s2'].strip()
features = tuple()
features = get_word_diff(s1, s2)
for col_index,feature in enumerate(features):
feature_test[index,col_index] = round(feature,5)
# step 3 保存特征:参数有:训练集的特征,测试集的特征,抽取特征的方法的多列特征的列名,抽取特征的方式名
col_names = [('{}_{}'.format(feature_name,col_index))for col_index in range(col_num)]
after_extract_feature_save_data(feature_train,feature_test,col_names,feature_name)
def extract_doubt_sim():
'''
抽取疑问词相同的比例
'''
# step1 定义抽取特征的方式名
feature_name = 'nlp_doubt_sim'
# step2 载入数据
train_data, test_data = before_extract_feature_load_data(
train_file=project.preprocessed_data_dir + 'train_0.6_seg.csv',
test_file=project.preprocessed_data_dir + 'test_0.4_seg.csv')
feature_train = np.zeros((train_data.shape[0], 1), dtype='float64')
feature_test = np.zeros((test_data.shape[0],1),dtype='float64')
doubt_words = load_doubt_words(project.aux_dir + 'doubt_words.txt')
# 获取疑问词相同的比例
def get_doubt_sim(q1, q2, doubt_words):
q1_doubt_words = set(q1.split(" ")) & set(doubt_words)
q2_doubt_words = set(q2.split(" ")) & set(doubt_words)
return len(q1_doubt_words & q2_doubt_words) / float(len(q1_doubt_words | q2_doubt_words) + 1)
for index,row in train_data.iterrows():
# 因为doubt_words词表加载出来的是Unicode,所以需要将s1,s2解码成Unicode
s1 = row['s1'].strip().decode('utf-8')
s2 = row['s2'].strip().decode('utf-8')
doubt_sim = get_doubt_sim(s1,s2,doubt_words)
feature_train[index] = round(doubt_sim,5)
for index, row in test_data.iterrows():
s1 = row['s1'].strip().decode('utf-8')
s2 = row['s2'].strip().decode('utf-8')
doubt_sim = get_doubt_sim(s1, s2, doubt_words)
feature_test[index] = round(doubt_sim,5)
col_names = [feature_name]
after_extract_feature_save_data(feature_train,feature_test,col_names,feature_name)
def extract_sentence_exist_topic():
"""
抽取两个句子中是否同时存在蚂蚁花呗或者蚂蚁借呗的特征,同时包含花呗为1,同时包含借呗为1,否则为0
:return:
"""
# step1 定义抽取特征的方式名
feature_name = 'nlp_sentece_exist_topic'
# step2 载入数据
train_data, test_data = before_extract_feature_load_data(
train_file=project.data_dir + 'atec_nlp_sim_train_0.6.csv',
test_file=project.data_dir + 'atec_nlp_sim_test_0.4.csv')
feature_train = np.zeros((train_data.shape[0], 2), dtype='float64')
feature_test = np.zeros((test_data.shape[0], 2), dtype='float64')
def get_exist_same_topic(rawq1,rawq2):
hua_flag = 0.
jie_flag = 0.
if '花呗' in rawq1 and '花呗' in rawq2:
hua_flag = 1.
if '借呗' in rawq1 and '借呗' in rawq2:
jie_flag = 1.
return hua_flag,jie_flag
for index,row in train_data.iterrows():
s1 = row['s1'].strip()
s2 = row['s2'].strip()
hua_flag, jie_flag = get_exist_same_topic(s1,s2)
feature_train[index,0] = hua_flag
feature_train[index,1] = jie_flag
for index, row in test_data.iterrows():
s1 = row['s1'].strip()
s2 = row['s2'].strip()
hua_flag, jie_flag = get_exist_same_topic(s1, s2)
feature_test[index, 0] = hua_flag
feature_test[index, 1] = jie_flag
col_names = ['nlp_sentece_exist_topic_hua_flag','nlp_sentece_exist_topic_jie_flag']
after_extract_feature_save_data(feature_train,feature_test,col_names,feature_name)
def extract_word_embedding_sim(w2v_model_path = 'train_all_data.bigram'):
'''
提取句子的词向量组合的相似度
w2v_model_path为词向量文件
:return:
'''
# step1 定义抽取特征的方式名
feature_name = 'nlp_word_embedding_sim'
# step2 载入数据
train_data ,test_data = before_extract_feature_load_data(train_file=project.preprocessed_data_dir + 'train_0.6_seg.csv',
test_file=project.preprocessed_data_dir + 'test_0.4_seg.csv')
feature_train = np.zeros((train_data.shape[0], 1), dtype='float64')
feature_test = np.zeros((test_data.shape[0], 1), dtype='float64')
train_all_w2v_model = KeyedVectors.load_word2vec_format(project.aux_dir + w2v_model_path, binary=False)
# 得到句子的词向量组合(tfidf)
def get_sen_vec(q, train_all_w2v_model, tfidf_dict, tfidf_flag=True):
sen_vec = 0
for word in q.split(' '):
if word in train_all_w2v_model.vocab:
word_vec = train_all_w2v_model.word_vec(word)
word_tfidf = tfidf_dict.get(word, None)
if tfidf_flag == True:
sen_vec += word_vec * word_tfidf
else:
sen_vec += word_vec
sen_vec = sen_vec / np.sqrt(np.sum(np.power(sen_vec, 2)) + 0.000001)
return sen_vec
def get_sentece_embedding_sim(q1, q2, train_all_w2v_model, tfidf_dict, tfidf_flag=True):
# 得到两个问句的词向量组合
q1_sec = get_sen_vec(q1, train_all_w2v_model, tfidf_dict, tfidf_flag)
q2_sec = get_sen_vec(q2, train_all_w2v_model, tfidf_dict, tfidf_flag)
# 曼哈顿距离
# manhattan_distance = np.sum(np.abs(np.subtract(q1_sec, q2_sec)))
# 欧式距离
# enclidean_distance = np.sqrt(np.sum(np.power((q1_sec - q2_sec),2)))
# 余弦相似度
molecular = np.sum(np.multiply(q1_sec, q2_sec))
denominator = np.sqrt(np.sum(np.power(q1_sec, 2))) * np.sqrt(np.sum(np.power(q2_sec, 2)))
cos_sim = molecular / (denominator + 0.000001)
# 闵可夫斯基距离
# minkowski_distance = np.power(np.sum(np.power(np.abs(np.subtract(q1_sec, q2_sec)), 3)), 0.333333)
# return manhattan_distance, enclidean_distance, cos_sim, minkowski_distance
return cos_sim
for index,row in train_data.iterrows():
s1 = row['s1'].strip().decode('utf-8')
s2 = row['s2'].strip().decode('utf-8')
sentece_embedding_sim = get_sentece_embedding_sim(s1,s2,train_all_w2v_model,{},False)
feature_train[index] = round(sentece_embedding_sim,5)
for index, row in test_data.iterrows():
s1 = row['s1'].strip().decode('utf-8')
s2 = row['s2'].strip().decode('utf-8')
sentece_embedding_sim = get_sentece_embedding_sim(s1, s2, train_all_w2v_model,{}, False)
feature_test[index] = round(sentece_embedding_sim,5)
col_names = [feature_name]
after_extract_feature_save_data(feature_train,feature_test,col_names,feature_name)
if __name__ == '__main__':
# 提取深度学习特征
# extract_feature_siamese_lstm_manDist()
# extract_feature_siamese_lstm_attention()
extract_feature_siamese_lstm_dssm()
# extract_feature_abcnn()
# 提取NLP特征
extract_sentece_length_diff()
extract_edit_distance()
extract_ngram()
extract_sentence_diff_same()
extract_doubt_sim()
extract_sentence_exist_topic()
extract_word_embedding_sim()
# model_path = project.trained_model_dir + 'dl_siamese_lstm_dssm_model0.h5'
# atten1 = AttentionLayer(20)
# atten2 = AttentionLayer(25)
#
# model0 = load_model(model_path,
# custom_objects={'AttentionLayer':AttentionLayer,'fbeta_score': fbeta_score,
# 'precision': precision,
# 'recall': recall})
pass
================================================
FILE: Financial_NLP/final_demo/main.py
================================================
#!/usr/bin/python
# -*- coding: utf-8 -*-
# @Time : 2018/6/15 下午10:49
# @Author : ComeOnJian
# @File : main.py
from data_prepare import *
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from train_model import *
def star_process(X_train,y_train,X_test,y_test):
# Step2 分类模型
num_folds = 5
# Step3 定义交叉验证,及模型gbm参数
rand_seed = 456
kfold = StratifiedKFold(
n_splits=5,
shuffle=True,
random_state=rand_seed
)
lgb_param = {
'objective':'binary',
'metric':'binary_logloss',
'boosting':'gbdt',
'device':'cpu',
'feature_fraction': 1, #抽取所有特征的0.75个进行训练
'num_leaves':16,
'learning_rate':0.01,
'verbose':1,
'bagging_seed':rand_seed,
'feature_fraction_seed':rand_seed
}
y_test_pred = np.zeros((len(X_test),5))
for fold_num,(ix_train,ix_val) in enumerate(kfold.split(X=X_train,y=y_train)):
# 准备数据
X_fold_train = X_train[ix_train]
X_fold_val = X_train[ix_val]
y_fold_train = y_train[ix_train]
y_fold_val = y_train[ix_val]
print 'train fold {} of {} ......'.format((fold_num + 1), 5)
# 定义模型
lgb_data_train = lgb.Dataset(X_fold_train,y_fold_train)
lgb_data_val = lgb.Dataset(X_fold_val,y_fold_val)
evals_res = {}
model = lgb.train(
params=lgb_param,
train_set= lgb_data_train,
valid_sets=[lgb_data_train,lgb_data_val], # 训练集和测试集都需要验证
valid_names = ['train','val'],
evals_result= evals_res,
num_boost_round=2500,
early_stopping_rounds=10,
verbose_eval=False,
)
fold_train_score = evals_res['train'][lgb_param['metric']]
fold_val_score = evals_res['val'][lgb_param['metric']]
print 'fold {}: {} rounds ,train loss {:.6f}, val loss {:.6f}'.format(
(fold_num+1),
len(fold_train_score),
fold_train_score[-1],
fold_val_score[-1]
)
y_test_pred[:,fold_num] = model.predict(X_test).reshape(-1)
# print model.feature_importance()
# model.save_model(project.trained_model_dir + 'lgb_{}.model'.format(fold_num))
print y_test_pred.shape, '0'
y_test_p = np.mean(y_test_pred,axis=1)
# np.save('y_test_pre','pre.npy')
print y_test_pred.shape ,'1'
print y_test_p.shape ,'2'
for index,pre in enumerate(y_test_p):
if pre >=0.5:
y_test_p[index] = 1
else:
y_test_p[index] = 0
print y_test.shape,'3'
print accuracy_score(y_test,y_test_p)
print f1_score(y_test,y_test_p)
if __name__ == '__main__':
# step1 选出的特征
feature_names_list = [
'dl_siamese_lstm_manDist',
'dl_siamese_lstm_dssm',
'dl_siamese_lstm_attention',
'nlp_sentece_length_diff',
'nlp_edit_distance',
'nlp_ngram',
'nlp_sentece_diff_some',
'nlp_doubt_sim',
'nlp_sentece_exist_topic',
'nlp_word_embedding_sim'
]
# 加载数据
df_train,df_test,feature_index_ix = project.load_feature_lists(feature_names_list)
# 查看抽取的特征情况
feature_view_df = pd.DataFrame(feature_index_ix, columns=['feature_name', 'start_index', 'end_index'])
print feature_view_df
print df_train.head(20)
print df_train.tail(20)
y_train = np.array(project.load(project.features_dir + 'y_0.6_train.pickle'))
y_test = pd.read_csv(project.data_dir + 'atec_nlp_sim_test_0.4.csv', sep='\t', header=None,
names=["index", "s1", "s2", "label"])['label'].values.reshape((-1))
X_test = df_test.values
X_train = df_train.values
# star_process(X_train,y_train,X_test,y_test)
#
#
# lr = LogisticRegression(class_weight={0:1,1:4})
# lr.fit(X_train,y_train)
# y_p = lr.predict(X_test)
# print y_p[0:20]
# for index, pre in enumerate(y_p):
# if pre >= 0.5:
# y_p[index] = 1
# else:
# y_p[index] = 0
#
# print accuracy_score(y_test, y_p)
# print f1_score(y_test, y_p)
#
# clf = RandomForestClassifier(n_estimators=25,
# max_depth=4,
#
# class_weight={
# 0: 1,
# 1: 4
# }
# )
# clf.fit(X_train, y_train)
# y_p1 = clf.predict(X_test)
# print accuracy_score(y_test, y_p1)
# print f1_score(y_test, y_p1)
#
#
# gb = GaussianNB()
# gb.fit(X_train, y_train)
# y_p2 = gb.predict(X_test)
# print accuracy_score(y_test, y_p2)
# print f1_score(y_test, y_p2)
#
# dt = DecisionTreeClassifier(class_weight={
# 0: 1,
# 1: 4
# },max_depth=4)
# dt.fit(X_train, y_train)
# y_p3 = dt.predict(X_test)
# print accuracy_score(y_test, y_p3)
# print f1_score(y_test, y_p3)
# stacking 第一层模型训练
# lgb_cls = LGBClassifier()
# lgb_oofp_train, lgb_oofp_val = lgb_cls.get_model_out(X_train,y_train,X_test)
# print lgb_oofp_val[0:25]
gnb_cls = GussianNBClassifier()
gnb_oop_train,gnb_oofp_val = gnb_cls.get_model_out(X_train,y_train,X_test)
print gnb_oofp_val[0:25]
rf_cls = RFClassifer()
rf_oop_train, rf_oofp_val = rf_cls.get_model_out(X_train, y_train, X_test)
print rf_oofp_val[0:25]
lg_cls = LogisicClassifier()
lg_oop_train, lg_oofp_val = lg_cls.get_model_out(X_train, y_train, X_test)
print lg_oofp_val[0:25]
dt_cls = DecisionClassifier()
dt_oop_train, dt_oofp_val = dt_cls.get_model_out(X_train, y_train, X_test)
print dt_oofp_val[0:25]
# 构造输入
input_train = [gnb_oop_train,rf_oop_train,lg_oop_train,dt_oop_train]
input_test = [gnb_oofp_val,rf_oofp_val,lg_oofp_val,dt_oofp_val]
stacked_train = np.concatenate([data.reshape(-1,1) for data in input_train],axis=1)
stacked_test = np.concatenate([data.reshape(-1,1) for data in input_test],axis=1)
# stacking 第二层模型训练
second_model = DecisionTreeClassifier(max_depth=3,class_weight={0: 1, 1: 4})
second_model.fit(stacked_train,y_train)
y_test_p = second_model.predict(stacked_test)
for index,pre in enumerate(y_test_p):
if pre >=0.5:
y_test_p[index] = 1
else:
y_test_p[index] = 0
print accuracy_score(y_test,y_test_p)
print f1_score(y_test,y_test_p)
================================================
FILE: Financial_NLP/final_demo/train_model.py
================================================
#!/usr/bin/python
# -*- coding: utf-8 -*-
# @Time : 2018/6/16 上午9:34
# @Author : ComeOnJian
# @File : train_model.py
from keras.models import Sequential,Model
from keras.layers.convolutional import Conv2D
from keras.layers.pooling import MaxPooling2D
from keras.layers import Embedding,LSTM,Layer,initializers,regularizers,constraints,Input,\
Dropout,concatenate,BatchNormalization,Dense,Bidirectional,\
Concatenate,Multiply,Maximum,Subtract,Lambda,dot,Flatten,Reshape
from keras import backend as K
from sklearn.model_selection import KFold
import numpy as np
#################### 模型的一些辅助类 ####################
class AttentionLayer(Layer):
def __init__(self,step_dim,W_regularizer=None, b_regularizer=None,
W_constraint=None, b_constraint=None,
bias=True, **kwargs):
self.supports_masking = True
self.init = initializers.get('glorot_uniform')
self.W_regularizer = regularizers.get(W_regularizer)
self.b_regularizer = regularizers.get(b_regularizer)
self.W_constraint = constraints.get(W_constraint)
self.b_constraint = constraints.get(b_constraint)
self.bias = bias
self.step_dim = step_dim
self.features_dim = 0
super(AttentionLayer,self).__init__(**kwargs)
#设置self.supports_masking = True后需要复写该方法
def compute_mask(self, inputs, mask=None):
return None
#参数设置,必须实现
def build(self, input_shape):
assert len(input_shape) == 3
self.W = self.add_weight((input_shape[-1],),
initializer=self.init,
name='{}_W'.format(self.name),
regularizer=self.W_regularizer,
constraint=self.W_constraint)
self.features_dim = input_shape[-1]
if self.bias:
self.b = self.add_weight((input_shape[1],),
initializer='zero',
name='{}_b'.format(self.name),
regularizer=self.b_regularizer,
constraint=self.b_constraint)
else:
self.b = None
self.built = True
# input (None,sentence_length,embedding_size)
def call(self, x, mask = None):
# 计算输出
features_dim = self.features_dim
step_dim = self.step_dim
eij = K.reshape(K.dot(K.reshape(x, (-1, features_dim)), K.reshape(self.W, (features_dim, 1))), (-1, step_dim))
if self.bias:
eij += self.b
eij = K.tanh(eij)
a = K.exp(eij)
# apply mask after the exp. will be re-normalized next
if mask is not None:
# Cast the mask to floatX to avoid float64 upcasting in theano
a *= K.cast(mask, K.floatx())
# in some cases especially in the early stages of training the sum may be almost zero
a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx())
a = K.expand_dims(a)
weighted_input = x * a
# print weigthted_input.shape
return K.sum(weighted_input, axis=1)
def compute_output_shape(self, input_shape):
return input_shape[0], self.features_dim
def get_config(self):
config = {'step_dim': self.step_dim}
base_config = super(AttentionLayer, self).get_config()
return dict(list(base_config.items()) + list(config.items()))
class ManDist(Layer):
"""
自定义定义曼哈顿距离计算层,继承Layer层,必须实现三个父类方法
build,call,comput_output_shape
"""
def __init__(self, **kwargs):
self.res = None # 表示相似度
# self.match_vector = None
super(ManDist, self).__init__(**kwargs)
def build(self, input_shape):
"""Creates the layer weights.
# Arguments
input_shape: Keras tensor (future input to layer)
or list/tuple of Keras tensors to reference
for weight shape computations.
"""
super(ManDist, self).build(input_shape)
def call(self, inputs, **kwargs):
"""This is where the layer's logic lives.
# Arguments
inputs: Input tensor, or list/tuple of input tensors.
**kwargs: Additional keyword arguments.
# Returns
A tensor or list/tuple of tensors.
"""
# 计算曼哈顿距离,因为输入计算曼哈顿距离的有两个Input层分别为inputs[0]和inputs[1]
# lstm model
self.res = K.exp(- K.sum(K.abs(inputs[0]-inputs[1]),axis = 1,keepdims = True))
return self.res
# encode_s1 = inputs[0]
# encode_s2 = inputs[1]
# sentence_differerce = encode_s1 - encode_s2
# sentece_product = encode_s1 * encode_s2
#
# self.match_vector = K.concatenate([encode_s1,sentence_differerce,sentece_product,encode_s2],1)
#
# return self.match_vector
def compute_output_shape(self, input_shape):
"""Computes the output shape of the layer.
Assumes that the layer will be built
to match that input shape provided.
# Arguments
input_shape: Shape tuple (tuple of integers)
or list of shape tuples (one per output tensor of the layer).
Shape tuples can include None for free dimensions,
instead of an integer.
# Returns
An input shape tuple.
"""
return K.int_shape(self.res)
class ConsDist(Layer):
"""
自定义定义曼哈顿距离计算层,继承Layer层,必须实现三个父类方法
build,call,comput_output_shape
"""
def __init__(self, **kwargs):
self.res = None # 表示相似度
# self.match_vector = None
super(ConsDist, self).__init__(**kwargs)
def build(self, input_shape):
"""Creates the layer weights.
# Arguments
input_shape: Keras tensor (future input to layer)
or list/tuple of Keras tensors to reference
for weight shape computations.
"""
super(ConsDist, self).build(input_shape)
def call(self, inputs, **kwargs):
"""This is where the layer's logic lives.
# Arguments
inputs: Input tensor, or list/tuple of input tensors.
**kwargs: Additional keyword arguments.
# Returns
A tensor or list/tuple of tensors.
"""
# 计算曼哈顿距离,因为输入计算曼哈顿距离的有两个Input层分别为inputs[0]和inputs[1]
# lstm model
self.res = K.sum(inputs[0] * inputs[1],axis=1,keepdims=True)/(K.sum(inputs[0]**2,axis=1,keepdims=True) * K.sum(inputs[1]**2,axis=1,keepdims=True))
return self.res
def compute_output_shape(self, input_shape):
"""Computes the output shape of the layer.
Assumes that the layer will be built
to match that input shape provided.
# Arguments
input_shape: Shape tuple (tuple of integers)
or list of shape tuples (one per output tensor of the layer).
Shape tuples can include None for free dimensions,
instead of an integer.
# Returns
An input shape tuple.
"""
return K.int_shape(self.res)
class AttentionLayer1(Layer):
def __init__(self, **kwargs):
# self.res = None # 表示相似度
self.match_vector = None
super(AttentionLayer1, self).__init__(**kwargs)
def build(self, input_shape):
"""Creates the layer weights.
# Arguments
input_shape: Keras tensor (future input to layer)
or list/tuple of Keras tensors to reference
for weight shape computations.
"""
super(AttentionLayer1, self).build(input_shape)
def call(self, inputs, **kwargs):
"""This is where the layer's logic lives.
# Arguments
inputs: Input tensor, or list/tuple of input tensors.
**kwargs: Additional keyword arguments.
# Returns
A tensor or list/tuple of tensors.
"""
encode_s1 = inputs[0]
encode_s2 = inputs[1]
sentence_differerce = encode_s1 - encode_s2
sentece_product = encode_s1 * encode_s2
self.match_vector = K.concatenate([encode_s1,sentence_differerce,sentece_product,encode_s2],1)
#
return self.match_vector
def compute_output_shape(self, input_shape):
"""Computes the output shape of the layer.
Assumes that the layer will be built
to match that input shape provided.
# Arguments
input_shape: Shape tuple (tuple of integers)
or list of shape tuples (one per output tensor of the layer).
Shape tuples can include None for free dimensions,
instead of an integer.
# Returns
An input shape tuple.
"""
return K.int_shape(self.match_vector)
def precision(y_true, y_pred):
"""Precision metric.
Only computes a batch-wise average of precision.
Computes the precision, a metric for multi-label classification of
how many selected items are relevant.
"""
#y_t = K.cast(K.argmax(y_true,axis=1),dtype='float32')
#y_p = K.cast(K.argmax(y_pred,axis=1),dtype='float32')
y_t = y_true
y_p = y_pred
true_positives = K.sum(K.round(K.clip(y_t * y_p, 0, 1)))
predicted_positives = K.sum(K.round(K.clip(y_p, 0, 1)))
precision = true_positives / (predicted_positives + K.epsilon())
return precision
def recall(y_true, y_pred):
"""Recall metric.
Only computes a batch-wise average of recall.
Computes the recall, a metric for multi-label classification of
how many relevant items are selected.
"""
#y_t = K.cast(K.argmax(y_true,axis=1),dtype='float32')
#y_p = K.cast(K.argmax(y_pred,axis=1),dtype='float32')
y_t = y_true
y_p = y_pred
true_positives = K.sum(K.round(K.clip(y_t * y_p, 0, 1)))
possible_positives = K.sum(K.round(K.clip(y_t, 0, 1)))
recall = true_positives / (possible_positives + K.epsilon())
return recall
def fbeta_score(y_t, y_p, beta=1):
"""Computes the F score.
The F score is the weighted harmonic mean of precision and recall.
Here it is only computed as a batch-wise average, not globally.
This is useful for multi-label classification, where input samples can be
classified as sets of labels. By only using accuracy (precision) a model
would achieve a perfect score by simply assigning every class to every
input. In order to avoid this, a metric should penalize incorrect class
assignments as well (recall). The F-beta score (ranged from 0.0 to 1.0)
computes this, as a weighted mean of the proportion of correct class
assignments vs. the proportion of incorrect class assignments.
With beta = 1, this is equivalent to a F-measure. With beta < 1, assigning
correct classes becomes more important, and with beta > 1 the metric is
instead weighted towards penalizing incorrect class assignments.
"""
if beta < 0:
raise ValueError('The lowest choosable beta is zero (only precision).')
# If there are no true positives, fix the F score at 0 like sklearn.
if K.sum(K.round(K.clip(y_t, 0, 1))) == 0:
return 0
p = precision(y_t, y_p)
r = recall(y_t, y_p)
bb = beta ** 2
fbeta_score = (1 + bb) * (p * r) / (bb * p + r + K.epsilon())
return fbeta_score
def contrastive_loss(y_true,y_pred):
"""
定义孪生网络的代价函数,对比代价函数,每个样本的误差为L=(1 - y) * d + y * max((margin - d),0) 其中margin为相似度的阈值默认为1
http://yann.lecun.com/exdb/publis/pdf/hadsell-chopra-lecun-06.pdf
:param y_true:1表示两个样本相似,0表示不匹配,y
:param y_pred:表示相似度d,范围是(0,1)
:return:
"""
margin = 0.8
# return K.mean(y_true * K.square(y_pred) +
# (1 - y_true) * K.square(K.maximum(margin - y_pred, 0)))
return K.mean((1-y_true) * y_pred + y_true * K.maximum((margin - y_pred),0))
def create_siamese_lstm_attention_model(embedding_matrix,model_param,embedding_size = 300,max_sentence_length = 20):
# step 1 定义孪生网络的公共层
X = Sequential()
embedding_layer = Embedding(
input_dim=len(embedding_matrix,),
output_dim=embedding_size,
weights=[embedding_matrix],
trainable=True,
input_length=max_sentence_length
)
# 一般来说return_sequences为true时,需要使用attention
lstm_layer = LSTM(
units=model_param['lstm_units']
,return_sequences=False
)
# attention_layer = AttentionLayer()
X.add(embedding_layer)
X.add(lstm_layer)
# X.add(attention_layer)
#share_model为孪生网络的共同拥有的层
share_model = X
# step 2 模型是多输入的结构,定义两个句子的输入
left_input = Input(shape=(max_sentence_length,), dtype='int32')
right_input = Input(shape=(max_sentence_length,), dtype='int32')
# Step3定义两个输入合并后的模型层
s1_net = share_model(left_input)
s2_net = share_model(right_input)
# Dropout 防止过拟合连接层
# merge_model = concatenate([s1_net,s2_net])
# merge_model = Dropout(model_param['desen_dropout_rate'])(merge_model)
# merge_model = BatchNormalization()(merge_model)
#
matching_layer = AttentionLayer1()([s1_net,s2_net])
# merge_model = Dropout(model_param['desen_dropout_rate'])(man_layer)
# merge_model = BatchNormalization()(merge_model)
# # Dense层
# activation = 'relu'
merge_model = Dense(model_param['num_dense'])(matching_layer)
merge_model = Dropout(model_param['desen_dropout_rate'])(merge_model)
merge_model = BatchNormalization()(merge_model)
# Step4 定义输出层
output_layer = Dense(1,activation='sigmoid')(merge_model)
model = Model(
inputs=[left_input, right_input],
outputs=[output_layer], name="simaese_lstm_attention"
)
model.compile(
#categorical_crossentropy,contrastive_loss,binary_crossentropy
loss='binary_crossentropy',
optimizer='adam',
metrics=["accuracy",fbeta_score,precision,recall]
)
return model
def create_siamese_lstm_ManDistance_model(embedding_matrix,model_param,embedding_size = 300,max_sentence_length = 20):
# step 1 定义孪生网络的公共层
X = Sequential()
embedding_layer = Embedding(
input_dim=len(embedding_matrix,),
output_dim=embedding_size,
weights=[embedding_matrix],
trainable=True,
input_length=max_sentence_length
)
# 一般来说return_sequences为true时,需要使用attention
lstm_layer = LSTM(
units=model_param['lstm_units'],
dropout=model_param['lstm_dropout_rate'],
recurrent_dropout=model_param['lstm_re_dropout_rate']
,return_sequences=False
)
# attention_layer = AttentionLayer()
X.add(embedding_layer)
X.add(lstm_layer)
# X.add(attention_layer)
#share_model为孪生网络的共同拥有的层
share_model = X
# step 2 模型是多输入的结构,定义两个句子的输入
left_input = Input(shape=(max_sentence_length,), dtype='int32')
right_input = Input(shape=(max_sentence_length,), dtype='int32')
# Step3定义两个输入合并后的模型层
s1_net = share_model(left_input)
s2_net = share_model(right_input)
# Step4 定义输出层
man_layer = ManDist()([s1_net,s2_net])
out_put_layer = Dense(2, activation='softmax')(man_layer)
# out_put_layer = Dense(1,activation='sigmoid')(man_layer)
model = Model(
inputs=[left_input, right_input],
outputs=[out_put_layer], name="simaese_lstm_manDist"
)
model.compile(
# contrastive_loss binary_crossentropy categorical_crossentropy
loss= 'categorical_crossentropy',
optimizer='adam',
metrics=["accuracy",fbeta_score,precision,recall]
)
# model.predict()
return model
def create_siamese_lstm_dssm_mdoel(embedding_matrix,embedding_word_matrix,model_param,embedding_size = 300,max_sentence_length = 20,max_word_length=25):
# 第一部分
# step 1 定义复杂模型的输入
num_conv2d_layers = 1
filters_2d = [6, 12]
kernel_size_2d = [[3, 3], [3, 3]]
mpool_size_2d = [[2, 2], [2, 2]]
left_input = Input(shape=(max_sentence_length,), dtype='int32')
right_input = Input(shape=(max_sentence_length,), dtype='int32')
# 定义需要使用的网络层
embedding_layer1 = Embedding(
input_dim=len(embedding_matrix, ),
output_dim=embedding_size,
weights=[embedding_matrix],
trainable=True,
input_length=max_sentence_length
)
att_layer1 = AttentionLayer(20)
bi_lstm_layer =Bidirectional(LSTM(model_param['lstm_units']))
lstm_layer1 = LSTM(model_param['lstm_units'],
return_sequences=True)
lstm_layer2 = LSTM(model_param['lstm_units'])
# 组合模型结构,两个输入添加Embeding层
s1 = embedding_layer1(left_input)
s2 = embedding_layer1(right_input)
# 在Embeding层上添加双向LSTM层
s1_bi = bi_lstm_layer(s1)
s2_bi = bi_lstm_layer(s2)
# 另在Embeding层上添加双层LSTM层
s1_lstm_lstm = lstm_layer2(lstm_layer1(s1))
s2_lstm_lstm = lstm_layer2(lstm_layer1(s2))
s1_lstm = lstm_layer1(s1)
s2_lstm = lstm_layer1(s2)
#
cnn_input_layer = dot([s1_lstm,s2_lstm],axes=-1)
cnn_input_layer_dot = Reshape((20,20,-1))(cnn_input_layer)
layer_conv1 = Conv2D(filters=8,kernel_size=3,padding='same',activation='relu')(cnn_input_layer_dot)
z = MaxPooling2D(pool_size=(2,2))(layer_conv1)
for i in range(num_conv2d_layers):
z = Conv2D(filters=filters_2d[i], kernel_size=kernel_size_2d[i], padding='same', activation='relu')(z)
z = MaxPooling2D(pool_size=(mpool_size_2d[i][0], mpool_size_2d[i][1]))(z)
pool1_flat = Flatten()(z)
# # print pool1_flat
pool1_flat_drop = Dropout(rate=0.1)(pool1_flat)
ccn1 = Dense(32, activation='relu')(pool1_flat_drop)
ccn2 = Dense(16, activation='relu')(ccn1)
# 另在Embeding层上添加attention层
s1_att = att_layer1(s1)
s2_att = att_layer1(s2)
# 组合在Embeding层上添加attention层和在Embeding层上添加双向LSTM层
s1_last = Concatenate(axis=1)([s1_att,s1_bi])
s2_last = Concatenate(axis=1)([s2_att,s2_bi])
cos_layer = ConsDist()([s1_last,s2_last])
man_layer = ManDist()([s1_last,s2_last])
# 第二部分
left_w_input = Input(shape=(max_word_length,), dtype='int32')
right_w_input = Input(shape=(max_word_length,), dtype='int32')
# 定义需要使用的网络层
embedding_layer2 = Embedding(
input_dim=len(embedding_word_matrix, ),
output_dim=embedding_size,
weights=[embedding_word_matrix],
trainable=True,
input_length=max_word_length
)
lstm_word_bi_layer = Bidirectional(LSTM(6))
att_layer2 = AttentionLayer(25)
s1_words = embedding_layer2(left_w_input)
s2_words = embedding_layer2(right_w_input)
# s1_word_lstm = lstm_layer1(s1_words)
# s2_word_lstm = lstm_layer1(s2_words)
#
# cnn_input_layer1 = dot([s1_word_lstm, s2_word_lstm], axes=-1)
# cnn_input_layer_dot1 = Reshape((25, 25, -1))(cnn_input_layer1)
# layer_conv11 = Conv2D(filters=8, kernel_size=3, padding='same', activation='relu')(cnn_input_layer_dot1)
# z1 = MaxPooling2D(pool_size=(2, 2))(layer_conv11)
#
# for i in range(num_conv2d_layers):
# z1 = Conv2D(filters=filters_2d[i], kernel_size=kernel_size_2d[i], padding='same', activation='relu')(z1)
# z1 = MaxPooling2D(pool_size=(mpool_size_2d[i][0], mpool_size_2d[i][1]))(z1)
#
# pool1_flat1 = Flatten()(z1)
# # print pool1_flat
# pool1_flat_drop1 = Dropout(rate=0.1)(pool1_flat1)
# mlp11 = Dense(32, activation='relu')(pool1_flat_drop1)
# mlp21 = Dense(16, activation='relu')(mlp11)
s1_words_bi = lstm_word_bi_layer(s1_words)
s2_words_bi = lstm_word_bi_layer(s2_words)
s1_words_att = att_layer2(s1_words)
s2_words_att = att_layer2(s2_words)
s1_words_last = Concatenate(axis=1)([s1_words_att,s1_words_bi])
s2_words_last = Concatenate(axis=1)([s2_words_att,s2_words_bi])
cos_layer1 = ConsDist()([s1_words_last,s2_words_last])
man_layer1 = ManDist()([s1_words_last,s2_words_last])
# 第三部分,前两部分模型组合
s1_s2_mul = Multiply()([s1_last,s2_last])
s1_s2_sub = Lambda(lambda x: K.abs(x))(Subtract()([s1_last,s2_last]))
s1_s2_maxium = Maximum()([Multiply()([s1_last,s1_last]),Multiply()([s2_last,s2_last])])
s1_s2_sub1 = Lambda(lambda x: K.abs(x))(Subtract()([s1_lstm_lstm,s2_lstm_lstm]))
s1_words_s2_words_mul = Multiply()([s1_words_last,s2_words_last])
s1_words_s2_words_sub = Lambda(lambda x: K.abs(x))(Subtract()([s1_words_last,s2_words_last]))
s1_words_s2_words_maxium = Maximum()([Multiply()([s1_words_last,s1_words_last]),Multiply()([s2_words_last,s2_words_last])])
last_list_layer = Concatenate(axis=1)([s1_s2_mul,s1_s2_sub,s1_s2_sub1,s1_s2_maxium,s1_words_s2_words_mul,s1_words_s2_words_sub,s1_words_s2_words_maxium])
last_list_layer = Dropout(0.05)(last_list_layer)
# Dense 层
dense_layer1 = Dense(32,activation='relu')(last_list_layer)
dense_layer2 = Dense(48,activation='sigmoid')(last_list_layer)
output_layer = Concatenate(axis=1)([dense_layer1,dense_layer2,cos_layer,man_layer,cos_layer1,man_layer1,ccn2])
# Step4 定义输出层
output_layer = Dense(1, activation='sigmoid')(output_layer)
model = Model(
inputs=[left_input,right_input,left_w_input,right_w_input],
outputs=[output_layer], name="simaese_lstm_attention"
)
model.compile(
# categorical_crossentropy,contrastive_loss,binary_crossentropy
loss='binary_crossentropy',
optimizer='adam',
metrics=["accuracy", fbeta_score, precision, recall]
)
return model
def predict(model,X_s1,X_s2):
# y1 = model.predict([X_s1,X_s2]).reshape(-1)
# y2 = model.predict([X_s1,X_s2]).reshape(-1)
y1 = model.predict([X_s1,X_s2])
y2 = model.predict([X_s1,X_s2])
print y1.shape
res = (y1 + y2)/2
# print res[0:15]
return res
def predict1(model,X_s1,X_s2,X_s1_char,X_s2_char):
# y1 = model.predict([X_s1,X_s2]).reshape(-1)
# y2 = model.predict([X_s1,X_s2]).reshape(-1)
y1 = model.predict([X_s1,X_s2,X_s1_char,X_s2_char])
y2 = model.predict([X_s1,X_s2,X_s1_char,X_s2_char])
res = (y1 + y2)/2
# print res[0:15]
return res
#################### Stacking 模型的融合 ####################
from sklearn.naive_bayes import GaussianNB
# import lightgbm as lgb
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
class StackingBaseClassifier(object):
def train(self, x_train, y_train, x_val=None, y_val=None):
"""
主要用于Stacking的基础模型的训练
:param x_train:
:param y_train:
:param x_val:
:param y_val:
:return:model和auc
"""
pass
def predict(self, model, x_test):
pass
def get_model_out(self, x_train, y_train, x_test, n_fold=5):
"""
交叉验证预测出基础模型的输出
:param x_train:
:param y_train:
:param x_test:
:param n_fold:
:return:
"""
n_train = x_train.shape[0]
n_test = x_test.shape[0]
train_oofp = np.zeros((n_train,)) # 存储每个fold的预测结果
test_oofp = np.zeros((n_test, n_fold)) # 存储对测试集预测结果
kfold = KFold(n_splits=n_fold, random_state=44, shuffle=True)
for index, (ix_train, ix_val) in enumerate(kfold.split(x_train)):
print '{} fold of {} start train and predict...'.format(index, n_fold)
X_fold_train = x_train[ix_train]
y_fold_train = y_train[ix_train]
X_fold_val = x_train[ix_val]
y_fold_val = y_train[ix_val]
model = self.train(X_fold_train, y_fold_train, X_fold_val, y_fold_val)
train_oofp[ix_val] = self.predict(model, X_fold_val)
test_oofp[:, index] = self.predict(model, x_test)
test_oofp_mean = np.mean(test_oofp, axis=1)
return train_oofp, test_oofp_mean
class GussianNBClassifier(StackingBaseClassifier):
def __init__(self):
# 参数设置
pass
def train(self, x_train, y_train, x_val, y_val):
print 'use GaussianNB train model...'
gnb = GaussianNB()
gnb.fit(x_train, y_train)
return gnb #, gnb.score(x_val, y_val)
def predict(self, model, x_test):
print 'use GaussianNB model test... '
return model.predict(x_test)
# class LGBClassifier(StackingBaseClassifier):
# def __init__(self):
# self.lgb_param = {
# 'objective': 'binary',
# 'metric': {'auc', 'binary_logloss'},
# 'boosting': 'gbdt',
# 'device': 'cpu',
# 'feature_fraction': 0.8, # 抽取所有特征的0.75个进行训练
# 'num_leaves': 16,
# 'learning_rate': 0.01,
# 'verbose': 1,
# 'bagging_seed': 456,
# 'feature_fraction_seed': 456
# }
#
# def train(self, x_train, y_train, x_val, y_val):
# print 'use LGB train model...'
# lgb_data_train = lgb.Dataset(x_train, y_train)
# lgb_data_val = lgb.Dataset(x_val, y_val)
# evals_res = {}
#
# model = lgb.train(
# params=self.lgb_param,
# train_set=lgb_data_train,
# valid_sets=[lgb_data_train, lgb_data_val], # 训练集和测试集都需要验证
# valid_names=['train', 'val'],
# evals_result=evals_res,
# num_boost_round=2500,
# early_stopping_rounds=10,
# verbose_eval=False
# )
# return model
#
# def predict(self, model, x_test):
# print 'use LGB model test... '
# return model.predict(x_test)
class RFClassifer(StackingBaseClassifier):
def train(self, x_train, y_train, x_val, y_val):
print 'use RandomForest train model...'
clf = RandomForestClassifier(n_estimators=25,
max_depth=4,
class_weight={
0: 1,
1: 4
}
)
clf.fit(x_train, y_train)
return clf #, 0.
def predict(self, model, x_test):
print 'use RandomForest test...'
return model.predict(x_test)
class LogisicClassifier(StackingBaseClassifier):
def train(self, x_train, y_train, x_val=None, y_val=None):
print 'use LogisticRegression train model...'
lr = LogisticRegression(class_weight={0: 1, 1: 4})
lr.fit(x_train, y_train)
return lr
def predict(self, model, x_test):
print 'use LogisticRegression test...'
return model.predict(x_test)
class DecisionClassifier(StackingBaseClassifier):
def train(self, x_train, y_train, x_val=None, y_val=None):
print 'use DecisionClassifier train model...'
dt = DecisionTreeClassifier(class_weight={0: 1, 1: 4},max_depth=5)
dt.fit(x_train, y_train)
return dt
def predict(self, model, x_test):
print 'use DecisionClassifier test...'
return model.predict(x_test)
================================================
FILE: Financial_NLP/final_demo/util.py
================================================
#!/usr/bin/python
# -*- coding: utf-8 -*-
# @Time : 2018/6/15 下午8:54
# @Author : ComeOnJian
# @File : project.py
# 参考https://github.com/YuriyGuts/pygoose/blob/master/pygoose
"""
整个项目的结构
"""
import os
import io
import numpy as np
import pandas as pd
import pickle
class Project:
def __init__(self,root_dir):
self._root_dir = root_dir
self._init_all_paths()
def _init_all_paths(self):
self._data_dir = os.path.join(self._root_dir, 'data')#存放训练数据和测试数据
# self._notebooks_dir = os.path.join(self._root_dir, 'notebooks')
self._aux_data_dir = os.path.join(self._data_dir, 'external') #存放外部数据源
self._preprocessed_data_dir = os.path.join(self._data_dir, 'preprocessed') ##存放预处理好的数据
self._features_dir = os.path.join(self._data_dir, 'features') #存放抽取的特征数据
# self._submissions_dir = os.path.join(self._data_dir, 'submissions') #存放最终提交的文件
self._trained_model_dir = os.path.join(self._data_dir, 'trained') #存放训练的模型文件
self._temp_dir = os.path.join(self._data_dir, 'tmp') #存放临时文件
# 设置成只读属性
@property
def root_dir(self):
return self._root_dir + os.path.sep
@property
def data_dir(self):
return self._data_dir + os.path.sep
@property
def aux_dir(self):
return self._aux_data_dir + os.path.sep
@property
def preprocessed_data_dir(self):
return self._preprocessed_data_dir + os.path.sep
@property
def features_dir(self):
return self._features_dir + os.path.sep
@property
def trained_model_dir(self):
return self._trained_model_dir + os.path.sep
@property
def temp_dir(self):
return self._temp_dir + os.path.sep
# print os.getcwd() # 获取当前工作目录路径
# print os.path.abspath('.') # 获取当前工作目录路径
# print os.path.abspath('test.txt') # 获取当前目录文件下的工作目录路径
# print os.path.abspath('..') # 获取当前工作的父目录 !注意是父目录路径
# print os.path.abspath(os.curdir) # 获取当前工作目录路径
@staticmethod
def init(root_dir,create_dir = True):
"""
:param root_dir:项目根目录
:param create_dir:是否需要重新创建项目存放的资料目录
:return:放回项目操作的对象
"""
project = Project(root_dir)
if create_dir:
paths_to_create = [
project.data_dir,
project.aux_dir,
project.features_dir,
project.preprocessed_data_dir,
project.trained_model_dir,
project.temp_dir
]
for path in paths_to_create:
if os.path.exists(path):
continue
else:
os.makedirs(path)
return project
"""
说明:
某中方法提取特征后,产生多列的数据如下,分别存放两个文件中feature.name(列名)和feature.pickle(根据该抽取方法获得的样本数据)
列名: f_1 f_2 f_3 f_4 f_5 f_6 f_7 f_8 f_9 f_10
样本1: 0.1 0.4 8 2 4 2 3 0.1 0.4 0.33
样本2: 0.1 0.4 8 2 4 2 3 0.1 0.4 0.33
"""
def load_feature_lists(self,feature_lists):
"""
根据特征名的列表,从运用各个方法抽取的特征文件中提出数据组合成DataFrame
:param feature_lists:特征名组成的列表,可以将feature看成是抽取特征的方式
:return:特征数据组成的DataFrame
"""
column_names = []
feature_ranges = []
running_feature_count = 0
# 从存放特征列名的文件中加载出列名,并记录各个特征对应的起始列的index
for feature_name in feature_lists:
feature_col_name_list = self._load_feature_col_name(self.features_dir + 'X_train_{}.names'.format(feature_name))
column_names.extend(feature_col_name_list)
start_index = running_feature_count
end_index = running_feature_count + len(feature_col_name_list) - 1
running_feature_count = running_feature_count + len(feature_col_name_list)
feature_ranges.append([feature_name,start_index,end_index])
# 从存放多个列文件中将数据的特征组合起来
X_train = np.hstack([self._load_feature_data(self.features_dir + 'X_train_{}.pickle'.format(feature_name))
for feature_name in feature_lists
])
X_test = np.hstack([self._load_feature_data(self.features_dir + 'X_test_{}.pickle'.format(feature_name))
for feature_name in feature_lists
])
train_df = pd.DataFrame(X_train,columns=column_names)
test_df = pd.DataFrame(X_test,columns=column_names)
return train_df,test_df,feature_ranges
def save_features(self,train_fea,test_fea,fea_names,feature_name):
"""
使用某种方式使用本方法来保存特征
:param train_fea:某种方法抽取的特征的多列数据来源于训练数据
:param test_fea:某种方法抽取的特征的多列数据来源于测试数据
:param fea_names:某种方法抽取的特征的形成的多列的数据对应的列名,list类型
:param feature_name:抽取的方法名
"""
self.save_feature_names(fea_names,feature_name)
self.save_feature_col_list(train_fea,'train',feature_name)
self.save_feature_col_list(test_fea,'test',feature_name)
def save_feature_names(self,fea_names,feature_name):
# 保存列名
self._save_feature_col_name(fea_names,self.features_dir + 'X_train_{}.names'.format(feature_name))
def save_feature_col_list(self,fea_data,type,feature_name):
# 保存各列对应的数据
self._save_feature_data(fea_data,self.features_dir + 'X_{}_{}.pickle'.format(type,feature_name))
def _load_feature_col_name(self,nfile):
with io.open(nfile,'r',encoding="utf-8") as file:
return [line.rstrip('\n') for line in file.readlines()]
def _load_feature_data(self,nfile):
with open(nfile,'rb') as file:
return pickle.load(file)
def _save_feature_data(self,data,nfile):
with open(nfile,'wb') as file:
pickle.dump(data,file)
def _save_feature_col_name(self,col_names,nfile):
with open(nfile,'w') as file:
file.write('\n'.join(col_names))
def save(self,nfile,object):
with open(nfile, 'wb') as file:
pickle.dump(object, file)
def load(self,nfile):
with open(nfile, 'rb') as file:
return pickle.load(file)
# 初始化整个项目的基础类
project = Project.init('/Users/jian/PythonPrMl/Financial_NLP/atec',create_dir=False)
================================================
FILE: ML/DecisionTree/Boosting.py
================================================
#!/usr/bin/python
# -*- coding: utf-8 -*-
# @Time : 2018/4/12 下午5:27
# @Author : ComeOnJian
# @File : Boosting.py
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
import math
import numpy as np
import time
class ThresholdClass():
"""
构造一个阈值分类器
x<v ,val = 1
x>= -1 ,val = 0
Adult是二分类问题,当阈值分类器输出val为1表示预测为正例,当阈值分类器输出val为0表示预测为负例
"""
def __init__(self,train_x,train_y,w):
"""
初始化阈值分类器
:param features: 特征集
:param labels:标签集
:param w: 样本对应的权重
"""
self.X = train_x
self.y = train_y
self.sample_num = train_x.shape[0]
#每个样本对应的参数
self.w = w
#初始化 feature取值的阈值范围
self.values = self._get_V_list(self.X)
self.best_V = -1
def train(self):
"""
计算使得分类误差率最小的阈值
:return: 返回误差率
"""
best_error_rate = 1.1
error_rate = 0.0
for V in self.values:
#计算每个阈值对应的误差率
for feature_v_index in range(self.sample_num):
val = 0
if self.X[feature_v_index] < V:
val = 1
# error 出现的情况
if val != self.y[feature_v_index]:
error_rate = error_rate + self.w[feature_v_index] * 1
if error_rate != 0.0 and error_rate < best_error_rate:
best_error_rate = error_rate
self.best_V = V
#没有error的情况
if best_error_rate == 1.1:
self.best_V = V
return best_error_rate
def predict(self,feature_value):
"""
:param feature_value:单个维度的值
:return:预测值
"""
if feature_value < self.best_V:
return 1
else:
return 0
def _get_V_list(self,X):
"""
:param X:特征对应的取值
:return:该特征对应的阈值可取值列表
"""
values_set = set(X)
values = []
for iter,value in enumerate(values_set):
if iter==0:
continue
else:
values.append(value-0.5)
return values
"""
如果特征有n维,我们针对每一维特征求一个分类器,选取这些分类器中分类误差率最低的分类器作为本轮的分类器,将其特征index与分类器一起存入G(x)中。
"""
class AdaBoostBasic():
def __init__(self,M = 10):
# 由M个弱分类器叠加
self.M = M
pass
def _init_parameters_(self,train_x,train_y):
self.X = train_x
self.y = train_y
# 特征数
self.feature_num = train_x.shape[1]
self.sample_num = train_x.shape[0]
# 分类器Gm(x)的系数列表
self.alpha = []
# 列表item 为(维度即特征index,维度对应的分类器)
self.classifier = []
# 数据集样本的权值分布,类型为列表
self.w = [1.0/self.sample_num] * self.sample_num
def train(self,train_x,train_y):
self._init_parameters_(train_x,train_y)
#计算M个弱分类器
for iter in range(self.M):
print('start %d ThresholdClass ...'%(iter))
#对于多维度的分类,需要计算特定的维度的最好的分类器即误差率最小
# 分别对应的误差率,特征index,分类器对象
best_ThresholdClass = (1,None,None)
#从多维度的数据中找出维度对应的误差最小的特征及对应的分类器
for feature_index in range(self.feature_num):
#取feature对应的列,作为单维特征
feature_X = self.X[:,feature_index]
thresholdClass = ThresholdClass(feature_X,self.y,self.w)
error_rate = thresholdClass.train()
if error_rate < best_ThresholdClass[0]:
best_ThresholdClass = (error_rate,feature_index,thresholdClass)
error_rate_iter = best_ThresholdClass[0]
print('No %d ThresholdClass error rate is : %f , feature index is :%d'
% (iter,best_ThresholdClass[0],best_ThresholdClass[1]))
#记录下第iter轮的分类器
self.classifier.append(best_ThresholdClass)
# 记录下参数alpha
alpha_iter = 100
if error_rate_iter == 1.1:
#没有分错的情况
self.alpha.append(alpha_iter)
else:
alpha_iter = self._get_alpha(error_rate_iter)
self.alpha.append(alpha_iter)
#更新训练集记录的权值分布
Zm = self._get_Z_m(alpha_iter,best_ThresholdClass[1],best_ThresholdClass[2])
self._updata_w(alpha_iter,best_ThresholdClass[1],best_ThresholdClass[2],Zm)
def predict(self,sample):
predict = 0
for index in range(self.M):
alpha_m = self.alpha[index] # 系数
classfiler_m = self.classifier[index] # 分类器参数
feature_index_m = classfiler_m[1] #分类器对应的Feature index
thresholfclass_m = classfiler_m[2]
feature_value = sample[feature_index_m]
Gm = thresholfclass_m.predict(feature_value)
predict = predict + alpha_m * Gm
predict = self._sigmoid(predict)
if predict >= 0.5:
return 1
else:
return 0
def _sigmoid(self,x):
return 1.0/(1 + math.exp(-x))
def _get_alpha(self,error_rate_iter):
alpha = 0.5 * math.log((1-error_rate_iter)/error_rate_iter)
return alpha
#规范因子
def _get_Z_m(self,alpha,feature_index,classifler):
"""
:param alpha:第m个弱分类前的系数
:param feature_index 分类的特征的index
:param classifler:第m个弱分类
:return:Zm
"""
Zm = 0.0
for index in range(self.sample_num):
temp = - alpha * self.y[index,:][0] * classifler.predict(self.X[index,feature_index])
Zm = Zm + self.w[index] * math.exp(temp)
return Zm
def _updata_w(self,alpha,feature_index,classifler,Zm):
"""更新w值"""
for index in range(self.sample_num):
temp = - alpha * self.y[index, :][0] * classifler.predict(self.X[index, feature_index])
self.w[index] = self.w[index] / Zm * math.exp(temp)
class AdaBoostTree():
"""
以CART为基类-弱分类器的提升方法
"""
def __init__(self):
pass
class AdaBoostGDBT():
pass
train_file = '../data/adult/adult_deal_value.data'
test_file = '../data/adult/adult_deal_value.test'
if __name__ == '__main__':
flods = [train_file, test_file]
print('load data...')
from ML.DecisionTree import decision_tree as dt
train_x, train_y, test_x, test_y = dt.load_data(flods)
print('finish data load...')
start_time = time.time()
adboost = AdaBoostBasic(M = 30)
adboost.train(train_x,train_y)
end_time = time.time()
train_time = end_time - start_time
print('total train time is :%.3f'%train_time)
pred_y = []
for sample in test_x:
pred_yi = adboost.predict(sample)
pred_y.append(pred_yi)
# pred_y = [0]* (test_y.shape[0])
print("accuracy is : ",accuracy_score(y_true=test_y,y_pred=pred_y))
================================================
FILE: ML/DecisionTree/RandomForest.py
================================================
#!/usr/bin/python
# -*- coding: utf-8 -*-
# @Time : 2018/4/5 上午10:55
# @Author : ComeOnJian
# @File : RandomForst.py
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import DecisionTreeClassifier
# from sklearn.ensemble import RandomForestClassifier
import numpy as np
import pandas as pd
import random
"""
随机示例对于回归问题此处采用的是平均法,对于分类问题采用的是投票法
"""
from enum import Enum
class TypeClass(Enum):
DecisionTreeClassifier_type = 1
DecisionTreeRegressor_type = 2
def randomforst(D,N,M,K,type_class):
"""
:param D: 数据集D,格式为[Feature,label],类型为np.ndarray
:param N: 一次随机选取出的样本数
:param M: M个基分类器
:param k: 所有特征中选取K个属性
:return:
"""
D_df = pd.DataFrame(D)
D_df.as_matrix()
trees = []
for count in M:
# 随机采样N个样本
D_df_i = D_df.sample(N)
# 随机选取K个特征
#包含label所以需要减1
feature_length = D_df.shape[0] - 1
feature_list = range(feature_length)
choice_features = random.sample(feature_list,K)
#最终的Di数据集
D_df_i_data = D_df_i[choice_features]
if isinstance(type_class,TypeClass):
if type_class == TypeClass.DecisionTreeClassifier_type:
cart_t = DecisionTreeClassifier(criterion='gini')
else:
cart_t = DecisionTreeRegressor(criterion='mse')
y = D_df_i_data[-1].as_matrix()
X = D_df_i_data.drop([-1], axis=1).as_matrix()
tree = cart_t.fit(X, y)
trees.append(tree)
else:
raise Exception('input param error')
return trees
def randomforst_predict(trees,test_x, type_class):
if isinstance(type_class, TypeClass):
results = []
for tree in trees:
result = tree.predict(test_x)
results.append(result)
results_np = np.array(results)
if type_class == TypeClass.DecisionTreeClassifier_type:
return get_max_count_array(results_np)
else:
return np.mean(results_np)
else:
raise Exception('input param error')
def get_max_count_array(arr):
count = np.bincount(arr)
max_value = np.argmax(count)
return max_value
================================================
FILE: ML/DecisionTree/decision_tree.py
================================================
#!/usr/bin/python
# -*- coding: utf-8 -*-
# @Time : 2018/3/23 下午3:43
# @Author : ComeOnJian
# @File : decision_tree.py
import pandas as pd
import numpy as np
from sklearn.utils import shuffle
import pickle
from tqdm import tqdm
import copy
# ######################### 数据集操作 ################
##dataframe中的值数值化,包括一些简单的连续值离散化处理
def adult_label(x):
"""
转换样本的类别
:param s: key
"""
label = {'>50K': 1,'<=50K' : 0,'>50K.': 1,'<=50K.' : 0}
return label[x]
def adult_age(x):
key = '17-35'
x = int(x)
if x>=36 and x<=53:
key = '36-53'
elif x>=54 and x<=71:
key = '54-71'
elif x>=72 and x<=90:
key = '72-90'
age = {'17-35': 0, '36-53': 1, '54-71': 2, '72-90': 3}
return age[key]
def adult_workclass(x):
workclass = {'Private': 0, 'Self-emp-not-inc': 1, 'Self-emp-inc': 2, 'Federal-gov': 3, 'Local-gov': 4,
'State-gov': 5, 'Without-pay': 6}
return workclass[x]
def adult_education(x):
education = {'Bachelors': 0, 'Some-college': 1, '11th': 2, 'HS-grad': 3, 'Prof-school': 4, 'Assoc-acdm': 5,
'Assoc-voc': 6, '9th': 7,
'7th-8th': 8, '12th': 9, 'Masters': 10, '1st-4th': 11, '10th': 12, 'Doctorate': 13, '5th-6th': 14,
'Preschool': 15}
return education[x]
def adult_education_num(x):
key = '1-4'
x = int(x)
if x>=5 and x<=8:
key = '5-8'
elif x>=9 and x<=12:
key = '9-12'
elif x>=13 and x<=16:
key = '13-16'
education_num = {'1-4': 0, '5-8': 1, '9-12': 2, '13-16': 3}
return education_num[key]
def adult_marital_status(x):
marital_status = {'Married-civ-spouse': 0, 'Divorced': 1, 'Never-married': 2, 'Separated': 3, 'Widowed': 4,
'Married-spouse-absent': 5, 'Married-AF-spouse': 6}
return marital_status[x]
def adult_occupation(x):
occupation = {'Tech-support': 0, 'Craft-repair': 1, 'Other-service': 2, 'Sales': 3, 'Exec-managerial': 4,
'Prof-specialty': 5, 'Handlers-cleaners': 6, 'Machine-op-inspct': 7,
'Adm-clerical': 8, 'Farming-fishing': 9, 'Transport-moving': 10, 'Priv-house-serv': 11,
'Protective-serv': 12, 'Armed-Forces': 13}
return occupation[x]
def adult_relationship(x):
relationship = {'Wife': 0, 'Own-child': 1, 'Husband': 2, 'Not-in-family': 3, 'Other-relative' :4, 'Unmarried': 5}
return relationship[x]
def adult_race(x):
race = {'White': 0, 'Asian-Pac-Islander': 1, 'Amer-Indian-Eskimo': 2, 'Other': 3,'Black': 4}
return race[x]
def adult_sex(x):
sex = {'Female': 0, 'Male': 1}
return sex[x]
def adult_capital_gain_loss(x):
capital_gain_loss = {'=0': 0, '>0': 1}
key = '=0'
x = int(x)
if x>=0:
key = '>0'
return capital_gain_loss[key]
def adult_hours_per_week(x):
hours_per_week = {'=40': 0, '>40': 1, '<40': 2}
key = '=40'
x = int(x)
if x > 40:
key = '>40'
elif x < 40:
key = '<40'
return hours_per_week[key]
def adult_native_country(x):
key = 'USA'
native_country = {'USA': 0, 'not USA': 1}
if x != 'United-States ':
key = 'not USA'
return native_country[key]
def transToValues(file_name,save_name,remove_unKnowValue=True,remove_duplicates=True):
converters = {0: adult_age,1: adult_workclass,3: adult_education,4: adult_education_num,5: adult_marital_status,
6: adult_occupation,7: adult_relationship,8: adult_race,9: adult_sex,10: adult_capital_gain_loss,
11: adult_capital_gain_loss,12: adult_hours_per_week,13: adult_native_country,14: adult_label}
adult_df = pd.read_table(file_name, header=None ,sep=',',converters=converters,
names=['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status',
'occupation','relationship', 'race', 'sex', 'capital-gain', 'capital-loss',
'hours-per-week', 'native-country','label'],engine='python')
if remove_duplicates:
# 移除df中重复的值
adult_df.drop_duplicates(inplace=True)
print('delete duplicates shape train-test =================')
print(adult_df.shape)
if remove_unKnowValue:
# 移除df中缺失的值
adult_df.replace(['?'], np.NaN, inplace=True)
adult_df.dropna(inplace=True)
print('delete unKnowValues shape train-test =================')
print(adult_df.shape)
adult_df.drop('fnlwgt',axis=1,inplace=True)
adult_df.to_csv(save_name,header=False,index=False)
if __name__ == '__main__':
train_file = '../data/adult/adult.data'
test_file = '../data/adult/adult.test'
train_deal_file = '../data/adult/adult_deal.data'
test_deal_file = '../data/adult/adult_deal.test'
#deal with duplicates and unKnowValue
transToValues(train_file,train_deal_file)
transToValues(test_file,test_deal_file)
# trans to value
transToValues(train_deal_file,'../data/adult/adult_deal_value.data',remove_duplicates=False,remove_unKnowValue=False)
transToValues(test_deal_file,'../data/adult/adult_deal_value.test',remove_duplicates=False,remove_unKnowValue=False)
def load_data(flods):
adult_train_df = pd.read_table(flods[0], header=None ,sep=',',
names=['age', 'workclass', 'education', 'education-num', 'marital-status',
'occupation','relationship', 'race', 'sex', 'capital-gain', 'capital-loss',
'hours-per-week', 'native-country','label'],engine='python',dtype=int)
adult_test_df = pd.read_table(flods[1], header=None, sep=',',
names=['age', 'workclass', 'education', 'education-num', 'marital-status',
'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss',
'hours-per-week', 'native-country', 'label'], engine='python',dtype=int)
# 打乱data中样本的顺序
# adult_train_df = shuffle(adult_train_df)
adult_train_df = adult_train_df.sample(frac=1).reset_index(drop=True)
# adult_test_df = shuffle(adult_test_df)
adult_test_df = adult_test_df.sample(frac=1).reset_index(drop=True)
print('init shape train-test =================')
print(adult_train_df.shape)
print(adult_test_df.shape)
# 离散分段处理
# D = np.array(adult_train_df['label']).reshape(adult_train_df.shape[0], 1)
# age_did = devide_feature_value(adult_train_df['age'],D)
train_data_x = np.array(adult_train_df.iloc[:,0:13])
train_data_y = np.array(adult_train_df.iloc[:,13:])
test_data_x = np.array(adult_test_df.iloc[:, 0:13])
test_data_y = np.array(adult_test_df.iloc[:, 13:])
return train_data_x,train_data_y,test_data_x,test_data_y
## 连续值离散处理
def devide_feature_value(series,D):
sets = set(series)
mid_value = []
a = float(sets.pop())
#取相邻点的中值
for par in sets:
a = (a + par) / 2.0
mid_value.append(a)
a = float(par)
max_divide = mid_value[0]
max_ent = 0.0
ent_d = calc_ent(D)
#查找最好的分裂点
for mid in mid_value:
Q1 = D[series < mid]
Q2 = D[series >= mid]
D_length = float(D.shape[0])
Q1_length = Q1.shape[0]
Q2_length = D_length - Q1_length
#条件熵
H_Q_D = Q1_length / D_length * calc_ent(Q1) + Q2_length / D_length * calc_ent(Q2)
H = ent_d - H_Q_D
if(H > max_ent):
max_ent = H
max_divide = mid
return max_divide
# ######################### 数学计算 ################
def calc_ent(D):
"""
计算数据集D的信息熵(经验熵),输入的labels
:param x:数据集D,labels
:return:ent
"""
ent = 0.0
x_value_list = set([D[i][0] for i in range(D.shape[0])]) #Ck 数据集中的类别数
for x_value in x_value_list:
p = float(D[D == x_value].shape[0]) / D.shape[0]
logp = np.log2(p)
ent -= p*logp
return ent
def calc_condition_ent(A,D):
"""
计算条件熵 H(D|A),x是特征项,y是D数据集labels
:param x: A,某个特征项目
:param y: D,数据集labels
:return:条件熵
"""
ent = 0.0
x_value_list = set([A[i] for i in range(A.shape[0])]) #X 特征能取得值
for x_value in x_value_list:
sub_y = D[A == x_value] #Di
ent_di = calc_ent(sub_y)
ent += (float(sub_y.shape[0])/D.shape[0]) * ent_di
return ent
def calc_ent_gain(A,D):
"""
:param A:某个特征项目
:param D:数据集,labels
:return:计算信息增益
"""
ent_d = calc_ent(D)
ent_condition_d_a = calc_condition_ent(A,D)
return (ent_d - ent_condition_d_a)
def calc_ent_gain_rate(A,D):
"""
计算信息增益比
:param A:
:param D:labels
:return:信息增益比
"""
ent = 0.0
ent_gain = calc_ent_gain(A,D)
x_values_list = set([A[i] for i in range(A.shape[0])])
for x_value in x_values_list:
sub_y = D[A == x_value]# Di
p = float(sub_y.shape[0])/D.shape[0] #Di/D
logp = np.log2(p)
ent -= p * logp
return ent_gain/ent
def calc_gini(D):
"""
计算样本集D的基尼系数
:param D: labels
:return:
"""
gini = 0.0
x_value_list = set([D[i][0] for i in range(D.shape[0])])
for x_value in x_value_list:
Ck_count = D[D == x_value].shape[0]
D_count = D.shape[0]
p = float(Ck_count)/D_count
gini += np.square(p)
gini = 1 - gini
return gini
def calc_condition_gini(A,D,a):
"""
在特征A的条件下,集合D的基尼系数
:param A:特征A
:param D:labels
:param a:特征A的确却的值a
:return:
"""
D1 = D[A == a]
# D2 = D - D1 无此种形式
# 取差集
mask = A != a
D2 = D[mask]
p1 = float(D1.shape[0])/D.shape[0]
p2 = float(D2.shape[0])/D.shape[0]
gini1 = calc_gini(D1)
gini2 = calc_gini(D2)
gini = p1 * gini1 + p2 * gini2
return gini
# ######################### 模型分类效果评价 ################
def eval(y_true,y_predict):
from sklearn.metrics import average_precision_score
from sklearn.metrics import matthews_corrcoef
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import precision_score, recall_score, f1_score
print('average_precision_score: %f' % (average_precision_score(y_true=y_true, y_score=y_predict)))
print('MMC: %f' % (matthews_corrcoef(y_true=y_true, y_pred=y_predict)))
print(classification_report(y_true=y_true, y_pred=y_predict))
# Calculate precision score
print(precision_score(y_true, y_predict, average='macro'))
print(precision_score(y_true, y_predict, average='micro'))
print(precision_score(y_true, y_predict, average=None))
# Calculate recall score
print(recall_score(y_true, y_predict, average='macro'))
print(recall_score(y_true, y_predict, average='micro'))
print(recall_score(y_true, y_predict, average=None))
# Calculate f1 score
print(f1_score(y_true, y_predict, average='macro'))
print(f1_score(y_true, y_predict, average='micro'))
print(f1_score(y_true, y_predict, average=None))
fpr, tpr, thresholds = roc_curve(y_true, y_predict)
roc_auc = auc(fpr, tpr)
# 画图,只需要plt.plot(fpr,tpr),变量roc_auc只是记录auc的值,通过auc()函数能计算出来
plt.plot(fpr, tpr, lw=1, label='ROC(area = %0.2f)' % (roc_auc))
plt.xlabel("FPR (False Positive Rate)")
plt.ylabel("TPR (True Positive Rate)")
plt.title("Receiver Operating Characteristic, ROC(AUC = %0.2f)" % (roc_auc))
plt.show()
# ######################### TreeNode ################
class TreeNode():
"""
树节点类
"""
def __init__(self):
#叶子结点需要的属性
self.type = -1 # 结点的类型label-类标记
#非叶子结点需要的属性
self.next_nodes = [] # 该结点指向的下一层结点
self.feature_index = -1 #该结点对应的特征编号
# self.feature_value = 0 #该结点划分的特征取值
self.select_value = 0 #特征选择(信息增益、信息增益比、gini)值
def add_next_node(self,node):
if type(node) == TreeNode:
self.next_nodes.append(node)
else:
raise Exception('node not belong to TreeNode type')
def add_attr_and_value(self,attr_name,attr_value):
"""
动态给节点添加属性,因为结点分为叶子结点,正常结点
:param attr_name:属性名
:param attr_value:属性值
"""
setattr(self,attr_name,attr_value)
# ######################### Decision Tree ################
class DecisionTree():
def __init__(self,mode):
self._tree = TreeNode() #指向根结点的指针
if mode == 'ID3' or mode == 'C4.5':
self._mode = mode
else:
raise Exception('mode should is C4.5 or ID3 or CARTClassification or CARTRegression')
def train(self,train_x,train_y,epsoion):
"""
构建树
:param train_x:
:param train_y:
:return:该树 ———— 模型
"""
feature_list = [index for index in range(train_x.shape[1])]
self._create_tree(train_x,train_y,feature_list,epsoion,self._tree)
# print (22)
def predict(self,test_x):
if (len(self._tree.next_nodes) == 0):
raise Exception('no train model')
# classfiy one sample
def _classfiy(node,sample):
feature_index = node.feature_index
#叶子结点
if feature_index == -1:
return node.type
#
sample_feature_v = sample[feature_index]
next_node = None
for sub_node in node.next_nodes:
if hasattr(sub_node,'feature_value'):
if sub_node.feature_value == sample_feature_v:
next_node = sub_node
break;
if next_node == None:
return node.type
else:
return _classfiy(next_node,sample)
predict_labels = []
for sample in tqdm(test_x):
label = _classfiy(self._tree.next_nodes[0],list(sample))
predict_labels.append(label)
return predict_labels
def _create_tree(self,X,y,feature_list,epsoion,start_node,Vi=-1):
"""
:param X: 数据集X
:param y: label集合
:param feature_list: 特征的id list
:param epsoion:阈值
:param start_node:决策树的启始结点
:param Vi: feature value
:return: 指向决策树的根结点的指针
"""
# 结点
node = TreeNode()
#若所有实例都属于一个类别
C = set(y[:,0]) #分类的类别集合
if(len(C) == 1 ):
node.type = tuple(C)[0] #该Ck作为该结点的类标记
start_node.add_next_node(node)
return
# 特征集合A为空,将D中实例数最大的类Ck作为该结点的类标记
if(len(feature_list) == 0):
max_value = self._get_max_count_array(y[:,0])
node.type = max_value
start_node.add_next_node(node)
return
# select feature
if self._mode == 'ID3' or self._mode == 'C4.5':
select_func = calc_ent_gain
if self._mode == 'C4.5':
select_func = calc_ent_gain_rate
ent_gain_max, ent_max_feature_index = self._select_feature(X,y,feature_list,select_func)
# 最大信息增益小于设定的某个阈值
if ent_gain_max < epsoion:
type_value = self._get_max_count_array(y[:, 0])
node.type = type_value
start_node.add_next_node(node)
return
else:
node.feature_index = ent_max_feature_index
node.select_value = ent_gain_max
type_value = self._get_max_count_array(y[:,0])
node.type = type_value
if (Vi != -1):
node.add_attr_and_value("feature_value", Vi)
start_node.add_next_node(node)
# 获取选取的特征的所有可能值
Ag_v = set(X[:,ent_max_feature_index])
# A - Ag
feature_list.remove(ent_max_feature_index)
# Di
for v in Ag_v:
# Di 为 Xi , yi
mask = X[:,ent_max_feature_index] == v
Xi = X[mask]
yi = y[mask]
feature_list_new = copy.deepcopy(feature_list)
self._create_tree(Xi, yi, feature_list_new, epsoion, node, Vi=v)
return
else:
pass
pass
def _select_feature(self,X,y,feature_list,select_func):
ent_gain_max = 0.0
ent_max_feature_index = 0
for feature in feature_list:
A = X[:,feature]
D = y
ent_gain = select_func(A,D)
if(ent_gain > ent_gain_max):
ent_gain_max = ent_gain
ent_max_feature_index = feature
return ent_gain_max,ent_max_feature_index
def _get_max_count_array(self,arr):
count = np.bincount(arr)
max_value = np.argmax(count)
return max_value
================================================
FILE: ML/DecisionTree/titanic_data_analy.ipynb
================================================
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\nRangeIndex: 891 entries, 0 to 890\nData columns (total 12 columns):\nPassengerId 891 non-null int64\nSurvived 891 non-null int64\nPclass 891 non-null int64\nName 891 non-null object\nSex 891 non-null object\nAge 714 non-null float64\nSibSp 891 non-null int64\nParch 891 non-null int64\nTicket 891 non-null object\nFare 891 non-null float64\nCabin 204 non-null object\nEmbarked 889 non-null object\ndtypes: float64(2), int64(5), object(5)\nmemory usage: 83.6+ KB\nNone\n"
]
}
],
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"train = pd.read_csv(\"ML/data/Titanic/train.csv\")\n",
"test = pd.read_csv(\"ML/data/Titanic/test.csv\")\n",
"full_data = [train,test]\n",
"print(train.info())"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"1 216\n2 184\n3 491\nName: Pclass, dtype: int64\n Pclass Survived\n0 1 0.629630\n1 2 0.472826\n2 3 0.242363\n"
]
}
],
"source": [
"\n",
"print (train['Pclass'].value_counts(sort=False).sort_index())\n",
"print train[['Pclass','Survived']].groupby('Pclass',as_index=False).mean()"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"female 314\nmale 577\nName: Sex, dtype: int64\n Sex Survived\n0 female 0.742038\n1 male 0.188908\n"
]
}
],
"source": [
"\n",
"print (train['Sex'].value_counts(sort=False).sort_index())\n",
"print train[['Sex','Survived']].groupby('Sex',as_index=False).mean()"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/Users/jian/anaconda2/lib/python2.7/site-packages/ipykernel_launcher.py:8: SettingWithCopyWarning: \nA value is trying to be set on a copy of a slice from a DataFrame\n\nSee the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n \n"
]
}
],
"source": [
"for dataset in full_data:\n",
" age_avg = dataset['Age'].mean()\n",
" age_std = dataset['Age'].std()\n",
" \n",
" age_null_count = dataset['Age'].isnull().sum()\n",
" age_default_list = np.random.randint(low=age_avg-age_std,high=age_avg+age_std,size=age_null_count,)\n",
" \n",
" dataset['Age'][np.isnan(dataset['Age'])] = age_default_list\n",
" dataset['Age'] = dataset['Age'].astype(int)\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" CategoricalAge Survived\n0 (-0.08, 16.0] 0.540541\n1 (16.0, 32.0] 0.362187\n2 (32.0, 48.0] 0.352490\n3 (48.0, 64.0] 0.434783\n4 (64.0, 80.0] 0.090909\n"
]
}
],
"source": [
"train['CategoricalAge'] = pd.cut(train['Age'], 5)\n",
"print (train[['CategoricalAge', 'Survived']].groupby(['CategoricalAge'], as_index=False).mean())\n"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0 608\n1 209\n2 28\n3 16\n4 18\n5 5\n8 7\nName: SibSp, dtype: int64\n SibSp Survived\n0 0 0.345395\n1 1 0.535885\n2 2 0.464286\n3 3 0.250000\n4 4 0.166667\n5 5 0.000000\n6 8 0.000000\n"
]
}
],
"source": [
"\n",
"print (train['SibSp'].value_counts(sort=False).sort_index())\n",
"print train[['SibSp','Survived']].groupby('SibSp',as_index=False).mean()"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0 678\n1 118\n2 80\n3 5\n4 4\n5 5\n6 1\nName: Parch, dtype: int64\n Parch Survived\n0 0 0.343658\n1 1 0.550847\n2 2 0.500000\n3 3 0.600000\n4 4 0.000000\n5 5 0.200000\n6 6 0.000000\n"
]
}
],
"source": [
"\n",
"print (train['Parch'].value_counts(sort=False).sort_index())\n",
"print train[['Parch','Survived']].groupby('Parch',as_index=False).mean()"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" FamilySize Survived\n0 1 0.303538\n1 2 0.552795\n2 3 0.578431\n3 4 0.724138\n4 5 0.200000\n5 6 0.136364\n6 7 0.333333\n7 8 0.000000\n8 11 0.000000\n"
]
}
],
"source": [
"for dataset in full_data:\n",
" dataset['FamilySize'] = dataset['SibSp'] + dataset['Parch'] + 1\n",
"print (train[['FamilySize', 'Survived']].groupby(['FamilySize'], as_index=False).mean())"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" CategoricalFare Survived\n0 (-0.001, 7.775] 0.205128\n1 (7.775, 8.662] 0.190789\n2 (8.662, 14.454] 0.366906\n3 (14.454, 26.0] 0.436242\n4 (26.0, 52.369] 0.417808\n5 (52.369, 512.329] 0.697987\n"
]
}
],
"source": [
"for dataset in full_data:\n",
" dataset['Fare'] = dataset['Fare'].fillna(train['Fare'].median())\n",
"train['CategoricalFare'] = pd.qcut(train['Fare'],6)\n",
"print (train[['CategoricalFare', 'Survived']].groupby(['CategoricalFare'], as_index=False).mean())"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"C 168\nQ 77\nS 644\nName: Embarked, dtype: int64\n"
]
}
],
"source": [
"\n",
"print (train['Embarked'].value_counts(sort=False).sort_index())"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"C 168\nQ 77\nS 646\nName: Embarked, dtype: int64\n"
]
}
],
"source": [
"for data in full_data:\n",
" data['Embarked'] = data['Embarked'].fillna('S')\n",
"print (train['Embarked'].value_counts(sort=False).sort_index())"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" Embarked Survived\n0 C 0.553571\n1 Q 0.389610\n2 S 0.339009\n"
]
}
],
"source": [
"print (train[['Embarked', 'Survived']].groupby(['Embarked'], as_index=False).mean())"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Sex female male\nTitleName \nCapt 0 1\nCol 0 2\nCountess 1 0\nDon 0 1\nDr 1 6\nJonkheer 0 1\nLady 1 0\nMajor 0 2\nMaster 0 40\nMiss 182 0\nMlle 2 0\nMme 1 0\nMr 0 517\nMrs 125 0\nMs 1 0\nRev 0 6\nSir 0 1\n"
]
}
],
"source": [
"\n",
"import re\n",
"def get_title_name(name):\n",
" title_s = re.search(' ([A-Za-z]+)\\.', name)\n",
" if title_s:\n",
" return title_s.group(1)\n",
" return \"\"\n",
"for dataset in full_data:\n",
" dataset['TitleName'] = dataset['Name'].apply(get_title_name)\n",
"print(pd.crosstab(train['TitleName'],train['Sex']))"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" TitleName Survived\n0 Master 0.575000\n1 Miss 0.702703\n2 Mr 0.156673\n3 Mrs 0.793651\n4 Other 0.347826\n"
]
}
],
"source": [
"train['TitleName'] = train['TitleName'].replace('Mme', 'Mrs')\n",
"train['TitleName'] = train['TitleName'].replace('Mlle', 'Miss')\n",
"train['TitleName'] = train['TitleName'].replace('Ms', 'Miss')\n",
"train['TitleName'] = train['TitleName'].replace(['Lady', 'Countess','Capt', 'Col',\\\n",
" 'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Other')\n",
"print (train[['TitleName', 'Survived']].groupby(['TitleName'], as_index=False).mean())"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" IsAlone Survived\n0 0 0.505650\n1 1 0.303538\n"
]
}
],
"source": [
"train['IsAlone'] = 0\n",
"train.loc[train['FamilySize']==1,'IsAlone'] = 1\n",
"print (train[['IsAlone', 'Survived']].groupby(['IsAlone'], as_index=False).mean())"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 2",
"language": "python",
"name": "python2"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
================================================
FILE: ML/DecisionTree/tree_main.py
================================================
#!/usr/bin/python
# -*- coding: utf-8 -*-
# @Time : 2018/3/23 下午8:47
# @Author : ComeOnJian
# @File : tree_main.py
from ML.DecisionTree import decision_tree as dt
import time
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
train_file = '../data/adult/adult_deal_value.data'
test_file = '../data/adult/adult_deal_value.test'
if __name__ == '__main__':
flods = [train_file,test_file]
print('load data...')
train_x, train_y, test_x, test_y = dt.load_data(flods)
# print(type(train_x[:,0][0]))
print('finish data load...')
my_decision_tree = dt.DecisionTree(mode='ID3')
my_decision_tree.train(train_x,train_y,0.01)
predict_y = my_decision_tree.predict(test_x)
print('my tree accuracy: %f' % (accuracy_score(y_true=test_y, y_pred=predict_y)))
decisiont_tr = DecisionTreeClassifier(criterion='entropy',max_depth=8,min_samples_split=9)
decisiont_tr.fit(train_x,train_y)
p_y = decisiont_tr.predict(test_x)
print('sklearn tree accuracy: %f' % (accuracy_score(y_true=test_y, y_pred=p_y)))
print(decisiont_tr.feature_importances_)
================================================
FILE: ML/DecisionTree/xgboost_demo.py
================================================
#!/usr/bin/python
# -*- coding: utf-8 -*-
# @Time : 2018/4/28 下午7:24
# @Author : ComeOnJian
# @File : xgboost.py
import pandas as pd
import numpy as np
import re
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelBinarizer
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
import matplotlib.pylab as plt
train_file = '../data/Titanic/train.csv'
test_file = '../data/Titanic/test.csv'
test_result_file = '../data/Titanic/gender_submission.csv'
def data_feature_engineering(full_data,age_default_avg=True,one_hot=True):
"""
:param full_data:全部数据集包括train,test
:param age_default_avg:age默认填充方式,是否使用平均值进行填充
:param one_hot: Embarked字符处理是否是one_hot编码还是映射处理
:return: 处理好的数据集
"""
for dataset in full_data:
# Pclass、Parch、SibSp不需要处理
# sex 0,1
dataset['Sex'] = dataset['Sex'].map(Passenger_sex).astype(int)
# FamilySize
dataset['FamilySize'] = dataset['SibSp'] + dataset['Parch'] + 1
# IsAlone
dataset['IsAlone'] = 0
isAlone_mask = dataset['FamilySize'] == 1
dataset.loc[isAlone_mask, 'IsAlone'] = 1
# Fare 离散化处理,6个阶段
fare_median = dataset['Fare'].median()
dataset['CategoricalFare'] = dataset['Fare'].fillna(fare_median)
dataset['CategoricalFare'] = pd.qcut(dataset['CategoricalFare'],6,labels=[0,1,2,3,4,5])
# Embarked映射处理,one-hot编码,极少部分缺失值处理
dataset['Embarked'] = dataset['Embarked'].fillna('S')
dataset['Embarked'] = dataset['Embarked'].astype(str)
if one_hot:
# 因为OneHotEncoder只能编码数值型,所以此处使用LabelBinarizer进行独热编码
Embarked_arr = LabelBinarizer().fit_transform(dataset['Embarked'])
dataset['Embarked_0'] = Embarked_arr[:, 0]
dataset['Embarked_1'] = Embarked_arr[:, 1]
dataset['Embarked_2'] = Embarked_arr[:, 2]
dataset.drop('Embarked',axis=1,inplace=True)
else:
# 字符串映射处理
dataset['Embarked'] = dataset['Embarked'].map(Passenger_Embarked).astype(int)
# Name选取称呼Title_name
dataset['TitleName'] = dataset['Name'].apply(get_title_name)
dataset['TitleName'] = dataset['TitleName'].replace('Mme', 'Mrs')
dataset['TitleName'] = dataset['TitleName'].replace('Mlle', 'Miss')
dataset['TitleName'] = dataset['TitleName'].replace('Ms', 'Miss')
dataset['TitleName'] = dataset['TitleName'].replace(['Lady', 'Countess', 'Capt', 'Col', \
'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'],
'Other')
dataset['TitleName'] = dataset['TitleName'].map(Passenger_TitleName).astype(int)
# age —— 缺失值,分段处理
if age_default_avg:
# 缺失值使用avg处理
age_avg = dataset['Age'].mean()
age_std = dataset['Age'].std()
age_null_count = dataset['Age'].isnull().sum()
age_default_list = np.random.randint(low=age_avg - age_std, high=age_avg + age_std, size=age_null_count)
dataset.loc[np.isnan(dataset['Age']), 'Age'] = age_default_list
dataset['Age'] = dataset['Age'].astype(int)
else:
# 将age作为label,预测缺失的age
# 特征为 TitleName,Sex,pclass,SibSP,Parch,IsAlone,CategoricalFare,FamileSize,Embarked
feature_list = ['TitleName', 'Sex', 'Pclass', 'SibSp', 'Parch', 'IsAlone','CategoricalFare',
'FamilySize', 'Embarked','Age']
if one_hot:
feature_list.append('Embarked_0')
feature_list.append('Embarked_1')
feature_list.append('Embarked_2')
feature_list.remove('Embarked')
Age_data = dataset.loc[:,feature_list]
un_Age_mask = np.isnan(Age_data['Age'])
Age_train = Age_data[~un_Age_mask] #要训练的Age
# print(Age_train.shape)
feature_list.remove('Age')
rf0 = RandomForestRegressor(n_estimators=60,oob_score=True,min_samples_split=10,min_samples_leaf=2,
max_depth=7,random_state=10)
rf0.fit(Age_train[feature_list],Age_train['Age'])
def set_default_age(age):
if np.isnan(age['Age']):
# print(age['PassengerId'])
# print age.loc[feature_list]
data_x = np.array(age.loc[feature_list]).reshape(1,-1)
# print data_x
age_v = round(rf0.predict(data_x))
# print('pred:',age_v)
# age['Age'] = age_v
return age_v
# print age
return age['Age']
dataset['Age'] = dataset.apply(set_default_age, axis=1)
# print(dataset.tail())
#
# data_age_no_full = dataset[dataset['Age'].]
# pd.cut与pd.qcut的区别,前者是根据取值范围来均匀划分,
# 后者是根据取值范围的各个取值的频率来换分,划分后的某个区间的频率数相同
# print(dataset.tail())
dataset['CategoricalAge'] = pd.cut(dataset['Age'], 5,labels=[0,1,2,3,4])
return full_data
def data_feature_select(full_data):
"""
:param full_data:全部数据集
:return:
"""
for data_set in full_data:
drop_list = ['PassengerId','Name','Age','Fare','Ticket','Cabin']
data_set.drop(drop_list,axis=1,inplace=True)
train_y = np.array(full_data[0]['Survived'])
train = full_data[0].drop('Survived',axis=1,inplace=False)
# print(train.head())
train_X = np.array(train)
test_X = np.array(full_data[1])
return train_X,train_y,test_X
def Passenger_sex(x):
sex = {'female': 0, 'male': 1}
return sex[x]
def Passenger_Embarked(x):
Embarked = {'S': 0, 'C': 1 , 'Q': 2}
return Embarked[x]
def Passenger_TitleName(x):
TitleName = {'Mr': 0, 'Miss': 1, 'Mrs': 2,'Master': 3, 'Other': 4}
return TitleName[x]
def get_title_name(name):
title_s = re.search(' ([A-Za-z]+)\.', name)
if title_s:
return title_s.group(1)
return ""
def modelfit(alg,dtrain_x,dtrain_y,useTrainCV=True,cv_flods=5,early_stopping_rounds=50):
"""
:param alg: 初始模型
:param dtrain_x:训练数据X
:param dtrain_y:训练数据y(label)
:param useTrainCV: 是否使用cv函数来确定最佳n_estimators
:param cv_flods:交叉验证的cv数
:param early_stopping_rounds:在该数迭代次数之前,eval_metric都没有提升的话则停止
"""
if useTrainCV:
xgb_param = alg.get_xgb_params()
xgtrain = xgb.DMatrix(dtrain_x,dtrain_y)
cv_result = xgb.cv(xgb_param,xgtrain,num_boost_round = alg.get_params()['n_estimators'],
nfold = cv_flods,metrics = 'auc',early_stopping_rounds=early_stopping_rounds)
# print(cv_result)
alg.set_params(n_estimators=cv_result.shape[0])
# train data
alg.fit(train_X,train_y,eval_metric='auc')
#predict train data
train_y_pre = alg.predict(train_X)
print ("\nModel Report")
print ("Accuracy : %.4g" % metrics.accuracy_score(train_y, train_y_pre))
feat_imp = pd.Series(alg.get_booster().get_fscore()).sort_values(ascending=False)
feat_imp.plot(kind = 'bar',title='Feature Importance')
plt.ylabel('Feature Importance Score')
plt.show()
return alg
def xgboost_change_param(train_X,train_y):
# Xgboost 调参
# step1 确定学习速率和迭代次数n_estimators,即集分类器的数量
xgb1 = XGBClassifier(learning_rate=0.1,
booster='gbtree',
n_estimators=300,
max_depth=4,
min_child_weight=1,
gamma=0,
subsample=0.8,
colsample_bytree=0.8,
objective='binary:logistic',
nthread=2,
scale_pos_weight=1,
seed=10
)
#最佳 n_estimators = 59 ,learning_rate=0.1
modelfit(xgb1,train_X,train_y,early_stopping_rounds=45)
# setp2 调试的参数是min_child_weight以及max_depth
param_test1 = {
'max_depth': range(3,8,1),
'min_child_weight':range(1,6,2)
}
gsearch1 = GridSearchCV(estimator=XGBClassifier(learning_rate=0.1,n_estimators=59,
max_depth=4,min_child_weight=1,gamma=0,
subsample=0.8,colsample_bytree=0.8,
objective='binary:logistic',nthread=2,
scale_pos_weight=1,seed=10
),
param_grid=param_test1,
scoring='roc_auc',n_jobs=1,cv=5)
gsearch1.fit(train_X,train_y)
print gsearch1.best_params_,gsearch1.best_score_
# 最佳 max_depth = 7 ,min_child_weight=3
# modelfit(gsearch1.best_estimator_) 最佳模型为:gsearch1.best_estimator_
# step3 gamma参数调优
param_test2 = {
'gamma': [i/10.0 for i in range(0,5)]
}
gsearch2 = GridSearchCV(estimator=XGBClassifier(learning_rate=0.1,n_estimators=59,
max_depth=7,min_child_weight=3,gamma=0,
subsample=0.8,colsample_bytree=0.8,
objective='binary:logistic',nthread=2,
scale_pos_weight=1,seed=10),
param_grid=param_test2,
scoring='roc_auc',
cv=5
)
gsearch2.fit(train_X, train_y)
print gsearch2.best_params_, gsearch2.best_score_
# 最佳 gamma=0.3
# modelfit(gsearch2.best_estimator_)
#step4 调整subsample 和 colsample_bytree 参数
param_test3 = {
'subsample': [i / 10.0 for i in range(6, 10)],
'colsample_bytree': [i / 10.0 for i in range(6, 10)]
}
gsearch3 = GridSearchCV(estimator=XGBClassifier(learning_rate=0.1,n_estimators=59,
max_depth=7,min_child_weight=3,gamma=0.3,
subsample=0.8,colsample_bytree=0.8,
objective='binary:logistic',nthread=2,
scale_pos_weight=1,seed=10),
param_grid=param_test3,
scoring='roc_auc',
cv=5
)
gsearch3.fit(train_X, train_y)
print gsearch3.best_params_, gsearch3.best_score_
# 最佳'subsample': 0.8, 'colsample_bytree': 0.6
# step5 正则化参数调优
if __name__ == '__main__':
train = pd.read_csv(train_file)
test = pd.read_csv(test_file)
test_y = pd.read_csv(test_result_file)
full_data = [train,test]
# train.apply(axis=0)
full_data = data_feature_engineering(full_data,age_default_avg=True,one_hot=False)
train_X, train_y, test_X = data_feature_select(full_data)
# XGBoost调参
# xgboost_change_param(train_X,train_y)
xgb1 = XGBClassifier(learning_rate=0.1,n_estimators=59,
max_depth=7,min_child_weight=3,
gamma=0.3,subsample=0.8,
colsample_bytree=0.6,objective='binary:logistic',
nthread=2,scale_pos_weight=1,seed=10)
xgb1.fit(train_X,train_y)
y_test_pre = xgb1.predict(test_X)
y_test_true = np.array(test_y['Survived'])
print ("the xgboost model Accuracy : %.4g" % metrics.accuracy_score(y_pred=y_test_pre, y_true=y_test_true))
================================================
FILE: ML/LogisticRegression_MEM/LR_MEM_demo.py
================================================
#!/usr/bin/python
# -*- coding: utf-8 -*-
# @Time : 2018/5/16 上午11:44
# @Author : ComeOnJian
# @File : LR_MEM_demo.py
import numpy as np
import pandas as pd
import random
import re
import copy
from collections import defaultdict
from sklearn import metrics
from sklearn.preprocessing import LabelBinarizer
train_file = '../data/Titanic/train.csv'
test_file = '../data/Titanic/test.csv'
test_result_file = '../data/Titanic/gender_submission.csv'
def data_feature_engineering(full_data,age_default_avg=True,one_hot=True):
"""
:param full_data:全部数据集包括train,test
:param age_default_avg:age默认填充方式,是否使用平均值进行填充
:param one_hot: Embarked字符处理是否是one_hot编码还是映射处理
:return: 处理好的数据集
"""
for dataset in full_data:
# Pclass、Parch、SibSp不需要处理
# sex 0,1
dataset['Sex'] = dataset['Sex'].map(Passenger_sex).astype(int)
# FamilySize
dataset['FamilySize'] = dataset['SibSp'] + dataset['Parch'] + 1
# IsAlone
dataset['IsAlone'] = 0
isAlone_mask = dataset['FamilySize'] == 1
dataset.loc[isAlone_mask, 'IsAlone'] = 1
# Fare 离散化处理,6个阶段
fare_median = dataset['Fare'].median()
dataset['CategoricalFare'] = dataset['Fare'].fillna(fare_median)
dataset['CategoricalFare'] = pd.qcut(dataset['CategoricalFare'],6,labels=[0,1,2,3,4,5])
# Embarked映射处理,one-hot编码,极少部分缺失值处理
dataset['Embarked'] = dataset['Embarked'].fillna('S')
dataset['Embarked'] = dataset['Embarked'].astype(str)
if one_hot:
# 因为OneHotEncoder只能编码数值型,所以此处使用LabelBinarizer进行独热编码
Embarked_arr = LabelBinarizer().fit_transform(dataset['Embarked'])
dataset['Embarked_0'] = Embarked_arr[:, 0]
dataset['Embarked_1'] = Embarked_arr[:, 1]
dataset['Embarked_2'] = Embarked_arr[:, 2]
dataset.drop('Embarked',axis=1,inplace=True)
else:
# 字符串映射处理
dataset['Embarked'] = dataset['Embarked'].map(Passenger_Embarked).astype(int)
# Name选取称呼Title_name
dataset['TitleName'] = dataset['Name'].apply(get_title_name)
dataset['TitleName'] = dataset['TitleName'].replace('Mme', 'Mrs')
dataset['TitleName'] = dataset['TitleName'].replace('Mlle', 'Miss')
dataset['TitleName'] = dataset['TitleName'].replace('Ms', 'Miss')
dataset['TitleName'] = dataset['TitleName'].replace(['Lady', 'Countess', 'Capt', 'Col', \
'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'],
'Other')
dataset['TitleName'] = dataset['TitleName'].map(Passenger_TitleName).astype(int)
# age —— 缺失值,分段处理
if age_default_avg:
# 缺失值使用avg处理
age_avg = dataset['Age'].mean()
age_std = dataset['Age'].std()
age_null_count = dataset['Age'].isnull().sum()
age_default_list = np.random.randint(low=age_avg - age_std, high=age_avg + age_std, size=age_null_count)
dataset.loc[np.isnan(dataset['Age']), 'Age'] = age_default_list
dataset['Age'] = dataset['Age'].astype(int)
else:
# 将age作为label,预测缺失的age
# 特征为 TitleName,Sex,pclass,SibSP,Parch,IsAlone,CategoricalFare,FamileSize,Embarked
feature_list = ['TitleName', 'Sex', 'Pclass', 'SibSp', 'Parch', 'IsAlone','CategoricalFare',
'FamilySize', 'Embarked','Age']
if one_hot:
feature_list.append('Embarked_0')
feature_list.append('Embarked_1')
feature_list.append('Embarked_2')
feature_list.remove('Embarked')
Age_data = dataset.loc[:,feature_list]
un_Age_mask = np.isnan(Age_data['Age'])
Age_train = Age_data[~un_Age_mask] #要训练的Age
# print(Age_train.shape)
feature_list.remove('Age')
rf0 = RandomForestRegressor(n_estimators=60,oob_score=True,min_samples_split=10,min_samples_leaf=2,
max_depth=7,random_state=10)
rf0.fit(Age_train[feature_list],Age_train['Age'])
def set_default_age(age):
if np.isnan(age['Age']):
# print(age['PassengerId'])
# print age.loc[feature_list]
data_x = np.array(age.loc[feature_list]).reshape(1,-1)
# print data_x
age_v = round(rf0.predict(data_x))
# print('pred:',age_v)
# age['Age'] = age_v
return age_v
# print age
return age['Age']
dataset['Age'] = dataset.apply(set_default_age, axis=1)
# print(dataset.tail())
#
# data_age_no_full = dataset[dataset['Age'].]
# pd.cut与pd.qcut的区别,前者是根据取值范围来均匀划分,
# 后者是根据取值范围的各个取值的频率来换分,划分后的某个区间的频率数相同
# print(dataset.tail())
dataset['CategoricalAge'] = pd.cut(dataset['Age'], 5,labels=[0,1,2,3,4])
return full_data
def data_feature_select(full_data):
"""
:param full_data:全部数据集
:return:
"""
for data_set in full_data:
drop_list = ['PassengerId','Name','Age','Fare','Ticket','Cabin']
data_set.drop(drop_list,axis=1,inplace=True)
train_y = np.array(full_data[0]['Survived'])
train = full_data[0].drop('Survived',axis=1,inplace=False)
# print(train.head())
train_X = np.array(train)
test_X = np.array(full_data[1])
return train_X,train_y,test_X
def Passenger_sex(x):
sex = {'female': 0, 'male': 1}
return sex[x]
def Passenger_Embarked(x):
Embarked = {'S': 0, 'C': 1 , 'Q': 2}
return Embarked[x]
def Passenger_TitleName(x):
TitleName = {'Mr': 0, 'Miss': 1, 'Mrs': 2,'Master': 3, 'Other': 4}
return TitleName[x]
def get_title_name(name):
title_s = re.search(' ([A-Za-z]+)\.', name)
if title_s:
return title_s.group(1)
return ""
class LR:
def __init__(self,iterNum = 2000,learn_late = 0.005):
self.maxIter = iterNum
self.learn_late = learn_late
def train(self,train_X,train_y):
feature_size = train_X.shape[1]
sample_size = train_X.shape[0]
# 将w,b 融合了
self.w = np.zeros(feature_size + 1)
correct_num = 0
#梯度下降算法
for iter in range(self.maxIter):
# 随机选取一个样本
sample_index = random.randint(0,sample_size-1)
sample_select = train_X[sample_index].tolist()
sample_select.append(1.0)
sample_y = train_y[sample_index]
if sample_y == self.predict(sample_select):
# 连续预测对一定的数量
correct_num = correct_num + 1
if correct_num > self.maxIter:
break
continue
correct_num = 0
temp = np.exp(sum(self.w * sample_select))
for index in range(feature_size):
self.w[index] = self.w[index] - self.learn_late * \
(- sample_y * sample_select[index] + float(temp * sample_select[index])/float(1 + temp))
def predict(self,sample):
# 《统计学习方法》公式6.3、6.4
tmp = sum(self.w * sample)
y_0 = 1 / float(1+np.exp(tmp))
y_1 = np.exp(tmp) / float(1+np.exp(tmp))
if y_0 > y_1:
return 0
else:
return 1
class MEM:
# 算法模型为:《统计学习方法》公式6.28&6.29
def __init__(self,iterNum = 2000,epsion = 0.01):
self.epsion = epsion # 精度阈值
self.maxIter = iterNum
def train(self,train_X,train_y):
# 使用《统计学习方法》P92算法6.2——BFGS,求解参数
self.feature_size = train_X.shape[1]
self.sample_num = train_X.shape[0]
self.samples = train_X
self.labels = train_y
# 统计数据集中的特征函数个数
self._cal_feature_func()
self._f2id()
self.n = len(self.P_x_y) # n为特征函数的个数
# 计算每个特征函数关于经验分布p(x,y)的期望,并保持于EPxy字典中
self._cal_EPxy()
self.w = np.zeros(self.n) #wi为拉格函数中的乘子
self.g = np.zeros(self.n) #对应g(w),《统计学习方法》P92,最上面g(w)的公式
self.B = np.eye(self.n) #正定对称矩阵
for iter in range(self.maxIter):
# 算法6.2——(2)
self._cal_Gw()
if self._cal_g_l2() < self.epsion:
break
# 算法6.2——(3)
p_k = - (self.B ** -1) * np.reshape(self.g,(self.n,1))
# np.linalg.solve()
# 算法6.2——(4)
r_k = self._liear_search(p_k)
# 算法6.2——(5)
old_g = copy.deepcopy(self.g)
old_w = copy.deepcopy(self.w)
self.w = self.w + r_k * p_k
# 算法6.2——(6)
self._cal_Gw()
if self._cal_g_l2() < self.epsion:
break
y_k = self.g - old_g
fai_k = self.w - old_w
y_k = np.reshape(y_k,(self.n,1))
fai_k = np.reshape(fai_k,(self.n,1))
temp1 = np.dot(y_k,y_k.T) / float((np.dot(y_k.T,fai_k).reshape(1)[0]))
temp2 = np.dot(np.dot(np.dot(self.B,fai_k),fai_k.T),self.B) / float(np.dot(np.dot(fai_k.T,self.B),fai_k).reshape(1)[0])
self.B =self.B + temp1 - temp2
def change_sample_feature_name(self,samples):
new_samples = []
for sample in samples:
new_sample = []
for feature_index,feature_v in enumerate(sample):
new_feature_v = 'x' + str(feature_index) + '_' + str(feature_v)
new_sample.append(new_feature_v)
new_samples.append(np.array(new_sample))
return np.array(new_samples)
def _cal_Pxy_Px(self):
# 从数据集中计算特征函数,f(x,y),有该样本就为1,没有则为0,x为样本X的某一个特征的取值
self.P_x_y = defaultdict(int) # 其中P_x_y的键的个数则为特征函数的个数。
self.P_x = defaultdict(int)
for index in range(self.sample_num):
# 取出样本值
sample = self.samples[index]
label = self.labels[index]
for feature_index in range(self.feature_size):
x = sample[feature_index]
y = label
self.P_x_y[(x,y)] = self.P_x_y[(x,y)] + 1
self.P_x[x] = self.P_x[x] + 1
def _cal_EPxy(self):
#计算特征函数f关于经验分布的P(x,y)的期望值
self.EPxy = defaultdict(int) # 记录每个特征函数关于经验分布的P(x,y)的期望值
#遍历特征函数,求出期望值
for index in range(self.n):
(x,y) = self.id2f[index]
self.EPxy[index] = float(self.P_x_y[(x,y)]) / float(self.sample_num)
def _f2id(self):
#将index与特征函数对应起来
self.id2f = {}
self.f2id = {}
for index,(x,y) in enumerate(self.P_x_y):
self.id2f[index] = (x,y)
self.f2id[(x,y)] = index
def _cal_Pw(self,X,y):
#《统计学习方法》公式6.28,计算Pw(y|x),此处y只取0或1
res = 0.
for feature_v in X:
if self.f2id.has_key((feature_v,y)):
index = self.f2id[(feature_v,y)]
res = res + (self.w[index] * 1)
if y == 0:
y = 1
else:
y = 0
res_y = 0.
for feature_v in X:
if self.f2id.has_key((feature_v,y)):
index = self.f2id[(feature_v,y)]
res_y = res_y + (self.w[index] * 1)
return float(res) / float(res + res_y)
def _cal_Gw(self):
# 计算f(w)对w_i的偏导数,《统计学习方法》P92,最上面g(w)的公式
for index in range(self.n):
res = 0.
(x,y) = self.id2f[index]
feature_index = int(x[1])
# 累加
for sample_index in range(self.sample_num):
sample = self.samples[index]
label = self.labels[index]
if label != y:
continue
if sample[feature_index] != x:
continue
p_w = self._cal_Pw(sample, y)
num = 0
for feature_v in sample:
num = self.P_x[feature_v] + num
#《统计学习方法》P82,计算P(X=x)公式
p_x = float(num) / float(self.sample_num)
res = res + p_w * p_x * 1 # 1为f_i特征函数的值
self.g[index] = res - self.EPxy[index]
def _cal_g_l2(self):
res = sum(self.g * self.g) ** 0.5
return res
def _liear_search(self,p_k):
# 一维搜索,求r_k,使得f(w_k + r_k * p_k)极小
# new_w = self.w + r_k * p_k
# r_k = argmin f(w_k + r_k * p_k)
pass
# def _cal_fw(self):
# # 《统计学习方法》P91,f(w)计算公式
# res
# for index in range(self.n):
# (x,y) = self.id2f(index)
if __name__ == '__main__':
train = pd.read_csv(train_file)
test = pd.read_csv(test_file)
test_y = pd.read_csv(test_result_file)
full_data = [train, test]
# train.apply(axis=0)
full_data = data_feature_engineering(full_data, age_default_avg=True, one_hot=False)
train_X, train_y, test_X = data_feature_select(full_data)
# lr = LR(iterNum=2000,learn_late=0.001)
#
# lr.train(train_X, train_y)
#
# results = []
# for test_sample in test_X:
# sample = list(test_sample)
# sample.append(1.0)
# result = lr.predict(sample)
# results.append(result)
#
# y_test_true = np.array(test_y['Survived'])
# print("the LR model Accuracy : %.4g" % metrics.accuracy_score(y_pred=results, y_true=y_test_true))
mem = MEM()
# 对于包含多特征属性的样本需要重新给每个属性值定义,用于区分f(x,y)中的x
print(train_X[0:5])
print('==============')
print (mem.change_sample_feature_name(train_X[0:5]))
# mem.train(train_X,train_y)
================================================
FILE: ML/Perce_SVM/SVM.py
================================================
#!/usr/bin/python
# -*- coding: utf-8 -*-
# @Time : 2018/5/10 下午5:14
# @Author : ComeOnJian
# @File : SVM.py
# 参考 SVM https://blog.csdn.net/sinat_33829806/article/details/78388025
import math
import numpy as np
import random
import copy
import re
from sklearn import metrics
import pandas as pd
from sklearn.preprocessing import LabelBinarizer
from sklearn import svm
train_file = '../data/Titanic/train.csv'
test_file = '../data/Titanic/test.csv'
test_result_file = '../data/Titanic/gender_submission.csv'
def data_feature_engineering(full_data,age_default_avg=True,one_hot=True):
"""
:param full_data:全部数据集包括train,test
:param age_default_avg:age默认填充方式,是否使用平均值进行填充
:param one_hot: Embarked字符处理是否是one_hot编码还是映射处理
:return: 处理好的数据集
"""
for dataset in full_data:
# Pclass、Parch、SibSp不需要处理
# sex 0,1
dataset['Sex'] = dataset['Sex'].map(Passenger_sex).astype(int)
# FamilySize
dataset['FamilySize'] = dataset['SibSp'] + dataset['Parch'] + 1
# IsAlone
dataset['IsAlone'] = 0
isAlone_mask = dataset['FamilySize'] == 1
dataset.loc[isAlone_mask, 'IsAlone'] = 1
# Fare 离散化处理,6个阶段
fare_median = dataset['Fare'].median()
dataset['CategoricalFare'] = dataset['Fare'].fillna(fare_median)
dataset['CategoricalFare'] = pd.qcut(dataset['CategoricalFare'],6,labels=[0,1,2,3,4,5])
# Embarked映射处理,one-hot编码,极少部分缺失值处理
dataset['Embarked'] = dataset['Embarked'].fillna('S')
dataset['Embarked'] = dataset['Embarked'].astype(str)
if one_hot:
# 因为OneHotEncoder只能编码数值型,所以此处使用LabelBinarizer进行独热编码
Embarked_arr = LabelBinarizer().fit_transform(dataset['Embarked'])
dataset['Embarked_0'] = Embarked_arr[:, 0]
dataset['Embarked_1'] = Embarked_arr[:, 1]
dataset['Embarked_2'] = Embarked_arr[:, 2]
dataset.drop('Embarked',axis=1,inplace=True)
else:
# 字符串映射处理
dataset['Embarked'] = dataset['Embarked'].map(Passenger_Embarked).astype(int)
# Name选取称呼Title_name
dataset['TitleName'] = dataset['Name'].apply(get_title_name)
dataset['TitleName'] = dataset['TitleName'].replace('Mme', 'Mrs')
dataset['TitleName'] = dataset['TitleName'].replace('Mlle', 'Miss')
dataset['TitleName'] = dataset['TitleName'].replace('Ms', 'Miss')
dataset['TitleName'] = dataset['TitleName'].replace(['Lady', 'Countess', 'Capt', 'Col', \
'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'],
'Other')
dataset['TitleName'] = dataset['TitleName'].map(Passenger_TitleName).astype(int)
# age —— 缺失值,分段处理
if age_default_avg:
# 缺失值使用avg处理
age_avg = dataset['Age'].mean()
age_std = dataset['Age'].std()
age_null_count = dataset['Age'].isnull().sum()
age_default_list = np.random.randint(low=age_avg - age_std, high=age_avg + age_std, size=age_null_count)
dataset.loc[np.isnan(dataset['Age']), 'Age'] = age_default_list
dataset['Age'] = dataset['Age'].astype(int)
else:
# 将age作为label,预测缺失的age
# 特征为 TitleName,Sex,pclass,SibSP,Parch,IsAlone,CategoricalFare,FamileSize,Embarked
feature_list = ['TitleName', 'Sex', 'Pclass', 'SibSp', 'Parch', 'IsAlone','CategoricalFare',
'FamilySize', 'Embarked','Age']
if one_hot:
feature_list.append('Embarked_0')
feature_list.append('Embarked_1')
feature_list.append('Embarked_2')
feature_list.remove('Embarked')
Age_data = dataset.loc[:,feature_list]
un_Age_mask = np.isnan(Age_data['Age'])
Age_train = Age_data[~un_Age_mask] #要训练的Age
# print(Age_train.shape)
feature_list.remove('Age')
rf0 = RandomForestRegressor(n_estimators=60,oob_score=True,min_samples_split=10,min_samples_leaf=2,
max_depth=7,random_state=10)
rf0.fit(Age_train[feature_list],Age_train['Age'])
def set_default_age(age):
if np.isnan(age['Age']):
# print(age['PassengerId'])
# print age.loc[feature_list]
data_x = np.array(age.loc[feature_list]).reshape(1,-1)
# print data_x
age_v = round(rf0.predict(data_x))
# print('pred:',age_v)
# age['Age'] = age_v
return age_v
# print age
return age['Age']
dataset['Age'] = dataset.apply(set_default_age, axis=1)
# print(dataset.tail())
#
# data_age_no_full = dataset[dataset['Age'].]
# pd.cut与pd.qcut的区别,前者是根据取值范围来均匀划分,
# 后者是根据取值范围的各个取值的频率来换分,划分后的某个区间的频率数相同
# print(dataset.tail())
dataset['CategoricalAge'] = pd.cut(dataset['Age'], 5,labels=[0,1,2,3,4])
return full_data
def data_feature_select(full_data):
"""
:param full_data:全部数据集
:return:
"""
for data_set in full_data:
drop_list = ['PassengerId','Name','Age','Fare','Ticket','Cabin']
data_set.drop(drop_list,axis=1,inplace=True)
train_y = np.array(full_data[0]['Survived'])
train = full_data[0].drop('Survived',axis=1,inplace=False)
# print(train.head())
train_X = np.array(train)
test_X = np.array(full_data[1])
return train_X,train_y,test_X
def Passenger_sex(x):
sex = {'female': 0, 'male': 1}
return sex[x]
def Passenger_Embarked(x):
Embarked = {'S': 0, 'C': 1 , 'Q': 2}
return Embarked[x]
def Passenger_TitleName(x):
TitleName = {'Mr': 0, 'Miss': 1, 'Mrs': 2,'Master': 3, 'Other': 4}
return TitleName[x]
def Passenger_Survived(x):
Survived = {0: -1, 1: 1}
return Survived[x]
def get_title_name(name):
title_s = re.search(' ([A-Za-z]+)\.', name)
if title_s:
return title_s.group(1)
return ""
class SVM():
def __init__(self,kernal,maxIter,C,epsilon,sigma = 0.001):
"""
:param kernal:核函数
:param maxIter:最大迭代次数
:param C:松弛变量前的惩罚系数
:param epseion:
"""
self.kernal = kernal
self.C = C
self.maxIter = maxIter
self.epsilon = epsilon
self.sigma = sigma #高斯核函数的sigma值
def train(self,train_X,train_y):
self.sample_num = train_X.shape[0]
self.feature_num = train_X.shape[1]
self.labels =train_y
self.samples = train_X
# 算法的模型为 《统计学习方法》——公式7.104,主要包括a,b,核函数
self.a = np.zeros(self.sample_num)#[0 for a_i in range(self.sample_num)]
self.b = 0
self.eCache = np.zeros(shape=(self.sample_num,2))# 存储差值
self._smo()
# self._update()
def predict(self,test_x):
# 《统计学习方法》——公式7.104,计算预测值
pre_v = 0
for index in range(self.sample_num):
pre_v = pre_v + self.a[index] * self.labels[index] * self._kernel(test_x,self.samples[index])
pre_v = pre_v + self.b
return np.sign(pre_v)
def _smo(self):
pre_a = copy.deepcopy(self.a) # 复制,pre_a是old的a数组
for iter in range(self.maxIter):
flag = 1
for index in range(self.sample_num):
diff = 0
# self._update()
E_i = self._calE(self.samples[index],self.labels[index])
j,E_j = self._chooseJ(index,E_i)
# 计算L H
(L,H) = self._calLH(pre_a,j,index)
# 《统计学习方法》——公式7.107,n = K11 + K22 - 2 * K12
n = self._kernel(self.samples[index],self.samples[index]) \
+ self._kernel(self.samples[j],self.samples[j])\
- 2 * self._kernel(self.samples[index],self.samples[j])
if (n == 0):
continue
# 《统计学习方法》——公式7.106,计算未剪切的a_j极值
self.a[j] = pre_a[j] + float(self.labels[j] * (E_i - E_j))/n
# 《统计学习方法》——公式7.108,计算剪切的a_j极值
if self.a[j] > H:
self.a[j] = H
elif self.a[j] < L:
self.a[j] = L
# 《统计学习方法》——公式7.109,更新a[i]
self.a[index] = pre_a[index] + self.labels[index] * self.labels[j] * (pre_a[j] - self.a[j])
# 更新b,《统计学习方法》——公式7.114到7.116,更新a[i]
b1 = self.b - E_i \
- self.labels[index] * self._kernel(self.samples[index],self.samples[index]) * (self.a[index] - pre_a[index]) \
- self.labels[j] * self._kernel(self.samples[j],self.samples[index]) * (self.a[j] - pre_a[j])
b2 = self.b - E_j \
- self.labels[index] * self._kernel(self.samples[index], self.samples[j]) * (
self.a[index] - pre_a[index]) \
- self.labels[j] * self._kernel(self.samples[j], self.samples[j]) * (self.a[j] - pre_a[j])
if (0 < self.a[index]< self.C):
self.b = b1
elif (0 < self.a[j]< self.C):
self.b = b2
else:
self.b = (b1 + b2)/2.0
# 更新E_i,E_j统计学习方法》——公式7.117,
self.eCache[j] = [1,self._calE(self.samples[j],self.labels[j])]
self.eCache[index] = [1,self._calE(self.samples[index],self.labels[index])]
diff = sum([abs(pre_a[m] - self.a[m]) for m in range(len(self.a))])
if diff < self.epsilon:
# 满足精度条件
flag = 0
pre_a = copy.deepcopy(self.a)
if flag == 0:
break
def _calE(self,sample,y):
# 计算E_i,输入X_i与真实值之间的误差,《统计学习方法》——公式7.105
pre_v = self.predict(sample)
return pre_v - y
def _calLH(self,a,j,i):
#《统计学习方法》——p126页
if(self.labels[j] != self.labels[i]):
return (max(0,a[j]-a[i]),min(self.C,self.C+a[j]-a[i]))
else:
return (max(0, a[j] + a[i] - self.C), min(self.C, a[j] + a[i]))
def _kernel(self,X_i,X_j):
"""
:param X_i:
:param X_j:
:return: 核函数K(X_i,X_j)计算结果
"""
result = 0.
# 高斯内核
if self.kernal == 'Gauss':
temp = -sum((X_i - X_j)**2)/(2 * self.sigma**2)
result = math.exp(temp)
# 线性内核
elif self.kernal == 'line':
result = sum(X_i * X_j)
return result
def _chooseJ(self,i,E_i):
# 选择变量
self.eCache[i] = [1,E_i]
choose_list = []
# 查找之前计算的可选择的的E_i
for cache_index in range(len(self.eCache)):
if self.eCache[cache_index][0] != 0 and cache_index != i:
choose_list.append(cache_index)
if len(choose_list)>1:
E_k =0
delta_E = 0
max_E = 0
j = 0 # 要选择的J
E_j = 0# 及其对应的E
for choose_index in choose_list:
E_k = self._calE(self.samples[choose_index],self.labels[choose_index])
delta_E = abs(E_k-E_i)
if delta_E > max_E:
max_E = delta_E
j = choose_index
E_j = E_k
return j,E_j
# 初始状态,没有已经计算好的E
else:
j = self._randJ(i)
E_j = self._calE(self.samples[j],self.labels[j])
return j , E_j
def _randJ(self,i):
j = i
while(j == i):
j = random.randint(0,self.sample_num-1)
return j
if __name__ == '__main__':
train = pd.read_csv(train_file)
test = pd.read_csv(test_file)
test_y = pd.read_csv(test_result_file)
train['Survived'] = train['Survived'].map(Passenger_Survived).astype(int)
full_data = [train, test]
full_data = data_feature_engineering(full_data, age_default_avg=True, one_hot=False)
train_X, train_y, test_X = data_feature_select(full_data)
svm1 = SVM('line',1000,0.05,0.001)
svm1.train(train_X, train_y)
results = []
for test_sample in test_X:
y = svm1.predict(test_sample)
results.append(y)
y_test_true = np.array(test_y['Survived'])
print("the svm model Accuracy : %.4g" % metrics.accuracy_score(y_pred=results, y_true=y_test_true))
# svm_s = svm.SVC(C=1,kernel='linear')
# svm_s.fit(train_X, train_y)
# pre_y = svm_s.predict(test_X)
# y_test_true = np.array(test_y['Survived'])
# print("the svm model Accuracy : %.4g" % metrics.accuracy_score(y_pred=pre_y, y_true=y_test_true))
================================================
FILE: ML/Perce_SVM/perceptron.py
================================================
#!/usr/bin/python
# -*- coding: utf-8 -*-
# @Time : 2018/5/6 下午3:55
# @Author : ComeOnJian
# @File : perceptron.py
import numpy as np
import pandas as pd
import random
import re
from sklearn import metrics
################特征工程部分###############
train_file = '../data/Titanic/train.csv'
test_file = '../data/Titanic/test.csv'
test_result_file = '../data/Titanic/gender_submission.csv'
def data_feature_engineering(full_data,age_default_avg=True,one_hot=True):
"""
:param full_data:全部数据集包括train,test
:param age_default_avg:age默认填充方式,是否使用平均值进行填充
:param one_hot: Embarked字符处理是否是one_hot编码还是映射处理
:return: 处理好的数据集
"""
for dataset in full_data:
# Pclass、Parch、SibSp不需要处理
# sex 0,1
dataset['Sex'] = dataset['Sex'].map(Passenger_sex).astype(int)
# FamilySize
dataset['FamilySize'] = dataset['SibSp'] + dataset['Parch'] + 1
# IsAlone
dataset['IsAlone'] = 0
isAlone_mask = dataset['FamilySize'] == 1
dataset.loc[isAlone_mask, 'IsAlone'] = 1
# Fare 离散化处理,6个阶段
fare_median = dataset['Fare'].median()
dataset['CategoricalFare'] = dataset['Fare'].fillna(fare_median)
dataset['CategoricalFare'] = pd.qcut(dataset['CategoricalFare'],6,labels=[0,1,2,3,4,5])
# Embarked映射处理,one-hot编码,极少部分缺失值处理
dataset['Embarked'] = dataset['Embarked'].fillna('S')
dataset['Embarked'] = dataset['Embarked'].astype(str)
if one_hot:
# 因为OneHotEncoder只能编码数值型,所以此处使用LabelBinarizer进行独热编码
Embarked_arr = LabelBinarizer().fit_transform(dataset['Embarked'])
dataset['Embarked_0'] = Embarked_arr[:, 0]
dataset['Embarked_1'] = Embarked_arr[:, 1]
dataset['Embarked_2'] = Embarked_arr[:, 2]
dataset.drop('Embarked',axis=1,inplace=True)
else:
# 字符串映射处理
dataset['Embarked'] = dataset['Embarked'].map(Passenger_Embarked).astype(int)
# Name选取称呼Title_name
dataset['TitleName'] = dataset['Name'].apply(get_title_name)
dataset['TitleName'] = dataset['TitleName'].replace('Mme', 'Mrs')
dataset['TitleName'] = dataset['TitleName'].replace('Mlle', 'Miss')
dataset['TitleName'] = dataset['TitleName'].replace('Ms', 'Miss')
dataset['TitleName'] = dataset['TitleName'].replace(['Lady', 'Countess', 'Capt', 'Col', \
'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'],
'Other')
dataset['TitleName'] = dataset['TitleName'].map(Passenger_TitleName).astype(int)
# age —— 缺失值,分段处理
if age_default_avg:
# 缺失值使用avg处理
age_avg = dataset['Age'].mean()
age_std = dataset['Age'].std()
age_null_count = dataset['Age'].isnull().sum()
age_default_list = np.random.randint(low=age_avg - age_std, high=age_avg + age_std, size=age_null_count)
dataset.loc[np.isnan(dataset['Age']), 'Age'] = age_default_list
dataset['Age'] = dataset['Age'].astype(int)
else:
# 将age作为label,预测缺失的age
# 特征为 TitleName,Sex,pclass,SibSP,Parch,IsAlone,CategoricalFare,FamileSize,Embarked
feature_list = ['TitleName', 'Sex', 'Pclass', 'SibSp', 'Parch', 'IsAlone','CategoricalFare',
'FamilySize', 'Embarked','Age']
if one_hot:
feature_list.append('Embarked_0')
feature_list.append('Embarked_1')
feature_list.append('Embarked_2')
feature_list.remove('Embarked')
Age_data = dataset.loc[:,feature_list]
un_Age_mask = np.isnan(Age_data['Age'])
Age_train = Age_data[~un_Age_mask] #要训练的Age
# print(Age_train.shape)
feature_list.remove('Age')
rf0 = RandomForestRegressor(n_estimators=60,oob_score=True,min_samples_split=10,min_samples_leaf=2,
max_depth=7,random_state=10)
rf0.fit(Age_train[feature_list],Age_train['Age'])
def set_default_age(age):
if np.isnan(age['Age']):
# print(age['PassengerId'])
# print age.loc[feature_list]
data_x = np.array(age.loc[feature_list]).reshape(1,-1)
# print data_x
age_v = round(rf0.predict(data_x))
# print('pred:',age_v)
# age['Age'] = age_v
return age_v
# print age
return age['Age']
dataset['Age'] = dataset.apply(set_default_age, axis=1)
# print(dataset.tail())
#
# data_age_no_full = dataset[dataset['Age'].]
# pd.cut与pd.qcut的区别,前者是根据取值范围来均匀划分,
# 后者是根据取值范围的各个取值的频率来换分,划分后的某个区间的频率数相同
# print(dataset.tail())
dataset['CategoricalAge'] = pd.cut(dataset['Age'], 5,labels=[0,1,2,3,4])
return full_data
def data_feature_select(full_data):
"""
:param full_data:全部数据集
:return:
"""
for data_set in full_data:
drop_list = ['PassengerId','Name','Age','Fare','Ticket','Cabin']
data_set.drop(drop_list,axis=1,inplace=True)
train_y = np.array(full_data[0]['Survived'])
train = full_data[0].drop('Survived',axis=1,inplace=False)
# print(train.head())
train_X = np.array(train)
test_X = np.array(full_data[1])
return train_X,train_y,test_X
def Passenger_sex(x):
sex = {'female': 0, 'male': 1}
return sex[x]
def Passenger_Embarked(x):
Embarked = {'S': 0, 'C': 1 , 'Q': 2}
return Embarked[x]
def Passenger_TitleName(x):
TitleName = {'Mr': 0, 'Miss': 1, 'Mrs': 2,'Master': 3, 'Other': 4}
return TitleName[x]
def get_title_name(name):
title_s = re.search(' ([A-Za-z]+)\.', name)
if title_s:
return title_s.group(1)
return ""
class Perceptron:
def __init__(self,alpha = 0.01,updata_count_total = 3000,nochange_count_limit = 600):
"""
:param alpha:梯度下降的学习参数
:param updata_count: 梯度下降的参数更新限制次数
:param nochange_count_limit:随机选择的样本连续分类正确的数
"""
self.alpha = alpha
self.updata_count_total = updata_count_total
self.nochange_count_limit = nochange_count_limit
def train(self,train_X,train_y):
feature_size = train_X.shape[1]
sample_size = train_X.shape[0]
# 初始化w,b参数
self.w = np.zeros((feature_size,1))
self.b = 0
update_count = 0
correct_count = 0
while True:
if correct_count > self.nochange_count_limit:
break
# 随机选取一个误分类点
sample_select_index = random.randint(0,sample_size-1)
sample = train_X[[sample_select_index]]
sample_y = train_y[sample_select_index]
# 将labe分类为0,1转换为-1,1,其中0对应-1,1对应着1
y_i = -1
if sample_y == 1:
y_i = 1
# 计算该样本的distance距离yi(xi*w)+b
distance = - (np.dot(sample,self.w)[0][0] + self.b) * y_i
if distance >= 0:
# 挑选出误分类点,更新w,b
correct_count = 0;
sample = np.reshape(sample,(feature_size,1))
add_w = self.alpha * y_i * sample
self.w = self.w + add_w
self.b += (self.alpha * y_i)
update_count += 1
if update_count > self.updata_count_total:
break;
else:
correct_count = correct_count + 1
def predict(self,sample_x):
result = np.dot(sample_x,self.w) + self.b
return int(result > 0)
if __name__ == '__main__':
train = pd.read_csv(train_file)
test = pd.read_csv(test_file)
test_y = pd.read_csv(test_result_file)
full_data = [train, test]
# train.apply(axis=0)
full_data = data_feature_engineering(full_data, age_default_avg=True, one_hot=False)
train_X, train_y, test_X = data_feature_select(full_data)
perce = Perceptron(alpha=0.01,updata_count_total = 3000)
perce.train(train_X,train_y)
results = []
for test_sample in test_X:
result = perce.predict(test_sample)
results.append(result)
y_test_true = np.array(test_y['Survived'])
print ("the Perceptron model Accuracy : %.4g" % metrics.accuracy_score(y_pred=results, y_true=y_test_true))
================================================
FILE: ML/REDAME.md
================================================
ML文件夹说明
================================================
FILE: ML/TensorDemo/NN_tf.py
================================================
#!/usr/bin/python
# -*- coding: utf-8 -*-
# @Time : 2018/2/28 下午4:22
# @Author : ComeOnJian
# @File : NN_tf.py
import tensorflow as tf
from tensorflow.examples.tutorials.mnist import input_data
# step 1 NN的参数设置
in_unit = 784
h1_unit = 300
learningrate = 0.05 # 梯度下降法学习率
dropout_keep_prob = 0.75 # dropout时保留神经元的比例,神经网络不为0的参数变为原理的1/dropout_keep_prob倍
batch_size = 100 # 梯度下降法选取的batch的size
max_iter = 3000 # 迭代次数
sava_dir = '../data/' # 存放数据结果
log_dir = '../log/' # 日志目录
def variable_summeries(var):
"""
:param var: Tensor, Attach a lot of summaries to a Tensor (for TensorBoard visualization).
"""
with tf.name_scope('summeries'):
mean = tf.reduce_mean(var)
tf.summary.scalar('mean',mean) #记录参数的均值
with tf.name_scope('stddev'):
stddev = tf.sqrt(tf.reduce_mean(tf.square(var - mean)))
tf.summary.scalar('stddev',stddev)
tf.summary.scalar('max',tf.reduce_max(var))
tf.summary.scalar('min',tf.reduce_min(var))
# 用直方图记录参数的分布
tf.summary.histogram('histogram',var)
def weight_variable(shape):
"""
将每一层的神经网络的对应的权重参数w,初始化并封装到function中
"""
inita_w = tf.truncated_normal(shape,stddev=0.1)
return tf.Variable(inita_w,dtype=tf.float32)
def bias_variable(shape):
"""
将每一层的神经网络的对应的偏置项b,初始化并封装到function中
"""
inita_b = tf.constant(0.1,shape=shape)
return tf.Variable(inita_b)
def nn_layer(input_tensor,input_dim,output_dim,layer_name,act=tf.nn.relu):
"""
建立神经网络层(一层)
:param input_tensor:特征数据
:param input_dim:输入数据的维度大小
:param output_dim:该层神经元的个数
:param layer_name:命名空间
:param act:神经元对应的激活函数
"""
#设置命名空间
with tf.name_scope(layer_name):
#初始化权重,并记录权重变化
with tf.name_scope('weights'):
weight = weight_variable([input_dim,output_dim])
variable_summeries(weight)# 记录权重变化
with tf.name_scope('bias'):
bias = bias_variable([output_dim])
variable_summeries(bias)
with tf.name_scope('linear_compute'):
preact = tf.matmul(input_tensor,weight)+bias
tf.summary.histogram('linear',preact)
activeation = act(preact,name = 'activation')
tf.summary.histogram('activation',activeation)
return activeation
# def set_computer_Graph():
#
# """
# 设计tf的计算图,并返回
# :return:
# """
# tf.reset_default_graph()
# train_graph = tf.Graph()
#
# with train_graph.as_default():
#
# # step 3.1 设置算法模型中的输入,使用占位符,占用输入的数据(什么情况下使用占位符,什么情况下设置tf变量)
#
# train_x = tf.placeholder(dtype=tf.float32,shape=[None,in_unit],name = 'train_x')
# train_y = tf.placeholder(dtype=tf.float32,shape=[None,10],name = 'train_y')
#
# # step 3.2构造神经网络
#
# # 创建第一层隐藏层
# hidden_layer1 = nn_layer(train_x,input_dim=in_unit,output_dim=h1_unit,layer_name='hider_layer1',act=tf.nn.relu)
#
# #在第一层隐藏层上创建一层 dropout层 —— 随机关闭一些hidden_layer1的神经元
# with tf.name_scope('dropout'):
# dropout_prob = tf.placeholder(dtype=tf.float32, name='dropout_prob')
# tf.summary.scalar('dropout_keep_probability',dropout_prob)
# hidden_layer1_dropout = tf.nn.dropout(hidden_layer1,dropout_prob)
#
# #创建输出层,包括10个类别,输出层的输入是hidden_layer1_dropout,输出是[1,10]
# y = nn_layer(hidden_layer1_dropout,h1_unit,10,layer_name='out_layer',act=tf.identity)
#
# # step 3.3 创建损失函数
#
# with tf.name_scope('loss'):
# cross_entropy_diff = tf.nn.softmax_cross_entropy_with_logits(labels=train_y,logits=y)
#
# with tf.name_scope('total'):
# cross_entropy = tf.reduce_mean(cross_entropy_diff)
# tf.summary.scalar('loss',cross_entropy)
#
# # step 3.4 选择优化器训练并计算准确率
# optimizer = tf.train.AdamOptimizer(learning_rate=learningrate)
# train_op = optimizer.minimize(cross_entropy)
#
# with tf.name_scope('accuracy'):
# with tf.name_scope('correct_prediction'):
# correct_prediction = tf.equal(tf.argmax(y,1),tf.argmax(train_y,1))
# with tf.name_scope('accuracy'):
# accuracy = tf.reduce_mean(tf.cast(correct_prediction,tf.float32))
# tf.summary.scalar('accuracy',accuracy)
# return train_graph,train_op,accuracy
#
if __name__ == '__main__':
# step 2 加载数据
mnist = input_data.read_data_sets('./MNIST_data/',one_hot=True)
#step 3设置tf 计算图
tf.reset_default_graph()
train_graph = tf.Graph()
with train_graph.as_default():
# step 3.1 设置算法模型中的输入,使用占位符,占用输入的数据(什么情况下使用占位符,什么情况下设置tf变量)
train_x = tf.placeholder(dtype=tf.float32,shape=[None,in_unit],name = 'train_x')
train_y = tf.placeholder(dtype=tf.float32,shape=[None,10],name = 'train_y')
# step 3.2构造神经网络
# 创建第一层隐藏层
hidden_layer1 = nn_layer(train_x,input_dim=in_unit,output_dim=h1_unit,layer_name='hider_layer1',act=tf.nn.relu)
#在第一层隐藏层上创建一层 dropout层 —— 随机关闭一些hidden_layer1的神经元
with tf.name_scope('dropout'):
dropout_prob = tf.placeholder(dtype=tf.float32, name='dropout_prob')
tf.summary.scalar('dropout_keep_probability',dropout_prob)
hidden_layer1_dropout = tf.nn.dropout(hidden_layer1,dropout_prob)
#创建输出层,包括10个类别,输出层的输入是hidden_layer1_dropout,输出是[1,10]
y = nn_layer(hidden_layer1_dropout,h1_unit,10,layer_name='out_layer',act=tf.identity)
# step 3.3 创建损失函数
with tf.name_scope('loss'):
cross_entropy_diff = tf.nn.softmax_cross_entropy_with_logits(labels=train_y, logits=y)
with tf.name_scope('total'):
cross_entropy = tf.reduce_mean(cross_entropy_diff)
tf.summary.scalar('loss', cross_entropy)
# step 3.4 选择优化器训练并计算准确率
optimizer = tf.train.AdamOptimizer(learning_rate=learningrate)
train_op = optimizer.minimize(cross_entropy)
with tf.name_scope('accuracy'):
with tf.name_scope('correct_prediction'):
correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(train_y, 1))
with tf.name_scope('accuracy'):
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
tf.summary.scalar('accuracy', accuracy)
session = tf.InteractiveSession(graph=train_graph)
# step 4 合并summary并初始化所有变量
merged = tf.summary.merge_all()
train_writer = tf.summary.FileWriter(log_dir+'/train',graph=train_graph)
test_writer = tf.summary.FileWriter(log_dir+'/test',graph=train_graph)
tf.global_variables_initializer().run()
# Step 5 训练模型并记录到TensorBoard
for iter in range(max_iter):
trainx_batch_x,train_batch_y = mnist.train.next_batch(batch_size)
#迭代10次记录一下accuracy
if iter % 10 == 0:
summmary,acc,loss = session.run([merged,accuracy,cross_entropy],feed_dict={train_x:trainx_batch_x,train_y:train_batch_y,dropout_prob:1.0})
test_writer.add_summary(summmary,iter)#写入日志
print('loss at step %s: %s'%(iter,loss))
print('Accuracy at step %s: %s'%(iter,acc))
else:
if iter % 100 == 0:
#记录tensor运行节点的信息
run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)
run_metadata = tf.RunMetadata()
#将配置信息和记录运行信息的proto传入运行的过程,从而记录运行时每一个节点的时间、空间开销信息
summmary,_ = session.run([merged,train_op],
feed_dict={train_x:trainx_batch_x,train_y:train_batch_y,dropout_prob:dropout_keep_prob},
options=run_options,
run_metadata=run_metadata)
#将节点运行时的信息写入日志文件
train_writer.add_run_metadata(run_metadata,'step %d' % iter)
train_writer.add_summary(summmary,iter)
pass
else:
summmary,_ = session.run([merged,train_op],feed_dict={train_x:trainx_batch_x,train_y:train_batch_y,dropout_prob:dropout_keep_prob})
train_writer.add_summary(summmary,iter)
train_writer.close()
test_writer.close()
session.close()
================================================
FILE: ML/TensorDemo/README.md
================================================
#### TensorDemo文件说明
##### [TensorFlow实现多层感知机及可视化训练过程中的数据记录](http://blog.csdn.net/u014732537/article/details/79412672) NN_ty.py
================================================
FILE: ML/data/adult/adult_deal_value.data
================================================
1,5,0,3,2,8,3,0,1,1,1,0,1,0
1,1,0,3,0,4,2,0,1,1,1,2,1,0
1,0,3,2,1,6,3,0,1,1,1,0,1,0
1,0,2,1,0,6,2,4,1,1,1,0,1,0
0,0,0,3,0,5,0,4,0,1,1,0,1,0
1,0,10,3,0,4,0,0,0,1,1,0,1,0
1,0,7,1,5,2,3,4,0,1,1,2,1,0
1,1,3,2,0,4,2,0,1,1,1,1,1,1
0,0,10,3,2,5,3,0,0,1,1,1,1,1
1,0,0,3,0,4,2,0,1,1,1,0,1,1
1,0,1,2,0,4,2,4,1,1,1,1,1,1
0,5,0,3,0,5,2,1,1,1,1,0,1,1
0,0,0,3,2,8,1,0,0,1,1,2,1,0
0,0,5,2,2,3,3,4,1,1,1,1,1,0
0,0,8,0,0,10,2,2,1,1,1,1,1,0
0,1,3,2,2,9,1,0,1,1,1,2,1,0
0,0,3,2,2,7,5,0,1,1,1,0,1,0
1,0,2,1,0,3,2,0,1,1,1,1,1,0
1,1,10,3,1,4,5,0,0,1,1,1,1,1
1,0,13,3,0,5,2,0,1,1,1,1,1,1
2,0,3,2,3,2,5,4,0,1,1,2,1,0
0,3,7,1,0,9,2,4,1,1,1,0,1,0
1,0,2,1,0,10,2,0,1,1,1,0,1,0
2,0,3,2,1,0,5,0,0,1,1,0,1,0
2,4,0,3,0,0,2,0,1,1,1,0,1,1
0,0,3,2,2,1,1,0,1,1,1,0,1,0
1,0,3,2,1,4,3,0,1,1,1,1,1,0
1,0,3,2,0,1,2,0,1,1,1,0,1,0
0,4,5,2,2,12,3,0,1,1,1,1,1,0
0,0,1,2,2,3,1,4,1,1,1,1,1,0
1,0,0,3,1,4,1,0,1,1,1,0,1,0
0,3,1,2,0,8,1,0,1,1,1,0,1,0
0,5,1,2,0,2,2,4,1,1,1,2,1,0
1,0,2,1,2,7,5,0,1,1,1,0,1,0
0,0,1,2,2,7,1,0,1,1,1,0,1,0
0,0,3,2,6,8,0,0,0,1,1,2,1,0
1,1,5,2,0,5,2,0,1,1,1,0,1,0
0,0,7,1,0,7,2,0,1,1,1,1,1,0
1,1,0,3,0,5,2,0,1,1,1,0,1,0
0,0,0,3,0,0,2,0,1,1,1,1,1,0
1,0,3,2,3,8,5,0,0,1,1,0,1,0
0,0,3,2,2,6,3,0,1,1,1,2,1,0
2,3,0,3,0,5,2,4,1,1,1,0,1,1
1,0,3,2,0,7,2,0,1,1,1,2,1,0
1,0,10,3,1,4,5,0,0,1,1,0,1,0
1,5,6,2,0,1,2,0,1,1,1,0,1,0
0,0,6,2,2,5,3,0,1,1,1,1,1,0
0,0,1,2,0,4,0,3,0,1,1,0,1,0
1,0,4,3,0,5,0,0,0,1,1,1,1,1
1,3,0,3,1,4,3,0,1,1,1,1,1,1
1,2,3,2,1,4,3,0,1,1,1,1,1,0
1,0,1,2,0,0,2,0,1,1,1,0,1,1
1,0,14,0,0,7,2,0,1,1,1,0,1,0
0,0,6,2,0,2,2,0,1,1,1,0,1,0
1,0,3,2,0,8,2,0,1,1,1,1,1,0
0,0,3,2,0,7,2,0,1,1,1,0,1,0
0,0,0,3,0,3,2,0,1,1,1,0,1,0
1,0,3,2,0,10,2,0,1,1,1,0,1,0
1,0,13,3,0,5,2,0,1,1,1,1,1,1
0,0,1,2,1,0,3,0,1,1,1,1,1,0
1,0,3,2,0,1,2,0,1,1,1,0,1,0
0,0,1,2,1,8,3,0,0,1,1,0,1,0
1,0,3,2,0,8,0,0,0,1,1,0,1,1
1,2,1,2,0,4,2,0,1,1,1,1,1,1
0,0,1,2,2,5,1,0,1,1,1,2,1,0
0,0,0,3,3,3,1,4,0,1,1,0,1,0
0,1,0,3,0,3,2,0,1,1,1,1,1,1
0,0,1,2,2,7,3,0,1,1,1,0,1,0
3,0,1,2,0,5,4,0,1,1,1,2,1,0
0,0,3,2,2,2,1,0,1,1,1,0,1,0
1,0,5,2,0,8,2,0,1,1,1,0,1,0
0,0,2,1,2,2,1,0,0,1,1,2,1,0
0,4,8,0,0,9,2,0,1,1,1,0,1,0
0,0,3,2,2,3,3,0,1,1,1,2,1,0
1,0,0,3,0,2,2,0,1,1,1,0,1,0
1,0,3,2,0,2,0,0,0,1,1,0,1,0
2,0,3,2,0,3,2,0,1,1,1,1,1,0
1,0,3,2,1,1,3,0,0,1,1,0,1,1
1,0,3,2,1,3,1,0,0,1,1,2,1,0
1,4,3,2,0,12,2,0,1,1,1,0,1,1
0,0,10,3,0,5,2,0,1,1,1,1,1,0
0,0,7,1,2,3,3,0,1,1,1,0,1,0
1,3,13,3,2,5,3,0,0,1,1,1,1,1
2,0,6,2,0,5,2,0,1,1,1,0,1,0
1,0,1,2,1,1,5,0,0,1,1,0,1,0
0,0,1,2,1,7,5,4,0,1,1,2,1,0
0,4,0,3,0,12,2,0,1,1,1,0,1,1
0,4,1,2,2,6,3,0,1,1,1,1,1,0
1,1,13,3,0,5,2,0,1,1,1,1,1,1
1,0,1,2,0,3,2,0,1,1,1,1,1,1
1,0,5,2,1,4,5,0,0,1,1,0,1,0
0,3,3,2,2,2,1,4,1,1,1,0,1,0
3,0,10,3,0,4,2,0,1,1,1,0,1,1
1,0,0,3,0,4,2,0,1,1,1,1,1,1
1,1,10,3,2,5,3,0,0,1,1,1,1,0
0,0,1,2,2,0,1,0,0,1,1,0,1,0
0,0,3,2,2,1,3,0,1,1,1,0,1,0
0,2,3,2,0,1,2,0,1,1,1,0,1,1
0,0,2,1,2,6,1,0,1,1,1,0,1,0
0,4,3,2,2,8,3,0,0,1,1,0,1,0
1,0,3,2,0,6,2,0,1,1,1,0,1,0
0,0,0,3,2,3,4,0,1,1,1,0,1,0
1,0,4,3,0,5,2,0,1,1,1,0,1,1
2,1,3,2,0,2,2,0,1,1,1,1,1,1
0,0,1,2,0,3,0,0,0,1,1,2,1,0
1,0,3,2,2,7,1,0,0,1,1,0,1,0
1,0,7,1,0,6,2,0,1,1,1,1,1,0
2,2,1,2,0,3,2,0,1,1,1,1,1,0
1,4,6,2,0,1,2,4,1,1,1,0,1,1
2,0,1,2,0,3,2,0,1,1,1,1,1,0
0,0,3,2,2,1,3,0,1,1,1,1,1,0
0,0,1,2,2,2,1,0,0,1,1,0,1,0
1,0,0,3,0,4,2,0,1,1,1,1,1,0
0,0,0,3,2,4,1,4,1,1,1,0,1,0
0,5,0,3,0,5,2,0,1,1,1,1,1,1
0,0,1,2,2,2,1,4,1,1,1,2,1,0
1,0,0,3,0,4,0,0,0,1,1,0,1,1
0,0,1,2,2,8,1,0,0,1,1,2,1,0
0,0,5,2,0,8,2,0,1,1,1,0,1,1
1,0,1,2,1,1,3,0,1,1,1,0,1,0
0,0,5,2,2,3,3,0,0,1,1,1,1,0
0,0,3,2,2,2,3,0,0,1,1,0,1,0
1,1,3,2,0,1,2,0,1,1,1,2,1,0
1,0,0,3,0,3,2,0,1,1,1,1,1,0
1,0,5,2,1,0,3,4,0,1,1,2,1,0
1,3,10,3,0,5,2,0,1,1,1,0,1,1
1,1,3,2,0,1,2,0,1,1,1,1,1,0
0,0,6,2,0,8,0,0,0,1,1,2,1,0
0,0,1,2,2,3,1,0,1,1,1,2,1,0
1,0,1,2,0,1,2,1,1,1,1,0,1,1
2,2,3,2,0,1,2,0,1,1,1,0,1,0
0,0,1,2,2,3,1,0,0,1,1,0,1,0
0,0,1,2,2,2,1,0,1,1,1,2,1,0
1,0,6,2,2,5,3,0,0,1,1,0,1,1
2,0,1,2,2,0,4,0,1,1,1,0,1,0
0,0,3,2,2,10,5,4,0,1,1,2,1,0
0,0,1,2,0,2,2,0,1,1,1,2,1,0
1,0,3,2,4,2,5,0,0,1,1,2,1,0
2,0,2,1,0,1,2,0,1,1,1,0,1,0
1,4,1,2,1,8,5,0,0,1,1,2,1,0
0,0,3,2,2,8,3,0,0,1,1,0,1,0
0,0,0,3,2,4,4,1,0,1,1,0,1,0
0,0,3,2,2,1,1,0,1,1,1,2,1,0
1,0,3,2,0,1,2,0,1,1,1,1,1,0
0,0,0,3,2,6,3,0,1,1,1,0,1,0
2,1,1,2,3,3,5,4,1,1,1,2,1,0
0,0,3,2,1,1,3,0,1,1,1,1,1,0
1,0,0,3,3,2,4,4,1,1,1,1,1,0
0,0,3,2,2,2,5,0,0,1,1,0,1,0
1,2,10,3,1,4,5,1,0,1,1,0,1,0
0,0,3,2,2,4,3,1,0,1,1,0,1,0
1,1,10,3,0,3,2,0,1,1,1,0,1,1
1,0,1,2,1,2,5,0,0,1,1,0,1,0
1,3,5,2,2,4,3,0,1,1,1,1,1,0
1,5,10,3,4,12,5,0,1,1,1,0,1,0
0,0,2,1,2,6,1,0,1,1,1,2,1,0
2,4,6,2,4,5,3,0,0,1,1,2,1,0
0,0,3,2,2,2,3,0,1,1,1,0,1,0
0,5,3,2,0,12,2,0,1,1,1,0,1,0
1,0,1,2,0,3,2,0,1,1,1,0,1,1
0,0,0,3,2,1,3,0,1,1,1,1,1,0
0,5,0,3,0,5,2,0,1,1,1,0,1,1
0,0,3,2,5,3,5,0,0,1,1,2,1,0
0,0,3,2,2,1,1,4,1,1,1,0,1,0
0,0,3,2,2,6,3,4,1,1,1,0,1,0
0,0,3,2,2,2,1,0,1,1,1,2,1,0
0,0,0,3,0,5,2,0,1,1,1,1,1,0
1,4,1,2,0,4,2,4,1,1,1,2,1,1
1,0,1,2,1,8,3,0,0,1,1,0,1,0
0,0,1,2,2,1,3,4,1,1,1,1,1,0
1,0,8,0,0,7,2,0,1,1,1,0,1,1
1,3,1,2,0,8,2,0,1,1,1,1,1,1
0,0,3,2,0,8,2,4,1,1,1,0,1,0
1,1,0,3,1,4,3,0,0,1,1,0,1,0
0,0,10,3,0,5,0,0,0,1,1,1,1,0
2,5,13,3,0,5,2,0,1,1,1,2,1,1
1,0,1,2,1,7,3,4,0,1,1,2,1,0
0,0,1,2,0,7,2,0,1,1,1,0,1,0
1,4,0,3,0,1,2,4,1,1,1,0,1,0
1,0,1,2,0,0,2,0,1,1,1,1,1,0
1,3,0,3,0,0,2,0,1,1,1,2,1,0
0,0,14,0,0,10,4,0,1,1,1,0,1,0
1,0,1,2,1,1,3,0,1,1,1,0,1,0
1,3,10,3,2,4,3,0,0,1,1,1,1,1
0,0,10,3,0,5,4,0,1,1,1,0,1,1
0,1,3,2,2,3,3,0,1,1,1,0,1,0
0,0,10,3,2,5,3,0,0,1,1,0,1,0
1,0,0,3,0,4,2,0,1,1,1,1,1,1
1,4,1,2,2,5,3,0,0,1,1,0,1,0
1,5,3,2,1,8,5,0,0,1,1,2,1,0
0,0,2,1,2,3,1,0,0,1,1,0,1,0
1,0,0,3,0,2,2,4,1,1,1,2,1,1
0,0,3,2,1,1,3,0,1,1,1,1,1,0
2,2,3,2,0,3,0,0,0,1,1,2,1,1
0,0,2,1,2,3,1,0,0,1,1,2,1,0
1,4,3,2,0,10,2,4,1,1,1,0,1,0
1,0,3,2,0,1,2,0,1,1,1,0,1,1
0,0,1,2,2,3,3,2,0,1,1,0,1,0
2,0,3,2,0,1,2,1,1,1,1,0,1,0
2,2,8,0,0,7,2,0,1,1,1,0,1,1
1,0,0,3,2,4,3,4,1,1,1,1,1,1
1,0,5,2,1,3,3,0,0,1,1,0,1,0
1,0,0,3,0,3,2,0,1,1,1,1,1,0
1,0,0,3,0,5,2,0,1,1,1,1,1,0
0,0,12,1,2,1,3,0,1,1,1,0,1,0
0,0,1,2,0,6,2,0,1,1,1,0,1,0
3,0,3,2,2,2,3,4,1,1,1,0,1,0
2,0,3,2,0,8,2,4,1,1,1,2,1,0
1,4,15,0,2,7,3,0,0,1,1,2,1,0
0,0,3,2,2,3,1,0,0,1,1,0,1,0
2,1,3,2,0,9,2,0,1,1,1,1,1,0
3,0,6,2,4,8,3,0,0,1,1,2,1,0
2,0,3,2,1,8,3,0,0,1,1,0,1,0
0,0,2,1,3,10,3,4,1,1,1,0,1,0
1,0,3,2,1,3,5,4,0,1,1,2,1,0
0,0,1,2,2,8,3,0,1,1,1,1,1,0
0,0,1,2,1,1,4,3,0,1,1,0,1,0
0,0,4,3,2,5,3,0,1,1,1,1,1,1
2,5,3,2,0,2,2,0,1,1,1,0,1,0
1,5,1,2,2,8,3,0,0,1,1,2,1,0
1,0,0,3,0,4,2,0,1,1,1,1,1,1
1,4,10,3,0,4,2,0,1,1,1,1,1,1
0,0,3,2,2,10,1,0,1,1,1,0,1,0
1,0,0,3,0,5,2,0,1,1,1,0,1,1
1,1,3,2,0,2,0,0,0,1,1,2,1,0
0,5,5,2,0,12,2,0,1,1,1,1,1,0
1,0,1,2,1,1,3,0,1,1,1,0,1,0
2,0,3,2,0,8,2,0,1,1,1,1,1,1
0,0,3,2,2,2,3,0,0,1,1,2,1,0
1,0,5,2,0,0,2,0,1,1,1,1,1,1
1,0,4,3,0,5,0,0,0,1,1,0,1,1
1,0,0,3,0,8,0,4,0,1,1,0,1,0
2,4,12,1,4,2,5,4,0,1,1,2,1,0
0,4,1,2,2,8,1,0,1,1,1,0,1,0
2,0,1,2,2,10,3,0,1,1,1,2,1,0
1,1,3,2,0,9,2,1,1,1,1,0,1,1
0,0,3,2,2,2,1,0,0,1,1,0,1,0
0,0,3,2,3,2,3,0,0,1,1,2,1,0
1,0,6,2,4,4,3,0,0,1,1,1,1,0
1,0,8,0,1,1,3,0,1,1,1,0,1,0
1,0,0,3,1,5,1,0,0,1,1,0,1,0
1,0,3,2,0,3,2,0,1,1,1,1,1,0
0,0,2,1,2,2,1,0,1,1,1,2,1,0
2,0,1,2,0,3,2,0,1,1,1,0,1,0
0,0,2,1,0,1,2,0,1,1,1,0,1,0
1,2,1,2,0,4,2,1,1,1,1,1,1,1
2,0,3,2,0,4,2,0,1,1,1,0,1,1
0,0,0,3,2,5,3,0,0,1,1,1,1,0
0,0,3,2,0,1,0,0,0,1,1,0,1,1
2,0,3,2,0,4,2,0,1,1,1,0,1,1
0,0,7,1,2,2,1,0,1,1,1,2,1,0
1,1,10,3,0,9,2,0,1,1,1,1,1,0
0,4,10,3,2,5,1,0,1,1,1,2,1,0
0,0,3,2,1,5,1,0,0,1,1,0,1,0
1,0,3,2,0,6,2,4,1,1,1,0,1,0
1,0,1,2,0,1,2,0,1,1,1,0,1,1
0,0,1,2,2,4,3,0,0,1,1,0,1,0
0,0,1,2,2,0,1,0,0,1,1,0,1,0
0,0,1,2,0,7,0,0,0,1,1,1,1,1
0,0,0,3,2,5,3,0,0,1,1,2,1,0
1,4,1,2,0,1,2,0,1,1,1,0,1,1
0,0,5,2,1,3,5,4,0,1,1,1,1,0
2,0,0,3,1,5,3,0,0,1,1,1,1,0
0,0,3,2,2,6,1,0,1,1,1,0,1,0
2,3,10,3,0,3,2,0,1,1,1,0,1,1
1,0,4,3,0,5,2,0,1,1,1,1,1,1
1,0,1,2,1,2,3,0,1,1,1,1,1,0
0,0,0,3,2,4,3,0,0,1,1,0,1,0
1,0,8,0,0,1,2,0,1,1,1,0,1,0
1,0,0,3,0,5,2,0,1,1,1,1,1,0
0,0,3,2,1,5,3,0,0,1,1,0,1,0
0,4,0,3,2,12,3,0,1,1,1,1,1,0
0,0,1,2,2,3,1,0,1,1,1,1,1,0
0,0,0,3,2,0,5,4,0,1,1,0,1,0
2,0,3,2,0,10,2,0,1,1,1,2,1,0
1,2,0,3,1,3,3,0,0,1,1,1,1,0
0,0,3,2,2,1,1,0,1,1,1,0,1,0
1,0,3,2,1,3,3,0,1,1,1,1,1,0
1,0,0,3,0,8,2,0,1,1,1,0,1,1
0,0,1,2,2,2,3,1,0,1,1,2,1,0
0,0,3,2,0,10,2,0,1,1,1,1,1,1
2,0,3,2,0,3,2,0,1,1,1,0,1,1
0,0,1,2,0,8,0,1,0,1,1,0,1,0
1,0,3,2,1,8,3,0,0,1,1,0,1,0
0,0,5,2,2,0,3,0,1,1,1,1,1,0
1,0,5,2,2,5,3,0,0,1,1,2,1,0
0,0,5,2,0,8,2,0,1,1,1,0,1,1
1,2,1,2,0,9,2,0,1,1,1,1,1,1
0,0,0,3,2,3,3,0,1,1,1,0,1,0
0,0,10,3,0,4,2,1,1,1,1,2,1,1
1,1,0,3,1,5,3,4,0,1,1,0,1,1
0,0,1,2,2,3,1,0,1,1,1,2,1,0
0,0,1,2,2,7,3,0,1,1,1,0,1,0
3,1,1,2,0,3,2,0,1,1,1,0,1,0
0,0,14,0,0,7,4,0,0,1,1,0,1,0
0,0,1,2,2,0,3,0,0,1,1,0,1,0
2,0,3,2,4,2,3,0,0,1,1,2,1,0
1,1,3,2,0,4,2,0,1,1,1,0,1,0
1,0,1,2,0,1,2,0,1,1,1,1,1,0
0,0,3,2,1,10,3,0,1,1,1,1,1,0
0,0,2,1,2,4,3,0,0,1,1,0,1,0
3,1,10,3,0,1,2,0,1,1,1,2,1,0
2,1,3,2,0,9,2,0,1,1,1,1,1,0
1,0,1,2,0,5,0,0,0,1,1,1,1,1
2,1,3,2,0,6,2,0,1,1,1,0,1,0
2,0,3,2,2,10,3,0,1,1,1,1,1,0
1,0,1,2,2,1,3,4,1,1,1,1,1,0
0,0,0,3,2,8,3,0,1,1,1,0,1,0
1,0,6,2,2,2,3,0,1,1,1,0,1,0
2,4,12,1,0,8,2,0,1,1,1,0,1,0
1,0,3,2,0,10,2,0,1,1,1,0,1,0
0,0,14,0,2,2,4,0,1,1,1,1,1,0
0,0,3,2,2,1,4,0,1,1,1,0,1,0
1,0,10,3,2,5,3,0,1,1,1,1,1,0
0,0,3,2,2,3,5,4,1,1,1,2,1,0
1,0,3,2,0,2,2,4,1,1,1,0,1,0
0,0,5,2,2,3,3,0,1,1,1,0,1,0
1,0,6,2,0,1,2,0,1,1,1,0,1,1
1,5,3,2,0,12,2,0,1,1,1,0,1,1
0,0,3,2,2,9,3,0,1,1,1,2,1,0
0,5,1,2,2,12,1,4,0,1,1,0,1,0
1,1,0,3,2,3,3,0,1,1,1,1,1,0
2,0,5,2,5,2,3,0,1,1,1,2,1,0
0,0,0,3,2,5,1,0,1,1,1,2,1,0
0,0,0,3,0,4,2,0,1,1,1,0,1,1
0,0,3,2,0,2,2,0,1,1,1,0,1,0
0,0,0,3,0,4,2,0,1,1,1,0,1,1
1,4,2,1,0,10,2,0,1,1,1,2,1,0
1,3,3,2,0,8,2,0,1,1,1,0,1,1
2,0,2,1,4,2,5,3,1,1,1,0,1,0
1,0,6,2,0,0,2,0,1,1,1,0,1,1
1,0,3,2,0,8,0,0,0,1,1,2,1,0
0,1,7,1,0,1,2,0,1,1,1,0,1,0
1,0,12,1,0,2,0,0,0,1,1,2,1,0
2,0,10,3,0,5,2,0,1,1,1,0,1,1
1,0,0,3,0,10,2,0,1,1,1,1,1,0
1,0,0,3,1,4,3,0,1,1,1,1,1,1
0,0,3,2,5,3,3,0,1,1,1,1,1,0
0,0,3,2,0,1,2,0,1,1,1,0,1,0
2,1,1,2,0,3,2,0,1,1,1,1,1,0
0,0,5,2,2,3,3,0,0,1,1,2,1,0
2,0,0,3,1,4,3,0,0,1,1,1,1,1
1,0,3,2,1,3,5,0,0,1,1,0,1,0
0,3,1,2,2,8,5,0,0,1,1,2,1,0
0,0,2,1,2,2,1,0,1,1,1,2,1,0
1,5,6,2,0,7,2,0,1,1,1,2,1,1
1,0,3,2,0,7,2,4,1,1,1,0,1,0
1,0,3,2,2,1,3,4,1,1,1,0,1,0
0,4,1,2,2,8,1,0,0,1,1,2,1,0
0,0,3,2,0,1,0,0,0,1,1,0,1,0
0,0,1,2,2,3,3,0,1,1,1,0,1,0
1,1,10,3,1,4,3,0,1,1,1,2,1,0
1,0,3,2,0,3,2,0,1,1,1,1,1,0
2,0,3,2,1,10,3,0,1,1,1,0,1,0
1,0,14,0,1,1,3,0,0,1,1,1,1,0
0,0,1,2,0,1,2,0,1,1,1,0,1,0
1,0,1,2,1,3,3,0,0,1,1,0,1,0
1,4,1,2,0,10,0,0,0,1,1,0,1,0
0,0,3,2,2,2,3,0,0,1,1,2,1,0
0,0,3,2,3,7,3,0,1,1,1,0,1,0
1,0,1,2,0,8,0,0,0,1,1,2,1,1
0,0,1,2,2,8,1,0,0,1,1,2,1,0
0,0,3,2,2,2,4,0,0,1,1,0,1,0
0,0,3,2,0,2,0,0,0,1,1,0,1,1
0,0,0,3,2,1,3,0,1,1,1,1,1,0
1,1,5,2,3,1,3,0,1,1,1,0,1,0
0,0,3,2,2,2,3,0,0,1,1,2,1,0
1,0,1,2,0,0,2,0,1,1,1,0,1,0
1,0,1,2,1,6,5,0,0,1,1,0,1,0
0,0,0,3,0,4,2,0,1,1,1,1,1,1
1,2,0,3,0,4,2,0,1,1,1,1,1,1
0,0,1,2,2,6,1,0,1,1,1,0,1,0
0,0,1,2,2,2,3,0,0,1,1,0,1,0
1,5,0,3,0,12,2,0,1,1,1,0,1,1
0,0,6,2,1,0,3,0,0,1,1,1,1,0
0,0,8,0,0,6,2,3,1,1,1,0,1,0
1,0,10,3,0,4,2,0,1,1,1,1,1,1
0,0,3,2,0,8,2,0,1,1,1,0,1,0
1,1,3,2,0,1,2,0,1,1,1,2,1,0
0,0,1,2,0,4,2,0,1,1,1,1,1,0
1,4,1,2,1,2,3,4,0,1,1,2,1,0
0,0,3,2,2,6,5,4,0,1,1,0,1,0
0,0,1,2,1,4,5,0,0,1,1,0,1,0
1,0,13,3,0,4,2,0,1,1,1,1,1,1
0,5,1,2,0,7,2,0,1,1,1,0,1,0
0,0,9,1,1,1,3,0,1,1,1,1,1,1
1,0,11,0,0,7,2,0,1,1,1,0,1,0
0,0,2,1,2,2,1,0,0,1,1,2,1,0
0,4,2,1,3,7,3,4,1,1,1,0,1,0
0,5,3,2,1,10,5,0,0,1,1,0,1,0
0,0,1,2,0,3,2,0,1,1,1,2,1,0
0,0,2,1,2,6,1,0,1,1,1,2,1,0
0,5,0,3,0,4,0,0,0,1,1,0,1,1
0,0,3,2,1,9,3,0,1,1,1,0,1,0
0,0,12,1,2,1,5,0,0,1,1,2,1,0
0,0,3,2,0,10,2,0,1,1,1,1,1,0
1,3,10,3,2,5,3,0,1,1,1,0,1,0
0,0,3,2,2,6,3,0,1,1,1,1,1,0
2,2,3,2,0,4,2,0,1,1,1,0,1,1
1,4,10,3,0,5,2,0,1,1,1,0,1,0
1,1,0,3,5,5,3,0,0,1,1,0,1,0
0,0,5,2,0,2,0,0,0,1,1,2,1,1
0,0,1,2,2,1,1,1,1,1,1,0,1,0
0,0,10,3,0,5,2,0,1,1,1,1,1,1
0,4,3,2,2,9,3,4,1,1,1,1,1,0
0,4,1,2,2,12,3,0,0,1,1,0,1,0
0,0,1,2,0,8,2,0,1,1,1,0,1,0
1,4,1,2,0,8,0,0,0,1,1,0,1,0
0,0,0,3,2,5,3,0,1,1,1,0,1,0
1,0,3,2,1,8,3,0,0,1,1,2,1,0
0,3,3,2,2,13,1,0,1,1,1,0,1,0
1,0,1,2,0,5,2,0,1,1,1,0,1,1
0,0,7,1,2,1,4,0,1,1,1,0,1,0
0,0,1,2,2,3,3,0,1,1,1,2,1,0
1,0,3,2,4,4,5,4,0,1,1,0,1,0
1,0,3,2,0,1,2,2,1,1,1,0,1,0
0,0,3,2,0,7,2,0,1,1,1,0,1,0
0,0,3,2,2,7,1,4,0,1,1,0,1,0
1,2,1,2,0,1,2,0,1,1,1,1,1,1
2,0,3,2,0,1,2,0,1,1,1,0,1,1
1,1,3,2,0,3,0,0,0,1,1,0,1,1
1,0,6,2,0,7,2,1,1,1,1,0,1,1
0,0,3,2,0,4,0,0,0,1,1,0,1,0
1,0,1,2,0,10,2,4,1,1,1,1,1,0
0,0,3,2,0,10,2,0,1,1,1,1,1,0
1,0,5,2,2,9,1,0,1,1,1,0,1,0
2,4,3,2,0,10,2,4,1,1,1,0,1,0
0,0,0,3,0,5,2,0,1,1,1,1,1,0
1,0,11,0,5,1,5,0,1,1,1,0,1,0
0,0,0,3,2,0,3,0,0,1,1,2,1,0
1,0,3,2,0,8,0,0,0,1,1,1,1,0
1,0,3,2,1,10,5,0,1,1,1,0,1,0
1,0,1,2,0,10,2,0,1,1,1,0,1,0
1,0,10,3,0,4,2,0,1,1,1,1,1,1
0,0,0,3,0,0,0,0,0,1,1,2,1,1
2,0,1,2,2,5,5,0,1,1,1,0,1,1
0,0,3,2,2,1,3,0,1,1,1,0,1,0
0,0,3,2,2,10,3,0,1,1,1,1,1,0
2,0,6,2,1,2,3,0,0,1,1,2,1,0
1,0,0,3,2,1,3,0,0,1,1,0,1,0
0,0,7,1,2,2,3,0,1,1,1,2,1,0
0,0,1,2,2,6,1,0,1,1,1,2,1,0
0,0,12,1,2,2,3,0,1,1,1,0,1,0
0,0,3,2,2,2,5,0,0,1,1,2,1,0
0,0,5,2
gitextract_sl_11rs9/ ├── Financial_NLP/ │ └── final_demo/ │ ├── README.md │ ├── __init__.py │ ├── data_prepare.py │ ├── extract_feature.py │ ├── main.py │ ├── train_model.py │ └── util.py ├── ML/ │ ├── DecisionTree/ │ │ ├── Boosting.py │ │ ├── RandomForest.py │ │ ├── decision_tree.py │ │ ├── titanic_data_analy.ipynb │ │ ├── tree_main.py │ │ └── xgboost_demo.py │ ├── LogisticRegression_MEM/ │ │ └── LR_MEM_demo.py │ ├── Perce_SVM/ │ │ ├── SVM.py │ │ └── perceptron.py │ ├── REDAME.md │ ├── TensorDemo/ │ │ ├── NN_tf.py │ │ └── README.md │ └── data/ │ └── adult/ │ ├── adult_deal_value.data │ └── adult_deal_value.test ├── NLP/ │ ├── AutoTitle_F/ │ │ ├── configs/ │ │ │ ├── make_vocab.yaml │ │ │ ├── predict.yaml │ │ │ ├── process.yaml │ │ │ └── train_model.yaml │ │ ├── data/ │ │ │ ├── __init__.py │ │ │ ├── batcher.py │ │ │ ├── data.py │ │ │ └── data_processed.py │ │ ├── models/ │ │ │ ├── __init__.py │ │ │ ├── adaptive.py │ │ │ ├── loss.py │ │ │ ├── lr_scheduler.py │ │ │ ├── optims.py │ │ │ └── seq2seq.py │ │ ├── pycocoevalcap/ │ │ │ ├── README.md │ │ │ ├── __init__.py │ │ │ ├── bleu/ │ │ │ │ ├── LICENSE │ │ │ │ ├── __init__.py │ │ │ │ ├── bleu.py │ │ │ │ └── bleu_scorer.py │ │ │ ├── cider/ │ │ │ │ ├── __init__.py │ │ │ │ ├── cider.py │ │ │ │ └── cider_scorer.py │ │ │ ├── license.txt │ │ │ ├── meteor/ │ │ │ │ ├── __init__.py │ │ │ │ ├── meteor-1.5.jar │ │ │ │ ├── meteor.py │ │ │ │ └── tests/ │ │ │ │ └── test_meteor.py │ │ │ ├── rouge/ │ │ │ │ ├── __init__.py │ │ │ │ └── rouge.py │ │ │ └── test_eval.py │ │ ├── submit.py │ │ └── train.py │ ├── GAN&NLP.md │ ├── Multi_Label/ │ │ └── ShengCe/ │ │ ├── generate_submit.py │ │ ├── train_model.py │ │ └── util.py │ ├── Seq2Seq/ │ │ ├── __init__.py │ │ ├── data_util.py │ │ ├── main.py │ │ ├── seq2seq_attn.py │ │ ├── seq2seq_model.py │ │ └── text_summarizer.py │ ├── Text_CNN/ │ │ ├── process_data.py │ │ ├── text_cnn_main.py │ │ └── text_cnn_model.py │ └── daguan/ │ ├── README.md │ ├── data_analy.py │ ├── lr_scheduler.py │ ├── main.py │ ├── model.py │ ├── optims.py │ └── predict.py └── README.md
SYMBOL INDEX (539 symbols across 47 files)
FILE: Financial_NLP/final_demo/data_prepare.py
function preprocessing (line 40) | def preprocessing(data_df,fname):
function seg_sentence (line 73) | def seg_sentence(sentence,stop_words):
function load_stopwordslist (line 87) | def load_stopwordslist(filepath):
function load_spelling_corrections (line 97) | def load_spelling_corrections(filepath):
function load_doubt_words (line 102) | def load_doubt_words(filpath):
function transform_other_word (line 112) | def transform_other_word(str_text,reg_dict):
function strip_why (line 117) | def strip_why(rawq):
function strip_how (line 123) | def strip_how(rawq):
function process_save_embedding_wv (line 128) | def process_save_embedding_wv(nfile,type = 1,isStore_ids = False):
function process_save_char_embedding_wv (line 224) | def process_save_char_embedding_wv(isStore_ids = False):
function pre_train_w2v (line 281) | def pre_train_w2v(binary = False):
function pre_train_char_w2v (line 313) | def pre_train_char_w2v(binary = False):
FILE: Financial_NLP/final_demo/extract_feature.py
function before_extract_feature_load_data (line 18) | def before_extract_feature_load_data(train_file,test_file):
function after_extract_feature_save_data (line 28) | def after_extract_feature_save_data(feature_train,feature_test,col_names...
function extract_feature_siamese_lstm_manDist (line 33) | def extract_feature_siamese_lstm_manDist():
function extract_feature_siamese_lstm_attention (line 151) | def extract_feature_siamese_lstm_attention():
function extract_feature_siamese_lstm_dssm (line 268) | def extract_feature_siamese_lstm_dssm():
function extract_feature_siamese_lstm_manDist_char (line 409) | def extract_feature_siamese_lstm_manDist_char():
function extract_sentece_length_diff (line 498) | def extract_sentece_length_diff():
function extract_edit_distance (line 532) | def extract_edit_distance():
function extract_ngram (line 580) | def extract_ngram(max_ngram = 3):
function extract_sentence_diff_same (line 660) | def extract_sentence_diff_same():
function extract_doubt_sim (line 713) | def extract_doubt_sim():
function extract_sentence_exist_topic (line 750) | def extract_sentence_exist_topic():
function extract_word_embedding_sim (line 793) | def extract_word_embedding_sim(w2v_model_path = 'train_all_data.bigram'):
FILE: Financial_NLP/final_demo/main.py
function star_process (line 14) | def star_process(X_train,y_train,X_test,y_test):
FILE: Financial_NLP/final_demo/train_model.py
class AttentionLayer (line 20) | class AttentionLayer(Layer):
method __init__ (line 21) | def __init__(self,step_dim,W_regularizer=None, b_regularizer=None,
method compute_mask (line 40) | def compute_mask(self, inputs, mask=None):
method build (line 44) | def build(self, input_shape):
method call (line 63) | def call(self, x, mask = None):
method compute_output_shape (line 90) | def compute_output_shape(self, input_shape):
method get_config (line 94) | def get_config(self):
class ManDist (line 99) | class ManDist(Layer):
method __init__ (line 104) | def __init__(self, **kwargs):
method build (line 109) | def build(self, input_shape):
method call (line 118) | def call(self, inputs, **kwargs):
method compute_output_shape (line 140) | def compute_output_shape(self, input_shape):
class ConsDist (line 155) | class ConsDist(Layer):
method __init__ (line 160) | def __init__(self, **kwargs):
method build (line 165) | def build(self, input_shape):
method call (line 174) | def call(self, inputs, **kwargs):
method compute_output_shape (line 186) | def compute_output_shape(self, input_shape):
class AttentionLayer1 (line 201) | class AttentionLayer1(Layer):
method __init__ (line 202) | def __init__(self, **kwargs):
method build (line 207) | def build(self, input_shape):
method call (line 216) | def call(self, inputs, **kwargs):
method compute_output_shape (line 232) | def compute_output_shape(self, input_shape):
function precision (line 247) | def precision(y_true, y_pred):
function recall (line 264) | def recall(y_true, y_pred):
function fbeta_score (line 281) | def fbeta_score(y_t, y_p, beta=1):
function contrastive_loss (line 310) | def contrastive_loss(y_true,y_pred):
function create_siamese_lstm_attention_model (line 325) | def create_siamese_lstm_attention_model(embedding_matrix,model_param,emb...
function create_siamese_lstm_ManDistance_model (line 387) | def create_siamese_lstm_ManDistance_model(embedding_matrix,model_param,e...
function create_siamese_lstm_dssm_mdoel (line 439) | def create_siamese_lstm_dssm_mdoel(embedding_matrix,embedding_word_matri...
function predict (line 585) | def predict(model,X_s1,X_s2):
function predict1 (line 597) | def predict1(model,X_s1,X_s2,X_s1_char,X_s2_char):
class StackingBaseClassifier (line 614) | class StackingBaseClassifier(object):
method train (line 616) | def train(self, x_train, y_train, x_val=None, y_val=None):
method predict (line 627) | def predict(self, model, x_test):
method get_model_out (line 630) | def get_model_out(self, x_train, y_train, x_test, n_fold=5):
class GussianNBClassifier (line 664) | class GussianNBClassifier(StackingBaseClassifier):
method __init__ (line 665) | def __init__(self):
method train (line 669) | def train(self, x_train, y_train, x_val, y_val):
method predict (line 675) | def predict(self, model, x_test):
class RFClassifer (line 716) | class RFClassifer(StackingBaseClassifier):
method train (line 717) | def train(self, x_train, y_train, x_val, y_val):
method predict (line 729) | def predict(self, model, x_test):
class LogisicClassifier (line 733) | class LogisicClassifier(StackingBaseClassifier):
method train (line 734) | def train(self, x_train, y_train, x_val=None, y_val=None):
method predict (line 739) | def predict(self, model, x_test):
class DecisionClassifier (line 743) | class DecisionClassifier(StackingBaseClassifier):
method train (line 744) | def train(self, x_train, y_train, x_val=None, y_val=None):
method predict (line 749) | def predict(self, model, x_test):
FILE: Financial_NLP/final_demo/util.py
class Project (line 17) | class Project:
method __init__ (line 19) | def __init__(self,root_dir):
method _init_all_paths (line 23) | def _init_all_paths(self):
method root_dir (line 35) | def root_dir(self):
method data_dir (line 39) | def data_dir(self):
method aux_dir (line 43) | def aux_dir(self):
method preprocessed_data_dir (line 47) | def preprocessed_data_dir(self):
method features_dir (line 51) | def features_dir(self):
method trained_model_dir (line 55) | def trained_model_dir(self):
method temp_dir (line 59) | def temp_dir(self):
method init (line 69) | def init(root_dir,create_dir = True):
method load_feature_lists (line 100) | def load_feature_lists(self,feature_lists):
method save_features (line 132) | def save_features(self,train_fea,test_fea,fea_names,feature_name):
method save_feature_names (line 144) | def save_feature_names(self,fea_names,feature_name):
method save_feature_col_list (line 148) | def save_feature_col_list(self,fea_data,type,feature_name):
method _load_feature_col_name (line 152) | def _load_feature_col_name(self,nfile):
method _load_feature_data (line 157) | def _load_feature_data(self,nfile):
method _save_feature_data (line 161) | def _save_feature_data(self,data,nfile):
method _save_feature_col_name (line 165) | def _save_feature_col_name(self,col_names,nfile):
method save (line 169) | def save(self,nfile,object):
method load (line 172) | def load(self,nfile):
FILE: ML/DecisionTree/Boosting.py
class ThresholdClass (line 13) | class ThresholdClass():
method __init__ (line 20) | def __init__(self,train_x,train_y,w):
method train (line 37) | def train(self):
method predict (line 63) | def predict(self,feature_value):
method _get_V_list (line 73) | def _get_V_list(self,X):
class AdaBoostBasic (line 90) | class AdaBoostBasic():
method __init__ (line 91) | def __init__(self,M = 10):
method _init_parameters_ (line 95) | def _init_parameters_(self,train_x,train_y):
method train (line 110) | def train(self,train_x,train_y):
method predict (line 149) | def predict(self,sample):
method _sigmoid (line 165) | def _sigmoid(self,x):
method _get_alpha (line 169) | def _get_alpha(self,error_rate_iter):
method _get_Z_m (line 173) | def _get_Z_m(self,alpha,feature_index,classifler):
method _updata_w (line 186) | def _updata_w(self,alpha,feature_index,classifler,Zm):
class AdaBoostTree (line 191) | class AdaBoostTree():
method __init__ (line 195) | def __init__(self):
class AdaBoostGDBT (line 198) | class AdaBoostGDBT():
FILE: ML/DecisionTree/RandomForest.py
class TypeClass (line 19) | class TypeClass(Enum):
function randomforst (line 23) | def randomforst(D,N,M,K,type_class):
function randomforst_predict (line 58) | def randomforst_predict(trees,test_x, type_class):
function get_max_count_array (line 74) | def get_max_count_array(arr):
FILE: ML/DecisionTree/decision_tree.py
function adult_label (line 17) | def adult_label(x):
function adult_age (line 24) | def adult_age(x):
function adult_workclass (line 35) | def adult_workclass(x):
function adult_education (line 39) | def adult_education(x):
function adult_education_num (line 45) | def adult_education_num(x):
function adult_marital_status (line 56) | def adult_marital_status(x):
function adult_occupation (line 60) | def adult_occupation(x):
function adult_relationship (line 66) | def adult_relationship(x):
function adult_race (line 69) | def adult_race(x):
function adult_sex (line 72) | def adult_sex(x):
function adult_capital_gain_loss (line 75) | def adult_capital_gain_loss(x):
function adult_hours_per_week (line 82) | def adult_hours_per_week(x):
function adult_native_country (line 91) | def adult_native_country(x):
function transToValues (line 98) | def transToValues(file_name,save_name,remove_unKnowValue=True,remove_dup...
function load_data (line 138) | def load_data(flods):
function devide_feature_value (line 169) | def devide_feature_value(series,D):
function calc_ent (line 200) | def calc_ent(D):
function calc_condition_ent (line 215) | def calc_condition_ent(A,D):
function calc_ent_gain (line 230) | def calc_ent_gain(A,D):
function calc_ent_gain_rate (line 240) | def calc_ent_gain_rate(A,D):
function calc_gini (line 260) | def calc_gini(D):
function calc_condition_gini (line 276) | def calc_condition_gini(A,D,a):
function eval (line 302) | def eval(y_true,y_predict):
class TreeNode (line 343) | class TreeNode():
method __init__ (line 347) | def __init__(self):
method add_next_node (line 358) | def add_next_node(self,node):
method add_attr_and_value (line 363) | def add_attr_and_value(self,attr_name,attr_value):
class DecisionTree (line 373) | class DecisionTree():
method __init__ (line 374) | def __init__(self,mode):
method train (line 380) | def train(self,train_x,train_y,epsoion):
method predict (line 392) | def predict(self,test_x):
method _create_tree (line 423) | def _create_tree(self,X,y,feature_list,epsoion,start_node,Vi=-1):
method _select_feature (line 487) | def _select_feature(self,X,y,feature_list,select_func):
method _get_max_count_array (line 501) | def _get_max_count_array(self,arr):
FILE: ML/DecisionTree/xgboost_demo.py
function data_feature_engineering (line 24) | def data_feature_engineering(full_data,age_default_avg=True,one_hot=True):
function data_feature_select (line 129) | def data_feature_select(full_data):
function Passenger_sex (line 143) | def Passenger_sex(x):
function Passenger_Embarked (line 146) | def Passenger_Embarked(x):
function Passenger_TitleName (line 149) | def Passenger_TitleName(x):
function get_title_name (line 152) | def get_title_name(name):
function modelfit (line 158) | def modelfit(alg,dtrain_x,dtrain_y,useTrainCV=True,cv_flods=5,early_stop...
function xgboost_change_param (line 190) | def xgboost_change_param(train_X,train_y):
FILE: ML/LogisticRegression_MEM/LR_MEM_demo.py
function data_feature_engineering (line 20) | def data_feature_engineering(full_data,age_default_avg=True,one_hot=True):
function data_feature_select (line 125) | def data_feature_select(full_data):
function Passenger_sex (line 139) | def Passenger_sex(x):
function Passenger_Embarked (line 142) | def Passenger_Embarked(x):
function Passenger_TitleName (line 145) | def Passenger_TitleName(x):
function get_title_name (line 148) | def get_title_name(name):
class LR (line 154) | class LR:
method __init__ (line 155) | def __init__(self,iterNum = 2000,learn_late = 0.005):
method train (line 159) | def train(self,train_X,train_y):
method predict (line 189) | def predict(self,sample):
class MEM (line 199) | class MEM:
method __init__ (line 202) | def __init__(self,iterNum = 2000,epsion = 0.01):
method train (line 206) | def train(self,train_X,train_y):
method change_sample_feature_name (line 259) | def change_sample_feature_name(self,samples):
method _cal_Pxy_Px (line 269) | def _cal_Pxy_Px(self):
method _cal_EPxy (line 285) | def _cal_EPxy(self):
method _f2id (line 293) | def _f2id(self):
method _cal_Pw (line 301) | def _cal_Pw(self,X,y):
method _cal_Gw (line 321) | def _cal_Gw(self):
method _cal_g_l2 (line 348) | def _cal_g_l2(self):
method _liear_search (line 352) | def _liear_search(self,p_k):
FILE: ML/Perce_SVM/SVM.py
function data_feature_engineering (line 24) | def data_feature_engineering(full_data,age_default_avg=True,one_hot=True):
function data_feature_select (line 129) | def data_feature_select(full_data):
function Passenger_sex (line 143) | def Passenger_sex(x):
function Passenger_Embarked (line 146) | def Passenger_Embarked(x):
function Passenger_TitleName (line 149) | def Passenger_TitleName(x):
function Passenger_Survived (line 152) | def Passenger_Survived(x):
function get_title_name (line 155) | def get_title_name(name):
class SVM (line 161) | class SVM():
method __init__ (line 162) | def __init__(self,kernal,maxIter,C,epsilon,sigma = 0.001):
method train (line 176) | def train(self,train_X,train_y):
method predict (line 193) | def predict(self,test_x):
method _smo (line 201) | def _smo(self):
method _calE (line 258) | def _calE(self,sample,y):
method _calLH (line 263) | def _calLH(self,a,j,i):
method _kernel (line 271) | def _kernel(self,X_i,X_j):
method _chooseJ (line 288) | def _chooseJ(self,i,E_i):
method _randJ (line 316) | def _randJ(self,i):
FILE: ML/Perce_SVM/perceptron.py
function data_feature_engineering (line 17) | def data_feature_engineering(full_data,age_default_avg=True,one_hot=True):
function data_feature_select (line 122) | def data_feature_select(full_data):
function Passenger_sex (line 136) | def Passenger_sex(x):
function Passenger_Embarked (line 139) | def Passenger_Embarked(x):
function Passenger_TitleName (line 142) | def Passenger_TitleName(x):
function get_title_name (line 145) | def get_title_name(name):
class Perceptron (line 150) | class Perceptron:
method __init__ (line 151) | def __init__(self,alpha = 0.01,updata_count_total = 3000,nochange_coun...
method train (line 161) | def train(self,train_X,train_y):
method predict (line 200) | def predict(self,sample_x):
FILE: ML/TensorDemo/NN_tf.py
function variable_summeries (line 25) | def variable_summeries(var):
function weight_variable (line 42) | def weight_variable(shape):
function bias_variable (line 48) | def bias_variable(shape):
function nn_layer (line 55) | def nn_layer(input_tensor,input_dim,output_dim,layer_name,act=tf.nn.relu):
FILE: NLP/AutoTitle_F/data/batcher.py
class Example (line 20) | class Example(object):
method __init__ (line 22) | def __init__(self, article, abstract_sentence, vocab, config):
method get_dec_inp_seqs (line 59) | def get_dec_inp_seqs(self, sequence, max_len, start_id, stop_id):
method pad_decoder_inp (line 69) | def pad_decoder_inp(self, max_len, pad_id):
method pad_encoder_input (line 75) | def pad_encoder_input(self, max_len, pad_id, pointer_gen=True):
class Batch (line 82) | class Batch(object):
method __init__ (line 83) | def __init__(self, example_list, batch_size):
method init_encoder_seq (line 92) | def init_encoder_seq(self, example_list ,pointer_gen = True):
method init_decoder_seq (line 123) | def init_decoder_seq(self, example_list):
method store_orig_strings (line 139) | def store_orig_strings(self, example_list):
class DocDataset (line 145) | class DocDataset(torch_data.Dataset):
method __init__ (line 147) | def __init__(self, path, vocab, config):
method __getitem__ (line 169) | def __getitem__(self, index):
method __len__ (line 172) | def __len__(self):
method drpout (line 175) | def drpout(self,text, p = 0.5):
method shuffle (line 184) | def shuffle(self, text):
function padding (line 188) | def padding(example_list):
function get_loader (line 195) | def get_loader(dataset, batch_size, shuffle, num_workers, mode='train'):
function get_input_from_batch (line 210) | def get_input_from_batch(batch, use_cuda, use_point_gen = True, use_cove...
function get_temp_vocab (line 249) | def get_temp_vocab(config):
function build_vaildation_set (line 278) | def build_vaildation_set():
function main (line 296) | def main():
FILE: NLP/AutoTitle_F/data/data.py
class Vocab (line 21) | class Vocab(object):
method __init__ (line 23) | def __init__(self, vocab_nfile, max_size=None):
method word2id (line 56) | def word2id(self, word):
method id2word (line 61) | def id2word(self, word_id):
method size (line 66) | def size(self):
method build_vectors (line 69) | def build_vectors(self, pre_word_embedding_path, dim , unk_init=torch....
function article2ids (line 88) | def article2ids(article_words, vocab):
function abstract2ids (line 103) | def abstract2ids(abstract_words, vocab, article_oovs):
function outputids2words (line 118) | def outputids2words(id_list, vocab, article_oovs):
function abstract2sents (line 134) | def abstract2sents(abstract):
function show_art_oovs (line 145) | def show_art_oovs(article, vocab):
function show_abs_oovs (line 152) | def show_abs_oovs(abstract, vocab, article_oovs):
FILE: NLP/AutoTitle_F/data/data_processed.py
function transform_other_word (line 24) | def transform_other_word(str_text,reg_dict):
function clean_text (line 29) | def clean_text(text, contractions):
function pre_word_token (line 38) | def pre_word_token(df, config, test = False, lower = True, is_make_title...
function pre_sentence_token (line 103) | def pre_sentence_token(df, lower=True, makevocab=True):
function word_tokenizer (line 161) | def word_tokenizer(text):
function sentence_tokenizer (line 169) | def sentence_tokenizer(text):
FILE: NLP/AutoTitle_F/models/adaptive.py
class AdaptiveLogSoftmaxWithLoss (line 15) | class AdaptiveLogSoftmaxWithLoss(nn.Module):
method __init__ (line 87) | def __init__(self, in_features, n_classes, cutoffs, div_value=4., head...
method reset_parameters (line 127) | def reset_parameters(self):
method forward (line 133) | def forward(self, input, target):
method _get_full_log_prob (line 188) | def _get_full_log_prob(self, input, head_output):
method log_prob (line 206) | def log_prob(self, input):
method predict (line 222) | def predict(self, input):
FILE: NLP/AutoTitle_F/models/loss.py
function criterion (line 12) | def criterion(tgt_vocab_size, use_cuda):
function adaptive_criterion (line 23) | def adaptive_criterion(config, use_cuda):
function ml_criterion (line 34) | def ml_criterion(hidden_outputs, decoder, targets, criterion, sim_score=0):
function ml_criterion_memory_efficiency (line 50) | def ml_criterion_memory_efficiency(hidden_outputs, decoder, targets, cri...
function ml_criterion_sampled_loss (line 69) | def ml_criterion_sampled_loss(hidden_outputs, decoder, targets, config, ...
function ml_criterion_adaptive_sampled_loss (line 73) | def ml_criterion_adaptive_sampled_loss(hidden_outputs, decoder, targets,...
function rl_criterion (line 88) | def rl_criterion():
FILE: NLP/AutoTitle_F/models/lr_scheduler.py
class _LRScheduler (line 11) | class _LRScheduler(object):
method __init__ (line 12) | def __init__(self, optimizer, last_epoch=-1):
method get_lr (line 29) | def get_lr(self):
method step (line 32) | def step(self, epoch=None):
class LambdaLR (line 40) | class LambdaLR(_LRScheduler):
method __init__ (line 59) | def __init__(self, optimizer, lr_lambda, last_epoch=-1):
method get_lr (line 71) | def get_lr(self):
class StepLR (line 76) | class StepLR(_LRScheduler):
method __init__ (line 99) | def __init__(self, optimizer, step_size, gamma=0.1, last_epoch=-1):
method get_lr (line 104) | def get_lr(self):
class MultiStepLR (line 109) | class MultiStepLR(_LRScheduler):
method __init__ (line 131) | def __init__(self, optimizer, milestones, gamma=0.1, last_epoch=-1):
method get_lr (line 139) | def get_lr(self):
class ExponentialLR (line 144) | class ExponentialLR(_LRScheduler):
method __init__ (line 153) | def __init__(self, optimizer, gamma, last_epoch=-1):
method get_lr (line 157) | def get_lr(self):
class CosineAnnealingLR (line 162) | class CosineAnnealingLR(_LRScheduler):
method __init__ (line 182) | def __init__(self, optimizer, T_max, eta_min=0, last_epoch=-1):
method get_lr (line 187) | def get_lr(self):
class ReduceLROnPlateau (line 193) | class ReduceLROnPlateau(object):
method __init__ (line 236) | def __init__(self, optimizer, mode='min', factor=0.1, patience=10,
method _reset (line 274) | def _reset(self):
method step (line 280) | def step(self, metrics, epoch=None):
method _reduce_lr (line 301) | def _reduce_lr(self, epoch):
method in_cooldown (line 312) | def in_cooldown(self):
method _init_is_better (line 315) | def _init_is_better(self, mode, threshold, threshold_mode):
FILE: NLP/AutoTitle_F/models/optims.py
class Optim (line 12) | class Optim(object):
method set_parameters (line 14) | def set_parameters(self, params):
method __init__ (line 32) | def __init__(self, method, lr, max_grad_norm, lr_decay=1, start_decay_...
method step (line 42) | def step(self):
method updateLearningRate (line 49) | def updateLearningRate(self, ppl, epoch):
class AdagradCustom (line 62) | class AdagradCustom(Optimizer):
method __init__ (line 79) | def __init__(self, params, lr=1e-2, lr_decay=0, weight_decay=0, initia...
method share_memory (line 90) | def share_memory(self):
method step (line 96) | def step(self, closure=None):
FILE: NLP/AutoTitle_F/models/seq2seq.py
function init_lstm_wt (line 20) | def init_lstm_wt(lstm, init_v):
function init_linear_wt (line 34) | def init_linear_wt(linear, init_v):
function init_wt_normal (line 40) | def init_wt_normal(wt, init_v):
function init_wt_unif (line 43) | def init_wt_unif(wt, init_v):
class Encoder (line 48) | class Encoder(nn.Module):
method __init__ (line 49) | def __init__(self, config, embedding_weight = None):
method forward (line 61) | def forward(self, input, seq_lens):
class ReduceState (line 73) | class ReduceState(nn.Module):
method __init__ (line 74) | def __init__(self,config):
method forward (line 83) | def forward(self, hidden):
class Decoder (line 93) | class Decoder(nn.Module):
method __init__ (line 94) | def __init__(self, config, embedding_weight = None):
method forward (line 118) | def forward(self, y_t_1, s_t_1, encoder_outputs, enc_padding_mask,
class Attention (line 170) | class Attention(nn.Module):
method __init__ (line 171) | def __init__(self,config):
method forward (line 181) | def forward(self, s_t_hat, h, enc_padding_mask, coverage):
class Global_Attention (line 218) | class Global_Attention(nn.Module):
class Beam (line 223) | class Beam(object):
method __init__ (line 224) | def __init__(self, tokens, log_probs, state, context, coverage):
method extend (line 231) | def extend(self, token, log_prob, state, context, coverage):
method latest_token (line 239) | def latest_token(self):
method avg_log_prob (line 243) | def avg_log_prob(self):
class seq2seq (line 247) | class seq2seq(nn.Module):
method __init__ (line 248) | def __init__(self, config, use_cuda, pretrain = None):
method forward (line 265) | def forward(self, sources, sources_lengths, source_padding_mask, sourc...
method rein_forward (line 305) | def rein_forward(self, sources, sources_lengths, source_padding_mask, ...
method beam_sample (line 350) | def beam_sample(self, sources, sources_lengths, source_padding_mask, s...
method sort_beams (line 443) | def sort_beams(self, beams):
method variable_to_init_id (line 447) | def variable_to_init_id(self, v):
FILE: NLP/AutoTitle_F/pycocoevalcap/bleu/bleu.py
class Bleu (line 14) | class Bleu:
method __init__ (line 15) | def __init__(self, n=4):
method compute_score (line 21) | def compute_score(self,hyps, refs):
method method (line 40) | def method(self):
FILE: NLP/AutoTitle_F/pycocoevalcap/bleu/bleu_scorer.py
function precook (line 27) | def precook(s, n=4, out=False):
function cook_refs (line 39) | def cook_refs(refs, eff=None, n=4): ## lhuang: oracle will call with "av...
function cook_test (line 64) | def cook_test(test, reflen_refmaxcounts, eff=None, n=4):
class BleuScorer (line 90) | class BleuScorer(object):
method copy (line 97) | def copy(self):
method __init__ (line 105) | def __init__(self, test=None, refs=None, n=4, special_reflen=None):
method cook_append (line 114) | def cook_append(self, test, refs):
method ratio (line 127) | def ratio(self, option=None):
method score_ratio (line 131) | def score_ratio(self, option=None):
method score_ratio_str (line 135) | def score_ratio_str(self, option=None):
method reflen (line 138) | def reflen(self, option=None):
method testlen (line 142) | def testlen(self, option=None):
method retest (line 146) | def retest(self, new_test):
method rescore (line 157) | def rescore(self, new_test):
method size (line 162) | def size(self):
method __iadd__ (line 166) | def __iadd__(self, other):
method compatible (line 180) | def compatible(self, other):
method single_reflen (line 183) | def single_reflen(self, option="average"):
method _single_reflen (line 186) | def _single_reflen(self, reflens, option=None, testlen=None):
method recompute_score (line 199) | def recompute_score(self, option=None, verbose=0):
method compute_score (line 203) | def compute_score(self, option=None, verbose=0):
FILE: NLP/AutoTitle_F/pycocoevalcap/cider/cider.py
class Cider (line 13) | class Cider:
method __init__ (line 18) | def __init__(self, test=None, refs=None, n=4, sigma=6.0):
method compute_score (line 24) | def compute_score(self, hyps, refs):
method method (line 52) | def method(self):
FILE: NLP/AutoTitle_F/pycocoevalcap/cider/cider_scorer.py
function precook (line 13) | def precook(s, n=4, out=False):
function cook_refs (line 30) | def cook_refs(refs, n=4): ## lhuang: oracle will call with "average"
function cook_test (line 40) | def cook_test(test, n=4):
class CiderScorer (line 49) | class CiderScorer(object):
method copy (line 53) | def copy(self):
method __init__ (line 60) | def __init__(self, test=None, refs=None, n=4, sigma=6.0):
method cook_append (line 70) | def cook_append(self, test, refs):
method size (line 80) | def size(self):
method __iadd__ (line 84) | def __iadd__(self, other):
method compute_doc_freq (line 95) | def compute_doc_freq(self):
method compute_cider (line 108) | def compute_cider(self):
method compute_score (line 185) | def compute_score(self, option=None, verbose=0):
FILE: NLP/AutoTitle_F/pycocoevalcap/meteor/meteor.py
function enc (line 16) | def enc(s):
function dec (line 19) | def dec(s):
class Meteor (line 23) | class Meteor:
method __init__ (line 25) | def __init__(self):
method close (line 42) | def close(self):
method compute_score (line 53) | def compute_score(self, gts, res):
method method (line 73) | def method(self):
method _stat (line 76) | def _stat(self, hypothesis_str, reference_list):
method _score (line 85) | def _score(self, hypothesis_str, reference_list):
method __del__ (line 103) | def __del__(self):
FILE: NLP/AutoTitle_F/pycocoevalcap/meteor/tests/test_meteor.py
class TestMeteor (line 9) | class TestMeteor(unittest.TestCase):
method test_compute_score (line 10) | def test_compute_score(self):
FILE: NLP/AutoTitle_F/pycocoevalcap/rouge/rouge.py
function my_lcs (line 13) | def my_lcs(string, sub):
class Rouge (line 36) | class Rouge():
method __init__ (line 41) | def __init__(self):
method calc_score (line 45) | def calc_score(self, candidate, refs):
method compute_score (line 78) | def compute_score(self, hyps, refs):
method method (line 94) | def method(self):
FILE: NLP/AutoTitle_F/pycocoevalcap/test_eval.py
class EvalCap (line 14) | class EvalCap:
function setEval (line 38) | def setEval(score, method):
FILE: NLP/AutoTitle_F/submit.py
function predict (line 80) | def predict(model, test_loader):
FILE: NLP/AutoTitle_F/train.py
function train (line 112) | def train(epoch):
function eval (line 159) | def eval(epoch):
function main (line 197) | def main():
function save_model (line 213) | def save_model(path):
FILE: NLP/Multi_Label/ShengCe/generate_submit.py
function extract_title_doc (line 33) | def extract_title_doc(id,title, stop_words, words_prob):
function get_key_from_title (line 85) | def get_key_from_title(id, num, title_key_words_sorted, words_prob):
function get_key_from_doc (line 133) | def get_key_from_doc(num, doc_key_words_sorted):
function main (line 155) | def main():
FILE: NLP/Multi_Label/ShengCe/train_model.py
function get_topic_sim (line 43) | def get_topic_sim(model, word_corpus, doc_corpus):
function build_topic_model (line 49) | def build_topic_model(data, stop_nfile, num_topics = 50, save = True):
function bulid_candidate_words (line 91) | def bulid_candidate_words(data, stop_nfile, candidate_save_path, candida...
function build_train_sample (line 222) | def build_train_sample(data, candidate_words_list):
function train_class_model (line 271) | def train_class_model(features,labels):
function get_test_sample_prob (line 292) | def get_test_sample_prob(model, test_data, test_candidates):
FILE: NLP/Multi_Label/ShengCe/util.py
function stopwordslist (line 18) | def stopwordslist(filepath):
function get_shuming (line 22) | def get_shuming(text):
function is_contain_char_num (line 27) | def is_contain_char_num(text):
function get_count_sentence (line 37) | def get_count_sentence(word,sentence):
function cal_sim (line 47) | def cal_sim(word_topic_prob, doc_topic_prob):
function save_object (line 59) | def save_object(obj,nfile):
function load_object (line 63) | def load_object(nfile):
FILE: NLP/Seq2Seq/data_util.py
function strQ2B (line 31) | def strQ2B(ustring):
function strB2Q (line 44) | def strB2Q(ustring):
function remove_url (line 58) | def remove_url(text):
function remove_pun_ch (line 62) | def remove_pun_ch(text):
function remove_pun_en (line 65) | def remove_pun_en(text):
function remove_date (line 69) | def remove_date(text):
function remove_num (line 73) | def remove_num(text):
function remove_num_en (line 77) | def remove_num_en(text):
function remove_tag (line 81) | def remove_tag(text):
function get_title_content (line 90) | def get_title_content(content_fp,title_fp):
function basic_tokenizer (line 118) | def basic_tokenizer(sentence):
function jieba_tokenizer (line 125) | def jieba_tokenizer(sentence):
function create_vocab (line 129) | def create_vocab(vocabulary_path, data_path, max_vocabulary_size,
function initialize_vocabulary (line 164) | def initialize_vocabulary(vocabulary_path):
function sentence_to_token_ids (line 186) | def sentence_to_token_ids(sentence, vocabulary,
function data_to_token_ids (line 209) | def data_to_token_ids(data_path, target_path, vocabulary_path,
function get_train_dev_sets (line 232) | def get_train_dev_sets(data_content, data_title, train_rate, dev_rate,
function prepare_headline_data (line 280) | def prepare_headline_data(data_dir, vocabulary_size, tokenizer=None):
FILE: NLP/Seq2Seq/main.py
class ModelLoader (line 12) | class ModelLoader(object):
method __init__ (line 13) | def __init__(self):
method _init_model (line 19) | def _init_model(self,session):
method func_predict (line 23) | def func_predict(self,sentence):
FILE: NLP/Seq2Seq/seq2seq_attn.py
function _extract_argmax_and_embed (line 80) | def _extract_argmax_and_embed(embedding,
function rnn_decoder (line 110) | def rnn_decoder(decoder_inputs,
function basic_rnn_seq2seq (line 157) | def basic_rnn_seq2seq(encoder_inputs,
function tied_rnn_seq2seq (line 187) | def tied_rnn_seq2seq(encoder_inputs,
function embedding_rnn_decoder (line 230) | def embedding_rnn_decoder(decoder_inputs,
function embedding_rnn_seq2seq (line 298) | def embedding_rnn_seq2seq(encoder_inputs,
function embedding_tied_rnn_seq2seq (line 407) | def embedding_tied_rnn_seq2seq(encoder_inputs,
function attention_decoder (line 536) | def attention_decoder(decoder_inputs,
function embedding_attention_decoder (line 708) | def embedding_attention_decoder(decoder_inputs,
function embedding_attention_seq2seq (line 794) | def embedding_attention_seq2seq(encoder_inputs,
function one2many_rnn_seq2seq (line 922) | def one2many_rnn_seq2seq(encoder_inputs,
function sequence_loss_by_example (line 1037) | def sequence_loss_by_example(logits,
function sequence_loss (line 1086) | def sequence_loss(logits,
function model_with_buckets (line 1127) | def model_with_buckets(encoder_inputs,
FILE: NLP/Seq2Seq/seq2seq_model.py
class Seq2SeqModel (line 32) | class Seq2SeqModel(object):
method __init__ (line 47) | def __init__(self,
method step (line 199) | def step(self, session, encoder_inputs, decoder_inputs, target_weights,
method get_batch (line 258) | def get_batch(self, data, bucket_id):
FILE: NLP/Seq2Seq/text_summarizer.py
class LargeConfig (line 14) | class LargeConfig(object):
class MediumConfig (line 25) | class MediumConfig(object):
function create_model (line 64) | def create_model(session,forward_only):
function read_data (line 101) | def read_data(source_path, target_path, max_size=None):
function train (line 137) | def train():
function main (line 202) | def main():
FILE: NLP/Text_CNN/process_data.py
function load_binary_vec (line 22) | def load_binary_vec(fname, vocab):
function load_data_k_cv (line 52) | def load_data_k_cv(folder,cv=10,clear_flag=True):
function add_unexist_word_vec (line 95) | def add_unexist_word_vec(w2v,vocab):
function clean_string (line 105) | def clean_string(string,TREC=False):
function get_vec_by_sentence_list (line 121) | def get_vec_by_sentence_list(word_vecs,sentence_list,maxlen=56,values=0....
function get_index_by_sentence_list (line 140) | def get_index_by_sentence_list(word_ids,sentence_list,maxlen=56):
function pad_sentences (line 160) | def pad_sentences(data,maxlen=56,values=0.,vec_size = 300):
function get_train_test_data1 (line 172) | def get_train_test_data1(word_vecs,revs,cv_id=0,sent_length = 56,default...
function get_train_test_data2 (line 208) | def get_train_test_data2(word_ids,revs,cv_id=0,sent_length = 56):
function get_contrast (line 236) | def get_contrast(x):
function getWordsVect (line 239) | def getWordsVect(W):
FILE: NLP/Text_CNN/text_cnn_model.py
class TextCNN (line 14) | class TextCNN():
method __init__ (line 19) | def __init__(self,W_list,shuffer_falg, static_falg, filter_numbers, fi...
method train (line 116) | def train(self,train_x,train_y):
method validataion (line 141) | def validataion(self,test_x, test_y):
method close (line 156) | def close(self):
method __get_batchs (line 160) | def __get_batchs(self,Xs,Ys,batch_size):
method __add_conv_layer (line 165) | def __add_conv_layer(self,filter_size,filter_num):
method __variable_summeries (line 185) | def __variable_summeries(self,var):
FILE: NLP/daguan/data_analy.py
function get_stopwords (line 22) | def get_stopwords(docs, min_df, max_d):
function make_vocab (line 33) | def make_vocab(data_se, stop_words, max_size, type, isSave=True):
function save (line 99) | def save(nfile, obj):
function load (line 103) | def load(nfile):
function init_vocab (line 108) | def init_vocab(min_df, max_d, add_test=True, char_vocab_size = 10000, wo...
function pre_train_w2v (line 130) | def pre_train_w2v(all_text):
function sentence_to_indexs (line 137) | def sentence_to_indexs(sentences, dict_label2id, stop_words, max_documen...
function split_train_val (line 171) | def split_train_val(data, article_dicts, word_dicts, rate=0.7, isSave = ...
class dataset (line 199) | class dataset(torch_data.Dataset):
method __init__ (line 201) | def __init__(self, src_article, src_word, y):
method __getitem__ (line 207) | def __getitem__(self, index):
method __len__ (line 210) | def __len__(self):
function padding (line 213) | def padding(data):
function get_loader (line 249) | def get_loader(dataset, batch_size, shuffle, num_workers):
function to_categorical (line 258) | def to_categorical(y, num_classes=None):
class AttrDict (line 273) | class AttrDict(dict):
method __init__ (line 277) | def __init__(self, *args, **kwargs):
method __getattr__ (line 281) | def __getattr__(self, item):
method __setstate__ (line 286) | def __setstate__(self, state):
method __getstate__ (line 288) | def __getstate__(self):
function read_config (line 294) | def read_config(path):
FILE: NLP/daguan/lr_scheduler.py
class _LRScheduler (line 6) | class _LRScheduler(object):
method __init__ (line 7) | def __init__(self, optimizer, last_epoch=-1):
method get_lr (line 24) | def get_lr(self):
method step (line 27) | def step(self, epoch=None):
class LambdaLR (line 35) | class LambdaLR(_LRScheduler):
method __init__ (line 54) | def __init__(self, optimizer, lr_lambda, last_epoch=-1):
method get_lr (line 66) | def get_lr(self):
class StepLR (line 71) | class StepLR(_LRScheduler):
method __init__ (line 94) | def __init__(self, optimizer, step_size, gamma=0.1, last_epoch=-1):
method get_lr (line 99) | def get_lr(self):
class MultiStepLR (line 104) | class MultiStepLR(_LRScheduler):
method __init__ (line 126) | def __init__(self, optimizer, milestones, gamma=0.1, last_epoch=-1):
method get_lr (line 134) | def get_lr(self):
class ExponentialLR (line 139) | class ExponentialLR(_LRScheduler):
method __init__ (line 148) | def __init__(self, optimizer, gamma, last_epoch=-1):
method get_lr (line 152) | def get_lr(self):
class CosineAnnealingLR (line 157) | class CosineAnnealingLR(_LRScheduler):
method __init__ (line 177) | def __init__(self, optimizer, T_max, eta_min=0, last_epoch=-1):
method get_lr (line 182) | def get_lr(self):
class ReduceLROnPlateau (line 188) | class ReduceLROnPlateau(object):
method __init__ (line 231) | def __init__(self, optimizer, mode='min', factor=0.1, patience=10,
method _reset (line 269) | def _reset(self):
method step (line 275) | def step(self, metrics, epoch=None):
method _reduce_lr (line 296) | def _reduce_lr(self, epoch):
method in_cooldown (line 307) | def in_cooldown(self):
method _init_is_better (line 310) | def _init_is_better(self, mode, threshold, threshold_mode):
FILE: NLP/daguan/main.py
function train (line 106) | def train(epoch):
function eval (line 161) | def eval(epoch):
function save_model (line 204) | def save_model(path):
function get_metrics (line 215) | def get_metrics(y,y_pre):
FILE: NLP/daguan/model.py
class encoder_cnn (line 11) | class encoder_cnn(nn.Module):
method __init__ (line 15) | def __init__(self,config,filter_sizes,filter_nums,vocab_size,embedding...
method forward (line 39) | def forward(self, inputs):
class Text_WCCNN (line 71) | class Text_WCCNN(nn.Module):
method __init__ (line 73) | def __init__(self,config, word_filter_sizes, word_filter_nums, word_vo...
method forward (line 112) | def forward(self, article, word_seg):
FILE: NLP/daguan/optims.py
class Optim (line 6) | class Optim(object):
method set_parameters (line 8) | def set_parameters(self, params):
method __init__ (line 23) | def __init__(self, method, lr, max_grad_norm, lr_decay=1, start_decay_...
method step (line 32) | def step(self):
method updateLearningRate (line 39) | def updateLearningRate(self, ppl, epoch):
FILE: NLP/daguan/predict.py
function eval (line 67) | def eval(cnn_model):
Condensed preview — 75 files, each showing path, character count, and a content snippet. Download the .json file or copy for the full structured content (1,907K chars).
[
{
"path": "Financial_NLP/final_demo/README.md",
"chars": 1212,
"preview": "\n#### 1.项目结构\n##### 1.1代码文件说明\n>**./final_demo/util.py**:1.管理整个项目的文件存放路径 \n2.存储和读取各种方法抽取的特征,并组合成DataFrame输出。\n**./final_demo"
},
{
"path": "Financial_NLP/final_demo/__init__.py",
"chars": 121,
"preview": "#!/usr/bin/python\n# -*- coding: utf-8 -*-\n# @Time : 2018/6/15 下午8:52 \n# @Author : ComeOnJian \n# @File : __init__.py.py \n"
},
{
"path": "Financial_NLP/final_demo/data_prepare.py",
"chars": 15205,
"preview": "#!/usr/bin/python\n# -*- coding: utf-8 -*-\n# @Time : 2018/6/15 下午10:50 \n# @Author : ComeOnJian \n# @File : data_prepare.py"
},
{
"path": "Financial_NLP/final_demo/extract_feature.py",
"chars": 35475,
"preview": "#!/usr/bin/python\n# -*- coding: utf-8 -*-\n# @Time : 2018/6/15 下午8:12 \n# @Author : ComeOnJian \n# @File : extract_feature."
},
{
"path": "Financial_NLP/final_demo/main.py",
"chars": 6668,
"preview": "#!/usr/bin/python\n# -*- coding: utf-8 -*-\n# @Time : 2018/6/15 下午10:49 \n# @Author : ComeOnJian \n# @File : main.py \n\nfrom "
},
{
"path": "Financial_NLP/final_demo/train_model.py",
"chars": 27237,
"preview": "#!/usr/bin/python\n# -*- coding: utf-8 -*-\n# @Time : 2018/6/16 上午9:34\n# @Author : ComeOnJian\n# @File : train_model.py\n\nfr"
},
{
"path": "Financial_NLP/final_demo/util.py",
"chars": 6194,
"preview": "#!/usr/bin/python\n# -*- coding: utf-8 -*-\n# @Time : 2018/6/15 下午8:54 \n# @Author : ComeOnJian \n# @File : project.py \n# 参考"
},
{
"path": "ML/DecisionTree/Boosting.py",
"chars": 6725,
"preview": "#!/usr/bin/python\n# -*- coding: utf-8 -*-\n# @Time : 2018/4/12 下午5:27 \n# @Author : ComeOnJian \n# @File : Boosting.py\n\nfro"
},
{
"path": "ML/DecisionTree/RandomForest.py",
"chars": 2171,
"preview": "#!/usr/bin/python\n# -*- coding: utf-8 -*-\n# @Time : 2018/4/5 上午10:55 \n# @Author : ComeOnJian \n# @File : RandomForst.py \n"
},
{
"path": "ML/DecisionTree/decision_tree.py",
"chars": 16808,
"preview": "#!/usr/bin/python\n# -*- coding: utf-8 -*-\n# @Time : 2018/3/23 下午3:43 \n# @Author : ComeOnJian \n# @File : decision_tree.py"
},
{
"path": "ML/DecisionTree/titanic_data_analy.ipynb",
"chars": 10458,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"code\",\n \"execution_count\": 1,\n \"metadata\": {\n \"collapsed\": false\n },\n \"out"
},
{
"path": "ML/DecisionTree/tree_main.py",
"chars": 1153,
"preview": "#!/usr/bin/python\n# -*- coding: utf-8 -*-\n# @Time : 2018/3/23 下午8:47 \n# @Author : ComeOnJian \n# @File : tree_main.py\n\nfr"
},
{
"path": "ML/DecisionTree/xgboost_demo.py",
"chars": 11971,
"preview": "#!/usr/bin/python\n# -*- coding: utf-8 -*-\n# @Time : 2018/4/28 下午7:24 \n# @Author : ComeOnJian \n# @File : xgboost.py \n\nimp"
},
{
"path": "ML/LogisticRegression_MEM/LR_MEM_demo.py",
"chars": 13649,
"preview": "#!/usr/bin/python\n# -*- coding: utf-8 -*-\n# @Time : 2018/5/16 上午11:44 \n# @Author : ComeOnJian \n# @File : LR_MEM_demo.py "
},
{
"path": "ML/Perce_SVM/SVM.py",
"chars": 12830,
"preview": "#!/usr/bin/python\n# -*- coding: utf-8 -*-\n# @Time : 2018/5/10 下午5:14 \n# @Author : ComeOnJian \n# @File : SVM.py\n\n# 参考 SVM"
},
{
"path": "ML/Perce_SVM/perceptron.py",
"chars": 8511,
"preview": "#!/usr/bin/python\n# -*- coding: utf-8 -*-\n# @Time : 2018/5/6 下午3:55 \n# @Author : ComeOnJian \n# @File : perceptron.py \n\ni"
},
{
"path": "ML/REDAME.md",
"chars": 7,
"preview": "ML文件夹说明"
},
{
"path": "ML/TensorDemo/NN_tf.py",
"chars": 8192,
"preview": "#!/usr/bin/python\n# -*- coding: utf-8 -*-\n# @Time : 2018/2/28 下午4:22 \n# @Author : ComeOnJian \n# @File : NN_tf.py \n\nimpor"
},
{
"path": "ML/TensorDemo/README.md",
"chars": 127,
"preview": "#### TensorDemo文件说明\n##### [TensorFlow实现多层感知机及可视化训练过程中的数据记录](http://blog.csdn.net/u014732537/article/details/79412672) NN"
},
{
"path": "ML/data/adult/adult_deal_value.data",
"chars": 849559,
"preview": "1,5,0,3,2,8,3,0,1,1,1,0,1,0\n1,1,0,3,0,4,2,0,1,1,1,2,1,0\n1,0,3,2,1,6,3,0,1,1,1,0,1,0\n1,0,2,1,0,6,2,4,1,1,1,0,1,0\n0,0,0,3,"
},
{
"path": "ML/data/adult/adult_deal_value.test",
"chars": 424428,
"preview": "0,0,2,1,2,7,1,4,1,1,1,0,1,0\n1,0,3,2,0,9,2,0,1,1,1,1,1,0\n0,4,5,2,0,12,2,0,1,1,1,0,1,1\n1,0,1,2,0,7,2,4,1,1,1,0,1,1\n0,0,12,"
},
{
"path": "NLP/AutoTitle_F/configs/make_vocab.yaml",
"chars": 326,
"preview": "makevocab: True\nmax_lines: 0\ntrain_path: '../data/preprocessed/train_set_last1_all.csv'\nval_path: '../data/preprocessed/"
},
{
"path": "NLP/AutoTitle_F/configs/predict.yaml",
"chars": 734,
"preview": "seed: 1445\ncheckpoint_restore: './data/logs/best_rouge_l_checkpoint.pt'\nvalidation_test_path: './data/results/test.data'"
},
{
"path": "NLP/AutoTitle_F/configs/process.yaml",
"chars": 680,
"preview": "test: False\naug: False\nmax_enc_steps: 200\nmax_dec_steps: 20\npointer_gen: True\ncoverage: False\ntrain_path: '../data/prepr"
},
{
"path": "NLP/AutoTitle_F/configs/train_model.yaml",
"chars": 1149,
"preview": "seed: 1314\ngpus: [0]\ncheckpoint_restore: './data/logs/2018-12-06-01-53/95000.checkpoint.pt'\nlog: './data/logs/'\nvocab_pa"
},
{
"path": "NLP/AutoTitle_F/data/__init__.py",
"chars": 172,
"preview": "#!/usr/bin/python\n# -*- coding: utf-8 -*-\n# @Time : 2018/10/27 上午11:09\n# @Author : ComeOnJian \n# @File : __init__.py.py "
},
{
"path": "NLP/AutoTitle_F/data/batcher.py",
"chars": 13805,
"preview": "#!/usr/bin/python\n# -*- coding: utf-8 -*-\n# @Time : 2018/10/24 下午4:52\n# @Author : ComeOnJian\n# @File : batcher.py\n\nimpor"
},
{
"path": "NLP/AutoTitle_F/data/data.py",
"chars": 6485,
"preview": "#!/usr/bin/python\n# -*- coding: utf-8 -*-\n# @Time : 2018/10/24 下午4:30 \n# @Author : ComeOnJian \n# @File : data.py\n\nfrom g"
},
{
"path": "NLP/AutoTitle_F/data/data_processed.py",
"chars": 9422,
"preview": "#!/usr/bin/python\n# -*- coding: utf-8 -*-\n# @Time : 2018/10/24 下午4:51 \n# @Author : ComeOnJian \n# @File : data_processed."
},
{
"path": "NLP/AutoTitle_F/models/__init__.py",
"chars": 243,
"preview": "#!/usr/bin/python\n# -*- coding: utf-8 -*-\n# @Time : 2018/10/24 上午11:15 \n# @Author : ComeOnJian \n# @File : __init__.py.py"
},
{
"path": "NLP/AutoTitle_F/models/adaptive.py",
"chars": 10546,
"preview": "#!/usr/bin/python\n# -*- coding: utf-8 -*-\n# @Time : 2018/10/23 下午3:26 \n# @Author : ComeOnJian \n# @File : adaptive.py \n\ni"
},
{
"path": "NLP/AutoTitle_F/models/loss.py",
"chars": 3779,
"preview": "#!/usr/bin/python\n# -*- coding: utf-8 -*-\n# @Time : 2018/10/17 下午5:10 \n# @Author : ComeOnJian \n# @File : loss.py \nimport"
},
{
"path": "NLP/AutoTitle_F/models/lr_scheduler.py",
"chars": 14056,
"preview": "#!/usr/bin/python\n# -*- coding: utf-8 -*-\n# @Time : 2018/10/18 上午9:55 \n# @Author : ComeOnJian \n# @File : lr_scheduler.py"
},
{
"path": "NLP/AutoTitle_F/models/optims.py",
"chars": 5694,
"preview": "#!/usr/bin/python\n# -*- coding: utf-8 -*-\n# @Time : 2018/10/18 上午9:51 \n# @Author : ComeOnJian \n# @File : optims.py \nimpo"
},
{
"path": "NLP/AutoTitle_F/models/seq2seq.py",
"chars": 19867,
"preview": "#!/usr/bin/python\n# -*- coding: utf-8 -*-\n# @Time : 2018/10/17 上午10:50 \n# @Author : ComeOnJian \n# @File : seq2seq.py \n\ni"
},
{
"path": "NLP/AutoTitle_F/pycocoevalcap/README.md",
"chars": 362,
"preview": "# coco-caption\n\nOriginal README can be found at [tylin/coco-caption](https://github.com/tylin/coco-caption/blob/3f0fe9b8"
},
{
"path": "NLP/AutoTitle_F/pycocoevalcap/__init__.py",
"chars": 21,
"preview": "__author__ = 'tylin'\n"
},
{
"path": "NLP/AutoTitle_F/pycocoevalcap/bleu/LICENSE",
"chars": 1105,
"preview": "Copyright (c) 2015 Xinlei Chen, Hao Fang, Tsung-Yi Lin, and Ramakrishna Vedantam\n\nPermission is hereby granted, free of "
},
{
"path": "NLP/AutoTitle_F/pycocoevalcap/bleu/__init__.py",
"chars": 32,
"preview": "__author__ = 'tylin'\n# from bleu"
},
{
"path": "NLP/AutoTitle_F/pycocoevalcap/bleu/bleu.py",
"chars": 1150,
"preview": "#!/usr/bin/env python\n# \n# File Name : bleu.py\n#\n# Description : Wrapper for BLEU scorer.\n#\n# Creation Date : 06-01-2015"
},
{
"path": "NLP/AutoTitle_F/pycocoevalcap/bleu/bleu_scorer.py",
"chars": 8713,
"preview": "#!/usr/bin/env python\n\n# bleu_scorer.py\n# David Chiang <chiang@isi.edu>\n\n# Copyright (c) 2004-2006 University of Marylan"
},
{
"path": "NLP/AutoTitle_F/pycocoevalcap/cider/__init__.py",
"chars": 21,
"preview": "__author__ = 'tylin'\n"
},
{
"path": "NLP/AutoTitle_F/pycocoevalcap/cider/cider.py",
"chars": 1731,
"preview": "# Filename: cider.py\n#\n# Description: Describes the class to compute the CIDEr (Consensus-Based Image Description Evalua"
},
{
"path": "NLP/AutoTitle_F/pycocoevalcap/cider/cider_scorer.py",
"chars": 7740,
"preview": "#!/usr/bin/env python\n# Tsung-Yi Lin <tl483@cornell.edu>\n# Ramakrishna Vedantam <vrama91@vt.edu>\n\nimport copy\nimport mat"
},
{
"path": "NLP/AutoTitle_F/pycocoevalcap/license.txt",
"chars": 1561,
"preview": "Copyright (c) 2015, Xinlei Chen, Hao Fang, Tsung-Yi Lin, and Ramakrishna Vedantam\nAll rights reserved.\n\nRedistribution a"
},
{
"path": "NLP/AutoTitle_F/pycocoevalcap/meteor/__init__.py",
"chars": 21,
"preview": "__author__ = 'tylin'\n"
},
{
"path": "NLP/AutoTitle_F/pycocoevalcap/meteor/meteor.py",
"chars": 3894,
"preview": "#!/usr/bin/env python\n\n# Python wrapper for METEOR implementation, by Xinlei Chen\n# Acknowledge Michael Denkowski for th"
},
{
"path": "NLP/AutoTitle_F/pycocoevalcap/meteor/tests/test_meteor.py",
"chars": 431,
"preview": "# -*- coding: utf-8 -*-\nfrom __future__ import unicode_literals\n\nimport unittest\n\nfrom nlgeval.pycocoevalcap.meteor.mete"
},
{
"path": "NLP/AutoTitle_F/pycocoevalcap/rouge/__init__.py",
"chars": 23,
"preview": "__author__ = 'vrama91'\n"
},
{
"path": "NLP/AutoTitle_F/pycocoevalcap/rouge/rouge.py",
"chars": 3181,
"preview": "#!/usr/bin/env python\n# \n# File Name : rouge.py\n#\n# Description : Computes ROUGE-L metric as described by Lin and Hovey "
},
{
"path": "NLP/AutoTitle_F/pycocoevalcap/test_eval.py",
"chars": 1251,
"preview": "#!/usr/bin/python\n# -*- coding: utf-8 -*-\n# @Time : 2018/10/18 下午3:30 \n# @Author : ComeOnJian \n# @File : test_eval.py \ni"
},
{
"path": "NLP/AutoTitle_F/submit.py",
"chars": 3537,
"preview": "#!/usr/bin/python\n# -*- coding: utf-8 -*-\n# @Time : 2018/10/29 下午12:55 \n# @Author : ComeOnJian \n# @File : submit.py\n\nimp"
},
{
"path": "NLP/AutoTitle_F/train.py",
"chars": 9122,
"preview": "#!/usr/bin/python\n# -*- coding: utf-8 -*-\n# @Time : 2018/10/25 下午3:45 \n# @Author : ComeOnJian \n# @File : train.py \n\nimpo"
},
{
"path": "NLP/GAN&NLP.md",
"chars": 21435,
"preview": "[转载来自EternalFeather大佬]\n细数生成对抗网络和自然语言处理的那些恩怨情仇\n===\n# 文本生成的基础模型(Introduction)\n在众多NLP的task中,**文本生成(Text Generation)** 是一种结合"
},
{
"path": "NLP/Multi_Label/ShengCe/generate_submit.py",
"chars": 7326,
"preview": "#!/usr/bin/python\n# -*- coding: utf-8 -*-\n# @Time : 2018/10/6 上午10:41 \n# @Author : ComeOnJian \n# @File : generate_submit"
},
{
"path": "NLP/Multi_Label/ShengCe/train_model.py",
"chars": 14386,
"preview": "#!/usr/bin/python\n# -*- coding: utf-8 -*-\n# @Time : 2018/10/6 上午10:42 \n# @Author : ComeOnJian \n# @File : train_model.py "
},
{
"path": "NLP/Multi_Label/ShengCe/util.py",
"chars": 1501,
"preview": "#!/usr/bin/python\n# -*- coding: utf-8 -*-\n# @Time : 2018/10/6 上午10:47 \n# @Author : ComeOnJian \n# @File : util.py \nimport"
},
{
"path": "NLP/Seq2Seq/__init__.py",
"chars": 121,
"preview": "#!/usr/bin/python\n# -*- coding: utf-8 -*-\n# @Time : 2018/7/19 下午9:20 \n# @Author : ComeOnJian \n# @File : __init__.py.py \n"
},
{
"path": "NLP/Seq2Seq/data_util.py",
"chars": 11877,
"preview": "#!/usr/bin/python\n# -*- coding: utf-8 -*-\n# @Time : 2018/7/20 上午9:02 \n# @Author : ComeOnJian \n# @File : data_util.py\n# 参"
},
{
"path": "NLP/Seq2Seq/main.py",
"chars": 2066,
"preview": "#!/usr/bin/python\n# -*- coding: utf-8 -*-\n# @Time : 2018/7/24 上午10:25 \n# @Author : ComeOnJian \n# @File : main.py \n# 参考: "
},
{
"path": "NLP/Seq2Seq/seq2seq_attn.py",
"chars": 56183,
"preview": "# Copyright 2015 The TensorFlow Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"L"
},
{
"path": "NLP/Seq2Seq/seq2seq_model.py",
"chars": 14433,
"preview": "# Copyright 2015 The TensorFlow Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"L"
},
{
"path": "NLP/Seq2Seq/text_summarizer.py",
"chars": 9451,
"preview": "#!/usr/bin/python\n# -*- coding: utf-8 -*-\n# @Time : 2018/7/20 下午4:26 \n# @Author : ComeOnJian \n# @File : seq2seq_att_mode"
},
{
"path": "NLP/Text_CNN/process_data.py",
"chars": 9248,
"preview": "#!/usr/bin/python\n# coding=utf-8\n# @Time : 2018/3/8 下午3:02\n# @Author : ComeOnJian\n# @File : process_data.py\n\nimport pick"
},
{
"path": "NLP/Text_CNN/text_cnn_main.py",
"chars": 4043,
"preview": "#!/usr/bin/python\n# -*- coding: utf-8 -*-\n# @Time : 2018/3/7 下午4:06 \n# @Author : ComeOnJian\n# @File : text_cnn.py\n# impl"
},
{
"path": "NLP/Text_CNN/text_cnn_model.py",
"chars": 9743,
"preview": "#!/usr/bin/python\n# -*- coding: utf-8 -*-\n# @Time : 2018/3/9 下午4:35 \n# @Author : ComeOnJian \n# @File : text_cnn_model.py"
},
{
"path": "NLP/daguan/README.md",
"chars": 51,
"preview": "#### pytorch TextCNN的实现\n参考模型:Char_CNN and Word_CNN\n"
},
{
"path": "NLP/daguan/data_analy.py",
"chars": 11546,
"preview": "#!/usr/bin/python\n# -*- coding: utf-8 -*-\n# @Time : 2018/8/28 下午4:03 \n# @Author : ComeOnJian \n# @File : data_analy.py\n\ni"
},
{
"path": "NLP/daguan/lr_scheduler.py",
"chars": 14262,
"preview": "import math\r\nfrom bisect import bisect_right\r\nfrom torch.optim.optimizer import Optimizer\r\n\r\n\r\nclass _LRScheduler(object"
},
{
"path": "NLP/daguan/main.py",
"chars": 9015,
"preview": "#!/usr/bin/python\n# -*- coding: utf-8 -*-\n# @Time : 2018/8/29 下午4:29 \n# @Author : ComeOnJian \n# @File : main.py \nimport "
},
{
"path": "NLP/daguan/model.py",
"chars": 6424,
"preview": "#!/usr/bin/python\n# -*- coding: utf-8 -*-\n# @Time : 2018/8/29 上午11:06 \n# @Author : ComeOnJian \n# @File : model.py \n\nimpo"
},
{
"path": "NLP/daguan/optims.py",
"chars": 1906,
"preview": "import math\nimport torch.optim as optim\nimport torch.nn as nn\nfrom torch.nn.utils import clip_grad_norm\n\nclass Optim(obj"
},
{
"path": "NLP/daguan/predict.py",
"chars": 3349,
"preview": "#!/usr/bin/python\n# -*- coding: utf-8 -*-\n# @Time : 2018/9/1 上午12:16 \n# @Author : ComeOnJian \n# @File : predict.py \nimpo"
},
{
"path": "README.md",
"chars": 4124,
"preview": "HEXO个人博客地址:[小简铺子](https://jianwenjun.xyz)\n\n### 机器学习练手代码\n描述:主要包括机器学习的基础算法的实现、相关竞赛代码,论文和项目复现代码。\n\n### 1ML\n#### 1.1决策树相关算法\n["
}
]
// ... and 1 more files (download for full content)
About this extraction
This page contains the full source code of the JianWenJun/MLDemo GitHub repository, extracted and formatted as plain text for AI agents and large language models (LLMs). The extraction includes 75 files (1.7 MB), approximately 1.4M tokens, and a symbol index with 539 extracted functions, classes, methods, constants, and types. Use this with OpenClaw, Claude, ChatGPT, Cursor, Windsurf, or any other AI tool that accepts text input. You can copy the full output to your clipboard or download it as a .txt file.
Extracted by GitExtract — free GitHub repo to text converter for AI. Built by Nikandr Surkov.