Repository: chizhu/yiguan_sex_age_predict_1st_solution
Branch: master
Commit: 6ffeda04746e
Files: 71
Total size: 678.0 KB
Directory structure:
gitextract_j9hppgj3/
├── 2018易观A10大数据应用峰会-RNG_终极版.pptx
├── README.md
├── THLUO/
│ ├── 1.w2c_model_start.py
│ ├── 10.age_bin_prob_oof.py
│ ├── 11.hcc_device_brand_age_sex.py
│ ├── 12.device_age_regression_prob_oof.py
│ ├── 13.device_start_GRU_pred.py
│ ├── 14.device_start_GRU_pred_age.py
│ ├── 15.device_all_GRU_pred.py
│ ├── 16.device_start_capsule_pred.py
│ ├── 17.device_start_textcnn_pred.py
│ ├── 18.device_start_text_dpcnn_pred.py
│ ├── 19.device_start_lstm_pred.py
│ ├── 2.w2c_model_close.py
│ ├── 20.lgb_sex_age_prob_oof.py
│ ├── 21.tfidf_lr_sex_age_prob_oof.py
│ ├── 22.base_feat.py
│ ├── 23.ATT_v6.py
│ ├── 24.thluo_22_lgb.py
│ ├── 25.thluo_22_xgb.py
│ ├── 26.thluo_nb_lgb.py
│ ├── 27.thluo_nb_xgb.py
│ ├── 28.final.py
│ ├── 3.device_quchong_start_app_w2c.py
│ ├── 3.w2c_all_emb.py
│ ├── 3.w2c_model_all.py
│ ├── 4.device_age_prob_oof.py
│ ├── 5.device_sex_prob_oof.py
│ ├── 6.start_close_age_prob_oof.py
│ ├── 7.start_close_sex_prob_oof.py
│ ├── 9.sex_age_bin_prob_oof.py
│ ├── TextModel.py
│ ├── readme.md
│ ├── util.py
│ └── 代码运行.bat
├── chizhu/
│ ├── readme.txt
│ ├── single_model/
│ │ ├── cnn.py
│ │ ├── config.py
│ │ ├── deepnn.py
│ │ ├── get_nn_feat.py
│ │ ├── lgb.py
│ │ ├── user_behavior.py
│ │ ├── xgb.py
│ │ ├── xgb_nb.py
│ │ └── yg_best_nn.py
│ ├── stacking/
│ │ ├── all_feat/
│ │ │ └── xgb__nurbs_nb.ipynb
│ │ └── nurbs_feat/
│ │ ├── xgb_22.py
│ │ └── xgb__nurbs_nb.py
│ └── util/
│ ├── bagging.py
│ └── get_nn_res.py
├── linwangli/
│ ├── code/
│ │ ├── lgb_allfeat_22.py
│ │ ├── lgb_allfeat_condProb.py
│ │ └── utils.py
│ ├── readme.txt
│ ├── yg-1st-lgb.py
│ └── 融合思路.pptx
├── nb_cz_lwl_wcm/
│ ├── 10_lgb.py
│ ├── 11_cnn.py
│ ├── 12_get_feature_lwl.py
│ ├── 13_last_get_all_feature.py
│ ├── 1_get_age_reg.py
│ ├── 2_get_feature_brand.py
│ ├── 3_get_feature_device_package.py
│ ├── 4_get_feature_device_start_close_tfidf_1_2.py
│ ├── 5_get_feature_device_start_close_tfidf.py
│ ├── 6_get_feature_device_start_close.py
│ ├── 7_get_feature_w2v.py
│ ├── 8_get_feature_lwl.py
│ ├── 9_yg_best_nn.py
│ └── 运行说明.txt
└── wangcanming/
└── deepnet_v33.py
================================================
FILE CONTENTS
================================================
================================================
FILE: README.md
================================================
# yiguan_sex_age_predict_1st_solution
易观性别年龄预测第一名解决方案
##### [比赛链接](https://www.tinymind.cn/competitions/43)
--------
团队是分别个人做然后再合并,所以团队中特征文件有所交叉,主要用到的方案是stacking不同模型,因为数据产出的维度较高,通过不同模型stacking可以达到不会损失过量信息下达到降维的目的。
以下是运行代码的顺序:
* 1.产出特征文件
> 按照nb_cz_lwl_wcm文件夹运行说明分别运行 nb_cz_lwl_wcm文件夹下的所有文件产出特征文件 feature_one.csv
> 按照thluo 文件夹下运行说明分别运行 thluo 文件夹下的代码生成 thluo_train_best_feat.csv
* 2.模型加权
注:模型所得到的结果在linwangli文件夹下
> 运行完thluo文件夹下面的所有代码会生成thluo_prob
> 用linwangli/code文件夹下面的模型以及上面所求得的特征文件可跑出对应概率文件,相关概率文件加权方案看 linwangli文件夹下面的融合思路ppt
CONTRIBUTORS:[THLUO](https://github.com/THLUO) [WangliLin](https://github.com/WangliLin) [Puck Wang](https://github.com/PuckWong) [chizhu](https://github.com/chizhu) [NURBS](https://github.com/suncostanx)
================================================
FILE: THLUO/1.w2c_model_start.py
================================================
# coding: utf-8
# In[1]:
import pandas as pd
import seaborn as sns
import numpy as np
from tqdm import tqdm
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.cross_validation import train_test_split
from sklearn.metrics import accuracy_score
import lightgbm as lgb
from datetime import datetime,timedelta
import time
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
import gc
from gensim.test.utils import common_texts, get_tmpfile
from gensim.models import Word2Vec
# In[2]:
path='input/'
data=pd.DataFrame()
print ('1.w2c_model_start.py')
# In[3]:
deviceid_packages=pd.read_csv(path+'deviceid_packages.tsv',sep='\t',names=['device_id','apps'])
deviceid_test=pd.read_csv(path+'deviceid_test.tsv',sep='\t',names=['device_id'])
deviceid_train=pd.read_csv(path+'deviceid_train.tsv',sep='\t',names=['device_id','sex','age'])
deviceid_brand = pd.read_csv(path+'deviceid_brand.tsv',sep='\t', names=['device_id','device_brand', 'device_type'])
deviceid_package_start_close = pd.read_csv(path+'deviceid_package_start_close.tsv',sep='\t', names=['device_id','app_id','start_time','close_time'])
package_label = pd.read_csv(path+'package_label.tsv',sep='\t',names=['app_id','app_parent_type', 'app_child_type'])
deviceid_brand['device_brand'] = deviceid_brand.device_brand.apply(lambda x : str(x).split(' ')[0])
df_temp = deviceid_brand.groupby('device_brand')['device_id'].count().reset_index().rename(columns={'device_id':'brand_counts'})
one_time_brand = df_temp[df_temp.brand_counts == 1].device_brand.values
deviceid_brand['device_brand'] = deviceid_brand.device_brand.apply(lambda x : 'other' if x in one_time_brand else x)
df_temp = deviceid_brand.groupby('device_brand')['device_id'].count().reset_index().rename(columns={'device_id':'brand_counts'})
one_time_brand = df_temp[df_temp.brand_counts == 2].device_brand.values
deviceid_brand['device_brand'] = deviceid_brand.device_brand.apply(lambda x : 'other_2' if x in one_time_brand else x)
df_temp = deviceid_brand.groupby('device_brand')['device_id'].count().reset_index().rename(columns={'device_id':'brand_counts'})
one_time_brand = df_temp[df_temp.brand_counts == 3].device_brand.values
deviceid_brand['device_brand'] = deviceid_brand.device_brand.apply(lambda x : 'other_3' if x in one_time_brand else x)
#转换成对应的数字
lbl = LabelEncoder()
lbl.fit(list(deviceid_brand.device_brand.values))
deviceid_brand['device_brand'] = lbl.transform(list(deviceid_brand.device_brand.values))
lbl = LabelEncoder()
lbl.fit(list(deviceid_brand.device_type.values))
deviceid_brand['device_type'] = lbl.transform(list(deviceid_brand.device_type.values))
#转换成对应的数字
lbl = LabelEncoder()
lbl.fit(list(package_label.app_parent_type.values))
package_label['app_parent_type'] = lbl.transform(list(package_label.app_parent_type.values))
lbl = LabelEncoder()
lbl.fit(list(package_label.app_child_type.values))
package_label['app_child_type'] = lbl.transform(list(package_label.app_child_type.values))
# In[4]:
df_sorted = deviceid_package_start_close.sort_values(by='start_time')
# In[20]:
df_results = df_sorted.groupby('device_id')['app_id'].apply(lambda x:' '.join(x)).reset_index().rename(columns = {'app_id' : 'app_list'})
df_results.to_csv('01.device_click_app_sorted_by_start.csv', index=None)
del df_results
# In[5]:
df_device_start_app_list = df_sorted.groupby('device_id').apply(lambda x : list(x.app_id)).reset_index().rename(columns = {0 : 'app_list'})
# In[7]:
app_list = list(df_device_start_app_list.app_list.values)
# In[9]:
model = Word2Vec(app_list, size=10, window=10, min_count=2, workers=4)
model.save("word2vec.model")
# In[10]:
vocab = list(model.wv.vocab.keys())
w2c_arr = []
for v in vocab :
w2c_arr.append(list(model.wv[v]))
# In[11]:
df_w2c_start = pd.DataFrame()
df_w2c_start['app_id'] = vocab
df_w2c_start = pd.concat([df_w2c_start, pd.DataFrame(w2c_arr)], axis=1)
df_w2c_start.columns = ['app_id'] + ['w2c_start_app_' + str(i) for i in range(10)]
# In[13]:
w2c_nums = 10
agg = {}
for l in ['w2c_start_app_' + str(i) for i in range(w2c_nums)] :
agg[l] = ['mean', 'std', 'max', 'min']
# In[14]:
deviceid_package_start_close = deviceid_package_start_close.merge(df_w2c_start, on='app_id', how='left')
# In[15]:
df_agg = deviceid_package_start_close.groupby('device_id').agg(agg)
df_agg.columns = pd.Index(['device_' + e[0] + "_" + e[1].upper() for e in df_agg.columns.tolist()])
df_agg = df_agg.reset_index()
df_agg.to_csv('device_start_app_w2c.csv', index=None)
# In[16]:
df_results = deviceid_package_start_close.groupby(['device_id', 'app_id'])['start_time'].mean().reset_index()
df_results = df_results.merge(df_w2c_start, on='app_id', how='left')
# In[18]:
df_agg = df_results.groupby('device_id').agg(agg)
df_agg.columns = pd.Index(['device_app_unique_' + e[0] + "_" + e[1].upper() for e in df_agg.columns.tolist()])
df_agg = df_agg.reset_index()
# In[24]:
df_agg.to_csv('device_app_unique_start_app_w2c.csv', index=None)
print ('success.....')
================================================
FILE: THLUO/10.age_bin_prob_oof.py
================================================
# coding: utf-8
# In[1]:
import pandas as pd
import seaborn as sns
import numpy as np
from tqdm import tqdm
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.cross_validation import train_test_split
from sklearn.metrics import accuracy_score
import lightgbm as lgb
from datetime import datetime,timedelta
import time
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
import gc
# In[2]:
print ('10.age_bin_prob_oof.py')
path='input/'
data=pd.DataFrame()
#sex_age=pd.read_excel('./data/性别年龄对照表.xlsx')
# In[3]:
deviceid_packages=pd.read_csv(path+'deviceid_packages.tsv',sep='\t',names=['device_id','apps'])
deviceid_test=pd.read_csv(path+'deviceid_test.tsv',sep='\t',names=['device_id'])
deviceid_train=pd.read_csv(path+'deviceid_train.tsv',sep='\t',names=['device_id','sex','age'])
deviceid_brand = pd.read_csv(path+'deviceid_brand.tsv',sep='\t', names=['device_id','device_brand', 'device_type'])
deviceid_package_start_close = pd.read_csv(path+'deviceid_package_start_close.tsv',sep='\t', names=['device_id','app_id','start_time','close_time'])
package_label = pd.read_csv(path+'package_label.tsv',sep='\t',names=['app_id','app_parent_type', 'app_child_type'])
deviceid_brand['device_brand'] = deviceid_brand.device_brand.apply(lambda x : str(x).split(' ')[0])
df_temp = deviceid_brand.groupby('device_brand')['device_id'].count().reset_index().rename(columns={'device_id':'brand_counts'})
one_time_brand = df_temp[df_temp.brand_counts == 1].device_brand.values
deviceid_brand['device_brand'] = deviceid_brand.device_brand.apply(lambda x : 'other' if x in one_time_brand else x)
df_temp = deviceid_brand.groupby('device_brand')['device_id'].count().reset_index().rename(columns={'device_id':'brand_counts'})
one_time_brand = df_temp[df_temp.brand_counts == 2].device_brand.values
deviceid_brand['device_brand'] = deviceid_brand.device_brand.apply(lambda x : 'other_2' if x in one_time_brand else x)
df_temp = deviceid_brand.groupby('device_brand')['device_id'].count().reset_index().rename(columns={'device_id':'brand_counts'})
one_time_brand = df_temp[df_temp.brand_counts == 3].device_brand.values
deviceid_brand['device_brand'] = deviceid_brand.device_brand.apply(lambda x : 'other_3' if x in one_time_brand else x)
#转换成对应的数字
lbl = LabelEncoder()
lbl.fit(list(deviceid_brand.device_brand.values))
deviceid_brand['device_brand'] = lbl.transform(list(deviceid_brand.device_brand.values))
lbl = LabelEncoder()
lbl.fit(list(deviceid_brand.device_type.values))
deviceid_brand['device_type'] = lbl.transform(list(deviceid_brand.device_type.values))
#转换成对应的数字
lbl = LabelEncoder()
lbl.fit(list(package_label.app_parent_type.values))
package_label['app_parent_type'] = lbl.transform(list(package_label.app_parent_type.values))
lbl = LabelEncoder()
lbl.fit(list(package_label.app_child_type.values))
package_label['app_child_type'] = lbl.transform(list(package_label.app_child_type.values))
# In[4]:
import time
# 输入毫秒级的时间,转出正常格式的时间
def timeStamp(timeNum):
timeStamp = float(timeNum/1000)
timeArray = time.localtime(timeStamp)
otherStyleTime = time.strftime("%Y-%m-%d %H:%M:%S", timeArray)
return otherStyleTime
#解析出具体的时间
deviceid_package_start_close['start_date'] = pd.to_datetime(deviceid_package_start_close.start_time.apply(timeStamp))
deviceid_package_start_close['end_date'] = pd.to_datetime(deviceid_package_start_close.close_time.apply(timeStamp))
deviceid_package_start_close['start_hour'] = deviceid_package_start_close.start_date.dt.hour
deviceid_package_start_close['end_hour'] = deviceid_package_start_close.end_date.dt.hour
deviceid_package_start_close['time_gap'] = (deviceid_package_start_close['end_date'] - deviceid_package_start_close['start_date']).astype('timedelta64[s]')
deviceid_package_start_close = deviceid_package_start_close.merge(package_label, on='app_id', how='left')
deviceid_package_start_close.app_parent_type.fillna(-1, inplace=True)
deviceid_package_start_close.app_child_type.fillna(-1, inplace=True)
deviceid_package_start_close['start_year'] = deviceid_package_start_close.start_date.dt.year
deviceid_package_start_close['end_year'] = deviceid_package_start_close.end_date.dt.year
deviceid_package_start_close['year_gap'] = deviceid_package_start_close['end_year'] - deviceid_package_start_close['start_year']
# In[5]:
deviceid_train=pd.concat([deviceid_train,deviceid_test])
# In[6]:
deviceid_packages['apps']=deviceid_packages['apps'].apply(lambda x:x.split(','))
deviceid_packages['app_lenghth']=deviceid_packages['apps'].apply(lambda x:len(x))
apps=deviceid_packages['apps'].apply(lambda x:' '.join(x)).tolist()
vectorizer=CountVectorizer()
transformer=TfidfTransformer()
cntTf = vectorizer.fit_transform(apps)
tfidf=transformer.fit_transform(cntTf)
word=vectorizer.get_feature_names()
weight=tfidf.toarray()
df_weight=pd.DataFrame(weight)
feature=df_weight.columns
df_weight['sum']=0
for f in tqdm(feature):
df_weight['sum']+=df_weight[f]
deviceid_packages['tfidf_sum']=df_weight['sum']
# In[10]:
lda = LatentDirichletAllocation(n_topics=5,
learning_offset=50.,
random_state=666)
docres = lda.fit_transform(cntTf)
# In[11]:
deviceid_packages = pd.concat([deviceid_packages,pd.DataFrame(docres)],axis=1)
# In[12]:
temp=deviceid_packages.drop('apps',axis=1)
deviceid_train=pd.merge(deviceid_train,temp,on='device_id',how='left')
# In[13]:
#解析出所有的device_app_pair
device_id_arr = []
app_arr = []
df_device_app_pair = pd.DataFrame()
for row in deviceid_packages.values :
device_id = row[0]
app_list = row[1]
for app in app_list :
device_id_arr.append(device_id)
app_arr.append(app)
#生成pair
df_device_app_pair['device_id'] = device_id_arr
df_device_app_pair['app_id'] = app_arr
df_device_app_pair = df_device_app_pair.merge(package_label, how='left', on='app_id')
#特征工程
def open_app_timegap_in_hour() :
df_temp = deviceid_package_start_close.groupby(['device_id', 'start_hour'])['time_gap'].mean().reset_index().rename(columns = {'time_gap': 'mean_time_gap'})
df_mean_temp = pd.pivot_table(df_temp, index='device_id', columns='start_hour', values='mean_time_gap').reset_index()
df_mean_temp.columns = ['device_id'] + ['open_app_timegap_in_'+str(i) + '_mean_hour' for i in range(0,24)]
df_mean_temp.fillna(0, inplace=True)
return df_mean_temp
# In[8]:
def device_start_end_app_timegap() :
#用户打开,关闭app的时间间隔
df_ = deviceid_package_start_close.sort_values(by=['device_id', 'start_date'], ascending=False)
df_['prev_start_date'] = df_.groupby('device_id')['start_date'].shift(-1)
df_['start_date_gap'] = (df_['start_date'] - df_['prev_start_date']).astype('timedelta64[s]')
agg_dic = {'start_date_gap' : ['min', 'max', 'mean', 'median', 'std']}
df_start_gap_agg = df_.groupby('device_id').agg(agg_dic)
df_start_gap_agg.columns = pd.Index(['device_' + e[0] + "_" + e[1].upper() for e in df_start_gap_agg.columns.tolist()])
df_start_gap_agg = df_start_gap_agg.reset_index()
#del df_
gc.collect()
#关闭时间间隔
df_ = deviceid_package_start_close.sort_values(by=['device_id', 'end_date'], ascending=False)
df_['prev_end_date'] = df_.groupby('device_id')['end_date'].shift(-1)
df_['end_date_gap'] = (df_['end_date'] - df_['prev_end_date']).astype('timedelta64[s]')
agg_dic = {'end_date_gap' : ['min', 'max', 'mean', 'median', 'std']}
df_end_gap_agg = df_.groupby('device_id').agg(agg_dic)
df_end_gap_agg.columns = pd.Index(['device_' + e[0] + "_" + e[1].upper() for e in df_end_gap_agg.columns.tolist()])
df_end_gap_agg = df_end_gap_agg.reset_index()
#del df_
gc.collect()
df_agg = df_start_gap_agg.merge(df_end_gap_agg, on='device_id', how='left')
#df_agg = df_agg.merge(df_app_start_gap_agg, on='device_id', how='left')
#df_agg = df_agg.merge(df_app_end_gap_agg, on='device_id', how='left')
return df_agg
def open_app_counts_in_hour() :
df_temp = deviceid_package_start_close.groupby(['device_id', 'start_hour'])['app_id'].count().reset_index().rename(columns = {'app_id': 'app_counts'})
df_temp = pd.pivot_table(df_temp, index='device_id', columns='start_hour', values='app_counts').reset_index()
df_temp.columns = ['device_id'] + ['open_app_counts_in'+str(i) + '_hour' for i in range(0,24)]
df_temp.fillna(0, inplace=True)
return df_temp
def close_app_counts_in_hour() :
df_temp = deviceid_package_start_close.groupby(['device_id', 'end_hour'])['app_id'].count().reset_index().rename(columns = {'app_id': 'app_counts'})
df_temp = pd.pivot_table(df_temp, index='device_id', columns='end_hour', values='app_counts').reset_index()
df_temp.columns = ['device_id'] + ['close_app_counts_in'+str(i) + '_hour' for i in range(0,24)]
df_temp.fillna(0, inplace=True)
return df_temp
def app_type_mean_time_gap_one_hot () :
df_temp = deviceid_package_start_close.groupby(['device_id', 'app_parent_type'])['time_gap'].mean().reset_index()
df_temp = pd.pivot_table(df_temp, index='device_id', columns='app_parent_type', values='time_gap').reset_index()
df_temp.columns = ['device_id'] + ['app_parent_type_mean_time_gap'+str(i) for i in range(-1,45)]
df_temp.fillna(-1, inplace=True)
return df_temp
def device_active_hour() :
aggregations = {
'start_hour' : ['std','mean','max','min'],
'end_hour' : ['std','mean','max','min']
}
df_agg = deviceid_package_start_close.groupby('device_id').agg(aggregations)
df_agg.columns = pd.Index(['device_grouped_' + e[0] + "_" + e[1].upper() for e in df_agg.columns.tolist()])
df_agg = df_agg.reset_index()
return df_agg
def device_brand_encoding() :
df_temp = deviceid_brand.merge(deviceid_train[['device_id', 'age', 'sex']], on='device_id', how='left')
aggregations = {
'age' : ['std','mean'],
'sex' : ['mean'],
}
df_device_brand = df_temp.groupby('device_brand').agg(aggregations)
df_device_brand.columns = pd.Index(['device_brand_' + e[0] + "_" + e[1].upper() for e in df_device_brand.columns.tolist()])
df_device_brand = df_device_brand.reset_index()
df_device_type = df_temp.groupby('device_type').agg(aggregations)
df_device_type.columns = pd.Index(['device_type_' + e[0] + "_" + e[1].upper() for e in df_device_type.columns.tolist()])
df_device_type = df_device_type.reset_index()
df_temp = df_temp.merge(df_device_brand, on='device_brand', how='left')
df_temp = df_temp.merge(df_device_type, on='device_type', how='left')
aggregations = {
'device_brand_age_STD' : ['mean'],
'device_brand_age_MEAN' : ['mean'],
'device_brand_sex_MEAN' : ['mean'],
#'device_type_age_STD' : ['mean'],
#'device_type_age_MEAN' : ['mean'],
#'device_type_sex_MEAN' : ['mean']
}
df_agg = df_temp.groupby('device_id').agg(aggregations)
df_agg.columns = pd.Index(['device_grouped_' + e[0] + "_" + e[1].upper() for e in df_agg.columns.tolist()])
df_agg = df_agg.reset_index()
return df_agg
#统计device运行app的情况
def device_active_time_time_stat() :
#device开启app的时间统计信息
deviceid_package_start_close['active_time'] = deviceid_package_start_close['close_time'] - deviceid_package_start_close['start_time']
#device开启了多少次app
#device开启了多少个app
aggregations = {
'app_id' : ['count', 'nunique'],
'active_time' : ['mean', 'std', 'max', 'min'],
}
df_agg = deviceid_package_start_close.groupby('device_id').agg(aggregations)
df_agg.columns = pd.Index(['device_grouped_' + e[0] + "_" + e[1].upper() for e in df_agg.columns.tolist()])
df_agg = df_agg.reset_index()
aggregations = {
'active_time' : ['mean', 'std', 'max', 'min', 'count'],
}
df_da_agg = deviceid_package_start_close.groupby(['device_id', 'app_id']).agg(aggregations)
df_da_agg.columns = pd.Index(['device_app_grouped_' + e[0] + "_" + e[1].upper() for e in df_da_agg.columns.tolist()])
df_da_agg = df_da_agg.reset_index()
#device开启app的平均时间
aggregations = {
'device_app_grouped_active_time_MEAN' : ['mean', 'std', 'max', 'min'],
'device_app_grouped_active_time_STD' : ['mean', 'std', 'max', 'min'],
'device_app_grouped_active_time_MAX' : ['mean', 'std', 'max', 'min'],
'device_app_grouped_active_time_MIN' : ['mean', 'std', 'max', 'min'],
'device_app_grouped_active_time_COUNT' : ['mean', 'std', 'max', 'min'],
}
df_temp = df_da_agg.groupby(['device_id']).agg(aggregations)
df_temp.columns = pd.Index([e[0] + "_" + e[1].upper() for e in df_temp.columns.tolist()])
df_temp = df_temp.reset_index()
df_agg = df_agg.merge(df_temp, on='device_id', how='left')
return df_agg
def app_type_encoding() :
df_temp = df_device_app_pair.merge(deviceid_train[['device_id', 'age', 'sex']], on='device_id', how='left')
aggregations = {
'age' : ['std','mean'],
'sex' : ['mean'],
}
df_agg_app_parent_type = df_temp.groupby('app_parent_type').agg(aggregations)
df_agg_app_parent_type.columns = pd.Index(['app_parent_type_' + e[0] + "_" + e[1].upper() for e in df_agg_app_parent_type.columns.tolist()])
df_agg_app_parent_type = df_agg_app_parent_type.reset_index()
df_agg_app_child_type = df_temp.groupby('app_child_type').agg(aggregations)
df_agg_app_child_type.columns = pd.Index(['app_child_type_' + e[0] + "_" + e[1].upper() for e in df_agg_app_child_type.columns.tolist()])
df_agg_app_child_type = df_agg_app_child_type.reset_index()
df_temp = df_temp.merge(df_agg_app_parent_type, on='app_parent_type', how='left')
df_temp = df_temp.merge(df_agg_app_child_type, on='app_child_type', how='left')
aggregations = {
'app_parent_type_age_STD' : ['mean'],
'app_parent_type_age_MEAN' : ['mean'],
'app_parent_type_sex_MEAN' : ['mean'],
'app_child_type_age_STD' : ['mean'],
'app_child_type_age_MEAN' : ['mean'],
'app_child_type_sex_MEAN' : ['mean']
}
df_agg = df_temp.groupby('device_id').agg(aggregations)
df_agg.columns = pd.Index(['device_grouped_' + e[0] + "_" + e[1].upper() for e in df_agg.columns.tolist()])
df_agg = df_agg.reset_index()
return df_agg
#每个device对应的app_parent_type计数
def app_type_onehot_in_device(df) :
df_copy = df.fillna(-1)
df_temp = df_copy.groupby(['device_id', 'app_parent_type'])['app_id'].size().reset_index()
df_temp.rename(columns = {'app_id' : 'app_parent_type_counts'}, inplace=True)
df_temp = pd.pivot_table(df_temp, index='device_id', columns='app_parent_type', values='app_parent_type_counts').reset_index()
df_temp.columns = ['device_id'] + ['app_parent_type'+str(i) for i in range(-1,45)]
df_temp.fillna(0, inplace=True)
return df_temp
# In[15]:
#提取特征
df_train = deviceid_train.merge(device_active_time_time_stat(), on='device_id', how='left')
df_train = df_train.merge(deviceid_brand, on='device_id', how='left')
df_train = df_train.merge(app_type_onehot_in_device(df_device_app_pair), on='device_id', how='left')
df_train = df_train.merge(app_type_encoding(), on='device_id', how='left')
df_train = df_train.merge(device_active_hour(), on='device_id', how='left')
df_train = df_train.merge(app_type_mean_time_gap_one_hot(), on='device_id', how='left')
df_train = df_train.merge(open_app_counts_in_hour(), on='device_id', how='left')
df_train = df_train.merge(close_app_counts_in_hour(), on='device_id', how='left')
df_train = df_train.merge(device_brand_encoding(), on='device_id', how='left')
df_train = df_train.merge(device_start_end_app_timegap(), on='device_id', how='left')
df_train = df_train.merge(open_app_timegap_in_hour(), on='device_id', how='left')
# In[16]:
df_w2c_start = pd.read_csv('device_start_app_w2c.csv')
df_w2c_close = pd.read_csv('device_close_app_w2c.csv')
df_w2c_all = pd.read_csv('device_all_app_w2c.csv')
df_device_quchong_start_app_w2c = pd.read_csv('device_quchong_start_app_w2c.csv')
df_device_app_unique_start_app_w2c = pd.read_csv('device_app_unique_start_app_w2c.csv')
df_device_app_unique_close_app_w2c = pd.read_csv('device_app_unique_close_app_w2c.csv')
df_device_app_unique_all_app_w2c = pd.read_csv('device_app_unique_all_app_w2c.csv')
# In[17]:
df_train_w2v = df_train.merge(df_w2c_start, on='device_id', how='left')
df_train_w2v = df_train_w2v.merge(df_w2c_close, on='device_id', how='left')
df_train_w2v = df_train_w2v.merge(df_w2c_all, on='device_id', how='left')
df_train_w2v = df_train_w2v.merge(df_device_quchong_start_app_w2c, on='device_id', how='left')
df_train_w2v = df_train_w2v.merge(df_device_app_unique_start_app_w2c, on='device_id', how='left')
df_train_w2v = df_train_w2v.merge(df_device_app_unique_close_app_w2c, on='device_id', how='left')
df_train_w2v = df_train_w2v.merge(df_device_app_unique_all_app_w2c, on='device_id', how='left')
# In[19]:
df_train_w2v['sex'] = df_train_w2v['sex'].apply(lambda x:str(x))
df_train_w2v['age'] = df_train_w2v['age'].apply(lambda x:str(x))
def tool(x):
if x=='nan':
return x
else:
return str(int(float(x)))
df_train_w2v['sex']=df_train_w2v['sex'].apply(tool)
df_train_w2v['age']=df_train_w2v['age'].apply(tool)
df_train_w2v['sex_age']=df_train_w2v['sex']+'-'+df_train_w2v['age']
df_train_w2v = df_train_w2v.replace({'nan':np.NaN,'nan-nan':np.NaN})
# In[31]:
train = df_train_w2v[df_train_w2v['sex_age'].notnull()]
test = df_train_w2v[df_train_w2v['sex_age'].isnull()]
train = train.reset_index(drop=True)
test = test.reset_index(drop=True)
# In[32]:
Y = train['age']
train['label'] = Y
# In[35]:
from sklearn.model_selection import KFold, StratifiedKFold
label_set = train.label.unique()
lgb_round = {'3': 363,
'5': 273,
'4': 328,
'7': 228,
'6': 361,
'9': 181,
'10': 338,
'2': 312,
'8': 234,
'1': 220,
'0': 200}
for sex_age in label_set :
print (sex_age)
X = train.drop(['sex', 'age', 'sex_age', 'label', 'device_id'],axis=1)
Y = train.label.apply(lambda x : 1 if x == sex_age else 0)
print (Y.value_counts())
seed = 2018
num_folds = 5
folds = StratifiedKFold(n_splits= num_folds, shuffle=True, random_state=seed)
sub_list = []
oof_preds = np.zeros(train.shape[0])
sub_preds = np.zeros(test.shape[0])
params = {
'boosting_type': 'gbdt',
'learning_rate' : 0.02,
#'max_depth':5,
'num_leaves' : 2 ** 5,
'metric': {'binary_logloss'},
#'num_class' : 22,
'objective' : 'binary',
'random_state' : 6666,
'bagging_freq' : 5,
'feature_fraction' : 0.7,
'bagging_fraction' : 0.7,
'min_split_gain' : 0.0970905919552776,
'min_child_weight' : 9.42012323936088,
}
for n_fold, (train_idx, valid_idx) in enumerate(folds.split(X, Y)):
train_x, train_y = X.iloc[train_idx], Y.iloc[train_idx]
valid_x, valid_y = X.iloc[valid_idx], Y.iloc[valid_idx]
lgb_train=lgb.Dataset(train_x,label=train_y)
lgb_eval = lgb.Dataset(valid_x, valid_y, reference=lgb_train)
gbm = lgb.train(params, lgb_train, num_boost_round=lgb_round[sex_age], valid_sets=[lgb_train, lgb_eval], verbose_eval=50)
oof_preds[valid_idx] = gbm.predict(valid_x[X.columns.values])
train['age_bin_prob_oof_' + str(sex_age)] = oof_preds
#用全部的train来预测test
lgb_train = lgb.Dataset(X,label=Y)
gbm = lgb.train(params, lgb_train, num_boost_round=lgb_round[sex_age], valid_sets=lgb_train, verbose_eval=50)
test['age_bin_prob_oof_' + str(sex_age)] = gbm.predict(test[X.columns.values])
# In[36]:
columns = ['device_id'] + ['age_bin_prob_oof_' + str(i) for i in range(11)]
# In[38]:
pd.concat([train[columns], test[columns]]).to_csv('age_bin_prob_oof.csv', index=None)
================================================
FILE: THLUO/11.hcc_device_brand_age_sex.py
================================================
# coding: utf-8
# In[1]:
import pandas as pd
import seaborn as sns
import numpy as np
from tqdm import tqdm
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.cross_validation import train_test_split
from sklearn.metrics import accuracy_score
import lightgbm as lgb
from datetime import datetime,timedelta
import time
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
import gc
from sklearn.model_selection import StratifiedKFold
# In[2]:
print ('11.hcc_device_brand_age_sex.py')
path='input/'
data=pd.DataFrame()
#sex_age=pd.read_excel('./data/性别年龄对照表.xlsx')
# In[3]:
deviceid_packages=pd.read_csv(path+'deviceid_packages.tsv',sep='\t',names=['device_id','apps'])
deviceid_test=pd.read_csv(path+'deviceid_test.tsv',sep='\t',names=['device_id'])
deviceid_train=pd.read_csv(path+'deviceid_train.tsv',sep='\t',names=['device_id','sex','age'])
deviceid_brand = pd.read_csv(path+'deviceid_brand.tsv',sep='\t', names=['device_id','device_brand', 'device_type'])
deviceid_package_start_close = pd.read_csv(path+'deviceid_package_start_close.tsv',sep='\t', names=['device_id','app_id','start_time','close_time'])
package_label = pd.read_csv(path+'package_label.tsv',sep='\t',names=['app_id','app_parent_type', 'app_child_type'])
#deviceid_brand['device_brand'] = deviceid_brand.device_brand.apply(lambda x : str(x).split(' ')[0])
#转换成对应的数字
lbl = LabelEncoder()
lbl.fit(list(deviceid_brand.device_brand.values))
deviceid_brand['device_brand'] = lbl.transform(list(deviceid_brand.device_brand.values))
lbl = LabelEncoder()
lbl.fit(list(deviceid_brand.device_type.values))
deviceid_brand['device_type'] = lbl.transform(list(deviceid_brand.device_type.values))
#转换成对应的数字
lbl = LabelEncoder()
lbl.fit(list(package_label.app_parent_type.values))
package_label['app_parent_type'] = lbl.transform(list(package_label.app_parent_type.values))
lbl = LabelEncoder()
lbl.fit(list(package_label.app_child_type.values))
package_label['app_child_type'] = lbl.transform(list(package_label.app_child_type.values))
# In[4]:
df_train = deviceid_train.merge(deviceid_brand, how='left', on='device_id')
df_train.fillna(-1, inplace=True)
df_test = deviceid_test.merge(deviceid_brand, how='left', on='device_id')
df_test.fillna(-1, inplace=True)
# In[5]:
df_train['sex'] = df_train.sex.apply(lambda x : 1 if x == 1 else 0)
df_train = df_train.join(pd.get_dummies(df_train["age"], prefix="age").astype(int))
df_train['sex_age'] = df_train['sex'].map(str) + '_' + df_train['age'].map(str)
Y = df_train['sex_age']
Y_CAT = pd.Categorical(Y)
df_train['sex_age'] = pd.Series(Y_CAT.codes)
df_train = df_train.join(pd.get_dummies(df_train["sex_age"], prefix="sex_age").astype(int))
# In[6]:
sex_age_columns = ['sex_age_' + str(i) for i in range(22)]
sex_age_prior_set = df_train[sex_age_columns].mean().values
age_columns = ['age_' + str(i) for i in range(11)]
age_prior_set = df_train[age_columns].mean().values
sex_prior_prob= df_train.sex.mean()
sex_prior_prob
# In[7]:
def hcc_encode(train_df, test_df, variable, target, prior_prob, k=5, f=1, g=1, update_df=None):
"""
See "A Preprocessing Scheme for High-Cardinality Categorical Attributes in
Classification and Prediction Problems" by Daniele Micci-Barreca
"""
hcc_name = "_".join(["hcc", variable, target])
grouped = train_df.groupby(variable)[target].agg({"size": "size", "mean": "mean"})
grouped["lambda"] = 1 / (g + np.exp((k - grouped["size"]) / f))
grouped[hcc_name] = grouped["lambda"] * grouped["mean"] + (1 - grouped["lambda"]) * prior_prob
df = test_df[[variable]].join(grouped, on=variable, how="left")[hcc_name].fillna(prior_prob)
if update_df is None: update_df = test_df
if hcc_name not in update_df.columns: update_df[hcc_name] = np.nan
update_df.update(df)
return
# In[8]:
#拟合年龄
#拟合测试集
# High-Cardinality Categorical encoding
skf = StratifiedKFold(5)
nums = 11
for variable in ['device_brand', 'device_type'] :
for i in range(nums) :
target = age_columns[i]
age_prior_prob = age_prior_set[i]
print (variable, target, age_prior_prob)
hcc_encode(df_train, df_test, variable, target, age_prior_prob, k=5, f=1, g=1, update_df=None)
#拟合验证集
for train, test in skf.split(np.zeros(len(df_train)), df_train['age']):
hcc_encode(df_train.iloc[train], df_train.iloc[test], variable, target, age_prior_prob, k=5, update_df=df_train)
# In[9]:
#拟合性别
#拟合测试集
# High-Cardinality Categorical encoding
skf = StratifiedKFold(5)
for variable in ['device_brand', 'device_type'] :
target = 'sex'
print (variable, target, sex_prior_prob)
hcc_encode(df_train, df_test, variable, target, sex_prior_prob, k=5, f=1, g=1, update_df=None)
#拟合验证集
for train, test in skf.split(np.zeros(len(df_train)), df_train['age']):
hcc_encode(df_train.iloc[train], df_train.iloc[test], variable, target, sex_prior_prob, k=5, f=1, g=1, update_df=df_train)
# In[10]:
#拟合性别年龄
#拟合测试集
# High-Cardinality Categorical encoding
skf = StratifiedKFold(5)
nums = 22
for variable in ['device_brand', 'device_type'] :
for i in range(nums) :
target = sex_age_columns[i]
sex_age_prior_prob = sex_age_prior_set[i]
print (variable, target, sex_age_prior_prob)
hcc_encode(df_train, df_test, variable, target, sex_age_prior_prob, k=5, f=1, g=1, update_df=None)
#拟合验证集
for train, test in skf.split(np.zeros(len(df_train)), df_train['sex_age']):
hcc_encode(df_train.iloc[train], df_train.iloc[test], variable, target, sex_age_prior_prob, k=5, update_df=df_train)
# In[14]:
hcc_columns = ['device_id'] + ['hcc_device_brand_age_' + str(i) for i in range(11)] + ['hcc_device_brand_sex'] + ['hcc_device_type_age_' + str(i) for i in range(11)] + ['hcc_device_type_sex'] + ['hcc_device_type_sex_age_' + str(i) for i in range(22)]
df_total = pd.concat([df_train[hcc_columns], df_test[hcc_columns]])
# In[15]:
df_total.to_csv('hcc_device_brand_age_sex.csv', index=None)
================================================
FILE: THLUO/12.device_age_regression_prob_oof.py
================================================
# coding: utf-8
# In[1]:
import pandas as pd
import seaborn as sns
import numpy as np
from tqdm import tqdm
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.cross_validation import train_test_split
from sklearn.metrics import accuracy_score
import lightgbm as lgb
from datetime import datetime,timedelta
import time
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
import gc
from feat_util import *
# In[2]:
print ('12.device_age_regression_prob_oof.py')
path='input/'
data=pd.DataFrame()
#sex_age=pd.read_excel('./data/性别年龄对照表.xlsx')
# In[3]:
deviceid_packages=pd.read_csv(path+'deviceid_packages.tsv',sep='\t',names=['device_id','apps'])
deviceid_test=pd.read_csv(path+'deviceid_test.tsv',sep='\t',names=['device_id'])
deviceid_train=pd.read_csv(path+'deviceid_train.tsv',sep='\t',names=['device_id','sex','age'])
deviceid_brand = pd.read_csv(path+'deviceid_brand.tsv',sep='\t', names=['device_id','device_brand', 'device_type'])
deviceid_package_start_close = pd.read_csv(path+'deviceid_package_start_close.tsv',sep='\t', names=['device_id','app_id','start_time','close_time'])
package_label = pd.read_csv(path+'package_label.tsv',sep='\t',names=['app_id','app_parent_type', 'app_child_type'])
deviceid_brand['device_brand'] = deviceid_brand.device_brand.apply(lambda x : str(x).split(' ')[0])
df_temp = deviceid_brand.groupby('device_brand')['device_id'].count().reset_index().rename(columns={'device_id':'brand_counts'})
one_time_brand = df_temp[df_temp.brand_counts == 1].device_brand.values
deviceid_brand['device_brand'] = deviceid_brand.device_brand.apply(lambda x : 'other' if x in one_time_brand else x)
df_temp = deviceid_brand.groupby('device_brand')['device_id'].count().reset_index().rename(columns={'device_id':'brand_counts'})
one_time_brand = df_temp[df_temp.brand_counts == 2].device_brand.values
deviceid_brand['device_brand'] = deviceid_brand.device_brand.apply(lambda x : 'other_2' if x in one_time_brand else x)
df_temp = deviceid_brand.groupby('device_brand')['device_id'].count().reset_index().rename(columns={'device_id':'brand_counts'})
one_time_brand = df_temp[df_temp.brand_counts == 3].device_brand.values
deviceid_brand['device_brand'] = deviceid_brand.device_brand.apply(lambda x : 'other_3' if x in one_time_brand else x)
#转换成对应的数字
lbl = LabelEncoder()
lbl.fit(list(deviceid_brand.device_brand.values))
deviceid_brand['device_brand'] = lbl.transform(list(deviceid_brand.device_brand.values))
lbl = LabelEncoder()
lbl.fit(list(deviceid_brand.device_type.values))
deviceid_brand['device_type'] = lbl.transform(list(deviceid_brand.device_type.values))
#转换成对应的数字
lbl = LabelEncoder()
lbl.fit(list(package_label.app_parent_type.values))
package_label['app_parent_type'] = lbl.transform(list(package_label.app_parent_type.values))
lbl = LabelEncoder()
lbl.fit(list(package_label.app_child_type.values))
package_label['app_child_type'] = lbl.transform(list(package_label.app_child_type.values))
# In[4]:
import time
# 输入毫秒级的时间,转出正常格式的时间
def timeStamp(timeNum):
timeStamp = float(timeNum/1000)
timeArray = time.localtime(timeStamp)
otherStyleTime = time.strftime("%Y-%m-%d %H:%M:%S", timeArray)
return otherStyleTime
#解析出具体的时间
deviceid_package_start_close['start_date'] = pd.to_datetime(deviceid_package_start_close.start_time.apply(timeStamp))
deviceid_package_start_close['end_date'] = pd.to_datetime(deviceid_package_start_close.close_time.apply(timeStamp))
deviceid_package_start_close['start_hour'] = deviceid_package_start_close.start_date.dt.hour
deviceid_package_start_close['end_hour'] = deviceid_package_start_close.end_date.dt.hour
deviceid_package_start_close['time_gap'] = (deviceid_package_start_close['end_date'] - deviceid_package_start_close['start_date']).astype('timedelta64[s]')
deviceid_package_start_close = deviceid_package_start_close.merge(package_label, on='app_id', how='left')
deviceid_package_start_close.app_parent_type.fillna(-1, inplace=True)
deviceid_package_start_close.app_child_type.fillna(-1, inplace=True)
deviceid_package_start_close['start_year'] = deviceid_package_start_close.start_date.dt.year
deviceid_package_start_close['end_year'] = deviceid_package_start_close.end_date.dt.year
deviceid_package_start_close['year_gap'] = deviceid_package_start_close['end_year'] - deviceid_package_start_close['start_year']
# In[5]:
deviceid_train=pd.concat([deviceid_train,deviceid_test])
# In[6]:
deviceid_packages['apps']=deviceid_packages['apps'].apply(lambda x:x.split(','))
deviceid_packages['app_lenghth']=deviceid_packages['apps'].apply(lambda x:len(x))
#特征工程
def open_app_timegap_in_hour() :
df_temp = deviceid_package_start_close.groupby(['device_id', 'start_hour'])['time_gap'].mean().reset_index().rename(columns = {'time_gap': 'mean_time_gap'})
df_mean_temp = pd.pivot_table(df_temp, index='device_id', columns='start_hour', values='mean_time_gap').reset_index()
df_mean_temp.columns = ['device_id'] + ['open_app_timegap_in_'+str(i) + '_mean_hour' for i in range(0,24)]
df_mean_temp.fillna(0, inplace=True)
return df_mean_temp
# In[8]:
def device_start_end_app_timegap() :
#用户打开,关闭app的时间间隔
df_ = deviceid_package_start_close.sort_values(by=['device_id', 'start_date'], ascending=False)
df_['prev_start_date'] = df_.groupby('device_id')['start_date'].shift(-1)
df_['start_date_gap'] = (df_['start_date'] - df_['prev_start_date']).astype('timedelta64[s]')
agg_dic = {'start_date_gap' : ['min', 'max', 'mean', 'median', 'std']}
df_start_gap_agg = df_.groupby('device_id').agg(agg_dic)
df_start_gap_agg.columns = pd.Index(['device_' + e[0] + "_" + e[1].upper() for e in df_start_gap_agg.columns.tolist()])
df_start_gap_agg = df_start_gap_agg.reset_index()
#del df_
gc.collect()
#关闭时间间隔
df_ = deviceid_package_start_close.sort_values(by=['device_id', 'end_date'], ascending=False)
df_['prev_end_date'] = df_.groupby('device_id')['end_date'].shift(-1)
df_['end_date_gap'] = (df_['end_date'] - df_['prev_end_date']).astype('timedelta64[s]')
agg_dic = {'end_date_gap' : ['min', 'max', 'mean', 'median', 'std']}
df_end_gap_agg = df_.groupby('device_id').agg(agg_dic)
df_end_gap_agg.columns = pd.Index(['device_' + e[0] + "_" + e[1].upper() for e in df_end_gap_agg.columns.tolist()])
df_end_gap_agg = df_end_gap_agg.reset_index()
#del df_
gc.collect()
df_agg = df_start_gap_agg.merge(df_end_gap_agg, on='device_id', how='left')
#df_agg = df_agg.merge(df_app_start_gap_agg, on='device_id', how='left')
#df_agg = df_agg.merge(df_app_end_gap_agg, on='device_id', how='left')
return df_agg
def open_app_counts_in_hour() :
df_temp = deviceid_package_start_close.groupby(['device_id', 'start_hour'])['app_id'].count().reset_index().rename(columns = {'app_id': 'app_counts'})
df_temp = pd.pivot_table(df_temp, index='device_id', columns='start_hour', values='app_counts').reset_index()
df_temp.columns = ['device_id'] + ['open_app_counts_in'+str(i) + '_hour' for i in range(0,24)]
df_temp.fillna(0, inplace=True)
return df_temp
def close_app_counts_in_hour() :
df_temp = deviceid_package_start_close.groupby(['device_id', 'end_hour'])['app_id'].count().reset_index().rename(columns = {'app_id': 'app_counts'})
df_temp = pd.pivot_table(df_temp, index='device_id', columns='end_hour', values='app_counts').reset_index()
df_temp.columns = ['device_id'] + ['close_app_counts_in'+str(i) + '_hour' for i in range(0,24)]
df_temp.fillna(0, inplace=True)
return df_temp
def app_type_mean_time_gap_one_hot () :
df_temp = deviceid_package_start_close.groupby(['device_id', 'app_parent_type'])['time_gap'].mean().reset_index()
df_temp = pd.pivot_table(df_temp, index='device_id', columns='app_parent_type', values='time_gap').reset_index()
df_temp.columns = ['device_id'] + ['app_parent_type_mean_time_gap'+str(i) for i in range(-1,45)]
df_temp.fillna(-1, inplace=True)
return df_temp
def device_active_hour() :
aggregations = {
'start_hour' : ['std','mean','max','min'],
'end_hour' : ['std','mean','max','min']
}
df_agg = deviceid_package_start_close.groupby('device_id').agg(aggregations)
df_agg.columns = pd.Index(['device_grouped_' + e[0] + "_" + e[1].upper() for e in df_agg.columns.tolist()])
df_agg = df_agg.reset_index()
return df_agg
def device_brand_encoding() :
df_temp = deviceid_brand.merge(deviceid_train[['device_id', 'age', 'sex']], on='device_id', how='left')
aggregations = {
'age' : ['std','mean'],
'sex' : ['mean'],
}
df_device_brand = df_temp.groupby('device_brand').agg(aggregations)
df_device_brand.columns = pd.Index(['device_brand_' + e[0] + "_" + e[1].upper() for e in df_device_brand.columns.tolist()])
df_device_brand = df_device_brand.reset_index()
df_device_type = df_temp.groupby('device_type').agg(aggregations)
df_device_type.columns = pd.Index(['device_type_' + e[0] + "_" + e[1].upper() for e in df_device_type.columns.tolist()])
df_device_type = df_device_type.reset_index()
df_temp = df_temp.merge(df_device_brand, on='device_brand', how='left')
df_temp = df_temp.merge(df_device_type, on='device_type', how='left')
aggregations = {
'device_brand_age_STD' : ['mean'],
'device_brand_age_MEAN' : ['mean'],
'device_brand_sex_MEAN' : ['mean'],
#'device_type_age_STD' : ['mean'],
#'device_type_age_MEAN' : ['mean'],
#'device_type_sex_MEAN' : ['mean']
}
df_agg = df_temp.groupby('device_id').agg(aggregations)
df_agg.columns = pd.Index(['device_grouped_' + e[0] + "_" + e[1].upper() for e in df_agg.columns.tolist()])
df_agg = df_agg.reset_index()
return df_agg
#统计device运行app的情况
def device_active_time_time_stat() :
#device开启app的时间统计信息
deviceid_package_start_close['active_time'] = deviceid_package_start_close['close_time'] - deviceid_package_start_close['start_time']
#device开启了多少次app
#device开启了多少个app
aggregations = {
'app_id' : ['count', 'nunique'],
'active_time' : ['mean', 'std', 'max', 'min'],
}
df_agg = deviceid_package_start_close.groupby('device_id').agg(aggregations)
df_agg.columns = pd.Index(['device_grouped_' + e[0] + "_" + e[1].upper() for e in df_agg.columns.tolist()])
df_agg = df_agg.reset_index()
aggregations = {
'active_time' : ['mean', 'std', 'max', 'min', 'count'],
}
df_da_agg = deviceid_package_start_close.groupby(['device_id', 'app_id']).agg(aggregations)
df_da_agg.columns = pd.Index(['device_app_grouped_' + e[0] + "_" + e[1].upper() for e in df_da_agg.columns.tolist()])
df_da_agg = df_da_agg.reset_index()
#device开启app的平均时间
aggregations = {
'device_app_grouped_active_time_MEAN' : ['mean', 'std', 'max', 'min'],
'device_app_grouped_active_time_STD' : ['mean', 'std', 'max', 'min'],
'device_app_grouped_active_time_MAX' : ['mean', 'std', 'max', 'min'],
'device_app_grouped_active_time_MIN' : ['mean', 'std', 'max', 'min'],
'device_app_grouped_active_time_COUNT' : ['mean', 'std', 'max', 'min'],
}
df_temp = df_da_agg.groupby(['device_id']).agg(aggregations)
df_temp.columns = pd.Index([e[0] + "_" + e[1].upper() for e in df_temp.columns.tolist()])
df_temp = df_temp.reset_index()
df_agg = df_agg.merge(df_temp, on='device_id', how='left')
return df_agg
def app_type_encoding() :
df_temp = df_device_app_pair.merge(deviceid_train[['device_id', 'age', 'sex']], on='device_id', how='left')
aggregations = {
'age' : ['std','mean'],
'sex' : ['mean'],
}
df_agg_app_parent_type = df_temp.groupby('app_parent_type').agg(aggregations)
df_agg_app_parent_type.columns = pd.Index(['app_parent_type_' + e[0] + "_" + e[1].upper() for e in df_agg_app_parent_type.columns.tolist()])
df_agg_app_parent_type = df_agg_app_parent_type.reset_index()
df_agg_app_child_type = df_temp.groupby('app_child_type').agg(aggregations)
df_agg_app_child_type.columns = pd.Index(['app_child_type_' + e[0] + "_" + e[1].upper() for e in df_agg_app_child_type.columns.tolist()])
df_agg_app_child_type = df_agg_app_child_type.reset_index()
df_temp = df_temp.merge(df_agg_app_parent_type, on='app_parent_type', how='left')
df_temp = df_temp.merge(df_agg_app_child_type, on='app_child_type', how='left')
aggregations = {
'app_parent_type_age_STD' : ['mean'],
'app_parent_type_age_MEAN' : ['mean'],
'app_parent_type_sex_MEAN' : ['mean'],
'app_child_type_age_STD' : ['mean'],
'app_child_type_age_MEAN' : ['mean'],
'app_child_type_sex_MEAN' : ['mean']
}
df_agg = df_temp.groupby('device_id').agg(aggregations)
df_agg.columns = pd.Index(['device_grouped_' + e[0] + "_" + e[1].upper() for e in df_agg.columns.tolist()])
df_agg = df_agg.reset_index()
return df_agg
#每个device对应的app_parent_type计数
def app_type_onehot_in_device(df) :
df_copy = df.fillna(-1)
df_temp = df_copy.groupby(['device_id', 'app_parent_type'])['app_id'].size().reset_index()
df_temp.rename(columns = {'app_id' : 'app_parent_type_counts'}, inplace=True)
df_temp = pd.pivot_table(df_temp, index='device_id', columns='app_parent_type', values='app_parent_type_counts').reset_index()
df_temp.columns = ['device_id'] + ['app_parent_type'+str(i) for i in range(-1,45)]
df_temp.fillna(0, inplace=True)
return df_temp
apps=deviceid_packages['apps'].apply(lambda x:' '.join(x)).tolist()
vectorizer=CountVectorizer()
transformer=TfidfTransformer()
cntTf = vectorizer.fit_transform(apps)
tfidf=transformer.fit_transform(cntTf)
word=vectorizer.get_feature_names()
weight=tfidf.toarray()
df_weight=pd.DataFrame(weight)
feature=df_weight.columns
df_weight['sum']=0
for f in tqdm(feature):
df_weight['sum']+=df_weight[f]
deviceid_packages['tfidf_sum']=df_weight['sum']
# In[10]:
lda = LatentDirichletAllocation(n_topics=5,
learning_offset=50.,
random_state=666)
docres = lda.fit_transform(cntTf)
# In[11]:
deviceid_packages = pd.concat([deviceid_packages,pd.DataFrame(docres)],axis=1)
# In[12]:
temp=deviceid_packages.drop('apps',axis=1)
deviceid_train=pd.merge(deviceid_train,temp,on='device_id',how='left')
# In[13]:
#解析出所有的device_app_pair
device_id_arr = []
app_arr = []
df_device_app_pair = pd.DataFrame()
for row in deviceid_packages.values :
device_id = row[0]
app_list = row[1]
for app in app_list :
device_id_arr.append(device_id)
app_arr.append(app)
#生成pair
df_device_app_pair['device_id'] = device_id_arr
df_device_app_pair['app_id'] = app_arr
df_device_app_pair = df_device_app_pair.merge(package_label, how='left', on='app_id')
# In[15]:
#提取特征
df_train = deviceid_train.merge(device_active_time_time_stat(), on='device_id', how='left')
df_train = df_train.merge(deviceid_brand, on='device_id', how='left')
df_train = df_train.merge(app_type_onehot_in_device(df_device_app_pair), on='device_id', how='left')
df_train = df_train.merge(app_type_encoding(), on='device_id', how='left')
df_train = df_train.merge(device_active_hour(), on='device_id', how='left')
df_train = df_train.merge(app_type_mean_time_gap_one_hot(), on='device_id', how='left')
df_train = df_train.merge(open_app_counts_in_hour(), on='device_id', how='left')
df_train = df_train.merge(close_app_counts_in_hour(), on='device_id', how='left')
df_train = df_train.merge(device_brand_encoding(), on='device_id', how='left')
df_train = df_train.merge(device_start_end_app_timegap(), on='device_id', how='left')
df_train = df_train.merge(open_app_timegap_in_hour(), on='device_id', how='left')
# In[16]:
df_w2c_start = pd.read_csv('device_start_app_w2c.csv')
df_w2c_close = pd.read_csv('device_close_app_w2c.csv')
df_w2c_all = pd.read_csv('device_all_app_w2c.csv')
df_device_quchong_start_app_w2c = pd.read_csv('device_quchong_start_app_w2c.csv')
df_device_app_unique_start_app_w2c = pd.read_csv('device_app_unique_start_app_w2c.csv')
df_device_app_unique_close_app_w2c = pd.read_csv('device_app_unique_close_app_w2c.csv')
df_device_app_unique_all_app_w2c = pd.read_csv('device_app_unique_all_app_w2c.csv')
df_hcc_device_brand_age_sex = pd.read_csv('hcc_device_brand_age_sex.csv')
df_train_w2v = df_train.merge(df_w2c_start, on='device_id', how='left')
df_train_w2v = df_train_w2v.merge(df_w2c_close, on='device_id', how='left')
df_train_w2v = df_train_w2v.merge(df_w2c_all, on='device_id', how='left')
df_train_w2v = df_train_w2v.merge(df_device_quchong_start_app_w2c, on='device_id', how='left')
df_train_w2v = df_train_w2v.merge(df_device_app_unique_start_app_w2c, on='device_id', how='left')
df_train_w2v = df_train_w2v.merge(df_device_app_unique_close_app_w2c, on='device_id', how='left')
df_train_w2v = df_train_w2v.merge(df_device_app_unique_all_app_w2c, on='device_id', how='left')
df_train_w2v = df_train_w2v.merge(df_hcc_device_brand_age_sex, on='device_id', how='left')
# In[22]:
train = df_train_w2v[df_train_w2v['age'].notnull()]
test = df_train_w2v[df_train_w2v['age'].isnull()]
# In[23]:
X = train.drop(['sex', 'age', 'device_id'],axis=1)
Y = train['age']
# In[24]:
from sklearn.model_selection import KFold, StratifiedKFold
seed = 2018
num_folds = 5
folds = StratifiedKFold(n_splits= num_folds, shuffle=True, random_state=seed)
sub_list = []
oof_preds = np.zeros(train.shape[0])
cate_feat = ['device_type','device_brand']
params = {
'boosting_type': 'gbdt',
'learning_rate' : 0.02,
'num_leaves' : 2 ** 5,
'objective' : 'regression',
'metric' : 'rmse',
'random_state' : 6666,
'bagging_freq' : 5,
'feature_fraction' : 0.7,
'bagging_fraction' : 0.7,
'min_split_gain' : 0.0970905919552776,
'min_child_weight' : 9.42012323936088,
}
for n_fold, (train_idx, valid_idx) in enumerate(folds.split(X, Y)):
train_x, train_y = X.iloc[train_idx], Y.iloc[train_idx]
valid_x, valid_y = X.iloc[valid_idx], Y.iloc[valid_idx]
lgb_train=lgb.Dataset(train_x,label=train_y)
lgb_eval = lgb.Dataset(valid_x, valid_y, reference=lgb_train)
gbm = lgb.train(params, lgb_train, num_boost_round=800, valid_sets=[lgb_train, lgb_eval], verbose_eval=50)
oof_preds[valid_idx] = gbm.predict(valid_x[X.columns.values])
train['age_regression_prob_oof'] = oof_preds
# In[26]:
#用全部的train来预测test
lgb_train = lgb.Dataset(X,label=Y)
gbm = lgb.train(params, lgb_train, num_boost_round=800, valid_sets=lgb_train, verbose_eval=50)
test = test.reset_index(drop=True)
test_preds = gbm.predict(test[X.columns.values])
# In[27]:
test['age_regression_prob_oof'] = test_preds
# In[30]:
df_age_prob_oof = pd.concat([train[['device_id', 'age_regression_prob_oof']],
test[['device_id', 'age_regression_prob_oof']]])
df_age_prob_oof.to_csv('device_age_regression_prob_oof.csv', index=None)
================================================
FILE: THLUO/13.device_start_GRU_pred.py
================================================
# coding: utf-8
# In[1]:
# coding: utf-8
import feather
import os
import re
import sys
import gc
import random
import pandas as pd
import numpy as np
import gensim
from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence
from scipy import stats
import tensorflow as tf
import keras
from keras.layers import *
from keras.models import *
from keras.optimizers import *
from keras.callbacks import *
from keras.preprocessing import text, sequence
from keras.utils import to_categorical
from keras.engine.topology import Layer
from sklearn.preprocessing import LabelEncoder
from keras.utils import np_utils
from keras.utils.training_utils import multi_gpu_model
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score
import warnings
from TextModel import *
warnings.filterwarnings('ignore')
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
session = tf.Session(config=config)
# In[2]:
print ('13.device_start_GRU_pred.py')
df_doc = pd.read_csv('01.device_click_app_sorted_by_start.csv')
deviceid_test=pd.read_csv('input/deviceid_test.tsv',sep='\t',names=['device_id'])
deviceid_train=pd.read_csv('input/deviceid_train.tsv',sep='\t',names=['device_id','sex','age'])
df_total = pd.concat([deviceid_train, deviceid_test])
df_doc = df_doc.merge(df_total, on='device_id', how='left')
df_wv2_all = pd.read_csv('w2c_all_emb.csv')
dic_w2c_all = {}
for row in df_wv2_all.values :
app_id = row[0]
vector = row[1:]
dic_w2c_all[app_id] = vector
# In[3]:
df_doc['sex'] = df_doc['sex'].apply(lambda x:str(x))
df_doc['age'] = df_doc['age'].apply(lambda x:str(x))
def tool(x):
if x=='nan':
return x
else:
return str(int(float(x)))
df_doc['sex']=df_doc['sex'].apply(tool)
df_doc['age']=df_doc['age'].apply(tool)
df_doc['sex_age']=df_doc['sex']+'-'+df_doc['age']
df_doc = df_doc.replace({'nan':np.NaN,'nan-nan':np.NaN})
train = df_doc[df_doc['sex_age'].notnull()]
test = df_doc[df_doc['sex_age'].isnull()]
train.reset_index(drop=True, inplace=True)
test.reset_index(drop=True, inplace=True)
lb = LabelEncoder()
train_label = lb.fit_transform(train['sex_age'].values)
train['class'] = train_label
# In[5]:
column_name="app_list"
word_seq_len = 900
victor_size = 200
num_words = 35000
batch_size = 64
classification = 22
kfold=10
# In[6]:
from sklearn.metrics import log_loss
def get_mut_label(y_label) :
results = []
for ele in y_label :
results.append(ele.argmax())
return results
class RocAucEvaluation(Callback):
def __init__(self, validation_data=(), interval=1):
super(Callback, self).__init__()
self.interval = interval
self.X_val, self.y_val = validation_data
def on_epoch_end(self, epoch, logs={}):
if epoch % self.interval == 0:
y_pred = self.model.predict(self.X_val, verbose=0)
val_y = get_mut_label(self.y_val)
score = log_loss(val_y, y_pred)
print("\n mlogloss - epoch: %d - score: %.6f \n" % (epoch+1, score))
# In[7]:
#词向量
def w2v_pad(df_train,df_test,col, maxlen_,victor_size, num_words):
tokenizer = text.Tokenizer(num_words=num_words, lower=False,filters="")
tokenizer.fit_on_texts(list(df_train[col].values)+list(df_test[col].values))
train_ = sequence.pad_sequences(tokenizer.texts_to_sequences(df_train[col].values), maxlen=maxlen_)
test_ = sequence.pad_sequences(tokenizer.texts_to_sequences(df_test[col].values), maxlen=maxlen_)
word_index = tokenizer.word_index
count = 0
nb_words = len(word_index)
print(nb_words)
all_data=pd.concat([df_train[col],df_test[col]])
file_name = 'embedding/' + 'Word2Vec_start_' + col +"_"+ str(victor_size) + '.model'
if not os.path.exists(file_name):
model = Word2Vec([[word for word in document.split(' ')] for document in all_data.values],
size=victor_size, window=5, iter=10, workers=11, seed=2018, min_count=2)
model.save(file_name)
else:
model = Word2Vec.load(file_name)
print("add word2vec finished....")
embedding_word2vec_matrix = np.zeros((nb_words + 1, victor_size))
for word, i in word_index.items():
embedding_vector = model[word] if word in model else None
if embedding_vector is not None:
count += 1
embedding_word2vec_matrix[i] = embedding_vector
else:
unk_vec = np.random.random(victor_size) * 0.5
unk_vec = unk_vec - unk_vec.mean()
embedding_word2vec_matrix[i] = unk_vec
embedding_w2c_all = np.zeros((nb_words + 1, victor_size))
for word, i in word_index.items():
embedding_vector = dic_w2c_all[word]
embedding_w2c_all[i] = embedding_vector
#embedding_matrix = np.concatenate((embedding_word2vec_matrix,embedding_w2c_all),axis=1)
embedding_matrix = embedding_word2vec_matrix
return train_, test_, word_index, embedding_matrix
# In[8]:
train_, test_,word2idx, word_embedding = w2v_pad(train,test,column_name, word_seq_len,victor_size, num_words)
# In[11]:
my_opt="bi_gru_model"
#参数
Y = train['class'].values
if not os.path.exists("cache/"+my_opt):
os.mkdir("cache/"+my_opt)
# In[12]:
from sklearn.model_selection import KFold, StratifiedKFold
gc.collect()
seed = 2006
num_folds = 10
kf = StratifiedKFold(n_splits= num_folds, shuffle=True, random_state=seed).split(train_, Y)
# In[13]:
epochs = 4
my_opt=eval(my_opt)
train_model_pred = np.zeros((train_.shape[0], classification))
test_model_pred = np.zeros((test_.shape[0], classification))
for i, (train_fold, val_fold) in enumerate(kf):
X_train, X_valid, = train_[train_fold, :], train_[val_fold, :]
y_train, y_valid = Y[train_fold], Y[val_fold]
y_tra = to_categorical(y_train)
y_val = to_categorical(y_valid)
#模型
name = str(my_opt.__name__)
model = my_opt(word_seq_len, word_embedding, classification)
RocAuc = RocAucEvaluation(validation_data=(X_valid, y_val), interval=1)
hist = model.fit(X_train, y_tra, batch_size=batch_size, epochs=epochs, validation_data=(X_valid, y_val),
callbacks=[RocAuc])
train_model_pred[val_fold, :] = model.predict(X_valid)
# In[26]:
#模型
#用全部的数据预测
train_label = to_categorical(Y)
name = str(my_opt.__name__)
model = my_opt(word_seq_len, word_embedding, classification)
RocAuc = RocAucEvaluation(validation_data=(train_, train_label), interval=1)
hist = model.fit(train_, train_label, batch_size=batch_size, epochs=epochs, validation_data=(train_, train_label),
callbacks=[RocAuc])
test_model_pred = model.predict(test_)
# In[27]:
df_train_pred = pd.DataFrame(train_model_pred)
df_test_pred = pd.DataFrame(test_model_pred)
df_train_pred.columns = ['device_start_GRU_pred_' + str(i) for i in range(22)]
df_test_pred.columns = ['device_start_GRU_pred_' + str(i) for i in range(22)]
# In[35]:
df_train_pred = pd.concat([train[['device_id']], df_train_pred], axis=1)
df_test_pred = pd.concat([test[['device_id']], df_test_pred], axis=1)
# In[37]:
df_results = pd.concat([df_train_pred, df_test_pred])
df_results.to_csv('device_start_GRU_pred.csv', index=None)
================================================
FILE: THLUO/14.device_start_GRU_pred_age.py
================================================
# coding: utf-8
# In[1]:
# coding: utf-8
import feather
import os
import re
import sys
import gc
import random
import pandas as pd
import numpy as np
import gensim
from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence
from scipy import stats
import tensorflow as tf
import keras
from keras.layers import *
from keras.models import *
from keras.optimizers import *
from keras.callbacks import *
from keras.preprocessing import text, sequence
from keras.utils import to_categorical
from keras.engine.topology import Layer
from sklearn.preprocessing import LabelEncoder
from keras.utils import np_utils
from keras.utils.training_utils import multi_gpu_model
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score
from TextModel import *
import warnings
warnings.filterwarnings('ignore')
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
session = tf.Session(config=config)
# In[2]:
print('14.device_start_GRU_pred_age.py')
df_doc = pd.read_csv('01.device_click_app_sorted_by_start.csv')
deviceid_test=pd.read_csv('input/deviceid_test.tsv',sep='\t',names=['device_id'])
deviceid_train=pd.read_csv('input/deviceid_train.tsv',sep='\t',names=['device_id','sex','age'])
df_total = pd.concat([deviceid_train, deviceid_test])
df_doc = df_doc.merge(df_total, on='device_id', how='left')
df_wv2_all = pd.read_csv('w2c_all_emb.csv')
dic_w2c_all = {}
for row in df_wv2_all.values :
app_id = row[0]
vector = row[1:]
dic_w2c_all[app_id] = vector
# In[3]:
train = df_doc[df_doc['age'].notnull()]
test = df_doc[df_doc['age'].isnull()]
train.reset_index(drop=True, inplace=True)
test.reset_index(drop=True, inplace=True)
lb = LabelEncoder()
train_label = lb.fit_transform(train['age'].values)
train['class'] = train_label
# In[5]:
column_name="app_list"
word_seq_len = 900
victor_size = 200
num_words = 35000
batch_size = 64
classification = 11
kfold=10
# In[6]:
from sklearn.metrics import log_loss
def get_mut_label(y_label) :
results = []
for ele in y_label :
results.append(ele.argmax())
return results
class RocAucEvaluation(Callback):
def __init__(self, validation_data=(), interval=1):
super(Callback, self).__init__()
self.interval = interval
self.X_val, self.y_val = validation_data
def on_epoch_end(self, epoch, logs={}):
if epoch % self.interval == 0:
y_pred = self.model.predict(self.X_val, verbose=0)
val_y = get_mut_label(self.y_val)
score = log_loss(val_y, y_pred)
print("\n mlogloss - epoch: %d - score: %.6f \n" % (epoch+1, score))
# In[7]:
#词向量
def w2v_pad(df_train,df_test,col, maxlen_,victor_size, num_words):
tokenizer = text.Tokenizer(num_words=num_words, lower=False,filters="")
tokenizer.fit_on_texts(list(df_train[col].values)+list(df_test[col].values))
train_ = sequence.pad_sequences(tokenizer.texts_to_sequences(df_train[col].values), maxlen=maxlen_)
test_ = sequence.pad_sequences(tokenizer.texts_to_sequences(df_test[col].values), maxlen=maxlen_)
word_index = tokenizer.word_index
count = 0
nb_words = len(word_index)
print(nb_words)
all_data=pd.concat([df_train[col],df_test[col]])
file_name = 'embedding/' + 'Word2Vec_start_' + col +"_"+ str(victor_size) + '.model'
if not os.path.exists(file_name):
model = Word2Vec([[word for word in document.split(' ')] for document in all_data.values],
size=victor_size, window=5, iter=10, workers=11, seed=2018, min_count=2)
model.save(file_name)
else:
model = Word2Vec.load(file_name)
print("add word2vec finished....")
embedding_word2vec_matrix = np.zeros((nb_words + 1, victor_size))
for word, i in word_index.items():
embedding_vector = model[word] if word in model else None
if embedding_vector is not None:
count += 1
embedding_word2vec_matrix[i] = embedding_vector
else:
unk_vec = np.random.random(victor_size) * 0.5
unk_vec = unk_vec - unk_vec.mean()
embedding_word2vec_matrix[i] = unk_vec
embedding_w2c_all = np.zeros((nb_words + 1, victor_size))
for word, i in word_index.items():
embedding_vector = dic_w2c_all[word]
embedding_w2c_all[i] = embedding_vector
#embedding_matrix = np.concatenate((embedding_word2vec_matrix,embedding_w2c_all),axis=1)
embedding_matrix = embedding_word2vec_matrix
return train_, test_, word_index, embedding_matrix
# In[8]:
train_, test_,word2idx, word_embedding = w2v_pad(train,test,column_name, word_seq_len,victor_size, num_words)
# In[11]:
my_opt="bi_gru_model"
#参数
Y = train['class'].values
if not os.path.exists("cache/"+my_opt):
os.mkdir("cache/"+my_opt)
# In[17]:
from sklearn.model_selection import KFold, StratifiedKFold
gc.collect()
seed = 2006
num_folds = 10
kf = StratifiedKFold(n_splits= num_folds, shuffle=True, random_state=seed).split(train_, Y)
epochs = 4
my_opt=eval(my_opt)
train_model_pred = np.zeros((train_.shape[0], classification))
test_model_pred = np.zeros((test_.shape[0], classification))
for i, (train_fold, val_fold) in enumerate(kf):
X_train, X_valid, = train_[train_fold, :], train_[val_fold, :]
y_train, y_valid = Y[train_fold], Y[val_fold]
y_tra = to_categorical(y_train)
y_val = to_categorical(y_valid)
#模型
name = str(my_opt.__name__)
model = my_opt(word_seq_len, word_embedding, classification)
RocAuc = RocAucEvaluation(validation_data=(X_valid, y_val), interval=1)
hist = model.fit(X_train, y_tra, batch_size=batch_size, epochs=epochs, validation_data=(X_valid, y_val),
callbacks=[RocAuc])
train_model_pred[val_fold, :] = model.predict(X_valid)
# In[21]:
#模型
#用全部的数据预测
train_label = to_categorical(Y)
name = str(my_opt.__name__)
model = my_opt(word_seq_len, word_embedding, classification)
RocAuc = RocAucEvaluation(validation_data=(train_, train_label), interval=1)
hist = model.fit(train_, train_label, batch_size=batch_size, epochs=epochs, validation_data=(train_, train_label),
callbacks=[RocAuc])
test_model_pred = model.predict(test_)
# In[22]:
df_train_pred = pd.DataFrame(train_model_pred)
df_test_pred = pd.DataFrame(test_model_pred)
df_train_pred.columns = ['device_start_GRU_pred_age_' + str(i) for i in range(11)]
df_test_pred.columns = ['device_start_GRU_pred_age_' + str(i) for i in range(11)]
# In[23]:
df_train_pred = pd.concat([train[['device_id']], df_train_pred], axis=1)
df_test_pred = pd.concat([test[['device_id']], df_test_pred], axis=1)
# In[24]:
df_results = pd.concat([df_train_pred, df_test_pred])
df_results.to_csv('device_start_GRU_pred_age.csv', index=None)
================================================
FILE: THLUO/15.device_all_GRU_pred.py
================================================
# coding: utf-8
# In[1]:
# coding: utf-8
import feather
import os
import re
import sys
import gc
import random
import pandas as pd
import numpy as np
import gensim
from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence
from scipy import stats
import tensorflow as tf
import keras
from keras.layers import *
from keras.models import *
from keras.optimizers import *
from keras.callbacks import *
from keras.preprocessing import text, sequence
from keras.utils import to_categorical
from keras.engine.topology import Layer
from sklearn.preprocessing import LabelEncoder
from keras.utils import np_utils
from keras.utils.training_utils import multi_gpu_model
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from TextModel import *
from sklearn.metrics import f1_score
import warnings
warnings.filterwarnings('ignore')
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
session = tf.Session(config=config)
# In[2]:
print('15.device_all_GRU_pred.py')
df_doc = pd.read_csv('03.device_click_app_sorted_by_all.csv')
deviceid_test=pd.read_csv('input/deviceid_test.tsv',sep='\t',names=['device_id'])
deviceid_train=pd.read_csv('input/deviceid_train.tsv',sep='\t',names=['device_id','sex','age'])
df_total = pd.concat([deviceid_train, deviceid_test])
df_doc = df_doc.merge(df_total, on='device_id', how='left')
df_wv2_all = pd.read_csv('w2c_all_emb.csv')
dic_w2c_all = {}
for row in df_wv2_all.values :
app_id = row[0]
vector = row[1:]
dic_w2c_all[app_id] = vector
# In[3]:
df_doc['sex'] = df_doc['sex'].apply(lambda x:str(x))
df_doc['age'] = df_doc['age'].apply(lambda x:str(x))
def tool(x):
if x=='nan':
return x
else:
return str(int(float(x)))
df_doc['sex']=df_doc['sex'].apply(tool)
df_doc['age']=df_doc['age'].apply(tool)
df_doc['sex_age']=df_doc['sex']+'-'+df_doc['age']
df_doc = df_doc.replace({'nan':np.NaN,'nan-nan':np.NaN})
train = df_doc[df_doc['sex_age'].notnull()]
test = df_doc[df_doc['sex_age'].isnull()]
train.reset_index(drop=True, inplace=True)
test.reset_index(drop=True, inplace=True)
lb = LabelEncoder()
train_label = lb.fit_transform(train['sex_age'].values)
train['class'] = train_label
# In[6]:
column_name="app_list"
word_seq_len = 1800
victor_size = 200
num_words = 35000
batch_size = 64
classification = 22
kfold=10
# In[7]:
from sklearn.metrics import log_loss
def get_mut_label(y_label) :
results = []
for ele in y_label :
results.append(ele.argmax())
return results
class RocAucEvaluation(Callback):
def __init__(self, validation_data=(), interval=1):
super(Callback, self).__init__()
self.interval = interval
self.X_val, self.y_val = validation_data
def on_epoch_end(self, epoch, logs={}):
if epoch % self.interval == 0:
y_pred = self.model.predict(self.X_val, verbose=0)
val_y = get_mut_label(self.y_val)
score = log_loss(val_y, y_pred)
print("\n mlogloss - epoch: %d - score: %.6f \n" % (epoch+1, score))
# In[14]:
#词向量
def w2v_pad(df_train,df_test,col, maxlen_,victor_size, num_words):
tokenizer = text.Tokenizer(num_words=num_words, lower=False,filters="")
tokenizer.fit_on_texts(list(df_train[col].values)+list(df_test[col].values))
train_ = sequence.pad_sequences(tokenizer.texts_to_sequences(df_train[col].values), maxlen=maxlen_)
test_ = sequence.pad_sequences(tokenizer.texts_to_sequences(df_test[col].values), maxlen=maxlen_)
word_index = tokenizer.word_index
count = 0
nb_words = len(word_index)
print(nb_words)
all_data=pd.concat([df_train[col],df_test[col]])
file_name = 'embedding/' + 'Word2Vec_all' + col +"_"+ str(victor_size) + '.model'
if not os.path.exists(file_name):
model = Word2Vec([[word for word in document.split(' ')] for document in all_data.values],
size=victor_size, window=30, iter=10, workers=11, seed=2018, min_count=2)
model.save(file_name)
else:
model = Word2Vec.load(file_name)
print("add word2vec finished....")
embedding_word2vec_matrix = np.zeros((nb_words + 1, victor_size))
for word, i in word_index.items():
embedding_vector = model[word] if word in model else None
if embedding_vector is not None:
count += 1
embedding_word2vec_matrix[i] = embedding_vector
else:
unk_vec = np.random.random(victor_size) * 0.5
unk_vec = unk_vec - unk_vec.mean()
embedding_word2vec_matrix[i] = unk_vec
embedding_w2c_all = np.zeros((nb_words + 1, victor_size))
for word, i in word_index.items():
embedding_vector = dic_w2c_all[word]
embedding_w2c_all[i] = embedding_vector
#embedding_matrix = np.concatenate((embedding_word2vec_matrix,embedding_w2c_all),axis=1)
embedding_matrix = embedding_word2vec_matrix
return train_, test_, word_index, embedding_matrix
# In[15]:
train_, test_,word2idx, word_embedding = w2v_pad(train,test,column_name, word_seq_len,victor_size, num_words)
# In[21]:
my_opt="bi_gru_model"
#参数
Y = train['class'].values
if not os.path.exists("cache/"+my_opt):
os.mkdir("cache/"+my_opt)
# In[22]:
from sklearn.model_selection import KFold, StratifiedKFold
gc.collect()
seed = 2006
num_folds = 10
kf = StratifiedKFold(n_splits= num_folds, shuffle=True, random_state=seed).split(train_, Y)
# In[23]:
epochs = 4
my_opt=eval(my_opt)
train_model_pred = np.zeros((train_.shape[0], classification))
test_model_pred = np.zeros((test_.shape[0], classification))
for i, (train_fold, val_fold) in enumerate(kf):
X_train, X_valid, = train_[train_fold, :], train_[val_fold, :]
y_train, y_valid = Y[train_fold], Y[val_fold]
y_tra = to_categorical(y_train)
y_val = to_categorical(y_valid)
#模型
name = str(my_opt.__name__)
model = my_opt(word_seq_len, word_embedding, classification)
RocAuc = RocAucEvaluation(validation_data=(X_valid, y_val), interval=1)
hist = model.fit(X_train, y_tra, batch_size=batch_size, epochs=epochs, validation_data=(X_valid, y_val),
callbacks=[RocAuc])
train_model_pred[val_fold, :] = model.predict(X_valid)
del model
del hist
gc.collect()
# In[27]:
#模型
#用全部的数据预测
train_label = to_categorical(Y)
name = str(my_opt.__name__)
model = my_opt(word_seq_len, word_embedding, classification)
RocAuc = RocAucEvaluation(validation_data=(train_, train_label), interval=1)
hist = model.fit(train_, train_label, batch_size=batch_size, epochs=epochs, validation_data=(train_, train_label),
callbacks=[RocAuc])
test_model_pred = model.predict(test_)
# In[28]:
df_train_pred = pd.DataFrame(train_model_pred)
df_test_pred = pd.DataFrame(test_model_pred)
df_train_pred.columns = ['device_all_GRU_pred_' + str(i) for i in range(22)]
df_test_pred.columns = ['device_all_GRU_pred_' + str(i) for i in range(22)]
# In[29]:
df_train_pred = pd.concat([train[['device_id']], df_train_pred], axis=1)
df_test_pred = pd.concat([test[['device_id']], df_test_pred], axis=1)
# In[30]:
df_results = pd.concat([df_train_pred, df_test_pred])
df_results.to_csv('device_all_GRU_pred.csv', index=None)
================================================
FILE: THLUO/16.device_start_capsule_pred.py
================================================
# coding: utf-8
# In[1]:
# coding: utf-8
import feather
import os
import re
import sys
import gc
import random
import pandas as pd
import numpy as np
import gensim
from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence
from scipy import stats
import tensorflow as tf
import keras
from keras.layers import *
from keras.models import *
from keras.optimizers import *
from keras.callbacks import *
from keras.preprocessing import text, sequence
from keras.utils import to_categorical
from keras.engine.topology import Layer
from sklearn.preprocessing import LabelEncoder
from keras.utils import np_utils
from keras.utils.training_utils import multi_gpu_model
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score
import warnings
warnings.filterwarnings('ignore')
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
session = tf.Session(config=config)
# In[2]:
print ('16.device_start_capsule_pred.py')
df_doc = pd.read_csv('01.device_click_app_sorted_by_start.csv')
deviceid_test=pd.read_csv('input/deviceid_test.tsv',sep='\t',names=['device_id'])
deviceid_train=pd.read_csv('input/deviceid_train.tsv',sep='\t',names=['device_id','sex','age'])
df_total = pd.concat([deviceid_train, deviceid_test])
df_doc = df_doc.merge(df_total, on='device_id', how='left')
df_wv2_all = pd.read_csv('w2c_all_emb.csv')
dic_w2c_all = {}
for row in df_wv2_all.values :
app_id = row[0]
vector = row[1:]
dic_w2c_all[app_id] = vector
# In[3]:
df_doc['sex'] = df_doc['sex'].apply(lambda x:str(x))
df_doc['age'] = df_doc['age'].apply(lambda x:str(x))
def tool(x):
if x=='nan':
return x
else:
return str(int(float(x)))
df_doc['sex']=df_doc['sex'].apply(tool)
df_doc['age']=df_doc['age'].apply(tool)
df_doc['sex_age']=df_doc['sex']+'-'+df_doc['age']
df_doc = df_doc.replace({'nan':np.NaN,'nan-nan':np.NaN})
train = df_doc[df_doc['sex_age'].notnull()]
test = df_doc[df_doc['sex_age'].isnull()]
train.reset_index(drop=True, inplace=True)
test.reset_index(drop=True, inplace=True)
lb = LabelEncoder()
train_label = lb.fit_transform(train['sex_age'].values)
train['class'] = train_label
# In[5]:
column_name="app_list"
word_seq_len = 900
victor_size = 200
num_words = 35000
batch_size = 64
classification = 22
kfold=10
# In[6]:
from sklearn.metrics import log_loss
def get_mut_label(y_label) :
results = []
for ele in y_label :
results.append(ele.argmax())
return results
class RocAucEvaluation(Callback):
def __init__(self, validation_data=(), interval=1):
super(Callback, self).__init__()
self.interval = interval
self.X_val, self.y_val = validation_data
def on_epoch_end(self, epoch, logs={}):
if epoch % self.interval == 0:
y_pred = self.model.predict(self.X_val, verbose=0)
val_y = get_mut_label(self.y_val)
score = log_loss(val_y, y_pred)
print("\n mlogloss - epoch: %d - score: %.6f \n" % (epoch+1, score))
# In[7]:
#词向量
def w2v_pad(df_train,df_test,col, maxlen_,victor_size, num_words):
tokenizer = text.Tokenizer(num_words=num_words, lower=False,filters="")
tokenizer.fit_on_texts(list(df_train[col].values)+list(df_test[col].values))
train_ = sequence.pad_sequences(tokenizer.texts_to_sequences(df_train[col].values), maxlen=maxlen_)
test_ = sequence.pad_sequences(tokenizer.texts_to_sequences(df_test[col].values), maxlen=maxlen_)
word_index = tokenizer.word_index
count = 0
nb_words = len(word_index)
print(nb_words)
all_data=pd.concat([df_train[col],df_test[col]])
file_name = 'embedding/' + 'Word2Vec_start_' + col +"_"+ str(victor_size) + '.model'
if not os.path.exists(file_name):
model = Word2Vec([[word for word in document.split(' ')] for document in all_data.values],
size=victor_size, window=5, iter=10, workers=11, seed=2018, min_count=2)
model.save(file_name)
else:
model = Word2Vec.load(file_name)
print("add word2vec finished....")
embedding_word2vec_matrix = np.zeros((nb_words + 1, victor_size))
for word, i in word_index.items():
embedding_vector = model[word] if word in model else None
if embedding_vector is not None:
count += 1
embedding_word2vec_matrix[i] = embedding_vector
else:
unk_vec = np.random.random(victor_size) * 0.5
unk_vec = unk_vec - unk_vec.mean()
embedding_word2vec_matrix[i] = unk_vec
embedding_w2c_all = np.zeros((nb_words + 1, victor_size))
for word, i in word_index.items():
embedding_vector = dic_w2c_all[word]
embedding_w2c_all[i] = embedding_vector
#embedding_matrix = np.concatenate((embedding_word2vec_matrix,embedding_w2c_all),axis=1)
embedding_matrix = embedding_word2vec_matrix
return train_, test_, word_index, embedding_matrix
# In[8]:
train_, test_,word2idx, word_embedding = w2v_pad(train,test,column_name, word_seq_len,victor_size, num_words)
# In[10]:
from TextModel import *
# In[18]:
my_opt="get_text_capsule"
#参数
Y = train['class'].values
if not os.path.exists("cache/"+my_opt):
os.mkdir("cache/"+my_opt)
# In[19]:
from sklearn.model_selection import KFold, StratifiedKFold
gc.collect()
seed = 2006
num_folds = 5
kf = StratifiedKFold(n_splits= num_folds, shuffle=True, random_state=seed).split(train_, Y)
# In[20]:
epochs = 10
my_opt=eval(my_opt)
train_model_pred = np.zeros((train_.shape[0], classification))
test_model_pred = np.zeros((test_.shape[0], classification))
for i, (train_fold, val_fold) in enumerate(kf):
X_train, X_valid, = train_[train_fold, :], train_[val_fold, :]
y_train, y_valid = Y[train_fold], Y[val_fold]
y_tra = to_categorical(y_train)
y_val = to_categorical(y_valid)
#模型
name = str(my_opt.__name__)
model = my_opt(word_seq_len, word_embedding, classification)
RocAuc = RocAucEvaluation(validation_data=(X_valid, y_val), interval=1)
hist = model.fit(X_train, y_tra, batch_size=batch_size, epochs=epochs, validation_data=(X_valid, y_val),
callbacks=[RocAuc])
train_model_pred[val_fold, :] = model.predict(X_valid)
# In[24]:
#模型
#用全部的数据预测
train_label = to_categorical(Y)
name = str(my_opt.__name__)
model = my_opt(word_seq_len, word_embedding, classification)
RocAuc = RocAucEvaluation(validation_data=(train_, train_label), interval=1)
hist = model.fit(train_, train_label, batch_size=batch_size, epochs=epochs, validation_data=(train_, train_label),
callbacks=[RocAuc])
test_model_pred = model.predict(test_)
# In[25]:
df_train_pred = pd.DataFrame(train_model_pred)
df_test_pred = pd.DataFrame(test_model_pred)
df_train_pred.columns = ['device_start_capsule_pred_' + str(i) for i in range(22)]
df_test_pred.columns = ['device_start_capsule_pred_' + str(i) for i in range(22)]
# In[26]:
df_train_pred = pd.concat([train[['device_id']], df_train_pred], axis=1)
df_test_pred = pd.concat([test[['device_id']], df_test_pred], axis=1)
# In[27]:
df_results = pd.concat([df_train_pred, df_test_pred])
df_results.to_csv('device_start_capsule_pred.csv', index=None)
================================================
FILE: THLUO/17.device_start_textcnn_pred.py
================================================
# coding: utf-8
# In[1]:
# coding: utf-8
import feather
import os
import re
import sys
import gc
import random
import pandas as pd
import numpy as np
import gensim
from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence
from scipy import stats
import tensorflow as tf
import keras
from keras.layers import *
from keras.models import *
from keras.optimizers import *
from keras.callbacks import *
from keras.preprocessing import text, sequence
from keras.utils import to_categorical
from keras.engine.topology import Layer
from sklearn.preprocessing import LabelEncoder
from keras.utils import np_utils
from keras.utils.training_utils import multi_gpu_model
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score
import warnings
warnings.filterwarnings('ignore')
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
session = tf.Session(config=config)
# In[2]:
print ('17.device_start_textcnn_pred.py')
df_doc = pd.read_csv('01.device_click_app_sorted_by_start.csv')
deviceid_test=pd.read_csv('input/deviceid_test.tsv',sep='\t',names=['device_id'])
deviceid_train=pd.read_csv('input/deviceid_train.tsv',sep='\t',names=['device_id','sex','age'])
df_total = pd.concat([deviceid_train, deviceid_test])
df_doc = df_doc.merge(df_total, on='device_id', how='left')
df_wv2_all = pd.read_csv('w2c_all_emb.csv')
dic_w2c_all = {}
for row in df_wv2_all.values :
app_id = row[0]
vector = row[1:]
dic_w2c_all[app_id] = vector
# In[3]:
df_doc['sex'] = df_doc['sex'].apply(lambda x:str(x))
df_doc['age'] = df_doc['age'].apply(lambda x:str(x))
def tool(x):
if x=='nan':
return x
else:
return str(int(float(x)))
df_doc['sex']=df_doc['sex'].apply(tool)
df_doc['age']=df_doc['age'].apply(tool)
df_doc['sex_age']=df_doc['sex']+'-'+df_doc['age']
df_doc = df_doc.replace({'nan':np.NaN,'nan-nan':np.NaN})
train = df_doc[df_doc['sex_age'].notnull()]
test = df_doc[df_doc['sex_age'].isnull()]
train.reset_index(drop=True, inplace=True)
test.reset_index(drop=True, inplace=True)
lb = LabelEncoder()
train_label = lb.fit_transform(train['sex_age'].values)
train['class'] = train_label
# In[5]:
column_name="app_list"
word_seq_len = 900
victor_size = 200
num_words = 35000
batch_size = 64
classification = 22
kfold=10
# In[6]:
from sklearn.metrics import log_loss
def get_mut_label(y_label) :
results = []
for ele in y_label :
results.append(ele.argmax())
return results
class RocAucEvaluation(Callback):
def __init__(self, validation_data=(), interval=1):
super(Callback, self).__init__()
self.interval = interval
self.X_val, self.y_val = validation_data
def on_epoch_end(self, epoch, logs={}):
if epoch % self.interval == 0:
y_pred = self.model.predict(self.X_val, verbose=0)
val_y = get_mut_label(self.y_val)
score = log_loss(val_y, y_pred)
print("\n mlogloss - epoch: %d - score: %.6f \n" % (epoch+1, score))
# In[7]:
#词向量
def w2v_pad(df_train,df_test,col, maxlen_,victor_size, num_words):
tokenizer = text.Tokenizer(num_words=num_words, lower=False,filters="")
tokenizer.fit_on_texts(list(df_train[col].values)+list(df_test[col].values))
train_ = sequence.pad_sequences(tokenizer.texts_to_sequences(df_train[col].values), maxlen=maxlen_)
test_ = sequence.pad_sequences(tokenizer.texts_to_sequences(df_test[col].values), maxlen=maxlen_)
word_index = tokenizer.word_index
count = 0
nb_words = len(word_index)
print(nb_words)
all_data=pd.concat([df_train[col],df_test[col]])
file_name = 'embedding/' + 'Word2Vec_start_' + col +"_"+ str(victor_size) + '.model'
if not os.path.exists(file_name):
model = Word2Vec([[word for word in document.split(' ')] for document in all_data.values],
size=victor_size, window=5, iter=10, workers=11, seed=2018, min_count=2)
model.save(file_name)
else:
model = Word2Vec.load(file_name)
print("add word2vec finished....")
embedding_word2vec_matrix = np.zeros((nb_words + 1, victor_size))
for word, i in word_index.items():
embedding_vector = model[word] if word in model else None
if embedding_vector is not None:
count += 1
embedding_word2vec_matrix[i] = embedding_vector
else:
unk_vec = np.random.random(victor_size) * 0.5
unk_vec = unk_vec - unk_vec.mean()
embedding_word2vec_matrix[i] = unk_vec
embedding_w2c_all = np.zeros((nb_words + 1, victor_size))
for word, i in word_index.items():
embedding_vector = dic_w2c_all[word]
embedding_w2c_all[i] = embedding_vector
#embedding_matrix = np.concatenate((embedding_word2vec_matrix,embedding_w2c_all),axis=1)
embedding_matrix = embedding_word2vec_matrix
return train_, test_, word_index, embedding_matrix
# In[8]:
train_, test_,word2idx, word_embedding = w2v_pad(train,test,column_name, word_seq_len,victor_size, num_words)
# In[10]:
from TextModel import *
# In[19]:
my_opt="get_text_cnn2"
#参数
Y = train['class'].values
if not os.path.exists("cache/"+my_opt):
os.mkdir("cache/"+my_opt)
# In[20]:
from sklearn.model_selection import KFold, StratifiedKFold
gc.collect()
seed = 2006
num_folds = 5
kf = StratifiedKFold(n_splits= num_folds, shuffle=True, random_state=seed).split(train_, Y)
# In[21]:
epochs = 6
my_opt=eval(my_opt)
train_model_pred = np.zeros((train_.shape[0], classification))
test_model_pred = np.zeros((test_.shape[0], classification))
for i, (train_fold, val_fold) in enumerate(kf):
X_train, X_valid, = train_[train_fold, :], train_[val_fold, :]
y_train, y_valid = Y[train_fold], Y[val_fold]
y_tra = to_categorical(y_train)
y_val = to_categorical(y_valid)
#模型
name = str(my_opt.__name__)
model = my_opt(word_seq_len, word_embedding, classification)
RocAuc = RocAucEvaluation(validation_data=(X_valid, y_val), interval=1)
hist = model.fit(X_train, y_tra, batch_size=batch_size, epochs=epochs, validation_data=(X_valid, y_val),
callbacks=[RocAuc])
train_model_pred[val_fold, :] = model.predict(X_valid)
# In[25]:
#模型
#用全部的数据预测
train_label = to_categorical(Y)
name = str(my_opt.__name__)
model = my_opt(word_seq_len, word_embedding, classification)
RocAuc = RocAucEvaluation(validation_data=(train_, train_label), interval=1)
hist = model.fit(train_, train_label, batch_size=batch_size, epochs=epochs, validation_data=(train_, train_label),
callbacks=[RocAuc])
test_model_pred = model.predict(test_)
# In[26]:
df_train_pred = pd.DataFrame(train_model_pred)
df_test_pred = pd.DataFrame(test_model_pred)
df_train_pred.columns = ['device_start_textcnn_pred_' + str(i) for i in range(22)]
df_test_pred.columns = ['device_start_textcnn_pred_' + str(i) for i in range(22)]
# In[27]:
df_train_pred = pd.concat([train[['device_id']], df_train_pred], axis=1)
df_test_pred = pd.concat([test[['device_id']], df_test_pred], axis=1)
# In[28]:
df_results = pd.concat([df_train_pred, df_test_pred])
df_results.to_csv('device_start_textcnn_pred.csv', index=None)
================================================
FILE: THLUO/18.device_start_text_dpcnn_pred.py
================================================
# coding: utf-8
# In[1]:
# coding: utf-8
import feather
import os
import re
import sys
import gc
import random
import pandas as pd
import numpy as np
import gensim
from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence
from scipy import stats
import tensorflow as tf
import keras
from keras.layers import *
from keras.models import *
from keras.optimizers import *
from keras.callbacks import *
from keras.preprocessing import text, sequence
from keras.utils import to_categorical
from keras.engine.topology import Layer
from sklearn.preprocessing import LabelEncoder
from keras.utils import np_utils
from keras.utils.training_utils import multi_gpu_model
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score
import warnings
warnings.filterwarnings('ignore')
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
session = tf.Session(config=config)
# In[2]:
print ('18.device_start_text_dpcnn_pred.py')
df_doc = pd.read_csv('01.device_click_app_sorted_by_start.csv')
deviceid_test=pd.read_csv('input/deviceid_test.tsv',sep='\t',names=['device_id'])
deviceid_train=pd.read_csv('input/deviceid_train.tsv',sep='\t',names=['device_id','sex','age'])
df_total = pd.concat([deviceid_train, deviceid_test])
df_doc = df_doc.merge(df_total, on='device_id', how='left')
df_wv2_all = pd.read_csv('w2c_all_emb.csv')
dic_w2c_all = {}
for row in df_wv2_all.values :
app_id = row[0]
vector = row[1:]
dic_w2c_all[app_id] = vector
# In[3]:
df_doc['sex'] = df_doc['sex'].apply(lambda x:str(x))
df_doc['age'] = df_doc['age'].apply(lambda x:str(x))
def tool(x):
if x=='nan':
return x
else:
return str(int(float(x)))
df_doc['sex']=df_doc['sex'].apply(tool)
df_doc['age']=df_doc['age'].apply(tool)
df_doc['sex_age']=df_doc['sex']+'-'+df_doc['age']
df_doc = df_doc.replace({'nan':np.NaN,'nan-nan':np.NaN})
train = df_doc[df_doc['sex_age'].notnull()]
test = df_doc[df_doc['sex_age'].isnull()]
train.reset_index(drop=True, inplace=True)
test.reset_index(drop=True, inplace=True)
lb = LabelEncoder()
train_label = lb.fit_transform(train['sex_age'].values)
train['class'] = train_label
# In[5]:
column_name="app_list"
word_seq_len = 900
victor_size = 200
num_words = 35000
batch_size = 64
classification = 22
kfold=10
# In[6]:
from sklearn.metrics import log_loss
def get_mut_label(y_label) :
results = []
for ele in y_label :
results.append(ele.argmax())
return results
class RocAucEvaluation(Callback):
def __init__(self, validation_data=(), interval=1):
super(Callback, self).__init__()
self.interval = interval
self.X_val, self.y_val = validation_data
def on_epoch_end(self, epoch, logs={}):
if epoch % self.interval == 0:
y_pred = self.model.predict(self.X_val, verbose=0)
val_y = get_mut_label(self.y_val)
score = log_loss(val_y, y_pred)
print("\n mlogloss - epoch: %d - score: %.6f \n" % (epoch+1, score))
# In[7]:
#词向量
def w2v_pad(df_train,df_test,col, maxlen_,victor_size, num_words):
tokenizer = text.Tokenizer(num_words=num_words, lower=False,filters="")
tokenizer.fit_on_texts(list(df_train[col].values)+list(df_test[col].values))
train_ = sequence.pad_sequences(tokenizer.texts_to_sequences(df_train[col].values), maxlen=maxlen_)
test_ = sequence.pad_sequences(tokenizer.texts_to_sequences(df_test[col].values), maxlen=maxlen_)
word_index = tokenizer.word_index
count = 0
nb_words = len(word_index)
print(nb_words)
all_data=pd.concat([df_train[col],df_test[col]])
file_name = 'embedding/' + 'Word2Vec_start_' + col +"_"+ str(victor_size) + '.model'
if not os.path.exists(file_name):
model = Word2Vec([[word for word in document.split(' ')] for document in all_data.values],
size=victor_size, window=5, iter=10, workers=11, seed=2018, min_count=2)
model.save(file_name)
else:
model = Word2Vec.load(file_name)
print("add word2vec finished....")
embedding_word2vec_matrix = np.zeros((nb_words + 1, victor_size))
for word, i in word_index.items():
embedding_vector = model[word] if word in model else None
if embedding_vector is not None:
count += 1
embedding_word2vec_matrix[i] = embedding_vector
else:
unk_vec = np.random.random(victor_size) * 0.5
unk_vec = unk_vec - unk_vec.mean()
embedding_word2vec_matrix[i] = unk_vec
embedding_w2c_all = np.zeros((nb_words + 1, victor_size))
for word, i in word_index.items():
embedding_vector = dic_w2c_all[word]
embedding_w2c_all[i] = embedding_vector
#embedding_matrix = np.concatenate((embedding_word2vec_matrix,embedding_w2c_all),axis=1)
embedding_matrix = embedding_word2vec_matrix
return train_, test_, word_index, embedding_matrix
# In[8]:
train_, test_,word2idx, word_embedding = w2v_pad(train,test,column_name, word_seq_len,victor_size, num_words)
# In[10]:
from TextModel import *
# In[12]:
my_opt="get_text_dpcnn"
#参数
Y = train['class'].values
if not os.path.exists("cache/"+my_opt):
os.mkdir("cache/"+my_opt)
# In[13]:
from sklearn.model_selection import KFold, StratifiedKFold
gc.collect()
seed = 2006
num_folds = 5
kf = StratifiedKFold(n_splits= num_folds, shuffle=True, random_state=seed).split(train_, Y)
# In[14]:
from keras import backend as K
epochs = 6
my_opt=eval(my_opt)
train_model_pred = np.zeros((train_.shape[0], classification))
test_model_pred = np.zeros((test_.shape[0], classification))
for i, (train_fold, val_fold) in enumerate(kf):
X_train, X_valid, = train_[train_fold, :], train_[val_fold, :]
y_train, y_valid = Y[train_fold], Y[val_fold]
y_tra = to_categorical(y_train)
y_val = to_categorical(y_valid)
#模型
name = str(my_opt.__name__)
model = my_opt(word_seq_len, word_embedding, classification)
RocAuc = RocAucEvaluation(validation_data=(X_valid, y_val), interval=1)
hist = model.fit(X_train, y_tra, batch_size=batch_size, epochs=epochs, validation_data=(X_valid, y_val),
callbacks=[RocAuc])
train_model_pred[val_fold, :] = model.predict(X_valid)
del model
del hist
gc.collect()
K.clear_session()
tf.reset_default_graph()
# In[15]:
#模型
#用全部的数据预测
train_label = to_categorical(Y)
name = str(my_opt.__name__)
model = my_opt(word_seq_len, word_embedding, classification)
RocAuc = RocAucEvaluation(validation_data=(train_, train_label), interval=1)
hist = model.fit(train_, train_label, batch_size=batch_size, epochs=epochs, validation_data=(train_, train_label),
callbacks=[RocAuc])
test_model_pred = model.predict(test_)
# In[16]:
df_train_pred = pd.DataFrame(train_model_pred)
df_test_pred = pd.DataFrame(test_model_pred)
df_train_pred.columns = ['device_start_text_dpcnn_pred_' + str(i) for i in range(22)]
df_test_pred.columns = ['device_start_text_dpcnn_pred_' + str(i) for i in range(22)]
# In[17]:
df_train_pred = pd.concat([train[['device_id']], df_train_pred], axis=1)
df_test_pred = pd.concat([test[['device_id']], df_test_pred], axis=1)
# In[18]:
df_results = pd.concat([df_train_pred, df_test_pred])
df_results.to_csv('device_start_text_dpcnn_pred.csv', index=None)
================================================
FILE: THLUO/19.device_start_lstm_pred.py
================================================
import feather
import os
import re
import sys
import gc
import random
import pandas as pd
import numpy as np
import gensim
from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence
from scipy import stats
import tensorflow as tf
import keras
from keras.layers import *
from keras.models import *
from keras.optimizers import *
from keras.callbacks import *
from keras.preprocessing import text, sequence
from keras.utils import to_categorical
from keras.engine.topology import Layer
from sklearn.preprocessing import LabelEncoder
from keras.utils import np_utils
from keras.utils.training_utils import multi_gpu_model
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score
import warnings
warnings.filterwarnings('ignore')
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
session = tf.Session(config=config)
print ('19.lstm...........py')
# In[2]:
df_doc = pd.read_csv('01.device_click_app_sorted_by_start.csv')
deviceid_test=pd.read_csv('input/deviceid_test.tsv',sep='\t',names=['device_id'])
deviceid_train=pd.read_csv('input/deviceid_train.tsv',sep='\t',names=['device_id','sex','age'])
df_total = pd.concat([deviceid_train, deviceid_test])
df_doc = df_doc.merge(df_total, on='device_id', how='left')
df_wv2_all = pd.read_csv('w2c_all_emb.csv')
dic_w2c_all = {}
for row in df_wv2_all.values :
app_id = row[0]
vector = row[1:]
dic_w2c_all[app_id] = vector
# In[3]:
df_doc['sex'] = df_doc['sex'].apply(lambda x:str(x))
df_doc['age'] = df_doc['age'].apply(lambda x:str(x))
def tool(x):
if x=='nan':
return x
else:
return str(int(float(x)))
df_doc['sex']=df_doc['sex'].apply(tool)
df_doc['age']=df_doc['age'].apply(tool)
df_doc['sex_age']=df_doc['sex']+'-'+df_doc['age']
df_doc = df_doc.replace({'nan':np.NaN,'nan-nan':np.NaN})
train = df_doc[df_doc['sex_age'].notnull()]
test = df_doc[df_doc['sex_age'].isnull()]
train.reset_index(drop=True, inplace=True)
test.reset_index(drop=True, inplace=True)
lb = LabelEncoder()
train_label = lb.fit_transform(train['sex_age'].values)
train['class'] = train_label
# In[5]:
column_name="app_list"
word_seq_len = 900
victor_size = 200
num_words = 35000
batch_size = 64
classification = 22
kfold=10
# In[6]:
from sklearn.metrics import log_loss
def get_mut_label(y_label) :
results = []
for ele in y_label :
results.append(ele.argmax())
return results
class RocAucEvaluation(Callback):
def __init__(self, validation_data=(), interval=1):
super(Callback, self).__init__()
self.interval = interval
self.X_val, self.y_val = validation_data
def on_epoch_end(self, epoch, logs={}):
if epoch % self.interval == 0:
y_pred = self.model.predict(self.X_val, verbose=0)
val_y = get_mut_label(self.y_val)
score = log_loss(val_y, y_pred)
print("\n mlogloss - epoch: %d - score: %.6f \n" % (epoch+1, score))
# In[7]:
#词向量
def w2v_pad(df_train,df_test,col, maxlen_,victor_size, num_words):
tokenizer = text.Tokenizer(num_words=num_words, lower=False,filters="")
tokenizer.fit_on_texts(list(df_train[col].values)+list(df_test[col].values))
train_ = sequence.pad_sequences(tokenizer.texts_to_sequences(df_train[col].values), maxlen=maxlen_)
test_ = sequence.pad_sequences(tokenizer.texts_to_sequences(df_test[col].values), maxlen=maxlen_)
word_index = tokenizer.word_index
count = 0
nb_words = len(word_index)
print(nb_words)
all_data=pd.concat([df_train[col],df_test[col]])
file_name = 'embedding/' + 'Word2Vec_start_' + col +"_"+ str(victor_size) + '.model'
if not os.path.exists(file_name):
model = Word2Vec([[word for word in document.split(' ')] for document in all_data.values],
size=victor_size, window=5, iter=10, workers=11, seed=2018, min_count=2)
model.save(file_name)
else:
model = Word2Vec.load(file_name)
print("add word2vec finished....")
embedding_word2vec_matrix = np.zeros((nb_words + 1, victor_size))
for word, i in word_index.items():
embedding_vector = model[word] if word in model else None
if embedding_vector is not None:
count += 1
embedding_word2vec_matrix[i] = embedding_vector
else:
unk_vec = np.random.random(victor_size) * 0.5
unk_vec = unk_vec - unk_vec.mean()
embedding_word2vec_matrix[i] = unk_vec
embedding_w2c_all = np.zeros((nb_words + 1, victor_size))
for word, i in word_index.items():
embedding_vector = dic_w2c_all[word]
embedding_w2c_all[i] = embedding_vector
#embedding_matrix = np.concatenate((embedding_word2vec_matrix,embedding_w2c_all),axis=1)
embedding_matrix = embedding_word2vec_matrix
return train_, test_, word_index, embedding_matrix
# In[8]:
train_, test_,word2idx, word_embedding = w2v_pad(train,test,column_name, word_seq_len,victor_size, num_words)
# In[10]:
from TextModel import *
# In[13]:
my_opt="get_text_lstm1"
#参数
Y = train['class'].values
if not os.path.exists("cache/"+my_opt):
os.mkdir("cache/"+my_opt)
# In[14]:
from sklearn.model_selection import KFold, StratifiedKFold
gc.collect()
seed = 2006
num_folds = 5
kf = StratifiedKFold(n_splits= num_folds, shuffle=True, random_state=seed).split(train_, Y)
# In[15]:
from keras import backend as K
epochs = 6
my_opt=eval(my_opt)
train_model_pred = np.zeros((train_.shape[0], classification))
test_model_pred = np.zeros((test_.shape[0], classification))
for i, (train_fold, val_fold) in enumerate(kf):
X_train, X_valid, = train_[train_fold, :], train_[val_fold, :]
y_train, y_valid = Y[train_fold], Y[val_fold]
y_tra = to_categorical(y_train)
y_val = to_categorical(y_valid)
#模型
name = str(my_opt.__name__)
model = my_opt(word_seq_len, word_embedding, classification)
RocAuc = RocAucEvaluation(validation_data=(X_valid, y_val), interval=1)
hist = model.fit(X_train, y_tra, batch_size=batch_size, epochs=epochs, validation_data=(X_valid, y_val),
callbacks=[RocAuc])
train_model_pred[val_fold, :] = model.predict(X_valid)
del model
del hist
gc.collect()
K.clear_session()
tf.reset_default_graph()
# In[19]:
#模型
#用全部的数据预测
train_label = to_categorical(Y)
name = str(my_opt.__name__)
model = my_opt(word_seq_len, word_embedding, classification)
RocAuc = RocAucEvaluation(validation_data=(train_, train_label), interval=1)
hist = model.fit(train_, train_label, batch_size=batch_size, epochs=epochs, validation_data=(train_, train_label),
callbacks=[RocAuc])
test_model_pred = model.predict(test_)
# In[20]:
df_train_pred = pd.DataFrame(train_model_pred)
df_test_pred = pd.DataFrame(test_model_pred)
df_train_pred.columns = ['device_start_lstm_pred_' + str(i) for i in range(22)]
df_test_pred.columns = ['device_start_lstm_pred_' + str(i) for i in range(22)]
# In[21]:
df_train_pred = pd.concat([train[['device_id']], df_train_pred], axis=1)
df_test_pred = pd.concat([test[['device_id']], df_test_pred], axis=1)
# In[22]:
df_results = pd.concat([df_train_pred, df_test_pred])
df_results.to_csv('device_start_lstm_pred.csv', index=None)
================================================
FILE: THLUO/2.w2c_model_close.py
================================================
# coding: utf-8
# In[1]:
import pandas as pd
import seaborn as sns
import numpy as np
from tqdm import tqdm
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.cross_validation import train_test_split
from sklearn.metrics import accuracy_score
import lightgbm as lgb
from datetime import datetime,timedelta
import time
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
import gc
# In[2]:
print ('2.w2c_model_close.py')
path='input/'
data=pd.DataFrame()
#sex_age=pd.read_excel('./data/性别年龄对照表.xlsx')
# In[3]:
deviceid_packages=pd.read_csv(path+'deviceid_packages.tsv',sep='\t',names=['device_id','apps'])
deviceid_test=pd.read_csv(path+'deviceid_test.tsv',sep='\t',names=['device_id'])
deviceid_train=pd.read_csv(path+'deviceid_train.tsv',sep='\t',names=['device_id','sex','age'])
deviceid_brand = pd.read_csv(path+'deviceid_brand.tsv',sep='\t', names=['device_id','device_brand', 'device_type'])
deviceid_package_start_close = pd.read_csv(path+'deviceid_package_start_close.tsv',sep='\t', names=['device_id','app_id','start_time','close_time'])
package_label = pd.read_csv(path+'package_label.tsv',sep='\t',names=['app_id','app_parent_type', 'app_child_type'])
deviceid_brand['device_brand'] = deviceid_brand.device_brand.apply(lambda x : str(x).split(' ')[0])
df_temp = deviceid_brand.groupby('device_brand')['device_id'].count().reset_index().rename(columns={'device_id':'brand_counts'})
one_time_brand = df_temp[df_temp.brand_counts == 1].device_brand.values
deviceid_brand['device_brand'] = deviceid_brand.device_brand.apply(lambda x : 'other' if x in one_time_brand else x)
df_temp = deviceid_brand.groupby('device_brand')['device_id'].count().reset_index().rename(columns={'device_id':'brand_counts'})
one_time_brand = df_temp[df_temp.brand_counts == 2].device_brand.values
deviceid_brand['device_brand'] = deviceid_brand.device_brand.apply(lambda x : 'other_2' if x in one_time_brand else x)
df_temp = deviceid_brand.groupby('device_brand')['device_id'].count().reset_index().rename(columns={'device_id':'brand_counts'})
one_time_brand = df_temp[df_temp.brand_counts == 3].device_brand.values
deviceid_brand['device_brand'] = deviceid_brand.device_brand.apply(lambda x : 'other_3' if x in one_time_brand else x)
#转换成对应的数字
lbl = LabelEncoder()
lbl.fit(list(deviceid_brand.device_brand.values))
deviceid_brand['device_brand'] = lbl.transform(list(deviceid_brand.device_brand.values))
lbl = LabelEncoder()
lbl.fit(list(deviceid_brand.device_type.values))
deviceid_brand['device_type'] = lbl.transform(list(deviceid_brand.device_type.values))
#转换成对应的数字
lbl = LabelEncoder()
lbl.fit(list(package_label.app_parent_type.values))
package_label['app_parent_type'] = lbl.transform(list(package_label.app_parent_type.values))
lbl = LabelEncoder()
lbl.fit(list(package_label.app_child_type.values))
package_label['app_child_type'] = lbl.transform(list(package_label.app_child_type.values))
# In[4]:
df_sorted = deviceid_package_start_close.sort_values(by='close_time')
# In[6]:
df_results = df_sorted.groupby('device_id')['app_id'].apply(lambda x:' '.join(x)).reset_index().rename(columns = {'app_id' : 'app_list'})
# In[7]:
df_results.to_csv('02.device_click_app_sorted_by_close.csv', index=None)
# In[6]:
df_device_start_app_list = df_sorted.groupby('device_id').apply(lambda x : list(x.app_id)).reset_index().rename(columns = {0 : 'app_list'})
# In[7]:
app_list = list(df_device_start_app_list.app_list.values)
# In[8]:
from gensim.test.utils import common_texts, get_tmpfile
from gensim.models import Word2Vec
# In[9]:
model = Word2Vec(app_list, size=10, window=10, min_count=2, workers=4)
model.save("word2vec.model")
# In[11]:
vocab = list(model.wv.vocab.keys())
w2c_arr = []
for v in vocab :
w2c_arr.append(list(model.wv[v]))
# In[12]:
df_w2c_start = pd.DataFrame()
df_w2c_start['app_id'] = vocab
df_w2c_start = pd.concat([df_w2c_start, pd.DataFrame(w2c_arr)], axis=1)
df_w2c_start.columns = ['app_id'] + ['w2c_close_app_' + str(i) for i in range(10)]
# In[ ]:
w2c_nums = 10
agg = {}
for l in ['w2c_close_app_' + str(i) for i in range(w2c_nums)] :
agg[l] = ['mean', 'std', 'max', 'min']
# In[14]:
deviceid_package_start_close = deviceid_package_start_close.merge(df_w2c_start, on='app_id', how='left')
# In[ ]:
df_agg = deviceid_package_start_close.groupby('device_id').agg(agg)
df_agg.columns = pd.Index(['device_' + e[0] + "_" + e[1].upper() for e in df_agg.columns.tolist()])
df_agg = df_agg.reset_index()
df_agg.to_csv('device_close_app_w2c.csv', index=None)
# In[14]:
df_results = deviceid_package_start_close.groupby(['device_id', 'app_id'])['start_time'].mean().reset_index()
df_results = df_results.merge(df_w2c_start, on='app_id', how='left')
# In[17]:
df_agg = df_results.groupby('device_id').agg(agg)
df_agg.columns = pd.Index(['device_app_unique_' + e[0] + "_" + e[1].upper() for e in df_agg.columns.tolist()])
df_agg = df_agg.reset_index()
# In[18]:
df_agg.to_csv('device_app_unique_close_app_w2c.csv', index=None)
================================================
FILE: THLUO/20.lgb_sex_age_prob_oof.py
================================================
# coding: utf-8
# In[1]:
import pandas as pd
import seaborn as sns
import numpy as np
from tqdm import tqdm
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.cross_validation import train_test_split
from sklearn.metrics import accuracy_score
import lightgbm as lgb
from datetime import datetime,timedelta
import time
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
import gc
# In[2]:
print ('20.lgb_sex_age_prob_oof.py')
path='input/'
data=pd.DataFrame()
#sex_age=pd.read_excel('./data/性别年龄对照表.xlsx')
# In[3]:
deviceid_packages=pd.read_csv(path+'deviceid_packages.tsv',sep='\t',names=['device_id','apps'])
deviceid_test=pd.read_csv(path+'deviceid_test.tsv',sep='\t',names=['device_id'])
deviceid_train=pd.read_csv(path+'deviceid_train.tsv',sep='\t',names=['device_id','sex','age'])
deviceid_brand = pd.read_csv(path+'deviceid_brand.tsv',sep='\t', names=['device_id','device_brand', 'device_type'])
deviceid_package_start_close = pd.read_csv(path+'deviceid_package_start_close.tsv',sep='\t', names=['device_id','app_id','start_time','close_time'])
package_label = pd.read_csv(path+'package_label.tsv',sep='\t',names=['app_id','app_parent_type', 'app_child_type'])
deviceid_brand['device_brand'] = deviceid_brand.device_brand.apply(lambda x : str(x).split(' ')[0])
df_temp = deviceid_brand.groupby('device_brand')['device_id'].count().reset_index().rename(columns={'device_id':'brand_counts'})
one_time_brand = df_temp[df_temp.brand_counts == 1].device_brand.values
deviceid_brand['device_brand'] = deviceid_brand.device_brand.apply(lambda x : 'other' if x in one_time_brand else x)
df_temp = deviceid_brand.groupby('device_brand')['device_id'].count().reset_index().rename(columns={'device_id':'brand_counts'})
one_time_brand = df_temp[df_temp.brand_counts == 2].device_brand.values
deviceid_brand['device_brand'] = deviceid_brand.device_brand.apply(lambda x : 'other_2' if x in one_time_brand else x)
df_temp = deviceid_brand.groupby('device_brand')['device_id'].count().reset_index().rename(columns={'device_id':'brand_counts'})
one_time_brand = df_temp[df_temp.brand_counts == 3].device_brand.values
deviceid_brand['device_brand'] = deviceid_brand.device_brand.apply(lambda x : 'other_3' if x in one_time_brand else x)
#转换成对应的数字
lbl = LabelEncoder()
lbl.fit(list(deviceid_brand.device_brand.values))
deviceid_brand['device_brand'] = lbl.transform(list(deviceid_brand.device_brand.values))
lbl = LabelEncoder()
lbl.fit(list(deviceid_brand.device_type.values))
deviceid_brand['device_type'] = lbl.transform(list(deviceid_brand.device_type.values))
#转换成对应的数字
lbl = LabelEncoder()
lbl.fit(list(package_label.app_parent_type.values))
package_label['app_parent_type'] = lbl.transform(list(package_label.app_parent_type.values))
lbl = LabelEncoder()
lbl.fit(list(package_label.app_child_type.values))
package_label['app_child_type'] = lbl.transform(list(package_label.app_child_type.values))
# In[4]:
import time
# 输入毫秒级的时间,转出正常格式的时间
def timeStamp(timeNum):
timeStamp = float(timeNum/1000)
timeArray = time.localtime(timeStamp)
otherStyleTime = time.strftime("%Y-%m-%d %H:%M:%S", timeArray)
return otherStyleTime
#解析出具体的时间
deviceid_package_start_close['start_date'] = pd.to_datetime(deviceid_package_start_close.start_time.apply(timeStamp))
deviceid_package_start_close['end_date'] = pd.to_datetime(deviceid_package_start_close.close_time.apply(timeStamp))
deviceid_package_start_close['start_hour'] = deviceid_package_start_close.start_date.dt.hour
deviceid_package_start_close['end_hour'] = deviceid_package_start_close.end_date.dt.hour
deviceid_package_start_close['time_gap'] = (deviceid_package_start_close['end_date'] - deviceid_package_start_close['start_date']).astype('timedelta64[s]')
deviceid_package_start_close = deviceid_package_start_close.merge(package_label, on='app_id', how='left')
deviceid_package_start_close.app_parent_type.fillna(-1, inplace=True)
deviceid_package_start_close.app_child_type.fillna(-1, inplace=True)
deviceid_package_start_close['start_year'] = deviceid_package_start_close.start_date.dt.year
deviceid_package_start_close['end_year'] = deviceid_package_start_close.end_date.dt.year
deviceid_package_start_close['year_gap'] = deviceid_package_start_close['end_year'] - deviceid_package_start_close['start_year']
# In[5]:
deviceid_train=pd.concat([deviceid_train,deviceid_test])
# In[6]:
deviceid_packages['apps']=deviceid_packages['apps'].apply(lambda x:x.split(','))
deviceid_packages['app_lenghth']=deviceid_packages['apps'].apply(lambda x:len(x))
#特征工程
def open_app_timegap_in_hour() :
df_temp = deviceid_package_start_close.groupby(['device_id', 'start_hour'])['time_gap'].mean().reset_index().rename(columns = {'time_gap': 'mean_time_gap'})
df_mean_temp = pd.pivot_table(df_temp, index='device_id', columns='start_hour', values='mean_time_gap').reset_index()
df_mean_temp.columns = ['device_id'] + ['open_app_timegap_in_'+str(i) + '_mean_hour' for i in range(0,24)]
df_mean_temp.fillna(0, inplace=True)
return df_mean_temp
# In[8]:
def device_start_end_app_timegap() :
#用户打开,关闭app的时间间隔
df_ = deviceid_package_start_close.sort_values(by=['device_id', 'start_date'], ascending=False)
df_['prev_start_date'] = df_.groupby('device_id')['start_date'].shift(-1)
df_['start_date_gap'] = (df_['start_date'] - df_['prev_start_date']).astype('timedelta64[s]')
agg_dic = {'start_date_gap' : ['min', 'max', 'mean', 'median', 'std']}
df_start_gap_agg = df_.groupby('device_id').agg(agg_dic)
df_start_gap_agg.columns = pd.Index(['device_' + e[0] + "_" + e[1].upper() for e in df_start_gap_agg.columns.tolist()])
df_start_gap_agg = df_start_gap_agg.reset_index()
#del df_
gc.collect()
#关闭时间间隔
df_ = deviceid_package_start_close.sort_values(by=['device_id', 'end_date'], ascending=False)
df_['prev_end_date'] = df_.groupby('device_id')['end_date'].shift(-1)
df_['end_date_gap'] = (df_['end_date'] - df_['prev_end_date']).astype('timedelta64[s]')
agg_dic = {'end_date_gap' : ['min', 'max', 'mean', 'median', 'std']}
df_end_gap_agg = df_.groupby('device_id').agg(agg_dic)
df_end_gap_agg.columns = pd.Index(['device_' + e[0] + "_" + e[1].upper() for e in df_end_gap_agg.columns.tolist()])
df_end_gap_agg = df_end_gap_agg.reset_index()
#del df_
gc.collect()
df_agg = df_start_gap_agg.merge(df_end_gap_agg, on='device_id', how='left')
#df_agg = df_agg.merge(df_app_start_gap_agg, on='device_id', how='left')
#df_agg = df_agg.merge(df_app_end_gap_agg, on='device_id', how='left')
return df_agg
def open_app_counts_in_hour() :
df_temp = deviceid_package_start_close.groupby(['device_id', 'start_hour'])['app_id'].count().reset_index().rename(columns = {'app_id': 'app_counts'})
df_temp = pd.pivot_table(df_temp, index='device_id', columns='start_hour', values='app_counts').reset_index()
df_temp.columns = ['device_id'] + ['open_app_counts_in'+str(i) + '_hour' for i in range(0,24)]
df_temp.fillna(0, inplace=True)
return df_temp
def close_app_counts_in_hour() :
df_temp = deviceid_package_start_close.groupby(['device_id', 'end_hour'])['app_id'].count().reset_index().rename(columns = {'app_id': 'app_counts'})
df_temp = pd.pivot_table(df_temp, index='device_id', columns='end_hour', values='app_counts').reset_index()
df_temp.columns = ['device_id'] + ['close_app_counts_in'+str(i) + '_hour' for i in range(0,24)]
df_temp.fillna(0, inplace=True)
return df_temp
def app_type_mean_time_gap_one_hot () :
df_temp = deviceid_package_start_close.groupby(['device_id', 'app_parent_type'])['time_gap'].mean().reset_index()
df_temp = pd.pivot_table(df_temp, index='device_id', columns='app_parent_type', values='time_gap').reset_index()
df_temp.columns = ['device_id'] + ['app_parent_type_mean_time_gap'+str(i) for i in range(-1,45)]
df_temp.fillna(-1, inplace=True)
return df_temp
def device_active_hour() :
aggregations = {
'start_hour' : ['std','mean','max','min'],
'end_hour' : ['std','mean','max','min']
}
df_agg = deviceid_package_start_close.groupby('device_id').agg(aggregations)
df_agg.columns = pd.Index(['device_grouped_' + e[0] + "_" + e[1].upper() for e in df_agg.columns.tolist()])
df_agg = df_agg.reset_index()
return df_agg
def device_brand_encoding() :
df_temp = deviceid_brand.merge(deviceid_train[['device_id', 'age', 'sex']], on='device_id', how='left')
aggregations = {
'age' : ['std','mean'],
'sex' : ['mean'],
}
df_device_brand = df_temp.groupby('device_brand').agg(aggregations)
df_device_brand.columns = pd.Index(['device_brand_' + e[0] + "_" + e[1].upper() for e in df_device_brand.columns.tolist()])
df_device_brand = df_device_brand.reset_index()
df_device_type = df_temp.groupby('device_type').agg(aggregations)
df_device_type.columns = pd.Index(['device_type_' + e[0] + "_" + e[1].upper() for e in df_device_type.columns.tolist()])
df_device_type = df_device_type.reset_index()
df_temp = df_temp.merge(df_device_brand, on='device_brand', how='left')
df_temp = df_temp.merge(df_device_type, on='device_type', how='left')
aggregations = {
'device_brand_age_STD' : ['mean'],
'device_brand_age_MEAN' : ['mean'],
'device_brand_sex_MEAN' : ['mean'],
#'device_type_age_STD' : ['mean'],
#'device_type_age_MEAN' : ['mean'],
#'device_type_sex_MEAN' : ['mean']
}
df_agg = df_temp.groupby('device_id').agg(aggregations)
df_agg.columns = pd.Index(['device_grouped_' + e[0] + "_" + e[1].upper() for e in df_agg.columns.tolist()])
df_agg = df_agg.reset_index()
return df_agg
#统计device运行app的情况
def device_active_time_time_stat() :
#device开启app的时间统计信息
deviceid_package_start_close['active_time'] = deviceid_package_start_close['close_time'] - deviceid_package_start_close['start_time']
#device开启了多少次app
#device开启了多少个app
aggregations = {
'app_id' : ['count', 'nunique'],
'active_time' : ['mean', 'std', 'max', 'min'],
}
df_agg = deviceid_package_start_close.groupby('device_id').agg(aggregations)
df_agg.columns = pd.Index(['device_grouped_' + e[0] + "_" + e[1].upper() for e in df_agg.columns.tolist()])
df_agg = df_agg.reset_index()
aggregations = {
'active_time' : ['mean', 'std', 'max', 'min', 'count'],
}
df_da_agg = deviceid_package_start_close.groupby(['device_id', 'app_id']).agg(aggregations)
df_da_agg.columns = pd.Index(['device_app_grouped_' + e[0] + "_" + e[1].upper() for e in df_da_agg.columns.tolist()])
df_da_agg = df_da_agg.reset_index()
#device开启app的平均时间
aggregations = {
'device_app_grouped_active_time_MEAN' : ['mean', 'std', 'max', 'min'],
'device_app_grouped_active_time_STD' : ['mean', 'std', 'max', 'min'],
'device_app_grouped_active_time_MAX' : ['mean', 'std', 'max', 'min'],
'device_app_grouped_active_time_MIN' : ['mean', 'std', 'max', 'min'],
'device_app_grouped_active_time_COUNT' : ['mean', 'std', 'max', 'min'],
}
df_temp = df_da_agg.groupby(['device_id']).agg(aggregations)
df_temp.columns = pd.Index([e[0] + "_" + e[1].upper() for e in df_temp.columns.tolist()])
df_temp = df_temp.reset_index()
df_agg = df_agg.merge(df_temp, on='device_id', how='left')
return df_agg
def app_type_encoding() :
df_temp = df_device_app_pair.merge(deviceid_train[['device_id', 'age', 'sex']], on='device_id', how='left')
aggregations = {
'age' : ['std','mean'],
'sex' : ['mean'],
}
df_agg_app_parent_type = df_temp.groupby('app_parent_type').agg(aggregations)
df_agg_app_parent_type.columns = pd.Index(['app_parent_type_' + e[0] + "_" + e[1].upper() for e in df_agg_app_parent_type.columns.tolist()])
df_agg_app_parent_type = df_agg_app_parent_type.reset_index()
df_agg_app_child_type = df_temp.groupby('app_child_type').agg(aggregations)
df_agg_app_child_type.columns = pd.Index(['app_child_type_' + e[0] + "_" + e[1].upper() for e in df_agg_app_child_type.columns.tolist()])
df_agg_app_child_type = df_agg_app_child_type.reset_index()
df_temp = df_temp.merge(df_agg_app_parent_type, on='app_parent_type', how='left')
df_temp = df_temp.merge(df_agg_app_child_type, on='app_child_type', how='left')
aggregations = {
'app_parent_type_age_STD' : ['mean'],
'app_parent_type_age_MEAN' : ['mean'],
'app_parent_type_sex_MEAN' : ['mean'],
'app_child_type_age_STD' : ['mean'],
'app_child_type_age_MEAN' : ['mean'],
'app_child_type_sex_MEAN' : ['mean']
}
df_agg = df_temp.groupby('device_id').agg(aggregations)
df_agg.columns = pd.Index(['device_grouped_' + e[0] + "_" + e[1].upper() for e in df_agg.columns.tolist()])
df_agg = df_agg.reset_index()
return df_agg
#每个device对应的app_parent_type计数
def app_type_onehot_in_device(df) :
df_copy = df.fillna(-1)
df_temp = df_copy.groupby(['device_id', 'app_parent_type'])['app_id'].size().reset_index()
df_temp.rename(columns = {'app_id' : 'app_parent_type_counts'}, inplace=True)
df_temp = pd.pivot_table(df_temp, index='device_id', columns='app_parent_type', values='app_parent_type_counts').reset_index()
df_temp.columns = ['device_id'] + ['app_parent_type'+str(i) for i in range(-1,45)]
df_temp.fillna(0, inplace=True)
return df_temp
apps=deviceid_packages['apps'].apply(lambda x:' '.join(x)).tolist()
vectorizer=CountVectorizer()
transformer=TfidfTransformer()
cntTf = vectorizer.fit_transform(apps)
tfidf=transformer.fit_transform(cntTf)
word=vectorizer.get_feature_names()
weight=tfidf.toarray()
df_weight=pd.DataFrame(weight)
feature=df_weight.columns
df_weight['sum']=0
for f in tqdm(feature):
df_weight['sum']+=df_weight[f]
deviceid_packages['tfidf_sum']=df_weight['sum']
# In[10]:
lda = LatentDirichletAllocation(n_topics=5,
learning_offset=50.,
random_state=666)
docres = lda.fit_transform(cntTf)
# In[11]:
deviceid_packages = pd.concat([deviceid_packages,pd.DataFrame(docres)],axis=1)
# In[12]:
temp=deviceid_packages.drop('apps',axis=1)
deviceid_train=pd.merge(deviceid_train,temp,on='device_id',how='left')
# In[13]:
#解析出所有的device_app_pair
device_id_arr = []
app_arr = []
df_device_app_pair = pd.DataFrame()
for row in deviceid_packages.values :
device_id = row[0]
app_list = row[1]
for app in app_list :
device_id_arr.append(device_id)
app_arr.append(app)
#生成pair
df_device_app_pair['device_id'] = device_id_arr
df_device_app_pair['app_id'] = app_arr
df_device_app_pair = df_device_app_pair.merge(package_label, how='left', on='app_id')
# In[15]:
#提取特征
df_train = deviceid_train.merge(device_active_time_time_stat(), on='device_id', how='left')
df_train = df_train.merge(deviceid_brand, on='device_id', how='left')
df_train = df_train.merge(app_type_onehot_in_device(df_device_app_pair), on='device_id', how='left')
df_train = df_train.merge(app_type_encoding(), on='device_id', how='left')
df_train = df_train.merge(device_active_hour(), on='device_id', how='left')
df_train = df_train.merge(app_type_mean_time_gap_one_hot(), on='device_id', how='left')
df_train = df_train.merge(open_app_counts_in_hour(), on='device_id', how='left')
df_train = df_train.merge(close_app_counts_in_hour(), on='device_id', how='left')
df_train = df_train.merge(device_brand_encoding(), on='device_id', how='left')
df_train = df_train.merge(device_start_end_app_timegap(), on='device_id', how='left')
df_train = df_train.merge(open_app_timegap_in_hour(), on='device_id', how='left')
# In[16]:
df_w2c_start = pd.read_csv('device_start_app_w2c.csv')
df_w2c_close = pd.read_csv('device_close_app_w2c.csv')
df_w2c_all = pd.read_csv('device_all_app_w2c.csv')
df_device_quchong_start_app_w2c = pd.read_csv('device_quchong_start_app_w2c.csv')
df_device_app_unique_start_app_w2c = pd.read_csv('device_app_unique_start_app_w2c.csv')
df_device_app_unique_close_app_w2c = pd.read_csv('device_app_unique_close_app_w2c.csv')
df_device_app_unique_all_app_w2c = pd.read_csv('device_app_unique_all_app_w2c.csv')
# In[17]:
df_train_w2v = df_train.merge(df_w2c_start, on='device_id', how='left')
df_train_w2v = df_train_w2v.merge(df_w2c_close, on='device_id', how='left')
df_train_w2v = df_train_w2v.merge(df_w2c_all, on='device_id', how='left')
df_train_w2v = df_train_w2v.merge(df_device_quchong_start_app_w2c, on='device_id', how='left')
df_train_w2v = df_train_w2v.merge(df_device_app_unique_start_app_w2c, on='device_id', how='left')
df_train_w2v = df_train_w2v.merge(df_device_app_unique_close_app_w2c, on='device_id', how='left')
df_train_w2v = df_train_w2v.merge(df_device_app_unique_all_app_w2c, on='device_id', how='left')
# In[19]:
df_train_w2v['sex'] = df_train_w2v['sex'].apply(lambda x:str(x))
df_train_w2v['age'] = df_train_w2v['age'].apply(lambda x:str(x))
def tool(x):
if x=='nan':
return x
else:
return str(int(float(x)))
df_train_w2v['sex']=df_train_w2v['sex'].apply(tool)
df_train_w2v['age']=df_train_w2v['age'].apply(tool)
df_train_w2v['sex_age']=df_train_w2v['sex']+'-'+df_train_w2v['age']
df_train_w2v = df_train_w2v.replace({'nan':np.NaN,'nan-nan':np.NaN})
# In[33]:
train = df_train_w2v[df_train_w2v['sex'].notnull()]
test = df_train_w2v[df_train_w2v['sex'].isnull()]
train = train.reset_index(drop=True)
test = test.reset_index(drop=True)
X = train.drop(['sex','age','sex_age','device_id'],axis=1)
Y = train['sex_age']
Y_CAT = pd.Categorical(Y)
Y = pd.Series(Y_CAT.codes)
# In[36]:
from sklearn.model_selection import KFold, StratifiedKFold
gc.collect()
seed = 2018
num_folds = 5
folds = StratifiedKFold(n_splits= num_folds, shuffle=True, random_state=seed)
oof_preds = np.zeros([train.shape[0], 22])
sub_list = []
cate_feat = ['device_type','device_brand']
for n_fold, (train_idx, valid_idx) in enumerate(folds.split(X, Y)):
train_x, train_y = X.iloc[train_idx], Y.iloc[train_idx]
valid_x, valid_y = X.iloc[valid_idx], Y.iloc[valid_idx]
lgb_train=lgb.Dataset(train_x,label=train_y)
lgb_eval = lgb.Dataset(valid_x, valid_y, reference=lgb_train)
params = {
'boosting_type': 'gbdt',
#'learning_rate' : 0.02,
'learning_rate' : 0.02,
'max_depth':5,
'num_leaves' : 2 ** 4,
'metric': {'multi_logloss'},
'num_class' : 22,
'objective' : 'multiclass',
'random_state' : 2018,
'bagging_freq' : 5,
'feature_fraction' : 0.7,
'bagging_fraction' : 0.7,
'min_split_gain' : 0.0970905919552776,
'min_child_weight' : 9.42012323936088,
}
gbm = lgb.train(params,
lgb_train,
num_boost_round=600,
valid_sets=[lgb_train, lgb_eval],
#early_stopping_rounds=200,
verbose_eval=100)
oof_preds[valid_idx] = gbm.predict(valid_x[X.columns.values])
oof_train = pd.DataFrame(oof_preds)
oof_train.columns = ['lgb_sex_age_prob_oof_' + str(i) for i in range(22)]
train = pd.concat([train, oof_train], axis=1)
# In[37]:
#用全部的数据来预测
#用全部的train来预测test
lgb_train = lgb.Dataset(X,label=Y)
gbm = lgb.train(params, lgb_train, num_boost_round=600, valid_sets=lgb_train, verbose_eval=100)
test = test.reset_index(drop=True)
test_preds = gbm.predict(test[X.columns.values])
oof_test = pd.DataFrame(test_preds)
oof_test.columns = ['lgb_sex_age_prob_oof_' + str(i) for i in range(22)]
test = pd.concat([test, oof_test], axis=1)
# In[39]:
df_sex_age_prob_oof = pd.concat([train[['device_id'] + ['lgb_sex_age_prob_oof_' + str(i) for i in range(22)] ],
test[['device_id'] + ['lgb_sex_age_prob_oof_' + str(i) for i in range(22)] ]])
df_sex_age_prob_oof.to_csv('lgb_sex_age_prob_oof.csv', index=None)
================================================
FILE: THLUO/21.tfidf_lr_sex_age_prob_oof.py
================================================
# coding: utf-8
# In[1]:
import pandas as pd
import seaborn as sns
import numpy as np
from tqdm import tqdm
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.cross_validation import train_test_split
from sklearn.metrics import accuracy_score
import lightgbm as lgb
from datetime import datetime,timedelta
import time
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
import gc
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss
# In[2]:
print('21.tfidf_lr.py')
path='input/'
data=pd.DataFrame()
#sex_age=pd.read_excel('./data/性别年龄对照表.xlsx')
# In[3]:
deviceid_packages=pd.read_csv(path+'deviceid_packages.tsv',sep='\t',names=['device_id','apps'])
deviceid_test=pd.read_csv(path+'deviceid_test.tsv',sep='\t',names=['device_id'])
deviceid_train=pd.read_csv(path+'deviceid_train.tsv',sep='\t',names=['device_id','sex','age'])
deviceid_brand = pd.read_csv(path+'deviceid_brand.tsv',sep='\t', names=['device_id','device_brand', 'device_type'])
deviceid_package_start_close = pd.read_csv(path+'deviceid_package_start_close.tsv',sep='\t', names=['device_id','app_id','start_time','close_time'])
package_label = pd.read_csv(path+'package_label.tsv',sep='\t',names=['app_id','app_parent_type', 'app_child_type'])
deviceid_brand['device_brand'] = deviceid_brand.device_brand.apply(lambda x : str(x).split(' ')[0])
df_temp = deviceid_brand.groupby('device_brand')['device_id'].count().reset_index().rename(columns={'device_id':'brand_counts'})
one_time_brand = df_temp[df_temp.brand_counts == 1].device_brand.values
deviceid_brand['device_brand'] = deviceid_brand.device_brand.apply(lambda x : 'other' if x in one_time_brand else x)
df_temp = deviceid_brand.groupby('device_brand')['device_id'].count().reset_index().rename(columns={'device_id':'brand_counts'})
one_time_brand = df_temp[df_temp.brand_counts == 2].device_brand.values
deviceid_brand['device_brand'] = deviceid_brand.device_brand.apply(lambda x : 'other_2' if x in one_time_brand else x)
df_temp = deviceid_brand.groupby('device_brand')['device_id'].count().reset_index().rename(columns={'device_id':'brand_counts'})
one_time_brand = df_temp[df_temp.brand_counts == 3].device_brand.values
deviceid_brand['device_brand'] = deviceid_brand.device_brand.apply(lambda x : 'other_3' if x in one_time_brand else x)
#转换成对应的数字
lbl = LabelEncoder()
lbl.fit(list(deviceid_brand.device_brand.values))
deviceid_brand['device_brand'] = lbl.transform(list(deviceid_brand.device_brand.values))
lbl = LabelEncoder()
lbl.fit(list(deviceid_brand.device_type.values))
deviceid_brand['device_type'] = lbl.transform(list(deviceid_brand.device_type.values))
#转换成对应的数字
lbl = LabelEncoder()
lbl.fit(list(package_label.app_parent_type.values))
package_label['app_parent_type'] = lbl.transform(list(package_label.app_parent_type.values))
lbl = LabelEncoder()
lbl.fit(list(package_label.app_child_type.values))
package_label['app_child_type'] = lbl.transform(list(package_label.app_child_type.values))
deviceid_train = pd.concat([deviceid_train, deviceid_test])
# In[4]:
deviceid_package_start = deviceid_package_start_close[['device_id', 'app_id', 'start_time']]
deviceid_package_start.columns = ['device_id', 'app_id', 'all_time']
deviceid_package_close = deviceid_package_start_close[['device_id', 'app_id', 'close_time']]
deviceid_package_close.columns = ['device_id', 'app_id', 'all_time']
deviceid_package_all = pd.concat([deviceid_package_start, deviceid_package_close])
deviceid_package_all = deviceid_package_all.sort_values(by='all_time')
#deviceid_package_all = deviceid_package_all.merge(deviceid_train, on='device_id', how='left')
# In[5]:
df = deviceid_package_all.groupby('device_id').apply(lambda x : list(x.app_id)).reset_index().rename(columns = {0 : 'app_list'})
# In[6]:
df_sex_prob_oof = pd.read_csv('device_sex_prob_oof.csv')
df_age_prob_oof = pd.read_csv('device_age_prob_oof.csv')
df_start_close_sex_prob_oof = pd.read_csv('start_close_sex_prob_oof.csv')
df_start_close_age_prob_oof = pd.read_csv('start_close_age_prob_oof.csv')
df_start_close_sex_age_prob_oof = pd.read_csv('start_close_sex_age_prob_oof.csv')
gc.collect()
df = df.merge(df_sex_prob_oof, on='device_id', how='left')
df = df.merge(df_age_prob_oof, on='device_id', how='left')
df = df.merge(df_start_close_sex_prob_oof, on='device_id', how='left')
df = df.merge(df_start_close_age_prob_oof, on='device_id', how='left')
df = df.merge(df_start_close_sex_age_prob_oof, on='device_id', how='left')
df.fillna(0, inplace=True)
apps = df['app_list'].apply(lambda x:' '.join(x)).tolist()
del df['app_list']
df = df.merge(deviceid_train, on='device_id', how='left')
# In[8]:
vectorizer=CountVectorizer()
transformer=TfidfTransformer()
cntTf = vectorizer.fit_transform(apps)
tfidf=transformer.fit_transform(cntTf)
word=vectorizer.get_feature_names()
weight=tfidf.toarray()
df_weight=pd.DataFrame(weight)
feature=df_weight.columns
# In[9]:
for i in df.columns.values:
df_weight[i] = df[i]
df_weight[i] = df[i]
# In[11]:
df_weight['sex'] = df_weight['sex'].apply(lambda x:str(x))
df_weight['age'] = df_weight['age'].apply(lambda x:str(x))
def tool(x):
if x == 'nan':
return x
else:
return str(int(float(x)))
df_weight['sex'] = df_weight['sex'].apply(tool)
df_weight['age'] = df_weight['age'].apply(tool)
df_weight['sex_age'] = df_weight['sex']+'-'+df_weight['age']
df_weight['sex_age'] = df_weight.sex_age.replace({'nan':np.NaN,'nan-nan':np.NaN})
# In[12]:
train = df_weight[df_weight.sex_age.notnull()]
train.reset_index(drop=True, inplace=True)
test = df_weight[df_weight.sex_age.isnull()]
test.reset_index(drop=True, inplace=True)
gc.collect()
# In[16]:
X = train.drop(['sex','age','sex_age','device_id'],axis=1)
Y = train['sex_age']
Y_CAT = pd.Categorical(Y)
Y = pd.Series(Y_CAT.codes)
# In[18]:
from sklearn.model_selection import KFold, StratifiedKFold
gc.collect()
seed = 666
num_folds = 5
folds = StratifiedKFold(n_splits= num_folds, shuffle=True, random_state=seed)
oof_preds = np.zeros([train.shape[0], 22])
for n_fold, (train_idx, valid_idx) in enumerate(folds.split(X, Y)):
train_x, train_y = X.iloc[train_idx], Y.iloc[train_idx]
valid_x, valid_y = X.iloc[valid_idx], Y.iloc[valid_idx]
clf = LogisticRegression(C=4)
clf.fit(train_x, train_y)
valid_preds=clf.predict_proba(valid_x)
train_preds=clf.predict_proba(train_x)
oof_preds[valid_idx] = valid_preds
print (log_loss(train_y.values, train_preds), log_loss(valid_y.values, valid_preds))
oof_train = pd.DataFrame(oof_preds)
oof_train.columns = ['tfidf_lr_sex_age_prob_oof_' + str(i) for i in range(22)]
train_temp = pd.concat([train[['device_id']], oof_train], axis=1)
# In[20]:
#用全部的数据预测
clf = LogisticRegression(C=4)
clf.fit(X, Y)
train_preds=clf.predict_proba(X)
test_preds=clf.predict_proba(test[X.columns])
print (log_loss(Y.values, train_preds))
oof_test = pd.DataFrame(test_preds)
oof_test.columns = ['tfidf_lr_sex_age_prob_oof_' + str(i) for i in range(22)]
# In[24]:
oof_test
# In[25]:
test_temp = pd.concat([test[['device_id']], oof_test], axis=1)
test_temp
# In[26]:
sex_age_oof = pd.concat([train_temp, test_temp])
sex_age_oof
# In[29]:
sex_age_oof.to_csv('tfidf_lr_sex_age_prob_oof.csv', index=None)
================================================
FILE: THLUO/22.base_feat.py
================================================
# coding: utf-8
# In[1]:
import pandas as pd
import seaborn as sns
import numpy as np
from tqdm import tqdm
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.cross_validation import train_test_split
from sklearn.metrics import accuracy_score
import lightgbm as lgb
from datetime import datetime,timedelta
import time
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
import gc
# In[2]:
path='input/'
data=pd.DataFrame()
#sex_age=pd.read_excel('./data/性别年龄对照表.xlsx')
# In[3]:
deviceid_packages=pd.read_csv(path+'deviceid_packages.tsv',sep='\t',names=['device_id','apps'])
deviceid_test=pd.read_csv(path+'deviceid_test.tsv',sep='\t',names=['device_id'])
deviceid_train=pd.read_csv(path+'deviceid_train.tsv',sep='\t',names=['device_id','sex','age'])
deviceid_brand = pd.read_csv(path+'deviceid_brand.tsv',sep='\t', names=['device_id','device_brand', 'device_type'])
deviceid_package_start_close = pd.read_csv(path+'deviceid_package_start_close.tsv',sep='\t', names=['device_id','app_id','start_time','close_time'])
package_label = pd.read_csv(path+'package_label.tsv',sep='\t',names=['app_id','app_parent_type', 'app_child_type'])
deviceid_brand['device_brand'] = deviceid_brand.device_brand.apply(lambda x : str(x).split(' ')[0])
df_temp = deviceid_brand.groupby('device_brand')['device_id'].count().reset_index().rename(columns={'device_id':'brand_counts'})
one_time_brand = df_temp[df_temp.brand_counts == 1].device_brand.values
deviceid_brand['device_brand'] = deviceid_brand.device_brand.apply(lambda x : 'other' if x in one_time_brand else x)
df_temp = deviceid_brand.groupby('device_brand')['device_id'].count().reset_index().rename(columns={'device_id':'brand_counts'})
one_time_brand = df_temp[df_temp.brand_counts == 2].device_brand.values
deviceid_brand['device_brand'] = deviceid_brand.device_brand.apply(lambda x : 'other_2' if x in one_time_brand else x)
df_temp = deviceid_brand.groupby('device_brand')['device_id'].count().reset_index().rename(columns={'device_id':'brand_counts'})
one_time_brand = df_temp[df_temp.brand_counts == 3].device_brand.values
deviceid_brand['device_brand'] = deviceid_brand.device_brand.apply(lambda x : 'other_3' if x in one_time_brand else x)
#转换成对应的数字
lbl = LabelEncoder()
lbl.fit(list(deviceid_brand.device_brand.values))
deviceid_brand['device_brand'] = lbl.transform(list(deviceid_brand.device_brand.values))
lbl = LabelEncoder()
lbl.fit(list(deviceid_brand.device_type.values))
deviceid_brand['device_type'] = lbl.transform(list(deviceid_brand.device_type.values))
#转换成对应的数字
lbl = LabelEncoder()
lbl.fit(list(package_label.app_parent_type.values))
package_label['app_parent_type'] = lbl.transform(list(package_label.app_parent_type.values))
lbl = LabelEncoder()
lbl.fit(list(package_label.app_child_type.values))
package_label['app_child_type'] = lbl.transform(list(package_label.app_child_type.values))
# In[4]:
import time
# 输入毫秒级的时间,转出正常格式的时间
def timeStamp(timeNum):
timeStamp = float(timeNum/1000)
timeArray = time.localtime(timeStamp)
otherStyleTime = time.strftime("%Y-%m-%d %H:%M:%S", timeArray)
return otherStyleTime
#解析出具体的时间
deviceid_package_start_close['start_date'] = pd.to_datetime(deviceid_package_start_close.start_time.apply(timeStamp))
deviceid_package_start_close['end_date'] = pd.to_datetime(deviceid_package_start_close.close_time.apply(timeStamp))
deviceid_package_start_close['start_hour'] = deviceid_package_start_close.start_date.dt.hour
deviceid_package_start_close['end_hour'] = deviceid_package_start_close.end_date.dt.hour
deviceid_package_start_close['time_gap'] = (deviceid_package_start_close['end_date'] - deviceid_package_start_close['start_date']).astype('timedelta64[s]')
deviceid_package_start_close = deviceid_package_start_close.merge(package_label, on='app_id', how='left')
deviceid_package_start_close.app_parent_type.fillna(-1, inplace=True)
deviceid_package_start_close.app_child_type.fillna(-1, inplace=True)
deviceid_package_start_close['start_year'] = deviceid_package_start_close.start_date.dt.year
deviceid_package_start_close['end_year'] = deviceid_package_start_close.end_date.dt.year
deviceid_package_start_close['year_gap'] = deviceid_package_start_close['end_year'] - deviceid_package_start_close['start_year']
# In[5]:
deviceid_train=pd.concat([deviceid_train,deviceid_test])
# In[6]:
deviceid_packages['apps']=deviceid_packages['apps'].apply(lambda x:x.split(','))
deviceid_packages['app_lenghth']=deviceid_packages['apps'].apply(lambda x:len(x))
#特征工程
def open_app_timegap_in_hour() :
df_temp = deviceid_package_start_close.groupby(['device_id', 'start_hour'])['time_gap'].mean().reset_index().rename(columns = {'time_gap': 'mean_time_gap'})
df_mean_temp = pd.pivot_table(df_temp, index='device_id', columns='start_hour', values='mean_time_gap').reset_index()
df_mean_temp.columns = ['device_id'] + ['open_app_timegap_in_'+str(i) + '_mean_hour' for i in range(0,24)]
df_mean_temp.fillna(0, inplace=True)
return df_mean_temp
# In[8]:
def device_start_end_app_timegap() :
#用户打开,关闭app的时间间隔
df_ = deviceid_package_start_close.sort_values(by=['device_id', 'start_date'], ascending=False)
df_['prev_start_date'] = df_.groupby('device_id')['start_date'].shift(-1)
df_['start_date_gap'] = (df_['start_date'] - df_['prev_start_date']).astype('timedelta64[s]')
agg_dic = {'start_date_gap' : ['min', 'max', 'mean', 'median', 'std']}
df_start_gap_agg = df_.groupby('device_id').agg(agg_dic)
df_start_gap_agg.columns = pd.Index(['device_' + e[0] + "_" + e[1].upper() for e in df_start_gap_agg.columns.tolist()])
df_start_gap_agg = df_start_gap_agg.reset_index()
#del df_
gc.collect()
#关闭时间间隔
df_ = deviceid_package_start_close.sort_values(by=['device_id', 'end_date'], ascending=False)
df_['prev_end_date'] = df_.groupby('device_id')['end_date'].shift(-1)
df_['end_date_gap'] = (df_['end_date'] - df_['prev_end_date']).astype('timedelta64[s]')
agg_dic = {'end_date_gap' : ['min', 'max', 'mean', 'median', 'std']}
df_end_gap_agg = df_.groupby('device_id').agg(agg_dic)
df_end_gap_agg.columns = pd.Index(['device_' + e[0] + "_" + e[1].upper() for e in df_end_gap_agg.columns.tolist()])
df_end_gap_agg = df_end_gap_agg.reset_index()
#del df_
gc.collect()
df_agg = df_start_gap_agg.merge(df_end_gap_agg, on='device_id', how='left')
#df_agg = df_agg.merge(df_app_start_gap_agg, on='device_id', how='left')
#df_agg = df_agg.merge(df_app_end_gap_agg, on='device_id', how='left')
return df_agg
def open_app_counts_in_hour() :
df_temp = deviceid_package_start_close.groupby(['device_id', 'start_hour'])['app_id'].count().reset_index().rename(columns = {'app_id': 'app_counts'})
df_temp = pd.pivot_table(df_temp, index='device_id', columns='start_hour', values='app_counts').reset_index()
df_temp.columns = ['device_id'] + ['open_app_counts_in'+str(i) + '_hour' for i in range(0,24)]
df_temp.fillna(0, inplace=True)
return df_temp
def close_app_counts_in_hour() :
df_temp = deviceid_package_start_close.groupby(['device_id', 'end_hour'])['app_id'].count().reset_index().rename(columns = {'app_id': 'app_counts'})
df_temp = pd.pivot_table(df_temp, index='device_id', columns='end_hour', values='app_counts').reset_index()
df_temp.columns = ['device_id'] + ['close_app_counts_in'+str(i) + '_hour' for i in range(0,24)]
df_temp.fillna(0, inplace=True)
return df_temp
def app_type_mean_time_gap_one_hot () :
df_temp = deviceid_package_start_close.groupby(['device_id', 'app_parent_type'])['time_gap'].mean().reset_index()
df_temp = pd.pivot_table(df_temp, index='device_id', columns='app_parent_type', values='time_gap').reset_index()
df_temp.columns = ['device_id'] + ['app_parent_type_mean_time_gap'+str(i) for i in range(-1,45)]
df_temp.fillna(-1, inplace=True)
return df_temp
def device_active_hour() :
aggregations = {
'start_hour' : ['std','mean','max','min'],
'end_hour' : ['std','mean','max','min']
}
df_agg = deviceid_package_start_close.groupby('device_id').agg(aggregations)
df_agg.columns = pd.Index(['device_grouped_' + e[0] + "_" + e[1].upper() for e in df_agg.columns.tolist()])
df_agg = df_agg.reset_index()
return df_agg
def device_brand_encoding() :
df_temp = deviceid_brand.merge(deviceid_train[['device_id', 'age', 'sex']], on='device_id', how='left')
aggregations = {
'age' : ['std','mean'],
'sex' : ['mean'],
}
df_device_brand = df_temp.groupby('device_brand').agg(aggregations)
df_device_brand.columns = pd.Index(['device_brand_' + e[0] + "_" + e[1].upper() for e in df_device_brand.columns.tolist()])
df_device_brand = df_device_brand.reset_index()
df_device_type = df_temp.groupby('device_type').agg(aggregations)
df_device_type.columns = pd.Index(['device_type_' + e[0] + "_" + e[1].upper() for e in df_device_type.columns.tolist()])
df_device_type = df_device_type.reset_index()
df_temp = df_temp.merge(df_device_brand, on='device_brand', how='left')
df_temp = df_temp.merge(df_device_type, on='device_type', how='left')
aggregations = {
'device_brand_age_STD' : ['mean'],
'device_brand_age_MEAN' : ['mean'],
'device_brand_sex_MEAN' : ['mean'],
#'device_type_age_STD' : ['mean'],
#'device_type_age_MEAN' : ['mean'],
#'device_type_sex_MEAN' : ['mean']
}
df_agg = df_temp.groupby('device_id').agg(aggregations)
df_agg.columns = pd.Index(['device_grouped_' + e[0] + "_" + e[1].upper() for e in df_agg.columns.tolist()])
df_agg = df_agg.reset_index()
return df_agg
#统计device运行app的情况
def device_active_time_time_stat() :
#device开启app的时间统计信息
deviceid_package_start_close['active_time'] = deviceid_package_start_close['close_time'] - deviceid_package_start_close['start_time']
#device开启了多少次app
#device开启了多少个app
aggregations = {
'app_id' : ['count', 'nunique'],
'active_time' : ['mean', 'std', 'max', 'min'],
}
df_agg = deviceid_package_start_close.groupby('device_id').agg(aggregations)
df_agg.columns = pd.Index(['device_grouped_' + e[0] + "_" + e[1].upper() for e in df_agg.columns.tolist()])
df_agg = df_agg.reset_index()
aggregations = {
'active_time' : ['mean', 'std', 'max', 'min', 'count'],
}
df_da_agg = deviceid_package_start_close.groupby(['device_id', 'app_id']).agg(aggregations)
df_da_agg.columns = pd.Index(['device_app_grouped_' + e[0] + "_" + e[1].upper() for e in df_da_agg.columns.tolist()])
df_da_agg = df_da_agg.reset_index()
#device开启app的平均时间
aggregations = {
'device_app_grouped_active_time_MEAN' : ['mean', 'std', 'max', 'min'],
'device_app_grouped_active_time_STD' : ['mean', 'std', 'max', 'min'],
'device_app_grouped_active_time_MAX' : ['mean', 'std', 'max', 'min'],
'device_app_grouped_active_time_MIN' : ['mean', 'std', 'max', 'min'],
'device_app_grouped_active_time_COUNT' : ['mean', 'std', 'max', 'min'],
}
df_temp = df_da_agg.groupby(['device_id']).agg(aggregations)
df_temp.columns = pd.Index([e[0] + "_" + e[1].upper() for e in df_temp.columns.tolist()])
df_temp = df_temp.reset_index()
df_agg = df_agg.merge(df_temp, on='device_id', how='left')
return df_agg
def app_type_encoding() :
df_temp = df_device_app_pair.merge(deviceid_train[['device_id', 'age', 'sex']], on='device_id', how='left')
aggregations = {
'age' : ['std','mean'],
'sex' : ['mean'],
}
df_agg_app_parent_type = df_temp.groupby('app_parent_type').agg(aggregations)
df_agg_app_parent_type.columns = pd.Index(['app_parent_type_' + e[0] + "_" + e[1].upper() for e in df_agg_app_parent_type.columns.tolist()])
df_agg_app_parent_type = df_agg_app_parent_type.reset_index()
df_agg_app_child_type = df_temp.groupby('app_child_type').agg(aggregations)
df_agg_app_child_type.columns = pd.Index(['app_child_type_' + e[0] + "_" + e[1].upper() for e in df_agg_app_child_type.columns.tolist()])
df_agg_app_child_type = df_agg_app_child_type.reset_index()
df_temp = df_temp.merge(df_agg_app_parent_type, on='app_parent_type', how='left')
df_temp = df_temp.merge(df_agg_app_child_type, on='app_child_type', how='left')
aggregations = {
'app_parent_type_age_STD' : ['mean'],
'app_parent_type_age_MEAN' : ['mean'],
'app_parent_type_sex_MEAN' : ['mean'],
'app_child_type_age_STD' : ['mean'],
'app_child_type_age_MEAN' : ['mean'],
'app_child_type_sex_MEAN' : ['mean']
}
df_agg = df_temp.groupby('device_id').agg(aggregations)
df_agg.columns = pd.Index(['device_grouped_' + e[0] + "_" + e[1].upper() for e in df_agg.columns.tolist()])
df_agg = df_agg.reset_index()
return df_agg
#每个device对应的app_parent_type计数
def app_type_onehot_in_device(df) :
df_copy = df.fillna(-1)
df_temp = df_copy.groupby(['device_id', 'app_parent_type'])['app_id'].size().reset_index()
df_temp.rename(columns = {'app_id' : 'app_parent_type_counts'}, inplace=True)
df_temp = pd.pivot_table(df_temp, index='device_id', columns='app_parent_type', values='app_parent_type_counts').reset_index()
df_temp.columns = ['device_id'] + ['app_parent_type'+str(i) for i in range(-1,45)]
df_temp.fillna(0, inplace=True)
return df_temp
apps=deviceid_packages['apps'].apply(lambda x:' '.join(x)).tolist()
vectorizer=CountVectorizer()
transformer=TfidfTransformer()
cntTf = vectorizer.fit_transform(apps)
tfidf=transformer.fit_transform(cntTf)
word=vectorizer.get_feature_names()
weight=tfidf.toarray()
df_weight=pd.DataFrame(weight)
feature=df_weight.columns
df_weight['sum']=0
for f in tqdm(feature):
df_weight['sum']+=df_weight[f]
deviceid_packages['tfidf_sum']=df_weight['sum']
# In[10]:
lda = LatentDirichletAllocation(n_topics=5,
learning_offset=50.,
random_state=666)
docres = lda.fit_transform(cntTf)
# In[11]:
deviceid_packages = pd.concat([deviceid_packages,pd.DataFrame(docres)],axis=1)
# In[12]:
temp=deviceid_packages.drop('apps',axis=1)
deviceid_train=pd.merge(deviceid_train,temp,on='device_id',how='left')
# In[13]:
#解析出所有的device_app_pair
device_id_arr = []
app_arr = []
df_device_app_pair = pd.DataFrame()
for row in deviceid_packages.values :
device_id = row[0]
app_list = row[1]
for app in app_list :
device_id_arr.append(device_id)
app_arr.append(app)
#生成pair
df_device_app_pair['device_id'] = device_id_arr
df_device_app_pair['app_id'] = app_arr
df_device_app_pair = df_device_app_pair.merge(package_label, how='left', on='app_id')
# In[15]:
#提取特征
df_train = deviceid_train.merge(device_active_time_time_stat(), on='device_id', how='left')
df_train = df_train.merge(deviceid_brand, on='device_id', how='left')
df_train = df_train.merge(app_type_onehot_in_device(df_device_app_pair), on='device_id', how='left')
df_train = df_train.merge(app_type_encoding(), on='device_id', how='left')
df_train = df_train.merge(device_active_hour(), on='device_id', how='left')
df_train = df_train.merge(app_type_mean_time_gap_one_hot(), on='device_id', how='left')
df_train = df_train.merge(open_app_counts_in_hour(), on='device_id', how='left')
df_train = df_train.merge(close_app_counts_in_hour(), on='device_id', how='left')
df_train = df_train.merge(device_brand_encoding(), on='device_id', how='left')
df_train = df_train.merge(device_start_end_app_timegap(), on='device_id', how='left')
df_train = df_train.merge(open_app_timegap_in_hour(), on='device_id', how='left')
# In[33]:
df_w2c_start = pd.read_csv('device_start_app_w2c.csv')
df_sex_prob_oof = pd.read_csv('device_sex_prob_oof.csv')
df_age_prob_oof = pd.read_csv('device_age_prob_oof.csv')
df_w2c_close = pd.read_csv('device_close_app_w2c.csv')
df_w2c_all = pd.read_csv('device_all_app_w2c.csv')
df_start_close_sex_prob_oof = pd.read_csv('start_close_sex_prob_oof.csv')
#后面两个,线上线下不对应,线下过拟合了
df_start_close_age_prob_oof = pd.read_csv('start_close_age_prob_oof.csv')
df_device_quchong_start_app_w2c = pd.read_csv('device_quchong_start_app_w2c.csv')
df_tfidf_lr_sex_age_prob_oof = pd.read_csv('tfidf_lr_sex_age_prob_oof.csv')
df_device_app_unique_start_app_w2c = pd.read_csv('device_app_unique_start_app_w2c.csv')
df_device_app_unique_close_app_w2c = pd.read_csv('device_app_unique_close_app_w2c.csv')
df_device_app_unique_all_app_w2c = pd.read_csv('device_app_unique_all_app_w2c.csv')
#之前的有用的
df_sex_age_bin_prob_oof = pd.read_csv('sex_age_bin_prob_oof.csv')
df_age_bin_prob_oof = pd.read_csv('age_bin_prob_oof.csv')
df_hcc_device_brand_age_sex = pd.read_csv('hcc_device_brand_age_sex.csv')
df_device_age_regression_prob_oof = pd.read_csv('device_age_regression_prob_oof.csv')
df_device_start_GRU_pred = pd.read_csv('device_start_GRU_pred.csv')
df_device_start_GRU_pred_age = pd.read_csv('device_start_GRU_pred_age.csv')
df_device_all_GRU_pred = pd.read_csv('device_all_GRU_pred.csv')
df_device_start_capsule_pred = pd.read_csv('device_start_capsule_pred.csv')
df_lgb_sex_age_prob_oof = pd.read_csv('lgb_sex_age_prob_oof.csv')
df_device_start_textcnn_pred = pd.read_csv('device_start_textcnn_pred.csv')
df_device_start_text_dpcnn_pred = pd.read_csv('device_start_text_dpcnn_pred.csv')
df_device_start_lstm_pred = pd.read_csv('device_start_lstm_pred.csv')
#过拟合特征
del df_start_close_age_prob_oof['device_app_groupedstart_close_age_prob_oof_4_MEAN']
del df_start_close_sex_prob_oof['device_app_groupedstart_close_sex_prob_oof_MIN']
del df_start_close_sex_prob_oof['device_app_groupedstart_close_sex_prob_oof_MAX']
# In[35]:
df_train_w2v = df_train.merge(df_w2c_start, on='device_id', how='left')
df_train_w2v = df_train_w2v.merge(df_sex_prob_oof, on='device_id', how='left')
df_train_w2v = df_train_w2v.merge(df_age_prob_oof, on='device_id', how='left')
df_train_w2v = df_train_w2v.merge(df_w2c_close, on='device_id', how='left')
df_train_w2v = df_train_w2v.merge(df_w2c_all, on='device_id', how='left')
df_train_w2v = df_train_w2v.merge(df_start_close_sex_prob_oof, on='device_id', how='left')
df_train_w2v = df_train_w2v.merge(df_start_close_age_prob_oof, on='device_id', how='left')
df_train_w2v = df_train_w2v.merge(df_device_quchong_start_app_w2c, on='device_id', how='left')
df_train_w2v = df_train_w2v.merge(df_tfidf_lr_sex_age_prob_oof, on='device_id', how='left')
df_train_w2v = df_train_w2v.merge(df_device_app_unique_start_app_w2c, on='device_id', how='left')
df_train_w2v = df_train_w2v.merge(df_device_app_unique_close_app_w2c, on='device_id', how='left')
df_train_w2v = df_train_w2v.merge(df_device_app_unique_all_app_w2c, on='device_id', how='left')
df_train_w2v = df_train_w2v.merge(df_sex_age_bin_prob_oof, on='device_id', how='left')
df_train_w2v = df_train_w2v.merge(df_age_bin_prob_oof, on='device_id', how='left')
df_train_w2v = df_train_w2v.merge(df_hcc_device_brand_age_sex, on='device_id', how='left')
df_train_w2v = df_train_w2v.merge(df_device_age_regression_prob_oof, on='device_id', how='left')
df_train_w2v = df_train_w2v.merge(df_device_start_GRU_pred, on='device_id', how='left')
df_train_w2v = df_train_w2v.merge(df_device_start_GRU_pred_age, on='device_id', how='left')
df_train_w2v = df_train_w2v.merge(df_device_all_GRU_pred, on='device_id', how='left')
df_train_w2v = df_train_w2v.merge(df_device_start_capsule_pred, on='device_id', how='left')
df_train_w2v = df_train_w2v.merge(df_lgb_sex_age_prob_oof, on='device_id', how='left')
df_train_w2v = df_train_w2v.merge(df_device_start_textcnn_pred, on='device_id', how='left')
df_train_w2v = df_train_w2v.merge(df_device_start_text_dpcnn_pred, on='device_id', how='left')
df_train_w2v = df_train_w2v.merge(df_device_start_lstm_pred, on='device_id', how='left')
# In[24]:
df_train_w2v['sex'] = df_train_w2v['sex'].apply(lambda x:str(x))
df_train_w2v['age'] = df_train_w2v['age'].apply(lambda x:str(x))
def tool(x):
if x=='nan':
return x
else:
return str(int(float(x)))
df_train_w2v['sex']=df_train_w2v['sex'].apply(tool)
df_train_w2v['age']=df_train_w2v['age'].apply(tool)
df_train_w2v['sex_age']=df_train_w2v['sex']+'-'+df_train_w2v['age']
df_train_w2v = df_train_w2v.replace({'nan':np.NaN,'nan-nan':np.NaN})
# In[ ]:
df_train_w2v.to_csv('thluo_train_best_feat.csv', index=None)
================================================
FILE: THLUO/23.ATT_v6.py
================================================
import pandas as pd
import seaborn as sns
import numpy as np
from tqdm import tqdm
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from datetime import datetime,timedelta
import time
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from gensim.models import FastText, Word2Vec
import re
from keras.layers import *
from keras.models import *
from keras.preprocessing.text import Tokenizer, text_to_word_sequence
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing import text, sequence
from keras.callbacks import *
from keras.layers.advanced_activations import LeakyReLU, PReLU
import keras.backend as K
from keras.optimizers import *
from keras.utils import to_categorical
from keras.utils import multi_gpu_model
import tensorflow as tf
#from keras.backend.tensorflow_backend import set_session
#config = tf.ConfigProto()
#config.gpu_options.per_process_gpu_memory_fraction = 0.9
#set_session(tf.Session(config=config))
print ('23.ATT_V6.py')
path="input/"
np.random.seed(1337)
packages = pd.read_csv(path+'deviceid_packages.tsv', sep='\t', names=['device_id', 'apps'])
test = pd.read_csv(path+'deviceid_test.tsv', sep='\t', names=['device_id'])
train = pd.read_csv(path+'deviceid_train.tsv', sep='\t', names=['device_id', 'sex', 'age'])
brand = pd.read_table(path+'deviceid_brand.tsv', names=['device_id', 'vendor', 'version'])
data = pd.read_csv('thluo_train_best_feat.csv')
data.head()
train = pd.merge(train, data, on='device_id', how='left')
test = pd.merge(test, data, on='device_id', how='left')
train.head()
X_h = train.drop(['device_id', 'sex', 'age'], axis=1).values
X_h_test = test.drop(['device_id'], axis=1).values
packages['app_lenghth'] = packages['apps'].apply(lambda x:x.split(',')).apply(lambda x:len(x))
packages['app_list'] = packages['apps'].apply(lambda x:x.split(','))
train = pd.merge(train, packages, on='device_id', how='left')
test = pd.merge(test, packages, on='device_id', how='left')
embed_size = 128
fastmodel = FastText(list(packages['app_list']), size=embed_size, window=4, min_count=3, negative=2,
sg=1, sample=0.002, hs=1, workers=4)
embedding_fast = pd.DataFrame([fastmodel[word] for word in (fastmodel.wv.vocab)])
embedding_fast['app'] = list(fastmodel.wv.vocab)
embedding_fast.columns= ["fdim_%s" % str(i) for i in range(embed_size)]+["app"]
tokenizer = Tokenizer(lower=False, char_level=False, split=',')
tokenizer.fit_on_texts(list(packages['apps']))
X_seq = tokenizer.texts_to_sequences(train['apps'])
X_test_seq = tokenizer.texts_to_sequences(test['apps'])
maxlen = 50
X = pad_sequences(X_seq, maxlen=maxlen, value=0)
X_test = pad_sequences(X_test_seq, maxlen=maxlen, value=0)
Y_sex = train['sex']-1
max_feaures=35001
embedding_matrix = np.zeros((max_feaures, embed_size))
for word in tokenizer.word_index:
if word not in fastmodel.wv.vocab:
continue
embedding_matrix[tokenizer.word_index[word]] = fastmodel[word]
def dot_product(x, kernel):
"""
Wrapper for dot product operation, in order to be compatible with both
Theano and Tensorflow
Args:
x (): input
kernel (): weights
Returns:
"""
if K.backend() == 'tensorflow':
return K.squeeze(K.dot(x, K.expand_dims(kernel)), axis=-1)
else:
return K.dot(x, kernel)
class AttentionWithContext(Layer):
"""
Attention operation, with a context/query vector, for temporal data.
Supports Masking.
Follows the work of Yang et al. [https://www.cs.cmu.edu/~diyiy/docs/naacl16.pdf]
"Hierarchical Attention Networks for Document Classification"
by using a context vector to assist the attention
# Input shape
3D tensor with shape: `(samples, steps, features)`.
# Output shape
2D tensor with shape: `(samples, features)`.
How to use:
Just put it on top of an RNN Layer (GRU/LSTM/SimpleRNN) with return_sequences=True.
The dimensions are inferred based on the output shape of the RNN.
Note: The layer has been tested with Keras 2.0.6
Example:
model.add(LSTM(64, return_sequences=True))
model.add(AttentionWithContext())
# next add a Dense layer (for classification/regression) or whatever...
"""
def __init__(self,
W_regularizer=None, u_regularizer=None, b_regularizer=None,
W_constraint=None, u_constraint=None, b_constraint=None,
bias=True, **kwargs):
self.supports_masking = True
self.init = initializers.get('glorot_uniform')
self.W_regularizer = regularizers.get(W_regularizer)
self.u_regularizer = regularizers.get(u_regularizer)
self.b_regularizer = regularizers.get(b_regularizer)
self.W_constraint = constraints.get(W_constraint)
self.u_constraint = constraints.get(u_constraint)
self.b_constraint = constraints.get(b_constraint)
self.bias = bias
super(AttentionWithContext, self).__init__(**kwargs)
def build(self, input_shape):
assert len(input_shape) == 3
self.W = self.add_weight((input_shape[-1], input_shape[-1],),
initializer=self.init,
name='{}_W'.format(self.name),
regularizer=self.W_regularizer,
constraint=self.W_constraint)
if self.bias:
self.b = self.add_weight((input_shape[-1],),
initializer='zero',
name='{}_b'.format(self.name),
regularizer=self.b_regularizer,
constraint=self.b_constraint)
self.u = self.add_weight((input_shape[-1],),
initializer=self.init,
name='{}_u'.format(self.name),
regularizer=self.u_regularizer,
constraint=self.u_constraint)
super(AttentionWithContext, self).build(input_shape)
def compute_mask(self, input, input_mask=None):
# do not pass the mask to the next layers
return None
def call(self, x, mask=None):
uit = dot_product(x, self.W)
if self.bias:
uit += self.b
uit = K.tanh(uit)
ait = dot_product(uit, self.u)
a = K.exp(ait)
# apply mask after the exp. will be re-normalized next
if mask is not None:
# Cast the mask to floatX to avoid float64 upcasting in theano
a *= K.cast(mask, K.floatx())
# in some cases especially in the early stages of training the sum may be almost zero
# and this results in NaN's. A workaround is to add a very small positive number ε to the sum.
# a /= K.cast(K.sum(a, axis=1, keepdims=True), K.floatx())
a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx())
a = K.expand_dims(a)
weighted_input = x * a
return K.sum(weighted_input, axis=1)
def compute_output_shape(self, input_shape):
return input_shape[0], input_shape[-1]
class AdamW(Optimizer):
def __init__(self, lr=0.001, beta_1=0.9, beta_2=0.999, weight_decay=1e-4, # decoupled weight decay (1/4)
epsilon=1e-8, decay=0., **kwargs):
super(AdamW, self).__init__(**kwargs)
with K.name_scope(self.__class__.__name__):
self.iterations = K.variable(0, dtype='int64', name='iterations')
self.lr = K.variable(lr, name='lr')
self.beta_1 = K.variable(beta_1, name='beta_1')
self.beta_2 = K.variable(beta_2, name='beta_2')
self.decay = K.variable(decay, name='decay')
self.wd = K.variable(weight_decay, name='weight_decay') # decoupled weight decay (2/4)
self.epsilon = epsilon
self.initial_decay = decay
@interfaces.legacy_get_updates_support
def get_updates(self, loss, params):
grads = self.get_gradients(loss, params)
self.updates = [K.update_add(self.iterations, 1)]
wd = self.wd # decoupled weight decay (3/4)
lr = self.lr
if self.initial_decay > 0:
lr *= (1. / (1. + self.decay * K.cast(self.iterations,
K.dtype(self.decay))))
t = K.cast(self.iterations, K.floatx()) + 1
lr_t = lr * (K.sqrt(1. - K.pow(self.beta_2, t)) /
(1. - K.pow(self.beta_1, t)))
ms = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
vs = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
self.weights = [self.iterations] + ms + vs
for p, g, m, v in zip(params, grads, ms, vs):
m_t = (self.beta_1 * m) + (1. - self.beta_1) * g
v_t = (self.beta_2 * v) + (1. - self.beta_2) * K.square(g)
p_t = p - lr_t * m_t / (K.sqrt(v_t) + self.epsilon) - lr * wd * p # decoupled weight decay (4/4)
self.updates.append(K.update(m, m_t))
self.updates.append(K.update(v, v_t))
new_p = p_t
# Apply constraints.
if getattr(p, 'constraint', None) is not None:
new_p = p.constraint(new_p)
self.updates.append(K.update(p, new_p))
return self.updates
def get_config(self):
config = {'lr': float(K.get_value(self.lr)),
'beta_1': float(K.get_value(self.beta_1)),
'beta_2': float(K.get_value(self.beta_2)),
'decay': float(K.get_value(self.decay)),
'weight_decay': float(K.get_value(self.wd)),
'epsilon': self.epsilon}
base_config = super(AdamW, self).get_config()
return dict(list(base_config.items()) + list(config.items()))
def model_conv1D(embedding_matrix):
K.clear_session()
# The embedding layer containing the word vectors
emb_layer = Embedding(
input_dim=embedding_matrix.shape[0],
output_dim=embedding_matrix.shape[1],
weights=[embedding_matrix],
input_length=maxlen,
trainable=False)
lstm_layer = Bidirectional(GRU(128, recurrent_dropout=0.15, dropout=0.15, return_sequences=True))
att = AttentionWithContext()
# Define inputs
seq = Input(shape=(maxlen,))
# Run inputs through embedding
emb = emb_layer(seq)
lstm = lstm_layer(emb)
# Run through CONV + GAP layers
att1 = att(lstm)
hin = Input(shape=(X_h.shape[1], ))
htime = Dense(64, activation='relu')(hin)
merge1 = concatenate([att1, htime])
# The MLP that determines the outcome
x = Dropout(0.3)(merge1)
x = BatchNormalization()(x)
x = Dense(200, activation='relu',)(x)
x = Dropout(0.22)(x)
x = BatchNormalization()(x)
x = Dense(200, activation='relu',)(x)
x = Dropout(0.22)(x)
x = BatchNormalization()(x)
pred = Dense(1, activation='sigmoid')(x)
model = Model(inputs=[seq, hin], outputs=pred)
model.compile(loss='binary_crossentropy', optimizer=AdamW(weight_decay=0.08,))###
return model
kfold = StratifiedKFold(n_splits=5, random_state=20, shuffle=True)
sub1 = np.zeros((X_test.shape[0], ))
oof_pref1 = np.zeros((X.shape[0], 1))
score = []
count=0
for i, (train_index, test_index) in enumerate(kfold.split(X, Y_sex)):
print("FOLD | ",count+1)
filepath="sex_weights_best.h5"
checkpoint = ModelCheckpoint(filepath, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.8, patience=2, min_lr=0.0001, verbose=1)
earlystopping = EarlyStopping(monitor='val_loss', min_delta=0.0001, patience=6, verbose=1, mode='auto')
callbacks = [checkpoint, reduce_lr, earlystopping]
model_sex = model_conv1D(embedding_matrix)
X_tr, X_vl, X_tr2, X_vl2, y_tr, y_vl = X[train_index], X[test_index], X_h[train_index], X_h[test_index], Y_sex[train_index], Y_sex[test_index]
hist = model_sex.fit([X_tr, X_tr2], y_tr, batch_size=256, epochs=50, validation_data=([X_vl, X_vl2], y_vl),
callbacks=callbacks, verbose=2, shuffle=True)
model_sex.load_weights(filepath)
sub1 += np.squeeze(model_sex.predict([X_test, X_h_test]))/kfold.n_splits
oof_pref1[test_index] = model_sex.predict([X_vl, X_vl2])
score.append(np.min(hist.history['val_loss']))
count+=1
print('log loss:',np.mean(score))
oof_pref1 = pd.DataFrame(oof_pref1, columns=['sex2'])
sub1 = pd.DataFrame(sub1, columns=['sex2'])
res1 = pd.concat([oof_pref1, sub1])
res1['sex1'] = 1-res1['sex2']
res1.to_csv("res1.csv", index=False)
def model_age_conv(embedding_matrix):
# The embedding layer containing the word vectors
K.clear_session()
emb_layer = Embedding(
input_dim=embedding_matrix.shape[0],
output_dim=embedding_matrix.shape[1],
weights=[embedding_matrix],
input_length=maxlen,
trainable=False
)
lstm_layer = Bidirectional(GRU(128, recurrent_dropout=0.15, dropout=0.15, return_sequences=True))
att = AttentionWithContext()
# Define inputs
seq = Input(shape=(maxlen,))
# Run inputs through embedding
emb = emb_layer(seq)
lstm = lstm_layer(emb)
# Run through CONV + GAP layers
att1 = att(lstm)
hin = Input(shape=(X_h.shape[1], ))
htime = Dense(64, activation='relu')(hin)
merge1 = concatenate([att1, htime])
# The MLP that determines the outcome
x = Dropout(0.3)(merge1)
x = BatchNormalization()(x)
x = Dense(200, activation='relu',)(x)
x = Dropout(0.22)(x)
x = BatchNormalization()(x)
x = Dense(200, activation='relu',)(x)
x = Dropout(0.22)(x)
x = BatchNormalization()(x)
pred = Dense(11, activation='softmax')(x)
model = Model(inputs=[seq, hin], outputs=pred)
model.compile(loss='categorical_crossentropy', optimizer=AdamW(weight_decay=0.08,))
return model
Y_age = to_categorical(train['age'])
X_h = np.hstack([X_h, train['sex'].values.reshape((-1, 1))])
X_h_test1 = np.hstack([X_h_test, np.ones((X_h_test.shape[0], 1))])
X_h_test2 = np.hstack([X_h_test, np.ones((X_h_test.shape[0], 1))*2])
sub2_1 = np.zeros((X_test.shape[0], 11))
sub2_2 = np.zeros((X_test.shape[0], 11))
oof_pref2 = np.zeros((X.shape[0], 11))
score = []
count=0
for i, (train_index, test_index) in enumerate(kfold.split(X, train['age'])):
print("FOLD | ",count+1)
filepath2="age_weights_best_%d.h5"%count
checkpoint2 = ModelCheckpoint(filepath2, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
reduce_lr2 = ReduceLROnPlateau(monitor='val_loss', factor=0.8, patience=2, min_lr=0.0001, verbose=1)
earlystopping2 = EarlyStopping(monitor='val_loss', min_delta=0.0001, patience=8, verbose=1, mode='auto')
callbacks2 = [checkpoint2, reduce_lr2, earlystopping2]
model_age = model_age_conv(embedding_matrix)
X_tr, X_vl, X_tr2, X_vl2, y_tr, y_vl = X[train_index], X[test_index], X_h[train_index], X_h[test_index], Y_age[train_index], Y_age[test_index]
hist = model_age.fit([X_tr, X_tr2], y_tr, batch_size=256, epochs=50, validation_data=([X_vl, X_vl2], y_vl),
callbacks=callbacks2, verbose=2, shuffle=True)
model_age.load_weights(filepath2)
oof_pref2[test_index] = model_age.predict([X_vl, X_vl2])
sub2_1 += model_age.predict([X_test, X_h_test1])/kfold.n_splits
sub2_2 += model_age.predict([X_test, X_h_test2])/kfold.n_splits
score.append(np.min(hist.history['val_loss']))
count+=1
print('log loss:',np.mean(score))
res2_1 = np.vstack((oof_pref2, sub2_1))
res2_1 = pd.DataFrame(res2_1)
res2_1.to_csv("res2_1.csv",index=False)
res2_2 = np.vstack((oof_pref2, sub2_2))
res2_2 = pd.DataFrame(res2_2)
res2_2.to_csv("res2_2.csv",index=False)
res1.index=range(len(res1))
res2_1.index=range(len(res2_1))
res2_2.index=range(len(res2_2))
final_1 = res2_1
final_2 = res2_2
for i in range(11):
final_1[i] = res1['sex1']*res2_1[i]
final_2[i] = res1['sex2']*res2_2[i]
id_list = pd.concat([train[['device_id']],test[['device_id']]])
final = id_list
final.index = range(len(final))
final.columns= ['device_id']
final_pred = pd.concat([final_1, final_2], axis=1)
final = pd.concat([final, final_pred], axis=1)
final.columns = ['device_id', '1-0', '1-1', '1-2', '1-3', '1-4', '1-5', '1-6',
'1-7','1-8', '1-9', '1-10', '2-0', '2-1', '2-2', '2-3', '2-4',
'2-5', '2-6', '2-7', '2-8', '2-9', '2-10']
final.to_csv('att_nn_feat_v6.csv', index=False)
sub = pd.merge(test[['device_id']], final, on='device_id', how='left')
sub.columns = ['DeviceID', '1-0', '1-1', '1-2', '1-3', '1-4', '1-5', '1-6',
'1-7','1-8', '1-9', '1-10', '2-0', '2-1', '2-2', '2-3', '2-4',
'2-5', '2-6', '2-7', '2-8', '2-9', '2-10']
sub.to_csv('Att_v6.csv', index=False)
================================================
FILE: THLUO/24.thluo_22_lgb.py
================================================
# coding: utf-8
# In[1]:
import pandas as pd
import seaborn as sns
import numpy as np
from tqdm import tqdm
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.cross_validation import train_test_split
from sklearn.metrics import accuracy_score
import lightgbm as lgb
from datetime import datetime,timedelta
import time
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
import gc
# In[24]:
df_train_w2v = pd.read_csv('thluo_train_best_feat.csv')
df_att_nn_feat_v6 = pd.read_csv('att_nn_feat_v6.csv')
df_att_nn_feat_v6.columns = ['device_id'] + ['att_nn_feat_' + str(i) for i in range(22)]
df_train_w2v = df_train_w2v.merge(df_att_nn_feat_v6, on='device_id', how='left')
# In[ ]:
df_train_w2v.to_csv('thluo_train_best_feat.csv', index=None)
# In[26]:
train = df_train_w2v[df_train_w2v['sex'].notnull()]
test = df_train_w2v[df_train_w2v['sex'].isnull()]
X = train.drop(['sex','age','sex_age','device_id'],axis=1)
Y = train['sex_age']
Y_CAT = pd.Categorical(Y)
Y = pd.Series(Y_CAT.codes)
# In[28]:
from sklearn.model_selection import KFold, StratifiedKFold
gc.collect()
seed = 666
num_folds = 5
folds = StratifiedKFold(n_splits= num_folds, shuffle=True, random_state=seed)
sub_list = []
cate_feat = ['device_type','device_brand']
for n_fold, (train_idx, valid_idx) in enumerate(folds.split(X, Y)):
train_x, train_y = X.iloc[train_idx], Y.iloc[train_idx]
valid_x, valid_y = X.iloc[valid_idx], Y.iloc[valid_idx]
lgb_train=lgb.Dataset(train_x,label=train_y)
lgb_eval = lgb.Dataset(valid_x, valid_y, reference=lgb_train)
params = {
'boosting_type': 'gbdt',
#'learning_rate' : 0.02,
'learning_rate' : 0.01,
'max_depth':5,
'num_leaves' : 2 ** 4,
'metric': {'multi_logloss'},
'num_class' : 22,
'objective' : 'multiclass',
'random_state' : 2018,
'bagging_freq' : 5,
'feature_fraction' : 0.7,
'bagging_fraction' : 0.7,
'min_split_gain' : 0.0970905919552776,
'min_child_weight' : 9.42012323936088,
}
gbm = lgb.train(params,
lgb_train,
num_boost_round=1000,
valid_sets=lgb_eval,
early_stopping_rounds=200, verbose_eval=100)
sub = pd.DataFrame(gbm.predict(test[X.columns.values],num_iteration=gbm.best_iteration))
sub_list.append(sub)
# In[29]:
sub = (sub_list[0] + sub_list[1] + sub_list[2] + sub_list[3] + sub_list[4]) / num_folds
# In[31]:
sub.columns=Y_CAT.categories
sub['DeviceID']=test['device_id'].values
sub=sub[['DeviceID', '1-0', '1-1', '1-2', '1-3', '1-4', '1-5', '1-6', '1-7','1-8', '1-9', '1-10', '2-0', '2-1', '2-2', '2-3', '2-4', '2-5', '2-6', '2-7', '2-8', '2-9', '2-10']]
# In[32]:
sub.to_csv('th_22_results_lgb.csv',index=False)
================================================
FILE: THLUO/25.thluo_22_xgb.py
================================================
# coding: utf-8
# In[1]:
import pandas as pd
import seaborn as sns
import numpy as np
from tqdm import tqdm
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.cross_validation import train_test_split
from sklearn.metrics import accuracy_score
import lightgbm as lgb
import xgboost as xgb
from datetime import datetime,timedelta
import time
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
import gc
from feat_util import *
# In[2]:
print ('25.thluo_22_xgb.py')
path='input/'
data=pd.DataFrame()
#sex_age=pd.read_excel('./data/性别年龄对照表.xlsx')
# In[3]:
deviceid_packages=pd.read_csv(path+'deviceid_packages.tsv',sep='\t',names=['device_id','apps'])
deviceid_test=pd.read_csv(path+'deviceid_test.tsv',sep='\t',names=['device_id'])
deviceid_train=pd.read_csv(path+'deviceid_train.tsv',sep='\t',names=['device_id','sex','age'])
# In[4]:
df_train = pd.concat([deviceid_train, deviceid_test])
# In[5]:
df_train
# In[6]:
df_sex_prob_oof = pd.read_csv('device_sex_prob_oof.csv')
df_age_prob_oof = pd.read_csv('device_age_prob_oof.csv')
df_start_close_sex_prob_oof = pd.read_csv('start_close_sex_prob_oof.csv')
#后面两个,线上线下不对应,线下过拟合了
df_start_close_age_prob_oof = pd.read_csv('start_close_age_prob_oof.csv')
df_tfidf_lr_sex_age_prob_oof = pd.read_csv('tfidf_lr_sex_age_prob_oof.csv')
#之前的有用的
df_sex_age_bin_prob_oof = pd.read_csv('sex_age_bin_prob_oof.csv')
df_age_bin_prob_oof = pd.read_csv('age_bin_prob_oof.csv')
df_hcc_device_brand_age_sex = pd.read_csv('hcc_device_brand_age_sex.csv')
df_device_age_regression_prob_oof = pd.read_csv('device_age_regression_prob_oof.csv')
df_device_start_GRU_pred = pd.read_csv('device_start_GRU_pred.csv')
df_device_start_GRU_pred_age = pd.read_csv('device_start_GRU_pred_age.csv')
df_device_all_GRU_pred = pd.read_csv('device_all_GRU_pred.csv')
df_lgb_sex_age_prob_oof = pd.read_csv('lgb_sex_age_prob_oof.csv')
df_device_start_capsule_pred = pd.read_csv('device_start_capsule_pred.csv')
df_device_start_textcnn_pred = pd.read_csv('device_start_textcnn_pred.csv')
df_device_start_text_dpcnn_pred = pd.read_csv('device_start_text_dpcnn_pred.csv')
df_device_start_lstm_pred = pd.read_csv('device_start_lstm_pred.csv')
df_att_nn_feat_v6 = pd.read_csv('att_nn_feat_v6.csv')
df_att_nn_feat_v6.columns = ['device_id'] + ['att_nn_feat_' + str(i) for i in range(22)]
#过拟合特征
del df_start_close_age_prob_oof['device_app_groupedstart_close_age_prob_oof_4_MEAN']
del df_start_close_sex_prob_oof['device_app_groupedstart_close_sex_prob_oof_MIN']
del df_start_close_sex_prob_oof['device_app_groupedstart_close_sex_prob_oof_MAX']
# In[7]:
df_train_w2v = df_train.merge(df_sex_prob_oof, on='device_id', how='left')
df_train_w2v = df_train_w2v.merge(df_age_prob_oof, on='device_id', how='left')
df_train_w2v = df_train_w2v.merge(df_start_close_sex_prob_oof, on='device_id', how='left')
df_train_w2v = df_train_w2v.merge(df_start_close_age_prob_oof, on='device_id', how='left')
df_train_w2v = df_train_w2v.merge(df_sex_age_bin_prob_oof, on='device_id', how='left')
df_train_w2v = df_train_w2v.merge(df_age_bin_prob_oof, on='device_id', how='left')
df_train_w2v = df_train_w2v.merge(df_hcc_device_brand_age_sex, on='device_id', how='left')
df_train_w2v = df_train_w2v.merge(df_device_age_regression_prob_oof, on='device_id', how='left')
df_train_w2v = df_train_w2v.merge(df_device_start_GRU_pred, on='device_id', how='left')
df_train_w2v = df_train_w2v.merge(df_device_start_GRU_pred_age, on='device_id', how='left')
df_train_w2v = df_train_w2v.merge(df_device_all_GRU_pred, on='device_id', how='left')
df_train_w2v = df_train_w2v.merge(df_lgb_sex_age_prob_oof, on='device_id', how='left')
df_train_w2v = df_train_w2v.merge(df_device_start_capsule_pred, on='device_id', how='left')
df_train_w2v = df_train_w2v.merge(df_device_start_textcnn_pred, on='device_id', how='left')
df_train_w2v = df_train_w2v.merge(df_device_start_text_dpcnn_pred, on='device_id', how='left')
df_train_w2v = df_train_w2v.merge(df_device_start_lstm_pred, on='device_id', how='left')
df_train_w2v = df_train_w2v.merge(df_att_nn_feat_v6, on='device_id', how='left')
# In[9]:
df_train_w2v['sex'] = df_train_w2v['sex'].apply(lambda x:str(x))
df_train_w2v['age'] = df_train_w2v['age'].apply(lambda x:str(x))
def tool(x):
if x=='nan':
return x
else:
return str(int(float(x)))
df_train_w2v['sex']=df_train_w2v['sex'].apply(tool)
df_train_w2v['age']=df_train_w2v['age'].apply(tool)
df_train_w2v['sex_age']=df_train_w2v['sex']+'-'+df_train_w2v['age']
df_train_w2v = df_train_w2v.replace({'nan':np.NaN,'nan-nan':np.NaN})
# In[11]:
train = df_train_w2v[df_train_w2v['sex'].notnull()]
test = df_train_w2v[df_train_w2v['sex'].isnull()]
X = train.drop(['sex','age','sex_age','device_id'],axis=1)
Y = train['sex_age']
Y_CAT = pd.Categorical(Y)
Y = pd.Series(Y_CAT.codes)
# In[14]:
from sklearn.model_selection import KFold, StratifiedKFold
gc.collect()
#seed = 2048
seed = 666
num_folds = 5
folds = StratifiedKFold(n_splits= num_folds, shuffle=True, random_state=seed)
sub_list = []
cate_feat = ['device_type','device_brand']
for n_fold, (train_idx, valid_idx) in enumerate(folds.split(X, Y)):
train_x, train_y = X.iloc[train_idx], Y.iloc[train_idx]
valid_x, valid_y = X.iloc[valid_idx], Y.iloc[valid_idx]
xg_train = xgb.DMatrix(train_x, label=train_y)
xg_val = xgb.DMatrix(valid_x, label=valid_y)
param = {
'objective' : 'multi:softprob',
'eta' : 0.03,
'max_depth' : 3,
'num_class' : 22,
'eval_metric' : 'mlogloss',
'min_child_weight' : 3,
'subsample' : 0.7,
'colsample_bytree' : 0.7,
'seed' : 2006,
'nthread' : 5
}
num_rounds = 1000
watchlist = [ (xg_train,'train'), (xg_val, 'val') ]
model = xgb.train(param, xg_train, num_rounds, watchlist, early_stopping_rounds=100, verbose_eval=50)
test_matrix = xgb.DMatrix(test[X.columns.values])
sub = pd.DataFrame(model.predict(test_matrix))
sub_list.append(sub)
# In[15]:
sub = (sub_list[0] + sub_list[1] + sub_list[2] + sub_list[3] + sub_list[4]) / num_folds
sub
# In[16]:
sub.columns=Y_CAT.categories
sub['DeviceID']=test['device_id'].values
sub=sub[['DeviceID', '1-0', '1-1', '1-2', '1-3', '1-4', '1-5', '1-6', '1-7','1-8', '1-9', '1-10', '2-0', '2-1', '2-2', '2-3', '2-4', '2-5', '2-6', '2-7', '2-8', '2-9', '2-10']]
sub.to_csv('th_22_results_xgb.csv',index=False)
================================================
FILE: THLUO/26.thluo_nb_lgb.py
================================================
# coding: utf-8
# In[1]:
# coding: utf-8
# In[1]:
from sklearn.metrics import log_loss
import pandas as pd
import seaborn as sns
import numpy as np
from tqdm import tqdm
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import lightgbm as lgb
from datetime import datetime,timedelta
import time
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
get_ipython().run_line_magic('matplotlib', 'inline')
#add
import gc
from sklearn import preprocessing
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack, vstack
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
# from skopt.space import Integer, Categorical, Real, Log10
# from skopt.utils import use_named_args
# from skopt import gp_minimize
from gensim.models import Word2Vec, FastText
import gensim
import re
import os
path="./"
os.listdir(path)
# In[2]:
print ('26.thluo_nb_lgb.py')
train_id=pd.read_csv("input/deviceid_train.tsv",sep="\t",names=['device_id','sex','age'])
test_id=pd.read_csv("input/deviceid_test.tsv",sep="\t",names=['device_id'])
all_id=pd.concat([train_id[['device_id']],test_id[['device_id']]])
#nurbs=pd.read_csv("nurbs_feature_all.csv")
#nurbs.columns=["nurbs_"+str(i) for i in nurbs.columns]
thluo = pd.read_csv("thluo_train_best_feat.csv")
del thluo['age']
del thluo['sex']
del thluo['sex_age']
# In[7]:
feat = thluo.copy()
# In[8]:
train=pd.merge(train_id,feat,on="device_id",how="left")
test=pd.merge(test_id,feat,on="device_id",how="left")
# In[11]:
features = [x for x in train.columns if x not in ['device_id', 'sex',"age",]]
Y = train['sex'] - 1
# In[12]:
from sklearn.model_selection import KFold, StratifiedKFold
gc.collect()
seed = 1024
num_folds = 5
folds = StratifiedKFold(n_splits= num_folds, shuffle=True, random_state=seed)
# In[13]:
params = {
'boosting_type': 'gbdt',
'learning_rate' : 0.02,
#'max_depth':5,
'num_leaves' : 2 ** 5,
'metric': {'binary_logloss'},
#'num_class' : 22,
'objective' : 'binary',
'random_state' : 6666,
'bagging_freq' : 5,
'feature_fraction' : 0.7,
'bagging_fraction' : 0.7,
'min_split_gain' : 0.0970905919552776,
'min_child_weight' : 9.42012323936088,
}
# In[14]:
#预测性别
aus = []
sub1 = np.zeros((len(test), ))
pred_oob1=np.zeros((len(train),))
for i,(train_index,test_index) in enumerate(folds.split(train[features], Y)):
tr_x = train[features].reindex(index=train_index, copy=False)
tr_y = Y[train_index]
te_x = train[features].reindex(index=test_index, copy=False)
te_y = Y[test_index]
lgb_train=lgb.Dataset(tr_x,label=tr_y)
lgb_eval = lgb.Dataset(te_x, te_y, reference=lgb_train)
gbm = lgb.train(params, lgb_train, num_boost_round=300,
valid_sets=[lgb_train, lgb_eval], verbose_eval=100)
pred = gbm.predict(te_x[tr_x.columns.values])
pred_oob1[test_index] =pred
# te_y=te_y.apply(lambda x:1 if x>0 else 0)
a = log_loss(te_y, pred)
print ("idx: ", i)
print (" loss: %.5f" % a)
# print " gini: %.5f" % g
aus.append(a)
print ("mean")
print ("auc: %s" % (sum(aus) / 5.0))
# In[15]:
#用全部数据训练一个lgb
#用全部的train来预测test
lgb_train = lgb.Dataset(train[features],label=Y)
gbm = lgb.train(params, lgb_train, num_boost_round=300, valid_sets=lgb_train, verbose_eval=100)
sub1 = gbm.predict(test[features])
# In[16]:
pred_oob1 = pd.DataFrame(pred_oob1, columns=['sex2'])
sub1 = pd.DataFrame(sub1, columns=['sex2'])
res1=pd.concat([pred_oob1,sub1])
res1['sex1'] = 1-res1['sex2']
# In[18]:
# In[50]:
features = [x for x in train.columns if x not in ['device_id',"age"]]
Y = train['age']
# In[51]:
import lightgbm as lgb
import xgboost as xgb
from sklearn.metrics import auc, log_loss, roc_auc_score,f1_score,recall_score,precision_score
# In[19]:
from sklearn.model_selection import KFold, StratifiedKFold
gc.collect()
seed = 1024
num_folds = 5
folds = StratifiedKFold(n_splits= num_folds, shuffle=True, random_state=seed)
# In[20]:
params = {
'boosting_type': 'gbdt',
'learning_rate' : 0.02,
#'max_depth':5,
'num_leaves' : 2 ** 5,
'metric': {'multi_logloss'},
'num_class' : 11,
'objective' : 'multiclass',
'random_state' : 6666,
'bagging_freq' : 5,
'feature_fraction' : 0.7,
'bagging_fraction' : 0.7,
'min_split_gain' : 0.0970905919552776,
'min_child_weight' : 9.42012323936088,
}
# In[22]:
#预测性别
aus = []
sub2 = np.zeros((len(test),11 ))
pred_oob2=np.zeros((len(train),11))
models=[]
iters=[]
for i,(train_index,test_index) in enumerate(folds.split(train[features], Y)):
tr_x = train[features].reindex(index=train_index, copy=False)
tr_y = Y[train_index]
te_x = train[features].reindex(index=test_index, copy=False)
te_y = Y[test_index]
lgb_train=lgb.Dataset(tr_x,label=tr_y)
lgb_eval = lgb.Dataset(te_x, te_y, reference=lgb_train)
gbm = lgb.train(params, lgb_train, num_boost_round=430,
valid_sets=[lgb_train, lgb_eval], verbose_eval=100)
pred = gbm.predict(te_x[tr_x.columns.values])
pred_oob2[test_index] = pred
# te_y=te_y.apply(lambda x:1 if x>0 else 0)
a = log_loss(te_y, pred)
#sub2 += gbm.predict(test[features], num_iteration=gbm.best_iteration) / 5
models.append(gbm)
iters.append(gbm.best_iteration)
print ("idx: ", i)
print (" loss: %.5f" % a)
# print " gini: %.5f" % g
aus.append(a)
print ("mean")
print ("auc: %s" % (sum(aus) / 5.0))
# In[23]:
#预测条件概率
####sex1
test['sex']=1
#用全部数据训练一个lgb
#用全部的train来预测test
lgb_train = lgb.Dataset(train[features],label=Y)
gbm = lgb.train(params, lgb_train, num_boost_round=430, valid_sets=lgb_train, verbose_eval=100)
sub2 = gbm.predict(test[features])
res2_1=np.vstack((pred_oob2,sub2))
res2_1 = pd.DataFrame(res2_1)
# In[24]:
###sex2
#预测条件概率
test['sex']=2
sub2 = np.zeros((len(test),11))
sub2 = gbm.predict(test[features], num_iteration = gbm.best_iteration)
res2_2=np.vstack((pred_oob2,sub2))
res2_2 = pd.DataFrame(res2_2)
# In[27]:
res1.index=range(len(res1))
res2_1.index=range(len(res2_1))
res2_2.index=range(len(res2_2))
final_1=res2_1.copy()
final_2=res2_2.copy()
# In[28]:
for i in range(11):
final_1[i]=res1['sex1'] * res2_1[i]
final_2[i]=res1['sex2'] * res2_2[i]
id_list = pd.concat([train[['device_id']],test[['device_id']]])
final = id_list
final.index = range(len(final))
final.columns = ['DeviceID']
final_pred = pd.concat([final_1,final_2], 1)
final = pd.concat([final,final_pred],1)
final.columns = ['DeviceID', '1-0', '1-1', '1-2', '1-3', '1-4', '1-5', '1-6',
'1-7','1-8', '1-9', '1-10', '2-0', '2-1', '2-2', '2-3', '2-4',
'2-5', '2-6', '2-7', '2-8', '2-9', '2-10']
# In[30]:
test['DeviceID']=test['device_id']
sub=pd.merge(test[['DeviceID']],final,on="DeviceID",how="left")
sub.to_csv("th_lgb_nb.csv",index=False)
================================================
FILE: THLUO/27.thluo_nb_xgb.py
================================================
# coding: utf-8
# In[1]:
# coding: utf-8
# In[1]:
from sklearn.metrics import log_loss
import pandas as pd
import seaborn as sns
import numpy as np
from tqdm import tqdm
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import lightgbm as lgb
from datetime import datetime,timedelta
import time
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
get_ipython().run_line_magic('matplotlib', 'inline')
#add
import gc
from sklearn import preprocessing
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack, vstack
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
# from skopt.space import Integer, Categorical, Real, Log10
# from skopt.utils import use_named_args
# from skopt import gp_minimize
from gensim.models import Word2Vec, FastText
import gensim
import re
import os
import xgboost as xgb
path="./"
os.listdir(path)
# In[2]:
print ('27.thluo_nb_xgb.py')
train_id=pd.read_csv("input/deviceid_train.tsv",sep="\t",names=['device_id','sex','age'])
test_id=pd.read_csv("input/deviceid_test.tsv",sep="\t",names=['device_id'])
all_id=pd.concat([train_id[['device_id']],test_id[['device_id']]])
df_sex_prob_oof = pd.read_csv('device_sex_prob_oof.csv')
df_age_prob_oof = pd.read_csv('device_age_prob_oof.csv')
df_start_close_sex_prob_oof = pd.read_csv('start_close_sex_prob_oof.csv')
#后面两个,线上线下不对应,线下过拟合了
df_start_close_age_prob_oof = pd.read_csv('start_close_age_prob_oof.csv')
#df_start_close_sex_age_prob_oof = pd.read_csv('start_close_sex_age_prob_oof.csv')
df_tfidf_lr_sex_age_prob_oof = pd.read_csv('tfidf_lr_sex_age_prob_oof.csv')
#之前的有用的
df_sex_age_bin_prob_oof = pd.read_csv('sex_age_bin_prob_oof.csv')
df_age_bin_prob_oof = pd.read_csv('age_bin_prob_oof.csv')
df_hcc_device_brand_age_sex = pd.read_csv('hcc_device_brand_age_sex.csv')
df_device_age_regression_prob_oof = pd.read_csv('device_age_regression_prob_oof.csv')
df_device_start_GRU_pred = pd.read_csv('device_start_GRU_pred.csv')
df_device_start_GRU_pred_age = pd.read_csv('device_start_GRU_pred_age.csv')
df_device_all_GRU_pred = pd.read_csv('device_all_GRU_pred.csv')
#df_boost_sex_age_prob_oof = pd.read_csv('boost_sex_age_prob_oof.csv')
df_lgb_sex_age_prob_oof = pd.read_csv('lgb_sex_age_prob_oof.csv')
df_device_start_capsule_pred = pd.read_csv('device_start_capsule_pred.csv')
df_device_start_textcnn_pred = pd.read_csv('device_start_textcnn_pred.csv')
df_device_start_text_dpcnn_pred = pd.read_csv('device_start_text_dpcnn_pred.csv')
df_device_start_lstm_pred = pd.read_csv('device_start_lstm_pred.csv')
df_att_nn_feat_v6 = pd.read_csv('att_nn_feat_v6.csv')
df_att_nn_feat_v6.columns = ['device_id'] + ['att_nn_feat_' + str(i) for i in range(22)]
#过拟合特征
del df_start_close_age_prob_oof['device_app_groupedstart_close_age_prob_oof_4_MEAN']
del df_start_close_sex_prob_oof['device_app_groupedstart_close_sex_prob_oof_MIN']
del df_start_close_sex_prob_oof['device_app_groupedstart_close_sex_prob_oof_MAX']
# In[3]:
df_train_w2v = all_id.merge(df_sex_prob_oof, on='device_id', how='left')
df_train_w2v = df_train_w2v.merge(df_age_prob_oof, on='device_id', how='left')
df_train_w2v = df_train_w2v.merge(df_start_close_sex_prob_oof, on='device_id', how='left')
df_train_w2v = df_train_w2v.merge(df_start_close_age_prob_oof, on='device_id', how='left')
df_train_w2v = df_train_w2v.merge(df_sex_age_bin_prob_oof, on='device_id', how='left')
df_train_w2v = df_train_w2v.merge(df_age_bin_prob_oof, on='device_id', how='left')
df_train_w2v = df_train_w2v.merge(df_hcc_device_brand_age_sex, on='device_id', how='left')
df_train_w2v = df_train_w2v.merge(df_device_age_regression_prob_oof, on='device_id', how='left')
df_train_w2v = df_train_w2v.merge(df_device_start_GRU_pred, on='device_id', how='left')
df_train_w2v = df_train_w2v.merge(df_device_start_GRU_pred_age, on='device_id', how='left')
df_train_w2v = df_train_w2v.merge(df_device_all_GRU_pred, on='device_id', how='left')
df_train_w2v = df_train_w2v.merge(df_lgb_sex_age_prob_oof, on='device_id', how='left')
df_train_w2v = df_train_w2v.merge(df_device_start_capsule_pred, on='device_id', how='left')
df_train_w2v = df_train_w2v.merge(df_device_start_textcnn_pred, on='device_id', how='left')
df_train_w2v = df_train_w2v.merge(df_device_start_text_dpcnn_pred, on='device_id', how='left')
df_train_w2v = df_train_w2v.merge(df_device_start_lstm_pred, on='device_id', how='left')
df_train_w2v = df_train_w2v.merge(df_att_nn_feat_v6, on='device_id', how='left')
# In[5]:
feat = df_train_w2v.copy()
# In[6]:
train=pd.merge(train_id,feat,on="device_id",how="left")
test=pd.merge(test_id,feat,on="device_id",how="left")
# In[8]:
features = [x for x in train.columns if x not in ['device_id', 'sex',"age",]]
Y = train['sex'] - 1
# In[9]:
from sklearn.model_selection import KFold, StratifiedKFold
gc.collect()
seed = 1024
num_folds = 5
folds = StratifiedKFold(n_splits= num_folds, shuffle=True, random_state=seed)
# In[10]:
params={
'booster':'gbtree',
'objective': 'binary:logistic',
# 'is_unbalance':'True',
# 'scale_pos_weight': 1500.0/13458.0,
'eval_metric': "logloss",
'gamma':0.2,#0.2 is ok
'max_depth':6,
# 'lambda':20,
# "alpha":5,
'subsample':0.7,
'colsample_bytree':0.4 ,
# 'min_child_weight':2.5,
'eta': 0.01,
# 'learning_rate':0.01,
"silent":1,
'seed':1024,
'nthread':5,
}
num_round = 3500
early_stopping_rounds = 100
# In[11]:
#预测性别
aus = []
sub1 = np.zeros((len(test), ))
pred_oob1=np.zeros((len(train),))
for i,(train_index,test_index) in enumerate(folds.split(train[features], Y)):
tr_x = train[features].reindex(index=train_index, copy=False)
tr_y = Y[train_index]
te_x = train[features].reindex(index=test_index, copy=False)
te_y = Y[test_index]
d_tr = xgb.DMatrix(tr_x, label=tr_y)
d_te = xgb.DMatrix(te_x, label=te_y)
watchlist = [(d_tr,'train'),
(d_te,'val')
]
model = xgb.train(params, d_tr, num_boost_round=530,
evals=watchlist,verbose_eval=100)
pred = model.predict(d_te)
pred_oob1[test_index] =pred
# te_y=te_y.apply(lambda x:1 if x>0 else 0)
a = log_loss(te_y, pred)
print ("idx: ", i)
print (" loss: %.5f" % a)
# print " gini: %.5f" % g
aus.append(a)
print ("mean")
print ("auc: %s" % (sum(aus) / 5.0))
# In[12]:
#用全部数据训练一个lgb
#用全部的train来预测test
xgb_train = xgb.DMatrix(train[features], label=Y)
watchlist = [(xgb_train,'train')]
gbm = xgb.train(params, xgb_train, num_boost_round=530, evals=watchlist, verbose_eval=100)
sub1 = gbm.predict(xgb.DMatrix(test[features]))
# In[13]:
pred_oob1 = pd.DataFrame(pred_oob1, columns=['sex2'])
sub1 = pd.DataFrame(sub1, columns=['sex2'])
res1=pd.concat([pred_oob1,sub1])
res1['sex1'] = 1-res1['sex2']
# In[15]:
# In[50]:
features = [x for x in train.columns if x not in ['device_id',"age"]]
Y = train['age']
# In[51]:
from sklearn.metrics import auc, log_loss, roc_auc_score,f1_score,recall_score,precision_score
# In[16]:
from sklearn.model_selection import KFold, StratifiedKFold
gc.collect()
seed = 1024
num_folds = 5
folds = StratifiedKFold(n_splits= num_folds, shuffle=True, random_state=seed)
# In[17]:
params={
'booster':'gbtree',
'objective': 'multi:softprob',
'eval_metric': "mlogloss",
'num_class':11,
'gamma':0.1,#0.2 is ok
'max_depth':5,
'subsample':0.7,
'colsample_bytree':0.4 ,
# 'min_child_weight':2.5,
'eta': 0.02,
# 'learning_rate':0.01,
"silent":1,
'seed':1024,
'nthread':5,
}
# In[19]:
#预测性别
aus = []
sub2 = np.zeros((len(test),11 ))
pred_oob2=np.zeros((len(train),11))
models=[]
iters=[]
for i,(train_index,test_index) in enumerate(folds.split(train[features], Y)):
tr_x = train[features].reindex(index=train_index, copy=False)
tr_y = Y[train_index]
te_x = train[features].reindex(index=test_index, copy=False)
te_y = Y[test_index]
d_tr = xgb.DMatrix(tr_x, label=tr_y)
d_te = xgb.DMatrix(te_x, label=te_y)
watchlist = [(d_tr,'train'),
(d_te,'val')]
model = xgb.train(params, d_tr, num_boost_round=550,
evals=watchlist,verbose_eval=100)
pred = model.predict(d_te)
pred_oob2[test_index] = pred
# te_y=te_y.apply(lambda x:1 if x>0 else 0)
a = log_loss(te_y, pred)
#sub2 += gbm.predict(test[features], num_iteration=gbm.best_iteration) / 5
print ("idx: ", i)
print (" loss: %.5f" % a)
# print " gini: %.5f" % g
aus.append(a)
print ("mean")
print ("auc: %s" % (sum(aus) / 5.0))
# In[20]:
#预测条件概率
####sex1
test['sex']=1
#用全部数据训练一个lgb
#用全部的train来预测test
xgb_train = xgb.DMatrix(train[features], label=Y)
watchlist = [(xgb_train,'train')]
gbm = xgb.train(params, xgb_train, num_boost_round=550, evals=watchlist, verbose_eval=100)
sub2 = gbm.predict(xgb.DMatrix(test[features]))
res2_1=np.vstack((pred_oob2,sub2))
res2_1 = pd.DataFrame(res2_1)
# In[21]:
###sex2
#预测条件概率
test['sex']=2
sub2 = np.zeros((len(test),11))
sub2 = gbm.predict(xgb.DMatrix(test[features]))
res2_2=np.vstack((pred_oob2,sub2))
res2_2 = pd.DataFrame(res2_2)
# In[24]:
res1.index=range(len(res1))
res2_1.index=range(len(res2_1))
res2_2.index=range(len(res2_2))
final_1=res2_1.copy()
final_2=res2_2.copy()
# In[25]:
for i in range(11):
final_1[i]=res1['sex1'] * res2_1[i]
final_2[i]=res1['sex2'] * res2_2[i]
id_list = pd.concat([train[['device_id']],test[['device_id']]])
final = id_list
final.index = range(len(final))
final.columns = ['DeviceID']
final_pred = pd.concat([final_1,final_2], 1)
final = pd.concat([final,final_pred],1)
final.columns = ['DeviceID', '1-0', '1-1', '1-2', '1-3', '1-4', '1-5', '1-6',
'1-7','1-8', '1-9', '1-10', '2-0', '2-1', '2-2', '2-3', '2-4',
'2-5', '2-6', '2-7', '2-8', '2-9', '2-10']
# In[27]:
test['DeviceID']=test['device_id']
sub=pd.merge(test[['DeviceID']],final,on="DeviceID",how="left")
sub.to_csv("th_xgb_nb.csv",index=False)
================================================
FILE: THLUO/28.final.py
================================================
# coding: utf-8
# In[1]:
import numpy as np
import pandas as pd
# In[2]:
th_22_results_lgb = pd.read_csv('th_22_results_lgb.csv')
th_22_results_xgb = pd.read_csv('th_22_results_xgb.csv')
th_lgb_nb = pd.read_csv('th_lgb_nb.csv')
th_xgb_nb = pd.read_csv('th_xgb_nb.csv')
# In[5]:
#直接22分类 lgb与xgb进行55 45加权融合
results_22 = pd.DataFrame(th_22_results_lgb.values[:,1:] * 0.55 + th_22_results_xgb.values[:,1:] * 0.45)
results_22.columns = th_22_results_lgb.columns[1:]
results_22['DeviceID'] = th_22_results_lgb['DeviceID']
# In[6]:
#条件概率分类, xgb与lgb进行65 35加权融合
results_nb = pd.DataFrame(th_xgb_nb.values[:,1:] * 0.65 + th_lgb_nb.values[:,1:] * 0.35)
results_nb.columns = th_xgb_nb.columns[1:]
results_nb['DeviceID'] = th_xgb_nb['DeviceID']
# In[ ]:
#两份结果继续进行加权融合
results_final = pd.DataFrame(results_22.values[:,1:] * 0.65 + results_nb.values[:,1:] * 0.35)
results_final.columns = results_22.columns[1:]
results_final['DeviceID'] = results_22['DeviceID']
# In[ ]:
results_final.to_csv('result/thluo_final.csv', index=None)
================================================
FILE: THLUO/3.device_quchong_start_app_w2c.py
================================================
# coding: utf-8
# In[1]:
import pandas as pd
import seaborn as sns
import numpy as np
from tqdm import tqdm
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.cross_validation import train_test_split
from sklearn.metrics import accuracy_score
import lightgbm as lgb
from datetime import datetime,timedelta
import time
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
import gc
from gensim.test.utils import common_texts, get_tmpfile
from gensim.models import Word2Vec
# In[2]:
print ('8.device_quchong_start_app_w2c.py')
path='input/'
data=pd.DataFrame()
#sex_age=pd.read_excel('./data/性别年龄对照表.xlsx')
# In[3]:
deviceid_packages=pd.read_csv(path+'deviceid_packages.tsv',sep='\t',names=['device_id','apps'])
deviceid_test=pd.read_csv(path+'deviceid_test.tsv',sep='\t',names=['device_id'])
deviceid_train=pd.read_csv(path+'deviceid_train.tsv',sep='\t',names=['device_id','sex','age'])
deviceid_brand = pd.read_csv(path+'deviceid_brand.tsv',sep='\t', names=['device_id','device_brand', 'device_type'])
deviceid_package_start_close = pd.read_csv(path+'deviceid_package_start_close.tsv',sep='\t', names=['device_id','app_id','start_time','close_time'])
package_label = pd.read_csv(path+'package_label.tsv',sep='\t',names=['app_id','app_parent_type', 'app_child_type'])
deviceid_brand['device_brand'] = deviceid_brand.device_brand.apply(lambda x : str(x).split(' ')[0])
df_temp = deviceid_brand.groupby('device_brand')['device_id'].count().reset_index().rename(columns={'device_id':'brand_counts'})
one_time_brand = df_temp[df_temp.brand_counts == 1].device_brand.values
deviceid_brand['device_brand'] = deviceid_brand.device_brand.apply(lambda x : 'other' if x in one_time_brand else x)
df_temp = deviceid_brand.groupby('device_brand')['device_id'].count().reset_index().rename(columns={'device_id':'brand_counts'})
one_time_brand = df_temp[df_temp.brand_counts == 2].device_brand.values
deviceid_brand['device_brand'] = deviceid_brand.device_brand.apply(lambda x : 'other_2' if x in one_time_brand else x)
df_temp = deviceid_brand.groupby('device_brand')['device_id'].count().reset_index().rename(columns={'device_id':'brand_counts'})
one_time_brand = df_temp[df_temp.brand_counts == 3].device_brand.values
deviceid_brand['device_brand'] = deviceid_brand.device_brand.apply(lambda x : 'other_3' if x in one_time_brand else x)
#转换成对应的数字
lbl = LabelEncoder()
lbl.fit(list(deviceid_brand.device_brand.values))
deviceid_brand['device_brand'] = lbl.transform(list(deviceid_brand.device_brand.values))
lbl = LabelEncoder()
lbl.fit(list(deviceid_brand.device_type.values))
deviceid_brand['device_type'] = lbl.transform(list(deviceid_brand.device_type.values))
#转换成对应的数字
lbl = LabelEncoder()
lbl.fit(list(package_label.app_parent_type.values))
package_label['app_parent_type'] = lbl.transform(list(package_label.app_parent_type.values))
lbl = LabelEncoder()
lbl.fit(list(package_label.app_child_type.values))
package_label['app_child_type'] = lbl.transform(list(package_label.app_child_type.values))
# In[4]:
import time
# 输入毫秒级的时间,转出正常格式的时间
def timeStamp(timeNum):
timeStamp = float(timeNum/1000)
timeArray = time.localtime(timeStamp)
otherStyleTime = time.strftime("%Y-%m-%d %H:%M:%S", timeArray)
return otherStyleTime
#解析出具体的时间
deviceid_package_start_close['start_date'] = pd.to_datetime(deviceid_package_start_close.start_time.apply(timeStamp))
deviceid_package_start_close['end_date'] = pd.to_datetime(deviceid_package_start_close.close_time.apply(timeStamp))
deviceid_package_start_close['start_hour'] = deviceid_package_start_close.start_date.dt.hour
deviceid_package_start_close['end_hour'] = deviceid_package_start_close.end_date.dt.hour
deviceid_package_start_close['time_gap'] = (deviceid_package_start_close['end_date'] - deviceid_package_start_close['start_date']).astype('timedelta64[s]')
deviceid_package_start_close = deviceid_package_start_close.merge(package_label, on='app_id', how='left')
deviceid_package_start_close.app_parent_type.fillna(-1, inplace=True)
deviceid_package_start_close.app_child_type.fillna(-1, inplace=True)
deviceid_package_start_close['start_year'] = deviceid_package_start_close.start_date.dt.year
deviceid_package_start_close['end_year'] = deviceid_package_start_close.end_date.dt.year
deviceid_package_start_close['year_gap'] = deviceid_package_start_close['end_year'] - deviceid_package_start_close['start_year']
# In[9]:
df_temp = deviceid_package_start_close.groupby(['device_id', 'app_id'])['start_hour'].mean().reset_index()
df_temp
# In[10]:
df_sorted = df_temp.sort_values(by='start_hour')
# In[13]:
df_device_start_app_list = df_sorted.groupby('device_id').apply(lambda x : list(x.app_id)).reset_index().rename(columns = {0 : 'app_list'})
# In[17]:
app_list = list(df_device_start_app_list.app_list.values)
# In[35]:
model = Word2Vec(app_list, size=10, window=4, min_count=2, workers=4)
model.save("word2vec.model")
# In[37]:
vocab = list(model.wv.vocab.keys())
w2c_arr = []
for v in vocab :
w2c_arr.append(list(model.wv[v]))
# In[38]:
df_w2c_start = pd.DataFrame()
df_w2c_start['app_id'] = vocab
df_w2c_start = pd.concat([df_w2c_start, pd.DataFrame(w2c_arr)], axis=1)
df_w2c_start.columns = ['app_id'] + ['w2c_start_app_' + str(i) for i in range(10)]
# In[47]:
df_sorted = df_sorted.merge(df_w2c_start, on='app_id', how='left')
df_sorted
# In[48]:
w2c_nums = 10
agg = {}
for l in ['w2c_start_app_' + str(i) for i in range(w2c_nums)] :
agg[l] = ['mean', 'std', 'max', 'min']
# In[50]:
df_agg = df_sorted.groupby('device_id').agg(agg)
df_agg.columns = pd.Index(['device_quchong' + e[0] + "_" + e[1].upper() for e in df_agg.columns.tolist()])
df_agg = df_agg.reset_index()
# In[52]:
df_agg.to_csv('device_quchong_start_app_w2c.csv', index=None)
================================================
FILE: THLUO/3.w2c_all_emb.py
================================================
# coding: utf-8
# In[1]:
import pandas as pd
import seaborn as sns
import numpy as np
from tqdm import tqdm
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.cross_validation import train_test_split
from sklearn.metrics import accuracy_score
import lightgbm as lgb
from datetime import datetime,timedelta
import time
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
import gc
# In[2]:
path='input/'
data=pd.DataFrame()
#sex_age=pd.read_excel('./data/性别年龄对照表.xlsx')
# In[3]:
deviceid_packages=pd.read_csv(path+'deviceid_packages.tsv',sep='\t',names=['device_id','apps'])
deviceid_test=pd.read_csv(path+'deviceid_test.tsv',sep='\t',names=['device_id'])
deviceid_train=pd.read_csv(path+'deviceid_train.tsv',sep='\t',names=['device_id','sex','age'])
deviceid_brand = pd.read_csv(path+'deviceid_brand.tsv',sep='\t', names=['device_id','device_brand', 'device_type'])
deviceid_package_start_close = pd.read_csv(path+'deviceid_package_start_close.tsv',sep='\t', names=['device_id','app_id','start_time','close_time'])
package_label = pd.read_csv(path+'package_label.tsv',sep='\t',names=['app_id','app_parent_type', 'app_child_type'])
deviceid_brand['device_brand'] = deviceid_brand.device_brand.apply(lambda x : str(x).split(' ')[0])
df_temp = deviceid_brand.groupby('device_brand')['device_id'].count().reset_index().rename(columns={'device_id':'brand_counts'})
one_time_brand = df_temp[df_temp.brand_counts == 1].device_brand.values
deviceid_brand['device_brand'] = deviceid_brand.device_brand.apply(lambda x : 'other' if x in one_time_brand else x)
df_temp = deviceid_brand.groupby('device_brand')['device_id'].count().reset_index().rename(columns={'device_id':'brand_counts'})
one_time_brand = df_temp[df_temp.brand_counts == 2].device_brand.values
deviceid_brand['device_brand'] = deviceid_brand.device_brand.apply(lambda x : 'other_2' if x in one_time_brand else x)
df_temp = deviceid_brand.groupby('device_brand')['device_id'].count().reset_index().rename(columns={'device_id':'brand_counts'})
one_time_brand = df_temp[df_temp.brand_counts == 3].device_brand.values
deviceid_brand['device_brand'] = deviceid_brand.device_brand.apply(lambda x : 'other_3' if x in one_time_brand else x)
#转换成对应的数字
lbl = LabelEncoder()
lbl.fit(list(deviceid_brand.device_brand.values))
deviceid_brand['device_brand'] = lbl.transform(list(deviceid_brand.device_brand.values))
lbl = LabelEncoder()
lbl.fit(list(deviceid_brand.device_type.values))
deviceid_brand['device_type'] = lbl.transform(list(deviceid_brand.device_type.values))
#转换成对应的数字
lbl = LabelEncoder()
lbl.fit(list(package_label.app_parent_type.values))
package_label['app_parent_type'] = lbl.transform(list(package_label.app_parent_type.values))
lbl = LabelEncoder()
lbl.fit(list(package_label.app_child_type.values))
package_label['app_child_type'] = lbl.transform(list(package_label.app_child_type.values))
# In[4]:
deviceid_package_start = deviceid_package_start_close[['device_id', 'app_id', 'start_time']]
deviceid_package_start.columns = ['device_id', 'app_id', 'all_time']
deviceid_package_close = deviceid_package_start_close[['device_id', 'app_id', 'close_time']]
deviceid_package_close.columns = ['device_id', 'app_id', 'all_time']
deviceid_package_all = pd.concat([deviceid_package_start, deviceid_package_close])
# In[6]:
df_sorted = deviceid_package_all.sort_values(by='all_time')
# In[8]:
df_device_start_app_list = df_sorted.groupby('device_id').apply(lambda x : list(x.app_id)).reset_index().rename(columns = {0 : 'app_list'})
df_device_start_app_list
# In[9]:
app_list = list(df_device_start_app_list.app_list.values)
# In[10]:
from gensim.test.utils import common_texts, get_tmpfile
from gensim.models import Word2Vec
# In[11]:
word_dim = 200
model = Word2Vec(app_list, size=word_dim, window=20, min_count=2, workers=4)
model.save("word2vec.model")
# In[13]:
vocab = list(model.wv.vocab.keys())
w2c_arr = []
for v in vocab :
w2c_arr.append(list(model.wv[v]))
# In[14]:
df_w2c_start = pd.DataFrame()
df_w2c_start['app_id'] = vocab
df_w2c_start = pd.concat([df_w2c_start, pd.DataFrame(w2c_arr)], axis=1)
df_w2c_start.columns = ['app_id'] + ['w2c_all_app_' + str(i) for i in range(word_dim)]
# In[16]:
df_w2c_start.to_csv('w2c_all_emb.csv', index=None)
================================================
FILE: THLUO/3.w2c_model_all.py
================================================
# coding: utf-8
# In[1]:
import pandas as pd
import seaborn as sns
import numpy as np
from tqdm import tqdm
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.cross_validation import train_test_split
from sklearn.metrics import accuracy_score
import lightgbm as lgb
from datetime import datetime,timedelta
import time
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
from gensim.test.utils import common_texts, get_tmpfile
from gensim.models import Word2Vec
import gc
# In[2]:
print ('3.w2c_model_all.py')
path='input/'
data=pd.DataFrame()
#sex_age=pd.read_excel('./data/性别年龄对照表.xlsx')
# In[3]:
deviceid_packages=pd.read_csv(path+'deviceid_packages.tsv',sep='\t',names=['device_id','apps'])
deviceid_test=pd.read_csv(path+'deviceid_test.tsv',sep='\t',names=['device_id'])
deviceid_train=pd.read_csv(path+'deviceid_train.tsv',sep='\t',names=['device_id','sex','age'])
deviceid_brand = pd.read_csv(path+'deviceid_brand.tsv',sep='\t', names=['device_id','device_brand', 'device_type'])
deviceid_package_start_close = pd.read_csv(path+'deviceid_package_start_close.tsv',sep='\t', names=['device_id','app_id','start_time','close_time'])
package_label = pd.read_csv(path+'package_label.tsv',sep='\t',names=['app_id','app_parent_type', 'app_child_type'])
deviceid_brand['device_brand'] = deviceid_brand.device_brand.apply(lambda x : str(x).split(' ')[0])
df_temp = deviceid_brand.groupby('device_brand')['device_id'].count().reset_index().rename(columns={'device_id':'brand_counts'})
one_time_brand = df_temp[df_temp.brand_counts == 1].device_brand.values
deviceid_brand['device_brand'] = deviceid_brand.device_brand.apply(lambda x : 'other' if x in one_time_brand else x)
df_temp = deviceid_brand.groupby('device_brand')['device_id'].count().reset_index().rename(columns={'device_id':'brand_counts'})
one_time_brand = df_temp[df_temp.brand_counts == 2].device_brand.values
deviceid_brand['device_brand'] = deviceid_brand.device_brand.apply(lambda x : 'other_2' if x in one_time_brand else x)
df_temp = deviceid_brand.groupby('device_brand')['device_id'].count().reset_index().rename(columns={'device_id':'brand_counts'})
one_time_brand = df_temp[df_temp.brand_counts == 3].device_brand.values
deviceid_brand['device_brand'] = deviceid_brand.device_brand.apply(lambda x : 'other_3' if x in one_time_brand else x)
#转换成对应的数字
lbl = LabelEncoder()
lbl.fit(list(deviceid_brand.device_brand.values))
deviceid_brand['device_brand'] = lbl.transform(list(deviceid_brand.device_brand.values))
lbl = LabelEncoder()
lbl.fit(list(deviceid_brand.device_type.values))
deviceid_brand['device_type'] = lbl.transform(list(deviceid_brand.device_type.values))
#转换成对应的数字
lbl = LabelEncoder()
lbl.fit(list(package_label.app_parent_type.values))
package_label['app_parent_type'] = lbl.transform(list(package_label.app_parent_type.values))
lbl = LabelEncoder()
lbl.fit(list(package_label.app_child_type.values))
package_label['app_child_type'] = lbl.transform(list(package_label.app_child_type.values))
# In[4]:
deviceid_package_start = deviceid_package_start_close[['device_id', 'app_id', 'start_time']]
deviceid_package_start.columns = ['device_id', 'app_id', 'all_time']
deviceid_package_close = deviceid_package_start_close[['device_id', 'app_id', 'close_time']]
deviceid_package_close.columns = ['device_id', 'app_id', 'all_time']
deviceid_package_all = pd.concat([deviceid_package_start, deviceid_package_close])
# In[5]:
df_sorted = deviceid_package_all.sort_values(by='all_time')
# In[7]:
df_results = df_sorted.groupby('device_id')['app_id'].apply(lambda x:' '.join(x)).reset_index().rename(columns = {'app_id' : 'app_list'})
df_results.to_csv('03.device_click_app_sorted_by_all.csv', index=None)
del df_results
# In[8]:
df_device_start_app_list = df_sorted.groupby('device_id').apply(lambda x : list(x.app_id)).reset_index().rename(columns = {0 : 'app_list'})
# In[9]:
app_list = list(df_device_start_app_list.app_list.values)
# In[11]:
model = Word2Vec(app_list, size=10, window=50, min_count=2, workers=4)
model.save("word2vec.model")
# In[12]:
vocab = list(model.wv.vocab.keys())
w2c_arr = []
for v in vocab :
w2c_arr.append(list(model.wv[v]))
# In[13]:
df_w2c_start = pd.DataFrame()
df_w2c_start['app_id'] = vocab
df_w2c_start = pd.concat([df_w2c_start, pd.DataFrame(w2c_arr)], axis=1)
df_w2c_start.columns = ['app_id'] + ['w2c_all_app_' + str(i) for i in range(10)]
# In[14]:
w2c_nums = 10
agg = {}
for l in ['w2c_all_app_' + str(i) for i in range(w2c_nums)] :
agg[l] = ['mean', 'std', 'max', 'min']
# In[15]:
deviceid_package_start_close = deviceid_package_start_close.merge(df_w2c_start, on='app_id', how='left')
# In[16]:
df_agg = deviceid_package_start_close.groupby('device_id').agg(agg)
df_agg.columns = pd.Index(['device_' + e[0] + "_" + e[1].upper() for e in df_agg.columns.tolist()])
df_agg = df_agg.reset_index()
df_agg.to_csv('device_all_app_w2c.csv', index=None)
# In[18]:
df_results = deviceid_package_start_close.groupby(['device_id', 'app_id'])['start_time'].mean().reset_index()
df_results = df_results.merge(df_w2c_start, on='app_id', how='left')
# In[22]:
df_agg = df_results.groupby('device_id').agg(agg)
df_agg.columns = pd.Index(['device_app_unique' + e[0] + "_" + e[1].upper() for e in df_agg.columns.tolist()])
df_agg = df_agg.reset_index()
# In[20]:
df_agg.to_csv('device_app_unique_all_app_w2c.csv', index=None)
================================================
FILE: THLUO/4.device_age_prob_oof.py
================================================
# coding: utf-8
# In[1]:
import pandas as pd
import seaborn as sns
import numpy as np
from tqdm import tqdm
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.cross_validation import train_test_split
from sklearn.metrics import accuracy_score
import lightgbm as lgb
from datetime import datetime,timedelta
import time
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
import gc
# In[2]:
print ('4.device_age_prob_oof.py')
path='input/'
data=pd.DataFrame()
#sex_age=pd.read_excel('./data/性别年龄对照表.xlsx')
# In[3]:
deviceid_packages=pd.read_csv(path+'deviceid_packages.tsv',sep='\t',names=['device_id','apps'])
deviceid_test=pd.read_csv(path+'deviceid_test.tsv',sep='\t',names=['device_id'])
deviceid_train=pd.read_csv(path+'deviceid_train.tsv',sep='\t',names=['device_id','sex','age'])
deviceid_brand = pd.read_csv(path+'deviceid_brand.tsv',sep='\t', names=['device_id','device_brand', 'device_type'])
deviceid_package_start_close = pd.read_csv(path+'deviceid_package_start_close.tsv',sep='\t', names=['device_id','app_id','start_time','close_time'])
package_label = pd.read_csv(path+'package_label.tsv',sep='\t',names=['app_id','app_parent_type', 'app_child_type'])
deviceid_brand['device_brand'] = deviceid_brand.device_brand.apply(lambda x : str(x).split(' ')[0])
df_temp = deviceid_brand.groupby('device_brand')['device_id'].count().reset_index().rename(columns={'device_id':'brand_counts'})
one_time_brand = df_temp[df_temp.brand_counts == 1].device_brand.values
deviceid_brand['device_brand'] = deviceid_brand.device_brand.apply(lambda x : 'other' if x in one_time_brand else x)
df_temp = deviceid_brand.groupby('device_brand')['device_id'].count().reset_index().rename(columns={'device_id':'brand_counts'})
one_time_brand = df_temp[df_temp.brand_counts == 2].device_brand.values
deviceid_brand['device_brand'] = deviceid_brand.device_brand.apply(lambda x : 'other_2' if x in one_time_brand else x)
df_temp = deviceid_brand.groupby('device_brand')['device_id'].count().reset_index().rename(columns={'device_id':'brand_counts'})
one_time_brand = df_temp[df_temp.brand_counts == 3].device_brand.values
deviceid_brand['device_brand'] = deviceid_brand.device_brand.apply(lambda x : 'other_3' if x in one_time_brand else x)
#转换成对应的数字
lbl = LabelEncoder()
lbl.fit(list(deviceid_brand.device_brand.values))
deviceid_brand['device_brand'] = lbl.transform(list(deviceid_brand.device_brand.values))
lbl = LabelEncoder()
lbl.fit(list(deviceid_brand.device_type.values))
deviceid_brand['device_type'] = lbl.transform(list(deviceid_brand.device_type.values))
#转换成对应的数字
lbl = LabelEncoder()
lbl.fit(list(package_label.app_parent_type.values))
package_label['app_parent_type'] = lbl.transform(list(package_label.app_parent_type.values))
lbl = LabelEncoder()
lbl.fit(list(package_label.app_child_type.values))
package_label['app_child_type'] = lbl.transform(list(package_label.app_child_type.values))
# In[4]:
import time
# 输入毫秒级的时间,转出正常格式的时间
def timeStamp(timeNum):
timeStamp = float(timeNum/1000)
timeArray = time.localtime(timeStamp)
otherStyleTime = time.strftime("%Y-%m-%d %H:%M:%S", timeArray)
return otherStyleTime
#解析出具体的时间
deviceid_package_start_close['start_date'] = pd.to_datetime(deviceid_package_start_close.start_time.apply(timeStamp))
deviceid_package_start_close['end_date'] = pd.to_datetime(deviceid_package_start_close.close_time.apply(timeStamp))
deviceid_package_start_close['start_hour'] = deviceid_package_start_close.start_date.dt.hour
deviceid_package_start_close['end_hour'] = deviceid_package_start_close.end_date.dt.hour
deviceid_package_start_close['time_gap'] = (deviceid_package_start_close['end_date'] - deviceid_package_start_close['start_date']).astype('timedelta64[s]')
deviceid_package_start_close = deviceid_package_start_close.merge(package_label, on='app_id', how='left')
deviceid_package_start_close.app_parent_type.fillna(-1, inplace=True)
deviceid_package_start_close.app_child_type.fillna(-1, inplace=True)
deviceid_package_start_close['start_year'] = deviceid_package_start_close.start_date.dt.year
deviceid_package_start_close['end_year'] = deviceid_package_start_close.end_date.dt.year
deviceid_package_start_close['year_gap'] = deviceid_package_start_close['end_year'] - deviceid_package_start_close['start_year']
# In[10]:
#这里留一个伏笔,有些日期属于异常,去除掉那些开启app和关闭app不是同年的数据
#df_temp = deviceid_package_start_close[deviceid_package_start_close.start_year != 2017]
#df_temp['year_gap'] = df_temp['end_year'] - df_temp['start_year']
# In[5]:
deviceid_train=pd.concat([deviceid_train,deviceid_test])
# In[6]:
deviceid_packages['apps']=deviceid_packages['apps'].apply(lambda x:x.split(','))
deviceid_packages['app_lenghth']=deviceid_packages['apps'].apply(lambda x:len(x))
# In[7]:
#特征工程
def open_app_timegap_in_hour() :
df_temp = deviceid_package_start_close.groupby(['device_id', 'start_hour'])['time_gap'].mean().reset_index().rename(columns = {'time_gap': 'mean_time_gap'})
df_mean_temp = pd.pivot_table(df_temp, index='device_id', columns='start_hour', values='mean_time_gap').reset_index()
df_mean_temp.columns = ['device_id'] + ['open_app_timegap_in_'+str(i) + '_mean_hour' for i in range(0,24)]
df_mean_temp.fillna(0, inplace=True)
return df_mean_temp
# In[8]:
def device_start_end_app_timegap() :
#用户打开,关闭app的时间间隔
df_ = deviceid_package_start_close.sort_values(by=['device_id', 'start_date'], ascending=False)
df_['prev_start_date'] = df_.groupby('device_id')['start_date'].shift(-1)
df_['start_date_gap'] = (df_['start_date'] - df_['prev_start_date']).astype('timedelta64[s]')
agg_dic = {'start_date_gap' : ['min', 'max', 'mean', 'median', 'std']}
df_start_gap_agg = df_.groupby('device_id').agg(agg_dic)
df_start_gap_agg.columns = pd.Index(['device_' + e[0] + "_" + e[1].upper() for e in df_start_gap_agg.columns.tolist()])
df_start_gap_agg = df_start_gap_agg.reset_index()
#del df_
gc.collect()
#关闭时间间隔
df_ = deviceid_package_start_close.sort_values(by=['device_id', 'end_date'], ascending=False)
df_['prev_end_date'] = df_.groupby('device_id')['end_date'].shift(-1)
df_['end_date_gap'] = (df_['end_date'] - df_['prev_end_date']).astype('timedelta64[s]')
agg_dic = {'end_date_gap' : ['min', 'max', 'mean', 'median', 'std']}
df_end_gap_agg = df_.groupby('device_id').agg(agg_dic)
df_end_gap_agg.columns = pd.Index(['device_' + e[0] + "_" + e[1].upper() for e in df_end_gap_agg.columns.tolist()])
df_end_gap_agg = df_end_gap_agg.reset_index()
#del df_
gc.collect()
df_agg = df_start_gap_agg.merge(df_end_gap_agg, on='device_id', how='left')
#df_agg = df_agg.merge(df_app_start_gap_agg, on='device_id', how='left')
#df_agg = df_agg.merge(df_app_end_gap_agg, on='device_id', how='left')
return df_agg
def open_app_counts_in_hour() :
df_temp = deviceid_package_start_close.groupby(['device_id', 'start_hour'])['app_id'].count().reset_index().rename(columns = {'app_id': 'app_counts'})
df_temp = pd.pivot_table(df_temp, index='device_id', columns='start_hour', values='app_counts').reset_index()
df_temp.columns = ['device_id'] + ['open_app_counts_in'+str(i) + '_hour' for i in range(0,24)]
df_temp.fillna(0, inplace=True)
return df_temp
def close_app_counts_in_hour() :
df_temp = deviceid_package_start_close.groupby(['device_id', 'end_hour'])['app_id'].count().reset_index().rename(columns = {'app_id': 'app_counts'})
df_temp = pd.pivot_table(df_temp, index='device_id', columns='end_hour', values='app_counts').reset_index()
df_temp.columns = ['device_id'] + ['close_app_counts_in'+str(i) + '_hour' for i in range(0,24)]
df_temp.fillna(0, inplace=True)
return df_temp
def app_type_mean_time_gap_one_hot () :
df_temp = deviceid_package_start_close.groupby(['device_id', 'app_parent_type'])['time_gap'].mean().reset_index()
df_temp = pd.pivot_table(df_temp, index='device_id', columns='app_parent_type', values='time_gap').reset_index()
df_temp.columns = ['device_id'] + ['app_parent_type_mean_time_gap'+str(i) for i in range(-1,45)]
df_temp.fillna(-1, inplace=True)
return df_temp
def device_active_hour() :
aggregations = {
'start_hour' : ['std','mean','max','min'],
'end_hour' : ['std','mean','max','min']
}
df_agg = deviceid_package_start_close.groupby('device_id').agg(aggregations)
df_agg.columns = pd.Index(['device_grouped_' + e[0] + "_" + e[1].upper() for e in df_agg.columns.tolist()])
df_agg = df_agg.reset_index()
return df_agg
def device_brand_encoding() :
df_temp = deviceid_brand.merge(deviceid_train[['device_id', 'age', 'sex']], on='device_id', how='left')
aggregations = {
'age' : ['std','mean'],
'sex' : ['mean'],
}
df_device_brand = df_temp.groupby('device_brand').agg(aggregations)
df_device_brand.columns = pd.Index(['device_brand_' + e[0] + "_" + e[1].upper() for e in df_device_brand.columns.tolist()])
df_device_brand = df_device_brand.reset_index()
df_device_type = df_temp.groupby('device_type').agg(aggregations)
df_device_type.columns = pd.Index(['device_type_' + e[0] + "_" + e[1].upper() for e in df_device_type.columns.tolist()])
df_device_type = df_device_type.reset_index()
df_temp = df_temp.merge(df_device_brand, on='device_brand', how='left')
df_temp = df_temp.merge(df_device_type, on='device_type', how='left')
aggregations = {
'device_brand_age_STD' : ['mean'],
'device_brand_age_MEAN' : ['mean'],
'device_brand_sex_MEAN' : ['mean'],
#'device_type_age_STD' : ['mean'],
#'device_type_age_MEAN' : ['mean'],
#'device_type_sex_MEAN' : ['mean']
}
df_agg = df_temp.groupby('device_id').agg(aggregations)
df_agg.columns = pd.Index(['device_grouped_' + e[0] + "_" + e[1].upper() for e in df_agg.columns.tolist()])
df_agg = df_agg.reset_index()
return df_agg
#统计device运行app的情况
def device_active_time_time_stat() :
#device开启app的时间统计信息
deviceid_package_start_close['active_time'] = deviceid_package_start_close['close_time'] - deviceid_package_start_close['start_time']
#device开启了多少次app
#device开启了多少个app
aggregations = {
'app_id' : ['count', 'nunique'],
'active_time' : ['mean', 'std', 'max', 'min'],
}
df_agg = deviceid_package_start_close.groupby('device_id').agg(aggregations)
df_agg.columns = pd.Index(['device_grouped_' + e[0] + "_" + e[1].upper() for e in df_agg.columns.tolist()])
df_agg = df_agg.reset_index()
aggregations = {
'active_time' : ['mean', 'std', 'max', 'min', 'count'],
}
df_da_agg = deviceid_package_start_close.groupby(['device_id', 'app_id']).agg(aggregations)
df_da_agg.columns = pd.Index(['device_app_grouped_' + e[0] + "_" + e[1].upper() for e in df_da_agg.columns.tolist()])
df_da_agg = df_da_agg.reset_index()
#device开启app的平均时间
aggregations = {
'device_app_grouped_active_time_MEAN' : ['mean', 'std', 'max', 'min'],
'device_app_grouped_active_time_STD' : ['mean', 'std', 'max', 'min'],
'device_app_grouped_active_time_MAX' : ['mean', 'std', 'max', 'min'],
'device_app_grouped_active_time_MIN' : ['mean', 'std', 'max', 'min'],
'device_app_grouped_active_time_COUNT' : ['mean', 'std', 'max', 'min'],
}
df_temp = df_da_agg.groupby(['device_id']).agg(aggregations)
df_temp.columns = pd.Index([e[0] + "_" + e[1].upper() for e in df_temp.columns.tolist()])
df_temp = df_temp.reset_index()
df_agg = df_agg.merge(df_temp, on='device_id', how='left')
return df_agg
def app_type_encoding() :
df_temp = df_device_app_pair.merge(deviceid_train[['device_id', 'age', 'sex']], on='device_id', how='left')
aggregations = {
'age' : ['std','mean'],
'sex' : ['mean'],
}
df_agg_app_parent_type = df_temp.groupby('app_parent_type').agg(aggregations)
df_agg_app_parent_type.columns = pd.Index(['app_parent_type_' + e[0] + "_" + e[1].upper() for e in df_agg_app_parent_type.columns.tolist()])
df_agg_app_parent_type = df_agg_app_parent_type.reset_index()
df_agg_app_child_type = df_temp.groupby('app_child_type').agg(aggregations)
df_agg_app_child_type.columns = pd.Index(['app_child_type_' + e[0] + "_" + e[1].upper() for e in df_agg_app_child_type.columns.tolist()])
df_agg_app_child_type = df_agg_app_child_type.reset_index()
df_temp = df_temp.merge(df_agg_app_parent_type, on='app_parent_type', how='left')
df_temp = df_temp.merge(df_agg_app_child_type, on='app_child_type', how='left')
aggregations = {
'app_parent_type_age_STD' : ['mean'],
'app_parent_type_age_MEAN' : ['mean'],
'app_parent_type_sex_MEAN' : ['mean'],
'app_child_type_age_STD' : ['mean'],
'app_child_type_age_MEAN' : ['mean'],
'app_child_type_sex_MEAN' : ['mean']
}
df_agg = df_temp.groupby('device_id').agg(aggregations)
df_agg.columns = pd.Index(['device_grouped_' + e[0] + "_" + e[1].upper() for e in df_agg.columns.tolist()])
df_agg = df_agg.reset_index()
return df_agg
#每个device对应的app_parent_type计数
def app_type_onehot_in_device(df) :
df_copy = df.fillna(-1)
df_temp = df_copy.groupby(['device_id', 'app_parent_type'])['app_id'].size().reset_index()
df_temp.rename(columns = {'app_id' : 'app_parent_type_counts'}, inplace=True)
df_temp = pd.pivot_table(df_temp, index='device_id', columns='app_parent_type', values='app_parent_type_counts').reset_index()
df_temp.columns = ['device_id'] + ['app_parent_type'+str(i) for i in range(-1,45)]
df_temp.fillna(0, inplace=True)
return df_temp
# In[9]:
apps=deviceid_packages['apps'].apply(lambda x:' '.join(x)).tolist()
vectorizer=CountVectorizer()
transformer=TfidfTransformer()
cntTf = vectorizer.fit_transform(apps)
tfidf=transformer.fit_transform(cntTf)
word=vectorizer.get_feature_names()
weight=tfidf.toarray()
df_weight=pd.DataFrame(weight)
feature=df_weight.columns
df_weight['sum']=0
for f in tqdm(feature):
df_weight['sum']+=df_weight[f]
deviceid_packages['tfidf_sum']=df_weight['sum']
# In[10]:
lda = LatentDirichletAllocation(n_topics=5,
learning_offset=50.,
random_state=666)
docres = lda.fit_transform(cntTf)
# In[11]:
deviceid_packages = pd.concat([deviceid_packages,pd.DataFrame(docres)],axis=1)
# In[12]:
temp=deviceid_packages.drop('apps',axis=1)
deviceid_train=pd.merge(deviceid_train,temp,on='device_id',how='left')
# In[13]:
#解析出所有的device_app_pair
device_id_arr = []
app_arr = []
df_device_app_pair = pd.DataFrame()
for row in deviceid_packages.values :
device_id = row[0]
app_list = row[1]
for app in app_list :
device_id_arr.append(device_id)
app_arr.append(app)
#生成pair
df_device_app_pair['device_id'] = device_id_arr
df_device_app_pair['app_id'] = app_arr
df_device_app_pair = df_device_app_pair.merge(package_label, how='left', on='app_id')
# In[15]:
#提取特征
df_train = deviceid_train.merge(device_active_time_time_stat(), on='device_id', how='left')
df_train = df_train.merge(deviceid_brand, on='device_id', how='left')
df_train = df_train.merge(app_type_onehot_in_device(df_device_app_pair), on='device_id', how='left')
df_train = df_train.merge(app_type_encoding(), on='device_id', how='left')
df_train = df_train.merge(device_active_hour(), on='device_id', how='left')
df_train = df_train.merge(app_type_mean_time_gap_one_hot(), on='device_id', how='left')
df_train = df_train.merge(open_app_counts_in_hour(), on='device_id', how='left')
df_train = df_train.merge(close_app_counts_in_hour(), on='device_id', how='left')
df_train = df_train.merge(device_brand_encoding(), on='device_id', how='left')
df_train = df_train.merge(device_start_end_app_timegap(), on='device_id', how='left')
df_train = df_train.merge(open_app_timegap_in_hour(), on='device_id', how='left')
# In[16]:
#W2C文件
df_w2c_start = pd.read_csv('device_start_app_w2c.csv')
df_w2c_close = pd.read_csv('device_close_app_w2c.csv')
df_w2c_all = pd.read_csv('device_all_app_w2c.csv')
df_device_quchong_start_app_w2c = pd.read_csv('device_quchong_start_app_w2c.csv')
df_device_app_unique_start_app_w2c = pd.read_csv('device_app_unique_start_app_w2c.csv')
df_device_app_unique_close_app_w2c = pd.read_csv('device_app_unique_close_app_w2c.csv')
df_device_app_unique_all_app_w2c = pd.read_csv('device_app_unique_all_app_w2c.csv')
df_train_w2v = df_train.merge(df_w2c_start, on='device_id', how='left')
df_train_w2v = df_train_w2v.merge(df_w2c_close, on='device_id', how='left')
df_train_w2v = df_train_w2v.merge(df_w2c_all, on='device_id', how='left')
df_train_w2v = df_train_w2v.merge(df_device_quchong_start_app_w2c, on='device_id', how='left')
df_train_w2v = df_train_w2v.merge(df_device_app_unique_start_app_w2c, on='device_id', how='left')
df_train_w2v = df_train_w2v.merge(df_device_app_unique_close_app_w2c, on='device_id', how='left')
df_train_w2v = df_train_w2v.merge(df_device_app_unique_all_app_w2c, on='device_id', how='left')
# In[23]:
train = df_train_w2v[df_train_w2v['age'].notnull()]
test = df_train_w2v[df_train_w2v['age'].isnull()]
# In[24]:
X = train.drop(['sex', 'age', 'device_id'],axis=1)
Y = train['age']
Y_CAT = pd.Categorical(Y)
Y = pd.Series(Y_CAT.codes)
# In[25]:
from sklearn.model_selection import KFold, StratifiedKFold
seed = 2018
num_folds = 5
folds = StratifiedKFold(n_splits= num_folds, shuffle=True, random_state=seed)
sub_list = []
oof_preds = np.zeros([train.shape[0], 11])
cate_feat = ['device_type','device_brand']
params = {
'boosting_type': 'gbdt',
'learning_rate' : 0.02,
#'max_depth':5,
'num_leaves' : 2 ** 5,
'metric': {'multi_logloss'},
'num_class' : 11,
'objective' : 'multiclass',
'random_state' : 6666,
'bagging_freq' : 5,
'feature_fraction' : 0.7,
'bagging_fraction' : 0.7,
'min_split_gain' : 0.0970905919552776,
'min_child_weight' : 9.42012323936088,
}
for n_fold, (train_idx, valid_idx) in enumerate(folds.split(X, Y)):
train_x, train_y = X.iloc[train_idx], Y.iloc[train_idx]
valid_x, valid_y = X.iloc[valid_idx], Y.iloc[valid_idx]
lgb_train=lgb.Dataset(train_x,label=train_y)
lgb_eval = lgb.Dataset(valid_x, valid_y, reference=lgb_train)
gbm = lgb.train(params, lgb_train, num_boost_round=600, valid_sets=[lgb_train, lgb_eval], verbose_eval=50)
oof_preds[valid_idx] = gbm.predict(valid_x[X.columns.values])
oof_train = pd.DataFrame(oof_preds)
oof_train.columns = ['age_prob_oof_' + str(i) for i in range(11)]
train = pd.concat([train, oof_train], axis=1)
# In[27]:
#用全部的train来预测test
lgb_train = lgb.Dataset(X,label=Y)
gbm = lgb.train(params, lgb_train, num_boost_round=600, valid_sets=lgb_train, verbose_eval=50)
test = test.reset_index(drop=True)
test_preds = gbm.predict(test[X.columns.values])
# In[28]:
oof_test = pd.DataFrame(test_preds)
oof_test.columns = ['age_prob_oof_' + str(i) for i in range(11)]
test = pd.concat([test, oof_test], axis=1)
# In[30]:
df_age_prob_oof = pd.concat([train[['device_id'] + ['age_prob_oof_' + str(i) for i in range(11)] ],
test[['device_id'] + ['age_prob_oof_' + str(i) for i in range(11)] ]])
df_age_prob_oof.to_csv('device_age_prob_oof.csv', index=None)
================================================
FILE: THLUO/5.device_sex_prob_oof.py
================================================
# coding: utf-8
# In[1]:
import pandas as pd
import seaborn as sns
import numpy as np
from tqdm import tqdm
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.cross_validation import train_test_split
from sklearn.metrics import accuracy_score
import lightgbm as lgb
from datetime import datetime,timedelta
import time
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
import gc
# In[2]:
print ('5.device_sex_prob_oof.py')
path='input/'
data=pd.DataFrame()
#sex_age=pd.read_excel('./data/性别年龄对照表.xlsx')
# In[3]:
deviceid_packages=pd.read_csv(path+'deviceid_packages.tsv',sep='\t',names=['device_id','apps'])
deviceid_test=pd.read_csv(path+'deviceid_test.tsv',sep='\t',names=['device_id'])
deviceid_train=pd.read_csv(path+'deviceid_train.tsv',sep='\t',names=['device_id','sex','age'])
deviceid_brand = pd.read_csv(path+'deviceid_brand.tsv',sep='\t', names=['device_id','device_brand', 'device_type'])
deviceid_package_start_close = pd.read_csv(path+'deviceid_package_start_close.tsv',sep='\t', names=['device_id','app_id','start_time','close_time'])
package_label = pd.read_csv(path+'package_label.tsv',sep='\t',names=['app_id','app_parent_type', 'app_child_type'])
deviceid_brand['device_brand'] = deviceid_brand.device_brand.apply(lambda x : str(x).split(' ')[0])
df_temp = deviceid_brand.groupby('device_brand')['device_id'].count().reset_index().rename(columns={'device_id':'brand_counts'})
one_time_brand = df_temp[df_temp.brand_counts == 1].device_brand.values
deviceid_brand['device_brand'] = deviceid_brand.device_brand.apply(lambda x : 'other' if x in one_time_brand else x)
df_temp = deviceid_brand.groupby('device_brand')['device_id'].count().reset_index().rename(columns={'device_id':'brand_counts'})
one_time_brand = df_temp[df_temp.brand_counts == 2].device_brand.values
deviceid_brand['device_brand'] = deviceid_brand.device_brand.apply(lambda x : 'other_2' if x in one_time_brand else x)
df_temp = deviceid_brand.groupby('device_brand')['device_id'].count().reset_index().rename(columns={'device_id':'brand_counts'})
one_time_brand = df_temp[df_temp.brand_counts == 3].device_brand.values
deviceid_brand['device_brand'] = deviceid_brand.device_brand.apply(lambda x : 'other_3' if x in one_time_brand else x)
#转换成对应的数字
lbl = LabelEncoder()
lbl.fit(list(deviceid_brand.device_brand.values))
deviceid_brand['device_brand'] = lbl.transform(list(deviceid_brand.device_brand.values))
lbl = LabelEncoder()
lbl.fit(list(deviceid_brand.device_type.values))
deviceid_brand['device_type'] = lbl.transform(list(deviceid_brand.device_type.values))
#转换成对应的数字
lbl = LabelEncoder()
lbl.fit(list(package_label.app_parent_type.values))
package_label['app_parent_type'] = lbl.transform(list(package_label.app_parent_type.values))
lbl = LabelEncoder()
lbl.fit(list(package_label.app_child_type.values))
package_label['app_child_type'] = lbl.transform(list(package_label.app_child_type.values))
# In[4]:
import time
# 输入毫秒级的时间,转出正常格式的时间
def timeStamp(timeNum):
timeStamp = float(timeNum/1000)
timeArray = time.localtime(timeStamp)
otherStyleTime = time.strftime("%Y-%m-%d %H:%M:%S", timeArray)
return otherStyleTime
#解析出具体的时间
deviceid_package_start_close['start_date'] = pd.to_datetime(deviceid_package_start_close.start_time.apply(timeStamp))
deviceid_package_start_close['end_date'] = pd.to_datetime(deviceid_package_start_close.close_time.apply(timeStamp))
deviceid_package_start_close['start_hour'] = deviceid_package_start_close.start_date.dt.hour
deviceid_package_start_close['end_hour'] = deviceid_package_start_close.end_date.dt.hour
deviceid_package_start_close['time_gap'] = (deviceid_package_start_close['end_date'] - deviceid_package_start_close['start_date']).astype('timedelta64[s]')
deviceid_package_start_close = deviceid_package_start_close.merge(package_label, on='app_id', how='left')
deviceid_package_start_close.app_parent_type.fillna(-1, inplace=True)
deviceid_package_start_close.app_child_type.fillna(-1, inplace=True)
deviceid_package_start_close['start_year'] = deviceid_package_start_close.start_date.dt.year
deviceid_package_start_close['end_year'] = deviceid_package_start_close.end_date.dt.year
deviceid_package_start_close['year_gap'] = deviceid_package_start_close['end_year'] - deviceid_package_start_close['start_year']
# In[10]:
#这里留一个伏笔,有些日期属于异常,去除掉那些开启app和关闭app不是同年的数据
#df_temp = deviceid_package_start_close[deviceid_package_start_close.start_year != 2017]
#df_temp['year_gap'] = df_temp['end_year'] - df_temp['start_year']
# In[5]:
deviceid_train=pd.concat([deviceid_train,deviceid_test])
# In[6]:
deviceid_packages['apps']=deviceid_packages['apps'].apply(lambda x:x.split(','))
deviceid_packages['app_lenghth']=deviceid_packages['apps'].apply(lambda x:len(x))
#特征工程
def open_app_timegap_in_hour() :
df_temp = deviceid_package_start_close.groupby(['device_id', 'start_hour'])['time_gap'].mean().reset_index().rename(columns = {'time_gap': 'mean_time_gap'})
df_mean_temp = pd.pivot_table(df_temp, index='device_id', columns='start_hour', values='mean_time_gap').reset_index()
df_mean_temp.columns = ['device_id'] + ['open_app_timegap_in_'+str(i) + '_mean_hour' for i in range(0,24)]
df_mean_temp.fillna(0, inplace=True)
return df_mean_temp
# In[8]:
def device_start_end_app_timegap() :
#用户打开,关闭app的时间间隔
df_ = deviceid_package_start_close.sort_values(by=['device_id', 'start_date'], ascending=False)
df_['prev_start_date'] = df_.groupby('device_id')['start_date'].shift(-1)
df_['start_date_gap'] = (df_['start_date'] - df_['prev_start_date']).astype('timedelta64[s]')
agg_dic = {'start_date_gap' : ['min', 'max', 'mean', 'median', 'std']}
df_start_gap_agg = df_.groupby('device_id').agg(agg_dic)
df_start_gap_agg.columns = pd.Index(['device_' + e[0] + "_" + e[1].upper() for e in df_start_gap_agg.columns.tolist()])
df_start_gap_agg = df_start_gap_agg.reset_index()
#del df_
gc.collect()
#关闭时间间隔
df_ = deviceid_package_start_close.sort_values(by=['device_id', 'end_date'], ascending=False)
df_['prev_end_date'] = df_.groupby('device_id')['end_date'].shift(-1)
df_['end_date_gap'] = (df_['end_date'] - df_['prev_end_date']).astype('timedelta64[s]')
agg_dic = {'end_date_gap' : ['min', 'max', 'mean', 'median', 'std']}
df_end_gap_agg = df_.groupby('device_id').agg(agg_dic)
df_end_gap_agg.columns = pd.Index(['device_' + e[0] + "_" + e[1].upper() for e in df_end_gap_agg.columns.tolist()])
df_end_gap_agg = df_end_gap_agg.reset_index()
#del df_
gc.collect()
df_agg = df_start_gap_agg.merge(df_end_gap_agg, on='device_id', how='left')
#df_agg = df_agg.merge(df_app_start_gap_agg, on='device_id', how='left')
#df_agg = df_agg.merge(df_app_end_gap_agg, on='device_id', how='left')
return df_agg
def open_app_counts_in_hour() :
df_temp = deviceid_package_start_close.groupby(['device_id', 'start_hour'])['app_id'].count().reset_index().rename(columns = {'app_id': 'app_counts'})
df_temp = pd.pivot_table(df_temp, index='device_id', columns='start_hour', values='app_counts').reset_index()
df_temp.columns = ['device_id'] + ['open_app_counts_in'+str(i) + '_hour' for i in range(0,24)]
df_temp.fillna(0, inplace=True)
return df_temp
def close_app_counts_in_hour() :
df_temp = deviceid_package_start_close.groupby(['device_id', 'end_hour'])['app_id'].count().reset_index().rename(columns = {'app_id': 'app_counts'})
df_temp = pd.pivot_table(df_temp, index='device_id', columns='end_hour', values='app_counts').reset_index()
df_temp.columns = ['device_id'] + ['close_app_counts_in'+str(i) + '_hour' for i in range(0,24)]
df_temp.fillna(0, inplace=True)
return df_temp
def app_type_mean_time_gap_one_hot () :
df_temp = deviceid_package_start_close.groupby(['device_id', 'app_parent_type'])['time_gap'].mean().reset_index()
df_temp = pd.pivot_table(df_temp, index='device_id', columns='app_parent_type', values='time_gap').reset_index()
df_temp.columns = ['device_id'] + ['app_parent_type_mean_time_gap'+str(i) for i in range(-1,45)]
df_temp.fillna(-1, inplace=True)
return df_temp
def device_active_hour() :
aggregations = {
'start_hour' : ['std','mean','max','min'],
'end_hour' : ['std','mean','max','min']
}
df_agg = deviceid_package_start_close.groupby('device_id').agg(aggregations)
df_agg.columns = pd.Index(['device_grouped_' + e[0] + "_" + e[1].upper() for e in df_agg.columns.tolist()])
df_agg = df_agg.reset_index()
return df_agg
def device_brand_encoding() :
df_temp = deviceid_brand.merge(deviceid_train[['device_id', 'age', 'sex']], on='device_id', how='left')
aggregations = {
'age' : ['std','mean'],
'sex' : ['mean'],
}
df_device_brand = df_temp.groupby('device_brand').agg(aggregations)
df_device_brand.columns = pd.Index(['device_brand_' + e[0] + "_" + e[1].upper() for e in df_device_brand.columns.tolist()])
df_device_brand = df_device_brand.reset_index()
df_device_type = df_temp.groupby('device_type').agg(aggregations)
df_device_type.columns = pd.Index(['device_type_' + e[0] + "_" + e[1].upper() for e in df_device_type.columns.tolist()])
df_device_type = df_device_type.reset_index()
df_temp = df_temp.merge(df_device_brand, on='device_brand', how='left')
df_temp = df_temp.merge(df_device_type, on='device_type', how='left')
aggregations = {
'device_brand_age_STD' : ['mean'],
'device_brand_age_MEAN' : ['mean'],
'device_brand_sex_MEAN' : ['mean'],
#'device_type_age_STD' : ['mean'],
#'device_type_age_MEAN' : ['mean'],
#'device_type_sex_MEAN' : ['mean']
}
df_agg = df_temp.groupby('device_id').agg(aggregations)
df_agg.columns = pd.Index(['device_grouped_' + e[0] + "_" + e[1].upper() for e in df_agg.columns.tolist()])
df_agg = df_agg.reset_index()
return df_agg
#统计device运行app的情况
def device_active_time_time_stat() :
#device开启app的时间统计信息
deviceid_package_start_close['active_time'] = deviceid_package_start_close['close_time'] - deviceid_package_start_close['start_time']
#device开启了多少次app
#device开启了多少个app
aggregations = {
'app_id' : ['count', 'nunique'],
'active_time' : ['mean', 'std', 'max', 'min'],
}
df_agg = deviceid_package_start_close.groupby('device_id').agg(aggregations)
df_agg.columns = pd.Index(['device_grouped_' + e[0] + "_" + e[1].upper() for e in df_agg.columns.tolist()])
df_agg = df_agg.reset_index()
aggregations = {
'active_time' : ['mean', 'std', 'max', 'min', 'count'],
}
df_da_agg = deviceid_package_start_close.groupby(['device_id', 'app_id']).agg(aggregations)
df_da_agg.columns = pd.Index(['device_app_grouped_' + e[0] + "_" + e[1].upper() for e in df_da_agg.columns.tolist()])
df_da_agg = df_da_agg.reset_index()
#device开启app的平均时间
aggregations = {
'device_app_grouped_active_time_MEAN' : ['mean', 'std', 'max', 'min'],
'device_app_grouped_active_time_STD' : ['mean', 'std', 'max', 'min'],
'device_app_grouped_active_time_MAX' : ['mean', 'std', 'max', 'min'],
'device_app_grouped_active_time_MIN' : ['mean', 'std', 'max', 'min'],
'device_app_grouped_active_time_COUNT' : ['mean', 'std', 'max', 'min'],
}
df_temp = df_da_agg.groupby(['device_id']).agg(aggregations)
df_temp.columns = pd.Index([e[0] + "_" + e[1].upper() for e in df_temp.columns.tolist()])
df_temp = df_temp.reset_index()
df_agg = df_agg.merge(df_temp, on='device_id', how='left')
return df_agg
def app_type_encoding() :
df_temp = df_device_app_pair.merge(deviceid_train[['device_id', 'age', 'sex']], on='device_id', how='left')
aggregations = {
'age' : ['std','mean'],
'sex' : ['mean'],
}
df_agg_app_parent_type = df_temp.groupby('app_parent_type').agg(aggregations)
df_agg_app_parent_type.columns = pd.Index(['app_parent_type_' + e[0] + "_" + e[1].upper() for e in df_agg_app_parent_type.columns.tolist()])
df_agg_app_parent_type = df_agg_app_parent_type.reset_index()
df_agg_app_child_type = df_temp.groupby('app_child_type').agg(aggregations)
df_agg_app_child_type.columns = pd.Index(['app_child_type_' + e[0] + "_" + e[1].upper() for e in df_agg_app_child_type.columns.tolist()])
df_agg_app_child_type = df_agg_app_child_type.reset_index()
df_temp = df_temp.merge(df_agg_app_parent_type, on='app_parent_type', how='left')
df_temp = df_temp.merge(df_agg_app_child_type, on='app_child_type', how='left')
aggregations = {
'app_parent_type_age_STD' : ['mean'],
'app_parent_type_age_MEAN' : ['mean'],
'app_parent_type_sex_MEAN' : ['mean'],
'app_child_type_age_STD' : ['mean'],
'app_child_type_age_MEAN' : ['mean'],
'app_child_type_sex_MEAN' : ['mean']
}
df_agg = df_temp.groupby('device_id').agg(aggregations)
df_agg.columns = pd.Index(['device_grouped_' + e[0] + "_" + e[1].upper() for e in df_agg.columns.tolist()])
df_agg = df_agg.reset_index()
return df_agg
#每个device对应的app_parent_type计数
def app_type_onehot_in_device(df) :
df_copy = df.fillna(-1)
df_temp = df_copy.groupby(['device_id', 'app_parent_type'])['app_id'].size().reset_index()
df_temp.rename(columns = {'app_id' : 'app_parent_type_counts'}, inplace=True)
df_temp = pd.pivot_table(df_temp, index='device_id', columns='app_parent_type', values='app_parent_type_counts').reset_index()
df_temp.columns = ['device_id'] + ['app_parent_type'+str(i) for i in range(-1,45)]
df_temp.fillna(0, inplace=True)
return df_temp
apps=deviceid_packages['apps'].apply(lambda x:' '.join(x)).tolist()
vectorizer=CountVectorizer()
transformer=TfidfTransformer()
cntTf = vectorizer.fit_transform(apps)
tfidf=transformer.fit_transform(cntTf)
word=vectorizer.get_feature_names()
weight=tfidf.toarray()
df_weight=pd.DataFrame(weight)
feature=df_weight.columns
df_weight['sum']=0
for f in tqdm(feature):
df_weight['sum']+=df_weight[f]
deviceid_packages['tfidf_sum']=df_weight['sum']
# In[10]:
lda = LatentDirichletAllocation(n_topics=5,
learning_offset=50.,
random_state=666)
docres = lda.fit_transform(cntTf)
# In[11]:
deviceid_packages = pd.concat([deviceid_packages,pd.DataFrame(docres)],axis=1)
# In[12]:
temp=deviceid_packages.drop('apps',axis=1)
deviceid_train=pd.merge(deviceid_train,temp,on='device_id',how='left')
# In[13]:
#解析出所有的device_app_pair
device_id_arr = []
app_arr = []
df_device_app_pair = pd.DataFrame()
for row in deviceid_packages.values :
device_id = row[0]
app_list = row[1]
for app in app_list :
device_id_arr.append(device_id)
app_arr.append(app)
#生成pair
df_device_app_pair['device_id'] = device_id_arr
df_device_app_pair['app_id'] = app_arr
df_device_app_pair = df_device_app_pair.merge(package_label, how='left', on='app_id')
# In[14]:
df_device_app_pair
# In[15]:
#提取特征
df_train = deviceid_train.merge(device_active_time_time_stat(), on='device_id', how='left')
df_train = df_train.merge(deviceid_brand, on='device_id', how='left')
df_train = df_train.merge(app_type_onehot_in_device(df_device_app_pair), on='device_id', how='left')
df_train = df_train.merge(app_type_encoding(), on='device_id', how='left')
df_train = df_train.merge(device_active_hour(), on='device_id', how='left')
df_train = df_train.merge(app_type_mean_time_gap_one_hot(), on='device_id', how='left')
df_train = df_train.merge(open_app_counts_in_hour(), on='device_id', how='left')
df_train = df_train.merge(close_app_counts_in_hour(), on='device_id', how='left')
df_train = df_train.merge(device_brand_encoding(), on='device_id', how='left')
df_train = df_train.merge(device_start_end_app_timegap(), on='device_id', how='left')
df_train = df_train.merge(open_app_timegap_in_hour(), on='device_id', how='left')
# In[79]:
df_w2c_start = pd.read_csv('device_start_app_w2c.csv')
df_w2c_close = pd.read_csv('device_close_app_w2c.csv')
df_w2c_all = pd.read_csv('device_all_app_w2c.csv')
df_device_quchong_start_app_w2c = pd.read_csv('device_quchong_start_app_w2c.csv')
df_device_app_unique_start_app_w2c = pd.read_csv('device_app_unique_start_app_w2c.csv')
df_device_app_unique_close_app_w2c = pd.read_csv('device_app_unique_close_app_w2c.csv')
df_device_app_unique_all_app_w2c = pd.read_csv('device_app_unique_all_app_w2c.csv')
df_train_w2v = df_train.merge(df_w2c_start, on='device_id', how='left')
df_train_w2v = df_train_w2v.merge(df_w2c_close, on='device_id', how='left')
df_train_w2v = df_train_w2v.merge(df_w2c_all, on='device_id', how='left')
df_train_w2v = df_train_w2v.merge(df_device_quchong_start_app_w2c, on='device_id', how='left')
df_train_w2v = df_train_w2v.merge(df_device_app_unique_start_app_w2c, on='device_id', how='left')
df_train_w2v = df_train_w2v.merge(df_device_app_unique_close_app_w2c, on='device_id', how='left')
df_train_w2v = df_train_w2v.merge(df_device_app_unique_all_app_w2c, on='device_id', how='left')
# In[81]:
train = df_train_w2v[df_train_w2v['sex'].notnull()]
test = df_train_w2v[df_train_w2v['sex'].isnull()]
# In[82]:
train['sex'] = train.sex.apply(lambda x : x if x == 1 else 0)
# In[83]:
X = train.drop(['sex', 'age', 'device_id'],axis=1)
Y = train['sex']
Y_CAT = pd.Categorical(Y)
Y = pd.Series(Y_CAT.codes)
# In[84]:
from sklearn.model_selection import KFold, StratifiedKFold
seed = 2018
num_folds = 5
folds = StratifiedKFold(n_splits= num_folds, shuffle=True, random_state=seed)
sub_list = []
oof_preds = np.zeros(train.shape[0])
sub_preds = np.zeros(test.shape[0])
cate_feat = ['device_type','device_brand']
params = {
'boosting_type': 'gbdt',
'learning_rate' : 0.02,
#'max_depth':5,
'num_leaves' : 2 ** 5,
'metric': {'binary_logloss'},
#'num_class' : 22,
'objective' : 'binary',
'random_state' : 6666,
'bagging_freq' : 5,
'feature_fraction' : 0.7,
'bagging_fraction' : 0.7,
'min_split_gain' : 0.0970905919552776,
'min_child_weight' : 9.42012323936088,
}
for n_fold, (train_idx, valid_idx) in enumerate(folds.split(X, Y)):
train_x, train_y = X.iloc[train_idx], Y.iloc[train_idx]
valid_x, valid_y = X.iloc[valid_idx], Y.iloc[valid_idx]
lgb_train=lgb.Dataset(train_x,label=train_y)
lgb_eval = lgb.Dataset(valid_x, valid_y, reference=lgb_train)
gbm = lgb.train(params, lgb_train, num_boost_round=450, valid_sets=lgb_eval, verbose_eval=50)
oof_preds[valid_idx] = gbm.predict(valid_x[X.columns.values])
train['sex_prob_oof'] = oof_preds
# In[85]:
#用全部的train来预测test
lgb_train = lgb.Dataset(X,label=Y)
gbm = lgb.train(params, lgb_train, num_boost_round=450, valid_sets=lgb_train, verbose_eval=50)
test['sex_prob_oof'] = gbm.predict(test[X.columns.values])
# In[88]:
df_sex_prob_oof = pd.concat([train[['device_id', 'sex_prob_oof']], test[['device_id', 'sex_prob_oof']]])
df_sex_prob_oof.to_csv('device_sex_prob_oof.csv', index=None)
================================================
FILE: THLUO/6.start_close_age_prob_oof.py
================================================
# coding: utf-8
# In[1]:
import pandas as pd
import seaborn as sns
import numpy as np
from tqdm import tqdm
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.cross_validation import train_test_split
from sklearn.metrics import accuracy_score
import lightgbm as lgb
from datetime import datetime,timedelta
import time
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
import gc
# In[2]:
print ('6.start_close_age_prob_oof.py')
path='input/'
data=pd.DataFrame()
#sex_age=pd.read_excel('./data/性别年龄对照表.xlsx')
# In[3]:
deviceid_packages=pd.read_csv(path+'deviceid_packages.tsv',sep='\t',names=['device_id','apps'])
deviceid_test=pd.read_csv(path+'deviceid_test.tsv',sep='\t',names=['device_id'])
deviceid_train=pd.read_csv(path+'deviceid_train.tsv',sep='\t',names=['device_id','sex','age'])
deviceid_brand = pd.read_csv(path+'deviceid_brand.tsv',sep='\t', names=['device_id','device_brand', 'device_type'])
deviceid_package_start_close = pd.read_csv(path+'deviceid_package_start_close.tsv',sep='\t', names=['device_id','app_id','start_time','close_time'])
package_label = pd.read_csv(path+'package_label.tsv',sep='\t',names=['app_id','app_parent_type', 'app_child_type'])
deviceid_brand['device_brand'] = deviceid_brand.device_brand.apply(lambda x : str(x).split(' ')[0])
df_temp = deviceid_brand.groupby('device_brand')['device_id'].count().reset_index().rename(columns={'device_id':'brand_counts'})
one_time_brand = df_temp[df_temp.brand_counts == 1].device_brand.values
deviceid_brand['device_brand'] = deviceid_brand.device_brand.apply(lambda x : 'other' if x in one_time_brand else x)
df_temp = deviceid_brand.groupby('device_brand')['device_id'].count().reset_index().rename(columns={'device_id':'brand_counts'})
one_time_brand = df_temp[df_temp.brand_counts == 2].device_brand.values
deviceid_brand['device_brand'] = deviceid_brand.device_brand.apply(lambda x : 'other_2' if x in one_time_brand else x)
df_temp = deviceid_brand.groupby('device_brand')['device_id'].count().reset_index().rename(columns={'device_id':'brand_counts'})
one_time_brand = df_temp[df_temp.brand_counts == 3].device_brand.values
deviceid_brand['device_brand'] = deviceid_brand.device_brand.apply(lambda x : 'other_3' if x in one_time_brand else x)
#转换成对应的数字
lbl = LabelEncoder()
lbl.fit(list(deviceid_brand.device_brand.values))
deviceid_brand['device_brand'] = lbl.transform(list(deviceid_brand.device_brand.values))
lbl = LabelEncoder()
lbl.fit(list(deviceid_brand.device_type.values))
deviceid_brand['device_type'] = lbl.transform(list(deviceid_brand.device_type.values))
#转换成对应的数字
lbl = LabelEncoder()
lbl.fit(list(package_label.app_parent_type.values))
package_label['app_parent_type'] = lbl.transform(list(package_label.app_parent_type.values))
lbl = LabelEncoder()
lbl.fit(list(package_label.app_child_type.values))
package_label['app_child_type'] = lbl.transform(list(package_label.app_child_type.values))
deviceid_train=pd.concat([deviceid_train,deviceid_test])
# In[4]:
deviceid_packages['apps']=deviceid_packages['apps'].apply(lambda x:x.split(','))
deviceid_packages['app_lenghth']=deviceid_packages['apps'].apply(lambda x:len(x))
# In[6]:
import time
# 输入毫秒级的时间,转出正常格式的时间
def timeStamp(timeNum):
timeStamp = float(timeNum/1000)
timeArray = time.localtime(timeStamp)
otherStyleTime = time.strftime("%Y-%m-%d %H:%M:%S", timeArray)
return otherStyleTime
#解析出具体的时间
deviceid_package_start_close['start_date'] = pd.to_datetime(deviceid_package_start_close.start_time.apply(timeStamp))
deviceid_package_start_close['end_date'] = pd.to_datetime(deviceid_package_start_close.close_time.apply(timeStamp))
deviceid_package_start_close['start_hour'] = deviceid_package_start_close.start_date.dt.hour
deviceid_package_start_close['end_hour'] = deviceid_package_start_close.end_date.dt.hour
deviceid_package_start_close['time_gap'] = (deviceid_package_start_close['end_date'] - deviceid_package_start_close['start_date']).astype('timedelta64[s]')
deviceid_package_start_close = deviceid_package_start_close.merge(package_label, on='app_id', how='left')
deviceid_package_start_close.app_parent_type.fillna(-1, inplace=True)
deviceid_package_start_close.app_child_type.fillna(-1, inplace=True)
deviceid_package_start_close['start_year'] = deviceid_package_start_close.start_date.dt.year
deviceid_package_start_close['end_year'] = deviceid_package_start_close.end_date.dt.year
deviceid_package_start_close['year_gap'] = deviceid_package_start_close['end_year'] - deviceid_package_start_close['start_year']
# In[8]:
agg_func = {
'start_hour' : ['min', 'max', 'mean', 'std', 'count'],
'end_hour' : ['min', 'max', 'mean', 'std'],
'time_gap' : ['min', 'max', 'mean', 'std']
}
df_agg = deviceid_package_start_close.groupby(['device_id', 'app_id']).agg(agg_func)
df_agg.columns = pd.Index(['device_app_grouped' + e[0] + "_" + e[1].upper() for e in df_agg.columns.tolist()])
df_agg = df_agg.reset_index()
df_agg = df_agg.merge(package_label, on='app_id', how='left')
# In[10]:
#device在每个时段打开app的次数
df_temp = deviceid_package_start_close.groupby(['device_id', 'app_id', 'start_hour'])['start_time'].count().reset_index()
df_temp = pd.pivot_table(df_temp, index=['device_id', 'app_id'], columns='start_hour', values='start_time').reset_index()
df_temp.columns = ['device_id', 'app_id'] + ['device_app_start_counts'+str(i) + '_hour' for i in range(0,24)]
df_temp.fillna(0, inplace=True)
# In[11]:
df_agg = df_agg.merge(df_temp, on=['device_id', 'app_id'], how='left')
# In[13]:
apps=deviceid_packages['apps'].apply(lambda x:' '.join(x)).tolist()
vectorizer=CountVectorizer()
transformer=TfidfTransformer()
cntTf = vectorizer.fit_transform(apps)
tfidf=transformer.fit_transform(cntTf)
word=vectorizer.get_feature_names()
weight=tfidf.toarray()
df_weight=pd.DataFrame(weight)
feature=df_weight.columns
df_weight['sum']=0
for f in tqdm(feature):
df_weight['sum']+=df_weight[f]
deviceid_packages['tfidf_sum']=df_weight['sum']
lda = LatentDirichletAllocation(n_topics=5,
learning_offset=50.,
random_state=666)
docres = lda.fit_transform(cntTf)
deviceid_packages = pd.concat([deviceid_packages,pd.DataFrame(docres)],axis=1)
del deviceid_packages['apps']
deviceid_packages.columns = ['device_id', 'app_lenghth', 'tfidf_sum', 'LDA_0', 'LDA_1', 'LDA_2', 'LDA_3', 'LDA_4']
# In[14]:
df_temp = df_agg.merge(deviceid_packages, on='device_id', how='left')
df_w2c_start = pd.read_csv('device_start_app_w2c.csv')
df_w2c_close = pd.read_csv('device_close_app_w2c.csv')
df_w2c_all = pd.read_csv('device_all_app_w2c.csv')
df_sex_prob_oof = pd.read_csv('device_sex_prob_oof.csv')
df_age_prob_oof = pd.read_csv('device_age_prob_oof.csv')
df_temp = df_temp.merge(df_w2c_start, on='device_id', how='left')
df_temp = df_temp.merge(df_w2c_close, on='device_id', how='left')
df_temp = df_temp.merge(df_w2c_all, on='device_id', how='left')
df_temp = df_temp.merge(df_sex_prob_oof, on='device_id', how='left')
df_temp = df_temp.merge(df_age_prob_oof, on='device_id', how='left')
# In[16]:
agg_func = {
'device_id' : ['count'],
'app_lenghth' : ['min', 'mean', 'std', 'max'],
'tfidf_sum' : ['min', 'mean', 'std', 'max'],
'LDA_1' : ['min', 'mean', 'std', 'max'],
'LDA_2' : ['min', 'mean', 'std', 'max'],
'LDA_3' : ['min', 'mean', 'std', 'max'],
'LDA_4' : ['min', 'mean', 'std', 'max'],
}
for j in [i for i in df_age_prob_oof.columns.values if i != 'device_id'] :
agg_func[j] = ['min', 'mean', 'std', 'max']
for j in [i for i in df_sex_prob_oof.columns.values if i != 'device_id'] :
agg_func[j] = ['min', 'mean', 'std', 'max']
for j in [i for i in df_w2c_all.columns.values if i != 'device_id'] :
agg_func[j] = ['mean']
for j in [i for i in df_w2c_start.columns.values if i != 'device_id'] :
agg_func[j] = ['mean']
for j in [i for i in df_w2c_close.columns.values if i != 'device_id'] :
agg_func[j] = ['mean']
# In[18]:
df_app_temp = df_temp.groupby('app_id').agg(agg_func)
df_app_temp.columns = pd.Index(['app_grouped' + e[0] + "_" + e[1].upper() for e in df_app_temp.columns.tolist()])
df_app_temp = df_app_temp.reset_index()
df_train = df_agg.merge(df_app_temp, on='app_id', how='left')
df_train = df_train.merge(deviceid_train, on='device_id', how='left')
# In[26]:
train = df_train[df_train['age'].notnull()]
test = df_train[df_train['age'].isnull()]
train = train.reset_index(drop=True)
test = test.reset_index(drop=True)
X = train.drop(['sex','age', 'app_id', 'device_id'],axis=1)
Y = train['age']
Y_CAT = pd.Categorical(Y)
Y = pd.Series(Y_CAT.codes)
# In[30]:
from sklearn.model_selection import KFold, StratifiedKFold
seed = 2018
num_folds = 5
folds = StratifiedKFold(n_splits= num_folds, shuffle=True, random_state=seed)
sub_list = []
oof_preds = np.zeros([train.shape[0], 11])
cate_feat = ['device_type','device_brand']
for n_fold, (train_idx, valid_idx) in enumerate(folds.split(X, Y)):
train_x, train_y = X.iloc[train_idx], Y.iloc[train_idx]
valid_x, valid_y = X.iloc[valid_idx], Y.iloc[valid_idx]
lgb_train=lgb.Dataset(train_x,label=train_y)
lgb_eval = lgb.Dataset(valid_x, valid_y, reference=lgb_train)
params = {
'boosting_type': 'gbdt',
'learning_rate' : 0.02,
#'max_depth':5,
'num_leaves' : 2 ** 5,
'metric': {'multi_logloss'},
'num_class' : 11,
'objective' : 'multiclass',
'random_state' : 6666,
'bagging_freq' : 5,
'feature_fraction' : 0.7,
'bagging_fraction' : 0.7,
'min_split_gain' : 0.0970905919552776,
'min_child_weight' : 9.42012323936088,
'nthread': 5,
}
gbm = lgb.train(params,
lgb_train,
num_boost_round=2100,
valid_sets=[lgb_train, lgb_eval],
verbose_eval=100)
oof_preds[valid_idx] = gbm.predict(valid_x[X.columns.values])
# In[32]:
oof_train = pd.DataFrame(oof_preds)
oof_train.columns = ['start_close_age_prob_oof_' + str(i) for i in range(11)]
train = pd.concat([train, oof_train], axis=1)
# In[38]:
#用全部的数据预测
lgb_train = lgb.Dataset(X,label=Y)
gbm = lgb.train(params, lgb_train, num_boost_round=2100, valid_sets=lgb_train, verbose_eval=100)
test_preds = gbm.predict(test[X.columns.values])
oof_test = pd.DataFrame(test_preds)
oof_test.columns = ['start_close_age_prob_oof_' + str(i) for i in range(11)]
test = pd.concat([test, oof_test], axis=1)
# In[76]:
df_age_prob_oof = pd.concat([train[['device_id'] + ['start_close_age_prob_oof_' + str(i) for i in range(11)]],
test[['device_id'] + ['start_close_age_prob_oof_' + str(i) for i in range(11)]]])
# In[72]:
agg_func = {
'start_close_age_prob_oof_0' : ['mean'],
'start_close_age_prob_oof_1' : ['mean'],
'start_close_age_prob_oof_2' : ['mean'],
'start_close_age_prob_oof_3' : ['mean'],
'start_close_age_prob_oof_4' : ['mean'],
'start_close_age_prob_oof_5' : ['mean'],
'start_close_age_prob_oof_6' : ['mean'],
'start_close_age_prob_oof_7' : ['mean'],
'start_close_age_prob_oof_8' : ['mean'],
'start_close_age_prob_oof_9' : ['mean'],
'start_close_age_prob_oof_10' : ['mean'],
}
df_age_prob_oof = df_age_prob_oof.groupby('device_id').agg(agg_func)
df_age_prob_oof.columns = pd.Index(['device_app_grouped' + e[0] + "_" + e[1].upper() for e in df_age_prob_oof.columns.tolist()])
df_age_prob_oof = df_age_prob_oof.reset_index()
# In[73]:
df_age_prob_oof.to_csv('start_close_age_prob_oof.csv', index=None)
================================================
FILE: THLUO/7.start_close_sex_prob_oof.py
================================================
# coding: utf-8
# In[1]:
import pandas as pd
import seaborn as sns
import numpy as np
from tqdm import tqdm
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.cross_validation import train_test_split
from sklearn.metrics import accuracy_score
import lightgbm as lgb
from datetime import datetime,timedelta
import time
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
import gc
# In[2]:
print ('7.start_close_sex_prob_oof.py')
path='input/'
data=pd.DataFrame()
#sex_age=pd.read_excel('./data/性别年龄对照表.xlsx')
# In[3]:
deviceid_packages=pd.read_csv(path+'deviceid_packages.tsv',sep='\t',names=['device_id','apps'])
deviceid_test=pd.read_csv(path+'deviceid_test.tsv',sep='\t',names=['device_id'])
deviceid_train=pd.read_csv(path+'deviceid_train.tsv',sep='\t',names=['device_id','sex','age'])
deviceid_brand = pd.read_csv(path+'deviceid_brand.tsv',sep='\t', names=['device_id','device_brand', 'device_type'])
deviceid_package_start_close = pd.read_csv(path+'deviceid_package_start_close.tsv',sep='\t', names=['device_id','app_id','start_time','close_time'])
package_label = pd.read_csv(path+'package_label.tsv',sep='\t',names=['app_id','app_parent_type', 'app_child_type'])
deviceid_brand['device_brand'] = deviceid_brand.device_brand.apply(lambda x : str(x).split(' ')[0])
df_temp = deviceid_brand.groupby('device_brand')['device_id'].count().reset_index().rename(columns={'device_id':'brand_counts'})
one_time_brand = df_temp[df_temp.brand_counts == 1].device_brand.values
deviceid_brand['device_brand'] = deviceid_brand.device_brand.apply(lambda x : 'other' if x in one_time_brand else x)
df_temp = deviceid_brand.groupby('device_brand')['device_id'].count().reset_index().rename(columns={'device_id':'brand_counts'})
one_time_brand = df_temp[df_temp.brand_counts == 2].device_brand.values
deviceid_brand['device_brand'] = deviceid_brand.device_brand.apply(lambda x : 'other_2' if x in one_time_brand else x)
df_temp = deviceid_brand.groupby('device_brand')['device_id'].count().reset_index().rename(columns={'device_id':'brand_counts'})
one_time_brand = df_temp[df_temp.brand_counts == 3].device_brand.values
deviceid_brand['device_brand'] = deviceid_brand.device_brand.apply(lambda x : 'other_3' if x in one_time_brand else x)
#转换成对应的数字
lbl = LabelEncoder()
lbl.fit(list(deviceid_brand.device_brand.values))
deviceid_brand['device_brand'] = lbl.transform(list(deviceid_brand.device_brand.values))
lbl = LabelEncoder()
lbl.fit(list(deviceid_brand.device_type.values))
deviceid_brand['device_type'] = lbl.transform(list(deviceid_brand.device_type.values))
#转换成对应的数字
lbl = LabelEncoder()
lbl.fit(list(package_label.app_parent_type.values))
package_label['app_parent_type'] = lbl.transform(list(package_label.app_parent_type.values))
lbl = LabelEncoder()
lbl.fit(list(package_label.app_child_type.values))
package_label['app_child_type'] = lbl.transform(list(package_label.app_child_type.values))
deviceid_train=pd.concat([deviceid_train,deviceid_test])
# In[6]:
deviceid_packages['apps']=deviceid_packages['apps'].apply(lambda x:x.split(','))
deviceid_packages['app_lenghth']=deviceid_packages['apps'].apply(lambda x:len(x))
# In[4]:
import time
# 输入毫秒级的时间,转出正常格式的时间
def timeStamp(timeNum):
timeStamp = float(timeNum/1000)
timeArray = time.localtime(timeStamp)
otherStyleTime = time.strftime("%Y-%m-%d %H:%M:%S", timeArray)
return otherStyleTime
#解析出具体的时间
deviceid_package_start_close['start_date'] = pd.to_datetime(deviceid_package_start_close.start_time.apply(timeStamp))
deviceid_package_start_close['end_date'] = pd.to_datetime(deviceid_package_start_close.close_time.apply(timeStamp))
deviceid_package_start_close['start_hour'] = deviceid_package_start_close.start_date.dt.hour
deviceid_package_start_close['end_hour'] = deviceid_package_start_close.end_date.dt.hour
deviceid_package_start_close['time_gap'] = (deviceid_package_start_close['end_date'] - deviceid_package_start_close['start_date']).astype('timedelta64[s]')
deviceid_package_start_close = deviceid_package_start_close.merge(package_label, on='app_id', how='left')
deviceid_package_start_close.app_parent_type.fillna(-1, inplace=True)
deviceid_package_start_close.app_child_type.fillna(-1, inplace=True)
deviceid_package_start_close['start_year'] = deviceid_package_start_close.start_date.dt.year
deviceid_package_start_close['end_year'] = deviceid_package_start_close.end_date.dt.year
deviceid_package_start_close['year_gap'] = deviceid_package_start_close['end_year'] - deviceid_package_start_close['start_year']
# In[9]:
agg_func = {
'start_hour' : ['min', 'max', 'mean', 'std', 'count'],
'end_hour' : ['min', 'max', 'mean', 'std'],
'time_gap' : ['min', 'max', 'mean', 'std']
}
df_agg = deviceid_package_start_close.groupby(['device_id', 'app_id']).agg(agg_func)
df_agg.columns = pd.Index(['device_app_grouped' + e[0] + "_" + e[1].upper() for e in df_agg.columns.tolist()])
df_agg = df_agg.reset_index()
df_agg = df_agg.merge(package_label, on='app_id', how='left')
# In[11]:
#device在每个时段打开app的次数
df_temp = deviceid_package_start_close.groupby(['device_id', 'app_id', 'start_hour'])['start_time'].count().reset_index()
df_temp = pd.pivot_table(df_temp, index=['device_id', 'app_id'], columns='start_hour', values='start_time').reset_index()
df_temp.columns = ['device_id', 'app_id'] + ['device_app_start_counts'+str(i) + '_hour' for i in range(0,24)]
df_temp.fillna(0, inplace=True)
# In[13]:
df_agg = df_agg.merge(df_temp, on=['device_id', 'app_id'], how='left')
# In[15]:
apps=deviceid_packages['apps'].apply(lambda x:' '.join(x)).tolist()
vectorizer=CountVectorizer()
transformer=TfidfTransformer()
cntTf = vectorizer.fit_transform(apps)
tfidf=transformer.fit_transform(cntTf)
word=vectorizer.get_feature_names()
weight=tfidf.toarray()
df_weight=pd.DataFrame(weight)
feature=df_weight.columns
df_weight['sum']=0
for f in tqdm(feature):
df_weight['sum']+=df_weight[f]
deviceid_packages['tfidf_sum']=df_weight['sum']
lda = LatentDirichletAllocation(n_topics=5,
learning_offset=50.,
random_state=666)
docres = lda.fit_transform(cntTf)
deviceid_packages = pd.concat([deviceid_packages,pd.DataFrame(docres)],axis=1)
del deviceid_packages['apps']
deviceid_packages.columns = ['device_id', 'app_lenghth', 'tfidf_sum', 'LDA_0', 'LDA_1', 'LDA_2', 'LDA_3', 'LDA_4']
# In[207]:
df_temp = df_agg.merge(deviceid_packages, on='device_id', how='left')
df_w2c_start = pd.read_csv('device_start_app_w2c.csv')
df_w2c_close = pd.read_csv('device_close_app_w2c.csv')
df_w2c_all = pd.read_csv('device_all_app_w2c.csv')
df_sex_prob_oof = pd.read_csv('device_sex_prob_oof.csv')
df_age_prob_oof = pd.read_csv('device_age_prob_oof.csv')
df_temp = df_temp.merge(df_w2c_start, on='device_id', how='left')
df_temp = df_temp.merge(df_w2c_close, on='device_id', how='left')
df_temp = df_temp.merge(df_w2c_all, on='device_id', how='left')
df_temp = df_temp.merge(df_sex_prob_oof, on='device_id', how='left')
df_temp = df_temp.merge(df_age_prob_oof, on='device_id', how='left')
# In[224]:
agg_func = {
'device_id' : ['count'],
'app_lenghth' : ['min', 'mean', 'std', 'max'],
'tfidf_sum' : ['min', 'mean', 'std', 'max'],
'LDA_1' : ['min', 'mean', 'std', 'max'],
'LDA_2' : ['min', 'mean', 'std', 'max'],
'LDA_3' : ['min', 'mean', 'std', 'max'],
'LDA_4' : ['min', 'mean', 'std', 'max'],
}
for j in [i for i in df_age_prob_oof.columns.values if i != 'device_id'] :
agg_func[j] = ['min', 'mean', 'std', 'max']
for j in [i for i in df_sex_prob_oof.columns.values if i != 'device_id'] :
agg_func[j] = ['min', 'mean', 'std', 'max']
for j in [i for i in df_w2c_all.columns.values if i != 'device_id'] :
agg_func[j] = ['mean']
for j in [i for i in df_w2c_start.columns.values if i != 'device_id'] :
agg_func[j] = ['mean']
for j in [i for i in df_w2c_close.columns.values if i != 'device_id'] :
agg_func[j] = ['mean']
# In[226]:
df_app_temp = df_temp.groupby('app_id').agg(agg_func)
df_app_temp.columns = pd.Index(['app_grouped' + e[0] + "_" + e[1].upper() for e in df_app_temp.columns.tolist()])
df_app_temp = df_app_temp.reset_index()
df_train = df_agg.merge(df_app_temp, on='app_id', how='left')
# In[228]:
df_train = df_train.merge(deviceid_train, on='device_id', how='left')
# In[235]:
train = df_train[df_train['sex'].notnull()]
test = df_train[df_train['sex'].isnull()]
train = train.reset_index(drop=True)
test = test.reset_index(drop=True)
X = train.drop(['sex','age', 'app_id', 'device_id'],axis=1)
Y = train['sex']
Y_CAT = pd.Categorical(Y)
Y = pd.Series(Y_CAT.codes)
# In[237]:
from sklearn.model_selection import KFold, StratifiedKFold
seed = 2018
num_folds = 5
folds = StratifiedKFold(n_splits= num_folds, shuffle=True, random_state=seed)
sub_list = []
oof_preds = np.zeros(train.shape[0])
sub_preds = np.zeros(test.shape[0])
cate_feat = ['device_type','device_brand']
params = {
'boosting_type': 'gbdt',
'learning_rate' : 0.02,
#'max_depth':5,
'num_leaves' : 2 ** 5,
'metric': {'binary_logloss'},
'objective' : 'binary',
'random_state' : 6666,
'bagging_freq' : 5,
'feature_fraction' : 0.7,
'bagging_fraction' : 0.7,
'min_split_gain' : 0.0970905919552776,
'min_child_weight' : 9.42012323936088,
}
for n_fold, (train_idx, valid_idx) in enumerate(folds.split(X, Y)):
train_x, train_y = X.iloc[train_idx], Y.iloc[train_idx]
valid_x, valid_y = X.iloc[valid_idx], Y.iloc[valid_idx]
lgb_train=lgb.Dataset(train_x,label=train_y)
lgb_eval = lgb.Dataset(valid_x, valid_y, reference=lgb_train)
gbm = lgb.train(params, lgb_train, num_boost_round=2100, valid_sets=[lgb_train, lgb_eval],
verbose_eval=100)
oof_preds[valid_idx] = gbm.predict(valid_x[X.columns.values])
train['sex_prob_oof'] = oof_preds
# In[239]:
#用全部的train来预测test
lgb_train = lgb.Dataset(X,label=Y)
gbm = lgb.train(params, lgb_train, num_boost_round=2100, valid_sets=lgb_train, verbose_eval=100)
test['sex_prob_oof'] = gbm.predict(test[X.columns.values])
# In[240]:
df_sex_prob_oof = pd.concat([train[['device_id', 'sex_prob_oof']], test[['device_id', 'sex_prob_oof']]])
df_sex_prob_oof.columns = ['device_id', 'start_close_sex_prob_oof']
agg_func = {
'start_close_sex_prob_oof' : ['min', 'max', 'mean', 'std']
}
df_sex_prob_oof = df_sex_prob_oof.groupby('device_id').agg(agg_func)
df_sex_prob_oof.columns = pd.Index(['device_app_grouped' + e[0] + "_" + e[1].upper() for e in df_sex_prob_oof.columns.tolist()])
df_sex_prob_oof = df_sex_prob_oof.reset_index()
# In[242]:
df_sex_prob_oof.to_csv('start_close_sex_prob_oof.csv', index=None)
================================================
FILE: THLUO/9.sex_age_bin_prob_oof.py
================================================
# coding: utf-8
# In[1]:
import pandas as pd
import seaborn as sns
import numpy as np
from tqdm import tqdm
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.cross_validation import train_test_split
from sklearn.metrics import accuracy_score
import lightgbm as lgb
from datetime import datetime,timedelta
import time
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
import gc
# In[2]:
print ('9.sex_age_bin_prob_oof.py')
path='input/'
data=pd.DataFrame()
#sex_age=pd.read_excel('./data/性别年龄对照表.xlsx')
# In[3]:
deviceid_packages=pd.read_csv(path+'deviceid_packages.tsv',sep='\t',names=['device_id','apps'])
deviceid_test=pd.read_csv(path+'deviceid_test.tsv',sep='\t',names=['device_id'])
deviceid_train=pd.read_csv(path+'deviceid_train.tsv',sep='\t',names=['device_id','sex','age'])
deviceid_brand = pd.read_csv(path+'deviceid_brand.tsv',sep='\t', names=['device_id','device_brand', 'device_type'])
deviceid_package_start_close = pd.read_csv(path+'deviceid_package_start_close.tsv',sep='\t', names=['device_id','app_id','start_time','close_time'])
package_label = pd.read_csv(path+'package_label.tsv',sep='\t',names=['app_id','app_parent_type', 'app_child_type'])
deviceid_brand['device_brand'] = deviceid_brand.device_brand.apply(lambda x : str(x).split(' ')[0])
df_temp = deviceid_brand.groupby('device_brand')['device_id'].count().reset_index().rename(columns={'device_id':'brand_counts'})
one_time_brand = df_temp[df_temp.brand_counts == 1].device_brand.values
deviceid_brand['device_brand'] = deviceid_brand.device_brand.apply(lambda x : 'other' if x in one_time_brand else x)
df_temp = deviceid_brand.groupby('device_brand')['device_id'].count().reset_index().rename(columns={'device_id':'brand_counts'})
one_time_brand = df_temp[df_temp.brand_counts == 2].device_brand.values
deviceid_brand['device_brand'] = deviceid_brand.device_brand.apply(lambda x : 'other_2' if x in one_time_brand else x)
df_temp = deviceid_brand.groupby('device_brand')['device_id'].count().reset_index().rename(columns={'device_id':'brand_counts'})
one_time_brand = df_temp[df_temp.brand_counts == 3].device_brand.values
deviceid_brand['device_brand'] = deviceid_brand.device_brand.apply(lambda x : 'other_3' if x in one_time_brand else x)
#转换成对应的数字
lbl = LabelEncoder()
lbl.fit(list(deviceid_brand.device_brand.values))
deviceid_brand['device_brand'] = lbl.transform(list(deviceid_brand.device_brand.values))
lbl = LabelEncoder()
lbl.fit(list(deviceid_brand.device_type.values))
deviceid_brand['device_type'] = lbl.transform(list(deviceid_brand.device_type.values))
#转换成对应的数字
lbl = LabelEncoder()
lbl.fit(list(package_label.app_parent_type.values))
package_label['app_parent_type'] = lbl.transform(list(package_label.app_parent_type.values))
lbl = LabelEncoder()
lbl.fit(list(package_label.app_child_type.values))
package_label['app_child_type'] = lbl.transform(list(package_label.app_child_type.values))
# In[4]:
import time
# 输入毫秒级的时间,转出正常格式的时间
def timeStamp(timeNum):
timeStamp = float(timeNum/1000)
timeArray = time.localtime(timeStamp)
otherStyleTime = time.strftime("%Y-%m-%d %H:%M:%S", timeArray)
return otherStyleTime
#解析出具体的时间
deviceid_package_start_close['start_date'] = pd.to_datetime(deviceid_package_start_close.start_time.apply(timeStamp))
deviceid_package_start_close['end_date'] = pd.to_datetime(deviceid_package_start_close.close_time.apply(timeStamp))
deviceid_package_start_close['start_hour'] = deviceid_package_start_close.start_date.dt.hour
deviceid_package_start_close['end_hour'] = deviceid_package_start_close.end_date.dt.hour
deviceid_package_start_close['time_gap'] = (deviceid_package_start_close['end_date'] - deviceid_package_start_close['start_date']).astype('timedelta64[s]')
deviceid_package_start_close = deviceid_package_start_close.merge(package_label, on='app_id', how='left')
deviceid_package_start_close.app_parent_type.fillna(-1, inplace=True)
deviceid_package_start_close.app_child_type.fillna(-1, inplace=True)
deviceid_package_start_close['start_year'] = deviceid_package_start_close.start_date.dt.year
deviceid_package_start_close['end_year'] = deviceid_package_start_close.end_date.dt.year
deviceid_package_start_close['year_gap'] = deviceid_package_start_close['end_year'] - deviceid_package_start_close['start_year']
# In[5]:
deviceid_train=pd.concat([deviceid_train,deviceid_test])
# In[6]:
deviceid_packages['apps']=deviceid_packages['apps'].apply(lambda x:x.split(','))
deviceid_packages['app_lenghth']=deviceid_packages['apps'].apply(lambda x:len(x))
#特征工程
def open_app_timegap_in_hour() :
df_temp = deviceid_package_start_close.groupby(['device_id', 'start_hour'])['time_gap'].mean().reset_index().rename(columns = {'time_gap': 'mean_time_gap'})
df_mean_temp = pd.pivot_table(df_temp, index='device_id', columns='start_hour', values='mean_time_gap').reset_index()
df_mean_temp.columns = ['device_id'] + ['open_app_timegap_in_'+str(i) + '_mean_hour' for i in range(0,24)]
df_mean_temp.fillna(0, inplace=True)
return df_mean_temp
# In[8]:
def device_start_end_app_timegap() :
#用户打开,关闭app的时间间隔
df_ = deviceid_package_start_close.sort_values(by=['device_id', 'start_date'], ascending=False)
df_['prev_start_date'] = df_.groupby('device_id')['start_date'].shift(-1)
df_['start_date_gap'] = (df_['start_date'] - df_['prev_start_date']).astype('timedelta64[s]')
agg_dic = {'start_date_gap' : ['min', 'max', 'mean', 'median', 'std']}
df_start_gap_agg = df_.groupby('device_id').agg(agg_dic)
df_start_gap_agg.columns = pd.Index(['device_' + e[0] + "_" + e[1].upper() for e in df_start_gap_agg.columns.tolist()])
df_start_gap_agg = df_start_gap_agg.reset_index()
#del df_
gc.collect()
#关闭时间间隔
df_ = deviceid_package_start_close.sort_values(by=['device_id', 'end_date'], ascending=False)
df_['prev_end_date'] = df_.groupby('device_id')['end_date'].shift(-1)
df_['end_date_gap'] = (df_['end_date'] - df_['prev_end_date']).astype('timedelta64[s]')
agg_dic = {'end_date_gap' : ['min', 'max', 'mean', 'median', 'std']}
df_end_gap_agg = df_.groupby('device_id').agg(agg_dic)
df_end_gap_agg.columns = pd.Index(['device_' + e[0] + "_" + e[1].upper() for e in df_end_gap_agg.columns.tolist()])
df_end_gap_agg = df_end_gap_agg.reset_index()
#del df_
gc.collect()
df_agg = df_start_gap_agg.merge(df_end_gap_agg, on='device_id', how='left')
#df_agg = df_agg.merge(df_app_start_gap_agg, on='device_id', how='left')
#df_agg = df_agg.merge(df_app_end_gap_agg, on='device_id', how='left')
return df_agg
def open_app_counts_in_hour() :
df_temp = deviceid_package_start_close.groupby(['device_id', 'start_hour'])['app_id'].count().reset_index().rename(columns = {'app_id': 'app_counts'})
df_temp = pd.pivot_table(df_temp, index='device_id', columns='start_hour', values='app_counts').reset_index()
df_temp.columns = ['device_id'] + ['open_app_counts_in'+str(i) + '_hour' for i in range(0,24)]
df_temp.fillna(0, inplace=True)
return df_temp
def close_app_counts_in_hour() :
df_temp = deviceid_package_start_close.groupby(['device_id', 'end_hour'])['app_id'].count().reset_index().rename(columns = {'app_id': 'app_counts'})
df_temp = pd.pivot_table(df_temp, index='device_id', columns='end_hour', values='app_counts').reset_index()
df_temp.columns = ['device_id'] + ['close_app_counts_in'+str(i) + '_hour' for i in range(0,24)]
df_temp.fillna(0, inplace=True)
return df_temp
def app_type_mean_time_gap_one_hot () :
df_temp = deviceid_package_start_close.groupby(['device_id', 'app_parent_type'])['time_gap'].mean().reset_index()
df_temp = pd.pivot_table(df_temp, index='device_id', columns='app_parent_type', values='time_gap').reset_index()
df_temp.columns = ['device_id'] + ['app_parent_type_mean_time_gap'+str(i) for i in range(-1,45)]
df_temp.fillna(-1, inplace=True)
return df_temp
def device_active_hour() :
aggregations = {
'start_hour' : ['std','mean','max','min'],
'end_hour' : ['std','mean','max','min']
}
df_agg = deviceid_package_start_close.groupby('device_id').agg(aggregations)
df_agg.columns = pd.Index(['device_grouped_' + e[0] + "_" + e[1].upper() for e in df_agg.columns.tolist()])
df_agg = df_agg.reset_index()
return df_agg
def device_brand_encoding() :
df_temp = deviceid_brand.merge(deviceid_train[['device_id', 'age', 'sex']], on='device_id', how='left')
aggregations = {
'age' : ['std','mean'],
'sex' : ['mean'],
}
df_device_brand = df_temp.groupby('device_brand').agg(aggregations)
df_device_brand.columns = pd.Index(['device_brand_' + e[0] + "_" + e[1].upper() for e in df_device_brand.columns.tolist()])
df_device_brand = df_device_brand.reset_index()
df_device_type = df_temp.groupby('device_type').agg(aggregations)
df_device_type.columns = pd.Index(['device_type_' + e[0] + "_" + e[1].upper() for e in df_device_type.columns.tolist()])
df_device_type = df_device_type.reset_index()
df_temp = df_temp.merge(df_device_brand, on='device_brand', how='left')
df_temp = df_temp.merge(df_device_type, on='device_type', how='left')
aggregations = {
'device_brand_age_STD' : ['mean'],
'device_brand_age_MEAN' : ['mean'],
'device_brand_sex_MEAN' : ['mean'],
#'device_type_age_STD' : ['mean'],
#'device_type_age_MEAN' : ['mean'],
#'device_type_sex_MEAN' : ['mean']
}
df_agg = df_temp.groupby('device_id').agg(aggregations)
df_agg.columns = pd.Index(['device_grouped_' + e[0] + "_" + e[1].upper() for e in df_agg.columns.tolist()])
df_agg = df_agg.reset_index()
return df_agg
#统计device运行app的情况
def device_active_time_time_stat() :
#device开启app的时间统计信息
deviceid_package_start_close['active_time'] = deviceid_package_start_close['close_time'] - deviceid_package_start_close['start_time']
#device开启了多少次app
#device开启了多少个app
aggregations = {
'app_id' : ['count', 'nunique'],
'active_time' : ['mean', 'std', 'max', 'min'],
}
df_agg = deviceid_package_start_close.groupby('device_id').agg(aggregations)
df_agg.columns = pd.Index(['device_grouped_' + e[0] + "_" + e[1].upper() for e in df_agg.columns.tolist()])
df_agg = df_agg.reset_index()
aggregations = {
'active_time' : ['mean', 'std', 'max', 'min', 'count'],
}
df_da_agg = deviceid_package_start_close.groupby(['device_id', 'app_id']).agg(aggregations)
df_da_agg.columns = pd.Index(['device_app_grouped_' + e[0] + "_" + e[1].upper() for e in df_da_agg.columns.tolist()])
df_da_agg = df_da_agg.reset_index()
#device开启app的平均时间
aggregations = {
'device_app_grouped_active_time_MEAN' : ['mean', 'std', 'max', 'min'],
'device_app_grouped_active_time_STD' : ['mean', 'std', 'max', 'min'],
'device_app_grouped_active_time_MAX' : ['mean', 'std', 'max', 'min'],
'device_app_grouped_active_time_MIN' : ['mean', 'std', 'max', 'min'],
'device_app_grouped_active_time_COUNT' : ['mean', 'std', 'max', 'min'],
}
df_temp = df_da_agg.groupby(['device_id']).agg(aggregations)
df_temp.columns = pd.Index([e[0] + "_" + e[1].upper() for e in df_temp.columns.tolist()])
df_temp = df_temp.reset_index()
df_agg = df_agg.merge(df_temp, on='device_id', how='left')
return df_agg
def app_type_encoding() :
df_temp = df_device_app_pair.merge(deviceid_train[['device_id', 'age', 'sex']], on='device_id', how='left')
aggregations = {
'age' : ['std','mean'],
'sex' : ['mean'],
}
df_agg_app_parent_type = df_temp.groupby('app_parent_type').agg(aggregations)
df_agg_app_parent_type.columns = pd.Index(['app_parent_type_' + e[0] + "_" + e[1].upper() for e in df_agg_app_parent_type.columns.tolist()])
df_agg_app_parent_type = df_agg_app_parent_type.reset_index()
df_agg_app_child_type = df_temp.groupby('app_child_type').agg(aggregations)
df_agg_app_child_type.columns = pd.Index(['app_child_type_' + e[0] + "_" + e[1].upper() for e in df_agg_app_child_type.columns.tolist()])
df_agg_app_child_type = df_agg_app_child_type.reset_index()
df_temp = df_temp.merge(df_agg_app_parent_type, on='app_parent_type', how='left')
df_temp = df_temp.merge(df_agg_app_child_type, on='app_child_type', how='left')
aggregations = {
'app_parent_type_age_STD' : ['mean'],
'app_parent_type_age_MEAN' : ['mean'],
'app_parent_type_sex_MEAN' : ['mean'],
'app_child_type_age_STD' : ['mean'],
'app_child_type_age_MEAN' : ['mean'],
'app_child_type_sex_MEAN' : ['mean']
}
df_agg = df_temp.groupby('device_id').agg(aggregations)
df_agg.columns = pd.Index(['device_grouped_' + e[0] + "_" + e[1].upper() for e in df_agg.columns.tolist()])
df_agg = df_agg.reset_index()
return df_agg
#每个device对应的app_parent_type计数
def app_type_onehot_in_device(df) :
df_copy = df.fillna(-1)
df_temp = df_copy.groupby(['device_id', 'app_parent_type'])['app_id'].size().reset_index()
df_temp.rename(columns = {'app_id' : 'app_parent_type_counts'}, inplace=True)
df_temp = pd.pivot_table(df_temp, index='device_id', columns='app_parent_type', values='app_parent_type_counts').reset_index()
df_temp.columns = ['device_id'] + ['app_parent_type'+str(i) for i in range(-1,45)]
df_temp.fillna(0, inplace=True)
return df_temp
apps=deviceid_packages['apps'].apply(lambda x:' '.join(x)).tolist()
vectorizer=CountVectorizer()
transformer=TfidfTransformer()
cntTf = vectorizer.fit_transform(apps)
tfidf=transformer.fit_transform(cntTf)
word=vectorizer.get_feature_names()
weight=tfidf.toarray()
df_weight=pd.DataFrame(weight)
feature=df_weight.columns
df_weight['sum']=0
for f in tqdm(feature):
df_weight['sum']+=df_weight[f]
deviceid_packages['tfidf_sum']=df_weight['sum']
# In[10]:
lda = LatentDirichletAllocation(n_topics=5,
learning_offset=50.,
random_state=666)
docres = lda.fit_transform(cntTf)
# In[11]:
deviceid_packages = pd.concat([deviceid_packages,pd.DataFrame(docres)],axis=1)
# In[12]:
temp=deviceid_packages.drop('apps',axis=1)
deviceid_train=pd.merge(deviceid_train,temp,on='device_id',how='left')
# In[13]:
#解析出所有的device_app_pair
device_id_arr = []
app_arr = []
df_device_app_pair = pd.DataFrame()
for row in deviceid_packages.values :
device_id = row[0]
app_list = row[1]
for app in app_list :
device_id_arr.append(device_id)
app_arr.append(app)
#生成pair
df_device_app_pair['device_id'] = device_id_arr
df_device_app_pair['app_id'] = app_arr
df_device_app_pair = df_device_app_pair.merge(package_label, how='left', on='app_id')
# In[15]:
#提取特征
df_train = deviceid_train.merge(device_active_time_time_stat(), on='device_id', how='left')
df_train = df_train.merge(deviceid_brand, on='device_id', how='left')
df_train = df_train.merge(app_type_onehot_in_device(df_device_app_pair), on='device_id', how='left')
df_train = df_train.merge(app_type_encoding(), on='device_id', how='left')
df_train = df_train.merge(device_active_hour(), on='device_id', how='left')
df_train = df_train.merge(app_type_mean_time_gap_one_hot(), on='device_id', how='left')
df_train = df_train.merge(open_app_counts_in_hour(), on='device_id', how='left')
df_train = df_train.merge(close_app_counts_in_hour(), on='device_id', how='left')
df_train = df_train.merge(device_brand_encoding(), on='device_id', how='left')
df_train = df_train.merge(device_start_end_app_timegap(), on='device_id', how='left')
df_train = df_train.merge(open_app_timegap_in_hour(), on='device_id', how='left')
# In[16]:
df_w2c_start = pd.read_csv('device_start_app_w2c.csv')
df_w2c_close = pd.read_csv('device_close_app_w2c.csv')
df_w2c_all = pd.read_csv('device_all_app_w2c.csv')
df_device_quchong_start_app_w2c = pd.read_csv('device_quchong_start_app_w2c.csv')
df_device_app_unique_start_app_w2c = pd.read_csv('device_app_unique_start_app_w2c.csv')
df_device_app_unique_close_app_w2c = pd.read_csv('device_app_unique_close_app_w2c.csv')
df_device_app_unique_all_app_w2c = pd.read_csv('device_app_unique_all_app_w2c.csv')
# In[17]:
df_train_w2v = df_train.merge(df_w2c_start, on='device_id', how='left')
df_train_w2v = df_train_w2v.merge(df_w2c_close, on='device_id', how='left')
df_train_w2v = df_train_w2v.merge(df_w2c_all, on='device_id', how='left')
df_train_w2v = df_train_w2v.merge(df_device_quchong_start_app_w2c, on='device_id', how='left')
df_train_w2v = df_train_w2v.merge(df_device_app_unique_start_app_w2c, on='device_id', how='left')
df_train_w2v = df_train_w2v.merge(df_device_app_unique_close_app_w2c, on='device_id', how='left')
df_train_w2v = df_train_w2v.merge(df_device_app_unique_all_app_w2c, on='device_id', how='left')
# In[19]:
df_train_w2v['sex'] = df_train_w2v['sex'].apply(lambda x:str(x))
df_train_w2v['age'] = df_train_w2v['age'].apply(lambda x:str(x))
def tool(x):
if x=='nan':
return x
else:
return str(int(float(x)))
df_train_w2v['sex']=df_train_w2v['sex'].apply(tool)
df_train_w2v['age']=df_train_w2v['age'].apply(tool)
df_train_w2v['sex_age']=df_train_w2v['sex']+'-'+df_train_w2v['age']
df_train_w2v = df_train_w2v.replace({'nan':np.NaN,'nan-nan':np.NaN})
# In[42]:
train = df_train_w2v[df_train_w2v['sex_age'].notnull()]
test = df_train_w2v[df_train_w2v['sex_age'].isnull()]
train = train.reset_index(drop=True)
test = test.reset_index(drop=True)
# In[43]:
Y = train['sex_age']
Y_CAT = pd.Categorical(Y)
Y = pd.Series(Y_CAT.codes)
train['label'] = Y
# In[45]:
from sklearn.model_selection import KFold, StratifiedKFold
lgb_round = {4: 267,
6: 199,
17: 151,
5: 166,
15: 188,
16: 147,
8: 195,
7: 250,
21: 107,
2: 254,
3: 282,
19: 139,
9: 169,
13: 153,
1: 167,
18: 178,
10: 153,
20: 177,
14: 208,
12: 194,
11: 211,
0: 132}
label_set = train.label.unique()
for sex_age in label_set :
print (sex_age)
X = train.drop(['sex', 'age', 'sex_age', 'label', 'device_id'],axis=1)
Y = train.label.apply(lambda x : 1 if x == sex_age else 0)
print (Y.value_counts())
seed = 2018
num_folds = 5
folds = StratifiedKFold(n_splits= num_folds, shuffle=True, random_state=seed)
sub_list = []
oof_preds = np.zeros(train.shape[0])
sub_preds = np.zeros(test.shape[0])
params = {
'boosting_type': 'gbdt',
'learning_rate' : 0.02,
#'max_depth':5,
'num_leaves' : 2 ** 5,
'metric': {'binary_logloss'},
#'num_class' : 22,
'objective' : 'binary',
'random_state' : 6666,
'bagging_freq' : 5,
'feature_fraction' : 0.7,
'bagging_fraction' : 0.7,
'min_split_gain' : 0.0970905919552776,
'min_child_weight' : 9.42012323936088,
}
for n_fold, (train_idx, valid_idx) in enumerate(folds.split(X, Y)):
train_x, train_y = X.iloc[train_idx], Y.iloc[train_idx]
valid_x, valid_y = X.iloc[valid_idx], Y.iloc[valid_idx]
lgb_train=lgb.Dataset(train_x,label=train_y)
lgb_eval = lgb.Dataset(valid_x, valid_y, reference=lgb_train)
gbm = lgb.train(params, lgb_train, num_boost_round=lgb_round[sex_age], valid_sets=[lgb_train, lgb_eval], verbose_eval=50)
oof_preds[valid_idx] = gbm.predict(valid_x[X.columns.values])
train['sex_age_bin_prob_oof_' + str(sex_age)] = oof_preds
#用全部的train来预测test
lgb_train = lgb.Dataset(X,label=Y)
gbm = lgb.train(params, lgb_train, num_boost_round=lgb_round[sex_age], valid_sets=lgb_train, verbose_eval=50)
test['sex_age_bin_prob_oof_' + str(sex_age)] = gbm.predict(test[X.columns.values])
# In[49]:
columns = ['device_id'] + ['sex_age_bin_prob_oof_' + str(i) for i in range(22)]
columns
# In[53]:
pd.concat([train[columns], test[columns]]).to_csv('sex_age_bin_prob_oof.csv', index=None)
================================================
FILE: THLUO/TextModel.py
================================================
import os
import re
import sys
import pandas as pd
import numpy as np
import gensim
from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence
import tensorflow as tf
import keras
from keras.layers import *
from keras.models import *
from keras.optimizers import *
from keras.callbacks import *
from keras.preprocessing import text, sequence
from keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder
from keras.utils import np_utils
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
import warnings
warnings.filterwarnings('ignore')
import os
import gc
import random
from keras.engine.topology import Layer
from util import *
def capsule_lstm(sent_length, embeddings_weight,class_num):
print("get_text_capsule")
content = Input(shape=(sent_length,), dtype='int32')
embedding = Embedding(
name="word_embedding",
input_dim=embeddings_weight.shape[0],
weights=[embeddings_weight],
output_dim=embeddings_weight.shape[1],
trainable=False)
embed = SpatialDropout1D(0.2)(embedding(content))
x = Bidirectional(CuDNNLSTM(200, return_sequences=True))(embed)
capsule = Capsule(num_capsule=Num_capsule, dim_capsule=Dim_capsule, routings=Routings, share_weights=True)(x)
capsule = Flatten()(capsule)
x = Dropout(0.2)(Activation(activation="relu")(BatchNormalization()(Dense(1000)(capsule))))
x = Activation(activation="relu")(BatchNormalization()(Dense(500)(x)))
output = Dense(class_num, activation="softmax")(x)
model = Model(inputs=content, outputs=output)
#model = multi_gpu_model(model, 2)
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
return model
def get_text_capsule(sent_length, embeddings_weight,class_num):
print("get_text_capsule")
content = Input(shape=(sent_length,), dtype='int32')
embedding = Embedding(
name="word_embedding",
input_dim=embeddings_weight.shape[0],
weights=[embeddings_weight],
output_dim=embeddings_weight.shape[1],
trainable=False)
embed = SpatialDropout1D(0.2)(embedding(content))
x = Bidirectional(CuDNNGRU(200, return_sequences=True))(embed)
capsule = Capsule(num_capsule=Num_capsule, dim_capsule=Dim_capsule, routings=Routings, share_weights=True)(x)
capsule = Flatten()(capsule)
x = Dropout(0.2)(Activation(activation="relu")(BatchNormalization()(Dense(1000)(capsule))))
x = Activation(activation="relu")(BatchNormalization()(Dense(500)(x)))
output = Dense(class_num, activation="softmax")(x)
model = Model(inputs=content, outputs=output)
#model = multi_gpu_model(model, 2)
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
return model
def get_text_cnn1(sent_length, embeddings_weight,class_num):
print("get_text_cnn1")
content = Input(shape=(sent_length,), dtype='int32')
embedding = Embedding(
name="word_embedding",
input_dim=embeddings_weight.shape[0],
weights=[embeddings_weight],
output_dim=embeddings_weight.shape[1],
trainable=False)
embed = embedding(content)
embed = SpatialDropout1D(0.2)(embed)
conv2 = Activation('relu')(BatchNormalization()(Conv1D(128, 2, padding='same')(embed)))
conv2 = Activation('relu')(BatchNormalization()(Conv1D(64, 2, padding='same')(conv2)))
conv2 = MaxPool1D(pool_size=50)(conv2)
conv3 = Activation('relu')(BatchNormalization()(Conv1D(128, 3, padding='same')(embed)))
conv3 = Activation('relu')(BatchNormalization()(Conv1D(64, 3, padding='same')(conv3)))
conv3 = MaxPool1D(pool_size=50)(conv3)
conv4 = Activation('relu')(BatchNormalization()(Conv1D(128, 4, padding='same')(embed)))
conv4 = Activation('relu')(BatchNormalization()(Conv1D(64, 4, padding='same')(conv4)))
conv4 = MaxPool1D(pool_size=50)(conv4)
conv5 = Activation('relu')(BatchNormalization()(Conv1D(128, 5, padding='same')(embed)))
conv5 = Activation('relu')(BatchNormalization()(Conv1D(64, 5, padding='same')(conv5)))
conv5 = MaxPool1D(pool_size=50)(conv5)
cnn = concatenate([conv2, conv3, conv4, conv5], axis=-1)
flat = Flatten()(cnn)
drop = Dropout(0.2)(flat)
x = Dropout(0.2)(Activation(activation="relu")(BatchNormalization()(Dense(1000)(drop))))
x = Activation(activation="relu")(BatchNormalization()(Dense(500)(x)))
output = Dense(class_num, activation="softmax")(x)
model = Model(inputs=content, outputs=output)
#model = multi_gpu_model(model, 2)
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
return model
def get_text_cnn2(sent_length, embeddings_weight,class_num):
print("get_text_cnn2")
content = Input(shape=(sent_length,), dtype='int32')
embedding = Embedding(
name="word_embedding",
input_dim=embeddings_weight.shape[0],
weights=[embeddings_weight],
output_dim=embeddings_weight.shape[1],
trainable=False)
embed = embedding(content)
filter_sizes = [2, 3, 4,5]
num_filters = 128
embed_size = embeddings_weight.shape[1]
x = SpatialDropout1D(0.2)(embed)
x = Reshape((sent_length, embed_size, 1))(x)
conv_0 = Conv2D(num_filters, kernel_size=(filter_sizes[0], embed_size), kernel_initializer='normal',
activation='relu')(x)
conv_1 = Conv2D(num_filters, kernel_size=(filter_sizes[1], embed_size), kernel_initializer='normal',
activation='relu')(x)
conv_2 = Conv2D(num_filters, kernel_size=(filter_sizes[2], embed_size), kernel_initializer='normal',
activation='relu')(x)
conv_3 = Conv2D(num_filters, kernel_size=(filter_sizes[3], embed_size), kernel_initializer='normal',
activation='relu')(x)
maxpool_0 = MaxPool2D(pool_size=(sent_length - filter_sizes[0] + 1, 1))(conv_0)
maxpool_1 = MaxPool2D(pool_size=(sent_length - filter_sizes[1] + 1, 1))(conv_1)
maxpool_2 = MaxPool2D(pool_size=(sent_length - filter_sizes[2] + 1, 1))(conv_2)
maxpool_3 = MaxPool2D(pool_size=(sent_length - filter_sizes[3] + 1, 1))(conv_3)
z = Concatenate(axis=1)([maxpool_0, maxpool_1, maxpool_2, maxpool_3])
z = Flatten()(z)
z = Dropout(0.1)(z)
x = Dropout(0.2)(Activation(activation="relu")(BatchNormalization()(Dense(1000)(z))))
x = Activation(activation="relu")(BatchNormalization()(Dense(500)(x)))
output = Dense(class_num, activation="softmax")(x)
model = Model(inputs=content, outputs=output)
#model = multi_gpu_model(model, 2)
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
return model
def get_text_cnn3(sent_length, embeddings_weight,class_num):
print("get_text_cnn3")
content = Input(shape=(sent_length,), dtype='int32')
embedding = Embedding(
name="word_embedding",
input_dim=embeddings_weight.shape[0],
weights=[embeddings_weight],
output_dim=embeddings_weight.shape[1],
trainable=False)(content)
embedding = SpatialDropout1D(0.2)(embedding)
cnn1 = Conv1D(128, 2, padding='same', strides=1, activation='relu')(embedding)
cnn2 = Conv1D(128, 3, padding='same', strides=1, activation='relu')(embedding)
cnn3 = Conv1D(128, 4, padding='same', strides=1, activation='relu')(embedding)
cnn4 = Conv1D(128, 5, padding='same', strides=1, activation='relu')(embedding)
cnn = concatenate([cnn1, cnn2, cnn3, cnn4], axis=-1)
cnn1 = Conv1D(64, 2, padding='same', strides=1, activation='relu')(cnn)
cnn1 = MaxPooling1D(pool_size=100)(cnn1)
cnn2 = Conv1D(64, 3, padding='same', strides=1, activation='relu')(cnn)
cnn2 = MaxPooling1D(pool_size=100)(cnn2)
cnn3 = Conv1D(64, 4, padding='same', strides=1, activation='relu')(cnn)
cnn3 = MaxPooling1D(pool_size=100)(cnn3)
cnn4 = Conv1D(64, 5, padding='same', strides=1, activation='relu')(cnn)
cnn4 = MaxPooling1D(pool_size=100)(cnn4)
cnn = concatenate([cnn1, cnn2, cnn3, cnn4], axis=-1)
flat = Flatten()(cnn)
drop = Dropout(0.2)(flat)
x = Dropout(0.2)(Activation(activation="relu")(BatchNormalization()(Dense(1000)(drop))))
x = Activation(activation="relu")(BatchNormalization()(Dense(500)(x)))
output = Dense(class_num, activation="softmax")(x)
model = Model(inputs=content, outputs=output)
#model = multi_gpu_model(model, 2)
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
return model
def get_text_gru1(sent_length, embeddings_weight,class_num):
print("get_text_gru1")
content = Input(shape=(sent_length,), dtype='int32')
embedding = Embedding(
name="word_embedding",
input_dim=embeddings_weight.shape[0],
weights=[embeddings_weight],
output_dim=embeddings_weight.shape[1],
trainable=False)
x = SpatialDropout1D(0.2)(embedding(content))
x = Bidirectional(CuDNNGRU(200, return_sequences=True))(x)
x = Bidirectional(CuDNNGRU(200, return_sequences=True))(x)
avg_pool = GlobalAveragePooling1D()(x)
max_pool = GlobalMaxPooling1D()(x)
conc = concatenate([avg_pool, max_pool])
x = Dropout(0.2)(Activation(activation="relu")(BatchNormalization()(Dense(1000)(conc))))
x = Activation(activation="relu")(BatchNormalization()(Dense(500)(x)))
output = Dense(class_num, activation="softmax")(x)
model = Model(inputs=content, outputs=output)
#model = multi_gpu_model(model, 2)
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
return model
def get_text_gru2(sent_length, embeddings_weight,class_num):
print("get_text_gru2")
content = Input(shape=(sent_length,), dtype='int32')
embedding = Embedding(
name="word_embedding",
input_dim=embeddings_weight.shape[0],
weights=[embeddings_weight],
output_dim=embeddings_weight.shape[1],
trainable=False)
x = SpatialDropout1D(0.2)(embedding(content))
x = Bidirectional(CuDNNGRU(200, return_sequences=True))(x)
x = Bidirectional(CuDNNGRU(200, return_sequences=True))(x)
x = Conv1D(100, kernel_size=3, padding="valid", kernel_initializer="glorot_uniform")(x)
avg_pool = GlobalAveragePooling1D()(x)
max_pool = GlobalMaxPooling1D()(x)
conc = concatenate([avg_pool, max_pool])
x = Dropout(0.2)(Activation(activation="relu")(BatchNormalization()(Dense(1000)(conc))))
x = Activation(activation="relu")(BatchNormalization()(Dense(500)(x)))
output = Dense(class_num, activation="softmax")(x)
model = Model(inputs=content, outputs=output)
#model = multi_gpu_model(model, 2)
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
return model
def get_text_gru4(sent_length, embeddings_weight,class_num):
print("get_text_gru4")
content = Input(shape=(sent_length,), dtype='int32')
embedding = Embedding(
name="word_embedding",
input_dim=embeddings_weight.shape[0],
weights=[embeddings_weight],
output_dim=embeddings_weight.shape[1],
trainable=False)
x = SpatialDropout1D(0.2)(embedding(content))
x = Bidirectional(CuDNNLSTM(200, return_sequences=True))(x)
x = Bidirectional(CuDNNGRU(200, return_sequences=True))(x)
avg_pool = GlobalAveragePooling1D()(x)
max_pool = GlobalMaxPooling1D()(x)
x = concatenate([avg_pool, max_pool])
x = Dropout(0.2)(Activation(activation="relu")(BatchNormalization()(Dense(1000)(x))))
x = Activation(activation="relu")(BatchNormalization()(Dense(500)(x)))
output = Dense(class_num, activation="softmax")(x)
model = Model(inputs=content, outputs=output)
#model = multi_gpu_model(model, 2)
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
return model
def get_text_gru5(sent_length, embeddings_weight,class_num):
print("get_text_gru5")
content = Input(shape=(sent_length,), dtype='int32')
embedding = Embedding(
name="word_embedding",
input_dim=embeddings_weight.shape[0],
weights=[embeddings_weight],
output_dim=embeddings_weight.shape[1],
trainable=False)
embed = SpatialDropout1D(0.2)(embedding(content))
x = Bidirectional(CuDNNGRU(200, return_sequences=True))(embed)
x = Dropout(0.35)(x)
x = Bidirectional(CuDNNGRU(200, return_sequences=True))(x)
last = Lambda(lambda t: t[:, -1])(x)
maxpool = GlobalMaxPooling1D()(x)
average = GlobalAveragePooling1D()(x)
x = concatenate([last, maxpool, average])
x = Dropout(0.2)(Activation(activation="relu")(BatchNormalization()(Dense(1000)(x))))
x = Activation(activation="relu")(BatchNormalization()(Dense(500)(x)))
output = Dense(class_num, activation="softmax")(x)
model = Model(inputs=content, outputs=output)
#model = multi_gpu_model(model, 2)
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
return model
def get_text_gru6(sent_length, embeddings_weight,class_num):
print("get_text_gru6")
content = Input(shape=(sent_length,), dtype='int32')
embedding = Embedding(
name="word_embedding",
input_dim=embeddings_weight.shape[0],
weights=[embeddings_weight],
output_dim=embeddings_weight.shape[1],
trainable=False)
embed = SpatialDropout1D(0.2)(embedding(content))
x = Bidirectional(CuDNNGRU(200, return_sequences=True))(embed)
x = Conv1D(60, kernel_size=3, padding='valid', activation='relu', strides=1)(x)
avg_pool = GlobalAveragePooling1D()(x)
max_pool = GlobalMaxPooling1D()(x)
embed = SpatialDropout1D(0.2)(embedding(content))
y = Bidirectional(CuDNNGRU(100, return_sequences=True))(embed)
y = Conv1D(40, kernel_size=3, padding='valid', activation='relu', strides=1)(y)
avg_pool2 = GlobalAveragePooling1D()(y)
max_pool2 = GlobalMaxPooling1D()(y)
x = concatenate([avg_pool, max_pool, avg_pool2, max_pool2], -1)
x = Dropout(0.2)(Activation(activation="relu")(BatchNormalization()(Dense(1000)(x))))
x = Activation(activation="relu")(BatchNormalization()(Dense(500)(x)))
output = Dense(class_num, activation="softmax")(x)
model = Model(inputs=content, outputs=output)
#model = multi_gpu_model(model, 2)
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
return model
def get_text_rcnn1(sent_length, embeddings_weight,class_num):
print("get_text_rcnn1")
document = Input(shape=(None,), dtype="int32")
embedder = Embedding(
name="word_embedding",
input_dim=embeddings_weight.shape[0],
weights=[embeddings_weight],
output_dim=embeddings_weight.shape[1],
trainable=False)
doc_embedding = SpatialDropout1D(0.2)(embedder(document))
forward = Bidirectional(CuDNNLSTM(200, return_sequences=True))(doc_embedding)
together = concatenate([forward, doc_embedding], axis=2)
semantic = Conv1D(100, 2, padding='same', strides=1, activation='relu')(together)
pool_rnn = Lambda(lambda x: K.max(x, axis=1), output_shape=(100,))(semantic)
x = Dropout(0.2)(Activation(activation="relu")(BatchNormalization()(Dense(1000)(pool_rnn))))
x = Activation(activation="relu")(BatchNormalization()(Dense(500)(x)))
output = Dense(class_num, activation="softmax")(x)
model = Model(inputs=document, outputs=output)
# model = multi_gpu_model(model, 2)
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
return model
def get_text_rcnn2(sent_length, embeddings_weight,class_num):
print("get_text_rcnn2")
content = Input(shape=(None,), dtype="int32")
embedding = Embedding(
name="word_embedding",
input_dim=embeddings_weight.shape[0],
weights=[embeddings_weight],
output_dim=embeddings_weight.shape[1],
trainable=False)
x = SpatialDropout1D(0.2)(embedding(content))
x = Convolution1D(filters=256, kernel_size=3, padding='same', strides=1, activation="relu")(x)
x = MaxPooling1D(pool_size=2)(x)
x = Dropout(0.2)(CuDNNGRU(units=200, return_sequences=True)(x))
x = Dropout(0.2)(CuDNNGRU(units=100)(x))
x = Dropout(0.2)(Activation(activation="relu")(BatchNormalization()(Dense(1000)(x))))
x = Activation(activation="relu")(BatchNormalization()(Dense(500)(x)))
output = Dense(class_num, activation="softmax")(x)
model = Model(inputs=content, outputs=output)
#model = multi_gpu_model(model, 2)
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
return model
def get_text_rcnn3(sent_length, embeddings_weight,class_num):
print("get_text_rcnn3")
content = Input(shape=(None,), dtype="int32")
embedding = Embedding(
name="word_embedding",
input_dim=embeddings_weight.shape[0],
weights=[embeddings_weight],
output_dim=embeddings_weight.shape[1],
trainable=False)
x = SpatialDropout1D(0.2)(embedding(content))
cnn = Convolution1D(filters=200, kernel_size=3, padding="same", strides=1, activation="relu")(x)
cnn_avg_pool = GlobalAveragePooling1D()(cnn)
cnn_max_pool = GlobalMaxPooling1D()(cnn)
rnn = Dropout(0.2)(CuDNNGRU(200, return_sequences=True)(x))
rnn_avg_pool = GlobalAveragePooling1D()(rnn)
rnn_max_pool = GlobalMaxPooling1D()(rnn)
con = concatenate([cnn_avg_pool, cnn_max_pool, rnn_avg_pool, rnn_max_pool], axis=-1)
x = Dropout(0.2)(Activation(activation="relu")(BatchNormalization()(Dense(1000)(con))))
x = Activation(activation="relu")(BatchNormalization()(Dense(500)(x)))
output = Dense(class_num, activation="softmax")(x)
model = Model(inputs=content, outputs=output)
#model = multi_gpu_model(model, 2)
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
return model
def get_text_rcnn4(sent_length, embeddings_weight,class_num):
print("get_text_rcnn4")
content = Input(shape=(sent_length,), dtype='int32')
embedding = Embedding(
name="word_embedding",
input_dim=embeddings_weight.shape[0],
weights=[embeddings_weight],
output_dim=embeddings_weight.shape[1],
trainable=False)
embed = SpatialDropout1D(0.2)(embedding(content))
rnn_1 = Bidirectional(CuDNNGRU(128, return_sequences=True))(embed)
conv_2 = Conv1D(128, 2, kernel_initializer="normal", padding="valid", activation="relu", strides=1)(rnn_1)
maxpool = GlobalMaxPooling1D()(conv_2)
attn = AttentionWeightedAverage()(conv_2)
average = GlobalAveragePooling1D()(conv_2)
x = concatenate([maxpool, attn, average])
x = Dropout(0.2)(Activation(activation="relu")(BatchNormalization()(Dense(1000)(x))))
x = Activation(activation="relu")(BatchNormalization()(Dense(500)(x)))
output = Dense(class_num, activation="softmax")(x)
model = Model(inputs=content, outputs=output)
#model = multi_gpu_model(model, 2)
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
return model
def get_text_rcnn5(sent_length, embeddings_weight,class_num):
print("get_text_rcnn5")
content = Input(shape=(sent_length,), dtype='int32')
embedding = Embedding(
name="word_embedding",
input_dim=embeddings_weight.shape[0],
weights=[embeddings_weight],
output_dim=embeddings_weight.shape[1],
trainable=False)
embed = SpatialDropout1D(0.2)(embedding(content))
rnn_1 = Bidirectional(CuDNNGRU(200, return_sequences=True))(embed)
rnn_2 = Bidirectional(CuDNNGRU(200, return_sequences=True))(rnn_1)
x = concatenate([rnn_1, rnn_2], axis=2)
last = Lambda(lambda t: t[:, -1], name='last')(x)
maxpool = GlobalMaxPooling1D()(x)
attn = AttentionWeightedAverage()(x)
average = GlobalAveragePooling1D()(x)
x = concatenate([last, maxpool, average, attn])
x = Dropout(0.2)(Activation(activation="relu")(BatchNormalization()(Dense(1000)(x))))
x = Activation(activation="relu")(BatchNormalization()(Dense(500)(x)))
output = Dense(class_num, activation="softmax")(x)
model = Model(inputs=content, outputs=output)
#model = multi_gpu_model(model, 2)
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
return model
def get_text_lstm1(sent_length, embeddings_weight,class_num):
print("get_text_lstm1")
content = Input(shape=(sent_length,), dtype='int32')
embedding = Embedding(
name="word_embedding",
input_dim=embeddings_weight.shape[0],
weights=[embeddings_weight],
output_dim=embeddings_weight.shape[1],
trainable=False)
embed = SpatialDropout1D(0.2)(embedding(content))
x = Dropout(0.2)(Bidirectional(CuDNNLSTM(200, return_sequences=True))(embed))
semantic = TimeDistributed(Dense(100, activation="tanh"))(x)
pool_rnn = Lambda(lambda x: K.max(x, axis=1), output_shape=(100,))(semantic)
x = Dropout(0.2)(Activation(activation="relu")(BatchNormalization()(Dense(1000)(pool_rnn))))
x = Activation(activation="relu")(BatchNormalization()(Dense(500)(x)))
output = Dense(class_num, activation="softmax")(x)
model = Model(inputs=content, outputs=output)
#model = multi_gpu_model(model, 2)
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
return model
def get_text_lstm2(sent_length, embeddings_weight,class_num):
print("get_text_lstm2")
content = Input(shape=(sent_length,), dtype='int32')
embedding = Embedding(
name="word_embedding",
input_dim=embeddings_weight.shape[0],
weights=[embeddings_weight],
output_dim=embeddings_weight.shape[1],
trainable=False)
embed = SpatialDropout1D(0.2)(embedding(content))
x = Dropout(0.2)(Bidirectional(CuDNNLSTM(200, return_sequences=True))(embed))
x = Dropout(0.2)(Bidirectional(CuDNNLSTM(100, return_sequences=True))(x))
semantic = TimeDistributed(Dense(100, activation="tanh"))(x)
pool_rnn = Lambda(lambda x: K.max(x, axis=1), output_shape=(100,))(semantic)
x = Dropout(0.2)(Activation(activation="relu")(BatchNormalization()(Dense(1000)(pool_rnn))))
x = Activation(activation="relu")(BatchNormalization()(Dense(500)(x)))
output = Dense(class_num, activation="softmax")(x)
model = Model(inputs=content, outputs=output)
#model = multi_gpu_model(model, 2)
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
return model
def get_text_lstm3(sent_length, embeddings_weight,class_num):
print("get_text_lstm3")
content = Input(shape=(sent_length,), dtype='int32')
embedding = Embedding(
name="word_embedding",
input_dim=embeddings_weight.shape[0],
weights=[embeddings_weight],
output_dim=embeddings_weight.shape[1],
trainable=False)
embed = SpatialDropout1D(0.2)(embedding(content))
x = Dropout(0.2)(Bidirectional(CuDNNLSTM(200, return_sequences=True))(embed))
x = Conv1D(64, kernel_size=3, padding='valid', kernel_initializer='glorot_uniform')(x)
avg_pool = GlobalAveragePooling1D()(x)
max_pool = GlobalMaxPooling1D()(x)
x = concatenate([avg_pool, max_pool])
x = Dropout(0.2)(Activation(activation="relu")(BatchNormalization()(Dense(1000)(x))))
x = Activation(activation="relu")(BatchNormalization()(Dense(500)(x)))
output = Dense(class_num, activation="softmax")(x)
model = Model(inputs=content, outputs=output)
#model = multi_gpu_model(model, 2)
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
return model
def get_text_lstm_attention(sent_length, embeddings_weight,class_num):
print("get_text_lstm_attention")
content = Input(shape=(sent_length,), dtype='int32')
embedding = Embedding(
name="word_embedding",
input_dim=embeddings_weight.shape[0],
weights=[embeddings_weight],
output_dim=embeddings_weight.shape[1],
trainable=False)
embedded_sequences = SpatialDropout1D(0.2)(embedding(content))
x = Dropout(0.25)(CuDNNLSTM(200, return_sequences=True)(embedded_sequences))
merged = Attention(sent_length)(x)
merged = Dense(100, activation='relu')(merged)
merged = Dropout(0.25)(merged)
x = Dropout(0.2)(Activation(activation="relu")(BatchNormalization()(Dense(1000)(merged))))
x = Activation(activation="relu")(BatchNormalization()(Dense(500)(x)))
output = Dense(class_num, activation="softmax")(x)
model = Model(inputs=content, outputs=output)
#model = multi_gpu_model(model, 2)
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
return model
def get_text_dpcnn(sent_length, embeddings_weight,class_num):
print("get_text_dpcnn")
content = Input(shape=(sent_length,), dtype='int32')
embedding = Embedding(
name="word_embedding",
input_dim=embeddings_weight.shape[0],
weights=[embeddings_weight],
output_dim=embeddings_weight.shape[1],
trainable=False)
embed = SpatialDropout1D(0.2)(embedding(content))
block1 = Conv1D(128, kernel_size=3, padding='same', activation='linear')(embed)
block1 = BatchNormalization()(block1)
block1 = PReLU()(block1)
block1 = Conv1D(128, kernel_size=3, padding='same', activation='linear')(block1)
block1 = BatchNormalization()(block1)
block1 = PReLU()(block1)
resize_emb = Conv1D(128, kernel_size=3, padding='same', activation='linear')(embed)
resize_emb = PReLU()(resize_emb)
block1_output = add([block1, resize_emb])
block1_output = MaxPooling1D(pool_size=10)(block1_output)
block2 = Conv1D(128, kernel_size=4, padding='same', activation='linear')(block1_output)
block2 = BatchNormalization()(block2)
block2 = PReLU()(block2)
block2 = Conv1D(128, kernel_size=4, padding='same', activation='linear')(block2)
block2 = BatchNormalization()(block2)
block2 = PReLU()(block2)
block2_output = add([block2, block1_output])
block2_output = MaxPooling1D(pool_size=10)(block2_output)
block3 = Conv1D(128, kernel_size=5, padding='same', activation='linear')(block2_output)
block3 = BatchNormalization()(block3)
block3 = PReLU()(block3)
block3 = Conv1D(128, kernel_size=5, padding='same', activation='linear')(block3)
block3 = BatchNormalization()(block3)
block3 = PReLU()(block3)
output = add([block3, block2_output])
maxpool = GlobalMaxPooling1D()(output)
average = GlobalAveragePooling1D()(output)
x = concatenate([maxpool, average])
x = Dropout(0.2)(Activation(activation="relu")(BatchNormalization()(Dense(1000)(x))))
x = Activation(activation="relu")(BatchNormalization()(Dense(500)(x)))
output = Dense(class_num, activation="softmax")(x)
model = Model(inputs=content, outputs=output)
#model = multi_gpu_model(model, 2)
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
return model
def bi_gru_model(sent_length, embeddings_weight,class_num):
print("get_text_gru3")
content = Input(shape=(sent_length,), dtype='int32')
embedding = Embedding(
name="word_embedding",
input_dim=embeddings_weight.shape[0],
weights=[embeddings_weight],
output_dim=embeddings_weight.shape[1],
trainable=False)
x = SpatialDropout1D(0.2)(embedding(content))
x = Bidirectional(CuDNNGRU(200, return_sequences=True))(x)
x = Bidirectional(CuDNNGRU(200, return_sequences=True))(x)
avg_pool = GlobalAveragePooling1D()(x)
max_pool = GlobalMaxPooling1D()(x)
conc = concatenate([avg_pool, max_pool])
x = Dropout(0.2)(Activation(activation="relu")(BatchNormalization()(Dense(1000)(conc))))
x = Activation(activation="relu")(BatchNormalization()(Dense(500)(x)))
output = Dense(class_num, activation="softmax")(x)
model = Model(inputs=content, outputs=output)
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
return model
def bi_gru_model_binary(sent_length, embeddings_weight,class_num):
print("bi_gru_model_binary")
content = Input(shape=(sent_length,), dtype='int32')
embedding = Embedding(
name="word_embedding",
input_dim=embeddings_weight.shape[0],
weights=[embeddings_weight],
output_dim=embeddings_weight.shape[1],
trainable=False)
x = SpatialDropout1D(0.2)(embedding(content))
x = Bidirectional(CuDNNGRU(200, return_sequences=True))(x)
x = Bidirectional(CuDNNGRU(200, return_sequences=True))(x)
avg_pool = GlobalAveragePooling1D()(x)
max_pool = GlobalMaxPooling1D()(x)
conc = concatenate([avg_pool, max_pool])
x = Dropout(0.2)(Activation(activation="relu")(BatchNormalization()(Dense(1000)(conc))))
x = Activation(activation="relu")(BatchNormalization()(Dense(500)(x)))
output = Dense(class_num, activation="softmax")(x)
model = Model(inputs=content, outputs=output)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
return model
================================================
FILE: THLUO/readme.md
================================================
本代码运行在windows10, 48G内存, 1070ti显卡上, 由于运行的py文件比较多, 所以需要比较长的时间才能跑完
文件夹说明:
> cache文件夹是存放输出模型的文件夹
> embedding是存放w2c词嵌入的文件夹
> input是存放本次比赛数据的文件夹
> result是THLUO选手最终的结果
下面是每个py文件的功能介绍:
* 1.w2c_model_start.py 根据device打开app的时间对app进行排序,形成app_list, 将app开作词,device_id看成文档,对app进行embedding
* 2.w2c_model_close.py 根据device关闭app的时间对app进行排序,形成app_list, 将app开作词,device_id看成文档,对app进行embedding
* 3.w2c_model_all.py 根据device打开关闭app的时间合在对app进行排序,形成app_list, 将app开作词,device_id看成文档,对app进行embedding
* 4.device_quchong_start_app_w2c.py 根据device打开app的时间对app进行排序,形成app_list, 对app_list进行去重操作, 将app开作词,device_id看成文档,对app进行embedding
* 5.device_age_prob_oof.py 单独对用户年龄进行预测
* 6.device_sex_prob_oof.py 单独对用户性别进行预测
* 7.start_close_age_prob_oof.py 对app所属的年龄概率进行预测
* 8.start_close_sex_prob_oof.py 对app所属的性别概率进行预测
* 9.sex_age_bin_prob_oof.py 用2分类的手法来预测用户属于性别-年龄的概率
* 10.age_bin_prob_oof.py 用2分类的手法来预测用户属于年龄的概率
* 11.hcc_device_brand_age_sex.py 手机品牌和手机类型属于High Cardinality Categorical, 参考论文A Preprocessing Scheme for High-Cardinality Categorical Attributes in Classification and Prediction Problems,对手机品牌和手机类型属于性别年龄的概率进行预测
* 12.device_age_regression_prob_oof.py 用回归的手法对用户属于年龄的概率进行预测
* 13.device_start_GRU_pred.py 根据device打开app的时间对app进行排序,形成app_list,将app开作词,device_id看成文档,跑了一个GRU文本模型对用户属于性别年龄的概率进行预测
* 14.device_start_GRU_pred_age.py 根据device打开app的时间对app进行排序,形成app_list,将app开作词,device_id看成文档,跑了一个GRU文本模型对用户属于年龄的概率进行预测
* 15.device_all_GRU_pred.py 根据device打开关闭app的时间合在对app进行排序,形成app_list, 将app开作词,device_id看成文档,跑了一个GRU文本模型对用户属于性别年龄的概率进行预测
* 16.device_start_capsule_pred.py 用capsule模型对用户属于性别年龄的概率进行预测
* 17.device_start_textcnn_pred.py 用textcnn模型对用户属于性别年龄的概率进行预测
* 18.device_start_text_dpcnn_pred.py 用dpcnn模型对用户属于性别年龄的概率进行预测
* 19.device_start_lstm_pred.py 用lstm模型对用户属于性别年龄的概率进行预测
* 20.lgb_sex_age_prob_oof.py 一个基础的模型,对用户属于性别年龄的概率进行预测
* 21.tfidf_lr_sex_age_prob_oof.py 对app进行tf-idf操作,用户LR训练一个模型来预测用户的性别年龄概率
* 22.base_feat.py 生成基础人工特征+上面产出的概率模型特征
* 23.ATT_v6.py 用attention模型对22.base_feat.py产出的特征进行训练,来计算用户属于性别年龄的概率
* 24.thluo_22_lgb.py 用lgb训练一个22多分类模型,输出test概率文件
* 25.thluo_22_xgb.py 用xgb训练一个22多分类模型,输出test概率文件
* 26.thluo_nb_lgb.py 用lgb训练一个条件分类模型,输出test概率文件,条件概率模型指的是先预测p(sex) 再预测p(age|sex),最终p(sex, age) = p(sex) * p(age|sex)
* 27.thluo_nb_xgb.py 用xgb训练一个条件分类模型,输出test概率文件,条件概率模型指的是先预测p(sex) 再预测p(age|sex),最终p(sex, age) = p(sex) * p(age|sex)
* 28.final.py 对上面四个模型产出的结果,进行线性加权融合,形成THLUO选手个人的最终结果
* TextModel.py包含本次比赛用到的文本模型
* util.py里面包含一些共用的函数
> note:因为本次比赛提交代码的时间比较仓促,之前一直都是用notebook来做比赛,所以如有问题,请联系团队
================================================
FILE: THLUO/util.py
================================================
import os
import re
import sys
import pandas as pd
import numpy as np
import gensim
from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence
import tensorflow as tf
import keras
from keras.layers import *
from keras.models import *
from keras.optimizers import *
from keras.callbacks import *
from keras.preprocessing import text, sequence
from keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder
from keras.utils import np_utils
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
import warnings
warnings.filterwarnings('ignore')
import os
import gc
import random
from keras.engine.topology import Layer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import f1_score
import jieba
import numpy as np
gru_len = 128
Routings = 5
Num_capsule = 10
Dim_capsule = 16
dropout_p = 0.25
rate_drop_dense = 0.28
class Attention(Layer):
def __init__(self, step_dim,
W_regularizer=None, b_regularizer=None,
W_constraint=None, b_constraint=None,
bias=True, **kwargs):
"""
Keras Layer that implements an Attention mechanism for temporal data.
Supports Masking.
Follows the work of Raffel et al. [https://arxiv.org/abs/1512.08756]
# Input shape
3D tensor with shape: `(samples, steps, features)`.
# Output shape
2D tensor with shape: `(samples, features)`.
:param kwargs:
Just put it on top of an RNN Layer (GRU/LSTM/SimpleRNN) with return_sequences=True.
The dimensions are inferred based on the output shape of the RNN.
Example:
model.add(LSTM(64, return_sequences=True))
model.add(Attention())
"""
self.supports_masking = True
#self.init = initializations.get('glorot_uniform')
self.init = initializers.get('glorot_uniform')
self.W_regularizer = regularizers.get(W_regularizer)
self.b_regularizer = regularizers.get(b_regularizer)
self.W_constraint = constraints.get(W_constraint)
self.b_constraint = constraints.get(b_constraint)
self.bias = bias
self.step_dim = step_dim
self.features_dim = 0
super(Attention, self).__init__(**kwargs)
def build(self, input_shape):
assert len(input_shape) == 3
self.W = self.add_weight((input_shape[-1],),
initializer=self.init,
name='{}_W'.format(self.name),
regularizer=self.W_regularizer,
constraint=self.W_constraint)
self.features_dim = input_shape[-1]
if self.bias:
self.b = self.add_weight((input_shape[1],),
initializer='zero',
name='{}_b'.format(self.name),
regularizer=self.b_regularizer,
constraint=self.b_constraint)
else:
self.b = None
self.built = True
def compute_mask(self, input, input_mask=None):
# do not pass the mask to the next layers
return None
def call(self, x, mask=None):
# eij = K.dot(x, self.W) TF backend doesn't support it
# features_dim = self.W.shape[0]
# step_dim = x._keras_shape[1]
features_dim = self.features_dim
step_dim = self.step_dim
eij = K.reshape(K.dot(K.reshape(x, (-1, features_dim)), K.reshape(self.W, (features_dim, 1))), (-1, step_dim))
if self.bias:
eij += self.b
eij = K.tanh(eij)
a = K.exp(eij)
# apply mask after the exp. will be re-normalized next
if mask is not None:
# Cast the mask to floatX to avoid float64 upcasting in theano
a *= K.cast(mask, K.floatx())
# in some cases especially in the early stages of training the sum may be almost zero
a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx())
a = K.expand_dims(a)
weighted_input = x * a
#print weigthted_input.shape
return K.sum(weighted_input, axis=1)
def compute_output_shape(self, input_shape):
#return input_shape[0], input_shape[-1]
return input_shape[0], self.features_dim
def squash(x, axis=-1):
s_squared_norm = K.sum(K.square(x), axis, keepdims=True)
scale = K.sqrt(s_squared_norm + K.epsilon())
return x / scale
# A Capsule Implement with Pure Keras
class Capsule(Layer):
def __init__(self, num_capsule, dim_capsule, routings=3, kernel_size=(9, 1), share_weights=True,
activation='default', **kwargs):
super(Capsule, self).__init__(**kwargs)
self.num_capsule = num_capsule
self.dim_capsule = dim_capsule
self.routings = routings
self.kernel_size = kernel_size
self.share_weights = share_weights
if activation == 'default':
self.activation = squash
else:
self.activation = Activation(activation)
def build(self, input_shape):
super(Capsule, self).build(input_shape)
input_dim_capsule = input_shape[-1]
if self.share_weights:
self.W = self.add_weight(name='capsule_kernel',
shape=(1, input_dim_capsule,
self.num_capsule * self.dim_capsule),
# shape=self.kernel_size,
initializer='glorot_uniform',
trainable=True)
else:
input_num_capsule = input_shape[-2]
self.W = self.add_weight(name='capsule_kernel',
shape=(input_num_capsule,
input_dim_capsule,
self.num_capsule * self.dim_capsule),
initializer='glorot_uniform',
trainable=True)
def call(self, u_vecs):
if self.share_weights:
u_hat_vecs = K.conv1d(u_vecs, self.W)
else:
u_hat_vecs = K.local_conv1d(u_vecs, self.W, [1], [1])
batch_size = K.shape(u_vecs)[0]
input_num_capsule = K.shape(u_vecs)[1]
u_hat_vecs = K.reshape(u_hat_vecs, (batch_size, input_num_capsule,
self.num_capsule, self.dim_capsule))
u_hat_vecs = K.permute_dimensions(u_hat_vecs, (0, 2, 1, 3))
# final u_hat_vecs.shape = [None, num_capsule, input_num_capsule, dim_capsule]
b = K.zeros_like(u_hat_vecs[:, :, :, 0]) # shape = [None, num_capsule, input_num_capsule]
for i in range(self.routings):
b = K.permute_dimensions(b, (0, 2, 1)) # shape = [None, input_num_capsule, num_capsule]
c = K.softmax(b)
c = K.permute_dimensions(c, (0, 2, 1))
b = K.permute_dimensions(b, (0, 2, 1))
outputs = self.activation(K.batch_dot(c, u_hat_vecs, [2, 2]))
if i < self.routings - 1:
b = K.batch_dot(outputs, u_hat_vecs, [2, 3])
return outputs
def compute_output_shape(self, input_shape):
return (None, self.num_capsule, self.dim_capsule)
class AttentionWeightedAverage(Layer):
"""
Computes a weighted average of the different channels across timesteps.
Uses 1 parameter pr. channel to compute the attention value for a single timestep.
"""
def __init__(self, return_attention=False, **kwargs):
self.init = initializers.get('uniform')
self.supports_masking = True
self.return_attention = return_attention
super(AttentionWeightedAverage, self).__init__(**kwargs)
def build(self, input_shape):
self.input_spec = [InputSpec(ndim=3)]
assert len(input_shape) == 3
self.W = self.add_weight(shape=(input_shape[2], 1),
name='{}_W'.format(self.name),
initializer=self.init)
self.trainable_weights = [self.W]
super(AttentionWeightedAverage, self).build(input_shape)
def call(self, x, mask=None):
# computes a probability distribution over the timesteps
# uses 'max trick' for numerical stability
# reshape is done to avoid issue with Tensorflow
# and 1-dimensional weights
logits = K.dot(x, self.W)
x_shape = K.shape(x)
logits = K.reshape(logits, (x_shape[0], x_shape[1]))
ai = K.exp(logits - K.max(logits, axis=-1, keepdims=True))
# masked timesteps have zero weight
if mask is not None:
mask = K.cast(mask, K.floatx())
ai = ai * mask
att_weights = ai / (K.sum(ai, axis=1, keepdims=True) + K.epsilon())
weighted_input = x * K.expand_dims(att_weights)
result = K.sum(weighted_input, axis=1)
if self.return_attention:
return [result, att_weights]
return result
def get_output_shape_for(self, input_shape):
return self.compute_output_shape(input_shape)
def compute_output_shape(self, input_shape):
output_len = input_shape[2]
if self.return_attention:
return [(input_shape[0], output_len), (input_shape[0], input_shape[1])]
return (input_shape[0], output_len)
def compute_mask(self, input, input_mask=None):
if isinstance(input_mask, list):
return [None] * len(input_mask)
else:
return None
class KMaxPooling(Layer):
"""
K-max pooling layer that extracts the k-highest activations from a sequence (2nd dimension).
TensorFlow backend.
"""
def __init__(self, k=1, **kwargs):
super().__init__(**kwargs)
self.input_spec = InputSpec(ndim=3)
self.k = k
def compute_output_shape(self, input_shape):
return (input_shape[0], (input_shape[2] * self.k))
def call(self, inputs):
# swap last two dimensions since top_k will be applied along the last dimension
shifted_input = tf.transpose(inputs, [0, 2, 1])
# extract top_k, returns two tensors [values, indices]
top_k = tf.nn.top_k(shifted_input, k=self.k, sorted=True, name=None)[0]
# return flattened output
return Flatten()(top_k)
def performance(f): # 定义装饰器函数,功能是传进来的函数进行包装并返回包装后的函数
def fn(*args, **kw): # 对传进来的函数进行包装的函数
t_start = time.time() # 记录函数开始时间
r = f(*args, **kw) # 调用函数
t_end = time.time() # 记录函数结束时间
print('call %s() in %fs' % (f.__name__, (t_end - t_start))) # 打印调用函数的属性信息,并打印调用函数所用的时间
return r # 返回包装后的函数
return fn
from keras import backend as K
def f1(y_true, y_pred):
def recall(y_true, y_pred):
"""Recall metric.
Only computes a batch-wise average of recall.
Computes the recall, a metric for multi-label classification of
how many relevant items are selected.
"""
true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
recall = true_positives / (possible_positives + K.epsilon())
return recall
def precision(y_true, y_pred):
"""Precision metric.
Only computes a batch-wise average of precision.
Computes the precision, a metric for multi-label classification of
how many selected items are relevant.
"""
true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
precision = true_positives / (predicted_positives + K.epsilon())
return precision
precision = precision(y_true, y_pred)
recall = recall(y_true, y_pred)
return 2*((precision*recall)/(precision+recall+K.epsilon()))
from keras import backend as K
def f1(y_true, y_pred):
def recall(y_true, y_pred):
"""Recall metric.
Only computes a batch-wise average of recall.
Computes the recall, a metric for multi-label classification of
how many relevant items are selected.
"""
true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
recall = true_positives / (possible_positives + K.epsilon())
return recall
def precision(y_true, y_pred):
"""Precision metric.
Only computes a batch-wise average of precision.
Computes the precision, a metric for multi-label classification of
how many selected items are relevant.
"""
true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
precision = true_positives / (predicted_positives + K.epsilon())
return precision
precision = precision(y_true, y_pred)
recall = recall(y_true, y_pred)
return 2*((precision*recall)/(precision+recall+K.epsilon()))
def evalation_score(y_true,y_pred):
for row,column in zip(range(y_pred.shape[0]),np.argmax(y_pred,axis=1)):
y_pred[row,column]=1
y_pred[y_pred<1]=0
y_pred=y_pred.astype("int")
macro=f1_score(y_true, y_pred, average='macro')
micro=f1_score(y_true, y_pred, average='micro')
print(macro,micro)
score=(macro+ micro)/2
print(score)
return score
================================================
FILE: THLUO/代码运行.bat
================================================
python 1.w2c_model_start.py
python 2.w2c_model_close.py
python 3.w2c_model_all.py
python 3.device_quchong_start_app_w2c.py
python 3.w2c_all_emb.py
python 4.device_age_prob_oof.py
python 5.device_sex_prob_oof.py
python 6.start_close_age_prob_oof.py
python 7.start_close_sex_prob_oof.py
python 9.sex_age_bin_prob_oof.py
python 10.age_bin_prob_oof.py
python 11.hcc_device_brand_age_sex.py
python 12.device_age_regression_prob_oof.py
python 13.device_start_GRU_pred.py
python 14.device_start_GRU_pred_age.py
python 15.device_all_GRU_pred.py
python 16.device_start_capsule_pred.py
python 17.device_start_textcnn_pred.py
python 18.device_start_text_dpcnn_pred.py
python 19.device_start_lstm_pred.py
python 20.lgb_sex_age_prob_oof.py
python 21.tfidf_lr_sex_age_prob_oof.py
python 22.base_feat.py
python 23.ATT_v6.py
python 24.thluo_22_lgb.py
python 25.thluo_22_xgb.py
python 26.thluo_nb_lgb.py
python 27.thluo_nb_xgb.py
python 28.final.py
================================================
FILE: chizhu/readme.txt
================================================
|-single_model/
|-data/ 处理后的特征和数据存放位置
|-model/ 模型文件
|-submit 模型概率文件,可用作stacking材料
|-config.py 配置原始文件路径
|-user_behavior.py 得到user_behavior特征集
|-get_nn_feat.py 获得nn 的统计特征输入
|-lgb.py
|-xgb.py
|-xgb_nb.py 条件概率
|-cnn.py
|-deepnn.py
|-yg_best_nn.py
|-stacking/
|-all_feat/ 使用全部概率文件的xgb的条件概率
|-nurbs_feat/ 使用rurbs概率文件的xgb的22分类以及条件概率
|-xgb_nurbs_nb.py 条件概率
|-xgb_22.py 22分类
|-util/
|-bagging.py 加权融合脚本
|-get_nn_res.py 获得nn概率文件和可提交的结果
使用说明:
single_model:1)先配置config.py 里的文件路径
2)运行user_behavior.py
3)运行get_nn_feat.py
4)然后可以逐个运行nn或者tree模型,得到的概率文件在submit/
stacking:这里直接运行是不行的 因为需要概率文件,大小在2G左右,没有附上,之后可以找我们要
util:加权用,这里需要的是stacking/nurbs_feat下的xgb_22.py和_xgbnb.py产生的结果取均值得到一份结果,xgb_22_nb.csv
================================================
FILE: chizhu/single_model/cnn.py
================================================
# coding: utf-8
# In[1]:
import pandas as pd
import seaborn as sns
import numpy as np
from tqdm import tqdm
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from datetime import datetime, timedelta
import matplotlib.pyplot as plt
import time
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
# get_ipython().run_line_magic('matplotlib', 'inline')
#add
# from category_encoders import OrdinalEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from gensim.models import FastText, Word2Vec
import re
from keras.layers import *
from keras.models import *
from keras.preprocessing.text import Tokenizer, text_to_word_sequence
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing import text, sequence
from keras.callbacks import *
from keras.layers.advanced_activations import LeakyReLU, PReLU
import keras.backend as K
from keras.optimizers import *
from keras.utils import to_categorical
from keras.utils import multi_gpu_model
import tensorflow as tf
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
from keras.backend.tensorflow_backend import set_session
config = tf.ConfigProto()
config.gpu_options.per_process_gpu_memory_fraction = 0.9
set_session(tf.Session(config=config))
from config import path
# path = "/dev/shm/chizhu_data/data/"
# In[2]:
packages = pd.read_csv(path+'deviceid_packages.tsv',
sep='\t', names=['device_id', 'apps'])
test = pd.read_csv(path+'deviceid_test.tsv', sep='\t', names=['device_id'])
train = pd.read_csv(path+'deviceid_train.tsv', sep='\t',
names=['device_id', 'sex', 'age'])
brand = pd.read_table(path+'deviceid_brand.tsv',
names=['device_id', 'vendor', 'version'])
behave_train = pd.read_csv('data/train_statistic_feat.csv')
behave_test = pd.read_csv('data/test_statistic_feat.csv')
# In[3]:
behave_train.drop(['sex', 'age', 'label', 'app'], 1, inplace=True)
behave_test.drop(['sex', 'age', 'label', 'app'], 1, inplace=True)
# In[4]:
brand['phone_version'] = brand['vendor'] + ' ' + brand['version']
train = pd.merge(brand[['device_id', 'phone_version']],
train, on='device_id', how='right')
test = pd.merge(brand[['device_id', 'phone_version']],
test, on='device_id', how='right')
# In[5]:
train = pd.merge(train, behave_train, on='device_id', how='left')
test = pd.merge(test, behave_test, on='device_id', how='left')
# In[6]:
packages['app_lenghth'] = packages['apps'].apply(
lambda x: x.split(',')).apply(lambda x: len(x))
packages['app_list'] = packages['apps'].apply(lambda x: x.split(','))
train = pd.merge(train, packages, on='device_id', how='left')
test = pd.merge(test, packages, on='device_id', how='left')
# In[7]:
embed_size = 128
fastmodel = Word2Vec(list(packages['app_list']), size=embed_size, window=4, min_count=3, negative=2,
sg=1, sample=0.002, hs=1, workers=4)
embedding_fast = pd.DataFrame([fastmodel[word]
for word in (fastmodel.wv.vocab)])
embedding_fast['app'] = list(fastmodel.wv.vocab)
embedding_fast.columns = ["fdim_%s" %
str(i) for i in range(embed_size)]+["app"]
# In[8]:
tokenizer = Tokenizer(lower=False, char_level=False, split=',')
tokenizer.fit_on_texts(list(packages['apps']))
X_seq = tokenizer.texts_to_sequences(train['apps'])
X_test_seq = tokenizer.texts_to_sequences(test['apps'])
maxlen = 50
X = pad_sequences(X_seq, maxlen=maxlen, value=0)
X_test = pad_sequences(X_test_seq, maxlen=maxlen, value=0)
Y_sex = train['sex']-1
# In[9]:
max_feaures = 35001
embedding_matrix = np.zeros((max_feaures, embed_size))
for word in tokenizer.word_index:
if word not in fastmodel.wv.vocab:
continue
embedding_matrix[tokenizer.word_index[word]] = fastmodel[word]
# In[10]:
# behave_train=behave_train.loc[:,"ph_ver_0":'week_day_6']
# behave_test=behave_test.loc[:,"h0":'week_day_6']
behave_train = pd.merge(train[['device_id']],
behave_train, on='device_id', how="left")
behave_test = pd.merge(test[['device_id']],
behave_test, on='device_id', how="left")
X_h = behave_train.iloc[:, 1:].values
X_h_test = behave_test.iloc[:, 1:].values
# In[11]:
class AdamW(Optimizer):
def __init__(self, lr=0.001, beta_1=0.9, beta_2=0.999, weight_decay=1e-4, # decoupled weight decay (1/4)
epsilon=1e-8, decay=0., **kwargs):
super(AdamW, self).__init__(**kwargs)
with K.name_scope(self.__class__.__name__):
self.iterations = K.variable(0, dtype='int64', name='iterations')
self.lr = K.variable(lr, name='lr')
self.beta_1 = K.variable(beta_1, name='beta_1')
self.beta_2 = K.variable(beta_2, name='beta_2')
self.decay = K.variable(decay, name='decay')
# decoupled weight decay (2/4)
self.wd = K.variable(weight_decay, name='weight_decay')
self.epsilon = epsilon
self.initial_decay = decay
@interfaces.legacy_get_updates_support
def get_updates(self, loss, params):
grads = self.get_gradients(loss, params)
self.updates = [K.update_add(self.iterations, 1)]
wd = self.wd # decoupled weight decay (3/4)
lr = self.lr
if self.initial_decay > 0:
lr *= (1. / (1. + self.decay * K.cast(self.iterations,
K.dtype(self.decay))))
t = K.cast(self.iterations, K.floatx()) + 1
lr_t = lr * (K.sqrt(1. - K.pow(self.beta_2, t)) /
(1. - K.pow(self.beta_1, t)))
ms = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
vs = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
self.weights = [self.iterations] + ms + vs
for p, g, m, v in zip(params, grads, ms, vs):
m_t = (self.beta_1 * m) + (1. - self.beta_1) * g
v_t = (self.beta_2 * v) + (1. - self.beta_2) * K.square(g)
# decoupled weight decay (4/4)
p_t = p - lr_t * m_t / (K.sqrt(v_t) + self.epsilon) - lr * wd * p
self.updates.append(K.update(m, m_t))
self.updates.append(K.update(v, v_t))
new_p = p_t
# Apply constraints.
if getattr(p, 'constraint', None) is not None:
new_p = p.constraint(new_p)
self.updates.append(K.update(p, new_p))
return self.updates
def get_config(self):
config = {'lr': float(K.get_value(self.lr)),
'beta_1': float(K.get_value(self.beta_1)),
'beta_2': float(K.get_value(self.beta_2)),
'decay': float(K.get_value(self.decay)),
'weight_decay': float(K.get_value(self.wd)),
'epsilon': self.epsilon}
base_config = super(AdamW, self).get_config()
return dict(list(base_config.items()) + list(config.items()))
# In[12]:
def model_conv1D(embedding_matrix):
K.clear_session()
# The embedding layer containing the word vectors
emb_layer = Embedding(
input_dim=embedding_matrix.shape[0],
output_dim=embedding_matrix.shape[1],
weights=[embedding_matrix],
input_length=maxlen,
trainable=False
)
lstm_layer = Bidirectional(
GRU(128, recurrent_dropout=0.15, dropout=0.15, return_sequences=True))
# 1D convolutions that can iterate over the word vectors
conv1 = Conv1D(filters=128, kernel_size=1,
padding='same', activation='relu',)
conv2 = Conv1D(filters=64, kernel_size=2,
padding='same', activation='relu', )
conv3 = Conv1D(filters=64, kernel_size=3,
padding='same', activation='relu',)
conv5 = Conv1D(filters=32, kernel_size=5,
padding='same', activation='relu',)
# Define inputs
seq = Input(shape=(maxlen,))
# Run inputs through embedding
emb = emb_layer(seq)
lstm = lstm_layer(emb)
# Run through CONV + GAP layers
conv1a = conv1(lstm)
gap1a = GlobalAveragePooling1D()(conv1a)
gmp1a = GlobalMaxPool1D()(conv1a)
conv2a = conv2(lstm)
gap2a = GlobalAveragePooling1D()(conv2a)
gmp2a = GlobalMaxPool1D()(conv2a)
conv3a = conv3(lstm)
gap3a = GlobalAveragePooling1D()(conv3a)
gmp3a = GlobalMaxPooling1D()(conv3a)
conv5a = conv5(lstm)
gap5a = GlobalAveragePooling1D()(conv5a)
gmp5a = GlobalMaxPooling1D()(conv5a)
hin = Input(shape=(396, ))
htime = Dense(64, activation='relu')(hin)
merge1 = concatenate([gap1a, gmp1a, htime])
# The MLP that determines the outcome
x = Dropout(0.3)(merge1)
x = BatchNormalization()(x)
x = Dense(200, activation='relu',)(x)
x = Dropout(0.25)(x)
x = BatchNormalization()(x)
x = Dense(200, activation='relu',)(x)
x = Dropout(0.25)(x)
x = BatchNormalization()(x)
x = Dense(200, activation='relu',)(x)
x = Dropout(0.25)(x)
x = BatchNormalization()(x)
pred = Dense(1, activation='sigmoid')(x)
# model = Model(inputs=[seq1, seq2, magic_input, distance_input], outputs=pred)
model = Model(inputs=[seq, hin], outputs=pred)
# model=multi_gpu_model(model,2)
model.compile(loss='binary_crossentropy',
optimizer=AdamW(weight_decay=0.08,))
# model.summary()
return model
# In[ ]:
kfold = StratifiedKFold(n_splits=5, random_state=20, shuffle=True)
sub1 = np.zeros((X_test.shape[0], ))
oof_pref1 = np.zeros((X.shape[0], 1))
score = []
count = 0
for i, (train_index, test_index) in enumerate(kfold.split(X, Y_sex)):
print("FOLD | ", count+1)
filepath = "model/sex_weights_best_%d.h5" % count
checkpoint = ModelCheckpoint(
filepath, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
reduce_lr = ReduceLROnPlateau(
monitor='val_loss', factor=0.8, patience=2, min_lr=0.0001, verbose=1)
earlystopping = EarlyStopping(
monitor='val_loss', min_delta=0.0001, patience=6, verbose=1, mode='auto')
callbacks = [checkpoint, reduce_lr, earlystopping]
model_sex = model_conv1D(embedding_matrix)
X_tr, X_vl, X_tr2, X_vl2, y_tr, y_vl = X[train_index], X[test_index], X_h[
train_index], X_h[test_index], Y_sex[train_index], Y_sex[test_index]
hist = model_sex.fit([X_tr, X_tr2], y_tr, batch_size=128, epochs=50, validation_data=([X_vl, X_vl2], y_vl),
callbacks=callbacks, verbose=1, shuffle=True)
model_sex.load_weights(filepath)
sub1 += np.squeeze(model_sex.predict([X_test, X_h_test]))/kfold.n_splits
oof_pref1[test_index] = model_sex.predict([X_vl, X_vl2])
score.append(np.min(hist.history['val_loss']))
count += 1
print('log loss:', np.mean(score))
# pd.DataFrame(oof_pref1).to_csv('cnn_oof_sex.csv', index=False)
# In[ ]:
oof_pref1 = pd.DataFrame(oof_pref1, columns=['sex2'])
sub1 = pd.DataFrame(sub1, columns=['sex2'])
res1 = pd.concat([oof_pref1, sub1])
res1['sex1'] = 1-res1['sex2']
res1.to_csv("data/res1.csv", index=False)
# In[ ]:
def model_age_conv(embedding_matrix):
# The embedding layer containing the word vectors
K.clear_session()
emb_layer = Embedding(
input_dim=embedding_matrix.shape[0],
output_dim=embedding_matrix.shape[1],
weights=[embedding_matrix],
input_length=maxlen,
trainable=False
)
lstm_layer = Bidirectional(
GRU(128, recurrent_dropout=0.15, dropout=0.15, return_sequences=True))
# 1D convolutions that can iterate over the word vectors
conv1 = Conv1D(filters=128, kernel_size=1,
padding='same', activation='relu',)
conv2 = Conv1D(filters=64, kernel_size=2,
padding='same', activation='relu', )
conv3 = Conv1D(filters=64, kernel_size=3,
padding='same', activation='relu',)
conv5 = Conv1D(filters=32, kernel_size=5,
padding='same', activation='relu',)
# Define inputs
seq = Input(shape=(maxlen,))
# Run inputs through embedding
emb = emb_layer(seq)
lstm = lstm_layer(emb)
# Run through CONV + GAP layers
conv1a = conv1(lstm)
gap1a = GlobalAveragePooling1D()(conv1a)
gmp1a = GlobalMaxPool1D()(conv1a)
conv2a = conv2(lstm)
gap2a = GlobalAveragePooling1D()(conv2a)
gmp2a = GlobalMaxPool1D()(conv2a)
conv3a = conv3(lstm)
gap3a = GlobalAveragePooling1D()(conv3a)
gmp3a = GlobalMaxPooling1D()(conv3a)
conv5a = conv5(lstm)
gap5a = GlobalAveragePooling1D()(conv5a)
gmp5a = GlobalMaxPooling1D()(conv5a)
hin = Input(shape=(397, ))
htime = Dense(64, activation='relu')(hin)
merge1 = concatenate([gap1a, gmp1a, htime])
# merge1 = concatenate([gap1a, gap2a, gap3a, gap5a])
# The MLP that determines the outcome
x = Dropout(0.3)(merge1)
x = BatchNormalization()(x)
x = Dense(200, activation='relu',)(x)
x = Dropout(0.22)(x)
x = BatchNormalization()(x)
x = Dense(200, activation='relu',)(x)
x = Dropout(0.22)(x)
x = BatchNormalization()(x)
x = Dense(200, activation='relu',)(x)
x = Dropout(0.22)(x)
x = BatchNormalization()(x)
pred = Dense(11, activation='softmax')(x)
model = Model(inputs=[seq, hin], outputs=pred)
model.compile(loss='categorical_crossentropy',
optimizer=AdamW(weight_decay=0.08,))
# model.summary()
return model
# In[ ]:
Y_age = to_categorical(train['age'])
# #### sex1
# In[ ]:
behave_train['sex'] = train['sex']
behave_test['sex'] = 1
X_h = behave_train.iloc[:, 1:].values
X_h_test = behave_test.iloc[:, 1:].values
# In[ ]:
sub2 = np.zeros((X_test.shape[0], 11))
oof_pref2 = np.zeros((X.shape[0], 11))
score = []
count = 0
for i, (train_index, test_index) in enumerate(kfold.split(X, train['age'])):
print("FOLD | ", count+1)
filepath2 = "model/age_weights_best_%d.h5" % count
checkpoint2 = ModelCheckpoint(
filepath2, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
reduce_lr2 = ReduceLROnPlateau(
monitor='val_loss', factor=0.8, patience=2, min_lr=0.0001, verbose=1)
earlystopping2 = EarlyStopping(
monitor='val_loss', min_delta=0.0001, patience=8, verbose=1, mode='auto')
callbacks2 = [checkpoint2, reduce_lr2, earlystopping2]
model_age = model_age_conv(embedding_matrix)
X_tr, X_vl, X_tr2, X_vl2, y_tr, y_vl = X[train_index], X[test_index], X_h[
train_index], X_h[test_index], Y_age[train_index], Y_age[test_index]
hist = model_age.fit([X_tr, X_tr2], y_tr, batch_size=128, epochs=50, validation_data=([X_vl, X_vl2], y_vl),
callbacks=callbacks2, verbose=1, shuffle=True)
model_age.load_weights(filepath2)
oof_pref2[test_index] = model_age.predict([X_vl, X_vl2])
sub2 += model_age.predict([X_test, X_h_test])/kfold.n_splits
score.append(np.min(hist.history['val_loss']))
count += 1
print('log loss:', np.mean(score))
# pd.DataFrame(oof_pref2).to_csv('cnn_oof_age.csv', index=False)
# In[ ]:
res2_1 = np.vstack((oof_pref2, sub2))
res2_1 = pd.DataFrame(res2_1)
res2_1.to_csv("submit/res2_1.csv", index=False)
# ### sex2
# In[ ]:
behave_train['sex'] = train['sex']
behave_test['sex'] = 2
X_h = behave_train.iloc[:, 1:].values
X_h_test = behave_test.iloc[:, 1:].values
# In[ ]:
sub2 = np.zeros((X_test.shape[0], 11))
oof_pref2 = np.zeros((X.shape[0], 11))
score = []
count = 0
for i, (train_index, test_index) in enumerate(kfold.split(X, train['age'])):
print("FOLD | ", count+1)
filepath2 = "model/age_weights_best_%d.h5" % count
checkpoint2 = ModelCheckpoint(
filepath2, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
reduce_lr2 = ReduceLROnPlateau(
monitor='val_loss', factor=0.8, patience=2, min_lr=0.0001, verbose=1)
earlystopping2 = EarlyStopping(
monitor='val_loss', min_delta=0.0001, patience=8, verbose=1, mode='auto')
callbacks2 = [checkpoint2, reduce_lr2, earlystopping2]
model_age = model_age_conv(embedding_matrix)
X_tr, X_vl, X_tr2, X_vl2, y_tr, y_vl = X[train_index], X[test_index], X_h[
train_index], X_h[test_index], Y_age[train_index], Y_age[test_index]
hist = model_age.fit([X_tr, X_tr2], y_tr, batch_size=128, epochs=50, validation_data=([X_vl, X_vl2], y_vl),
callbacks=callbacks2, verbose=1, shuffle=True)
model_age.load_weights(filepath2)
oof_pref2[test_index] = model_age.predict([X_vl, X_vl2])
sub2 += model_age.predict([X_test, X_h_test])/kfold.n_splits
score.append(np.min(hist.history['val_loss']))
count += 1
print('log loss:', np.mean(score))
# pd.DataFrame(oof_pref2).to_csv('cnn_oof_age.csv', index=False)
# In[ ]:
res2_2 = np.vstack((oof_pref2, sub2))
res2_2 = pd.DataFrame(res2_2)
# In[ ]:
res2_2.to_csv("submit/res2_2.csv", index=False)
# In[ ]:
res1.index = range(len(res1))
res2_1.index = range(len(res2_1))
res2_2.index = range(len(res2_2))
final_1 = res2_1
final_2 = res2_2
for i in range(11):
final_1[i] = res1['sex1']*res2_1[i]
final_2[i] = res1['sex2']*res2_2[i]
id_list = pd.concat([train[['device_id']], test[['device_id']]])
final = id_list
final.index = range(len(final))
final.columns = ['DeviceID']
final_pred = pd.concat([final_1, final_2], 1)
final = pd.concat([final, final_pred], 1)
final.columns = ['DeviceID', '1-0', '1-1', '1-2', '1-3', '1-4', '1-5', '1-6',
'1-7', '1-8', '1-9', '1-10', '2-0', '2-1', '2-2', '2-3', '2-4',
'2-5', '2-6', '2-7', '2-8', '2-9', '2-10']
final.to_csv('submit/nn_feat.csv', index=False)
================================================
FILE: chizhu/single_model/config.py
================================================
path = "/Users/chizhu/data/competition_data/易观/"
================================================
FILE: chizhu/single_model/deepnn.py
================================================
import pandas as pd
import seaborn as sns
import numpy as np
from tqdm import tqdm
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from datetime import datetime, timedelta
import matplotlib.pyplot as plt
import time
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
# %matplotlib inline
#add
from category_encoders import OrdinalEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from gensim.models import FastText, Word2Vec
import re
from keras.layers import *
from keras.models import *
from keras.preprocessing.text import Tokenizer, text_to_word_sequence
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing import text, sequence
from keras.callbacks import *
from keras.layers.advanced_activations import LeakyReLU, PReLU
import keras.backend as K
from keras.optimizers import *
from keras.utils import to_categorical
packages = pd.read_csv(path+'deviceid_packages.tsv',
sep='\t', names=['device_id', 'apps'])
test = pd.read_csv(path+'deviceid_test.tsv',
sep='\t', names=['device_id'])
train = pd.read_csv(path+'deviceid_train.tsv',
sep='\t', names=['device_id', 'sex', 'age'])
brand = pd.read_table(path+'deviceid_brand.tsv',
names=['device_id', 'vendor', 'version'])
behave = pd.read_csv('data/user_behavior.csv')
packages['app_lenghth'] = packages['apps'].apply(
lambda x: x.split(',')).apply(lambda x: len(x))
packages['app_list'] = packages['apps'].apply(lambda x: x.split(','))
train = pd.merge(train, packages, on='device_id', how='left')
test = pd.merge(test, packages, on='device_id', how='left')
embed_size = 128
fastmodel = Word2Vec(list(packages['app_list']), size=embed_size, window=4, min_count=3, negative=2,
sg=1, sample=0.002, hs=1, workers=4)
embedding_fast = pd.DataFrame([fastmodel[word]
for word in (fastmodel.wv.vocab)])
embedding_fast['app'] = list(fastmodel.wv.vocab)
embedding_fast.columns = ["fdim_%s" %
str(i) for i in range(embed_size)]+["app"]
tokenizer = Tokenizer(lower=False, char_level=False, split=',')
tokenizer.fit_on_texts(list(packages['apps']))
X_seq = tokenizer.texts_to_sequences(train['apps'])
X_test_seq = tokenizer.texts_to_sequences(test['apps'])
maxlen = 50
X = pad_sequences(X_seq, maxlen=maxlen, value=0)
X_test = pad_sequences(X_test_seq, maxlen=maxlen, value=0)
Y_sex = train['sex']-1
max_feaures = 35001
embedding_matrix = np.zeros((max_feaures, embed_size))
for word in tokenizer.word_index:
if word not in fastmodel.wv.vocab:
continue
embedding_matrix[tokenizer.word_index[word]] = fastmodel[word]
class AdamW(Optimizer):
def __init__(self, lr=0.001, beta_1=0.9, beta_2=0.999, weight_decay=1e-4, # decoupled weight decay (1/4)
epsilon=1e-8, decay=0., **kwargs):
super(AdamW, self).__init__(**kwargs)
with K.name_scope(self.__class__.__name__):
self.iterations = K.variable(0, dtype='int64', name='iterations')
self.lr = K.variable(lr, name='lr')
self.beta_1 = K.variable(beta_1, name='beta_1')
self.beta_2 = K.variable(beta_2, name='beta_2')
self.decay = K.variable(decay, name='decay')
# decoupled weight decay (2/4)
self.wd = K.variable(weight_decay, name='weight_decay')
self.epsilon = epsilon
self.initial_decay = decay
@interfaces.legacy_get_updates_support
def get_updates(self, loss, params):
grads = self.get_gradients(loss, params)
self.updates = [K.update_add(self.iterations, 1)]
wd = self.wd # decoupled weight decay (3/4)
lr = self.lr
if self.initial_decay > 0:
lr *= (1. / (1. + self.decay * K.cast(self.iterations,
K.dtype(self.decay))))
t = K.cast(self.iterations, K.floatx()) + 1
lr_t = lr * (K.sqrt(1. - K.pow(self.beta_2, t)) /
(1. - K.pow(self.beta_1, t)))
ms = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
vs = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
self.weights = [self.iterations] + ms + vs
for p, g, m, v in zip(params, grads, ms, vs):
m_t = (self.beta_1 * m) + (1. - self.beta_1) * g
v_t = (self.beta_2 * v) + (1. - self.beta_2) * K.square(g)
# decoupled weight decay (4/4)
p_t = p - lr_t * m_t / (K.sqrt(v_t) + self.epsilon) - lr * wd * p
self.updates.append(K.update(m, m_t))
self.updates.append(K.update(v, v_t))
new_p = p_t
# Apply constraints.
if getattr(p, 'constraint', None) is not None:
new_p = p.constraint(new_p)
self.updates.append(K.update(p, new_p))
return self.updates
def get_config(self):
config = {'lr': float(K.get_value(self.lr)),
'beta_1': float(K.get_value(self.beta_1)),
'beta_2': float(K.get_value(self.beta_2)),
'decay': float(K.get_value(self.decay)),
'weight_decay': float(K.get_value(self.wd)),
'epsilon': self.epsilon}
base_config = super(AdamW, self).get_config()
return dict(list(base_config.items()) + list(config.items()))
def model_conv1D_sex(embedding_matrix):
K.clear_session()
# The embedding layer containing the word vectors
emb_layer = Embedding(
input_dim=embedding_matrix.shape[0],
output_dim=embedding_matrix.shape[1],
weights=[embedding_matrix],
input_length=maxlen,
trainable=False)
# Define inputs
seq = Input(shape=(maxlen,))
# Run inputs through embedding
emb = emb_layer(seq)
lstm_layer = Bidirectional(GRU(128, recurrent_dropout=0.15, dropout=0.15,))
lstm = lstm_layer(emb)
translate = TimeDistributed(Dense(128, activation='relu'))
t1 = translate(emb)
t1 = TimeDistributed(Dropout(0.15))(t1)
sum_op = Lambda(lambda x: K.sum(x, axis=1), output_shape=(128,))
t1 = sum_op(t1)
lstm_layer2 = Bidirectional(
GRU(128, recurrent_dropout=0.15, dropout=0.15, return_sequences=True))
# 1D convolutions that can iterate over the word vectors
conv1_2 = Conv1D(filters=128, kernel_size=2,
padding='same', activation='relu',)
lstm2 = lstm_layer2(emb)
# Run through CONV + GAP layers
conv1a2 = conv1_2(lstm2)
gap1a2 = GlobalAveragePooling1D()(conv1a2)
gmp1a2 = GlobalMaxPool1D()(conv1a2)
# 1D convolutions that can iterate over the word vectors
conv1 = Conv1D(filters=128, kernel_size=1,
padding='same', activation='relu',)
conv2 = Conv1D(filters=64, kernel_size=2,
padding='same', activation='relu', )
conv3 = Conv1D(filters=64, kernel_size=3,
padding='same', activation='relu',)
conv4 = Conv1D(filters=64, kernel_size=4,
padding='same', activation='relu',)
conv5 = Conv1D(filters=32, kernel_size=5,
padding='same', activation='relu',)
conv6 = Conv1D(filters=32, kernel_size=6,
padding='same', activation='relu',)
# Run through CONV + GAP layers
conv1a = conv1(emb)
gap1a = GlobalAveragePooling1D()(conv1a)
gmp1a = GlobalMaxPool1D()(conv1a)
conv2a = conv2(emb)
gap2a = GlobalAveragePooling1D()(conv2a)
gmp2a = GlobalMaxPool1D()(conv2a)
conv3a = conv3(emb)
gap3a = GlobalAveragePooling1D()(conv3a)
gmp3a = GlobalMaxPooling1D()(conv3a)
conv4a = conv3(emb)
gap4a = GlobalAveragePooling1D()(conv4a)
#gmp3a = GlobalMaxPooling1D()(conv3a)
conv5a = conv5(emb)
gap5a = GlobalAveragePooling1D()(conv5a)
gmp5a = GlobalMaxPooling1D()(conv5a)
conv6a = conv6(emb)
gap6a = GlobalAveragePooling1D()(conv6a)
#hin = Input(shape=(X_h.shape[1], ))
#htime = Dense(X_h.shape[1]//4, activation='relu')(hin)
merge1 = concatenate([gap1a2, gmp1a2, lstm, t1])
# The MLP that determines the outcome
x = Dropout(0.38)(merge1)
#x = BatchNormalization()(x)
#x = Dense(200, activation='relu',)(x)
#x = Dropout(0.22)(x)
x = BatchNormalization()(x)
x = Dense(200, activation='relu',)(x)
x = Dropout(0.22)(x)
x = BatchNormalization()(x)
x = Dense(200, activation='relu',)(x)
x = Dropout(0.22)(x)
x = BatchNormalization()(x)
pred = Dense(1, activation='sigmoid')(x)
# model = Model(inputs=[seq1, seq2, magic_input, distance_input], outputs=pred)
model = Model(inputs=seq, outputs=pred)
model.compile(loss='binary_crossentropy',
optimizer=AdamW(weight_decay=0.1,))
return model
kfold = StratifiedKFold(n_splits=5, random_state=20, shuffle=True)
sub1 = np.zeros((X_test.shape[0], ))
oof_pref1 = np.zeros((X.shape[0], 1))
score = []
count = 0
for i, (train_index, test_index) in enumerate(kfold.split(X, Y_sex)):
print("FOLD | ", count+1)
filepath = "sex_weights_best_%d.h5" % count
checkpoint = ModelCheckpoint(
filepath, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
reduce_lr = ReduceLROnPlateau(
monitor='val_loss', factor=0.8, patience=2, min_lr=0.0001, verbose=1)
earlystopping = EarlyStopping(
monitor='val_loss', min_delta=0.0001, patience=6, verbose=1, mode='auto')
callbacks = [checkpoint, reduce_lr, earlystopping]
model_sex = model_conv1D_sex(embedding_matrix)
X_tr, X_vl, y_tr, y_vl = X[train_index], X[test_index], Y_sex[train_index], Y_sex[test_index]
hist = model_sex.fit(X_tr, y_tr, batch_size=512, epochs=50, validation_data=(X_vl, y_vl),
callbacks=callbacks, verbose=2, shuffle=True)
model_sex.load_weights(filepath)
sub1 += np.squeeze(model_sex.predict(X_test))/kfold.n_splits
oof_pref1[test_index] = model_sex.predict(X_vl)
score.append(np.min(hist.history['val_loss']))
count += 1
print('log loss:', np.mean(score))
oof_pref1 = pd.DataFrame(oof_pref1, columns=['sex2'])
sub1 = pd.DataFrame(sub1, columns=['sex2'])
res1 = pd.concat([oof_pref1, sub1])
res1['sex1'] = 1-res1['sex2']
# res1.to_csv("res1.csv", index=False)
def model_age_conv(embedding_matrix):
K.clear_session()
# The embedding layer containing the word vectors
emb_layer = Embedding(
input_dim=embedding_matrix.shape[0],
output_dim=embedding_matrix.shape[1],
weights=[embedding_matrix],
input_length=maxlen,
trainable=False)
# Define inputs
seq = Input(shape=(maxlen,))
# Run inputs through embedding
emb = emb_layer(seq)
lstm_layer = Bidirectional(GRU(128, recurrent_dropout=0.15, dropout=0.15,))
lstm = lstm_layer(emb)
translate = TimeDistributed(Dense(128, activation='relu'))
t1 = translate(emb)
t1 = TimeDistributed(Dropout(0.15))(t1)
sum_op = Lambda(lambda x: K.sum(x, axis=1), output_shape=(128,))
t1 = sum_op(t1)
lstm_layer2 = Bidirectional(
GRU(128, recurrent_dropout=0.15, dropout=0.15, return_sequences=True))
# 1D convolutions that can iterate over the word vectors
conv1_2 = Conv1D(filters=128, kernel_size=2,
padding='same', activation='relu',)
lstm2 = lstm_layer2(emb)
# Run through CONV + GAP layers
conv1a2 = conv1_2(lstm2)
gap1a2 = GlobalAveragePooling1D()(conv1a2)
gmp1a2 = GlobalMaxPool1D()(conv1a2)
# 1D convolutions that can iterate over the word vectors
conv1 = Conv1D(filters=128, kernel_size=1,
padding='same', activation='relu',)
conv2 = Conv1D(filters=64, kernel_size=2,
padding='same', activation='relu', )
conv3 = Conv1D(filters=64, kernel_size=3,
padding='same', activation='relu',)
conv4 = Conv1D(filters=64, kernel_size=4,
padding='same', activation='relu',)
conv5 = Conv1D(filters=32, kernel_size=5,
padding='same', activation='relu',)
conv6 = Conv1D(filters=32, kernel_size=6,
padding='same', activation='relu',)
# Run through CONV + GAP layers
conv1a = conv1(emb)
gap1a = GlobalAveragePooling1D()(conv1a)
gmp1a = GlobalMaxPool1D()(conv1a)
conv2a = conv2(emb)
gap2a = GlobalAveragePooling1D()(conv2a)
gmp2a = GlobalMaxPool1D()(conv2a)
conv3a = conv3(emb)
gap3a = GlobalAveragePooling1D()(conv3a)
gmp3a = GlobalMaxPooling1D()(conv3a)
conv4a = conv3(emb)
gap4a = GlobalAveragePooling1D()(conv4a)
#gmp3a = GlobalMaxPooling1D()(conv3a)
conv5a = conv5(emb)
gap5a = GlobalAveragePooling1D()(conv5a)
gmp5a = GlobalMaxPooling1D()(conv5a)
conv6a = conv6(emb)
gap6a = GlobalAveragePooling1D()(conv6a)
#hin = Input(shape=(X_h.shape[1], ))
#htime = Dense(X_h.shape[1]//4, activation='relu')(hin)
merge1 = concatenate([gap1a2, gmp1a2, lstm, t1])
# The MLP that determines the outcome
x = Dropout(0.38)(merge1)
#x = BatchNormalization()(x)
#x = Dense(200, activation='relu',)(x)
#x = Dropout(0.22)(x)
x = BatchNormalization()(x)
x = Dense(200, activation='relu',)(x)
x = Dropout(0.22)(x)
x = BatchNormalization()(x)
x = Dense(200, activation='relu',)(x)
x = Dropout(0.22)(x)
x = BatchNormalization()(x)
pred = Dense(11, activation='softmax')(x)
model = Model(inputs=seq, outputs=pred)
model.compile(loss='categorical_crossentropy',
optimizer=AdamW(weight_decay=0.1,))
return model
Y_age = to_categorical(train['age'])
sub2 = np.zeros((X_test.shape[0], 11))
oof_pref2 = np.zeros((X.shape[0], 11))
score = []
count = 0
for i, (train_index, test_index) in enumerate(kfold.split(X, train['age'])):
print("FOLD | ", count+1)
filepath2 = "age_weights_best_%d.h5" % count
checkpoint2 = ModelCheckpoint(
filepath2, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
reduce_lr2 = ReduceLROnPlateau(
monitor='val_loss', factor=0.8, patience=2, min_lr=0.0001, verbose=1)
earlystopping2 = EarlyStopping(
monitor='val_loss', min_delta=0.0001, patience=8, verbose=1, mode='auto')
callbacks2 = [checkpoint2, reduce_lr2, earlystopping2]
X_tr, X_vl, y_tr, y_vl = X[train_index], X[test_index], Y_age[train_index], Y_age[test_index]
model_age = model_age_conv(embedding_matrix)
hist = model_age.fit(X_tr, y_tr, batch_size=512, epochs=50, validation_data=(X_vl, y_vl),
callbacks=callbacks2, verbose=2, shuffle=True)
model_age.load_weights(filepath2)
oof_pref2[test_index] = model_age.predict(X_vl)
sub2 += model_age.predict(X_test)/kfold.n_splits
score.append(np.min(hist.history['val_loss']))
count += 1
print('log loss:', np.mean(score))
res2_1 = np.vstack((oof_pref2, sub2))
res2_1 = pd.DataFrame(res2_1)
# res2_1.to_csv("res2.csv", index=False)
res1.index = range(len(res1))
res2_1.index = range(len(res2_1))
final_1 = res2_1.copy()
final_2 = res2_1.copy()
for i in range(11):
final_1[i] = res1['sex1']*res2_1[i]
final_2[i] = res1['sex2']*res2_1[i]
id_list = pd.concat([train[['device_id']], test[['device_id']]])
final = id_list
final.index = range(len(final))
final.columns = ['DeviceID']
final_pred = pd.concat([final_1, final_2], 1)
final = pd.concat([final, final_pred], 1)
final.columns = ['DeviceID', '1-0', '1-1', '1-2', '1-3', '1-4', '1-5', '1-6',
'1-7', '1-8', '1-9', '1-10', '2-0', '2-1', '2-2', '2-3', '2-4',
'2-5', '2-6', '2-7', '2-8', '2-9', '2-10']
final.to_csv('submit/deepnn_fix.csv', index=False)
================================================
FILE: chizhu/single_model/get_nn_feat.py
================================================
import pandas as pd
import seaborn as sns
import numpy as np
from tqdm import tqdm
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import lightgbm as lgb
from datetime import datetime, timedelta
import matplotlib.pyplot as plt
import time
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
%matplotlib inline
#add
import gc
from sklearn import preprocessing
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack, vstack
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
# from skopt.space import Integer, Categorical, Real, Log10
# from skopt.utils import use_named_args
# from skopt import gp_minimize
from gensim.models import Word2Vec, FastText
import gensim
import re
from config import path
# path = "/dev/shm/chizhu_data/data/"
###这里是原始文件的地址,务必修改这里的路径
test = pd.read_csv(path+'deviceid_test.tsv', sep='\t', names=['device_id'])
train = pd.read_csv(path+'deviceid_train.tsv', sep='\t',
names=['device_id', 'sex', 'age'])
brand = pd.read_table(path+'deviceid_brand.tsv',
names=['device_id', 'vendor', 'version'])
packtime = pd.read_table(path+'deviceid_package_start_close.tsv',
names=['device_id', 'app', 'start', 'close'])
packages = pd.read_csv(path+'deviceid_packages.tsv',
sep='\t', names=['device_id', 'apps'])
packtime['period'] = (packtime['close'] - packtime['start'])/1000
packtime['start'] = pd.to_datetime(packtime['start'], unit='ms')
app_use_time = packtime.groupby(['app'])['period'].agg('sum').reset_index()
app_use_top100 = app_use_time.sort_values(
by='period', ascending=False)[:100]['app']
device_app_use_time = packtime.groupby(['device_id', 'app'])[
'period'].agg('sum').reset_index()
use_time_top100_statis = device_app_use_time.set_index(
'app').loc[list(app_use_top100)].reset_index()
top100_statis = use_time_top100_statis.pivot(
index='device_id', columns='app', values='period').reset_index()
top100_statis = top100_statis.fillna(0)
# 手机品牌预处理
brand['vendor'] = brand['vendor'].astype(
str).apply(lambda x: x.split(' ')[0].upper())
brand['ph_ver'] = brand['vendor'] + '_' + brand['version']
ph_ver = brand['ph_ver'].value_counts()
ph_ver_cnt = pd.DataFrame(ph_ver).reset_index()
ph_ver_cnt.columns = ['ph_ver', 'ph_ver_cnt']
brand = pd.merge(left=brand, right=ph_ver_cnt, on='ph_ver')
# 针对长尾分布做的一点处理
mask = (brand.ph_ver_cnt < 100)
brand.loc[mask, 'ph_ver'] = 'other'
train = pd.merge(brand[['device_id', 'ph_ver']],
train, on='device_id', how='right')
test = pd.merge(brand[['device_id', 'ph_ver']],
test, on='device_id', how='right')
train['ph_ver'] = train['ph_ver'].astype(str)
test['ph_ver'] = test['ph_ver'].astype(str)
# 将 ph_ver 进行 label encoder
ph_ver_le = preprocessing.LabelEncoder()
train['ph_ver'] = ph_ver_le.fit_transform(train['ph_ver'])
test['ph_ver'] = ph_ver_le.transform(test['ph_ver'])
train['label'] = train['sex'].astype(str) + '-' + train['age'].astype(str)
label_le = preprocessing.LabelEncoder()
train['label'] = label_le.fit_transform(train['label'])
test['sex'] = -1
test['age'] = -1
test['label'] = -1
data = pd.concat([train, test], ignore_index=True)
# data.shape
ph_ver_dummy = pd.get_dummies(data['ph_ver'])
ph_ver_dummy.columns = ['ph_ver_' + str(i)
for i in range(ph_ver_dummy.shape[1])]
data = pd.concat([data, ph_ver_dummy], axis=1)
del data['ph_ver']
train = data[data.sex != -1]
test = data[data.sex == -1]
# train.shape, test.shape
# 每个app的总使用次数统计
app_num = packtime['app'].value_counts().reset_index()
app_num.columns = ['app', 'app_num']
packtime = pd.merge(left=packtime, right=app_num, on='app')
# 同样的,针对长尾分布做些处理(尝试过不做处理,或换其他阈值,这个100的阈值最高)
packtime.loc[packtime.app_num < 100, 'app'] = 'other'
# 统计每台设备的app数量
df_app = packtime[['device_id', 'app']]
apps = df_app.drop_duplicates().groupby(['device_id'])[
'app'].apply(' '.join).reset_index()
apps['app_length'] = apps['app'].apply(lambda x: len(x.split(' ')))
train = pd.merge(train, apps, on='device_id', how='left')
test = pd.merge(test, apps, on='device_id', how='left')
# packtime['period'] = (packtime['close'] - packtime['start'])/1000
# packtime['start'] = pd.to_datetime(packtime['start'], unit='ms')
packtime['dayofweek'] = packtime['start'].dt.dayofweek
packtime['hour'] = packtime['start'].dt.hour
# packtime = packtime[(packtime['start'] < '2017-03-31 23:59:59') & (packtime['start'] > '2017-03-01 00:00:00')]
app_use_time = packtime.groupby(['device_id', 'dayofweek'])[
'period'].agg('sum').reset_index()
week_app_use = app_use_time.pivot_table(
values='period', columns='dayofweek', index='device_id').reset_index()
week_app_use = week_app_use.fillna(0)
week_app_use.columns = ['device_id'] + \
['week_day_' + str(i) for i in range(0, 7)]
week_app_use['week_max'] = week_app_use.max(axis=1)
week_app_use['week_min'] = week_app_use.min(axis=1)
week_app_use['week_sum'] = week_app_use.sum(axis=1)
week_app_use['week_std'] = week_app_use.std(axis=1)
# '''
# for i in range(0, 7):
# week_app_use['week_day_' + str(i)] = week_app_use['week_day_' + str(i)] / week_app_use['week_sum']
# '''
user_behavior = pd.read_csv('data/user_behavior.csv')
user_behavior['app_len_max'] = user_behavior['app_len_max'].astype(np.float64)
del user_behavior['app']
train = pd.merge(train, user_behavior, on='device_id', how='left')
test = pd.merge(test, user_behavior, on='device_id', how='left')
train = pd.merge(train, week_app_use, on='device_id', how='left')
test = pd.merge(test, week_app_use, on='device_id', how='left')
top100_statis.columns = ['device_id'] + \
['top100_statis_' + str(i) for i in range(0, 100)]
train = pd.merge(train, top100_statis, on='device_id', how='left')
test = pd.merge(test, top100_statis, on='device_id', how='left')
train.to_csv("data/train_statistic_feat.csv", index=False)
test.to_csv("data/test_statistic_feat.csv", index=False)
================================================
FILE: chizhu/single_model/lgb.py
================================================
# coding: utf-8
# In[1]:
import pandas as pd
import seaborn as sns
import numpy as np
from tqdm import tqdm
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import lightgbm as lgb
from datetime import datetime,timedelta
import matplotlib.pyplot as plt
import time
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
#add
import gc
from sklearn import preprocessing
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack, vstack
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
# from skopt.space import Integer, Categorical, Real, Log10
# from skopt.utils import use_named_args
# from skopt import gp_minimize
from gensim.models import Word2Vec, FastText
import gensim
import re
from config import path
# path="/Users/chizhu/data/competition_data/易观/"
# In[2]:
test = pd.read_csv(path+'deviceid_test.tsv', sep='\t', names=['device_id'])
train = pd.read_csv(path+'deviceid_train.tsv', sep='\t', names=['device_id', 'sex', 'age'])
brand = pd.read_table(path+'deviceid_brand.tsv', names=['device_id', 'vendor', 'version'])
packtime = pd.read_table(path+'deviceid_package_start_close.tsv',
names=['device_id', 'app', 'start', 'close'])
packages = pd.read_csv(path+'deviceid_packages.tsv', sep='\t', names=['device_id', 'apps'])
# In[3]:
def get_str(df):
res=""
for i in df.split(","):
res+=i+" "
return res
packages["str_app"]=packages['apps'].apply(lambda x:get_str(x),1)
# In[4]:
tfidf = CountVectorizer()
train_str_app=pd.merge(train[['device_id']],packages[["device_id",'str_app']],on="device_id",how="left")
test_str_app=pd.merge(test[['device_id']],packages[["device_id",'str_app']],on="device_id",how="left")
packages['str_app'] = tfidf.fit_transform(packages['str_app'])
train_app = tfidf.transform(list(train_str_app['str_app'])).tocsr()
test_app = tfidf.transform(list(test_str_app['str_app'])).tocsr()
# In[5]:
all_id=pd.concat([train[["device_id"]],test[['device_id']]])
# In[6]:
all_id.index=range(len(all_id))
# In[7]:
# encoding:utf-8
import sys
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cross_validation import StratifiedKFold
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.linear_model import RidgeClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.metrics import mean_squared_error
import os
if not os.path.exists("data"):
os.mkdir("data")
############################ 切分数据集 ##########################
print('开始进行一些前期处理')
train_feature = train_app
test_feature = test_app
# 五则交叉验证
n_folds = 5
print('处理完毕')
df_stack = pd.DataFrame()
df_stack['device_id']=all_id['device_id']
for label in ["sex"]:
score = train[label]-1
########################### lr(LogisticRegression) ################################
print('lr stacking')
stack_train = np.zeros((len(train), 1))
stack_test = np.zeros((len(test), 1))
score_va = 0
for i, (tr, va) in enumerate(StratifiedKFold(score, n_folds=n_folds, random_state=1017)):
print('stack:%d/%d' % ((i + 1), n_folds))
clf = LogisticRegression(random_state=1017, C=8)
clf.fit(train_feature[tr], score[tr])
score_va = clf.predict_proba(train_feature[va])[:,1]
score_te = clf.predict_proba(test_feature)[:,1]
print('得分' + str(mean_squared_error(score[va], clf.predict(train_feature[va]))))
stack_train[va,0] = score_va
stack_test[:,0]+= score_te
stack_test /= n_folds
stack = np.vstack([stack_train, stack_test])
df_stack['pack_tfidf_lr_classfiy_{}'.format(label)] = stack[:, 0]
########################### SGD(随机梯度下降) ################################
print('sgd stacking')
stack_train = np.zeros((len(train), 1))
stack_test = np.zeros((len(test), 1))
score_va = 0
for i, (tr, va) in enumerate(StratifiedKFold(score, n_folds=n_folds, random_state=1017)):
print('stack:%d/%d' % ((i + 1), n_folds))
sgd = SGDClassifier(random_state=1017, loss='log')
sgd.fit(train_feature[tr], score[tr])
score_va = sgd.predict_proba(train_feature[va])[:,1]
score_te = sgd.predict_proba(test_feature)[:,1]
print('得分' + str(mean_squared_error(score[va], sgd.predict(train_feature[va]))))
stack_train[va,0] = score_va
stack_test[:,0]+= score_te
stack_test /= n_folds
stack = np.vstack([stack_train, stack_test])
df_stack['pack_tfidf_sgd_classfiy_{}'.format(label)] = stack[:, 0]
########################### pac(PassiveAggressiveClassifier) ################################
print('PAC stacking')
stack_train = np.zeros((len(train), 1))
stack_test = np.zeros((len(test), 1))
score_va = 0
for i, (tr, va) in enumerate(StratifiedKFold(score, n_folds=n_folds, random_state=1017)):
print('stack:%d/%d' % ((i + 1), n_folds))
pac = PassiveAggressiveClassifier(random_state=1017)
pac.fit(train_feature[tr], score[tr])
score_va = pac._predict_proba_lr(train_feature[va])[:,1]
score_te = pac._predict_proba_lr(test_feature)[:,1]
print(score_va)
print('得分' + str(mean_squared_error(score[va], pac.predict(train_feature[va]))))
stack_train[va,0] += score_va
stack_test[:,0] += score_te
stack_test /= n_folds
stack = np.vstack([stack_train, stack_test])
df_stack['pack_tfidf_pac_classfiy_{}'.format(label)] = stack[:, 0]
########################### ridge(RidgeClassfiy) ################################
print('RidgeClassfiy stacking')
stack_train = np.zeros((len(train), 1))
stack_test = np.zeros((len(test), 1))
score_va = 0
for i, (tr, va) in enumerate(StratifiedKFold(score, n_folds=n_folds, random_state=1017)):
print('stack:%d/%d' % ((i + 1), n_folds))
ridge = RidgeClassifier(random_state=1017)
ridge.fit(train_feature[tr], score[tr])
score_va = ridge._predict_proba_lr(train_feature[va])[:,1]
score_te = ridge._predict_proba_lr(test_feature)[:,1]
print(score_va)
print('得分' + str(mean_squared_error(score[va], ridge.predict(train_feature[va]))))
stack_train[va,0] += score_va
stack_test[:,0] += score_te
stack_test /= n_folds
stack = np.vstack([stack_train, stack_test])
df_stack['pack_tfidf_ridge_classfiy_{}'.format(label)] = stack[:, 0]
########################### bnb(BernoulliNB) ################################
print('BernoulliNB stacking')
stack_train = np.zeros((len(train), 1))
stack_test = np.zeros((len(test), 1))
score_va = 0
for i, (tr, va) in enumerate(StratifiedKFold(score, n_folds=n_folds, random_state=1017)):
print('stack:%d/%d' % ((i + 1), n_folds))
bnb = BernoulliNB()
bnb.fit(train_feature[tr], score[tr])
score_va = bnb.predict_proba(train_feature[va])[:,1]
score_te = bnb.predict_proba(test_feature)[:,1]
print(score_va)
print('得分' + str(mean_squared_error(score[va], bnb.predict(train_feature[va]))))
stack_train[va,0] += score_va
stack_test[:,0] += score_te
stack_test /= n_folds
stack = np.vstack([stack_train, stack_test])
df_stack['pack_tfidf_bnb_classfiy_{}'.format(label)] = stack[:, 0]
########################### mnb(MultinomialNB) ################################
print('MultinomialNB stacking')
stack_train = np.zeros((len(train), 1))
stack_test = np.zeros((len(test), 1))
score_va = 0
for i, (tr, va) in enumerate(StratifiedKFold(score, n_folds=n_folds, random_state=1017)):
print('stack:%d/%d' % ((i + 1), n_folds))
mnb = MultinomialNB()
mnb.fit(train_feature[tr], score[tr])
score_va = mnb.predict_proba(train_feature[va])[:,1]
score_te = mnb.predict_proba(test_feature)[:,1]
print(score_va)
print('得分' + str(mean_squared_error(score[va], mnb.predict(train_feature[va]))))
stack_train[va,0] += score_va
stack_test[:,0] += score_te
stack_test /= n_folds
stack = np.vstack([stack_train, stack_test])
df_stack['pack_tfidf_mnb_classfiy_{}'.format(label)] = stack[:, 0]
############################ Linersvc(LinerSVC) ################################
print('LinerSVC stacking')
stack_train = np.zeros((len(train), 1))
stack_test = np.zeros((len(test), 1))
score_va = 0
for i, (tr, va) in enumerate(StratifiedKFold(score, n_folds=n_folds, random_state=1017)):
print('stack:%d/%d' % ((i + 1), n_folds))
lsvc = LinearSVC(random_state=1017)
lsvc.fit(train_feature[tr], score[tr])
score_va = lsvc._predict_proba_lr(train_feature[va])[:,1]
score_te = lsvc._predict_proba_lr(test_feature)[:,1]
print(score_va)
print('得分' + str(mean_squared_error(score[va], lsvc.predict(train_feature[va]))))
stack_train[va,0] += score_va
stack_test[:,0] += score_te
stack_test /= n_folds
stack = np.vstack([stack_train, stack_test])
df_stack['pack_tfidf_lsvc_classfiy_{}'.format(label)] = stack[:, 0]
df_stack.to_csv('data/tfidf_classfiy_package.csv', index=None, encoding='utf8')
print('tfidf特征已保存\n')
# In[8]:
packtime['period'] = (packtime['close'] - packtime['start'])/1000
packtime['start'] = pd.to_datetime(packtime['start'], unit='ms')
app_use_time = packtime.groupby(['app'])['period'].agg('sum').reset_index()
app_use_top100 = app_use_time.sort_values(by='period', ascending=False)[:100]['app']
device_app_use_time = packtime.groupby(['device_id', 'app'])['period'].agg('sum').reset_index()
use_time_top100_statis = device_app_use_time.set_index('app').loc[list(app_use_top100)].reset_index()
top100_statis = use_time_top100_statis.pivot(index='device_id', columns='app', values='period').reset_index()
# In[9]:
top100_statis = top100_statis.fillna(0)
# In[10]:
# 手机品牌预处理
brand['vendor'] = brand['vendor'].astype(str).apply(lambda x : x.split(' ')[0].upper())
brand['ph_ver'] = brand['vendor'] + '_' + brand['version']
ph_ver = brand['ph_ver'].value_counts()
ph_ver_cnt = pd.DataFrame(ph_ver).reset_index()
ph_ver_cnt.columns = ['ph_ver', 'ph_ver_cnt']
brand = pd.merge(left=brand, right=ph_ver_cnt,on='ph_ver')
# In[11]:
# 针对长尾分布做的一点处理
mask = (brand.ph_ver_cnt < 100)
brand.loc[mask, 'ph_ver'] = 'other'
train_data = pd.merge(brand[['device_id', 'ph_ver']], train, on='device_id', how='right')
test_data = pd.merge(brand[['device_id', 'ph_ver']], test, on='device_id', how='right')
train_data['ph_ver'] = train_data['ph_ver'].astype(str)
test_data['ph_ver'] = test_data['ph_ver'].astype(str)
# 将 ph_ver 进行 label encoder
ph_ver_le = preprocessing.LabelEncoder()
train_data['ph_ver'] = ph_ver_le.fit_transform(train_data['ph_ver'])
test_data['ph_ver'] = ph_ver_le.transform(test_data['ph_ver'])
train_data['label'] = train_data['sex'].astype(str) + '-' + train_data['age'].astype(str)
label_le = preprocessing.LabelEncoder()
train_data['label'] = label_le.fit_transform(train_data['label'])
# In[12]:
test_data['sex'] = -1
test_data['age'] = -1
test_data['label'] = -1
data = pd.concat([train_data, test_data], ignore_index=True)
print(data.shape)
# In[13]:
train_data = data[data.sex != -1]
test_data = data[data.sex == -1]
print(train.shape, test.shape)
# In[14]:
# 每个app的总使用次数统计
app_num = packtime['app'].value_counts().reset_index()
app_num.columns = ['app', 'app_num']
packtime = pd.merge(left=packtime, right=app_num, on='app')
# 同样的,针对长尾分布做些处理(尝试过不做处理,或换其他阈值,这个100的阈值最高)
packtime.loc[packtime.app_num < 100, 'app'] = 'other'
# In[15]:
# 统计每台设备的app数量
df_app = packtime[['device_id', 'app']]
apps = df_app.drop_duplicates().groupby(['device_id'])['app'].apply(' '.join).reset_index()
apps['app_length'] = apps['app'].apply(lambda x:len(x.split(' ')))
train_data = pd.merge(train_data, apps, on='device_id', how='left')
test_data = pd.merge(test_data, apps, on='device_id', how='left')
# In[16]:
# 获取每台设备所安装的apps的tfidf
tfidf = CountVectorizer()
apps['app'] = tfidf.fit_transform(apps['app'])
X_tr_app = tfidf.transform(list(train_data['app'])).tocsr()
X_ts_app = tfidf.transform(list(test_data['app'])).tocsr()
# In[17]:
# encoding:utf-8
import sys
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cross_validation import StratifiedKFold
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.linear_model import RidgeClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.metrics import mean_squared_error
############################ 切分数据集 ##########################
print('开始进行一些前期处理')
train_feature = X_tr_app
test_feature = X_ts_app
# 五则交叉验证
n_folds = 5
print('处理完毕')
df_stack = pd.DataFrame()
df_stack['device_id']=data['device_id']
for label in ["sex"]:
score = train_data[label]-1
########################### lr(LogisticRegression) ################################
print('lr stacking')
stack_train = np.zeros((len(train_data), 1))
stack_test = np.zeros((len(test_data), 1))
score_va = 0
for i, (tr, va) in enumerate(StratifiedKFold(score, n_folds=n_folds, random_state=1017)):
print('stack:%d/%d' % ((i + 1), n_folds))
clf = LogisticRegression(random_state=1017, C=8)
clf.fit(train_feature[tr], score[tr])
score_va = clf.predict_proba(train_feature[va])[:,1]
score_te = clf.predict_proba(test_feature)[:,1]
print('得分' + str(mean_squared_error(score[va], clf.predict(train_feature[va]))))
stack_train[va,0] = score_va
stack_test[:,0]+= score_te
stack_test /= n_folds
stack = np.vstack([stack_train, stack_test])
df_stack['tfidf_lr_classfiy_{}'.format(label)] = stack[:, 0]
########################### SGD(随机梯度下降) ################################
print('sgd stacking')
stack_train = np.zeros((len(train_data), 1))
stack_test = np.zeros((len(test_data), 1))
score_va = 0
for i, (tr, va) in enumerate(StratifiedKFold(score, n_folds=n_folds, random_state=1017)):
print('stack:%d/%d' % ((i + 1), n_folds))
sgd = SGDClassifier(random_state=1017, loss='log')
sgd.fit(train_feature[tr], score[tr])
score_va = sgd.predict_proba(train_feature[va])[:,1]
score_te = sgd.predict_proba(test_feature)[:,1]
print('得分' + str(mean_squared_error(score[va], sgd.predict(train_feature[va]))))
stack_train[va,0] = score_va
stack_test[:,0]+= score_te
stack_test /= n_folds
stack = np.vstack([stack_train, stack_test])
df_stack['tfidf_sgd_classfiy_{}'.format(label)] = stack[:, 0]
########################### pac(PassiveAggressiveClassifier) ################################
print('PAC stacking')
stack_train = np.zeros((len(train_data), 1))
stack_test = np.zeros((len(test_data), 1))
score_va = 0
for i, (tr, va) in enumerate(StratifiedKFold(score, n_folds=n_folds, random_state=1017)):
print('stack:%d/%d' % ((i + 1), n_folds))
pac = PassiveAggressiveClassifier(random_state=1017)
pac.fit(train_feature[tr], score[tr])
score_va = pac._predict_proba_lr(train_feature[va])[:,1]
score_te = pac._predict_proba_lr(test_feature)[:,1]
print(score_va)
print('得分' + str(mean_squared_error(score[va], pac.predict(train_feature[va]))))
stack_train[va,0] += score_va
stack_test[:,0] += score_te
stack_test /= n_folds
stack = np.vstack([stack_train, stack_test])
df_stack['tfidf_pac_classfiy_{}'.format(label)] = stack[:, 0]
########################### ridge(RidgeClassfiy) ################################
print('RidgeClassfiy stacking')
stack_train = np.zeros((len(train_data), 1))
stack_test = np.zeros((len(test_data), 1))
score_va = 0
for i, (tr, va) in enumerate(StratifiedKFold(score, n_folds=n_folds, random_state=1017)):
print('stack:%d/%d' % ((i + 1), n_folds))
ridge = RidgeClassifier(random_state=1017)
ridge.fit(train_feature[tr], score[tr])
score_va = ridge._predict_proba_lr(train_feature[va])[:,1]
score_te = ridge._predict_proba_lr(test_feature)[:,1]
print(score_va)
print('得分' + str(mean_squared_error(score[va], ridge.predict(train_feature[va]))))
stack_train[va,0] += score_va
stack_test[:,0] += score_te
stack_test /= n_folds
stack = np.vstack([stack_train, stack_test])
df_stack['tfidf_ridge_classfiy_{}'.format(label)] = stack[:, 0]
########################### bnb(BernoulliNB) ################################
print('BernoulliNB stacking')
stack_train = np.zeros((len(train_data), 1))
stack_test = np.zeros((len(test_data), 1))
score_va = 0
for i, (tr, va) in enumerate(StratifiedKFold(score, n_folds=n_folds, random_state=1017)):
print('stack:%d/%d' % ((i + 1), n_folds))
bnb = BernoulliNB()
bnb.fit(train_feature[tr], score[tr])
score_va = bnb.predict_proba(train_feature[va])[:,1]
score_te = bnb.predict_proba(test_feature)[:,1]
print(score_va)
print('得分' + str(mean_squared_error(score[va], bnb.predict(train_feature[va]))))
stack_train[va,0] += score_va
stack_test[:,0] += score_te
stack_test /= n_folds
stack = np.vstack([stack_train, stack_test])
df_stack['tfidf_bnb_classfiy_{}'.format(label)] = stack[:, 0]
########################### mnb(MultinomialNB) ################################
print('MultinomialNB stacking')
stack_train = np.zeros((len(train_data), 1))
stack_test = np.zeros((len(test_data), 1))
score_va = 0
for i, (tr, va) in enumerate(StratifiedKFold(score, n_folds=n_folds, random_state=1017)):
print('stack:%d/%d' % ((i + 1), n_folds))
mnb = MultinomialNB()
mnb.fit(train_feature[tr], score[tr])
score_va = mnb.predict_proba(train_feature[va])[:,1]
score_te = mnb.predict_proba(test_feature)[:,1]
print(score_va)
print('得分' + str(mean_squared_error(score[va], mnb.predict(train_feature[va]))))
stack_train[va,0] += score_va
stack_test[:,0] += score_te
stack_test /= n_folds
stack = np.vstack([stack_train, stack_test])
df_stack['tfidf_mnb_classfiy_{}'.format(label)] = stack[:, 0]
############################ Linersvc(LinerSVC) ################################
print('LinerSVC stacking')
stack_train = np.zeros((len(train_data), 1))
stack_test = np.zeros((len(test_data), 1))
score_va = 0
for i, (tr, va) in enumerate(StratifiedKFold(score, n_folds=n_folds, random_state=1017)):
print('stack:%d/%d' % ((i + 1), n_folds))
lsvc = LinearSVC(random_state=1017)
lsvc.fit(train_feature[tr], score[tr])
score_va = lsvc._predict_proba_lr(train_feature[va])[:,1]
score_te = lsvc._predict_proba_lr(test_feature)[:,1]
print(score_va)
print('得分' + str(mean_squared_error(score[va], lsvc.predict(train_feature[va]))))
stack_train[va,0] += score_va
stack_test[:,0] += score_te
stack_test /= n_folds
stack = np.vstack([stack_train, stack_test])
df_stack['tfidf_lsvc_classfiy_{}'.format(label)] = stack[:, 0]
df_stack.to_csv('data/tfidf_classfiy.csv', index=None, encoding='utf8')
print('tfidf特征已保存\n')
# ### 利用word2vec得到每台设备所安装app的embedding表示
# In[18]:
packages['apps'] = packages['apps'].apply(lambda x:x.split(','))
packages['app_length'] = packages['apps'].apply(lambda x:len(x))
# In[19]:
embed_size = 128
fastmodel = Word2Vec(list(packages['apps']), size=embed_size, window=4, min_count=3, negative=2,
sg=1, sample=0.002, hs=1, workers=4)
embedding_fast = pd.DataFrame([fastmodel[word] for word in (fastmodel.wv.vocab)])
embedding_fast['app'] = list(fastmodel.wv.vocab)
embedding_fast.columns= ["fdim_%s" % str(i) for i in range(embed_size)]+["app"]
print(embedding_fast.head())
# In[20]:
id_list = []
for i in range(packages.shape[0]):
id_list += [list(packages['device_id'])[i]]*packages['app_length'].iloc[i]
app_list = [word for item in packages['apps'] for word in item]
app_vect = pd.DataFrame({'device_id':id_list})
app_vect['app'] = app_list
# In[21]:
app_vect = app_vect.merge(embedding_fast, on='app', how='left')
app_vect = app_vect.drop('app', axis=1)
seqfeature = app_vect.groupby(['device_id']).agg('mean')
seqfeature.reset_index(inplace=True)
# In[22]:
print(seqfeature.head())
# ### 用户一周七天玩手机的时长情况
# In[23]:
# packtime['period'] = (packtime['close'] - packtime['start'])/1000
# packtime['start'] = pd.to_datetime(packtime['start'], unit='ms')
packtime['dayofweek'] = packtime['start'].dt.dayofweek
packtime['hour'] = packtime['start'].dt.hour
# packtime = packtime[(packtime['start'] < '2017-03-31 23:59:59') & (packtime['start'] > '2017-03-01 00:00:00')]
# In[24]:
app_use_time = packtime.groupby(['device_id', 'dayofweek'])['period'].agg('sum').reset_index()
week_app_use = app_use_time.pivot_table(values='period', columns='dayofweek', index='device_id').reset_index()
week_app_use = week_app_use.fillna(0)
week_app_use.columns = ['device_id'] + ['week_day_' + str(i) for i in range(0, 7)]
week_app_use['week_max'] = week_app_use.max(axis=1)
week_app_use['week_min'] = week_app_use.min(axis=1)
week_app_use['week_sum'] = week_app_use.sum(axis=1)
week_app_use['week_std'] = week_app_use.std(axis=1)
# ### 将各个特征整合到一块
# In[25]:
print(train_data.columns[4:])
# In[26]:
user_behavior = pd.read_csv('data/user_behavior.csv')
user_behavior['app_len_max'] = user_behavior['app_len_max'].astype(np.float64)
del user_behavior['app']
train_data = pd.merge(train_data, user_behavior, on='device_id', how='left')
test_data = pd.merge(test_data, user_behavior, on='device_id', how='left')
# In[27]:
train_data = pd.merge(train_data, seqfeature, on='device_id', how='left')
test_data = pd.merge(test_data, seqfeature, on='device_id', how='left')
# In[28]:
train_data = pd.merge(train_data, week_app_use, on='device_id', how='left')
test_data = pd.merge(test_data, week_app_use, on='device_id', how='left')
# In[29]:
top100_statis.columns = ['device_id'] + ['top100_statis_' + str(i) for i in range(0, 100)]
train_data = pd.merge(train_data, top100_statis, on='device_id', how='left')
test_data = pd.merge(test_data, top100_statis, on='device_id', how='left')
# In[30]:
train_data.to_csv("./data/train_data.csv",index=False)
test_data.to_csv("./data/test_data.csv",index=False)
# In[31]:
tfidf_feat=pd.read_csv("data/tfidf_classfiy.csv")
tf2=pd.read_csv("data/tfidf_classfiy_package.csv")
train_data=pd.read_csv("data/train_data.csv")
test_data=pd.read_csv("data/test_data.csv")
# app_w2v=pd.read_csv("./data/w2v_tfidf.csv")
# In[32]:
train = pd.merge(train_data,tfidf_feat,on="device_id",how="left")
# train = pd.merge(train_data,tf2,on="device_id",how="left")
# train = pd.merge(train_data,app_w2v,on="device_id",how="left")
test = pd.merge(test_data,tfidf_feat,on="device_id",how="left")
# test = pd.merge(test_data,tf2,on="device_id",how="left")
# test = pd.merge(test_data,app_w2v,on="device_id",how="left")
# In[85]:
train_dt = pd.merge(train_data[['device_id','ph_ver']],tfidf_feat,on="device_id",how="left")
train_dt = pd.merge(train_dt,tf2,on="device_id",how="left")
test_dt = pd.merge(test_data[['device_id',"ph_ver"]],tfidf_feat,on="device_id",how="left")
test_dt = pd.merge(test_dt,tf2,on="device_id",how="left")
feat=pd.concat([train_dt,test_dt])
feat.to_csv("data/sex_chizhu_feat.csv",index=False)
# In[33]:
features = [x for x in train.columns if x not in ['device_id', 'sex',"age","label","app"]]
Y = train['sex'] - 1
# ### 开始训练模型
# In[34]:
import lightgbm as lgb
# import xgboost as xgb
from sklearn.metrics import auc, log_loss, roc_auc_score,f1_score,recall_score,precision_score
from sklearn.cross_validation import StratifiedKFold
kf = StratifiedKFold(Y, n_folds=5, shuffle=True, random_state=1024)
params = {
'boosting_type': 'gbdt',
'metric': {'binary_logloss',},
# 'is_unbalance':'True',
'learning_rate' : 0.01,
'verbose': 0,
'num_leaves':32 ,
# 'max_depth':8,
# 'max_bin':10,
# 'lambda_l2': 1,
# 'min_child_weight':50,
'objective': 'binary',
'feature_fraction': 0.4,
'bagging_fraction':0.7, # 0.9是目前最优的
'bagging_freq':3, # 3是目前最优的
# 'min_data': 500,
'seed': 1024,
'nthread': 8,
# 'silent': True,
}
num_round = 3500
early_stopping_rounds = 100
# In[35]:
aus = []
sub1 = np.zeros((len(test), ))
pred_oob1=np.zeros((len(train),))
for i,(train_index,test_index) in enumerate(kf):
tr_x = train[features].reindex(index=train_index, copy=False)
tr_y = Y[train_index]
te_x = train[features].reindex(index=test_index, copy=False)
te_y = Y[test_index]
d_tr = lgb.Dataset(tr_x, label=tr_y)
d_te = lgb.Dataset(te_x, label=te_y)
model = lgb.train(params, d_tr, num_boost_round=num_round,
valid_sets=d_te,verbose_eval=200,
early_stopping_rounds=early_stopping_rounds)
pred= model.predict(te_x, num_iteration=model.best_iteration)
pred_oob1[test_index] =pred
a = log_loss(te_y, pred)
sub1 += model.predict(test[features], num_iteration=model.best_iteration)/5
print ("idx: ", i)
print (" loss: %.5f" % a)
print ("best tree num: ", model.best_iteration)
aus.append(a)
print ("mean")
print ("auc: %s" % (sum(aus) / 5.0))
# In[36]:
#####特征重要性
# get_ipython().run_line_magic('matplotlib', 'inline')
# import matplotlib.pyplot as plt
# f=dict(zip(list(train[features].keys()),model.feature_importance()))
# f=sorted(f.items(),key=lambda d:d[1], reverse = True)
# f=pd.DataFrame(f,columns=['feature','imp'])
# plt.bar(range(len(f)),f.imp)
# plt.xticks(range(len(f)),f.feature,rotation=70,fontsize=20)
# fig = plt.gcf()
# fig.set_size_inches(50, 20)
# In[37]:
# f.ix[:450,:]
# In[38]:
# features=f.ix[:434,"feature"].values
# In[39]:
pred_oob1 = pd.DataFrame(pred_oob1, columns=['sex2'])
sub1 = pd.DataFrame(sub1, columns=['sex2'])
res1=pd.concat([pred_oob1,sub1])
res1['sex1'] = 1-res1['sex2']
# In[40]:
import gc
gc.collect()
# In[41]:
train_id = pd.read_csv(path+'deviceid_train.tsv', sep='\t', names=['device_id', 'sex', 'age'])
# In[42]:
# encoding:utf-8
import sys
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cross_validation import StratifiedKFold
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.linear_model import RidgeClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.metrics import mean_squared_error
############################ 切分数据集 ##########################
print('开始进行一些前期处理')
train_feature = train_app
test_feature = test_app
# 五则交叉验证
n_folds = 5
print('处理完毕')
df_stack = pd.DataFrame()
df_stack['device_id']=all_id['device_id']
for label in ["age"]:
score = train_id[label]
########################### lr(LogisticRegression) ################################
print('lr stacking')
stack_train = np.zeros((len(train), 11))
stack_test = np.zeros((len(test), 11))
score_va = 0
for i, (tr, va) in enumerate(StratifiedKFold(score, n_folds=n_folds, random_state=1017)):
print('stack:%d/%d' % ((i + 1), n_folds))
clf = LogisticRegression(random_state=1017, C=8)
clf.fit(train_feature[tr], score[tr])
score_va = clf.predict_proba(train_feature[va])
score_te = clf.predict_proba(test_feature)
print('得分' + str(mean_squared_error(score[va], clf.predict(train_feature[va]))))
stack_train[va] = score_va
stack_test+= score_te
stack_test /= n_folds
stack = np.vstack([stack_train, stack_test])
for i in range(stack.shape[1]):
df_stack['pack_tfidf_lr_classfiy_{}'.format(i)] = stack[:, i]
########################### SGD(随机梯度下降) ################################
print('sgd stacking')
stack_train = np.zeros((len(train), 11))
stack_test = np.zeros((len(test), 11))
score_va = 0
for i, (tr, va) in enumerate(StratifiedKFold(score, n_folds=n_folds, random_state=1017)):
print('stack:%d/%d' % ((i + 1), n_folds))
sgd = SGDClassifier(random_state=1017, loss='log')
sgd.fit(train_feature[tr], score[tr])
score_va = sgd.predict_proba(train_feature[va])
score_te = sgd.predict_proba(test_feature)
print('得分' + str(mean_squared_error(score[va], sgd.predict(train_feature[va]))))
stack_train[va] = score_va
stack_test+= score_te
stack_test /= n_folds
stack = np.vstack([stack_train, stack_test])
for i in range(stack.shape[1]):
df_stack['pack_tfidf_sgd_classfiy_{}'.format(i)] = stack[:, i]
########################### pac(PassiveAggressiveClassifier) ################################
print('PAC stacking')
stack_train = np.zeros((len(train), 11))
stack_test = np.zeros((len(test), 11))
score_va = 0
for i, (tr, va) in enumerate(StratifiedKFold(score, n_folds=n_folds, random_state=1017)):
print('stack:%d/%d' % ((i + 1), n_folds))
pac = PassiveAggressiveClassifier(random_state=1017)
pac.fit(train_feature[tr], score[tr])
score_va = pac._predict_proba_lr(train_feature[va])
score_te = pac._predict_proba_lr(test_feature)
print(score_va)
print('得分' + str(mean_squared_error(score[va], pac.predict(train_feature[va]))))
stack_train[va] += score_va
stack_test += score_te
stack_test /= n_folds
stack = np.vstack([stack_train, stack_test])
for i in range(stack.shape[1]):
df_stack['pack_tfidf_pac_classfiy_{}'.format(i)] = stack[:, i]
########################### ridge(RidgeClassfiy) ################################
print('RidgeClassfiy stacking')
stack_train = np.zeros((len(train), 11))
stack_test = np.zeros((len(test), 11))
score_va = 0
for i, (tr, va) in enumerate(StratifiedKFold(score, n_folds=n_folds, random_state=1017)):
print('stack:%d/%d' % ((i + 1), n_folds))
ridge = RidgeClassifier(random_state=1017)
ridge.fit(train_feature[tr], score[tr])
score_va = ridge._predict_proba_lr(train_feature[va])
score_te = ridge._predict_proba_lr(test_feature)
print(score_va)
print('得分' + str(mean_squared_error(score[va], ridge.predict(train_feature[va]))))
stack_train[va] += score_va
stack_test += score_te
stack_test /= n_folds
stack = np.vstack([stack_train, stack_test])
for i in range(stack.shape[1]):
df_stack['pack_tfidf_ridge_classfiy_{}'.format(i)] = stack[:, i]
########################### bnb(BernoulliNB) ################################
print('BernoulliNB stacking')
stack_train = np.zeros((len(train), 11))
stack_test = np.zeros((len(test), 11))
score_va = 0
for i, (tr, va) in enumerate(StratifiedKFold(score, n_folds=n_folds, random_state=1017)):
print('stack:%d/%d' % ((i + 1), n_folds))
bnb = BernoulliNB()
bnb.fit(train_feature[tr], score[tr])
score_va = bnb.predict_proba(train_feature[va])
score_te = bnb.predict_proba(test_feature)
print(score_va)
print('得分' + str(mean_squared_error(score[va], bnb.predict(train_feature[va]))))
stack_train[va] += score_va
stack_test += score_te
stack_test /= n_folds
stack = np.vstack([stack_train, stack_test])
for i in range(stack.shape[1]):
df_stack['pack_tfidf_bnb_classfiy_{}'.format(i)] = stack[:, i]
########################### mnb(MultinomialNB) ################################
print('MultinomialNB stacking')
stack_train = np.zeros((len(train), 11))
stack_test = np.zeros((len(test), 11))
score_va = 0
for i, (tr, va) in enumerate(StratifiedKFold(score, n_folds=n_folds, random_state=1017)):
print('stack:%d/%d' % ((i + 1), n_folds))
mnb = MultinomialNB()
mnb.fit(train_feature[tr], score[tr])
score_va = mnb.predict_proba(train_feature[va])
score_te = mnb.predict_proba(test_feature)
print(score_va)
print('得分' + str(mean_squared_error(score[va], mnb.predict(train_feature[va]))))
stack_train[va] += score_va
stack_test += score_te
stack_test /= n_folds
stack = np.vstack([stack_train, stack_test])
for i in range(stack.shape[1]):
df_stack['pack_tfidf_mnb_classfiy_{}'.format(i)] = stack[:, i]
############################ Linersvc(LinerSVC) ################################
print('LinerSVC stacking')
stack_train = np.zeros((len(train), 11))
stack_test = np.zeros((len(test), 11))
score_va = 0
for i, (tr, va) in enumerate(StratifiedKFold(score, n_folds=n_folds, random_state=1017)):
print('stack:%d/%d' % ((i + 1), n_folds))
lsvc = LinearSVC(random_state=1017)
lsvc.fit(train_feature[tr], score[tr])
score_va = lsvc._predict_proba_lr(train_feature[va])
score_te = lsvc._predict_proba_lr(test_feature)
print(score_va)
print('得分' + str(mean_squared_error(score[va], lsvc.predict(train_feature[va]))))
stack_train[va] += score_va
stack_test += score_te
stack_test /= n_folds
stack = np.vstack([stack_train, stack_test])
for i in range(stack.shape[1]):
df_stack['pack_tfidf_lsvc_classfiy_{}'.format(i)] = stack[:, i]
df_stack.to_csv('data/pack_tfidf_age.csv', index=None, encoding='utf8')
print('tfidf特征已保存\n')
# #### tfidf
# In[43]:
# encoding:utf-8
import sys
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cross_validation import StratifiedKFold
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.linear_model import RidgeClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.metrics import mean_squared_error
############################ 切分数据集 ##########################
print('开始进行一些前期处理')
train_feature = X_tr_app
test_feature = X_ts_app
# 五则交叉验证
n_folds = 5
print('处理完毕')
df_stack = pd.DataFrame()
df_stack['device_id']=data['device_id']
for label in ["age"]:
score = train[label]
########################### lr(LogisticRegression) ################################
print('lr stacking')
stack_train = np.zeros((len(train), 11))
stack_test = np.zeros((len(test), 11))
score_va = 0
for i, (tr, va) in enumerate(StratifiedKFold(score, n_folds=n_folds, random_state=1017)):
print('stack:%d/%d' % ((i + 1), n_folds))
clf = LogisticRegression(random_state=1017, C=8)
clf.fit(train_feature[tr], score[tr])
score_va = clf.predict_proba(train_feature[va])
score_te = clf.predict_proba(test_feature)
print('得分' + str(mean_squared_error(score[va], clf.predict(train_feature[va]))))
stack_train[va] = score_va
stack_test+= score_te
stack_test /= n_folds
stack = np.vstack([stack_train, stack_test])
for i in range(stack.shape[1]):
df_stack['tfidf_lr_classfiy_{}'.format(i)] = stack[:, i]
########################### SGD(随机梯度下降) ################################
print('sgd stacking')
stack_train = np.zeros((len(train), 11))
stack_test = np.zeros((len(test), 11))
score_va = 0
for i, (tr, va) in enumerate(StratifiedKFold(score, n_folds=n_folds, random_state=1017)):
print('stack:%d/%d' % ((i + 1), n_folds))
sgd = SGDClassifier(random_state=1017, loss='log')
sgd.fit(train_feature[tr], score[tr])
score_va = sgd.predict_proba(train_feature[va])
score_te = sgd.predict_proba(test_feature)
print('得分' + str(mean_squared_error(score[va], sgd.predict(train_feature[va]))))
stack_train[va] = score_va
stack_test+= score_te
stack_test /= n_folds
stack = np.vstack([stack_train, stack_test])
for i in range(stack.shape[1]):
df_stack['tfidf_sgd_classfiy_{}'.format(i)] = stack[:, i]
########################### pac(PassiveAggressiveClassifier) ################################
print('PAC stacking')
stack_train = np.zeros((len(train), 11))
stack_test = np.zeros((len(test), 11))
score_va = 0
for i, (tr, va) in enumerate(StratifiedKFold(score, n_folds=n_folds, random_state=1017)):
print('stack:%d/%d' % ((i + 1), n_folds))
pac = PassiveAggressiveClassifier(random_state=1017)
pac.fit(train_feature[tr], score[tr])
score_va = pac._predict_proba_lr(train_feature[va])
score_te = pac._predict_proba_lr(test_feature)
print(score_va)
print('得分' + str(mean_squared_error(score[va], pac.predict(train_feature[va]))))
stack_train[va] += score_va
stack_test += score_te
stack_test /= n_folds
stack = np.vstack([stack_train, stack_test])
for i in range(stack.shape[1]):
df_stack['tfidf_pac_classfiy_{}'.format(i)] = stack[:, i]
########################### ridge(RidgeClassfiy) ################################
print('RidgeClassfiy stacking')
stack_train = np.zeros((len(train), 11))
stack_test = np.zeros((len(test), 11))
score_va = 0
for i, (tr, va) in enumerate(StratifiedKFold(score, n_folds=n_folds, random_state=1017)):
print('stack:%d/%d' % ((i + 1), n_folds))
ridge = RidgeClassifier(random_state=1017)
ridge.fit(train_feature[tr], score[tr])
score_va = ridge._predict_proba_lr(train_feature[va])
score_te = ridge._predict_proba_lr(test_feature)
print(score_va)
print('得分' + str(mean_squared_error(score[va], ridge.predict(train_feature[va]))))
stack_train[va] += score_va
stack_test += score_te
stack_test /= n_folds
stack = np.vstack([stack_train, stack_test])
for i in range(stack.shape[1]):
df_stack['tfidf_ridge_classfiy_{}'.format(i)] = stack[:, i]
########################### bnb(BernoulliNB) ################################
print('BernoulliNB stacking')
stack_train = np.zeros((len(train), 11))
stack_test = np.zeros((len(test), 11))
score_va = 0
for i, (tr, va) in enumerate(StratifiedKFold(score, n_folds=n_folds, random_state=1017)):
print('stack:%d/%d' % ((i + 1), n_folds))
bnb = BernoulliNB()
bnb.fit(train_feature[tr], score[tr])
score_va = bnb.predict_proba(train_feature[va])
score_te = bnb.predict_proba(test_feature)
print(score_va)
print('得分' + str(mean_squared_error(score[va], bnb.predict(train_feature[va]))))
stack_train[va] += score_va
stack_test += score_te
stack_test /= n_folds
stack = np.vstack([stack_train, stack_test])
for i in range(stack.shape[1]):
df_stack['tfidf_bnb_classfiy_{}'.format(i)] = stack[:, i]
########################### mnb(MultinomialNB) ################################
print('MultinomialNB stacking')
stack_train = np.zeros((len(train), 11))
stack_test = np.zeros((len(test), 11))
score_va = 0
for i, (tr, va) in enumerate(StratifiedKFold(score, n_folds=n_folds, random_state=1017)):
print('stack:%d/%d' % ((i + 1), n_folds))
mnb = MultinomialNB()
mnb.fit(train_feature[tr], score[tr])
score_va = mnb.predict_proba(train_feature[va])
score_te = mnb.predict_proba(test_feature)
print(score_va)
print('得分' + str(mean_squared_error(score[va], mnb.predict(train_feature[va]))))
stack_train[va] += score_va
stack_test += score_te
stack_test /= n_folds
stack = np.vstack([stack_train, stack_test])
for i in range(stack.shape[1]):
df_stack['tfidf_mnb_classfiy_{}'.format(i)] = stack[:, i]
############################ Linersvc(LinerSVC) ################################
print('LinerSVC stacking')
stack_train = np.zeros((len(train), 11))
stack_test = np.zeros((len(test), 11))
score_va = 0
for i, (tr, va) in enumerate(StratifiedKFold(score, n_folds=n_folds, random_state=1017)):
print('stack:%d/%d' % ((i + 1), n_folds))
lsvc = LinearSVC(random_state=1017)
lsvc.fit(train_feature[tr], score[tr])
score_va = lsvc._predict_proba_lr(train_feature[va])
score_te = lsvc._predict_proba_lr(test_feature)
print(score_va)
print('得分' + str(mean_squared_error(score[va], lsvc.predict(train_feature[va]))))
stack_train[va] += score_va
stack_test += score_te
stack_test /= n_folds
stack = np.vstack([stack_train, stack_test])
for i in range(stack.shape[1]):
df_stack['data/tfidf_lsvc_classfiy_{}'.format(i)] = stack[:, i]
df_stack.to_csv('data/tfidf_age.csv', index=None, encoding='utf8')
print('tfidf特征已保存\n')
# In[44]:
tfidf_feat=pd.read_csv("data/tfidf_age.csv")
tf2=pd.read_csv("data/pack_tfidf_age.csv")
train_data=pd.read_csv("data/train_data.csv")
test_data=pd.read_csv("data/test_data.csv")
# In[41]:
train_dt = pd.merge(train_data[['device_id','ph_ver']],tfidf_feat,on="device_id",how="left")
train_dt = pd.merge(train_dt,tf2,on="device_id",how="left")
test_dt = pd.merge(test_data[['device_id',"ph_ver"]],tfidf_feat,on="device_id",how="left")
test_dt = pd.merge(test_dt,tf2,on="device_id",how="left")
feat=pd.concat([train_dt,test_dt])
feat.to_csv("data/age_chizhu_feat.csv",index=False)
# In[40]:
# In[45]:
tfidf_feat=pd.read_csv("data/tfidf_age.csv")
tf2=pd.read_csv("data/pack_tfidf_age.csv")
train_data=pd.read_csv("data/train_data.csv")
test_data=pd.read_csv("data/test_data.csv")
train = pd.merge(train_data,tfidf_feat,on="device_id",how="left")
# train = pd.merge(train_data,tf2,on="device_id",how="left")
# train = pd.merge(train_data,app_w2v,on="device_id",how="left")
test = pd.merge(test_data,tfidf_feat,on="device_id",how="left")
# test = pd.merge(test_data,tf2,on="device_id",how="left")
# test = pd.merge(test_data,app_w2v,on="device_id",how="left")
features = [x for x in train.columns if x not in ['device_id',"age","sex","label","app"]]
Y = train['age']
# In[46]:
import lightgbm as lgb
# import xgboost as xgb
from sklearn.metrics import auc, log_loss, roc_auc_score,f1_score,recall_score,precision_score
from sklearn.cross_validation import StratifiedKFold
kf = StratifiedKFold(Y, n_folds=5, shuffle=True, random_state=1024)
params = {
'boosting_type': 'gbdt',
'metric': {'multi_logloss',},
# 'is_unbalance':'True',
'learning_rate' : 0.01,
'verbose': 0,
'num_leaves':32 ,
# 'max_depth':8,
# 'max_bin':10,
# 'lambda_l2': 1,
# 'min_child_weight':50,
"num_class":11,
'objective': 'multiclass',
'feature_fraction': 0.4,
'bagging_fraction':0.7, # 0.9是目前最优的
'bagging_freq':3, # 3是目前最优的
# 'min_data': 500,
'seed': 1024,
'nthread': 8,
# 'silent': True,
}
num_round = 3500
early_stopping_rounds = 100
# In[47]:
aus = []
sub2 = np.zeros((len(test),11 ))
pred_oob2=np.zeros((len(train),11))
for i,(train_index,test_index) in enumerate(kf):
tr_x = train[features].reindex(index=train_index, copy=False)
tr_y = Y[train_index]
te_x = train[features].reindex(index=test_index, copy=False)
te_y = Y[test_index]
d_tr = lgb.Dataset(tr_x, label=tr_y)
d_te = lgb.Dataset(te_x, label=te_y)
model = lgb.train(params, d_tr, num_boost_round=num_round,
valid_sets=d_te,verbose_eval=200,
early_stopping_rounds=early_stopping_rounds)
pred= model.predict(te_x, num_iteration=model.best_iteration)
pred_oob2[test_index] =pred
a = log_loss(te_y, pred)
sub2 += model.predict(test[features], num_iteration=model.best_iteration)/5
print ("idx: ", i)
print (" loss: %.5f" % a)
print ("best tree num: ", model.best_iteration)
aus.append(a)
print ("mean")
print ("loss: %s" % (sum(aus) / 5.0))
# In[55]:
#####特征重要性
# import matplotlib.pyplot as plt
# f=dict(zip(list(train[features].keys()),model.feature_importance()))
# f=sorted(f.items(),key=lambda d:d[1], reverse = True)
# f=pd.DataFrame(f,columns=['feature','imp'])
# plt.bar(range(len(f)),f.imp)
# plt.xticks(range(len(f)),f.feature,rotation=70,fontsize=20)
# fig = plt.gcf()
# fig.set_size_inches(50, 20)
# In[56]:
# f.ix[:650,:]
# In[57]:
# features=f.ix[:641,"feature"].values
# In[58]:
res2_1=np.vstack((pred_oob2,sub2))
res2_1 = pd.DataFrame(res2_1)
# In[59]:
if not os.path.exists("submit"):
os.mkdir("submit")
res1.index=range(len(res1))
res2_1.index=range(len(res2_1))
final_1=res2_1.copy()
final_2=res2_1.copy()
for i in range(11):
final_1[i]=res1['sex1']*res2_1[i]
final_2[i]=res1['sex2']*res2_1[i]
id_list=pd.concat([train[['device_id']],test[['device_id']]])
final=id_list
final.index=range(len(final))
final.columns= ['DeviceID']
final_pred = pd.concat([final_1,final_2],1)
final=pd.concat([final,final_pred],1)
final.columns = ['DeviceID', '1-0', '1-1', '1-2', '1-3', '1-4', '1-5', '1-6',
'1-7','1-8', '1-9', '1-10', '2-0', '2-1', '2-2', '2-3', '2-4',
'2-5', '2-6', '2-7', '2-8', '2-9', '2-10']
final.to_csv('submit/lgb_feat_chizhu.csv', index=False)
# In[60]:
test['DeviceID']=test['device_id']
sub=pd.merge(test[['DeviceID']],final,on="DeviceID",how="left")
sub.to_csv("submit/lgb_chizhu.csv",index=False)
# In[61]:
# sub.sum(1)
================================================
FILE: chizhu/single_model/user_behavior.py
================================================
# coding: utf-8
import pandas as pd
import seaborn as sns
import numpy as np
from tqdm import tqdm
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import lightgbm as lgb
from datetime import datetime, timedelta
import matplotlib.pyplot as plt
import time
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
# %matplotlib inline
from config import path
#add
import gc
packtime = pd.read_table(path+'deviceid_package_start_close.tsv',
names=['device_id', 'app', 'start', 'close'], low_memory=True)
# packtime.head()
packtime['peroid'] = (packtime['close'] - packtime['start'])/1000
packtime['start'] = pd.to_datetime(packtime['start'], unit='ms')
#packtime['closetime'] = pd.to_datetime(packtime['close'], unit='ms')
del packtime['close']
gc.collect()
#packtime['day'] = packtime['start'].dt.day
#packtime['month'] = packtime['start'].dt.month
packtime['hour'] = packtime['start'].dt.hour
packtime['date'] = packtime['start'].dt.date
packtime['dayofweek'] = packtime['start'].dt.dayofweek
#packtime['hour'] = pd.cut(packtime['hour'], bins=4).cat.codes
#平均每天使用设备时间
dtime = packtime.groupby(['device_id', 'date'])['peroid'].agg('sum')
#不同时间段占比
qtime = packtime.groupby(['device_id', 'hour'])['peroid'].agg('sum')
wtime = packtime.groupby(['device_id', 'dayofweek'])['peroid'].agg('sum')
atime = packtime.groupby(['device_id', 'app'])['peroid'].agg('sum')
dapp = packtime[['device_id', 'date', 'app']].drop_duplicates().groupby(
['device_id', 'date'])['app'].agg(' '.join)
dapp = dapp.reset_index()
dapp['app_len'] = dapp['app'].apply(lambda x: x.split(' ')).apply(len)
dapp_stat = dapp.groupby('device_id')['app_len'].agg(
{'std': 'std', 'mean': 'mean', 'max': 'max'})
dapp_stat = dapp_stat.reset_index()
dapp_stat.columns = ['device_id', 'app_len_std', 'app_len_mean', 'app_len_max']
# dapp_stat.head()
dtime = dtime.reset_index()
dtime_stat = dtime.groupby(['device_id'])['peroid'].agg(
{'sum': 'sum', 'mean': 'mean', 'std': 'std', 'max': 'max'}).reset_index()
dtime_stat.columns = ['device_id', 'date_sum',
'date_mean', 'date_std', 'date_max']
# dtime_stat.head()
qtime = qtime.reset_index()
ftime = qtime.pivot(index='device_id', columns='hour',
values='peroid').fillna(0)
ftime.columns = ['h%s' % i for i in range(24)]
ftime.reset_index(inplace=True)
# ftime.head()
wtime = wtime.reset_index()
weektime = wtime.pivot(
index='device_id', columns='dayofweek', values='peroid').fillna(0)
weektime.columns = ['w0', 'w1', 'w2', 'w3', 'w4', 'w5', 'w6']
weektime.reset_index(inplace=True)
# weektime.head()
atime = atime.reset_index()
app = atime.groupby(['device_id'])['peroid'].idxmax()
#dapp_stat.shape, dtime_stat.shape, ftime.shape, weektime.shape, app.shape
user = pd.merge(dapp_stat, dtime_stat, on='device_id', how='left')
user = pd.merge(user, ftime, on='device_id', how='left')
user = pd.merge(user, weektime, on='device_id', how='left')
user = pd.merge(user, atime.iloc[app], on='device_id', how='left')
app_cat = pd.read_table(path+'package_label.tsv',
names=['app', 'category', 'app_name'])
cat_enc = pd.DataFrame(app_cat['category'].value_counts())
cat_enc['idx'] = range(45)
app_cat['cat_enc'] = app_cat['category'].map(cat_enc['idx'])
app_cat.set_index(['app'], inplace=True)
atime['app_cat_enc'] = atime['app'].map(app_cat['cat_enc']).fillna(45)
cat_num = atime.groupby(['device_id', 'app_cat_enc'])[
'app'].agg('count').reset_index()
cat_time = atime.groupby(['device_id', 'app_cat_enc'])[
'peroid'].agg('sum').reset_index()
app_cat_num = cat_num.pivot(
index='device_id', columns='app_cat_enc', values='app').fillna(0)
app_cat_num.columns = ['cat%s' % i for i in range(46)]
app_cat_time = cat_time.pivot(
index='device_id', columns='app_cat_enc', values='peroid').fillna(0)
app_cat_time.columns = ['time%s' % i for i in range(46)]
user = pd.merge(user, app_cat_num, on='device_id', how='left')
user = pd.merge(user, app_cat_time, on='device_id', how='left')
user.to_csv('data/user_behavior.csv', index=False)
================================================
FILE: chizhu/single_model/xgb.py
================================================
# coding: utf-8
# In[1]:
import pandas as pd
import seaborn as sns
import numpy as np
from tqdm import tqdm
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import lightgbm as lgb
from datetime import datetime,timedelta
import matplotlib.pyplot as plt
import time
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
# get_ipython().run_line_magic('matplotlib', 'inline')
#add
import gc
from sklearn import preprocessing
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack, vstack
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
# from skopt.space import Integer, Categorical, Real, Log10
# from skopt.utils import use_named_args
# from skopt import gp_minimize
from gensim.models import Word2Vec, FastText
import gensim
import re
# path="/dev/shm/chizhu_data/data/"
# In[2]:
tfidf_feat=pd.read_csv("data/tfidf_classfiy.csv")
tf2=pd.read_csv("data/tfidf_classfiy_package.csv")
train_data=pd.read_csv("data/train_data.csv")
test_data=pd.read_csv("data/test_data.csv")
# In[3]:
train_data = pd.merge(train_data,tfidf_feat,on="device_id",how="left")
train = pd.merge(train_data,tf2,on="device_id",how="left")
test_data = pd.merge(test_data,tfidf_feat,on="device_id",how="left")
test = pd.merge(test_data,tf2,on="device_id",how="left")
# In[4]:
features = [x for x in train.columns if x not in ['device_id', 'sex',"age","label","app"]]
Y = train['sex'] - 1
# In[19]:
import lightgbm as lgb
import xgboost as xgb
from sklearn.metrics import auc, log_loss, roc_auc_score,f1_score,recall_score,precision_score
from sklearn.cross_validation import StratifiedKFold
kf = StratifiedKFold(Y, n_folds=5, shuffle=True, random_state=1024)
params={
'booster':'gbtree',
'objective': 'binary:logistic',
# 'is_unbalance':'True',
# 'scale_pos_weight': 1500.0/13458.0,
'eval_metric': "logloss",
'gamma':0.2,#0.2 is ok
'max_depth':6,
# 'lambda':20,
# "alpha":5,
'subsample':0.7,
'colsample_bytree':0.4 ,
# 'min_child_weight':2.5,
'eta': 0.01,
# 'learning_rate':0.01,
"silent":1,
'seed':1024,
'nthread':12,
}
num_round = 3500
early_stopping_rounds = 100
# In[20]:
aus = []
sub1 = np.zeros((len(test), ))
pred_oob1=np.zeros((len(train),))
for i,(train_index,test_index) in enumerate(kf):
tr_x = train[features].reindex(index=train_index, copy=False)
tr_y = Y[train_index]
te_x = train[features].reindex(index=test_index, copy=False)
te_y = Y[test_index]
# tr_y=tr_y.apply(lambda x:1 if x>0 else 0)
# te_y=te_y.apply(lambda x:1 if x>0 else 0)
d_tr = xgb.DMatrix(tr_x, label=tr_y)
d_te = xgb.DMatrix(te_x, label=te_y)
watchlist = [(d_tr,'train'),
(d_te,'val')
]
model = xgb.train(params, d_tr, num_boost_round=5500,
evals=watchlist,verbose_eval=200,
early_stopping_rounds=100)
pred = model.predict(d_te)
pred_oob1[test_index] =pred
# te_y=te_y.apply(lambda x:1 if x>0 else 0)
a = log_loss(te_y, pred)
sub1 += model.predict(xgb.DMatrix(test[features]))/5
print ("idx: ", i)
print (" loss: %.5f" % a)
# print " gini: %.5f" % g
aus.append(a)
print ("mean")
print ("auc: %s" % (sum(aus) / 5.0))
# In[21]:
pred_oob1 = pd.DataFrame(pred_oob1, columns=['sex2'])
sub1 = pd.DataFrame(sub1, columns=['sex2'])
res1=pd.concat([pred_oob1,sub1])
res1['sex1'] = 1-res1['sex2']
# In[22]:
import gc
gc.collect()
# In[23]:
tfidf_feat=pd.read_csv("data/tfidf_age.csv")
tf2=pd.read_csv("data/pack_tfidf_age.csv")
train_data = pd.merge(train_data,tfidf_feat,on="device_id",how="left")
train = pd.merge(train_data,tf2,on="device_id",how="left")
test_data = pd.merge(test_data,tfidf_feat,on="device_id",how="left")
test = pd.merge(test_data,tf2,on="device_id",how="left")
features = [x for x in train.columns if x not in ['device_id',"age","sex","label","app"]]
Y = train['age']
# In[34]:
import lightgbm as lgb
import xgboost as xgb
from sklearn.metrics import auc, log_loss, roc_auc_score,f1_score,recall_score,precision_score
from sklearn.cross_validation import StratifiedKFold
kf = StratifiedKFold(Y, n_folds=5, shuffle=True, random_state=1024)
params={
'booster':'gbtree',
'objective': 'multi:softprob',
# 'is_unbalance':'True',
# 'scale_pos_weight': 1500.0/13458.0,
'eval_metric': "mlogloss",
'num_class':11,
'gamma':0.1,#0.2 is ok
'max_depth':6,
# 'lambda':20,
# "alpha":5,
'subsample':0.7,
'colsample_bytree':0.4 ,
# 'min_child_weight':2.5,
'eta': 0.01,
# 'learning_rate':0.01,
"silent":1,
'seed':1024,
'nthread':12,
}
num_round = 3500
early_stopping_rounds = 100
# In[ ]:
aus = []
sub2 = np.zeros((len(test),11 ))
pred_oob2=np.zeros((len(train),11))
for i,(train_index,test_index) in enumerate(kf):
tr_x = train[features].reindex(index=train_index, copy=False)
tr_y = Y[train_index]
te_x = train[features].reindex(index=test_index, copy=False)
te_y = Y[test_index]
# tr_y=tr_y.apply(lambda x:1 if x>0 else 0)
# te_y=te_y.apply(lambda x:1 if x>0 else 0)
d_tr = xgb.DMatrix(tr_x, label=tr_y)
d_te = xgb.DMatrix(te_x, label=te_y)
watchlist = [(d_tr,'train'),
(d_te,'val')
]
model = xgb.train(params, d_tr, num_boost_round=5500,
evals=watchlist,verbose_eval=200,
early_stopping_rounds=100)
pred = model.predict(d_te)
pred_oob2[test_index] =pred
# te_y=te_y.apply(lambda x:1 if x>0 else 0)
a = log_loss(te_y, pred)
sub2 += model.predict(xgb.DMatrix(test[features]))/5
print ("idx: ", i)
print (" loss: %.5f" % a)
# print " gini: %.5f" % g
aus.append(a)
print ("mean")
print ("auc: %s" % (sum(aus) / 5.0))
# In[ ]:
res2_1=np.vstack((pred_oob2,sub2))
res2_1 = pd.DataFrame(res2_1)
# In[ ]:
res1.index=range(len(res1))
res2_1.index=range(len(res2_1))
final_1=res2_1.copy()
final_2=res2_1.copy()
for i in range(11):
final_1[i]=res1['sex1']*res2_1[i]
final_2[i]=res1['sex2']*res2_1[i]
id_list=pd.concat([train[['device_id']],test[['device_id']]])
final=id_list
final.index=range(len(final))
final.columns= ['DeviceID']
final_pred = pd.concat([final_1,final_2],1)
final=pd.concat([final,final_pred],1)
final.columns = ['DeviceID', '1-0', '1-1', '1-2', '1-3', '1-4', '1-5', '1-6',
'1-7','1-8', '1-9', '1-10', '2-0', '2-1', '2-2', '2-3', '2-4',
'2-5', '2-6', '2-7', '2-8', '2-9', '2-10']
final.to_csv('submit/xgb_feat_chizhu.csv', index=False)
# In[ ]:
test['DeviceID']=test['device_id']
sub=pd.merge(test[['DeviceID']],final,on="DeviceID",how="left")
sub.to_csv("submit/xgb_chizhu.csv",index=False)
================================================
FILE: chizhu/single_model/xgb_nb.py
================================================
# coding: utf-8
# In[1]:
import pandas as pd
import seaborn as sns
import numpy as np
from tqdm import tqdm
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import lightgbm as lgb
from datetime import datetime,timedelta
import matplotlib.pyplot as plt
import time
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
# get_ipython().run_line_magic('matplotlib', 'inline')
#add
import gc
from sklearn import preprocessing
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack, vstack
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
# from skopt.space import Integer, Categorical, Real, Log10
# from skopt.utils import use_named_args
# from skopt import gp_minimize
from gensim.models import Word2Vec, FastText
import gensim
import re
# path="/dev/shm/chizhu_data/data/"
# In[2]:
tfidf_feat=pd.read_csv("data/tfidf_classfiy.csv")
tf2=pd.read_csv("data/tfidf_classfiy_package.csv")
train_data=pd.read_csv("data/train_data.csv")
test_data=pd.read_csv("data/test_data.csv")
# In[4]:
train_data = pd.merge(train_data,tfidf_feat,on="device_id",how="left")
train = pd.merge(train_data,tf2,on="device_id",how="left")
test_data = pd.merge(test_data,tfidf_feat,on="device_id",how="left")
test = pd.merge(test_data,tf2,on="device_id",how="left")
# In[5]:
features = [x for x in train.columns if x not in ['device_id', 'sex',"age","label","app"]]
Y = train['sex'] - 1
# In[5]:
import lightgbm as lgb
import xgboost as xgb
from sklearn.metrics import auc, log_loss, roc_auc_score,f1_score,recall_score,precision_score
from sklearn.cross_validation import StratifiedKFold
kf = StratifiedKFold(Y, n_folds=5, shuffle=True, random_state=1024)
params={
'booster':'gbtree',
'objective': 'binary:logistic',
# 'is_unbalance':'True',
# 'scale_pos_weight': 1500.0/13458.0,
'eval_metric': "logloss",
'gamma':0.2,#0.2 is ok
'max_depth':6,
# 'lambda':20,
# "alpha":5,
'subsample':0.7,
'colsample_bytree':0.4 ,
# 'min_child_weight':2.5,
'eta': 0.01,
# 'learning_rate':0.01,
"silent":1,
'seed':1024,
'nthread':12,
}
num_round = 3500
early_stopping_rounds = 100
# In[6]:
aus = []
sub1 = np.zeros((len(test), ))
pred_oob1=np.zeros((len(train),))
for i,(train_index,test_index) in enumerate(kf):
tr_x = train[features].reindex(index=train_index, copy=False)
tr_y = Y[train_index]
te_x = train[features].reindex(index=test_index, copy=False)
te_y = Y[test_index]
# tr_y=tr_y.apply(lambda x:1 if x>0 else 0)
# te_y=te_y.apply(lambda x:1 if x>0 else 0)
d_tr = xgb.DMatrix(tr_x, label=tr_y)
d_te = xgb.DMatrix(te_x, label=te_y)
watchlist = [(d_tr,'train'),
(d_te,'val')
]
model = xgb.train(params, d_tr, num_boost_round=5500,
evals=watchlist,verbose_eval=200,
early_stopping_rounds=100)
pred = model.predict(d_te)
pred_oob1[test_index] =pred
# te_y=te_y.apply(lambda x:1 if x>0 else 0)
a = log_loss(te_y, pred)
sub1 += model.predict(xgb.DMatrix(test[features]))/5
print ("idx: ", i)
print (" loss: %.5f" % a)
# print " gini: %.5f" % g
aus.append(a)
print ("mean")
print ("auc: %s" % (sum(aus) / 5.0))
# In[7]:
pred_oob1 = pd.DataFrame(pred_oob1, columns=['sex2'])
sub1 = pd.DataFrame(sub1, columns=['sex2'])
res1=pd.concat([pred_oob1,sub1])
res1['sex1'] = 1-res1['sex2']
# In[8]:
import gc
gc.collect()
# In[9]:
tfidf_feat=pd.read_csv("data/tfidf_age.csv")
tf2=pd.read_csv("data/pack_tfidf_age.csv")
train_data = pd.merge(train_data,tfidf_feat,on="device_id",how="left")
train = pd.merge(train_data,tf2,on="device_id",how="left")
test_data = pd.merge(test_data,tfidf_feat,on="device_id",how="left")
test = pd.merge(test_data,tf2,on="device_id",how="left")
# In[10]:
####sex1
test['sex']=1
# In[11]:
features = [x for x in train.columns if x not in ['device_id',"age","label","app"]]
Y = train['age']
# In[12]:
import lightgbm as lgb
import xgboost as xgb
from sklearn.metrics import auc, log_loss, roc_auc_score,f1_score,recall_score,precision_score
from sklearn.cross_validation import StratifiedKFold
kf = StratifiedKFold(Y, n_folds=5, shuffle=True, random_state=1024)
params={
'booster':'gbtree',
'objective': 'multi:softprob',
# 'is_unbalance':'True',
# 'scale_pos_weight': 1500.0/13458.0,
'eval_metric': "mlogloss",
'num_class':11,
'gamma':0.1,#0.2 is ok
'max_depth':6,
# 'lambda':20,
# "alpha":5,
'subsample':0.7,
'colsample_bytree':0.4 ,
# 'min_child_weight':2.5,
'eta': 0.01,
# 'learning_rate':0.01,
"silent":1,
'seed':1024,
'nthread':12,
}
num_round = 3500
early_stopping_rounds = 100
# In[13]:
aus = []
sub2 = np.zeros((len(test),11 ))
pred_oob2=np.zeros((len(train),11))
for i,(train_index,test_index) in enumerate(kf):
tr_x = train[features].reindex(index=train_index, copy=False)
tr_y = Y[train_index]
te_x = train[features].reindex(index=test_index, copy=False)
te_y = Y[test_index]
# tr_y=tr_y.apply(lambda x:1 if x>0 else 0)
# te_y=te_y.apply(lambda x:1 if x>0 else 0)
d_tr = xgb.DMatrix(tr_x, label=tr_y)
d_te = xgb.DMatrix(te_x, label=te_y)
watchlist = [(d_tr,'train'),
(d_te,'val')
]
model = xgb.train(params, d_tr, num_boost_round=5500,
evals=watchlist,verbose_eval=200,
early_stopping_rounds=100)
pred = model.predict(d_te)
pred_oob2[test_index] =pred
# te_y=te_y.apply(lambda x:1 if x>0 else 0)
a = log_loss(te_y, pred)
sub2 += model.predict(xgb.DMatrix(test[features]))/5
print ("idx: ", i)
print (" loss: %.5f" % a)
# print " gini: %.5f" % g
aus.append(a)
print ("mean")
print ("auc: %s" % (sum(aus) / 5.0))
# In[14]:
res2_1=np.vstack((pred_oob2,sub2))
res2_1 = pd.DataFrame(res2_1)
# In[ ]:
###sex2
test['sex']=2
features = [x for x in train.columns if x not in ['device_id',"age","label","app"]]
Y = train['age']
# In[ ]:
aus = []
sub2 = np.zeros((len(test),11 ))
pred_oob2=np.zeros((len(train),11))
for i,(train_index,test_index) in enumerate(kf):
tr_x = train[features].reindex(index=train_index, copy=False)
tr_y = Y[train_index]
te_x = train[features].reindex(index=test_index, copy=False)
te_y = Y[test_index]
# tr_y=tr_y.apply(lambda x:1 if x>0 else 0)
# te_y=te_y.apply(lambda x:1 if x>0 else 0)
d_tr = xgb.DMatrix(tr_x, label=tr_y)
d_te = xgb.DMatrix(te_x, label=te_y)
watchlist = [(d_tr,'train'),
(d_te,'val')
]
model = xgb.train(params, d_tr, num_boost_round=5500,
evals=watchlist,verbose_eval=200,
early_stopping_rounds=100)
pred = model.predict(d_te)
pred_oob2[test_index] =pred
# te_y=te_y.apply(lambda x:1 if x>0 else 0)
a = log_loss(te_y, pred)
sub2 += model.predict(xgb.DMatrix(test[features]))/5
print ("idx: ", i)
print (" loss: %.5f" % a)
# print " gini: %.5f" % g
aus.append(a)
print ("mean")
print ("auc: %s" % (sum(aus) / 5.0))
# In[ ]:
res2_2=np.vstack((pred_oob2,sub2))
res2_2 = pd.DataFrame(res2_2)
# In[ ]:
res1.index=range(len(res1))
res2_1.index=range(len(res2_1))
res2_2.index=range(len(res2_2))
final_1=res2_1.copy()
final_2=res2_2.copy()
for i in range(11):
final_1[i]=res1['sex1']*res2_1[i]
final_2[i]=res1['sex2']*res2_2[i]
id_list=pd.concat([train[['device_id']],test[['device_id']]])
final=id_list
final.index=range(len(final))
final.columns= ['DeviceID']
final_pred = pd.concat([final_1,final_2],1)
final=pd.concat([final,final_pred],1)
final.columns = ['DeviceID', '1-0', '1-1', '1-2', '1-3', '1-4', '1-5', '1-6',
'1-7','1-8', '1-9', '1-10', '2-0', '2-1', '2-2', '2-3', '2-4',
'2-5', '2-6', '2-7', '2-8', '2-9', '2-10']
final.to_csv('submit/xgb_feat_chizhu_nb.csv', index=False)
# In[ ]:
test['DeviceID']=test['device_id']
sub=pd.merge(test[['DeviceID']],final,on="DeviceID",how="left")
sub.to_csv("submit/xgb_chizhu_nb.csv",index=False)
================================================
FILE: chizhu/single_model/yg_best_nn.py
================================================
import pandas as pd
import seaborn as sns
import numpy as np
from tqdm import tqdm
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from datetime import datetime, timedelta
import matplotlib.pyplot as plt
import time
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
# %matplotlib inline
#add
from category_encoders import OrdinalEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from gensim.models import FastText, Word2Vec
import re
from keras.layers import *
from keras.models import *
from keras.preprocessing.text import Tokenizer, text_to_word_sequence
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing import text, sequence
from keras.callbacks import *
from keras.layers.advanced_activations import LeakyReLU, PReLU
import keras.backend as K
from keras.optimizers import *
from keras.utils import to_categorical
from config import path
packages = pd.read_csv(path+'deviceid_packages.tsv',
sep='\t', names=['device_id', 'apps'])
test = pd.read_csv(path+'deviceid_test.tsv',
sep='\t', names=['device_id'])
train = pd.read_csv(path+'deviceid_train.tsv',
sep='\t', names=['device_id', 'sex', 'age'])
brand = pd.read_table(path+'deviceid_brand.tsv',
names=['device_id', 'vendor', 'version'])
behave = pd.read_csv('data/user_behavior.csv')
brand['phone_version'] = brand['vendor'] + ' ' + brand['version']
train = pd.merge(brand[['device_id', 'phone_version']],
train, on='device_id', how='right')
test = pd.merge(brand[['device_id', 'phone_version']],
test, on='device_id', how='right')
train = pd.merge(train, behave, on='device_id', how='left')
test = pd.merge(test, behave, on='device_id', how='left')
packages['app_lenghth'] = packages['apps'].apply(
lambda x: x.split(',')).apply(lambda x: len(x))
packages['app_list'] = packages['apps'].apply(lambda x: x.split(','))
train = pd.merge(train, packages, on='device_id', how='left')
test = pd.merge(test, packages, on='device_id', how='left')
embed_size = 128
fastmodel = Word2Vec(list(packages['app_list']), size=embed_size, window=4, min_count=3, negative=2,
sg=1, sample=0.002, hs=1, workers=4)
embedding_fast = pd.DataFrame([fastmodel[word]
for word in (fastmodel.wv.vocab)])
embedding_fast['app'] = list(fastmodel.wv.vocab)
embedding_fast.columns = ["fdim_%s" %
str(i) for i in range(embed_size)]+["app"]
tokenizer = Tokenizer(lower=False, char_level=False, split=',')
tokenizer.fit_on_texts(list(packages['apps']))
X_seq = tokenizer.texts_to_sequences(train['apps'])
X_test_seq = tokenizer.texts_to_sequences(test['apps'])
maxlen = 50
X = pad_sequences(X_seq, maxlen=maxlen, value=0)
X_test = pad_sequences(X_test_seq, maxlen=maxlen, value=0)
Y_sex = train['sex']-1
max_feaures = 35001
embedding_matrix = np.zeros((max_feaures, embed_size))
for word in tokenizer.word_index:
if word not in fastmodel.wv.vocab:
continue
embedding_matrix[tokenizer.word_index[word]] = fastmodel[word]
X_h = train[['h%s' % i for i in range(24)]].values
X_h_test = test[['h%s' % i for i in range(24)]].values
class AdamW(Optimizer):
def __init__(self, lr=0.001, beta_1=0.9, beta_2=0.999, weight_decay=1e-4, # decoupled weight decay (1/4)
epsilon=1e-8, decay=0., **kwargs):
super(AdamW, self).__init__(**kwargs)
with K.name_scope(self.__class__.__name__):
self.iterations = K.variable(0, dtype='int64', name='iterations')
self.lr = K.variable(lr, name='lr')
self.beta_1 = K.variable(beta_1, name='beta_1')
self.beta_2 = K.variable(beta_2, name='beta_2')
self.decay = K.variable(decay, name='decay')
# decoupled weight decay (2/4)
self.wd = K.variable(weight_decay, name='weight_decay')
self.epsilon = epsilon
self.initial_decay = decay
@interfaces.legacy_get_updates_support
def get_updates(self, loss, params):
grads = self.get_gradients(loss, params)
self.updates = [K.update_add(self.iterations, 1)]
wd = self.wd # decoupled weight decay (3/4)
lr = self.lr
if self.initial_decay > 0:
lr *= (1. / (1. + self.decay * K.cast(self.iterations,
K.dtype(self.decay))))
t = K.cast(self.iterations, K.floatx()) + 1
lr_t = lr * (K.sqrt(1. - K.pow(self.beta_2, t)) /
(1. - K.pow(self.beta_1, t)))
ms = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
vs = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
self.weights = [self.iterations] + ms + vs
for p, g, m, v in zip(params, grads, ms, vs):
m_t = (self.beta_1 * m) + (1. - self.beta_1) * g
v_t = (self.beta_2 * v) + (1. - self.beta_2) * K.square(g)
# decoupled weight decay (4/4)
p_t = p - lr_t * m_t / (K.sqrt(v_t) + self.epsilon) - lr * wd * p
self.updates.append(K.update(m, m_t))
self.updates.append(K.update(v, v_t))
new_p = p_t
# Apply constraints.
if getattr(p, 'constraint', None) is not None:
new_p = p.constraint(new_p)
self.updates.append(K.update(p, new_p))
return self.updates
def get_config(self):
config = {'lr': float(K.get_value(self.lr)),
'beta_1': float(K.get_value(self.beta_1)),
'beta_2': float(K.get_value(self.beta_2)),
'decay': float(K.get_value(self.decay)),
'weight_decay': float(K.get_value(self.wd)),
'epsilon': self.epsilon}
base_config = super(AdamW, self).get_config()
return dict(list(base_config.items()) + list(config.items()))
def model_conv1D(embedding_matrix):
K.clear_session()
# The embedding layer containing the word vectors
emb_layer = Embedding(
input_dim=embedding_matrix.shape[0],
output_dim=embedding_matrix.shape[1],
weights=[embedding_matrix],
input_length=maxlen,
trainable=False
)
lstm_layer = Bidirectional(
GRU(128, recurrent_dropout=0.15, dropout=0.15, return_sequences=True))
# 1D convolutions that can iterate over the word vectors
conv1 = Conv1D(filters=128, kernel_size=1,
padding='same', activation='relu',)
conv2 = Conv1D(filters=64, kernel_size=2,
padding='same', activation='relu', )
conv3 = Conv1D(filters=64, kernel_size=3,
padding='same', activation='relu',)
conv5 = Conv1D(filters=32, kernel_size=5,
padding='same', activation='relu',)
# Define inputs
seq = Input(shape=(maxlen,))
# Run inputs through embedding
emb = emb_layer(seq)
lstm = lstm_layer(emb)
# Run through CONV + GAP layers
conv1a = conv1(lstm)
gap1a = GlobalAveragePooling1D()(conv1a)
gmp1a = GlobalMaxPool1D()(conv1a)
conv2a = conv2(lstm)
gap2a = GlobalAveragePooling1D()(conv2a)
gmp2a = GlobalMaxPool1D()(conv2a)
conv3a = conv3(lstm)
gap3a = GlobalAveragePooling1D()(conv3a)
gmp3a = GlobalMaxPooling1D()(conv3a)
conv5a = conv5(lstm)
gap5a = GlobalAveragePooling1D()(conv5a)
gmp5a = GlobalMaxPooling1D()(conv5a)
hin = Input(shape=(24, ))
htime = Dense(6, activation='relu')(hin)
merge1 = concatenate([gmp1a, gmp1a, gmp1a, gmp1a, htime])
# The MLP that determines the outcome
x = Dropout(0.3)(merge1)
x = BatchNormalization()(x)
x = Dense(200, activation='relu',)(x)
x = Dropout(0.25)(x)
x = BatchNormalization()(x)
x = Dense(200, activation='relu',)(x)
x = Dropout(0.25)(x)
x = BatchNormalization()(x)
x = Dense(200, activation='relu',)(x)
x = Dropout(0.25)(x)
x = BatchNormalization()(x)
pred = Dense(1, activation='sigmoid')(x)
# model = Model(inputs=[seq1, seq2, magic_input, distance_input], outputs=pred)
model = Model(inputs=[seq, hin], outputs=pred)
model.compile(loss='binary_crossentropy',
optimizer=AdamW(weight_decay=0.08,))
return model
kfold = StratifiedKFold(n_splits=5, random_state=20, shuffle=True)
sub1 = np.zeros((X_test.shape[0], ))
oof_pref1 = np.zeros((X.shape[0], 1))
score = []
count = 0
for i, (train_index, test_index) in enumerate(kfold.split(X, Y_sex)):
print("FOLD | ", count+1)
filepath = "sex_weights_best_%d.h5" % count
checkpoint = ModelCheckpoint(
filepath, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
reduce_lr = ReduceLROnPlateau(
monitor='val_loss', factor=0.8, patience=2, min_lr=0.0001, verbose=1)
earlystopping = EarlyStopping(
monitor='val_loss', min_delta=0.0001, patience=6, verbose=1, mode='auto')
callbacks = [checkpoint, reduce_lr, earlystopping]
model_sex = model_conv1D(embedding_matrix)
X_tr, X_vl, X_tr2, X_vl2, y_tr, y_vl = X[train_index], X[test_index], X_h[
train_index], X_h[test_index], Y_sex[train_index], Y_sex[test_index]
hist = model_sex.fit([X_tr, X_tr2], y_tr, batch_size=256, epochs=50, validation_data=([X_vl, X_vl2], y_vl),
callbacks=callbacks, verbose=1, shuffle=True)
model_sex.load_weights(filepath)
sub1 += np.squeeze(model_sex.predict([X_test, X_h_test]))/kfold.n_splits
oof_pref1[test_index] = model_sex.predict([X_vl, X_vl2])
score.append(np.min(hist.history['val_loss']))
count += 1
print('log loss:', np.mean(score))
oof_pref1 = pd.DataFrame(oof_pref1, columns=['sex2'])
sub1 = pd.DataFrame(sub1, columns=['sex2'])
res1 = pd.concat([oof_pref1, sub1])
res1['sex1'] = 1-res1['sex2']
# res1.to_csv("res1.csv", index=False)
def model_age_conv(embedding_matrix):
# The embedding layer containing the word vectors
K.clear_session()
emb_layer = Embedding(
input_dim=embedding_matrix.shape[0],
output_dim=embedding_matrix.shape[1],
weights=[embedding_matrix],
input_length=maxlen,
trainable=False
)
lstm_layer = Bidirectional(
GRU(128, recurrent_dropout=0.15, dropout=0.15, return_sequences=True))
# 1D convolutions that can iterate over the word vectors
conv1 = Conv1D(filters=128, kernel_size=1,
padding='same', activation='relu',)
conv2 = Conv1D(filters=64, kernel_size=2,
padding='same', activation='relu', )
conv3 = Conv1D(filters=64, kernel_size=3,
padding='same', activation='relu',)
conv5 = Conv1D(filters=32, kernel_size=5,
padding='same', activation='relu',)
# Define inputs
seq = Input(shape=(maxlen,))
# Run inputs through embedding
emb = emb_layer(seq)
lstm = lstm_layer(emb)
# Run through CONV + GAP layers
conv1a = conv1(lstm)
gap1a = GlobalAveragePooling1D()(conv1a)
gmp1a = GlobalMaxPool1D()(conv1a)
conv2a = conv2(lstm)
gap2a = GlobalAveragePooling1D()(conv2a)
gmp2a = GlobalMaxPool1D()(conv2a)
conv3a = conv3(lstm)
gap3a = GlobalAveragePooling1D()(conv3a)
gmp3a = GlobalMaxPooling1D()(conv3a)
conv5a = conv5(lstm)
gap5a = GlobalAveragePooling1D()(conv5a)
gmp5a = GlobalMaxPooling1D()(conv5a)
merge1 = concatenate([gap1a, gap2a, gap3a, gap5a])
# The MLP that determines the outcome
x = Dropout(0.3)(merge1)
x = BatchNormalization()(x)
x = Dense(200, activation='relu',)(x)
x = Dropout(0.22)(x)
x = BatchNormalization()(x)
x = Dense(200, activation='relu',)(x)
x = Dropout(0.22)(x)
x = BatchNormalization()(x)
x = Dense(200, activation='relu',)(x)
x = Dropout(0.22)(x)
x = BatchNormalization()(x)
pred = Dense(11, activation='softmax')(x)
model = Model(inputs=seq, outputs=pred)
model.compile(loss='categorical_crossentropy',
optimizer=AdamW(weight_decay=0.08,))
return model
Y_age = to_categorical(train['age'])
sub2 = np.zeros((X_test.shape[0], 11))
oof_pref2 = np.zeros((X.shape[0], 11))
score = []
count = 0
for i, (train_index, test_index) in enumerate(kfold.split(X, train['age'])):
print("FOLD | ", count+1)
filepath2 = "age_weights_best_%d.h5" % count
checkpoint2 = ModelCheckpoint(
filepath2, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
reduce_lr2 = ReduceLROnPlateau(
monitor='val_loss', factor=0.8, patience=2, min_lr=0.0001, verbose=1)
earlystopping2 = EarlyStopping(
monitor='val_loss', min_delta=0.0001, patience=8, verbose=1, mode='auto')
callbacks2 = [checkpoint2, reduce_lr2, earlystopping2]
X_tr, X_vl, y_tr, y_vl = X[train_index], X[test_index], Y_age[train_index], Y_age[test_index]
model_age = model_age_conv(embedding_matrix)
hist = model_age.fit(X_tr, y_tr, batch_size=256, epochs=50, validation_data=(X_vl, y_vl),
callbacks=callbacks2, verbose=2, shuffle=True)
model_age.load_weights(filepath2)
oof_pref2[test_index] = model_age.predict(X_vl)
sub2 += model_age.predict(X_test)/kfold.n_splits
score.append(np.min(hist.history['val_loss']))
count += 1
print('log loss:', np.mean(score))
res2_1 = np.vstack((oof_pref2, sub2))
res2_1 = pd.DataFrame(res2_1)
# res2_1.to_csv("res2.csv", index=False)
res1.index = range(len(res1))
res2_1.index = range(len(res2_1))
final_1 = res2_1.copy()
final_2 = res2_1.copy()
for i in range(11):
final_1[i] = res1['sex1']*res2_1[i]
final_2[i] = res1['sex2']*res2_1[i]
id_list = pd.concat([train[['device_id']], test[['device_id']]])
final = id_list
final.index = range(len(final))
final.columns = ['DeviceID']
final_pred = pd.concat([final_1, final_2], 1)
final = pd.concat([final, final_pred], 1)
final.columns = ['DeviceID', '1-0', '1-1', '1-2', '1-3', '1-4', '1-5', '1-6',
'1-7', '1-8', '1-9', '1-10', '2-0', '2-1', '2-2', '2-3', '2-4',
'2-5', '2-6', '2-7', '2-8', '2-9', '2-10']
final.to_csv('submit/yg_best_nn.csv', index=False)
================================================
FILE: chizhu/stacking/all_feat/xgb__nurbs_nb.ipynb
================================================
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['xgb_final_nb.csv',\n",
" 'deviceid_train.tsv',\n",
" 'feat.csv.zip',\n",
" '.DS_Store',\n",
" 'thluo_train_best_feat.csv',\n",
" 'feat.csv',\n",
" 'xgb_feat_final_nb.csv',\n",
" 'xgb_nb.ipynb',\n",
" 'nurbs_feature_all.csv',\n",
" '.ipynb_checkpoints',\n",
" 'deviceid_test.tsv']"
]
},
"execution_count": 1,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import pandas as pd\n",
"import seaborn as sns\n",
"import numpy as np\n",
"from tqdm import tqdm\n",
"from sklearn.decomposition import LatentDirichletAllocation\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn.metrics import accuracy_score\n",
"import lightgbm as lgb\n",
"from datetime import datetime,timedelta \n",
"import matplotlib.pyplot as plt\n",
"import time\n",
"from sklearn.feature_extraction.text import TfidfTransformer\n",
"from sklearn.feature_extraction.text import CountVectorizer\n",
"%matplotlib inline\n",
"\n",
"#add\n",
"import gc\n",
"from sklearn import preprocessing\n",
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
"\n",
"from scipy.sparse import hstack, vstack\n",
"from sklearn.model_selection import StratifiedKFold\n",
"from sklearn.model_selection import cross_val_score\n",
"# from skopt.space import Integer, Categorical, Real, Log10\n",
"# from skopt.utils import use_named_args\n",
"# from skopt import gp_minimize\n",
"from gensim.models import Word2Vec, FastText\n",
"import gensim \n",
"import re\n",
"import os\n",
"path=\"./\"\n",
"os.listdir(path)"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"train_id=pd.read_csv(\"deviceid_train.tsv\",sep=\"\\t\",names=['device_id','sex','age'])\n",
"test_id=pd.read_csv(\"deviceid_test.tsv\",sep=\"\\t\",names=['device_id'])\n",
"all_id=pd.concat([train_id[['device_id']],test_id[['device_id']]])\n",
"nurbs=pd.read_csv(\"nurbs_feature_all.csv\")\n",
"nurbs.columns=[\"nurbs_\"+str(i) for i in nurbs.columns]\n",
"all_id.index=range(len(all_id))\n",
"nurbs['device_id']=all_id"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"th=pd.read_csv(\"thluo_train_best_feat.csv\")"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"feat=pd.merge(th,nurbs,on=\"device_id\",how=\"left\")"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"feat.to_csv(\"feat.csv\",index=False)"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [],
"source": [
"train=pd.merge(train_id,feat,on=\"device_id\",how=\"left\")\n",
"test=pd.merge(test_id,feat,on=\"device_id\",how=\"left\")"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [],
"source": [
"features = [x for x in train.columns if x not in ['device_id', 'sex',\"age\",]]\n",
"Y = train['sex'] - 1"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/Users/chizhu/anaconda3/lib/python3.6/site-packages/sklearn/cross_validation.py:41: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.\n",
" \"This module will be removed in 0.20.\", DeprecationWarning)\n"
]
}
],
"source": [
"\n",
"import xgboost as xgb\n",
"from sklearn.metrics import auc, log_loss, roc_auc_score,f1_score,recall_score,precision_score\n",
"from sklearn.cross_validation import StratifiedKFold\n",
"\n",
"kf = StratifiedKFold(Y, n_folds=5, shuffle=True, random_state=1024)\n",
"params={\n",
"\t'booster':'gbtree',\n",
"\t'objective': 'binary:logistic',\n",
"# 'is_unbalance':'True',\n",
"# \t'scale_pos_weight': 1500.0/13458.0,\n",
" 'eval_metric': \"logloss\",\n",
" \n",
"\t'gamma':0.2,#0.2 is ok\n",
"\t'max_depth':6,\n",
"# \t'lambda':20,\n",
" # \"alpha\":5,\n",
" 'subsample':0.7,\n",
" 'colsample_bytree':0.4 ,\n",
"# 'min_child_weight':2.5, \n",
" 'eta': 0.01,\n",
" # 'learning_rate':0.01,\n",
" \"silent\":1,\n",
"\t'seed':1024,\n",
"\t'nthread':12,\n",
" \n",
" }\n",
"num_round = 3500\n",
"early_stopping_rounds = 100"
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[0]\ttrain-logloss:0.691359\tval-logloss:0.691488\n",
"Multiple eval metrics have been passed: 'val-logloss' will be used for early stopping.\n",
"\n",
"Will train until val-logloss hasn't improved in 100 rounds.\n",
"[200]\ttrain-logloss:0.566693\tval-logloss:0.595722\n",
"[400]\ttrain-logloss:0.53806\tval-logloss:0.590461\n",
"[600]\ttrain-logloss:0.519054\tval-logloss:0.590032\n",
"Stopping. Best iteration:\n",
"[529]\ttrain-logloss:0.525748\tval-logloss:0.589953\n",
"\n",
"idx: 0\n",
" loss: 0.59015\n",
"[0]\ttrain-logloss:0.691215\tval-logloss:0.691369\n",
"Multiple eval metrics have been passed: 'val-logloss' will be used for early stopping.\n",
"\n",
"Will train until val-logloss hasn't improved in 100 rounds.\n",
"[200]\ttrain-logloss:0.56648\tval-logloss:0.596397\n",
"[400]\ttrain-logloss:0.538516\tval-logloss:0.591125\n",
"[600]\ttrain-logloss:0.51823\tval-logloss:0.590809\n",
"Stopping. Best iteration:\n",
"[595]\ttrain-logloss:0.518718\tval-logloss:0.590732\n",
"\n",
"idx: 1\n",
" loss: 0.59099\n",
"[0]\ttrain-logloss:0.691228\tval-logloss:0.69143\n",
"Multiple eval metrics have been passed: 'val-logloss' will be used for early stopping.\n",
"\n",
"Will train until val-logloss hasn't improved in 100 rounds.\n",
"[200]\ttrain-logloss:0.566822\tval-logloss:0.596484\n",
"[400]\ttrain-logloss:0.538456\tval-logloss:0.591576\n",
"[600]\ttrain-logloss:0.518551\tval-logloss:0.590934\n",
"Stopping. Best iteration:\n",
"[641]\ttrain-logloss:0.514957\tval-logloss:0.590818\n",
"\n",
"idx: 2\n",
" loss: 0.59091\n",
"[0]\ttrain-logloss:0.691224\tval-logloss:0.691404\n",
"Multiple eval metrics have been passed: 'val-logloss' will be used for early stopping.\n",
"\n",
"Will train until val-logloss hasn't improved in 100 rounds.\n",
"[200]\ttrain-logloss:0.565394\tval-logloss:0.598566\n",
"[400]\ttrain-logloss:0.536792\tval-logloss:0.594022\n",
"Stopping. Best iteration:\n",
"[458]\ttrain-logloss:0.531227\tval-logloss:0.593837\n",
"\n",
"idx: 3\n",
" loss: 0.59396\n",
"[0]\ttrain-logloss:0.691344\tval-logloss:0.691511\n",
"Multiple eval metrics have been passed: 'val-logloss' will be used for early stopping.\n",
"\n",
"Will train until val-logloss hasn't improved in 100 rounds.\n",
"[200]\ttrain-logloss:0.566356\tval-logloss:0.595648\n",
"[400]\ttrain-logloss:0.537421\tval-logloss:0.591302\n",
"[600]\ttrain-logloss:0.518249\tval-logloss:0.591042\n",
"Stopping. Best iteration:\n",
"[525]\ttrain-logloss:0.525041\tval-logloss:0.590956\n",
"\n",
"idx: 4\n",
" loss: 0.59108\n",
"mean\n",
"auc: 0.5914183145833928\n"
]
}
],
"source": [
"aus = []\n",
"sub1 = np.zeros((len(test), ))\n",
"pred_oob1=np.zeros((len(train),))\n",
"for i,(train_index,test_index) in enumerate(kf):\n",
" \n",
" tr_x = train[features].reindex(index=train_index, copy=False)\n",
" tr_y = Y[train_index]\n",
" te_x = train[features].reindex(index=test_index, copy=False)\n",
" te_y = Y[test_index]\n",
"\n",
" # tr_y=tr_y.apply(lambda x:1 if x>0 else 0)\n",
" # te_y=te_y.apply(lambda x:1 if x>0 else 0)\n",
" d_tr = xgb.DMatrix(tr_x, label=tr_y)\n",
" d_te = xgb.DMatrix(te_x, label=te_y)\n",
" watchlist = [(d_tr,'train'),\n",
" (d_te,'val')\n",
" ]\n",
" model = xgb.train(params, d_tr, num_boost_round=5500, \n",
" evals=watchlist,verbose_eval=200,\n",
" early_stopping_rounds=100)\n",
" pred = model.predict(d_te)\n",
" pred_oob1[test_index] =pred\n",
" # te_y=te_y.apply(lambda x:1 if x>0 else 0)\n",
" a = log_loss(te_y, pred)\n",
"\n",
" sub1 += model.predict(xgb.DMatrix(test[features]))/5\n",
" \n",
"\n",
" print (\"idx: \", i) \n",
" print (\" loss: %.5f\" % a)\n",
"# print \" gini: %.5f\" % g\n",
" aus.append(a)\n",
"\n",
"print (\"mean\")\n",
"print (\"auc: %s\" % (sum(aus) / 5.0))"
]
},
{
"cell_type": "code",
"execution_count": 41,
"metadata": {},
"outputs": [],
"source": [
"pred_oob1 = pd.DataFrame(pred_oob1, columns=['sex2'])\n",
"sub1 = pd.DataFrame(sub1, columns=['sex2'])\n",
"res1=pd.concat([pred_oob1,sub1])\n",
"res1['sex1'] = 1-res1['sex2']"
]
},
{
"cell_type": "code",
"execution_count": 48,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"1012"
]
},
"execution_count": 48,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import gc\n",
"gc.collect()"
]
},
{
"cell_type": "code",
"execution_count": 49,
"metadata": {},
"outputs": [],
"source": [
"####sex1\n",
"test['sex']=1"
]
},
{
"cell_type": "code",
"execution_count": 50,
"metadata": {},
"outputs": [],
"source": [
"features = [x for x in train.columns if x not in ['device_id',\"age\"]]\n",
"Y = train['age'] "
]
},
{
"cell_type": "code",
"execution_count": 51,
"metadata": {},
"outputs": [],
"source": [
"import lightgbm as lgb\n",
"import xgboost as xgb\n",
"from sklearn.metrics import auc, log_loss, roc_auc_score,f1_score,recall_score,precision_score\n",
"from sklearn.cross_validation import StratifiedKFold\n",
"\n",
"kf = StratifiedKFold(Y, n_folds=5, shuffle=True, random_state=1024)\n",
"params={\n",
"\t'booster':'gbtree',\n",
"\t'objective': 'multi:softprob',\n",
"# 'is_unbalance':'True',\n",
"# \t'scale_pos_weight': 1500.0/13458.0,\n",
" 'eval_metric': \"mlogloss\",\n",
" 'num_class':11,\n",
"\t'gamma':0.1,#0.2 is ok\n",
"\t'max_depth':6,\n",
"# \t'lambda':20,\n",
" # \"alpha\":5,\n",
" 'subsample':0.7,\n",
" 'colsample_bytree':0.4 ,\n",
" # 'min_child_weight':2.5, \n",
" 'eta': 0.01,\n",
" # 'learning_rate':0.01,\n",
" \"silent\":1,\n",
"\t'seed':1024,\n",
"\t'nthread':12,\n",
" \n",
" }\n",
"num_round = 3500\n",
"early_stopping_rounds = 100"
]
},
{
"cell_type": "code",
"execution_count": 52,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[0]\ttrain-mlogloss:2.39131\tval-mlogloss:2.39264\n",
"Multiple eval metrics have been passed: 'val-mlogloss' will be used for early stopping.\n",
"\n",
"Will train until val-mlogloss hasn't improved in 100 rounds.\n",
"[200]\ttrain-mlogloss:1.80941\tval-mlogloss:2.00508\n",
"[400]\ttrain-mlogloss:1.60383\tval-mlogloss:1.94\n",
"[600]\ttrain-mlogloss:1.472\tval-mlogloss:1.9241\n",
"[800]\ttrain-mlogloss:1.36689\tval-mlogloss:1.92024\n",
"[1000]\ttrain-mlogloss:1.273\tval-mlogloss:1.91999\n",
"Stopping. Best iteration:\n",
"[918]\ttrain-mlogloss:1.31045\tval-mlogloss:1.91983\n",
"\n",
"idx: 0\n",
" loss: 1.91985\n",
"[0]\ttrain-mlogloss:2.39114\tval-mlogloss:2.39277\n",
"Multiple eval metrics have been passed: 'val-mlogloss' will be used for early stopping.\n",
"\n",
"Will train until val-mlogloss hasn't improved in 100 rounds.\n",
"[200]\ttrain-mlogloss:1.8078\tval-mlogloss:2.0115\n",
"[400]\ttrain-mlogloss:1.60116\tval-mlogloss:1.9457\n",
"[600]\ttrain-mlogloss:1.46953\tval-mlogloss:1.93011\n",
"[800]\ttrain-mlogloss:1.36553\tval-mlogloss:1.92647\n",
"Stopping. Best iteration:\n",
"[825]\ttrain-mlogloss:1.35318\tval-mlogloss:1.92626\n",
"\n",
"idx: 1\n",
" loss: 1.92627\n",
"[0]\ttrain-mlogloss:2.39122\tval-mlogloss:2.3928\n",
"Multiple eval metrics have been passed: 'val-mlogloss' will be used for early stopping.\n",
"\n",
"Will train until val-mlogloss hasn't improved in 100 rounds.\n",
"[200]\ttrain-mlogloss:1.8065\tval-mlogloss:2.01298\n",
"[400]\ttrain-mlogloss:1.60091\tval-mlogloss:1.94872\n",
"[600]\ttrain-mlogloss:1.4685\tval-mlogloss:1.93313\n",
"[800]\ttrain-mlogloss:1.36383\tval-mlogloss:1.92927\n",
"Stopping. Best iteration:\n",
"[899]\ttrain-mlogloss:1.3168\tval-mlogloss:1.92877\n",
"\n",
"idx: 2\n",
" loss: 1.92879\n",
"[0]\ttrain-mlogloss:2.39105\tval-mlogloss:2.39257\n",
"Multiple eval metrics have been passed: 'val-mlogloss' will be used for early stopping.\n",
"\n",
"Will train until val-mlogloss hasn't improved in 100 rounds.\n",
"[200]\ttrain-mlogloss:1.80767\tval-mlogloss:2.01163\n",
"[400]\ttrain-mlogloss:1.6018\tval-mlogloss:1.94808\n",
"[600]\ttrain-mlogloss:1.47112\tval-mlogloss:1.93282\n",
"[800]\ttrain-mlogloss:1.36743\tval-mlogloss:1.92918\n",
"[1000]\ttrain-mlogloss:1.27495\tval-mlogloss:1.92918\n",
"Stopping. Best iteration:\n",
"[953]\ttrain-mlogloss:1.29641\tval-mlogloss:1.92904\n",
"\n",
"idx: 3\n",
" loss: 1.92906\n",
"[0]\ttrain-mlogloss:2.39143\tval-mlogloss:2.39284\n",
"Multiple eval metrics have been passed: 'val-mlogloss' will be used for early stopping.\n",
"\n",
"Will train until val-mlogloss hasn't improved in 100 rounds.\n",
"[200]\ttrain-mlogloss:1.81054\tval-mlogloss:2.00446\n",
"[400]\ttrain-mlogloss:1.6046\tval-mlogloss:1.93723\n",
"[600]\ttrain-mlogloss:1.47282\tval-mlogloss:1.92063\n",
"[800]\ttrain-mlogloss:1.36819\tval-mlogloss:1.91661\n",
"[1000]\ttrain-mlogloss:1.27547\tval-mlogloss:1.91579\n",
"Stopping. Best iteration:\n",
"[1014]\ttrain-mlogloss:1.26898\tval-mlogloss:1.91575\n",
"\n",
"idx: 4\n",
" loss: 1.91579\n",
"mean\n",
"auc: 1.923953299949125\n"
]
}
],
"source": [
"aus = []\n",
"sub2 = np.zeros((len(test),11 ))\n",
"pred_oob2=np.zeros((len(train),11))\n",
"models=[]\n",
"iters=[]\n",
"for i,(train_index,test_index) in enumerate(kf):\n",
" \n",
" tr_x = train[features].reindex(index=train_index, copy=False)\n",
" tr_y = Y[train_index]\n",
" te_x = train[features].reindex(index=test_index, copy=False)\n",
" te_y = Y[test_index]\n",
"\n",
" # tr_y=tr_y.apply(lambda x:1 if x>0 else 0)\n",
" # te_y=te_y.apply(lambda x:1 if x>0 else 0)\n",
" d_tr = xgb.DMatrix(tr_x, label=tr_y)\n",
" d_te = xgb.DMatrix(te_x, label=te_y)\n",
" watchlist = [(d_tr,'train'),\n",
" (d_te,'val')\n",
" ]\n",
" model = xgb.train(params, d_tr, num_boost_round=5500, \n",
" evals=watchlist,verbose_eval=200,\n",
" early_stopping_rounds=100)\n",
" models.append(model)\n",
" iters.append(model.best_iteration)\n",
" pred = model.predict(d_te,ntree_limit=model.best_iteration)\n",
" pred_oob2[test_index] =pred\n",
" # te_y=te_y.apply(lambda x:1 if x>0 else 0)\n",
" a = log_loss(te_y, pred)\n",
"\n",
" sub2 += model.predict(xgb.DMatrix(test[features]),ntree_limit=model.best_iteration)/5\n",
" \n",
"\n",
" print (\"idx: \", i) \n",
" print (\" loss: %.5f\" % a)\n",
"# print \" gini: %.5f\" % g\n",
" aus.append(a)\n",
"\n",
"print (\"mean\")\n",
"print (\"auc: %s\" % (sum(aus) / 5.0))"
]
},
{
"cell_type": "code",
"execution_count": 53,
"metadata": {},
"outputs": [],
"source": [
"res2_1=np.vstack((pred_oob2,sub2))\n",
"res2_1 = pd.DataFrame(res2_1)"
]
},
{
"cell_type": "code",
"execution_count": 54,
"metadata": {},
"outputs": [],
"source": [
"###sex2\n",
"test['sex']=2\n",
"features = [x for x in train.columns if x not in ['device_id',\"age\",\"label\",\"app\"]]\n",
"Y = train['age'] "
]
},
{
"cell_type": "code",
"execution_count": 55,
"metadata": {},
"outputs": [],
"source": [
"aus = []\n",
"sub2 = np.zeros((len(test),11 ))\n",
"for model,it in zip(models,iters):\n",
" sub2 += model.predict(xgb.DMatrix(test[features]),ntree_limit=it)/5\n",
"res2_2=np.vstack((pred_oob2,sub2))\n",
"res2_2 = pd.DataFrame(res2_2) "
]
},
{
"cell_type": "code",
"execution_count": 56,
"metadata": {},
"outputs": [],
"source": [
"res1.index=range(len(res1))\n",
"res2_1.index=range(len(res2_1))\n",
"res2_2.index=range(len(res2_2))\n",
"final_1=res2_1.copy()\n",
"final_2=res2_2.copy()\n",
"for i in range(11):\n",
" final_1[i]=res1['sex1']*res2_1[i]\n",
" final_2[i]=res1['sex2']*res2_2[i]\n",
"id_list=pd.concat([train[['device_id']],test[['device_id']]])\n",
"final=id_list\n",
"final.index=range(len(final))\n",
"final.columns= ['DeviceID']\n",
"final_pred = pd.concat([final_1,final_2],1)\n",
"final=pd.concat([final,final_pred],1)\n",
"final.columns = ['DeviceID', '1-0', '1-1', '1-2', '1-3', '1-4', '1-5', '1-6', \n",
" '1-7','1-8', '1-9', '1-10', '2-0', '2-1', '2-2', '2-3', '2-4', \n",
" '2-5', '2-6', '2-7', '2-8', '2-9', '2-10']\n",
"\n",
"final.to_csv('xgb_feat_final_nb.csv', index=False)"
]
},
{
"cell_type": "code",
"execution_count": 57,
"metadata": {},
"outputs": [],
"source": [
"test['DeviceID']=test['device_id']\n",
"sub=pd.merge(test[['DeviceID']],final,on=\"DeviceID\",how=\"left\")\n",
"sub.to_csv(\"xgb_final_nb.csv\",index=False)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.4"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
================================================
FILE: chizhu/stacking/nurbs_feat/xgb_22.py
================================================
# coding: utf-8
# In[2]:
import pandas as pd
import seaborn as sns
import numpy as np
from tqdm import tqdm
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import lightgbm as lgb
from datetime import datetime,timedelta
import matplotlib.pyplot as plt
import time
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
# get_ipython().run_line_magic('matplotlib', 'inline')
#add
import gc
from sklearn import preprocessing
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack, vstack
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
# from skopt.space import Integer, Categorical, Real, Log10
# from skopt.utils import use_named_args
# from skopt import gp_minimize
from gensim.models import Word2Vec, FastText
import gensim
import re
import os
path="./feature/"###nurbs概率文件路径
o_path="/dev/shm/chizhu_data/data/"###原始文件路径
os.listdir(path)
# In[4]:
all_feat=pd.read_csv(path+"feature_22_all.csv")
train_id=pd.read_csv(o_path+"deviceid_train.tsv",sep="\t",names=['device_id','sex','age'])
test_id=pd.read_csv(o_path+"deviceid_test.tsv",sep="\t",names=['device_id'])
all_id=pd.concat([train_id[['device_id']],test_id[['device_id']]])
all_id.index=range(len(all_id))
all_feat['device_id']=all_id
# deepnn_feat=pd.read_csv(path+"deepnn_fix.csv")
# deepnn_feat['device_id']=deepnn_feat['DeviceID']
# del deepnn_feat['DeviceID']
# In[9]:
train=pd.merge(train_id,all_feat,on="device_id",how="left")
# train=pd.merge(train,deepnn_feat,on="device_id",how="left")
test=pd.merge(test_id,all_feat,on="device_id",how="left")
# test=pd.merge(test,deepnn_feat,on="device_id",how="left")
# In[10]:
train['sex-age']=train.apply(lambda x:str(x['sex'])+"-"+str(x['age']),1)
# In[11]:
features = [x for x in train.columns if x not in ['device_id',"sex",'age','sex-age']]
label="sex-age"
# In[12]:
Y_CAT=pd.Categorical(train[label])
# In[13]:
import lightgbm as lgb
import xgboost as xgb
from sklearn.metrics import auc, log_loss, roc_auc_score,f1_score,recall_score,precision_score
from sklearn.cross_validation import StratifiedKFold
kf = StratifiedKFold(Y_CAT, n_folds=5, shuffle=True, random_state=1024)
params={
'booster':'gbtree',
"tree_method":"gpu_hist",
"gpu_id":"1",
'objective': 'multi:softprob',
# 'is_unbalance':'True',
# 'scale_pos_weight': 1500.0/13458.0,
'eval_metric': "mlogloss",
'num_class':22,
'gamma':0.1,#0.2 is ok
'max_depth':6,
# 'lambda':20,
# "alpha":5,
'subsample':0.7,
'colsample_bytree':0.4 ,
# 'min_child_weight':2.5,
'eta': 0.01,
# 'learning_rate':0.01,
"silent":1,
'seed':1024,
'nthread':12,
}
num_round = 3500
early_stopping_rounds = 100
# In[14]:
aus = []
sub2 = np.zeros((len(test),22 ))
pred_oob2=np.zeros((len(train),22))
models=[]
iters=[]
for i,(train_index,test_index) in enumerate(kf):
tr_x = train[features].reindex(index=train_index, copy=False)
tr_y = Y_CAT.codes[train_index]
te_x = train[features].reindex(index=test_index, copy=False)
te_y = Y_CAT.codes[test_index]
# tr_y=tr_y.apply(lambda x:1 if x>0 else 0)
# te_y=te_y.apply(lambda x:1 if x>0 else 0)
d_tr = xgb.DMatrix(tr_x, label=tr_y)
d_te = xgb.DMatrix(te_x, label=te_y)
watchlist = [(d_tr,'train'),
(d_te,'val')
]
model = xgb.train(params, d_tr, num_boost_round=5500,
evals=watchlist,verbose_eval=200,
early_stopping_rounds=100)
models.append(model)
iters.append(model.best_iteration)
pred = model.predict(d_te,ntree_limit=model.best_iteration)
pred_oob2[test_index] =pred
# te_y=te_y.apply(lambda x:1 if x>0 else 0)
a = log_loss(te_y, pred)
sub2 += model.predict(xgb.DMatrix(test[features]),ntree_limit=model.best_iteration)/5
print ("idx: ", i)
print (" loss: %.5f" % a)
# print " gini: %.5f" % g
aus.append(a)
print ("mean")
print ("loss: %s" % (sum(aus) / 5.0))
# In[15]:
res=np.vstack((pred_oob2,sub2))
res = pd.DataFrame(res,columns=Y_CAT.categories)
res['DeviceID']=all_id
res=res[['DeviceID', '1-0', '1-1', '1-2', '1-3', '1-4', '1-5', '1-6', '1-7','1-8', '1-9', '1-10', '2-0', '2-1', '2-2', '2-3', '2-4', '2-5', '2-6', '2-7', '2-8', '2-9', '2-10']]
res.to_csv("xgb_nurbs_22_feat.csv",index=False)
# In[16]:
test['DeviceID']=test['device_id']
sub=pd.merge(test[['DeviceID']],res,on="DeviceID",how="left")
sub.to_csv("xgb_nurbs_22.csv",index=False)
================================================
FILE: chizhu/stacking/nurbs_feat/xgb__nurbs_nb.py
================================================
# coding: utf-8
# In[1]:
import pandas as pd
import seaborn as sns
import numpy as np
from tqdm import tqdm
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import lightgbm as lgb
from datetime import datetime,timedelta
import matplotlib.pyplot as plt
import time
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
get_ipython().run_line_magic('matplotlib', 'inline')
#add
import gc
from sklearn import preprocessing
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack, vstack
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
# from skopt.space import Integer, Categorical, Real, Log10
# from skopt.utils import use_named_args
# from skopt import gp_minimize
from gensim.models import Word2Vec, FastText
import gensim
import re
import os
path="./feature/"##nurbs概率文件路径
o_path="/dev/shm/chizhu_data/data/"###原始文件路径
os.listdir(path)
# In[2]:
sex_feat=pd.read_csv(path+"feature_sex_all.csv")
age_feat=pd.read_csv(path+"feature_age_all.csv")
# all_feat=pd.read_csv(path+"feature_22_all.csv")
train_id=pd.read_csv(o_path+"deviceid_train.tsv",sep="\t",names=['device_id','sex','age'])
test_id=pd.read_csv(o_path+"deviceid_test.tsv",sep="\t",names=['device_id'])
all_id=pd.concat([train_id[['device_id']],test_id[['device_id']]])
all_id.index=range(len(all_id))
sex_feat['device_id']=all_id
age_feat['device_id']=all_id
# deepnn_feat=pd.read_csv(path+"deepnn_fix.csv")
# deepnn_feat['device_id']=deepnn_feat['DeviceID']
# del deepnn_feat['DeviceID']
# In[3]:
train=pd.merge(train_id,sex_feat,on="device_id",how="left")
# train=pd.merge(train,deepnn_feat,on="device_id",how="left")
test=pd.merge(test_id,sex_feat,on="device_id",how="left")
# test=pd.merge(test,deepnn_feat,on="device_id",how="left")
# In[4]:
features = [x for x in train.columns if x not in ['device_id', 'sex',"age",]]
Y = train['sex'] - 1
# In[5]:
import xgboost as xgb
from sklearn.metrics import auc, log_loss, roc_auc_score,f1_score,recall_score,precision_score
from sklearn.cross_validation import StratifiedKFold
kf = StratifiedKFold(Y, n_folds=10, shuffle=True, random_state=1024)
params={
'booster':'gbtree',
"tree_method":"gpu_hist",
"gpu_id":"2",
'objective': 'binary:logistic',
# 'is_unbalance':'True',
# 'scale_pos_weight': 1500.0/13458.0,
'eval_metric': "logloss",
'gamma':0.2,#0.2 is ok
'max_depth':6,
# 'lambda':20,
# "alpha":5,
'subsample':0.7,
'colsample_bytree':0.4 ,
# 'min_child_weight':2.5,
'eta': 0.01,
# 'learning_rate':0.01,
"silent":1,
'seed':1024,
'nthread':12,
}
num_round = 3500
early_stopping_rounds = 100
# In[6]:
aus = []
sub1 = np.zeros((len(test), ))
pred_oob1=np.zeros((len(train),))
for i,(train_index,test_index) in enumerate(kf):
tr_x = train[features].reindex(index=train_index, copy=False)
tr_y = Y[train_index]
te_x = train[features].reindex(index=test_index, copy=False)
te_y = Y[test_index]
# tr_y=tr_y.apply(lambda x:1 if x>0 else 0)
# te_y=te_y.apply(lambda x:1 if x>0 else 0)
d_tr = xgb.DMatrix(tr_x, label=tr_y)
d_te = xgb.DMatrix(te_x, label=te_y)
watchlist = [(d_tr,'train'),
(d_te,'val')
]
model = xgb.train(params, d_tr, num_boost_round=5500,
evals=watchlist,verbose_eval=200,
early_stopping_rounds=100)
pred = model.predict(d_te,ntree_limit=model.best_iteration)
pred_oob1[test_index] =pred
# te_y=te_y.apply(lambda x:1 if x>0 else 0)
a = log_loss(te_y, pred)
sub1 += model.predict(xgb.DMatrix(test[features]),ntree_limit=model.best_iteration)/10
print ("idx: ", i)
print (" loss: %.5f" % a)
# print " gini: %.5f" % g
aus.append(a)
print ("mean")
print ("auc: %s" % (sum(aus) / 10.0))
# In[7]:
pred_oob1 = pd.DataFrame(pred_oob1, columns=['sex2'])
sub1 = pd.DataFrame(sub1, columns=['sex2'])
res1=pd.concat([pred_oob1,sub1])
res1['sex1'] = 1-res1['sex2']
# In[8]:
import gc
gc.collect()
# In[9]:
train=pd.merge(train_id,age_feat,on="device_id",how="left")
# train=pd.merge(train,deepnn_feat,on="device_id",how="left")
test=pd.merge(test_id,age_feat,on="device_id",how="left")
# test=pd.merge(test,deepnn_feat,on="device_id",how="left")
# In[10]:
####sex1
test['sex']=1
# In[11]:
features = [x for x in train.columns if x not in ['device_id',"age"]]
Y = train['age']
# In[12]:
import lightgbm as lgb
import xgboost as xgb
from sklearn.metrics import auc, log_loss, roc_auc_score,f1_score,recall_score,precision_score
from sklearn.cross_validation import StratifiedKFold
kf = StratifiedKFold(Y, n_folds=10, shuffle=True, random_state=1024)
params={
'booster':'gbtree',
"tree_method":"gpu_hist",
"gpu_id":"2",
'objective': 'multi:softprob',
# 'is_unbalance':'True',
# 'scale_pos_weight': 1500.0/13458.0,
'eval_metric': "mlogloss",
'num_class':11,
'gamma':0.1,#0.2 is ok
'max_depth':6,
# 'lambda':20,
# "alpha":5,
'subsample':0.7,
'colsample_bytree':0.4 ,
# 'min_child_weight':2.5,
'eta': 0.01,
# 'learning_rate':0.01,
"silent":1,
'seed':1024,
'nthread':12,
}
num_round = 3500
early_stopping_rounds = 100
# In[13]:
aus = []
sub2 = np.zeros((len(test),11 ))
pred_oob2=np.zeros((len(train),11))
models=[]
iters=[]
for i,(train_index,test_index) in enumerate(kf):
tr_x = train[features].reindex(index=train_index, copy=False)
tr_y = Y[train_index]
te_x = train[features].reindex(index=test_index, copy=False)
te_y = Y[test_index]
# tr_y=tr_y.apply(lambda x:1 if x>0 else 0)
# te_y=te_y.apply(lambda x:1 if x>0 else 0)
d_tr = xgb.DMatrix(tr_x, label=tr_y)
d_te = xgb.DMatrix(te_x, label=te_y)
watchlist = [(d_tr,'train'),
(d_te,'val')
]
model = xgb.train(params, d_tr, num_boost_round=5500,
evals=watchlist,verbose_eval=200,
early_stopping_rounds=100)
models.append(model)
iters.append(model.best_iteration)
pred = model.predict(d_te,ntree_limit=model.best_iteration)
pred_oob2[test_index] =pred
# te_y=te_y.apply(lambda x:1 if x>0 else 0)
a = log_loss(te_y, pred)
sub2 += model.predict(xgb.DMatrix(test[features]),ntree_limit=model.best_iteration)/10
print ("idx: ", i)
print (" loss: %.5f" % a)
# print " gini: %.5f" % g
aus.append(a)
print ("mean")
print ("auc: %s" % (sum(aus) / 10.0))
# In[14]:
res2_1=np.vstack((pred_oob2,sub2))
res2_1 = pd.DataFrame(res2_1)
# In[15]:
###sex2
test['sex']=2
features = [x for x in train.columns if x not in ['device_id',"age"]]
Y = train['age']
# In[16]:
aus = []
sub2 = np.zeros((len(test),11 ))
for model,it in zip(models,iters):
sub2 += model.predict(xgb.DMatrix(test[features]),ntree_limit=it)/10
res2_2=np.vstack((pred_oob2,sub2))
res2_2 = pd.DataFrame(res2_2)
# In[17]:
res1.index=range(len(res1))
res2_1.index=range(len(res2_1))
res2_2.index=range(len(res2_2))
final_1=res2_1.copy()
final_2=res2_2.copy()
for i in range(11):
final_1[i]=res1['sex1']*res2_1[i]
final_2[i]=res1['sex2']*res2_2[i]
id_list=pd.concat([train[['device_id']],test[['device_id']]])
final=id_list
final.index=range(len(final))
final.columns= ['DeviceID']
final_pred = pd.concat([final_1,final_2],1)
final=pd.concat([final,final_pred],1)
final.columns = ['DeviceID', '1-0', '1-1', '1-2', '1-3', '1-4', '1-5', '1-6',
'1-7','1-8', '1-9', '1-10', '2-0', '2-1', '2-2', '2-3', '2-4',
'2-5', '2-6', '2-7', '2-8', '2-9', '2-10']
final.to_csv('xgb_feat_nurbs_nb_10fold.csv', index=False)
# In[18]:
test['DeviceID']=test['device_id']
sub=pd.merge(test[['DeviceID']],final,on="DeviceID",how="left")
sub.to_csv("xgb_nurbs_nb_10fold.csv",index=False)
================================================
FILE: chizhu/util/bagging.py
================================================
import os
import pandas as pd
path = "/Users/chizhu/data/competition_data/易观/"
os.listdir(path)
train = pd.read_csv(path+"deviceid_train.tsv", sep="\t",
names=["id", "sex", "age"])
test = pd.read_csv(path+"deviceid_test.tsv", sep="\t", names=['DeviceID'])
pred = pd.read_csv(path+"nn_feat_v6.csv")
lgb1 = pd.read_csv(path+"th_results_ems_22_nb_5400.csv") # 576
lgb1 = pd.merge(test, lgb1, on="DeviceID", how="left")
submit = lgb1.copy()
nn1 = pd.read_csv(path+"xgb_and_nurbs.csv") # 573
nn1 = pd.merge(test, nn1, on="DeviceID", how="left")
# nn2=pd.read_csv(path+"th_results_ems_2547.csv")##574
# nn2=pd.merge(test,nn2,on="DeviceID",how="left")
# lgb2=pd.read_csv(path+"th_results_ems_2.549.csv")##570
# lgb2=pd.merge(test,lgb2,on="DeviceID",how="left")
# lgb3=pd.read_csv(path+"th_results_ems_2547.csv")##547
# lgb3=pd.merge(test,lgb3,on="DeviceID",how="left")
for i in['1-0', '1-1', '1-2', '1-3', '1-4', '1-5', '1-6',
'1-7', '1-8', '1-9', '1-10', '2-0', '2-1', '2-2', '2-3', '2-4',
'2-5', '2-6', '2-7', '2-8', '2-9', '2-10']:
# submit[i]=(lgb1[i]+lgb2[i]+nn1[i]+nn2[i])/4.0
submit[i] = 0.75*lgb1[i]+0.25*nn1[i]
# submit[i]=0.1*lgb1[i]+0.1*nn1[i]+0.2*nn2[i]+0.2*lgb2[i]+0.4*lgb3[i]
submit.to_csv(path+"th_nurbs_7525.csv", index=False)
================================================
FILE: chizhu/util/get_nn_res.py
================================================
import pandas as pd
path = "/Users/chizhu/data/competition_data/易观/"
res1 = pd.read_csv(path+"res1.csv")
res2_1 = pd.read_csv(path+"res2_1.csv")
res2_2 = pd.read_csv(path+"res2_2.csv")
res1.index = range(len(res1))
res2_1.index = range(len(res2_1))
res2_2.index = range(len(res2_2))
final_1 = res2_1.copy()
final_2 = res2_2.copy()
for i in range(11):
final_1[str(i)] = res1['sex1']*res2_1[str(i)]
final_2[str(i)] = res1['sex2']*res2_2[str(i)]
id_list = pred['DeviceID']
final = id_list
final.index = range(len(final))
final.columns = ['DeviceID']
final_pred = pd.concat([final_1, final_2], 1)
final = pd.concat([final, final_pred], 1)
final.columns = ['DeviceID', '1-0', '1-1', '1-2', '1-3', '1-4', '1-5', '1-6',
'1-7', '1-8', '1-9', '1-10', '2-0', '2-1', '2-2', '2-3', '2-4',
'2-5', '2-6', '2-7', '2-8', '2-9', '2-10']
final.to_csv(path+'nn_feat_v12.csv', index=False)
train = pd.read_csv(path+"deviceid_train.tsv", sep="\t",
names=["id", "sex", "age"])
test = pd.read_csv(path+"deviceid_test.tsv", sep="\t", names=['DeviceID'])
pred = pd.read_csv(path+"nn_feat_v6.csv")
sub = pd.merge(test, pred, on="DeviceID", how="left")
sub.to_csv(path+"nn_v6.csv", index=False)
================================================
FILE: linwangli/code/lgb_allfeat_22.py
================================================
#!/usr/bin/env python
# coding: utf-8
from catboost import Pool, CatBoostClassifier, cv
import pandas as pd
import seaborn as sns
import numpy as np
from tqdm import tqdm
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import lightgbm as lgb
from datetime import datetime,timedelta
import matplotlib.pyplot as plt
import time
import gc
from sklearn import preprocessing
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack, vstack
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from skopt.space import Integer, Categorical, Real, Log10
from skopt.utils import use_named_args
from skopt import gp_minimize
import re
train = pd.read_csv('../dataset/deviceid_train.tsv', sep='\t', names=['device_id', 'sex', 'age'])
all_feat = pd.read_csv('../dataset/all_feat.csv')
train['label'] = train['sex'].astype(str) + '-' + train['age'].astype(str)
label_le = preprocessing.LabelEncoder()
train['label'] = label_le.fit_transform(train['label'])
data_all = pd.merge(left=all_feat, right=train, on='device_id', how='left')
train = data_all[:50000]
test = data_all[50000:]
train = train.fillna(-1)
test = test.fillna(-1)
del data_all
gc.collect()
use_feats = all_feat.columns[1:]
use_feats
X_train = train[use_feats]
X_test = test[use_feats]
Y = train['label']
kfold = StratifiedKFold(n_splits=5, random_state=10, shuffle=True)
sub = np.zeros((X_test.shape[0], 22))
for i, (train_index, test_index) in enumerate(kfold.split(X_train, Y)):
X_tr, X_vl, y_tr, y_vl = X_train.iloc[train_index], X_train.iloc[test_index], Y.iloc[train_index], Y.iloc[test_index]
dtrain = lgb.Dataset(X_tr, label=y_tr, categorical_feature=[-1])
dvalid = lgb.Dataset(X_vl, y_vl, reference=dtrain)
params = {
'boosting_type': 'gbdt',
'max_depth':6,
'metric': {'multi_logloss'},
'num_class':22,
'objective':'multiclass',
'num_leaves':7,
'subsample': 0.9,
'colsample_bytree': 0.2,
'lambda_l1':0.0001,
'lambda_l2':0.00111,
'subsample_freq':12,
'learning_rate': 0.012,
'min_child_weight':12
}
model = lgb.train(params,
dtrain,
num_boost_round=6000,
valid_sets=dvalid,
early_stopping_rounds=100,
verbose_eval=100)
sub += model.predict(X_test, num_iteration=model.best_iteration)/kfold.n_splits
sub = pd.DataFrame(sub)
cols = [x for x in range(0, 22)]
cols = label_le.inverse_transform(cols)
sub.columns = cols
sub['DeviceID'] = test['device_id'].values
sub = sub[['DeviceID', '1-0', '1-1', '1-2', '1-3', '1-4', '1-5', '1-6',
'1-7','1-8', '1-9', '1-10', '2-0', '2-1', '2-2', '2-3', '2-4',
'2-5', '2-6', '2-7', '2-8', '2-9', '2-10']]
sub.to_csv('lgb_22.csv', index=False)
================================================
FILE: linwangli/code/lgb_allfeat_condProb.py
================================================
#!/usr/bin/env python
# coding: utf-8
from catboost import Pool, CatBoostClassifier, cv
import pandas as pd
import seaborn as sns
import numpy as np
from tqdm import tqdm
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import lightgbm as lgb
from datetime import datetime,timedelta
import matplotlib.pyplot as plt
import time
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
import gc
from sklearn import preprocessing
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack, vstack
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from skopt.space import Integer, Categorical, Real, Log10
from skopt.utils import use_named_args
from skopt import gp_minimize
import re
# 读入数据
train = pd.read_csv('../dataset/deviceid_train.tsv', sep='\t', names=['device_id', 'sex', 'age'])
all_feat = pd.read_csv('../dataset/all_feat.csv')
data_all = pd.merge(left=all_feat, right=train, on='device_id', how='left')
train = data_all[:50000]
test = data_all[50000:]
train = train.fillna(-1)
test = test.fillna(-1)
del data_all
gc.collect()
use_feats = all_feat.columns[1:]
use_feats
# P(age)
Y = train['sex'] - 1
X_train = train[use_feats]
X_test = test[use_feats]
kfold = StratifiedKFold(n_splits=5, random_state=10, shuffle=True)
oof_preds1 = np.zeros((X_train.shape[0], ))
sub1 = np.zeros((X_test.shape[0], ))
for i, (train_index, test_index) in enumerate(kfold.split(X_train, Y)):
X_tr, X_vl, y_tr, y_vl = X_train.iloc[train_index], X_train.iloc[test_index], Y.iloc[train_index], Y.iloc[test_index]
dtrain = lgb.Dataset(X_tr, label=y_tr)
dvalid = lgb.Dataset(X_vl, y_vl, reference=dtrain)
params = {
'boosting_type': 'gbdt',
'max_depth':6,
'objective':'binary',
'num_leaves':31,
'subsample': 0.85,
'colsample_bytree': 0.2,
'lambda_l1':0.00007995302080034896,
'lambda_l2':0.0003648648811380991,
'subsample_freq':12,
'learning_rate': 0.012,
'min_child_weight':5.5
}
model = lgb.train(params,
dtrain,
num_boost_round=4000,
valid_sets=dvalid,
early_stopping_rounds=100,
verbose_eval=100)
oof_preds1[test_index] = model.predict(X_vl, num_iteration=model.best_iteration)
sub1 += model.predict(X_test, num_iteration=model.best_iteration)/kfold.n_splits
# P(age|sex = 2)
train['sex_pred'] = train['sex']
test['sex_pred'] = 1
use_feats = list(train.columns[1:-3])
use_feats = use_feats + ['sex_pred']
X_train = train[use_feats]
X_test = test[use_feats]
Y = train['age']
kfold = StratifiedKFold(n_splits=10, random_state=10, shuffle=True)
oof_preds2_1 = np.zeros((X_train.shape[0], 11))
sub2_1 = np.zeros((X_test.shape[0], 11))
for i, (train_index, test_index) in enumerate(kfold.split(X_train, Y)):
X_tr, X_vl, y_tr, y_vl = X_train.iloc[train_index], X_train.iloc[test_index], Y.iloc[train_index], Y.iloc[test_index]
dtrain = lgb.Dataset(X_tr, label=y_tr)
dvalid = lgb.Dataset(X_vl, y_vl, reference=dtrain)
params = {
'boosting_type': 'gbdt',
'max_depth':6,
'metric': {'multi_logloss'},
'num_class':11,
'objective':'multiclass',
'num_leaves':31,
'subsample': 0.9,
'colsample_bytree': 0.2,
'lambda_l1':0.0001,
'lambda_l2':0.00111,
'subsample_freq':10,
'learning_rate': 0.012,
'min_child_weight':10
}
model = lgb.train(params,
dtrain,
num_boost_round=4000,
valid_sets=dvalid,
early_stopping_rounds=100,
verbose_eval=100)
oof_preds2_1[test_index] = model.predict(X_vl, num_iteration=model.best_iteration)
sub2_1 += model.predict(X_test, num_iteration=model.best_iteration)/kfold.n_splits
# P(age|sex = 2)
train['sex_pred'] = train['sex']
test['sex_pred'] = 2
use_feats = list(train.columns[1:-3])
use_feats = use_feats + ['sex_pred']
X_train = train[use_feats]
X_test = test[use_feats]
Y = train['age']
kfold = StratifiedKFold(n_splits=10, random_state=10, shuffle=True)
oof_preds2_2 = np.zeros((X_train.shape[0], 11))
sub2_2 = np.zeros((X_test.shape[0], 11))
for i, (train_index, test_index) in enumerate(kfold.split(X_train, Y)):
X_tr, X_vl, y_tr, y_vl = X_train.iloc[train_index], X_train.iloc[test_index], Y.iloc[train_index], Y.iloc[test_index]
dtrain = lgb.Dataset(X_tr, label=y_tr)
dvalid = lgb.Dataset(X_vl, y_vl, reference=dtrain)
params = {
'boosting_type': 'gbdt',
'max_depth':6,
'metric': {'multi_logloss'},
'num_class':11,
'objective':'multiclass',
'num_leaves':31,
'subsample': 0.9,
'colsample_bytree': 0.2,
'lambda_l1':0.0001,
'lambda_l2':0.00111,
'subsample_freq':10,
'learning_rate': 0.012,
'min_child_weight':10
}
model = lgb.train(params,
dtrain,
num_boost_round=4000,
valid_sets=dvalid,
early_stopping_rounds=100,
verbose_eval=100)
oof_preds2_2[test_index] = model.predict(X_vl, num_iteration=model.best_iteration)
sub2_2 += model.predict(X_test, num_iteration=model.best_iteration)/kfold.n_splits
# 保存测试集的预测结果
sub1 = pd.DataFrame(sub1, columns=['sex2'])
sub1['sex1'] = 1-sub1['sex2']
sub2 = pd.DataFrame(sub2_1, columns=['age%s'%i for i in range(11)])
sub = pd.DataFrame(test['device_id'].values, columns=['DeviceID'])
for i in ['sex1', 'sex2']:
for j in ['age%s'%i for i in range(11)]:
sub[i+'_'+j] = sub1[i] * sub2[j]
sub.columns = ['DeviceID', '1-0', '1-1', '1-2', '1-3', '1-4', '1-5', '1-6',
'1-7','1-8', '1-9', '1-10', '2-0', '2-1', '2-2', '2-3', '2-4',
'2-5', '2-6', '2-7', '2-8', '2-9', '2-10']
sub.to_csv('test_pred.csv', index=False)
# 保存训练集五折的预测结果
oof_preds1 = pd.DataFrame(oof_preds1, columns=['sex2'])
oof_preds1['sex1'] = 1-oof_preds1['sex2']
oof_preds2_1 = pd.DataFrame(oof_preds2_1, columns=['age%s'%i for i in range(11)])
oof_preds2_2 = pd.DataFrame(oof_preds2_2, columns=['age%s'%i for i in range(11)])
oof_preds = train[['device_id']]
oof_preds.columns = ['DeviceID']
for i in ['age%s'%i for i in range(11)]:
oof_preds['sex1_'+i] = oof_preds1['sex1'] * oof_preds2_1[i]
for i in ['age%s'%i for i in range(11)]:
oof_preds['sex2_'+i] = oof_preds1['sex2'] * oof_preds2_2[i]
oof_preds.columns = ['DeviceID', '1-0', '1-1', '1-2', '1-3', '1-4', '1-5', '1-6',
'1-7','1-8', '1-9', '1-10', '2-0', '2-1', '2-2', '2-3', '2-4',
'2-5', '2-6', '2-7', '2-8', '2-9', '2-10']
oof_preds.to_csv('train_pred.csv', index=False)
================================================
FILE: linwangli/code/utils.py
================================================
import pandas as pd
import numpy as np
def weights_ensemble(results, weights):
'''
针对此次比赛的按权重进行模型融合的函数脚本
results: list,存放所有需要融合的结果路径
weights: list, 存放各个结果的权重
return: 可以直接to_csv提交的结果
'''
for i in range(len(results)):
if i == 0:
sub = pd.read_csv(results[0])
final_cols = list(sub.columns)
cols = list(sub.columns)
cols[1:] = [col + '_0' for col in cols[1:]]
sub.columns = cols
else:
result = pd.read_csv(results[i])
cols = list(result.columns)
cols[1:] = [col + '_' + str(i) for col in cols[1:]]
result.columns = cols
sub = pd.merge(left=sub, right=result, on='DeviceID')
for i in range(len(weights)):
for col in final_cols[1:]:
if col not in sub.columns:
sub[col] = weights[i] * sub[col + '_' + str(i)]
else:
sub[col] = sub[col] + weights[i] * sub[col + '_' + str(i)]
sub = sub[final_cols]
return sub
def result_corr(path1, path2):
'''
根据此次比赛写的评测不同提交结果相关性文件
path1: 结果1的路径
path2: 结果2的路径
return: 返回不同提交结果的相关性
'''
result_1 = pd.read_csv(path1)
result_2 = pd.read_csv(path2)
result = pd.merge(left=result_1, right=result_2, on='DeviceID', suffixes=('_x', '_y'))
cols = result_1.columns[1:]
col_list = []
for col in cols:
col_pair = [col + '_x', col + '_y']
col_list.append(result[col_pair].corr().loc[col + '_x', col + '_y'])
return np.mean(col_list)
================================================
FILE: linwangli/readme.txt
================================================
|—— code
|—— lgb_allfeat_22.py:基于【全部特征】训练得到lgb结果
|—— lgb_allfeat_condProb.py:基于【全部特征+条件概率】训练得到lgb结果
|—— utils.py:一些脚本函数,如加权融合/相关性评测等
|—— dataset
|—— deviceid_train.tsv: 赛方提供的文件
|—— all_feat.csv: 团队提取的所有特征
|—— result:存放各种提交文件
================================================
FILE: linwangli/yg-1st-lgb.py
================================================
# coding: utf-8
# In[ ]:
import pandas as pd
import seaborn as sns
import numpy as np
from tqdm import tqdm
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import lightgbm as lgb
from datetime import datetime,timedelta
import matplotlib.pyplot as plt
import time
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
# %matplotlib inline
#add
import gc
from sklearn import preprocessing
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack, vstack
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from skopt.space import Integer, Categorical, Real, Log10
from skopt.utils import use_named_args
from skopt import gp_minimize
from gensim.models import Word2Vec, FastText
import gensim
import re
# In[ ]:
test = pd.read_csv('../input/yiguan/demo/Demo/deviceid_test.tsv', sep='\t', names=['device_id'])
train = pd.read_csv('../input/yiguan/demo/Demo/deviceid_train.tsv', sep='\t', names=['device_id', 'sex', 'age'])
brand = pd.read_table('../input/yiguan/demo/Demo/deviceid_brand.tsv', names=['device_id', 'vendor', 'version'])
packtime = pd.read_table('../input/yiguan/demo/Demo/deviceid_package_start_close.tsv',
names=['device_id', 'app', 'start', 'close'])
packages = pd.read_csv('../input/yiguan/demo/Demo/deviceid_packages.tsv', sep='\t', names=['device_id', 'apps'])
# In[ ]:
packtime['period'] = (packtime['close'] - packtime['start'])/1000
packtime['start'] = pd.to_datetime(packtime['start'], unit='ms')
app_use_time = packtime.groupby(['app'])['period'].agg('sum').reset_index()
# 试试看200
app_use_top100 = app_use_time.sort_values(by='period', ascending=False)[:100]['app']
device_app_use_time = packtime.groupby(['device_id', 'app'])['period'].agg('sum').reset_index()
use_time_top100_statis = device_app_use_time.set_index('app').loc[list(app_use_top100)].reset_index()
top100_statis = use_time_top100_statis.pivot(index='device_id', columns='app', values='period').reset_index()
# In[ ]:
top100_statis = top100_statis.fillna(0)
# In[ ]:
# 手机品牌预处理
brand['vendor'] = brand['vendor'].astype(str).apply(lambda x : x.split(' ')[0].upper())
brand['ph_ver'] = brand['vendor'] + '_' + brand['version']
ph_ver = brand['ph_ver'].value_counts()
ph_ver_cnt = pd.DataFrame(ph_ver).reset_index()
ph_ver_cnt.columns = ['ph_ver', 'ph_ver_cnt']
brand = pd.merge(left=brand, right=ph_ver_cnt,on='ph_ver')
# In[ ]:
# 针对长尾分布做的一点处理
mask = (brand.ph_ver_cnt < 100)
brand.loc[mask, 'ph_ver'] = 'other'
train = pd.merge(brand[['device_id', 'ph_ver']], train, on='device_id', how='right')
test = pd.merge(brand[['device_id', 'ph_ver']], test, on='device_id', how='right')
train['ph_ver'] = train['ph_ver'].astype(str)
test['ph_ver'] = test['ph_ver'].astype(str)
# 将 ph_ver 进行 label encoder
ph_ver_le = preprocessing.LabelEncoder()
train['ph_ver'] = ph_ver_le.fit_transform(train['ph_ver'])
test['ph_ver'] = ph_ver_le.transform(test['ph_ver'])
train['label'] = train['sex'].astype(str) + '-' + train['age'].astype(str)
label_le = preprocessing.LabelEncoder()
train['label'] = label_le.fit_transform(train['label'])
# In[ ]:
test['sex'] = -1
test['age'] = -1
test['label'] = -1
data = pd.concat([train, test], ignore_index=True)
data.shape
# In[ ]:
ph_ver_dummy = pd.get_dummies(data['ph_ver'])
ph_ver_dummy.columns = ['ph_ver_' + str(i) for i in range(ph_ver_dummy.shape[1])]
# In[ ]:
data = pd.concat([data, ph_ver_dummy], axis=1)
# In[ ]:
del data['ph_ver']
# In[ ]:
train = data[data.sex != -1]
test = data[data.sex == -1]
train.shape, test.shape
# In[ ]:
# 每个app的总使用次数统计
app_num = packtime['app'].value_counts().reset_index()
app_num.columns = ['app', 'app_num']
packtime = pd.merge(left=packtime, right=app_num, on='app')
# 同样的,针对长尾分布做些处理(尝试过不做处理,或换其他阈值,这个100的阈值最高)
packtime.loc[packtime.app_num < 100, 'app'] = 'other'
# In[ ]:
# 统计每台设备的app数量
df_app = packtime[['device_id', 'app']]
apps = df_app.drop_duplicates().groupby(['device_id'])['app'].apply(' '.join).reset_index()
apps['app_length'] = apps['app'].apply(lambda x:len(x.split(' ')))
train = pd.merge(train, apps, on='device_id', how='left')
test = pd.merge(test, apps, on='device_id', how='left')
# In[ ]:
# 获取每台设备所安装的apps的tfidf
tfidf = CountVectorizer(lowercase=False, min_df=3, stop_words=top100_statis.columns.tolist()[1:7])
apps['app'] = tfidf.fit_transform(apps['app'])
X_tr_app = tfidf.transform(list(train['app']))
X_ts_app = tfidf.transform(list(test['app']))
# In[ ]:
'''
svd = TruncatedSVD(n_components=100, random_state=42)
X = vstack([X_tr_app, X_ts_app])
svd.fit(X)
X_tr_app = svd.fit_transform(X_tr_app)
X_ts_app = svd.fit_transform(X_ts_app)
X_tr_app = pd.DataFrame(X_tr_app)
X_ts_app = pd.DataFrame(X_ts_app)
X_tr_app.columns = ['app_' + str(i) for i in range(0, 100)]
X_ts_app.columns = ['app_' + str(i) for i in range(0, 100)]
'''
# ### 利用word2vec得到每台设备所安装app的embedding表示
# In[ ]:
packages['apps'] = packages['apps'].apply(lambda x:x.split(','))
packages['app_length'] = packages['apps'].apply(lambda x:len(x))
# In[ ]:
embed_size = 128
fastmodel = Word2Vec(list(packages['apps']), size=embed_size, window=4, min_count=3, negative=2,
sg=1, sample=0.002, hs=1, workers=4)
embedding_fast = pd.DataFrame([fastmodel[word] for word in (fastmodel.wv.vocab)])
embedding_fast['app'] = list(fastmodel.wv.vocab)
embedding_fast.columns= ["fdim_%s" % str(i) for i in range(embed_size)]+["app"]
embedding_fast.head()
# In[ ]:
id_list = []
for i in range(packages.shape[0]):
id_list += [list(packages['device_id'])[i]]*packages['app_length'].iloc[i]
app_list = [word for item in packages['apps'] for word in item]
app_vect = pd.DataFrame({'device_id':id_list})
app_vect['app'] = app_list
# In[ ]:
app_vect = app_vect.merge(embedding_fast, on='app', how='left')
app_vect = app_vect.drop('app', axis=1)
seqfeature = app_vect.groupby(['device_id']).agg('mean')
seqfeature.reset_index(inplace=True)
# In[ ]:
seqfeature.head()
# ### 用户一周七天玩手机的时长情况
# In[ ]:
# packtime['period'] = (packtime['close'] - packtime['start'])/1000
# packtime['start'] = pd.to_datetime(packtime['start'], unit='ms')
packtime['dayofweek'] = packtime['start'].dt.dayofweek
packtime['hour'] = packtime['start'].dt.hour
# packtime = packtime[(packtime['start'] < '2017-03-31 23:59:59') & (packtime['start'] > '2017-03-01 00:00:00')]
# In[ ]:
app_use_time = packtime.groupby(['device_id', 'dayofweek'])['period'].agg('sum').reset_index()
week_app_use = app_use_time.pivot_table(values='period', columns='dayofweek', index='device_id').reset_index()
week_app_use = week_app_use.fillna(0)
week_app_use.columns = ['device_id'] + ['week_day_' + str(i) for i in range(0, 7)]
week_app_use['week_max'] = week_app_use.max(axis=1)
week_app_use['week_min'] = week_app_use.min(axis=1)
week_app_use['week_sum'] = week_app_use.sum(axis=1)
week_app_use['week_std'] = week_app_use.std(axis=1)
'''
for i in range(0, 7):
week_app_use['week_day_' + str(i)] = week_app_use['week_day_' + str(i)] / week_app_use['week_sum']
'''
# In[ ]:
'''
app_use_time = packtime.groupby(['device_id', 'hour'])['period'].agg('sum').reset_index()
hour_app_use = app_use_time.pivot_table(values='period', columns='hour', index='device_id').reset_index()
hour_app_use = hour_app_use.fillna(0)
hour_app_use.columns = ['device_id'] + ['hour_' + str(i) for i in range(0, 24)]
# hour_app_use['hour_max'] = hour_app_use.max(axis=1)
# hour_app_use['hour_min'] = hour_app_use.min(axis=1)
# hour_app_use['hour_sum'] = hour_app_use.sum(axis=1)
# hour_app_use['hour_std'] = hour_app_use.std(axis=1)
# for i in range(0, 24):
# hour_app_use['hour_' + str(i)] = hour_app_use['hour_' + str(i)] / hour_app_use['hour_sum']
'''
# ### 将各个特征整合到一块
# In[ ]:
train.columns[4:]
# In[ ]:
user_behavior = pd.read_csv('../input/yg-user-behavior/user_behavior.csv')
user_behavior['app_len_max'] = user_behavior['app_len_max'].astype(np.float64)
del user_behavior['app']
train = pd.merge(train, user_behavior, on='device_id', how='left')
test = pd.merge(test, user_behavior, on='device_id', how='left')
# In[ ]:
train = pd.merge(train, seqfeature, on='device_id', how='left')
test = pd.merge(test, seqfeature, on='device_id', how='left')
# In[ ]:
train = pd.merge(train, week_app_use, on='device_id', how='left')
test = pd.merge(test, week_app_use, on='device_id', how='left')
# In[ ]:
'''
app_top50_list = list(packtime.groupby(by='app')['period'].sum().sort_values(ascending=False)[:50].index)
for app in app_top50_list:
app_cnt = packtime[packtime['app'] == app]
start_num_app = app_cnt.groupby(by='device_id')['start'].count().reset_index()
start_num_app.columns = ['device_id', 'start_num_app_' + app[0:4]]
train = train.merge(start_num_app, on='device_id', how='left')
test = test.merge(start_num_app, on='device_id', how='left')
print(app + ' done')
'''
# In[ ]:
'''
# all_top50 : 使用总时长最高的50款app,每个人的使用时间统计
all_top50 = pd.read_csv('../input/yg-feature/all_top50_statis.csv')
train = pd.merge(train, all_top50, on='device_id', how='left')
test = pd.merge(test, all_top50, on='device_id', how='left')
'''
# In[ ]:
top100_statis.columns = ['device_id'] + ['top100_statis_' + str(i) for i in range(0, 100)]
train = pd.merge(train, top100_statis, on='device_id', how='left')
test = pd.merge(test, top100_statis, on='device_id', how='left')
# In[ ]:
train.to_csv('train_feature.csv', index=None)
test.to_csv('test_feature.csv', index=None)
# In[ ]:
feats = train.columns[4:]
feats
# In[ ]:
feats = feats.delete(153)
feats[153]
# In[ ]:
'''
train = pd.merge(train, hour_app_use, on='device_id', how='left')
test = pd.merge(test, hour_app_use, on='device_id', how='left')
'''
# In[ ]:
X_train = hstack([X_tr_app, train[feats].astype(float)])
X_test = hstack([X_ts_app, test[feats].astype(float)])
X_train = X_train.tocsr().astype('float')
X_test = X_test.tocsr().astype('float')
# ### 开始训练模型
# In[ ]:
Y = train['sex'] - 1
kfold = StratifiedKFold(n_splits=10, random_state=10, shuffle=True)
oof_preds1 = np.zeros((X_train.shape[0], ))
sub1 = np.zeros((X_test.shape[0], ))
for i, (train_index, test_index) in enumerate(kfold.split(X_train, Y)):
X_tr, X_vl, y_tr, y_vl = X_train[train_index], X_train[test_index], Y[train_index], Y[test_index]
dtrain = lgb.Dataset(X_tr, label=y_tr)
dvalid = lgb.Dataset(X_vl, y_vl, reference=dtrain)
params = {
'boosting_type': 'gbdt',
'max_depth':6,
'objective':'binary',
'num_leaves':31,
'subsample': 0.85,
'colsample_bytree': 0.2,
'lambda_l1':0.00007995302080034896,
'lambda_l2':0.0003648648811380991,
'subsample_freq':12,
'learning_rate': 0.012,
'min_child_weight':5.5
}
model = lgb.train(params,
dtrain,
num_boost_round=4000,
valid_sets=dvalid,
early_stopping_rounds=100,
verbose_eval=100)
oof_preds1[test_index] = model.predict(X_vl, num_iteration=model.best_iteration)
sub1 += model.predict(X_test, num_iteration=model.best_iteration)/kfold.n_splits
# In[ ]:
Y = train['age']
kfold = StratifiedKFold(n_splits=10, random_state=10, shuffle=True)
oof_preds2 = np.zeros((X_train.shape[0], 11))
sub2 = np.zeros((X_test.shape[0], 11))
for i, (train_index, test_index) in enumerate(kfold.split(X_train, Y)):
X_tr, X_vl, y_tr, y_vl = X_train[train_index], X_train[test_index], Y[train_index], Y[test_index]
dtrain = lgb.Dataset(X_tr, label=y_tr)
dvalid = lgb.Dataset(X_vl, y_vl, reference=dtrain)
params = {
'boosting_type': 'gbdt',
'max_depth':6,
'metric': {'multi_logloss'},
'num_class':11,
'objective':'multiclass',
'num_leaves':31,
'subsample': 0.9,
'colsample_bytree': 0.2,
'lambda_l1':0.0001,
'lambda_l2':0.00111,
'subsample_freq':10,
'learning_rate': 0.012,
'min_child_weight':10
}
model = lgb.train(params,
dtrain,
num_boost_round=4000,
valid_sets=dvalid,
early_stopping_rounds=100,
verbose_eval=100)
oof_preds2[test_index] = model.predict(X_vl, num_iteration=model.best_iteration)
sub2 += model.predict(X_test, num_iteration=model.best_iteration)/kfold.n_splits
# In[ ]:
oof_preds1 = pd.DataFrame(oof_preds1, columns=['sex2'])
oof_preds1['sex1'] = 1-oof_preds1['sex2']
oof_preds2 = pd.DataFrame(oof_preds2, columns=['age%s'%i for i in range(11)])
oof_preds = train[['device_id']]
oof_preds.columns = ['DeviceID']
for i in ['sex1', 'sex2']:
for j in ['age%s'%i for i in range(11)]:
oof_preds[i+'_'+j] = oof_preds1[i] * oof_preds2[j]
oof_preds.columns = ['DeviceID', '1-0', '1-1', '1-2', '1-3', '1-4', '1-5', '1-6',
'1-7','1-8', '1-9', '1-10', '2-0', '2-1', '2-2', '2-3', '2-4',
'2-5', '2-6', '2-7', '2-8', '2-9', '2-10']
oof_preds.to_csv('train.csv', index=False)
# In[ ]:
sub1 = pd.DataFrame(sub1, columns=['sex2'])
sub1['sex1'] = 1-sub1['sex2']
sub2 = pd.DataFrame(sub2, columns=['age%s'%i for i in range(11)])
sub = test[['device_id']]
sub.columns = ['DeviceID']
for i in ['sex1', 'sex2']:
for j in ['age%s'%i for i in range(11)]:
sub[i+'_'+j] = sub1[i] * sub2[j]
sub.columns = ['DeviceID', '1-0', '1-1', '1-2', '1-3', '1-4', '1-5', '1-6',
'1-7','1-8', '1-9', '1-10', '2-0', '2-1', '2-2', '2-3', '2-4',
'2-5', '2-6', '2-7', '2-8', '2-9', '2-10']
sub.to_csv('lgb_l_v54.csv', index=False)
# In[ ]:
'''
Y = train['label']
#best params: [31, 11, 0.015955854914003094, 0.12122664084283229, 0.7645440142264772, 24, 1048, 0.00552258737237652, 0.005810068328090833, 7]
kfold = StratifiedKFold(n_splits=5, random_state=10, shuffle=True)
sub = np.zeros((X_test.shape[0], 22))
for i, (train_index, test_index) in enumerate(kfold.split(X_train, Y)):
X_tr, X_vl, y_tr, y_vl = X_train[train_index], X_train[test_index], Y[train_index], Y[test_index]
dtrain = lgb.Dataset(X_tr, label=y_tr)
dvalid = lgb.Dataset(X_vl, y_vl, reference=dtrain)
params = {
'boosting_type': 'gbdt',
'max_depth':7,
'objective':'multiclass',
'metric': {'multi_logloss'},
'num_class':22,
'num_leaves':20,
'subsample': 0.86,
'colsample_bytree': 0.8,
#'lambda_l1':0.00007995302080034896,
'lambda_l2':0.005,
'subsample_freq':11,
'learning_rate': 0.01,
'min_child_weight':5.5,
}
model = lgb.train(params,
dtrain,
num_boost_round=6000,
valid_sets=dvalid,
early_stopping_rounds=20,
verbose_eval=100)
sub += model.predict(X_test, num_iteration=model.best_iteration)/kfold.n_splits
'''
# In[ ]:
'''
sub = pd.DataFrame(sub)
cols = [x for x in range(0, 22)]
cols = label_le.inverse_transform(cols)
sub.columns = cols
sub['DeviceID'] = test['device_id'].values
sub = sub[['DeviceID', '1-0', '1-1', '1-2', '1-3', '1-4', '1-5', '1-6',
'1-7','1-8', '1-9', '1-10', '2-0', '2-1', '2-2', '2-3', '2-4',
'2-5', '2-6', '2-7', '2-8', '2-9', '2-10']]
sub.to_csv('30.csv', index=False)
'''
================================================
FILE: nb_cz_lwl_wcm/10_lgb.py
================================================
# coding: utf-8
# In[1]:
import pandas as pd
import seaborn as sns
import numpy as np
from tqdm import tqdm
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import lightgbm as lgb
from datetime import datetime,timedelta
import matplotlib.pyplot as plt
import time
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
#add
import gc
from sklearn import preprocessing
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack, vstack
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
# from skopt.space import Integer, Categorical, Real, Log10
# from skopt.utils import use_named_args
# from skopt import gp_minimize
from gensim.models import Word2Vec, FastText
import gensim
import re
from config import path
# path="/Users/chizhu/data/competition_data/易观/"
# In[2]:
test = pd.read_csv(path+'deviceid_test.tsv', sep='\t', names=['device_id'])
train = pd.read_csv(path+'deviceid_train.tsv', sep='\t', names=['device_id', 'sex', 'age'])
brand = pd.read_table(path+'deviceid_brand.tsv', names=['device_id', 'vendor', 'version'])
packtime = pd.read_table(path+'deviceid_package_start_close.tsv',
names=['device_id', 'app', 'start', 'close'])
packages = pd.read_csv(path+'deviceid_packages.tsv', sep='\t', names=['device_id', 'apps'])
# In[3]:
def get_str(df):
res=""
for i in df.split(","):
res+=i+" "
return res
packages["str_app"]=packages['apps'].apply(lambda x:get_str(x),1)
# In[4]:
tfidf = CountVectorizer()
train_str_app=pd.merge(train[['device_id']],packages[["device_id",'str_app']],on="device_id",how="left")
test_str_app=pd.merge(test[['device_id']],packages[["device_id",'str_app']],on="device_id",how="left")
packages['str_app'] = tfidf.fit_transform(packages['str_app'])
train_app = tfidf.transform(list(train_str_app['str_app'])).tocsr()
test_app = tfidf.transform(list(test_str_app['str_app'])).tocsr()
# In[5]:
all_id=pd.concat([train[["device_id"]],test[['device_id']]])
# In[6]:
all_id.index=range(len(all_id))
# In[7]:
# encoding:utf-8
import sys
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cross_validation import StratifiedKFold
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.linear_model import RidgeClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.metrics import mean_squared_error
import os
if not os.path.exists("data"):
os.mkdir("data")
############################ 切分数据集 ##########################
print('开始进行一些前期处理')
train_feature = train_app
test_feature = test_app
# 五则交叉验证
n_folds = 5
print('处理完毕')
df_stack = pd.DataFrame()
df_stack['device_id']=all_id['device_id']
for label in ["sex"]:
score = train[label]-1
########################### lr(LogisticRegression) ################################
print('lr stacking')
stack_train = np.zeros((len(train), 1))
stack_test = np.zeros((len(test), 1))
score_va = 0
for i, (tr, va) in enumerate(StratifiedKFold(score, n_folds=n_folds, random_state=1017)):
print('stack:%d/%d' % ((i + 1), n_folds))
clf = LogisticRegression(random_state=1017, C=8)
clf.fit(train_feature[tr], score[tr])
score_va = clf.predict_proba(train_feature[va])[:,1]
score_te = clf.predict_proba(test_feature)[:,1]
print('得分' + str(mean_squared_error(score[va], clf.predict(train_feature[va]))))
stack_train[va,0] = score_va
stack_test[:,0]+= score_te
stack_test /= n_folds
stack = np.vstack([stack_train, stack_test])
df_stack['pack_tfidf_lr_classfiy_{}'.format(label)] = stack[:, 0]
########################### SGD(随机梯度下降) ################################
print('sgd stacking')
stack_train = np.zeros((len(train), 1))
stack_test = np.zeros((len(test), 1))
score_va = 0
for i, (tr, va) in enumerate(StratifiedKFold(score, n_folds=n_folds, random_state=1017)):
print('stack:%d/%d' % ((i + 1), n_folds))
sgd = SGDClassifier(random_state=1017, loss='log')
sgd.fit(train_feature[tr], score[tr])
score_va = sgd.predict_proba(train_feature[va])[:,1]
score_te = sgd.predict_proba(test_feature)[:,1]
print('得分' + str(mean_squared_error(score[va], sgd.predict(train_feature[va]))))
stack_train[va,0] = score_va
stack_test[:,0]+= score_te
stack_test /= n_folds
stack = np.vstack([stack_train, stack_test])
df_stack['pack_tfidf_sgd_classfiy_{}'.format(label)] = stack[:, 0]
########################### pac(PassiveAggressiveClassifier) ################################
print('PAC stacking')
stack_train = np.zeros((len(train), 1))
stack_test = np.zeros((len(test), 1))
score_va = 0
for i, (tr, va) in enumerate(StratifiedKFold(score, n_folds=n_folds, random_state=1017)):
print('stack:%d/%d' % ((i + 1), n_folds))
pac = PassiveAggressiveClassifier(random_state=1017)
pac.fit(train_feature[tr], score[tr])
score_va = pac._predict_proba_lr(train_feature[va])[:,1]
score_te = pac._predict_proba_lr(test_feature)[:,1]
print(score_va)
print('得分' + str(mean_squared_error(score[va], pac.predict(train_feature[va]))))
stack_train[va,0] += score_va
stack_test[:,0] += score_te
stack_test /= n_folds
stack = np.vstack([stack_train, stack_test])
df_stack['pack_tfidf_pac_classfiy_{}'.format(label)] = stack[:, 0]
########################### ridge(RidgeClassfiy) ################################
print('RidgeClassfiy stacking')
stack_train = np.zeros((len(train), 1))
stack_test = np.zeros((len(test), 1))
score_va = 0
for i, (tr, va) in enumerate(StratifiedKFold(score, n_folds=n_folds, random_state=1017)):
print('stack:%d/%d' % ((i + 1), n_folds))
ridge = RidgeClassifier(random_state=1017)
ridge.fit(train_feature[tr], score[tr])
score_va = ridge._predict_proba_lr(train_feature[va])[:,1]
score_te = ridge._predict_proba_lr(test_feature)[:,1]
print(score_va)
print('得分' + str(mean_squared_error(score[va], ridge.predict(train_feature[va]))))
stack_train[va,0] += score_va
stack_test[:,0] += score_te
stack_test /= n_folds
stack = np.vstack([stack_train, stack_test])
df_stack['pack_tfidf_ridge_classfiy_{}'.format(label)] = stack[:, 0]
########################### bnb(BernoulliNB) ################################
print('BernoulliNB stacking')
stack_train = np.zeros((len(train), 1))
stack_test = np.zeros((len(test), 1))
score_va = 0
for i, (tr, va) in enumerate(StratifiedKFold(score, n_folds=n_folds, random_state=1017)):
print('stack:%d/%d' % ((i + 1), n_folds))
bnb = BernoulliNB()
bnb.fit(train_feature[tr], score[tr])
score_va = bnb.predict_proba(train_feature[va])[:,1]
score_te = bnb.predict_proba(test_feature)[:,1]
print(score_va)
print('得分' + str(mean_squared_error(score[va], bnb.predict(train_feature[va]))))
stack_train[va,0] += score_va
stack_test[:,0] += score_te
stack_test /= n_folds
stack = np.vstack([stack_train, stack_test])
df_stack['pack_tfidf_bnb_classfiy_{}'.format(label)] = stack[:, 0]
########################### mnb(MultinomialNB) ################################
print('MultinomialNB stacking')
stack_train = np.zeros((len(train), 1))
stack_test = np.zeros((len(test), 1))
score_va = 0
for i, (tr, va) in enumerate(StratifiedKFold(score, n_folds=n_folds, random_state=1017)):
print('stack:%d/%d' % ((i + 1), n_folds))
mnb = MultinomialNB()
mnb.fit(train_feature[tr], score[tr])
score_va = mnb.predict_proba(train_feature[va])[:,1]
score_te = mnb.predict_proba(test_feature)[:,1]
print(score_va)
print('得分' + str(mean_squared_error(score[va], mnb.predict(train_feature[va]))))
stack_train[va,0] += score_va
stack_test[:,0] += score_te
stack_test /= n_folds
stack = np.vstack([stack_train, stack_test])
df_stack['pack_tfidf_mnb_classfiy_{}'.format(label)] = stack[:, 0]
############################ Linersvc(LinerSVC) ################################
print('LinerSVC stacking')
stack_train = np.zeros((len(train), 1))
stack_test = np.zeros((len(test), 1))
score_va = 0
for i, (tr, va) in enumerate(StratifiedKFold(score, n_folds=n_folds, random_state=1017)):
print('stack:%d/%d' % ((i + 1), n_folds))
lsvc = LinearSVC(random_state=1017)
lsvc.fit(train_feature[tr], score[tr])
score_va = lsvc._predict_proba_lr(train_feature[va])[:,1]
score_te = lsvc._predict_proba_lr(test_feature)[:,1]
print(score_va)
print('得分' + str(mean_squared_error(score[va], lsvc.predict(train_feature[va]))))
stack_train[va,0] += score_va
stack_test[:,0] += score_te
stack_test /= n_folds
stack = np.vstack([stack_train, stack_test])
df_stack['pack_tfidf_lsvc_classfiy_{}'.format(label)] = stack[:, 0]
df_stack.to_csv('data/tfidf_classfiy_package.csv', index=None, encoding='utf8')
print('tfidf特征已保存\n')
# In[8]:
packtime['period'] = (packtime['close'] - packtime['start'])/1000
packtime['start'] = pd.to_datetime(packtime['start'], unit='ms')
app_use_time = packtime.groupby(['app'])['period'].agg('sum').reset_index()
app_use_top100 = app_use_time.sort_values(by='period', ascending=False)[:100]['app']
device_app_use_time = packtime.groupby(['device_id', 'app'])['period'].agg('sum').reset_index()
use_time_top100_statis = device_app_use_time.set_index('app').loc[list(app_use_top100)].reset_index()
top100_statis = use_time_top100_statis.pivot(index='device_id', columns='app', values='period').reset_index()
# In[9]:
top100_statis = top100_statis.fillna(0)
# In[10]:
# 手机品牌预处理
brand['vendor'] = brand['vendor'].astype(str).apply(lambda x : x.split(' ')[0].upper())
brand['ph_ver'] = brand['vendor'] + '_' + brand['version']
ph_ver = brand['ph_ver'].value_counts()
ph_ver_cnt = pd.DataFrame(ph_ver).reset_index()
ph_ver_cnt.columns = ['ph_ver', 'ph_ver_cnt']
brand = pd.merge(left=brand, right=ph_ver_cnt,on='ph_ver')
# In[11]:
# 针对长尾分布做的一点处理
mask = (brand.ph_ver_cnt < 100)
brand.loc[mask, 'ph_ver'] = 'other'
train_data = pd.merge(brand[['device_id', 'ph_ver']], train, on='device_id', how='right')
test_data = pd.merge(brand[['device_id', 'ph_ver']], test, on='device_id', how='right')
train_data['ph_ver'] = train_data['ph_ver'].astype(str)
test_data['ph_ver'] = test_data['ph_ver'].astype(str)
# 将 ph_ver 进行 label encoder
ph_ver_le = preprocessing.LabelEncoder()
train_data['ph_ver'] = ph_ver_le.fit_transform(train_data['ph_ver'])
test_data['ph_ver'] = ph_ver_le.transform(test_data['ph_ver'])
train_data['label'] = train_data['sex'].astype(str) + '-' + train_data['age'].astype(str)
label_le = preprocessing.LabelEncoder()
train_data['label'] = label_le.fit_transform(train_data['label'])
# In[12]:
test_data['sex'] = -1
test_data['age'] = -1
test_data['label'] = -1
data = pd.concat([train_data, test_data], ignore_index=True)
print(data.shape)
# In[13]:
train_data = data[data.sex != -1]
test_data = data[data.sex == -1]
print(train.shape, test.shape)
# In[14]:
# 每个app的总使用次数统计
app_num = packtime['app'].value_counts().reset_index()
app_num.columns = ['app', 'app_num']
packtime = pd.merge(left=packtime, right=app_num, on='app')
# 同样的,针对长尾分布做些处理(尝试过不做处理,或换其他阈值,这个100的阈值最高)
packtime.loc[packtime.app_num < 100, 'app'] = 'other'
# In[15]:
# 统计每台设备的app数量
df_app = packtime[['device_id', 'app']]
apps = df_app.drop_duplicates().groupby(['device_id'])['app'].apply(' '.join).reset_index()
apps['app_length'] = apps['app'].apply(lambda x:len(x.split(' ')))
train_data = pd.merge(train_data, apps, on='device_id', how='left')
test_data = pd.merge(test_data, apps, on='device_id', how='left')
# In[16]:
# 获取每台设备所安装的apps的tfidf
tfidf = CountVectorizer()
apps['app'] = tfidf.fit_transform(apps['app'])
X_tr_app = tfidf.transform(list(train_data['app'])).tocsr()
X_ts_app = tfidf.transform(list(test_data['app'])).tocsr()
# In[17]:
# encoding:utf-8
import sys
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cross_validation import StratifiedKFold
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.linear_model import RidgeClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.metrics import mean_squared_error
############################ 切分数据集 ##########################
print('开始进行一些前期处理')
train_feature = X_tr_app
test_feature = X_ts_app
# 五则交叉验证
n_folds = 5
print('处理完毕')
df_stack = pd.DataFrame()
df_stack['device_id']=data['device_id']
for label in ["sex"]:
score = train_data[label]-1
########################### lr(LogisticRegression) ################################
print('lr stacking')
stack_train = np.zeros((len(train_data), 1))
stack_test = np.zeros((len(test_data), 1))
score_va = 0
for i, (tr, va) in enumerate(StratifiedKFold(score, n_folds=n_folds, random_state=1017)):
print('stack:%d/%d' % ((i + 1), n_folds))
clf = LogisticRegression(random_state=1017, C=8)
clf.fit(train_feature[tr], score[tr])
score_va = clf.predict_proba(train_feature[va])[:,1]
score_te = clf.predict_proba(test_feature)[:,1]
print('得分' + str(mean_squared_error(score[va], clf.predict(train_feature[va]))))
stack_train[va,0] = score_va
stack_test[:,0]+= score_te
stack_test /= n_folds
stack = np.vstack([stack_train, stack_test])
df_stack['tfidf_lr_classfiy_{}'.format(label)] = stack[:, 0]
########################### SGD(随机梯度下降) ################################
print('sgd stacking')
stack_train = np.zeros((len(train_data), 1))
stack_test = np.zeros((len(test_data), 1))
score_va = 0
for i, (tr, va) in enumerate(StratifiedKFold(score, n_folds=n_folds, random_state=1017)):
print('stack:%d/%d' % ((i + 1), n_folds))
sgd = SGDClassifier(random_state=1017, loss='log')
sgd.fit(train_feature[tr], score[tr])
score_va = sgd.predict_proba(train_feature[va])[:,1]
score_te = sgd.predict_proba(test_feature)[:,1]
print('得分' + str(mean_squared_error(score[va], sgd.predict(train_feature[va]))))
stack_train[va,0] = score_va
stack_test[:,0]+= score_te
stack_test /= n_folds
stack = np.vstack([stack_train, stack_test])
df_stack['tfidf_sgd_classfiy_{}'.format(label)] = stack[:, 0]
########################### pac(PassiveAggressiveClassifier) ################################
print('PAC stacking')
stack_train = np.zeros((len(train_data), 1))
stack_test = np.zeros((len(test_data), 1))
score_va = 0
for i, (tr, va) in enumerate(StratifiedKFold(score, n_folds=n_folds, random_state=1017)):
print('stack:%d/%d' % ((i + 1), n_folds))
pac = PassiveAggressiveClassifier(random_state=1017)
pac.fit(train_feature[tr], score[tr])
score_va = pac._predict_proba_lr(train_feature[va])[:,1]
score_te = pac._predict_proba_lr(test_feature)[:,1]
print(score_va)
print('得分' + str(mean_squared_error(score[va], pac.predict(train_feature[va]))))
stack_train[va,0] += score_va
stack_test[:,0] += score_te
stack_test /= n_folds
stack = np.vstack([stack_train, stack_test])
df_stack['tfidf_pac_classfiy_{}'.format(label)] = stack[:, 0]
########################### ridge(RidgeClassfiy) ################################
print('RidgeClassfiy stacking')
stack_train = np.zeros((len(train_data), 1))
stack_test = np.zeros((len(test_data), 1))
score_va = 0
for i, (tr, va) in enumerate(StratifiedKFold(score, n_folds=n_folds, random_state=1017)):
print('stack:%d/%d' % ((i + 1), n_folds))
ridge = RidgeClassifier(random_state=1017)
ridge.fit(train_feature[tr], score[tr])
score_va = ridge._predict_proba_lr(train_feature[va])[:,1]
score_te = ridge._predict_proba_lr(test_feature)[:,1]
print(score_va)
print('得分' + str(mean_squared_error(score[va], ridge.predict(train_feature[va]))))
stack_train[va,0] += score_va
stack_test[:,0] += score_te
stack_test /= n_folds
stack = np.vstack([stack_train, stack_test])
df_stack['tfidf_ridge_classfiy_{}'.format(label)] = stack[:, 0]
########################### bnb(BernoulliNB) ################################
print('BernoulliNB stacking')
stack_train = np.zeros((len(train_data), 1))
stack_test = np.zeros((len(test_data), 1))
score_va = 0
for i, (tr, va) in enumerate(StratifiedKFold(score, n_folds=n_folds, random_state=1017)):
print('stack:%d/%d' % ((i + 1), n_folds))
bnb = BernoulliNB()
bnb.fit(train_feature[tr], score[tr])
score_va = bnb.predict_proba(train_feature[va])[:,1]
score_te = bnb.predict_proba(test_feature)[:,1]
print(score_va)
print('得分' + str(mean_squared_error(score[va], bnb.predict(train_feature[va]))))
stack_train[va,0] += score_va
stack_test[:,0] += score_te
stack_test /= n_folds
stack = np.vstack([stack_train, stack_test])
df_stack['tfidf_bnb_classfiy_{}'.format(label)] = stack[:, 0]
########################### mnb(MultinomialNB) ################################
print('MultinomialNB stacking')
stack_train = np.zeros((len(train_data), 1))
stack_test = np.zeros((len(test_data), 1))
score_va = 0
for i, (tr, va) in enumerate(StratifiedKFold(score, n_folds=n_folds, random_state=1017)):
print('stack:%d/%d' % ((i + 1), n_folds))
mnb = MultinomialNB()
mnb.fit(train_feature[tr], score[tr])
score_va = mnb.predict_proba(train_feature[va])[:,1]
score_te = mnb.predict_proba(test_feature)[:,1]
print(score_va)
print('得分' + str(mean_squared_error(score[va], mnb.predict(train_feature[va]))))
stack_train[va,0] += score_va
stack_test[:,0] += score_te
stack_test /= n_folds
stack = np.vstack([stack_train, stack_test])
df_stack['tfidf_mnb_classfiy_{}'.format(label)] = stack[:, 0]
############################ Linersvc(LinerSVC) ################################
print('LinerSVC stacking')
stack_train = np.zeros((len(train_data), 1))
stack_test = np.zeros((len(test_data), 1))
score_va = 0
for i, (tr, va) in enumerate(StratifiedKFold(score, n_folds=n_folds, random_state=1017)):
print('stack:%d/%d' % ((i + 1), n_folds))
lsvc = LinearSVC(random_state=1017)
lsvc.fit(train_feature[tr], score[tr])
score_va = lsvc._predict_proba_lr(train_feature[va])[:,1]
score_te = lsvc._predict_proba_lr(test_feature)[:,1]
print(score_va)
print('得分' + str(mean_squared_error(score[va], lsvc.predict(train_feature[va]))))
stack_train[va,0] += score_va
stack_test[:,0] += score_te
stack_test /= n_folds
stack = np.vstack([stack_train, stack_test])
df_stack['tfidf_lsvc_classfiy_{}'.format(label)] = stack[:, 0]
df_stack.to_csv('data/tfidf_classfiy.csv', index=None, encoding='utf8')
print('tfidf特征已保存\n')
# ### 利用word2vec得到每台设备所安装app的embedding表示
# In[18]:
packages['apps'] = packages['apps'].apply(lambda x:x.split(','))
packages['app_length'] = packages['apps'].apply(lambda x:len(x))
# In[19]:
embed_size = 128
fastmodel = Word2Vec(list(packages['apps']), size=embed_size, window=4, min_count=3, negative=2,
sg=1, sample=0.002, hs=1, workers=4)
embedding_fast = pd.DataFrame([fastmodel[word] for word in (fastmodel.wv.vocab)])
embedding_fast['app'] = list(fastmodel.wv.vocab)
embedding_fast.columns= ["fdim_%s" % str(i) for i in range(embed_size)]+["app"]
print(embedding_fast.head())
# In[20]:
id_list = []
for i in range(packages.shape[0]):
id_list += [list(packages['device_id'])[i]]*packages['app_length'].iloc[i]
app_list = [word for item in packages['apps'] for word in item]
app_vect = pd.DataFrame({'device_id':id_list})
app_vect['app'] = app_list
# In[21]:
app_vect = app_vect.merge(embedding_fast, on='app', how='left')
app_vect = app_vect.drop('app', axis=1)
seqfeature = app_vect.groupby(['device_id']).agg('mean')
seqfeature.reset_index(inplace=True)
# In[22]:
print(seqfeature.head())
# ### 用户一周七天玩手机的时长情况
# In[23]:
# packtime['period'] = (packtime['close'] - packtime['start'])/1000
# packtime['start'] = pd.to_datetime(packtime['start'], unit='ms')
packtime['dayofweek'] = packtime['start'].dt.dayofweek
packtime['hour'] = packtime['start'].dt.hour
# packtime = packtime[(packtime['start'] < '2017-03-31 23:59:59') & (packtime['start'] > '2017-03-01 00:00:00')]
# In[24]:
app_use_time = packtime.groupby(['device_id', 'dayofweek'])['period'].agg('sum').reset_index()
week_app_use = app_use_time.pivot_table(values='period', columns='dayofweek', index='device_id').reset_index()
week_app_use = week_app_use.fillna(0)
week_app_use.columns = ['device_id'] + ['week_day_' + str(i) for i in range(0, 7)]
week_app_use['week_max'] = week_app_use.max(axis=1)
week_app_use['week_min'] = week_app_use.min(axis=1)
week_app_use['week_sum'] = week_app_use.sum(axis=1)
week_app_use['week_std'] = week_app_use.std(axis=1)
# ### 将各个特征整合到一块
# In[25]:
print(train_data.columns[4:])
# In[26]:
user_behavior = pd.read_csv('data/user_behavior.csv')
user_behavior['app_len_max'] = user_behavior['app_len_max'].astype(np.float64)
del user_behavior['app']
train_data = pd.merge(train_data, user_behavior, on='device_id', how='left')
test_data = pd.merge(test_data, user_behavior, on='device_id', how='left')
# In[27]:
train_data = pd.merge(train_data, seqfeature, on='device_id', how='left')
test_data = pd.merge(test_data, seqfeature, on='device_id', how='left')
# In[28]:
train_data = pd.merge(train_data, week_app_use, on='device_id', how='left')
test_data = pd.merge(test_data, week_app_use, on='device_id', how='left')
# In[29]:
top100_statis.columns = ['device_id'] + ['top100_statis_' + str(i) for i in range(0, 100)]
train_data = pd.merge(train_data, top100_statis, on='device_id', how='left')
test_data = pd.merge(test_data, top100_statis, on='device_id', how='left')
# In[30]:
train_data.to_csv("./data/train_data.csv",index=False)
test_data.to_csv("./data/test_data.csv",index=False)
# In[31]:
tfidf_feat=pd.read_csv("data/tfidf_classfiy.csv")
tf2=pd.read_csv("data/tfidf_classfiy_package.csv")
train_data=pd.read_csv("data/train_data.csv")
test_data=pd.read_csv("data/test_data.csv")
# app_w2v=pd.read_csv("./data/w2v_tfidf.csv")
# In[32]:
train = pd.merge(train_data,tfidf_feat,on="device_id",how="left")
# train = pd.merge(train_data,tf2,on="device_id",how="left")
# train = pd.merge(train_data,app_w2v,on="device_id",how="left")
test = pd.merge(test_data,tfidf_feat,on="device_id",how="left")
# test = pd.merge(test_data,tf2,on="device_id",how="left")
# test = pd.merge(test_data,app_w2v,on="device_id",how="left")
# In[85]:
train_dt = pd.merge(train_data[['device_id','ph_ver']],tfidf_feat,on="device_id",how="left")
train_dt = pd.merge(train_dt,tf2,on="device_id",how="left")
test_dt = pd.merge(test_data[['device_id',"ph_ver"]],tfidf_feat,on="device_id",how="left")
test_dt = pd.merge(test_dt,tf2,on="device_id",how="left")
feat=pd.concat([train_dt,test_dt])
feat.to_csv("data/sex_chizhu_feat.csv",index=False)
# In[33]:
features = [x for x in train.columns if x not in ['device_id', 'sex',"age","label","app"]]
Y = train['sex'] - 1
# ### 开始训练模型
# In[34]:
import lightgbm as lgb
# import xgboost as xgb
from sklearn.metrics import auc, log_loss, roc_auc_score,f1_score,recall_score,precision_score
from sklearn.cross_validation import StratifiedKFold
kf = StratifiedKFold(Y, n_folds=5, shuffle=True, random_state=1024)
params = {
'boosting_type': 'gbdt',
'metric': {'binary_logloss',},
# 'is_unbalance':'True',
'learning_rate' : 0.01,
'verbose': 0,
'num_leaves':32 ,
# 'max_depth':8,
# 'max_bin':10,
# 'lambda_l2': 1,
# 'min_child_weight':50,
'objective': 'binary',
'feature_fraction': 0.4,
'bagging_fraction':0.7, # 0.9是目前最优的
'bagging_freq':3, # 3是目前最优的
# 'min_data': 500,
'seed': 1024,
'nthread': 8,
# 'silent': True,
}
num_round = 3500
early_stopping_rounds = 100
# In[35]:
aus = []
sub1 = np.zeros((len(test), ))
pred_oob1=np.zeros((len(train),))
for i,(train_index,test_index) in enumerate(kf):
tr_x = train[features].reindex(index=train_index, copy=False)
tr_y = Y[train_index]
te_x = train[features].reindex(index=test_index, copy=False)
te_y = Y[test_index]
d_tr = lgb.Dataset(tr_x, label=tr_y)
d_te = lgb.Dataset(te_x, label=te_y)
model = lgb.train(params, d_tr, num_boost_round=num_round,
valid_sets=d_te,verbose_eval=200,
early_stopping_rounds=early_stopping_rounds)
pred= model.predict(te_x, num_iteration=model.best_iteration)
pred_oob1[test_index] =pred
a = log_loss(te_y, pred)
sub1 += model.predict(test[features], num_iteration=model.best_iteration)/5
print ("idx: ", i)
print (" loss: %.5f" % a)
print ("best tree num: ", model.best_iteration)
aus.append(a)
print ("mean")
print ("auc: %s" % (sum(aus) / 5.0))
# In[36]:
#####特征重要性
# get_ipython().run_line_magic('matplotlib', 'inline')
# import matplotlib.pyplot as plt
# f=dict(zip(list(train[features].keys()),model.feature_importance()))
# f=sorted(f.items(),key=lambda d:d[1], reverse = True)
# f=pd.DataFrame(f,columns=['feature','imp'])
# plt.bar(range(len(f)),f.imp)
# plt.xticks(range(len(f)),f.feature,rotation=70,fontsize=20)
# fig = plt.gcf()
# fig.set_size_inches(50, 20)
# In[37]:
# f.ix[:450,:]
# In[38]:
# features=f.ix[:434,"feature"].values
# In[39]:
pred_oob1 = pd.DataFrame(pred_oob1, columns=['sex2'])
sub1 = pd.DataFrame(sub1, columns=['sex2'])
res1=pd.concat([pred_oob1,sub1])
res1['sex1'] = 1-res1['sex2']
# In[40]:
import gc
gc.collect()
# In[41]:
train_id = pd.read_csv(path+'deviceid_train.tsv', sep='\t', names=['device_id', 'sex', 'age'])
# In[42]:
# encoding:utf-8
import sys
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cross_validation import StratifiedKFold
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.linear_model import RidgeClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.metrics import mean_squared_error
############################ 切分数据集 ##########################
print('开始进行一些前期处理')
train_feature = train_app
test_feature = test_app
# 五则交叉验证
n_folds = 5
print('处理完毕')
df_stack = pd.DataFrame()
df_stack['device_id']=all_id['device_id']
for label in ["age"]:
score = train_id[label]
########################### lr(LogisticRegression) ################################
print('lr stacking')
stack_train = np.zeros((len(train), 11))
stack_test = np.zeros((len(test), 11))
score_va = 0
for i, (tr, va) in enumerate(StratifiedKFold(score, n_folds=n_folds, random_state=1017)):
print('stack:%d/%d' % ((i + 1), n_folds))
clf = LogisticRegression(random_state=1017, C=8)
clf.fit(train_feature[tr], score[tr])
score_va = clf.predict_proba(train_feature[va])
score_te = clf.predict_proba(test_feature)
print('得分' + str(mean_squared_error(score[va], clf.predict(train_feature[va]))))
stack_train[va] = score_va
stack_test+= score_te
stack_test /= n_folds
stack = np.vstack([stack_train, stack_test])
for i in range(stack.shape[1]):
df_stack['pack_tfidf_lr_classfiy_{}'.format(i)] = stack[:, i]
########################### SGD(随机梯度下降) ################################
print('sgd stacking')
stack_train = np.zeros((len(train), 11))
stack_test = np.zeros((len(test), 11))
score_va = 0
for i, (tr, va) in enumerate(StratifiedKFold(score, n_folds=n_folds, random_state=1017)):
print('stack:%d/%d' % ((i + 1), n_folds))
sgd = SGDClassifier(random_state=1017, loss='log')
sgd.fit(train_feature[tr], score[tr])
score_va = sgd.predict_proba(train_feature[va])
score_te = sgd.predict_proba(test_feature)
print('得分' + str(mean_squared_error(score[va], sgd.predict(train_feature[va]))))
stack_train[va] = score_va
stack_test+= score_te
stack_test /= n_folds
stack = np.vstack([stack_train, stack_test])
for i in range(stack.shape[1]):
df_stack['pack_tfidf_sgd_classfiy_{}'.format(i)] = stack[:, i]
########################### pac(PassiveAggressiveClassifier) ################################
print('PAC stacking')
stack_train = np.zeros((len(train), 11))
stack_test = np.zeros((len(test), 11))
score_va = 0
for i, (tr, va) in enumerate(StratifiedKFold(score, n_folds=n_folds, random_state=1017)):
print('stack:%d/%d' % ((i + 1), n_folds))
pac = PassiveAggressiveClassifier(random_state=1017)
pac.fit(train_feature[tr], score[tr])
score_va = pac._predict_proba_lr(train_feature[va])
score_te = pac._predict_proba_lr(test_feature)
print(score_va)
print('得分' + str(mean_squared_error(score[va], pac.predict(train_feature[va]))))
stack_train[va] += score_va
stack_test += score_te
stack_test /= n_folds
stack = np.vstack([stack_train, stack_test])
for i in range(stack.shape[1]):
df_stack['pack_tfidf_pac_classfiy_{}'.format(i)] = stack[:, i]
########################### ridge(RidgeClassfiy) ################################
print('RidgeClassfiy stacking')
stack_train = np.zeros((len(train), 11))
stack_test = np.zeros((len(test), 11))
score_va = 0
for i, (tr, va) in enumerate(StratifiedKFold(score, n_folds=n_folds, random_state=1017)):
print('stack:%d/%d' % ((i + 1), n_folds))
ridge = RidgeClassifier(random_state=1017)
ridge.fit(train_feature[tr], score[tr])
score_va = ridge._predict_proba_lr(train_feature[va])
score_te = ridge._predict_proba_lr(test_feature)
print(score_va)
print('得分' + str(mean_squared_error(score[va], ridge.predict(train_feature[va]))))
stack_train[va] += score_va
stack_test += score_te
stack_test /= n_folds
stack = np.vstack([stack_train, stack_test])
for i in range(stack.shape[1]):
df_stack['pack_tfidf_ridge_classfiy_{}'.format(i)] = stack[:, i]
########################### bnb(BernoulliNB) ################################
print('BernoulliNB stacking')
stack_train = np.zeros((len(train), 11))
stack_test = np.zeros((len(test), 11))
score_va = 0
for i, (tr, va) in enumerate(StratifiedKFold(score, n_folds=n_folds, random_state=1017)):
print('stack:%d/%d' % ((i + 1), n_folds))
bnb = BernoulliNB()
bnb.fit(train_feature[tr], score[tr])
score_va = bnb.predict_proba(train_feature[va])
score_te = bnb.predict_proba(test_feature)
print(score_va)
print('得分' + str(mean_squared_error(score[va], bnb.predict(train_feature[va]))))
stack_train[va] += score_va
stack_test += score_te
stack_test /= n_folds
stack = np.vstack([stack_train, stack_test])
for i in range(stack.shape[1]):
df_stack['pack_tfidf_bnb_classfiy_{}'.format(i)] = stack[:, i]
########################### mnb(MultinomialNB) ################################
print('MultinomialNB stacking')
stack_train = np.zeros((len(train), 11))
stack_test = np.zeros((len(test), 11))
score_va = 0
for i, (tr, va) in enumerate(StratifiedKFold(score, n_folds=n_folds, random_state=1017)):
print('stack:%d/%d' % ((i + 1), n_folds))
mnb = MultinomialNB()
mnb.fit(train_feature[tr], score[tr])
score_va = mnb.predict_proba(train_feature[va])
score_te = mnb.predict_proba(test_feature)
print(score_va)
print('得分' + str(mean_squared_error(score[va], mnb.predict(train_feature[va]))))
stack_train[va] += score_va
stack_test += score_te
stack_test /= n_folds
stack = np.vstack([stack_train, stack_test])
for i in range(stack.shape[1]):
df_stack['pack_tfidf_mnb_classfiy_{}'.format(i)] = stack[:, i]
############################ Linersvc(LinerSVC) ################################
print('LinerSVC stacking')
stack_train = np.zeros((len(train), 11))
stack_test = np.zeros((len(test), 11))
score_va = 0
for i, (tr, va) in enumerate(StratifiedKFold(score, n_folds=n_folds, random_state=1017)):
print('stack:%d/%d' % ((i + 1), n_folds))
lsvc = LinearSVC(random_state=1017)
lsvc.fit(train_feature[tr], score[tr])
score_va = lsvc._predict_proba_lr(train_feature[va])
score_te = lsvc._predict_proba_lr(test_feature)
print(score_va)
print('得分' + str(mean_squared_error(score[va], lsvc.predict(train_feature[va]))))
stack_train[va] += score_va
stack_test += score_te
stack_test /= n_folds
stack = np.vstack([stack_train, stack_test])
for i in range(stack.shape[1]):
df_stack['pack_tfidf_lsvc_classfiy_{}'.format(i)] = stack[:, i]
df_stack.to_csv('data/pack_tfidf_age.csv', index=None, encoding='utf8')
print('tfidf特征已保存\n')
# #### tfidf
# In[43]:
# encoding:utf-8
import sys
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cross_validation import StratifiedKFold
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.linear_model import RidgeClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.metrics import mean_squared_error
############################ 切分数据集 ##########################
print('开始进行一些前期处理')
train_feature = X_tr_app
test_feature = X_ts_app
# 五则交叉验证
n_folds = 5
print('处理完毕')
df_stack = pd.DataFrame()
df_stack['device_id']=data['device_id']
for label in ["age"]:
score = train[label]
########################### lr(LogisticRegression) ################################
print('lr stacking')
stack_train = np.zeros((len(train), 11))
stack_test = np.zeros((len(test), 11))
score_va = 0
for i, (tr, va) in enumerate(StratifiedKFold(score, n_folds=n_folds, random_state=1017)):
print('stack:%d/%d' % ((i + 1), n_folds))
clf = LogisticRegression(random_state=1017, C=8)
clf.fit(train_feature[tr], score[tr])
score_va = clf.predict_proba(train_feature[va])
score_te = clf.predict_proba(test_feature)
print('得分' + str(mean_squared_error(score[va], clf.predict(train_feature[va]))))
stack_train[va] = score_va
stack_test+= score_te
stack_test /= n_folds
stack = np.vstack([stack_train, stack_test])
for i in range(stack.shape[1]):
df_stack['tfidf_lr_classfiy_{}'.format(i)] = stack[:, i]
########################### SGD(随机梯度下降) ################################
print('sgd stacking')
stack_train = np.zeros((len(train), 11))
stack_test = np.zeros((len(test), 11))
score_va = 0
for i, (tr, va) in enumerate(StratifiedKFold(score, n_folds=n_folds, random_state=1017)):
print('stack:%d/%d' % ((i + 1), n_folds))
sgd = SGDClassifier(random_state=1017, loss='log')
sgd.fit(train_feature[tr], score[tr])
score_va = sgd.predict_proba(train_feature[va])
score_te = sgd.predict_proba(test_feature)
print('得分' + str(mean_squared_error(score[va], sgd.predict(train_feature[va]))))
stack_train[va] = score_va
stack_test+= score_te
stack_test /= n_folds
stack = np.vstack([stack_train, stack_test])
for i in range(stack.shape[1]):
df_stack['tfidf_sgd_classfiy_{}'.format(i)] = stack[:, i]
########################### pac(PassiveAggressiveClassifier) ################################
print('PAC stacking')
stack_train = np.zeros((len(train), 11))
stack_test = np.zeros((len(test), 11))
score_va = 0
for i, (tr, va) in enumerate(StratifiedKFold(score, n_folds=n_folds, random_state=1017)):
print('stack:%d/%d' % ((i + 1), n_folds))
pac = PassiveAggressiveClassifier(random_state=1017)
pac.fit(train_feature[tr], score[tr])
score_va = pac._predict_proba_lr(train_feature[va])
score_te = pac._predict_proba_lr(test_feature)
print(score_va)
print('得分' + str(mean_squared_error(score[va], pac.predict(train_feature[va]))))
stack_train[va] += score_va
stack_test += score_te
stack_test /= n_folds
stack = np.vstack([stack_train, stack_test])
for i in range(stack.shape[1]):
df_stack['tfidf_pac_classfiy_{}'.format(i)] = stack[:, i]
########################### ridge(RidgeClassfiy) ################################
print('RidgeClassfiy stacking')
stack_train = np.zeros((len(train), 11))
stack_test = np.zeros((len(test), 11))
score_va = 0
for i, (tr, va) in enumerate(StratifiedKFold(score, n_folds=n_folds, random_state=1017)):
print('stack:%d/%d' % ((i + 1), n_folds))
ridge = RidgeClassifier(random_state=1017)
ridge.fit(train_feature[tr], score[tr])
score_va = ridge._predict_proba_lr(train_feature[va])
score_te = ridge._predict_proba_lr(test_feature)
print(score_va)
print('得分' + str(mean_squared_error(score[va], ridge.predict(train_feature[va]))))
stack_train[va] += score_va
stack_test += score_te
stack_test /= n_folds
stack = np.vstack([stack_train, stack_test])
for i in range(stack.shape[1]):
df_stack['tfidf_ridge_classfiy_{}'.format(i)] = stack[:, i]
########################### bnb(BernoulliNB) ################################
print('BernoulliNB stacking')
stack_train = np.zeros((len(train), 11))
stack_test = np.zeros((len(test), 11))
score_va = 0
for i, (tr, va) in enumerate(StratifiedKFold(score, n_folds=n_folds, random_state=1017)):
print('stack:%d/%d' % ((i + 1), n_folds))
bnb = BernoulliNB()
bnb.fit(train_feature[tr], score[tr])
score_va = bnb.predict_proba(train_feature[va])
score_te = bnb.predict_proba(test_feature)
print(score_va)
print('得分' + str(mean_squared_error(score[va], bnb.predict(train_feature[va]))))
stack_train[va] += score_va
stack_test += score_te
stack_test /= n_folds
stack = np.vstack([stack_train, stack_test])
for i in range(stack.shape[1]):
df_stack['tfidf_bnb_classfiy_{}'.format(i)] = stack[:, i]
########################### mnb(MultinomialNB) ################################
print('MultinomialNB stacking')
stack_train = np.zeros((len(train), 11))
stack_test = np.zeros((len(test), 11))
score_va = 0
for i, (tr, va) in enumerate(StratifiedKFold(score, n_folds=n_folds, random_state=1017)):
print('stack:%d/%d' % ((i + 1), n_folds))
mnb = MultinomialNB()
mnb.fit(train_feature[tr], score[tr])
score_va = mnb.predict_proba(train_feature[va])
score_te = mnb.predict_proba(test_feature)
print(score_va)
print('得分' + str(mean_squared_error(score[va], mnb.predict(train_feature[va]))))
stack_train[va] += score_va
stack_test += score_te
stack_test /= n_folds
stack = np.vstack([stack_train, stack_test])
for i in range(stack.shape[1]):
df_stack['tfidf_mnb_classfiy_{}'.format(i)] = stack[:, i]
############################ Linersvc(LinerSVC) ################################
print('LinerSVC stacking')
stack_train = np.zeros((len(train), 11))
stack_test = np.zeros((len(test), 11))
score_va = 0
for i, (tr, va) in enumerate(StratifiedKFold(score, n_folds=n_folds, random_state=1017)):
print('stack:%d/%d' % ((i + 1), n_folds))
lsvc = LinearSVC(random_state=1017)
lsvc.fit(train_feature[tr], score[tr])
score_va = lsvc._predict_proba_lr(train_feature[va])
score_te = lsvc._predict_proba_lr(test_feature)
print(score_va)
print('得分' + str(mean_squared_error(score[va], lsvc.predict(train_feature[va]))))
stack_train[va] += score_va
stack_test += score_te
stack_test /= n_folds
stack = np.vstack([stack_train, stack_test])
for i in range(stack.shape[1]):
df_stack['data/tfidf_lsvc_classfiy_{}'.format(i)] = stack[:, i]
df_stack.to_csv('data/tfidf_age.csv', index=None, encoding='utf8')
print('tfidf特征已保存\n')
# In[44]:
tfidf_feat=pd.read_csv("data/tfidf_age.csv")
tf2=pd.read_csv("data/pack_tfidf_age.csv")
train_data=pd.read_csv("data/train_data.csv")
test_data=pd.read_csv("data/test_data.csv")
# In[41]:
train_dt = pd.merge(train_data[['device_id','ph_ver']],tfidf_feat,on="device_id",how="left")
train_dt = pd.merge(train_dt,tf2,on="device_id",how="left")
test_dt = pd.merge(test_data[['device_id',"ph_ver"]],tfidf_feat,on="device_id",how="left")
test_dt = pd.merge(test_dt,tf2,on="device_id",how="left")
feat=pd.concat([train_dt,test_dt])
feat.to_csv("data/age_chizhu_feat.csv",index=False)
# In[40]:
# In[45]:
tfidf_feat=pd.read_csv("data/tfidf_age.csv")
tf2=pd.read_csv("data/pack_tfidf_age.csv")
train_data=pd.read_csv("data/train_data.csv")
test_data=pd.read_csv("data/test_data.csv")
train = pd.merge(train_data,tfidf_feat,on="device_id",how="left")
# train = pd.merge(train_data,tf2,on="device_id",how="left")
# train = pd.merge(train_data,app_w2v,on="device_id",how="left")
test = pd.merge(test_data,tfidf_feat,on="device_id",how="left")
# test = pd.merge(test_data,tf2,on="device_id",how="left")
# test = pd.merge(test_data,app_w2v,on="device_id",how="left")
features = [x for x in train.columns if x not in ['device_id',"age","sex","label","app"]]
Y = train['age']
# In[46]:
import lightgbm as lgb
# import xgboost as xgb
from sklearn.metrics import auc, log_loss, roc_auc_score,f1_score,recall_score,precision_score
from sklearn.cross_validation import StratifiedKFold
kf = StratifiedKFold(Y, n_folds=5, shuffle=True, random_state=1024)
params = {
'boosting_type': 'gbdt',
'metric': {'multi_logloss',},
# 'is_unbalance':'True',
'learning_rate' : 0.01,
'verbose': 0,
'num_leaves':32 ,
# 'max_depth':8,
# 'max_bin':10,
# 'lambda_l2': 1,
# 'min_child_weight':50,
"num_class":11,
'objective': 'multiclass',
'feature_fraction': 0.4,
'bagging_fraction':0.7, # 0.9是目前最优的
'bagging_freq':3, # 3是目前最优的
# 'min_data': 500,
'seed': 1024,
'nthread': 8,
# 'silent': True,
}
num_round = 3500
early_stopping_rounds = 100
# In[47]:
aus = []
sub2 = np.zeros((len(test),11 ))
pred_oob2=np.zeros((len(train),11))
for i,(train_index,test_index) in enumerate(kf):
tr_x = train[features].reindex(index=train_index, copy=False)
tr_y = Y[train_index]
te_x = train[features].reindex(index=test_index, copy=False)
te_y = Y[test_index]
d_tr = lgb.Dataset(tr_x, label=tr_y)
d_te = lgb.Dataset(te_x, label=te_y)
model = lgb.train(params, d_tr, num_boost_round=num_round,
valid_sets=d_te,verbose_eval=200,
early_stopping_rounds=early_stopping_rounds)
pred= model.predict(te_x, num_iteration=model.best_iteration)
pred_oob2[test_index] =pred
a = log_loss(te_y, pred)
sub2 += model.predict(test[features], num_iteration=model.best_iteration)/5
print ("idx: ", i)
print (" loss: %.5f" % a)
print ("best tree num: ", model.best_iteration)
aus.append(a)
print ("mean")
print ("loss: %s" % (sum(aus) / 5.0))
# In[55]:
#####特征重要性
# import matplotlib.pyplot as plt
# f=dict(zip(list(train[features].keys()),model.feature_importance()))
# f=sorted(f.items(),key=lambda d:d[1], reverse = True)
# f=pd.DataFrame(f,columns=['feature','imp'])
# plt.bar(range(len(f)),f.imp)
# plt.xticks(range(len(f)),f.feature,rotation=70,fontsize=20)
# fig = plt.gcf()
# fig.set_size_inches(50, 20)
# In[56]:
# f.ix[:650,:]
# In[57]:
# features=f.ix[:641,"feature"].values
# In[58]:
res2_1=np.vstack((pred_oob2,sub2))
res2_1 = pd.DataFrame(res2_1)
# In[59]:
if not os.path.exists("submit"):
os.mkdir("submit")
res1.index=range(len(res1))
res2_1.index=range(len(res2_1))
final_1=res2_1.copy()
final_2=res2_1.copy()
for i in range(11):
final_1[i]=res1['sex1']*res2_1[i]
final_2[i]=res1['sex2']*res2_1[i]
id_list=pd.concat([train[['device_id']],test[['device_id']]])
final=id_list
final.index=range(len(final))
final.columns= ['DeviceID']
final_pred = pd.concat([final_1,final_2],1)
final=pd.concat([final,final_pred],1)
final.columns = ['DeviceID', '1-0', '1-1', '1-2', '1-3', '1-4', '1-5', '1-6',
'1-7','1-8', '1-9', '1-10', '2-0', '2-1', '2-2', '2-3', '2-4',
'2-5', '2-6', '2-7', '2-8', '2-9', '2-10']
final.to_csv('feature/lgb_feat_chizhu.csv', index=False)
# In[60]:
test['DeviceID']=test['device_id']
sub=pd.merge(test[['DeviceID']],final,on="DeviceID",how="left")
sub.to_csv("submit/lgb_chizhu.csv",index=False)
# In[61]:
# sub.sum(1)
================================================
FILE: nb_cz_lwl_wcm/11_cnn.py
================================================
# coding: utf-8
# In[1]:
import pandas as pd
import seaborn as sns
import numpy as np
from tqdm import tqdm
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from datetime import datetime, timedelta
import matplotlib.pyplot as plt
import time
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
# get_ipython().run_line_magic('matplotlib', 'inline')
#add
# from category_encoders import OrdinalEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from gensim.models import FastText, Word2Vec
import re
from keras.layers import *
from keras.models import *
from keras.preprocessing.text import Tokenizer, text_to_word_sequence
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing import text, sequence
from keras.callbacks import *
from keras.layers.advanced_activations import LeakyReLU, PReLU
import keras.backend as K
from keras.optimizers import *
from keras.utils import to_categorical
from keras.utils import multi_gpu_model
import tensorflow as tf
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
from keras.backend.tensorflow_backend import set_session
config = tf.ConfigProto()
config.gpu_options.per_process_gpu_memory_fraction = 0.9
set_session(tf.Session(config=config))
from config import path
# path = "/dev/shm/chizhu_data/data/"
# In[2]:
packages = pd.read_csv(path+'deviceid_packages.tsv',
sep='\t', names=['device_id', 'apps'])
test = pd.read_csv(path+'deviceid_test.tsv', sep='\t', names=['device_id'])
train = pd.read_csv(path+'deviceid_train.tsv', sep='\t',
names=['device_id', 'sex', 'age'])
brand = pd.read_table(path+'deviceid_brand.tsv',
names=['device_id', 'vendor', 'version'])
behave_train = pd.read_csv('data/train_statistic_feat.csv')
behave_test = pd.read_csv('data/test_statistic_feat.csv')
# In[3]:
behave_train.drop(['sex', 'age', 'label', 'app'], 1, inplace=True)
behave_test.drop(['sex', 'age', 'label', 'app'], 1, inplace=True)
# In[4]:
brand['phone_version'] = brand['vendor'] + ' ' + brand['version']
train = pd.merge(brand[['device_id', 'phone_version']],
train, on='device_id', how='right')
test = pd.merge(brand[['device_id', 'phone_version']],
test, on='device_id', how='right')
# In[5]:
train = pd.merge(train, behave_train, on='device_id', how='left')
test = pd.merge(test, behave_test, on='device_id', how='left')
# In[6]:
packages['app_lenghth'] = packages['apps'].apply(
lambda x: x.split(',')).apply(lambda x: len(x))
packages['app_list'] = packages['apps'].apply(lambda x: x.split(','))
train = pd.merge(train, packages, on='device_id', how='left')
test = pd.merge(test, packages, on='device_id', how='left')
# In[7]:
embed_size = 128
fastmodel = Word2Vec(list(packages['app_list']), size=embed_size, window=4, min_count=3, negative=2,
sg=1, sample=0.002, hs=1, workers=4)
embedding_fast = pd.DataFrame([fastmodel[word]
for word in (fastmodel.wv.vocab)])
embedding_fast['app'] = list(fastmodel.wv.vocab)
embedding_fast.columns = ["fdim_%s" %
str(i) for i in range(embed_size)]+["app"]
# In[8]:
tokenizer = Tokenizer(lower=False, char_level=False, split=',')
tokenizer.fit_on_texts(list(packages['apps']))
X_seq = tokenizer.texts_to_sequences(train['apps'])
X_test_seq = tokenizer.texts_to_sequences(test['apps'])
maxlen = 50
X = pad_sequences(X_seq, maxlen=maxlen, value=0)
X_test = pad_sequences(X_test_seq, maxlen=maxlen, value=0)
Y_sex = train['sex']-1
# In[9]:
max_feaures = 35001
embedding_matrix = np.zeros((max_feaures, embed_size))
for word in tokenizer.word_index:
if word not in fastmodel.wv.vocab:
continue
embedding_matrix[tokenizer.word_index[word]] = fastmodel[word]
# In[10]:
# behave_train=behave_train.loc[:,"ph_ver_0":'week_day_6']
# behave_test=behave_test.loc[:,"h0":'week_day_6']
behave_train = pd.merge(train[['device_id']],
behave_train, on='device_id', how="left")
behave_test = pd.merge(test[['device_id']],
behave_test, on='device_id', how="left")
X_h = behave_train.iloc[:, 1:].values
X_h_test = behave_test.iloc[:, 1:].values
# In[11]:
class AdamW(Optimizer):
def __init__(self, lr=0.001, beta_1=0.9, beta_2=0.999, weight_decay=1e-4, # decoupled weight decay (1/4)
epsilon=1e-8, decay=0., **kwargs):
super(AdamW, self).__init__(**kwargs)
with K.name_scope(self.__class__.__name__):
self.iterations = K.variable(0, dtype='int64', name='iterations')
self.lr = K.variable(lr, name='lr')
self.beta_1 = K.variable(beta_1, name='beta_1')
self.beta_2 = K.variable(beta_2, name='beta_2')
self.decay = K.variable(decay, name='decay')
# decoupled weight decay (2/4)
self.wd = K.variable(weight_decay, name='weight_decay')
self.epsilon = epsilon
self.initial_decay = decay
@interfaces.legacy_get_updates_support
def get_updates(self, loss, params):
grads = self.get_gradients(loss, params)
self.updates = [K.update_add(self.iterations, 1)]
wd = self.wd # decoupled weight decay (3/4)
lr = self.lr
if self.initial_decay > 0:
lr *= (1. / (1. + self.decay * K.cast(self.iterations,
K.dtype(self.decay))))
t = K.cast(self.iterations, K.floatx()) + 1
lr_t = lr * (K.sqrt(1. - K.pow(self.beta_2, t)) /
(1. - K.pow(self.beta_1, t)))
ms = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
vs = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
self.weights = [self.iterations] + ms + vs
for p, g, m, v in zip(params, grads, ms, vs):
m_t = (self.beta_1 * m) + (1. - self.beta_1) * g
v_t = (self.beta_2 * v) + (1. - self.beta_2) * K.square(g)
# decoupled weight decay (4/4)
p_t = p - lr_t * m_t / (K.sqrt(v_t) + self.epsilon) - lr * wd * p
self.updates.append(K.update(m, m_t))
self.updates.append(K.update(v, v_t))
new_p = p_t
# Apply constraints.
if getattr(p, 'constraint', None) is not None:
new_p = p.constraint(new_p)
self.updates.append(K.update(p, new_p))
return self.updates
def get_config(self):
config = {'lr': float(K.get_value(self.lr)),
'beta_1': float(K.get_value(self.beta_1)),
'beta_2': float(K.get_value(self.beta_2)),
'decay': float(K.get_value(self.decay)),
'weight_decay': float(K.get_value(self.wd)),
'epsilon': self.epsilon}
base_config = super(AdamW, self).get_config()
return dict(list(base_config.items()) + list(config.items()))
# In[12]:
def model_conv1D(embedding_matrix):
K.clear_session()
# The embedding layer containing the word vectors
emb_layer = Embedding(
input_dim=embedding_matrix.shape[0],
output_dim=embedding_matrix.shape[1],
weights=[embedding_matrix],
input_length=maxlen,
trainable=False
)
lstm_layer = Bidirectional(
GRU(128, recurrent_dropout=0.15, dropout=0.15, return_sequences=True))
# 1D convolutions that can iterate over the word vectors
conv1 = Conv1D(filters=128, kernel_size=1,
padding='same', activation='relu',)
conv2 = Conv1D(filters=64, kernel_size=2,
padding='same', activation='relu', )
conv3 = Conv1D(filters=64, kernel_size=3,
padding='same', activation='relu',)
conv5 = Conv1D(filters=32, kernel_size=5,
padding='same', activation='relu',)
# Define inputs
seq = Input(shape=(maxlen,))
# Run inputs through embedding
emb = emb_layer(seq)
lstm = lstm_layer(emb)
# Run through CONV + GAP layers
conv1a = conv1(lstm)
gap1a = GlobalAveragePooling1D()(conv1a)
gmp1a = GlobalMaxPool1D()(conv1a)
conv2a = conv2(lstm)
gap2a = GlobalAveragePooling1D()(conv2a)
gmp2a = GlobalMaxPool1D()(conv2a)
conv3a = conv3(lstm)
gap3a = GlobalAveragePooling1D()(conv3a)
gmp3a = GlobalMaxPooling1D()(conv3a)
conv5a = conv5(lstm)
gap5a = GlobalAveragePooling1D()(conv5a)
gmp5a = GlobalMaxPooling1D()(conv5a)
hin = Input(shape=(396, ))
htime = Dense(64, activation='relu')(hin)
merge1 = concatenate([gap1a, gmp1a, htime])
# The MLP that determines the outcome
x = Dropout(0.3)(merge1)
x = BatchNormalization()(x)
x = Dense(200, activation='relu',)(x)
x = Dropout(0.25)(x)
x = BatchNormalization()(x)
x = Dense(200, activation='relu',)(x)
x = Dropout(0.25)(x)
x = BatchNormalization()(x)
x = Dense(200, activation='relu',)(x)
x = Dropout(0.25)(x)
x = BatchNormalization()(x)
pred = Dense(1, activation='sigmoid')(x)
# model = Model(inputs=[seq1, seq2, magic_input, distance_input], outputs=pred)
model = Model(inputs=[seq, hin], outputs=pred)
# model=multi_gpu_model(model,2)
model.compile(loss='binary_crossentropy',
optimizer=AdamW(weight_decay=0.08,))
# model.summary()
return model
# In[ ]:
kfold = StratifiedKFold(n_splits=5, random_state=20, shuffle=True)
sub1 = np.zeros((X_test.shape[0], ))
oof_pref1 = np.zeros((X.shape[0], 1))
score = []
count = 0
for i, (train_index, test_index) in enumerate(kfold.split(X, Y_sex)):
print("FOLD | ", count+1)
filepath = "model/sex_weights_best_%d.h5" % count
checkpoint = ModelCheckpoint(
filepath, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
reduce_lr = ReduceLROnPlateau(
monitor='val_loss', factor=0.8, patience=2, min_lr=0.0001, verbose=1)
earlystopping = EarlyStopping(
monitor='val_loss', min_delta=0.0001, patience=6, verbose=1, mode='auto')
callbacks = [checkpoint, reduce_lr, earlystopping]
model_sex = model_conv1D(embedding_matrix)
X_tr, X_vl, X_tr2, X_vl2, y_tr, y_vl = X[train_index], X[test_index], X_h[
train_index], X_h[test_index], Y_sex[train_index], Y_sex[test_index]
hist = model_sex.fit([X_tr, X_tr2], y_tr, batch_size=128, epochs=50, validation_data=([X_vl, X_vl2], y_vl),
callbacks=callbacks, verbose=1, shuffle=True)
model_sex.load_weights(filepath)
sub1 += np.squeeze(model_sex.predict([X_test, X_h_test]))/kfold.n_splits
oof_pref1[test_index] = model_sex.predict([X_vl, X_vl2])
score.append(np.min(hist.history['val_loss']))
count += 1
print('log loss:', np.mean(score))
# pd.DataFrame(oof_pref1).to_csv('cnn_oof_sex.csv', index=False)
# In[ ]:
oof_pref1 = pd.DataFrame(oof_pref1, columns=['sex2'])
sub1 = pd.DataFrame(sub1, columns=['sex2'])
res1 = pd.concat([oof_pref1, sub1])
res1['sex1'] = 1-res1['sex2']
res1.to_csv("data/res1.csv", index=False)
# In[ ]:
def model_age_conv(embedding_matrix):
# The embedding layer containing the word vectors
K.clear_session()
emb_layer = Embedding(
input_dim=embedding_matrix.shape[0],
output_dim=embedding_matrix.shape[1],
weights=[embedding_matrix],
input_length=maxlen,
trainable=False
)
lstm_layer = Bidirectional(
GRU(128, recurrent_dropout=0.15, dropout=0.15, return_sequences=True))
# 1D convolutions that can iterate over the word vectors
conv1 = Conv1D(filters=128, kernel_size=1,
padding='same', activation='relu',)
conv2 = Conv1D(filters=64, kernel_size=2,
padding='same', activation='relu', )
conv3 = Conv1D(filters=64, kernel_size=3,
padding='same', activation='relu',)
conv5 = Conv1D(filters=32, kernel_size=5,
padding='same', activation='relu',)
# Define inputs
seq = Input(shape=(maxlen,))
# Run inputs through embedding
emb = emb_layer(seq)
lstm = lstm_layer(emb)
# Run through CONV + GAP layers
conv1a = conv1(lstm)
gap1a = GlobalAveragePooling1D()(conv1a)
gmp1a = GlobalMaxPool1D()(conv1a)
conv2a = conv2(lstm)
gap2a = GlobalAveragePooling1D()(conv2a)
gmp2a = GlobalMaxPool1D()(conv2a)
conv3a = conv3(lstm)
gap3a = GlobalAveragePooling1D()(conv3a)
gmp3a = GlobalMaxPooling1D()(conv3a)
conv5a = conv5(lstm)
gap5a = GlobalAveragePooling1D()(conv5a)
gmp5a = GlobalMaxPooling1D()(conv5a)
hin = Input(shape=(397, ))
htime = Dense(64, activation='relu')(hin)
merge1 = concatenate([gap1a, gmp1a, htime])
# merge1 = concatenate([gap1a, gap2a, gap3a, gap5a])
# The MLP that determines the outcome
x = Dropout(0.3)(merge1)
x = BatchNormalization()(x)
x = Dense(200, activation='relu',)(x)
x = Dropout(0.22)(x)
x = BatchNormalization()(x)
x = Dense(200, activation='relu',)(x)
x = Dropout(0.22)(x)
x = BatchNormalization()(x)
x = Dense(200, activation='relu',)(x)
x = Dropout(0.22)(x)
x = BatchNormalization()(x)
pred = Dense(11, activation='softmax')(x)
model = Model(inputs=[seq, hin], outputs=pred)
model.compile(loss='categorical_crossentropy',
optimizer=AdamW(weight_decay=0.08,))
# model.summary()
return model
# In[ ]:
Y_age = to_categorical(train['age'])
# #### sex1
# In[ ]:
behave_train['sex'] = train['sex']
behave_test['sex'] = 1
X_h = behave_train.iloc[:, 1:].values
X_h_test = behave_test.iloc[:, 1:].values
# In[ ]:
sub2 = np.zeros((X_test.shape[0], 11))
oof_pref2 = np.zeros((X.shape[0], 11))
score = []
count = 0
for i, (train_index, test_index) in enumerate(kfold.split(X, train['age'])):
print("FOLD | ", count+1)
filepath2 = "model/age_weights_best_%d.h5" % count
checkpoint2 = ModelCheckpoint(
filepath2, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
reduce_lr2 = ReduceLROnPlateau(
monitor='val_loss', factor=0.8, patience=2, min_lr=0.0001, verbose=1)
earlystopping2 = EarlyStopping(
monitor='val_loss', min_delta=0.0001, patience=8, verbose=1, mode='auto')
callbacks2 = [checkpoint2, reduce_lr2, earlystopping2]
model_age = model_age_conv(embedding_matrix)
X_tr, X_vl, X_tr2, X_vl2, y_tr, y_vl = X[train_index], X[test_index], X_h[
train_index], X_h[test_index], Y_age[train_index], Y_age[test_index]
hist = model_age.fit([X_tr, X_tr2], y_tr, batch_size=128, epochs=50, validation_data=([X_vl, X_vl2], y_vl),
callbacks=callbacks2, verbose=1, shuffle=True)
model_age.load_weights(filepath2)
oof_pref2[test_index] = model_age.predict([X_vl, X_vl2])
sub2 += model_age.predict([X_test, X_h_test])/kfold.n_splits
score.append(np.min(hist.history['val_loss']))
count += 1
print('log loss:', np.mean(score))
# pd.DataFrame(oof_pref2).to_csv('cnn_oof_age.csv', index=False)
# In[ ]:
res2_1 = np.vstack((oof_pref2, sub2))
res2_1 = pd.DataFrame(res2_1)
res2_1.to_csv("submit/res2_1.csv", index=False)
# ### sex2
# In[ ]:
behave_train['sex'] = train['sex']
behave_test['sex'] = 2
X_h = behave_train.iloc[:, 1:].values
X_h_test = behave_test.iloc[:, 1:].values
# In[ ]:
sub2 = np.zeros((X_test.shape[0], 11))
oof_pref2 = np.zeros((X.shape[0], 11))
score = []
count = 0
for i, (train_index, test_index) in enumerate(kfold.split(X, train['age'])):
print("FOLD | ", count+1)
filepath2 = "model/age_weights_best_%d.h5" % count
checkpoint2 = ModelCheckpoint(
filepath2, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
reduce_lr2 = ReduceLROnPlateau(
monitor='val_loss', factor=0.8, patience=2, min_lr=0.0001, verbose=1)
earlystopping2 = EarlyStopping(
monitor='val_loss', min_delta=0.0001, patience=8, verbose=1, mode='auto')
callbacks2 = [checkpoint2, reduce_lr2, earlystopping2]
model_age = model_age_conv(embedding_matrix)
X_tr, X_vl, X_tr2, X_vl2, y_tr, y_vl = X[train_index], X[test_index], X_h[
train_index], X_h[test_index], Y_age[train_index], Y_age[test_index]
hist = model_age.fit([X_tr, X_tr2], y_tr, batch_size=128, epochs=50, validation_data=([X_vl, X_vl2], y_vl),
callbacks=callbacks2, verbose=1, shuffle=True)
model_age.load_weights(filepath2)
oof_pref2[test_index] = model_age.predict([X_vl, X_vl2])
sub2 += model_age.predict([X_test, X_h_test])/kfold.n_splits
score.append(np.min(hist.history['val_loss']))
count += 1
print('log loss:', np.mean(score))
# pd.DataFrame(oof_pref2).to_csv('cnn_oof_age.csv', index=False)
# In[ ]:
res2_2 = np.vstack((oof_pref2, sub2))
res2_2 = pd.DataFrame(res2_2)
# In[ ]:
res2_2.to_csv("submit/res2_2.csv", index=False)
# In[ ]:
res1.index = range(len(res1))
res2_1.index = range(len(res2_1))
res2_2.index = range(len(res2_2))
final_1 = res2_1
final_2 = res2_2
for i in range(11):
final_1[i] = res1['sex1']*res2_1[i]
final_2[i] = res1['sex2']*res2_2[i]
id_list = pd.concat([train[['device_id']], test[['device_id']]])
final = id_list
final.index = range(len(final))
final.columns = ['DeviceID']
final_pred = pd.concat([final_1, final_2], 1)
final = pd.concat([final, final_pred], 1)
final.columns = ['DeviceID', '1-0', '1-1', '1-2', '1-3', '1-4', '1-5', '1-6',
'1-7', '1-8', '1-9', '1-10', '2-0', '2-1', '2-2', '2-3', '2-4',
'2-5', '2-6', '2-7', '2-8', '2-9', '2-10']
final.to_csv('feature/nn_feat.csv', index=False)
================================================
FILE: nb_cz_lwl_wcm/12_get_feature_lwl.py
================================================
#!/usr/bin/env python
# coding: utf-8
import pandas as pd
import seaborn as sns
import numpy as np
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import lightgbm as lgb
from datetime import datetime,timedelta
import matplotlib.pyplot as plt
import time
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
import gc
from sklearn import preprocessing
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack, vstack
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from skopt.space import Integer, Categorical, Real, Log10
from skopt.utils import use_named_args
from skopt import gp_minimize
from gensim.models import Word2Vec, FastText
import gensim
import re
# 获取app开关表中使用总时间Top100的app使用时长统计特征
def get_top100_statis_feat(start_close):
start_close['period'] = (start_close['close'] - start_close['start'])/1000
start_close['start'] = pd.to_datetime(start_close['start'], unit='ms')
app_use_time = start_close.groupby(['app'])['period'].agg('sum').reset_index()
app_use_top100 = app_use_time.sort_values(by='period', ascending=False)[:100]['app']
device_app_use_time = start_close.groupby(['device_id', 'app'])['period'].agg('sum').reset_index()
use_time_top100_statis = device_app_use_time.set_index('app').loc[list(app_use_top100)].reset_index()
top100_statis = use_time_top100_statis.pivot(index='device_id', columns='app', values='period').reset_index()
top100_statis = top100_statis.fillna(0)
top100_statis.columns = ['device_id'] + ['top100_statis_' + str(i) for i in range(0, 100)]
print('top100_statis_feat done')
return top100_statis
# 获得手机品牌特征
def get_brand_feat(brand):
# 手机品牌预处理
brand['vendor'] = brand['vendor'].astype(str).apply(lambda x : x.split(' ')[0].upper())
brand['ph_ver'] = brand['vendor'] + '_' + brand['version']
ph_ver = brand['ph_ver'].value_counts()
ph_ver_cnt = pd.DataFrame(ph_ver).reset_index()
ph_ver_cnt.columns = ['ph_ver', 'ph_ver_cnt']
brand = pd.merge(left=brand, right=ph_ver_cnt,on='ph_ver')
# 针对长尾分布做的一点处理
mask = (brand.ph_ver_cnt < 100)
brand.loc[mask, 'ph_ver'] = 'other'
ph_ver_le = preprocessing.LabelEncoder()
brand['ph_ver'] = ph_ver_le.fit_transform(brand['ph_ver'].astype(str))
print('brand_feat done')
return brand[['device_id', 'ph_ver']]
# 获取app开关表的tfidf特征,不是df格式
def get_start_close_tfidf_feat(data_all, start_close):
# 每个app的总使用次数统计
app_num = start_close['app'].value_counts().reset_index()
app_num.columns = ['app', 'app_num']
start_close = pd.merge(left=start_close, right=app_num, on='app')
# 同样的,针对长尾分布做些处理(尝试过不做处理,或换其他阈值,这个100的阈值最高)
start_close.loc[start_close.app_num < 100, 'app'] = 'other'
df_app = start_close[['device_id', 'app']]
apps = df_app.drop_duplicates().groupby(['device_id'])['app'].apply(' '.join).reset_index()
apps['app_length'] = apps['app'].apply(lambda x:len(x.split(' ')))
data_all = pd.merge(data_all, apps, on='device_id', how='left')
# 获取每台设备所安装的apps的tfidf
tfidf = CountVectorizer()
apps['app'] = tfidf.fit_transform(apps['app'])
# 转换
start_close_tfidf = tfidf.transform(list(data_all['app']))
print('start_close_tfidf_feat done')
return start_close_tfidf
# 利用word2vec得到每台设备所安装app的embedding表示
def get_packages_w2c_feat(packages):
packages['apps'] = packages['apps'].apply(lambda x:x.split(','))
packages['app_length'] = packages['apps'].apply(lambda x:len(x))
embed_size = 128
fastmodel = Word2Vec(list(packages['apps']), size=embed_size, window=4, min_count=3, negative=2,
sg=1, sample=0.002, hs=1, workers=4)
embedding_fast = pd.DataFrame([fastmodel[word] for word in (fastmodel.wv.vocab)])
embedding_fast['app'] = list(fastmodel.wv.vocab)
embedding_fast.columns= ["fdim_%s" % str(i) for i in range(embed_size)]+["app"]
id_list = []
for i in range(packages.shape[0]):
id_list += [list(packages['device_id'])[i]]*packages['app_length'].iloc[i]
app_list = [word for item in packages['apps'] for word in item]
app_vect = pd.DataFrame({'device_id':id_list})
app_vect['app'] = app_list
app_vect = app_vect.merge(embedding_fast, on='app', how='left')
app_vect = app_vect.drop('app', axis=1)
seqfeature = app_vect.groupby(['device_id']).agg('mean')
seqfeature.reset_index(inplace=True)
print('packages_w2c_feat done')
return seqfeature
# 用户一周七天玩手机的时长情况的统计特征
def get_week_statis_feat(start_close):
start_close['dayofweek'] = start_close['start'].dt.dayofweek
start_close['hour'] = start_close['start'].dt.hour
app_use_time = start_close.groupby(['device_id', 'dayofweek'])['period'].agg('sum').reset_index()
week_app_use = app_use_time.pivot_table(values='period', columns='dayofweek', index='device_id').reset_index()
week_app_use = week_app_use.fillna(0)
week_app_use.columns = ['device_id'] + ['week_day_' + str(i) for i in range(0, 7)]
week_app_use['week_max'] = week_app_use.max(axis=1)
week_app_use['week_min'] = week_app_use.min(axis=1)
week_app_use['week_sum'] = week_app_use.sum(axis=1)
week_app_use['week_std'] = week_app_use.std(axis=1)
print('week_statis_feat done')
return week_app_use
def get_user_behaviour_feat(start_close):
# start_close['peroid'] = (start_close['close'] - start_close['start'])/1000
# start_close['start'] = pd.to_datetime(start_close['start'], unit='ms')
#start_close['closetime'] = pd.to_datetime(start_close['close'], unit='ms')
# del start_close['close']
# gc.collect();
start_close['hour'] = start_close['start'].dt.hour
start_close['date'] = start_close['start'].dt.date
start_close['dayofweek'] = start_close['start'].dt.dayofweek
#平均每天使用设备时间
dtime = start_close.groupby(['device_id', 'date'])['period'].agg('sum')
#不同时间段占比
qtime = start_close.groupby(['device_id', 'hour'])['period'].agg('sum')
wtime = start_close.groupby(['device_id', 'dayofweek'])['period'].agg('sum')
atime = start_close.groupby(['device_id', 'app'])['period'].agg('sum')
dapp = start_close[['device_id', 'date', 'app']].drop_duplicates().groupby(['device_id', 'date'])['app'].agg(' '.join)
dapp = dapp.reset_index()
dapp['app_len'] = dapp['app'].apply(lambda x:x.split(' ')).apply(len)
dapp_stat = dapp.groupby('device_id')['app_len'].agg({'std':'std', 'mean':'mean', 'max':'max'})
dapp_stat = dapp_stat.reset_index()
dapp_stat.columns = ['device_id', 'app_len_std', 'app_len_mean', 'app_len_max']
dtime = dtime.reset_index()
dtime_stat = dtime.groupby(['device_id'])['period'].agg({'sum':'sum', 'mean':'mean', 'std':'std', 'max':'max'}).reset_index()
dtime_stat.columns = ['device_id', 'date_sum', 'date_mean', 'date_std', 'date_max']
qtime = qtime.reset_index()
ftime = qtime.pivot(index='device_id', columns='hour', values='period').fillna(0)
ftime.columns = ['h%s'%i for i in range(24)]
ftime.reset_index(inplace=True)
wtime = wtime.reset_index()
weektime = wtime.pivot(index='device_id', columns='dayofweek', values='period').fillna(0)
weektime.columns = ['w0', 'w1', 'w2', 'w3', 'w4', 'w5', 'w6']
weektime.reset_index(inplace=True)
atime = atime.reset_index()
app = atime.groupby(['device_id'])['period'].idxmax()
user = pd.merge(dapp_stat, dtime_stat, on='device_id', how='left')
user = pd.merge(user, ftime, on='device_id', how='left')
user = pd.merge(user, weektime, on='device_id', how='left')
user = pd.merge(user, atime.iloc[app], on='device_id', how='left')
app_cat = pd.read_table('Demo/package_label.tsv', names=['app', 'category', 'app_name'])
cat_enc = pd.DataFrame(app_cat['category'].value_counts())
cat_enc['idx'] = range(45)
app_cat['cat_enc'] = app_cat['category'].map(cat_enc['idx'])
app_cat.set_index(['app'], inplace=True)
atime['app_cat_enc'] = atime['app'].map(app_cat['cat_enc']).fillna(45)
cat_num = atime.groupby(['device_id', 'app_cat_enc'])['app'].agg('count').reset_index()
cat_time = atime.groupby(['device_id', 'app_cat_enc'])['period'].agg('sum').reset_index()
app_cat_num = cat_num.pivot(index='device_id', columns='app_cat_enc', values='app').fillna(0)
app_cat_num.columns = ['cat%s'%i for i in range(46)]
app_cat_time = cat_time.pivot(index='device_id', columns='app_cat_enc', values='period').fillna(0)
app_cat_time.columns = ['time%s'%i for i in range(46)]
user = pd.merge(user, app_cat_num, on='device_id', how='left')
user = pd.merge(user, app_cat_time, on='device_id', how='left')
del user['app']
print('user_behaviour_feat done')
return user
if __name__ == '__main__':
test = pd.read_csv('Demo/deviceid_test.tsv', sep='\t', names=['device_id'])
train = pd.read_csv('Demo/deviceid_train.tsv', sep='\t', names=['device_id', 'sex', 'age'])
brand = pd.read_table('Demo/deviceid_brand.tsv', names=['device_id', 'vendor', 'version'])
start_close = pd.read_table('Demo/deviceid_package_start_close.tsv',
names=['device_id', 'app', 'start', 'close'])
packages = pd.read_csv('Demo/deviceid_packages.tsv', sep='\t', names=['device_id', 'apps'])
data_all = pd.concat([train, test], axis=0, ignore_index=True)
print('data done')
top100_statis_feat = get_top100_statis_feat(start_close)
brand_feat = get_brand_feat(brand)
# start_close_tfidf_feat = get_start_close_tfidf_feat(data_all, start_close)
packages_w2c_feat = get_packages_w2c_feat(packages)
week_statis_feat = get_week_statis_feat(start_close)
user_behaviour_feat = get_user_behaviour_feat(start_close)
print('feats done')
data_all = pd.merge(data_all, top100_statis_feat, on='device_id', how='left')
data_all = pd.merge(data_all, brand_feat, on='device_id', how='left')
data_all = pd.merge(data_all, packages_w2c_feat, on='device_id', how='left')
data_all = pd.merge(data_all, week_statis_feat, on='device_id', how='left')
data_all = pd.merge(data_all, user_behaviour_feat, on='device_id', how='left')
print('merge done')
# 删掉标签
del data_all['age'], data_all['sex']
data_all.to_csv('feature/feat_lwl.csv', index=None)
================================================
FILE: nb_cz_lwl_wcm/13_last_get_all_feature.py
================================================
# -*- coding:utf-8 -*-
import pandas as pd
df_brand = pd.read_csv('feature/deviceid_brand_feature.csv')
df_lr = pd.read_csv('feature/tfidf_lr_error_single_classfiy.csv')
df_pac = pd.read_csv('feature/tfidf_pac_error_single_classfiy.csv')
df_sgd = pd.read_csv('feature/tfidf_sgd_error_single_classfiy.csv')
df_ridge = pd.read_csv('feature/tfidf_ridge_error_single_classfiy.csv')
df_bnb = pd.read_csv('feature/tfidf_bnb_error_single_classfiy.csv')
df_mnb = pd.read_csv('feature/tfidf_mnb_error_single_classfiy.csv')
df_lsvc = pd.read_csv('feature/tfidf_lsvc_error_single_classfiy.csv')
df_lr_2 = pd.read_csv('feature/tfidf_lr_1_3_error_single_classfiy.csv')
df_pac_2 = pd.read_csv('feature/tfidf_pac_1_3_error_single_classfiy.csv')
df_sgd_2 = pd.read_csv('feature/tfidf_sgd_1_3_error_single_classfiy.csv')
df_ridge_2 = pd.read_csv('feature/tfidf_ridge_1_3_error_single_classfiy.csv')
df_bnb_2 = pd.read_csv('feature/tfidf_bnb_1_3_error_single_classfiy.csv')
df_mnb_2 = pd.read_csv('feature/tfidf_mnb_1_3_error_single_classfiy.csv')
df_lsvc_2 = pd.read_csv('feature/tfidf_lsvc_2_error_single_classfiy.csv')
df_kmeans_2 = pd.read_csv('feature/cluster_2_tfidf_feature.csv')
df_start_close = pd.read_csv('feature/feature_start_close.csv')
df_ling_reg = pd.read_csv('feature/tfidf_ling_reg.csv')
df_par_reg = pd.read_csv('feature/tfidf_par_reg.csv')
df_svr_reg = pd.read_csv('feature/tfidf_svr_reg.csv')
df_w2v = pd.read_csv('feature/w2v_avg.csv')
del df_w2v['DeviceID']
df_best_nn = pd.read_csv('feature/yg_best_nn.csv')
del df_best_nn['DeviceID']
df_chizhu_lgb = pd.read_csv('feature/lgb_feat_chizhu.csv')
del df_chizhu_lgb['DeviceID']
df_chizhu_nn = pd.read_csv('feature/nn_feat.csv')
del df_chizhu_nn['DeviceID']
df_lwl_lgb = pd.read_csv('feature/feat_lwl.csv')
del df_lwl_lgb['DeviceID']
df_feature = pd.concat([
df_brand,
df_lr, df_pac, df_sgd,
df_ridge, df_bnb, df_mnb, df_lsvc,
df_start_close, df_ling_reg, df_par_reg,df_svr_reg,
df_lr_2, df_pac_2, df_sgd_2, df_ridge_2, df_bnb_2, df_mnb_2,
df_lsvc_2, df_kmeans_2, df_w2v, df_best_nn, df_chizhu_lgb, df_chizhu_nn
df_lwl_lgb
], axis=1)
df_feature.to_csv('feature/feature_one.csv', encoding='utf8', index=None)
================================================
FILE: nb_cz_lwl_wcm/1_get_age_reg.py
================================================
# -*- coding:utf-8 -*-
####### 尝试骚操作,单独针对这个表
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.linear_model import LogisticRegression, SGDClassifier, PassiveAggressiveClassifier, RidgeClassifier, Ridge, \
PassiveAggressiveRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.svm import LinearSVC, LinearSVR
train = pd.read_csv('Demo/deviceid_train.tsv', sep='\t', header=None)
test = pd.read_csv('Demo/deviceid_test.tsv', sep='\t', header=None)
test_id = test[0]
def get_label(row):
return row[2]
train['label'] = train.apply(lambda row:get_label(row), axis=1)
data_all = pd.concat([train, test], axis=0)
data_all = data_all.rename({0:'id'}, axis=1)
del data_all[1],data_all[2]
deviceid_packages = pd.read_csv('Demo/deviceid_packages.tsv', sep='\t', header=None)
deviceid_packages = deviceid_packages.rename({0: 'id', 1: 'packages_names'}, axis=1)
package_label = pd.read_csv('Demo/package_label.tsv', sep='\t', header=None)
package_label = package_label.rename({0:'packages_name', 1:'packages_type'},axis=1)
dict_label = dict(zip(list(package_label['packages_name']), list(package_label['packages_type'])))
data_all = pd.merge(data_all, deviceid_packages, on='id', how='left')
feature = pd.DataFrame()
import numpy as np
# app个数
# 毒特征?
# feature['app_count'] = data_all['packages_names'].apply(lambda row: len(str(row).split(',')))
# 对此数据做countvector,和tfidfvector,并在一起跑几个学习模型
# 引申出来的count和tfidf,跑基本机器学习分类模型
data_all['package_str'] = data_all['packages_names'].apply(lambda row: str(row).replace(',', ' '))
def get_more_information(row):
result = ' '
start = True
row_list = row.split(',')
for i in row_list:
try:
if start:
result = dict_label[i]
start = False
else:
result = result + ' ' + dict_label[i]
except KeyError:
pass
return result
data_all['package_str_more_information'] = data_all['packages_names'].apply(lambda row: get_more_information(str(row)))
print(data_all)
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import scipy.sparse
count_vec = CountVectorizer()
count_csr_basic = count_vec.fit_transform(data_all['package_str'])
tfidf_vec = TfidfVectorizer()
tfidf_vec_basic = tfidf_vec.fit_transform(data_all['package_str'])
count_vec = CountVectorizer()
count_csr_more = count_vec.fit_transform(data_all['package_str_more_information'])
tfidf_vec = TfidfVectorizer()
tfidf_vec_more = tfidf_vec.fit_transform(data_all['package_str_more_information'])
data_feature = scipy.sparse.csr_matrix(scipy.sparse.hstack([count_csr_basic, tfidf_vec_basic,
count_csr_more, tfidf_vec_more]))
train_feature = data_feature[:len(train)]
score = train['label']
test_feature = data_feature[len(train):]
number = len(np.unique(score))
X = train_feature
test = test_feature
y = score
n_flods = 5
kf = KFold(n_splits=n_flods,shuffle=True,random_state=1017)
kf = kf.split(X)
def xx_mse_s(y_true,y_pre):
y_true = y_true
y_pre = pd.DataFrame({'res': list(y_pre)})
return mean_squared_error(y_true,y_pre['res'].values)
######################## ridge reg #########################3
cv_pred = []
xx_mse = []
stack = np.zeros((len(y),1))
stack_te = np.zeros((len(test_id),1))
model_1 = Ridge(solver='auto', fit_intercept=True, alpha=0.4, max_iter=250, normalize=False, tol=0.01,random_state=1017)
for i ,(train_fold,test_fold) in enumerate(kf):
X_train, X_validate, label_train, label_validate = X[train_fold, :], X[test_fold, :], y[train_fold], y[test_fold]
model_1.fit(X_train, label_train)
val_ = model_1.predict(X=X_validate)
stack[test_fold] = np.array(val_).reshape(len(val_),1)
print(xx_mse_s(label_validate, val_))
cv_pred.append(model_1.predict(test))
xx_mse.append(xx_mse_s(label_validate, val_))
import numpy as np
print('xx_result',np.mean(xx_mse))
s = 0
for i in cv_pred:
s = s+i
s = s/n_flods
print(stack)
print(s)
df_stack1 = pd.DataFrame(stack)
df_stack2 = pd.DataFrame(s)
df_stack = pd.concat([df_stack1,df_stack2
], axis=0)
df_stack.to_csv('feature/tfidf_ling_reg.csv', encoding='utf8', index=None)
######################## par reg #########################
kf = KFold(n_splits=n_flods,shuffle=True,random_state=1017)
kf = kf.split(X)
cv_pred = []
xx_mse = []
stack = np.zeros((len(y),1))
model_1 = PassiveAggressiveRegressor(fit_intercept=True, max_iter=280, tol=0.01,random_state=1017)
for i ,(train_fold,test_fold) in enumerate(kf):
X_train, X_validate, label_train, label_validate = X[train_fold, :], X[test_fold, :], y[train_fold], y[test_fold]
model_1.fit(X_train, label_train)
val_ = model_1.predict(X=X_validate)
stack[test_fold] = np.array(val_).reshape(len(val_),1)
print(xx_mse_s(label_validate, val_))
cv_pred.append(model_1.predict(test))
xx_mse.append(xx_mse_s(label_validate, val_))
import numpy as np
print('xx_result',np.mean(xx_mse))
s = 0
for i in cv_pred:
s = s+i
s = s/n_flods
print(stack)
print(s)
df_stack1 = pd.DataFrame(stack)
df_stack2 = pd.DataFrame(s)
df_stack = pd.concat([df_stack1,df_stack2
], axis=0)
df_stack.to_csv('feature/tfidf_par_reg.csv', encoding='utf8', index=None)
######################## svr reg #########################
kf = KFold(n_splits=n_flods,shuffle=True,random_state=1017)
kf = kf.split(X)
cv_pred = []
xx_mse = []
stack = np.zeros((len(y),1))
model_1 = LinearSVR(random_state=1017)
for i ,(train_fold,test_fold) in enumerate(kf):
X_train, X_validate, label_train, label_validate = X[train_fold, :], X[test_fold, :], y[train_fold], y[test_fold]
model_1.fit(X_train, label_train)
val_ = model_1.predict(X=X_validate)
stack[test_fold] = np.array(val_).reshape(len(val_),1)
print(xx_mse_s(label_validate, val_))
cv_pred.append(model_1.predict(test))
xx_mse.append(xx_mse_s(label_validate, val_))
import numpy as np
print('xx_result',np.mean(xx_mse))
s = 0
for i in cv_pred:
s = s+i
s = s/n_flods
print(stack)
print(s)
df_stack1 = pd.DataFrame(stack)
df_stack2 = pd.DataFrame(s)
df_stack = pd.concat([df_stack1,df_stack2
], axis=0)
df_stack.to_csv('feature/tfidf_svr_reg.csv', encoding='utf8', index=None)
================================================
FILE: nb_cz_lwl_wcm/2_get_feature_brand.py
================================================
# -*- coding:utf-8 -*-
import pandas as pd
import numpy as np
from sklearn import preprocessing
train = pd.read_csv('Demo/deviceid_train.tsv', sep='\t', header=None)
test = pd.read_csv('Demo/deviceid_test.tsv', sep='\t', header=None)
data_all = pd.concat([train, test], axis=0)
data_all = data_all.rename({0:'id'}, axis=1)
del data_all[1],data_all[2]
deviced_brand = pd.read_csv('Demo/deviceid_brand.tsv', sep='\t', header=None)
deviced_brand = deviced_brand.rename({0: 'id'}, axis=1)
data_all = pd.merge(data_all, deviced_brand, on='id', how='left')
print(data_all)
# 直接做类别编码特征
feature = pd.DataFrame()
label_encoder = preprocessing.LabelEncoder()
feature['phone_type'] = label_encoder.fit_transform(data_all[1])
feature['phone_type_detail'] = label_encoder.fit_transform(data_all[2])
feature.to_csv('feature/deviceid_brand_feature.csv', index=False)
================================================
FILE: nb_cz_lwl_wcm/3_get_feature_device_package.py
================================================
# -*- coding:utf-8 -*-
####### 尝试骚操作,单独针对这个表
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.linear_model import LogisticRegression, SGDClassifier, PassiveAggressiveClassifier, RidgeClassifier
from sklearn.metrics import mean_squared_error
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.svm import LinearSVC
train = pd.read_csv('Demo/deviceid_train.tsv', sep='\t', header=None)
test = pd.read_csv('Demo/deviceid_test.tsv', sep='\t', header=None)
def get_label(row):
if row[1] == 1:
return row[2]
else:
return row[2] + 11
train['label'] = train.apply(lambda row:get_label(row), axis=1)
data_all = pd.concat([train, test], axis=0)
data_all = data_all.rename({0:'id'}, axis=1)
del data_all[1],data_all[2]
deviceid_packages = pd.read_csv('Demo/deviceid_packages.tsv', sep='\t', header=None)
deviceid_packages = deviceid_packages.rename({0: 'id', 1: 'packages_names'}, axis=1)
package_label = pd.read_csv('Demo/package_label.tsv', sep='\t', header=None)
package_label = package_label.rename({0:'packages_name', 1:'packages_type'},axis=1)
# package_label['packages_type'] = package_label.apply(lambda row:row['packages_type'] + ' ' + row[2], axis=1)
dict_label = dict(zip(list(package_label['packages_name']), list(package_label['packages_type'])))
data_all = pd.merge(data_all, deviceid_packages, on='id', how='left')
feature = pd.DataFrame()
import numpy as np
# app个数
# 毒特征?
# feature['app_count'] = data_all['packages_names'].apply(lambda row: len(str(row).split(',')))
# 对此数据做countvector,和tfidfvector,并在一起跑几个学习模型
# 引申出来的count和tfidf,跑基本机器学习分类模型
data_all['package_str'] = data_all['packages_names'].apply(lambda row: str(row).replace(',', ' '))
def get_more_information(row):
result = ' '
start = True
row_list = row.split(',')
for i in row_list:
try:
if start:
result = dict_label[i]
start = False
else:
result = result + ' ' + dict_label[i]
except KeyError:
pass
return result
data_all['package_str_more_information'] = data_all['packages_names'].apply(lambda row: get_more_information(str(row)))
print(data_all)
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import scipy.sparse
from sklearn.cross_validation import StratifiedKFold
count_vec = CountVectorizer()
count_csr_basic = count_vec.fit_transform(data_all['package_str'])
tfidf_vec = TfidfVectorizer()
tfidf_vec_basic = tfidf_vec.fit_transform(data_all['package_str'])
count_vec = CountVectorizer()
count_csr_more = count_vec.fit_transform(data_all['package_str_more_information'])
tfidf_vec = TfidfVectorizer()
tfidf_vec_more = tfidf_vec.fit_transform(data_all['package_str_more_information'])
data_feature = scipy.sparse.csr_matrix(scipy.sparse.hstack([count_csr_basic, tfidf_vec_basic,
count_csr_more, tfidf_vec_more]))
train_feature = data_feature[:len(train)]
score = train['label']
test_feature = data_feature[len(train):]
number = len(np.unique(score))
# 五则交叉验证
n_folds = 5
print('处理完毕')
########################### lr(LogisticRegression) ################################
print('lr stacking')
stack_train = np.zeros((len(train), number))
stack_test = np.zeros((len(test), number))
score_va = 0
for i, (tr, va) in enumerate(StratifiedKFold(score, n_folds=n_folds, random_state=1017)):
print('stack:%d/%d' % ((i + 1), n_folds))
clf = LogisticRegression(random_state=1017, C=8)
clf.fit(train_feature[tr], score[tr])
score_va = clf.predict_proba(train_feature[va])
score_te = clf.predict_proba(test_feature)
print('得分' + str(mean_squared_error(score[va], clf.predict(train_feature[va]))))
stack_train[va] += score_va
stack_test += score_te
stack_test /= n_folds
stack = np.vstack([stack_train, stack_test])
df_stack = pd.DataFrame()
for i in range(stack.shape[1]):
df_stack['tfidf_lr_classfiy_{}'.format(i)] = np.around(stack[:, i], 6)
df_stack.to_csv('feature/tfidf_lr_error_single_classfiy.csv', index=None, encoding='utf8')
print('lr特征已保存\n')
########################### SGD(随机梯度下降) ################################
print('sgd stacking')
stack_train = np.zeros((len(train), number))
stack_test = np.zeros((len(test), number))
score_va = 0
for i, (tr, va) in enumerate(StratifiedKFold(score, n_folds=n_folds, random_state=1017)):
print('stack:%d/%d' % ((i + 1), n_folds))
sgd = SGDClassifier(random_state=1017, loss='log')
sgd.fit(train_feature[tr], score[tr])
score_va = sgd.predict_proba(train_feature[va])
score_te = sgd.predict_proba(test_feature)
print('得分' + str(mean_squared_error(score[va], sgd.predict(train_feature[va]))))
stack_train[va] += score_va
stack_test += score_te
stack_test /= n_folds
stack = np.vstack([stack_train, stack_test])
df_stack = pd.DataFrame()
for i in range(stack.shape[1]):
df_stack['tfidf_sgd_classfiy_{}'.format(i)] = np.around(stack[:, i], 6)
df_stack.to_csv('feature/tfidf_sgd_error_single_classfiy.csv', index=None, encoding='utf8')
print('sgd特征已保存\n')
########################### pac(PassiveAggressiveClassifier) ################################
print('PAC stacking')
stack_train = np.zeros((len(train), number))
stack_test = np.zeros((len(test), number))
score_va = 0
for i, (tr, va) in enumerate(StratifiedKFold(score, n_folds=n_folds, random_state=1017)):
print('stack:%d/%d' % ((i + 1), n_folds))
pac = PassiveAggressiveClassifier(random_state=1017)
pac.fit(train_feature[tr], score[tr])
score_va = pac._predict_proba_lr(train_feature[va])
score_te = pac._predict_proba_lr(test_feature)
print(score_va)
print('得分' + str(mean_squared_error(score[va], pac.predict(train_feature[va]))))
stack_train[va] += score_va
stack_test += score_te
stack_test /= n_folds
stack = np.vstack([stack_train, stack_test])
df_stack = pd.DataFrame()
for i in range(stack.shape[1]):
df_stack['tfidf_pac_classfiy_{}'.format(i)] = np.around(stack[:, i], 6)
df_stack.to_csv('feature/tfidf_pac_error_single_classfiy.csv', index=None, encoding='utf8')
print('pac特征已保存\n')
########################### ridge(RidgeClassfiy) ################################
print('RidgeClassfiy stacking')
stack_train = np.zeros((len(train), number))
stack_test = np.zeros((len(test), number))
score_va = 0
for i, (tr, va) in enumerate(StratifiedKFold(score, n_folds=n_folds, random_state=1017)):
print('stack:%d/%d' % ((i + 1), n_folds))
ridge = RidgeClassifier(random_state=1017)
ridge.fit(train_feature[tr], score[tr])
score_va = ridge._predict_proba_lr(train_feature[va])
score_te = ridge._predict_proba_lr(test_feature)
print(score_va)
print('得分' + str(mean_squared_error(score[va], ridge.predict(train_feature[va]))))
stack_train[va] += score_va
stack_test += \
score_te
stack_test /= n_folds
stack = np.vstack([stack_train, stack_test])
df_stack = pd.DataFrame()
for i in range(stack.shape[1]):
df_stack['tfidf_ridge_classfiy_{}'.format(i)] = np.around(stack[:, i], 6)
df_stack.to_csv('feature/tfidf_ridge_error_single_classfiy.csv', index=None, encoding='utf8')
print('ridge特征已保存\n')
########################### bnb(BernoulliNB) ################################
print('BernoulliNB stacking')
stack_train = np.zeros((len(train), number))
stack_test = np.zeros((len(test), number))
score_va = 0
for i, (tr, va) in enumerate(StratifiedKFold(score, n_folds=n_folds, random_state=1017)):
print('stack:%d/%d' % ((i + 1), n_folds))
bnb = BernoulliNB()
bnb.fit(train_feature[tr], score[tr])
score_va = bnb.predict_proba(train_feature[va])
score_te = bnb.predict_proba(test_feature)
print(score_va)
print('得分' + str(mean_squared_error(score[va], bnb.predict(train_feature[va]))))
stack_train[va] += score_va
stack_test += score_te
stack_test /= n_folds
stack = np.vstack([stack_train, stack_test])
df_stack = pd.DataFrame()
for i in range(stack.shape[1]):
df_stack['tfidf_bnb_classfiy_{}'.format(i)] = np.around(stack[:, i], 6)
df_stack.to_csv('feature/tfidf_bnb_error_single_classfiy.csv', index=None, encoding='utf8')
print('BernoulliNB特征已保存\n')
########################### mnb(MultinomialNB) ################################
print('MultinomialNB stacking')
stack_train = np.zeros((len(train), number))
stack_test = np.zeros((len(test), number))
score_va = 0
for i, (tr, va) in enumerate(StratifiedKFold(score, n_folds=n_folds, random_state=1017)):
print('stack:%d/%d' % ((i + 1), n_folds))
mnb = MultinomialNB()
mnb.fit(train_feature[tr], score[tr])
score_va = mnb.predict_proba(train_feature[va])
score_te = mnb.predict_proba(test_feature)
print(score_va)
print('得分' + str(mean_squared_error(score[va], mnb.predict(train_feature[va]))))
stack_train[va] += score_va
stack_test += score_te
stack_test /= n_folds
stack = np.vstack([stack_train, stack_test])
df_stack = pd.DataFrame()
for i in range(stack.shape[1]):
df_stack['tfidf_mnb_classfiy_{}'.format(i)] = np.around(stack[:, i], 6)
df_stack.to_csv('feature/tfidf_mnb_error_single_classfiy.csv', index=None, encoding='utf8')
print('MultinomialNB特征已保存\n')
############################ Linersvc(LinerSVC) ################################
print('LinerSVC stacking')
stack_train = np.zeros((len(train), number))
stack_test = np.zeros((len(test), number))
score_va = 0
for i, (tr, va) in enumerate(StratifiedKFold(score, n_folds=n_folds, random_state=1017)):
print('stack:%d/%d' % ((i + 1), n_folds))
lsvc = LinearSVC(random_state=1017)
lsvc.fit(train_feature[tr], score[tr])
score_va = lsvc._predict_proba_lr(train_feature[va])
score_te = lsvc._predict_proba_lr(test_feature)
print(score_va)
print('得分' + str(mean_squared_error(score[va], lsvc.predict(train_feature[va]))))
stack_train[va] += score_va
stack_test += score_te
stack_test /= n_folds
stack = np.vstack([stack_train, stack_test])
df_stack = pd.DataFrame()
for i in range(stack.shape[1]):
df_stack['tfidf_lsvc_classfiy_{}'.format(i)] = np.around(stack[:, i], 6)
df_stack.to_csv('feature/tfidf_lsvc_error_single_classfiy.csv', index=None, encoding='utf8')
print('LSVC特征已保存\n')
kmeans_result = pd.DataFrame()
###### kmeans ###
def get_cluster(num_clusters):
print('开始' + str(num_clusters))
name = 'kmean'
print(name)
model = KMeans(n_clusters=num_clusters, max_iter=300, n_init=1, \
init='k-means++', n_jobs=10, random_state=1017)
result = model.fit_predict(data_feature)
kmeans_result[name + 'word_' + str(num_clusters)] = result
get_cluster(5)
get_cluster(10)
get_cluster(19)
get_cluster(30)
get_cluster(40)
get_cluster(50)
get_cluster(60)
get_cluster(70)
kmeans_result.to_csv('feature/cluster_tfidf_feature.csv', index=False)
feature.to_csv('feature/deviceid_package_feature.csv', index=False)
================================================
FILE: nb_cz_lwl_wcm/4_get_feature_device_start_close_tfidf_1_2.py
================================================
# -*- coding:utf-8 -*-
import pandas as pd
import scipy.sparse
import numpy as np
from sklearn import preprocessing
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
train = pd.read_csv('Demo/deviceid_train.tsv', sep='\t', header=None)
test = pd.read_csv('Demo/deviceid_test.tsv', sep='\t', header=None)
data_all = pd.concat([train, test], axis=0)
data_all = data_all.rename({0:'id'}, axis=1)
del data_all[1],data_all[2]
start_close_time = pd.read_csv('Demo/deviceid_package_start_close.tsv', sep='\t', header=None)
start_close_time = start_close_time.rename({0:'id', 1:'app_name', 2:'start_time', 3:'close_time'}, axis=1)
start_close_time = start_close_time.sort_values(by='start_time')
start_close_time['start_time'] = map(int,start_close_time['start_time']/1000)
start_close_time['close_time'] = map(int,start_close_time['close_time']/1000)
unique_app_name = np.unique(start_close_time['app_name'])
dict_label = dict(zip(list(unique_app_name), list(np.arange(0, len(unique_app_name), 1))))
import time
start_close_time['app_name'] = start_close_time['app_name'].apply(lambda row: str(dict_label[row]))
del start_close_time['start_time'], start_close_time['close_time']
from tqdm import tqdm, tqdm_pandas
tqdm_pandas(tqdm())
def dealed_row(row):
app_name_list = list(row['app_name'])
return ' '.join(app_name_list)
data_feature = start_close_time.groupby('id').progress_apply(lambda row:dealed_row(row)).reset_index()
data_feature = pd.merge(data_all, data_feature, on='id', how='left')
del data_feature['id']
count_vec = CountVectorizer(ngram_range=(1,3))
count_csr_basic = count_vec.fit_transform(data_feature[0])
tfidf_vec = TfidfVectorizer(ngram_range=(1,3))
tfidf_vec_basic = tfidf_vec.fit_transform(data_feature[0])
data_feature = scipy.sparse.csr_matrix(scipy.sparse.hstack([count_csr_basic, tfidf_vec_basic]))
from sklearn.cluster import KMeans
from sklearn.linear_model import LogisticRegression, SGDClassifier, PassiveAggressiveClassifier, RidgeClassifier
from sklearn.metrics import mean_squared_error
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.cross_validation import StratifiedKFold
train = pd.read_csv('Demo/deviceid_train.tsv', sep='\t', header=None)
test = pd.read_csv('Demo/deviceid_test.tsv', sep='\t', header=None)
def get_label(row):
if row[1] == 1:
return row[2]
else:
return row[2] + 11
train['label'] = train.apply(lambda row:get_label(row), axis=1)
data_all = pd.concat([train, test], axis=0)
data_all = data_all.rename({0:'id'}, axis=1)
del data_all[1],data_all[2]
train_feature = data_feature[:len(train)]
score = train['label']
test_feature = data_feature[len(train):]
number = len(np.unique(score))
# 五则交叉验证
n_folds = 5
print('处理完毕')
########################### lr(LogisticRegression) ################################
print('lr stacking')
stack_train = np.zeros((len(train), number))
stack_test = np.zeros((len(test), number))
score_va = 0
for i, (tr, va) in enumerate(StratifiedKFold(score, n_folds=n_folds, random_state=1017)):
print('stack:%d/%d' % ((i + 1), n_folds))
clf = LogisticRegression(random_state=1017, C=8)
clf.fit(train_feature[tr], score[tr])
score_va = clf.predict_proba(train_feature[va])
score_te = clf.predict_proba(test_feature)
print('得分' + str(mean_squared_error(score[va], clf.predict(train_feature[va]))))
stack_train[va] += score_va
stack_test += score_te
stack_test /= n_folds
stack = np.vstack([stack_train, stack_test])
df_stack = pd.DataFrame()
for i in range(stack.shape[1]):
df_stack['tfidf_lr_2_classfiy_{}'.format(i)] = np.around(stack[:, i], 6)
df_stack.to_csv('feature/tfidf_lr_1_3_error_single_classfiy.csv', index=None, encoding='utf8')
print('lr特征已保存\n')
########################### SGD(随机梯度下降) ################################
print('sgd stacking')
stack_train = np.zeros((len(train), number))
stack_test = np.zeros((len(test), number))
score_va = 0
for i, (tr, va) in enumerate(StratifiedKFold(score, n_folds=n_folds, random_state=1017)):
print('stack:%d/%d' % ((i + 1), n_folds))
sgd = SGDClassifier(random_state=1017, loss='log')
sgd.fit(train_feature[tr], score[tr])
score_va = sgd.predict_proba(train_feature[va])
score_te = sgd.predict_proba(test_feature)
print('得分' + str(mean_squared_error(score[va], sgd.predict(train_feature[va]))))
stack_train[va] += score_va
stack_test += score_te
stack_test /= n_folds
stack = np.vstack([stack_train, stack_test])
df_stack = pd.DataFrame()
for i in range(stack.shape[1]):
df_stack['tfidf_2_sgd_classfiy_{}'.format(i)] = np.around(stack[:, i], 6)
df_stack.to_csv('feature/tfidf_sgd_1_3_error_single_classfiy.csv', index=None, encoding='utf8')
print('sgd特征已保存\n')
########################### pac(PassiveAggressiveClassifier) ################################
print('PAC stacking')
stack_train = np.zeros((len(train), number))
stack_test = np.zeros((len(test), number))
score_va = 0
for i, (tr, va) in enumerate(StratifiedKFold(score, n_folds=n_folds, random_state=1017)):
print('stack:%d/%d' % ((i + 1), n_folds))
pac = PassiveAggressiveClassifier(random_state=1017)
pac.fit(train_feature[tr], score[tr])
score_va = pac._predict_proba_lr(train_feature[va])
score_te = pac._predict_proba_lr(test_feature)
print(score_va)
print('得分' + str(mean_squared_error(score[va], pac.predict(train_feature[va]))))
stack_train[va] += score_va
stack_test += score_te
stack_test /= n_folds
stack = np.vstack([stack_train, stack_test])
df_stack = pd.DataFrame()
for i in range(stack.shape[1]):
df_stack['tfidf_pac_classfiy_{}'.format(i)] = np.around(stack[:, i], 6)
df_stack.to_csv('feature/tfidf_pac_1_3_error_single_classfiy.csv', index=None, encoding='utf8')
print('pac特征已保存\n')
########################### ridge(RidgeClassfiy) ################################
print('RidgeClassfiy stacking')
stack_train = np.zeros((len(train), number))
stack_test = np.zeros((len(test), number))
score_va = 0
for i, (tr, va) in enumerate(StratifiedKFold(score, n_folds=n_folds, random_state=1017)):
print('stack:%d/%d' % ((i + 1), n_folds))
ridge = RidgeClassifier(random_state=1017)
ridge.fit(train_feature[tr], score[tr])
score_va = ridge._predict_proba_lr(train_feature[va])
score_te = ridge._predict_proba_lr(test_feature)
print(score_va)
print('得分' + str(mean_squared_error(score[va], ridge.predict(train_feature[va]))))
stack_train[va] += score_va
stack_test += score_te
stack_test /= n_folds
stack = np.vstack([stack_train, stack_test])
df_stack = pd.DataFrame()
for i in range(stack.shape[1]):
df_stack['tfidf_ridge_classfiy_{}'.format(i)] = np.around(stack[:, i], 6)
df_stack.to_csv('feature/tfidf_ridge_1_3_error_single_classfiy.csv', index=None, encoding='utf8')
print('ridge特征已保存\n')
########################### bnb(BernoulliNB) ################################
print('BernoulliNB stacking')
stack_train = np.zeros((len(train), number))
stack_test = np.zeros((len(test), number))
score_va = 0
for i, (tr, va) in enumerate(StratifiedKFold(score, n_folds=n_folds, random_state=1017)):
print('stack:%d/%d' % ((i + 1), n_folds))
bnb = BernoulliNB()
bnb.fit(train_feature[tr], score[tr])
score_va = bnb.predict_proba(train_feature[va])
score_te = bnb.predict_proba(test_feature)
print(score_va)
print('得分' + str(mean_squared_error(score[va], bnb.predict(train_feature[va]))))
stack_train[va] += score_va
stack_test += score_te
stack_test /= n_folds
stack = np.vstack([stack_train, stack_test])
df_stack = pd.DataFrame()
for i in range(stack.shape[1]):
df_stack['tfidf_bnb_classfiy_{}'.format(i)] = np.around(stack[:, i], 6)
df_stack.to_csv('feature/tfidf_bnb_1_3_error_single_classfiy.csv', index=None, encoding='utf8')
print('BernoulliNB特征已保存\n')
########################### mnb(MultinomialNB) ################################
print('MultinomialNB stacking')
stack_train = np.zeros((len(train), number))
stack_test = np.zeros((len(test), number))
score_va = 0
for i, (tr, va) in enumerate(StratifiedKFold(score, n_folds=n_folds, random_state=1017)):
print('stack:%d/%d' % ((i + 1), n_folds))
mnb = MultinomialNB()
mnb.fit(train_feature[tr], score[tr])
score_va = mnb.predict_proba(train_feature[va])
score_te = mnb.predict_proba(test_feature)
print(score_va)
print('得分' + str(mean_squared_error(score[va], mnb.predict(train_feature[va]))))
stack_train[va] += score_va
stack_test += score_te
stack_test /= n_folds
stack = np.vstack([stack_train, stack_test])
df_stack = pd.DataFrame()
for i in range(stack.shape[1]):
df_stack['tfidf_mnb_classfiy_{}'.format(i)] = np.around(stack[:, i], 6)
df_stack.to_csv('feature/tfidf_mnb_1_3_error_single_classfiy.csv', index=None, encoding='utf8')
print('MultinomialNB特征已保存\n')
================================================
FILE: nb_cz_lwl_wcm/5_get_feature_device_start_close_tfidf.py
================================================
# -*- coding:utf-8 -*-
import pandas as pd
import scipy.sparse
import numpy as np
from sklearn import preprocessing
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
train = pd.read_csv('Demo/deviceid_train.tsv', sep='\t', header=None)
test = pd.read_csv('Demo/deviceid_test.tsv', sep='\t', header=None)
data_all = pd.concat([train, test], axis=0)
data_all = data_all.rename({0:'id'}, axis=1)
del data_all[1],data_all[2]
start_close_time = pd.read_csv('Demo/deviceid_package_start_close.tsv', sep='\t', header=None)
start_close_time = start_close_time.rename({0:'id', 1:'app_name', 2:'start_time', 3:'close_time'}, axis=1)
start_close_time['start_time'] = map(int,start_close_time['start_time']/1000)
start_close_time['close_time'] = map(int,start_close_time['close_time']/1000)
unique_app_name = np.unique(start_close_time['app_name'])
dict_label = dict(zip(list(unique_app_name), list(np.arange(0, len(unique_app_name), 1))))
import time
start_close_time['app_name'] = start_close_time['app_name'].apply(lambda row: str(dict_label[row]))
del start_close_time['start_time'], start_close_time['close_time']
from tqdm import tqdm, tqdm_pandas
tqdm_pandas(tqdm())
def dealed_row(row):
app_name_list = list(row['app_name'])
return ' '.join(app_name_list)
data_feature = start_close_time.groupby('id').progress_apply(lambda row:dealed_row(row)).reset_index()
data_feature = pd.merge(data_all, data_feature, on='id', how='left')
del data_feature['id']
count_vec = CountVectorizer()
count_csr_basic = count_vec.fit_transform(data_feature[0])
tfidf_vec = TfidfVectorizer()
tfidf_vec_basic = tfidf_vec.fit_transform(data_feature[0])
data_feature = scipy.sparse.csr_matrix(scipy.sparse.hstack([count_csr_basic, tfidf_vec_basic]))
from sklearn.cluster import KMeans
from sklearn.linear_model import LogisticRegression, SGDClassifier, PassiveAggressiveClassifier, RidgeClassifier
from sklearn.metrics import mean_squared_error
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.cross_validation import StratifiedKFold
train = pd.read_csv('Demo/deviceid_train.tsv', sep='\t', header=None)
test = pd.read_csv('Demo/deviceid_test.tsv', sep='\t', header=None)
def get_label(row):
if row[1] == 1:
return row[2]
else:
return row[2] + 11
train['label'] = train.apply(lambda row:get_label(row), axis=1)
data_all = pd.concat([train, test], axis=0)
data_all = data_all.rename({0:'id'}, axis=1)
del data_all[1],data_all[2]
train_feature = data_feature[:len(train)]
score = train['label']
test_feature = data_feature[len(train):]
number = len(np.unique(score))
# 五则交叉验证
n_folds = 5
print('处理完毕')
########################### lr(LogisticRegression) ################################
print('lr stacking')
stack_train = np.zeros((len(train), number))
stack_test = np.zeros((len(test), number))
score_va = 0
for i, (tr, va) in enumerate(StratifiedKFold(score, n_folds=n_folds, random_state=1017)):
print('stack:%d/%d' % ((i + 1), n_folds))
clf = LogisticRegression(random_state=1017, C=8)
clf.fit(train_feature[tr], score[tr])
score_va = clf.predict_proba(train_feature[va])
score_te = clf.predict_proba(test_feature)
print('得分' + str(mean_squared_error(score[va], clf.predict(train_feature[va]))))
stack_train[va] += score_va
stack_test += score_te
stack_test /= n_folds
stack = np.vstack([stack_train, stack_test])
df_stack = pd.DataFrame()
for i in range(stack.shape[1]):
df_stack['tfidf_lr_2_classfiy_{}'.format(i)] = np.around(stack[:, i], 6)
df_stack.to_csv('feature/tfidf_lr_2_error_single_classfiy.csv', index=None, encoding='utf8')
print('lr特征已保存\n')
########################### SGD(随机梯度下降) ################################
print('sgd stacking')
stack_train = np.zeros((len(train), number))
stack_test = np.zeros((len(test), number))
score_va = 0
for i, (tr, va) in enumerate(StratifiedKFold(score, n_folds=n_folds, random_state=1017)):
print('stack:%d/%d' % ((i + 1), n_folds))
sgd = SGDClassifier(random_state=1017, loss='log')
sgd.fit(train_feature[tr], score[tr])
score_va = sgd.predict_proba(train_feature[va])
score_te = sgd.predict_proba(test_feature)
print('得分' + str(mean_squared_error(score[va], sgd.predict(train_feature[va]))))
stack_train[va] += score_va
stack_test += score_te
stack_test /= n_folds
stack = np.vstack([stack_train, stack_test])
df_stack = pd.DataFrame()
for i in range(stack.shape[1]):
df_stack['tfidf_2_sgd_classfiy_{}'.format(i)] = np.around(stack[:, i], 6)
df_stack.to_csv('feature/tfidf_sgd_2_error_single_classfiy.csv', index=None, encoding='utf8')
print('sgd特征已保存\n')
########################### pac(PassiveAggressiveClassifier) ################################
print('PAC stacking')
stack_train = np.zeros((len(train), number))
stack_test = np.zeros((len(test), number))
score_va = 0
for i, (tr, va) in enumerate(StratifiedKFold(score, n_folds=n_folds, random_state=1017)):
print('stack:%d/%d' % ((i + 1), n_folds))
pac = PassiveAggressiveClassifier(random_state=1017)
pac.fit(train_feature[tr], score[tr])
score_va = pac._predict_proba_lr(train_feature[va])
score_te = pac._predict_proba_lr(test_feature)
print(score_va)
print('得分' + str(mean_squared_error(score[va], pac.predict(train_feature[va]))))
stack_train[va] += score_va
stack_test += score_te
stack_test /= n_folds
stack = np.vstack([stack_train, stack_test])
df_stack = pd.DataFrame()
for i in range(stack.shape[1]):
df_stack['tfidf_pac_classfiy_{}'.format(i)] = np.around(stack[:, i], 6)
df_stack.to_csv('feature/tfidf_pac_2_error_single_classfiy.csv', index=None, encoding='utf8')
print('pac特征已保存\n')
########################### ridge(RidgeClassfiy) ################################
print('RidgeClassfiy stacking')
stack_train = np.zeros((len(train), number))
stack_test = np.zeros((len(test), number))
score_va = 0
for i, (tr, va) in enumerate(StratifiedKFold(score, n_folds=n_folds, random_state=1017)):
print('stack:%d/%d' % ((i + 1), n_folds))
ridge = RidgeClassifier(random_state=1017)
ridge.fit(train_feature[tr], score[tr])
score_va = ridge._predict_proba_lr(train_feature[va])
score_te = ridge._predict_proba_lr(test_feature)
print(score_va)
print('得分' + str(mean_squared_error(score[va], ridge.predict(train_feature[va]))))
stack_train[va] += score_va
stack_test += \
score_te
stack_test /= n_folds
stack = np.vstack([stack_train, stack_test])
df_stack = pd.DataFrame()
for i in range(stack.shape[1]):
df_stack['tfidf_ridge_classfiy_{}'.format(i)] = np.around(stack[:, i], 6)
df_stack.to_csv('feature/tfidf_ridge_2_error_single_classfiy.csv', index=None, encoding='utf8')
print('ridge特征已保存\n')
########################### bnb(BernoulliNB) ################################
print('BernoulliNB stacking')
stack_train = np.zeros((len(train), number))
stack_test = np.zeros((len(test), number))
score_va = 0
for i, (tr, va) in enumerate(StratifiedKFold(score, n_folds=n_folds, random_state=1017)):
print('stack:%d/%d' % ((i + 1), n_folds))
bnb = BernoulliNB()
bnb.fit(train_feature[tr], score[tr])
score_va = bnb.predict_proba(train_feature[va])
score_te = bnb.predict_proba(test_feature)
print(score_va)
print('得分' + str(mean_squared_error(score[va], bnb.predict(train_feature[va]))))
stack_train[va] += score_va
stack_test += score_te
stack_test /= n_folds
stack = np.vstack([stack_train, stack_test])
df_stack = pd.DataFrame()
for i in range(stack.shape[1]):
df_stack['tfidf_bnb_classfiy_{}'.format(i)] = np.around(stack[:, i], 6)
df_stack.to_csv('feature/tfidf_bnb_2_error_single_classfiy.csv', index=None, encoding='utf8')
print('BernoulliNB特征已保存\n')
########################### mnb(MultinomialNB) ################################
print('MultinomialNB stacking')
stack_train = np.zeros((len(train), number))
stack_test = np.zeros((len(test), number))
score_va = 0
for i, (tr, va) in enumerate(StratifiedKFold(score, n_folds=n_folds, random_state=1017)):
print('stack:%d/%d' % ((i + 1), n_folds))
mnb = MultinomialNB()
mnb.fit(train_feature[tr], score[tr])
score_va = mnb.predict_proba(train_feature[va])
score_te = mnb.predict_proba(test_feature)
print(score_va)
print('得分' + str(mean_squared_error(score[va], mnb.predict(train_feature[va]))))
stack_train[va] += score_va
stack_test += score_te
stack_test /= n_folds
stack = np.vstack([stack_train, stack_test])
df_stack = pd.DataFrame()
for i in range(stack.shape[1]):
df_stack['tfidf_mnb_classfiy_{}'.format(i)] = np.around(stack[:, i], 6)
df_stack.to_csv('feature/tfidf_mnb_2_error_single_classfiy.csv', index=None, encoding='utf8')
print('MultinomialNB特征已保存\n')
############################ Linersvc(LinerSVC) ################################
print('LinerSVC stacking')
stack_train = np.zeros((len(train), number))
stack_test = np.zeros((len(test), number))
score_va = 0
for i, (tr, va) in enumerate(StratifiedKFold(score, n_folds=n_folds, random_state=1017)):
print('stack:%d/%d' % ((i + 1), n_folds))
lsvc = LinearSVC(random_state=1017)
lsvc.fit(train_feature[tr], score[tr])
score_va = lsvc._predict_proba_lr(train_feature[va])
score_te = lsvc._predict_proba_lr(test_feature)
print(score_va)
print('得分' + str(mean_squared_error(score[va], lsvc.predict(train_feature[va]))))
stack_train[va] += score_va
stack_test += score_te
stack_test /= n_folds
stack = np.vstack([stack_train, stack_test])
df_stack = pd.DataFrame()
for i in range(stack.shape[1]):
df_stack['tfidf_lsvc_classfiy_{}'.format(i)] = np.around(stack[:, i], 6)
df_stack.to_csv('feature/tfidf_lsvc_2_error_single_classfiy.csv', index=None, encoding='utf8')
print('LSVC特征已保存\n')
kmeans_result = pd.DataFrame()
###### kmeans ###
def get_cluster(num_clusters):
print('开始' + str(num_clusters))
name = 'kmean'
print(name)
model = KMeans(n_clusters=num_clusters, max_iter=300, n_init=1, \
init='k-means++', n_jobs=10, random_state=1017)
result = model.fit_predict(data_feature)
kmeans_result[name + 'word_' + str(num_clusters)] = result
get_cluster(5)
get_cluster(10)
get_cluster(19)
get_cluster(30)
get_cluster(40)
get_cluster(50)
get_cluster(60)
get_cluster(70)
kmeans_result.to_csv('feature/cluster_2_tfidf_feature.csv', index=False)
================================================
FILE: nb_cz_lwl_wcm/6_get_feature_device_start_close.py
================================================
# -*- coding:utf-8 -*-
import pandas as pd
import numpy as np
from sklearn import preprocessing
train = pd.read_csv('Demo/deviceid_train.tsv', sep='\t', header=None)
test = pd.read_csv('Demo/deviceid_test.tsv', sep='\t', header=None)
data_all = pd.concat([train, test], axis=0)
data_all = data_all.rename({0:'id'}, axis=1)
del data_all[1],data_all[2]
start_close_time = pd.read_csv('Demo/deviceid_package_start_close.tsv', sep='\t', header=None)
start_close_time = start_close_time.rename({0:'id', 1:'app_name', 2:'start_time', 3:'close_time'}, axis=1)
start_close_time['diff_time'] = (start_close_time['close_time'] - start_close_time['start_time'])/1000
print('开始转换时间')
import time
start_close_time['close_time'] = start_close_time['close_time'].apply(lambda row: int(time.localtime(row/1000).tm_hour))
start_close_time['start_time'] = start_close_time['start_time'].apply(lambda row: int(time.localtime(row/1000).tm_hour))
# 一个表里面的总次数
print('一个表的总次数')
feature = pd.DataFrame()
feature['start_close_count'] = pd.merge(data_all, start_close_time.groupby('id').size().reset_index(), on='id', how='left')[0]
# 0 - 5 点的使用次数
temp = start_close_time[(start_close_time['close_time'] >=0)&(start_close_time['close_time'] <=5)]
temp = temp.groupby('id').size().reset_index()
feature['zero_five_count'] = pd.merge(data_all, temp, on='id', how='left').fillna(0)[0]
# 玩的时间最长的app的名字编码
def get_max_label(row):
row_name = list(row['app_name'])
row_diff_time = list(row['diff_time'])
return row_name[np.argmax(row_diff_time)]
start_close_max_name = start_close_time.groupby('id').apply(lambda row:get_max_label(row)).reset_index()
label_encoder = preprocessing.LabelEncoder()
feature['start_close_max_name'] = label_encoder.fit_transform(pd.merge(data_all, start_close_max_name, on='id', how='left').fillna(0)[0])
feature.to_csv('feature/feature_start_close.csv', index=False)
================================================
FILE: nb_cz_lwl_wcm/7_get_feature_w2v.py
================================================
from gensim.models import Word2Vec
import pandas as pd
path="Demo/"
packages = pd.read_csv(path+"deviceid_packages.tsv",
sep="\t", names=['id', 'app_list'])
packages['app_count'] = packages['app_list'].apply(
lambda x: len(x.split(",")), 1)
documents = packages['app_list'].values.tolist()
texts = [[word for word in str(document).split(',')] for document in documents]
# frequency = defaultdict(int)
# for text in texts:
# for token in text:
# frequency[token] += 1
# texts = [[token for token in text if frequency[token] >= 5] for text in texts]
w2v = Word2Vec(texts, size=128, window=10, iter=45,
workers=12, seed=1017, min_count=5)
w2v.wv.save_word2vec_format('./w2v_128.txt')
import gensim
import numpy as np
def get_w2v_avg(text, w2v_out_path, word2vec_Path):
texts = []
w2v_dim = 128
data = text
# data = pd.read_csv(text_path)
data['app_list'] = data['app_list'].apply(
lambda x: x.strip().split(","), 1)
texts = data['app_list'].values.tolist()
model = gensim.models.KeyedVectors.load_word2vec_format(
word2vec_Path, binary=False)
vacab = model.vocab.keys()
w2v_feature = np.zeros((len(texts), w2v_dim))
w2v_feature_avg = np.zeros((len(texts), w2v_dim))
for i, line in enumerate(texts):
num = 0
if line == '':
w2v_feature_avg[i, :] = np.zeros(w2v_dim)
else:
for word in line:
num += 1
vec = model[word] if word in vacab else np.zeros(w2v_dim)
w2v_feature[i, :] += vec
w2v_feature_avg[i, :] = w2v_feature[i, :] / num
w2v_avg = pd.DataFrame(w2v_feature_avg)
w2v_avg.columns = ['w2v_avg_' + str(i) for i in w2v_avg.columns]
w2v_avg['id'] = data['id']
w2v_avg.to_csv(w2v_out_path, encoding='utf-8', index=None)
return w2v_avg
w2v_feat = get_w2v_avg(packages, "feature/w2v_avg.csv", "w2v_128.txt")
================================================
FILE: nb_cz_lwl_wcm/8_get_feature_lwl.py
================================================
# coding: utf-8
# In[ ]:
import pandas as pd
import seaborn as sns
import numpy as np
from tqdm import tqdm
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import lightgbm as lgb
from datetime import datetime,timedelta
import matplotlib.pyplot as plt
import time
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
# %matplotlib inline
#add
import gc
from sklearn import preprocessing
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack, vstack
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from skopt.space import Integer, Categorical, Real, Log10
from skopt.utils import use_named_args
from skopt import gp_minimize
from gensim.models import Word2Vec, FastText
import gensim
import re
# In[ ]:
test = pd.read_csv('../input/yiguan/demo/Demo/deviceid_test.tsv', sep='\t', names=['device_id'])
train = pd.read_csv('../input/yiguan/demo/Demo/deviceid_train.tsv', sep='\t', names=['device_id', 'sex', 'age'])
brand = pd.read_table('../input/yiguan/demo/Demo/deviceid_brand.tsv', names=['device_id', 'vendor', 'version'])
packtime = pd.read_table('../input/yiguan/demo/Demo/deviceid_package_start_close.tsv',
names=['device_id', 'app', 'start', 'close'])
packages = pd.read_csv('../input/yiguan/demo/Demo/deviceid_packages.tsv', sep='\t', names=['device_id', 'apps'])
# In[ ]:
packtime['period'] = (packtime['close'] - packtime['start'])/1000
packtime['start'] = pd.to_datetime(packtime['start'], unit='ms')
app_use_time = packtime.groupby(['app'])['period'].agg('sum').reset_index()
# 试试看200
app_use_top100 = app_use_time.sort_values(by='period', ascending=False)[:100]['app']
device_app_use_time = packtime.groupby(['device_id', 'app'])['period'].agg('sum').reset_index()
use_time_top100_statis = device_app_use_time.set_index('app').loc[list(app_use_top100)].reset_index()
top100_statis = use_time_top100_statis.pivot(index='device_id', columns='app', values='period').reset_index()
# In[ ]:
top100_statis = top100_statis.fillna(0)
# In[ ]:
# 手机品牌预处理
brand['vendor'] = brand['vendor'].astype(str).apply(lambda x : x.split(' ')[0].upper())
brand['ph_ver'] = brand['vendor'] + '_' + brand['version']
ph_ver = brand['ph_ver'].value_counts()
ph_ver_cnt = pd.DataFrame(ph_ver).reset_index()
ph_ver_cnt.columns = ['ph_ver', 'ph_ver_cnt']
brand = pd.merge(left=brand, right=ph_ver_cnt,on='ph_ver')
# In[ ]:
# 针对长尾分布做的一点处理
mask = (brand.ph_ver_cnt < 100)
brand.loc[mask, 'ph_ver'] = 'other'
train = pd.merge(brand[['device_id', 'ph_ver']], train, on='device_id', how='right')
test = pd.merge(brand[['device_id', 'ph_ver']], test, on='device_id', how='right')
train['ph_ver'] = train['ph_ver'].astype(str)
test['ph_ver'] = test['ph_ver'].astype(str)
# 将 ph_ver 进行 label encoder
ph_ver_le = preprocessing.LabelEncoder()
train['ph_ver'] = ph_ver_le.fit_transform(train['ph_ver'])
test['ph_ver'] = ph_ver_le.transform(test['ph_ver'])
train['label'] = train['sex'].astype(str) + '-' + train['age'].astype(str)
label_le = preprocessing.LabelEncoder()
train['label'] = label_le.fit_transform(train['label'])
# In[ ]:
test['sex'] = -1
test['age'] = -1
test['label'] = -1
data = pd.concat([train, test], ignore_index=True)
data.shape
# In[ ]:
ph_ver_dummy = pd.get_dummies(data['ph_ver'])
ph_ver_dummy.columns = ['ph_ver_' + str(i) for i in range(ph_ver_dummy.shape[1])]
# In[ ]:
data = pd.concat([data, ph_ver_dummy], axis=1)
# In[ ]:
del data['ph_ver']
# In[ ]:
train = data[data.sex != -1]
test = data[data.sex == -1]
train.shape, test.shape
# In[ ]:
# 每个app的总使用次数统计
app_num = packtime['app'].value_counts().reset_index()
app_num.columns = ['app', 'app_num']
packtime = pd.merge(left=packtime, right=app_num, on='app')
# 同样的,针对长尾分布做些处理(尝试过不做处理,或换其他阈值,这个100的阈值最高)
packtime.loc[packtime.app_num < 100, 'app'] = 'other'
# In[ ]:
# 统计每台设备的app数量
df_app = packtime[['device_id', 'app']]
apps = df_app.drop_duplicates().groupby(['device_id'])['app'].apply(' '.join).reset_index()
apps['app_length'] = apps['app'].apply(lambda x:len(x.split(' ')))
train = pd.merge(train, apps, on='device_id', how='left')
test = pd.merge(test, apps, on='device_id', how='left')
# In[ ]:
# 获取每台设备所安装的apps的tfidf
tfidf = CountVectorizer(lowercase=False, min_df=3, stop_words=top100_statis.columns.tolist()[1:7])
apps['app'] = tfidf.fit_transform(apps['app'])
X_tr_app = tfidf.transform(list(train['app']))
X_ts_app = tfidf.transform(list(test['app']))
# In[ ]:
'''
svd = TruncatedSVD(n_components=100, random_state=42)
X = vstack([X_tr_app, X_ts_app])
svd.fit(X)
X_tr_app = svd.fit_transform(X_tr_app)
X_ts_app = svd.fit_transform(X_ts_app)
X_tr_app = pd.DataFrame(X_tr_app)
X_ts_app = pd.DataFrame(X_ts_app)
X_tr_app.columns = ['app_' + str(i) for i in range(0, 100)]
X_ts_app.columns = ['app_' + str(i) for i in range(0, 100)]
'''
# ### 利用word2vec得到每台设备所安装app的embedding表示
# In[ ]:
packages['apps'] = packages['apps'].apply(lambda x:x.split(','))
packages['app_length'] = packages['apps'].apply(lambda x:len(x))
# In[ ]:
embed_size = 128
fastmodel = Word2Vec(list(packages['apps']), size=embed_size, window=4, min_count=3, negative=2,
sg=1, sample=0.002, hs=1, workers=4)
embedding_fast = pd.DataFrame([fastmodel[word] for word in (fastmodel.wv.vocab)])
embedding_fast['app'] = list(fastmodel.wv.vocab)
embedding_fast.columns= ["fdim_%s" % str(i) for i in range(embed_size)]+["app"]
embedding_fast.head()
# In[ ]:
id_list = []
for i in range(packages.shape[0]):
id_list += [list(packages['device_id'])[i]]*packages['app_length'].iloc[i]
app_list = [word for item in packages['apps'] for word in item]
app_vect = pd.DataFrame({'device_id':id_list})
app_vect['app'] = app_list
# In[ ]:
app_vect = app_vect.merge(embedding_fast, on='app', how='left')
app_vect = app_vect.drop('app', axis=1)
seqfeature = app_vect.groupby(['device_id']).agg('mean')
seqfeature.reset_index(inplace=True)
# In[ ]:
seqfeature.head()
# ### 用户一周七天玩手机的时长情况
# In[ ]:
# packtime['period'] = (packtime['close'] - packtime['start'])/1000
# packtime['start'] = pd.to_datetime(packtime['start'], unit='ms')
packtime['dayofweek'] = packtime['start'].dt.dayofweek
packtime['hour'] = packtime['start'].dt.hour
# packtime = packtime[(packtime['start'] < '2017-03-31 23:59:59') & (packtime['start'] > '2017-03-01 00:00:00')]
# In[ ]:
app_use_time = packtime.groupby(['device_id', 'dayofweek'])['period'].agg('sum').reset_index()
week_app_use = app_use_time.pivot_table(values='period', columns='dayofweek', index='device_id').reset_index()
week_app_use = week_app_use.fillna(0)
week_app_use.columns = ['device_id'] + ['week_day_' + str(i) for i in range(0, 7)]
week_app_use['week_max'] = week_app_use.max(axis=1)
week_app_use['week_min'] = week_app_use.min(axis=1)
week_app_use['week_sum'] = week_app_use.sum(axis=1)
week_app_use['week_std'] = week_app_use.std(axis=1)
'''
for i in range(0, 7):
week_app_use['week_day_' + str(i)] = week_app_use['week_day_' + str(i)] / week_app_use['week_sum']
'''
# In[ ]:
'''
app_use_time = packtime.groupby(['device_id', 'hour'])['period'].agg('sum').reset_index()
hour_app_use = app_use_time.pivot_table(values='period', columns='hour', index='device_id').reset_index()
hour_app_use = hour_app_use.fillna(0)
hour_app_use.columns = ['device_id'] + ['hour_' + str(i) for i in range(0, 24)]
# hour_app_use['hour_max'] = hour_app_use.max(axis=1)
# hour_app_use['hour_min'] = hour_app_use.min(axis=1)
# hour_app_use['hour_sum'] = hour_app_use.sum(axis=1)
# hour_app_use['hour_std'] = hour_app_use.std(axis=1)
# for i in range(0, 24):
# hour_app_use['hour_' + str(i)] = hour_app_use['hour_' + str(i)] / hour_app_use['hour_sum']
'''
# ### 将各个特征整合到一块
# In[ ]:
train.columns[4:]
# In[ ]:
user_behavior = pd.read_csv('../input/yg-user-behavior/user_behavior.csv')
user_behavior['app_len_max'] = user_behavior['app_len_max'].astype(np.float64)
del user_behavior['app']
train = pd.merge(train, user_behavior, on='device_id', how='left')
test = pd.merge(test, user_behavior, on='device_id', how='left')
# In[ ]:
train = pd.merge(train, seqfeature, on='device_id', how='left')
test = pd.merge(test, seqfeature, on='device_id', how='left')
# In[ ]:
train = pd.merge(train, week_app_use, on='device_id', how='left')
test = pd.merge(test, week_app_use, on='device_id', how='left')
# In[ ]:
'''
app_top50_list = list(packtime.groupby(by='app')['period'].sum().sort_values(ascending=False)[:50].index)
for app in app_top50_list:
app_cnt = packtime[packtime['app'] == app]
start_num_app = app_cnt.groupby(by='device_id')['start'].count().reset_index()
start_num_app.columns = ['device_id', 'start_num_app_' + app[0:4]]
train = train.merge(start_num_app, on='device_id', how='left')
test = test.merge(start_num_app, on='device_id', how='left')
print(app + ' done')
'''
# In[ ]:
'''
# all_top50 : 使用总时长最高的50款app,每个人的使用时间统计
all_top50 = pd.read_csv('../input/yg-feature/all_top50_statis.csv')
train = pd.merge(train, all_top50, on='device_id', how='left')
test = pd.merge(test, all_top50, on='device_id', how='left')
'''
# In[ ]:
top100_statis.columns = ['device_id'] + ['top100_statis_' + str(i) for i in range(0, 100)]
train = pd.merge(train, top100_statis, on='device_id', how='left')
test = pd.merge(test, top100_statis, on='device_id', how='left')
# In[ ]:
train.to_csv('train_feature.csv', index=None)
test.to_csv('test_feature.csv', index=None)
# In[ ]:
feats = train.columns[4:]
feats
# In[ ]:
feats = feats.delete(153)
feats[153]
# In[ ]:
'''
train = pd.merge(train, hour_app_use, on='device_id', how='left')
test = pd.merge(test, hour_app_use, on='device_id', how='left')
'''
# In[ ]:
X_train = hstack([X_tr_app, train[feats].astype(float)])
X_test = hstack([X_ts_app, test[feats].astype(float)])
X_train = X_train.tocsr().astype('float')
X_test = X_test.tocsr().astype('float')
# ### 开始训练模型
# In[ ]:
Y = train['sex'] - 1
kfold = StratifiedKFold(n_splits=10, random_state=10, shuffle=True)
oof_preds1 = np.zeros((X_train.shape[0], ))
sub1 = np.zeros((X_test.shape[0], ))
for i, (train_index, test_index) in enumerate(kfold.split(X_train, Y)):
X_tr, X_vl, y_tr, y_vl = X_train[train_index], X_train[test_index], Y[train_index], Y[test_index]
dtrain = lgb.Dataset(X_tr, label=y_tr)
dvalid = lgb.Dataset(X_vl, y_vl, reference=dtrain)
params = {
'boosting_type': 'gbdt',
'max_depth':6,
'objective':'binary',
'num_leaves':31,
'subsample': 0.85,
'colsample_bytree': 0.2,
'lambda_l1':0.00007995302080034896,
'lambda_l2':0.0003648648811380991,
'subsample_freq':12,
'learning_rate': 0.012,
'min_child_weight':5.5
}
model = lgb.train(params,
dtrain,
num_boost_round=4000,
valid_sets=dvalid,
early_stopping_rounds=100,
verbose_eval=100)
oof_preds1[test_index] = model.predict(X_vl, num_iteration=model.best_iteration)
sub1 += model.predict(X_test, num_iteration=model.best_iteration)/kfold.n_splits
# In[ ]:
Y = train['age']
kfold = StratifiedKFold(n_splits=10, random_state=10, shuffle=True)
oof_preds2 = np.zeros((X_train.shape[0], 11))
sub2 = np.zeros((X_test.shape[0], 11))
for i, (train_index, test_index) in enumerate(kfold.split(X_train, Y)):
X_tr, X_vl, y_tr, y_vl = X_train[train_index], X_train[test_index], Y[train_index], Y[test_index]
dtrain = lgb.Dataset(X_tr, label=y_tr)
dvalid = lgb.Dataset(X_vl, y_vl, reference=dtrain)
params = {
'boosting_type': 'gbdt',
'max_depth':6,
'metric': {'multi_logloss'},
'num_class':11,
'objective':'multiclass',
'num_leaves':31,
'subsample': 0.9,
'colsample_bytree': 0.2,
'lambda_l1':0.0001,
'lambda_l2':0.00111,
'subsample_freq':10,
'learning_rate': 0.012,
'min_child_weight':10
}
model = lgb.train(params,
dtrain,
num_boost_round=4000,
valid_sets=dvalid,
early_stopping_rounds=100,
verbose_eval=100)
oof_preds2[test_index] = model.predict(X_vl, num_iteration=model.best_iteration)
sub2 += model.predict(X_test, num_iteration=model.best_iteration)/kfold.n_splits
# In[ ]:
oof_preds1 = pd.DataFrame(oof_preds1, columns=['sex2'])
oof_preds1['sex1'] = 1-oof_preds1['sex2']
oof_preds2 = pd.DataFrame(oof_preds2, columns=['age%s'%i for i in range(11)])
oof_preds = train[['device_id']]
oof_preds.columns = ['DeviceID']
for i in ['sex1', 'sex2']:
for j in ['age%s'%i for i in range(11)]:
oof_preds[i+'_'+j] = oof_preds1[i] * oof_preds2[j]
oof_preds.columns = ['DeviceID', '1-0', '1-1', '1-2', '1-3', '1-4', '1-5', '1-6',
'1-7','1-8', '1-9', '1-10', '2-0', '2-1', '2-2', '2-3', '2-4',
'2-5', '2-6', '2-7', '2-8', '2-9', '2-10']
oof_preds.to_csv('train.csv', index=False)
# In[ ]:
sub1 = pd.DataFrame(sub1, columns=['sex2'])
sub1['sex1'] = 1-sub1['sex2']
sub2 = pd.DataFrame(sub2, columns=['age%s'%i for i in range(11)])
sub = test[['device_id']]
sub.columns = ['DeviceID']
for i in ['sex1', 'sex2']:
for j in ['age%s'%i for i in range(11)]:
sub[i+'_'+j] = sub1[i] * sub2[j]
sub.columns = ['DeviceID', '1-0', '1-1', '1-2', '1-3', '1-4', '1-5', '1-6',
'1-7','1-8', '1-9', '1-10', '2-0', '2-1', '2-2', '2-3', '2-4',
'2-5', '2-6', '2-7', '2-8', '2-9', '2-10']
sub.to_csv('lgb_l_v54.csv', index=False)
# In[ ]:
'''
Y = train['label']
#best params: [31, 11, 0.015955854914003094, 0.12122664084283229, 0.7645440142264772, 24, 1048, 0.00552258737237652, 0.005810068328090833, 7]
kfold = StratifiedKFold(n_splits=5, random_state=10, shuffle=True)
sub = np.zeros((X_test.shape[0], 22))
for i, (train_index, test_index) in enumerate(kfold.split(X_train, Y)):
X_tr, X_vl, y_tr, y_vl = X_train[train_index], X_train[test_index], Y[train_index], Y[test_index]
dtrain = lgb.Dataset(X_tr, label=y_tr)
dvalid = lgb.Dataset(X_vl, y_vl, reference=dtrain)
params = {
'boosting_type': 'gbdt',
'max_depth':7,
'objective':'multiclass',
'metric': {'multi_logloss'},
'num_class':22,
'num_leaves':20,
'subsample': 0.86,
'colsample_bytree': 0.8,
#'lambda_l1':0.00007995302080034896,
'lambda_l2':0.005,
'subsample_freq':11,
'learning_rate': 0.01,
'min_child_weight':5.5,
}
model = lgb.train(params,
dtrain,
num_boost_round=6000,
valid_sets=dvalid,
early_stopping_rounds=20,
verbose_eval=100)
sub += model.predict(X_test, num_iteration=model.best_iteration)/kfold.n_splits
'''
# In[ ]:
'''
sub = pd.DataFrame(sub)
cols = [x for x in range(0, 22)]
cols = label_le.inverse_transform(cols)
sub.columns = cols
sub['DeviceID'] = test['device_id'].values
sub = sub[['DeviceID', '1-0', '1-1', '1-2', '1-3', '1-4', '1-5', '1-6',
'1-7','1-8', '1-9', '1-10', '2-0', '2-1', '2-2', '2-3', '2-4',
'2-5', '2-6', '2-7', '2-8', '2-9', '2-10']]
sub.to_csv('30.csv', index=False)
'''
================================================
FILE: nb_cz_lwl_wcm/9_yg_best_nn.py
================================================
import pandas as pd
import seaborn as sns
import numpy as np
from tqdm import tqdm
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from datetime import datetime, timedelta
import matplotlib.pyplot as plt
import time
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
# %matplotlib inline
#add
from category_encoders import OrdinalEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from gensim.models import FastText, Word2Vec
import re
from keras.layers import *
from keras.models import *
from keras.preprocessing.text import Tokenizer, text_to_word_sequence
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing import text, sequence
from keras.callbacks import *
from keras.layers.advanced_activations import LeakyReLU, PReLU
import keras.backend as K
from keras.optimizers import *
from keras.utils import to_categorical
from config import path
packages = pd.read_csv(path+'deviceid_packages.tsv',
sep='\t', names=['device_id', 'apps'])
test = pd.read_csv(path+'deviceid_test.tsv',
sep='\t', names=['device_id'])
train = pd.read_csv(path+'deviceid_train.tsv',
sep='\t', names=['device_id', 'sex', 'age'])
brand = pd.read_table(path+'deviceid_brand.tsv',
names=['device_id', 'vendor', 'version'])
behave = pd.read_csv('data/user_behavior.csv')
brand['phone_version'] = brand['vendor'] + ' ' + brand['version']
train = pd.merge(brand[['device_id', 'phone_version']],
train, on='device_id', how='right')
test = pd.merge(brand[['device_id', 'phone_version']],
test, on='device_id', how='right')
train = pd.merge(train, behave, on='device_id', how='left')
test = pd.merge(test, behave, on='device_id', how='left')
packages['app_lenghth'] = packages['apps'].apply(
lambda x: x.split(',')).apply(lambda x: len(x))
packages['app_list'] = packages['apps'].apply(lambda x: x.split(','))
train = pd.merge(train, packages, on='device_id', how='left')
test = pd.merge(test, packages, on='device_id', how='left')
embed_size = 128
fastmodel = Word2Vec(list(packages['app_list']), size=embed_size, window=4, min_count=3, negative=2,
sg=1, sample=0.002, hs=1, workers=4)
embedding_fast = pd.DataFrame([fastmodel[word]
for word in (fastmodel.wv.vocab)])
embedding_fast['app'] = list(fastmodel.wv.vocab)
embedding_fast.columns = ["fdim_%s" %
str(i) for i in range(embed_size)]+["app"]
tokenizer = Tokenizer(lower=False, char_level=False, split=',')
tokenizer.fit_on_texts(list(packages['apps']))
X_seq = tokenizer.texts_to_sequences(train['apps'])
X_test_seq = tokenizer.texts_to_sequences(test['apps'])
maxlen = 50
X = pad_sequences(X_seq, maxlen=maxlen, value=0)
X_test = pad_sequences(X_test_seq, maxlen=maxlen, value=0)
Y_sex = train['sex']-1
max_feaures = 35001
embedding_matrix = np.zeros((max_feaures, embed_size))
for word in tokenizer.word_index:
if word not in fastmodel.wv.vocab:
continue
embedding_matrix[tokenizer.word_index[word]] = fastmodel[word]
X_h = train[['h%s' % i for i in range(24)]].values
X_h_test = test[['h%s' % i for i in range(24)]].values
class AdamW(Optimizer):
def __init__(self, lr=0.001, beta_1=0.9, beta_2=0.999, weight_decay=1e-4, # decoupled weight decay (1/4)
epsilon=1e-8, decay=0., **kwargs):
super(AdamW, self).__init__(**kwargs)
with K.name_scope(self.__class__.__name__):
self.iterations = K.variable(0, dtype='int64', name='iterations')
self.lr = K.variable(lr, name='lr')
self.beta_1 = K.variable(beta_1, name='beta_1')
self.beta_2 = K.variable(beta_2, name='beta_2')
self.decay = K.variable(decay, name='decay')
# decoupled weight decay (2/4)
self.wd = K.variable(weight_decay, name='weight_decay')
self.epsilon = epsilon
self.initial_decay = decay
@interfaces.legacy_get_updates_support
def get_updates(self, loss, params):
grads = self.get_gradients(loss, params)
self.updates = [K.update_add(self.iterations, 1)]
wd = self.wd # decoupled weight decay (3/4)
lr = self.lr
if self.initial_decay > 0:
lr *= (1. / (1. + self.decay * K.cast(self.iterations,
K.dtype(self.decay))))
t = K.cast(self.iterations, K.floatx()) + 1
lr_t = lr * (K.sqrt(1. - K.pow(self.beta_2, t)) /
(1. - K.pow(self.beta_1, t)))
ms = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
vs = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
self.weights = [self.iterations] + ms + vs
for p, g, m, v in zip(params, grads, ms, vs):
m_t = (self.beta_1 * m) + (1. - self.beta_1) * g
v_t = (self.beta_2 * v) + (1. - self.beta_2) * K.square(g)
# decoupled weight decay (4/4)
p_t = p - lr_t * m_t / (K.sqrt(v_t) + self.epsilon) - lr * wd * p
self.updates.append(K.update(m, m_t))
self.updates.append(K.update(v, v_t))
new_p = p_t
# Apply constraints.
if getattr(p, 'constraint', None) is not None:
new_p = p.constraint(new_p)
self.updates.append(K.update(p, new_p))
return self.updates
def get_config(self):
config = {'lr': float(K.get_value(self.lr)),
'beta_1': float(K.get_value(self.beta_1)),
'beta_2': float(K.get_value(self.beta_2)),
'decay': float(K.get_value(self.decay)),
'weight_decay': float(K.get_value(self.wd)),
'epsilon': self.epsilon}
base_config = super(AdamW, self).get_config()
return dict(list(base_config.items()) + list(config.items()))
def model_conv1D(embedding_matrix):
K.clear_session()
# The embedding layer containing the word vectors
emb_layer = Embedding(
input_dim=embedding_matrix.shape[0],
output_dim=embedding_matrix.shape[1],
weights=[embedding_matrix],
input_length=maxlen,
trainable=False
)
lstm_layer = Bidirectional(
GRU(128, recurrent_dropout=0.15, dropout=0.15, return_sequences=True))
# 1D convolutions that can iterate over the word vectors
conv1 = Conv1D(filters=128, kernel_size=1,
padding='same', activation='relu',)
conv2 = Conv1D(filters=64, kernel_size=2,
padding='same', activation='relu', )
conv3 = Conv1D(filters=64, kernel_size=3,
padding='same', activation='relu',)
conv5 = Conv1D(filters=32, kernel_size=5,
padding='same', activation='relu',)
# Define inputs
seq = Input(shape=(maxlen,))
# Run inputs through embedding
emb = emb_layer(seq)
lstm = lstm_layer(emb)
# Run through CONV + GAP layers
conv1a = conv1(lstm)
gap1a = GlobalAveragePooling1D()(conv1a)
gmp1a = GlobalMaxPool1D()(conv1a)
conv2a = conv2(lstm)
gap2a = GlobalAveragePooling1D()(conv2a)
gmp2a = GlobalMaxPool1D()(conv2a)
conv3a = conv3(lstm)
gap3a = GlobalAveragePooling1D()(conv3a)
gmp3a = GlobalMaxPooling1D()(conv3a)
conv5a = conv5(lstm)
gap5a = GlobalAveragePooling1D()(conv5a)
gmp5a = GlobalMaxPooling1D()(conv5a)
hin = Input(shape=(24, ))
htime = Dense(6, activation='relu')(hin)
merge1 = concatenate([gmp1a, gmp1a, gmp1a, gmp1a, htime])
# The MLP that determines the outcome
x = Dropout(0.3)(merge1)
x = BatchNormalization()(x)
x = Dense(200, activation='relu',)(x)
x = Dropout(0.25)(x)
x = BatchNormalization()(x)
x = Dense(200, activation='relu',)(x)
x = Dropout(0.25)(x)
x = BatchNormalization()(x)
x = Dense(200, activation='relu',)(x)
x = Dropout(0.25)(x)
x = BatchNormalization()(x)
pred = Dense(1, activation='sigmoid')(x)
# model = Model(inputs=[seq1, seq2, magic_input, distance_input], outputs=pred)
model = Model(inputs=[seq, hin], outputs=pred)
model.compile(loss='binary_crossentropy',
optimizer=AdamW(weight_decay=0.08,))
return model
kfold = StratifiedKFold(n_splits=5, random_state=20, shuffle=True)
sub1 = np.zeros((X_test.shape[0], ))
oof_pref1 = np.zeros((X.shape[0], 1))
score = []
count = 0
for i, (train_index, test_index) in enumerate(kfold.split(X, Y_sex)):
print("FOLD | ", count+1)
filepath = "sex_weights_best_%d.h5" % count
checkpoint = ModelCheckpoint(
filepath, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
reduce_lr = ReduceLROnPlateau(
monitor='val_loss', factor=0.8, patience=2, min_lr=0.0001, verbose=1)
earlystopping = EarlyStopping(
monitor='val_loss', min_delta=0.0001, patience=6, verbose=1, mode='auto')
callbacks = [checkpoint, reduce_lr, earlystopping]
model_sex = model_conv1D(embedding_matrix)
X_tr, X_vl, X_tr2, X_vl2, y_tr, y_vl = X[train_index], X[test_index], X_h[
train_index], X_h[test_index], Y_sex[train_index], Y_sex[test_index]
hist = model_sex.fit([X_tr, X_tr2], y_tr, batch_size=256, epochs=50, validation_data=([X_vl, X_vl2], y_vl),
callbacks=callbacks, verbose=1, shuffle=True)
model_sex.load_weights(filepath)
sub1 += np.squeeze(model_sex.predict([X_test, X_h_test]))/kfold.n_splits
oof_pref1[test_index] = model_sex.predict([X_vl, X_vl2])
score.append(np.min(hist.history['val_loss']))
count += 1
print('log loss:', np.mean(score))
oof_pref1 = pd.DataFrame(oof_pref1, columns=['sex2'])
sub1 = pd.DataFrame(sub1, columns=['sex2'])
res1 = pd.concat([oof_pref1, sub1])
res1['sex1'] = 1-res1['sex2']
# res1.to_csv("res1.csv", index=False)
def model_age_conv(embedding_matrix):
# The embedding layer containing the word vectors
K.clear_session()
emb_layer = Embedding(
input_dim=embedding_matrix.shape[0],
output_dim=embedding_matrix.shape[1],
weights=[embedding_matrix],
input_length=maxlen,
trainable=False
)
lstm_layer = Bidirectional(
GRU(128, recurrent_dropout=0.15, dropout=0.15, return_sequences=True))
# 1D convolutions that can iterate over the word vectors
conv1 = Conv1D(filters=128, kernel_size=1,
padding='same', activation='relu',)
conv2 = Conv1D(filters=64, kernel_size=2,
padding='same', activation='relu', )
conv3 = Conv1D(filters=64, kernel_size=3,
padding='same', activation='relu',)
conv5 = Conv1D(filters=32, kernel_size=5,
padding='same', activation='relu',)
# Define inputs
seq = Input(shape=(maxlen,))
# Run inputs through embedding
emb = emb_layer(seq)
lstm = lstm_layer(emb)
# Run through CONV + GAP layers
conv1a = conv1(lstm)
gap1a = GlobalAveragePooling1D()(conv1a)
gmp1a = GlobalMaxPool1D()(conv1a)
conv2a = conv2(lstm)
gap2a = GlobalAveragePooling1D()(conv2a)
gmp2a = GlobalMaxPool1D()(conv2a)
conv3a = conv3(lstm)
gap3a = GlobalAveragePooling1D()(conv3a)
gmp3a = GlobalMaxPooling1D()(conv3a)
conv5a = conv5(lstm)
gap5a = GlobalAveragePooling1D()(conv5a)
gmp5a = GlobalMaxPooling1D()(conv5a)
merge1 = concatenate([gap1a, gap2a, gap3a, gap5a])
# The MLP that determines the outcome
x = Dropout(0.3)(merge1)
x = BatchNormalization()(x)
x = Dense(200, activation='relu',)(x)
x = Dropout(0.22)(x)
x = BatchNormalization()(x)
x = Dense(200, activation='relu',)(x)
x = Dropout(0.22)(x)
x = BatchNormalization()(x)
x = Dense(200, activation='relu',)(x)
x = Dropout(0.22)(x)
x = BatchNormalization()(x)
pred = Dense(11, activation='softmax')(x)
model = Model(inputs=seq, outputs=pred)
model.compile(loss='categorical_crossentropy',
optimizer=AdamW(weight_decay=0.08,))
return model
Y_age = to_categorical(train['age'])
sub2 = np.zeros((X_test.shape[0], 11))
oof_pref2 = np.zeros((X.shape[0], 11))
score = []
count = 0
for i, (train_index, test_index) in enumerate(kfold.split(X, train['age'])):
print("FOLD | ", count+1)
filepath2 = "age_weights_best_%d.h5" % count
checkpoint2 = ModelCheckpoint(
filepath2, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
reduce_lr2 = ReduceLROnPlateau(
monitor='val_loss', factor=0.8, patience=2, min_lr=0.0001, verbose=1)
earlystopping2 = EarlyStopping(
monitor='val_loss', min_delta=0.0001, patience=8, verbose=1, mode='auto')
callbacks2 = [checkpoint2, reduce_lr2, earlystopping2]
X_tr, X_vl, y_tr, y_vl = X[train_index], X[test_index], Y_age[train_index], Y_age[test_index]
model_age = model_age_conv(embedding_matrix)
hist = model_age.fit(X_tr, y_tr, batch_size=256, epochs=50, validation_data=(X_vl, y_vl),
callbacks=callbacks2, verbose=2, shuffle=True)
model_age.load_weights(filepath2)
oof_pref2[test_index] = model_age.predict(X_vl)
sub2 += model_age.predict(X_test)/kfold.n_splits
score.append(np.min(hist.history['val_loss']))
count += 1
print('log loss:', np.mean(score))
res2_1 = np.vstack((oof_pref2, sub2))
res2_1 = pd.DataFrame(res2_1)
# res2_1.to_csv("res2.csv", index=False)
res1.index = range(len(res1))
res2_1.index = range(len(res2_1))
final_1 = res2_1.copy()
final_2 = res2_1.copy()
for i in range(11):
final_1[i] = res1['sex1']*res2_1[i]
final_2[i] = res1['sex2']*res2_1[i]
id_list = pd.concat([train[['device_id']], test[['device_id']]])
final = id_list
final.index = range(len(final))
final.columns = ['DeviceID']
final_pred = pd.concat([final_1, final_2], 1)
final = pd.concat([final, final_pred], 1)
final.columns = ['DeviceID', '1-0', '1-1', '1-2', '1-3', '1-4', '1-5', '1-6',
'1-7', '1-8', '1-9', '1-10', '2-0', '2-1', '2-2', '2-3', '2-4',
'2-5', '2-6', '2-7', '2-8', '2-9', '2-10']
final.to_csv('feature/yg_best_nn.csv', index=False)
================================================
FILE: nb_cz_lwl_wcm/运行说明.txt
================================================
Demo文件夹下存放原始数据集
按照1、2、3... 顺序运行,最后在feature文件夹下面生成feature_nurbs.csv
================================================
FILE: wangcanming/deepnet_v33.py
================================================
import pandas as pd
import seaborn as sns
import numpy as np
from tqdm import tqdm
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from datetime import datetime,timedelta
import matplotlib.pyplot as plt
import time
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
%matplotlib inline
#add
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from gensim.models import FastText, Word2Vec
import re
from keras.layers import *
from keras.models import *
from keras.preprocessing.text import Tokenizer, text_to_word_sequence
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing import text, sequence
from keras.callbacks import *
from keras.layers.advanced_activations import LeakyReLU, PReLU
import keras.backend as K
from keras.optimizers import *
from keras.utils import to_categorical
packages = pd.read_csv('../input/yiguan/demo/Demo/deviceid_packages.tsv', sep='\t', names=['device_id', 'apps'])
test = pd.read_csv('../input/yiguan/demo/Demo/deviceid_test.tsv', sep='\t', names=['device_id'])
train = pd.read_csv('../input/yiguan/demo/Demo/deviceid_train.tsv', sep='\t', names=['device_id', 'sex', 'age'])
brand = pd.read_table('../input/yiguan/demo/Demo/deviceid_brand.tsv', names=['device_id', 'vendor', 'version'])
packages['app_lenghth'] = packages['apps'].apply(lambda x:x.split(',')).apply(lambda x:len(x))
packages['app_list'] = packages['apps'].apply(lambda x:x.split(','))
train = pd.merge(train, packages, on='device_id', how='left')
test = pd.merge(test, packages, on='device_id', how='left')
embed_size = 128
fastmodel = Word2Vec(list(packages['app_list']), size=embed_size, window=4, min_count=1, negative=2,
sg=1, sample=0.001, hs=1, workers=4)
embedding_fast = pd.DataFrame([fastmodel[word] for word in (fastmodel.wv.vocab)])
embedding_fast['app'] = list(fastmodel.wv.vocab)
embedding_fast.columns= ["fdim_%s" % str(i) for i in range(embed_size)]+["app"]
tokenizer = Tokenizer(lower=False, char_level=False, split=',')
tokenizer.fit_on_texts(list(packages['apps']))
X_seq = tokenizer.texts_to_sequences(train['apps'])
X_test_seq = tokenizer.texts_to_sequences(test['apps'])
maxlen = 50
X = pad_sequences(X_seq, maxlen=maxlen, value=0)
X_test = pad_sequences(X_test_seq, maxlen=maxlen, value=0)
Y_sex = train['sex']-1
max_feaures=35001
embedding_matrix = np.zeros((max_feaures, embed_size))
for word in tokenizer.word_index:
if word not in fastmodel.wv.vocab:
continue
embedding_matrix[tokenizer.word_index[word]] = fastmodel[word]
class AdamW(Optimizer):
def __init__(self, lr=0.001, beta_1=0.9, beta_2=0.999, weight_decay=1e-4, # decoupled weight decay (1/4)
epsilon=1e-8, decay=0., **kwargs):
super(AdamW, self).__init__(**kwargs)
with K.name_scope(self.__class__.__name__):
self.iterations = K.variable(0, dtype='int64', name='iterations')
self.lr = K.variable(lr, name='lr')
self.beta_1 = K.variable(beta_1, name='beta_1')
self.beta_2 = K.variable(beta_2, name='beta_2')
self.decay = K.variable(decay, name='decay')
self.wd = K.variable(weight_decay, name='weight_decay') # decoupled weight decay (2/4)
self.epsilon = epsilon
self.initial_decay = decay
@interfaces.legacy_get_updates_support
def get_updates(self, loss, params):
grads = self.get_gradients(loss, params)
self.updates = [K.update_add(self.iterations, 1)]
wd = self.wd # decoupled weight decay (3/4)
lr = self.lr
if self.initial_decay > 0:
lr *= (1. / (1. + self.decay * K.cast(self.iterations,
K.dtype(self.decay))))
t = K.cast(self.iterations, K.floatx()) + 1
lr_t = lr * (K.sqrt(1. - K.pow(self.beta_2, t)) /
(1. - K.pow(self.beta_1, t)))
ms = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
vs = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
self.weights = [self.iterations] + ms + vs
for p, g, m, v in zip(params, grads, ms, vs):
m_t = (self.beta_1 * m) + (1. - self.beta_1) * g
v_t = (self.beta_2 * v) + (1. - self.beta_2) * K.square(g)
p_t = p - lr_t * m_t / (K.sqrt(v_t) + self.epsilon) - lr * wd * p # decoupled weight decay (4/4)
self.updates.append(K.update(m, m_t))
self.updates.append(K.update(v, v_t))
new_p = p_t
# Apply constraints.
if getattr(p, 'constraint', None) is not None:
new_p = p.constraint(new_p)
self.updates.append(K.update(p, new_p))
return self.updates
def get_config(self):
config = {'lr': float(K.get_value(self.lr)),
'beta_1': float(K.get_value(self.beta_1)),
'beta_2': float(K.get_value(self.beta_2)),
'decay': float(K.get_value(self.decay)),
'weight_decay': float(K.get_value(self.wd)),
'epsilon': self.epsilon}
base_config = super(AdamW, self).get_config()
return dict(list(base_config.items()) + list(config.items()))
def model_conv1D_sex(embedding_matrix):
K.clear_session()
# The embedding layer containing the word vectors
emb_layer = Embedding(
input_dim=embedding_matrix.shape[0],
output_dim=embedding_matrix.shape[1],
weights=[embedding_matrix],
input_length=maxlen,
trainable=False)
# Define inputs
seq = Input(shape=(maxlen,))
# Run inputs through embedding
emb = emb_layer(seq)
lstm_layer = Bidirectional(GRU(128, recurrent_dropout=0.15, dropout=0.15,))
lstm = lstm_layer(emb)
translate = TimeDistributed(Dense(128, activation='relu'))
t1 = translate(emb)
t1 = TimeDistributed(Dropout(0.15))(t1)
sum_op = Lambda(lambda x: K.sum(x, axis=1), output_shape=(128,))
t1 = sum_op(t1)
merge1 = concatenate([lstm, t1])
# The MLP that determines the outcome
x = Dropout(0.24)(merge1)
#x = BatchNormalization()(x)
#x = Dense(200, activation='relu',)(x)
#x = Dropout(0.22)(x)
x = BatchNormalization()(x)
x = Dense(200, activation='relu',)(x)
x = Dropout(0.22)(x)
x = BatchNormalization()(x)
x = Dense(200, activation='relu',)(x)
x = Dropout(0.22)(x)
x = BatchNormalization()(x)
pred = Dense(1, activation='sigmoid')(x)
# model = Model(inputs=[seq1, seq2, magic_input, distance_input], outputs=pred)
model = Model(inputs=seq, outputs=pred)
model.compile(loss='binary_crossentropy', optimizer=AdamW(weight_decay=0.1,))###
return model
kfold = StratifiedKFold(n_splits=5, random_state=20, shuffle=True)
sub1 = np.zeros((X_test.shape[0], ))
oof_pref1 = np.zeros((X.shape[0], 1))
score = []
for i, (train_index, test_index) in enumerate(kfold.split(X, Y_sex)):
filepath="weights_best.h5"
checkpoint = ModelCheckpoint(filepath, monitor='val_loss', verbose=2, save_best_only=True, mode='min')
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3, min_lr=0.0001, verbose=2)
earlystopping = EarlyStopping(monitor='val_loss', min_delta=0.0001, patience=8, verbose=2, mode='auto')
callbacks = [checkpoint, reduce_lr, earlystopping]
model_sex = model_conv1D_sex(embedding_matrix)
X_tr, X_vl, y_tr, y_vl = X[train_index], X[test_index], Y_sex[train_index], Y_sex[test_index]
hist = model_sex.fit(X_tr, y_tr, batch_size=512, epochs=50, validation_data=(X_vl, y_vl),
callbacks=callbacks, verbose=2, shuffle=True)
model_sex.load_weights(filepath)
sub1 += np.squeeze(model_sex.predict(X_test))/kfold.n_splits
oof_pref1[test_index] = model_sex.predict(X_vl)
score.append(np.min(hist.history['val_loss']))
print('log loss:',np.mean(score))
def model_age_conv(embedding_matrix):
K.clear_session()
# The embedding layer containing the word vectors
emb_layer = Embedding(
input_dim=embedding_matrix.shape[0],
output_dim=embedding_matrix.shape[1],
weights=[embedding_matrix],
input_length=maxlen,
trainable=False)
# Define inputs
seq = Input(shape=(maxlen,))
# Run inputs through embedding
emb = emb_layer(seq)
lstm_layer = Bidirectional(GRU(128, recurrent_dropout=0.15, dropout=0.15,))
lstm = lstm_layer(emb)
translate = TimeDistributed(Dense(128, activation='relu'))
t1 = translate(emb)
t1 = TimeDistributed(Dropout(0.15))(t1)
sum_op = Lambda(lambda x: K.sum(x, axis=1), output_shape=(128,))
t1 = sum_op(t1)
merge1 = concatenate([lstm, t1])
# The MLP that determines the outcome
x = Dropout(0.24)(merge1)
#x = BatchNormalization()(x)
#x = Dense(200, activation='relu',)(x)
#x = Dropout(0.22)(x)
x = BatchNormalization()(x)
x = Dense(200, activation='relu',)(x)
x = Dropout(0.22)(x)
x = BatchNormalization()(x)
x = Dense(200, activation='relu',)(x)
x = Dropout(0.22)(x)
x = BatchNormalization()(x)
pred = Dense(11, activation='softmax')(x)
model = Model(inputs=seq, outputs=pred)
model.compile(loss='categorical_crossentropy', optimizer=AdamW(weight_decay=0.1,))
return model
Y_age = to_categorical(train['age'])
sub2 = np.zeros((X_test.shape[0], 11))
oof_pref2 = np.zeros((X.shape[0], 11))
score = []
for i, (train_index, test_index) in enumerate(kfold.split(X, train['age'])):
filepath2="weights_best2.h5"
checkpoint2 = ModelCheckpoint(filepath2, monitor='val_loss', verbose=2, save_best_only=True, mode='min')
reduce_lr2 = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3, min_lr=0.0001, verbose=2)
earlystopping2 = EarlyStopping(monitor='val_loss', min_delta=0.0001, patience=8, verbose=2, mode='auto')
callbacks2 = [checkpoint2, reduce_lr2, earlystopping2]
model_age = model_age_conv(embedding_matrix)
X_tr, X_vl, y_tr, y_vl = X[train_index], X[test_index], Y_age[train_index], Y_age[test_index]
hist = model_age.fit(X_tr, y_tr, batch_size=512, epochs=50, validation_data=(X_vl, y_vl),
callbacks=callbacks2, verbose=2, shuffle=True)
model_age.load_weights(filepath2)
sub2 += model_age.predict(X_test)/kfold.n_splits
oof_pref2[test_index] = model_age.predict(X_vl)
score.append(np.min(hist.history['val_loss']))
print('log loss:',np.mean(score))
sub1 = pd.DataFrame(sub1, columns=['sex2'])
oof_pref1 = pd.DataFrame(oof_pref1, columns=['sex2'])
sub1['sex1'] = 1-sub1['sex2']
oof_pref1['sex1'] = 1-oof_pref1['sex2']
sub2 = pd.DataFrame(sub2, columns=['age%s'%i for i in range(11)])
oof_pref2 = pd.DataFrame(oof_pref2, columns=['age%s'%i for i in range(11)])
sub = test[['device_id']]
sub.columns = ['DeviceID']
oof = train[['device_id']]
oof.columns = ['DeviceID']
for i in ['sex1', 'sex2']:
for j in ['age%s'%i for i in range(11)]:
sub[i+'_'+j] = sub1[i]*sub2[j]
oof[i+'_'+j] = oof_pref1[i]*oof_pref2[j]
sub.columns = ['DeviceID', '1-0', '1-1', '1-2', '1-3', '1-4', '1-5', '1-6',
'1-7','1-8', '1-9', '1-10', '2-0', '2-1', '2-2', '2-3', '2-4',
'2-5', '2-6', '2-7', '2-8', '2-9', '2-10']
oof.columns = ['DeviceID', '1-0', '1-1', '1-2', '1-3', '1-4', '1-5', '1-6',
'1-7','1-8', '1-9', '1-10', '2-0', '2-1', '2-2', '2-3', '2-4',
'2-5', '2-6', '2-7', '2-8', '2-9', '2-10']
sub.to_csv('deepnet_v33.csv', index=False)
oof.to_csv('deepnet_oof_v33.csv', index=False)
df_stack = pd.concat([oof, sub])
df_stack.to_csv('feature_wcm.csv')