Repository: yanshengli/tianchi_bigdata Branch: master Commit: 36aa99234c93 Files: 11 Total size: 20.0 KB Directory structure: gitextract_wuij7jog/ ├── README.md ├── classify_user_item.py ├── combine_feature_txt.py ├── cut_data_set.py ├── fetch_feature.py ├── fetch_negative_sample.py ├── fetch_sample.py ├── get_feature_vector_txt_4.py ├── get_recommend_result_6.py ├── global_feature.py └── produt_test_data.py ================================================ FILE CONTENTS ================================================ ================================================ FILE: README.md ================================================ # tianchi_bigdata 任务: [详见天池大数据任务介绍](http://tianchi.aliyun.com/competition/information.htm?spm=0.0.0.0.y1LXeD&raceId=1) 特征(39维): user特征、item特征、user-item特征、全局比例特征 数据采样 采用移动窗口target(17、15、13、11、9)+移动窗口样本采样(1、3、7、全部) 训练数据 正样本:15000,负样本:130000 测试数据 同样采用移动窗口变换采样,取了3天、5天、9天的做实验,最优提交为9天的,测试样本大小:155万 结果划分 结果最终取置信度0.78,取470条结果(子集结果),最终f1值:11.46% 排名:25/7200,队伍名:叮当 学习模型 RF 程序架构 combine_feature_txt:混合正负样本特征 cut_data_set.py:按照移动窗口方式,分割数据集 fetch_feature.py:提取特征 fetch_negative_sample:负样本抽样 fetch_sample:提取正、负样本 get_feature_vector_txt_4.py:提取特征向量,去掉用户-商品标示 get_recommend_result_6.py:对最后分类结果取置信度,并得到相应的推荐结果 global_feature.py:提取全局比例特征 product_test_data.py:产生测试数据 classify_user_item.py:训练学习特征,并预测 [大赛排名] (http://tianchi.aliyun.com/competition/rankingList.htm?spm=0.0.0.0.OyeBsu&season=0&raceId=1&pageIndex=2) ================================================ FILE: classify_user_item.py ================================================ __author__ = 'LiGe' #encoding:utf-8 from sklearn.linear_model import LogisticRegression from sklearn.ensemble import RandomForestClassifier from sklearn.ensemble import GradientBoostingClassifier import numpy as np f = open("train_sample.txt") f.readline() data = np.loadtxt(f) X = data[:, :-1] # select columns 1 through end y = data[:, -1] # select column 0, the stock price print X print y print 'start train' clf2 = RandomForestClassifier(n_estimators=100) #clf2=GradientBoostingClassifier() clf2.fit(X,y) #clf2 = LogisticRegression().fit(X, y) print clf2.classes_ f1=open("test_data_9feature.txt") data1=np.loadtxt(f1) X_new=data1[:,:] print 'testing data is ok' result=clf2.predict_proba(X_new) print 'output result' print result f_result=open('result9.txt','w') for i in range(0,len(result)): f_result.write(str(result[i])+'\n') ================================================ FILE: combine_feature_txt.py ================================================ __author__ = 'LiGe' #encoding:utf-8 i=9 f1=open('combine_txt_positive_feature.txt','wb') while (i<18): filename='2014-12-'+str(i) filename=filename+'_positive_user_item_feature.txt' f=open(filename,'r') lines=f.readlines() for line in lines: f1.write(line) i=i+1 ================================================ FILE: cut_data_set.py ================================================ __author__ = 'LiGe' #encoding:utf-8 import csv reader=csv.reader(file('filter_user.csv', 'rb')) csvfile = file('9_1_data.csv', 'wb') writer1=csv.writer(csvfile) csvfile = file('9_3_data.csv', 'wb') writer2=csv.writer(csvfile) csvfile = file('9_7_data.csv', 'wb') writer3=csv.writer(csvfile) csvfile = file('9_all_data.csv', 'wb') writer4=csv.writer(csvfile) ##############################################取样本子集,后三天的作为验证日期,正样本10000个,负样本取100000个,最后对样本子集空间进行预测###################################### #################################取的是样本全集####################################################################### positive_user_item=set() num=0 for line in reader: if num==0: num=num+1 continue time_s=line[5].split(' ') time_slot=time_s[0].split('-') month=int(time_slot[1]) day=int(time_slot[2]) dis_day=(12-month)*30+(19-day) if dis_day>=10 and dis_day<=11 : writer1.writerow(line) if dis_day>=10 and dis_day<=13: writer2.writerow(line) if dis_day>=10 and dis_day<=17: writer3.writerow(line) if dis_day>=10 : writer4.writerow(line) num=num+1 ================================================ FILE: fetch_feature.py ================================================ __author__ = 'LiGe' #encoding:utf-8 ##################抽取如下特征,浏览数、收藏数、购物车、购买数、平均活跃天数、最后活跃天数距离最终时间的天数,先不考虑平均活跃天数#### import csv def fetch_feature(sample_filename,feature_filename,item_brand): reader=csv.reader(file(sample_filename, 'rb')) csvfile = file(feature_filename, 'wb') writer=csv.writer(csvfile) ###################################定义统计变量########################################### user_item_click=dict()#(u,i)点击次数 usr_item_hide=dict()#(u,i)收藏次数 usr_item_shop_basket=dict()#(u,i)购物车次数 num=0 user_item_pair=set()#(u,i)对 user_basket=dict()#(u)购物车件数 usr_item_shop=dict()#(用户-item购买次数) item_num=dict() item_user=dict() item_click=dict() item_basket=dict() item_hide=dict() user_buy_brand=dict() user_buy_item_brand=dict() user_item_num=dict() user_brand=dict() user_buy=dict() user_click=dict() user_hide=dict() user_item_time=dict() user_click_item_brand=dict() user_basket_item_brand=dict() catogery_buy=dict() catogery_click=dict() catogery_basket=dict() catogery_hide=dict() ###################################初始化############################# for line in reader: if line[5].find('2014-12-17')<0: item_brand[line[1]]=line[4] user_hide[line[0]]=0 user_click[line[0]]=0 user_buy[line[0]]=0 user_brand[line[0]]=set() user_item_num[line[0]]=set() user_item_click[(line[0],line[1])]=0 usr_item_hide[(line[0],line[1])]=0 usr_item_shop_basket[(line[0],line[1])]=0 user_item_pair.add((line[0],line[1])) usr_item_shop[(line[0],line[1])]=0 item_num[line[1]]=0 item_user[line[1]]=set() item_click[line[1]]=0 item_basket[line[1]]=0 item_hide[line[1]]=0 user_buy_brand[line[0]]=set() user_buy_item_brand[(line[0],item_brand[line[1]])]=0 user_click_item_brand[(line[0],item_brand[line[1]])]=0 user_basket_item_brand[(line[0],item_brand[line[1]])]=0 user_basket[line[0]]=0 catogery_buy[item_brand[line[1]]]=0 catogery_click[item_brand[line[1]]]=0 catogery_basket[item_brand[line[1]]]=0 catogery_hide[item_brand[line[1]]]=0 num=num+1 #####################################统计特征############################################ for line in csv.reader(file(sample_filename, 'rb')): if line[5].find('2014-12-17')<0: time_s=line[5].split(' ') time_slot=time_s[0].split('-') month=int(time_slot[1]) day=int(time_slot[2]) dis_day=(12-month)*30+(17-day)####间隔时间 if (line[0],line[1]) not in user_item_time: user_item_time[line[0],line[1]]=set() user_item_time[line[0],line[1]].add(dis_day) else: user_item_time[line[0],line[1]].add(dis_day) if line[2]=='1': #################用户对该商品的点击总数############################ if (line[0],line[1]) not in user_item_click: user_item_click[(line[0],line[1])]=1 else: user_item_click[(line[0],line[1])]=1+user_item_click[(line[0],line[1])] ####用户点击总数####### user_click[line[0]]=user_click[line[0]]+1 ###############统计点击次数############################### #########################统计用户对该商品所对应类型的次数####################### user_click_item_brand[(line[0],item_brand[line[1]])]=1+user_click_item_brand[(line[0],item_brand[line[1]])] #########################商品对应的种类被点击的次数############# catogery_click[item_brand[line[1]]]=catogery_click[item_brand[line[1]]]+1 ###################商品被点击的总数#################### item_click[line[1]]=item_click[line[1]]+1 if line[2]=='2': if (line[0],line[1]) not in usr_item_hide: usr_item_hide[(line[0],line[1])]=1 else: usr_item_hide[(line[0],line[1])]=1+usr_item_hide[(line[0],line[1])] #############用户收藏总数################ user_hide[line[0]]=user_hide[line[0]]+1 ################商品类型被收藏的次数############ catogery_hide[item_brand[line[1]]]=catogery_hide[item_brand[line[1]]]+1 ################商品被加入收藏的次数############ item_hide[line[1]]=item_hide[line[1]]+1 if line[2]=='3': ################(u,i)加入购物车的次数############## if (line[0],line[1]) not in usr_item_shop_basket: usr_item_shop_basket[(line[0],line[1])]=1 else: usr_item_shop_basket[(line[0],line[1])]=1+usr_item_shop_basket[(line[0],line[1])] ############用户加入购物车的总数####################### user_basket[line[0]]=user_basket[line[0]]+1 #########################统计用户对该商品所对应类型的购物车次数################### user_basket_item_brand[(line[0],item_brand[line[1]])]=1+user_basket_item_brand[(line[0],item_brand[line[1]])] ########################商品种类被加入购物车的次数##################### catogery_basket[item_brand[line[1]]]=catogery_basket[item_brand[line[1]]]+1 ################商品被加入购物车的次数############ item_basket[line[1]]=item_basket[line[1]]+1 if line[2]=='4': ##############################该用户购买该商品的次数############################ if (line[0],line[1]) not in usr_item_shop: usr_item_shop[(line[0],line[1])]=1 else: usr_item_shop[(line[0],line[1])]=usr_item_shop[(line[0],line[1])]+1 #############用户购买商品的总次数######################### user_buy[line[0]]=user_buy[line[0]]+1 ###########################统计该商品被购买的次数############################## item_num[line[1]]=item_num[line[1]]+1 ###############商品被多少人购买#################### item_user[line[1]].add((line[0])) ##########################种类被购买的次数###################### catogery_buy[item_brand[line[1]]]=catogery_buy[item_brand[line[1]]]+1 ################用户购买商品类型的总数############ user_buy_brand[line[0]].add(item_brand[line[1]]) ####################用户购买该类型商品种类的数目########### user_buy_item_brand[(line[0],item_brand[line[1]])]=1+user_buy_item_brand[(line[0],item_brand[line[1]])] #############################用户交互的商品数################ user_item_num[line[0]].add((line[1])) ############################用户交互的商品品牌数#################### user_brand[line[0]].add(item_brand[line[1]]) #####################################写结果################################################################## for k in user_item_pair: ####################用户交互的商品数与购买的商品数之比###################### if user_buy[k[0]]!=0: comm_item_ratio=float("%.2f"%(len(user_item_num[k[0]])/user_buy[k[0]])) else: comm_item_ratio=0 #################用户交互的商品品牌数与购买的商品品牌数之比################## if len(user_buy_brand[k[0]])!=0: comm_brand_buy_ratio=float("%.2f"%(len(user_brand[k[0]])/len(user_buy_brand[k[0]]))) else: comm_brand_buy_ratio=0 ###################该类型商品点击与购买的比例############### if catogery_buy[item_brand[k[1]]]!=0: catogry_click_buy=float("%.2f"%(catogery_click[item_brand[k[1]]]/catogery_buy[item_brand[k[1]]])) else: catogry_click_buy=0 ###################该类型商品加入购物车与购买的比例############### if catogery_buy[item_brand[k[1]]]!=0: catogry_basket_buy=float("%.2f"%(catogery_basket[item_brand[k[1]]]/catogery_buy[item_brand[k[1]]])) else: catogry_basket_buy=0 ####购买该商品所对应的类型占总的购买量的比例######################## if user_buy[k[0]]!=0: buy_catogry_ratio=float("%.2f"%(user_buy_item_brand[(k[0],item_brand[k[1]])]/user_buy[k[0]])) else: buy_catogry_ratio=0 ##########################################点击该商品所对应的类型占总的点击量的比例#################### if user_click[k[0]]!=0: click_catogry_ratio=float("%.2f"%(user_click_item_brand[(k[0],item_brand[k[1]])]/user_click[k[0]])) else: click_catogry_ratio=0 #########################################购物车该商品所对应的类型占总的购物车的比例###################### if user_basket[k[0]]!=0: basket_catogry_ratio=float("%.2f"%(user_basket_item_brand[(k[0],item_brand[k[1]])]/user_basket[k[0]])) else: basket_catogry_ratio=0 ####用户点击购买比例############### if user_buy[k[0]]!=0: click_buy_user_ratio=float("%.2f"%(user_click[k[0]]/user_buy[k[0]])) else: click_buy_user_ratio=0 ######用户-商品对购物车与购买的比例#################### if usr_item_shop[k]!=0: basket_buy_ratio=float("%.2f"%(usr_item_shop_basket[k]/usr_item_shop[k])) else: basket_buy_ratio=0 ##########用户-商品点击与购物车的比例###### if usr_item_shop_basket[k]!=0: click_basket=float("%.2f"%(user_item_click[k]/usr_item_shop_basket[k])) else: click_basket=0 ##################用户购物车与购买的比例##################### if user_buy[k[0]]!=0: basket_buy_user_ratio=float("%.2f"%(user_basket[k[0]]/user_buy[k[0]])) else: basket_buy_user_ratio=0 #################用户点击与购物车的比例################### if user_basket[k[0]]!=0: ratio_click_basket=float("%.2f"%(user_click[k[0]]/user_basket[k[0]])) else: ratio_click_basket=0 ######################用户收藏与购物的比例################# if user_buy[k[0]]!=0: ratio_hide_buy=float("%.2f"%(user_hide[k[0]]/user_buy[k[0]])) else: ratio_hide_buy=0 ######################该类型商品收藏与购买的比例################# if catogery_buy[item_brand[k[1]]]!=0: catogry_hide_buy=float("%.2f"%(catogery_hide[item_brand[k[1]]]/catogery_buy[item_brand[k[1]]])) else: catogry_hide_buy=0 ###################用户最早接触该物品的时间以及最晚接触该物品的时间##################### sort_user_item_time=list(user_item_time[k]) eraliest_time=sort_user_item_time[-1] latest_time=sort_user_item_time[0] writer.writerow((k[0],k[1],user_item_click[k],user_click[k[0]],usr_item_hide[k],user_hide[k[0]],\ usr_item_shop_basket[k],user_basket[k[0]],usr_item_shop[k],user_buy[k[0]],item_num[k[1]],len(item_user[k[1]]),\ len(user_buy_brand[k[0]]),user_buy_item_brand[(k[0],item_brand[k[1]])],len(user_item_num[k[0]]),len(user_brand[k[0]]), user_click_item_brand[(k[0],item_brand[k[1]])],user_basket_item_brand[(k[0],item_brand[k[1]])],catogery_click[item_brand[k[1]]], catogery_hide[item_brand[k[1]]],catogery_basket[item_brand[k[1]]],catogery_buy[item_brand[k[1]]],item_click[k[1]], item_hide[k[1]],item_basket[k[1]], buy_catogry_ratio,click_buy_user_ratio,basket_buy_ratio,click_basket,basket_buy_user_ratio,ratio_hide_buy, ratio_click_basket,click_catogry_ratio,basket_catogry_ratio,catogry_click_buy,catogry_basket_buy, catogry_hide_buy,comm_item_ratio,comm_brand_buy_ratio,eraliest_time,latest_time)) ####################39维特征################################## if __name__=='__main__': item_brand=dict() fetch_feature('./17/17_1_data.csv',\ './17/17_1_data_feature.csv',item_brand) ================================================ FILE: fetch_negative_sample.py ================================================ __author__ = 'LiGe' #encoding:utf-8 import csv import random num=1 csvfile = file('sample_17_negative_user.csv', 'wb') writer=csv.writer(csvfile) for line in csv.reader(file('17_negative.csv','r')): if num%200==0: writer.writerow(line) num=num+1 print num ================================================ FILE: fetch_sample.py ================================================ __author__ = 'LiGe' #encoding:utf-8 import csv import os buy=set() for line in csv.reader(file('./17/17_1_data.csv','rb')): if line[5].find('2014-12-17')>=0: if line[2]=='4': buy.add((line[0],line[1])) csvfile = file('17_negative.csv', 'wb') writer=csv.writer(csvfile) files=os.listdir('./17/') for filename in files: if filename.find('feature')>=0: for line in csv.reader(file('./17/'+filename,'rb')): if (line[0],line[1]) not in buy: writer.writerow(line) ================================================ FILE: get_feature_vector_txt_4.py ================================================ __author__ = 'LiGe' #encoding:Utf-8 import csv ##########################加类标,去用户-商品名,取纯特征文档############################# def put_on_label(feature_csv,feature_txt_label): f=open(feature_txt_label,'w') for line in csv.reader(file(feature_csv, 'rb')): f.write(line[2]+' '+line[3]+' '+line[4]+' '+line[5]+' '+line[6] +' '+line[7]+line[8] +' '+line[9] +' '+line[10] +' '+line[11] +' '+line[12] +' '+line[13] +' '+line[14] +' '+line[15] +' '+line[16] +' '+line[17] +' '+line[18] +' '+line[19] +' '+line[20] +' '+line[21] +' '+line[22] +' '+line[23] +' '+line[24] +' '+line[25] +' '+line[26] +' '+line[27] +' '+line[28] +' '+line[29] +' '+line[30] +' '+line[31] +' '+line[32] +' '+line[33] +' '+line[34] +' '+line[35] +' '+line[36] +' '+line[37] +' '+line[38] +' '+line[39] +' '+line[40] +'\n') if __name__=="__main__": feature_csv='global_test_data_feature.csv' feature_txt_lable='test_data_9feature.txt' put_on_label(feature_csv,feature_txt_lable) ================================================ FILE: get_recommend_result_6.py ================================================ __author__ = 'LiGe' #encoding:utf-8 import csv def get_result(input_source_result_txt,output_final_result_csv,test_sourc_file_csv): result_line_num=set() f=open(input_source_result_txt,'r') lines=f.readlines() num=1 for line in lines: linedata=line[1:-2].strip() data=linedata.split(' ') #print data[1] if float(data[1].strip())>0.78: result_line_num.add(num) num=num+1 print len(result_line_num) csvfile = file(output_final_result_csv, 'wb') writer=csv.writer(csvfile) num=1 suspect_user_item=set() for line in csv.reader(file(test_sourc_file_csv, 'rb')): if num in result_line_num: suspect_user_item.add((line[0],line[1])) #writer.writerow((line[0],line[1])) num=num+1 sub_item=set() for line in csv.reader(file('tianchi_mobile_recommend_train_item.csv','rb')): sub_item.add(line[0]) count=0 for k in suspect_user_item: if k[1] in sub_item: writer.writerow((k[0],k[1])) count=count+1 print count if __name__=='__main__': input_source_result_txt='result9.txt' output_final_result_csv='tianchi_mobile_recommendation_predict_9_434.csv' test_sourc_file_csv='global_test_data_feature.csv' get_result(input_source_result_txt,output_final_result_csv,test_sourc_file_csv) ================================================ FILE: global_feature.py ================================================ __author__ = 'LiGe' #encoding:utf-8 import csv global_user_feature=dict() for line in csv.reader(file('9_all_data.csv','rb')): global_user_feature[(line[0],line[1])]=line[25:] csvfile = file('gloabal_9_1_data.csv', 'wb') writer=csv.writer(csvfile) for line in csv.reader(file('9_1_data.csv','rb')): k=global_user_feature[(line[0],line[1])] writer.writerow((line[0],line[1],line[2],line[3], line[4],line[5],line[6],line[7], line[8],line[9],line[10],line[11], line[12],line[13],line[14],line[15], line[16],line[17],line[18],line[19], line[20],line[21],line[22],line[23], line[24],k[0],k[1],k[2],k[3],k[4],k[5], k[6],k[7],k[8],k[9],k[10],k[11],k[12], k[12],k[13],k[14],k[15] )) ================================================ FILE: produt_test_data.py ================================================ __author__ = 'LiGe' #encoding:utf-8 import csv num=0 csvfile = file('test_data_9.csv', 'wb') writer=csv.writer(csvfile) for line in csv.reader(file('tianchi_mobile_recommend_train_user.csv','rb')): if num==0: num=num+1 continue time_s=line[5].split(' ') time_slot=time_s[0].split('-') month=int(time_slot[1]) day=int(time_slot[2]) dis_day=(12-month)*30+(19-day) if dis_day<=9: writer.writerow(line)