Repository: chenkkkk/TianChi_YiDongTuiJian_forecast
Branch: master
Commit: 0da7516d0616
Files: 6
Total size: 31.2 KB
Directory structure:
gitextract_1fl6mnq7/
├── .gitattributes
├── Preprocess/
│ └── Drop_Day_and_sub_item.py
├── README.md
├── feature/
│ ├── add_feture.py
│ └── extract_feture.py
└── result/
└── your result
================================================
FILE CONTENTS
================================================
================================================
FILE: .gitattributes
================================================
# Auto detect text files and perform LF normalization
* text=auto
*.zip filter=lfs diff=lfs merge=lfs -text
================================================
FILE: Preprocess/Drop_Day_and_sub_item.py
================================================
import pandas as pd
import numpy as np
if __name__ == '__main__':
user_table = pd.read_csv('../DataSet/tianchi_fresh_comp_train_user.csv')
item_table = pd.read_csv('../DataSet/tianchi_fresh_comp_train_item.csv')
user_table = user_table[user_table.item_id.isin(list(item_table.item_id))]
user_table['days'] = user_table['time'].map(lambda x:x.split(' ')[0])
user_table['hours'] = user_table['time'].map(lambda x:x.split(' ')[1])
user_table = user_table[user_table['days'] != '2014-12-12']
user_table = user_table[user_table['days'] != '2014-12-11']
user_table.to_csv('../DataSet/drop1112_sub_item.csv',index=None)
================================================
FILE: README.md
================================================
# TianChi_YiDongTuiJian_forecast
移动推荐新手实战赛学习代码(简单版本)
首先解压数据
运行说明:
1.run ./Preprocess/Drop_Day_and_sub_item.py
2.run ./feature/extract_feture.py
================================================
FILE: feature/add_feture.py
================================================
import pandas as pd
import numpy as np
import datetime
import sys
import time
def user_click(beforesomeday):#用户在前几天各种操作在各个小时的计数
user_act_count = pd.crosstab([beforesomeday.user_id,beforesomeday.behavior_type],beforesomeday.hours,dropna=False)
user_act_count = user_act_count.unstack(fill_value = 0)
return user_act_count
def user_liveday(train_user_window1):#用户各个行为活跃的天数
user_live = train_user_window1.groupby(by = ['user_id','behavior_type']).agg({"daystime":lambda x:x.nunique()})
user_live = user_live.unstack(fill_value = 0)
return user_live
def user_item_click(beforesomeday):
user_item_act_count = pd.crosstab([beforesomeday.user_id,beforesomeday.item_id,beforesomeday.behavior_type],beforesomeday.hours)
user_item_act_count = user_item_act_count.unstack(fill_value = 0)
return user_item_act_count
def user_cate_click(beforesomeday):
user_cate_act_count = pd.crosstab([beforesomeday.user_id,beforesomeday.item_category,beforesomeday.behavior_type],beforesomeday.hours)
user_cate_act_count = user_cate_act_count.unstack(fill_value = 0)
return user_cate_act_count
def user_item_long_touch(train_user_window1):
_live = train_user_window1.groupby(by = ['user_id','item_id']).agg({"daystime":lambda x:(x.max()-x.min()).days})
return _live
def user_cate_long_touch(train_user_window1):
_live = train_user_window1.groupby(by = ['user_id','item_category']).agg({"daystime":lambda x:(x.max()-x.min()).days})
return _live
================================================
FILE: feature/extract_feture.py
================================================
import pandas as pd
import numpy as np
import datetime
import sys
import time
import xgboost as xgb
from add_feture import *
FEATURE_EXTRACTION_SLOT = 10
LabelDay = datetime.datetime(2014,12,18,0,0,0)
Data = pd.read_csv("../DataSet/drop1112_sub_item.csv")
Data['daystime'] = Data['days'].map(lambda x: time.strptime(x, "%Y-%m-%d")).map(lambda x: datetime.datetime(*x[:6]))
def get_train(train_user,end_time):
# 取出label day 前一天的记录作为打标记录
data_train = train_user[(train_user['daystime'] == (end_time-datetime.timedelta(days=1)))]#&((train_user.behavior_type==3)|(train_user.behavior_type==2))
# 训练样本中,删除重复的样本
data_train = data_train.drop_duplicates(['user_id', 'item_id'])
data_train_ui = data_train['user_id'] / data_train['item_id']
# print(len(data_train))
# 使用label day 的实际购买情况进行打标
data_label = train_user[train_user['daystime'] == end_time]
data_label_buy = data_label[data_label['behavior_type'] == 4]
data_label_buy_ui = data_label_buy['user_id'] / data_label_buy['item_id']
# 对前一天的交互记录进行打标
data_train_labeled = data_train_ui.isin(data_label_buy_ui)
dict = {True: 1, False: 0}
data_train_labeled = data_train_labeled.map(dict)
data_train['label'] = data_train_labeled
return data_train[['user_id', 'item_id','item_category', 'label']]
def get_label_testset(train_user,LabelDay):
# 测试集选为上一天所有的交互数据
data_test = train_user[(train_user['daystime'] == LabelDay)]#&((train_user.behavior_type==3)|(train_user.behavior_type==2))
data_test = data_test.drop_duplicates(['user_id', 'item_id'])
return data_test[['user_id', 'item_id','item_category']]
def item_category_feture(data,end_time,beforeoneday):
# data = Data[(Data['daystime']<LabelDay) & (Data['daystime']>LabelDay-datetime.timedelta(days=FEATURE_EXTRACTION_SLOT))]
item_count = pd.crosstab(data.item_category,data.behavior_type)
item_count_before5=None
if (((end_time-datetime.timedelta(days=5))<datetime.datetime(2014,12,13,0,0,0))&((end_time-datetime.timedelta(days=5))>datetime.datetime(2014,12,10,0,0,0))):
beforefiveday = data[data['daystime']>=end_time-datetime.timedelta(days=5+2)]
item_count_before5 = pd.crosstab(beforefiveday.item_category,beforefiveday.behavior_type)
else:
beforefiveday = data[data['daystime']>=end_time-datetime.timedelta(days=5)]
item_count_before5 = pd.crosstab(beforefiveday.item_category,beforefiveday.behavior_type)
item_count_before_3=None
if (((end_time-datetime.timedelta(days=5))<datetime.datetime(2014,12,13,0,0,0))&((end_time-datetime.timedelta(days=5))>datetime.datetime(2014,12,10,0,0,0))):
beforethreeday = data[data['daystime']>=end_time-datetime.timedelta(days=3+2)]
item_count_before_3 = pd.crosstab(beforethreeday.item_category,beforethreeday.behavior_type)
else:
beforethreeday = data[data['daystime']>=end_time-datetime.timedelta(days=3)]
item_count_before_3 = pd.crosstab(beforethreeday.item_category,beforethreeday.behavior_type)
item_count_before_2=None
if (((end_time-datetime.timedelta(days=5))<datetime.datetime(2014,12,13,0,0,0))&((end_time-datetime.timedelta(days=5))>datetime.datetime(2014,12,10,0,0,0))):
beforethreeday = data[data['daystime']>=end_time-datetime.timedelta(days=7+2)]
item_count_before_2 = pd.crosstab(beforethreeday.item_category,beforethreeday.behavior_type)
else:
beforethreeday = data[data['daystime']>=end_time-datetime.timedelta(days=7)]
item_count_before_2 = pd.crosstab(beforethreeday.item_category,beforethreeday.behavior_type)
# beforeoneday = Data[Data['daystime'] == LabelDay-datetime.timedelta(days=1)]
beforeonedayitem_count = pd.crosstab(beforeoneday.item_category,beforeoneday.behavior_type)
countAverage = item_count/FEATURE_EXTRACTION_SLOT
buyRate = pd.DataFrame()
buyRate['click'] = item_count[1]/item_count[4]
buyRate['skim'] = item_count[2]/item_count[4]
buyRate['collect'] = item_count[3]/item_count[4]
buyRate.index = item_count.index
buyRate_2 = pd.DataFrame()
buyRate_2['click'] = item_count_before5[1]/item_count_before5[4]
buyRate_2['skim'] = item_count_before5[2]/item_count_before5[4]
buyRate_2['collect'] = item_count_before5[3]/item_count_before5[4]
buyRate_2.index = item_count_before5.index
buyRate_3 = pd.DataFrame()
buyRate_3['click'] = item_count_before_3[1]/item_count_before_3[4]
buyRate_3['skim'] = item_count_before_3[2]/item_count_before_3[4]
buyRate_3['collect'] = item_count_before_3[3]/item_count_before_3[4]
buyRate_3.index = item_count_before_3.index
buyRate = buyRate.replace([np.inf, -np.inf], 0)
buyRate_2 = buyRate_2.replace([np.inf, -np.inf], 0)
buyRate_3 = buyRate_3.replace([np.inf, -np.inf], 0)
item_category_feture = pd.merge(item_count,beforeonedayitem_count,how='left',right_index=True,left_index=True)
item_category_feture = pd.merge(item_category_feture,countAverage,how='left',right_index=True,left_index=True)
item_category_feture = pd.merge(item_category_feture,buyRate,how='left',right_index=True,left_index=True)
item_category_feture = pd.merge(item_category_feture,item_count_before5,how='left',right_index=True,left_index=True)
item_category_feture = pd.merge(item_category_feture,item_count_before_3,how='left',right_index=True,left_index=True)
item_category_feture = pd.merge(item_category_feture,item_count_before_2,how='left',right_index=True,left_index=True)
# item_category_feture = pd.merge(item_category_feture,buyRate_2,how='left',right_index=True,left_index=True)
# item_category_feture = pd.merge(item_category_feture,buyRate_3,how='left',right_index=True,left_index=True)
item_category_feture.fillna(0,inplace=True)
return item_category_feture
def item_id_feture(data,end_time,beforeoneday):
# data = Data[(Data['daystime']<LabelDay) & (Data['daystime']>LabelDay-datetime.timedelta(days=FEATURE_EXTRACTION_SLOT))]
item_count = pd.crosstab(data.item_id,data.behavior_type)
item_count_before5=None
if (((end_time-datetime.timedelta(days=5))<datetime.datetime(2014,12,13,0,0,0))&((end_time-datetime.timedelta(days=5))>datetime.datetime(2014,12,10,0,0,0))):
beforefiveday = data[data['daystime']>=end_time-datetime.timedelta(days=5+2)]
item_count_before5 = pd.crosstab(beforefiveday.item_id,beforefiveday.behavior_type)
else:
beforefiveday = data[data['daystime']>=end_time-datetime.timedelta(days=5)]
item_count_before5 = pd.crosstab(beforefiveday.item_id,beforefiveday.behavior_type)
item_count_before_3=None
if (((end_time-datetime.timedelta(days=5))<datetime.datetime(2014,12,13,0,0,0))&((end_time-datetime.timedelta(days=5))>datetime.datetime(2014,12,10,0,0,0))):
beforethreeday = data[data['daystime']>=end_time-datetime.timedelta(days=3+2)]
item_count_before_3 = pd.crosstab(beforethreeday.item_id,beforethreeday.behavior_type)
else:
beforethreeday = data[data['daystime']>=end_time-datetime.timedelta(days=3)]
item_count_before_3 = pd.crosstab(beforethreeday.item_id,beforethreeday.behavior_type)
item_count_before_2=None
if (((end_time-datetime.timedelta(days=5))<datetime.datetime(2014,12,13,0,0,0))&((end_time-datetime.timedelta(days=5))>datetime.datetime(2014,12,10,0,0,0))):
beforethreeday = data[data['daystime']>=end_time-datetime.timedelta(days=7+2)]
item_count_before_2 = pd.crosstab(beforethreeday.item_id,beforethreeday.behavior_type)
else:
beforethreeday = data[data['daystime']>=end_time-datetime.timedelta(days=7)]
item_count_before_2 = pd.crosstab(beforethreeday.item_id,beforethreeday.behavior_type)
item_count_unq = data.groupby(by = ['item_id','behavior_type']).agg({"user_id":lambda x:x.nunique()});item_count_unq = item_count_unq.unstack()
# beforeoneday = Data[Data['daystime'] == LabelDay-datetime.timedelta(days=1)]
beforeonedayitem_count = pd.crosstab(beforeoneday.item_id,beforeoneday.behavior_type)
countAverage = item_count/FEATURE_EXTRACTION_SLOT
buyRate = pd.DataFrame()
buyRate['click'] = item_count[1]/item_count[4]
buyRate['skim'] = item_count[2]/item_count[4]
buyRate['collect'] = item_count[3]/item_count[4]
buyRate.index = item_count.index
buyRate_2 = pd.DataFrame()
buyRate_2['click'] = item_count_before5[1]/item_count_before5[4]
buyRate_2['skim'] = item_count_before5[2]/item_count_before5[4]
buyRate_2['collect'] = item_count_before5[3]/item_count_before5[4]
buyRate_2.index = item_count_before5.index
buyRate_3 = pd.DataFrame()
buyRate_3['click'] = item_count_before_3[1]/item_count_before_3[4]
buyRate_3['skim'] = item_count_before_3[2]/item_count_before_3[4]
buyRate_3['collect'] = item_count_before_3[3]/item_count_before_3[4]
buyRate_3.index = item_count_before_3.index
buyRate = buyRate.replace([np.inf, -np.inf], 0)
buyRate_2 = buyRate_2.replace([np.inf, -np.inf], 0)
buyRate_3 = buyRate_3.replace([np.inf, -np.inf], 0)
item_id_feture = pd.merge(item_count,beforeonedayitem_count,how='left',right_index=True,left_index=True)
item_id_feture = pd.merge(item_id_feture,countAverage,how='left',right_index=True,left_index=True)
item_id_feture = pd.merge(item_id_feture,buyRate,how='left',right_index=True,left_index=True)
item_id_feture = pd.merge(item_id_feture,item_count_unq,how='left',right_index=True,left_index=True)
item_id_feture = pd.merge(item_id_feture,item_count_before5,how='left',right_index=True,left_index=True)
item_id_feture = pd.merge(item_id_feture,item_count_before_3,how='left',right_index=True,left_index=True)
item_id_feture = pd.merge(item_id_feture,item_count_before_2,how='left',right_index=True,left_index=True)
# item_id_feture = pd.merge(item_id_feture,buyRate_2,how='left',right_index=True,left_index=True)
# item_id_feture = pd.merge(item_id_feture,buyRate_3,how='left',right_index=True,left_index=True)
item_id_feture.fillna(0,inplace=True)
return item_id_feture
def user_id_feture(data,end_time,beforeoneday):
# data = Data[(Data['daystime']<LabelDay) & (Data['daystime']>LabelDay-datetime.timedelta(days=FEATURE_EXTRACTION_SLOT))]
user_count = pd.crosstab(data.user_id,data.behavior_type)
user_count_before5=None
if (((end_time-datetime.timedelta(days=5))<datetime.datetime(2014,12,13,0,0,0))&((end_time-datetime.timedelta(days=5))>datetime.datetime(2014,12,10,0,0,0))):
beforefiveday = data[data['daystime']>=end_time-datetime.timedelta(days=5+2)]
user_count_before5 = pd.crosstab(beforefiveday.user_id,beforefiveday.behavior_type)
else:
beforefiveday = data[data['daystime']>=end_time-datetime.timedelta(days=5)]
user_count_before5 = pd.crosstab(beforefiveday.user_id,beforefiveday.behavior_type)
user_count_before_3=None
if (((end_time-datetime.timedelta(days=5))<datetime.datetime(2014,12,13,0,0,0))&((end_time-datetime.timedelta(days=5))>datetime.datetime(2014,12,10,0,0,0))):
beforethreeday = data[data['daystime']>=end_time-datetime.timedelta(days=3+2)]
user_count_before_3 = pd.crosstab(beforethreeday.user_id,beforethreeday.behavior_type)
else:
beforethreeday = data[data['daystime']>=end_time-datetime.timedelta(days=3)]
user_count_before_3 = pd.crosstab(beforethreeday.user_id,beforethreeday.behavior_type)
user_count_before_2=None
if (((end_time-datetime.timedelta(days=5))<datetime.datetime(2014,12,13,0,0,0))&((end_time-datetime.timedelta(days=5))>datetime.datetime(2014,12,10,0,0,0))):
beforethreeday = data[data['daystime']>=end_time-datetime.timedelta(days=7+2)]
user_count_before_2 = pd.crosstab(beforethreeday.user_id,beforethreeday.behavior_type)
else:
beforethreeday = data[data['daystime']>=end_time-datetime.timedelta(days=7)]
user_count_before_2 = pd.crosstab(beforethreeday.user_id,beforethreeday.behavior_type)
# beforeoneday = Data[Data['daystime'] == LabelDay-datetime.timedelta(days=1)]
beforeonedayuser_count = pd.crosstab(beforeoneday.user_id,beforeoneday.behavior_type)
countAverage = user_count/FEATURE_EXTRACTION_SLOT
buyRate = pd.DataFrame()
buyRate['click'] = user_count[1]/user_count[4]
buyRate['skim'] = user_count[2]/user_count[4]
buyRate['collect'] = user_count[3]/user_count[4]
buyRate.index = user_count.index
buyRate_2 = pd.DataFrame()
buyRate_2['click'] = user_count_before5[1]/user_count_before5[4]
buyRate_2['skim'] = user_count_before5[2]/user_count_before5[4]
buyRate_2['collect'] = user_count_before5[3]/user_count_before5[4]
buyRate_2.index = user_count_before5.index
buyRate_3 = pd.DataFrame()
buyRate_3['click'] = user_count_before_3[1]/user_count_before_3[4]
buyRate_3['skim'] = user_count_before_3[2]/user_count_before_3[4]
buyRate_3['collect'] = user_count_before_3[3]/user_count_before_3[4]
buyRate_3.index = user_count_before_3.index
buyRate = buyRate.replace([np.inf, -np.inf], 0)
buyRate_2 = buyRate_2.replace([np.inf, -np.inf], 0)
buyRate_3 = buyRate_3.replace([np.inf, -np.inf], 0)
long_online = pd.pivot_table(beforeoneday,index=['user_id'],values=['hours'],aggfunc=[np.min,np.max,np.ptp])
user_id_feture = pd.merge(user_count,beforeonedayuser_count,how='left',right_index=True,left_index=True)
user_id_feture = pd.merge(user_id_feture,countAverage,how='left',right_index=True,left_index=True)
user_id_feture = pd.merge(user_id_feture,buyRate,how='left',right_index=True,left_index=True)
user_id_feture = pd.merge(user_id_feture,user_count_before5,how='left',right_index=True,left_index=True)
user_id_feture = pd.merge(user_id_feture,user_count_before_3,how='left',right_index=True,left_index=True)
user_id_feture = pd.merge(user_id_feture,user_count_before_2,how='left',right_index=True,left_index=True)
user_id_feture = pd.merge(user_id_feture,long_online,how='left',right_index=True,left_index=True)
# user_id_feture = pd.merge(user_id_feture,buyRate_2,how='left',right_index=True,left_index=True)
# user_id_feture = pd.merge(user_id_feture,buyRate_3,how='left',right_index=True,left_index=True)
user_id_feture.fillna(0,inplace=True)
return user_id_feture
def user_item_feture(data,end_time,beforeoneday):
# data = Data[(Data['daystime']<LabelDay) & (Data['daystime']>LabelDay-datetime.timedelta(days=FEATURE_EXTRACTION_SLOT))]
user_item_count = pd.crosstab([data.user_id,data.item_id],data.behavior_type)
# beforeoneday = Data[Data['daystime'] == LabelDay-datetime.timedelta(days=1)]
user_item_count_5=None
if (((end_time-datetime.timedelta(days=5))<datetime.datetime(2014,12,13,0,0,0))&((end_time-datetime.timedelta(days=5))>datetime.datetime(2014,12,10,0,0,0))):
beforefiveday = data[data['daystime']>=end_time-datetime.timedelta(days=5+2)]
user_item_count_5 = pd.crosstab([beforefiveday.user_id,beforefiveday.item_id],beforefiveday.behavior_type)
else:
beforefiveday = data[data['daystime']>=end_time-datetime.timedelta(days=5)]
user_item_count_5 = pd.crosstab([beforefiveday.user_id,beforefiveday.item_id],beforefiveday.behavior_type)
user_item_count_3=None
if (((end_time-datetime.timedelta(days=5))<datetime.datetime(2014,12,13,0,0,0))&((end_time-datetime.timedelta(days=5))>datetime.datetime(2014,12,10,0,0,0))):
beforethreeday = data[data['daystime']>=end_time-datetime.timedelta(days=3+2)]
user_item_count_3 = pd.crosstab([beforethreeday.user_id,beforethreeday.item_id],beforethreeday.behavior_type)
else:
beforethreeday = data[data['daystime']>=end_time-datetime.timedelta(days=3)]
user_item_count_3 = pd.crosstab([beforethreeday.user_id,beforethreeday.item_id],beforethreeday.behavior_type)
user_item_count_2=None
if (((end_time-datetime.timedelta(days=5))<datetime.datetime(2014,12,13,0,0,0))&((end_time-datetime.timedelta(days=5))>datetime.datetime(2014,12,10,0,0,0))):
beforethreeday = data[data['daystime']>=end_time-datetime.timedelta(days=7+2)]
user_item_count_2 = pd.crosstab([beforethreeday.user_id,beforethreeday.item_id],beforethreeday.behavior_type)
else:
beforethreeday = data[data['daystime']>=end_time-datetime.timedelta(days=7)]
user_item_count_2 = pd.crosstab([beforethreeday.user_id,beforethreeday.item_id],beforethreeday.behavior_type)
beforeonedayuser_item_count = pd.crosstab([beforeoneday.user_id,beforeoneday.item_id],beforeoneday.behavior_type)
# _live = user_item_long_touch(data)
max_touchtime = pd.pivot_table(beforeoneday,index=['user_id','item_id'],values=['hours'],aggfunc=[np.min,np.max])
max_touchtype = pd.pivot_table(beforeoneday,index=['user_id','item_id'],values=['behavior_type'],aggfunc=np.max)
user_item_feture = pd.merge(user_item_count,beforeonedayuser_item_count,how='left',right_index=True,left_index=True)
user_item_feture = pd.merge(user_item_feture,max_touchtime,how='left',right_index=True,left_index=True)
user_item_feture = pd.merge(user_item_feture,max_touchtype,how='left',right_index=True,left_index=True)
# user_item_feture = pd.merge(user_item_feture,_live,how='left',right_index=True,left_index=True)
user_item_feture = pd.merge(user_item_feture,user_item_count_5,how='left',right_index=True,left_index=True)
user_item_feture = pd.merge(user_item_feture,user_item_count_3,how='left',right_index=True,left_index=True)
user_item_feture = pd.merge(user_item_feture,user_item_count_2,how='left',right_index=True,left_index=True)
user_item_feture.fillna(0,inplace=True)
return user_item_feture
def user_cate_feture(data,end_time,beforeoneday):
# data = Data[(Data['daystime']<LabelDay) & (Data['daystime']>LabelDay-datetime.timedelta(days=FEATURE_EXTRACTION_SLOT))]
user_item_count = pd.crosstab([data.user_id,data.item_category],data.behavior_type)
# beforeoneday = Data[Data['daystime'] == LabelDay-datetime.timedelta(days=1)]
user_cate_count_5=None
if (((end_time-datetime.timedelta(days=5))<datetime.datetime(2014,12,13,0,0,0))&((end_time-datetime.timedelta(days=5))>datetime.datetime(2014,12,10,0,0,0))):
beforefiveday = data[data['daystime']>=(end_time-datetime.timedelta(days=5+2))]
user_cate_count_5 = pd.crosstab([beforefiveday.user_id,beforefiveday.item_category],beforefiveday.behavior_type)
else:
beforefiveday = data[data['daystime']>=(end_time-datetime.timedelta(days=5))]
user_cate_count_5 = pd.crosstab([beforefiveday.user_id,beforefiveday.item_category],beforefiveday.behavior_type)
user_cate_count_3 = None
if (((end_time-datetime.timedelta(days=5))<datetime.datetime(2014,12,13,0,0,0))&((end_time-datetime.timedelta(days=5))>datetime.datetime(2014,12,10,0,0,0))):
beforethreeday = data[data['daystime']>=(end_time-datetime.timedelta(days=3+2))]
user_cate_count_3 = pd.crosstab([beforethreeday.user_id,beforethreeday.item_category],beforethreeday.behavior_type)
else:
beforethreeday = data[data['daystime']>=(end_time-datetime.timedelta(days=3))]
user_cate_count_3 = pd.crosstab([beforethreeday.user_id,beforethreeday.item_category],beforethreeday.behavior_type)
user_cate_count_2 = None
if (((end_time-datetime.timedelta(days=5))<datetime.datetime(2014,12,13,0,0,0))&((end_time-datetime.timedelta(days=5))>datetime.datetime(2014,12,10,0,0,0))):
beforethreeday = data[data['daystime']>=(end_time-datetime.timedelta(days=7+2))]
user_cate_count_2 = pd.crosstab([beforethreeday.user_id,beforethreeday.item_category],beforethreeday.behavior_type)
else:
beforethreeday = data[data['daystime']>=(end_time-datetime.timedelta(days=7))]
user_cate_count_2 = pd.crosstab([beforethreeday.user_id,beforethreeday.item_category],beforethreeday.behavior_type)
# _live = user_cate_long_touch(data)
beforeonedayuser_item_count = pd.crosstab([beforeoneday.user_id,beforeoneday.item_category],beforeoneday.behavior_type)
max_touchtime = pd.pivot_table(beforeoneday,index=['user_id','item_category'],values=['hours'],aggfunc=[np.min,np.max])
max_touchtype = pd.pivot_table(beforeoneday,index=['user_id','item_category'],values=['behavior_type'],aggfunc=np.max)
user_cate_feture = pd.merge(user_item_count,beforeonedayuser_item_count,how='left',right_index=True,left_index=True)
user_cate_feture = pd.merge(user_cate_feture,max_touchtime,how='left',right_index=True,left_index=True)
user_cate_feture = pd.merge(user_cate_feture,max_touchtype,how='left',right_index=True,left_index=True)
# user_cate_feture = pd.merge(user_cate_feture,_live,how='left',right_index=True,left_index=True)
user_cate_feture = pd.merge(user_cate_feture,user_cate_count_5,how='left',right_index=True,left_index=True)
user_cate_feture = pd.merge(user_cate_feture,user_cate_count_3,how='left',right_index=True,left_index=True)
user_cate_feture = pd.merge(user_cate_feture,user_cate_count_2,how='left',right_index=True,left_index=True)
user_cate_feture.fillna(0,inplace=True)
return user_cate_feture
if __name__ == '__main__':
# pass
result=[]
for i in range(15):
train_user_window1 = None
if (LabelDay >= datetime.datetime(2014,12,12,0,0,0)):
train_user_window1 = Data[(Data['daystime'] > (LabelDay - datetime.timedelta(days=FEATURE_EXTRACTION_SLOT+2))) & (Data['daystime'] < LabelDay)]
else:
train_user_window1 = Data[(Data['daystime'] > (LabelDay - datetime.timedelta(days=FEATURE_EXTRACTION_SLOT))) & (Data['daystime'] < LabelDay)]
# train_user_window1 = Data[(Data['daystime'] > (LabelDay - datetime.timedelta(days=FEATURE_EXTRACTION_SLOT))) & (Data['daystime'] < LabelDay)]
beforeoneday = Data[Data['daystime'] == (LabelDay-datetime.timedelta(days=1))]
# beforetwoday = Data[(Data['daystime'] >= (LabelDay-datetime.timedelta(days=2))) & (Data['daystime'] < LabelDay)]
# beforefiveday = Data[(Data['daystime'] >= (LabelDay-datetime.timedelta(days=5))) & (Data['daystime'] < LabelDay)]
x = get_train(Data, LabelDay)
add_user_click_1 = user_click(beforeoneday)
add_user_item_click_1 = user_item_click(beforeoneday)
add_user_cate_click_1 = user_cate_click(beforeoneday)
# add_user_click_2 = user_click(beforetwoday)
# add_user_click_5 = user_click(beforefiveday)
liveday = user_liveday(train_user_window1)
# sys.exit()
a = user_id_feture(train_user_window1, LabelDay,beforeoneday)
a = a.reset_index()
b = item_id_feture(train_user_window1, LabelDay,beforeoneday)
b = b.reset_index()
c = item_category_feture(train_user_window1, LabelDay,beforeoneday)
c = c.reset_index()
d = user_cate_feture(train_user_window1, LabelDay,beforeoneday)
d = d.reset_index()
e = user_item_feture(train_user_window1, LabelDay,beforeoneday)
e = e.reset_index()
x = pd.merge(x,a,on=['user_id'],how='left')
x = pd.merge(x,b,on=['item_id'],how='left')
x = pd.merge(x,c,on=['item_category'],how='left')
x = pd.merge(x,d,on=['user_id','item_category'],how='left')
x = pd.merge(x,e,on=['user_id','item_id'],how='left')
x = pd.merge(x,add_user_click_1,left_on = ['user_id'],right_index=True,how = 'left' )
# x = pd.merge(x,add_user_click_2,left_on = ['user_id'],right_index=True,how = 'left' )
# x = pd.merge(x,add_user_click_5,left_on = ['user_id'],right_index=True,how = 'left' )
x = pd.merge(x,add_user_item_click_1,left_on = ['user_id','item_id'],right_index=True,how = 'left' )
x = pd.merge(x,add_user_cate_click_1,left_on = ['user_id','item_category'],right_index=True,how = 'left' )
x = pd.merge(x,liveday,left_on = ['user_id'],right_index=True,how = 'left' )
x = x.fillna(0)
print(i,LabelDay,len(x))
LabelDay = LabelDay-datetime.timedelta(days=1)
if (LabelDay == datetime.datetime(2014,12,13,0,0,0)):
LabelDay = datetime.datetime(2014,12,10,0,0,0)
result.append(x)
train_set = pd.concat(result,axis=0,ignore_index=True)
# train_set.to_csv('train_train_no_jiagou.csv',index=None)
###############################################
LabelDay=datetime.datetime(2014,12,18,0,0,0)
test = get_label_testset(Data,LabelDay)
train_user_window1 = Data[(Data['daystime'] > (LabelDay - datetime.timedelta(days=FEATURE_EXTRACTION_SLOT-1))) & (Data['daystime'] <= LabelDay)]
beforeoneday = Data[Data['daystime'] == LabelDay]
# beforetwoday = Data[(Data['daystime'] >= (LabelDay-datetime.timedelta(days=2))) & (Data['daystime'] < LabelDay)]
# beforefiveday = Data[(Data['daystime'] >= (LabelDay-datetime.timedelta(days=5))) & (Data['daystime'] < LabelDay)]
add_user_click = user_click(beforeoneday)
add_user_item_click = user_item_click(beforeoneday)
add_user_cate_click = user_cate_click(beforeoneday)
# add_user_click_2 = user_click(beforetwoday)
# add_user_click_5 = user_click(beforefiveday)
liveday = user_liveday(train_user_window1)
a = user_id_feture(train_user_window1, LabelDay,beforeoneday)
a = a.reset_index()
b = item_id_feture(train_user_window1, LabelDay,beforeoneday)
b = b.reset_index()
c = item_category_feture(train_user_window1, LabelDay,beforeoneday)
c = c.reset_index()
d = user_cate_feture(train_user_window1, LabelDay,beforeoneday)
d = d.reset_index()
e = user_item_feture(train_user_window1, LabelDay,beforeoneday)
e = e.reset_index()
test = pd.merge(test,a,on=['user_id'],how='left')
test = pd.merge(test,b,on=['item_id'],how='left')
test = pd.merge(test,c,on=['item_category'],how='left')
test = pd.merge(test,d,on=['user_id','item_category'],how='left')
test = pd.merge(test,e,on=['user_id','item_id'],how='left')
test = pd.merge(test,add_user_click,left_on = ['user_id'],right_index=True,how = 'left' )
# test = pd.merge(test,add_user_click_2,left_on = ['user_id'],right_index=True,how = 'left' )
# test = pd.merge(test,add_user_click_5,left_on = ['user_id'],right_index=True,how = 'left' )
test = pd.merge(test,add_user_item_click,left_on = ['user_id','item_id'],right_index=True,how = 'left' )
test = pd.merge(test,add_user_cate_click,left_on = ['user_id','item_category'],right_index=True,how = 'left' )
test = pd.merge(test,liveday,left_on = ['user_id'],right_index=True,how = 'left' )
test = test.fillna(0)
# test.to_csv('test_test_no_jiagou.csv',index=None)
#
# sys.exit()
###############采样
train_set_1 = train_set[train_set['label']==1]
train_set_0 = train_set[train_set['label']==0]
new_train_set_0 = train_set_0.sample(len(train_set_1)*90)
train_set = pd.concat([train_set_1,new_train_set_0],axis=0)
###############
train_y = train_set['label'].values
train_x = train_set.drop(['user_id', 'item_id','item_category', 'label'], axis=1).values
test_x = test.drop(['user_id', 'item_id','item_category'], axis=1).values
num_round = 900
params = {'max_depth': 4, 'colsample_bytree': 0.8, 'subsample': 0.8, 'eta': 0.02, 'silent': 1,
'objective': 'binary:logistic','eval_metric ':'error', 'min_child_weight': 2.5,#'max_delta_step':10,'gamma':0.1,'scale_pos_weight':230/1,
'seed': 10} #
plst = list(params.items())
dtrain = xgb.DMatrix(train_x, label=train_y)
dtest = xgb.DMatrix(test_x)
bst = xgb.train(plst, dtrain, num_round)
predicted_proba = bst.predict(dtest)
#print(predicted_proba)
predicted_proba = pd.DataFrame(predicted_proba)
predicted = pd.concat([test[['user_id', 'item_id']], predicted_proba], axis=1)
predicted.columns = ['user_id','item_id','prob']
#print(predicted)
predicted = predicted.sort_values('prob', axis=0,ascending=False)
#print(predicted)
# predict1 = predicted.iloc[:650, [0, 1]]
# # 保存到文件
# predict1.to_csv("../result/10_30_2/650_1B80minchildweight1.8.csv", index=False)
predict2 = predicted.iloc[:700, [0, 1]]
# 保存到文件
predict2.to_csv("../result/result.csv", index=False)
# predict3 = predicted.iloc[:750, [0, 1]]
# # 保存到文件
# predict3.to_csv("../result/10_30_2/750_1B80minchildweight1.8.csv", index=False)
sys.exit()
# evaluate(predicted)
#####################################################################线下验证部分
reference = Data[Data['daystime'] == (LabelDay+datetime.timedelta(days=1))]
reference = reference[reference['behavior_type'] == 4] # 购买的记录
reference = reference[['user_id', 'item_id']] # 获取ui对
reference = reference.drop_duplicates(['user_id', 'item_id']) # 去重
ui = predicted['user_id'] / predicted['item_id']
predicted=predicted[ui.duplicated() == False]
predicted_ui = predicted['user_id'] / predicted['item_id']
reference_ui = reference['user_id'] / reference['item_id']
is_in = predicted_ui.isin(reference_ui)
true_positive = predicted[is_in]
tp = len(true_positive)
predictedSetCount = len(predicted)
referenceSetCount = len(reference)
precision = tp / predictedSetCount
recall = tp / referenceSetCount
f_score = 2 * precision * recall / (precision + recall)
tp = recall * referenceSetCount
predictedSetCount = tp / precision
print('%.8f%% %.8f %.8f %.0f %.0f' %
(f_score * 100, precision, recall, tp, predictedSetCount))
================================================
FILE: result/your result
================================================
gitextract_1fl6mnq7/
├── .gitattributes
├── Preprocess/
│ └── Drop_Day_and_sub_item.py
├── README.md
├── feature/
│ ├── add_feture.py
│ └── extract_feture.py
└── result/
└── your result
SYMBOL INDEX (13 symbols across 2 files) FILE: feature/add_feture.py function user_click (line 7) | def user_click(beforesomeday):#用户在前几天各种操作在各个小时的计数 function user_liveday (line 12) | def user_liveday(train_user_window1):#用户各个行为活跃的天数 function user_item_click (line 18) | def user_item_click(beforesomeday): function user_cate_click (line 23) | def user_cate_click(beforesomeday): function user_item_long_touch (line 28) | def user_item_long_touch(train_user_window1): function user_cate_long_touch (line 32) | def user_cate_long_touch(train_user_window1): FILE: feature/extract_feture.py function get_train (line 14) | def get_train(train_user,end_time): function get_label_testset (line 35) | def get_label_testset(train_user,LabelDay): function item_category_feture (line 43) | def item_category_feture(data,end_time,beforeoneday): function item_id_feture (line 105) | def item_id_feture(data,end_time,beforeoneday): function user_id_feture (line 170) | def user_id_feture(data,end_time,beforeoneday): function user_item_feture (line 240) | def user_item_feture(data,end_time,beforeoneday): function user_cate_feture (line 285) | def user_cate_feture(data,end_time,beforeoneday):
Condensed preview — 6 files, each showing path, character count, and a content snippet. Download the .json file or copy for the full structured content (33K chars).
[
{
"path": ".gitattributes",
"chars": 108,
"preview": "# Auto detect text files and perform LF normalization\n* text=auto\n*.zip filter=lfs diff=lfs merge=lfs -text\n"
},
{
"path": "Preprocess/Drop_Day_and_sub_item.py",
"chars": 625,
"preview": "import pandas as pd\nimport numpy as np\n\n\n\nif __name__ == '__main__':\n\tuser_table = pd.read_csv('../DataSet/tianchi_fresh"
},
{
"path": "README.md",
"chars": 153,
"preview": "# TianChi_YiDongTuiJian_forecast\n\n移动推荐新手实战赛学习代码(简单版本)\n\n首先解压数据\n\n\n运行说明:\n\n1.run ./Preprocess/Drop_Day_and_sub_item.py \n2.r"
},
{
"path": "feature/add_feture.py",
"chars": 1440,
"preview": "import pandas as pd\nimport numpy as np\nimport datetime\nimport sys\nimport time\n\ndef user_click(beforesomeday):#用户在前几天各种操作"
},
{
"path": "feature/extract_feture.py",
"chars": 29647,
"preview": "import pandas as pd\nimport numpy as np\nimport datetime\nimport sys\nimport time\nimport xgboost as xgb\nfrom add_feture impo"
},
{
"path": "result/your result",
"chars": 0,
"preview": ""
}
]
About this extraction
This page contains the full source code of the chenkkkk/TianChi_YiDongTuiJian_forecast GitHub repository, extracted and formatted as plain text for AI agents and large language models (LLMs). The extraction includes 6 files (31.2 KB), approximately 9.4k tokens, and a symbol index with 13 extracted functions, classes, methods, constants, and types. Use this with OpenClaw, Claude, ChatGPT, Cursor, Windsurf, or any other AI tool that accepts text input. You can copy the full output to your clipboard or download it as a .txt file.
Extracted by GitExtract — free GitHub repo to text converter for AI. Built by Nikandr Surkov.