Repository: ChuanyuXue/KDDCUP-2020 Branch: master Commit: b675a1b01ba4 Files: 68 Total size: 373.2 KB Directory structure: gitextract_b6ebyh89/ ├── README.txt ├── README_CN.md ├── code/ │ ├── 1_DataPreprocessing/ │ │ ├── 01_Generate_Offline_Dataset_origin.py │ │ ├── 02_Generate_Model1_Dataset_origin.py │ │ ├── 03_Create_Model1_Answer.py │ │ ├── 03_Create_Offline_Answer.py │ │ ├── 04_TransformDateTime-Copy1.py │ │ └── 05_Generate_img_txt_vec.py │ ├── 2_Similarity/ │ │ ├── 01_itemCF_Mundane_model1.py │ │ ├── 01_itemCF_Mundane_offline.py │ │ ├── 01_itemCF_Mundane_online.py │ │ ├── RA_Wu_model1.py │ │ ├── RA_Wu_offline.py │ │ ├── RA_Wu_online.py │ │ └── deep_node_model.py │ ├── 3_NN/ │ │ ├── ItemFeat2.py │ │ ├── Readme │ │ ├── config.py │ │ ├── model2.py │ │ ├── modules.py │ │ ├── sampler2.py │ │ ├── sas_rec.py │ │ └── util.py │ ├── 3_Recall/ │ │ ├── 01_Recall-Wu-model1.py │ │ ├── 01_Recall-Wu-offline.py │ │ └── 01_Recall-Wu-online.py │ ├── 4_RankFeature/ │ │ ├── 01_sim_feature_model1.py │ │ ├── 01_sim_feature_model1_RA_AA.py │ │ ├── 01_sim_feature_offline.py │ │ ├── 01_sim_feature_offline_RA_AA.py │ │ ├── 01_sim_feature_online.py │ │ ├── 01_sim_feature_online_RA_AA.py │ │ ├── 02_itemtime_feature_model1.py │ │ ├── 02_itemtime_feature_offline.py │ │ ├── 02_itemtime_feature_online.py │ │ ├── 03_count_feature_model1.py │ │ ├── 03_count_feature_offline.py │ │ ├── 03_count_feature_online.py │ │ ├── 04_NN_feature_model1.py │ │ ├── 04_NN_feature_offline.py │ │ ├── 04_NN_feature_online.csv.py │ │ ├── 05_txt_feature_model1.py │ │ ├── 05_txt_feature_offline.py │ │ ├── 05_txt_feature_online.py │ │ ├── 06_interactive_model1.py │ │ ├── 06_interactive_offline.py │ │ ├── 06_interactive_online.py │ │ ├── 07_count_detail_model1.py │ │ ├── 07_count_detail_offline.py │ │ ├── 07_count_detail_online.py │ │ ├── 08_user_feature_model1.py │ │ ├── 08_user_feature_offline.py │ │ ├── 08_user_feature_online.py │ │ ├── 09_partial_sim_feature_model1.py │ │ ├── 09_partial_sim_feature_offline.py │ │ ├── 09_partial_sim_feature_online.py │ │ ├── 10_emergency_feature_model1.py │ │ ├── 10_emergency_feature_offline.py │ │ ├── 10_emergency_feature_online.py │ │ ├── 10_紧急feature_model1.py │ │ ├── 10_紧急feature_offline.py │ │ └── 10_紧急feature_online.py │ └── 5_Modeling/ │ ├── Model_Offline.py │ └── Model_Online.py ├── feature_list.csv ├── main.sh ├── project_structure.txt └── requirements.txt ================================================ FILE CONTENTS ================================================ ================================================ FILE: README.txt ================================================ This repository contains the 6th solution on KDD Cup 2020 Challenges for Modern E-Commerce Platform: Debiasing Challenge. skewcy@gmail.com ================================================ FILE: README_CN.md ================================================ # KDDCUP-2020 2020-KDDCUP,Debiasing赛道 第6名解决方案 This repository contains the 6th solution on KDD Cup 2020 Challenges for Modern E-Commerce Platform: Debiasing Challenge. 赛题链接:https://tianchi.aliyun.com/competition/entrance/231785/introduction 解决方案blog: https://zhuanlan.zhihu.com/p/149424540 数据集下载链接: underexpose_train.zip 271.62MB http://tianchi-competition.oss-cn-hangzhou.aliyuncs.com/231785/underexpose_train.zip underexpose_test.zip 3.27MB http://tianchi-competition.oss-cn-hangzhou.aliyuncs.com/231785/underexpose_test.zip 数据集解压密码: 7c2d2b8a636cbd790ff12a007907b2ba underexpose_train_click-1 ea0ec486b76ae41ed836a8059726aa85 underexpose_train_click-2 65255c3677a40bf4d341b0c739ad6dff underexpose_train_click-3 c8376f1c4ed07b901f7fe5c60362ad7b underexpose_train_click-4 63b326dc07d39c9afc65ed81002ff2ab underexpose_train_click-5 f611f3e477b458b718223248fd0d1b55 underexpose_train_click-6 ec191ea68e0acc367da067133869dd60 underexpose_train_click-7 90129a980cb0a4ba3879fb9a4b177cd2 underexpose_train_click-8 f4ff091ab62d849ba1e6ea6f7c4fb717 underexpose_train_click-9 96d071a532e801423be614e9e8414992 underexpose_test_click-1 503bf7a5882d3fac5ca9884d9010078c underexpose_test_click-2 dd3de82d0b3a7fe9c55e0b260027f50f underexpose_test_click-3 04e966e4f6c7b48f1272a53d8f9ade5d underexpose_test_click-4 13a14563bf5528121b8aaccfa7a0dd73 underexpose_test_click-5 dee22d5e4a7b1e3c409ea0719aa0a715 underexpose_test_click-6 69416eedf810b56f8a01439e2061e26d underexpose_test_click-7 55588c1cddab2fa5c63abe5c4bf020e5 underexpose_test_click-8 caacb2c58d01757f018d6b9fee0c8095 underexpose_test_click-9 ## 解决方案 1. 如下文件结构所示,我们先对数据做预处理“1_DataPreprocessing”,将倒数第二次点击当答案生成线下训练集(存于user_data/model_1),将倒数第一次 点击当答案生成线下验证集(存于user_data/offline),线上待预测数据存于user_data/dataset。我们依据点击数的周期变换,将time转换为了 日期(04_TransformDateTime-Copy1.py),还生成了文本相似性、图像相似性文件(05_Generate_img_txt_vec.py)。 2. 依次选用线下训练集、线下验证集和线上待预测数据中的点击日志训练deepwalk、node2vec模型(“deep_node_model.py”)。进而,融合文本相似性 、deepwalk、node2vec修改了ItemCF算法,计算并存储商品相似性(“01_itemCF_Mundane_model1.py”等)。此外,基于召回的商品相似性构建商品相似性网络, 计算并存储RA、AA、CN、HDI、HPI、LHN1等二阶相似性(“RA_Wu_model1.py”等)。 3. 实现Self-Attentive Sequnetial Model,预测召回的用户-商品对的发生点击的概率(“3_NN”)。 4. 基于存储的商品相似性为每个待预测用户召回1000候选商品(“3_Recall”)。 5. 为召回列表中的商品-用户对生成排序特征(“4_RankFeature”)。 6. 将召回列表中真正发生点击的用户-商品对视为正样,按1:5的正负比例从召回列表中随机选取负样,生成6个数据集。进而,采用catboost和lightgbm 建模,为点击量少的商品赋予更大的权重,采用算数平均值、几何平均值与调和平均值做模型融合,并依据商品点击量进行后处理(“5_Modeling”)。 **最终我们的方案取得了Track-A 1th,Track-B 6th的成绩。** ## 文件结构 数据可以在比赛官方网站中下载,按照以下路径创建文件夹以及放置数据。 │ feature_list.csv # List the features we used in ranking process │ main.sh # Run this script to start the whole process │ project_structure.txt # The tree structure of this project │ ├─code │ │ __init__.py │ │ │ ├─1_DataPreprocessing # Generate validation-set, create timestamp and generate item feature vectors │ │ 01_Generate_Offline_Dataset_origin.py │ │ 02_Generate_Model1_Dataset_origin.py │ │ 03_Create_Model1_Answer.py │ │ 03_Create_Offline_Answer.py │ │ 04_TransformDateTime-Copy1.py │ │ 05_Generate_img_txt_vec.py │ │ ipynb_file.zip │ │ │ ├─2_Similarity # Generate item-item similarity matrix │ │ 01_itemCF_Mundane_model1.py │ │ 01_itemCF_Mundane_offline.py │ │ 01_itemCF_Mundane_online.py │ │ deep_node_model.py │ │ ipynb_file.zip │ │ RA_Wu_model1.py │ │ RA_Wu_offline.py │ │ RA_Wu_online.py │ │ │ ├─3_NN # Generate deep-learning based result │ │ config.py │ │ ItemFeat2.py │ │ model2.py │ │ modules.py │ │ Readme │ │ sampler2.py │ │ sas_rec.py │ │ util.py │ │ │ ├─3_Recall # Recall candidates │ │ 01_Recall-Wu-model1.py │ │ 01_Recall-Wu-offline.py │ │ 01_Recall-Wu-online.py │ │ ipynb_file.zip │ │ │ ├─4_RankFeature # Generate feature for ranking │ │ 01_sim_feature_model1.py │ │ 01_sim_feature_model1_RA_AA.py │ │ 01_sim_feature_offline.py │ │ 01_sim_feature_offline_RA_AA.py | | …… │ │ 10_emergency_feature_offline.py │ │ 10_emergency_feature_online.py │ │ 4_RankFeature.zip │ │ │ └─5_Modeling # Build Catboost and LightGBM model │ ipynb_file.zip │ Model_Offline.py │ Model_Online.py │ ├─data # Origin dataset │ ├─underexpose_test │ └─underexpose_train ├─prediction_result └─user_data # Containing intermediate files ├─dataset │ ├─new_recall │ ├─new_similarity │ └─nn ├─model_1 │ ├─new_recall │ ├─new_similarity │ └─nn └─offline ├─new_recall ├─new_similarity └─nn ## Python库环境依赖 lightgbm==2.2.1 tensorflow==1.13.1 joblib==0.15.1 gensim==3.4.0 pandas==0.25.1 numpy==1.16.3 networkx==2.4 tqdm==4.46.0 ## 声明/ 本项目库专门存放KDD2020挑战赛的相关代码文件,所有代码仅供各位同学学习参考使用。如有任何对代码的问题请邮箱联系:cs_xcy@126.com If you have any issue please feel free to contact me at cs_xcy@126.com 天池ID:GrandRookie, BruceQD, 七里z, 青禹小生, 蓝绿黄红, LSH123, XMNG, wenwen_123, **小雨姑娘**, wbbhcb ================================================ FILE: code/1_DataPreprocessing/01_Generate_Offline_Dataset_origin.py ================================================ #!/usr/bin/env python # coding: utf-8 # In[29]: import pandas as pd import numpy as np from tqdm import tqdm import os import warnings warnings.filterwarnings("ignore") # In[30]: current_stage = 9 path = './data/' output_path = './user_data/offline/' input_header = 'underexpose_' output_header = 'offline_' #path = 'offline/' #input_header = 'offline_' #output_header = 'model1/model_1_' # In[31]: df_train_list = [pd.read_csv(path+'underexpose_train/'+input_header+'train_click-%d.csv'%x, header=None, names=['user_id', 'item_id', 'time']) for x in range(current_stage + 1)] for x, df_train in enumerate(df_train_list): df_train.to_csv('./user_data/dataset/' + input_header + 'train_click-%d.csv'%x, index=False,header=None) df_train = pd.concat(df_train_list) df_train = df_train.drop_duplicates(subset=['user_id','item_id','time'],keep='last') df_train = df_train.reset_index(drop=True) # In[32]: df_test_list = [pd.read_csv(path+'underexpose_test/'+input_header+'test_click-%d.csv'%x, header=None, names=['user_id', 'item_id', 'time']) for x in range(current_stage + 1)] for x, df_test in enumerate(df_test_list): df_test.to_csv('./user_data/dataset/' + input_header + 'test_click-%d.csv'%x, index=False,header=None) df_test = pd.concat(df_test_list) df_test = df_test.drop_duplicates(subset=['user_id','item_id','time'],keep='last') df_test = df_test.reset_index(drop=True) # In[33]: df = pd.concat([df_train,df_test]) df = df.drop_duplicates(subset=['user_id','item_id','time'],keep='last') df = df.reset_index(drop=True) # In[34]: # if you are generating the offline dataset please use the comment sentense # df_pred_list = [pd.read_csv(path+input_header+'test_qtime-%d.csv'%x, # header=None, # names=['user_id','item_id','time']) for x in range(current_stage + 1)] #online df_pred_list = [pd.read_csv(path+'underexpose_test/'+input_header+'test_qtime-%d.csv'%x, header=None, names=['user_id','time']) for x in range(current_stage + 1)] for x, df_pred in enumerate(df_pred_list): df_pred.to_csv('./user_data/dataset/' + input_header + 'test_qtime-%d.csv'%x, index=False,header=None) # In[35]: for i in range(current_stage + 1): if 'item_id' in df_pred_list[i].columns: df_pred_list[i] = df_pred_list[i][['user_id','time']] # In[36]: df_list = [] for i in range(current_stage + 1): df_0 = pd.concat([df_train_list[i], df_test_list[i],df_pred_list[i]]) df_0 = df_0.sort_values(by=['time']) df_0 = df_0.reset_index(drop=True) df_list.append(df_0) # In[37]: for i in range(current_stage + 1): count_log = [] for index, row in df_pred_list[i].iterrows(): count_log.append(sum((df_list[i]['user_id']==row['user_id']) & (df_list[i]['time'] 1: row_tmp = df_list[each_stage_out].loc[df_tmp[ (df_tmp['time']==max(df_tmp['time']) ) & (~np.isnan(df_tmp['item_id'] )) ].index[0]] user_id_tmp = row_tmp['user_id'] item_id_tmp = row_tmp['item_id'] time_tmp = row_tmp['time'] fout.write(str(int(user_id_tmp)) + ',' + str(int(item_id_tmp)) + ',' + str(time_tmp) + '\n') else: row_tmp = df_list[each_stage_out].loc[df_tmp.index[-2]] user_id_tmp = row_tmp['user_id'] item_id_tmp = row_tmp['item_id'] time_tmp = row_tmp['time'] fout.write(str(int(user_id_tmp)) + ',' + str(int(item_id_tmp)) + ',' + str(time_tmp) + '\n') for each_stage_in in range(current_stage + 1): list_train_list[each_stage_in] += list(df_train_list[each_stage_in][(df_train_list[each_stage_in]['user_id']==row['user_id']) &(df_train_list[each_stage_in]['item_id']==item_id_tmp)].index) list_test_list[each_stage_in] += list(df_test_list[each_stage_in][(df_test_list[each_stage_in]['user_id']==row['user_id']) &(df_test_list[each_stage_in]['item_id']==item_id_tmp)].index) fout.close() # In[ ]: # In[39]: df_train_list = [x.drop(labels=list_train_list[i],axis=0) for i,x in enumerate(df_train_list)] # In[40]: df_test_list = [x.drop(labels=list_test_list[i],axis=0) for i,x in enumerate(df_test_list)] # In[41]: df_train_list = [x.reset_index(drop=True) for x in df_train_list] df_test_list = [x.reset_index(drop=True) for x in df_test_list] # In[42]: for i in range(current_stage + 1): df_train_list[i].to_csv(output_path + output_header+'train_click-%d.csv'%i,index=False,header=None) df_test_list[i].to_csv(output_path + output_header+'test_click-%d.csv'%i,index=False,header=None) # In[ ]: ================================================ FILE: code/1_DataPreprocessing/02_Generate_Model1_Dataset_origin.py ================================================ #!/usr/bin/env python # coding: utf-8 # In[10]: import pandas as pd import numpy as np from tqdm import tqdm import os import warnings warnings.filterwarnings("ignore") # In[23]: current_stage = 9 #path = 'dataset/' #input_header = 'underexpose_' #output_header = 'offline/offline_' path = './user_data/offline/' output_path = './user_data/model_1/' input_header = 'offline_' output_header = 'model_1_' # In[12]: df_train_list = [pd.read_csv(path+input_header+'train_click-%d.csv'%x, header=None, names=['user_id', 'item_id', 'time']) for x in range(current_stage + 1)] df_train = pd.concat(df_train_list) df_train = df_train.drop_duplicates(subset=['user_id','item_id','time'],keep='last') df_train = df_train.reset_index(drop=True) # In[13]: df_test_list = [pd.read_csv(path+input_header+'test_click-%d.csv'%x, header=None, names=['user_id', 'item_id', 'time']) for x in range(current_stage + 1)] df_test = pd.concat(df_test_list) df_test = df_test.drop_duplicates(subset=['user_id','item_id','time'],keep='last') df_test = df_test.reset_index(drop=True) # In[14]: df = pd.concat([df_train,df_test]) df = df.drop_duplicates(subset=['user_id','item_id','time'],keep='last') df = df.reset_index(drop=True) # In[15]: # if you are generating the offline dataset please use the comment sentense df_pred_list = [pd.read_csv(path+input_header+'test_qtime-%d.csv'%x, header=None, names=['user_id','item_id','time']) for x in range(current_stage + 1)] #online #df_pred_list = [pd.read_csv(path+input_header+'test_qtime-%d.csv'%x, # header=None, # names=['user_id','time']) for x in range(current_stage + 1)] # In[16]: for i in range(current_stage + 1): if 'item_id' in df_pred_list[i].columns: df_pred_list[i] = df_pred_list[i][['user_id','time']] # In[17]: df_list = [] for i in range(current_stage + 1): df_0 = pd.concat([df_train_list[i], df_test_list[i],df_pred_list[i]]) df_0 = df_0.sort_values(by=['time']) df_0 = df_0.reset_index(drop=True) df_list.append(df_0) # In[18]: for i in range(current_stage + 1): count_log = [] for index, row in df_pred_list[i].iterrows(): count_log.append(sum((df_list[i]['user_id']==row['user_id']) & (df_list[i]['time'] 1: row_tmp = df_list[each_stage_out].loc[df_tmp[ (df_tmp['time']==max(df_tmp['time']) ) & (~np.isnan(df_tmp['item_id'] )) ].index[0]] user_id_tmp = row_tmp['user_id'] item_id_tmp = row_tmp['item_id'] time_tmp = row_tmp['time'] fout.write(str(int(user_id_tmp)) + ',' + str(int(item_id_tmp)) + ',' + str(time_tmp) + '\n') else: row_tmp = df_list[each_stage_out].loc[df_tmp.index[-2]] user_id_tmp = row_tmp['user_id'] item_id_tmp = row_tmp['item_id'] time_tmp = row_tmp['time'] fout.write(str(int(user_id_tmp)) + ',' + str(int(item_id_tmp)) + ',' + str(time_tmp) + '\n') for each_stage_in in range(current_stage + 1): list_train_list[each_stage_in] += list(df_train_list[each_stage_in][(df_train_list[each_stage_in]['user_id']==row['user_id']) &(df_train_list[each_stage_in]['item_id']==item_id_tmp)].index) list_test_list[each_stage_in] += list(df_test_list[each_stage_in][(df_test_list[each_stage_in]['user_id']==row['user_id']) &(df_test_list[each_stage_in]['item_id']==item_id_tmp)].index) fout.close() # In[ ]: # In[25]: df_train_list = [x.drop(labels=list_train_list[i],axis=0) for i,x in enumerate(df_train_list)] # In[26]: df_test_list = [x.drop(labels=list_test_list[i],axis=0) for i,x in enumerate(df_test_list)] # In[27]: df_train_list = [x.reset_index(drop=True) for x in df_train_list] df_test_list = [x.reset_index(drop=True) for x in df_test_list] # In[28]: for i in range(current_stage + 1): df_train_list[i].to_csv(output_path+output_header+'train_click-%d.csv'%i,index=False,header=None) df_test_list[i].to_csv(output_path+output_header+'test_click-%d.csv'%i,index=False,header=None) ================================================ FILE: code/1_DataPreprocessing/03_Create_Model1_Answer.py ================================================ #!/usr/bin/env python # coding: utf-8 # In[6]: from collections import defaultdict current_phases = 9 number = 1 def _create_answer_file_for_evaluation(answer_fname='debias_track_answer.csv'): train = './user_data/model_'+str(number)+'/model_'+str(number)+'_train_click-%d.csv' test = './user_data/model_'+str(number)+'/model_'+str(number)+'_test_click-%d.csv' answer = './user_data/model_'+str(number)+'/model_'+str(number)+'_test_qtime-%d.csv' item_deg = defaultdict(lambda: 0) with open(answer_fname, 'w') as fout: for phase_id in range(current_phases+1): with open(train % phase_id) as fin: for line in fin: user_id, item_id, timestamp = line.split(',') user_id, item_id, timestamp = ( int(user_id), int(item_id), float(timestamp)) item_deg[item_id] += 1 with open(test % phase_id) as fin: for line in fin: user_id, item_id, timestamp = line.split(',') user_id, item_id, timestamp = ( int(user_id), int(item_id), float(timestamp)) item_deg[item_id] += 1 with open(answer % phase_id) as fin: for line in fin: user_id, item_id, timestamp = line.split(',') user_id, item_id, timestamp = ( int(user_id), int(item_id), float(timestamp)) assert user_id % 11 == phase_id print(phase_id, user_id, item_id, item_deg[item_id], sep=',', file=fout) # In[7]: _create_answer_file_for_evaluation('./user_data/model_'+str(number)+'/model_'+str(number)+'_debias_track_answer.csv') # In[ ]: ================================================ FILE: code/1_DataPreprocessing/03_Create_Offline_Answer.py ================================================ #!/usr/bin/env python # coding: utf-8 # In[1]: from collections import defaultdict current_phases = 9 def _create_answer_file_for_evaluation(answer_fname='debias_track_answer.csv'): train = './user_data/offline/offline_train_click-%d.csv' test = './user_data/offline/offline_test_click-%d.csv' # train = 'model'+str(number)+'/model_'+str(number)+'_train_click-%d.csv' # test = 'model'+str(number)+'/model_'+str(number)+'_test_click-%d.csv' # underexpose_test_qtime-T.csv contains only # underexpose_test_qtime_with_answer-T.csv contains #answer = 'model/model_test_qtime-%d.csv' # not released answer = './user_data/offline/offline_test_qtime-%d.csv' # answer = 'model'+str(number)+'/model_'+str(number)+'_test_qtime-%d.csv' item_deg = defaultdict(lambda: 0) with open(answer_fname, 'w') as fout: for phase_id in range(current_phases+1): with open(train % phase_id) as fin: for line in fin: user_id, item_id, timestamp = line.split(',') user_id, item_id, timestamp = ( int(user_id), int(item_id), float(timestamp)) item_deg[item_id] += 1 with open(test % phase_id) as fin: for line in fin: user_id, item_id, timestamp = line.split(',') user_id, item_id, timestamp = ( int(user_id), int(item_id), float(timestamp)) item_deg[item_id] += 1 with open(answer % phase_id) as fin: for line in fin: user_id, item_id, timestamp = line.split(',') user_id, item_id, timestamp = ( int(user_id), int(item_id), float(timestamp)) assert user_id % 11 == phase_id print(phase_id, user_id, item_id, item_deg[item_id], sep=',', file=fout) # In[2]: _create_answer_file_for_evaluation('./user_data/offline/offline_debias_track_answer.csv') # In[ ]: # In[3]: # _create_answer_file_for_evaluation('model'+str(number)+'/model_'+str(number)+'_debias_track_answer.csv') ================================================ FILE: code/1_DataPreprocessing/04_TransformDateTime-Copy1.py ================================================ #!/usr/bin/env python # coding: utf-8 # In[1]: import pandas as pd from tqdm import tqdm from collections import defaultdict import math import numpy as np import datetime # In[2]: random_number_1 = 41152582 random_number_2 = 1570909091 # In[3]: train_path = './user_data/offline/' test_path = './user_data/offline/' now_phase = 9 for c in range(now_phase + 1): print('phase:', c) click_train = pd.read_csv(train_path + '/offline_train_click-{}.csv'.format(c), header=None, names=['user_id', 'item_id', 'time']) click_test = pd.read_csv(test_path + '/offline_test_click-{}.csv'.format(c), header=None, names=['user_id', 'item_id', 'time']) click_query = pd.read_csv(test_path + '/offline_test_qtime-{}.csv'.format(c), header=None, names=['user_id', 'item_id', 'time']) click_train['unix_time'] = click_train['time'].apply(lambda x: x * random_number_2 + random_number_1) click_train['datetime'] = click_train['unix_time'].apply(lambda x: datetime.datetime.fromtimestamp(x)) click_train.to_csv(train_path+'/offline_train_click_{}_time.csv'.format(c),index=False) click_test['unix_time'] = click_test['time'].apply(lambda x: x * random_number_2 + random_number_1) click_test['datetime'] = click_test['unix_time'].apply(lambda x: datetime.datetime.fromtimestamp(x)) click_test.to_csv(test_path+'/offline_test_click_{}_time.csv'.format(c),index=False) click_query['unix_time'] = click_query['time'].apply(lambda x: x * random_number_2 + random_number_1) click_query['datetime'] = click_query['unix_time'].apply(lambda x: datetime.datetime.fromtimestamp(x)) click_query.to_csv(test_path+'/offline_test_qtime_{}_time.csv'.format(c),index=False) # In[4]: num = 1 train_path = './user_data/model_'+str(num) test_path = './user_data/model_'+str(num) now_phase = 9 for c in range(now_phase + 1): print('phase:', c) click_train = pd.read_csv(train_path + '/model_'+str(num)+'_train_click-{}.csv'.format(c), header=None, names=['user_id', 'item_id', 'time']) click_test = pd.read_csv(test_path + '/model_'+str(num)+'_test_click-{}.csv'.format(c), header=None, names=['user_id', 'item_id', 'time']) click_query = pd.read_csv(test_path + '/model_'+str(num)+'_test_qtime-{}.csv'.format(c), header=None, names=['user_id', 'item_id', 'time']) click_train['unix_time'] = click_train['time'].apply(lambda x: x * random_number_2 + random_number_1) click_train['datetime'] = click_train['unix_time'].apply(lambda x: datetime.datetime.fromtimestamp(x)) click_train.to_csv(train_path+'/model_'+str(num)+'_train_click_{}_time.csv'.format(c),index=False) click_test['unix_time'] = click_test['time'].apply(lambda x: x * random_number_2 + random_number_1) click_test['datetime'] = click_test['unix_time'].apply(lambda x: datetime.datetime.fromtimestamp(x)) click_test.to_csv(test_path+'/model_'+str(num)+'_test_click_{}_time.csv'.format(c),index=False) click_query['unix_time'] = click_query['time'].apply(lambda x: x * random_number_2 + random_number_1) click_query['datetime'] = click_query['unix_time'].apply(lambda x: datetime.datetime.fromtimestamp(x)) click_query.to_csv(test_path+'/model_'+str(num)+'_test_qtime_{}_time.csv'.format(c),index=False) # In[5]: train_path = './user_data/dataset' test_path = './user_data/dataset' now_phase = 9 for c in range(now_phase + 1): print('phase:', c) click_train = pd.read_csv(train_path + '/underexpose_train_click-{}.csv'.format(c), header=None, names=['user_id', 'item_id', 'time']) click_test = pd.read_csv(test_path + '/underexpose_test_click-{}.csv'.format(c), header=None, names=['user_id', 'item_id', 'time']) click_query = pd.read_csv(test_path + '/underexpose_test_qtime-{}.csv'.format(c), header=None, names=['user_id', 'time']) click_train['unix_time'] = click_train['time'].apply(lambda x: x * random_number_2 + random_number_1) click_train['datetime'] = click_train['unix_time'].apply(lambda x: datetime.datetime.fromtimestamp(x)) click_train.to_csv(train_path+'/underexpose_train_click_{}_time.csv'.format(c),index=False) click_test['unix_time'] = click_test['time'].apply(lambda x: x * random_number_2 + random_number_1) click_test['datetime'] = click_test['unix_time'].apply(lambda x: datetime.datetime.fromtimestamp(x)) click_test.to_csv(test_path+'/underexpose_test_click_{}_time.csv'.format(c),index=False) click_query['unix_time'] = click_query['time'].apply(lambda x: x * random_number_2 + random_number_1) click_query['datetime'] = click_query['unix_time'].apply(lambda x: datetime.datetime.fromtimestamp(x)) click_query.to_csv(test_path+'/underexpose_test_qtime_{}_time.csv'.format(c),index=False) ================================================ FILE: code/1_DataPreprocessing/05_Generate_img_txt_vec.py ================================================ import pandas as pd from gensim.models import KeyedVectors train_path = './data/underexpose_train/' item = pd.read_csv(train_path+'underexpose_item_feat.csv',header=None) item[1] = item[1].apply(lambda x: float(str(x).replace('[', ''))) item[256] = item[256].apply(lambda x: float(str(x).replace(']', ''))) item[128] = item[128].apply(lambda x: float(str(x).replace(']', ''))) item[129] = item[129].apply(lambda x: float(str(x).replace('[', ''))) item.columns = ['item_id'] + ['txt_vec_{}'.format(f) for f in range(0, 128)] + ['img_vec_{}'.format(f) for f in range(0, 128)] item_nun=item['item_id'].nunique() item[['item_id'] + ['img_vec_{}'.format(f) for f in range(0, 128)]].to_csv("user_data/w2v_img_vec.txt", sep=" ", header=[str(item_nun), '128'] + [""] * 127, index=False, encoding='UTF-8') item[['item_id'] + ['txt_vec_{}'.format(f) for f in range(0, 128)]].to_csv("user_data/w2v_txt_vec.txt", sep=" ", header=[str(item_nun), '128'] + [""] * 127, index=False, encoding='UTF-8') txt_vec_model = KeyedVectors.load_word2vec_format("./user_data/" + 'w2v_txt_vec.txt', binary=False) txt_vec_model = KeyedVectors.load_word2vec_format("./user_data/" + 'w2v_img_vec.txt', binary=False) ================================================ FILE: code/2_Similarity/01_itemCF_Mundane_model1.py ================================================ #!/usr/bin/env python # coding: utf-8 # In[13]: from __future__ import division from __future__ import print_function from gensim.models import KeyedVectors import gc import os import math import time import random import joblib import itertools import numpy as np import pandas as pd from tqdm import tqdm from collections import defaultdict import pickle from multiprocessing import Pool as ProcessPool import json # In[14]: random.seed(2020) pd.set_option('display.unicode.ambiguous_as_wide', True) pd.set_option('display.unicode.east_asian_width', True) pd.set_option('display.max_columns', None) pd.set_option('display.max_rows', None) pd.set_option("display.max_colwidth", 100) pd.set_option('display.width', 1000) # In[15]: def process(each_item): dict_tmp = item_sim_list[each_item] for j in dict_tmp: dict_tmp[j] = round(dict_tmp[j],4) dict_tmp[j] = round(dict_tmp[j],4) return (each_item,dict_tmp) def myround(x, thres): temp = 10**thres return int(x * temp) / temp # In[16]: myround = lambda x,thres : int(x * 10**thres) / 10**thres # In[17]: def phase_predict(df, pred_col, top_fill, topk=50): """recom_df, 'sim', top50_click, "click_valid" """ top_fill = [int(t) for t in top_fill.split(',')] top_fill = top_fill[:topk] scores = [-1 * i for i in range(1, len(top_fill) + 1)] ids = list(df['user_id'].unique()) fill_df = pd.DataFrame(ids * len(top_fill), columns=['user_id']) fill_df.sort_values('user_id', inplace=True) fill_df['item_id'] = top_fill * len(ids) fill_df[pred_col] = scores * len(ids) df = df.append(fill_df) df.sort_values(pred_col, ascending=False, inplace=True) df = df.drop_duplicates(subset=['user_id', 'item_id'], keep='first') df['rank'] = df.groupby('user_id')[pred_col].rank(method='first', ascending=False) df.sort_values("rank", inplace=True) df = df[df["rank"] <= topk] df = df.groupby('user_id')['item_id'].apply(lambda x: ','.join([str(i) for i in x])).str.split(',', expand=True).reset_index() return df def get_sim_item(df_, user_col, item_col):#, nodewalk_model,deepwalk_model,txt_vec_model): global txt_similarity global deepwalk_similarity global nodewalk_similarity df = df_.copy() user_item_ = df.groupby(user_col)[item_col].agg(list).reset_index() user_item_dict = dict(zip(user_item_[user_col], user_item_[item_col])) user_time_ = df.groupby(user_col)['time'].agg(list).reset_index() # 引入时间因素 user_time_dict = dict(zip(user_time_[user_col], user_time_['time'])) item_user_ = df.groupby(item_col)[user_col].agg(set).reset_index() item_user_dict = dict(zip(item_user_[item_col], item_user_[user_col])) item_dic = df[item_col].value_counts().to_dict() df.sort_values('time', inplace=True) df.drop_duplicates('item_id', keep='first', inplace=True) item_time_ = df.groupby(item_col)['time'].agg(list).reset_index() # 引入时间因素 item_time_dict = dict(zip(item_time_[item_col], item_time_['time'])) sim_item = {} item_cnt = defaultdict(int) # 商品被点击次数 for user, items in tqdm(user_item_dict.items()): for loc1, item in enumerate(items): users = item_user_dict[item] item_cnt[item] += 1 sim_item.setdefault(item, {}) user_item_len = len(items) for loc2, relate_item in enumerate(items): if item == relate_item: continue t1 = user_time_dict[user][loc1] # 点击时间提取 t2 = user_time_dict[user][loc2] delta_t = abs(t1 - t2) * 650000 delta_loc = abs(loc1 - loc2) ''' The meaning of each columns: {'sim': 0,------------------------0 'item_cf': 0,-------------------1 'item_cf_weighted': 0,----------2 'time_diff': np.inf,------------3 'loc_diff': np.inf,-------------4 'node_sim_max': -1e8,-----------5 'node_sim_sum':0,---------------6 'deep_sim_max': -1e8,-----------7 'deep_sim_sum':0----------------8 } ''' sim_item[item].setdefault(relate_item, [0,0,0,np.inf,np.inf,-1e8,0,-1e8,0] ) key = [str(int(item)), str(int(relate_item))] key_tmp = "_".join(key) ##nodewalk if key_tmp in nodewalk_similarity: node_sim = nodewalk_similarity[key_tmp] else: try: node_sim = 0.5 * nodewalk_model.similarity(str(item), str(relate_item))+ 0.5 except: node_sim = 0.5 nodewalk_similarity[key_tmp] = node_sim ##deepwalk if key_tmp in deepwalk_similarity: deep_sim = deepwalk_similarity[key_tmp] else: try: deep_sim = 0.5 * deepwalk_model.similarity(str(item), str(relate_item))+ 0.5 except: deep_sim = 0.5 deepwalk_similarity[key_tmp] = deep_sim #txt if key_tmp in txt_similarity: txt_sim = txt_similarity[key_tmp] else: try: txt_sim = 0.5 * txt_model.similarity(str(item), str(relate_item))+ 0.5 except: txt_sim = 0.5 txt_similarity[key_tmp] = txt_sim ''' WIJ The meaning of each columns: {'sim': 0,------------------------0 'item_cf': 0,-------------------1 'item_cf_weighted': 0,----------2 'time_diff': np.inf,------------3 'loc_diff': np.inf,-------------4 'node_sim_max': -1e8,-----------5 'node_sim_sum':0,---------------6 'deep_sim_max': -1e8,-----------7 'deep_sim_sum':0----------------8 } ''' if loc1 - loc2 > 0: sim_item[item][relate_item][0] += (node_sim**2)*deep_sim*txt_sim * 0.8 * max(0.5, (0.9 ** (loc1 - loc2 - 1))) * ( max(0.5, 1 / (1 + delta_t))) / (math.log(len(users) + 1) * math.log( 1 + user_item_len)) else: sim_item[item][relate_item][0] += (node_sim**2)*deep_sim*txt_sim * 1.0 * max(0.5, (0.9 ** (loc2 - loc1 - 1))) * ( max(0.5, 1 / (1 + delta_t))) / (math.log(len(users) + 1) * math.log( 1 + user_item_len)) if delta_t < sim_item[item][relate_item][3]: sim_item[item][relate_item][3] = delta_t if delta_loc < sim_item[item][relate_item][4]: sim_item[item][relate_item][4] = delta_loc sim_item[item][relate_item][1] += 1 sim_item[item][relate_item][2] += (0.8**(loc2-loc1-1)) * (1 - (t2 - t1) * 2000) / math.log(1 + len(items)) if node_sim > sim_item[item][relate_item][5]: sim_item[item][relate_item][5] = node_sim sim_item[item][relate_item][6] += node_sim if deep_sim > sim_item[item][relate_item][7]: sim_item[item][relate_item][7] = deep_sim sim_item[item][relate_item][8] += deep_sim sim_item_corr = sim_item.copy() for i, related_items in tqdm(sim_item.items()): for j, cij in related_items.items(): cosine_sim = cij[0] / ((item_cnt[i] * item_cnt[j]) ** 0.2) sim_item_corr[i][j][0] = cosine_sim sim_item_corr[i][j] = [myround(x, 4) for x in sim_item_corr[i][j]] return sim_item_corr, user_item_dict, user_time_dict, item_dic, item_time_dict def recommend(sim_item_corr, user_item_dict, user_id, times, item_dict, item_time_dict, top_k, item_num): ''' input:item_sim_list, user_item, uid, 500, 50 # 用户历史序列中的所有商品均有关联商品,整合这些关联商品,进行相似性排序 ''' rank = {} interacted_items = user_item_dict[user_id] interacted_items = interacted_items[::-1] times = times[::-1] t0 = times[0] for loc, i in enumerate(interacted_items): for j, wij in sorted(sim_item_corr[i].items(), key=lambda d: d[1][0], reverse=True)[0:top_k]: if j not in interacted_items: rank.setdefault(j, [0,0,0,np.inf,np.inf,np.inf,np.inf,np.inf,-1e8,0,-1e8,0]) ''' RANK {'sim': 0,---------------------------------0 'item_cf': 0,------------------------------1 'item_cf_weighted': 0,---------------------2 'time_diff': np.inf,-----------------------3 'loc_diff': np.inf,------------------------4 # Some feature generated by recall 'time_diff_recall': np.inf,----------------5 'time_diff_recall_1': np.inf,--------------6 'loc_diff_recall': np.inf,-----------------7 # Nodesim and Deepsim 'node_sim_max': -1e8,--------------------8 'node_sim_sum':0,------------------------9 'deep_sim_max': -1e8,--------------------10 'deep_sim_sum':0,------------------------11 } ''' t1 = times[loc] t2 = item_time_dict[j][0] delta_t1 = abs(t0 - t1) * 650000 delta_t2 = abs(t0 - t2) * 650000 alpha = max(0.2, 1 / (1 + item_dict[j])) beta = max(0.5, (0.9 ** loc)) theta = max(0.5, 1 / (1 + delta_t1)) gamma = max(0.5, 1 / (1 + delta_t2)) ''' RANK {'sim': 0,---------------------------------0 'item_cf': 0,------------------------------1 'item_cf_weighted': 0,---------------------2 'time_diff': np.inf,-----------------------3 'loc_diff': np.inf,------------------------4 # Some feature generated by recall 'time_diff_recall': np.inf,----------------5 'time_diff_recall_1': np.inf,--------------6 'loc_diff_recall': np.inf,-----------------7 # Nodesim and Deepsim 'node_sim_max': -1e8,--------------------8 'node_sim_sum':0,------------------------9 'deep_sim_max': -1e8,--------------------10 'deep_sim_sum':0,------------------------11 } ''' ''' WIJ The meaning of each columns: {'sim': 0,------------------------0 'item_cf': 0,-------------------1 'item_cf_weighted': 0,----------2 'time_diff': np.inf,------------3 'loc_diff': np.inf,-------------4 'node_sim_max': -1e8,-----------5 'node_sim_sum':0,---------------6 'deep_sim_max': -1e8,-----------7 'deep_sim_sum':0----------------8 } ''' rank[j][0] += myround(wij[0] * (alpha ** 2) * (beta) * (theta ** 2) * gamma, 4) rank[j][1] += wij[1] rank[j][2] += wij[2] if wij[3] < rank[j][3]: rank[j][3] = wij[3] if wij[4] < rank[j][4]: rank[j][4] = wij[4] if delta_t1 < rank[j][5]: rank[j][5] = myround(delta_t1, 4) if delta_t2 < rank[j][6]: rank[j][6] = myround(delta_t2, 4) if loc < rank[j][7]: rank[j][7] = loc if wij[5] > rank[j][8]: rank[j][8] = wij[5] rank[j][9] += wij[6] / wij[1] if wij[7] > rank[j][10]: rank[j][10] = wij[7] rank[j][11] += wij[8] / wij[1] return sorted(rank.items(), key=lambda d: d[1][0], reverse=True)[:item_num] # In[18]: now_phase = 9 header = 'model_1' txt_similarity = {} deepwalk_similarity = {} nodewalk_similarity = {} offline = "./user_data/model_1/" out_path = './user_data/model_1/new_similarity/' print("start") print("read sim") nodewalk_model = KeyedVectors.load_word2vec_format(offline + 'node2vec_' + header + '.bin',binary=True) deepwalk_model = KeyedVectors.load_word2vec_format(offline + 'deepwalk_' + header + '.bin',binary=True) txt_model = KeyedVectors.load_word2vec_format('./user_data/w2v_txt_vec.txt') # In[19]: recom_item = [] for phase in range(now_phase + 1): a = time.time() history_list = [] for i in range(now_phase + 1): click_train = pd.read_csv(offline + header + '_train_click-{}.csv'.format(i), header=None, names=['user_id', 'item_id', 'time']) click_test = pd.read_csv(offline + header + '_test_click-{}.csv'.format(i), header=None, names=['user_id', 'item_id', 'time']) all_click = click_train.append(click_test) history_list.append(all_click) qtime_test = pd.read_csv(offline + header + '_test_qtime-{}.csv'.format(phase), header=None, names=['user_id', 'item_id', 'time']) print('phase:', phase) time_diff = max(history_list[now_phase]['time']) - min(history_list[0]['time']) for i in range(phase + 1, now_phase + 1): history_list[i]['time'] = history_list[i]['time'] - time_diff whole_click = pd.DataFrame() for i in range(now_phase + 1): whole_click = whole_click.append(history_list[i]) whole_click = whole_click.drop_duplicates(subset=['user_id', 'item_id', 'time'], keep='last') whole_click = whole_click.sort_values('time') whole_click = whole_click.reset_index(drop=True) item_sim_list, user_item, user_time_dict, item_dic, item_time_dict = get_sim_item(whole_click, 'user_id', 'item_id' ) print("phase finish time:{:6.4f} mins".format((time.time() - a) / 60)) for user in tqdm(qtime_test['user_id'].unique()): if user in user_time_dict: times = user_time_dict[user] rank_item = recommend(item_sim_list, user_item, user, times, item_dic, item_time_dict, 500, 1000) for j in rank_item: recom_item.append([user, int(j[0])] + j[1]) for i, related_items in tqdm(item_sim_list.items()): for j, cij in related_items.items(): item_sim_list[i][j] = cij[0] write_file = open(out_path+'itemCF_new'+str(phase)+'.pkl', 'wb') pickle.dump(item_sim_list, write_file) write_file.close() write_file = open(out_path+'user2item_new'+str(phase)+'.pkl', 'wb') pickle.dump(user_item, write_file) write_file.close() write_file = open(out_path+'item2cnt_new'+str(phase)+'.pkl', 'wb') pickle.dump(item_dic, write_file) write_file.close() write_file = open(out_path+'userTime'+str(phase)+'.pkl', 'wb') pickle.dump(user_time_dict, write_file) write_file.close() write_file = open(out_path+'itemTime'+str(phase)+'.pkl', 'wb') pickle.dump(item_time_dict, write_file) write_file.close() write_file = open(out_path+'recom_item'+'.pkl', 'wb') pickle.dump(recom_item, write_file) write_file.close() del item_sim_list del user_item del user_time_dict del item_dic del item_time_dict gc.collect() ================================================ FILE: code/2_Similarity/01_itemCF_Mundane_offline.py ================================================ #!/usr/bin/env python # coding: utf-8 # In[30]: from __future__ import division from __future__ import print_function from gensim.models import KeyedVectors import gc import os import math import time import random import joblib import itertools import numpy as np import pandas as pd from tqdm import tqdm from collections import defaultdict import pickle from multiprocessing import Pool as ProcessPool import json # In[31]: random.seed(2020) pd.set_option('display.unicode.ambiguous_as_wide', True) pd.set_option('display.unicode.east_asian_width', True) pd.set_option('display.max_columns', None) pd.set_option('display.max_rows', None) pd.set_option("display.max_colwidth", 100) pd.set_option('display.width', 1000) # In[32]: def process(each_item): dict_tmp = item_sim_list[each_item] for j in dict_tmp: dict_tmp[j] = round(dict_tmp[j],4) dict_tmp[j] = round(dict_tmp[j],4) return (each_item,dict_tmp) def myround(x, thres): temp = 10**thres return int(x * temp) / temp # In[33]: myround = lambda x,thres : int(x * 10**thres) / 10**thres # In[34]: def phase_predict(df, pred_col, top_fill, topk=50): """recom_df, 'sim', top50_click, "click_valid" """ top_fill = [int(t) for t in top_fill.split(',')] top_fill = top_fill[:topk] scores = [-1 * i for i in range(1, len(top_fill) + 1)] ids = list(df['user_id'].unique()) fill_df = pd.DataFrame(ids * len(top_fill), columns=['user_id']) fill_df.sort_values('user_id', inplace=True) fill_df['item_id'] = top_fill * len(ids) fill_df[pred_col] = scores * len(ids) df = df.append(fill_df) df.sort_values(pred_col, ascending=False, inplace=True) df = df.drop_duplicates(subset=['user_id', 'item_id'], keep='first') df['rank'] = df.groupby('user_id')[pred_col].rank(method='first', ascending=False) df.sort_values("rank", inplace=True) df = df[df["rank"] <= topk] df = df.groupby('user_id')['item_id'].apply(lambda x: ','.join([str(i) for i in x])).str.split(',', expand=True).reset_index() return df def get_sim_item(df_, user_col, item_col):#, nodewalk_model,deepwalk_model,txt_vec_model): global txt_similarity global deepwalk_similarity global nodewalk_similarity df = df_.copy() user_item_ = df.groupby(user_col)[item_col].agg(list).reset_index() user_item_dict = dict(zip(user_item_[user_col], user_item_[item_col])) user_time_ = df.groupby(user_col)['time'].agg(list).reset_index() # 引入时间因素 user_time_dict = dict(zip(user_time_[user_col], user_time_['time'])) item_user_ = df.groupby(item_col)[user_col].agg(set).reset_index() item_user_dict = dict(zip(item_user_[item_col], item_user_[user_col])) item_dic = df[item_col].value_counts().to_dict() df.sort_values('time', inplace=True) df.drop_duplicates('item_id', keep='first', inplace=True) item_time_ = df.groupby(item_col)['time'].agg(list).reset_index() # 引入时间因素 item_time_dict = dict(zip(item_time_[item_col], item_time_['time'])) sim_item = {} item_cnt = defaultdict(int) # 商品被点击次数 for user, items in tqdm(user_item_dict.items()): for loc1, item in enumerate(items): users = item_user_dict[item] item_cnt[item] += 1 sim_item.setdefault(item, {}) user_item_len = len(items) for loc2, relate_item in enumerate(items): if item == relate_item: continue t1 = user_time_dict[user][loc1] # 点击时间提取 t2 = user_time_dict[user][loc2] delta_t = abs(t1 - t2) * 650000 delta_loc = abs(loc1 - loc2) ''' The meaning of each columns: {'sim': 0,------------------------0 'item_cf': 0,-------------------1 'item_cf_weighted': 0,----------2 'time_diff': np.inf,------------3 'loc_diff': np.inf,-------------4 'node_sim_max': -1e8,-----------5 'node_sim_sum':0,---------------6 'deep_sim_max': -1e8,-----------7 'deep_sim_sum':0----------------8 } ''' sim_item[item].setdefault(relate_item, [0,0,0,np.inf,np.inf,-1e8,0,-1e8,0] ) key = [str(int(item)), str(int(relate_item))] key_tmp = "_".join(key) ##nodewalk if key_tmp in nodewalk_similarity: node_sim = nodewalk_similarity[key_tmp] else: try: node_sim = 0.5 * nodewalk_model.similarity(str(item), str(relate_item))+ 0.5 except: node_sim = 0.5 nodewalk_similarity[key_tmp] = node_sim ##deepwalk if key_tmp in deepwalk_similarity: deep_sim = deepwalk_similarity[key_tmp] else: try: deep_sim = 0.5 * deepwalk_model.similarity(str(item), str(relate_item))+ 0.5 except: deep_sim = 0.5 deepwalk_similarity[key_tmp] = deep_sim #txt if key_tmp in txt_similarity: txt_sim = txt_similarity[key_tmp] else: try: txt_sim = 0.5 * txt_model.similarity(str(item), str(relate_item))+ 0.5 except: txt_sim = 0.5 txt_similarity[key_tmp] = txt_sim ''' WIJ The meaning of each columns: {'sim': 0,------------------------0 'item_cf': 0,-------------------1 'item_cf_weighted': 0,----------2 'time_diff': np.inf,------------3 'loc_diff': np.inf,-------------4 'node_sim_max': -1e8,-----------5 'node_sim_sum':0,---------------6 'deep_sim_max': -1e8,-----------7 'deep_sim_sum':0----------------8 } ''' if loc1 - loc2 > 0: sim_item[item][relate_item][0] += (node_sim**2)*deep_sim*txt_sim * 0.8 * max(0.5, (0.9 ** (loc1 - loc2 - 1))) * ( max(0.5, 1 / (1 + delta_t))) / (math.log(len(users) + 1) * math.log( 1 + user_item_len)) else: sim_item[item][relate_item][0] += (node_sim**2)*deep_sim*txt_sim * 1.0 * max(0.5, (0.9 ** (loc2 - loc1 - 1))) * ( max(0.5, 1 / (1 + delta_t))) / (math.log(len(users) + 1) * math.log( 1 + user_item_len)) if delta_t < sim_item[item][relate_item][3]: sim_item[item][relate_item][3] = delta_t if delta_loc < sim_item[item][relate_item][4]: sim_item[item][relate_item][4] = delta_loc sim_item[item][relate_item][1] += 1 sim_item[item][relate_item][2] += (0.8**(loc2-loc1-1)) * (1 - (t2 - t1) * 2000) / math.log(1 + len(items)) if node_sim > sim_item[item][relate_item][5]: sim_item[item][relate_item][5] = node_sim sim_item[item][relate_item][6] += node_sim if deep_sim > sim_item[item][relate_item][7]: sim_item[item][relate_item][7] = deep_sim sim_item[item][relate_item][8] += deep_sim sim_item_corr = sim_item.copy() for i, related_items in tqdm(sim_item.items()): for j, cij in related_items.items(): cosine_sim = cij[0] / ((item_cnt[i] * item_cnt[j]) ** 0.2) sim_item_corr[i][j][0] = cosine_sim sim_item_corr[i][j] = [myround(x, 4) for x in sim_item_corr[i][j]] return sim_item_corr, user_item_dict, user_time_dict, item_dic, item_time_dict def recommend(sim_item_corr, user_item_dict, user_id, times, item_dict, item_time_dict, top_k, item_num): ''' input:item_sim_list, user_item, uid, 500, 50 # 用户历史序列中的所有商品均有关联商品,整合这些关联商品,进行相似性排序 ''' rank = {} interacted_items = user_item_dict[user_id] interacted_items = interacted_items[::-1] times = times[::-1] t0 = times[0] for loc, i in enumerate(interacted_items): for j, wij in sorted(sim_item_corr[i].items(), key=lambda d: d[1][0], reverse=True)[0:top_k]: if j not in interacted_items: rank.setdefault(j, [0,0,0,np.inf,np.inf,np.inf,np.inf,np.inf,-1e8,0,-1e8,0]) ''' RANK {'sim': 0,---------------------------------0 'item_cf': 0,------------------------------1 'item_cf_weighted': 0,---------------------2 'time_diff': np.inf,-----------------------3 'loc_diff': np.inf,------------------------4 # Some feature generated by recall 'time_diff_recall': np.inf,----------------5 'time_diff_recall_1': np.inf,--------------6 'loc_diff_recall': np.inf,-----------------7 # Nodesim and Deepsim 'node_sim_max': -1e8,--------------------8 'node_sim_sum':0,------------------------9 'deep_sim_max': -1e8,--------------------10 'deep_sim_sum':0,------------------------11 } ''' t1 = times[loc] t2 = item_time_dict[j][0] delta_t1 = abs(t0 - t1) * 650000 delta_t2 = abs(t0 - t2) * 650000 alpha = max(0.2, 1 / (1 + item_dict[j])) beta = max(0.5, (0.9 ** loc)) theta = max(0.5, 1 / (1 + delta_t1)) gamma = max(0.5, 1 / (1 + delta_t2)) ''' RANK {'sim': 0,---------------------------------0 'item_cf': 0,------------------------------1 'item_cf_weighted': 0,---------------------2 'time_diff': np.inf,-----------------------3 'loc_diff': np.inf,------------------------4 # Some feature generated by recall 'time_diff_recall': np.inf,----------------5 'time_diff_recall_1': np.inf,--------------6 'loc_diff_recall': np.inf,-----------------7 # Nodesim and Deepsim 'node_sim_max': -1e8,--------------------8 'node_sim_sum':0,------------------------9 'deep_sim_max': -1e8,--------------------10 'deep_sim_sum':0,------------------------11 } ''' ''' WIJ The meaning of each columns: {'sim': 0,------------------------0 'item_cf': 0,-------------------1 'item_cf_weighted': 0,----------2 'time_diff': np.inf,------------3 'loc_diff': np.inf,-------------4 'node_sim_max': -1e8,-----------5 'node_sim_sum':0,---------------6 'deep_sim_max': -1e8,-----------7 'deep_sim_sum':0----------------8 } ''' rank[j][0] += myround(wij[0] * (alpha ** 2) * (beta) * (theta ** 2) * gamma, 4) rank[j][1] += wij[1] rank[j][2] += wij[2] if wij[3] < rank[j][3]: rank[j][3] = wij[3] if wij[4] < rank[j][4]: rank[j][4] = wij[4] if delta_t1 < rank[j][5]: rank[j][5] = myround(delta_t1, 4) if delta_t2 < rank[j][6]: rank[j][6] = myround(delta_t2, 4) if loc < rank[j][7]: rank[j][7] = loc if wij[5] > rank[j][8]: rank[j][8] = wij[5] rank[j][9] += wij[6] / wij[1] if wij[7] > rank[j][10]: rank[j][10] = wij[7] rank[j][11] += wij[8] / wij[1] return sorted(rank.items(), key=lambda d: d[1][0], reverse=True)[:item_num] # In[35]: now_phase = 9 header = 'offline' txt_similarity = {} deepwalk_similarity = {} nodewalk_similarity = {} offline = "./user_data/offline/" out_path = './user_data/offline/new_similarity/' print("start") print("read sim") nodewalk_model = KeyedVectors.load_word2vec_format(offline + 'node2vec_' + header + '.bin',binary=True) deepwalk_model = KeyedVectors.load_word2vec_format(offline + 'deepwalk_' + header + '.bin',binary=True) txt_model = KeyedVectors.load_word2vec_format('./user_data/w2v_txt_vec.txt') # In[ ]: recom_item = [] for phase in range(0, now_phase + 1): a = time.time() history_list = [] for i in range(now_phase + 1): click_train = pd.read_csv(offline + header + '_train_click-{}.csv'.format(i), header=None, names=['user_id', 'item_id', 'time']) click_test = pd.read_csv(offline + header + '_test_click-{}.csv'.format(i), header=None, names=['user_id', 'item_id', 'time']) all_click = click_train.append(click_test) history_list.append(all_click) qtime_test = pd.read_csv(offline + header + '_test_qtime-{}.csv'.format(phase), header=None, names=['user_id', 'item_id', 'time']) print('phase:', phase) time_diff = max(history_list[now_phase]['time']) - min(history_list[0]['time']) for i in range(phase + 1, now_phase + 1): history_list[i]['time'] = history_list[i]['time'] - time_diff whole_click = pd.DataFrame() for i in range(now_phase + 1): whole_click = whole_click.append(history_list[i]) whole_click = whole_click.drop_duplicates(subset=['user_id', 'item_id', 'time'], keep='last') whole_click = whole_click.sort_values('time') whole_click = whole_click.reset_index(drop=True) item_sim_list, user_item, user_time_dict, item_dic, item_time_dict = get_sim_item(whole_click, 'user_id', 'item_id' ) print("phase finish time:{:6.4f} mins".format((time.time() - a) / 60)) for user in tqdm(qtime_test['user_id'].unique()): if user in user_time_dict: times = user_time_dict[user] rank_item = recommend(item_sim_list, user_item, user, times, item_dic, item_time_dict, 500, 1000) for j in rank_item: recom_item.append([user, int(j[0])] + j[1]) for i, related_items in tqdm(item_sim_list.items()): for j, cij in related_items.items(): item_sim_list[i][j] = cij[0] write_file = open(out_path+'itemCF_new'+str(phase)+'.pkl', 'wb') pickle.dump(item_sim_list, write_file) write_file.close() write_file = open(out_path+'user2item_new'+str(phase)+'.pkl', 'wb') pickle.dump(user_item, write_file) write_file.close() write_file = open(out_path+'item2cnt_new'+str(phase)+'.pkl', 'wb') pickle.dump(item_dic, write_file) write_file.close() write_file = open(out_path+'userTime'+str(phase)+'.pkl', 'wb') pickle.dump(user_time_dict, write_file) write_file.close() write_file = open(out_path+'itemTime'+str(phase)+'.pkl', 'wb') pickle.dump(item_time_dict, write_file) write_file.close() write_file = open(out_path+'recom_item'+'.pkl', 'wb') pickle.dump(recom_item, write_file) write_file.close() del item_sim_list del user_item del user_time_dict del item_dic del item_time_dict gc.collect() ================================================ FILE: code/2_Similarity/01_itemCF_Mundane_online.py ================================================ #!/usr/bin/env python # coding: utf-8 # In[1]: from __future__ import division from __future__ import print_function from gensim.models import KeyedVectors import gc import os import math import time import random import joblib import itertools import numpy as np import pandas as pd from tqdm import tqdm from collections import defaultdict import pickle from multiprocessing import Pool as ProcessPool import json import time # In[2]: # print('俺睡着了') # time.sleep(6 * 60 * 60) # print('俺睡醒了') # In[3]: random.seed(2020) pd.set_option('display.unicode.ambiguous_as_wide', True) pd.set_option('display.unicode.east_asian_width', True) pd.set_option('display.max_columns', None) pd.set_option('display.max_rows', None) pd.set_option("display.max_colwidth", 100) pd.set_option('display.width', 1000) # In[4]: def process(each_item): dict_tmp = item_sim_list[each_item] for j in dict_tmp: dict_tmp[j] = round(dict_tmp[j],4) dict_tmp[j] = round(dict_tmp[j],4) return (each_item,dict_tmp) def myround(x, thres): temp = 10**thres return int(x * temp) / temp # In[5]: myround = lambda x,thres : int(x * 10**thres) / 10**thres # In[6]: def phase_predict(df, pred_col, top_fill, topk=50): """recom_df, 'sim', top50_click, "click_valid" """ top_fill = [int(t) for t in top_fill.split(',')] top_fill = top_fill[:topk] scores = [-1 * i for i in range(1, len(top_fill) + 1)] ids = list(df['user_id'].unique()) fill_df = pd.DataFrame(ids * len(top_fill), columns=['user_id']) fill_df.sort_values('user_id', inplace=True) fill_df['item_id'] = top_fill * len(ids) fill_df[pred_col] = scores * len(ids) df = df.append(fill_df) df.sort_values(pred_col, ascending=False, inplace=True) df = df.drop_duplicates(subset=['user_id', 'item_id'], keep='first') df['rank'] = df.groupby('user_id')[pred_col].rank(method='first', ascending=False) df.sort_values("rank", inplace=True) df = df[df["rank"] <= topk] df = df.groupby('user_id')['item_id'].apply(lambda x: ','.join([str(i) for i in x])).str.split(',', expand=True).reset_index() return df def get_sim_item(df_, user_col, item_col):#, nodewalk_model,deepwalk_model,txt_vec_model): global txt_similarity global deepwalk_similarity global nodewalk_similarity df = df_.copy() user_item_ = df.groupby(user_col)[item_col].agg(list).reset_index() user_item_dict = dict(zip(user_item_[user_col], user_item_[item_col])) user_time_ = df.groupby(user_col)['time'].agg(list).reset_index() # 引入时间因素 user_time_dict = dict(zip(user_time_[user_col], user_time_['time'])) item_user_ = df.groupby(item_col)[user_col].agg(set).reset_index() item_user_dict = dict(zip(item_user_[item_col], item_user_[user_col])) item_dic = df[item_col].value_counts().to_dict() df.sort_values('time', inplace=True) df.drop_duplicates('item_id', keep='first', inplace=True) item_time_ = df.groupby(item_col)['time'].agg(list).reset_index() # 引入时间因素 item_time_dict = dict(zip(item_time_[item_col], item_time_['time'])) sim_item = {} item_cnt = defaultdict(int) # 商品被点击次数 for user, items in tqdm(user_item_dict.items()): for loc1, item in enumerate(items): users = item_user_dict[item] item_cnt[item] += 1 sim_item.setdefault(item, {}) user_item_len = len(items) for loc2, relate_item in enumerate(items): if item == relate_item: continue t1 = user_time_dict[user][loc1] # 点击时间提取 t2 = user_time_dict[user][loc2] delta_t = abs(t1 - t2) * 650000 delta_loc = abs(loc1 - loc2) ''' The meaning of each columns: {'sim': 0,------------------------0 'item_cf': 0,-------------------1 'item_cf_weighted': 0,----------2 'time_diff': np.inf,------------3 'loc_diff': np.inf,-------------4 'node_sim_max': -1e8,-----------5 'node_sim_sum':0,---------------6 'deep_sim_max': -1e8,-----------7 'deep_sim_sum':0----------------8 } ''' sim_item[item].setdefault(relate_item, [0,0,0,np.inf,np.inf,-1e8,0,-1e8,0] ) key = [str(int(item)), str(int(relate_item))] key_tmp = "_".join(key) ##nodewalk if key_tmp in nodewalk_similarity: node_sim = nodewalk_similarity[key_tmp] else: try: node_sim = 0.5 * nodewalk_model.similarity(str(item), str(relate_item))+ 0.5 except: node_sim = 0.5 nodewalk_similarity[key_tmp] = node_sim ##deepwalk if key_tmp in deepwalk_similarity: deep_sim = deepwalk_similarity[key_tmp] else: try: deep_sim = 0.5 * deepwalk_model.similarity(str(item), str(relate_item))+ 0.5 except: deep_sim = 0.5 deepwalk_similarity[key_tmp] = deep_sim #txt if key_tmp in txt_similarity: txt_sim = txt_similarity[key_tmp] else: try: txt_sim = 0.5 * txt_model.similarity(str(item), str(relate_item))+ 0.5 except: txt_sim = 0.5 txt_similarity[key_tmp] = txt_sim ''' WIJ The meaning of each columns: {'sim': 0,------------------------0 'item_cf': 0,-------------------1 'item_cf_weighted': 0,----------2 'time_diff': np.inf,------------3 'loc_diff': np.inf,-------------4 'node_sim_max': -1e8,-----------5 'node_sim_sum':0,---------------6 'deep_sim_max': -1e8,-----------7 'deep_sim_sum':0----------------8 } ''' if loc1 - loc2 > 0: sim_item[item][relate_item][0] += (node_sim**2)*deep_sim*txt_sim * 0.8 * max(0.5, (0.9 ** (loc1 - loc2 - 1))) * ( max(0.5, 1 / (1 + delta_t))) / (math.log(len(users) + 1) * math.log( 1 + user_item_len)) else: sim_item[item][relate_item][0] += (node_sim**2)*deep_sim*txt_sim * 1.0 * max(0.5, (0.9 ** (loc2 - loc1 - 1))) * ( max(0.5, 1 / (1 + delta_t))) / (math.log(len(users) + 1) * math.log( 1 + user_item_len)) if delta_t < sim_item[item][relate_item][3]: sim_item[item][relate_item][3] = delta_t if delta_loc < sim_item[item][relate_item][4]: sim_item[item][relate_item][4] = delta_loc sim_item[item][relate_item][1] += 1 sim_item[item][relate_item][2] += (0.8**(loc2-loc1-1)) * (1 - (t2 - t1) * 2000) / math.log(1 + len(items)) if node_sim > sim_item[item][relate_item][5]: sim_item[item][relate_item][5] = node_sim sim_item[item][relate_item][6] += node_sim if deep_sim > sim_item[item][relate_item][7]: sim_item[item][relate_item][7] = deep_sim sim_item[item][relate_item][8] += deep_sim sim_item_corr = sim_item.copy() for i, related_items in tqdm(sim_item.items()): for j, cij in related_items.items(): cosine_sim = cij[0] / ((item_cnt[i] * item_cnt[j]) ** 0.2) sim_item_corr[i][j][0] = cosine_sim sim_item_corr[i][j] = [myround(x, 4) for x in sim_item_corr[i][j]] return sim_item_corr, user_item_dict, user_time_dict, item_dic, item_time_dict def recommend(sim_item_corr, user_item_dict, user_id, times, item_dict, item_time_dict, top_k, item_num): ''' input:item_sim_list, user_item, uid, 500, 50 # 用户历史序列中的所有商品均有关联商品,整合这些关联商品,进行相似性排序 ''' rank = {} interacted_items = user_item_dict[user_id] interacted_items = interacted_items[::-1] times = times[::-1] t0 = times[0] for loc, i in enumerate(interacted_items): for j, wij in sorted(sim_item_corr[i].items(), key=lambda d: d[1][0], reverse=True)[0:top_k]: if j not in interacted_items: rank.setdefault(j, [0,0,0,np.inf,np.inf,np.inf,np.inf,np.inf,-1e8,0,-1e8,0]) ''' RANK {'sim': 0,---------------------------------0 'item_cf': 0,------------------------------1 'item_cf_weighted': 0,---------------------2 'time_diff': np.inf,-----------------------3 'loc_diff': np.inf,------------------------4 # Some feature generated by recall 'time_diff_recall': np.inf,----------------5 'time_diff_recall_1': np.inf,--------------6 'loc_diff_recall': np.inf,-----------------7 # Nodesim and Deepsim 'node_sim_max': -1e8,--------------------8 'node_sim_sum':0,------------------------9 'deep_sim_max': -1e8,--------------------10 'deep_sim_sum':0,------------------------11 } ''' t1 = times[loc] t2 = item_time_dict[j][0] delta_t1 = abs(t0 - t1) * 650000 delta_t2 = abs(t0 - t2) * 650000 alpha = max(0.2, 1 / (1 + item_dict[j])) beta = max(0.5, (0.9 ** loc)) theta = max(0.5, 1 / (1 + delta_t1)) gamma = max(0.5, 1 / (1 + delta_t2)) ''' RANK {'sim': 0,---------------------------------0 'item_cf': 0,------------------------------1 'item_cf_weighted': 0,---------------------2 'time_diff': np.inf,-----------------------3 'loc_diff': np.inf,------------------------4 # Some feature generated by recall 'time_diff_recall': np.inf,----------------5 'time_diff_recall_1': np.inf,--------------6 'loc_diff_recall': np.inf,-----------------7 # Nodesim and Deepsim 'node_sim_max': -1e8,--------------------8 'node_sim_sum':0,------------------------9 'deep_sim_max': -1e8,--------------------10 'deep_sim_sum':0,------------------------11 } ''' ''' WIJ The meaning of each columns: {'sim': 0,------------------------0 'item_cf': 0,-------------------1 'item_cf_weighted': 0,----------2 'time_diff': np.inf,------------3 'loc_diff': np.inf,-------------4 'node_sim_max': -1e8,-----------5 'node_sim_sum':0,---------------6 'deep_sim_max': -1e8,-----------7 'deep_sim_sum':0----------------8 } ''' rank[j][0] += myround(wij[0] * (alpha ** 2) * (beta) * (theta ** 2) * gamma, 4) rank[j][1] += wij[1] rank[j][2] += wij[2] if wij[3] < rank[j][3]: rank[j][3] = wij[3] if wij[4] < rank[j][4]: rank[j][4] = wij[4] if delta_t1 < rank[j][5]: rank[j][5] = myround(delta_t1, 4) if delta_t2 < rank[j][6]: rank[j][6] = myround(delta_t2, 4) if loc < rank[j][7]: rank[j][7] = loc if wij[5] > rank[j][8]: rank[j][8] = wij[5] rank[j][9] += wij[6] / wij[1] if wij[7] > rank[j][10]: rank[j][10] = wij[7] rank[j][11] += wij[8] / wij[1] return sorted(rank.items(), key=lambda d: d[1][0], reverse=True)[:item_num] # In[7]: now_phase = 9 header = 'underexpose' txt_similarity = {} deepwalk_similarity = {} nodewalk_similarity = {} offline = "./user_data/dataset/" out_path = './user_data/dataset/new_similarity/' print("start") print("read sim") nodewalk_model = KeyedVectors.load_word2vec_format(offline + 'node2vec_' + header + '.bin',binary=True) deepwalk_model = KeyedVectors.load_word2vec_format(offline + 'deepwalk_' + header + '.bin',binary=True) txt_model = KeyedVectors.load_word2vec_format('./user_data/w2v_txt_vec.txt') # In[ ]: recom_item = [] for phase in range(now_phase + 1): a = time.time() history_list = [] for i in range(now_phase + 1): click_train = pd.read_csv(offline + header + '_train_click-{}.csv'.format(i), header=None, names=['user_id', 'item_id', 'time']) click_test = pd.read_csv(offline + header + '_test_click-{}.csv'.format(i), header=None, names=['user_id', 'item_id', 'time']) all_click = click_train.append(click_test) history_list.append(all_click) qtime_test = pd.read_csv(offline + header + '_test_qtime-{}.csv'.format(phase), header=None, names=['user_id', 'item_id', 'time']) print('phase:', phase) time_diff = max(history_list[now_phase]['time']) - min(history_list[0]['time']) for i in range(phase + 1, now_phase + 1): history_list[i]['time'] = history_list[i]['time'] - time_diff whole_click = pd.DataFrame() for i in range(now_phase + 1): whole_click = whole_click.append(history_list[i]) whole_click = whole_click.drop_duplicates(subset=['user_id', 'item_id', 'time'], keep='last') whole_click = whole_click.sort_values('time') whole_click = whole_click.reset_index(drop=True) item_sim_list, user_item, user_time_dict, item_dic, item_time_dict = get_sim_item(whole_click, 'user_id', 'item_id' ) print("phase finish time:{:6.4f} mins".format((time.time() - a) / 60)) for user in tqdm(qtime_test['user_id'].unique()): if user in user_time_dict: times = user_time_dict[user] rank_item = recommend(item_sim_list, user_item, user, times, item_dic, item_time_dict, 500, 1000) for j in rank_item: recom_item.append([user, int(j[0])] + j[1]) for i, related_items in tqdm(item_sim_list.items()): for j, cij in related_items.items(): item_sim_list[i][j] = cij[0] write_file = open(out_path+'itemCF_new'+str(phase)+'.pkl', 'wb') pickle.dump(item_sim_list, write_file) write_file.close() write_file = open(out_path+'user2item_new'+str(phase)+'.pkl', 'wb') pickle.dump(user_item, write_file) write_file.close() write_file = open(out_path+'item2cnt_new'+str(phase)+'.pkl', 'wb') pickle.dump(item_dic, write_file) write_file.close() write_file = open(out_path+'userTime'+str(phase)+'.pkl', 'wb') pickle.dump(user_time_dict, write_file) write_file.close() write_file = open(out_path+'itemTime'+str(phase)+'.pkl', 'wb') pickle.dump(item_time_dict, write_file) write_file.close() write_file = open(out_path+'recom_item'+'.pkl', 'wb') pickle.dump(recom_item, write_file) write_file.close() del item_sim_list del user_item del user_time_dict del item_dic del item_time_dict gc.collect() # In[ ]: import sys # In[ ]: del deepwalk_similarity # In[ ]: del nodewalk_similarity # In[ ]: del txt_similarity ================================================ FILE: code/2_Similarity/RA_Wu_model1.py ================================================ #!/usr/bin/env python # coding: utf-8 # In[1]: import pandas as pd import numpy as np from tqdm import tqdm import os from collections import defaultdict import math import json from sys import stdout import pickle # In[ ]: # In[ ]: # # RA、AA一起运行的 # In[ ]: # In[2]: now_phase = 9 input_path = './user_data/model_1/new_similarity/' out_path = './user_data/model_1/new_similarity/' for num in range(now_phase+1): # 获取itemCF相似度 with open(input_path+'itemCF_new'+str(num)+'.pkl','rb') as f: item_sim_list_tmp = pickle.load(f) item_sim = {} for item in item_sim_list_tmp: item_sim.setdefault(item, {}) for related_item in item_sim_list_tmp[item]: if item_sim_list_tmp[item][related_item] > 0.005: item_sim[item][related_item] = item_sim_list_tmp[item][related_item] item_sim_list_tmp = [] strengh_dict = dict() print('Counting degree') for item in tqdm(item_sim): strengh_dict[item] = sum(item_sim[item].values()) strengh_AA_dict = dict() print('Counting degree') for item in tqdm(item_sim): strengh_AA_dict[item] = math.log(1+sum(item_sim[item].values()) ) #RA RA_sim = dict() for item in tqdm(item_sim): neighbors = list(set(item_sim[item].keys())) for item1 in neighbors: if item in item_sim[item1]: RA_sim.setdefault(item1, {}) for item2 in neighbors: if item1 != item2: RA_sim[item1].setdefault(item2, 0) RA_sim[item1][item2] += item_sim[item1][item] * item_sim[item][item2]/strengh_dict[item] new_RA = dict() for item1 in tqdm(RA_sim): new_RA[item1] = {i: int(x * 1e3) / 1e3 for i, x in RA_sim[item1].items() if x > 1e-3} RA_sim = [] print('Saving') write_file = open(out_path+'RA_P'+str(num)+'_new.pkl', 'wb') pickle.dump(new_RA, write_file) write_file.close() new_RA = [] #RA AA_sim = dict() for item in tqdm(item_sim): neighbors = list(set(item_sim[item].keys())) for item1 in neighbors: if item in item_sim[item1]: AA_sim.setdefault(item1, {}) for item2 in neighbors: if item1 != item2: AA_sim[item1].setdefault(item2, 0) AA_sim[item1][item2] += item_sim[item1][item] * item_sim[item][item2]/strengh_AA_dict[item] new_AA = dict() for item1 in tqdm(AA_sim): new_AA[item1] = {i: int(x * 1e3) / 1e3 for i, x in AA_sim[item1].items() if x > 1e-3} AA_sim = [] print('Saving') write_file = open(out_path+'AA_P'+str(num)+'_new.pkl', 'wb') pickle.dump(new_AA, write_file) write_file.close() new_AA = [] # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: # # CN、HPI、HDI、LHN1是一起运行的 # In[3]: now_phase = 9 input_path = './user_data/model_1/new_similarity/' out_path = './user_data/model_1/new_similarity/' for num in range(now_phase+1): # 获取itemCF相似度 with open(input_path+'itemCF_new'+str(num)+'.pkl','rb') as f: item_sim_list_tmp = pickle.load(f) item_sim = {} for item in item_sim_list_tmp: item_sim.setdefault(item, {}) for related_item in item_sim_list_tmp[item]: if item_sim_list_tmp[item][related_item] > 0.005: item_sim[item][related_item] = item_sim_list_tmp[item][related_item] item_sim_list_tmp = [] #CN CN_sim = dict() for item in tqdm(item_sim): neighbors = list(set(item_sim[item].keys())) for item1 in neighbors: if item in item_sim[item1]: CN_sim.setdefault(item1, {}) for item2 in neighbors: if item1 != item2: CN_sim[item1].setdefault(item2, 0) CN_sim[item1][item2] += item_sim[item1][item] * item_sim[item][item2] new_CN = dict() for item1 in tqdm(CN_sim): new_CN[item1] = {i: int(x * 1e3) / 1e3 for i, x in CN_sim[item1].items() if x > 1e-3} CN_sim = [] print('Saving') write_file = open(out_path+'CN_P'+str(num)+'_new.pkl', 'wb') pickle.dump(new_CN, write_file) write_file.close() strengh_dict = dict() print('Counting degree') for item in tqdm(item_sim): strengh_dict[item] = sum(item_sim[item].values()) #HPI HPI_sim = dict() for item in tqdm(new_CN): HPI_sim.setdefault(item,{}) for related_item in new_CN[item]: HPI_sim[item][related_item] = new_CN[item][related_item]/max(0.005,min(strengh_dict[item],strengh_dict[related_item])) print('Saving') write_file = open(out_path+'HPI_P'+str(num)+'_new.pkl', 'wb') pickle.dump(HPI_sim, write_file) write_file.close() HPI_sim = [] #HDI HDI_sim = dict() for item in tqdm(new_CN): HDI_sim.setdefault(item,{}) for related_item in new_CN[item]: HDI_sim[item][related_item] = new_CN[item][related_item]/max(strengh_dict[item],strengh_dict[related_item]) print('Saving') write_file = open(out_path+'HDI_P'+str(num)+'_new.pkl', 'wb') pickle.dump(HDI_sim, write_file) write_file.close() HDI_sim = [] #LHN1 LHN1_sim = dict() for item in tqdm(new_CN): LHN1_sim.setdefault(item,{}) for related_item in new_CN[item]: LHN1_sim[item][related_item] = new_CN[item][related_item]/( max(0.005,strengh_dict[item]) * max(0.005,strengh_dict[related_item])) print('Saving') write_file = open(out_path+'LHN1_P'+str(num)+'_new.pkl', 'wb') pickle.dump(LHN1_sim, write_file) write_file.close() LHN1_sim = [] new_CN = [] # In[ ]: # In[ ]: # In[ ]: ================================================ FILE: code/2_Similarity/RA_Wu_offline.py ================================================ #!/usr/bin/env python # coding: utf-8 # In[1]: import pandas as pd import numpy as np from tqdm import tqdm import os from collections import defaultdict import math import json from sys import stdout import pickle # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: # # RA、AA一起运行的 # In[2]: now_phase = 9 input_path = './user_data/offline/new_similarity/' out_path = './user_data/offline/new_similarity/' for num in range(now_phase+1): # 获取itemCF相似度 with open(input_path+'itemCF_new'+str(num)+'.pkl','rb') as f: item_sim_list_tmp = pickle.load(f) item_sim = {} for item in item_sim_list_tmp: item_sim.setdefault(item, {}) for related_item in item_sim_list_tmp[item]: if item_sim_list_tmp[item][related_item] > 0.005: item_sim[item][related_item] = item_sim_list_tmp[item][related_item] item_sim_list_tmp = [] strengh_dict = dict() print('Counting degree') for item in tqdm(item_sim): strengh_dict[item] = sum(item_sim[item].values()) strengh_AA_dict = dict() print('Counting degree') for item in tqdm(item_sim): strengh_AA_dict[item] = math.log(1+sum(item_sim[item].values()) ) #RA RA_sim = dict() for item in tqdm(item_sim): neighbors = list(set(item_sim[item].keys())) for item1 in neighbors: if item in item_sim[item1]: RA_sim.setdefault(item1, {}) for item2 in neighbors: if item1 != item2: RA_sim[item1].setdefault(item2, 0) RA_sim[item1][item2] += item_sim[item1][item] * item_sim[item][item2]/strengh_dict[item] new_RA = dict() for item1 in tqdm(RA_sim): new_RA[item1] = {i: int(x * 1e3) / 1e3 for i, x in RA_sim[item1].items() if x > 1e-3} RA_sim = [] print('Saving') write_file = open(out_path+'RA_P'+str(num)+'_new.pkl', 'wb') pickle.dump(new_RA, write_file) write_file.close() new_RA = [] #RA AA_sim = dict() for item in tqdm(item_sim): neighbors = list(set(item_sim[item].keys())) for item1 in neighbors: if item in item_sim[item1]: AA_sim.setdefault(item1, {}) for item2 in neighbors: if item1 != item2: AA_sim[item1].setdefault(item2, 0) AA_sim[item1][item2] += item_sim[item1][item] * item_sim[item][item2]/strengh_AA_dict[item] new_AA = dict() for item1 in tqdm(AA_sim): new_AA[item1] = {i: int(x * 1e3) / 1e3 for i, x in AA_sim[item1].items() if x > 1e-3} AA_sim = [] print('Saving') write_file = open(out_path+'AA_P'+str(num)+'_new.pkl', 'wb') pickle.dump(new_AA, write_file) write_file.close() new_AA = [] # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: # # CN、HPI、HDI、LHN1是一起运行的 # In[3]: now_phase = 9 input_path = './user_data/offline/new_similarity/' out_path = './user_data/offline/new_similarity/' for num in range(now_phase+1): # 获取itemCF相似度 with open(input_path+'itemCF_new'+str(num)+'.pkl','rb') as f: item_sim_list_tmp = pickle.load(f) item_sim = {} for item in item_sim_list_tmp: item_sim.setdefault(item, {}) for related_item in item_sim_list_tmp[item]: if item_sim_list_tmp[item][related_item] > 0.005: item_sim[item][related_item] = item_sim_list_tmp[item][related_item] item_sim_list_tmp = [] #CN CN_sim = dict() for item in tqdm(item_sim): neighbors = list(set(item_sim[item].keys())) for item1 in neighbors: if item in item_sim[item1]: CN_sim.setdefault(item1, {}) for item2 in neighbors: if item1 != item2: CN_sim[item1].setdefault(item2, 0) CN_sim[item1][item2] += item_sim[item1][item] * item_sim[item][item2] new_CN = dict() for item1 in tqdm(CN_sim): new_CN[item1] = {i: int(x * 1e3) / 1e3 for i, x in CN_sim[item1].items() if x > 1e-3} CN_sim = [] print('Saving') write_file = open(out_path+'CN_P'+str(num)+'_new.pkl', 'wb') pickle.dump(new_CN, write_file) write_file.close() strengh_dict = dict() print('Counting degree') for item in tqdm(item_sim): strengh_dict[item] = sum(item_sim[item].values()) #HPI HPI_sim = dict() for item in tqdm(new_CN): HPI_sim.setdefault(item,{}) for related_item in new_CN[item]: HPI_sim[item][related_item] = new_CN[item][related_item]/min(strengh_dict[item],strengh_dict[related_item]) print('Saving') write_file = open(out_path+'HPI_P'+str(num)+'_new.pkl', 'wb') pickle.dump(HPI_sim, write_file) write_file.close() HPI_sim = [] #HDI HDI_sim = dict() for item in tqdm(new_CN): HDI_sim.setdefault(item,{}) for related_item in new_CN[item]: HDI_sim[item][related_item] = new_CN[item][related_item]/max(strengh_dict[item],strengh_dict[related_item]) print('Saving') write_file = open(out_path+'HDI_P'+str(num)+'_new.pkl', 'wb') pickle.dump(HDI_sim, write_file) write_file.close() HDI_sim = [] #LHN1 LHN1_sim = dict() for item in tqdm(new_CN): LHN1_sim.setdefault(item,{}) for related_item in new_CN[item]: LHN1_sim[item][related_item] = new_CN[item][related_item]/(strengh_dict[item]*strengh_dict[related_item]) print('Saving') write_file = open(out_path+'LHN1_P'+str(num)+'_new.pkl', 'wb') pickle.dump(LHN1_sim, write_file) write_file.close() LHN1_sim = [] new_CN = [] # In[ ]: # In[ ]: # In[ ]: ================================================ FILE: code/2_Similarity/RA_Wu_online.py ================================================ #!/usr/bin/env python # coding: utf-8 # In[1]: import pandas as pd import numpy as np from tqdm import tqdm import os from collections import defaultdict import math import json from sys import stdout import pickle # In[ ]: # In[2]: now_phase = 9 input_path = './user_data/dataset/new_similarity/' out_path = './user_data/dataset/new_similarity/' for num in range(now_phase+1): # 获取itemCF相似度 with open(input_path+'itemCF_new'+str(num)+'.pkl','rb') as f: item_sim_list_tmp = pickle.load(f) item_sim = {} for item in item_sim_list_tmp: item_sim.setdefault(item, {}) for related_item in item_sim_list_tmp[item]: if item_sim_list_tmp[item][related_item] > 0.005: item_sim[item][related_item] = item_sim_list_tmp[item][related_item] item_sim_list_tmp = [] strengh_dict = dict() print('Counting degree') for item in tqdm(item_sim): strengh_dict[item] = sum(item_sim[item].values()) strengh_AA_dict = dict() print('Counting degree') for item in tqdm(item_sim): strengh_AA_dict[item] = math.log(1+sum(item_sim[item].values()) ) #RA RA_sim = dict() for item in tqdm(item_sim): neighbors = list(set(item_sim[item].keys())) for item1 in neighbors: if item in item_sim[item1]: RA_sim.setdefault(item1, {}) for item2 in neighbors: if item1 != item2: RA_sim[item1].setdefault(item2, 0) RA_sim[item1][item2] += item_sim[item1][item] * item_sim[item][item2]/strengh_dict[item] new_RA = dict() for item1 in tqdm(RA_sim): new_RA[item1] = {i: int(x * 1e3) / 1e3 for i, x in RA_sim[item1].items() if x > 1e-3} RA_sim = [] print('Saving') write_file = open(out_path+'RA_P'+str(num)+'_new.pkl', 'wb') pickle.dump(new_RA, write_file) write_file.close() new_RA = [] #RA AA_sim = dict() for item in tqdm(item_sim): neighbors = list(set(item_sim[item].keys())) for item1 in neighbors: if item in item_sim[item1]: AA_sim.setdefault(item1, {}) for item2 in neighbors: if item1 != item2: AA_sim[item1].setdefault(item2, 0) AA_sim[item1][item2] += item_sim[item1][item] * item_sim[item][item2]/strengh_AA_dict[item] new_AA = dict() for item1 in tqdm(AA_sim): new_AA[item1] = {i: int(x * 1e3) / 1e3 for i, x in AA_sim[item1].items() if x > 1e-3} AA_sim = [] print('Saving') write_file = open(out_path+'AA_P'+str(num)+'_new.pkl', 'wb') pickle.dump(new_AA, write_file) write_file.close() new_AA = [] # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[3]: now_phase = 9 input_path = './user_data/dataset/new_similarity/' out_path = './user_data/dataset/new_similarity/' for num in range(now_phase+1): # 获取itemCF相似度 with open(input_path+'itemCF_new'+str(num)+'.pkl','rb') as f: item_sim_list_tmp = pickle.load(f) item_sim = {} for item in item_sim_list_tmp: item_sim.setdefault(item, {}) for related_item in item_sim_list_tmp[item]: if item_sim_list_tmp[item][related_item] > 0.005: item_sim[item][related_item] = item_sim_list_tmp[item][related_item] item_sim_list_tmp = [] #CN CN_sim = dict() for item in tqdm(item_sim): neighbors = list(set(item_sim[item].keys())) for item1 in neighbors: if item in item_sim[item1]: CN_sim.setdefault(item1, {}) for item2 in neighbors: if item1 != item2: CN_sim[item1].setdefault(item2, 0) CN_sim[item1][item2] += item_sim[item1][item] * item_sim[item][item2] new_CN = dict() for item1 in tqdm(CN_sim): new_CN[item1] = {i: int(x * 1e3) / 1e3 for i, x in CN_sim[item1].items() if x > 1e-3} CN_sim = [] print('Saving') write_file = open(out_path+'CN_P'+str(num)+'_new.pkl', 'wb') pickle.dump(new_CN, write_file) write_file.close() strengh_dict = dict() print('Counting degree') for item in tqdm(item_sim): strengh_dict[item] = sum(item_sim[item].values()) #HPI HPI_sim = dict() for item in tqdm(new_CN): HPI_sim.setdefault(item,{}) for related_item in new_CN[item]: HPI_sim[item][related_item] = new_CN[item][related_item]/min(strengh_dict[item],strengh_dict[related_item]) print('Saving') write_file = open(out_path+'HPI_P'+str(num)+'_new.pkl', 'wb') pickle.dump(HPI_sim, write_file) write_file.close() HPI_sim = [] #HDI HDI_sim = dict() for item in tqdm(new_CN): HDI_sim.setdefault(item,{}) for related_item in new_CN[item]: HDI_sim[item][related_item] = new_CN[item][related_item]/max(strengh_dict[item],strengh_dict[related_item]) print('Saving') write_file = open(out_path+'HDI_P'+str(num)+'_new.pkl', 'wb') pickle.dump(HDI_sim, write_file) write_file.close() HDI_sim = [] #LHN1 LHN1_sim = dict() for item in tqdm(new_CN): LHN1_sim.setdefault(item,{}) for related_item in new_CN[item]: LHN1_sim[item][related_item] = new_CN[item][related_item]/(strengh_dict[item]*strengh_dict[related_item]) print('Saving') write_file = open(out_path+'LHN1_P'+str(num)+'_new.pkl', 'wb') pickle.dump(LHN1_sim, write_file) write_file.close() LHN1_sim = [] new_CN = [] # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: ================================================ FILE: code/2_Similarity/deep_node_model.py ================================================ # coding=utf-8 ''' Created on 2020年5月1日 @author: LSH ''' import os import time import random import itertools import numpy as np import pandas as pd import networkx as nx from gensim.models import Word2Vec from joblib import Parallel, delayed random.seed(2020) pd.set_option('display.unicode.ambiguous_as_wide', True) pd.set_option('display.unicode.east_asian_width', True) pd.set_option('display.max_columns', None) pd.set_option('display.max_rows', None) pd.set_option("display.max_colwidth",100) pd.set_option('display.width',1000) now_phase = 9 user_data = "./user_data/" def create_alias_table(area_ratio): """ :param area_ratio: sum(area_ratio)=1 :return: accept,alias """ l = len(area_ratio) accept, alias = [0] * l, [None] * l small, large = [], [] area_ratio_ = np.array(area_ratio) * l for i, prob in enumerate(area_ratio_): if prob < 1.0: small.append(i) else: large.append(i) while small and large: small_idx, large_idx = small.pop(), large.pop() accept[small_idx] = area_ratio_[small_idx] alias[small_idx] = large_idx area_ratio_[large_idx] = area_ratio_[large_idx] - (1 - area_ratio_[small_idx]) if area_ratio_[large_idx] < 1.0: small.append(large_idx) else: large.append(large_idx) while large: large_idx = large.pop() accept[large_idx] = 1 while small: small_idx = small.pop() accept[small_idx] = 1 return accept, alias def alias_sample(accept, alias): """ :param accept: :param alias: :return: sample index """ N = len(accept) i = int(np.random.random() * N) r = np.random.random() if r < accept[i]: return i else: return alias[i] def partition_num(num, workers): if num % workers == 0: return [num // workers] * workers else: return [num // workers] * workers + [num % workers] class RandomWalker: def __init__(self, G, p=1, q=1): """ :param G: :param p: Return parameter,controls the likelihood of immediately revisiting a node in the walk. :param q: In-out parameter,allows the search to differentiate between “inward” and “outward” nodes """ self.G = G self.p = p self.q = q def deepwalk_walk(self, walk_length, start_node): walk = [start_node] while len(walk) < walk_length: cur = walk[-1] cur_nbrs = list(self.G.neighbors(cur)) if len(cur_nbrs) > 0: walk.append(random.choice(cur_nbrs)) else: break return walk def node2vec_walk(self, walk_length, start_node): G = self.G alias_nodes = self.alias_nodes alias_edges = self.alias_edges walk = [start_node] while len(walk) < walk_length: cur = walk[-1] cur_nbrs = list(G.neighbors(cur)) if len(cur_nbrs) > 0: #由于node2vec采样需要cur节点v,prev节点t,所以当没有前序节点时,直接使用当前顶点和邻居顶点之间的边权作为采样依据 if len(walk) == 1: walk.append(cur_nbrs[alias_sample(alias_nodes[cur][0], alias_nodes[cur][1])]) else: prev = walk[-2] edge = (prev, cur) next_node = cur_nbrs[alias_sample(alias_edges[edge][0],alias_edges[edge][1])] walk.append(next_node) else: break return walk def simulate_walks(self, num_walks, walk_length, workers=1, verbose=0): """ """ G = self.G nodes = list(G.nodes()) results = Parallel(n_jobs=workers, verbose=verbose, )( delayed(self._simulate_walks)(nodes, num, walk_length) for num in partition_num(num_walks, workers)) walks = list(itertools.chain(*results)) return walks def _simulate_walks(self, nodes, num_walks, walk_length, ): walks = [] for _ in range(num_walks): random.shuffle(nodes) for v in nodes: if self.p == 1 and self.q == 1: walks.append(self.deepwalk_walk( walk_length=walk_length, start_node=v)) else: walks.append(self.node2vec_walk( walk_length=walk_length, start_node=v)) return walks def get_alias_edge(self, t, v): """ compute unnormalized transition probability between nodes v and its neighbors give the previous visited node t. :param t: :param v: :return: """ G = self.G p = self.p q = self.q unnormalized_probs = [] for x in G.neighbors(v): weight = G[v][x].get('weight', 1.0) # w_vx if x == t: # d_tx == 0 unnormalized_probs.append(weight/p) elif G.has_edge(x, t): # d_tx == 1 unnormalized_probs.append(weight) else: # d_tx > 1 unnormalized_probs.append(weight/q) norm_const = sum(unnormalized_probs) normalized_probs = [float(u_prob)/norm_const for u_prob in unnormalized_probs] return create_alias_table(normalized_probs) def preprocess_transition_probs(self): """ Preprocessing of transition probabilities for guiding the random walks. """ G = self.G alias_nodes = {} for node in G.nodes(): unnormalized_probs = [G[node][nbr].get('weight', 1.0) for nbr in G.neighbors(node)] norm_const = sum(unnormalized_probs) normalized_probs = [float(u_prob)/norm_const for u_prob in unnormalized_probs] alias_nodes[node] = create_alias_table(normalized_probs) alias_edges = {} for edge in G.edges(): alias_edges[edge] = self.get_alias_edge(edge[0], edge[1]) self.alias_nodes = alias_nodes self.alias_edges = alias_edges return class DeepWalk: def __init__(self, graph, walk_length, num_walks, workers=1): self.graph = graph self.w2v_model = None self._embeddings = {} self.walker = RandomWalker(graph, p=1, q=1, ) self.sentences = self.walker.simulate_walks( num_walks=num_walks, walk_length=walk_length, workers=workers, verbose=1) def train(self, embed_size=128, window_size=5, workers=3, iters=5, **kwargs): kwargs["sentences"] = self.sentences kwargs["min_count"] = kwargs.get("min_count", 0) kwargs["size"] = embed_size kwargs["sg"] = 1 # skip gram kwargs["hs"] = 1 # deepwalk use Hierarchical Softmax kwargs["workers"] = workers kwargs["window"] = window_size kwargs["iter"] = iters print("Learning embedding vectors...") model = Word2Vec(**kwargs) print("Learning embedding vectors done!") self.w2v_model = model return model def get_embeddings(self, ): if self.w2v_model is None: print("model not train") return {} self._embeddings = {} for word in self.graph.nodes(): self._embeddings[word] = self.w2v_model.wv[word] return self._embeddings def get_topK(self, item, k=50): if not isinstance(item, str): item=str(item) recom_list = list(map(lambda x: [x[0], x[1]], self.w2v_model.wv.most_similar(positive=[item], topn=k))) return recom_list class Node2Vec: def __init__(self, graph, walk_length, num_walks, p=1.0, q=1.0, workers=1): self.graph = graph self._embeddings = {} self.walker = RandomWalker(graph, p=p, q=q, ) self.walker.preprocess_transition_probs() self.sentences = self.walker.simulate_walks( num_walks=num_walks, walk_length=walk_length, workers=workers, verbose=1) def train(self, embed_size=128, window_size=5, workers=3, iters=5, **kwargs): kwargs["sentences"] = self.sentences kwargs["min_count"] = kwargs.get("min_count", 0) kwargs["size"] = embed_size kwargs["sg"] = 1 kwargs["hs"] = 0 # node2vec not use Hierarchical Softmax kwargs["workers"] = workers kwargs["window"] = window_size kwargs["iter"] = iters print("Learning embedding vectors...") model = Word2Vec(**kwargs) print("Learning embedding vectors done!") self.w2v_model = model return model def get_embeddings(self,): if self.w2v_model is None: print("model not train") return {} self._embeddings = {} for word in self.graph.nodes(): self._embeddings[word] = self.w2v_model.wv[word] return self._embeddings def get_topK(self, item, k=50): if not isinstance(item, str): item = str(item) recom_list = list(map(lambda x: [x[0], x[1]], self.w2v_model.wv.most_similar(positive=[item], topn=k))) return recom_list def get_item_graph(df, user_col, item_col, direction=True, new_wei=False): """构造图 """ user_item_ = df.groupby(user_col)[item_col].agg(list).reset_index() user_item_dict = dict(zip(user_item_[user_col], user_item_[item_col])) edgelist = [] user_time_ = df.groupby(user_col)['time'].agg(list).reset_index() # 引入时间因素 user_time_dict = dict(zip(user_time_[user_col], user_time_['time'])) item_cnt=df[item_col].value_counts().to_dict() for user, items in user_item_dict.items(): for i in range(len(items) - 1): if direction: t1 = user_time_dict[user][i] # 点击时间提取 t2 = user_time_dict[user][i+1] delta_t=abs(t1-t2)*50000 # 中值 0.01 75%:0.02 # 有向有权图,热门商品-->冷门商品权重=热门商品个数/冷门商品个数 ai, aj = item_cnt[items[i]], item_cnt[items[i+1]] edgelist.append([items[i], items[i + 1], max(3, np.log(1+ai/aj)) * 1/(1+delta_t) ]) edgelist.append([items[i+1], items[i], max(3, np.log(1+aj/ai)) * 0.8 * 1/(1+delta_t) ]) else: edgelist.append([items[i], items[i + 1], 1]) if direction: G = nx.DiGraph() else: G = nx.Graph() for edge in edgelist: G.add_edge(str(edge[0]), str(edge[1]), weight=edge[2]) if new_wei: for u,v,d in G.edges(data=True): deg = G.degree(u)/G.degree(v) if deg < 1: deg = max(0.1, deg) else: deg = min(3, deg) new_weight = d["weight"] * deg G[u][v].update({"weight":new_weight}) return G def deep_node_recom(): """使用全量数据分别训练deepwalk和node2vec模型 用于offline和online """ global now_phase novalid_click = pd.DataFrame() whole_click = pd.DataFrame() for i in range(now_phase+1): click_train=pd.read_csv(user_data+'offline/offline_train_click-{}.csv'.format(i),header=None, names=['user_id', 'item_id', 'time']) click_test=pd.read_csv(user_data+'offline/offline_test_click-{}.csv'.format(i),header=None, names=['user_id', 'item_id', 'time']) qtime_test=pd.read_csv(user_data+'offline/offline_test_qtime-{}.csv'.format(i),header=None, names=['user_id', 'item_id', 'time']) click_train["time"] += i click_test["time"] += i qtime_test["time"] += i all_click=click_train.append(click_test) novalid_click = novalid_click.append(all_click) all_click.append(qtime_test) whole_click = whole_click.append(all_click) """除去test最后一次点击的whole点击数据,用于offline的召回 """ novalid_click = novalid_click.drop_duplicates(subset=['user_id', 'item_id', 'time'], keep='last') novalid_click = novalid_click.sort_values('time') novalid_click = novalid_click.reset_index(drop=True) """whole点击数据,用于online的召回 """ whole_click = whole_click.drop_duplicates(subset=['user_id', 'item_id', 'time'], keep='last') whole_click = whole_click.sort_values('time') whole_click = whole_click.reset_index(drop=True) cpu_jobs = os.cpu_count() - 1 """使用有向图训练的node2vecmox """ G = get_item_graph(novalid_click, 'user_id', 'item_id') novalidmodel = Node2Vec(G, walk_length=20, num_walks=80, p=2, q=0.5, workers=1) novalidmodel.train(embed_size=128, window_size=10, workers=cpu_jobs, iter=3) novalidmodel.w2v_model.wv.save_word2vec_format(user_data + "offline/node2vec_offline.bin", binary=True) G = get_item_graph(whole_click, 'user_id', 'item_id') model = Node2Vec(G, walk_length=20, num_walks=80, p=2, q=0.5, workers=1) model.train(embed_size=128, window_size=10, workers=cpu_jobs, iter=3) model.w2v_model.wv.save_word2vec_format(user_data + "dataset/node2vec_underexpose.bin", binary=True) """deepwalk """ G = get_item_graph(novalid_click, 'user_id', 'item_id', direction=False) novalidmodel = DeepWalk(G, walk_length=20, num_walks=80, workers=8) novalidmodel.train(embed_size=128, window_size=10, workers=cpu_jobs, iter=3) novalidmodel.w2v_model.wv.save_word2vec_format(user_data + "offline/deepwalk_offline.bin", binary=True) G = get_item_graph(whole_click, 'user_id', 'item_id', direction=False) model = DeepWalk(G, walk_length=20, num_walks=80, workers=8) model.train(embed_size=128, window_size=10, workers=cpu_jobs, iter=3) model.w2v_model.wv.save_word2vec_format(user_data + "dataset/deepwalk_underexpose.bin", binary=True) # model = KeyedVectors.load_word2vec_format(deepwalk + "deep_model_whoclick_model.bin", binary=True) def model_deep_node_recom(): """训练用于model1的deepwalk和node2vec """ global now_phase novalid_click = pd.DataFrame() for i in range(now_phase+1): click_train=pd.read_csv(user_data+'model_1/model_1_train_click-{}.csv'.format(i),header=None, names=['user_id', 'item_id', 'time']) click_test=pd.read_csv(user_data+'model_1/model_1_test_click-{}.csv'.format(i),header=None, names=['user_id', 'item_id', 'time']) click_train["time"] += i click_test["time"] += i all_click=click_train.append(click_test) novalid_click = novalid_click.append(all_click) novalid_click = novalid_click.drop_duplicates(subset=['user_id', 'item_id', 'time'], keep='last') novalid_click = novalid_click.sort_values('time') novalid_click = novalid_click.reset_index(drop=True) cpu_jobs = os.cpu_count() - 1 G = get_item_graph(novalid_click, 'user_id', 'item_id') novalidmodel = Node2Vec(G, walk_length=20, num_walks=80, p=2, q=0.5, workers=1) novalidmodel.train(embed_size=128, window_size=10, workers=cpu_jobs, iter=3) novalidmodel.w2v_model.wv.save_word2vec_format(user_data + "model_1/node2vec_model_1.bin", binary=True) G = get_item_graph(novalid_click, 'user_id', 'item_id', direction=False) novalidmodel = DeepWalk(G, walk_length=20, num_walks=80, workers=8) novalidmodel.train(embed_size=128, window_size=10, workers=cpu_jobs, iter=3) novalidmodel.w2v_model.wv.save_word2vec_format(user_data + "model_1/deepwalk_model_1.bin", binary=True) if __name__ == "__main__": print("start") a = time.time() if not os.path.exists(user_data): os.mkdir(user_data) deep_node_recom() model_deep_node_recom() print("time:{:6.4f} mins".format( (time.time()-a)/60)) ================================================ FILE: code/3_NN/ItemFeat2.py ================================================ # -*- coding: utf-8 -*- """ Created on Wed Apr 29 10:01:01 2020 @author: hcb """ import pandas as pd import os from config import config def get_feat(now_phase=3, base_path=None): # if base_path is None: # train_path = 'underexpose_train' # test_path = 'underexpose_test' # else: # train_path = os.path.join(base_path, 'underexpose_train') # test_path = os.path.join(base_path, 'underexpose_test') train_path = config.train_path test_path = config.test_path click_train = pd.DataFrame() click_test = pd.DataFrame() for c in range(now_phase + 1): click_tmp = pd.read_csv(train_path + f'/underexpose_train_click-{c}.csv', header=None, names=['user_id', 'item_id', 'time']) click_tmp['user_id'] = '1_{}_'.format(c) + click_tmp['user_id'].astype(str) click_test_tmp = pd.read_csv(test_path + f'/underexpose_test_click-{c}.csv', header=None, names=['user_id', 'item_id', 'time']) click_test_tmp['user_id'] = '0_{}_'.format(c) + click_test_tmp['user_id'].astype(str) click_train = click_train.append(click_tmp) click_test = click_test.append(click_test_tmp) all_click = click_train.append(click_test) print(all_click['item_id'].nunique()) item_df = all_click.groupby('item_id')['time'].count().reset_index() item_df.columns = ['item_id', 'degree'] feat = pd.read_csv('./data/underexpose_train/underexpose_item_feat.csv', header=None) feat[1] = feat[1].apply(lambda x:x[1:]).astype(float) feat[128] = feat[128].apply(lambda x:x[:-1]).astype(float) feat[129] = feat[129].apply(lambda x:x[1:]).astype(float) feat[256] = feat[256].apply(lambda x:x[:-1]).astype(float) feat.columns = ['item_id'] + ['feat'+str(i) for i in range(256)] item_df = item_df.merge(feat, on='item_id', how='left') print(item_df['item_id'].nunique()) def transform(x): if x > 150 and x <400: x = (x-150) // 25 * 25 +150 elif x>=400: x = 400 return x item_df['degree'] = item_df['degree'].apply(lambda x: transform(x)) degree_df = item_df.groupby('degree')[['feat'+str(i) for i in range(256)]].mean().reset_index() na_df = item_df[item_df['feat0'].isna()][['item_id', 'degree']].merge(degree_df, on='degree', how='left') item_df.dropna(inplace=True) item_df = pd.concat((item_df, na_df)) item_df.to_csv('item_feat.csv', index=None) if __name__ == '__main__': get_feat(now_phase=9) ================================================ FILE: code/3_NN/Readme ================================================ pandas==0.25.1 numpy==1.17.2 tensorflow-gpu==1.13.1 tqdm argparse cudatoolkit==9.0 cudnn==7.6.5 ================================================ FILE: code/3_NN/config.py ================================================ #!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Thu Jun 11 12:36:15 2020 @author: hcb """ class config: train_path = './user_data/dataset' test_path = './user_data/dataset' offline_path = './user_data/offline' model1_path = './user_data/model_1' save_path_offline = './user_data/offline/nn/nn_offline.csv' save_path_online = './user_data/dataset/nn/nn_underexpose.csv' save_path_model1 = './user_data/model_1/nn/nn_model_1.csv' online_item_file = './user_data/dataset/new_recall/user_item_index.csv' offline_item_file = './user_data/offline/new_recall/user_item_index.csv' model1_item_file = './user_data/model_1/new_recall/user_item_index.csv' # online_path = '' ================================================ FILE: code/3_NN/model2.py ================================================ from modules import * class Model: def __init__(self, usernum, itemnum, args, emb=None, num_neg=2, dec_step=None, emb_usr=None, reuse=None): self.is_training = tf.placeholder(tf.bool, shape=()) self.u = tf.placeholder(tf.int32, shape=(None)) self.input_seq = tf.placeholder(tf.int32, shape=(None, args.maxlen)) self.pos = tf.placeholder(tf.int32, shape=(None, args.maxlen)) self.neg = tf.placeholder(tf.int32, shape=(None, args.maxlen, num_neg)) pos = self.pos neg = self.neg mask = tf.expand_dims(tf.to_float(tf.not_equal(self.input_seq, 0)), -1) with tf.variable_scope("SASRec", reuse=reuse): # sequence embedding, item embedding table self.seq, item_emb_table = embedding(self.input_seq, vocab_size=itemnum + 1, num_units=args.hidden_units, zero_pad=False, scale=True, l2_reg=args.l2_emb, scope="input_embeddings", with_t=True, reuse=reuse ) # self.lookup_table2 = tf.get_variable('lookup_table2', # dtype=tf.float32, # shape=[itemnum + 1, args.hidden_units], # trainable=False # ) # item_emb_table = lookup_table2 + item_emb_table # # self.seq = tf.nn.embedding_lookup(item_emb_table, self.input_seq) # Positional Encoding t, pos_emb_table = embedding( tf.tile(tf.expand_dims(tf.range(tf.shape(self.input_seq)[1]), 0), [tf.shape(self.input_seq)[0], 1]), vocab_size=args.maxlen, num_units=args.hidden_units, zero_pad=False, scale=False, l2_reg=args.l2_emb, scope="dec_pos", reuse=reuse, with_t=True ) # user embedding u_, user_emb_table = embedding( self.u, vocab_size=usernum+1, num_units=args.hidden_units, zero_pad=False, scale=False, l2_reg=args.l2_emb, scope="user_embedding", reuse=reuse, with_t=True ) self.seq += t # user_emb = tf.reshape(u_, [tf.shape(self.input_seq)[0], 1, args.hidden_units]) # self.seq = user_emb + self.seq # Dropout self.seq = tf.layers.dropout(self.seq, rate=args.dropout_rate, training=tf.convert_to_tensor(self.is_training)) self.seq *= mask # Build blocks for i in range(args.num_blocks): with tf.variable_scope("num_blocks_%d" % i): # Self-attention self.seq = multihead_attention(queries=normalize(self.seq), keys=self.seq, num_units=args.hidden_units, num_heads=args.num_heads, dropout_rate=args.dropout_rate, is_training=self.is_training, causality=True, scope="self_attention") # Feed forward self.seq = feedforward(normalize(self.seq), num_units=[args.hidden_units, args.hidden_units], dropout_rate=args.dropout_rate, is_training=self.is_training) self.seq *= mask self.seq = normalize(self.seq) # print(item_emb_table.shape) # print(emb_item.shape) self.emb_item = tf.Variable(emb, dtype=tf.float32) self.usr_emb = tf.Variable(emb_usr, dtype=tf.float32) self.item_emb_table = item_emb_table # self.lookup_table2 = lookup_table2 pos = tf.reshape(pos, [tf.shape(self.input_seq)[0] * args.maxlen]) # neg = tf.reshape(neg, [tf.shape(self.input_seq)[0] * args.maxlen]) neg = tf.reshape(neg, [tf.shape(self.input_seq)[0] * args.maxlen * num_neg]) pos_emb = tf.nn.embedding_lookup(item_emb_table, pos) neg_emb = tf.nn.embedding_lookup(item_emb_table, neg) # ------------------ #user emedding self.user_emb_table = user_emb_table user_emb = tf.nn.embedding_lookup(self.user_emb_table, self.u) user_emb = tf.reshape(user_emb, [tf.shape(self.input_seq)[0], 1, args.hidden_units]) seq_emb = tf.reshape(self.seq, [tf.shape(self.input_seq)[0], args.maxlen, args.hidden_units]) self.seq = user_emb + seq_emb # last 5 emb # item_emb2 = tf.nn.embedding_lookup(item_emb_table, self.input_seq) # item_emb2 = tf.reshape(item_emb2, [tf.shape(self.input_seq)[0], args.maxlen, args.hidden_units]) # item_emb2 = tf.layers.dense(item_emb2, args.hidden_units, activation=None) # self.seq = self.seq + item_emb2 # seq_emb = tf.reshape(seq_emb, [-1, args.hidden_units]) # item_emb2 = tf.reduce_mean(item_emb2[:,-10:,:], axis=1) # ----------- seq_emb = tf.reshape(self.seq, [tf.shape(self.input_seq)[0] * args.maxlen, args.hidden_units]) self.test_item = tf.placeholder(tf.int32, shape=(None)) test_item_emb = tf.nn.embedding_lookup(item_emb_table, self.test_item) self.test_logits = tf.matmul(seq_emb, tf.transpose(test_item_emb)) self.test_logits = tf.reshape(self.test_logits, [tf.shape(self.input_seq)[0], args.maxlen, -1]) self.test_logits = self.test_logits[:, -1, :] # prediction layer self.pos_logits = tf.reduce_sum(pos_emb * seq_emb, -1) # print(neg_emb.shape) tmp_seq_emb = tf.reshape(seq_emb, [-1,1,args.hidden_units]) neg_emb = tf.reshape(neg_emb, [-1,num_neg, args.hidden_units]) self.neg_logits = tf.reduce_sum(neg_emb * tmp_seq_emb, -1) self.neg_logits = tf.reshape(self.neg_logits, [tf.shape(self.input_seq)[0] * args.maxlen, num_neg]) # ignore padding items (0) istarget = tf.reshape(tf.to_float(tf.not_equal(pos, 0)), [tf.shape(self.input_seq)[0] * args.maxlen]) # self.pos_logits = tf.reshape(self.pos_logits, [tf.shape(self.input_seq)[0] * args.maxlen, 1]) # err = self.pos_logits - self.neg_logits # self.loss = tf.reduce_sum( # -tf.reduce_sum(tf.log(tf.sigmoid(err) + 1e-24), axis=-1) * istarget # ) / tf.reduce_sum(istarget) self.loss = tf.reduce_sum( - tf.log(tf.sigmoid(self.pos_logits) + 1e-24) * istarget - tf.reduce_sum(tf.log(1 - tf.sigmoid(self.neg_logits) + 1e-24), axis=-1) * istarget ) / tf.reduce_sum(istarget) reg_losses = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES) self.loss += sum(reg_losses) tf.summary.scalar('loss', self.loss) self.auc = tf.reduce_sum( ((tf.sign(self.pos_logits - self.neg_logits) + 1) / 2) * istarget ) / tf.reduce_sum(istarget) if reuse is None: tf.summary.scalar('auc', self.auc) self.global_step = tf.Variable(0, name='global_step', trainable=False) self.lr = tf.train.exponential_decay(args.lr, self.global_step, dec_step, 0.5, staircase=True) self.optimizer = tf.train.AdamOptimizer(learning_rate=self.lr, beta2=0.98) self.train_op = self.optimizer.minimize(self.loss, global_step=self.global_step) else: tf.summary.scalar('test_auc', self.auc) self.merged = tf.summary.merge_all() def predict(self, sess, u, seq, item_idx): return sess.run(self.test_logits, {self.u: u, self.input_seq: seq, self.test_item: item_idx, self.is_training: False}) ================================================ FILE: code/3_NN/modules.py ================================================ # -*- coding: utf-8 -*- #/usr/bin/python2 ''' June 2017 by kyubyong park. kbpark.linguist@gmail.com. https://www.github.com/kyubyong/transformer ''' from __future__ import print_function import tensorflow as tf import numpy as np def positional_encoding(dim, sentence_length, dtype=tf.float32): encoded_vec = np.array([pos/np.power(10000, 2*i/dim) for pos in range(sentence_length) for i in range(dim)]) encoded_vec[::2] = np.sin(encoded_vec[::2]) encoded_vec[1::2] = np.cos(encoded_vec[1::2]) return tf.convert_to_tensor(encoded_vec.reshape([sentence_length, dim]), dtype=dtype) def normalize(inputs, epsilon = 1e-8, scope="ln", reuse=None): '''Applies layer normalization. Args: inputs: A tensor with 2 or more dimensions, where the first dimension has `batch_size`. epsilon: A floating number. A very small number for preventing ZeroDivision Error. scope: Optional scope for `variable_scope`. reuse: Boolean, whether to reuse the weights of a previous layer by the same name. Returns: A tensor with the same shape and data dtype as `inputs`. ''' with tf.variable_scope(scope, reuse=reuse): inputs_shape = inputs.get_shape() params_shape = inputs_shape[-1:] mean, variance = tf.nn.moments(inputs, [-1], keep_dims=True) beta= tf.Variable(tf.zeros(params_shape)) gamma = tf.Variable(tf.ones(params_shape)) normalized = (inputs - mean) / ( (variance + epsilon) ** (.5) ) outputs = gamma * normalized + beta return outputs def embedding(inputs, vocab_size, num_units, zero_pad=True, scale=True, l2_reg=0.0, scope="embedding", with_t=False, trainable=True, reuse=None): '''Embeds a given tensor. Args: inputs: A `Tensor` with type `int32` or `int64` containing the ids to be looked up in `lookup table`. vocab_size: An int. Vocabulary size. num_units: An int. Number of embedding hidden units. zero_pad: A boolean. If True, all the values of the fist row (id 0) should be constant zeros. scale: A boolean. If True. the outputs is multiplied by sqrt num_units. scope: Optional scope for `variable_scope`. reuse: Boolean, whether to reuse the weights of a previous layer by the same name. Returns: A `Tensor` with one more rank than inputs's. The last dimensionality should be `num_units`. For example, ``` import tensorflow as tf inputs = tf.to_int32(tf.reshape(tf.range(2*3), (2, 3))) outputs = embedding(inputs, 6, 2, zero_pad=True) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) print sess.run(outputs) >> [[[ 0. 0. ] [ 0.09754146 0.67385566] [ 0.37864095 -0.35689294]] [[-1.01329422 -1.09939694] [ 0.7521342 0.38203377] [-0.04973143 -0.06210355]]] ``` ``` import tensorflow as tf inputs = tf.to_int32(tf.reshape(tf.range(2*3), (2, 3))) outputs = embedding(inputs, 6, 2, zero_pad=False) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) print sess.run(outputs) >> [[[-0.19172323 -0.39159766] [-0.43212751 -0.66207761] [ 1.03452027 -0.26704335]] [[-0.11634696 -0.35983452] [ 0.50208133 0.53509563] [ 1.22204471 -0.96587461]]] ``` ''' with tf.variable_scope(scope, reuse=reuse): lookup_table = tf.get_variable('lookup_table', dtype=tf.float32, shape=[vocab_size, num_units], initializer=tf.contrib.layers.xavier_initializer(), regularizer=tf.contrib.layers.l2_regularizer(l2_reg), trainable=trainable ) if zero_pad: lookup_table = tf.concat((tf.zeros(shape=[1, num_units]), lookup_table[1:, :]), 0) outputs = tf.nn.embedding_lookup(lookup_table, inputs) if scale: outputs = outputs * (num_units ** 0.5) if with_t: return outputs,lookup_table else: return outputs def multihead_attention(queries, keys, num_units=None, num_heads=8, dropout_rate=0, is_training=True, causality=False, scope="multihead_attention", reuse=None, with_qk=False): '''Applies multihead attention. Args: queries: A 3d tensor with shape of [N, T_q, C_q]. keys: A 3d tensor with shape of [N, T_k, C_k]. num_units: A scalar. Attention size. dropout_rate: A floating point number. is_training: Boolean. Controller of mechanism for dropout. causality: Boolean. If true, units that reference the future are masked. num_heads: An int. Number of heads. scope: Optional scope for `variable_scope`. reuse: Boolean, whether to reuse the weights of a previous layer by the same name. Returns A 3d tensor with shape of (N, T_q, C) ''' with tf.variable_scope(scope, reuse=reuse): # Set the fall back option for num_units if num_units is None: num_units = queries.get_shape().as_list[-1] # Linear projections # Q = tf.layers.dense(queries, num_units, activation=tf.nn.relu) # (N, T_q, C) # K = tf.layers.dense(keys, num_units, activation=tf.nn.relu) # (N, T_k, C) # V = tf.layers.dense(keys, num_units, activation=tf.nn.relu) # (N, T_k, C) Q = tf.layers.dense(queries, num_units, activation=None) # (N, T_q, C) K = tf.layers.dense(keys, num_units, activation=None) # (N, T_k, C) V = tf.layers.dense(keys, num_units, activation=None) # (N, T_k, C) # Split and concat Q_ = tf.concat(tf.split(Q, num_heads, axis=2), axis=0) # (h*N, T_q, C/h) K_ = tf.concat(tf.split(K, num_heads, axis=2), axis=0) # (h*N, T_k, C/h) V_ = tf.concat(tf.split(V, num_heads, axis=2), axis=0) # (h*N, T_k, C/h) # Multiplication outputs = tf.matmul(Q_, tf.transpose(K_, [0, 2, 1])) # (h*N, T_q, T_k) # Scale outputs = outputs / (K_.get_shape().as_list()[-1] ** 0.5) # Key Masking key_masks = tf.sign(tf.abs(tf.reduce_sum(keys, axis=-1))) # (N, T_k) key_masks = tf.tile(key_masks, [num_heads, 1]) # (h*N, T_k) key_masks = tf.tile(tf.expand_dims(key_masks, 1), [1, tf.shape(queries)[1], 1]) # (h*N, T_q, T_k) paddings = tf.ones_like(outputs)*(-2**32+1) outputs = tf.where(tf.equal(key_masks, 0), paddings, outputs) # (h*N, T_q, T_k) # Causality = Future blinding if causality: diag_vals = tf.ones_like(outputs[0, :, :]) # (T_q, T_k) tril = tf.linalg.LinearOperatorLowerTriangular(diag_vals).to_dense() # (T_q, T_k) masks = tf.tile(tf.expand_dims(tril, 0), [tf.shape(outputs)[0], 1, 1]) # (h*N, T_q, T_k) paddings = tf.ones_like(masks)*(-2**32+1) outputs = tf.where(tf.equal(masks, 0), paddings, outputs) # (h*N, T_q, T_k) # Activation outputs = tf.nn.softmax(outputs) # (h*N, T_q, T_k) # Query Masking query_masks = tf.sign(tf.abs(tf.reduce_sum(queries, axis=-1))) # (N, T_q) query_masks = tf.tile(query_masks, [num_heads, 1]) # (h*N, T_q) query_masks = tf.tile(tf.expand_dims(query_masks, -1), [1, 1, tf.shape(keys)[1]]) # (h*N, T_q, T_k) outputs *= query_masks # broadcasting. (N, T_q, C) # Dropouts outputs = tf.layers.dropout(outputs, rate=dropout_rate, training=tf.convert_to_tensor(is_training)) # Weighted sum outputs = tf.matmul(outputs, V_) # ( h*N, T_q, C/h) # Restore shape outputs = tf.concat(tf.split(outputs, num_heads, axis=0), axis=2 ) # (N, T_q, C) # Residual connection outputs += queries # Normalize #outputs = normalize(outputs) # (N, T_q, C) if with_qk: return Q,K else: return outputs def feedforward(inputs, num_units=[2048, 512], scope="multihead_attention", dropout_rate=0.2, is_training=True, reuse=None): '''Point-wise feed forward net. Args: inputs: A 3d tensor with shape of [N, T, C]. num_units: A list of two integers. scope: Optional scope for `variable_scope`. reuse: Boolean, whether to reuse the weights of a previous layer by the same name. Returns: A 3d tensor with the same shape and dtype as inputs ''' with tf.variable_scope(scope, reuse=reuse): # Inner layer params = {"inputs": inputs, "filters": num_units[0], "kernel_size": 1, #'padding':'same', "activation": tf.nn.relu, "use_bias": True} outputs = tf.layers.conv1d(**params) outputs = tf.layers.dropout(outputs, rate=dropout_rate, training=tf.convert_to_tensor(is_training)) # Readout layer params = {"inputs": outputs, "filters": num_units[1], "kernel_size": 1, #'padding':'same', "activation": None, "use_bias": True} outputs = tf.layers.conv1d(**params) outputs = tf.layers.dropout(outputs, rate=dropout_rate, training=tf.convert_to_tensor(is_training)) # Residual connection outputs += inputs # Normalize #outputs = normalize(outputs) return outputs ================================================ FILE: code/3_NN/sampler2.py ================================================ import numpy as np from multiprocessing import Process, Queue import random def random_neq(l, r, s, num_neg): negs = [] for i in range(num_neg): t = np.random.randint(l, r) while t in s: t = np.random.randint(l, r) negs.append(t) return negs def sample_function(user_train, usernum, itemnum, batch_size, maxlen, num_neg, id2user, user2idmap2, result_queue, SEED): def sample(): # num_neg = 2 user = np.random.randint(1, usernum + 1) while len(user_train[user]) <= 1: user = np.random.randint(1, usernum + 1) seq = np.zeros([maxlen], dtype=np.int32) pos = np.zeros([maxlen], dtype=np.int32) neg = np.zeros([maxlen, num_neg], dtype=np.int32) # nxt = user_train[user][-1] idx = maxlen - 1 seq_ = user_train[user] st = 0 if len(seq_) > (maxlen+1) : st = np.random.randint(0, len(seq_)-maxlen-1) seq_ = seq_[st:st+(maxlen+1)] nxt = seq_[-1] # nexts = [nxt] ts = set(seq_) for i in reversed(seq_[:-1]): seq[idx] = i pos[idx] = nxt if nxt != 0: neg[idx, :] = random_neq(1, itemnum + 1, ts, num_neg) nxt = i # nexts.append(i) # nxt = random.choice(nexts) idx -= 1 if idx == -1: break user = id2user[user] # user = user2idmap2[int(user.split('_')[-1])] user = user2idmap2[user[2:]] return (user, seq, pos, neg) np.random.seed(SEED) while True: one_batch = [] for i in range(batch_size): one_batch.append(sample()) result_queue.put(zip(*one_batch)) class WarpSampler(object): def __init__(self, User, usernum, itemnum, id2user, user2idmap2, num_neg=20, batch_size=64, maxlen=10, n_workers=1): self.result_queue = Queue(maxsize=n_workers * 10) self.processors = [] for i in range(n_workers): self.processors.append( Process(target=sample_function, args=(User, usernum, itemnum, batch_size, maxlen, num_neg, id2user, user2idmap2, self.result_queue, np.random.randint(2e9) ))) self.processors[-1].daemon = True self.processors[-1].start() def next_batch(self): return self.result_queue.get() def close(self): for p in self.processors: p.terminate() p.join() ================================================ FILE: code/3_NN/sas_rec.py ================================================ #!/usr/bin/env python # -*- coding:utf-8 -*- # author:juzphy # datetime:2020/4/26 3:46 下午 import pandas as pd from tqdm import tqdm import tensorflow as tf from sampler2 import WarpSampler from model2 import Model import os from util import * import numpy as np import argparse from config import config def get_data(now_phase, train_path, test_path, kind=1): click_train = pd.DataFrame() click_test = pd.DataFrame() for c in range(now_phase + 1): if kind == 1: click_tmp = pd.read_csv(os.path.join(train_path, f'underexpose_train_click-{c}.csv'), header=None, names=['user_id', 'item_id', 'time'],converters={'time':np.float64}) click_test_tmp = pd.read_csv(os.path.join(test_path, f'underexpose_test_click-{c}.csv'), header=None, names=['user_id', 'item_id', 'time']) elif kind == 2: click_tmp = pd.read_csv(os.path.join(train_path, 'offline' + f'_train_click-{c}.csv'), header=None, names=['user_id', 'item_id', 'time'],converters={'time':np.float64}) click_test_tmp = pd.read_csv(os.path.join(test_path, 'offline' + f'_test_click-{c}.csv'), header=None, names=['user_id', 'item_id', 'time']) elif kind == 3: click_tmp = pd.read_csv(os.path.join(train_path, 'model_1' + f'_train_click-{c}.csv'), header=None, names=['user_id', 'item_id', 'time'],converters={'time':np.float64}) click_test_tmp = pd.read_csv(os.path.join(test_path, 'model_1' + f'_test_click-{c}.csv'), header=None, names=['user_id', 'item_id', 'time']) # click_tmp['user_id2'] = click_tmp['user_id'] click_tmp['user_id2'] ='{}_'.format(c) + click_tmp['user_id'].astype(str) click_tmp['user_id'] = '1_{}_'.format(c) + click_tmp['user_id'].astype(str) # click_test_tmp['user_id2'] = click_test_tmp['user_id'] click_test_tmp['user_id2'] = '{}_'.format(c) + click_test_tmp['user_id'].astype(str) click_test_tmp['user_id'] = '0_{}_'.format(c) + click_test_tmp['user_id'].astype(str) click_train = click_train.append(click_tmp) click_test = click_test.append(click_test_tmp) # click_train.drop_duplicates(['item_id','time', 'user_id2'], inplace=True) all_click = click_train.append(click_test) num_items = all_click['item_id'].nunique() num_users = all_click['user_id'].nunique() num_users2 = all_click['user_id2'].nunique() item2idmap = dict(zip(all_click['item_id'].unique(), range(1, 1 + num_items))) user2idmap = dict(zip(all_click['user_id'].unique(), range(1, 1 + num_users))) user2idmap2 = dict(zip(all_click['user_id2'].unique(), range(1, 1 + num_users2))) all_click['map_user'] = all_click['user_id'].map(user2idmap) all_click['map_item'] = all_click['item_id'].map(item2idmap) item_deg = all_click['map_item'].value_counts().to_dict() use_train, use_valid, use_test = {}, {}, {} all_click = all_click.sort_values('time').groupby('user_id')['map_item'].apply(list).to_dict() for reviewerID, hist in tqdm(all_click.items()): is_train = reviewerID.split('_')[0] phase = reviewerID.split('_')[1] user = user2idmap[reviewerID] if is_train == '1': # if phase == str(now_phase): if phase in ['7', '8', '9']: use_train[user] = hist[:-1] use_valid[user] = [hist[-1]] else: use_train[user] = hist use_valid[user] = [] else: use_train[user] = hist use_valid[user] = [] #if phase in ['7', '8', '9']: use_test[user] = hist id2item = dict() for tmp_key in item2idmap.keys(): id2item[item2idmap[tmp_key]] = tmp_key id2user = dict() for tmp_key in user2idmap.keys(): id2user[user2idmap[tmp_key]] = tmp_key emb = pd.read_csv('item_feat.csv') emb['item_id'] = emb['item_id'].map(item2idmap) emb = emb.sort_values('item_id', ascending=True).reset_index(drop=True) emb = emb[emb.columns[2:]].values return use_train, use_valid, num_items, num_users, id2item, id2user, \ item_deg, emb, use_test, user2idmap2, num_users2 def eval_model(model, sess, train_data, eval_date, item_set, item_deg, idx2user, args, valid_array_): res = {} answers = {} [user, user_array, seqs_array, label_array] = valid_array_ # eval_date = generate_vail_date(train_data, eval_date, 256) for u, seq,label in tqdm(gen(user, user_array, seqs_array, label_array, 32)): preds = model.predict(sess, u, seq, item_set) arg_sort =np.argsort(preds, -1)[:, ::-1] for i in range(len(u)): user_idx = u[i][0] label_item = label[i][0] # user = idx2user[user_idx] phase = '4' res.setdefault(phase, {}) answers.setdefault(phase, {}) _pred_top_50 = item_set[arg_sort[i][:50]] res[phase][user_idx] = _pred_top_50.tolist() answers[phase][user_idx] = (label_item, item_deg[label_item]) finally_score, phase_score = evalation(res, answers, None) return finally_score, phase_score def evaluate_each_phase(predictions, answers, recall_num=50): list_item_degress = [] for user_id in answers: item_id, item_degree = answers[user_id] list_item_degress.append(item_degree) list_item_degress.sort() median_item_degree = list_item_degress[len(list_item_degress) // 2] num_cases_full = 0.0 ndcg_50_full = 0.0 ndcg_50_half = 0.0 num_cases_half = 0.0 hitrate_50_full = 0.0 hitrate_50_half = 0.0 for user_id in answers: item_id, item_degree = answers[user_id] rank = 0 while rank < recall_num and predictions[user_id][rank] != item_id: rank += 1 num_cases_full += 1.0 if rank < recall_num: ndcg_50_full += 1.0 / np.log2(rank + 2.0) hitrate_50_full += 1.0 if item_degree <= median_item_degree: num_cases_half += 1.0 if rank < recall_num: ndcg_50_half += 1.0 / np.log2(rank + 2.0) hitrate_50_half += 1.0 ndcg_50_full /= num_cases_full hitrate_50_full /= num_cases_full ndcg_50_half /= num_cases_half hitrate_50_half /= num_cases_half return np.array([ndcg_50_full, ndcg_50_half, hitrate_50_full, hitrate_50_half], dtype=np.float32) def evalation(res, answers, item_deg=None, recall_num=50): if item_deg is not None: _ = {} for phase in answers.keys(): _.setdefault(phase, {}) for k,v in answers[phase].items(): _[phase][k] = (v, item_deg[v]) answers = _ finally_score = np.zeros(4, dtype=np.float32) phase_score = {} for phase in res.keys(): # We sum the scores from all the phases, instead of averaging them. score = evaluate_each_phase(res[phase], answers[phase], recall_num) print(f"phase: {phase}, hitrate_full:{score[2]}, ndcg_full:{score[0]}, hitrate_half:{score[3]}, ndcg_half:{score[1]}") finally_score += score phase_score[phase] = str(score.tolist()) print(f"phase: all, hitrate_full:{finally_score[2]}, ndcg_full:{finally_score[0]}, hitrate_half:{finally_score[3]}, ndcg_half:{finally_score[1]}") return finally_score, phase_score def generate_vail_date(train, valid, id2user, user2idmap2): user = [] seqs = [] labels = [] for user_idx, label_item in tqdm(valid.items(), leave=False, total=len(valid), desc="[EVAL] >> "): if len(label_item) < 1: continue seq = train[user_idx] seq_len = len(seq) if seq_len == 0: continue if seq_len <= args.maxlen: seq_ = [0] * (args.maxlen - seq_len) + seq else: seq_ = seq[-50:] seqs.append(seq_) u = id2user[user_idx] # u = user2idmap2[u.split('_')[-1]] u = user2idmap2[u[2:]] user.append([u]) labels.append(label_item) user_array = np.array(user) seqs_array = np.array(seqs) label_array = np.array(labels) return user, user_array, seqs_array, label_array def gen(user, user_array, seqs_array, label_array, batch_size): for i in range(len(user)//batch_size): yield (user_array[i*batch_size:(i+1)*batch_size], seqs_array[i*batch_size:(i+1)*batch_size], label_array[i*batch_size:(i+1)*batch_size]) yield (user_array[(i+1)*batch_size:], seqs_array[(i+1)*batch_size:], label_array[(i+1)*batch_size:]) class Args: lr = 0.002 maxlen = 50 hidden_units = 256 num_blocks = 1 dropout_rate = 0.5 num_heads = 2 l2_emb = 0.0 if __name__ == "__main__": now_phase = 9 parser = argparse.ArgumentParser() parser.add_argument("--kind", type=int, default=0) parser.add_argument("--train", type=int, default=0) parser.add_argument("--test", type=int, default=0) parser.add_argument("--valid", type=int, default=0) args = parser.parse_args() kind = int(args.kind) if kind == 1: read_path = config.online_item_file save_path = config.save_path_online model_base_path = 'ckpt' train_path = config.train_path test_path = config.test_path elif kind == 2: read_path = config.offline_item_file save_path = config.save_path_offline model_base_path = 'ckpt2' train_path = config.offline_path test_path = config.offline_path elif kind == 3: read_path = config.model1_item_file save_path = config.save_path_model1 model_base_path = 'ckpt3' train_path = config.model1_path test_path = config.model1_path train, valid, n_items, n_users, id2item, id2user, \ item_deg, emb, use_test, user2idmap2, num_users2 = get_data(now_phase, train_path, test_path, kind) # , base_path='F:\data_kdd', emb = np.concatenate((np.zeros((1,256)), emb), axis=0) / 25 usr_emb = 0 print('Reading data done.') train_flag = args.train valid_flag = args.valid test_flag = args.test test_flag2 = 0 num_neg = 20 batch_size = 256 args = Args() num_batch = len(train) // batch_size num_epochs = 75 item_set = np.arange(1, n_items+1) config = tf.ConfigProto() config.gpu_options.allow_growth = True config.allow_soft_placement = True sess = tf.Session(config=config) print(n_items) sampler = WarpSampler(train, n_users, n_items, id2user, user2idmap2, num_neg=num_neg, batch_size=batch_size, maxlen=args.maxlen, n_workers=3) model = Model(num_users2, n_items, args, emb, num_neg, dec_step=num_batch*25, emb_usr=usr_emb) sess.run(tf.initialize_all_variables()) sess.run(tf.assign(model.item_emb_table, model.emb_item)) # sess.run(tf.assign(model.user_emb_table, model.usr_emb)) user, user_array, seqs_array, label_array = generate_vail_date(train, valid, id2user, user2idmap2) valid_array = [user, user_array, seqs_array, label_array] idx = np.random.choice(len(user), 5000, replace=False) user2, user_array2, seqs_array2, label_array2= [], [], [], [] for i in range(len(idx)): user2.append(user[idx[i]]) user_array2.append(user_array[idx[i]]) seqs_array2.append(seqs_array[idx[i]]) label_array2.append(label_array[idx[i]]) valid_array2 = [user2, user_array2, seqs_array2, label_array2] saver = tf.train.Saver() ckpt_path = os.path.join(model_base_path, 'model.ckpt') if not os.path.exists(model_base_path): os.mkdir(model_base_path) # saver.restore(sess, ckpt_path) if train_flag: finally_score = [0] best_score = 0 for epoch in range(1, num_epochs + 1): # auc_ = [] loss_ = [] for step in tqdm(range(num_batch), total=num_batch, ncols=70, leave=False, unit='b'): u, seq, pos, neg = sampler.next_batch() loss, _ = sess.run([model.loss, model.train_op], {model.u: u, model.input_seq: seq, model.pos: pos, model.neg: neg, model.is_training: True}) # auc_.append(auc) loss_.append(loss) print('epoch:%d, loss:%.3f' %(epoch, np.mean(loss_))) if epoch % 25 == 0: print("[EVAL] valid...") finally_score, phase_score = eval_model(model, sess, train, valid, item_set, item_deg, id2user, args, valid_array2) if finally_score[0] > best_score: best_score = finally_score[0] save_path = saver.save(sess, ckpt_path) ckpt_path = os.path.join(model_base_path, 'model_last.ckpt') save_path = saver.save(sess, ckpt_path) sampler.close() print("Done!") # sess.run(tf.initialize_all_variables()) if valid_flag: ckpt_path = os.path.join(model_base_path, 'model.ckpt') saver.restore(sess, ckpt_path) finally_score, phase_score = eval_model(model, sess, train, valid, item_set, item_deg, id2user, args, valid_array) if test_flag2: ckpt_path = os.path.join(model_base_path, 'model_last.ckpt') saver.restore(sess, ckpt_path) evaluate2(model, [use_test, n_users, n_items], user2idmap2, args, sess, id2item, id2user) from evaulation import evaluate_ evaluate_('pred_valid.csv', answer_fname='model_1/model_1_debias_track_answer.csv') if test_flag: # resotre model ckpt_path = os.path.join(model_base_path, 'model_last.ckpt') saver.restore(sess, ckpt_path) evaluate5(model, [use_test, n_users, n_items], user2idmap2, args, sess, id2item, id2user, save_path=save_path, read_path=read_path) ================================================ FILE: code/3_NN/util.py ================================================ import sys import copy import random import numpy as np from collections import defaultdict import pandas as pd from tqdm import tqdm def evaluate6(model, dataset,user2idmap2, args, sess, id2item, id2user, save_path='pred_valid.csv', read_path='all/offline.csv'): [train, usernum, itemnum] = copy.deepcopy(dataset) pred = [] item_idx = list(range(1, itemnum + 1)) id2itme_list = [id2item[i] for i in item_idx] df2 = pd.read_csv(read_path) item_map = {v:k for (k,v) in id2item.items()} for u in tqdm(train.keys()): if len(train[u]) < 1: print(u) continue score = [] seq = np.zeros([args.maxlen], dtype=np.int32) idx = args.maxlen - 1 for i in reversed(train[u]): seq[idx] = i idx -= 1 if idx == -1: break u2 = id2user[u] # u2 = user2idmap2[int(u2.split('_')[-1])] u2 = user2idmap2[u2[2:]] predictions = model.predict(sess, [u2], [seq], item_idx) predictions = predictions[0] idx = np.argsort(predictions)[::-1][:500] # tmp_list = [id2itme_list[idx[i]] for i in range(500)] # score = [predictions[idx[i]] for i in range(500)] tmp_list = [] score = [] tmp_df = df2[df2['user_id'] == int(id2user[u].split('_')[-1])]['item_id'] if len(tmp_df)>0: items = set(tmp_df.values[0][1:-1].split(',')) tmp_list_set = set(tmp_list) for tmp_item in items: tmp_ = int(tmp_item) if tmp_ not in tmp_list_set: tmp_idx = item_map[tmp_] tmp_list.append(tmp_) score.append(predictions[tmp_idx-1]) pred.append([id2user[u]] + [tmp_list] + [score]) df = pd.DataFrame(pred) df[0] = df[0].apply(lambda x: x.split('_')[-1]) df.columns = ['user', 'item', 'score'] df.to_csv(save_path, index=None) return df def evaluate5(model, dataset,user2idmap2, args, sess, id2item, id2user, save_path='pred_valid.csv', read_path='all/offline.csv'): [train, usernum, itemnum] = copy.deepcopy(dataset) pred = [] item_idx = list(range(1, itemnum + 1)) id2itme_list = [id2item[i] for i in item_idx] df2 = pd.read_csv(read_path) df2 = df2.groupby('user_id')['item_id'].apply(list).reset_index() item_map = {v:k for (k,v) in id2item.items()} for u in tqdm(train.keys()): if len(train[u]) < 1: print(u) continue score = [] seq = np.zeros([args.maxlen], dtype=np.int32) idx = args.maxlen - 1 for i in reversed(train[u]): seq[idx] = i idx -= 1 if idx == -1: break u2 = id2user[u] # u2 = user2idmap2[int(u2.split('_')[-1])] u2 = user2idmap2[u2[2:]] predictions = model.predict(sess, [u2], [seq], item_idx) predictions = predictions[0] idx = np.argsort(predictions)[::-1][:500] tmp_list = [id2itme_list[idx[i]] for i in range(500)] score = [predictions[idx[i]] for i in range(500)] tmp_df = df2[df2['user_id'] == int(id2user[u].split('_')[-1])]['item_id'] if len(tmp_df)>0: items = set(tmp_df.values[0]) # [1:-1].split(',') tmp_list_set = set(tmp_list) for tmp_item in items: tmp_ = int(tmp_item) if tmp_ not in tmp_list_set: tmp_idx = item_map[tmp_] tmp_list.append(tmp_) score.append(predictions[tmp_idx-1]) pred.append([id2user[u]] + [tmp_list] + [score]) df = pd.DataFrame(pred) df[0] = df[0].apply(lambda x: x.split('_')[-1]) df.columns = ['user', 'item', 'score'] df.to_csv(save_path, index=None) return df def evaluate4(model, dataset,user2idmap2, args, sess, id2item, id2user, user2idmap3): [train, usernum, itemnum] = copy.deepcopy(dataset) pred = [] item_idx = list(range(1, itemnum + 1)) id2itme_list = [id2item[i] for i in item_idx] for u in tqdm(train.keys()): if len(train[u]) < 1: print(u) continue seq = np.zeros([args.maxlen], dtype=np.int32) idx = args.maxlen - 1 for i in reversed(train[u]): seq[idx] = i idx -= 1 if idx == -1: break u2 = id2user[u] u3 = user2idmap3[int(u2.split('_')[-1])] u2 = user2idmap2[u2[2:]] # predictions = model.predict(sess, [u2], [u3], [seq], item_idx) predictions = predictions[0] idx = np.argsort(predictions)[::-1][:50] tmp_list = [id2itme_list[idx[i]] for i in range(50)] pred.append([id2user[u]] + tmp_list) df = pd.DataFrame(pred) df[0] = df[0].apply(lambda x: x.split('_')[-1]) df.to_csv('pred_valid.csv', index=None, header=None) return df def evaluate3(model, dataset, args, sess, id2item, id2user, time_array): [train, usernum, itemnum] = copy.deepcopy(dataset) pred = [] item_idx = list(range(1, itemnum + 1)) id2itme_list = [id2item[i] for i in item_idx] for u in tqdm(train.keys()): if len(train[u]) < 1: print(u) continue seq = np.zeros([args.maxlen], dtype=np.int32) t = np.zeros([args.maxlen], dtype=np.int32) idx = args.maxlen - 1 for i, t_ in zip(reversed(train[u]), reversed(time_array[u])): seq[idx] = i t[idx] = t_ idx -= 1 if idx == -1: break predictions = model.predict(sess, [u], [seq], item_idx, [t]) predictions = predictions[0] idx = np.argsort(predictions)[::-1][:50] tmp_list = [id2itme_list[idx[i]] for i in range(50)] pred.append([id2user[u]] + tmp_list) df = pd.DataFrame(pred) df[0] = df[0].apply(lambda x: x.split('_')[-1]) df.to_csv('pred_valid.csv', index=None, header=None) return df def evaluate2(model, dataset,user2idmap2, args, sess, id2item, id2user, save_path='pred_valid.csv'): [train, usernum, itemnum] = copy.deepcopy(dataset) pred = [] item_idx = list(range(1, itemnum + 1)) id2itme_list = [id2item[i] for i in item_idx] for u in tqdm(train.keys()): if len(train[u]) < 1: print(u) continue seq = np.zeros([args.maxlen], dtype=np.int32) idx = args.maxlen - 1 for i in reversed(train[u]): seq[idx] = i idx -= 1 if idx == -1: break u2 = id2user[u] # u2 = user2idmap2[int(u2.split('_')[-1])] u2 = user2idmap2[u2[2:]] predictions = model.predict(sess, [u2], [seq], item_idx) predictions = predictions[0] idx = np.argsort(predictions)[::-1][:50] tmp_list = [id2itme_list[idx[i]] for i in range(50)] pred.append([id2user[u]] + tmp_list) df = pd.DataFrame(pred) df[0] = df[0].apply(lambda x: x.split('_')[-1]) df.to_csv(save_path, index=None, header=None) return df def evaluate(model, dataset, args, sess, id2item, id2user): [train, usernum, itemnum] = copy.deepcopy(dataset) pred = [] item_idx = list(range(1, itemnum + 1)) id2itme_list = [id2item[i] for i in item_idx] for u in tqdm(train.keys()): if len(train[u]) < 1: print(u) continue seq = np.zeros([args.maxlen], dtype=np.int32) idx = args.maxlen - 1 for i in reversed(train[u]): seq[idx] = i idx -= 1 if idx == -1: break predictions = model.predict(sess, [u], [seq], item_idx) predictions = predictions[0] idx = np.argsort(predictions)[::-1][:50] tmp_list = [id2itme_list[idx[i]] for i in range(50)] pred.append([id2user[u]] + tmp_list) df = pd.DataFrame(pred) df[0] = df[0].apply(lambda x: x.split('_')[-1]) df.to_csv('pred_valid.csv', index=None, header=None) return df ================================================ FILE: code/3_Recall/01_Recall-Wu-model1.py ================================================ #!/usr/bin/env python # coding: utf-8 # In[1]: import pandas as pd import numpy as np from tqdm import tqdm import os from collections import defaultdict import math import json from sys import stdout import pickle import time import gc # In[2]: def get_predict(df, pred_col, top_fill, ranknum): top_fill = [int(t) for t in top_fill.split(',')] scores = [-1 * i for i in range(1, len(top_fill) + 1)] ids = list(df['user_id'].unique()) fill_df = pd.DataFrame(ids * len(top_fill), columns=['user_id']) fill_df.sort_values('user_id', inplace=True) fill_df['item_id'] = top_fill * len(ids) fill_df[pred_col] = scores * len(ids) df = df.append(fill_df) df.sort_values(pred_col, ascending=False, inplace=True) df = df.drop_duplicates(subset=['user_id', 'item_id'], keep='first') df['rank'] = df.groupby('user_id')[pred_col].rank(method='first', ascending=False) df = df[df['rank'] <= ranknum] df = df.groupby('user_id')['item_id'].apply(lambda x: ','.join([str(i) for i in x])).str.split(',', expand=True).reset_index() return df # In[3]: def recommend(sim_item_corr, user_item_dict, user_id, times, item_dict, item_time_dict, top_k, item_num): ''' input:item_sim_list, user_item, uid, 500, 50 # 用户历史序列中的所有商品均有关联商品,整合这些关联商品,进行相似性排序 ''' rank = {} interacted_items = user_item_dict[user_id] interacted_items = interacted_items[::-1] times = times[::-1] t0 = times[0] for loc, i in enumerate(interacted_items): for j, wij in sorted(sim_item_corr[i].items(), key=lambda d: d[1]['sim'], reverse=True)[0:top_k]: if j not in interacted_items: rank.setdefault(j, {'sim': 0, 'item_cf': 0, 'item_cf_weighted': 0, 'time_diff': np.inf, 'loc_diff': np.inf, # Some feature generated by recall 'time_diff_recall': np.inf, 'time_diff_recall_1': np.inf, 'loc_diff_recall': np.inf, # Nodesim and Deepsim 'node_sim_max': -1e8, 'node_sim_sum':0, 'deep_sim_max': -1e8, 'deep_sim_sum':0, }) t1 = times[loc] t2 = item_time_dict[j][0] delta_t1 = abs(t0 - t1) * 650000 delta_t2 = abs(t0 - t2) * 650000 alpha = max(0.2, 1 / (1 + item_dict[j])) beta = max(0.5, (0.9 ** loc)) theta = max(0.5, 1 / (1 + delta_t1)) gamma = max(0.5, 1 / (1 + delta_t2)) rank[j]['sim'] += wij['sim'] * (alpha ** 2) * (beta) * (theta ** 2) * gamma rank[j]['item_cf'] += wij['item_cf'] rank[j]['item_cf_weighted'] += wij['item_cf_weighted'] if wij['time_diff'] < rank[j]['time_diff']: rank[j]['time_diff'] = wij['time_diff'] if wij['loc_diff'] < rank[j]['loc_diff']: rank[j]['loc_diff'] = wij['loc_diff'] if delta_t1 < rank[j]['time_diff_recall']: rank[j]['time_diff_recall'] = delta_t1 if delta_t2 < rank[j]['time_diff_recall_1']: rank[j]['time_diff_recall_1'] = delta_t2 if loc < rank[j]['loc_diff_recall']: rank[j]['loc_diff_recall'] = loc if wij['node_sim_max'] > rank[j]['node_sim_max']: rank[j]['node_sim_max'] = wij['node_sim_max'] rank[j]['node_sim_sum'] += wij['node_sim_sum'] / wij['item_cf'] if wij['deep_sim_max'] > rank[j]['deep_sim_max']: rank[j]['deep_sim_max'] = wij['deep_sim_max'] rank[j]['deep_sim_sum'] += wij['deep_sim_sum'] / wij['item_cf'] return sorted(rank.items(), key=lambda d: d[1]['sim'], reverse=True)[:item_num] # In[4]: now_phase = 9 offline = "./user_data/model_1/" header = 'model_1' input_path = './user_data/model_1/new_similarity/' output_path = './user_data/model_1/new_recall/' # In[5]: # recom_item = [] # for c in range(now_phase + 1): # a = time.time() # print('phase:', c) # with open(input_path+'itemCF_new'+str(c)+'.pkl','rb') as f: # item_sim_list = pickle.load(f) # with open(input_path+'user2item_new'+str(c)+'.pkl','rb') as f: # user_item = pickle.load(f) # with open(input_path+'item2cnt_new'+str(c)+'.pkl','rb') as f: # item_dic = pickle.load(f) # with open(input_path+'userTime'+str(c)+'.pkl','rb') as f: # user_time_dict = pickle.load(f) # with open(input_path+'itemTime'+str(c)+'.pkl','rb') as f: # item_time_dict = pickle.load(f) # qtime_test = pd.read_csv(offline + header + '_test_qtime-{}.csv'.format(c), header=None, # names=['user_id', 'item_id', 'time']) # for user in tqdm(qtime_test['user_id'].unique()): # if user in user_time_dict: # times = user_time_dict[user] # rank_item = recommend(item_sim_list, user_item, user, times, item_dic, item_time_dict, 500, 500) # for j in rank_item: # recom_item.append([user, int(j[0])] + list(j[1].values())) # gc.collect() file = open(input_path + 'recom_item.pkl', 'rb') recom_item = pickle.load(file) file.close() # In[6]: for phase in range(now_phase + 1): a = time.time() history_list = [] for i in range(now_phase + 1): click_train = pd.read_csv(offline + header + '_train_click-{}.csv'.format(i), header=None, names=['user_id', 'item_id', 'time']) click_test = pd.read_csv(offline + header + '_test_click-{}.csv'.format(i), header=None, names=['user_id', 'item_id', 'time']) all_click = click_train.append(click_test) history_list.append(all_click) # qtime_test = pd.read_csv(offline + 'offline_test_qtime-{}.csv'.format(phase), header=None, # names=['user_id', 'item_id', 'time']) click_test = pd.read_csv(offline + header+ '_test_click-{}.csv'.format(phase), header=None, names=['user_id', 'item_id', 'time']) print(click_test['user_id'].nunique()) print('phase:', phase) time_diff = max(history_list[now_phase]['time']) - min(history_list[0]['time']) for i in range(phase + 1, now_phase + 1): history_list[i]['time'] = history_list[i]['time'] - time_diff whole_click = pd.DataFrame() for i in range(now_phase + 1): whole_click = whole_click.append(history_list[i]) whole_click = whole_click.drop_duplicates(subset=['user_id', 'item_id', 'time'], keep='last') whole_click = whole_click.sort_values('time') whole_click = whole_click.reset_index(drop=True) # In[7]: def phase_predict(df, pred_col, top_fill, topk=50): """recom_df, 'sim', top50_click, "click_valid" """ top_fill = [int(t) for t in top_fill.split(',')] top_fill = top_fill[:topk] scores = [-1 * i for i in range(1, len(top_fill) + 1)] ids = list(df['user_id'].unique()) fill_df = pd.DataFrame(ids * len(top_fill), columns=['user_id']) fill_df.sort_values('user_id', inplace=True) fill_df['item_id'] = top_fill * len(ids) fill_df[pred_col] = scores * len(ids) df = df.append(fill_df) df.sort_values(pred_col, ascending=False, inplace=True) df = df.drop_duplicates(subset=['user_id', 'item_id'], keep='first') df['rank'] = df.groupby('user_id')[pred_col].rank(method='first', ascending=False) df.sort_values("rank", inplace=True) df = df[df["rank"] <= topk] df = df.groupby('user_id')['item_id'].apply(lambda x: ','.join([str(i) for i in x])).str.split(',', expand=True).reset_index() return df # In[8]: # find most popular items top50_click = whole_click['item_id'].value_counts().index[:500].values top50_click = ','.join([str(i) for i in top50_click]) recom_df = pd.DataFrame(recom_item, columns=['user_id', 'item_id', 'sim'] + ['feature_' + str(x) for x in range(len(recom_item[0]) - 3)]) result = phase_predict(recom_df, 'sim', top50_click, 50) result['user_id'] = result['user_id'].astype(int) result.to_csv('Recall_0531.csv', index=False, header=None) # In[9]: import datetime # In[10]: # the higher scores, the better performance def evaluate_each_phase(predictions, answers, rank_num): list_item_degress = [] for user_id in answers: item_id, item_degree = answers[user_id] list_item_degress.append(item_degree) list_item_degress.sort() median_item_degree = list_item_degress[len(list_item_degress) // 2] num_cases_full = 0.0 ndcg_50_full = 0.0 ndcg_50_half = 0.0 num_cases_half = 0.0 hitrate_50_full = 0.0 hitrate_50_half = 0.0 for user_id in answers: item_id, item_degree = answers[user_id] rank = 0 while rank < rank_num and predictions[user_id][rank] != item_id: rank += 1 num_cases_full += 1.0 if rank < rank_num: ndcg_50_full += 1.0 / np.log2(rank + 2.0) hitrate_50_full += 1.0 if item_degree <= median_item_degree: num_cases_half += 1.0 if rank < rank_num: ndcg_50_half += 1.0 / np.log2(rank + 2.0) hitrate_50_half += 1.0 ndcg_50_full /= num_cases_full hitrate_50_full /= num_cases_full ndcg_50_half /= num_cases_half hitrate_50_half /= num_cases_half print([ndcg_50_full, ndcg_50_half, hitrate_50_full, hitrate_50_half]) return np.array([ndcg_50_full, ndcg_50_half, hitrate_50_full, hitrate_50_half], dtype=np.float32) # submit_fname is the path to the file submitted by the participants. # debias_track_answer.csv is the standard answer, which is not released. def evaluate(stdout, submit_fname, answer_fname='debias_track_answer.csv', rank_num=50, current_time=None): schedule_in_unix_time = [ 0, # ........ 1970-01-01 08:00:00 (T=0) 1586534399, # 2020-04-10 23:59:59 (T=1) 1587139199, # 2020-04-17 23:59:59 (T=2) 1587743999, # 2020-04-24 23:59:59 (T=3) 1588348799, # 2020-05-01 23:59:59 (T=4) 1588953599, # 2020-05-08 23:59:59 (T=5) 1589558399, # 2020-05-15 23:59:59 (T=6) 1590163199, # 2020-05-22 23:59:59 (T=7) #1589558399, 1590767999, # 2020-05-29 23:59:59 (T=8) 1591372799 # .2020-06-05 23:59:59 (T=9) ] assert len(schedule_in_unix_time) == 10 for i in range(1, len(schedule_in_unix_time) - 1): # 604800 == one week assert schedule_in_unix_time[i] + 604800 == schedule_in_unix_time[i + 1] if current_time is None: current_time = int(time.time()) print('current_time:', current_time) print('date_time:', datetime.datetime.fromtimestamp(current_time)) current_phase = 0 while (current_phase < 9) and ( current_time > schedule_in_unix_time[current_phase + 1]): current_phase += 1 print('current_phase:', current_phase) try: answers = [{} for _ in range(10)] with open(answer_fname, 'r') as fin: for line in fin: line = [int(x) for x in line.split(',')] phase_id, user_id, item_id, item_degree = line assert user_id % 11 == phase_id # exactly one test case for each user_id answers[phase_id][user_id] = (item_id, item_degree) except Exception as _: print( 'server-side error: answer file incorrect\n') return -1 try: predictions = {} with open(submit_fname, 'r') as fin: for line in fin: line = line.strip() if line == '': continue line = line.split(',') user_id = int(line[0]) if user_id in predictions: print('submitted duplicate user_ids \n') return -1 item_ids = [int(i) for i in line[1:]] if len(item_ids) != rank_num: print('each row need have 50 items \n') return -1 if len(set(item_ids)) != rank_num: print('each row need have 50 DISTINCT items \n') return -1 predictions[user_id] = item_ids except Exception as _: print('submission not in correct format \n') return -1 scores = np.zeros(4, dtype=np.float32) # The final winning teams will be decided based on phase T=7,8,9 only. # We thus fix the scores to 1.0 for phase 0,1,2,...,6 at the final stage. #if current_phase >= 7: # if at the final stage, i.e., T=7,8,9 # scores += 7.0 # then fix the scores to 1.0 for phase 0,1,2,...,6 #phase_beg = (7 if (current_phase >= 7) else 0) phase_beg = 0 phase_end = current_phase + 1 for phase_id in range(phase_beg, phase_end): for user_id in answers[phase_id]: if user_id not in predictions: print('user_id %d of phase %d not in submission' % (user_id, phase_id)) return -1 try: # We sum the scores from all the phases, instead of averaging them. scores += evaluate_each_phase(predictions, answers[phase_id], rank_num) except Exception as _: print('error occurred during evaluation') return -1 return [float(scores[0]),float(scores[0]),float(scores[1]),float(scores[2]),float(scores[3])] # In[11]: recom_df[['user_id','item_id']].to_csv(output_path +'user_item_index.csv', index=False) # In[12]: recom_df.to_csv(output_path + 'recall_0531.csv', index=False) # In[13]: output_path + 'recall_0531.csv' # In[14]: from sys import stdout print(evaluate(stdout,'Recall_0531.csv', answer_fname='./user_data/model_1/model_1_debias_track_answer.csv', rank_num=50)) # current_time: 1590673576 # date_time: 2020-05-28 21:46:16 # current_phase: 6 # [0.07291776530294389, 0.04257302451332752, 0.16795865633074936, 0.10839160839160839] # [0.07522970326234413, 0.047286878349803496, 0.1778875849289685, 0.12471655328798185] # [0.08768431272730617, 0.05220432316374826, 0.2040429564118762, 0.13366960907944514] # [0.08137267931092253, 0.04650284552993235, 0.18584070796460178, 0.10888610763454318] # [0.086082070609559, 0.06099578564127202, 0.20061919504643963, 0.14116251482799524] # [0.08282023366562385, 0.05404211657982558, 0.18724400234055003, 0.1210710128055879] # [0.08625658967639374, 0.05129722585118765, 0.19410745233968804, 0.12543153049482164] # [0.5723633170127869, 0.5723633170127869, 0.3549021780490875, 1.3177005052566528, 0.8633289337158203] # current_time: 1590730998 # date_time: 2020-05-29 13:43:18 # current_phase: 6 # [0.07336197799145278, 0.04333070177814886, 0.17118863049095606, 0.11188811188811189] # [0.07551020515190006, 0.047111743016730066, 0.17974058060531192, 0.12698412698412698] # [0.0877367887009624, 0.052890596296164785, 0.2040429564118762, 0.13619167717528374] # user_id 3 of phase 3 not in submission # In[ ]: # In[ ]: ================================================ FILE: code/3_Recall/01_Recall-Wu-offline.py ================================================ #!/usr/bin/env python # coding: utf-8 # In[7]: import pandas as pd import numpy as np from tqdm import tqdm import os from collections import defaultdict import math import json from sys import stdout import pickle import time import gc # In[8]: def get_predict(df, pred_col, top_fill, ranknum): top_fill = [int(t) for t in top_fill.split(',')] scores = [-1 * i for i in range(1, len(top_fill) + 1)] ids = list(df['user_id'].unique()) fill_df = pd.DataFrame(ids * len(top_fill), columns=['user_id']) fill_df.sort_values('user_id', inplace=True) fill_df['item_id'] = top_fill * len(ids) fill_df[pred_col] = scores * len(ids) df = df.append(fill_df) df.sort_values(pred_col, ascending=False, inplace=True) df = df.drop_duplicates(subset=['user_id', 'item_id'], keep='first') df['rank'] = df.groupby('user_id')[pred_col].rank(method='first', ascending=False) df = df[df['rank'] <= ranknum] df = df.groupby('user_id')['item_id'].apply(lambda x: ','.join([str(i) for i in x])).str.split(',', expand=True).reset_index() return df # In[9]: def recommend(sim_item_corr, user_item_dict, user_id, times, item_dict, item_time_dict, top_k, item_num): ''' input:item_sim_list, user_item, uid, 500, 50 # 用户历史序列中的所有商品均有关联商品,整合这些关联商品,进行相似性排序 ''' rank = {} interacted_items = user_item_dict[user_id] interacted_items = interacted_items[::-1] times = times[::-1] t0 = times[0] for loc, i in enumerate(interacted_items): for j, wij in sorted(sim_item_corr[i].items(), key=lambda d: d[1]['sim'], reverse=True)[0:top_k]: if j not in interacted_items: rank.setdefault(j, {'sim': 0, 'item_cf': 0, 'item_cf_weighted': 0, 'time_diff': np.inf, 'loc_diff': np.inf, # Some feature generated by recall 'time_diff_recall': np.inf, 'time_diff_recall_1': np.inf, 'loc_diff_recall': np.inf, # Nodesim and Deepsim 'node_sim_max': -1e8, 'node_sim_sum':0, 'deep_sim_max': -1e8, 'deep_sim_sum':0, }) t1 = times[loc] t2 = item_time_dict[j][0] delta_t1 = abs(t0 - t1) * 650000 delta_t2 = abs(t0 - t2) * 650000 alpha = max(0.2, 1 / (1 + item_dict[j])) beta = max(0.5, (0.9 ** loc)) theta = max(0.5, 1 / (1 + delta_t1)) gamma = max(0.5, 1 / (1 + delta_t2)) rank[j]['sim'] += wij['sim'] * (alpha ** 2) * (beta) * (theta ** 2) * gamma rank[j]['item_cf'] += wij['item_cf'] rank[j]['item_cf_weighted'] += wij['item_cf_weighted'] if wij['time_diff'] < rank[j]['time_diff']: rank[j]['time_diff'] = wij['time_diff'] if wij['loc_diff'] < rank[j]['loc_diff']: rank[j]['loc_diff'] = wij['loc_diff'] if delta_t1 < rank[j]['time_diff_recall']: rank[j]['time_diff_recall'] = delta_t1 if delta_t2 < rank[j]['time_diff_recall_1']: rank[j]['time_diff_recall_1'] = delta_t2 if loc < rank[j]['loc_diff_recall']: rank[j]['loc_diff_recall'] = loc if wij['node_sim_max'] > rank[j]['node_sim_max']: rank[j]['node_sim_max'] = wij['node_sim_max'] rank[j]['node_sim_sum'] += wij['node_sim_sum'] / wij['item_cf'] if wij['deep_sim_max'] > rank[j]['deep_sim_max']: rank[j]['deep_sim_max'] = wij['deep_sim_max'] rank[j]['deep_sim_sum'] += wij['deep_sim_sum'] / wij['item_cf'] return sorted(rank.items(), key=lambda d: d[1]['sim'], reverse=True)[:item_num] # In[10]: now_phase = 9 offline = "./user_data/offline/" header = 'offline' input_path = './user_data/offline/new_similarity/' output_path = './user_data/offline/new_recall/' # In[11]: # recom_item = [] # for c in range(now_phase + 1): # a = time.time() # print('phase:', c) # with open(input_path+'itemCF_new'+str(c)+'.pkl','rb') as f: # item_sim_list = pickle.load(f) # with open(input_path+'user2item_new'+str(c)+'.pkl','rb') as f: # user_item = pickle.load(f) # with open(input_path+'item2cnt_new'+str(c)+'.pkl','rb') as f: # item_dic = pickle.load(f) # with open(input_path+'userTime'+str(c)+'.pkl','rb') as f: # user_time_dict = pickle.load(f) # with open(input_path+'itemTime'+str(c)+'.pkl','rb') as f: # item_time_dict = pickle.load(f) # qtime_test = pd.read_csv(offline + header + '_test_qtime-{}.csv'.format(c), header=None, # names=['user_id', 'item_id', 'time']) # for user in tqdm(qtime_test['user_id'].unique()): # if user in user_time_dict: # times = user_time_dict[user] # rank_item = recommend(item_sim_list, user_item, user, times, item_dic, item_time_dict, 500, 500) # for j in rank_item: # recom_item.append([user, int(j[0])] + list(j[1].values())) # gc.collect() file = open(input_path + 'recom_item.pkl', 'rb') recom_item = pickle.load(file) file.close() # In[12]: for phase in range(now_phase + 1): a = time.time() history_list = [] for i in range(now_phase + 1): click_train = pd.read_csv(offline + header + '_train_click-{}.csv'.format(i), header=None, names=['user_id', 'item_id', 'time']) click_test = pd.read_csv(offline + header + '_test_click-{}.csv'.format(i), header=None, names=['user_id', 'item_id', 'time']) all_click = click_train.append(click_test) history_list.append(all_click) # qtime_test = pd.read_csv(offline + 'offline_test_qtime-{}.csv'.format(phase), header=None, # names=['user_id', 'item_id', 'time']) click_test = pd.read_csv(offline + header+ '_test_click-{}.csv'.format(phase), header=None, names=['user_id', 'item_id', 'time']) print(click_test['user_id'].nunique()) print('phase:', phase) time_diff = max(history_list[now_phase]['time']) - min(history_list[0]['time']) for i in range(phase + 1, now_phase + 1): history_list[i]['time'] = history_list[i]['time'] - time_diff whole_click = pd.DataFrame() for i in range(now_phase + 1): whole_click = whole_click.append(history_list[i]) whole_click = whole_click.drop_duplicates(subset=['user_id', 'item_id', 'time'], keep='last') whole_click = whole_click.sort_values('time') whole_click = whole_click.reset_index(drop=True) # In[13]: def phase_predict(df, pred_col, top_fill, topk=50): """recom_df, 'sim', top50_click, "click_valid" """ top_fill = [int(t) for t in top_fill.split(',')] top_fill = top_fill[:topk] scores = [-1 * i for i in range(1, len(top_fill) + 1)] ids = list(df['user_id'].unique()) fill_df = pd.DataFrame(ids * len(top_fill), columns=['user_id']) fill_df.sort_values('user_id', inplace=True) fill_df['item_id'] = top_fill * len(ids) fill_df[pred_col] = scores * len(ids) df = df.append(fill_df) df.sort_values(pred_col, ascending=False, inplace=True) df = df.drop_duplicates(subset=['user_id', 'item_id'], keep='first') df['rank'] = df.groupby('user_id')[pred_col].rank(method='first', ascending=False) df.sort_values("rank", inplace=True) df = df[df["rank"] <= topk] df = df.groupby('user_id')['item_id'].apply(lambda x: ','.join([str(i) for i in x])).str.split(',', expand=True).reset_index() return df # In[14]: # find most popular items top50_click = whole_click['item_id'].value_counts().index[:500].values top50_click = ','.join([str(i) for i in top50_click]) recom_df = pd.DataFrame(recom_item, columns=['user_id', 'item_id', 'sim'] + ['feature_' + str(x) for x in range(len(recom_item[0]) - 3)]) result = phase_predict(recom_df, 'sim', top50_click, 50) result['user_id'] = result['user_id'].astype(int) result.to_csv('Recall_0531.csv', index=False, header=None) # In[15]: import datetime # In[16]: # the higher scores, the better performance def evaluate_each_phase(predictions, answers, rank_num): list_item_degress = [] for user_id in answers: item_id, item_degree = answers[user_id] list_item_degress.append(item_degree) list_item_degress.sort() median_item_degree = list_item_degress[len(list_item_degress) // 2] num_cases_full = 0.0 ndcg_50_full = 0.0 ndcg_50_half = 0.0 num_cases_half = 0.0 hitrate_50_full = 0.0 hitrate_50_half = 0.0 for user_id in answers: item_id, item_degree = answers[user_id] rank = 0 while rank < rank_num and predictions[user_id][rank] != item_id: rank += 1 num_cases_full += 1.0 if rank < rank_num: ndcg_50_full += 1.0 / np.log2(rank + 2.0) hitrate_50_full += 1.0 if item_degree <= median_item_degree: num_cases_half += 1.0 if rank < rank_num: ndcg_50_half += 1.0 / np.log2(rank + 2.0) hitrate_50_half += 1.0 ndcg_50_full /= num_cases_full hitrate_50_full /= num_cases_full ndcg_50_half /= num_cases_half hitrate_50_half /= num_cases_half print([ndcg_50_full, ndcg_50_half, hitrate_50_full, hitrate_50_half]) return np.array([ndcg_50_full, ndcg_50_half, hitrate_50_full, hitrate_50_half], dtype=np.float32) # submit_fname is the path to the file submitted by the participants. # debias_track_answer.csv is the standard answer, which is not released. def evaluate(stdout, submit_fname, answer_fname='debias_track_answer.csv', rank_num=50, current_time=None): schedule_in_unix_time = [ 0, # ........ 1970-01-01 08:00:00 (T=0) 1586534399, # 2020-04-10 23:59:59 (T=1) 1587139199, # 2020-04-17 23:59:59 (T=2) 1587743999, # 2020-04-24 23:59:59 (T=3) 1588348799, # 2020-05-01 23:59:59 (T=4) 1588953599, # 2020-05-08 23:59:59 (T=5) 1589558399, # 2020-05-15 23:59:59 (T=6) 1590163199, # 2020-05-22 23:59:59 (T=7) #1589558399, 1590767999, # 2020-05-29 23:59:59 (T=8) 1591372799 # .2020-06-05 23:59:59 (T=9) ] assert len(schedule_in_unix_time) == 10 for i in range(1, len(schedule_in_unix_time) - 1): # 604800 == one week assert schedule_in_unix_time[i] + 604800 == schedule_in_unix_time[i + 1] if current_time is None: current_time = int(time.time()) print('current_time:', current_time) print('date_time:', datetime.datetime.fromtimestamp(current_time)) current_phase = 0 while (current_phase < 9) and ( current_time > schedule_in_unix_time[current_phase + 1]): current_phase += 1 print('current_phase:', current_phase) try: answers = [{} for _ in range(10)] with open(answer_fname, 'r') as fin: for line in fin: line = [int(x) for x in line.split(',')] phase_id, user_id, item_id, item_degree = line assert user_id % 11 == phase_id # exactly one test case for each user_id answers[phase_id][user_id] = (item_id, item_degree) except Exception as _: print( 'server-side error: answer file incorrect\n') return -1 try: predictions = {} with open(submit_fname, 'r') as fin: for line in fin: line = line.strip() if line == '': continue line = line.split(',') user_id = int(line[0]) if user_id in predictions: print('submitted duplicate user_ids \n') return -1 item_ids = [int(i) for i in line[1:]] if len(item_ids) != rank_num: print('each row need have 50 items \n') return -1 if len(set(item_ids)) != rank_num: print('each row need have 50 DISTINCT items \n') return -1 predictions[user_id] = item_ids except Exception as _: print('submission not in correct format \n') return -1 scores = np.zeros(4, dtype=np.float32) # The final winning teams will be decided based on phase T=7,8,9 only. # We thus fix the scores to 1.0 for phase 0,1,2,...,6 at the final stage. #if current_phase >= 7: # if at the final stage, i.e., T=7,8,9 # scores += 7.0 # then fix the scores to 1.0 for phase 0,1,2,...,6 #phase_beg = (7 if (current_phase >= 7) else 0) phase_beg = 0 phase_end = current_phase + 1 for phase_id in range(phase_beg, phase_end): for user_id in answers[phase_id]: if user_id not in predictions: print('user_id %d of phase %d not in submission' % (user_id, phase_id)) return -1 try: # We sum the scores from all the phases, instead of averaging them. scores += evaluate_each_phase(predictions, answers[phase_id], rank_num) except Exception as _: print('error occurred during evaluation') return -1 return [float(scores[0]),float(scores[0]),float(scores[1]),float(scores[2]),float(scores[3])] # In[17]: recom_df[['user_id','item_id']].to_csv(output_path +'user_item_index.csv', index=False) # In[18]: recom_df.to_csv(output_path + 'recall_0531.csv', index=False) # In[19]: output_path + 'recall_0531.csv' # In[20]: from sys import stdout print(evaluate(stdout,'Recall_0531.csv', answer_fname=offline + header + '_debias_track_answer.csv', rank_num=50)) # current_time: 1590673576 # date_time: 2020-05-28 21:46:16 # current_phase: 6 # [0.07291776530294389, 0.04257302451332752, 0.16795865633074936, 0.10839160839160839] # [0.07522970326234413, 0.047286878349803496, 0.1778875849289685, 0.12471655328798185] # [0.08768431272730617, 0.05220432316374826, 0.2040429564118762, 0.13366960907944514] # [0.08137267931092253, 0.04650284552993235, 0.18584070796460178, 0.10888610763454318] # [0.086082070609559, 0.06099578564127202, 0.20061919504643963, 0.14116251482799524] # [0.08282023366562385, 0.05404211657982558, 0.18724400234055003, 0.1210710128055879] # [0.08625658967639374, 0.05129722585118765, 0.19410745233968804, 0.12543153049482164] # [0.5723633170127869, 0.5723633170127869, 0.3549021780490875, 1.3177005052566528, 0.8633289337158203] # current_time: 1590730998 # date_time: 2020-05-29 13:43:18 # current_phase: 6 # [0.07336197799145278, 0.04333070177814886, 0.17118863049095606, 0.11188811188811189] # [0.07551020515190006, 0.047111743016730066, 0.17974058060531192, 0.12698412698412698] # [0.0877367887009624, 0.052890596296164785, 0.2040429564118762, 0.13619167717528374] # user_id 3 of phase 3 not in submission ================================================ FILE: code/3_Recall/01_Recall-Wu-online.py ================================================ #!/usr/bin/env python # coding: utf-8 # In[1]: import pandas as pd import numpy as np from tqdm import tqdm import os from collections import defaultdict import math import json from sys import stdout import pickle import time import gc # In[2]: def get_predict(df, pred_col, top_fill, ranknum): top_fill = [int(t) for t in top_fill.split(',')] scores = [-1 * i for i in range(1, len(top_fill) + 1)] ids = list(df['user_id'].unique()) fill_df = pd.DataFrame(ids * len(top_fill), columns=['user_id']) fill_df.sort_values('user_id', inplace=True) fill_df['item_id'] = top_fill * len(ids) fill_df[pred_col] = scores * len(ids) df = df.append(fill_df) df.sort_values(pred_col, ascending=False, inplace=True) df = df.drop_duplicates(subset=['user_id', 'item_id'], keep='first') df['rank'] = df.groupby('user_id')[pred_col].rank(method='first', ascending=False) df = df[df['rank'] <= ranknum] df = df.groupby('user_id')['item_id'].apply(lambda x: ','.join([str(i) for i in x])).str.split(',', expand=True).reset_index() return df # In[3]: def recommend(sim_item_corr, user_item_dict, user_id, times, item_dict, item_time_dict, top_k, item_num): ''' input:item_sim_list, user_item, uid, 500, 50 # 用户历史序列中的所有商品均有关联商品,整合这些关联商品,进行相似性排序 ''' rank = {} interacted_items = user_item_dict[user_id] interacted_items = interacted_items[::-1] times = times[::-1] t0 = times[0] for loc, i in enumerate(interacted_items): for j, wij in sorted(sim_item_corr[i].items(), key=lambda d: d[1]['sim'], reverse=True)[0:top_k]: if j not in interacted_items: rank.setdefault(j, {'sim': 0, 'item_cf': 0, 'item_cf_weighted': 0, 'time_diff': np.inf, 'loc_diff': np.inf, # Some feature generated by recall 'time_diff_recall': np.inf, 'time_diff_recall_1': np.inf, 'loc_diff_recall': np.inf, # Nodesim and Deepsim 'node_sim_max': -1e8, 'node_sim_sum':0, 'deep_sim_max': -1e8, 'deep_sim_sum':0, }) t1 = times[loc] t2 = item_time_dict[j][0] delta_t1 = abs(t0 - t1) * 650000 delta_t2 = abs(t0 - t2) * 650000 alpha = max(0.2, 1 / (1 + item_dict[j])) beta = max(0.5, (0.9 ** loc)) theta = max(0.5, 1 / (1 + delta_t1)) gamma = max(0.5, 1 / (1 + delta_t2)) rank[j]['sim'] += wij['sim'] * (alpha ** 2) * (beta) * (theta ** 2) * gamma rank[j]['item_cf'] += wij['item_cf'] rank[j]['item_cf_weighted'] += wij['item_cf_weighted'] if wij['time_diff'] < rank[j]['time_diff']: rank[j]['time_diff'] = wij['time_diff'] if wij['loc_diff'] < rank[j]['loc_diff']: rank[j]['loc_diff'] = wij['loc_diff'] if delta_t1 < rank[j]['time_diff_recall']: rank[j]['time_diff_recall'] = delta_t1 if delta_t2 < rank[j]['time_diff_recall_1']: rank[j]['time_diff_recall_1'] = delta_t2 if loc < rank[j]['loc_diff_recall']: rank[j]['loc_diff_recall'] = loc if wij['node_sim_max'] > rank[j]['node_sim_max']: rank[j]['node_sim_max'] = wij['node_sim_max'] rank[j]['node_sim_sum'] += wij['node_sim_sum'] / wij['item_cf'] if wij['deep_sim_max'] > rank[j]['deep_sim_max']: rank[j]['deep_sim_max'] = wij['deep_sim_max'] rank[j]['deep_sim_sum'] += wij['deep_sim_sum'] / wij['item_cf'] return sorted(rank.items(), key=lambda d: d[1]['sim'], reverse=True)[:item_num] # In[4]: now_phase = 9 offline = "./user_data/dataset/" header = 'underexpose' input_path = './user_data/dataset/new_similarity/' output_path = './user_data/dataset/new_recall/' # In[5]: # recom_item = [] # for c in range(now_phase + 1): # a = time.time() # print('phase:', c) # with open(input_path+'itemCF_new'+str(c)+'.pkl','rb') as f: # item_sim_list = pickle.load(f) # with open(input_path+'user2item_new'+str(c)+'.pkl','rb') as f: # user_item = pickle.load(f) # with open(input_path+'item2cnt_new'+str(c)+'.pkl','rb') as f: # item_dic = pickle.load(f) # with open(input_path+'userTime'+str(c)+'.pkl','rb') as f: # user_time_dict = pickle.load(f) # with open(input_path+'itemTime'+str(c)+'.pkl','rb') as f: # item_time_dict = pickle.load(f) # qtime_test = pd.read_csv(offline + header + '_test_qtime-{}.csv'.format(c), header=None, # names=['user_id', 'item_id', 'time']) # for user in tqdm(qtime_test['user_id'].unique()): # if user in user_time_dict: # times = user_time_dict[user] # rank_item = recommend(item_sim_list, user_item, user, times, item_dic, item_time_dict, 500, 500) # for j in rank_item: # recom_item.append([user, int(j[0])] + list(j[1].values())) # gc.collect() file = open(input_path + 'recom_item.pkl', 'rb') recom_item = pickle.load(file) file.close() # In[6]: for phase in range(now_phase + 1): a = time.time() history_list = [] for i in range(now_phase + 1): click_train = pd.read_csv(offline + header + '_train_click-{}.csv'.format(i), header=None, names=['user_id', 'item_id', 'time']) click_test = pd.read_csv(offline + header + '_test_click-{}.csv'.format(i), header=None, names=['user_id', 'item_id', 'time']) all_click = click_train.append(click_test) history_list.append(all_click) # qtime_test = pd.read_csv(offline + 'offline_test_qtime-{}.csv'.format(phase), header=None, # names=['user_id', 'item_id', 'time']) click_test = pd.read_csv(offline + header+ '_test_click-{}.csv'.format(phase), header=None, names=['user_id', 'item_id', 'time']) print(click_test['user_id'].nunique()) print('phase:', phase) time_diff = max(history_list[now_phase]['time']) - min(history_list[0]['time']) for i in range(phase + 1, now_phase + 1): history_list[i]['time'] = history_list[i]['time'] - time_diff whole_click = pd.DataFrame() for i in range(now_phase + 1): whole_click = whole_click.append(history_list[i]) whole_click = whole_click.drop_duplicates(subset=['user_id', 'item_id', 'time'], keep='last') whole_click = whole_click.sort_values('time') whole_click = whole_click.reset_index(drop=True) # In[7]: def phase_predict(df, pred_col, top_fill, topk=50): """recom_df, 'sim', top50_click, "click_valid" """ top_fill = [int(t) for t in top_fill.split(',')] top_fill = top_fill[:topk] scores = [-1 * i for i in range(1, len(top_fill) + 1)] ids = list(df['user_id'].unique()) fill_df = pd.DataFrame(ids * len(top_fill), columns=['user_id']) fill_df.sort_values('user_id', inplace=True) fill_df['item_id'] = top_fill * len(ids) fill_df[pred_col] = scores * len(ids) df = df.append(fill_df) df.sort_values(pred_col, ascending=False, inplace=True) df = df.drop_duplicates(subset=['user_id', 'item_id'], keep='first') df['rank'] = df.groupby('user_id')[pred_col].rank(method='first', ascending=False) df.sort_values("rank", inplace=True) df = df[df["rank"] <= topk] df = df.groupby('user_id')['item_id'].apply(lambda x: ','.join([str(i) for i in x])).str.split(',', expand=True).reset_index() return df # In[8]: # find most popular items top50_click = whole_click['item_id'].value_counts().index[:500].values top50_click = ','.join([str(i) for i in top50_click]) recom_df = pd.DataFrame(recom_item, columns=['user_id', 'item_id', 'sim'] + ['feature_' + str(x) for x in range(len(recom_item[0]) - 3)]) result = phase_predict(recom_df, 'sim', top50_click, 50) result['user_id'] = result['user_id'].astype(int) result.to_csv('Recall_0531.csv', index=False, header=None) # In[9]: import datetime # In[10]: # the higher scores, the better performance def evaluate_each_phase(predictions, answers, rank_num): list_item_degress = [] for user_id in answers: item_id, item_degree = answers[user_id] list_item_degress.append(item_degree) list_item_degress.sort() median_item_degree = list_item_degress[len(list_item_degress) // 2] num_cases_full = 0.0 ndcg_50_full = 0.0 ndcg_50_half = 0.0 num_cases_half = 0.0 hitrate_50_full = 0.0 hitrate_50_half = 0.0 for user_id in answers: item_id, item_degree = answers[user_id] rank = 0 while rank < rank_num and predictions[user_id][rank] != item_id: rank += 1 num_cases_full += 1.0 if rank < rank_num: ndcg_50_full += 1.0 / np.log2(rank + 2.0) hitrate_50_full += 1.0 if item_degree <= median_item_degree: num_cases_half += 1.0 if rank < rank_num: ndcg_50_half += 1.0 / np.log2(rank + 2.0) hitrate_50_half += 1.0 ndcg_50_full /= num_cases_full hitrate_50_full /= num_cases_full ndcg_50_half /= num_cases_half hitrate_50_half /= num_cases_half print([ndcg_50_full, ndcg_50_half, hitrate_50_full, hitrate_50_half]) return np.array([ndcg_50_full, ndcg_50_half, hitrate_50_full, hitrate_50_half], dtype=np.float32) # submit_fname is the path to the file submitted by the participants. # debias_track_answer.csv is the standard answer, which is not released. def evaluate(stdout, submit_fname, answer_fname='debias_track_answer.csv', rank_num=50, current_time=None): schedule_in_unix_time = [ 0, # ........ 1970-01-01 08:00:00 (T=0) 1586534399, # 2020-04-10 23:59:59 (T=1) 1587139199, # 2020-04-17 23:59:59 (T=2) 1587743999, # 2020-04-24 23:59:59 (T=3) 1588348799, # 2020-05-01 23:59:59 (T=4) 1588953599, # 2020-05-08 23:59:59 (T=5) 1589558399, # 2020-05-15 23:59:59 (T=6) 1590163199, # 2020-05-22 23:59:59 (T=7) #1589558399, 1590767999, # 2020-05-29 23:59:59 (T=8) 1591372799 # .2020-06-05 23:59:59 (T=9) ] assert len(schedule_in_unix_time) == 10 for i in range(1, len(schedule_in_unix_time) - 1): # 604800 == one week assert schedule_in_unix_time[i] + 604800 == schedule_in_unix_time[i + 1] if current_time is None: current_time = int(time.time()) print('current_time:', current_time) print('date_time:', datetime.datetime.fromtimestamp(current_time)) current_phase = 0 while (current_phase < 9) and ( current_time > schedule_in_unix_time[current_phase + 1]): current_phase += 1 print('current_phase:', current_phase) try: answers = [{} for _ in range(10)] with open(answer_fname, 'r') as fin: for line in fin: line = [int(x) for x in line.split(',')] phase_id, user_id, item_id, item_degree = line assert user_id % 11 == phase_id # exactly one test case for each user_id answers[phase_id][user_id] = (item_id, item_degree) except Exception as _: print( 'server-side error: answer file incorrect\n') return -1 try: predictions = {} with open(submit_fname, 'r') as fin: for line in fin: line = line.strip() if line == '': continue line = line.split(',') user_id = int(line[0]) if user_id in predictions: print('submitted duplicate user_ids \n') return -1 item_ids = [int(i) for i in line[1:]] if len(item_ids) != rank_num: print('each row need have 50 items \n') return -1 if len(set(item_ids)) != rank_num: print('each row need have 50 DISTINCT items \n') return -1 predictions[user_id] = item_ids except Exception as _: print('submission not in correct format \n') return -1 scores = np.zeros(4, dtype=np.float32) # The final winning teams will be decided based on phase T=7,8,9 only. # We thus fix the scores to 1.0 for phase 0,1,2,...,6 at the final stage. #if current_phase >= 7: # if at the final stage, i.e., T=7,8,9 # scores += 7.0 # then fix the scores to 1.0 for phase 0,1,2,...,6 #phase_beg = (7 if (current_phase >= 7) else 0) phase_beg = 0 phase_end = current_phase + 1 for phase_id in range(phase_beg, phase_end): for user_id in answers[phase_id]: if user_id not in predictions: print('user_id %d of phase %d not in submission' % (user_id, phase_id)) return -1 try: # We sum the scores from all the phases, instead of averaging them. scores += evaluate_each_phase(predictions, answers[phase_id], rank_num) except Exception as _: print('error occurred during evaluation') return -1 return [float(scores[0]),float(scores[0]),float(scores[1]),float(scores[2]),float(scores[3])] # In[11]: recom_df[['user_id','item_id']].to_csv(output_path + 'user_item_index.csv', index=False) # In[12]: recom_df.to_csv(output_path + 'recall_0531.csv', index=False) # In[13]: output_path + 'recall_0531.csv' # In[14]: from sys import stdout print(evaluate(stdout,'Recall_0531.csv', answer_fname=offline + header + '_debias_track_answer.csv', rank_num=50)) # current_time: 1590673576 # date_time: 2020-05-28 21:46:16 # current_phase: 6 # [0.07291776530294389, 0.04257302451332752, 0.16795865633074936, 0.10839160839160839] # [0.07522970326234413, 0.047286878349803496, 0.1778875849289685, 0.12471655328798185] # [0.08768431272730617, 0.05220432316374826, 0.2040429564118762, 0.13366960907944514] # [0.08137267931092253, 0.04650284552993235, 0.18584070796460178, 0.10888610763454318] # [0.086082070609559, 0.06099578564127202, 0.20061919504643963, 0.14116251482799524] # [0.08282023366562385, 0.05404211657982558, 0.18724400234055003, 0.1210710128055879] # [0.08625658967639374, 0.05129722585118765, 0.19410745233968804, 0.12543153049482164] # [0.5723633170127869, 0.5723633170127869, 0.3549021780490875, 1.3177005052566528, 0.8633289337158203] # current_time: 1590730998 # date_time: 2020-05-29 13:43:18 # current_phase: 6 # [0.07336197799145278, 0.04333070177814886, 0.17118863049095606, 0.11188811188811189] # [0.07551020515190006, 0.047111743016730066, 0.17974058060531192, 0.12698412698412698] # [0.0877367887009624, 0.052890596296164785, 0.2040429564118762, 0.13619167717528374] # user_id 3 of phase 3 not in submission ================================================ FILE: code/4_RankFeature/01_sim_feature_model1.py ================================================ #!/usr/bin/env python # coding: utf-8 # In[7]: import pandas as pd import numpy as np from tqdm import tqdm import os from collections import defaultdict import math import json from sys import stdout import pickle # In[8]: def ReComputeSim(sim_cor,candidate_item_list,interacted_items,item_weight_dict,flag=False): sim_list = [] for j in candidate_item_list: sim_tmp = 0 for loc, i in enumerate(interacted_items): #Just for RA gernerated by offline if i not in sim_cor or j not in sim_cor[i]: continue if i in item_weight_dict: sim_tmp += sim_cor[i][j][0] * (0.7**loc) * item_weight_dict[i] if flag else sim_cor[i][j] * (0.7**loc) * item_weight_dict[i] else: sim_tmp += sim_cor[i][j][0] * (0.7**loc) * 0.5 if flag else sim_cor[i][j] * (0.7**loc) * 0.5 sim_list.append(sim_tmp) return sim_list # In[9]: file_name = 'recall_0531' offline = pd.read_csv('./user_data/model_1/new_recall/' + file_name + '.csv') now_phase = 9 train_path = './user_data/model_1/' test_path = './user_data/model_1/' header = 'model_1' out_path = './user_data/model_1/new_similarity/' recom_item = [] whole_click = pd.DataFrame() user_id_list = [] item_id_list = [] item_sim_list = [] ra_sim_list = [] aa_sim_list = [] cn_sim_list = [] txt_sim_list = [] hdi_sim_list = [] hpi_sim_list = [] lhn1_sim_list = [] for c in range(now_phase + 1): print('phase:', c) click_train = pd.read_csv(train_path + header + '_train_click_{}_time.csv'.format(c)) click_test = pd.read_csv(test_path + header + '_test_click_{}_time.csv'.format(c)) click_query = pd.read_csv(test_path + header + '_test_qtime_{}_time.csv'.format(c)) click_train['datetime'] = pd.to_datetime(click_train['datetime']) click_test['datetime'] = pd.to_datetime(click_test['datetime']) click_query['datetime'] = pd.to_datetime(click_query['datetime']) click_train['timestamp'] = click_train['datetime'].dt.day + ( click_train['datetime'].dt.hour + (click_train['datetime'].dt.minute + click_train['datetime'].dt.second/60)/float(60) )/float(24) click_test['timestamp'] = click_test['datetime'].dt.day + ( click_test['datetime'].dt.hour + (click_test['datetime'].dt.minute + click_test['datetime'].dt.second/60)/float(60) )/float(24) click_query['timestamp'] = click_query['datetime'].dt.day + ( click_query['datetime'].dt.hour + (click_query['datetime'].dt.minute + click_query['datetime'].dt.second/60)/float(60) )/float(24) all_click = click_train.append(click_test) with open(out_path+'user2item_new'+str(c)+'.pkl','rb') as f: user_item_tmp = pickle.load(f) with open(out_path+'CN_P'+str(c)+'_new.pkl','rb') as f: CN_sim_list_new = pickle.load(f) for i, row in click_query.iterrows(): offline_tmp = offline[offline['user_id']==row['user_id']] candidate_item_list = list(offline_tmp['item_id']) time_min = min(all_click['timestamp']) time_max = row['timestamp'] df_tmp = all_click[all_click['user_id']==row['user_id']] df_tmp = df_tmp.reset_index(drop=True) df_tmp['weight'] = 1 - (time_max-df_tmp['timestamp']+0.01) / (time_max-time_min+0.01) item_weight_dict = dict(zip(df_tmp['item_id'], df_tmp['weight'])) interacted_items = user_item_tmp[row['user_id']] interacted_items = interacted_items[::-1] sim_list_tmp = ReComputeSim(CN_sim_list_new,candidate_item_list,interacted_items,item_weight_dict) cn_sim_list += sim_list_tmp item_id_list += candidate_item_list user_id_list += [row['user_id'] for x in candidate_item_list] CN_sim_list_new = [] with open(out_path+'HDI_P'+str(c)+'_new.pkl','rb') as f: HDI_sim_list_new = pickle.load(f) for i, row in click_query.iterrows(): offline_tmp = offline[offline['user_id']==row['user_id']] candidate_item_list = list(offline_tmp['item_id']) time_min = min(all_click['timestamp']) time_max = row['timestamp'] df_tmp = all_click[all_click['user_id']==row['user_id']] df_tmp = df_tmp.reset_index(drop=True) df_tmp['weight'] = 1 - (time_max-df_tmp['timestamp']+0.01) / (time_max-time_min+0.01) item_weight_dict = dict(zip(df_tmp['item_id'], df_tmp['weight'])) interacted_items = user_item_tmp[row['user_id']] interacted_items = interacted_items[::-1] sim_list_tmp = ReComputeSim(HDI_sim_list_new,candidate_item_list,interacted_items,item_weight_dict) hdi_sim_list += sim_list_tmp HDI_sim_list_new = [] with open(out_path+'HPI_P'+str(c)+'_new.pkl','rb') as f: HPI_sim_list_new = pickle.load(f) for i, row in click_query.iterrows(): offline_tmp = offline[offline['user_id']==row['user_id']] candidate_item_list = list(offline_tmp['item_id']) time_min = min(all_click['timestamp']) time_max = row['timestamp'] df_tmp = all_click[all_click['user_id']==row['user_id']] df_tmp = df_tmp.reset_index(drop=True) df_tmp['weight'] = 1 - (time_max-df_tmp['timestamp']+0.01) / (time_max-time_min+0.01) item_weight_dict = dict(zip(df_tmp['item_id'], df_tmp['weight'])) interacted_items = user_item_tmp[row['user_id']] interacted_items = interacted_items[::-1] sim_list_tmp = ReComputeSim(HPI_sim_list_new,candidate_item_list,interacted_items,item_weight_dict) hpi_sim_list += sim_list_tmp HPI_sim_list_new = [] with open(out_path+'LHN1_P'+str(c)+'_new.pkl','rb') as f: LHN1_sim_list_new = pickle.load(f) for i, row in click_query.iterrows(): offline_tmp = offline[offline['user_id']==row['user_id']] candidate_item_list = list(offline_tmp['item_id']) time_min = min(all_click['timestamp']) time_max = row['timestamp'] df_tmp = all_click[all_click['user_id']==row['user_id']] df_tmp = df_tmp.reset_index(drop=True) df_tmp['weight'] = 1 - (time_max-df_tmp['timestamp']+0.01) / (time_max-time_min+0.01) item_weight_dict = dict(zip(df_tmp['item_id'], df_tmp['weight'])) interacted_items = user_item_tmp[row['user_id']] interacted_items = interacted_items[::-1] sim_list_tmp = ReComputeSim(LHN1_sim_list_new,candidate_item_list,interacted_items,item_weight_dict) lhn1_sim_list += sim_list_tmp LHN1_sim_list_new = [] # In[ ]: # In[10]: offline.shape # In[11]: len(lhn1_sim_list) # In[ ]: # In[12]: sim_df = pd.DataFrame() sim_df['user_id'] = user_id_list sim_df['item_id'] = item_id_list sim_df['cn_sim'] = cn_sim_list sim_df['hpi_sim'] = hpi_sim_list sim_df['hdi_sim'] = hdi_sim_list sim_df['lhn1_sim'] = lhn1_sim_list # In[13]: sim_df.shape # In[14]: offline = offline.merge(sim_df,on=['user_id','item_id']) # In[ ]: # In[17]: offline.to_csv('./user_data/model_1/new_recall/'+ file_name + '_addsim.csv',index=False) # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: ================================================ FILE: code/4_RankFeature/01_sim_feature_model1_RA_AA.py ================================================ #!/usr/bin/env python # coding: utf-8 # In[ ]: import pandas as pd import numpy as np from tqdm import tqdm import os from collections import defaultdict import math import json from sys import stdout import pickle # In[ ]: def ReComputeSim(sim_cor,candidate_item_list,interacted_items,item_weight_dict,flag=False): sim_list = [] for j in candidate_item_list: sim_tmp = 0 for loc, i in enumerate(interacted_items): #Just for RA gernerated by offline if i not in sim_cor or j not in sim_cor[i]: continue if i in item_weight_dict: sim_tmp += sim_cor[i][j][0] * (0.7**loc) * item_weight_dict[i] if flag else sim_cor[i][j] * (0.7**loc) * item_weight_dict[i] else: sim_tmp += sim_cor[i][j][0] * (0.7**loc) * 0.5 if flag else sim_cor[i][j] * (0.7**loc) * 0.5 sim_list.append(sim_tmp) return sim_list # In[ ]: file_name = 'recall_0531_addsim' offline = pd.read_csv('./user_data/model_1/new_recall/' + file_name + '.csv') now_phase = 9 train_path = './user_data/model_1/' test_path = './user_data/model_1/' header = 'model_1' out_path = './user_data/model_1/new_similarity/' recom_item = [] whole_click = pd.DataFrame() user_id_list = [] item_id_list = [] ra_sim_list = [] aa_sim_list = [] for c in range(now_phase + 1): print('phase:', c) click_train = pd.read_csv(train_path + header + '_train_click_{}_time.csv'.format(c)) click_test = pd.read_csv(test_path + header + '_test_click_{}_time.csv'.format(c)) click_query = pd.read_csv(test_path + header + '_test_qtime_{}_time.csv'.format(c)) click_train['datetime'] = pd.to_datetime(click_train['datetime']) click_test['datetime'] = pd.to_datetime(click_test['datetime']) click_query['datetime'] = pd.to_datetime(click_query['datetime']) click_train['timestamp'] = click_train['datetime'].dt.day + ( click_train['datetime'].dt.hour + (click_train['datetime'].dt.minute + click_train['datetime'].dt.second/60)/float(60) )/float(24) click_test['timestamp'] = click_test['datetime'].dt.day + ( click_test['datetime'].dt.hour + (click_test['datetime'].dt.minute + click_test['datetime'].dt.second/60)/float(60) )/float(24) click_query['timestamp'] = click_query['datetime'].dt.day + ( click_query['datetime'].dt.hour + (click_query['datetime'].dt.minute + click_query['datetime'].dt.second/60)/float(60) )/float(24) all_click = click_train.append(click_test) with open(out_path+'user2item_new'+str(c)+'.pkl','rb') as f: user_item_tmp = pickle.load(f) with open(out_path+'RA_P'+str(c)+'_new.pkl','rb') as f: RA_sim_list_new = pickle.load(f) for i, row in click_query.iterrows(): offline_tmp = offline[offline['user_id']==row['user_id']] candidate_item_list = list(offline_tmp['item_id']) time_min = min(all_click['timestamp']) time_max = row['timestamp'] df_tmp = all_click[all_click['user_id']==row['user_id']] df_tmp = df_tmp.reset_index(drop=True) df_tmp['weight'] = 1 - (time_max-df_tmp['timestamp']+0.01) / (time_max-time_min+0.01) item_weight_dict = dict(zip(df_tmp['item_id'], df_tmp['weight'])) interacted_items = user_item_tmp[row['user_id']] interacted_items = interacted_items[::-1] sim_list_tmp = ReComputeSim(RA_sim_list_new,candidate_item_list,interacted_items,item_weight_dict) ra_sim_list += sim_list_tmp item_id_list += candidate_item_list user_id_list += [row['user_id'] for x in candidate_item_list] RA_sim_list_new = [] with open(out_path+'AA_P'+str(c)+'_new.pkl','rb') as f: AA_sim_list_new = pickle.load(f) for i, row in click_query.iterrows(): offline_tmp = offline[offline['user_id']==row['user_id']] candidate_item_list = list(offline_tmp['item_id']) time_min = min(all_click['timestamp']) time_max = row['timestamp'] df_tmp = all_click[all_click['user_id']==row['user_id']] df_tmp = df_tmp.reset_index(drop=True) df_tmp['weight'] = 1 - (time_max-df_tmp['timestamp']+0.01) / (time_max-time_min+0.01) item_weight_dict = dict(zip(df_tmp['item_id'], df_tmp['weight'])) interacted_items = user_item_tmp[row['user_id']] interacted_items = interacted_items[::-1] sim_list_tmp = ReComputeSim(AA_sim_list_new,candidate_item_list,interacted_items,item_weight_dict) aa_sim_list += sim_list_tmp AA_sim_list_new = [] # In[ ]: # In[ ]: offline.shape # In[ ]: # In[ ]: # In[ ]: sim_df = pd.DataFrame() sim_df['user_id'] = user_id_list sim_df['item_id'] = item_id_list sim_df['ra_sim'] = ra_sim_list sim_df['aa_sim'] = aa_sim_list # In[ ]: sim_df.shape # In[ ]: offline = offline.merge(sim_df,on=['user_id','item_id']) # In[ ]: # In[ ]: offline.to_csv('./user_data/model_1/new_recall/'+ file_name + '_addAA_RA.csv',index=False) # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: ================================================ FILE: code/4_RankFeature/01_sim_feature_offline.py ================================================ #!/usr/bin/env python # coding: utf-8 # In[1]: import pandas as pd import numpy as np from tqdm import tqdm import os from collections import defaultdict import math import json from sys import stdout import pickle # In[2]: def ReComputeSim(sim_cor,candidate_item_list,interacted_items,item_weight_dict,flag=False): sim_list = [] for j in candidate_item_list: sim_tmp = 0 for loc, i in enumerate(interacted_items): #Just for RA gernerated by offline if i not in sim_cor or j not in sim_cor[i]: continue if i in item_weight_dict: sim_tmp += sim_cor[i][j][0] * (0.7**loc) * item_weight_dict[i] if flag else sim_cor[i][j] * (0.7**loc) * item_weight_dict[i] else: sim_tmp += sim_cor[i][j][0] * (0.7**loc) * 0.5 if flag else sim_cor[i][j] * (0.7**loc) * 0.5 sim_list.append(sim_tmp) return sim_list # In[3]: file_name = 'recall_0531' offline = pd.read_csv('./user_data/offline/new_recall/' + file_name + '.csv') now_phase = 9 train_path = './user_data/offline/' test_path = './user_data/offline/' header = 'offline' out_path = './user_data/offline/new_similarity/' recom_item = [] whole_click = pd.DataFrame() user_id_list = [] item_id_list = [] item_sim_list = [] ra_sim_list = [] aa_sim_list = [] cn_sim_list = [] txt_sim_list = [] hdi_sim_list = [] hpi_sim_list = [] lhn1_sim_list = [] for c in range(now_phase + 1): print('phase:', c) click_train = pd.read_csv(train_path + header + '_train_click_{}_time.csv'.format(c)) click_test = pd.read_csv(test_path + header + '_test_click_{}_time.csv'.format(c)) click_query = pd.read_csv(test_path + header + '_test_qtime_{}_time.csv'.format(c)) click_train['datetime'] = pd.to_datetime(click_train['datetime']) click_test['datetime'] = pd.to_datetime(click_test['datetime']) click_query['datetime'] = pd.to_datetime(click_query['datetime']) click_train['timestamp'] = click_train['datetime'].dt.day + ( click_train['datetime'].dt.hour + (click_train['datetime'].dt.minute + click_train['datetime'].dt.second/60)/float(60) )/float(24) click_test['timestamp'] = click_test['datetime'].dt.day + ( click_test['datetime'].dt.hour + (click_test['datetime'].dt.minute + click_test['datetime'].dt.second/60)/float(60) )/float(24) click_query['timestamp'] = click_query['datetime'].dt.day + ( click_query['datetime'].dt.hour + (click_query['datetime'].dt.minute + click_query['datetime'].dt.second/60)/float(60) )/float(24) all_click = click_train.append(click_test) with open(out_path+'user2item_new'+str(c)+'.pkl','rb') as f: user_item_tmp = pickle.load(f) with open(out_path+'CN_P'+str(c)+'_new.pkl','rb') as f: CN_sim_list_new = pickle.load(f) for i, row in click_query.iterrows(): offline_tmp = offline[offline['user_id']==row['user_id']] candidate_item_list = list(offline_tmp['item_id']) time_min = min(all_click['timestamp']) time_max = row['timestamp'] df_tmp = all_click[all_click['user_id']==row['user_id']] df_tmp = df_tmp.reset_index(drop=True) df_tmp['weight'] = 1 - (time_max-df_tmp['timestamp']+0.01) / (time_max-time_min+0.01) item_weight_dict = dict(zip(df_tmp['item_id'], df_tmp['weight'])) interacted_items = user_item_tmp[row['user_id']] interacted_items = interacted_items[::-1] sim_list_tmp = ReComputeSim(CN_sim_list_new,candidate_item_list,interacted_items,item_weight_dict) cn_sim_list += sim_list_tmp item_id_list += candidate_item_list user_id_list += [row['user_id'] for x in candidate_item_list] CN_sim_list_new = [] with open(out_path+'HDI_P'+str(c)+'_new.pkl','rb') as f: HDI_sim_list_new = pickle.load(f) for i, row in click_query.iterrows(): offline_tmp = offline[offline['user_id']==row['user_id']] candidate_item_list = list(offline_tmp['item_id']) time_min = min(all_click['timestamp']) time_max = row['timestamp'] df_tmp = all_click[all_click['user_id']==row['user_id']] df_tmp = df_tmp.reset_index(drop=True) df_tmp['weight'] = 1 - (time_max-df_tmp['timestamp']+0.01) / (time_max-time_min+0.01) item_weight_dict = dict(zip(df_tmp['item_id'], df_tmp['weight'])) interacted_items = user_item_tmp[row['user_id']] interacted_items = interacted_items[::-1] sim_list_tmp = ReComputeSim(HDI_sim_list_new,candidate_item_list,interacted_items,item_weight_dict) hdi_sim_list += sim_list_tmp HDI_sim_list_new = [] with open(out_path+'HPI_P'+str(c)+'_new.pkl','rb') as f: HPI_sim_list_new = pickle.load(f) for i, row in click_query.iterrows(): offline_tmp = offline[offline['user_id']==row['user_id']] candidate_item_list = list(offline_tmp['item_id']) time_min = min(all_click['timestamp']) time_max = row['timestamp'] df_tmp = all_click[all_click['user_id']==row['user_id']] df_tmp = df_tmp.reset_index(drop=True) df_tmp['weight'] = 1 - (time_max-df_tmp['timestamp']+0.01) / (time_max-time_min+0.01) item_weight_dict = dict(zip(df_tmp['item_id'], df_tmp['weight'])) interacted_items = user_item_tmp[row['user_id']] interacted_items = interacted_items[::-1] sim_list_tmp = ReComputeSim(HPI_sim_list_new,candidate_item_list,interacted_items,item_weight_dict) hpi_sim_list += sim_list_tmp HPI_sim_list_new = [] with open(out_path+'LHN1_P'+str(c)+'_new.pkl','rb') as f: LHN1_sim_list_new = pickle.load(f) for i, row in click_query.iterrows(): offline_tmp = offline[offline['user_id']==row['user_id']] candidate_item_list = list(offline_tmp['item_id']) time_min = min(all_click['timestamp']) time_max = row['timestamp'] df_tmp = all_click[all_click['user_id']==row['user_id']] df_tmp = df_tmp.reset_index(drop=True) df_tmp['weight'] = 1 - (time_max-df_tmp['timestamp']+0.01) / (time_max-time_min+0.01) item_weight_dict = dict(zip(df_tmp['item_id'], df_tmp['weight'])) interacted_items = user_item_tmp[row['user_id']] interacted_items = interacted_items[::-1] sim_list_tmp = ReComputeSim(LHN1_sim_list_new,candidate_item_list,interacted_items,item_weight_dict) lhn1_sim_list += sim_list_tmp LHN1_sim_list_new = [] # In[ ]: # In[4]: offline.shape # In[5]: len(lhn1_sim_list) # In[ ]: # In[6]: sim_df = pd.DataFrame() sim_df['user_id'] = user_id_list sim_df['item_id'] = item_id_list sim_df['cn_sim'] = cn_sim_list sim_df['hpi_sim'] = hpi_sim_list sim_df['hdi_sim'] = hdi_sim_list sim_df['lhn1_sim'] = lhn1_sim_list # In[7]: sim_df.shape # In[8]: offline = offline.merge(sim_df,on=['user_id','item_id']) # In[ ]: # In[9]: offline.to_csv('./user_data/offline/new_recall/'+ file_name + '_addsim.csv',index=False) # In[ ]: # In[ ]: # In[ ]: ================================================ FILE: code/4_RankFeature/01_sim_feature_offline_RA_AA.py ================================================ #!/usr/bin/env python # coding: utf-8 # In[1]: import pandas as pd import numpy as np from tqdm import tqdm import os from collections import defaultdict import math import json from sys import stdout import pickle # In[2]: def ReComputeSim(sim_cor,candidate_item_list,interacted_items,item_weight_dict,flag=False): sim_list = [] for j in candidate_item_list: sim_tmp = 0 for loc, i in enumerate(interacted_items): #Just for RA gernerated by offline if i not in sim_cor or j not in sim_cor[i]: continue if i in item_weight_dict: sim_tmp += sim_cor[i][j][0] * (0.7**loc) * item_weight_dict[i] if flag else sim_cor[i][j] * (0.7**loc) * item_weight_dict[i] else: sim_tmp += sim_cor[i][j][0] * (0.7**loc) * 0.5 if flag else sim_cor[i][j] * (0.7**loc) * 0.5 sim_list.append(sim_tmp) return sim_list # In[3]: file_name = 'recall_0531_addsim' offline = pd.read_csv('./user_data/offline/new_recall/' + file_name + '.csv') now_phase = 9 train_path = './user_data/offline/' test_path = './user_data/offline/' header = 'offline' out_path = './user_data/offline/new_similarity/' recom_item = [] whole_click = pd.DataFrame() user_id_list = [] item_id_list = [] ra_sim_list = [] aa_sim_list = [] for c in range(now_phase + 1): print('phase:', c) click_train = pd.read_csv(train_path + header + '_train_click_{}_time.csv'.format(c)) click_test = pd.read_csv(test_path + header + '_test_click_{}_time.csv'.format(c)) click_query = pd.read_csv(test_path + header + '_test_qtime_{}_time.csv'.format(c)) click_train['datetime'] = pd.to_datetime(click_train['datetime']) click_test['datetime'] = pd.to_datetime(click_test['datetime']) click_query['datetime'] = pd.to_datetime(click_query['datetime']) click_train['timestamp'] = click_train['datetime'].dt.day + ( click_train['datetime'].dt.hour + (click_train['datetime'].dt.minute + click_train['datetime'].dt.second/60)/float(60) )/float(24) click_test['timestamp'] = click_test['datetime'].dt.day + ( click_test['datetime'].dt.hour + (click_test['datetime'].dt.minute + click_test['datetime'].dt.second/60)/float(60) )/float(24) click_query['timestamp'] = click_query['datetime'].dt.day + ( click_query['datetime'].dt.hour + (click_query['datetime'].dt.minute + click_query['datetime'].dt.second/60)/float(60) )/float(24) all_click = click_train.append(click_test) with open(out_path+'user2item_new'+str(c)+'.pkl','rb') as f: user_item_tmp = pickle.load(f) with open(out_path+'RA_P'+str(c)+'_new.pkl','rb') as f: RA_sim_list_new = pickle.load(f) for i, row in click_query.iterrows(): offline_tmp = offline[offline['user_id']==row['user_id']] candidate_item_list = list(offline_tmp['item_id']) time_min = min(all_click['timestamp']) time_max = row['timestamp'] df_tmp = all_click[all_click['user_id']==row['user_id']] df_tmp = df_tmp.reset_index(drop=True) df_tmp['weight'] = 1 - (time_max-df_tmp['timestamp']+0.01) / (time_max-time_min+0.01) item_weight_dict = dict(zip(df_tmp['item_id'], df_tmp['weight'])) interacted_items = user_item_tmp[row['user_id']] interacted_items = interacted_items[::-1] sim_list_tmp = ReComputeSim(RA_sim_list_new,candidate_item_list,interacted_items,item_weight_dict) ra_sim_list += sim_list_tmp item_id_list += candidate_item_list user_id_list += [row['user_id'] for x in candidate_item_list] RA_sim_list_new = [] with open(out_path+'AA_P'+str(c)+'_new.pkl','rb') as f: AA_sim_list_new = pickle.load(f) for i, row in click_query.iterrows(): offline_tmp = offline[offline['user_id']==row['user_id']] candidate_item_list = list(offline_tmp['item_id']) time_min = min(all_click['timestamp']) time_max = row['timestamp'] df_tmp = all_click[all_click['user_id']==row['user_id']] df_tmp = df_tmp.reset_index(drop=True) df_tmp['weight'] = 1 - (time_max-df_tmp['timestamp']+0.01) / (time_max-time_min+0.01) item_weight_dict = dict(zip(df_tmp['item_id'], df_tmp['weight'])) interacted_items = user_item_tmp[row['user_id']] interacted_items = interacted_items[::-1] sim_list_tmp = ReComputeSim(AA_sim_list_new,candidate_item_list,interacted_items,item_weight_dict) aa_sim_list += sim_list_tmp AA_sim_list_new = [] # In[ ]: # In[4]: offline.shape # In[ ]: # In[ ]: # In[5]: sim_df = pd.DataFrame() sim_df['user_id'] = user_id_list sim_df['item_id'] = item_id_list sim_df['ra_sim'] = ra_sim_list sim_df['aa_sim'] = aa_sim_list # In[6]: sim_df.shape # In[7]: offline = offline.merge(sim_df,on=['user_id','item_id']) # In[ ]: # In[8]: offline.to_csv('./user_data/offline/new_recall/'+ file_name + '_addAA_RA.csv',index=False) # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: ================================================ FILE: code/4_RankFeature/01_sim_feature_online.py ================================================ #!/usr/bin/env python # coding: utf-8 # In[1]: import pandas as pd import numpy as np from tqdm import tqdm import os from collections import defaultdict import math import json from sys import stdout import pickle # In[2]: def ReComputeSim(sim_cor,candidate_item_list,interacted_items,item_weight_dict,flag=False): sim_list = [] for j in candidate_item_list: sim_tmp = 0 for loc, i in enumerate(interacted_items): #Just for RA gernerated by offline if i not in sim_cor or j not in sim_cor[i]: continue if i in item_weight_dict: sim_tmp += sim_cor[i][j][0] * (0.7**loc) * item_weight_dict[i] if flag else sim_cor[i][j] * (0.7**loc) * item_weight_dict[i] else: sim_tmp += sim_cor[i][j][0] * (0.7**loc) * 0.5 if flag else sim_cor[i][j] * (0.7**loc) * 0.5 sim_list.append(sim_tmp) return sim_list # In[3]: file_name = 'recall_0531' offline = pd.read_csv('./user_data/dataset/new_recall/' + file_name + '.csv') now_phase = 9 train_path = './user_data/dataset/' test_path = './user_data/dataset/' header = 'underexpose' out_path = './user_data/dataset/new_similarity/' recom_item = [] whole_click = pd.DataFrame() user_id_list = [] item_id_list = [] item_sim_list = [] ra_sim_list = [] aa_sim_list = [] cn_sim_list = [] txt_sim_list = [] hdi_sim_list = [] hpi_sim_list = [] lhn1_sim_list = [] for c in range(now_phase + 1): print('phase:', c) click_train = pd.read_csv(train_path + header + '_train_click_{}_time.csv'.format(c)) click_test = pd.read_csv(test_path + header + '_test_click_{}_time.csv'.format(c)) click_query = pd.read_csv(test_path + header + '_test_qtime_{}_time.csv'.format(c)) click_train['datetime'] = pd.to_datetime(click_train['datetime']) click_test['datetime'] = pd.to_datetime(click_test['datetime']) click_query['datetime'] = pd.to_datetime(click_query['datetime']) click_train['timestamp'] = click_train['datetime'].dt.day + ( click_train['datetime'].dt.hour + (click_train['datetime'].dt.minute + click_train['datetime'].dt.second/60)/float(60) )/float(24) click_test['timestamp'] = click_test['datetime'].dt.day + ( click_test['datetime'].dt.hour + (click_test['datetime'].dt.minute + click_test['datetime'].dt.second/60)/float(60) )/float(24) click_query['timestamp'] = click_query['datetime'].dt.day + ( click_query['datetime'].dt.hour + (click_query['datetime'].dt.minute + click_query['datetime'].dt.second/60)/float(60) )/float(24) all_click = click_train.append(click_test) with open(out_path+'user2item_new'+str(c)+'.pkl','rb') as f: user_item_tmp = pickle.load(f) with open(out_path+'CN_P'+str(c)+'_new.pkl','rb') as f: CN_sim_list_new = pickle.load(f) for i, row in click_query.iterrows(): offline_tmp = offline[offline['user_id']==row['user_id']] candidate_item_list = list(offline_tmp['item_id']) time_min = min(all_click['timestamp']) time_max = row['timestamp'] df_tmp = all_click[all_click['user_id']==row['user_id']] df_tmp = df_tmp.reset_index(drop=True) df_tmp['weight'] = 1 - (time_max-df_tmp['timestamp']+0.01) / (time_max-time_min+0.01) item_weight_dict = dict(zip(df_tmp['item_id'], df_tmp['weight'])) interacted_items = user_item_tmp[row['user_id']] interacted_items = interacted_items[::-1] sim_list_tmp = ReComputeSim(CN_sim_list_new,candidate_item_list,interacted_items,item_weight_dict) cn_sim_list += sim_list_tmp item_id_list += candidate_item_list user_id_list += [row['user_id'] for x in candidate_item_list] CN_sim_list_new = [] with open(out_path+'HDI_P'+str(c)+'_new.pkl','rb') as f: HDI_sim_list_new = pickle.load(f) for i, row in click_query.iterrows(): offline_tmp = offline[offline['user_id']==row['user_id']] candidate_item_list = list(offline_tmp['item_id']) time_min = min(all_click['timestamp']) time_max = row['timestamp'] df_tmp = all_click[all_click['user_id']==row['user_id']] df_tmp = df_tmp.reset_index(drop=True) df_tmp['weight'] = 1 - (time_max-df_tmp['timestamp']+0.01) / (time_max-time_min+0.01) item_weight_dict = dict(zip(df_tmp['item_id'], df_tmp['weight'])) interacted_items = user_item_tmp[row['user_id']] interacted_items = interacted_items[::-1] sim_list_tmp = ReComputeSim(HDI_sim_list_new,candidate_item_list,interacted_items,item_weight_dict) hdi_sim_list += sim_list_tmp HDI_sim_list_new = [] with open(out_path+'HPI_P'+str(c)+'_new.pkl','rb') as f: HPI_sim_list_new = pickle.load(f) for i, row in click_query.iterrows(): offline_tmp = offline[offline['user_id']==row['user_id']] candidate_item_list = list(offline_tmp['item_id']) time_min = min(all_click['timestamp']) time_max = row['timestamp'] df_tmp = all_click[all_click['user_id']==row['user_id']] df_tmp = df_tmp.reset_index(drop=True) df_tmp['weight'] = 1 - (time_max-df_tmp['timestamp']+0.01) / (time_max-time_min+0.01) item_weight_dict = dict(zip(df_tmp['item_id'], df_tmp['weight'])) interacted_items = user_item_tmp[row['user_id']] interacted_items = interacted_items[::-1] sim_list_tmp = ReComputeSim(HPI_sim_list_new,candidate_item_list,interacted_items,item_weight_dict) hpi_sim_list += sim_list_tmp HPI_sim_list_new = [] with open(out_path+'LHN1_P'+str(c)+'_new.pkl','rb') as f: LHN1_sim_list_new = pickle.load(f) for i, row in click_query.iterrows(): offline_tmp = offline[offline['user_id']==row['user_id']] candidate_item_list = list(offline_tmp['item_id']) time_min = min(all_click['timestamp']) time_max = row['timestamp'] df_tmp = all_click[all_click['user_id']==row['user_id']] df_tmp = df_tmp.reset_index(drop=True) df_tmp['weight'] = 1 - (time_max-df_tmp['timestamp']+0.01) / (time_max-time_min+0.01) item_weight_dict = dict(zip(df_tmp['item_id'], df_tmp['weight'])) interacted_items = user_item_tmp[row['user_id']] interacted_items = interacted_items[::-1] sim_list_tmp = ReComputeSim(LHN1_sim_list_new,candidate_item_list,interacted_items,item_weight_dict) lhn1_sim_list += sim_list_tmp LHN1_sim_list_new = [] # In[ ]: # In[4]: offline.shape # In[5]: len(lhn1_sim_list) # In[ ]: # In[6]: sim_df = pd.DataFrame() sim_df['user_id'] = user_id_list sim_df['item_id'] = item_id_list sim_df['cn_sim'] = cn_sim_list sim_df['hpi_sim'] = hpi_sim_list sim_df['hdi_sim'] = hdi_sim_list sim_df['lhn1_sim'] = lhn1_sim_list # In[7]: sim_df.shape # In[8]: offline = offline.merge(sim_df,on=['user_id','item_id']) # In[ ]: # In[9]: offline.to_csv('./user_data/dataset/new_recall/'+ file_name + '_addsim.csv',index=False) # In[10]: offline.shape # In[ ]: # In[ ]: ================================================ FILE: code/4_RankFeature/01_sim_feature_online_RA_AA.py ================================================ #!/usr/bin/env python # coding: utf-8 # In[1]: import pandas as pd import numpy as np from tqdm import tqdm import os from collections import defaultdict import math import json from sys import stdout import pickle # In[2]: def ReComputeSim(sim_cor,candidate_item_list,interacted_items,item_weight_dict,flag=False): sim_list = [] for j in candidate_item_list: sim_tmp = 0 for loc, i in enumerate(interacted_items): #Just for RA gernerated by offline if i not in sim_cor or j not in sim_cor[i]: continue if i in item_weight_dict: sim_tmp += sim_cor[i][j][0] * (0.7**loc) * item_weight_dict[i] if flag else sim_cor[i][j] * (0.7**loc) * item_weight_dict[i] else: sim_tmp += sim_cor[i][j][0] * (0.7**loc) * 0.5 if flag else sim_cor[i][j] * (0.7**loc) * 0.5 sim_list.append(sim_tmp) return sim_list # In[ ]: file_name = 'recall_0531_addsim' offline = pd.read_csv('./user_data/dataset/new_recall/' + file_name + '.csv') now_phase = 9 train_path = './user_data/dataset/' test_path = './user_data/dataset/' header = 'underexpose' out_path = './user_data/dataset/new_similarity/' recom_item = [] whole_click = pd.DataFrame() user_id_list = [] item_id_list = [] ra_sim_list = [] aa_sim_list = [] for c in range(now_phase + 1): print('phase:', c) click_train = pd.read_csv(train_path + header + '_train_click_{}_time.csv'.format(c)) click_test = pd.read_csv(test_path + header + '_test_click_{}_time.csv'.format(c)) click_query = pd.read_csv(test_path + header + '_test_qtime_{}_time.csv'.format(c)) click_train['datetime'] = pd.to_datetime(click_train['datetime']) click_test['datetime'] = pd.to_datetime(click_test['datetime']) click_query['datetime'] = pd.to_datetime(click_query['datetime']) click_train['timestamp'] = click_train['datetime'].dt.day + ( click_train['datetime'].dt.hour + (click_train['datetime'].dt.minute + click_train['datetime'].dt.second/60)/float(60) )/float(24) click_test['timestamp'] = click_test['datetime'].dt.day + ( click_test['datetime'].dt.hour + (click_test['datetime'].dt.minute + click_test['datetime'].dt.second/60)/float(60) )/float(24) click_query['timestamp'] = click_query['datetime'].dt.day + ( click_query['datetime'].dt.hour + (click_query['datetime'].dt.minute + click_query['datetime'].dt.second/60)/float(60) )/float(24) all_click = click_train.append(click_test) with open(out_path+'user2item_new'+str(c)+'.pkl','rb') as f: user_item_tmp = pickle.load(f) with open(out_path+'RA_P'+str(c)+'_new.pkl','rb') as f: RA_sim_list_new = pickle.load(f) for i, row in click_query.iterrows(): offline_tmp = offline[offline['user_id']==row['user_id']] candidate_item_list = list(offline_tmp['item_id']) time_min = min(all_click['timestamp']) time_max = row['timestamp'] df_tmp = all_click[all_click['user_id']==row['user_id']] df_tmp = df_tmp.reset_index(drop=True) df_tmp['weight'] = 1 - (time_max-df_tmp['timestamp']+0.01) / (time_max-time_min+0.01) item_weight_dict = dict(zip(df_tmp['item_id'], df_tmp['weight'])) interacted_items = user_item_tmp[row['user_id']] interacted_items = interacted_items[::-1] sim_list_tmp = ReComputeSim(RA_sim_list_new,candidate_item_list,interacted_items,item_weight_dict) ra_sim_list += sim_list_tmp item_id_list += candidate_item_list user_id_list += [row['user_id'] for x in candidate_item_list] RA_sim_list_new = [] with open(out_path+'AA_P'+str(c)+'_new.pkl','rb') as f: AA_sim_list_new = pickle.load(f) for i, row in click_query.iterrows(): offline_tmp = offline[offline['user_id']==row['user_id']] candidate_item_list = list(offline_tmp['item_id']) time_min = min(all_click['timestamp']) time_max = row['timestamp'] df_tmp = all_click[all_click['user_id']==row['user_id']] df_tmp = df_tmp.reset_index(drop=True) df_tmp['weight'] = 1 - (time_max-df_tmp['timestamp']+0.01) / (time_max-time_min+0.01) item_weight_dict = dict(zip(df_tmp['item_id'], df_tmp['weight'])) interacted_items = user_item_tmp[row['user_id']] interacted_items = interacted_items[::-1] sim_list_tmp = ReComputeSim(AA_sim_list_new,candidate_item_list,interacted_items,item_weight_dict) aa_sim_list += sim_list_tmp AA_sim_list_new = [] # In[ ]: # In[ ]: offline.shape # In[ ]: # In[ ]: # In[ ]: sim_df = pd.DataFrame() sim_df['user_id'] = user_id_list sim_df['item_id'] = item_id_list sim_df['ra_sim'] = ra_sim_list sim_df['aa_sim'] = aa_sim_list # In[ ]: sim_df.shape # In[ ]: offline = offline.merge(sim_df,on=['user_id','item_id']) # In[ ]: # In[ ]: offline.to_csv('./user_data/dataset/new_recall/'+ file_name + '_addAA_RA.csv',index=False) # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: ================================================ FILE: code/4_RankFeature/02_itemtime_feature_model1.py ================================================ #!/usr/bin/env python # coding: utf-8 # In[1]: import pandas as pd import numpy as np # In[2]: def extractItemCount(df, df_qTime, df_click, intervals, col_name): df_click = getTimeInterval(df_click,intervals) if 'time_interval' not in df.columns: df_qTime = getTimeInterval(df_qTime,intervals) df = df.merge(df_qTime[['user_id','time_interval']]) df_click_sta = df_click[['user_id','item_id','time_interval']].groupby(by=['item_id','time_interval'],as_index=False).count() df_click_sta.columns = ['item_id','time_interval',col_name] df = df.merge(df_click_sta,on=['item_id','time_interval'],how='left') return df # In[3]: def getTimeInterval(df,intervals): df['hour_minute'] = (df['datetime'].dt.hour + df['datetime'].dt.minute/60)/24 time_interval_list = np.linspace(0,1,intervals) df['time_interval'] = df['hour_minute'].apply(lambda x: np.where(x6] offline_train = offline_train.reset_index(drop=True) # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[49]: col_sel = [x for x in offline_train.columns if x not in ['user_item_count_max_time','user_item_count_min_time', 'time_interval','item_count_4h','phrase','item_count_6h', 'is_user_count_climax','item_count_2h','is_user_count_lowerpoint', 'item_count_1h']] # In[50]: len(col_sel) # In[ ]: # In[ ]: # In[ ]: # In[ ]: # # 线下 # In[51]: now_phase = 9 train_path = './user_data/offline/' test_path = './user_data/offline/' header = 'offline' item_sim_list = [] item_cnt_list = [] user_item = [] whole_click = pd.DataFrame() for c in range(7,now_phase + 1): print('phase:', c) click_train = pd.read_csv(train_path + header + '_train_click_{}_time.csv'.format(c)) click_test = pd.read_csv(test_path + header + '_test_click_{}_time.csv'.format(c)) click_query = pd.read_csv(test_path + header + '_test_qtime_{}_time.csv'.format(c)) all_click = click_train.append(click_test) whole_click = whole_click.append(all_click) whole_click = whole_click.drop_duplicates(subset=['user_id','item_id','time'],keep='last') whole_click = whole_click.sort_values('time') whole_click = whole_click.reset_index(drop=True) # find most popular items top50_click = whole_click['item_id'].value_counts().index[:500].values top50_click = ','.join([str(i) for i in top50_click]) # In[162]: model_train = model1_train #model_train = pd.concat([model1_train,offline_train]) #model_train = model_train.reset_index(drop=True) model_train_p = model_train[model_train['future_click']==1] model_train_p = model_train_p.reset_index(drop=True) model_train_n = model_train[model_train['future_click']==0] model_train_n = model_train_n.reset_index(drop=True) # In[164]: model_train_p.shape # In[ ]: # In[165]: online_train = offline_train # In[166]: online_train.shape # In[ ]: # In[ ]: # In[167]: import random def generateDataset(df_n,df_p,random_seed): random.seed(random_seed) n_index = random.sample(list(range(len(df_n))), len(df_p)*5) df_ns = df_n.loc[n_index] df = pd.concat([df_ns,df_p]) df = df.reset_index(drop=True) return df model_train_s_1 = generateDataset(model_train_n,model_train_p,2020) model_train_s_2 = generateDataset(model_train_n,model_train_p,0) model_train_s_3 = generateDataset(model_train_n,model_train_p,2019) model_train_s_4 = generateDataset(model_train_n,model_train_p,1000) model_train_s_5 = generateDataset(model_train_n,model_train_p,3000) model_train_s_6 = generateDataset(model_train_n,model_train_p,2021) # In[ ]: # In[168]: def addWeightForDataSet(df,item_degree_median,weight): df['sample_weight'] = df['count']/item_degree_median df['sample_weight'] = df['sample_weight'].apply(lambda x: 5 if x<1 else 1) df.loc[(df['count']6] online_train = online_train.reset_index(drop=True) # In[ ]: # In[ ]: # In[16]: import random def generateDataset(df_n,df_p,random_seed): random.seed(random_seed) n_index = random.sample(list(range(len(df_n))), len(df_p)*5) df_ns = df_n.loc[n_index] df = pd.concat([df_ns,df_p]) df = df.reset_index(drop=True) return df model_train_s_1 = generateDataset(model_train_n,model_train_p,2020) model_train_s_2 = generateDataset(model_train_n,model_train_p,0) model_train_s_3 = generateDataset(model_train_n,model_train_p,2019) model_train_s_4 = generateDataset(model_train_n,model_train_p,1000) model_train_s_5 = generateDataset(model_train_n,model_train_p,3000) model_train_s_6 = generateDataset(model_train_n,model_train_p,2021) # In[ ]: # In[17]: def addWeightForDataSet(df,item_degree_median,weight): df['sample_weight'] = df['count']/item_degree_median df['sample_weight'] = df['sample_weight'].apply(lambda x: 5 if x<1 else 1) df.loc[(df['count']