[
  {
    "path": "README.md",
    "content": "# Pytorch-BPR\n\nNote that I use the two sub datasets provided by Xiangnan's [repo](https://github.com/hexiangnan/neural_collaborative_filtering/tree/master/Data). Another pytorch NCF implementaion can be found at this [repo](https://github.com/guoyang9/NCF).\n\nI utilized a factor number **32**, and posted the results in the NCF paper and this implementation here. Since there is no specific numbers in their paper, I found this implementation achieved a better performance than the original curve. Moreover, the batch_size is not very sensitive with the final model performance.\n\nModels \t\t\t| MovieLens HR@10 | MovieLens NDCG@10 | Pinterest HR@10 | Pinterest NDCG@10\n------ \t\t\t| --------------- | ----------------- | --------------- | -----------------\npytorch-BPR    \t| 0.700 \t\t  | 0.418             | 0.877 \t\t\t| 0.551\n\n\n## The requirements are as follows:\n\t* python==3.6\n\t* pandas==0.24.2\n\t* numpy==1.16.2\n\t* pytorch==1.0.1\n\t* tensorboardX==1.6 (mainly useful when you want to visulize the loss, see https://github.com/lanpa/tensorboard-pytorch)\n\n## Example to run:\n```\npython main.py --factor_num=16 --lamda=0.001\n```\n"
  },
  {
    "path": "config.py",
    "content": "# dataset name \ndataset = 'ml-1m'\nassert dataset in ['ml-1m', 'pinterest-20']\n\n# paths\nmain_path = '/home/share/guoyangyang/recommendation/NCF-Data/'\n\ntrain_rating = main_path + '{}.train.rating'.format(dataset)\ntest_rating = main_path + '{}.test.rating'.format(dataset)\ntest_negative = main_path + '{}.test.negative'.format(dataset)\n\nmodel_path = './models/'\nBPR_model_path = model_path + 'NeuMF.pth'\n"
  },
  {
    "path": "data_utils.py",
    "content": "import numpy as np \nimport pandas as pd \nimport scipy.sparse as sp\n\nimport torch.utils.data as data\n\nimport config\n\n\ndef load_all(test_num=100):\n\t\"\"\" We load all the three file here to save time in each epoch. \"\"\"\n\ttrain_data = pd.read_csv(\n\t\tconfig.train_rating, \n\t\tsep='\\t', header=None, names=['user', 'item'], \n\t\tusecols=[0, 1], dtype={0: np.int32, 1: np.int32})\n\n\tuser_num = train_data['user'].max() + 1\n\titem_num = train_data['item'].max() + 1\n\n\ttrain_data = train_data.values.tolist()\n\n\t# load ratings as a dok matrix\n\ttrain_mat = sp.dok_matrix((user_num, item_num), dtype=np.float32)\n\tfor x in train_data:\n\t\ttrain_mat[x[0], x[1]] = 1.0\n\n\ttest_data = []\n\twith open(config.test_negative, 'r') as fd:\n\t\tline = fd.readline()\n\t\twhile line != None and line != '':\n\t\t\tarr = line.split('\\t')\n\t\t\tu = eval(arr[0])[0]\n\t\t\ttest_data.append([u, eval(arr[0])[1]])\n\t\t\tfor i in arr[1:]:\n\t\t\t\ttest_data.append([u, int(i)])\n\t\t\tline = fd.readline()\n\treturn train_data, test_data, user_num, item_num, train_mat\n\n\nclass BPRData(data.Dataset):\n\tdef __init__(self, features, \n\t\t\t\tnum_item, train_mat=None, num_ng=0, is_training=None):\n\t\tsuper(BPRData, self).__init__()\n\t\t\"\"\" Note that the labels are only useful when training, we thus \n\t\t\tadd them in the ng_sample() function.\n\t\t\"\"\"\n\t\tself.features = features\n\t\tself.num_item = num_item\n\t\tself.train_mat = train_mat\n\t\tself.num_ng = num_ng\n\t\tself.is_training = is_training\n\n\tdef ng_sample(self):\n\t\tassert self.is_training, 'no need to sampling when testing'\n\n\t\tself.features_fill = []\n\t\tfor x in self.features:\n\t\t\tu, i = x[0], x[1]\n\t\t\tfor t in range(self.num_ng):\n\t\t\t\tj = np.random.randint(self.num_item)\n\t\t\t\twhile (u, j) in self.train_mat:\n\t\t\t\t\tj = np.random.randint(self.num_item)\n\t\t\t\tself.features_fill.append([u, i, j])\n\n\tdef __len__(self):\n\t\treturn self.num_ng * len(self.features) if \\\n\t\t\t\tself.is_training else len(self.features)\n\n\tdef __getitem__(self, idx):\n\t\tfeatures = self.features_fill if \\\n\t\t\t\tself.is_training else self.features\n\n\t\tuser = features[idx][0]\n\t\titem_i = features[idx][1]\n\t\titem_j = features[idx][2] if \\\n\t\t\t\tself.is_training else features[idx][1]\n\t\treturn user, item_i, item_j \n\t\t"
  },
  {
    "path": "evaluate.py",
    "content": "import numpy as np\nimport torch\n\n\ndef hit(gt_item, pred_items):\n\tif gt_item in pred_items:\n\t\treturn 1\n\treturn 0\n\n\ndef ndcg(gt_item, pred_items):\n\tif gt_item in pred_items:\n\t\tindex = pred_items.index(gt_item)\n\t\treturn np.reciprocal(np.log2(index+2))\n\treturn 0\n\n\ndef metrics(model, test_loader, top_k):\n\tHR, NDCG = [], []\n\n\tfor user, item_i, item_j in test_loader:\n\t\tuser = user.cuda()\n\t\titem_i = item_i.cuda()\n\t\titem_j = item_j.cuda() # not useful when testing\n\n\t\tprediction_i, prediction_j = model(user, item_i, item_j)\n\t\t_, indices = torch.topk(prediction_i, top_k)\n\t\trecommends = torch.take(\n\t\t\t\titem_i, indices).cpu().numpy().tolist()\n\n\t\tgt_item = item_i[0].item()\n\t\tHR.append(hit(gt_item, recommends))\n\t\tNDCG.append(ndcg(gt_item, recommends))\n\n\treturn np.mean(HR), np.mean(NDCG)\n"
  },
  {
    "path": "main.py",
    "content": "import os\nimport time\nimport argparse\nimport numpy as np\n\nimport torch\nimport torch.nn as nn\nimport torch.optim as optim\nimport torch.utils.data as data\nimport torch.backends.cudnn as cudnn\nfrom tensorboardX import SummaryWriter\n\nimport model\nimport config\nimport evaluate\nimport data_utils\n\n\nparser = argparse.ArgumentParser()\nparser.add_argument(\"--lr\", \n\ttype=float, \n\tdefault=0.01, \n\thelp=\"learning rate\")\nparser.add_argument(\"--lamda\", \n\ttype=float, \n\tdefault=0.001, \n\thelp=\"model regularization rate\")\nparser.add_argument(\"--batch_size\", \n\ttype=int, \n\tdefault=4096, \n\thelp=\"batch size for training\")\nparser.add_argument(\"--epochs\", \n\ttype=int,\n\tdefault=50,  \n\thelp=\"training epoches\")\nparser.add_argument(\"--top_k\", \n\ttype=int, \n\tdefault=10, \n\thelp=\"compute metrics@top_k\")\nparser.add_argument(\"--factor_num\", \n\ttype=int,\n\tdefault=32, \n\thelp=\"predictive factors numbers in the model\")\nparser.add_argument(\"--num_ng\", \n\ttype=int,\n\tdefault=4, \n\thelp=\"sample negative items for training\")\nparser.add_argument(\"--test_num_ng\", \n\ttype=int,\n\tdefault=99, \n\thelp=\"sample part of negative items for testing\")\nparser.add_argument(\"--out\", \n\tdefault=True,\n\thelp=\"save model or not\")\nparser.add_argument(\"--gpu\", \n\ttype=str,\n\tdefault=\"0\",  \n\thelp=\"gpu card ID\")\nargs = parser.parse_args()\n\nos.environ[\"CUDA_VISIBLE_DEVICES\"] = args.gpu\ncudnn.benchmark = True\n\n\n############################## PREPARE DATASET ##########################\ntrain_data, test_data, user_num ,item_num, train_mat = data_utils.load_all()\n\n# construct the train and test datasets\ntrain_dataset = data_utils.BPRData(\n\t\ttrain_data, item_num, train_mat, args.num_ng, True)\ntest_dataset = data_utils.BPRData(\n\t\ttest_data, item_num, train_mat, 0, False)\ntrain_loader = data.DataLoader(train_dataset,\n\t\tbatch_size=args.batch_size, shuffle=True, num_workers=4)\ntest_loader = data.DataLoader(test_dataset,\n\t\tbatch_size=args.test_num_ng+1, shuffle=False, num_workers=0)\n\n########################### CREATE MODEL #################################\nmodel = model.BPR(user_num, item_num, args.factor_num)\nmodel.cuda()\n\noptimizer = optim.SGD(\n\t\t\tmodel.parameters(), lr=args.lr, weight_decay=args.lamda)\n# writer = SummaryWriter() # for visualization\n\n########################### TRAINING #####################################\ncount, best_hr = 0, 0\nfor epoch in range(args.epochs):\n\tmodel.train() \n\tstart_time = time.time()\n\ttrain_loader.dataset.ng_sample()\n\n\tfor user, item_i, item_j in train_loader:\n\t\tuser = user.cuda()\n\t\titem_i = item_i.cuda()\n\t\titem_j = item_j.cuda()\n\n\t\tmodel.zero_grad()\n\t\tprediction_i, prediction_j = model(user, item_i, item_j)\n\t\tloss = - (prediction_i - prediction_j).sigmoid().log().sum()\n\t\tloss.backward()\n\t\toptimizer.step()\n\t\t# writer.add_scalar('data/loss', loss.item(), count)\n\t\tcount += 1\n\n\tmodel.eval()\n\tHR, NDCG = evaluate.metrics(model, test_loader, args.top_k)\n\n\telapsed_time = time.time() - start_time\n\tprint(\"The time elapse of epoch {:03d}\".format(epoch) + \" is: \" + \n\t\t\ttime.strftime(\"%H: %M: %S\", time.gmtime(elapsed_time)))\n\tprint(\"HR: {:.3f}\\tNDCG: {:.3f}\".format(np.mean(HR), np.mean(NDCG)))\n\n\tif HR > best_hr:\n\t\tbest_hr, best_ndcg, best_epoch = HR, NDCG, epoch\n\t\tif args.out:\n\t\t\tif not os.path.exists(config.model_path):\n\t\t\t\tos.mkdir(config.model_path)\n\t\t\ttorch.save(model, '{}BPR.pt'.format(config.model_path))\n\nprint(\"End. Best epoch {:03d}: HR = {:.3f}, \\\n\tNDCG = {:.3f}\".format(best_epoch, best_hr, best_ndcg))\n"
  },
  {
    "path": "model.py",
    "content": "import torch\nimport torch.nn as nn\n\n\nclass BPR(nn.Module):\n\tdef __init__(self, user_num, item_num, factor_num):\n\t\tsuper(BPR, self).__init__()\n\t\t\"\"\"\n\t\tuser_num: number of users;\n\t\titem_num: number of items;\n\t\tfactor_num: number of predictive factors.\n\t\t\"\"\"\t\t\n\t\tself.embed_user = nn.Embedding(user_num, factor_num)\n\t\tself.embed_item = nn.Embedding(item_num, factor_num)\n\n\t\tnn.init.normal_(self.embed_user.weight, std=0.01)\n\t\tnn.init.normal_(self.embed_item.weight, std=0.01)\n\n\tdef forward(self, user, item_i, item_j):\n\t\tuser = self.embed_user(user)\n\t\titem_i = self.embed_item(item_i)\n\t\titem_j = self.embed_item(item_j)\n\n\t\tprediction_i = (user * item_i).sum(dim=-1)\n\t\tprediction_j = (user * item_j).sum(dim=-1)\n\t\treturn prediction_i, prediction_j\n"
  }
]