Repository: guoyang9/BPR-pytorch
Branch: master
Commit: a21fbcde6ab8
Files: 6
Total size: 8.4 KB
Directory structure:
gitextract_6cyprr6l/
├── README.md
├── config.py
├── data_utils.py
├── evaluate.py
├── main.py
└── model.py
================================================
FILE CONTENTS
================================================
================================================
FILE: README.md
================================================
# Pytorch-BPR
Note that I use the two sub datasets provided by Xiangnan's [repo](https://github.com/hexiangnan/neural_collaborative_filtering/tree/master/Data). Another pytorch NCF implementaion can be found at this [repo](https://github.com/guoyang9/NCF).
I utilized a factor number **32**, and posted the results in the NCF paper and this implementation here. Since there is no specific numbers in their paper, I found this implementation achieved a better performance than the original curve. Moreover, the batch_size is not very sensitive with the final model performance.
Models | MovieLens HR@10 | MovieLens NDCG@10 | Pinterest HR@10 | Pinterest NDCG@10
------ | --------------- | ----------------- | --------------- | -----------------
pytorch-BPR | 0.700 | 0.418 | 0.877 | 0.551
## The requirements are as follows:
* python==3.6
* pandas==0.24.2
* numpy==1.16.2
* pytorch==1.0.1
* tensorboardX==1.6 (mainly useful when you want to visulize the loss, see https://github.com/lanpa/tensorboard-pytorch)
## Example to run:
```
python main.py --factor_num=16 --lamda=0.001
```
================================================
FILE: config.py
================================================
# dataset name
dataset = 'ml-1m'
assert dataset in ['ml-1m', 'pinterest-20']
# paths
main_path = '/home/share/guoyangyang/recommendation/NCF-Data/'
train_rating = main_path + '{}.train.rating'.format(dataset)
test_rating = main_path + '{}.test.rating'.format(dataset)
test_negative = main_path + '{}.test.negative'.format(dataset)
model_path = './models/'
BPR_model_path = model_path + 'NeuMF.pth'
================================================
FILE: data_utils.py
================================================
import numpy as np
import pandas as pd
import scipy.sparse as sp
import torch.utils.data as data
import config
def load_all(test_num=100):
""" We load all the three file here to save time in each epoch. """
train_data = pd.read_csv(
config.train_rating,
sep='\t', header=None, names=['user', 'item'],
usecols=[0, 1], dtype={0: np.int32, 1: np.int32})
user_num = train_data['user'].max() + 1
item_num = train_data['item'].max() + 1
train_data = train_data.values.tolist()
# load ratings as a dok matrix
train_mat = sp.dok_matrix((user_num, item_num), dtype=np.float32)
for x in train_data:
train_mat[x[0], x[1]] = 1.0
test_data = []
with open(config.test_negative, 'r') as fd:
line = fd.readline()
while line != None and line != '':
arr = line.split('\t')
u = eval(arr[0])[0]
test_data.append([u, eval(arr[0])[1]])
for i in arr[1:]:
test_data.append([u, int(i)])
line = fd.readline()
return train_data, test_data, user_num, item_num, train_mat
class BPRData(data.Dataset):
def __init__(self, features,
num_item, train_mat=None, num_ng=0, is_training=None):
super(BPRData, self).__init__()
""" Note that the labels are only useful when training, we thus
add them in the ng_sample() function.
"""
self.features = features
self.num_item = num_item
self.train_mat = train_mat
self.num_ng = num_ng
self.is_training = is_training
def ng_sample(self):
assert self.is_training, 'no need to sampling when testing'
self.features_fill = []
for x in self.features:
u, i = x[0], x[1]
for t in range(self.num_ng):
j = np.random.randint(self.num_item)
while (u, j) in self.train_mat:
j = np.random.randint(self.num_item)
self.features_fill.append([u, i, j])
def __len__(self):
return self.num_ng * len(self.features) if \
self.is_training else len(self.features)
def __getitem__(self, idx):
features = self.features_fill if \
self.is_training else self.features
user = features[idx][0]
item_i = features[idx][1]
item_j = features[idx][2] if \
self.is_training else features[idx][1]
return user, item_i, item_j
================================================
FILE: evaluate.py
================================================
import numpy as np
import torch
def hit(gt_item, pred_items):
if gt_item in pred_items:
return 1
return 0
def ndcg(gt_item, pred_items):
if gt_item in pred_items:
index = pred_items.index(gt_item)
return np.reciprocal(np.log2(index+2))
return 0
def metrics(model, test_loader, top_k):
HR, NDCG = [], []
for user, item_i, item_j in test_loader:
user = user.cuda()
item_i = item_i.cuda()
item_j = item_j.cuda() # not useful when testing
prediction_i, prediction_j = model(user, item_i, item_j)
_, indices = torch.topk(prediction_i, top_k)
recommends = torch.take(
item_i, indices).cpu().numpy().tolist()
gt_item = item_i[0].item()
HR.append(hit(gt_item, recommends))
NDCG.append(ndcg(gt_item, recommends))
return np.mean(HR), np.mean(NDCG)
================================================
FILE: main.py
================================================
import os
import time
import argparse
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data
import torch.backends.cudnn as cudnn
from tensorboardX import SummaryWriter
import model
import config
import evaluate
import data_utils
parser = argparse.ArgumentParser()
parser.add_argument("--lr",
type=float,
default=0.01,
help="learning rate")
parser.add_argument("--lamda",
type=float,
default=0.001,
help="model regularization rate")
parser.add_argument("--batch_size",
type=int,
default=4096,
help="batch size for training")
parser.add_argument("--epochs",
type=int,
default=50,
help="training epoches")
parser.add_argument("--top_k",
type=int,
default=10,
help="compute metrics@top_k")
parser.add_argument("--factor_num",
type=int,
default=32,
help="predictive factors numbers in the model")
parser.add_argument("--num_ng",
type=int,
default=4,
help="sample negative items for training")
parser.add_argument("--test_num_ng",
type=int,
default=99,
help="sample part of negative items for testing")
parser.add_argument("--out",
default=True,
help="save model or not")
parser.add_argument("--gpu",
type=str,
default="0",
help="gpu card ID")
args = parser.parse_args()
os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu
cudnn.benchmark = True
############################## PREPARE DATASET ##########################
train_data, test_data, user_num ,item_num, train_mat = data_utils.load_all()
# construct the train and test datasets
train_dataset = data_utils.BPRData(
train_data, item_num, train_mat, args.num_ng, True)
test_dataset = data_utils.BPRData(
test_data, item_num, train_mat, 0, False)
train_loader = data.DataLoader(train_dataset,
batch_size=args.batch_size, shuffle=True, num_workers=4)
test_loader = data.DataLoader(test_dataset,
batch_size=args.test_num_ng+1, shuffle=False, num_workers=0)
########################### CREATE MODEL #################################
model = model.BPR(user_num, item_num, args.factor_num)
model.cuda()
optimizer = optim.SGD(
model.parameters(), lr=args.lr, weight_decay=args.lamda)
# writer = SummaryWriter() # for visualization
########################### TRAINING #####################################
count, best_hr = 0, 0
for epoch in range(args.epochs):
model.train()
start_time = time.time()
train_loader.dataset.ng_sample()
for user, item_i, item_j in train_loader:
user = user.cuda()
item_i = item_i.cuda()
item_j = item_j.cuda()
model.zero_grad()
prediction_i, prediction_j = model(user, item_i, item_j)
loss = - (prediction_i - prediction_j).sigmoid().log().sum()
loss.backward()
optimizer.step()
# writer.add_scalar('data/loss', loss.item(), count)
count += 1
model.eval()
HR, NDCG = evaluate.metrics(model, test_loader, args.top_k)
elapsed_time = time.time() - start_time
print("The time elapse of epoch {:03d}".format(epoch) + " is: " +
time.strftime("%H: %M: %S", time.gmtime(elapsed_time)))
print("HR: {:.3f}\tNDCG: {:.3f}".format(np.mean(HR), np.mean(NDCG)))
if HR > best_hr:
best_hr, best_ndcg, best_epoch = HR, NDCG, epoch
if args.out:
if not os.path.exists(config.model_path):
os.mkdir(config.model_path)
torch.save(model, '{}BPR.pt'.format(config.model_path))
print("End. Best epoch {:03d}: HR = {:.3f}, \
NDCG = {:.3f}".format(best_epoch, best_hr, best_ndcg))
================================================
FILE: model.py
================================================
import torch
import torch.nn as nn
class BPR(nn.Module):
def __init__(self, user_num, item_num, factor_num):
super(BPR, self).__init__()
"""
user_num: number of users;
item_num: number of items;
factor_num: number of predictive factors.
"""
self.embed_user = nn.Embedding(user_num, factor_num)
self.embed_item = nn.Embedding(item_num, factor_num)
nn.init.normal_(self.embed_user.weight, std=0.01)
nn.init.normal_(self.embed_item.weight, std=0.01)
def forward(self, user, item_i, item_j):
user = self.embed_user(user)
item_i = self.embed_item(item_i)
item_j = self.embed_item(item_j)
prediction_i = (user * item_i).sum(dim=-1)
prediction_j = (user * item_j).sum(dim=-1)
return prediction_i, prediction_j
gitextract_6cyprr6l/ ├── README.md ├── config.py ├── data_utils.py ├── evaluate.py ├── main.py └── model.py
SYMBOL INDEX (12 symbols across 3 files)
FILE: data_utils.py
function load_all (line 10) | def load_all(test_num=100):
class BPRData (line 40) | class BPRData(data.Dataset):
method __init__ (line 41) | def __init__(self, features,
method ng_sample (line 53) | def ng_sample(self):
method __len__ (line 65) | def __len__(self):
method __getitem__ (line 69) | def __getitem__(self, idx):
FILE: evaluate.py
function hit (line 5) | def hit(gt_item, pred_items):
function ndcg (line 11) | def ndcg(gt_item, pred_items):
function metrics (line 18) | def metrics(model, test_loader, top_k):
FILE: model.py
class BPR (line 5) | class BPR(nn.Module):
method __init__ (line 6) | def __init__(self, user_num, item_num, factor_num):
method forward (line 19) | def forward(self, user, item_i, item_j):
Condensed preview — 6 files, each showing path, character count, and a content snippet. Download the .json file or copy for the full structured content (10K chars).
[
{
"path": "README.md",
"chars": 1120,
"preview": "# Pytorch-BPR\n\nNote that I use the two sub datasets provided by Xiangnan's [repo](https://github.com/hexiangnan/neural_c"
},
{
"path": "config.py",
"chars": 402,
"preview": "# dataset name \ndataset = 'ml-1m'\nassert dataset in ['ml-1m', 'pinterest-20']\n\n# paths\nmain_path = '/home/share/guoyangy"
},
{
"path": "data_utils.py",
"chars": 2140,
"preview": "import numpy as np \nimport pandas as pd \nimport scipy.sparse as sp\n\nimport torch.utils.data as data\n\nimport config\n\n\ndef"
},
{
"path": "evaluate.py",
"chars": 783,
"preview": "import numpy as np\nimport torch\n\n\ndef hit(gt_item, pred_items):\n\tif gt_item in pred_items:\n\t\treturn 1\n\treturn 0\n\n\ndef nd"
},
{
"path": "main.py",
"chars": 3412,
"preview": "import os\nimport time\nimport argparse\nimport numpy as np\n\nimport torch\nimport torch.nn as nn\nimport torch.optim as optim"
},
{
"path": "model.py",
"chars": 744,
"preview": "import torch\nimport torch.nn as nn\n\n\nclass BPR(nn.Module):\n\tdef __init__(self, user_num, item_num, factor_num):\n\t\tsuper("
}
]
About this extraction
This page contains the full source code of the guoyang9/BPR-pytorch GitHub repository, extracted and formatted as plain text for AI agents and large language models (LLMs). The extraction includes 6 files (8.4 KB), approximately 2.5k tokens, and a symbol index with 12 extracted functions, classes, methods, constants, and types. Use this with OpenClaw, Claude, ChatGPT, Cursor, Windsurf, or any other AI tool that accepts text input. You can copy the full output to your clipboard or download it as a .txt file.
Extracted by GitExtract — free GitHub repo to text converter for AI. Built by Nikandr Surkov.