Full Code of guoyang9/BPR-pytorch for AI

master a21fbcde6ab8 cached

6 files

8.4 KB

2.5k tokens

12 symbols

1 requests

Download .txt

Repository: guoyang9/BPR-pytorch
Branch: master
Commit: a21fbcde6ab8
Files: 6
Total size: 8.4 KB

Directory structure:
gitextract_6cyprr6l/

├── README.md
├── config.py
├── data_utils.py
├── evaluate.py
├── main.py
└── model.py

================================================
FILE CONTENTS
================================================

================================================
FILE: README.md
================================================
# Pytorch-BPR

Note that I use the two sub datasets provided by Xiangnan's [repo](https://github.com/hexiangnan/neural_collaborative_filtering/tree/master/Data). Another pytorch NCF implementaion can be found at this [repo](https://github.com/guoyang9/NCF).

I utilized a factor number **32**, and posted the results in the NCF paper and this implementation here. Since there is no specific numbers in their paper, I found this implementation achieved a better performance than the original curve. Moreover, the batch_size is not very sensitive with the final model performance.

Models 			| MovieLens HR@10 | MovieLens NDCG@10 | Pinterest HR@10 | Pinterest NDCG@10
------ 			| --------------- | ----------------- | --------------- | -----------------
pytorch-BPR    	| 0.700 		  | 0.418             | 0.877 			| 0.551


## The requirements are as follows:
	* python==3.6
	* pandas==0.24.2
	* numpy==1.16.2
	* pytorch==1.0.1
	* tensorboardX==1.6 (mainly useful when you want to visulize the loss, see https://github.com/lanpa/tensorboard-pytorch)

## Example to run:
```
python main.py --factor_num=16 --lamda=0.001
```


================================================
FILE: config.py
================================================
# dataset name 
dataset = 'ml-1m'
assert dataset in ['ml-1m', 'pinterest-20']

# paths
main_path = '/home/share/guoyangyang/recommendation/NCF-Data/'

train_rating = main_path + '{}.train.rating'.format(dataset)
test_rating = main_path + '{}.test.rating'.format(dataset)
test_negative = main_path + '{}.test.negative'.format(dataset)

model_path = './models/'
BPR_model_path = model_path + 'NeuMF.pth'


================================================
FILE: data_utils.py
================================================
import numpy as np 
import pandas as pd 
import scipy.sparse as sp

import torch.utils.data as data

import config


def load_all(test_num=100):
	""" We load all the three file here to save time in each epoch. """
	train_data = pd.read_csv(
		config.train_rating, 
		sep='\t', header=None, names=['user', 'item'], 
		usecols=[0, 1], dtype={0: np.int32, 1: np.int32})

	user_num = train_data['user'].max() + 1
	item_num = train_data['item'].max() + 1

	train_data = train_data.values.tolist()

	# load ratings as a dok matrix
	train_mat = sp.dok_matrix((user_num, item_num), dtype=np.float32)
	for x in train_data:
		train_mat[x[0], x[1]] = 1.0

	test_data = []
	with open(config.test_negative, 'r') as fd:
		line = fd.readline()
		while line != None and line != '':
			arr = line.split('\t')
			u = eval(arr[0])[0]
			test_data.append([u, eval(arr[0])[1]])
			for i in arr[1:]:
				test_data.append([u, int(i)])
			line = fd.readline()
	return train_data, test_data, user_num, item_num, train_mat


class BPRData(data.Dataset):
	def __init__(self, features, 
				num_item, train_mat=None, num_ng=0, is_training=None):
		super(BPRData, self).__init__()
		""" Note that the labels are only useful when training, we thus 
			add them in the ng_sample() function.
		"""
		self.features = features
		self.num_item = num_item
		self.train_mat = train_mat
		self.num_ng = num_ng
		self.is_training = is_training

	def ng_sample(self):
		assert self.is_training, 'no need to sampling when testing'

		self.features_fill = []
		for x in self.features:
			u, i = x[0], x[1]
			for t in range(self.num_ng):
				j = np.random.randint(self.num_item)
				while (u, j) in self.train_mat:
					j = np.random.randint(self.num_item)
				self.features_fill.append([u, i, j])

	def __len__(self):
		return self.num_ng * len(self.features) if \
				self.is_training else len(self.features)

	def __getitem__(self, idx):
		features = self.features_fill if \
				self.is_training else self.features

		user = features[idx][0]
		item_i = features[idx][1]
		item_j = features[idx][2] if \
				self.is_training else features[idx][1]
		return user, item_i, item_j 
		

================================================
FILE: evaluate.py
================================================
import numpy as np
import torch


def hit(gt_item, pred_items):
	if gt_item in pred_items:
		return 1
	return 0


def ndcg(gt_item, pred_items):
	if gt_item in pred_items:
		index = pred_items.index(gt_item)
		return np.reciprocal(np.log2(index+2))
	return 0


def metrics(model, test_loader, top_k):
	HR, NDCG = [], []

	for user, item_i, item_j in test_loader:
		user = user.cuda()
		item_i = item_i.cuda()
		item_j = item_j.cuda() # not useful when testing

		prediction_i, prediction_j = model(user, item_i, item_j)
		_, indices = torch.topk(prediction_i, top_k)
		recommends = torch.take(
				item_i, indices).cpu().numpy().tolist()

		gt_item = item_i[0].item()
		HR.append(hit(gt_item, recommends))
		NDCG.append(ndcg(gt_item, recommends))

	return np.mean(HR), np.mean(NDCG)


================================================
FILE: main.py
================================================
import os
import time
import argparse
import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data
import torch.backends.cudnn as cudnn
from tensorboardX import SummaryWriter

import model
import config
import evaluate
import data_utils


parser = argparse.ArgumentParser()
parser.add_argument("--lr", 
	type=float, 
	default=0.01, 
	help="learning rate")
parser.add_argument("--lamda", 
	type=float, 
	default=0.001, 
	help="model regularization rate")
parser.add_argument("--batch_size", 
	type=int, 
	default=4096, 
	help="batch size for training")
parser.add_argument("--epochs", 
	type=int,
	default=50,  
	help="training epoches")
parser.add_argument("--top_k", 
	type=int, 
	default=10, 
	help="compute metrics@top_k")
parser.add_argument("--factor_num", 
	type=int,
	default=32, 
	help="predictive factors numbers in the model")
parser.add_argument("--num_ng", 
	type=int,
	default=4, 
	help="sample negative items for training")
parser.add_argument("--test_num_ng", 
	type=int,
	default=99, 
	help="sample part of negative items for testing")
parser.add_argument("--out", 
	default=True,
	help="save model or not")
parser.add_argument("--gpu", 
	type=str,
	default="0",  
	help="gpu card ID")
args = parser.parse_args()

os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu
cudnn.benchmark = True


############################## PREPARE DATASET ##########################
train_data, test_data, user_num ,item_num, train_mat = data_utils.load_all()

# construct the train and test datasets
train_dataset = data_utils.BPRData(
		train_data, item_num, train_mat, args.num_ng, True)
test_dataset = data_utils.BPRData(
		test_data, item_num, train_mat, 0, False)
train_loader = data.DataLoader(train_dataset,
		batch_size=args.batch_size, shuffle=True, num_workers=4)
test_loader = data.DataLoader(test_dataset,
		batch_size=args.test_num_ng+1, shuffle=False, num_workers=0)

########################### CREATE MODEL #################################
model = model.BPR(user_num, item_num, args.factor_num)
model.cuda()

optimizer = optim.SGD(
			model.parameters(), lr=args.lr, weight_decay=args.lamda)
# writer = SummaryWriter() # for visualization

########################### TRAINING #####################################
count, best_hr = 0, 0
for epoch in range(args.epochs):
	model.train() 
	start_time = time.time()
	train_loader.dataset.ng_sample()

	for user, item_i, item_j in train_loader:
		user = user.cuda()
		item_i = item_i.cuda()
		item_j = item_j.cuda()

		model.zero_grad()
		prediction_i, prediction_j = model(user, item_i, item_j)
		loss = - (prediction_i - prediction_j).sigmoid().log().sum()
		loss.backward()
		optimizer.step()
		# writer.add_scalar('data/loss', loss.item(), count)
		count += 1

	model.eval()
	HR, NDCG = evaluate.metrics(model, test_loader, args.top_k)

	elapsed_time = time.time() - start_time
	print("The time elapse of epoch {:03d}".format(epoch) + " is: " + 
			time.strftime("%H: %M: %S", time.gmtime(elapsed_time)))
	print("HR: {:.3f}\tNDCG: {:.3f}".format(np.mean(HR), np.mean(NDCG)))

	if HR > best_hr:
		best_hr, best_ndcg, best_epoch = HR, NDCG, epoch
		if args.out:
			if not os.path.exists(config.model_path):
				os.mkdir(config.model_path)
			torch.save(model, '{}BPR.pt'.format(config.model_path))

print("End. Best epoch {:03d}: HR = {:.3f}, \
	NDCG = {:.3f}".format(best_epoch, best_hr, best_ndcg))


================================================
FILE: model.py
================================================
import torch
import torch.nn as nn


class BPR(nn.Module):
	def __init__(self, user_num, item_num, factor_num):
		super(BPR, self).__init__()
		"""
		user_num: number of users;
		item_num: number of items;
		factor_num: number of predictive factors.
		"""		
		self.embed_user = nn.Embedding(user_num, factor_num)
		self.embed_item = nn.Embedding(item_num, factor_num)

		nn.init.normal_(self.embed_user.weight, std=0.01)
		nn.init.normal_(self.embed_item.weight, std=0.01)

	def forward(self, user, item_i, item_j):
		user = self.embed_user(user)
		item_i = self.embed_item(item_i)
		item_j = self.embed_item(item_j)

		prediction_i = (user * item_i).sum(dim=-1)
		prediction_j = (user * item_j).sum(dim=-1)
		return prediction_i, prediction_j

Download .txt

gitextract_6cyprr6l/

├── README.md
├── config.py
├── data_utils.py
├── evaluate.py
├── main.py
└── model.py

Download .txt

SYMBOL INDEX (12 symbols across 3 files)

FILE: data_utils.py
  function load_all (line 10) | def load_all(test_num=100):
  class BPRData (line 40) | class BPRData(data.Dataset):
    method __init__ (line 41) | def __init__(self, features,
    method ng_sample (line 53) | def ng_sample(self):
    method __len__ (line 65) | def __len__(self):
    method __getitem__ (line 69) | def __getitem__(self, idx):

FILE: evaluate.py
  function hit (line 5) | def hit(gt_item, pred_items):
  function ndcg (line 11) | def ndcg(gt_item, pred_items):
  function metrics (line 18) | def metrics(model, test_loader, top_k):

FILE: model.py
  class BPR (line 5) | class BPR(nn.Module):
    method __init__ (line 6) | def __init__(self, user_num, item_num, factor_num):
    method forward (line 19) | def forward(self, user, item_i, item_j):

Download .json

Condensed preview — 6 files, each showing path, character count, and a content snippet. Download the .json file or copy for the full structured content (10K chars).

[
  {
    "path": "README.md",
    "chars": 1120,
    "preview": "# Pytorch-BPR\n\nNote that I use the two sub datasets provided by Xiangnan's [repo](https://github.com/hexiangnan/neural_c"
  },
  {
    "path": "config.py",
    "chars": 402,
    "preview": "# dataset name \ndataset = 'ml-1m'\nassert dataset in ['ml-1m', 'pinterest-20']\n\n# paths\nmain_path = '/home/share/guoyangy"
  },
  {
    "path": "data_utils.py",
    "chars": 2140,
    "preview": "import numpy as np \nimport pandas as pd \nimport scipy.sparse as sp\n\nimport torch.utils.data as data\n\nimport config\n\n\ndef"
  },
  {
    "path": "evaluate.py",
    "chars": 783,
    "preview": "import numpy as np\nimport torch\n\n\ndef hit(gt_item, pred_items):\n\tif gt_item in pred_items:\n\t\treturn 1\n\treturn 0\n\n\ndef nd"
  },
  {
    "path": "main.py",
    "chars": 3412,
    "preview": "import os\nimport time\nimport argparse\nimport numpy as np\n\nimport torch\nimport torch.nn as nn\nimport torch.optim as optim"
  },
  {
    "path": "model.py",
    "chars": 744,
    "preview": "import torch\nimport torch.nn as nn\n\n\nclass BPR(nn.Module):\n\tdef __init__(self, user_num, item_num, factor_num):\n\t\tsuper("
  }
]

About this extraction

This page contains the full source code of the guoyang9/BPR-pytorch GitHub repository, extracted and formatted as plain text for AI agents and large language models (LLMs). The extraction includes 6 files (8.4 KB), approximately 2.5k tokens, and a symbol index with 12 extracted functions, classes, methods, constants, and types. Use this with OpenClaw, Claude, ChatGPT, Cursor, Windsurf, or any other AI tool that accepts text input. You can copy the full output to your clipboard or download it as a .txt file.

Extracted by GitExtract — free GitHub repo to text converter for AI. Built by Nikandr Surkov.

Extract another repo