Repository: snowkylin/line Branch: master Commit: 4cdfa7ada958 Files: 6 Total size: 9.7 KB Directory structure: gitextract_scq2ywej/ ├── .gitignore ├── data/ │ └── co-authorship_graph.pkl ├── line.py ├── model.py ├── readme.md └── utils.py ================================================ FILE CONTENTS ================================================ ================================================ FILE: .gitignore ================================================ .idea __pycache__ playground.py ================================================ FILE: line.py ================================================ import tensorflow as tf import numpy as np import argparse from model import LINEModel from utils import DBLPDataLoader import pickle import time def main(): parser = argparse.ArgumentParser() parser.add_argument('--embedding_dim', default=128) parser.add_argument('--batch_size', default=128) parser.add_argument('--K', default=5) parser.add_argument('--proximity', default='second-order', help='first-order or second-order') parser.add_argument('--learning_rate', default=0.025) parser.add_argument('--mode', default='train') parser.add_argument('--num_batches', default=300000) parser.add_argument('--total_graph', default=True) parser.add_argument('--graph_file', default='data/co-authorship_graph.pkl') args = parser.parse_args() if args.mode == 'train': train(args) elif args.mode == 'test': test(args) def train(args): data_loader = DBLPDataLoader(graph_file=args.graph_file) suffix = args.proximity args.num_of_nodes = data_loader.num_of_nodes model = LINEModel(args) with tf.Session() as sess: print(args) print('batches\tloss\tsampling time\ttraining_time\tdatetime') tf.global_variables_initializer().run() initial_embedding = sess.run(model.embedding) learning_rate = args.learning_rate sampling_time, training_time = 0, 0 for b in range(args.num_batches): t1 = time.time() u_i, u_j, label = data_loader.fetch_batch(batch_size=args.batch_size, K=args.K) feed_dict = {model.u_i: u_i, model.u_j: u_j, model.label: label, model.learning_rate: learning_rate} t2 = time.time() sampling_time += t2 - t1 if b % 100 != 0: sess.run(model.train_op, feed_dict=feed_dict) training_time += time.time() - t2 if learning_rate > args.learning_rate * 0.0001: learning_rate = args.learning_rate * (1 - b / args.num_batches) else: learning_rate = args.learning_rate * 0.0001 else: loss = sess.run(model.loss, feed_dict=feed_dict) print('%d\t%f\t%0.2f\t%0.2f\t%s' % (b, loss, sampling_time, training_time, time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))) sampling_time, training_time = 0, 0 if b % 1000 == 0 or b == (args.num_batches - 1): embedding = sess.run(model.embedding) normalized_embedding = embedding / np.linalg.norm(embedding, axis=1, keepdims=True) pickle.dump(data_loader.embedding_mapping(normalized_embedding), open('data/embedding_%s.pkl' % suffix, 'wb')) def test(args): pass if __name__ == '__main__': main() ================================================ FILE: model.py ================================================ import tensorflow as tf class LINEModel: def __init__(self, args): self.u_i = tf.placeholder(name='u_i', dtype=tf.int32, shape=[args.batch_size * (args.K + 1)]) self.u_j = tf.placeholder(name='u_j', dtype=tf.int32, shape=[args.batch_size * (args.K + 1)]) self.label = tf.placeholder(name='label', dtype=tf.float32, shape=[args.batch_size * (args.K + 1)]) self.embedding = tf.get_variable('target_embedding', [args.num_of_nodes, args.embedding_dim], initializer=tf.random_uniform_initializer(minval=-1., maxval=1.)) self.u_i_embedding = tf.matmul(tf.one_hot(self.u_i, depth=args.num_of_nodes), self.embedding) if args.proximity == 'first-order': self.u_j_embedding = tf.matmul(tf.one_hot(self.u_j, depth=args.num_of_nodes), self.embedding) elif args.proximity == 'second-order': self.context_embedding = tf.get_variable('context_embedding', [args.num_of_nodes, args.embedding_dim], initializer=tf.random_uniform_initializer(minval=-1., maxval=1.)) self.u_j_embedding = tf.matmul(tf.one_hot(self.u_j, depth=args.num_of_nodes), self.context_embedding) self.inner_product = tf.reduce_sum(self.u_i_embedding * self.u_j_embedding, axis=1) self.loss = -tf.reduce_mean(tf.log_sigmoid(self.label * self.inner_product)) self.learning_rate = tf.placeholder(name='learning_rate', dtype=tf.float32) # self.optimizer = tf.train.GradientDescentOptimizer(learning_rate=self.learning_rate) self.optimizer = tf.train.RMSPropOptimizer(learning_rate=self.learning_rate) self.train_op = self.optimizer.minimize(self.loss) ================================================ FILE: readme.md ================================================ # LINE in TensorFlow TensorFlow implementation of paper _LINE: Large-scale Information Network Embedding_ by Jian Tang, et al. You can see [my slide](Network_Embedding_with_TensorFlow.pdf) on GDG DevFest 2017 for more detail about LINE and TensorFlow. Notice: code shown in the slide are pseudocode, minibatch and negative sampling are omitted in the slide. ## Prerequisites * Python 3.6 * TensorFlow 1.3.0 * Networkx * NumPy ## Setup * Prepare a network using networkx. Write the graph to a file by [nx.write_gpickle](https://networkx.github.io/documentation/stable/reference/readwrite/generated/networkx.readwrite.gpickle.write_gpickle.html). * Put the network file in `data` folder. * Run `line.py --graph_file graph.pkl` to start training. `graph.pkl` is the name of your network file. * Embedding will be stored in `data/embedding_XXX-order.pkl`. You can load it by `pickle.load()` in python. ## References - Tang, Jian, et al. "[Line: Large-scale information network embedding.](https://dl.acm.org/citation.cfm?id=2741093)" _Proceedings of the 24th International Conference on World Wide Web. International World Wide Web Conferences Steering Committee_, 2015. ================================================ FILE: utils.py ================================================ import networkx as nx import numpy as np class DBLPDataLoader: def __init__(self, graph_file): self.g = nx.read_gpickle(graph_file) self.num_of_nodes = self.g.number_of_nodes() self.num_of_edges = self.g.number_of_edges() self.edges_raw = self.g.edges(data=True) self.nodes_raw = self.g.nodes(data=True) self.edge_distribution = np.array([attr['weight'] for _, _, attr in self.edges_raw], dtype=np.float32) self.edge_distribution /= np.sum(self.edge_distribution) self.edge_sampling = AliasSampling(prob=self.edge_distribution) self.node_negative_distribution = np.power( np.array([self.g.degree(node, weight='weight') for node, _ in self.nodes_raw], dtype=np.float32), 0.75) self.node_negative_distribution /= np.sum(self.node_negative_distribution) self.node_sampling = AliasSampling(prob=self.node_negative_distribution) self.node_index = {} self.node_index_reversed = {} for index, (node, _) in enumerate(self.nodes_raw): self.node_index[node] = index self.node_index_reversed[index] = node self.edges = [(self.node_index[u], self.node_index[v]) for u, v, _ in self.edges_raw] def fetch_batch(self, batch_size=16, K=10, edge_sampling='atlas', node_sampling='atlas'): if edge_sampling == 'numpy': edge_batch_index = np.random.choice(self.num_of_edges, size=batch_size, p=self.edge_distribution) elif edge_sampling == 'atlas': edge_batch_index = self.edge_sampling.sampling(batch_size) elif edge_sampling == 'uniform': edge_batch_index = np.random.randint(0, self.num_of_edges, size=batch_size) u_i = [] u_j = [] label = [] for edge_index in edge_batch_index: edge = self.edges[edge_index] if self.g.__class__ == nx.Graph: if np.random.rand() > 0.5: # important: second-order proximity is for directed edge edge = (edge[1], edge[0]) u_i.append(edge[0]) u_j.append(edge[1]) label.append(1) for i in range(K): while True: if node_sampling == 'numpy': negative_node = np.random.choice(self.num_of_nodes, p=self.node_negative_distribution) elif node_sampling == 'atlas': negative_node = self.node_sampling.sampling() elif node_sampling == 'uniform': negative_node = np.random.randint(0, self.num_of_nodes) if not self.g.has_edge(self.node_index_reversed[negative_node], self.node_index_reversed[edge[0]]): break u_i.append(edge[0]) u_j.append(negative_node) label.append(-1) return u_i, u_j, label def embedding_mapping(self, embedding): return {node: embedding[self.node_index[node]] for node, _ in self.nodes_raw} class AliasSampling: # Reference: https://en.wikipedia.org/wiki/Alias_method def __init__(self, prob): self.n = len(prob) self.U = np.array(prob) * self.n self.K = [i for i in range(len(prob))] overfull, underfull = [], [] for i, U_i in enumerate(self.U): if U_i > 1: overfull.append(i) elif U_i < 1: underfull.append(i) while len(overfull) and len(underfull): i, j = overfull.pop(), underfull.pop() self.K[j] = i self.U[i] = self.U[i] - (1 - self.U[j]) if self.U[i] > 1: overfull.append(i) elif self.U[i] < 1: underfull.append(i) def sampling(self, n=1): x = np.random.rand(n) i = np.floor(self.n * x) y = self.n * x - i i = i.astype(np.int32) res = [i[k] if y[k] < self.U[i[k]] else self.K[i[k]] for k in range(n)] if n == 1: return res[0] else: return res