Full Code of snowkylin/line for AI

master 4cdfa7ada958 cached
6 files
9.7 KB
2.4k tokens
12 symbols
1 requests
Download .txt
Repository: snowkylin/line
Branch: master
Commit: 4cdfa7ada958
Files: 6
Total size: 9.7 KB

Directory structure:
gitextract_scq2ywej/

├── .gitignore
├── data/
│   └── co-authorship_graph.pkl
├── line.py
├── model.py
├── readme.md
└── utils.py

================================================
FILE CONTENTS
================================================

================================================
FILE: .gitignore
================================================
.idea
__pycache__
playground.py


================================================
FILE: line.py
================================================
import tensorflow as tf
import numpy as np
import argparse
from model import LINEModel
from utils import DBLPDataLoader
import pickle
import time


def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--embedding_dim', default=128)
    parser.add_argument('--batch_size', default=128)
    parser.add_argument('--K', default=5)
    parser.add_argument('--proximity', default='second-order', help='first-order or second-order')
    parser.add_argument('--learning_rate', default=0.025)
    parser.add_argument('--mode', default='train')
    parser.add_argument('--num_batches', default=300000)
    parser.add_argument('--total_graph', default=True)
    parser.add_argument('--graph_file', default='data/co-authorship_graph.pkl')
    args = parser.parse_args()
    if args.mode == 'train':
        train(args)
    elif args.mode == 'test':
        test(args)


def train(args):
    data_loader = DBLPDataLoader(graph_file=args.graph_file)
    suffix = args.proximity
    args.num_of_nodes = data_loader.num_of_nodes
    model = LINEModel(args)
    with tf.Session() as sess:
        print(args)
        print('batches\tloss\tsampling time\ttraining_time\tdatetime')
        tf.global_variables_initializer().run()
        initial_embedding = sess.run(model.embedding)
        learning_rate = args.learning_rate
        sampling_time, training_time = 0, 0
        for b in range(args.num_batches):
            t1 = time.time()
            u_i, u_j, label = data_loader.fetch_batch(batch_size=args.batch_size, K=args.K)
            feed_dict = {model.u_i: u_i, model.u_j: u_j, model.label: label, model.learning_rate: learning_rate}
            t2 = time.time()
            sampling_time += t2 - t1
            if b % 100 != 0:
                sess.run(model.train_op, feed_dict=feed_dict)
                training_time += time.time() - t2
                if learning_rate > args.learning_rate * 0.0001:
                    learning_rate = args.learning_rate * (1 - b / args.num_batches)
                else:
                    learning_rate = args.learning_rate * 0.0001
            else:
                loss = sess.run(model.loss, feed_dict=feed_dict)
                print('%d\t%f\t%0.2f\t%0.2f\t%s' % (b, loss, sampling_time, training_time,
                                                    time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())))
                sampling_time, training_time = 0, 0
            if b % 1000 == 0 or b == (args.num_batches - 1):
                embedding = sess.run(model.embedding)
                normalized_embedding = embedding / np.linalg.norm(embedding, axis=1, keepdims=True)
                pickle.dump(data_loader.embedding_mapping(normalized_embedding),
                            open('data/embedding_%s.pkl' % suffix, 'wb'))


def test(args):
    pass

if __name__ == '__main__':
    main()

================================================
FILE: model.py
================================================
import tensorflow as tf


class LINEModel:
    def __init__(self, args):
        self.u_i = tf.placeholder(name='u_i', dtype=tf.int32, shape=[args.batch_size * (args.K + 1)])
        self.u_j = tf.placeholder(name='u_j', dtype=tf.int32, shape=[args.batch_size * (args.K + 1)])
        self.label = tf.placeholder(name='label', dtype=tf.float32, shape=[args.batch_size * (args.K + 1)])
        self.embedding = tf.get_variable('target_embedding', [args.num_of_nodes, args.embedding_dim],
                                         initializer=tf.random_uniform_initializer(minval=-1., maxval=1.))
        self.u_i_embedding = tf.matmul(tf.one_hot(self.u_i, depth=args.num_of_nodes), self.embedding)
        if args.proximity == 'first-order':
            self.u_j_embedding = tf.matmul(tf.one_hot(self.u_j, depth=args.num_of_nodes), self.embedding)
        elif args.proximity == 'second-order':
            self.context_embedding = tf.get_variable('context_embedding', [args.num_of_nodes, args.embedding_dim],
                                                     initializer=tf.random_uniform_initializer(minval=-1., maxval=1.))
            self.u_j_embedding = tf.matmul(tf.one_hot(self.u_j, depth=args.num_of_nodes), self.context_embedding)

        self.inner_product = tf.reduce_sum(self.u_i_embedding * self.u_j_embedding, axis=1)
        self.loss = -tf.reduce_mean(tf.log_sigmoid(self.label * self.inner_product))
        self.learning_rate = tf.placeholder(name='learning_rate', dtype=tf.float32)
        # self.optimizer = tf.train.GradientDescentOptimizer(learning_rate=self.learning_rate)
        self.optimizer = tf.train.RMSPropOptimizer(learning_rate=self.learning_rate)
        self.train_op = self.optimizer.minimize(self.loss)




================================================
FILE: readme.md
================================================
# LINE in TensorFlow

TensorFlow implementation of paper _LINE: Large-scale Information Network Embedding_ by Jian Tang, et al.

You can see [my slide](Network_Embedding_with_TensorFlow.pdf) on GDG DevFest 2017 for more detail about LINE and TensorFlow. Notice: code shown in the slide are pseudocode, minibatch and negative sampling are omitted in the slide. 

## Prerequisites

* Python 3.6
* TensorFlow 1.3.0
* Networkx
* NumPy

## Setup

* Prepare a network using networkx. Write the graph to a file by [nx.write_gpickle](https://networkx.github.io/documentation/stable/reference/readwrite/generated/networkx.readwrite.gpickle.write_gpickle.html).
* Put the network file in `data` folder.
* Run `line.py --graph_file graph.pkl` to start training. `graph.pkl` is the name of your network file.
* Embedding will be stored in `data/embedding_XXX-order.pkl`. You can load it by `pickle.load()` in python.

## References

- Tang, Jian, et al. "[Line: Large-scale information network embedding.](https://dl.acm.org/citation.cfm?id=2741093)" _Proceedings of the 24th International Conference on World Wide Web. International World Wide Web Conferences Steering Committee_, 2015.


================================================
FILE: utils.py
================================================
import networkx as nx
import numpy as np


class DBLPDataLoader:
    def __init__(self, graph_file):
        self.g = nx.read_gpickle(graph_file)
        self.num_of_nodes = self.g.number_of_nodes()
        self.num_of_edges = self.g.number_of_edges()
        self.edges_raw = self.g.edges(data=True)
        self.nodes_raw = self.g.nodes(data=True)

        self.edge_distribution = np.array([attr['weight'] for _, _, attr in self.edges_raw], dtype=np.float32)
        self.edge_distribution /= np.sum(self.edge_distribution)
        self.edge_sampling = AliasSampling(prob=self.edge_distribution)
        self.node_negative_distribution = np.power(
            np.array([self.g.degree(node, weight='weight') for node, _ in self.nodes_raw], dtype=np.float32), 0.75)
        self.node_negative_distribution /= np.sum(self.node_negative_distribution)
        self.node_sampling = AliasSampling(prob=self.node_negative_distribution)

        self.node_index = {}
        self.node_index_reversed = {}
        for index, (node, _) in enumerate(self.nodes_raw):
            self.node_index[node] = index
            self.node_index_reversed[index] = node
        self.edges = [(self.node_index[u], self.node_index[v]) for u, v, _ in self.edges_raw]

    def fetch_batch(self, batch_size=16, K=10, edge_sampling='atlas', node_sampling='atlas'):
        if edge_sampling == 'numpy':
            edge_batch_index = np.random.choice(self.num_of_edges, size=batch_size, p=self.edge_distribution)
        elif edge_sampling == 'atlas':
            edge_batch_index = self.edge_sampling.sampling(batch_size)
        elif edge_sampling == 'uniform':
            edge_batch_index = np.random.randint(0, self.num_of_edges, size=batch_size)
        u_i = []
        u_j = []
        label = []
        for edge_index in edge_batch_index:
            edge = self.edges[edge_index]
            if self.g.__class__ == nx.Graph:
                if np.random.rand() > 0.5:      # important: second-order proximity is for directed edge
                    edge = (edge[1], edge[0])
            u_i.append(edge[0])
            u_j.append(edge[1])
            label.append(1)
            for i in range(K):
                while True:
                    if node_sampling == 'numpy':
                        negative_node = np.random.choice(self.num_of_nodes, p=self.node_negative_distribution)
                    elif node_sampling == 'atlas':
                        negative_node = self.node_sampling.sampling()
                    elif node_sampling == 'uniform':
                        negative_node = np.random.randint(0, self.num_of_nodes)
                    if not self.g.has_edge(self.node_index_reversed[negative_node], self.node_index_reversed[edge[0]]):
                        break
                u_i.append(edge[0])
                u_j.append(negative_node)
                label.append(-1)
        return u_i, u_j, label

    def embedding_mapping(self, embedding):
        return {node: embedding[self.node_index[node]] for node, _ in self.nodes_raw}


class AliasSampling:

    # Reference: https://en.wikipedia.org/wiki/Alias_method

    def __init__(self, prob):
        self.n = len(prob)
        self.U = np.array(prob) * self.n
        self.K = [i for i in range(len(prob))]
        overfull, underfull = [], []
        for i, U_i in enumerate(self.U):
            if U_i > 1:
                overfull.append(i)
            elif U_i < 1:
                underfull.append(i)
        while len(overfull) and len(underfull):
            i, j = overfull.pop(), underfull.pop()
            self.K[j] = i
            self.U[i] = self.U[i] - (1 - self.U[j])
            if self.U[i] > 1:
                overfull.append(i)
            elif self.U[i] < 1:
                underfull.append(i)

    def sampling(self, n=1):
        x = np.random.rand(n)
        i = np.floor(self.n * x)
        y = self.n * x - i
        i = i.astype(np.int32)
        res = [i[k] if y[k] < self.U[i[k]] else self.K[i[k]] for k in range(n)]
        if n == 1:
            return res[0]
        else:
            return res

Download .txt
gitextract_scq2ywej/

├── .gitignore
├── data/
│   └── co-authorship_graph.pkl
├── line.py
├── model.py
├── readme.md
└── utils.py
Download .txt
SYMBOL INDEX (12 symbols across 3 files)

FILE: line.py
  function main (line 10) | def main():
  function train (line 28) | def train(args):
  function test (line 65) | def test(args):

FILE: model.py
  class LINEModel (line 4) | class LINEModel:
    method __init__ (line 5) | def __init__(self, args):

FILE: utils.py
  class DBLPDataLoader (line 5) | class DBLPDataLoader:
    method __init__ (line 6) | def __init__(self, graph_file):
    method fetch_batch (line 28) | def fetch_batch(self, batch_size=16, K=10, edge_sampling='atlas', node...
    method embedding_mapping (line 61) | def embedding_mapping(self, embedding):
  class AliasSampling (line 65) | class AliasSampling:
    method __init__ (line 69) | def __init__(self, prob):
    method sampling (line 88) | def sampling(self, n=1):
Condensed preview — 6 files, each showing path, character count, and a content snippet. Download the .json file or copy for the full structured content (10K chars).
[
  {
    "path": ".gitignore",
    "chars": 32,
    "preview": ".idea\n__pycache__\nplayground.py\n"
  },
  {
    "path": "line.py",
    "chars": 2860,
    "preview": "import tensorflow as tf\nimport numpy as np\nimport argparse\nfrom model import LINEModel\nfrom utils import DBLPDataLoader\n"
  },
  {
    "path": "model.py",
    "chars": 1744,
    "preview": "import tensorflow as tf\n\n\nclass LINEModel:\n    def __init__(self, args):\n        self.u_i = tf.placeholder(name='u_i', d"
  },
  {
    "path": "readme.md",
    "chars": 1176,
    "preview": "# LINE in TensorFlow\n\nTensorFlow implementation of paper _LINE: Large-scale Information Network Embedding_ by Jian Tang,"
  },
  {
    "path": "utils.py",
    "chars": 4102,
    "preview": "import networkx as nx\nimport numpy as np\n\n\nclass DBLPDataLoader:\n    def __init__(self, graph_file):\n        self.g = nx"
  }
]

// ... and 1 more files (download for full content)

About this extraction

This page contains the full source code of the snowkylin/line GitHub repository, extracted and formatted as plain text for AI agents and large language models (LLMs). The extraction includes 6 files (9.7 KB), approximately 2.4k tokens, and a symbol index with 12 extracted functions, classes, methods, constants, and types. Use this with OpenClaw, Claude, ChatGPT, Cursor, Windsurf, or any other AI tool that accepts text input. You can copy the full output to your clipboard or download it as a .txt file.

Extracted by GitExtract — free GitHub repo to text converter for AI. Built by Nikandr Surkov.

Copied to clipboard!