[
  {
    "path": ".classpath",
    "content": "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<classpath>\n\t<classpathentry kind=\"src\" path=\"src\"/>\n\t<classpathentry kind=\"lib\" path=\"lib/happy.coding.utils.jar\"/>\n\t<classpathentry kind=\"lib\" path=\"lib/json-simple.jar\"/>\n\t<classpathentry kind=\"con\" path=\"org.eclipse.jdt.launching.JRE_CONTAINER\"/>\n\t<classpathentry kind=\"output\" path=\"bin\"/>\n</classpath>\n"
  },
  {
    "path": ".project",
    "content": "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<projectDescription>\n\t<name>sigir16-eals</name>\n\t<comment></comment>\n\t<projects>\n\t</projects>\n\t<buildSpec>\n\t\t<buildCommand>\n\t\t\t<name>org.eclipse.jdt.core.javabuilder</name>\n\t\t\t<arguments>\n\t\t\t</arguments>\n\t\t</buildCommand>\n\t</buildSpec>\n\t<natures>\n\t\t<nature>org.eclipse.jdt.core.javanature</nature>\n\t</natures>\n</projectDescription>\n"
  },
  {
    "path": "README.md",
    "content": "# sigir16-eals\nExperiments codes for SIGIR'16 paper \"Fast Matrix Factorization for Online Recommendation with Implicit Feedback \"\n\n"
  },
  {
    "path": "data/README.md",
    "content": "The amazon dataset is too large to put here. Please email xiangnanhe@gmail.com to request the data if needed. \n"
  },
  {
    "path": "src/algorithms/ItemKNN.java",
    "content": "package algorithms;\n\nimport java.util.ArrayList;\nimport java.util.HashMap;\nimport java.util.HashSet;\nimport java.util.Map;\n\nimport utils.CommonUtils;\nimport data_structure.Rating;\nimport data_structure.SparseMatrix;\nimport data_structure.SparseVector;\n\n/**\n * Implement ItemKNN method for topK recommendation, as described in:\n * Collaborative filtering for implicit feedback datasets. \n * By Yifan Hu , Yehuda Koren , Chris Volinsky.\n * In IEEE ICDM'2008.\n * \n * @author xiangnanhe\n *\n */\npublic class ItemKNN extends TopKRecommender {\n\n\t/** Similarity matrix of item-item . */\n\tpublic SparseMatrix similarity;\n\n\t/** K neighbors to consider for each item */\n\tprivate int K = 0;\n\t\n\t/** Cache the L2 length for each item. */\n\tdouble[] lengths;\n\t\n\tpublic ItemKNN(SparseMatrix trainMatrix, ArrayList<Rating> testRatings, \n\t\t\tint topK, int threadNum, int K) {\n\t\tsuper(trainMatrix, testRatings, topK, threadNum);\n\t\tthis.K = K;\n\t\tthis.similarity = new SparseMatrix(itemCount, itemCount);\n\t}\n\n\tpublic void buildModel() {\n\t\t// The length cache\n\t\tlengths = new double[itemCount];\n\t\tfor (int i = 0; i < itemCount; i ++) {\n\t\t\tlengths[i] = Math.sqrt(trainMatrix.getColRef(i).squareSum());\n\t\t}\n\t\t\n\t\t// Run model multi-threads splitted by items.\n\t\tItemKNNThread[] threads = new ItemKNNThread[threadNum];\n\t\tfor (int t = 0; t < threadNum; t ++) {\n\t\t\tint startItem = (itemCount / threadNum) * t;\n\t\t\tint endItem = (t == threadNum-1) ? itemCount : \n\t\t\t\t(itemCount / threadNum) * (t + 1);\n\t\t\tthreads[t] = new ItemKNNThread(this, startItem, endItem);\n\t\t\tthreads[t].start();\n\t\t}\n\t\t\n\t\t// Wait until all threads are finished.\n\t\tfor (int t = 0; t < threads.length; t++) { \n\t\t  try {\n\t\t\t\tthreads[t].join();\n\t\t\t} catch (InterruptedException e) {\n\t\t\t\tSystem.err.println(\"InterruptException was caught: \" + e.getMessage());\n\t\t\t}\n\t\t}\n\t}\n\t\n\tprotected void buildModel_items(int startItem, int endItem) {\n\t\t// Build the similarity matrix for selected items.\n\t\tfor (int i = startItem; i < endItem; i ++) {\n\t\t\tHashMap<Integer, Double> map_item_score = new HashMap<Integer, Double>();\n\t\t\tfor (int j = 0; j < itemCount & j != i; j ++) {\n\t\t\t\t// Cosine similarity\n\t\t\t\tdouble score = trainMatrix.getColRef(i).innerProduct(trainMatrix.getColRef(j));\n\t\t\t\tif (score != 0) {\n\t\t\t\t\tscore /= (lengths[i] * lengths[j]);\n\t\t\t\t\tmap_item_score.put(j, score);\n\t\t\t\t}\n\t\t\t}\n\t\t\tif (K <= 0) {  // All neighbors\n\t\t\t\tfor (int j : map_item_score.keySet()) {\n\t\t\t\t\tsimilarity.setValue(i, j, map_item_score.get(j));\n\t\t\t\t}\n\t\t\t} else {  // Only K nearest neighbors\n\t\t\t\tfor (int j : CommonUtils.TopKeysByValue(map_item_score, K, null)) {\n\t\t\t\t\tsimilarity.setValue(i, j, map_item_score.get(j));\n\t\t\t\t}\n\t\t\t} // end if\n\t\t} // end for\n\t}\n\t\n\tpublic double predict(int u, int i) {\n\t\treturn trainMatrix.getRowRef(u).innerProduct(similarity.getRowRef(i));\n\t}\n\n\t@Override\n\tpublic void updateModel(int u, int i) {\n\t\t// TODO Implement SIGMOD15 paper\n\t\t\n\t}\n}\n\n// Thread for building model for ItemKNN.\nclass ItemKNNThread extends Thread {\n\tItemKNN model;\n\tint startItem;\n\tint endItem;\n\n\tpublic ItemKNNThread(ItemKNN model, int startItem, int endItem) {\n\t\tthis.model = model;\n\t\tthis.startItem = startItem;\n\t\tthis.endItem = endItem;\n\t}\n\t\n\tpublic void run() {\n\t\tmodel.buildModel_items(startItem, endItem);\n\t}\n}\n"
  },
  {
    "path": "src/algorithms/ItemPopularity.java",
    "content": "package algorithms;\r\n\r\nimport java.util.ArrayList;\r\nimport java.util.HashMap;\r\n\r\nimport data_structure.Rating;\r\nimport data_structure.SparseMatrix;\r\n\r\npublic class ItemPopularity extends TopKRecommender {\r\n\r\n\tdouble[] item_popularity;\r\n\tpublic ItemPopularity(SparseMatrix trainMatrix, ArrayList<Rating> testRatings, \r\n\t\t\tint topK, int threadNum) {\r\n\t\tsuper(trainMatrix, testRatings, topK, threadNum);\r\n\t\titem_popularity = new double[itemCount];\r\n\t}\r\n\t\r\n\tpublic void buildModel() {\r\n\t\tfor (int i = 0; i < itemCount; i++) {\r\n\t\t\t// Measure popularity by number of reviews received.\r\n\t\t\titem_popularity[i] = trainMatrix.getColRef(i).itemCount();\r\n\t\t}\r\n\t}\r\n\t\r\n\tpublic double predict(int u, int i) {\r\n\t\treturn item_popularity[i];\r\n\t}\r\n\r\n\t@Override\r\n\tpublic void updateModel(int u, int i) {\r\n\t\ttrainMatrix.setValue(u, i, 1);\r\n\t\titem_popularity[i] += 1;\r\n\t}\r\n}\r\n"
  },
  {
    "path": "src/algorithms/MF_ALS.java",
    "content": "package algorithms;\n\nimport data_structure.Rating;\nimport data_structure.SparseMatrix;\nimport data_structure.DenseVector;\nimport data_structure.DenseMatrix;\nimport data_structure.Pair;\nimport data_structure.SparseVector;\nimport happy.coding.math.Randoms;\n\nimport java.util.ArrayList;\nimport java.util.Collections;\nimport java.util.Random;\n\nimport utils.Printer;\n\n/**\n * ALS algorithm of the ICDM'09 paper:\n * Yifan Hu etc. Collaborative Filtering for Implicit Feedback Datasets. \n * @author xiangnanhe\n */\npublic class MF_ALS extends TopKRecommender {\n\t/** Model priors to set. */\n\tint factors = 10; \t// number of latent factors.\n\tint maxIter = 100; \t// maximum iterations.\n\tdouble w0 = 0.01;\t// weight for 0s\n\tdouble reg = 0.01; \t// regularization parameters\n  double init_mean = 0;  // Gaussian mean for init V\n  double init_stdev = 0.01; // Gaussian std-dev for init V\n\t\n  /** Model parameters to learn */\n  DenseMatrix U;\t// latent vectors for users\n  DenseMatrix V;\t// latent vectors for items\n  \n  /** Caches */\n\tDenseMatrix SU;\n\tDenseMatrix SV;\n\t\n\tboolean showProgress;\n\tboolean showLoss;\n\t\n\tpublic MF_ALS(SparseMatrix trainMatrix, ArrayList<Rating> testRatings, \n\t\t\tint topK, int threadNum, int factors, int maxIter, double w0, double reg, \n\t\t\tdouble init_mean, double init_stdev, boolean showProgress, boolean showLoss) {\n\t\tsuper(trainMatrix, testRatings, topK, threadNum);\n\t\tthis.factors = factors;\n\t\tthis.maxIter = maxIter;\n\t\tthis.w0 = w0 / itemCount;\n\t\tthis.reg = reg;\n\t\tthis.init_mean = init_mean;\n\t\tthis.init_stdev = init_stdev;\n\t\tthis.showProgress = showProgress;\n\t\tthis.showLoss = showLoss;\n\t\tthis.initialize();\n\t}\n\t\n\t//remove\n\tpublic void setUV(DenseMatrix U, DenseMatrix V) {\n\t\tthis.U = U.clone();\n\t\tthis.V = V.clone();\n\t\tSU = U.transpose().mult(U);\n\t\tSV = V.transpose().mult(V);\n\t}\n\t\n\tprivate void initialize() {\n\t\tU = new DenseMatrix(userCount, factors);\n\t\tV = new DenseMatrix(itemCount, factors);\n\t\tU.init(init_mean, init_stdev);\n\t\tV.init(init_mean, init_stdev);\n\t\t\n\t\tSU = U.transpose().mult(U);\n\t\tSV = V.transpose().mult(V);\n\t}\n\t\n\t// Implement the ALS algorithm of the ICDM'09 paper\n\tpublic void buildModel() {\n\t\tSystem.out.println(\"Run for MF_ALS\");\n\t\t\n\t\tdouble loss_pre = Double.MAX_VALUE;\n\t\tfor (int iter = 0; iter < maxIter; iter ++) {\n\t\t\tLong start = System.currentTimeMillis();\n\t\t\t\n\t\t\t// Update user factors\n\t\t\tfor (int u = 0; u < userCount; u ++) {\n\t\t\t\tupdate_user(u);\n\t\t\t}\n\t\t\t\n\t\t\t// Update item factors\n\t\t\tfor (int i = 0; i < itemCount; i ++) {\n\t\t\t\tupdate_item(i);\n\t\t\t}\n\t\t\t\n\t\t\t// Show progress\n\t\t\tif (showProgress)\n\t\t\t\tshowProgress(iter, start, testRatings);\n\t\t\t// Show loss\n\t\t\tif (showLoss)\n\t\t\t\tloss_pre = showLoss(iter, start, loss_pre);\n\t\t\t\n\t\t}\n\t}\n\t\n\t// Run model for one iteration\n\tpublic void runOneIteration() {\n\t\t// Update user latent vectors\n\t\tfor (int u = 0; u < userCount; u ++) {\n\t\t\tupdate_user(u);\n\t\t}\n\t\t\n\t\t// Update item latent vectors\n\t\tfor (int i = 0; i < itemCount; i ++) {\n\t\t\tupdate_item(i);\n\t\t}\n\t}\n\t\n\tprivate void update_user(int u) {\n\t\tArrayList<Integer> itemList = trainMatrix.getRowRef(u).indexList();\n\t\t// Get matrix Au\n\t\tDenseMatrix Au = SU.scale(w0);\n\t\tfor (int k1 = 0; k1 < factors; k1 ++) {\n\t\t\tfor (int k2 = 0; k2 < factors; k2 ++) {\n\t\t\t\tfor (int i : itemList)\n\t\t\t\t\tAu.add(k1, k2, V.get(i, k1) * V.get(i, k2) * (1 - w0));\n\t\t\t}\n\t\t} \n\t\t// Get vector du\n\t\tDenseVector du = new DenseVector(factors);\n\t\tfor (int k = 0; k < factors; k ++) {\n\t\t\tfor (int i : itemList)\n\t\t\t\tdu.add(k, V.get(i, k) * trainMatrix.getValue(u, i));\n\t\t}\n\t\t// Matrix inversion to get the new embedding\n\t\tfor (int k = 0; k < factors; k ++) { // consider the regularizer\n\t\t\tAu.add(k, k, reg);\n\t\t}\n\t\tDenseVector newVector = Au.inv().mult(du);\n\t\t\n\t\t// Update the SU cache\n\t\tfor (int f = 0; f < factors; f ++) {\n\t\t\tfor (int k = 0; k <= f; k ++) {\n\t\t\t\tdouble val = SU.get(f, k) - U.get(u, f) * U.get(u, k)\n\t\t\t\t\t\t+ newVector.get(f) * newVector.get(k);\n\t\t\t\tSU.set(f, k, val);\n\t\t\t\tSU.set(k, f, val);\n\t\t\t}\n\t\t}\n\t\t// Update parameters\n\t\tfor (int k = 0; k < factors; k ++) {\n\t\t\tU.set(u, k, newVector.get(k));\n\t\t}\n\t}\n\t\n\tprivate void update_item(int i) {\n\t\tArrayList<Integer> userList = trainMatrix.getColRef(i).indexList();\n\t\t// Get matrix Ai\n\t\tDenseMatrix Ai = SV.scale(w0);\n\t\tfor (int k1 = 0; k1 < factors; k1 ++) {\n\t\t\tfor (int k2 = 0; k2 < factors; k2 ++) {\n\t\t\t\tfor (int u : userList)\n\t\t\t\t\tAi.add(k1, k2, U.get(u, k1) * U.get(u, k2) * (1 - w0));\n\t\t\t}\n\t\t}\n\t\t// Get vector di\n\t\tDenseVector di = new DenseVector(factors);\n\t\tfor (int k = 0; k < factors; k ++) {\n\t\t\tfor (int u : userList)\n\t\t\t\tdi.add(k, U.get(u, k) * trainMatrix.getValue(u, i));\n\t\t}\n\t\t// Matrix inversion to get the new embedding\n\t\tfor (int k = 0; k < factors; k ++) { // consider the regularizer\n\t\t\tAi.add(k, k, reg);\n\t\t}\n\t\tDenseVector newVector = Ai.inv().mult(di);\n\t\t\n\t\t// Update the SV cache\n\t\tfor (int f = 0; f < factors; f ++) {\n\t\t\tfor (int k = 0; k <= f; k ++) {\n\t\t\t\tdouble val = SV.get(f, k) - V.get(i, f) * V.get(i, k)\n\t\t\t\t\t\t+ newVector.get(f) * newVector.get(k);\n\t\t\t\tSV.set(f, k, val);\n\t\t\t\tSV.set(k, f, val);\n\t\t\t}\n\t\t}\n\t\t\n\t\t// Update parameters\n\t\tfor (int k = 0; k < factors; k ++) {\n\t\t\tV.set(i, k, newVector.get(k));\n\t\t}\n\t}\n\t\n\tpublic double showLoss(int iter, long start, double loss_pre) {\n\t\tlong start1 = System.currentTimeMillis();\n\t\tdouble loss_cur = loss();\n\t\tString symbol = loss_pre >= loss_cur ? \"-\" : \"+\";\n\t\tSystem.out.printf(\"Iter=%d [%s]\\t [%s]loss: %.4f [%s]\\n\", iter, \n\t\t\t\tPrinter.printTime(start1 - start), symbol, loss_cur, \n\t\t\t\tPrinter.printTime(System.currentTimeMillis() - start1));\n\t\treturn loss_cur;\n\t}\n\t\n\t// Fast way to calculate the loss function\n\tpublic double loss() {\n\t\t// Init the SV cache for fast calculation\n\t\tDenseMatrix SV = new DenseMatrix(factors, factors);\n\t\tfor (int f = 0; f < factors; f ++) {\n\t\t\tfor (int k = 0; k <= f; k ++) {\n\t\t\t\tdouble val = 0;\n\t\t\t\tfor (int i = 0; i < itemCount; i ++)\n\t\t\t\t\tval += V.get(i, f) * V.get(i, k);\n\t\t\t\tSV.set(f, k, val);\n\t\t\t\tSV.set(k, f, val);\n\t\t\t}\n\t\t}\n\t\t\n\t\tdouble L = reg * (U.squaredSum() + V.squaredSum());\n\t\tfor (int u = 0; u < userCount; u ++) {\n\t\t\tdouble l = 0;\n\t\t\tfor (int i : trainMatrix.getRowRef(u).indexList()) {\n\t\t\t\tl += Math.pow(trainMatrix.getValue(u, i) - predict(u, i), 2);\n\t\t\t}\n\t\t\tl *= (1 - w0);\n\t\t\tl += w0 * SV.mult(U.row(u, false)).inner(U.row(u, false));\n\t\t\tL += l;\n\t\t}\n\t\t\n\t\treturn L;\n\t}\n\t\n\t@Override\n\tpublic double predict(int u, int i) {\n\t\treturn U.row(u, false).inner(V.row(i, false));\n\t}\n\n\t@Override\n\tpublic void updateModel(int u, int i) {\n\t\ttrainMatrix.setValue(u, i, 1);\n\t\t\n\t\tfor (int iter = 0; iter < maxIterOnline; iter ++) {\n\t\t\tupdate_user(u);\n\t\t\t\n\t\t\tupdate_item(i);\n\t\t}\n\t}\n}\n"
  },
  {
    "path": "src/algorithms/MF_CD.java",
    "content": "package algorithms;\n\nimport data_structure.Rating;\nimport data_structure.SparseMatrix;\nimport data_structure.DenseVector;\nimport data_structure.DenseMatrix;\nimport data_structure.Pair;\nimport data_structure.SparseVector;\nimport happy.coding.math.Randoms;\n\nimport java.util.ArrayList;\nimport java.util.Collections;\nimport java.util.Random;\n\nimport utils.Printer;\n\n/**\n * Coordinate descent algorithm of the KDD'15 paper:\n * Robin Devooght etc. Dynamic Matrix Factorization with Priors on Unknown Values.\n * @author xiangnanhe\n */\npublic class MF_CD extends TopKRecommender {\n\t/** Model priors to set. */\n\tint factors = 10; \t// number of latent factors.\n\tint maxIter = 100; \t// maximum iterations.\n\tdouble w0 = 0.01;\t// weight for 0s\n\tdouble reg = 0.01; \t// regularization parameters\n  double init_mean = 0;  // Gaussian mean for init V\n  double init_stdev = 0.01; // Gaussian std-dev for init V\n\t\n  /** Priors for line search */\n\tint LSMaxIter\t= 10;  // max iteration of the line search. Default is 10\n\tdouble Alpha = 0.3; //\tparameter of line search. In the range (0, 0.5).\n\tdouble Beta\t= 0.3; //\tparameter of line search. In the range (0, 1.0).\n  \n  /** Model parameters to learn */\n  public DenseMatrix U;\t// latent vectors for users\n  public DenseMatrix V;\t// latent vectors for items\n\t\n  /** Caches */\n  DenseMatrix SU;\n  DenseMatrix SV;\n  \n  boolean showProgress;\n  boolean showLoss;\n  \n  // weight for each positive instance in trainMatrix\n  SparseMatrix W; \n  \n  // weight of new instance in online learning\n  public double w_new = 1;\n  \n\tpublic MF_CD(SparseMatrix trainMatrix, ArrayList<Rating> testRatings, \n\t\t\tint topK, int threadNum, int factors, int maxIter, double w0, double reg,\n\t\t\tdouble init_mean, double init_stdev, boolean showProgress, boolean showLoss) {\n\t\tsuper(trainMatrix, testRatings, topK, threadNum);\n\t\tthis.factors = factors;\n\t\tthis.maxIter = maxIter;\n\t\tthis.w0 = w0 / itemCount;\n\t\tthis.reg = reg;\n\t\tthis.init_mean = init_mean;\n\t\tthis.init_stdev = init_stdev;\n\t\tthis.showProgress = showProgress;\n\t\tthis.showLoss = showLoss;\n\t\tthis.initialize();\n\t\t\n\t\t// By default, the weight for positive instance is uniformly 1.\n\t\tW = new SparseMatrix(userCount, itemCount);\n\t\tfor (int u = 0; u < userCount; u ++)\n\t\t\tfor (int i : trainMatrix.getRowRef(u).indexList())\n\t\t\t\tW.setValue(u, i, 1);\n\t}\n\t\n\tprivate void initialize() {\n\t\tU = new DenseMatrix(userCount, factors);\n\t\tV = new DenseMatrix(itemCount, factors);\n\t\tU.init(init_mean, init_stdev);\n\t\tV.init(init_mean, init_stdev);\n\t\t\n\t\tSU = U.transpose().mult(U);\n\t\tSV = V.transpose().mult(V);\n\t}\n\t\n\tpublic void setTrain(SparseMatrix trainMatrix) {\n\t\tthis.trainMatrix = new SparseMatrix(trainMatrix);\n\t\tW = new SparseMatrix(userCount, itemCount);\n\t\tfor (int u = 0; u < userCount; u ++)\n\t\t\tfor (int i : this.trainMatrix.getRowRef(u).indexList())\n\t\t\t\tW.setValue(u, i, 1);\n\t}\n\t\n\tpublic void setLSpriors(int LSMaxIter, double Alpha, double Beta) {\n\t\tthis.LSMaxIter = LSMaxIter;\n\t\tthis.Alpha = Alpha;\n\t\tthis.Beta = Beta;\n\t}\n\t\n\t// remove\n\tpublic void setUV(DenseMatrix U, DenseMatrix V) {\n\t\tthis.U = U.clone();\n\t\tthis.V = V.clone();\n\t\tSU = U.transpose().mult(U);\n\t\tSV = V.transpose().mult(V);\n\t}\n\t\n\t/**\n\t * Implement the CD algorithm of the KDD'15 papers\n\t */\n\tpublic void buildModel() {\n\t\t//System.out.println(\"Run for MF_CD.\");\n\t\t\n\t\tArrayList<Integer> shuffle_list = new ArrayList<Integer>();\n\t\tfor (int i = 0; i < itemCount + userCount; i ++)\n\t\t\tshuffle_list.add(i);\n\t\t\n\t\tdouble loss_pre = Double.MAX_VALUE;\n\t\tfor (int iter = 0; iter < maxIter; iter ++) {\n\t\t\tLong start = System.currentTimeMillis();\n\t\t\tCollections.shuffle(shuffle_list);\n\t\t\t\n\t\t\tfor (int index : shuffle_list) {\t\t\t\t\n\t\t\t\tif (index >= userCount)  // for an item\n\t\t\t\t\tupdate_item(index - userCount);\n\t\t\t\telse   // for a user\n\t\t\t\t\tupdate_user(index);\n\t\t\t}\n\t\t\t\n\t\t\t// Show progress\n\t\t\tif (showProgress)\n\t\t\t\tshowProgress(iter, start, testRatings);\n\t\t\t// Show loss\n\t\t\tif (showLoss)\n\t\t\t\tloss_pre = showLoss(iter, start, loss_pre);\n\n\t\t}  // end for iter\n\t}\n\t\n\t// Run model for one iteration\n\tpublic void runOneIteration() {\n\t\tArrayList<Integer> shuffle_list = new ArrayList<Integer>();\n\t\tfor (int i = 0; i < itemCount + userCount; i ++)\n\t\t\tshuffle_list.add(i);\n\t\tCollections.shuffle(shuffle_list);\n\t\t\n\t\tfor (int index : shuffle_list) {\t\t\t\t\n\t\t\tif (index >= userCount)  // for an item\n\t\t\t\tupdate_item(index - userCount);\n\t\t\telse   // for a user\n\t\t\t\tupdate_user(index);\n\t\t}\n\t}\n\t\n\t// Line search (book, Convex Optimization) for the best step size.\n\tprivate double linesearch(int index, DenseVector embedding, \n\t\t\tDenseVector gradient, int LSMaxIter, double Alpha, double Beta) {\n\t\tdouble step_size = 1.0;\n\t\tdouble init_error = error_row(index, embedding);\n\t\t\n\t\tfor (int iter = 0; iter < LSMaxIter; iter ++) {\n\t\t\t// Build new features (ie embedding) with current step size\n\t\t\tDenseVector newEmbedding = embedding.minus(gradient.scale(step_size));\n\t\t\t\n\t\t\t// Check if new features are good enough. If not reduce step size\n\t\t\tdouble new_error = error_row(index, newEmbedding);\n\t\t\tif (new_error > init_error - Alpha * step_size * gradient.squaredSum())\n\t\t\t\tstep_size *= Beta;\n\t\t\telse\n\t\t\t\tbreak;\n\t\t\t\n\t\t\t// Too many iterations, return step_size = 0\n\t\t\tif (iter == LSMaxIter - 1) {\n\t\t\t\tstep_size = 0;\n\t\t\t\tbreak;\n\t\t\t}\n\t\t}\t\n\t\treturn step_size;\n\t}\n\t\n\tprivate double error_row(int index, DenseVector embedding) {\n\t\tdouble err = 0;\n\t\tif (index >= userCount) {  // for an item\n\t\t\tint i = index - userCount;\n\t\t\tfor (int u : trainMatrix.getColRef(i).indexList()) {\n\t\t\t\tdouble prediction = U.row(u, false).inner(embedding);\n\t\t\t\terr += W.getValue(u, i) * Math.pow(trainMatrix.getValue(u, i) - prediction, 2);\n\t\t\t}\n\t\t\terr *= (1 - w0);\n\t\t\terr += w0 * SU.mult(embedding).inner(embedding);\n\t\t\terr += reg * embedding.squaredSum();\n\t\t\t\n\t\t} else {  // for a user\n\t\t\tint u = index;\n\t\t\tfor (int i : trainMatrix.getRowRef(u).indexList()) {\n\t\t\t\tdouble prediction = V.row(i, false).inner(embedding);\n\t\t\t\terr += W.getValue(u, i) * Math.pow(trainMatrix.getValue(u, i) - prediction, 2);\n\t\t\t}\n\t\t\terr *= (1 - w0);\n\t\t\terr += w0 * SV.mult(embedding).inner(embedding);\n\t\t\terr += reg * embedding.squaredSum();\n\t\t}\n\t\treturn err;\n\t}\n\t\n\tprivate void update_user(int u) {\n\t\tDenseVector embedding = U.row(u, false);\n\t\t// Calculate the gradient\n\t\tDenseVector gradient = SV.mult(embedding).scale(w0);\n\t\tfor (int i : trainMatrix.getRowRef(u).indexList()) {\n\t\t\tdouble mul = W.getValue(u, i) * (predict(u, i) * (1 - w0) - trainMatrix.getValue(u, i));\n\t\t\tgradient.selfAdd(V.row(i, false).scale(mul));\n\t\t}\n\t\tgradient.selfAdd(embedding.scale(reg));  // with regularizer\n\t\t\n\t\t// Line search for learning rate\n\t\tdouble lr = linesearch(u, embedding, gradient, LSMaxIter, Alpha, Beta);\n\t\t\n\t\t// Update S cache before updating parameters\n\t\tDenseVector new_embedding = embedding.minus(gradient.scale(lr));\n\t\tfor (int f = 0; f < factors; f ++) {\n\t\t\tfor (int k = 0; k <= f; k ++) {\n\t\t\t\tdouble val = SU.get(f, k) - embedding.get(f) * embedding.get(k)\n\t\t\t\t\t\t+ new_embedding.get(f) * new_embedding.get(k);\n\t\t\t\tSU.set(f, k, val);\n\t\t\t\tSU.set(k, f, val);\n\t\t\t}\n\t\t}\n\t\t\n\t\t// Parameter update\n\t\tfor (int f = 0; f < factors; f ++)\n\t\t\tembedding.set(f, new_embedding.get(f));\n\t}\n\t\n\tprivate void update_item(int i) {\n\t\tDenseVector embedding = V.row(i, false);\n\t\t// Calculate the gradient\n\t\tDenseVector gradient = SU.mult(embedding).scale(w0);\n\t\tfor (int u : trainMatrix.getColRef(i).indexList()) {\n\t\t\tdouble mul = W.getValue(u, i) * (predict(u, i) * (1 - w0) - trainMatrix.getValue(u, i));\n\t\t\tgradient.selfAdd(U.row(u, false).scale(mul));\n\t\t}\n\t\tgradient.selfAdd(embedding.scale(reg));  // with regularizer\n\t\t\n\t\t// Line search for learning rate\n\t\tdouble lr = linesearch(userCount + i, embedding, gradient, LSMaxIter, Alpha, Beta);\n\t\t\n\t\t// Update SV cache\n\t\tDenseVector new_embedding = embedding.minus(gradient.scale(lr));\n\t\tfor (int f = 0; f < factors; f ++) {\n\t\t\tfor (int k = 0; k <= f; k ++) {\n\t\t\t\tdouble val = SV.get(f, k) - embedding.get(f) * embedding.get(k)\n\t\t\t\t\t\t+ new_embedding.get(f) * new_embedding.get(k);\n\t\t\t\tSV.set(f, k, val);\n\t\t\t\tSV.set(k, f, val);\n\t\t\t}\n\t\t}\n\t\t\n\t\t// Parameter update\n\t\tfor (int f = 0; f < factors; f ++)\n\t\t\tembedding.set(f, new_embedding.get(f));\n\t}\n\t\n\tpublic double showLoss(int iter, long start, double loss_pre) {\n\t\tlong start1 = System.currentTimeMillis();\n\t\tdouble loss_cur = loss();\n\t\tString symbol = loss_pre >= loss_cur ? \"-\" : \"+\";\n\t\tSystem.out.printf(\"Iter=%d [%s]\\t [%s]loss: %.4f [%s]\\n\", iter, \n\t\t\t\tPrinter.printTime(start1 - start), symbol, loss_cur, \n\t\t\t\tPrinter.printTime(System.currentTimeMillis() - start1));\n\t\treturn loss_cur;\n\t}\n\t\n\t// Fast way to calculate the loss function\n\tpublic double loss() {\n\t\tdouble L = reg * (U.squaredSum() + V.squaredSum());\n\t\tfor (int u = 0; u < userCount; u ++) {\n\t\t\tdouble l = 0;\n\t\t\tfor (int i : trainMatrix.getRowRef(u).indexList()) {\n\t\t\t\tl +=  W.getValue(u, i) * Math.pow(trainMatrix.getValue(u, i) - predict(u, i), 2);\n\t\t\t}\n\t\t\tl *= (1 - w0);\n\t\t\tl += w0 * SV.mult(U.row(u, false)).inner(U.row(u, false));\n\t\t\tL += l;\n\t\t}\n\t\t\n\t\treturn L;\n\t}\n\t\n\t@Override\n\tpublic double predict(int u, int i) {\n\t\treturn U.row(u, false).inner(V.row(i, false));\n\t}\n\n\t@Override\n\tpublic void updateModel(int u, int i) {\n\t\ttrainMatrix.setValue(u, i, 1);\n\t\tW.setValue(u, i, w_new);\n\t\t\n\t\tfor (int iter = 0; iter < maxIterOnline; iter ++) {\n\t\t\tupdate_user(u);\n\t\t\tupdate_item(i);\n\t\t}\n\t}\n}\n"
  },
  {
    "path": "src/algorithms/MF_fastALS.java",
    "content": "package algorithms;\n\nimport data_structure.Rating;\nimport data_structure.SparseMatrix;\nimport data_structure.DenseVector;\nimport data_structure.DenseMatrix;\nimport data_structure.Pair;\nimport data_structure.SparseVector;\nimport happy.coding.math.Randoms;\n\nimport java.util.ArrayList;\nimport java.util.Collections;\nimport java.util.Random;\nimport java.util.HashMap;\n\nimport utils.Printer;\n\n/**\n * Fast ALS for weighted matrix factorization (with imputation)\n * @author xiangnanhe\n */\npublic class MF_fastALS extends TopKRecommender {\n\t/** Model priors to set. */\n\tint factors = 10; \t// number of latent factors.\n\tint maxIter = 500; \t// maximum iterations.\n\tdouble reg = 0.01; \t// regularization parameters\n\tdouble w0 = 1;\n  double init_mean = 0;  // Gaussian mean for init V\n  double init_stdev = 0.01; // Gaussian std-dev for init V\n\t\n  /** Model parameters to learn */\n  public DenseMatrix U;\t// latent vectors for users\n  public DenseMatrix V;\t// latent vectors for items\n\t\n  /** Caches */\n  DenseMatrix SU;\n  DenseMatrix SV;\n  double[] prediction_users, prediction_items;\n  double[] rating_users, rating_items;\n  double[] w_users, w_items;\n  \n  boolean showProgress;\n  boolean showLoss;\n  \n  // weight for each positive instance in trainMatrix\n  SparseMatrix W; \n  \n  // weight for negative instances on item i.\n  double[] Wi;\n  \n  // weight of new instance in online learning\n  public double w_new = 1;\n  \n\tpublic MF_fastALS(SparseMatrix trainMatrix, ArrayList<Rating> testRatings, \n\t\t\tint topK, int threadNum, int factors, int maxIter, double w0, double alpha, double reg, \n\t\t\tdouble init_mean, double init_stdev, boolean showProgress, boolean showLoss) {\n\t\tsuper(trainMatrix, testRatings, topK, threadNum);\n\t\tthis.factors = factors;\n\t\tthis.maxIter = maxIter;\n\t\tthis.w0 = w0;\n\t\tthis.reg = reg;\n\t\tthis.init_mean = init_mean;\n\t\tthis.init_stdev = init_stdev;\n\t\tthis.showLoss = showLoss;\n\t\tthis.showProgress = showProgress;\n\t\t\n\t\t// Set the Wi as a decay function w0 * pi ^ alpha\n\t\tdouble sum = 0, Z = 0;\n\t\tdouble[] p = new double[itemCount];\n\t\tfor (int i = 0; i < itemCount; i ++) {\n\t\t\tp[i] = trainMatrix.getColRef(i).itemCount();\n\t\t\tsum += p[i];\n\t\t}\n\t\t// convert p[i] to probability \n\t\tfor (int i = 0; i < itemCount; i ++) {\n\t\t\tp[i] /= sum;\n\t\t\tp[i] = Math.pow(p[i], alpha);\n\t\t\tZ += p[i];\n\t\t}\n\t\t// assign weight\n\t\tWi = new double[itemCount];\n\t\tfor (int i = 0; i < itemCount; i ++)\n\t\t\tWi[i] = w0 * p[i] / Z;\n\t\t\n\t\t// By default, the weight for positive instance is uniformly 1.\n\t\tW = new SparseMatrix(userCount, itemCount);\n\t\tfor (int u = 0; u < userCount; u ++)\n\t\t\tfor (int i : trainMatrix.getRowRef(u).indexList())\n\t\t\t\tW.setValue(u, i, 1);\n\t\t\n\t\t// Init caches\n\t\tprediction_users = new double[userCount];\n\t\tprediction_items = new double[itemCount];\n\t\trating_users = new double[userCount];\n\t\trating_items = new double[itemCount];\n\t\tw_users = new double[userCount];\n\t\tw_items = new double[itemCount];\n\t\t\n\t\t// Init model parameters\n\t\tU = new DenseMatrix(userCount, factors);\n\t\tV = new DenseMatrix(itemCount, factors);\n\t\tU.init(init_mean, init_stdev);\n\t\tV.init(init_mean, init_stdev);\n\t\tinitS();\n\t}\n\t\n\tpublic void setTrain(SparseMatrix trainMatrix) {\n\t\tthis.trainMatrix = new SparseMatrix(trainMatrix);\n\t\tW = new SparseMatrix(userCount, itemCount);\n\t\tfor (int u = 0; u < userCount; u ++)\n\t\t\tfor (int i : this.trainMatrix.getRowRef(u).indexList())\n\t\t\t\tW.setValue(u, i, 1);\n\t}\n\t\n\t// Init SU and SV\n\tprivate void initS() {\n\t\tSU = U.transpose().mult(U);\n\t\t// Init SV as V^T Wi V\n\t\tSV = new DenseMatrix(factors, factors);\n\t\tfor (int f = 0; f < factors; f ++) {\n\t\t\tfor (int k = 0; k <= f; k ++) {\n\t\t\t\tdouble val = 0;\n\t\t\t\tfor (int i = 0; i < itemCount; i ++) \n\t\t\t\t\tval += V.get(i, f) * V.get(i, k) * Wi[i];\n\t\t\t\tSV.set(f, k, val);\n\t\t\t\tSV.set(k, f, val);\n\t\t\t}\n\t\t}\n\t}\n\t\n\t//remove\n\tpublic void setUV(DenseMatrix U, DenseMatrix V) {\n\t\tthis.U = U.clone();\n\t\tthis.V = V.clone();\n\t\tinitS();\n\t}\n\n\tpublic void buildModel() {\t\t\n\t\t//System.out.println(\"Run for FastALS. \");\n\t\tdouble loss_pre = Double.MAX_VALUE;\n\t\tfor (int iter = 0; iter < maxIter; iter ++) {\n\t\t\tLong start = System.currentTimeMillis();\n\t\t\t\n\t\t\t// Update user latent vectors\n\t\t\tfor (int u = 0; u < userCount; u ++) {\n\t\t\t\tupdate_user(u);\n\t\t\t}\n\t\t\t\n\t\t\t// Update item latent vectors\n\t\t\tfor (int i = 0; i < itemCount; i ++) {\n\t\t\t\tupdate_item(i);\n\t\t\t}\n\t\t\t\n\t\t\t// Show progress\n\t\t\tif (showProgress)\n\t\t\t\tshowProgress(iter, start, testRatings);\n\t\t\t// Show loss\n\t\t\tif (showLoss)\n\t\t\t\tloss_pre = showLoss(iter, start, loss_pre);\n\t\t\t\n\t\t} // end for iter\n\t\t\n\t}\n\t\n\t// Run model for one iteration\n\tpublic void runOneIteration() {\n\t\t// Update user latent vectors\n\t\tfor (int u = 0; u < userCount; u ++) {\n\t\t\tupdate_user(u);\n\t\t}\n\t\t\n\t\t// Update item latent vectors\n\t\tfor (int i = 0; i < itemCount; i ++) {\n\t\t\tupdate_item(i);\n\t\t}\n\t}\n\t\n\tprotected void update_user(int u) {\n\t\tArrayList<Integer> itemList = trainMatrix.getRowRef(u).indexList();\n\t\tif (itemList.size() == 0)\t\treturn;\t// user has no ratings\n\t\t// prediction cache for the user\n\t\tfor (int i : itemList) {\n\t\t\tprediction_items[i] = predict(u, i);\n\t\t\trating_items[i] = trainMatrix.getValue(u, i);\n\t\t\tw_items[i] = W.getValue(u, i);\n\t\t}\n\t\t\n\t\tDenseVector oldVector = U.row(u);\n\t\tfor (int f = 0; f < factors; f ++) {\n\t\t\tdouble numer = 0, denom = 0;\n\t\t\t// O(K) complexity for the negative part\n\t\t\tfor (int k = 0; k < factors; k ++) {\n\t\t\t\tif (k != f)\n\t\t\t\t\tnumer -= U.get(u, k) * SV.get(f, k);\n\t\t\t}\n\t\t\t//numer *= w0;\n\t\t\t\n\t\t\t// O(Nu) complexity for the positive part\n\t\t\tfor (int i : itemList) {\n\t\t\t\tprediction_items[i] -= U.get(u, f) * V.get(i, f);\n\t\t\t\tnumer +=  (w_items[i]*rating_items[i] - (w_items[i]-Wi[i]) * prediction_items[i]) * V.get(i, f);\n\t\t\t\tdenom += (w_items[i]-Wi[i]) * V.get(i, f) * V.get(i, f);\n\t\t\t}\n\t\t\tdenom += SV.get(f, f) + reg;\n\t\t\t\n\t\t\t// Parameter Update\n\t\t\tU.set(u, f, numer / denom);\n\t\t\t\n\t\t\t// Update the prediction cache\n\t\t\tfor (int i : itemList) \n\t\t\t\tprediction_items[i] += U.get(u, f) * V.get(i, f);\n\t\t} // end for f\n\t\t\n\t\t// Update the SU cache\n\t\tfor (int f = 0; f < factors; f ++) {\n\t\t\tfor (int k = 0; k <= f; k ++) {\n\t\t\t\tdouble val = SU.get(f, k) - oldVector.get(f) * oldVector.get(k)\n\t\t\t\t\t\t+ U.get(u, f) * U.get(u, k);\n\t\t\t\tSU.set(f, k, val);\n\t\t\t\tSU.set(k, f, val);\n\t\t\t}\n\t\t} // end for f\n\t}\n\t\n\tprotected void update_item(int i) {\n\t\tArrayList<Integer> userList = trainMatrix.getColRef(i).indexList();\n\t\tif (userList.size() == 0)\t\treturn; // item has no ratings. \n\t\t// prediction cache for the item\n\t\tfor (int u : userList) {\n\t\t\tprediction_users[u] = predict(u, i);\n\t\t\trating_users[u] = trainMatrix.getValue(u, i);\n\t\t\tw_users[u] = W.getValue(u, i);\n\t\t}\n\t\t\n\t\tDenseVector oldVector = V.row(i);\n\t\tfor (int f = 0; f < factors; f++) {\n\t\t\t// O(K) complexity for the w0 part\n\t\t\tdouble numer = 0, denom = 0;\n\t\t\tfor (int k = 0; k < factors;  k ++) {\n\t\t\t\tif (k != f)\n\t\t\t\t\tnumer -= V.get(i, k) * SU.get(f, k);\n\t\t\t}\n\t\t\tnumer *= Wi[i];\n\t\t\t\n\t\t\t// O(Ni) complexity for the positive ratings part\n\t\t\tfor (int u : userList) {\n\t\t\t\tprediction_users[u] -= U.get(u, f) * V.get(i, f);\n\t\t\t\tnumer += (w_users[u]*rating_users[u] - (w_users[u]-Wi[i]) * prediction_users[u]) * U.get(u, f);\n\t\t\t\tdenom += (w_users[u]-Wi[i]) * U.get(u, f) * U.get(u, f);\n\t\t\t}\n\t\t\tdenom += Wi[i] * SU.get(f, f) + reg;\n\t\t\t\n\t\t\t// Parameter update\n\t\t\tV.set(i, f, numer / denom);\n\t\t\t// Update the prediction cache for the item\n\t\t\tfor (int u : userList)\n\t\t\t\tprediction_users[u] += U.get(u, f) * V.get(i, f);\n\t\t} // end for f\n\t\t\n\t\t// Update the SV cache\n\t\tfor (int f = 0; f < factors; f ++) {\n\t\t\tfor (int k = 0; k <= f; k ++) {\n\t\t\t\tdouble val = SV.get(f, k) - oldVector.get(f) * oldVector.get(k) * Wi[i]\n\t\t\t\t\t\t+ V.get(i, f) * V.get(i, k) * Wi[i];\n\t\t\t\tSV.set(f, k, val);\n\t\t\t\tSV.set(k, f, val);\n\t\t\t}\n\t\t}\n\t}\n\t\n\tpublic double showLoss(int iter, long start, double loss_pre) {\n\t\tlong start1 = System.currentTimeMillis();\n\t\tdouble loss_cur = loss();\n\t\tString symbol = loss_pre >= loss_cur ? \"-\" : \"+\";\n\t\tSystem.out.printf(\"Iter=%d [%s]\\t [%s]loss: %.4f [%s]\\n\", iter, \n\t\t\t\tPrinter.printTime(start1 - start), symbol, loss_cur, \n\t\t\t\tPrinter.printTime(System.currentTimeMillis() - start1));\n\t\treturn loss_cur;\n\t}\n\t\n\t// Fast way to calculate the loss function\n\tpublic double loss() {\n\t\tdouble L = reg * (U.squaredSum() + V.squaredSum());\n\t\tfor (int u = 0; u < userCount; u ++) {\n\t\t\tdouble l = 0;\n\t\t\tfor (int i : trainMatrix.getRowRef(u).indexList()) {\n\t\t\t\tdouble pred = predict(u, i);\n\t\t\t\tl += W.getValue(u, i) * Math.pow(trainMatrix.getValue(u, i) - pred, 2);\n\t\t\t\tl -= Wi[i] * Math.pow(pred, 2);\n\t\t\t}\n\t\t\tl += SV.mult(U.row(u, false)).inner(U.row(u, false));\n\t\t\tL += l;\n\t\t}\n\t\t\n\t\treturn L;\n\t}\n\t\n\t@Override\n\tpublic double predict(int u, int i) {\n\t\treturn U.row(u, false).inner(V.row(i, false));\n\t}\n\n\t@Override\n\tpublic void updateModel(int u, int i) {\n\t\ttrainMatrix.setValue(u, i, 1);\n\t\tW.setValue(u, i, w_new);\n\t\tif (Wi[i] == 0) { // an new item\n\t\t\tWi[i] = w0 / itemCount;\n\t\t\t// Update the SV cache\n\t\t\tfor (int f = 0; f < factors; f ++) {\n\t\t\t\tfor (int k = 0; k <= f; k ++) {\n\t\t\t\t\tdouble val = SV.get(f, k) + V.get(i, f) * V.get(i, k) * Wi[i];\n\t\t\t\t\tSV.set(f, k, val);\n\t\t\t\t\tSV.set(k, f, val);\n\t\t\t\t}\n\t\t\t}\n\t\t}\n\t\t\n\t\tfor (int iter = 0; iter < maxIterOnline; iter ++) {\n\t\t\tupdate_user(u);\n\t\t\t\n\t\t\tupdate_item(i);\n\t\t}\n\t}\n\t\n/*\t// Raw way to calculate the loss function\n\tpublic double loss() {\n\t\tdouble L = reg * (U.squaredSum() + V.squaredSum());\n\t\tfor (int u = 0; u < userCount; u ++) {\n\t\t\tdouble l = 0;\n\t\t\tfor (int i : trainMatrix.getRowRef(u).indexList()) {\n\t\t\t\tl += Math.pow(trainMatrix.getValue(u, i) - predict(u, i), 2);\n\t\t\t}\n\t\t\tl *= (1 - w0);\n\t\t\tfor (int i = 0; i < itemCount; i ++) {\n\t\t\t\tl += w0 * Math.pow(predict(u, i), 2);\n\t\t\t}\n\t\t\tL += l;\n\t\t}\n\t\treturn L;\n\t} */\n}\n\n"
  },
  {
    "path": "src/algorithms/MFbpr.java",
    "content": "package algorithms;\n\nimport data_structure.Rating;\nimport data_structure.SparseMatrix;\nimport data_structure.DenseVector;\nimport data_structure.DenseMatrix;\nimport data_structure.Pair;\n\nimport java.util.ArrayList;\nimport java.util.Collections;\nimport java.util.Random;\n\nimport utils.Printer;\n\n/**\n * Implement the standard matrix factorization model, optimized by BPR loss.\n * Rendle, Steffen, et al. \"BPR: Bayesian personalized ranking from implicit feedback.\" \n * Proc. of UAI 2009.\n * \n * Adaptive learning rate see the KDD'11 paper \n * Large-Scale Matrix Factorization with Distributed Stochastic Gradient Descent\n * @author xiangnanhe\n *\n */\npublic class MFbpr extends TopKRecommender {\n\t/** Model priors to set. */\n\tint factors = 10; \t// number of latent factors.\n\tint maxIter = 100; \t// maximum iterations.\n\tdouble lr = 0.01; \t\t// Learning rate\n\tboolean adaptive = false; \t// Whether to use adaptive learning rate \n\tdouble reg = 0.01; \t// regularization parameters\n  double init_mean = 0;  // Gaussian mean for init V\n  double init_stdev = 0.1; // Gaussian std-dev for init V\n  // Dynamic Negative Sampling [Zhang et al. SIGIR 2013]: sample X negatives and use the one with maximum predicted value as the true negative.\n  double num_dns = 1;\t// number of dynamic negative samples. \n\t\n  /** Model parameters to learn */\n  public DenseMatrix U;\t// latent vectors for users\n  public DenseMatrix V;\t// latent vectors for items\n  \n  boolean showProgress;\n  public String onlineMode = \"u\";\n  \n  Random rand = new Random();\n\tpublic MFbpr(SparseMatrix trainMatrix, ArrayList<Rating> testRatings,\n\t\t\tint topK, int threadNum, int factors, int maxIter, double lr, boolean adaptive, double reg, \n\t\t\tdouble init_mean, double init_stdev, int num_dns, boolean showProgress) {\n\t\tsuper(trainMatrix, testRatings, topK, threadNum);\n\t\tthis.factors = factors;\n\t\tthis.maxIter = maxIter;\n\t\tthis.lr = lr;\n\t\tthis.adaptive = adaptive;\n\t\tthis.reg = reg;\n\t\tthis.init_mean = init_mean;\n\t\tthis.init_stdev = init_stdev;\n\t\tthis.num_dns = num_dns;\n\t\tthis.showProgress = showProgress;\n\t\t\n\t\t// Init model parameters\n\t\tU = new DenseMatrix(userCount, factors);\n\t\tV = new DenseMatrix(itemCount, factors);\n\t\tU.init(init_mean, init_stdev);\n\t\tV.init(init_mean, init_stdev);\n\t}\n\t\n\t//remove\n\tpublic void setUV(DenseMatrix U, DenseMatrix V) {\n\t\tthis.U = U.clone();\n\t\tthis.V = V.clone();\n\t}\n\t\n\tpublic void buildModel() {\t\n\t\tint nonzeros = trainMatrix.itemCount();\n\t\tdouble hr_prev = 0;\n\t\tfor (int iter = 0; iter < maxIter; iter ++) {\n\t\t\tLong start = System.currentTimeMillis();\n\t\t\trand = new Random();\n\t\t\t\n\t\t\t// Each training epoch\n\t\t\tfor (int s = 0; s < nonzeros; s ++) { \n\t\t\t\t// sample a user\n\t\t\t\tint u = rand.nextInt(userCount); \n\t\t\t\tArrayList<Integer> itemList = trainMatrix.getRowRef(u).indexList();\n\t\t\t\tif (itemList.size() == 0)\tcontinue;\n\t\t\t\t// sample a positive item\n\t\t\t\tint i = itemList.get(rand.nextInt(itemList.size())); \n\t\t\t\t\n\t\t\t\t// One SGD step update\n\t\t\t\tupdate_ui(u, i);\n\t\t\t}\n\t\t\n\t\t\t// Show progress per 10 epochs\n\t\t\tif (showProgress && iter%10 == 0)\n\t\t\t\tshowProgress(iter, start, testRatings);\n\t\t\t\n\t\t\t// Adjust the learning rate\n\t\t\tif (adaptive) {\n\t\t\t\tif (!showProgress)\tevaluate(testRatings);\n\t\t\t\tdouble hr = ndcgs.mean();\n\t\t\t\tlr = hr > hr_prev ? lr * 1.05 : lr * 0.5;\n\t\t\t\thr_prev = hr;\n\t\t\t}\n\t\t} // end for iter\n\t}\n\t\n\tpublic void runOneIteration() {\n\t\tint nonzeros = trainMatrix.itemCount();\n\t\trand = new Random();\n\t\t// Each training epoch\n\t\tfor (int s = 0; s < nonzeros; s ++) { \n\t\t\t// sample a user\n\t\t\tint u = rand.nextInt(userCount); \n\t\t\tArrayList<Integer> itemList = trainMatrix.getRowRef(u).indexList();\n\t\t\tif (itemList.size() == 0)\tcontinue;\n\t\t\t// sample a positive item\n\t\t\tint i = itemList.get(rand.nextInt(itemList.size())); \n\t\t\t\n\t\t\t// One SGD step update\n\t\t\tupdate_ui(u, i);\n\t\t}\n\t}\n\t\n\t//One SGD step for a positive instance.\n\tprivate void update_ui(int u, int i) {\n\t\t// Dynamic negative sampling\t\t\n\t\t// sample a negative item\n\t\tint s = rand.nextInt(itemCount);\n\t\twhile (trainMatrix.getValue(u, s) != 0) {\n\t\t\ts = rand.nextInt(itemCount);\n\t\t}\n\t\tint j = s;\t// record the negative example with the largest predict value\n\t\tfor (int k = 1; k < this.num_dns; k ++) {\n\t\t\t// sample another negative item\n\t\t\ts = rand.nextInt(itemCount);\n\t\t\twhile (trainMatrix.getValue(u, s) != 0) {\n\t\t\t\ts = rand.nextInt(itemCount);\n\t\t\t}\n\t\t\tif (predict(u, s) > predict(u, j)) {\n\t\t\t\tj = s;\n\t\t\t}\n\t\t}\n\t\t\n\t\t// BPR update rules\n\t\tdouble y_pos = predict(u, i);  // target value of positive instance\n    double y_neg = predict(u, j);  // target value of negative instance\n    double mult = -partial_loss(y_pos - y_neg);\n    \n    for (int f = 0; f < factors; f ++) {\n    \tdouble grad_u = V.get(i, f) - V.get(j, f);\n    \tU.add(u, f, -lr * (mult * grad_u + reg * U.get(u, f)));\n    \t\n    \tdouble grad = U.get(u, f);\n    \tV.add(i, f, -lr * (mult * grad + reg * V.get(i, f)));\n    \tV.add(j, f, -lr * (-mult * grad + reg * V.get(j, f)));\n    }\n\t}\n\t\n\t@Override\n\tpublic double predict(int u, int i) {\n\t\treturn U.row(u, false).inner(V.row(i, false));\n\t}\n\t\n  // Partial of the ln sigmoid function used by BPR.\n  private double partial_loss(double x) {\n    double exp_x = Math.exp(-x);\n    return exp_x / (1 + exp_x);\n  }\n\n  // Implement the Recsys08 method: Steffen Rendle, Lars Schmidt-Thieme,\n  // \"Online-Updating Regularized Kernel Matrix Factorization Models\"\n\tpublic void updateModel(int u, int item) {\n\t\ttrainMatrix.setValue(u, item, 1);\n\t\trand = new Random();\n\t\t\n\t\t// user retrain\n\t\tArrayList<Integer> itemList = trainMatrix.getRowRef(u).indexList();\n\t\tfor (int iter = 0; iter < maxIterOnline; iter ++) {\n\t\t\tCollections.shuffle(itemList);\n\t\t\t\n\t\t\tfor (int s = 0; s < itemList.size(); s ++) {\n\t\t\t\t// retrain for the user or for the (user, item) pair\n\t\t\t\tint i = onlineMode.equalsIgnoreCase(\"u\") ? itemList.get(s) : item;\n\t\t\t\t// One SGD step update\n\t\t\t\tupdate_ui(u, i);\n\t\t\t}\n\t\t}\n\t\t\n\t}\n}\n"
  },
  {
    "path": "src/algorithms/TopKRecommender.java",
    "content": "package algorithms;\r\n\r\nimport java.util.ArrayList;\r\nimport java.util.Arrays;\r\nimport java.util.HashMap;\r\nimport java.util.HashSet;\r\nimport java.util.List;\r\n\r\nimport utils.CommonUtils;\r\nimport utils.Printer;\r\nimport data_structure.DenseVector;\r\nimport data_structure.Rating;\r\nimport data_structure.SparseMatrix;\r\nimport data_structure.DenseMatrix;\r\nimport utils.TopKPriorityQueue;\r\n\r\nimport java.util.Map;\r\n\r\n/**\r\n * This is an abstract class for topK recommender systems.\r\n * Define some variables to use, and member functions to implement by a topK recommender.\r\n * \r\n * @author HeXiangnan\r\n * @since 2014.12.03\r\n */\r\npublic abstract class TopKRecommender {\r\n\t/** The number of users. */\r\n\tpublic int userCount;\r\n\t/** The number of items. */\r\n\tpublic int itemCount;\r\n\t/** Rating matrix of training set. Users by Items.*/\r\n\tpublic SparseMatrix trainMatrix;\r\n\t/** Test ratings. For showing progress only. */\r\n\tpublic ArrayList<Rating> testRatings;\r\n\t\r\n\t/** Position to cutoff. */\r\n\tpublic int topK = 100;\r\n\t/** Number of threads to run the model (if multi-thread implementation).*/\r\n\tpublic int threadNum = 1;\r\n\t\r\n\t/** Evaluation for each user (offline eval) or test instance (online eval).*/\r\n\tpublic DenseVector hits;\r\n\tpublic DenseVector ndcgs;\r\n\tpublic DenseVector precs;\r\n\tpublic int maxIterOnline = 1;\r\n\t\r\n\tpublic boolean ignoreTrain = false; // ignore train items when generating topK list\r\n\tpublic TopKRecommender() {};\r\n\t\r\n\tpublic TopKRecommender(SparseMatrix trainMatrix, \r\n\t\t\tArrayList<Rating> testRatings, int topK, int threadNum) {\r\n\t\tthis.trainMatrix = new SparseMatrix(trainMatrix);\r\n\t\tthis.testRatings = new ArrayList<Rating>(testRatings);\r\n\t\tthis.topK = topK;\r\n\t\tthis.threadNum = threadNum;\r\n\t\t\r\n\t\tthis.userCount = trainMatrix.length()[0];\r\n\t\tthis.itemCount = trainMatrix.length()[1];\r\n\t}\r\n\t\r\n\t/**\r\n\t * Get the prediction score of user u on item i. To be overridden. \r\n\t */\r\n\tpublic abstract double predict(int u, int i);\r\n\t\r\n\t/**\r\n\t * Build the model.\r\n\t */\r\n\tpublic abstract void buildModel();\r\n\t\r\n\t/**\r\n\t * Update the model with a new observation. \r\n\t */\r\n\tpublic abstract void updateModel(int u, int i);\r\n\t\r\n\t/**\r\n\t * Show progress (evaluation) with current model parameters. \r\n\t * @iter\tCurrent iteration\r\n\t * @start\tStarting time of the iteration\r\n\t * @testMatrix\tFor evaluation purpose\r\n\t */\r\n\tpublic void showProgress(int iter, long start, ArrayList<Rating> testRatings) {\r\n\t\tlong end_iter = System.currentTimeMillis();\r\n\t\tif (userCount == testRatings.size())  // leave-1-out eval\r\n\t\t\t evaluate(testRatings);\r\n\t\telse\t// global split\r\n\t\t\t evaluateOnline(testRatings, 100);\r\n\t\tlong end_eval = System.currentTimeMillis();\r\n\t\t\r\n\t\tSystem.out.printf(\"Iter=%d[%s] <loss, hr, ndcg, prec>:\\t %.4f\\t %.4f\\t %.4f\\t %.4f\\t [%s]\\n\",\r\n\t\t\t\titer, Printer.printTime(end_iter - start), loss(),\r\n\t\t\t\thits.mean(), ndcgs.mean(), precs.mean(), Printer.printTime(end_eval - end_iter));\r\n\t}\r\n\t\r\n\t/**\r\n\t * Online evaluation (global split) by simulating the testing stream. \r\n\t * @param ratings Test ratings that are sorted by time (old -> recent).\r\n\t * @param interval Print evaluation result per X iteration. \r\n\t */\r\n\tpublic void evaluateOnline(ArrayList<Rating> testRatings, int interval) {\r\n\t\tint testCount = testRatings.size();\r\n\t\thits = new DenseVector(testCount);\r\n\t\tndcgs = new DenseVector(testCount);\r\n\t\tprecs = new DenseVector(testCount);\r\n\t\t\r\n\t\t// break down the results by number of user ratings of the test pair\r\n\t\tint intervals = 10;\r\n\t\tint[] counts = new int[intervals + 1];\r\n\t\tdouble[] hits_r = new double[intervals + 1];\r\n\t\tdouble[] ndcgs_r = new double[intervals + 1];\r\n\t\tdouble[] precs_r = new double[intervals + 1];\r\n\t\t\r\n\t\tLong updateTime = (long) 0;\r\n\t\tfor (int i = 0; i < testCount; i ++) {\r\n\t\t\t// Check performance per interval:\r\n\t\t\tif (i > 0 && interval > 0 && i % interval == 0) {\r\n\t\t\t\tSystem.out.printf(\"%d: <hr, ndcg, prec> =\\t %.4f\\t %.4f\\t %.4f\\n\", \r\n\t\t\t\t\t\ti, hits.sum() / i, ndcgs.sum() / i, precs.sum() / i);\r\n\t\t\t}\r\n\t\t\t// Evaluate model of the current test rating:\r\n\t\t\tRating rating = testRatings.get(i);\r\n\t\t\tdouble[] res = this.evaluate_for_user(rating.userId, rating.itemId);\r\n\t\t\thits.set(i, res[0]);\r\n\t\t\tndcgs.set(i, res[1]);\r\n\t\t\tprecs.set(i, res[2]);\r\n\t\t\t\r\n\t\t\t// statisitcs for break down\r\n\t\t\tint r = trainMatrix.getRowRef(rating.userId).itemCount();\r\n\t\t\tr =  r> intervals ? intervals : r;\r\n\t\t\tcounts[r] += 1;\r\n\t\t\thits_r[r] += res[0];\r\n\t\t\tndcgs_r[r] += res[1];\r\n\t\t\tprecs_r[r] += res[2];\r\n\t\t\t\r\n\t\t\t// Update the model\r\n\t\t\tLong start = System.currentTimeMillis();\r\n\t\t\tupdateModel(rating.userId, rating.itemId);\r\n\t\t\tupdateTime += (System.currentTimeMillis() - start);\r\n\t\t}\r\n\t\t\r\n\t\tSystem.out.println(\"Break down the results by number of user ratings for the test pair.\");\r\n\t\tSystem.out.printf(\"#Rating\\t Percentage\\t HR\\t NDCG\\t MAP\\n\");\r\n\t\tfor (int i = 0; i <= intervals; i ++) {\r\n\t\t\tSystem.out.printf(\"%d\\t %.2f%%\\t %.4f\\t %.4f\\t %.4f \\n\", \r\n\t\t\t\t\ti, (double)counts[i] / testCount * 100, \r\n\t\t\t\t\thits_r[i] / counts[i], ndcgs_r[i] / counts[i], precs_r[i] / counts[i]);\r\n\t\t}\r\n\t\t\r\n\t\tSystem.out.printf(\"Avg model update time per instance: %.2f ms\\n\", (float)updateTime/testCount);\r\n\t}\r\n\t\r\n\tprotected ArrayList<Integer> threadSplit(int total, int threadNum, int t) {\r\n\t\tArrayList<Integer> res = new ArrayList<Integer>();\r\n\t\tint start = (total / threadNum) * t;\r\n\t\tint end = (t == threadNum-1) ? total : \r\n\t\t\t(total / threadNum) * (t + 1);\r\n\t\tfor (int i = start; i < end; i ++)\r\n\t\t\tres.add(i);\r\n\t\treturn res;\r\n\t}\r\n\t\r\n\t/**\r\n\t * Offline evaluation (leave-1-out) for each user.\r\n\t * @param topK position to cutoff\r\n\t * @param testMatrix\r\n\t * @throws InterruptedException \r\n\t */\r\n\tpublic void evaluate(ArrayList<Rating> testRatings) {\r\n\t\tassert userCount == testRatings.size();\r\n\t\tfor (int u = 0; u < userCount; u ++)\r\n\t\t\tassert u == testRatings.get(u).userId;\r\n\t\t\r\n\t\thits = new DenseVector(userCount);\r\n\t\tndcgs = new DenseVector(userCount);\r\n\t\tprecs = new DenseVector(userCount);\r\n\t\t\r\n\t\t// Run the evaluation multi-threads splitted by users\r\n\t\tEvaluationThread[] threads = new EvaluationThread[threadNum];\r\n\t\tfor (int t = 0; t < threadNum; t ++) {\r\n\t\t\tArrayList<Integer> users = threadSplit(userCount, threadNum, t);\r\n\t\t\tthreads[t] = new EvaluationThread(this, testRatings, users);\r\n\t\t\tthreads[t].start();\r\n\t\t}\r\n\t\t\r\n\t\t// Wait until all threads are finished.\r\n\t\tfor (int t = 0; t < threads.length; t++) { \r\n\t\t  try {\r\n\t\t\t\tthreads[t].join();\r\n\t\t\t} catch (InterruptedException e) {\r\n\t\t\t\tSystem.err.println(\"InterruptException was caught: \" + e.getMessage());\r\n\t\t\t}\r\n\t\t}\r\n\t}\r\n\t\r\n\t/**\r\n\t * Evaluation for a specific user with given GT item.\r\n\t * @return:\r\n\t * \t result[0]: hit ratio\r\n\t * \t result[1]: ndcg\r\n\t * \t result[2]: precision\r\n\t */\r\n\tprotected double[] evaluate_for_user(int u, int gtItem) {\r\n\t\tdouble[] result = new double[3];\r\n\t\tHashMap<Integer, Double> map_item_score = new HashMap<Integer, Double>();\r\n\t\t// Get the score of the test item first.\r\n\t\tdouble maxScore = predict(u, gtItem);\r\n\t\t\r\n\t\t// Early stopping if there are topK items larger than maxScore.\r\n\t\tint countLarger = 0;\r\n\t\tfor (int i = 0; i < itemCount; i++) {\r\n\t\t\tdouble score = predict(u, i);\r\n\t\t\tmap_item_score.put(i, score);\r\n\t\t\t\r\n\t\t\tif (score > maxScore)\tcountLarger ++;\r\n\t\t\tif (countLarger > topK)\treturn result;\t// early stopping\r\n\t\t}\r\n\t\t\r\n\t\t// Selecting topK items (does not exclude train items).\r\n\t\tArrayList<Integer> rankList = ignoreTrain ? \r\n\t\t\t\tCommonUtils.TopKeysByValue(map_item_score, topK, trainMatrix.getRowRef(u).indexList()) : \r\n\t\t\t\tCommonUtils.TopKeysByValue(map_item_score, topK, null);\r\n\t\tresult[0] = getHitRatio(rankList, gtItem);\r\n\t\tresult[1] = getNDCG(rankList, gtItem);\r\n\t\tresult[2] = getPrecision(rankList, gtItem);\r\n\t\t\r\n\t\treturn result;\r\n\t}\r\n\t\r\n\t/**\r\n\t * Compute Hit Ratio.\r\n\t * @param rankList  A list of ranked item IDs\r\n\t * @param gtItem The ground truth item. \r\n\t * @return Hit ratio.\r\n\t */\r\n\tpublic double getHitRatio(List<Integer> rankList, int gtItem) {\r\n\t\tfor (int item : rankList) {\r\n\t\t\tif (item == gtItem)\treturn 1;\r\n\t\t}\r\n\t\treturn 0;\r\n\t}\r\n\t\r\n\t/**\r\n\t * Compute NDCG of a list of ranked items.\r\n\t * See http://recsyswiki.com/wiki/Discounted_Cumulative_Gain\r\n\t * @param rankList  a list of ranked item IDs\r\n\t * @param gtItem The ground truth item. \r\n\t * @return  NDCG.\r\n\t */\r\n\tpublic double getNDCG(List<Integer> rankList, int gtItem) {\r\n\t\tfor (int i = 0; i < rankList.size(); i++) {\r\n\t\t\tint item = rankList.get(i);\r\n\t\t\tif (item == gtItem)\r\n\t\t\t\treturn Math.log(2) / Math.log(i+2);\r\n\t\t}\r\n\t\treturn 0;\r\n\t}\r\n\t\r\n\tpublic double getPrecision(List<Integer> rankList, int gtItem) {\r\n\t\tfor (int i = 0; i < rankList.size(); i++) {\r\n\t\t\tint item = rankList.get(i);\r\n\t\t\tif (item == gtItem)\r\n\t\t\t\treturn 1.0 / (i + 1);\r\n\t\t}\r\n\t\treturn 0;\r\n\t}\r\n\t\r\n\t// remove\r\n\tpublic void runOneIteration() {}\r\n\t\r\n\t// remove\r\n\tpublic double loss() {return 0;}\r\n\t\r\n\t// remove\r\n\tpublic void setUV(DenseMatrix U, DenseMatrix V) {};\r\n}\r\n\r\n// Thread for running the offline evaluation.\r\nclass EvaluationThread extends Thread {\r\n\tTopKRecommender model;\r\n\tArrayList<Rating> testRatings;\r\n\tArrayList<Integer> users;\r\n\r\n\tpublic EvaluationThread(TopKRecommender model, ArrayList<Rating> testRatings, \r\n\t\t\tArrayList<Integer> users) {\r\n\t\tthis.model = model;\r\n\t\tthis.testRatings = testRatings;\r\n\t\tthis.users = users;\r\n\t}\r\n\t\r\n\tpublic void run() {\r\n\t\tfor (int u : users) {\r\n\t\t\tdouble[] res = model.evaluate_for_user(u, testRatings.get(u).itemId);\r\n\t\t\tmodel.hits.set(u, res[0]);\r\n\t\t\tmodel.ndcgs.set(u, res[1]);\r\n\t\t\tmodel.precs.set(u, res[2]);\r\n\t\t}\r\n\t}\r\n}"
  },
  {
    "path": "src/data_structure/DataMap.java",
    "content": "package data_structure;\r\nimport java.util.HashMap;\r\nimport java.util.Iterator;\r\nimport java.io.Serializable;\r\n\r\n/**\r\n * This is a class implementing HashMap-based data map.\r\n * This data structure is used for implementing sparse vector and matrix.\r\n * \r\n * @author Joonseok Lee\r\n * @since 2012. 4. 20\r\n * @version 1.1\r\n */\r\npublic class DataMap<Key extends Comparable<Key>, Val> implements Iterable<Key>, Serializable {\r\n\tprivate static final long serialVersionUID = 8001;\r\n\t\r\n\t/** Key-value mapping structure */\r\n\tprivate HashMap<Key, Val> map;\r\n\r\n\t/*========================================\r\n\t * Constructors\r\n\t *========================================*/\r\n\t/** Basic constructor without specifying the capacity. */\r\n\tpublic DataMap() {\r\n\t\tmap = new HashMap<Key, Val>();\r\n\t}\r\n\t\r\n\t/**\r\n\t * A constructor specifying the capacity.\r\n\t * BE CAREFUL TO USE THIS! Never set the capacity too larger than actually needed.\r\n\t * It will waste the memory space, reducing performance of your program.\r\n\t */\r\n\tpublic DataMap(int capacity) {\r\n\t\tmap = new HashMap<Key, Val>(capacity);\r\n\t}\r\n\t\r\n\t/*========================================\r\n\t * Getter/Setter\r\n\t *========================================*/\r\n\t/**\r\n\t * Get a data value by the given key.\r\n\t * \r\n\t * @param key The key to search.\r\n\t * @return The data value associated with the given key.\r\n\t */\r\n\tpublic Val get(Key key) {\r\n\t\treturn map.get(key);\r\n\t}\r\n\t\r\n\t/**\r\n\t * Set a data value with the given key.\r\n\t * \r\n\t * @param key The key to set.\r\n\t * @param value The data value associated with the given key.\r\n\t */\r\n\tpublic void put(Key key, Val value) {\r\n\t\tif (value == null) {\r\n\t\t\tmap.remove(key);\r\n\t\t}\r\n\t\telse {\r\n\t\t\tmap.put(key, value);\r\n\t\t}\r\n\t}\r\n\t\r\n\t/**\r\n\t * Remove a data element with the given key.\r\n\t * \r\n\t * @param key The key to remove.\r\n\t * @return The data value deleted with the given key.\r\n\t */\r\n\tpublic Val remove(Key key) {\r\n\t\treturn map.remove(key);\r\n\t}\r\n\t\r\n\t/**\r\n\t * Check whether the map has a specific key inside it.\r\n\t * \r\n\t * @param key The key to search.\r\n\t * @return true if the map has the given key, false otherwise.\r\n\t */\r\n\tpublic boolean contains(Key key) {\r\n\t\treturn map.containsKey(key);\r\n\t}\r\n\t\r\n\t/**\r\n\t * Get an iterator for the map.\r\n\t * \r\n\t * @return The Iterator instance for the map.\r\n\t */\r\n\t@Override\r\n\tpublic Iterator<Key> iterator() {\r\n\t\treturn map.keySet().iterator();\r\n\t}\r\n\t\r\n\t/*========================================\r\n\t * Properties\r\n\t *========================================*/\r\n\t/**\r\n\t * Count the number of elements in the map.\r\n\t * \r\n\t * @return The number of items in the map.\r\n\t */\r\n\tpublic int itemCount() {\r\n\t\treturn map.size();\r\n\t}\r\n}\r\n"
  },
  {
    "path": "src/data_structure/DenseMatrix.java",
    "content": "// Copyright (C) 2014 Guibing Guo\r\n//\r\n// This file is part of LibRec.\r\n//\r\n// LibRec is free software: you can redistribute it and/or modify\r\n// it under the terms of the GNU General Public License as published by\r\n// the Free Software Foundation, either version 3 of the License, or\r\n// (at your option) any later version.\r\n//\r\n// LibRec is distributed in the hope that it will be useful,\r\n// but WITHOUT ANY WARRANTY; without even the implied warranty of\r\n// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\r\n// GNU General Public License for more details.\r\n//\r\n// You should have received a copy of the GNU General Public License\r\n// along with LibRec. If not, see <http://www.gnu.org/licenses/>.\r\n//\r\n\r\npackage data_structure;\r\n\r\nimport happy.coding.io.Strings;\r\nimport happy.coding.math.Randoms;\r\n\r\nimport java.io.Serializable;\r\nimport java.util.Arrays;\r\n\r\n/**\r\n * Data Structure: dense matrix <br>\r\n * \r\n * A big reason that we do not adopt original DenseMatrix from M4J libraray is\r\n * because the latter using one-dimensional array to store data, which will\r\n * often cause OutOfMemory exception due to the limit of maximum length of a\r\n * one-dimensional Java array.\r\n * \r\n * @author guoguibing\r\n * \r\n */\r\npublic class DenseMatrix implements Serializable {\r\n\r\n\tprivate static final long serialVersionUID = -2069621030647530185L;\r\n\r\n\t// dimension\r\n\tprotected int numRows, numColumns;\r\n\t// read data\r\n\tprotected double[][] data;\r\n\r\n\t/**\r\n\t * Construct a dense matrix with specified dimensions\r\n\t * \r\n\t * @param numRows\r\n\t *            number of rows\r\n\t * @param numColumns\r\n\t *            number of columns\r\n\t */\r\n\tpublic DenseMatrix(int numRows, int numColumns) {\r\n\t\tthis.numRows = numRows;\r\n\t\tthis.numColumns = numColumns;\r\n\r\n\t\tdata = new double[numRows][numColumns];\r\n\t}\r\n\r\n\t/**\r\n\t * Construct a dense matrix by copying data from a given 2D array\r\n\t * \r\n\t * @param array\r\n\t *            data array\r\n\t */\r\n\tpublic DenseMatrix(double[][] array) {\r\n\t\tthis(array.length, array[0].length);\r\n\r\n\t\tfor (int i = 0; i < numRows; i++)\r\n\t\t\tfor (int j = 0; j < numColumns; j++)\r\n\t\t\t\tdata[i][j] = array[i][j];\r\n\t}\r\n\r\n\t/**\r\n\t * Construct a dense matrix by copying data from a given matrix\r\n\t * \r\n\t * @param mat\r\n\t *            input matrix\r\n\t */\r\n\tpublic DenseMatrix(DenseMatrix mat) {\r\n\t\tthis(mat.data);\r\n\t}\r\n\r\n\t/**\r\n\t * Make a deep copy of current matrix\r\n\t */\r\n\tpublic DenseMatrix clone() {\r\n\t\treturn new DenseMatrix(this);\r\n\t}\r\n\r\n\t/**\r\n\t * Construct an identity matrix\r\n\t * \r\n\t * @param dim\r\n\t *            dimension\r\n\t * @return an identity matrix\r\n\t */\r\n\tpublic static DenseMatrix eye(int dim) {\r\n\t\tDenseMatrix mat = new DenseMatrix(dim, dim);\r\n\t\tfor (int i = 0; i < mat.numRows; i++)\r\n\t\t\tmat.set(i, i, 1.0);\r\n\r\n\t\treturn mat;\r\n\t}\r\n\r\n\t/**\r\n\t * Initialize a dense matrix with small Guassian values <br/>\r\n\t * \r\n\t * <strong>NOTE:</strong> small initial values make it easier to train a\r\n\t * model; otherwise a very small learning rate may be needed (especially\r\n\t * when the number of factors is large) which can cause bad performance.\r\n\t */\r\n\tpublic void init(double mean, double sigma) {\r\n\t\tfor (int i = 0; i < numRows; i++)\r\n\t\t\tfor (int j = 0; j < numColumns; j++)\r\n\t\t\t\tdata[i][j] = Randoms.gaussian(mean, sigma);\r\n\t}\r\n\r\n\t/**\r\n\t * initialize a dense matrix with small random values in (0, range)\r\n\t */\r\n\tpublic void init(double range) {\r\n\r\n\t\tfor (int i = 0; i < numRows; i++)\r\n\t\t\tfor (int j = 0; j < numColumns; j++)\r\n\t\t\t\tdata[i][j] = Randoms.uniform(0, range);\r\n\t}\r\n\r\n\t/**\r\n\t * initialize a dense matrix with small random values in (0, 1)\r\n\t */\r\n\tpublic void init() {\r\n\t\tinit(1.0);\r\n\t}\r\n\r\n\t/**\r\n\t * @return number of rows\r\n\t */\r\n\tpublic int numRows() {\r\n\t\treturn numRows;\r\n\t}\r\n\r\n\t/**\r\n\t * @return number of columns\r\n\t */\r\n\tpublic int numColumns() {\r\n\t\treturn numColumns;\r\n\t}\r\n\r\n\t/**\r\n\t * @param rowId\r\n\t *            row id\r\n\t * @return a copy of row data as a dense vector\r\n\t */\r\n\tpublic DenseVector row(int rowId) {\r\n\t\treturn row(rowId, true);\r\n\t}\r\n\r\n\t/**\r\n\t * \r\n\t * @param rowId\r\n\t *            row id\r\n\t * @param deep\r\n\t *            whether to copy data or only shallow copy for executing\r\n\t *            speedup purpose\r\n\t * @return a vector of a specific row\r\n\t */\r\n\tpublic DenseVector row(int rowId, boolean deep) {\r\n\t\treturn new DenseVector(data[rowId], deep);\r\n\t}\r\n\r\n\t/**\r\n\t * @param column\r\n\t *            column id\r\n\t * @return a copy of column data as a dense vector\r\n\t */\r\n\tpublic DenseVector column(int column) {\r\n\t\tDenseVector vec = new DenseVector(numRows);\r\n\r\n\t\tfor (int i = 0; i < numRows; i++)\r\n\t\t\tvec.set(i, data[i][column]);\r\n\r\n\t\treturn vec;\r\n\t}\r\n\r\n\t/**\r\n\t * Compute mean of a column of the current matrix\r\n\t * \r\n\t * @param column\r\n\t *            column id\r\n\t * @return mean of a column of the current matrix\r\n\t */\r\n\tpublic double columnMean(int column) {\r\n\t\tdouble sum = 0.0;\r\n\r\n\t\tfor (int i = 0; i < numRows; i++)\r\n\t\t\tsum += data[i][column];\r\n\r\n\t\treturn sum / numRows;\r\n\t}\r\n\r\n\t/**\r\n\t * @return squared sum of all elements of the matrix.\r\n\t */\r\n\tpublic double squaredSum() {\r\n\t\tdouble res = 0;\r\n\r\n\t\tfor (int i = 0; i < numRows; i++)\r\n\t\t\tfor (int j = 0; j < numColumns; j++)\r\n\t\t\t\tres += data[i][j] * data[i][j];\r\n\r\n\t\treturn res;\r\n\t}\r\n\t\r\n\t/**\r\n\t * @return the matrix norm-2\r\n\t */\r\n\tpublic double norm() {\r\n\t\tdouble res = 0;\r\n\r\n\t\tfor (int i = 0; i < numRows; i++)\r\n\t\t\tfor (int j = 0; j < numColumns; j++)\r\n\t\t\t\tres += data[i][j] * data[i][j];\r\n\r\n\t\treturn Math.sqrt(res);\r\n\t}\r\n\r\n\t/**\r\n\t * row x row of two matrix\r\n\t * \r\n\t * @param m\r\n\t *            the first matrix\r\n\t * @param mrow\r\n\t *            row of the first matrix\r\n\t * @param n\r\n\t *            the second matrix\r\n\t * @param nrow\r\n\t *            row of the second matrix\r\n\t * @return inner product of two row vectors\r\n\t */\r\n\tpublic static double rowMult(DenseMatrix m, int mrow, DenseMatrix n, int nrow) {\r\n\t\tassert m.numColumns == n.numColumns;\r\n\r\n\t\tdouble res = 0;\r\n\t\tfor (int j = 0, k = m.numColumns; j < k; j++)\r\n\t\t\tres += m.get(mrow, j) * n.get(nrow, j);\r\n\r\n\t\treturn res;\r\n\t}\r\n\r\n\t/**\r\n\t * column x column of two matrix\r\n\t * \r\n\t * @param m\r\n\t *            the first matrix\r\n\t * @param mcol\r\n\t *            column of the first matrix\r\n\t * @param n\r\n\t *            the second matrix\r\n\t * @param ncol\r\n\t *            column of the second matrix\r\n\t * @return inner product of two column vectors\r\n\t */\r\n\tpublic static double colMult(DenseMatrix m, int mcol, DenseMatrix n, int ncol) {\r\n\t\tassert m.numRows == n.numRows;\r\n\r\n\t\tdouble res = 0;\r\n\t\tfor (int j = 0, k = m.numRows; j < k; j++)\r\n\t\t\tres += m.get(j, mcol) * n.get(j, ncol);\r\n\r\n\t\treturn res;\r\n\t}\r\n\r\n\t/**\r\n\t * dot product of row x col between two matrices\r\n\t * \r\n\t * @param m\r\n\t *            the first matrix\r\n\t * @param mrow\r\n\t *            row id of the first matrix\r\n\t * @param n\r\n\t *            the second matrix\r\n\t * @param ncol\r\n\t *            column id of the second matrix\r\n\t * @return dot product of row of the first matrix and column of the second\r\n\t *         matrix\r\n\t */\r\n\tpublic static double product(DenseMatrix m, int mrow, DenseMatrix n, int ncol) {\r\n\t\tassert m.numColumns == n.numRows;\r\n\r\n\t\tdouble res = 0;\r\n\t\tfor (int j = 0; j < m.numColumns; j++)\r\n\t\t\tres += m.get(mrow, j) * n.get(j, ncol);\r\n\r\n\t\treturn res;\r\n\t}\r\n\r\n\t/**\r\n\t * Matrix multiplication with a dense matrix\r\n\t * \r\n\t * @param mat\r\n\t *            a dense matrix\r\n\t * @return a dense matrix with results of matrix multiplication\r\n\t */\r\n\tpublic DenseMatrix mult(DenseMatrix mat) {\r\n\t\tassert this.numColumns == mat.numRows;\r\n\r\n\t\tDenseMatrix res = new DenseMatrix(this.numRows, mat.numColumns);\r\n\t\tfor (int i = 0; i < res.numRows; i++) {\r\n\t\t\tfor (int j = 0; j < res.numColumns; j++) {\r\n\r\n\t\t\t\tdouble product = 0;\r\n\t\t\t\tfor (int k = 0; k < this.numColumns; k++)\r\n\t\t\t\t\tproduct += data[i][k] * mat.data[k][j];\r\n\r\n\t\t\t\tres.set(i, j, product);\r\n\t\t\t}\r\n\t\t}\r\n\r\n\t\treturn res;\r\n\t}\r\n\r\n\t/**\r\n\t * Do {@code matrix x vector} between current matrix and a given vector\r\n\t * \r\n\t * @return a dense vector with the results of {@code matrix x vector}\r\n\t */\r\n\tpublic DenseVector mult(DenseVector vec) {\r\n\t\tassert this.numColumns == vec.size;\r\n\r\n\t\tDenseVector res = new DenseVector(this.numRows);\r\n\t\tfor (int i = 0; i < this.numRows; i++)\r\n\t\t\tres.set(i, row(i, false).inner(vec));\r\n\r\n\t\treturn res;\r\n\t}\r\n\r\n\t/**\r\n\t * Get the value at entry [row, column]\r\n\t */\r\n\tpublic double get(int row, int column) {\r\n\t\treturn data[row][column];\r\n\t}\r\n\r\n\t/**\r\n\t * Set a value to entry [row, column]\r\n\t */\r\n\tpublic void set(int row, int column, double val) {\r\n\t\tdata[row][column] = val;\r\n\t}\r\n\r\n\t/**\r\n\t * Add a value to entry [row, column]\r\n\t */\r\n\tpublic void add(int row, int column, double val) {\r\n\t\tdata[row][column] += val;\r\n\t}\r\n\r\n\t/**\r\n\t * @return a new matrix by scaling the current matrix\r\n\t */\r\n\tpublic DenseMatrix scale(double val) {\r\n\t\tDenseMatrix mat = new DenseMatrix(numRows, numColumns);\r\n\t\tfor (int i = 0; i < numRows; i++)\r\n\t\t\tfor (int j = 0; j < numColumns; j++)\r\n\t\t\t\tmat.data[i][j] = this.data[i][j] * val;\r\n\r\n\t\treturn mat;\r\n\t}\r\n\t\r\n\t/**\r\n\t * Scaling on the current matrix\r\n\t */\r\n\tpublic void selfScale(double val) {\r\n\t\tfor (int i = 0; i < numRows; i++)\r\n\t\t\tfor (int j = 0; j < numColumns; j++)\r\n\t\t\t\tthis.data[i][j] = this.data[i][j] * val;\r\n\t}\r\n\r\n\t/**\r\n\t * Do {@code A + B} matrix operation\r\n\t * \r\n\t * @return a matrix with results of {@code C = A + B}\r\n\t */\r\n\tpublic DenseMatrix add(DenseMatrix mat) {\r\n\t\tassert numRows == mat.numRows;\r\n\t\tassert numColumns == mat.numColumns;\r\n\r\n\t\tDenseMatrix res = new DenseMatrix(numRows, numColumns);\r\n\r\n\t\tfor (int i = 0; i < numRows; i++)\r\n\t\t\tfor (int j = 0; j < numColumns; j++)\r\n\t\t\t\tres.data[i][j] = data[i][j] + mat.data[i][j];\r\n\r\n\t\treturn res;\r\n\t}\r\n\t\r\n\tpublic void selfAdd(DenseMatrix mat) {\r\n\t\tassert numRows == mat.numRows;\r\n\t\tassert numColumns == mat.numColumns;\r\n\t\t\r\n\t\tfor (int i = 0; i < numRows; i++)\r\n\t\t\tfor (int j = 0; j < numColumns; j++)\r\n\t\t\t\tthis.data[i][j] += mat.data[i][j];\r\n\t}\r\n\t\r\n\r\n\t/**\r\n\t * Do {@code A + c} matrix operation, where {@code c} is a constant. Each\r\n\t * entries will be added by {@code c}\r\n\t * \r\n\t * @return a new matrix with results of {@code C = A + c}\r\n\t */\r\n\tpublic DenseMatrix add(double val) {\r\n\r\n\t\tDenseMatrix res = new DenseMatrix(numRows, numColumns);\r\n\r\n\t\tfor (int i = 0; i < numRows; i++)\r\n\t\t\tfor (int j = 0; j < numColumns; j++)\r\n\t\t\t\tres.data[i][j] = data[i][j] + val;\r\n\r\n\t\treturn res;\r\n\t}\r\n\r\n\t/**\r\n\t * Do {@code A + B} matrix operation\r\n\t * \r\n\t * @return a matrix with results of {@code C = A + B}\r\n\t */\r\n\tpublic DenseMatrix minus(DenseMatrix mat) {\r\n\t\tassert numRows == mat.numRows;\r\n\t\tassert numColumns == mat.numColumns;\r\n\r\n\t\tDenseMatrix res = new DenseMatrix(numRows, numColumns);\r\n\r\n\t\tfor (int i = 0; i < numRows; i++)\r\n\t\t\tfor (int j = 0; j < numColumns; j++)\r\n\t\t\t\tres.data[i][j] = data[i][j] - mat.data[i][j];\r\n\r\n\t\treturn res;\r\n\t}\r\n\r\n\t/**\r\n\t * Do {@code A + c} matrix operation, where {@code c} is a constant. Each\r\n\t * entries will be added by {@code c}\r\n\t * \r\n\t * @return a new matrix with results of {@code C = A + c}\r\n\t */\r\n\tpublic DenseMatrix minus(double val) {\r\n\r\n\t\tDenseMatrix res = new DenseMatrix(numRows, numColumns);\r\n\r\n\t\tfor (int i = 0; i < numRows; i++)\r\n\t\t\tfor (int j = 0; j < numColumns; j++)\r\n\t\t\t\tres.data[i][j] = data[i][j] - val;\r\n\r\n\t\treturn res;\r\n\t}\r\n\r\n\t/**\r\n\t * @return the Cholesky decomposition of the current matrix\r\n\t */\r\n\tpublic DenseMatrix cholesky() {\r\n\t\tif (this.numRows != this.numColumns)\r\n\t\t\tthrow new RuntimeException(\"Matrix is not square\");\r\n\r\n\t\tint n = numRows;\r\n\t\tDenseMatrix L = new DenseMatrix(n, n);\r\n\r\n\t\tfor (int i = 0; i < n; i++) {\r\n\t\t\tfor (int j = 0; j <= i; j++) {\r\n\t\t\t\tdouble sum = 0.0;\r\n\t\t\t\tfor (int k = 0; k < j; k++)\r\n\t\t\t\t\tsum += L.get(i, k) * L.get(j, k);\r\n\r\n\t\t\t\tdouble val = i == j ? Math.sqrt(data[i][i] - sum) : (data[i][j] - sum) / L.get(j, j);\r\n\t\t\t\tL.set(i, j, val);\r\n\t\t\t}\r\n\t\t\tif (Double.isNaN(L.get(i, i)))\r\n\t\t\t\treturn null;\r\n\t\t}\r\n\r\n\t\treturn L.transpose();\r\n\t}\r\n\r\n\t/**\r\n\t * @return a transposed matrix of current matrix\r\n\t */\r\n\tpublic DenseMatrix transpose() {\r\n\t\tDenseMatrix mat = new DenseMatrix(numColumns, numRows);\r\n\r\n\t\tfor (int i = 0; i < mat.numRows; i++)\r\n\t\t\tfor (int j = 0; j < mat.numColumns; j++)\r\n\t\t\t\tmat.set(i, j, this.data[j][i]);\r\n\r\n\t\treturn mat;\r\n\t}\r\n\r\n\t/**\r\n\t * @return a covariance matrix of the current matrix\r\n\t */\r\n\tpublic DenseMatrix cov() {\r\n\t\tDenseMatrix mat = new DenseMatrix(numColumns, numColumns);\r\n\r\n\t\tfor (int i = 0; i < numColumns; i++) {\r\n\t\t\tDenseVector xi = this.column(i);\r\n\t\t\txi = xi.minus(xi.mean());\r\n\r\n\t\t\tmat.set(i, i, xi.inner(xi) / (xi.size - 1));\r\n\r\n\t\t\tfor (int j = i + 1; j < numColumns; j++) {\r\n\t\t\t\tDenseVector yi = this.column(j);\r\n\t\t\t\tdouble val = xi.inner(yi.minus(yi.mean())) / (xi.size - 1);\r\n\r\n\t\t\t\tmat.set(i, j, val);\r\n\t\t\t\tmat.set(j, i, val);\r\n\t\t\t}\r\n\t\t}\r\n\r\n\t\treturn mat;\r\n\t}\r\n\r\n\t/**\r\n\t * Compute the inverse of a matrix by LU decomposition\r\n\t * \r\n\t * @return the inverse matrix of current matrix\r\n\t * @deprecated use {@code inv} instead which is slightly faster\r\n\t */\r\n\tpublic DenseMatrix inverse() {\r\n\t\tif (numRows != numColumns)\r\n\t\t\tthrow new RuntimeException(\"Only square matrix can do inversion\");\r\n\r\n\t\tint n = numRows;\r\n\t\tDenseMatrix mat = new DenseMatrix(this);\r\n\r\n\t\tif (n == 1) {\r\n\t\t\tmat.set(0, 0, 1.0 / mat.get(0, 0));\r\n\t\t\treturn mat;\r\n\t\t}\r\n\r\n\t\tint row[] = new int[n];\r\n\t\tint col[] = new int[n];\r\n\t\tdouble temp[] = new double[n];\r\n\t\tint hold, I_pivot, J_pivot;\r\n\t\tdouble pivot, abs_pivot;\r\n\r\n\t\t// set up row and column interchange vectors\r\n\t\tfor (int k = 0; k < n; k++) {\r\n\t\t\trow[k] = k;\r\n\t\t\tcol[k] = k;\r\n\t\t}\r\n\t\t// begin main reduction loop\r\n\t\tfor (int k = 0; k < n; k++) {\r\n\t\t\t// find largest element for pivot\r\n\t\t\tpivot = mat.get(row[k], col[k]);\r\n\t\t\tI_pivot = k;\r\n\t\t\tJ_pivot = k;\r\n\t\t\tfor (int i = k; i < n; i++) {\r\n\t\t\t\tfor (int j = k; j < n; j++) {\r\n\t\t\t\t\tabs_pivot = Math.abs(pivot);\r\n\t\t\t\t\tif (Math.abs(mat.get(row[i], col[j])) > abs_pivot) {\r\n\t\t\t\t\t\tI_pivot = i;\r\n\t\t\t\t\t\tJ_pivot = j;\r\n\t\t\t\t\t\tpivot = mat.get(row[i], col[j]);\r\n\t\t\t\t\t}\r\n\t\t\t\t}\r\n\t\t\t}\r\n\t\t\tif (Math.abs(pivot) < 1.0E-10)\r\n\t\t\t\tthrow new RuntimeException(\"Matrix is singular !\");\r\n\r\n\t\t\thold = row[k];\r\n\t\t\trow[k] = row[I_pivot];\r\n\t\t\trow[I_pivot] = hold;\r\n\t\t\thold = col[k];\r\n\t\t\tcol[k] = col[J_pivot];\r\n\t\t\tcol[J_pivot] = hold;\r\n\r\n\t\t\t// reduce about pivot\r\n\t\t\tmat.set(row[k], col[k], 1.0 / pivot);\r\n\t\t\tfor (int j = 0; j < n; j++) {\r\n\t\t\t\tif (j != k) {\r\n\t\t\t\t\tmat.set(row[k], col[j], mat.get(row[k], col[j]) * mat.get(row[k], col[k]));\r\n\t\t\t\t}\r\n\t\t\t}\r\n\t\t\t// inner reduction loop\r\n\t\t\tfor (int i = 0; i < n; i++) {\r\n\t\t\t\tif (k != i) {\r\n\t\t\t\t\tfor (int j = 0; j < n; j++) {\r\n\t\t\t\t\t\tif (k != j) {\r\n\r\n\t\t\t\t\t\t\tdouble val = mat.get(row[i], col[j]) - mat.get(row[i], col[k]) * mat.get(row[k], col[j]);\r\n\t\t\t\t\t\t\tmat.set(row[i], col[j], val);\r\n\t\t\t\t\t\t}\r\n\t\t\t\t\t}\r\n\t\t\t\t\tmat.set(row[i], col[k], -mat.get(row[i], col[k]) * mat.get(row[k], col[k]));\r\n\t\t\t\t}\r\n\t\t\t}\r\n\t\t}\r\n\t\t// end main reduction loop\r\n\r\n\t\t// unscramble rows\r\n\t\tfor (int j = 0; j < n; j++) {\r\n\t\t\tfor (int i = 0; i < n; i++)\r\n\t\t\t\ttemp[col[i]] = mat.get(row[i], j);\r\n\r\n\t\t\tfor (int i = 0; i < n; i++)\r\n\t\t\t\tmat.set(i, j, temp[i]);\r\n\r\n\t\t}\r\n\r\n\t\t// unscramble columns\r\n\t\tfor (int i = 0; i < n; i++) {\r\n\t\t\tfor (int j = 0; j < n; j++)\r\n\t\t\t\ttemp[row[j]] = mat.get(i, col[j]);\r\n\r\n\t\t\tfor (int j = 0; j < n; j++)\r\n\t\t\t\tmat.set(i, j, temp[j]);\r\n\t\t}\r\n\r\n\t\treturn mat;\r\n\t}\r\n\r\n\t/**\r\n\t * NOTE: this implementation (adopted from PREA package) is slightly faster\r\n\t * than {@code inverse}, especailly when {@code numRows} is large.\r\n\t * \r\n\t * @return the inverse matrix of current matrix\r\n\t */\r\n\tpublic DenseMatrix inv() {\r\n\t\tif (this.numRows != this.numColumns)\r\n\t\t\tthrow new RuntimeException(\"Dimensions disagree\");\r\n\r\n\t\tint n = this.numRows;\r\n\t\tDenseMatrix mat = DenseMatrix.eye(n);\r\n\r\n\t\tif (n == 1) {\r\n\t\t\tmat.set(0, 0, 1 / this.get(0, 0));\r\n\t\t\treturn mat;\r\n\t\t}\r\n\r\n\t\tDenseMatrix b = new DenseMatrix(this);\r\n\t\tfor (int i = 0; i < n; i++) {\r\n\t\t\t// find pivot:\r\n\t\t\tdouble mag = 0;\r\n\t\t\tint pivot = -1;\r\n\r\n\t\t\tfor (int j = i; j < n; j++) {\r\n\t\t\t\tdouble mag2 = Math.abs(b.get(j, i));\r\n\t\t\t\tif (mag2 > mag) {\r\n\t\t\t\t\tmag = mag2;\r\n\t\t\t\t\tpivot = j;\r\n\t\t\t\t}\r\n\t\t\t}\r\n\r\n\t\t\t// no pivot (error):\r\n\t\t\tif (pivot == -1 || mag == 0)\r\n\t\t\t\treturn mat;\r\n\r\n\t\t\t// move pivot row into position:\r\n\t\t\tif (pivot != i) {\r\n\t\t\t\tdouble temp;\r\n\t\t\t\tfor (int j = i; j < n; j++) {\r\n\t\t\t\t\ttemp = b.get(i, j);\r\n\t\t\t\t\tb.set(i, j, b.get(pivot, j));\r\n\t\t\t\t\tb.set(pivot, j, temp);\r\n\t\t\t\t}\r\n\r\n\t\t\t\tfor (int j = 0; j < n; j++) {\r\n\t\t\t\t\ttemp = mat.get(i, j);\r\n\t\t\t\t\tmat.set(i, j, mat.get(pivot, j));\r\n\t\t\t\t\tmat.set(pivot, j, temp);\r\n\t\t\t\t}\r\n\t\t\t}\r\n\r\n\t\t\t// normalize pivot row:\r\n\t\t\tmag = b.get(i, i);\r\n\t\t\tfor (int j = i; j < n; j++)\r\n\t\t\t\tb.set(i, j, b.get(i, j) / mag);\r\n\r\n\t\t\tfor (int j = 0; j < n; j++)\r\n\t\t\t\tmat.set(i, j, mat.get(i, j) / mag);\r\n\r\n\t\t\t// eliminate pivot row component from other rows:\r\n\t\t\tfor (int k = 0; k < n; k++) {\r\n\t\t\t\tif (k == i)\r\n\t\t\t\t\tcontinue;\r\n\r\n\t\t\t\tdouble mag2 = b.get(k, i);\r\n\r\n\t\t\t\tfor (int j = i; j < n; j++)\r\n\t\t\t\t\tb.set(k, j, b.get(k, j) - mag2 * b.get(i, j));\r\n\r\n\t\t\t\tfor (int j = 0; j < n; j++)\r\n\t\t\t\t\tmat.set(k, j, mat.get(k, j) - mag2 * mat.get(i, j));\r\n\t\t\t}\r\n\t\t}\r\n\r\n\t\treturn mat;\r\n\t}\r\n\r\n\t/**\r\n\t * set one value to a specific row\r\n\t * \r\n\t * @param row\r\n\t *            row id\r\n\t * @param val\r\n\t *            value to be set\r\n\t */\r\n\tpublic void setRow(int row, double val) {\r\n\t\tArrays.fill(data[row], val);\r\n\t}\r\n\r\n\t/**\r\n\t * set values of one dense vector to a specific row\r\n\t * \r\n\t * @param row\r\n\t *            row id\r\n\t * @param vals\r\n\t *            values of a dense vector\r\n\t */\r\n\tpublic void setRow(int row, DenseVector vals) {\r\n\t\tfor (int j = 0; j < numColumns; j++)\r\n\t\t\tdata[row][j] = vals.data[j];\r\n\t}\r\n\r\n\t/**\r\n\t * clear and reset all entries to 0\r\n\t */\r\n\tpublic void clear() {\r\n\t\tfor (int i = 0; i < numRows; i++)\r\n\t\t\tsetRow(i, 0.0);\r\n\t}\r\n\r\n\t@Override\r\n\tpublic String toString() {\r\n\t\treturn Strings.toString(data);\r\n\t}\r\n\r\n}\r\n"
  },
  {
    "path": "src/data_structure/DenseVector.java",
    "content": "// Copyright (C) 2014 Guibing Guo\n//\n// This file is part of LibRec.\n//\n// LibRec is free software: you can redistribute it and/or modify\n// it under the terms of the GNU General Public License as published by\n// the Free Software Foundation, either version 3 of the License, or\n// (at your option) any later version.\n//\n// LibRec is distributed in the hope that it will be useful,\n// but WITHOUT ANY WARRANTY; without even the implied warranty of\n// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\n// GNU General Public License for more details.\n//\n// You should have received a copy of the GNU General Public License\n// along with LibRec. If not, see <http://www.gnu.org/licenses/>.\n//\n\npackage data_structure;\n\nimport happy.coding.io.Strings;\nimport happy.coding.math.Randoms;\nimport happy.coding.math.Stats;\n\nimport java.io.Serializable;\n\n/**\n * Data Structure: dense vector\n * \n * @author guoguibing\n * \n */\npublic class DenseVector implements Serializable {\n\n\tprivate static final long serialVersionUID = -2930574547913792430L;\n\n\tprotected int size;\n\tprotected double[] data;\n\n\t/**\n\t * Construct a dense vector with a specific size\n\t * \n\t * @param size\n\t *            the size of vector\n\t */\n\tpublic DenseVector(int size) {\n\t\tthis.size = size;\n\t\tdata = new double[size];\n\t}\n\n\t/**\n\t * Construct a dense vector by deeply copying data from a given array\n\t */\n\tpublic DenseVector(double[] array) {\n\t\tthis(array, true);\n\t}\n\n\t/**\n\t * Construct a dense vector by copying data from a given array\n\t * \n\t * @param array\n\t *            a given data array\n\t * @param deep\n\t *            whether to deep copy array data\n\t */\n\tpublic DenseVector(double[] array, boolean deep) {\n\t\tthis.size = array.length;\n\t\tif (deep) {\n\t\t\tdata = new double[array.length];\n\t\t\tfor (int i = 0; i < size; i++)\n\t\t\t\tdata[i] = array[i];\n\t\t} else {\n\t\t\tdata = array;\n\t\t}\n\t}\n\n\t/**\n\t * Construct a dense vector by deeply copying data from a given vector\n\t */\n\tpublic DenseVector(DenseVector vec) {\n\t\tthis(vec.data);\n\t}\n\n\t/**\n\t * Make a deep copy of current vector\n\t */\n\tpublic DenseVector clone() {\n\t\treturn new DenseVector(this);\n\t}\n\t\n\tpublic int size() {\n\t\treturn this.size;\n\t}\n\t\n\t/**\n\t * Initialize a dense vector with Gaussian values\n\t */\n\tpublic void init(double mean, double sigma) {\n\t\tfor (int i = 0; i < size; i++)\n\t\t\tdata[i] = Randoms.gaussian(mean, sigma);\n\t}\n\n\t/**\n\t * Initialize a dense vector with uniform values in (0, 1)\n\t */\n\tpublic void init() {\n\t\tfor (int i = 0; i < size; i++)\n\t\t\tdata[i] = Randoms.uniform();\n\t}\n\n\t/**\n\t * Initialize a dense vector with uniform values in (0, range)\n\t */\n\tpublic void init(double range) {\n\t\tfor (int i = 0; i < size; i++)\n\t\t\tdata[i] = Randoms.uniform(0, range);\n\t}\n\n\t/**\n\t * Get a value at entry [index]\n\t */\n\tpublic double get(int idx) {\n\t\treturn data[idx];\n\t}\n\n\t/**\n\t * @return vector's data\n\t */\n\tpublic double[] getData() {\n\t\treturn data;\n\t}\n\n\t/**\n\t * @return mean of current vector\n\t */\n\tpublic double mean() {\n\t\treturn Stats.mean(data);\n\t}\n\t\n\t/**\n\t * @return summation of entries\n\t */\n\tpublic double sum(){\n\t\treturn Stats.sum(data);\n\t}\n\t\n\t/**\n\t * @return squared summation of entries\n\t */\n\tpublic double squaredSum(){\n\t\tdouble sum = 0;\n\t\tfor (int i = 0; i < data.length; i ++) {\n\t\t\tsum += data[i] * data[i];\n\t\t}\n\t\treturn sum;\n\t}\n\t\n\t/**\n\t * Set a value to entry [index]\n\t */\n\tpublic void set(int idx, double val) {\n\t\tdata[idx] = val;\n\t}\n\n\t/**\n\t * Set a value to all entries\n\t */\n\tpublic void setAll(double val) {\n\t\tfor (int i = 0; i < size; i++)\n\t\t\tdata[i] = val;\n\t}\n\n\t/**\n\t * Add a value to entry [index]\n\t */\n\tpublic void add(int idx, double val) {\n\t\tdata[idx] += val;\n\t}\n\n\t/**\n\t * Substract a value from entry [index]\n\t */\n\tpublic void minus(int idx, double val) {\n\t\tdata[idx] -= val;\n\t}\n\n\t/**\n\t * @return a dense vector by adding a value to all entries of current vector\n\t */\n\tpublic DenseVector add(double val) {\n\t\tDenseVector result = new DenseVector(size);\n\n\t\tfor (int i = 0; i < size; i++)\n\t\t\tresult.data[i] = this.data[i] + val;\n\n\t\treturn result;\n\t}\n\n\t/**\n\t * @return a dense vector by substructing a value from all entries of current vector\n\t */\n\tpublic DenseVector minus(double val) {\n\n\t\tDenseVector result = new DenseVector(size);\n\n\t\tfor (int i = 0; i < size; i++)\n\t\t\tresult.data[i] = this.data[i] - val;\n\n\t\treturn result;\n\t}\n\n\t/**\n\t * @return a dense vector by scaling a value to all entries of current vector\n\t */\n\tpublic DenseVector scale(double val) {\n\n\t\tDenseVector result = new DenseVector(size);\n\t\tfor (int i = 0; i < size; i++)\n\t\t\tresult.data[i] = this.data[i] * val;\n\n\t\treturn result;\n\t}\n\t\n\tpublic void selfScale(double val) {\n\t\tfor (int i = 0; i < size; i ++)\n\t\t\tthis.data[i] = this.data[i] * val;\n\t}\n\n\t/**\n\t * Do vector operation: {@code a + b}\n\t * \n\t * @return a dense vector with results of {@code c = a + b}\n\t */\n\tpublic DenseVector add(DenseVector vec) {\n\t\tassert size == vec.size;\n\n\t\tDenseVector result = new DenseVector(size);\n\t\tfor (int i = 0; i < result.size; i++)\n\t\t\tresult.data[i] = this.data[i] + vec.data[i];\n\n\t\treturn result;\n\t}\n\t\n\t/**\n\t * Vector add operation to itself.\n\t */\n\tpublic void selfAdd(DenseVector vec) {\n\t\tassert size == vec.size;\n\t\t\n\t\tfor (int i = 0; i < size; i ++) \n\t\t\tthis.data[i] = this.data[i] + vec.data[i];\n\t}\n\n\t/**\n\t * Do vector operation: {@code a - b}\n\t * \n\t * @return a dense vector with results of {@code c = a - b}\n\t */\n\tpublic DenseVector minus(DenseVector vec) {\n\t\tassert size == vec.size;\n\n\t\tDenseVector result = new DenseVector(size);\n\t\tfor (int i = 0; i < vec.size; i++)\n\t\t\tresult.data[i] = this.data[i] - vec.data[i];\n\n\t\treturn result;\n\t}\n\n\t/**\n\t * Do vector operation: {@code a^t * b}\n\t * \n\t * @return the inner product of two vectors\n\t */\n\tpublic double inner(DenseVector vec) {\n\t\tassert size == vec.size;\n\n\t\tdouble result = 0;\n\t\tfor (int i = 0; i < vec.size; i++)\n\t\t\tresult += get(i) * vec.get(i);\n\n\t\treturn result;\n\t}\n\n\t/**\n\t * Do vector operation: {@code a * b^t}\n\t * \n\t * @return the outer product of two vectors\n\t */\n\tpublic DenseMatrix outer(DenseVector vec) {\n\t\tDenseMatrix mat = new DenseMatrix(this.size, vec.size);\n\n\t\tfor (int i = 0; i < mat.numRows; i++)\n\t\t\tfor (int j = 0; j < mat.numColumns; j++)\n\t\t\t\tmat.set(i, j, get(i) * vec.get(j));\n\n\t\treturn mat;\n\t}\n\n\t@Override\n\tpublic String toString() {\n\t\treturn Strings.toString(data);\n\t}\n\n}\n"
  },
  {
    "path": "src/data_structure/Pair.java",
    "content": "package data_structure;\n\nimport java.util.Objects;\n\npublic class Pair<F, S> {\n  public final F first;\n  public final S second;\n\n  public Pair(F first, S second) {\n      this.first = first;\n      this.second = second;\n  }\n\n  @Override\n  public boolean equals(Object o) {\n      if (!(o instanceof Pair)) {\n          return false;\n      }\n      Pair<?, ?> p = (Pair<?, ?>) o;\n      return Objects.equals(p.first, first) && Objects.equals(p.second, second);\n  }\n\n  @Override\n  public int hashCode() {\n      return (first == null ? 0 : first.hashCode()) ^ \n      \t\t(second == null ? 0 : second.hashCode());\n  }\n\n  public static <A, B> Pair <A, B> create(A a, B b) {\n      return new Pair<A, B>(a, b);\n  }\n}"
  },
  {
    "path": "src/data_structure/Rating.java",
    "content": "package data_structure;\r\n\r\npublic class Rating {\r\n\tpublic int userId; // user id, starts from 0\r\n\tpublic int itemId; // item id, starts from 0\r\n\tpublic float score;\r\n\tpublic long timestamp;\r\n\t\r\n\tpublic Rating(int userId, int itemId, float score, long timestamp) {\r\n\t\tthis.userId = userId;\r\n\t\tthis.itemId = itemId;\r\n\t\tthis.score = score;\r\n\t\tthis.timestamp = timestamp;\r\n\t}\r\n\t\r\n\tpublic Rating(String line) {\r\n\t\tString[] arr = line.split(\"\\t\");\r\n\t\tuserId = Integer.parseInt(arr[0]);\r\n\t\titemId = Integer.parseInt(arr[1]);\r\n\t\tscore = Float.parseFloat(arr[2]);\r\n\t\tif (arr.length > 3)\ttimestamp = Long.parseLong(arr[3]);\r\n\t}\r\n\t\r\n\tpublic String toString() {\r\n\t\treturn \"<\" + userId + \",\" + itemId + \",\" + score + \",\" + timestamp + \">\";\r\n\t}\r\n}"
  },
  {
    "path": "src/data_structure/SparseMatrix.java",
    "content": "package data_structure;\r\n\r\nimport java.io.Serializable;\r\nimport java.util.ArrayList;\r\nimport data_structure.Pair;\r\n\r\n/**\r\n * This class implements sparse matrix, containing empty values for most space.\r\n * \r\n * @author Joonseok Lee\r\n * @since 2012. 4. 20\r\n * @version 1.1\r\n */\r\npublic class SparseMatrix implements Serializable{\r\n\tprivate static final long serialVersionUID = 8003;\r\n\t\r\n\t/** The number of rows. */\r\n\tprivate int M;\r\n\t/** The number of columns. */\r\n\tprivate int N;\r\n\t/** The array of row references. */\r\n\tprivate SparseVector[] rows;\r\n\t/** The array of column references. */\r\n\tprivate SparseVector[] cols;\r\n\r\n\t/*========================================\r\n\t * Constructors\r\n\t *========================================*/\r\n\t/**\r\n\t * Construct an empty sparse matrix, with a given size.\r\n\t * \r\n\t * @param m The number of rows.\r\n\t * @param n The number of columns.\r\n\t */\r\n\tpublic SparseMatrix(int m, int n) {\r\n\t\tthis.M = m;\r\n\t\tthis.N = n;\r\n\t\trows = new SparseVector[M];\r\n\t\tcols = new SparseVector[N];\r\n\t\t\r\n\t\tfor (int i = 0; i < M; i++) {\r\n\t\t\trows[i] = new SparseVector(N);\r\n\t\t}\r\n\t\tfor (int j = 0; j < N; j++) {\r\n\t\t\tcols[j] = new SparseVector(M);\r\n\t\t}\r\n\t}\r\n\t\r\n\t/**\r\n\t * Construct an empty sparse matrix, with data copied from another sparse matrix.\r\n\t * \r\n\t * @param sm The matrix having data being copied.\r\n\t */\r\n\tpublic SparseMatrix(SparseMatrix sm) {\r\n\t\tthis.M = sm.M;\r\n\t\tthis.N = sm.N;\r\n\t\trows = new SparseVector[M];\r\n\t\tcols = new SparseVector[N];\r\n\t\t\r\n\t\tfor (int i = 0; i < M; i++) {\r\n\t\t\trows[i] = sm.getRow(i);\r\n\t\t}\r\n\t\tfor (int j = 0; j < N; j++) {\r\n\t\t\tcols[j] = sm.getCol(j);\r\n\t\t}\r\n\t}\r\n\r\n\t/*========================================\r\n\t * Getter/Setter\r\n\t *========================================*/\r\n\t/**\r\n\t * Retrieve a stored value from the given index.\r\n\t * \r\n\t * @param i The row index to retrieve.\r\n\t * @param j The column index to retrieve.\r\n\t * @return The value stored at the given index.\r\n\t */\r\n\tpublic double getValue(int i, int j) {\r\n\t\treturn rows[i].getValue(j);\r\n\t}\r\n\t\r\n\t/**\r\n\t * Set a new value at the given index.\r\n\t * \r\n\t * @param i The row index to store new value.\r\n\t * @param j The column index to store new value.\r\n\t * @param value The value to store.\r\n\t */\r\n\tpublic void setValue(int i, int j, double value) {\r\n\t\tif (value == 0.0) {\r\n\t\t\trows[i].remove(j);\r\n\t\t\tcols[j].remove(i);\r\n\t\t}\r\n\t\telse {\r\n\t\t\trows[i].setValue(j, value);\r\n\t\t\tcols[j].setValue(i, value);\r\n\t\t}\r\n\t}\r\n\t\r\n\t/**\r\n\t * Set a new row vector at the given row index.\r\n\t * @param i The row index to store new vector\r\n\t * @param newVector \r\n\t */\r\n\tpublic void setRowVector(int i, SparseVector newVector) {\r\n\t\tif (newVector.length() != this.N)\r\n\t\t\tthrow new RuntimeException(\"Vector lengths disagree\");\r\n\t\tif (i < 0 || i >= this.M)\r\n\t\t\tthrow new RuntimeException(\"Wrong input row index.\");\r\n\t\t// Clear the values of the current rowVector.\r\n\t\tif (rows[i].indexList() != null) {\r\n\t\t\tfor (int j : rows[i].indexList()) {\r\n\t\t\t\tthis.setValue(i, j, 0);\r\n\t\t\t}\r\n\t\t}\r\n\t\t// Set the new vector.\r\n\t\tif (newVector.indexList() != null) {\r\n\t\t\tfor (int j : newVector.indexList()) {\r\n\t\t\t\tthis.setValue(i, j, newVector.getValue(j));\r\n\t\t\t}\r\n\t\t}\r\n\t}\r\n\t\r\n\t/**\r\n\t * Set a new row vector with non-negative constraint at the given row index.\r\n\t * If the value is negative, set it as 0.\r\n\t * \r\n\t * @param i The row index to store new vector\r\n\t * @param newVector \r\n\t */\r\n\tpublic void setRowVectorNonnegative(int i, SparseVector newVector) {\r\n\t\tif (newVector.length() != this.N)\r\n\t\t\tthrow new RuntimeException(\"Vector lengths disagree\");\r\n\t\tif (i < 0 || i >= this.M)\r\n\t\t\tthrow new RuntimeException(\"Wrong input row index.\");\r\n\t\t// Clear the values of the current rowVector.\r\n\t\tif (rows[i].indexList() != null) {\r\n\t\t\tfor (int j : rows[i].indexList()) {\r\n\t\t\t\tthis.setValue(i, j, 0);\r\n\t\t\t}\r\n\t\t}\r\n\t\t// Set the new vector with nonnegative constraint.\r\n\t\tif (newVector.indexList() != null) {\r\n\t\t\tfor (int j : newVector.indexList()) {\r\n\t\t\t\tdouble value = newVector.getValue(j);\r\n\t\t\t\tthis.setValue(i, j, value > 0 ? value : 0);\r\n\t\t\t}\r\n\t\t}\r\n\t}\r\n\t\r\n\t/**\r\n\t * Set a new col vector at the given col index.\r\n\t */\r\n\tpublic void setColVector(int j, SparseVector newVector) {\r\n\t\tif (newVector.length() != this.M)\r\n\t\t\tthrow new RuntimeException(\"Vector lengths disagree\");\r\n\t\tif (j < 0 || j >= this.N)\r\n\t\t\tthrow new RuntimeException(\"Wrong input column index.\");\r\n\t\t// Clear the values of the current colVector\r\n\t\tif (cols[j].indexList() != null) {\r\n\t\t\tfor (int i : cols[j].indexList()) {\r\n\t\t\t\tthis.setValue(i, j, 0);\r\n\t\t\t}\r\n\t\t}\r\n\t\t// Set the new vector.\r\n\t\tif (newVector.indexList() != null) {\r\n\t\t\tfor (int i : newVector.indexList()) {\r\n\t\t\t\tthis.setValue(i, j, newVector.getValue(i));\r\n\t\t\t}\r\n\t\t}\r\n\t}\r\n\t\r\n\t/**\r\n\t * Set a new size of the matrix.\r\n\t * \r\n\t * @param m The new row count.\r\n\t * @param n The new column count.\r\n\t */\r\n\tpublic void setSize(int m, int n) {\r\n\t\tthis.M = m;\r\n\t\tthis.N = n;\r\n\t}\r\n\t\r\n\t/**\r\n\t * Return a reference of a given row.\r\n\t * Make sure to use this method only for read-only purpose.\r\n\t * \r\n\t * @param index The row index to retrieve.\r\n\t * @return A reference to the designated row.\r\n\t */\r\n\tpublic SparseVector getRowRef(int index) {\r\n\t\treturn rows[index];\r\n\t}\r\n\t\r\n\t/**\r\n\t * Return a copy of a given row.\r\n\t * Use this if you do not want to affect to original data.\r\n\t * \r\n\t * @param index The row index to retrieve.\r\n\t * @return A reference to the designated row.\r\n\t */\r\n\tpublic SparseVector getRow(int index) {\r\n\t\tSparseVector newVector = this.rows[index].copy();\r\n\t\t\r\n\t\treturn newVector;\r\n\t}\r\n\t\r\n\t/**\r\n\t * Return a reference of a given column.\r\n\t * Make sure to use this method only for read-only purpose.\r\n\t * \r\n\t * @param index The column index to retrieve.\r\n\t * @return A reference to the designated column.\r\n\t */\r\n\tpublic SparseVector getColRef(int index) {\r\n\t\treturn cols[index];\r\n\t}\r\n\t\r\n\t/**\r\n\t * Return a copy of a given column.\r\n\t * Use this if you do not want to affect to original data.\r\n\t * \r\n\t * @param index The column index to retrieve.\r\n\t * @return A reference to the designated column.\r\n\t */\r\n\tpublic SparseVector getCol(int index) {\r\n\t\tSparseVector newVector = this.cols[index].copy();\r\n\t\t\r\n\t\treturn newVector;\r\n\t}\r\n\t\r\n\t/**\r\n\t * Calculate average value for each row.\r\n\t * \r\n\t * @param default_value The default average of a row if it has no values.\r\n\t * @return A SparseVector that each value denotes the average of the row vector.\r\n\t **/\r\n\tpublic SparseVector getRowAverage(double defalut_value) {\r\n\t\tSparseVector rowAverage = new SparseVector(this.M);\r\n\t\tfor (int u = 0; u < this.M; u++) {\r\n\t\t\tSparseVector v = this.getRowRef(u);\r\n\t\t\tdouble avg = v.average();\r\n\t\t\tif (Double.isNaN(avg)) { // no rate is available: set it as median value.\r\n\t\t\t\tavg = defalut_value;\r\n\t\t\t}\r\n\t\t\trowAverage.setValue(u, avg);\r\n\t\t}\r\n\t\treturn rowAverage;\r\n\t}\r\n\t\r\n\t/**\r\n\t * Calculate average value for each column.\r\n\t * \r\n\t * @param default_value The default average of a column if it has no values.\r\n\t * @return A SparseVector that each value denotes the average of the column vector.\r\n\t */\r\n\tpublic SparseVector getColumnAverage(double defalut_value) {\r\n\t\tSparseVector columnAverage = new SparseVector(this.N);\r\n\t\tfor (int i = 0; i < this.N; i++) {\r\n\t\t\tSparseVector j = this.getColRef(i);\r\n\t\t\tdouble avg = j.average();\r\n\t\t\tif (Double.isNaN(avg)) { // no rate is available: set it as median value.\r\n\t\t\t\tavg = defalut_value;\r\n\t\t\t}\r\n\t\t\tcolumnAverage.setValue(i, avg);\r\n\t\t}\r\n\t\treturn columnAverage;\r\n\t}\r\n\r\n\t/*========================================\r\n\t * Properties\r\n\t *========================================*/\r\n\t/**\r\n\t * Capacity of this matrix.\r\n\t * \r\n\t * @return An array containing the length of this matrix.\r\n\t * Index 0 contains row count, while index 1 column count.\r\n\t */\r\n\tpublic int[] length() {\r\n\t\tint[] lengthArray = new int[2];\r\n\t\t\r\n\t\tlengthArray[0] = this.M;\r\n\t\tlengthArray[1] = this.N;\r\n\t\t\r\n\t\treturn lengthArray;\r\n\t}\r\n\t\r\n\t/**\r\n\t * Size of this matrix, M * N\r\n\t */\r\n\tpublic int size() {\r\n\t\treturn M * N;\r\n\t}\r\n\t\r\n\t/**\r\n\t * Actual number of items in the matrix.\r\n\t * \r\n\t * @return The number of items in the matrix.\r\n\t */\r\n\tpublic int itemCount() { \r\n\t\tint sum = 0;\r\n\t\t\r\n\t\tif (M > N) {\r\n\t\t\tfor (int i = 0; i < M; i++) {\r\n\t\t\t\tsum += rows[i].itemCount();\r\n\t\t\t}\r\n\t\t}\r\n\t\telse {\r\n\t\t\tfor (int j = 0; j < N; j++) {\r\n\t\t\t\tsum += cols[j].itemCount();\r\n\t\t\t}\r\n\t\t}\r\n\t\t\r\n\t\treturn sum;\r\n\t}\r\n\t\r\n\t/**\r\n\t * Number of non-zero elements in the matrix.\r\n\t * \r\n\t * @return The number of non-zero elements in the matrix.\r\n\t */\r\n\tpublic int nonZeroCount() {\r\n\t\tint sum = 0;\r\n\t\tif (M > N) {\r\n\t\t\tfor (int i = 0; i < M; i++) {\r\n\t\t\t\tsum += rows[i].nonZeroCount();\r\n\t\t\t}\r\n\t\t}\r\n\t\telse {\r\n\t\t\tfor (int j = 0; j < N; j++) {\r\n\t\t\t\tsum += cols[j].nonZeroCount();\r\n\t\t\t}\r\n\t\t}\r\n\t\t\r\n\t\treturn sum;\t\r\n\t}\r\n\t\r\n\t/**\r\n\t * Return items in the diagonal in vector form.\r\n\t * \r\n\t * @return Diagonal vector from the matrix.\r\n\t */\r\n\tpublic SparseVector diagonal() {\r\n\t\tSparseVector v = new SparseVector(Math.min(this.M, this.N));\r\n\t\t\r\n\t\tfor (int i = 0; i < Math.min(this.M, this.N); i++) {\r\n\t\t\tdouble value = this.getValue(i, i);\r\n\t\t\tif (value > 0.0) {\r\n\t\t\t\tv.setValue(i, value);\r\n\t\t\t}\r\n\t\t}\r\n\t\t\r\n\t\treturn v;\r\n\t}\r\n\t\r\n\t/**\r\n\t * The value of maximum element in the matrix.\r\n\t * \r\n\t * @return The maximum value.\r\n\t */\r\n\tpublic double max() {\r\n\t\tdouble curr = Double.MIN_VALUE;\r\n\t\t\r\n\t\tfor (int i = 0; i < this.M; i++) {\r\n\t\t\tSparseVector v = this.getRowRef(i);\r\n\t\t\tif (v.itemCount() > 0) {\r\n\t\t\t\tdouble rowMax = v.max();\r\n\t\t\t\tif (v.max() > curr) {\r\n\t\t\t\t\tcurr = rowMax;\r\n\t\t\t\t}\r\n\t\t\t}\r\n\t\t}\r\n\t\t\r\n\t\treturn curr;\r\n\t}\r\n\t\r\n\t/**\r\n\t * The value of minimum element in the matrix.\r\n\t * \r\n\t * @return The minimum value.\r\n\t */\r\n\tpublic double min() {\r\n\t\tdouble curr = Double.MAX_VALUE;\r\n\t\t\r\n\t\tfor (int i = 0; i < this.M; i++) {\r\n\t\t\tSparseVector v = this.getRowRef(i);\r\n\t\t\tif (v.itemCount() > 0) {\r\n\t\t\t\tdouble rowMin = v.min();\r\n\t\t\t\tif (v.min() < curr) {\r\n\t\t\t\t\tcurr = rowMin;\r\n\t\t\t\t}\r\n\t\t\t}\r\n\t\t}\r\n\t\t\r\n\t\treturn curr;\r\n\t}\r\n\t\r\n\t/**\r\n\t * Sum of every element. It ignores non-existing values.\r\n\t * \r\n\t * @return The sum of all elements.\r\n\t */\r\n\tpublic double sum() {\r\n\t\tdouble sum = 0.0;\r\n\t\t\r\n\t\tfor (int i = 0; i < this.M; i++) {\r\n\t\t\tSparseVector v = this.getRowRef(i);\r\n\t\t\tsum += v.sum();\r\n\t\t}\r\n\t\t\r\n\t\treturn sum;\r\n\t}\r\n\t\r\n\t/**\r\n\t * Square sum of all elements. It ignores non-existing values.\r\n\t * \r\n\t * @return The square sum of all elements\r\n\t */\r\n\tpublic double squareSum() {\r\n\t\tdouble sum = 0.0;\r\n\t\t\r\n\t\tfor (int i = 0; i < this.M; i++) {\r\n\t\t\tSparseVector v = this.getRowRef(i);\r\n\t\t\tsum += v.squareSum();\r\n\t\t}\r\n\t\t\r\n\t\treturn sum;\r\n\t}\r\n\t/**\r\n\t * Average of every element. It ignores non-existing values.\r\n\t * \r\n\t * @return The average value.\r\n\t */\r\n\tpublic double average() {\r\n\t\treturn this.sum() / this.itemCount();\r\n\t}\r\n\t\r\n\t/**\r\n\t * Variance of every element. It ignores non-existing values.\r\n\t * \r\n\t * @return The variance value.\r\n\t */\r\n\tpublic double variance() {\r\n\t\tdouble avg = this.average();\r\n\t\tdouble sum = 0.0;\r\n\t\t\r\n\t\tfor (int i = 0; i < this.M; i++) {\r\n\t\t\tArrayList<Integer> itemList = this.getRowRef(i).indexList();\r\n\t\t\tfor (int j : itemList) {\r\n\t\t\t\tsum += Math.pow(this.getValue(i, j) - avg, 2);\r\n\t\t\t}\r\n\t\t}\r\n\t\t\r\n\t\treturn sum / this.itemCount();\r\n\t}\r\n\t\r\n\t/**\r\n\t * Standard Deviation of every element. It ignores non-existing values.\r\n\t * \r\n\t * @return The standard deviation value.\r\n\t */\r\n\tpublic double stdev() {\r\n\t\treturn Math.sqrt(this.variance());\r\n\t}\r\n\t\r\n\t/**\r\n\t * Return the (non-zero) index pairs.\r\n\t * @return\r\n\t */\r\n\tpublic ArrayList<Pair<Integer, Integer>> indexPairs() {\r\n\t\tArrayList<Pair<Integer, Integer>> pairs = new ArrayList<Pair<Integer, Integer>>();\r\n\t\tfor (int i = 0; i < M; i ++) {\r\n\t\t\tfor (int j : rows[i].indexList()) {\r\n\t\t\t\tpairs.add(new Pair<Integer, Integer>(i, j));\r\n\t\t\t}\r\n\t\t}\r\n\t\treturn pairs;\r\n\t}\r\n\t\r\n\t/*========================================\r\n\t * Matrix operations\r\n\t *========================================*/\r\n\t/**\r\n\t * Scalar subtraction (aX).\r\n\t * \r\n\t * @param alpha The scalar value to be multiplied to this matrix.\r\n\t * @return The resulting matrix after scaling.\r\n\t */\r\n\tpublic SparseMatrix scale(double alpha) {\r\n\t\tSparseMatrix A = new SparseMatrix(this.M, this.N);\r\n\t\t\r\n\t\tfor (int i = 0; i < A.M; i++) {\r\n\t\t\tA.rows[i] = this.getRowRef(i).scale(alpha);\r\n\t\t}\r\n\t\tfor (int j = 0; j < A.N; j++) {\r\n\t\t\tA.cols[j] = this.getColRef(j).scale(alpha);\r\n\t\t}\r\n\t\t\r\n\t\treturn A;\r\n\t}\r\n\t\r\n\t/**\r\n\t * Scalar subtraction (aX) on the matrix itself.\r\n\t * This is used for minimizing memory usage.\r\n\t * \r\n\t * @param alpha The scalar value to be multiplied to this matrix.\r\n\t */\r\n\tpublic SparseMatrix selfScale(double alpha) {\r\n\t\tfor (int i = 0; i < this.M; i++) {\r\n\t\t\tArrayList<Integer> itemList = this.getRowRef(i).indexList();\r\n\t\t\tfor (int j : itemList) {\r\n\t\t\t\tthis.setValue(i, j, this.getValue(i, j) * alpha);\r\n\t\t\t}\r\n\t\t}\r\n\t\treturn this;\r\n\t}\r\n\t\r\n\t/**\r\n\t * Scalar addition.\r\n\t * @param alpha The scalar value to be added to this matrix.\r\n\t * @return The resulting matrix after addition.\r\n\t */\r\n\tpublic SparseMatrix add(double alpha) {\r\n\t\tSparseMatrix A = new SparseMatrix(this.M, this.N);\r\n\t\t\r\n\t\tfor (int i = 0; i < A.M; i++) {\r\n\t\t\tA.rows[i] = this.getRowRef(i).add(alpha);\r\n\t\t}\r\n\t\tfor (int j = 0; j < A.N; j++) {\r\n\t\t\tA.cols[j] = this.getColRef(j).add(alpha);\r\n\t\t}\r\n\t\t\r\n\t\treturn A;\r\n\t}\r\n\t\r\n\t/**\r\n\t * Scalar addition on the matrix itself.\r\n\t * @param alpha The scalar value to be added to this matrix.\r\n\t */\r\n\tpublic void selfAdd(double alpha) {\r\n\t\tfor (int i = 0; i < this.M; i++) {\r\n\t\t\tArrayList<Integer> itemList = this.getRowRef(i).indexList();\r\n\t\t\tfor (int j : itemList) {\r\n\t\t\t\tthis.setValue(i, j, this.getValue(i, j) + alpha);\r\n\t\t\t}\r\n\t\t}\r\n\t}\r\n\t\r\n\t/**\r\n\t * Exponential of a given constant.\r\n\t * \r\n\t * @param alpha The exponent.\r\n\t * @return The resulting exponential matrix.\r\n\t */\r\n\tpublic SparseMatrix exp(double alpha) {\r\n\t\tfor (int i = 0; i < this.M; i++) {\r\n\t\t\tSparseVector b = this.getRowRef(i);\r\n\t\t\tArrayList<Integer> indexList = b.indexList();\r\n\t\t\t\r\n\t\t\tfor (int j : indexList) {\r\n\t\t\t\tthis.setValue(i, j, Math.pow(alpha, this.getValue(i, j)));\r\n\t\t\t}\r\n\t\t}\r\n\t\t\r\n\t\treturn this;\r\n\t}\r\n\t\r\n\t/**\r\n\t * The transpose of the matrix.\r\n\t * This is simply implemented by interchanging row and column each other. \r\n\t * \r\n\t * @return The transpose of the matrix.\r\n\t */\r\n\tpublic SparseMatrix transpose() {\r\n\t\tSparseMatrix A = new SparseMatrix(this.N, this.M);\r\n\t\t\r\n\t\tA.cols = this.rows;\r\n\t\tA.rows = this.cols;\r\n\t\t\r\n\t\treturn A;\r\n\t}\r\n\t\r\n\t/**\r\n\t * Matrix-vector product (b = Ax)\r\n\t * \r\n\t * @param x The vector to be multiplied to this matrix.\r\n\t * @throws RuntimeException when dimensions disagree\r\n\t * @return The resulting vector after multiplication.\r\n\t */\r\n\tpublic SparseVector times(SparseVector x) {\r\n\t\tif (N != x.length())\r\n\t\t\tthrow new RuntimeException(\"Dimensions disagree\");\r\n\t\t\r\n\t\tSparseMatrix A = this;\r\n\t\tSparseVector b = new SparseVector(M);\r\n\t\t\r\n\t\tfor (int i = 0; i < M; i++) {\r\n\t\t\tb.setValue(i, A.rows[i].innerProduct(x));\r\n\t\t}\r\n\t\t\r\n\t\treturn b;\r\n\t}\r\n\t\r\n\t/**\r\n\t * Matrix-matrix product (C = AB)\r\n\t * \r\n\t * @param B The matrix to be multiplied to this matrix.\r\n\t * @throws RuntimeException when dimensions disagree\r\n\t * @return The resulting matrix after multiplication.\r\n\t */\r\n\tpublic SparseMatrix times(SparseMatrix B) {\r\n\t\t// original implementation\r\n\t\tif (N != (B.length())[0])\r\n\t\t\tthrow new RuntimeException(\"Dimensions disagree\");\r\n\t\t\r\n\t\tSparseMatrix A = this;\r\n\t\tSparseMatrix C = new SparseMatrix(M, (B.length())[1]);\r\n\t\tfor (int i = 0; i < M; i++) {\r\n\t\t\tfor (int j = 0; j < (B.length())[1]; j++) {\r\n\t\t\t\tSparseVector x = A.getRowRef(i);\r\n\t\t\t\tSparseVector y = B.getColRef(j);\r\n\t\t\t\t\r\n\t\t\t\tif (x != null && y != null)\r\n\t\t\t\t\tC.setValue(i, j, x.innerProduct(y));\r\n\t\t\t\telse\r\n\t\t\t\t\tC.setValue(i, j, 0.0);\r\n\t\t\t}\r\n\t\t}\r\n\t\t\r\n\t\treturn C;\r\n\t}\r\n\t\r\n\t/**\r\n\t * Element-wise matrix product (C_ij = A_ij * B_ij)\r\n\t * @param B\r\n\t * @return\r\n\t */\r\n\tpublic SparseMatrix dotTimes(SparseMatrix B) {\r\n\t\tif (M != B.M || N != B.N) {\r\n\t\t\tthrow new RuntimeException(\"dotTimes: Matrices are not of the same size!\");\r\n\t\t}\r\n\t\tSparseMatrix C = new SparseMatrix(M, N);\r\n\t\tfor (int i = 0; i < M; i ++) {\r\n\t\t\tArrayList<Integer> wordList = this.rows[i].indexList();\r\n\t\t\tfor (int j : wordList) {\r\n\t\t\t\tdouble A_ij = getValue(i, j);\r\n\t\t\t\tdouble B_ij = B.getValue(i, j);\r\n\t\t\t\tif (A_ij != 0 && B_ij != 0) {\r\n\t\t\t\t\tC.setValue(i, j, A_ij * B_ij);\r\n\t\t\t\t}\r\n\t\t\t}\r\n\t\t}\r\n\t\treturn C;\r\n\t}\r\n\t\r\n\t/**\r\n\t * Element-wise matrix division (C_ij = A_ij / B_ij)\r\n\t * It ignore 0 elements.\r\n\t * @param B\r\n\t * @return\r\n\t */\r\n\tpublic SparseMatrix dotDivide(SparseMatrix B) {\r\n\t\tif (M != B.M || N != B.N) {\r\n\t\t\tthrow new RuntimeException(\"dotDivide: Matrices are not of the same size!\");\r\n\t\t}\r\n\t\tSparseMatrix C = new SparseMatrix(M, N);\r\n\t\tfor (int i = 0; i < M; i ++) {\r\n\t\t\tArrayList<Integer> wordList = this.rows[i].indexList();\r\n\t\t\tfor (int j : wordList) {\r\n\t\t\t\tdouble A_ij = getValue(i, j);\r\n\t\t\t\tdouble B_ij = B.getValue(i, j);\r\n\t\t\t\tif (A_ij != 0 && B_ij != 0) {\r\n\t\t\t\t\tC.setValue(i, j, A_ij / B_ij);\r\n\t\t\t\t}\r\n\t\t\t}\r\n\t\t}\t\t\r\n\t\treturn C;\r\n\t}\r\n\t\r\n\t/** TF-IDF term weighting on an itemWords Matrix (row denotes item, column denotes word, each value is an integer).\r\n\t *  \r\n\t * @return TF-IDF term weighted matrix.\r\n\t */\r\n\tpublic SparseMatrix tfidf() {\r\n\t\tSparseMatrix C = new SparseMatrix(M, N);\r\n\t\tfor (int i = 0; i < M; i++) { // row represents a doc\r\n\t\t\tArrayList<Integer> wordList = rows[i].indexList();\r\n\t\t\tfor (int j : wordList) { // col represent a word\r\n\t\t\t\tif (this.getValue(i, j) != 0) {\r\n\t\t\t\t\tdouble TF = 1 + log2(getValue(i, j));\r\n\t\t\t\t\tdouble IDF = log2((double)M / cols[j].itemCount());\r\n\t\t\t\t\tC.setValue(i, j, TF * IDF);\r\n\t\t\t\t}\r\n\t\t\t\t\r\n\t\t\t}\r\n\t\t}\r\n\t\treturn C;\r\n\t}\r\n\t\r\n\t/**\r\n\t * IDF term weighting on an itemWords matrix.\r\n\t * @return\r\n\t */\r\n\tpublic SparseMatrix idf() {\r\n\t\tSparseMatrix C = new SparseMatrix(M, N);\r\n\t\tfor (int i = 0; i < M; i++) { // row represents a doc\r\n\t\t\tArrayList<Integer> wordList = rows[i].indexList();\r\n\t\t\tfor (int j : wordList) { // col represent a word\r\n\t\t\t\tif (this.getValue(i, j) != 0) {\r\n\t\t\t\t\tdouble TF = 1;\r\n\t\t\t\t\tdouble IDF = log2((double)M / cols[j].itemCount());\r\n\t\t\t\t\tC.setValue(i, j, TF * IDF);\r\n\t\t\t\t}\r\n\t\t\t\t\r\n\t\t\t}\r\n\t\t}\r\n\t\treturn C;\r\n\t}\r\n\t\r\n\t/**\r\n\t * TF term weighting on an itemWords Matrix (row denotes item, column denotes word, each value is an integer).\r\n\t * @return TF term weighted matrix.\r\n\t */\r\n\tpublic SparseMatrix tf() {\r\n\t\tSparseMatrix C = new SparseMatrix(M, N);\r\n\t\tfor (int i = 0; i < M; i++) { // row represents a doc\r\n\t\t\tArrayList<Integer> wordList = rows[i].indexList();\r\n\t\t\tfor (int j : wordList) { // col represent a word\r\n\t\t\t\tif (this.getValue(i, j) != 0) {\r\n\t\t\t\t\tdouble TF = 1 + log2(getValue(i, j));\r\n\t\t\t\t\tC.setValue(i, j, TF);\r\n\t\t\t\t}\r\n\t\t\t\t\r\n\t\t\t}\r\n\t\t}\r\n\t\treturn C;\r\n\t}\r\n\t\r\n\tpublic SparseMatrix log2() {\r\n\t\tSparseMatrix C = new SparseMatrix(M, N);\r\n\t\tfor (int i = 0; i < M; i++) {\r\n\t\t\tArrayList<Integer> indexList = this.getRowRef(i).indexList();\r\n\t\t\tfor (int j : indexList) {\r\n\t\t\t\tC.setValue(i, j, 1 + log2(this.getValue(i, j)));\r\n\t\t\t}\r\n\t\t}\r\n\t\treturn C;\r\n\t}\r\n\t\r\n\tprivate double log2(double n) {\r\n\t\treturn Math.log(n) / Math.log(2);\r\n\t}\r\n\t\r\n\t\r\n\t/** Convert a non-negative matrix to a row stochastic matrix (i.e. sum of a row is 1).\r\n\t *  It ignores 0 row vector.\r\n\t *  \r\n\t * @return Row stochastic matrix\r\n\t */\r\n\tpublic SparseMatrix rowStochastic() {\r\n\t\tSparseMatrix C = new SparseMatrix(M, N); \r\n\t\tfor (int i = 0; i < M; i++) {\r\n\t\t\tdouble sum  = rows[i].sum();\r\n\t\t\tif (sum != 0) {\r\n\t\t\t\tfor (int j : this.rows[i].indexList()) {\r\n\t\t\t\t\tC.setValue(i, j, getValue(i, j) / sum);\r\n\t\t\t\t}\r\n\t\t\t}\r\n\t\t}\r\n\t\treturn C;\r\n\t}\r\n\t\r\n\t/**\r\n\t * Apply L2 norm on each row vector.\r\n\t * It ignores 0 row vector.\r\n\t * @return\r\n\t */\r\n\tpublic SparseMatrix rowL2Norm() {\r\n\t\tSparseMatrix C = new SparseMatrix(M, N);\r\n\t\tfor (int i = 0; i < M; i++) {\r\n\t\t\tdouble squareSum = rows[i].squareSum();\r\n\t\t\tif (squareSum != 0) {\r\n\t\t\t\tdouble l2_norm = Math.sqrt(squareSum);\r\n\t\t\t\tfor (int j : rows[i].indexList()) {\r\n\t\t\t\t\tC.setValue(i, j, getValue(i, j) / l2_norm);\r\n\t\t\t\t}\r\n\t\t\t}\r\n\t\t}\r\n\t\t\r\n\t\treturn C;\r\n\t}\r\n\t\r\n\t/** Convert a non-negative matrix to a column stochastic matrix (i.e. sum of a column is 1).\r\n\t *  It ignores 0 column vector.\r\n\t *  \r\n\t * @return Column stochastic matrix.\r\n\t */\r\n\tpublic SparseMatrix colStochastic() {\r\n\t\tSparseMatrix C = new SparseMatrix(this.M, this.N);\r\n\t\tfor (int j = 0; j < this.N; j++) {\r\n\t\t\tdouble sum = this.cols[j].sum();\r\n\t\t\tif (sum != 0) {\r\n\t\t\t\tfor (int i : this.cols[j].indexList()) {\r\n\t\t\t\t\tC.setValue(i, j, this.getValue(i, j) / sum);\r\n\t\t\t\t}\r\n\t\t\t}\r\n\t\t}\r\n\t\treturn C;\r\n\t}\r\n\r\n\t/**\r\n\t * Matrix-matrix product (A = AB), without using extra memory.\r\n\t * \r\n\t * @param B The matrix to be multiplied to this matrix.\r\n\t * @throws RuntimeException when dimensions disagree\r\n\t */\r\n\tpublic void selfTimes(SparseMatrix B) {\r\n\t\t// original implementation\r\n\t\tif (N != (B.length())[0])\r\n\t\t\tthrow new RuntimeException(\"Dimensions disagree\");\r\n\t\t\r\n\t\tfor (int i = 0; i < M; i++) {\r\n\t\t\tSparseVector tmp = new SparseVector(N);\r\n\t\t\tfor (int j = 0; j < (B.length())[1]; j++) {\r\n\t\t\t\tSparseVector x = this.getRowRef(i);\r\n\t\t\t\tSparseVector y = B.getColRef(j);\r\n\t\t\t\t\r\n\t\t\t\tif (x != null && y != null)\r\n\t\t\t\t\ttmp.setValue(j, x.innerProduct(y));\r\n\t\t\t\telse\r\n\t\t\t\t\ttmp.setValue(j, 0.0);\r\n\t\t\t}\r\n\t\t\t\r\n\t\t\tfor (int j = 0; j < (B.length())[1]; j++) {\r\n\t\t\t\tthis.setValue(i, j, tmp.getValue(j));\r\n\t\t\t}\r\n\t\t}\r\n\t}\r\n\r\n\t/**\r\n\t * Matrix-matrix sum (C = A + B)\r\n\t * \r\n\t * @param B The matrix to be added to this matrix.\r\n\t * @throws RuntimeException when dimensions disagree\r\n\t * @return The resulting matrix after summation.\r\n\t */\r\n\tpublic SparseMatrix plus(SparseMatrix B) {\r\n\t\tSparseMatrix A = this;\r\n\t\tif (A.M != B.M || A.N != B.N)\r\n\t\t\tthrow new RuntimeException(\"Dimensions disagree\");\r\n\t\t\r\n\t\tSparseMatrix C = new SparseMatrix(M, N);\r\n\t\tfor (int i = 0; i < M; i++) {\r\n\t\t\tC.rows[i] = A.rows[i].plus(B.rows[i]);\r\n\t\t}\r\n\t\tfor (int j = 0; j < N; j++) {\r\n\t\t\tC.cols[j] = A.cols[j].plus(B.cols[j]);\r\n\t\t}\r\n\t\t\r\n\t\treturn C;\r\n\t}\r\n\t\r\n\t/**\r\n\t * Matrix-matrix minus (C = A - B)\r\n\t * \r\n\t * @param B The matrix to be deducted to this matrix.\r\n\t * @throws RuntimeException when dimensions disagree\r\n\t * @return The resulting matrix after minus.\r\n\t */\r\n\tpublic SparseMatrix minus(SparseMatrix B) {\r\n\t\tSparseMatrix A = this;\r\n\t\tif (A.M != B.M || A.N != B.N)\r\n\t\t\tthrow new RuntimeException(\"Dimensions disagree\");\r\n\t\t\r\n\t\tSparseMatrix C = new SparseMatrix(M, N);\r\n\t\tfor (int i = 0; i < M; i++) {\r\n\t\t\tC.rows[i] = A.rows[i].minus(B.rows[i]);\r\n\t\t}\r\n\t\tfor (int j = 0; j < N; j++) {\r\n\t\t\tC.cols[j] = A.cols[j].minus(B.cols[j]);\r\n\t\t}\r\n\t\t\r\n\t\treturn C;\r\n\t}\r\n\t\r\n\t/**\r\n\t * Generate an identity matrix with the given size.\r\n\t * \r\n\t * @param n The size of requested identity matrix.\r\n\t * @return An identity matrix with the size of n by n. \r\n\t */\r\n\tpublic static SparseMatrix makeIdentity(int n) {\r\n\t\tSparseMatrix m = new SparseMatrix(n, n);\r\n\t\tfor (int i = 0; i < n; i++) {\r\n\t\t\tm.setValue(i, i, 1.0);\r\n\t\t}\r\n\t\t\r\n\t\treturn m;\r\n\t}\r\n\t\t\r\n\t/**\r\n\t * Generate a uniform matrix with the given size.\r\n\t * The sum of each row is 1.\r\n\t * @param m\r\n\t * @param n\r\n\t * @return\r\n\t */\r\n\tpublic static SparseMatrix makeUniform(int M, int N) {\r\n\t\tSparseMatrix m = new SparseMatrix(M, N);\r\n\t\tfor (int i = 0; i < M; i ++) {\r\n\t\t\tfor (int j = 0; j < N; j++) {\r\n\t\t\t\tm.setValue(i, j, 1.0 / N);\r\n\t\t\t}\r\n\t\t}\r\n\t\treturn m;\r\n\t}\r\n\t\r\n\t/**\r\n\t * Generate a random matrix with the given size and sparseRate.\r\n\t * Each entry is in the range [0,1]\r\n\t * @param M\r\n\t * @param N\r\n\t * @param sparseRate\r\n\t * @return\r\n\t */\r\n\tpublic static SparseMatrix makeRandom(int M, int N, double sparseRate) {\r\n\t\tif (sparseRate <=0 || sparseRate >1) {\r\n\t\t\tthrow new RuntimeException(\"SparseRate input error!\");\r\n\t\t}\r\n\t\t\r\n\t\tSparseMatrix m = new SparseMatrix(M, N);\r\n\t\tfor (int i = 0; i < M; i ++) {\r\n\t\t\tfor (int j = 0; j < N; j ++) {\r\n\t\t\t\tdouble random = Math.random();\r\n\t\t\t\tif (random < sparseRate) {\r\n\t\t\t\t\tm.setValue(i, j, Math.random());\r\n\t\t\t\t}\r\n\t\t\t}\r\n\t\t}\r\n\t\treturn m;\r\n\t}\r\n\t\r\n\t/**\r\n\t * Calculate inverse matrix.\r\n\t * \r\n\t * @throws RuntimeException when dimensions disagree.\r\n\t * @return The inverse of current matrix.\r\n\t */\r\n\tpublic SparseMatrix inverse() {\r\n\t\tif (this.M != this.N)\r\n\t\t\tthrow new RuntimeException(\"Dimensions disagree\");\r\n\t\t\r\n\t\tSparseMatrix original = this;\r\n\t\tSparseMatrix newMatrix = makeIdentity(this.M);\r\n\t\t\r\n\t\tint n = this.M;\r\n\t\t\r\n\t\tif (n == 1) {\r\n\t\t\tnewMatrix.setValue(0, 0, 1 / original.getValue(0, 0));\r\n\t\t\treturn newMatrix;\r\n\t\t}\r\n\r\n\t\tSparseMatrix b = new SparseMatrix(original);\r\n\t\t\r\n\t\tfor (int i = 0; i < n; i++) {\r\n\t\t\t// find pivot:\r\n\t\t\tdouble mag = 0;\r\n\t\t\tint pivot = -1;\r\n\r\n\t\t\tfor (int j = i; j < n; j++) {\r\n\t\t\t\tdouble mag2 = Math.abs(b.getValue(j, i));\r\n\t\t\t\tif (mag2 > mag) {\r\n\t\t\t\t\tmag = mag2;\r\n\t\t\t\t\tpivot = j;\r\n\t\t\t\t}\r\n\t\t\t}\r\n\r\n\t\t\t// no pivot (error):\r\n\t\t\tif (pivot == -1 || mag == 0) {\r\n\t\t\t\treturn newMatrix;\r\n\t\t\t}\r\n\r\n\t\t\t// move pivot row into position:\r\n\t\t\tif (pivot != i) {\r\n\t\t\t\tdouble temp;\r\n\t\t\t\tfor (int j = i; j < n; j++) {\r\n\t\t\t\t\ttemp = b.getValue(i, j);\r\n\t\t\t\t\tb.setValue(i, j, b.getValue(pivot, j));\r\n\t\t\t\t\tb.setValue(pivot, j, temp);\r\n\t\t\t\t}\r\n\r\n\t\t\t\tfor (int j = 0; j < n; j++) {\r\n\t\t\t\t\ttemp = newMatrix.getValue(i, j);\r\n\t\t\t\t\tnewMatrix.setValue(i, j, newMatrix.getValue(pivot, j));\r\n\t\t\t\t\tnewMatrix.setValue(pivot, j, temp);\r\n\t\t\t\t}\r\n\t\t\t}\r\n\r\n\t\t\t// normalize pivot row:\r\n\t\t\tmag = b.getValue(i, i);\r\n\t\t\tfor (int j = i; j < n; j ++) {\r\n\t\t\t\tb.setValue(i, j, b.getValue(i, j) / mag);\r\n\t\t\t}\r\n\t\t\tfor (int j = 0; j < n; j ++) {\r\n\t\t\t\tnewMatrix.setValue(i, j, newMatrix.getValue(i, j) / mag);\r\n\t\t\t}\r\n\r\n\t\t\t// eliminate pivot row component from other rows:\r\n\t\t\tfor (int k = 0; k < n; k ++) {\r\n\t\t\t\tif (k == i)\r\n\t\t\t\t\tcontinue;\r\n\t\t\t\t\r\n\t\t\t\tdouble mag2 = b.getValue(k, i);\r\n\r\n\t\t\t\tfor (int j = i; j < n; j ++) {\r\n\t\t\t\t\tb.setValue(k, j, b.getValue(k, j) - mag2 * b.getValue(i, j));\r\n\t\t\t\t}\r\n\t\t\t\tfor (int j = 0; j < n; j ++) {\r\n\t\t\t\t\tnewMatrix.setValue(k, j, newMatrix.getValue(k, j) - mag2 * newMatrix.getValue(i, j));\r\n\t\t\t\t}\r\n\t\t\t}\r\n\t\t}\r\n\t\t\r\n\t\treturn newMatrix;\r\n\t}\r\n\t\r\n\t/**\r\n\t * Calculate Cholesky decomposition of the matrix.\r\n\t * \r\n\t * @throws RuntimeException when matrix is not square.\r\n\t * @return The Cholesky decomposition result.\r\n\t */\r\n\tpublic SparseMatrix cholesky() {\r\n\t\tif (this.M != this.N)\r\n\t\t\tthrow new RuntimeException(\"Matrix is not square\");\r\n\t\t\r\n\t\tSparseMatrix A = this;\r\n\t\t\r\n\t\tint n = A.M;\r\n\t\tSparseMatrix L = new SparseMatrix(n, n);\r\n\r\n\t\tfor (int i = 0; i < n; i++)  {\r\n\t\t\tfor (int j = 0; j <= i; j++) {\r\n\t\t\t\tdouble sum = 0.0;\r\n\t\t\t\tfor (int k = 0; k < j; k++) {\r\n\t\t\t\t\tsum += L.getValue(i, k) * L.getValue(j, k);\r\n\t\t\t\t}\r\n\t\t\t\tif (i == j) {\r\n\t\t\t\t\tL.setValue(i, i, Math.sqrt(A.getValue(i, i) - sum));\r\n\t\t\t\t}\r\n\t\t\t\telse {\r\n\t\t\t\t\tL.setValue(i, j, 1.0 / L.getValue(j, j) * (A.getValue(i, j) - sum));\r\n\t\t\t\t}\r\n\t\t\t}\r\n\t\t\tif (Double.isNaN(L.getValue(i, i))) {\r\n\t\t\t\t//throw new RuntimeException(\"Matrix not positive definite: (\" + i + \", \" + i + \")\");\r\n\t\t\t\treturn null;\r\n\t\t\t}\r\n\t\t}\r\n\t\t\r\n\t\treturn L.transpose();\r\n\t}\r\n\t\r\n\t/**\r\n\t * Generate a covariance matrix of the current matrix.\r\n\t * \r\n\t * @return The covariance matrix of the current matrix.\r\n\t */\r\n\tpublic SparseMatrix covariance() {\r\n\t\tint columnSize = this.N;\r\n\t\tSparseMatrix cov = new SparseMatrix(columnSize, columnSize);\r\n\t\t\r\n\t\tfor (int i = 0; i < columnSize; i++) {\r\n\t\t\tfor (int j = i; j < columnSize; j++) {\r\n\t\t\t\tSparseVector data1 = this.getCol(i);\r\n\t\t\t\tSparseVector data2 = this.getCol(j);\r\n\t\t\t\tdouble avg1 = data1.average();\r\n\t\t\t\tdouble avg2 = data2.average();\r\n\t\t\t\t\r\n\t\t\t\tdouble value = data1.sub(avg1).innerProduct(data2.sub(avg2)) / (data1.length()-1);\r\n\t\t\t\tcov.setValue(i, j, value);\r\n\t\t\t\tcov.setValue(j, i, value);\r\n\t\t\t}\r\n\t\t}\r\n\t\t\r\n\t\treturn cov;\r\n\t}\r\n\t\r\n\t/*========================================\r\n\t * Matrix operations (partial)\r\n\t *========================================*/\r\n\t/**\r\n\t * Scalar Multiplication only with indices in indexList.\r\n\t * \r\n\t * @param alpha The scalar to be multiplied to this matrix.\r\n\t * @param indexList The list of indices to be applied summation.\r\n\t * @return The resulting matrix after scaling.\r\n\t */\r\n\tpublic SparseMatrix partScale(double alpha, int[] indexList) {\r\n\t\tif (indexList != null) {\r\n\t\t\tfor (int i : indexList) {\r\n\t\t\t\tfor (int j : indexList) {\r\n\t\t\t\t\tthis.setValue(i, j, this.getValue(i, j) * alpha);\r\n\t\t\t\t}\r\n\t\t\t}\r\n\t\t}\r\n\t\t\r\n\t\treturn this;\r\n\t}\r\n\t\r\n\t/**\r\n\t * Matrix summation (A = A + B) only with indices in indexList.\r\n\t * \r\n\t * @param B The matrix to be added to this matrix.\r\n\t * @param indexList The list of indices to be applied summation.\r\n\t * @throws RuntimeException when dimensions disagree.\r\n\t * @return The resulting matrix after summation.\r\n\t */\r\n\tpublic SparseMatrix partPlus(SparseMatrix B, int[] indexList) {\r\n\t\tif (indexList != null) {\r\n\t\t\tif (this.M != B.M || this.N != B.N)\r\n\t\t\t\tthrow new RuntimeException(\"Dimensions disagree\");\r\n\t\t\t\r\n\t\t\tfor (int i : indexList) {\r\n\t\t\t\tthis.rows[i].partPlus(B.rows[i], indexList);\r\n\t\t\t}\r\n\t\t\tfor (int j : indexList) {\r\n\t\t\t\tthis.cols[j].partPlus(B.cols[j], indexList);\r\n\t\t\t}\r\n\t\t}\r\n\t\t\r\n\t\treturn this;\r\n\t}\r\n\t\r\n\t/**\r\n\t * Matrix subtraction (A = A - B) only with indices in indexList.\r\n\t * \r\n\t * @param B The matrix to be subtracted from this matrix.\r\n\t * @param indexList The list of indices to be applied subtraction.\r\n\t * @throws RuntimeException when dimensions disagree.\r\n\t * @return The resulting matrix after subtraction.\r\n\t */\r\n\tpublic SparseMatrix partMinus(SparseMatrix B, int[] indexList) {\r\n\t\tif (indexList != null) {\r\n\t\t\tif (this.M != B.M || this.N != B.N)\r\n\t\t\t\tthrow new RuntimeException(\"Dimensions disagree\");\r\n\t\t\t\r\n\t\t\tfor (int i : indexList) {\r\n\t\t\t\tthis.rows[i].partMinus(B.rows[i], indexList);\r\n\t\t\t}\r\n\t\t\tfor (int j : indexList) {\r\n\t\t\t\tthis.cols[j].partMinus(B.cols[j], indexList);\r\n\t\t\t}\r\n\t\t}\r\n\t\t\r\n\t\treturn this;\r\n\t}\r\n\t\r\n\t/**\r\n\t * Matrix-vector product (b = Ax) only with indices in indexList.\r\n\t * \r\n\t * @param x The vector to be multiplied to this matrix.\r\n\t * @param indexList The list of indices to be applied multiplication.\r\n\t * @return The resulting vector after multiplication.\r\n\t */\r\n\tpublic SparseVector partTimes(SparseVector x, int[] indexList) {\r\n\t\tif (indexList == null)\r\n\t\t\treturn x;\r\n\t\t\r\n\t\tSparseVector b = new SparseVector(M);\r\n\t\t\r\n\t\tfor (int i : indexList) {\r\n\t\t\tb.setValue(i, this.rows[i].partInnerProduct(x, indexList));\r\n\t\t}\r\n\t\t\r\n\t\treturn b;\r\n\t}\r\n\t\r\n\t/**\r\n\t * Convert the matrix to a printable string.\r\n\t * \r\n\t * @return The resulted string in the form of \"(1, 2: 5.0) (2, 4: 4.5)\"\r\n\t */\r\n\t@Override\r\n\tpublic String toString() {\r\n        String s = \"\";\r\n        \r\n        for (int i = 0; i < this.M; i++) {\r\n        \tSparseVector row = this.getRowRef(i);\r\n        \tif (row.itemCount() == 0)\tcontinue;\r\n        \tfor (int j : row.indexList()) {\r\n        \t\ts += \"(\" + i + \", \" + j + \": \" + this.getValue(i, j) + \") \";\r\n        \t}\r\n        \ts += \"\\r\\n\";\r\n        }\r\n        \r\n        return s;\r\n    }\r\n}\r\n"
  },
  {
    "path": "src/data_structure/SparseVector.java",
    "content": "package data_structure;\r\n\r\nimport java.io.Serializable;\r\nimport java.util.ArrayList;\r\nimport java.util.HashMap;\r\nimport java.util.HashSet;\r\n\r\nimport utils.CommonUtils;\r\n/**\r\n * This class implements sparse vector, containing empty values for most space.\r\n * \r\n * @author Joonseok Lee\r\n * @since 2012. 4. 20\r\n * @version 1.1\r\n */\r\npublic class SparseVector implements Serializable{\r\n\tprivate static final long serialVersionUID = 8002;\r\n\t\r\n\t/** The length (maximum number of items to be stored) of sparse vector. */\r\n\tprivate int N;\r\n\t/** Data map for <index, value> pairs. */\r\n\tprivate DataMap<Integer, Double> map;\r\n\r\n\t/*========================================\r\n\t * Constructors\r\n\t *========================================*/\r\n\t/**\r\n\t * Construct an empty sparse vector, with capacity 0.\r\n\t * Capacity can be reset with setLength method later.\r\n\t */\r\n\tpublic SparseVector() {\r\n\t\tthis.N = 0;\r\n\t\tthis.map = new DataMap<Integer, Double>();\r\n\t}\r\n\t\r\n\t/**\r\n\t * Construct a new sparse vector with size n.\r\n\t * \r\n\t * @param n The capacity of new sparse vector.\r\n\t */\r\n\tpublic SparseVector(int n) {\r\n\t\tthis.N = n;\r\n\t\tthis.map = new DataMap<Integer, Double>();\r\n\t}\r\n\t\r\n\t/**\r\n\t * Construct an empty sparse vector, with data copied from another sparse vector.\r\n\t * \r\n\t * @param sv The vector having data being copied.\r\n\t */\r\n\tpublic SparseVector(SparseVector sv) {\r\n\t\tthis.N = sv.N;\r\n\t\tthis.map = new DataMap<Integer, Double>();\r\n\t\t\r\n\t\tfor (int i = 0; i < N; i++) {\r\n\t\t\tthis.setValue(i, sv.getValue(i)); \r\n\t\t}\r\n\t}\r\n\r\n\t/*========================================\r\n\t * Getter/Setter\r\n\t *========================================*/\r\n\t/**\r\n\t * Set a new value at the given index.\r\n\t * \r\n\t * @param i The index to store new value.\r\n\t * @param value The value to store.\r\n\t */\r\n\tpublic void setValue(int i, double value) {\r\n\t\tif (value == 0.0)\r\n\t\t\tmap.remove(i);\r\n\t\telse\r\n\t\t\tmap.put(i, value);\r\n\t}\r\n\t\r\n\t/**\r\n\t * Set the values of current vector as newVector\r\n\t * @param newVector\r\n\t */\r\n\tpublic void setVector(SparseVector newVector) {\r\n\t\tif (this.length() != newVector.length()) {\r\n\t\t\tthrow new RuntimeException(\"Vector length disagrees.\");\r\n\t\t}\r\n\t\tArrayList<Integer> indexList = this.indexList();\r\n\t\tfor (int i : indexList) \r\n\t\t\tthis.setValue(i, 0);\r\n\t\t\r\n\t\tindexList = newVector.indexList();\r\n\t\tfor (int i : indexList)\r\n\t\t\tthis.setValue(i, newVector.getValue(i));\r\n\t}\r\n\t\r\n\t/**\r\n\t * Retrieve a stored value from the given index.\r\n\t * \r\n\t * @param i The index to retrieve.\r\n\t * @return The value stored at the given index.\r\n\t */\r\n\tpublic double getValue(int i) {\r\n\t\tif (map.contains(i))\r\n\t\t\treturn map.get(i);\r\n\t\telse\r\n\t\t\treturn 0.0;\r\n\t}\r\n\t\r\n\t/**\r\n\t * Delete a value stored at the given index.\r\n\t * \r\n\t * @param i The index to delete the value in it.\r\n\t */\r\n\tpublic void remove(int i) {\r\n\t\tif (map.contains(i))\r\n\t\t\tmap.remove(i);\r\n\t}\r\n\t\r\n\t/**\r\n\t * Copy the whole sparse vector and make a clone.\r\n\t * \r\n\t * @return A clone of the current sparse vector, containing same values.\r\n\t */\r\n\tpublic SparseVector copy() {\r\n\t\tSparseVector newVector = new SparseVector(this.N);\r\n\t\t\r\n\t\tfor (int i : this.map) {\r\n\t\t\tnewVector.setValue(i, this.getValue(i));\r\n\t\t}\r\n\t\t\r\n\t\treturn newVector;\r\n\t}\r\n\t\r\n\t/**\r\n\t * Get an Arraylist of existing indices.\r\n\t * @return An arraylist of integer, contain indices with valid items.\r\n\t */\r\n\tpublic ArrayList<Integer> indexList() {\r\n\t\tif (this.itemCount() == 0)\r\n\t\t\treturn new ArrayList<Integer>();\r\n\t\t\r\n\t\tArrayList<Integer> result = new ArrayList<Integer>();\r\n\t\tfor (int i : this.map) {\r\n\t\t\tresult.add(i);\r\n\t\t}\r\n\t\t\r\n\t\treturn result;\r\n\t}\r\n\t\r\n\t/**\r\n\t * Get a HashSet of existing indices.\r\n\t * @return A hashset of integer, contain indices with valid items.\r\n\t */\r\n\tpublic HashSet<Integer> indexSet() {\r\n\t\tif (this.itemCount() == 0)\r\n\t\t\treturn new HashSet<Integer>();\r\n\t\t\r\n\t\tHashSet<Integer> result = new HashSet<Integer>();\r\n\t\tfor (int i : this.map) {\r\n\t\t\tresult.add(i);\r\n\t\t}\r\n\t\t\r\n\t\treturn result;\r\n\t}\r\n\t\r\n\t/**\r\n\t * Set a same value to every element.\r\n\t * \r\n\t * @param value The value to assign to every element.\r\n\t */\r\n\tpublic void initialize(double value) {\r\n\t\tfor (int i = 0; i < this.N; i++) {\r\n\t\t\tthis.setValue(i, value);\r\n\t\t}\r\n\t}\r\n\t\r\n\t/**\r\n\t * Set same value to given indices.\r\n\t * \r\n\t * @param index The list of indices, which will be assigned the new value.\r\n\t * @param value The new value to be assigned.\r\n\t */\r\n\tpublic void initialize(int[] index, double value) {\r\n\t\tfor (int i = 0; i < index.length; i++) {\r\n\t\t\tthis.setValue(index[i], value);\r\n\t\t}\r\n\t}\r\n\t\r\n\t/*========================================\r\n\t * Properties\r\n\t *========================================*/\r\n\t/**\r\n\t * Capacity of this vector.\r\n\t * \r\n\t * @return The length of sparse vector\r\n\t */\r\n\tpublic int length() {\r\n\t\treturn N;\r\n\t}\r\n\r\n\t/**\r\n\t * Actual number of items in the vector.\r\n\t * \r\n\t * @return The number of items in the vector.\r\n\t */\r\n\tpublic int itemCount() {\r\n\t\treturn map.itemCount();\t\t\r\n\t}\r\n\t\r\n\t/**\r\n\t * Number of non-zero elements in the vector.\r\n\t * \r\n\t * @return The number of non-zero elements in the vector.\r\n\t */\r\n\tpublic int nonZeroCount() {\r\n\t\tint count = 0;\r\n\t\tfor (int i : map) {\r\n\t\t\tif (map.get(i) != 0)\r\n\t\t\t\tcount ++;\r\n\t\t}\r\n\t\treturn count;\r\n\t}\r\n\t\r\n\t\r\n\t/**\r\n\t * Set a new capacity of the vector.\r\n\t * \r\n\t * @param n The new capacity value.\r\n\t */\r\n\tpublic void setLength(int n) {\r\n\t\tthis.N = n;\r\n\t}\r\n\t\r\n\t/*========================================\r\n\t * Unary Vector operations\r\n\t *========================================*/\r\n\t/**\r\n\t * Scalar addition operator.\r\n\t * \r\n\t * @param alpha The scalar value to be added to the original vector.\r\n\t * @return The resulting vector, added by alpha.\r\n\t */\r\n\tpublic SparseVector add(double alpha) {\r\n\t\tSparseVector a = this;\r\n\t\tSparseVector c = new SparseVector(N);\r\n\t\t\r\n\t\tfor (int i : a.map) {\r\n\t\t\tc.setValue(i, alpha + a.getValue(i));\r\n\t\t}\r\n\t\t\r\n\t\treturn c;\r\n\t}\r\n\t\r\n\t/**\r\n\t * Scalar subtraction operator.\r\n\t * \r\n\t * @param alpha The scalar value to be subtracted from the original vector.\r\n\t * @return The resulting vector, subtracted by alpha.\r\n\t */\r\n\tpublic SparseVector sub(double alpha) {\r\n\t\tSparseVector a = this;\r\n\t\tSparseVector c = new SparseVector(N);\r\n\t\t\r\n\t\tfor (int i : a.map) {\r\n\t\t\tc.setValue(i, a.getValue(i) - alpha);\r\n\t\t}\r\n\t\t\r\n\t\treturn c;\r\n\t}\r\n\t\r\n\t/**\r\n\t * Scalar multiplication operator.\r\n\t * \r\n\t * @param alpha The scalar value to be multiplied to the original vector.\r\n\t * @return The resulting vector, multiplied by alpha.\r\n\t */\r\n\tpublic SparseVector scale(double alpha) {\r\n\t\tSparseVector a = this;\r\n\t\tSparseVector c = new SparseVector(N);\r\n\t\t\r\n\t\tif (alpha == 0)\r\n\t\t\treturn c;\r\n\t\tfor (int i : a.map) {\r\n\t\t\tc.setValue(i, alpha * a.getValue(i));\r\n\t\t}\r\n\t\t\r\n\t\treturn c;\r\n\t}\r\n\t\r\n\t/**\r\n\t * Scale multiplication operator on vector itself.\r\n\t * @param alpha\r\n\t * @return\r\n\t */\r\n\tpublic SparseVector selfScale(double alpha) {\r\n\t\tSparseVector a = this;\r\n\t\t\r\n\t\tfor (int i : a.map) {\r\n\t\t\ta.setValue(i, alpha * a.getValue(i));\r\n\t\t}\r\n\t\treturn a;\r\n\t}\r\n\t\r\n\t/**\r\n\t * Scalar power operator.\r\n\t * \r\n\t * @param alpha The scalar value to be powered to the original vector.\r\n\t * @return The resulting vector, powered by alpha.\r\n\t */\r\n\tpublic SparseVector power(double alpha) {\r\n\t\tSparseVector a = this;\r\n\t\tSparseVector c = new SparseVector(N);\r\n\t\t\r\n\t\tfor (int i : a.map) {\r\n\t\t\tc.setValue(i, Math.pow(a.getValue(i), alpha));\r\n\t\t}\r\n\t\t\r\n\t\treturn c;\r\n\t}\r\n\t\r\n\t/**\r\n\t * Exponential of a given constant.\r\n\t * \r\n\t * @param alpha The exponent.\r\n\t * @return The resulting exponential vector.\r\n\t */\r\n\tpublic SparseVector exp(double alpha) {\r\n\t\tSparseVector a = this;\r\n\t\tSparseVector c = new SparseVector(N);\r\n\t\t\r\n\t\tfor (int i : a.map) {\r\n\t\t\tc.setValue(i, Math.pow(alpha, a.getValue(i)));\r\n\t\t}\r\n\t\t\r\n\t\treturn c;\r\n\t}\r\n\t\r\n\tpublic SparseVector log2() {\r\n\t\tSparseVector c = new SparseVector(N);\r\n\t\t\r\n\t\tfor (int i : this.map) {\r\n\t\t\tc.setValue(i, 1 + log2(this.getValue(i)));\r\n\t\t}\r\n\t\treturn c;\r\n\t}\r\n\t\r\n\tprivate double log2(double n) {\r\n\t\treturn Math.log(n) / Math.log(2);\r\n\t}\r\n\t\r\n\t/**\r\n\t * Return a uniform vector of size n.\r\n\t * @param n\r\n\t */\r\n\tpublic static SparseVector makeUniform(int n) {\r\n\t\tSparseVector v = new SparseVector(n);\r\n\t\tdouble val = 1.0 / n;\r\n\t\tfor (int i = 0; i < n; i++) {\r\n\t\t\tv.setValue(i, val);\r\n\t\t}\r\n\t\treturn v;\r\n \t}\r\n\t\r\n\t/**\r\n\t * Randomly generate a vector of dimension m. Each value is in the range [0,1]\r\n\t * @param m\r\n\t * @return\r\n\t */\r\n\tpublic static SparseVector makeRandom(int m) {\r\n\t\tSparseVector a = new SparseVector(m);\r\n\t\tfor (int i = 0; i < m; i++) {\r\n\t\t\ta.setValue(i, Math.random());\r\n\t\t}\r\n\t\treturn a;\r\n\t}\r\n\t\r\n\t/**\r\n\t * Calculate cosine similarity of two sparse vectors.\r\n\t * @param a\r\n\t * @param b\r\n\t * @return\r\n\t */\r\n\tpublic static double cosineSimilarity(SparseVector a, SparseVector b) {\r\n\t\tif (a.itemCount() == 0 || b.itemCount() == 0)\r\n\t\t\treturn 0;\r\n\t\t\r\n\t\tdouble innerProduct = a.innerProduct(b);\r\n\t\treturn innerProduct == 0 ? 0 :\r\n\t\t\tinnerProduct / (Math.sqrt(a.squareSum()) * Math.sqrt(b.squareSum()));\r\n\t}\r\n\r\n\t\r\n\t/**\r\n\t * 2-norm of the vector.\r\n\t * \r\n\t * @return 2-norm value of the vector.\r\n\t */\r\n\tpublic double norm() {\r\n\t\tSparseVector a = this;\r\n\t\treturn Math.sqrt(a.innerProduct(a));\r\n\t}\r\n\t\r\n\t/**\r\n\t * L1 norm (sum of elements is 1) of the vector.\r\n\t * @return L1-norm of the vector. \r\n\t */\r\n\tpublic SparseVector L1_norm() {\r\n\t\tdouble sum = this.sum();\r\n\t\treturn this.scale(1.0 / sum);\r\n\t}\r\n\t\r\n\t/**\r\n\t * Sum of every element in the vector.\r\n\t * \r\n\t * @return Sum value of every element.\r\n\t */\r\n\tpublic double sum() {\r\n\t\tSparseVector a = this;\r\n\t\t\r\n\t\tdouble sum = 0.0;\r\n\t\tfor (int i : a.map) {\r\n\t\t\tsum += a.getValue(i);\r\n\t\t}\r\n\t\t\r\n\t\treturn sum;\r\n\t}\r\n\t\r\n\t/**\r\n\t * Square sum of all elements in the vector.\r\n\t * \r\n\t * @return Square sum of all elements.\r\n\t */\r\n\tpublic double squareSum() {\r\n\t\treturn this.innerProduct(this);\r\n\t}\r\n\t\r\n\t/**\r\n\t * The value of maximum element in the vector.\r\n\t * \r\n\t * @return Maximum value in the vector.\r\n\t */\r\n\tpublic double max() {\r\n\t\tSparseVector a = this;\r\n\t\t\r\n\t\tdouble curr = Double.MIN_VALUE;\r\n\t\tfor (int i : a.map) {\r\n\t\t\tif (a.getValue(i) > curr) {\r\n\t\t\t\tcurr = a.getValue(i);\r\n\t\t\t}\r\n\t\t}\r\n\t\t\r\n\t\treturn curr;\r\n\t}\r\n\t\r\n\t/**\r\n\t * The value of minimum element in the vector.\r\n\t * \r\n\t * @return Minimum value in the vector.\r\n\t */\r\n\tpublic double min() {\r\n\t\tSparseVector a = this;\r\n\t\t\r\n\t\tdouble curr = Double.MAX_VALUE;\r\n\t\tfor (int i : a.map) {\r\n\t\t\tif (a.getValue(i) < curr) {\r\n\t\t\t\tcurr = a.getValue(i);\r\n\t\t\t}\r\n\t\t}\r\n\t\t\r\n\t\treturn curr;\r\n\t}\r\n\t\r\n\t/**\r\n\t * Sum of absolute value of every element in the vector.\r\n\t * \r\n\t * @return Sum of absolute value of every element.\r\n\t */\r\n\tpublic double absoluteSum() {\r\n\t\tSparseVector a = this;\r\n\t\t\r\n\t\tdouble sum = 0.0;\r\n\t\tfor (int i : a.map) {\r\n\t\t\tsum += Math.abs(a.getValue(i));\r\n\t\t}\r\n\t\t\r\n\t\treturn sum;\r\n\t}\r\n\t\r\n\t/**\r\n\t * Average of every element. It ignores non-existing values.\r\n\t * \r\n\t * @return The average value.\r\n\t */\r\n\tpublic double average() {\r\n\t\tSparseVector a = this;\r\n\t\t\r\n\t\treturn a.sum() / (double) a.itemCount();\r\n\t}\r\n\t\r\n\t/**\r\n\t * Variance of every element. It ignores non-existing values.\r\n\t * \r\n\t * @return The variance value.\r\n\t */\r\n\tpublic double variance() {\r\n\t\tdouble avg = this.average();\r\n\t\tdouble sum = 0.0;\r\n\t\t\r\n\t\tfor (int i : this.map) {\r\n\t\t\tsum += Math.pow(this.getValue(i) - avg, 2);\r\n\t\t}\r\n\t\t\r\n\t\treturn sum / this.itemCount();\r\n\t}\r\n\t\r\n\t/**\r\n\t * Standard Deviation of every element. It ignores non-existing values.\r\n\t * \r\n\t * @return The standard deviation value.\r\n\t */\r\n\tpublic double stdev() {\r\n\t\treturn Math.sqrt(this.variance());\r\n\t}\r\n\t\r\n\t/*========================================\r\n\t * Binary Vector operations\r\n\t *========================================*/\r\n\t/**\r\n\t * Vector sum (a + b)\r\n\t * \r\n\t * @param b The vector to be added to this vector.\r\n\t * @return The resulting vector after summation.\r\n\t */\r\n\tpublic SparseVector plus(SparseVector b) {\r\n\t\tSparseVector a = this;\r\n\t\tif (a.N != b.N)\r\n\t\t\tthrow new RuntimeException(\"Vector lengths disagree\");\r\n\t\t\r\n\t\tSparseVector c = new SparseVector(N);\r\n\t\tfor (int i : a.map)\r\n\t\t\tc.setValue(i, a.getValue(i));  // c = a\r\n\t\tfor (int i : b.map)\r\n\t\t\tc.setValue(i, b.getValue(i) + c.getValue(i)); // c = c + b\r\n\t\t\r\n\t\treturn c;\r\n\t}\r\n\t\r\n\t/**\r\n\t * Vector sum on itself (a + b)\r\n\t * @param b\r\n\t * @return\r\n\t */\r\n\tpublic SparseVector selfPlus(SparseVector b) {\r\n\t\tSparseVector a = this;\r\n\t\tif (a.N != b.N)\r\n\t\t\tthrow new RuntimeException(\"Vector lengths disagree\");\r\n\t\t\r\n\t\tfor (int i : b.map) {\r\n\t\t\ta.setValue(i, a.getValue(i) + b.getValue(i));\r\n\t\t}\r\n\t\treturn a;\r\n\t}\r\n\t\r\n\t/**\r\n\t * Vector subtraction (a - b)\r\n\t * \r\n\t * @param b The vector to be subtracted from this vector.\r\n\t * @return The resulting vector after subtraction.\r\n\t */\r\n\tpublic SparseVector minus(SparseVector b) {\r\n\t\tSparseVector a = this;\r\n\t\tif (a.N != b.N)\r\n\t\t\tthrow new RuntimeException(\"Vector lengths disagree\");\r\n\t\t\r\n\t\tSparseVector c = new SparseVector(N);\r\n\t\tfor (int i : a.map)\r\n\t\t\tc.setValue(i, a.getValue(i));  // c = a\r\n\t\tfor (int i : b.map)\r\n\t\t\tc.setValue(i, c.getValue(i) - b.getValue(i)); // c = c - b\r\n\t\t\r\n\t\treturn c;\r\n\t}\r\n\t\r\n\t/**\r\n\t * Vector subtraction on itself (a - b)\r\n\t * @param b\r\n\t * @return\r\n\t */\r\n\tpublic SparseVector selfMinus(SparseVector b) {\r\n\t\tSparseVector a = this;\r\n\t\tif (a.N != b.N)\r\n\t\t\tthrow new RuntimeException(\"Vector lengths disagree\");\r\n\t\t\r\n\t\tfor (int i : b.map) {\r\n\t\t\ta.setValue(i, a.getValue(i) - b.getValue(i));\r\n\t\t}\r\n\t\treturn a;\r\n\t}\r\n\t\r\n\t/**\r\n\t * Vector subtraction (a - b), for only existing values.\r\n\t * The resulting vector can have a non-zero value only if both vectors have a value at the index.\r\n\t * \r\n\t * @param b The vector to be subtracted from this vector.\r\n\t * @return The resulting vector after subtraction.\r\n\t */\r\n\tpublic SparseVector commonMinus(SparseVector b) {\r\n\t\tSparseVector a = this;\r\n//\t\tif (a.N != b.N)\r\n//\t\t\tthrow new RuntimeException(\"Vector lengths disagree\");\r\n\t\t\r\n\t\tSparseVector c = new SparseVector(N);\r\n\t\tif (a.itemCount() <= b.itemCount()) {\r\n\t\t\tfor (int i : a.map) {\r\n\t\t\t\tif (b.map.contains(i)) c.setValue(i, a.getValue(i) - b.getValue(i));\r\n\t\t\t}\r\n\t\t}\r\n\t\telse {\r\n\t\t\tfor (int i : b.map) {\r\n\t\t\t\tif (a.map.contains(i)) c.setValue(i, a.getValue(i) - b.getValue(i));\r\n\t\t\t}\r\n\t\t}\r\n\t\t\r\n\t\treturn c;\r\n\t}\r\n\t\r\n\t/**\r\n\t * Inner product of two vectors.\r\n\t * \r\n\t * @param b The vector to be inner-producted with this vector.\r\n\t * @return The inner-product value.\r\n\t */\r\n\tpublic double innerProduct(SparseVector b) {\r\n\t\tSparseVector a = this;\r\n\t\tdouble sum = 0.0;\r\n\t\t\r\n\t\tif (a.N != b.N)\r\n\t\t\tthrow new RuntimeException(\"Vector lengths disagree\");\r\n\t\t\r\n\t\t// iterate over the vector with the fewer items\r\n\t\tif (a.itemCount() <= b.itemCount()) {\r\n\t\t\tfor (int i : a.map) {\r\n\t\t\t\tif (b.map.contains(i)) sum += a.getValue(i) * b.getValue(i);\r\n\t\t\t}\r\n\t\t}\r\n\t\telse {\r\n\t\t\tfor (int i : b.map) {\r\n\t\t\t\tif (a.map.contains(i)) sum += a.getValue(i) * b.getValue(i);\r\n\t\t\t}\r\n\t\t}\r\n\t\t\r\n\t\treturn sum;\r\n\t}\r\n\t\r\n\t/**\r\n\t * Outer product of two vectors.\r\n\t * \r\n\t * @param b The vector to be outer-producted with this vector.\r\n\t * @return The resulting outer-product matrix. \r\n\t */\r\n\tpublic SparseMatrix outerProduct(SparseVector b) {\r\n\t\tSparseMatrix A = new SparseMatrix(this.N, b.N);\r\n\t\t\r\n\t\tfor (int i = 0; i < this.N; i++) {\r\n\t\t\tfor (int j = 0; j < b.N; j++) {\r\n\t\t\t\tA.setValue(i, j, this.getValue(i) * b.getValue(j));\r\n\t\t\t}\r\n\t\t}\r\n\t\t\r\n\t\treturn A;\r\n\t}\r\n\t\r\n\t/**\r\n\t * Dot product of two vectors (c_i = a_i * b_i)\r\n\t * @param b\r\n\t * @return The resulting doc-product vector.\r\n\t */\r\n\tpublic SparseVector dotProduct(SparseVector b) {\r\n\t\tif (N != b.N)\r\n\t\t\tthrow new RuntimeException(\"dotProduct Error - Vector lengths disagree\");\r\n\t\t\r\n\t\tSparseVector c = new SparseVector(N);\r\n\t\tfor (int i : map) {\r\n\t\t\tif (getValue(i) != 0 && b.getValue(i)!= 0)\r\n\t\t\t\tc.setValue(i, getValue(i) * b.getValue(i));\r\n\t\t}\r\n\t\treturn c;\r\n\t}\r\n\t\r\n\t/*========================================\r\n\t * Binary Vector operations (partial)\r\n\t *========================================*/\r\n\t/**\r\n\t * Vector sum (a + b) for indices only in the given indices.\r\n\t * \r\n\t * @param b The vector to be added to this vector.\r\n\t * @param indexList The list of indices to be applied summation.\r\n\t * @return The resulting vector after summation.\r\n\t */\r\n\tpublic SparseVector partPlus(SparseVector b, int[] indexList) {\r\n\t\tif (indexList == null)\r\n\t\t\treturn this;\r\n\t\t\r\n\t\tif (this.N != b.N)\r\n\t\t\tthrow new RuntimeException(\"Vector lengths disagree\");\r\n\t\t\r\n\t\tfor (int i : indexList)\r\n\t\t\tthis.setValue(i, this.getValue(i) + b.getValue(i)); // c = c + b\r\n\t\t\r\n\t\treturn this;\r\n\t}\r\n\t\r\n\t/**\r\n\t * Vector subtraction (a - b) for indices only in the given indices.\r\n\t * \r\n\t * @param b The vector to be subtracted from this vector.\r\n\t * @param indexList The list of indices to be applied subtraction.\r\n\t * @return The resulting vector after subtraction.\r\n\t */\r\n\tpublic SparseVector partMinus(SparseVector b, int[] indexList) {\r\n\t\tif (indexList == null)\r\n\t\t\treturn this;\r\n\t\t\r\n\t\tif (this.N != b.N)\r\n\t\t\tthrow new RuntimeException(\"Vector lengths disagree\");\r\n\t\t\r\n\t\tfor (int i : indexList)\r\n\t\t\tthis.setValue(i, this.getValue(i) - b.getValue(i)); // c = c - b\r\n\t\t\r\n\t\treturn this;\r\n\t}\r\n\t\r\n\t/**\r\n\t * Inner-product for indices only in the given indices.\r\n\t * \r\n\t * @param b The vector to be inner-producted with this vector.\r\n\t * @param indexList The list of indices to be applied inner-product.\r\n\t * @return The inner-product value.\r\n\t */\r\n\tpublic double partInnerProduct(SparseVector b, int[] indexList) {\r\n\t\tdouble sum = 0.0;\r\n\t\t\r\n\t\tif (indexList != null) {\r\n\t\t\tfor (int i : indexList) {\r\n\t\t\t\tsum += this.getValue(i) * b.getValue(i);\r\n\t\t\t}\r\n\t\t}\r\n\t\t\r\n\t\treturn sum;\r\n\t}\r\n\t\r\n\t/**\r\n\t * Outer-product for indices only in the given indices.\r\n\t * \r\n\t * @param b The vector to be outer-producted with this vector.\r\n\t * @param indexList The list of indices to be applied outer-product.\r\n\t * @return The outer-product value.\r\n\t */\r\n\tpublic SparseMatrix partOuterProduct(SparseVector b, int[] indexList) {\r\n\t\tif (indexList == null)\r\n\t\t\treturn null;\r\n\t\t\r\n\t\tSparseMatrix A = new SparseMatrix(b.length(), b.length());\r\n\t\t\r\n\t\tfor (int i : indexList) {\r\n\t\t\tfor (int j : indexList) {\r\n\t\t\t\tA.setValue(i, j, this.getValue(i) * b.getValue(j));\r\n\t\t\t}\r\n\t\t}\r\n\t\t\r\n\t\treturn A;\r\n\t}\r\n\t\r\n\t/**\r\n\t * Get the topK indices with largest values. \r\n\t * @param topK\r\n\t * @param igonoreIndices Indices to ignore. \r\n\t * @return\r\n\t */\r\n\tpublic ArrayList<Integer> topIndicesByValue(int topK, ArrayList<Integer> ignoreIndices) {\r\n\t\tHashMap<Integer, Double> hashmap = new HashMap<Integer, Double>();\r\n\t\tfor (int j : this.indexList()) {\r\n\t\t\thashmap.put(j, this.getValue(j));\r\n\t\t}\r\n\t\treturn CommonUtils.TopKeysByValue(hashmap, topK, ignoreIndices);\r\n\t}\r\n\t\r\n\t/**\r\n\t * Convert the vector to a printable string.\r\n\t * \r\n\t * @return The resulted string in the form of \"(1: 5.0) (2: 4.5)\"\r\n\t */\r\n\t@Override\r\n\tpublic String toString() {\r\n        String s = \"\";\r\n        for (int i : this.map) {\r\n        \ts += String.format(\"(%d:\\t%.6f) \", i, map.get(i));\r\n            // s += \"(\" + i + \": \" + map.get(i) + \") \";\r\n        }\r\n        return s;\r\n    }\r\n\t\r\n\tpublic String KeysToString() {\r\n\t\tString s = \"[\";\r\n\t\tfor (int i : this.map) {\r\n\t\t\ts += i + \", \";\r\n\t\t}\r\n\t\ts += \"]\";\r\n\t\treturn s;\r\n\t}\r\n}\r\n"
  },
  {
    "path": "src/main/main.java",
    "content": "package main;\r\n\r\nimport java.io.BufferedReader;\r\nimport java.io.FileInputStream;\r\nimport java.io.FileNotFoundException;\r\nimport java.io.FileOutputStream;\r\nimport java.io.IOException;\r\nimport java.io.InputStreamReader;\r\nimport java.io.PrintWriter;\r\nimport java.util.Arrays;\r\nimport java.util.Collection;\r\nimport java.util.Collections;\r\nimport java.util.Comparator;\r\nimport java.util.HashMap;\r\nimport java.util.HashSet;\r\nimport java.util.List;\r\nimport java.util.NavigableMap;\r\nimport java.util.SortedSet;\r\nimport java.util.TreeSet;\r\n\r\nimport algorithms.*;\r\nimport utils.DatasetUtil;\r\nimport data_structure.DenseVector;\r\nimport data_structure.Rating;\r\nimport data_structure.SparseMatrix;\r\nimport data_structure.SparseVector;\r\nimport utils.Printer;\r\nimport utils.CommonUtils;\r\n\r\nimport java.util.ArrayList;\r\n\r\n/**\r\n * This is an abstract class for evaluating topK recommender systems (i.e. main functions.).\r\n * Define some variables to use, and member functions to load data.\r\n * \r\n * @author HeXiangnan\r\n * @since 2014.12.16\r\n */\r\n\r\npublic abstract class main {\r\n\r\n\t/** Rating matrix for training. */ \r\n\tpublic static SparseMatrix trainMatrix;\r\n\t\r\n\t/** Test ratings (sorted by time for global split). */\r\n\tpublic static ArrayList<Rating> testRatings;\r\n\t\r\n\tpublic static int topK = 100;\r\n\tpublic static int threadNum = 10;\r\n\t\r\n\tpublic static int userCount;\r\n\tpublic static int itemCount;\r\n\t\r\n\tpublic static void ReadRatings_GlobalSplit(String ratingFile, double testRatio)\r\n\t\t\tthrows IOException {\r\n\t\tuserCount = itemCount = 0;\r\n\t\tSystem.out.println(\"Global splitting with testRatio \" + testRatio);\r\n\t\t// Step 1. Construct data structure for sorting.\r\n\t\tSystem.out.print(\"Read ratings and sort.\");\r\n\t\tlong startTime = System.currentTimeMillis();\r\n\t\tArrayList<Rating> ratings = new ArrayList<Rating>();\r\n\t\tBufferedReader reader = new BufferedReader(\r\n\t\t\t\tnew InputStreamReader(new FileInputStream(ratingFile)));\r\n\t\tString line;\r\n\t\twhile((line = reader.readLine()) != null) {\r\n\t\t\tRating rating = new Rating(line);\r\n\t\t\tratings.add(rating);\r\n\t\t\tuserCount = Math.max(userCount, rating.userId);\r\n\t\t\titemCount = Math.max(itemCount, rating.itemId);\r\n\t\t}\r\n\t\treader.close();\r\n\t\tuserCount ++;\r\n\t\titemCount ++;\r\n\t\t\r\n\t\t// Step 2. Sort the ratings by time (small->large).\r\n\t\tComparator<Rating> c = new Comparator<Rating>() {\r\n\t\t\tpublic int compare(Rating o1, Rating o2) {\r\n\t\t\t\tif (o1.timestamp - o2.timestamp > 0)\treturn 1;\r\n\t\t\t\telse if (o1.timestamp - o2.timestamp < 0)\treturn -1;\r\n\t\t\t\telse return 0;\r\n\t\t\t}\r\n\t\t};\r\n\t\tCollections.sort(ratings, c);\r\n\t\tSystem.out.printf(\"[%s]\\n\", Printer.printTime(\r\n\t\t\t\tSystem.currentTimeMillis() - startTime));\r\n\t\t\r\n\t\t// Step 3. Generate trainMatrix and testStream\r\n\t\tSystem.out.printf(\"Generate trainMatrix and testStream.\");\r\n\t\tstartTime = System.currentTimeMillis();\r\n\t\ttrainMatrix = new SparseMatrix(userCount, itemCount);\r\n\t\ttestRatings = new ArrayList<Rating>();\r\n\t\t\r\n\t\tint testCount = (int) (ratings.size() * testRatio);\r\n\t\tint count = 0;\r\n\t\tfor (Rating rating : ratings) {\r\n\t\t\tif (count < ratings.size() - testCount) {  // train\r\n\t\t\t\ttrainMatrix.setValue(rating.userId, rating.itemId, 1);\r\n\t\t\t} else {  // test\r\n\t\t\t\ttestRatings.add(rating);\r\n\t\t\t}\r\n\t\t\tcount ++;\r\n\t\t}\r\n\t\t// Count number of new users/items/ratings in the test data\r\n\t\tHashSet<Integer> newUsers = new HashSet<Integer>();\r\n\t\tint newRatings = 0;\r\n\t\tfor (int u = 0; u < userCount; u ++) {\r\n\t\t\tif (trainMatrix.getRowRef(u).itemCount() == 0)\tnewUsers.add(u);\r\n\t\t}\r\n\t\tfor (Rating rating : testRatings) {\r\n\t\t\tif (newUsers.contains(rating.userId))\tnewRatings ++;\r\n\t\t}\r\n\t\t\r\n\t\tSystem.out.printf(\"[%s]\\n\", Printer.printTime(\r\n\t\t\t\tSystem.currentTimeMillis() - startTime));\r\n\t\t\r\n\t\t// Print some basic statistics of the dataset.\r\n\t\tSystem.out.println (\"Data\\t\" + ratingFile);\r\n\t\tSystem.out.println (\"#Users\\t\" + userCount + \", #newUser: \" + newUsers.size());\r\n\t\tSystem.out.println (\"#Items\\t\" + itemCount);\r\n\t\tSystem.out.printf(\"#Ratings\\t %d (train), %d(test), %d(#newTestRatings)\\n\", \r\n\t\t\t\ttrainMatrix.itemCount(),  testRatings.size(), newRatings);\r\n\t}\r\n\t\r\n\t/**\r\n\t *  Each line of .rating file is: userID\\t itemID\\t score\\t timestamp.\r\n\t *  userID starts from 0 to num_user-1\r\n\t *  The items of each user is sorted by time (small->large).\r\n\t */\t\r\n\tpublic static void ReadRatings_HoldOneOut(String ratingFile) throws IOException {\r\n\t\t\tuserCount = itemCount = 0;\r\n\t\t\tSystem.out.println(\"HoldOne out splitting.\");\r\n\t\t\t// Step 1. Construct data structure for sorting.\r\n\t\t\tSystem.out.print(\"Sort items for each user.\");\r\n\t\t\tlong startTime = System.currentTimeMillis();\r\n\t\t\tArrayList<ArrayList<Rating>> user_ratings = new ArrayList<ArrayList<Rating>>();\r\n\t\t\tBufferedReader reader = new BufferedReader(\r\n\t\t\t\t\tnew InputStreamReader(new FileInputStream(ratingFile)));\r\n\t\t\tString line;\r\n\t\t\twhile((line = reader.readLine()) != null) {\r\n\t\t\t\tRating rating = new Rating(line);\r\n\t\t\t\tif (user_ratings.size() - 1 < rating.userId) { // create a new user\r\n\t\t\t\t\tuser_ratings.add(new ArrayList<Rating>());\r\n\t\t\t\t}\r\n\t\t\t\tuser_ratings.get(rating.userId).add(rating);\r\n\t\t\t\tuserCount = Math.max(userCount, rating.userId);\r\n\t\t\t\titemCount = Math.max(itemCount, rating.itemId);\r\n\t\t\t}\r\n\t\t\treader.close();\r\n\t\t\tuserCount ++;\r\n\t\t\titemCount ++;\r\n\t\t\tassert userCount == user_ratings.size();\r\n\t\t\t\r\n\t\t\t// Step 2. Sort the ratings of each user by time (small->large).\r\n\t\t\tComparator<Rating> c = new Comparator<Rating>() {\r\n\t\t\t\tpublic int compare(Rating o1, Rating o2) {\r\n\t\t\t\t\tif (o1.timestamp - o2.timestamp > 0)\treturn 1;\r\n\t\t\t\t\telse if (o1.timestamp - o2.timestamp < 0)\treturn -1;\r\n\t\t\t\t\telse return 0;\r\n\t\t\t\t}\r\n\t\t\t};\r\n\t\t\tfor (int u = 0;  u < userCount; u ++) {\r\n\t\t\t\tCollections.sort(user_ratings.get(u), c);\r\n\t\t\t}\r\n\t\t\tSystem.out.printf(\"[%s]\\n\", Printer.printTime(\r\n\t\t\t\t\tSystem.currentTimeMillis() - startTime));\r\n\t\t\t\r\n\t\t\t// Step 3. Generated splitted matrices (implicit 0/1 settings). \r\n\t\t\tSystem.out.printf(\"Generate rating matrices.\");\r\n\t\t\tstartTime = System.currentTimeMillis();\r\n\t\t\ttrainMatrix = new SparseMatrix(userCount, itemCount);\r\n\t\t\ttestRatings = new ArrayList<Rating>();\r\n\t\t\tfor (int u = 0; u < userCount; u ++) {\r\n\t\t\t\tArrayList<Rating> ratings = user_ratings.get(u);\r\n\t\t\t\tfor (int i = ratings.size() - 1; i >= 0; i --) {\r\n\t\t\t\t\tint userId = ratings.get(i).userId;\r\n\t\t\t\t\tint itemId = ratings.get(i).itemId;\r\n\t\t\t\t\tif (i == ratings.size() - 1) { // test\r\n\t\t\t\t\t\ttestRatings.add(ratings.get(i));\r\n\t\t\t\t\t} else { // train\r\n\t\t\t\t\t\ttrainMatrix.setValue(userId, itemId, 1);\r\n\t\t\t\t\t} \r\n\t\t\t\t}\r\n\t\t\t}\r\n\t\t\tSystem.out.printf(\"[%s]\\n\", Printer.printTime(\r\n\t\t\t\t\tSystem.currentTimeMillis() - startTime));\r\n\t\t\t\r\n\t\t\t// Print some basic statistics of the dataset.\r\n\t\t\tSystem.out.println (\"Data\\t\" + ratingFile);\r\n\t\t\tSystem.out.println (\"#Users\\t\" + userCount);\r\n\t\t\tSystem.out.println (\"#Items\\t\" + itemCount);\r\n\t\t\tSystem.out.printf(\"#Ratings\\t %d (train), %d(test)\\n\", \r\n\t\t\t\t\ttrainMatrix.itemCount(), testRatings.size());\r\n\t\t}\r\n\t\r\n\t/**\r\n\t * Generate a smaller dataset. \r\n\t * @param threshold\r\n\t * @throws IOException \r\n\t */\r\n\tpublic static void FilterRatingsWithThreshold(String ratingFile, \r\n\t\t\tint userThreshold, int itemThreshold) throws IOException {\r\n\t\tArrayList<ArrayList<Rating>> user_ratings = new ArrayList<ArrayList<Rating>>();\r\n\t\tSystem.out.println(\"Filter dataset with #user/item >= \" + itemThreshold + \r\n\t\t\t\t\" and #item/user >= \" + userThreshold);\r\n\t\t\r\n\t\t// Read user ratings.\r\n\t\tBufferedReader reader = new BufferedReader(\r\n\t\t\t\tnew InputStreamReader(new FileInputStream(ratingFile)));\r\n\t\tHashMap<Integer, Integer> map_item_count = new HashMap<Integer, Integer>();\r\n\t\tString line;\r\n\t\twhile((line = reader.readLine()) != null) {\r\n\t\t\tRating rating = new Rating(line);\r\n\t\t\tif (user_ratings.size() - 1 < rating.userId) { // create a new user\r\n\t\t\t\tuser_ratings.add(new ArrayList<Rating>());\r\n\t\t\t}\r\n\t\t\tuser_ratings.get(rating.userId).add(rating);\r\n\t\t\tif (!map_item_count.containsKey(rating.itemId)) {\r\n\t\t\t\tmap_item_count.put(rating.itemId, 0);\r\n\t\t\t}\r\n\t\t\tmap_item_count.put(rating.itemId, map_item_count.get(rating.itemId) + 1);\r\n\t\t}\r\n\t\treader.close();\r\n\t\t\r\n\t\t// User filtering & item filtering\r\n\t\tPrintWriter writer = new PrintWriter (new FileOutputStream(\r\n\t\t\t\tratingFile + \"_i\" + itemThreshold + \"_u\" + userThreshold));\r\n\t\tHashMap<String, Integer> map_user_id = new HashMap<String, Integer>();\r\n\t\tHashMap<String, Integer> map_item_id = new HashMap<String, Integer>();\r\n\t\tint count = 0;\r\n\t\t\r\n\t\tfor (int u = 0; u < user_ratings.size(); u ++) {\r\n\t\t\tArrayList<Rating> ratings = user_ratings.get(u);\r\n\t\t\tint count_u = 0;\r\n\t\t\tfor (Rating rating : ratings) {\r\n\t\t\t\t// item filtering\r\n\t\t\t\tif (map_item_count.get(rating.itemId) < itemThreshold)\tcontinue; \r\n\t\t\t\tcount_u ++;\r\n\t\t\t}\r\n\t\t\t// user filtering\r\n\t\t\tif (count_u < userThreshold)\tcontinue;  \r\n\t\t\t// write to files\r\n\t\t\tfor (Rating rating: ratings) {\r\n\t\t\t\tif (map_item_count.get(rating.itemId) < itemThreshold)\tcontinue;\r\n\t\t\t\t// Old item id and user id\r\n\t\t\t\tString item = \"\" + rating.itemId;\r\n\t\t\t\tString user = \"\" + rating.userId;\r\n\t\t\t\tif (!map_item_id.containsKey(item))\t{\r\n\t\t\t\t\tmap_item_id.put(item, map_item_id.size());\r\n\t\t\t\t}\r\n\t\t\t\tif (!map_user_id.containsKey(user)) {\r\n\t\t\t\t\tmap_user_id.put(user, map_user_id.size());\r\n\t\t\t\t}\r\n\t\t\t\t// New item id and user id\r\n\t\t\t\tint userId = map_user_id.get(user);\r\n\t\t\t\tint itemId = map_item_id.get(item);\r\n\t\t\t\twriter.println(userId + \"\\t\" + itemId + \"\\t\" + rating.score + \"\\t\" + rating.timestamp);\r\n\t\t\t\tcount ++;\r\n\t\t\t}\r\n\t\t}\r\n\t\t\r\n\t\tSystem.out.printf(\"After filtering: #user:%d, #item:%d, #rating:%d \\n\", \r\n\t\t\t\tmap_user_id.size(), map_item_id.size(), count);\r\n\t\twriter.close();\r\n\t}\r\n\t\r\n\t// Get some statistics about the dataset, e.g. user distribution on items\r\n\tpublic static void DatasetStatistics(String ratingFile) throws IOException {\r\n\t\tBufferedReader reader = new BufferedReader(\r\n\t\t\t\tnew InputStreamReader(new FileInputStream(ratingFile)));\r\n\t\t\r\n\t\t// Read user ratings\r\n\t\tint ratingCount = 0;\r\n\t\tArrayList<ArrayList<Rating>> user_ratings = new ArrayList<ArrayList<Rating>>();\r\n\t\tString line;\r\n\t\twhile ((line = reader.readLine()) != null) {\r\n\t\t\tRating rating = new Rating(line);\r\n\t\t\tratingCount ++;\r\n\t\t\tif (user_ratings.size() - 1 < rating.userId) { // create a new user\r\n\t\t\t\tuser_ratings.add(new ArrayList<Rating>());\r\n\t\t\t}\r\n\t\t\tuser_ratings.get(rating.userId).add(rating);\r\n\t\t}\r\n\t\tSystem.out.println(\"#Ratings in total: \" + ratingCount);\r\n\t\t\r\n\t\t// user distribution on items\r\n\t\tHashMap<Integer, Integer> map_count_users = new HashMap<Integer, Integer>();\r\n\t\tfor (ArrayList<Rating> ratings : user_ratings) {\r\n\t\t\tint count = ratings.size();\r\n\t\t\tif (!map_count_users.containsKey(ratings.size())) {\r\n\t\t\t\tmap_count_users.put(count, 0);\r\n\t\t\t}\r\n\t\t\tmap_count_users.put(count, map_count_users.get(count) + 1);\r\n\t\t}\r\n\t\tList<Integer> sortedKeys=new ArrayList<Integer>(map_count_users.keySet());\r\n\t\tCollections.sort(sortedKeys);\r\n\t\tSystem.out.println(\"#rating\\t#users (percentage)\");\r\n\t\tfor (int count : sortedKeys) {\r\n\t\t\tint users = map_count_users.get(count);\r\n\t\t\tSystem.out.printf(\"%d\\t %d (%.2f%%)\\n\", count, users, \r\n\t\t\t\t\t(double)users / user_ratings.size() * 100 );\r\n\t\t}\r\n\t\treader.close();\r\n\t\t\r\n\t\t// Read item ratings\r\n\t\treader = new BufferedReader(new InputStreamReader(new FileInputStream(ratingFile)));\r\n\t\tArrayList<ArrayList<Rating>> item_ratings = new ArrayList<ArrayList<Rating>>();\r\n\t\twhile ((line = reader.readLine()) != null) {\r\n\t\t\tRating rating = new Rating(line);\r\n\t\t\tif (item_ratings.size() - 1 < rating.itemId) { // create a new user\r\n\t\t\t\titem_ratings.add(new ArrayList<Rating>());\r\n\t\t\t}\r\n\t\t\titem_ratings.get(rating.itemId).add(rating);\r\n\t\t}\r\n\t\t\r\n\t\t// item distrubution on users\r\n\t\tHashMap<Integer, Integer> map_count_items = new HashMap<Integer, Integer>();\r\n\t\tfor (ArrayList<Rating> ratings : item_ratings) {\r\n\t\t\tint count = ratings.size();\r\n\t\t\tif (!map_count_items.containsKey(ratings.size())) {\r\n\t\t\t\tmap_count_items.put(count, 0);\r\n\t\t\t}\r\n\t\t\tmap_count_items.put(count, map_count_items.get(count) + 1);\r\n\t\t}\r\n\t\tsortedKeys=new ArrayList<Integer>(map_count_items.keySet());\r\n\t\tCollections.sort(sortedKeys);\r\n\t\tSystem.out.println(\"#rating\\t#items (percentage)\");\r\n\t\tfor (int count : sortedKeys) {\r\n\t\t\tint items = map_count_items.get(count);\r\n\t\t\tSystem.out.printf(\"%d\\t %d (%.2f%%)\\n\", count, items, \r\n\t\t\t\t\t(double)items / item_ratings.size() * 100 );\r\n\t\t}\r\n\t\treader.close();\r\n\t}\r\n\t\r\n\t// Convert the movie-len-10M input(.dat) file to rating file.\r\n\tpublic static void convertMLDatToRating(String ml_file) throws IOException {\r\n\t\tBufferedReader reader = new BufferedReader(\r\n\t\t\t\tnew InputStreamReader(new FileInputStream(ml_file)));\r\n\t\tPrintWriter writer = new PrintWriter (new FileOutputStream(ml_file + \".rating\"));\r\n\t\t\r\n\t\tint ratingCount = 0;\r\n\t\tString splitter = \"::\";\r\n\t\tHashMap<String, Integer> map_item_id = new HashMap<String, Integer>(); // id starts from 0\r\n\t\tHashMap<String, Integer> map_user_id = new HashMap<String, Integer>();\r\n\t\tString line;\r\n\t\twhile ((line = reader.readLine()) != null) {\r\n\t\t\tString[] arr = line.split(splitter);\r\n\t\t\tif (!map_user_id.containsKey(arr[0]))\r\n\t\t\t\tmap_user_id.put(arr[0], map_user_id.size());\r\n\t\t\tif (!map_item_id.containsKey(arr[1]))\r\n\t\t\t\tmap_item_id.put(arr[1], map_item_id.size());\r\n\t\t\t\r\n\t\t\tint userId = map_user_id.get(arr[0]);\r\n\t\t\tint itemId = map_item_id.get(arr[1]);\r\n\t\t\twriter.println(userId + \"\\t\" + itemId + \"\\t\" + arr[2] + \"\\t\" + arr[3]);\r\n\t\t\tratingCount ++;\r\n\t\t}\r\n\t\t\r\n\t\tSystem.out.println(\"Converted \" + ml_file + \" to .rating file\");\r\n\t\tSystem.out.printf(\"#rating:%d, #user:%d, #item:%d \\n\", \r\n\t\t\t\tratingCount, map_user_id.size(), map_item_id.size());\r\n\t\treader.close();\r\n\t\twriter.close();\r\n\t}\r\n\r\n\t// Convert the amazon review dataset (.vote) file to rating file.\r\n\tpublic static void convertVoteToRating(String vote_file) throws IOException {\r\n\t\tBufferedReader reader = new BufferedReader(\r\n\t\t\t\tnew InputStreamReader(new FileInputStream(vote_file)));\r\n\t\tPrintWriter writer = new PrintWriter (new FileOutputStream(vote_file + \".rating\"));\r\n\t\t\r\n\t\tint ratingCount = 0;\r\n\t\tString splitter = \" \";\r\n\t\tHashMap<String, Integer> map_item_id = new HashMap<String, Integer>(); // id starts from 0\r\n\t\tHashMap<String, Integer> map_user_id = new HashMap<String, Integer>();\r\n\t\tString line;\r\n\t\twhile ((line = reader.readLine()) != null) {\r\n\t\t\tString[] arr = line.split(splitter);\r\n\t\t\tif (!map_user_id.containsKey(arr[0]))\r\n\t\t\t\tmap_user_id.put(arr[0], map_user_id.size());\r\n\t\t\tif (!map_item_id.containsKey(arr[1]))\r\n\t\t\t\tmap_item_id.put(arr[1], map_item_id.size());\r\n\t\t\t\r\n\t\t\tint userId = map_user_id.get(arr[0]);\r\n\t\t\tint itemId = map_item_id.get(arr[1]);\r\n\t\t\twriter.println(userId + \"\\t\" + itemId + \"\\t\" + arr[2] + \"\\t\" + arr[3]);\r\n\t\t\tratingCount ++;\r\n\t\t}\r\n\t\t\r\n\t\tSystem.out.println(\"Converted \" + vote_file + \" to .rating file\");\r\n\t\tSystem.out.printf(\"#rating:%d, #user:%d, #item:%d \\n\", \r\n\t\t\t\tratingCount, map_user_id.size(), map_item_id.size());\r\n\t\treader.close();\r\n\t\twriter.close();\r\n\t}\r\n\t\r\n\t// Deduplicate the rating file by averaging the ratings for a (u,i) pair\r\n\t// Note: after deduplication, timestamp is removed.\r\n\tpublic static void deduplicate(String ratingFile) throws IOException {\r\n\t\t// Read user ratings.\r\n\t\tBufferedReader reader = new BufferedReader(\r\n\t\t\t\tnew InputStreamReader(new FileInputStream(ratingFile)));\r\n\t\tint ratingCount = 0;\r\n\t\tArrayList<ArrayList<Rating>> user_ratings = new ArrayList<ArrayList<Rating>>();\r\n\t\tString line;\r\n\t\twhile ((line = reader.readLine()) != null) {\r\n\t\t\tRating rating = new Rating(line);\r\n\t\t\tratingCount ++;\r\n\t\t\tif (user_ratings.size() - 1 < rating.userId) { // create a new user\r\n\t\t\t\tuser_ratings.add(new ArrayList<Rating>());\r\n\t\t\t}\r\n\t\t\tuser_ratings.get(rating.userId).add(rating);\r\n\t\t}\r\n\t\tSystem.out.println(\"#Ratings in total: \" + ratingCount);\r\n\t\treader.close();\r\n\t\t\r\n\t\t// Deduplicate and Writing to file\r\n\t\tPrintWriter writer = new PrintWriter (new FileOutputStream(ratingFile + \".deduplicate\"));\r\n\t\tratingCount = 0;\r\n\t\tfor (int u = 0; u < user_ratings.size(); u ++) {\r\n\t\t\tArrayList<Rating> ratings = user_ratings.get(u);\r\n\t\t\tHashMap<Integer, Double> map_item_score = new HashMap<Integer, Double>();\r\n\t\t\tHashMap<Integer, Integer> map_item_count = new HashMap<Integer, Integer>();\r\n\t\t\tfor (Rating rating: ratings) {\r\n\t\t\t\tif (!map_item_score.containsKey(rating.itemId))\t{\r\n\t\t\t\t\tmap_item_score.put(rating.itemId, 0.0);\r\n\t\t\t\t\tmap_item_count.put(rating.itemId, 0);\r\n\t\t\t\t}\r\n\t\t\t\tmap_item_score.put(rating.itemId, map_item_score.get(rating.itemId) + rating.score);\r\n\t\t\t\tmap_item_count.put(rating.itemId, map_item_count.get(rating.itemId) + 1);\r\n\t\t\t}\r\n\t\t\tfor (int i : map_item_score.keySet()) {\r\n\t\t\t\tdouble score = map_item_score.get(i) / map_item_count.get(i);\r\n\t\t\t\twriter.printf(\"%d\\t%d\\t%.1f\\n\", u+1, i+1, score);\r\n\t\t\t\tratingCount ++;\r\n\t\t\t}\r\n\t\t}\r\n\t\twriter.close();\r\n\t\tSystem.out.println(\"#After dedepulicate, #ratings: \" + ratingCount);\r\n\t}\r\n\t\r\n\tpublic static void main(String[] args) throws IOException {\r\n\t\tString dataset =\"hanwang-data/amazon_books_filter.rating\";\r\n\t\tdeduplicate(dataset);\r\n\t\t\r\n\t\t//String dataset = \"data/yelp.rating\";\r\n\t\t//ReadRatings_HoldOneOut(\"data/yelp.rating\");\r\n\t\t\r\n\t\t//FilterRatingsWithThreshold(dataset, 10, 10);\r\n\t\t//DatasetStatistics(dataset);\r\n\t\t\r\n\t\t//convertVoteToRating(dataset);\r\n\t\t//FilterRatingsWithThreshold(dataset, 10, 10);\r\n\t}\r\n\t\r\n\t// Evaluate the model\r\n\tpublic static double[] evaluate_model(TopKRecommender model, String name) {\r\n\t\tlong start = System.currentTimeMillis();\r\n\t\tmodel.buildModel();\r\n\t\tmodel.evaluate(testRatings);\r\n\t\t\r\n\t\tdouble[] res = new double[3];\r\n\t\tres[0] = model.hits.mean();\r\n\t\tres[1] = model.ndcgs.mean();\r\n\t\tres[2] = model.precs.mean();\r\n\t\tSystem.out.printf(\"%s\\t <hr, ndcg, prec>:\\t %.4f\\t %.4f\\t %.4f [%s]\\n\", \r\n\t\t\t\tname, res[0], res[1], res[2],\r\n\t\t\t\tPrinter.printTime(System.currentTimeMillis() - start));\r\n\t\treturn res;\r\n\t}\r\n\t\r\n\t// Evaluate the model by online protocol\r\n\tpublic static void evaluate_model_online(TopKRecommender model, String name, int interval) {\r\n\t\tlong start = System.currentTimeMillis();\r\n\t\tmodel.evaluateOnline(testRatings, interval);\r\n\t\tSystem.out.printf(\"%s\\t <hr, ndcg, prec>:\\t %.4f\\t %.4f\\t %.4f [%s]\\n\", \r\n\t\t\t\tname, model.hits.mean(), model.ndcgs.mean(), model.precs.mean(),\r\n\t\t\t\tPrinter.printTime(System.currentTimeMillis() - start));\r\n\t}\r\n}\r\n\r\nclass ModelThread extends Thread {\r\n\tTopKRecommender model;\r\n\t\r\n\tpublic ModelThread(TopKRecommender model) {\r\n\t\tthis.model = model;\r\n\t}\r\n\t\r\n\tpublic void run() {\r\n\t\tmodel.runOneIteration();\r\n\t}\r\n}\r\n"
  },
  {
    "path": "src/main/main_MF.java",
    "content": "package main;\r\n\r\nimport java.io.IOException;\r\n\r\nimport data_structure.DenseMatrix;\r\nimport utils.Printer;\r\nimport algorithms.MF_fastALS;\r\nimport algorithms.MF_ALS;\r\nimport algorithms.MF_CD;\r\nimport algorithms.ItemPopularity;\r\n\r\npublic class main_MF extends main {\r\n\tpublic static void main(String argv[]) throws IOException {\r\n\t\tString dataset_name = \"yelp\";\r\n\t\tString method = \"FastALS\";\r\n\t\tdouble w0 = 10;\r\n\t\tboolean showProgress = false;\r\n\t\tboolean showLoss = true;\r\n\t\tint factors = 64;\r\n\t\tint maxIter = 500;\r\n\t\tdouble reg = 0.01;\r\n\t\tdouble alpha = 0.75;\r\n\t\t\r\n\t\tif (argv.length > 0) {\r\n\t\t\tdataset_name = argv[0];\r\n\t\t\tmethod = argv[1];\r\n\t\t\tw0 = Double.parseDouble(argv[2]);\r\n\t\t\tshowProgress = Boolean.parseBoolean(argv[3]);\r\n\t\t\tshowLoss = Boolean.parseBoolean(argv[4]);\r\n\t\t\tfactors = Integer.parseInt(argv[5]);\r\n\t\t\tmaxIter = Integer.parseInt(argv[6]);\r\n\t\t\treg = Double.parseDouble(argv[7]);\r\n\t\t\tif (argv.length > 8) alpha = Double.parseDouble(argv[8]);\r\n\t\t}\r\n\t\t//ReadRatings_GlobalSplit(\"data/\" + dataset_name + \".rating\", 0.1);\r\n\t\tReadRatings_HoldOneOut(\"data/\" + dataset_name + \".rating\");\r\n\t\t\r\n\t\tSystem.out.printf(\"%s: showProgress=%s, factors=%d, maxIter=%d, reg=%f, w0=%.2f, alpha=%.2f\\n\",\r\n\t\t\t\tmethod, showProgress, factors, maxIter, reg, w0, alpha);\r\n\t\tSystem.out.println(\"====================================================\");\r\n\t\t\r\n\t\tItemPopularity popularity = new ItemPopularity(trainMatrix, testRatings, topK, threadNum);\r\n\t\tevaluate_model(popularity, \"Popularity\");\r\n\t\t\r\n\t\tdouble init_mean = 0;\r\n\t\tdouble init_stdev = 0.01;\r\n\t\t\r\n\t\tif (method.equalsIgnoreCase(\"fastals\")) {\r\n\t\t\tMF_fastALS fals = new MF_fastALS(trainMatrix, testRatings, topK, threadNum,\r\n\t\t\t\t\tfactors, maxIter, w0, alpha, reg, init_mean, init_stdev, showProgress, showLoss);\r\n\t\t\tevaluate_model(fals, \"MF_fastALS\");\r\n\t\t}\r\n\t\t\r\n\t\tif (method.equalsIgnoreCase(\"als\")) {\r\n\t\t\tMF_ALS als = new MF_ALS(trainMatrix, testRatings, topK, threadNum,\r\n\t\t\t\t\tfactors, maxIter, w0, reg, init_mean, init_stdev, showProgress, showLoss);\r\n\t\t\tevaluate_model(als, \"MF_ALS\");\r\n\t\t}\r\n\t\t\r\n\t\tif (method.equalsIgnoreCase(\"cd\")) {\r\n\t\t\tMF_CD cd = new MF_CD(trainMatrix, testRatings, topK, threadNum,\r\n\t\t\t\t\tfactors, maxIter, w0, reg, init_mean, init_stdev, showProgress, showLoss);\r\n\t\t\tevaluate_model(cd, \"MF_CD\");\r\n\t\t}\r\n\t\t\r\n\t\tif (method.equalsIgnoreCase(\"all\")) {\r\n\t\t\tDenseMatrix U = new DenseMatrix(userCount, factors);\r\n\t\t\tDenseMatrix V = new DenseMatrix(itemCount, factors);\r\n\t\t\tU.init(init_mean, init_stdev);\r\n\t\t\tV.init(init_mean, init_stdev);\r\n\t\t\t\r\n\t\t\tMF_fastALS fals = new MF_fastALS(trainMatrix, testRatings, topK, threadNum,\r\n\t\t\t\t\tfactors, maxIter, w0, alpha, reg, init_mean, init_stdev, showProgress, showLoss);\r\n\t\t\tfals.setUV(U, V);\r\n\t\t\tevaluate_model(fals, \"MF_fastALS\");\r\n\t\t\t\r\n\t\t\tMF_ALS als = new MF_ALS(trainMatrix, testRatings, topK, threadNum,\r\n\t\t\t\t\tfactors, maxIter, w0, reg, init_mean, init_stdev, showProgress, showLoss);\r\n\t\t\tals.setUV(U, V);\r\n\t\t\tevaluate_model(als, \"MF_ALS\");\r\n\t\t\t\r\n\t\t\tMF_CD cd = new MF_CD(trainMatrix, testRatings, topK, threadNum,\r\n\t\t\t\t\tfactors, maxIter, w0, reg, init_mean, init_stdev, showProgress, showLoss);\r\n\t\t\tcd.setUV(U, V);\r\n\t\t\tevaluate_model(cd, \"MF_CD\");\r\n\t\t}\r\n\t\r\n\t} // end main\r\n}\r\n"
  },
  {
    "path": "src/main/main_bpr.java",
    "content": "package main;\n\nimport java.io.IOException;\n\nimport utils.Printer;\nimport algorithms.MFbpr;\nimport algorithms.ItemPopularity;\nimport algorithms.TopKRecommender;\nimport data_structure.Rating;\n\nimport java.util.ArrayList;\n\npublic class main_bpr extends main {\n\tpublic static void main(String argv[]) throws IOException {\n\t\tString dataset_name = \"yelp\";\n\t\tint factors = 16;\n\t\tdouble lr = 0.01;\n\t\tdouble reg = 0.01;\n\t\tint num_dns = 1; // number of dynamic negative samples [Zhang Weinan et al. SIGIR 2013]\n\t\tint maxIter = 1000;\n\t\tdouble init_mean = 0;\n\t\tdouble init_stdev = 0.01;\n\t\t\n\t\tif (argv.length > 0) {\n\t\t\tdataset_name = argv[0];\n\t\t\tfactors = Integer.parseInt(argv[1]);\n\t\t\tlr = Double.parseDouble(argv[2]);\n\t\t\treg = Double.parseDouble(argv[3]);\n\t\t}\n\t\tReadRatings_HoldOneOut(\"data/\" + dataset_name + \".rating\");\n\t\ttopK = 100;\n\t\t\n\t\tSystem.out.printf(\"BPR with factors=%d, lr=%.4f, reg=%.4f, num_dns=%d\\n\", \n\t\t\t\tfactors, lr, reg, num_dns);\n\t\tSystem.out.println(\"====================================================\");\n\t\t\n\t\tItemPopularity pop = new ItemPopularity(trainMatrix, testRatings, topK, threadNum);\n\t\tevaluate_model(pop, \"Popularity\");\n\t\t\n\t\tMFbpr bpr = new MFbpr(trainMatrix, testRatings, topK, threadNum, \n\t\t\t\tfactors, maxIter, lr, false, reg, init_mean, init_stdev, num_dns, true);\n\t\tevaluate_model(bpr, \"BPR\");\n\t\t\n\t} // end main\n}\n"
  },
  {
    "path": "src/main/main_online.java",
    "content": "package main;\r\n\r\nimport java.io.IOException;\r\n\r\nimport data_structure.DenseMatrix;\r\nimport utils.Printer;\r\nimport algorithms.MF_fastALS;\r\nimport algorithms.MF_ALS;\r\nimport algorithms.MF_CD;\r\nimport algorithms.ItemPopularity;\r\nimport algorithms.MFbpr;\r\n\r\npublic class main_online extends main {\r\n\tpublic static void main(String argv[]) throws IOException {\r\n\t\tString dataset_name = \"yelp\";\r\n\t\tString method = \"FastALS\";\r\n\t\tint interval = 1000;\r\n\t\tdouble w0 = 512;\r\n\t\tint factors = 64;\r\n\t\tint maxIter = 50;\r\n\t\tint maxIterOnline = 1;\r\n\t\tdouble alpha = 0.4;\r\n\t\tString onlineMode = \"ui\";\r\n\t\tdouble w_new = 1;\r\n\t\t\r\n\t\tif (argv.length > 0) {\r\n\t\t\tdataset_name = argv[0];\r\n\t\t\tmethod = argv[1];\r\n\t\t\tinterval = Integer.parseInt(argv[2]);\r\n\t\t\tw0 = Double.parseDouble(argv[3]);\r\n\t\t\tfactors = Integer.parseInt(argv[4]);\r\n\t\t\tmaxIter = Integer.parseInt(argv[5]);\r\n\t\t\tmaxIterOnline = Integer.parseInt(argv[6]);\r\n\t\t\talpha = Double.parseDouble(argv[7]);\r\n\t\t\tif (argv.length >= 9)\tonlineMode = argv[8];\r\n\t\t\tif (argv.length >= 10) w_new = Double.parseDouble(argv[9]);\r\n\t\t}\r\n\t\tReadRatings_GlobalSplit(\"data/\" + dataset_name + \".rating\", 0.1);\r\n\t\t\r\n\t\tSystem.out.printf(\"Online evaluation for %s: factors=%d, maxIter=%d, maxInterOnline=%d, interval=%d, onlineMode(bpr only)=%s\\n\",\r\n\t\t\t\tmethod, factors, maxIter, maxIterOnline, interval, onlineMode);\r\n\t\tSystem.out.println(\"====================================================\");\r\n\t\t\r\n\t\tItemPopularity popularity = new ItemPopularity(trainMatrix, testRatings, topK, threadNum);\r\n\t\tevaluate_model_online(popularity, \"Popularity\", interval);\r\n\t\t\r\n\t\tdouble init_mean = 0;\r\n\t\tdouble init_stdev = 0.01;\r\n\t\tdouble reg = 0.01;\r\n\t\tboolean showProgress = false;\r\n\t\tboolean showLoss = false;\r\n\t\t\r\n\t\t// Remove ALS is not suitable for online learning.\r\n\t\tif (method.equalsIgnoreCase(\"als\")) {\r\n\t\t\tMF_ALS als = new MF_ALS(trainMatrix, testRatings, topK, threadNum,\r\n\t\t\t\t\tfactors, maxIter, w0, reg, init_mean, init_stdev, showProgress, showLoss);\r\n\t\t\tals.buildModel();\r\n\t\t\tals.maxIterOnline = maxIterOnline;\r\n\t\t\tevaluate_model_online(als, \"MF_ALS\", interval);\r\n\t\t}\r\n\t\t\r\n\t\tif (method.equalsIgnoreCase(\"fastals\")) {\r\n\t\t\tMF_fastALS fals = new MF_fastALS(trainMatrix, testRatings, topK, threadNum,\r\n\t\t\t\t\tfactors, maxIter, w0, alpha, reg, init_mean, init_stdev, showProgress, showLoss);\r\n\t\t\tfals.w_new = w_new;\r\n\t\t\tfals.buildModel();\r\n\t\t\tfals.maxIterOnline = maxIterOnline;\r\n\t\t\tevaluate_model_online(fals, \"MF_fastALS\", interval);\r\n\t\t}\r\n\t\t\r\n\t\tif (method.equalsIgnoreCase(\"cd\")) {\r\n\t\t\tMF_CD cd = new MF_CD(trainMatrix, testRatings, topK, threadNum,\r\n\t\t\t\t\tfactors, maxIter, w0, reg, init_mean, init_stdev, showProgress, showLoss);\r\n\t\t\tcd.w_new = w_new;\r\n\t\t\tcd.buildModel();\r\n\t\t\tcd.maxIterOnline = maxIterOnline;\r\n\t\t\tevaluate_model_online(cd, \"MF_CD\", interval);\r\n\t\t}\r\n\t\t\r\n\t\tif (method.equalsIgnoreCase(\"bpr\")) {\r\n\t\t\tMFbpr bpr = new MFbpr(trainMatrix, testRatings, topK, threadNum, \r\n\t\t\t\t\tfactors, maxIter, 0.01, false, reg, init_mean, init_stdev, 1, showProgress);\r\n\t\t\tbpr.onlineMode = onlineMode;\r\n\t\t\tbpr.buildModel();\r\n\t\t\tbpr.maxIterOnline = maxIterOnline;\r\n\t\t\tevaluate_model_online(bpr, \"BPR\", interval);\r\n\t\t}\r\n\t\r\n\t} // end main\r\n}\r\n"
  },
  {
    "path": "src/utils/CommonUtils.java",
    "content": "package utils;\r\n\r\nimport java.io.IOException;\r\nimport java.util.ArrayList;\r\nimport java.util.Collections;\r\nimport java.util.Comparator;\r\nimport java.util.HashSet;\r\nimport java.util.List;\r\nimport java.util.Map;\r\nimport java.util.PriorityQueue;\r\nimport java.util.HashMap;\r\nimport java.util.Random;\r\npublic class CommonUtils {\r\n\t\r\n\t/**\r\n\t * Sort the HashMap<K, V> by its values, from Large->Small.\r\n\t * @return List<Map.Entry<K, V>> with sorted entries.\r\n\t */\r\n\tpublic static<K, V extends Comparable<? super V>> List<Map.Entry<K, V>> SortMapByValue(Map<K, V> map) {\r\n\t\tList<Map.Entry<K, V>> infoIds = new ArrayList<Map.Entry<K, V>>(map.entrySet()); \r\n\t\tComparator<Map.Entry<K, V>> c = new Comparator<Map.Entry<K, V>>() { \r\n\t\t\tpublic int compare(Map.Entry<K, V> o1, Map.Entry<K, V> o2) { \r\n\t\t\t\treturn o2.getValue().compareTo(o1.getValue());\r\n\t\t\t}};\r\n\t\tCollections.sort(infoIds, c); \r\n\t\treturn infoIds;\r\n\t}\r\n\t\r\n\t/**\r\n\t * Get the topK keys (by its value) of a map. Does not consider the keys which are in ignoreKeys.\r\n\t * @param map\r\n\t * @return\r\n\t */\r\n\tpublic static<K, V extends Comparable<? super V>> ArrayList<K> TopKeysByValue(Map<K, V> map, \r\n\t\t\tint topK, ArrayList<K> ignoreKeys) {\r\n\t\tHashSet<K> ignoreSet;\r\n\t\tif (ignoreKeys == null) {\r\n\t\t\tignoreSet = new HashSet<K>();\r\n\t\t} else {\r\n\t\t\tignoreSet = new HashSet<K> (ignoreKeys);\r\n\t\t}\r\n\t\t\r\n\t\tTopKPriorityQueue<K, V> topQueue = new TopKPriorityQueue<K, V>(topK);\r\n\t\tfor (Map.Entry<K, V> entry : map.entrySet()) {\r\n\t\t\tif (!ignoreSet.contains(entry.getKey())) {\r\n\t\t\t\ttopQueue.add(entry);\r\n\t\t\t}\r\n\t\t}\r\n\t\tArrayList<K> topKeys = new ArrayList<K>();\r\n\t\tfor (Map.Entry<K, V> entry : topQueue.sortedList()) {\r\n\t\t\ttopKeys.add(entry.getKey());\r\n\t\t}\r\n\t\treturn topKeys;\r\n\t\t/*\r\n\t\t//Another implementation that first sorting.\r\n\t\tList<Map.Entry<K, V>> topEntities = SortMapByValue(map);\r\n\t\tArrayList<K> topKeys = new ArrayList<K>();\r\n\t\tfor (Map.Entry<K, V> entity : topEntities) {\r\n\t\t\tif (topKeys.size() >= topK)\tbreak;\r\n\t\t\tif (!ignoreSet.contains(entity.getKey())) {\r\n\t\t\t\ttopKeys.add(entity.getKey());\r\n\t\t\t}\r\n\t\t}\r\n\t\treturn topKeys; */\r\n\t}\r\n\t\r\n\t/**\r\n\t * Convert an int[] to ArrayList<Integer>\r\n\t */\r\n\tpublic static ArrayList<Integer> ArrayToArraylist(int[] array) {\r\n\t\tif (array == null) {\r\n\t\t\treturn new ArrayList<Integer>();\r\n\t\t}\r\n\t\tArrayList<Integer> list = new ArrayList<Integer>(array.length);\r\n\t\tfor (int val : array) {\r\n\t\t\tlist.add(val);\r\n\t\t}\r\n\t\treturn list;\r\n\t}\r\n\t\r\n\t/**\r\n\t * Count number of matches of findStr in str.\r\n\t * @param str\r\n\t * @param findStr\r\n\t * @return\r\n\t */\r\n\tpublic static int CountMatchesInString(String str, String findStr) {\r\n\t\tint lastIndex = 0;\r\n\t\tint count = 0;\r\n\t\twhile(lastIndex != -1) {\r\n\t       lastIndex = str.indexOf(findStr,lastIndex);\r\n\r\n\t       if( lastIndex != -1) {\r\n\t             count ++;\r\n\t             lastIndex+=findStr.length();\r\n\t       }\r\n\t\t}\r\n\t\treturn count;\r\n\t}\r\n\t\r\n\t/**\r\n\t * Convert a string to k-gram set.\r\n\t * @param str\r\n\t * @param size\r\n\t */\r\n\tpublic static ArrayList<String> StringToGramSet(String str, int k) {\r\n\t\tArrayList<String> grams = new ArrayList<String>();\r\n\t\tString[] words = str.split(\" \");\r\n\t\tfor(int i = 0; i <= words.length-k; i ++) {\r\n\t\t\tString gram = words[i];\r\n\t\t\tfor (int j = 1; j < k; j++) {\r\n\t\t\t\tgram += \" \" + words[i+j];\r\n\t\t\t}\r\n\t\t\tgrams.add(gram.trim());\r\n\t\t}\r\n\t\treturn grams;\r\n\t}\r\n\t\r\n\t/**\r\n\t * Randomly shuffle an int array.\r\n\t * @param array\r\n\t */\r\n\tpublic static void ShuffleArray(int[] array)\r\n\t{\r\n\t    int index, temp;\r\n\t    Random random = new Random();\r\n\t    for (int i = array.length - 1; i > 0; i--)\r\n\t    {\r\n\t        index = random.nextInt(i + 1);\r\n\t        temp = array[index];\r\n\t        array[index] = array[i];\r\n\t        array[i] = temp;\r\n\t    }\r\n\t}\r\n}\r\n"
  },
  {
    "path": "src/utils/DatasetUtil.java",
    "content": "package utils;\r\n\r\nimport java.io.BufferedReader;\r\nimport java.io.FileInputStream;\r\nimport java.io.FileNotFoundException;\r\nimport java.io.FileOutputStream;\r\nimport java.io.IOException;\r\nimport java.io.InputStreamReader;\r\nimport java.io.PrintWriter;\r\nimport java.util.ArrayList;\r\nimport java.util.Arrays;\r\nimport java.util.Collections;\r\nimport java.util.Comparator;\r\nimport java.util.HashMap;\r\nimport java.util.HashSet;\r\nimport java.util.LinkedList;\r\nimport java.util.List;\r\nimport java.util.Map;\r\nimport java.util.Map.Entry;\r\nimport java.util.Set;\r\n\r\nimport org.json.simple.JSONObject;\r\nimport org.json.simple.parser.ParseException;\r\nimport org.json.simple.parser.JSONParser;\r\n\r\nimport java.text.DateFormat;\r\nimport java.text.SimpleDateFormat;\r\n\r\nimport data_structure.SparseMatrix;\r\nimport utils.StopwordsFilter;\r\nimport data_structure.SparseVector;\r\n/**\r\n * Represent each review.\r\n * @author HeXiangnan\r\n *\r\n */\r\nclass Vote {\r\n\tpublic String user;\r\n\tpublic String item;\r\n\tpublic double rating;\r\n\tpublic Integer time;\r\n\tpublic int wordCount;\r\n\tpublic String review;\r\n\t\r\n\tpublic Vote(String user, String item, double rating, int time, int wordCount, String review) {\r\n\t\tthis.user = user;\r\n\t\tthis.item = item;\r\n\t\tthis.rating = rating;\r\n\t\tthis.time = time;\r\n\t\tthis.wordCount = wordCount;\r\n\t\tthis.review = review;\r\n\t}\r\n\t\r\n\t/**\r\n\t * Sort votes by the review time, small (old) -> large (recent)\r\n\t * @param votes\r\n\t * @return\r\n\t */\r\n\tpublic static void sortByTime(ArrayList<Vote> votes) {\r\n\t\tComparator<Vote> comparator = new Comparator<Vote> () {\r\n\t\t\tpublic int compare(Vote vote0, Vote vote1) {\r\n\t\t\t\treturn vote0.time.compareTo(vote1.time);\r\n\t\t\t}\r\n\t\t};\r\n\t\tCollections.sort(votes, comparator);\r\n\t}\r\n\t\r\n\t@Override\r\n\tpublic String toString() {\r\n\t\tString line = String.format(\"%s %s %.1f %d %d %s\", user, item, rating, time, wordCount, review);\r\n\t\treturn line;\r\n\t}\r\n}\r\n\r\npublic class DatasetUtil {\r\n\r\n\tprivate BufferedReader reader;\r\n\t\r\n\tpublic DatasetUtil() {\r\n\t}\r\n\t/*==============================================================================================\r\n\t * Process datasets, e.g. converting to .votes file, \r\n\t * splitting(train, test, validation) and filtering dataset.\r\n\t *==============================================================================================*/\r\n\t/**\r\n\t * Convert the original Amazon datasets into votes file (originally provided by HFT, Recsys'13 paper)\r\n\t * Input file format example:\r\n\t * \t\tamazon_datasets/arts.txt\r\n\t * Output file format:\r\n\t * \t  \tA list of quadruple of form (userID, itemID, rating, time), followed by #words of the review, \r\n\t *    \tfollowed by the words themselves (lower-cased).\r\n\t *    \tSee example of amazon_datasets/arts.votes\r\n\t * @param inputfileDir Directory of input dataset.\r\n\t * @param dataset Dataset name.\r\n\t * @throws IOException \r\n\t */\r\n\tpublic void ConvertTxtToVotesFile(String inputfileDir, String dataset) \r\n\t\t\tthrows IOException {\r\n\t\tString inputfileName = inputfileDir + dataset + \".txt\";\r\n\t\tString outputfileName = inputfileDir + dataset + \".votes\";\r\n\t\tSystem.out.println(\"\\nConverting to .votes file: \" + inputfileName);\r\n\t\t\r\n\t\treader = new BufferedReader(new InputStreamReader(new FileInputStream(inputfileName)));\r\n\t\tPrintWriter writer = new PrintWriter (new FileOutputStream(outputfileName));\r\n\t\t\r\n\t\tString line;\r\n\t\tString productId=\"\", userId=\"\", rating=\"\", time=\"\";\r\n\t\tint count = 0;\r\n\t\twhile((line = reader.readLine()) != null) {\r\n\t\t\tif (line.contains(\":\")) {\r\n\t\t\t\tString[] segments = line.split(\":\");\r\n\t\t\t\tString linename = segments[0].trim();\r\n\t\t\t\tif (linename.equals(\"product/productId\")) {\r\n\t\t\t\t\tproductId = segments[1].trim();\r\n\t\t\t\t}\r\n\t\t\t\tif (linename.equals(\"review/userId\")) {\r\n\t\t\t\t\tuserId = segments[1].trim();\r\n\t\t\t\t}\r\n\t\t\t\tif (linename.equals(\"review/score\")) {\r\n\t\t\t\t\trating = segments[1].trim();\r\n\t\t\t\t}\r\n\t\t\t\tif (linename.equals(\"review/time\")) {\r\n\t\t\t\t\ttime = segments[1].trim();\r\n\t\t\t\t}\r\n\t\t\t\tif (linename.equals(\"review/text\")) {\r\n\t\t\t\t\tString review_text = segments[1].trim();\r\n\t\t\t\t\tString parse_review_text = \"\";\r\n\t\t\t\t\t/*String[] review_words = parseSentence(review_text);\r\n\t\t\t\t\tfor (String review_word : review_words) {\r\n\t\t\t\t\t\treview_word = review_word.toLowerCase();\r\n\t\t\t\t\t\tparse_review_text = parse_review_text + review_word + \" \";\r\n\t\t\t\t\t}*/\r\n\t\t\t\t\t// Output to the votes file.\r\n\t\t\t\t\twriter.println(userId + \" \" + productId + \" \" + rating + \" \" + \r\n\t\t\t\t\t\t\ttime + \" \" + review_text.split(\" \").length + \" \" + review_text);\r\n\t\t\t\t\tproductId = userId = rating = time = \"\";\r\n\t\t\t\t}\r\n\t\t\t}\r\n\t\t\tif (count++ % 10000 == 0)\r\n\t\t\t\tSystem.out.print(\".\");\r\n\t\t}\r\n\t\t\r\n\t\treader.close();\r\n\t\twriter.close();\r\n\t}\r\n\t\r\n\t/**\r\n\t * Convert the original Yelp Challenge datasets into votes file.\r\n\t * Input file format example:\r\n\t * \t\tyelp_datasets/yelp_reviews_220K.json\r\n\t * Output file format:\r\n\t * \t  \tA list of quadruple of form (userID, itemID, rating, time), followed by #words of the review, \r\n\t *    \tfollowed by the words themselves (lower-cased).\r\n\t *    \tSee example of amazon_datasets/arts.votes\r\n\t * @param inputfileDir\r\n\t * @param dataset\r\n\t * @throws IOException \r\n\t * @throws ParseException \r\n\t * @throws java.text.ParseException \r\n\t */\r\n\tpublic void ConvertJsonToVotesFile(String inputfileDir, String dataset) throws IOException, ParseException, java.text.ParseException {\r\n\t\tString inputfileName = inputfileDir + dataset + \".json\";\r\n\t\tString outputfileName = inputfileDir + dataset + \".votes\";\r\n\t\tSystem.out.println(\"\\nConverting to .votes file: \" + inputfileName);\r\n\t\t\r\n\t\tBufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(inputfileName)));\r\n\t\tPrintWriter writer = new PrintWriter (new FileOutputStream(outputfileName));\r\n\t\t\r\n\t\tString line;\r\n\t\tJSONParser parser=new JSONParser();\r\n\t\tint count = 0;\r\n\t\twhile ((line = reader.readLine()) != null) {\r\n\t\t\tJSONObject obj = (JSONObject) parser.parse(line);\r\n\t\t\tString user_id = (String) obj.get(\"user_id\");\r\n\t\t\tString business_id = (String) obj.get(\"business_id\");\r\n\t\t\tString score = (Long) obj.get(\"stars\") + \".0\";\r\n\t\t\t// Parse time to unix time.\r\n\t\t\tString date = (String) obj.get(\"date\");\r\n\t\t\tString time = date.replace(\"-\", \"\") + \"0800\"; \r\n\t\t\tDateFormat dfm = new SimpleDateFormat(\"yyyyMMddHHmm\");\r\n\t\t\tLong unixtime = dfm.parse(time).getTime() / 1000;\r\n\t\t\tString review_text = (String) obj.get(\"text\");\r\n\t\t\treview_text = review_text.replace(\"|\", \" \").replace(\"\\n\", \" \");\r\n\t\t\t\r\n\t\t\t// Parse review words.\r\n\t\t\tString[] review_words = parseSentence((String) obj.get(\"text\"));\r\n\t\t\tString parse_review_text = \"\";\r\n\t\t\tfor (String review_word : review_words) {\r\n\t\t\t\tparse_review_text = parse_review_text + review_word.toLowerCase() + \" \";\r\n\t\t\t}\r\n\t\t\t// Output to the .votes file.\r\n\t\t\twriter.println(user_id + \" \" + business_id + \" \" + score + \" \" + \r\n\t\t\t\t\tunixtime + \" \" + review_words.length + \" \" + parse_review_text);\r\n\t\t\t//writer.println(user_id + \"|\" + business_id + \"|\" + score + \"|\" + \r\n\t\t\t\t//unixtime + \"|\" + review_text);\r\n\t\t\tif (count++ % 10000 == 0)\r\n\t\t\t\tSystem.out.print(\".\");\r\n\t\t}\r\n\t\t\r\n\t\tSystem.out.println(\"#reviews: \" + count);\r\n\t\treader.close();\r\n\t\twriter.close();\r\n\t}\r\n\t\r\n\t/**\r\n\t * Convert the original Yelp Challenge datasets into .raw file for lexicon construction.\r\n\t * The .raw data is used by the tool thuir-sentires.jar.\r\n\t * The format is <DOC>review_text</DOC>\r\n\t * @param inputfileDir\r\n\t * @param dataset\r\n\t * @throws IOException\r\n\t * @throws ParseException\r\n\t * @throws java.text.ParseException\r\n\t */\r\n\tpublic void ConvertJsonToRawFile(String inputfileDir, String dataset) throws IOException, ParseException, java.text.ParseException {\r\n\t\tString inputfileName = inputfileDir + dataset + \".json\";\r\n\t\tString outputfileName = inputfileDir + dataset + \".raw\";\r\n\t\tSystem.out.println(\"\\nConverting to .raw file: \" + inputfileName);\r\n\t\t\r\n\t\treader = new BufferedReader(new InputStreamReader(new FileInputStream(inputfileName)));\r\n\t\tPrintWriter writer = new PrintWriter (new FileOutputStream(outputfileName));\r\n\t\t\r\n\t\tString line;\r\n\t\tJSONParser parser=new JSONParser();\r\n\t\tint count = 0;\r\n\t\twhile ((line = reader.readLine()) != null) {\r\n\t\t\tJSONObject obj = (JSONObject) parser.parse(line);\r\n\t\t\t// Parse review words.\r\n\t\t\tString review = (String) obj.get(\"text\");\r\n\t\t\t// Output to the .raw file.\r\n\t\t\twriter.println(\"<DOC>\");\r\n\t\t\twriter.println(review);\r\n\t\t\twriter.println(\"</DOC>\");\r\n\t\t\tif (count++ % 10000 == 0)\r\n\t\t\t\tSystem.out.print(\".\");\r\n\t\t}\r\n\t\t\r\n\t\tSystem.out.println(\"\\nGenerated .raw file\" + outputfileName);\r\n\t\treader.close();\r\n\t\twriter.close();\r\n\t}\r\n\t\r\n\t/**\r\n\t * Format of .rating file:\r\n\t * Each line is: \tuser_id\\t item_id\\t ratingScore\r\n\t * @param inputfileDir\r\n\t * @param dataset\r\n\t * @throws IOException\r\n\t */\r\n\tpublic void ConvertVotesToRatingFile(String inputfileDir, String dataset) throws IOException {\r\n\t\tString inputfileName = inputfileDir + dataset + \".votes\";\r\n\t\tString outputfileName = inputfileDir + dataset + \".rating\";\r\n\t\tSystem.out.println(\"\\nConverting .votes to .rating file: \" + inputfileName);\r\n\t\t\r\n\t\treader = new BufferedReader(new InputStreamReader(new FileInputStream(inputfileName)));\r\n\t\tPrintWriter writer = new PrintWriter (new FileOutputStream(outputfileName));\r\n\t\tString line;\r\n\t\twhile ((line = reader.readLine()) != null) {\r\n\t\t\tString user = parseVotesLine(line).user;\r\n\t\t\tString item = parseVotesLine(line).item;\r\n\t\t\tdouble rating = parseVotesLine(line).rating;\r\n\t\t\t// Output to the .raw file.\r\n\t\t\twriter.printf(\"%s\\t%s\\t%f\\n\",user,item,rating);\r\n\t\t}\r\n\t\tSystem.out.println(\"Generated .rating file\" + outputfileName);\r\n\t\treader.close();\r\n\t\twriter.close();\r\n\t}\r\n\t\r\n\tpublic void ConvertVotesToRawFile(String inputfileDir, String dataset) throws IOException {\r\n\t\tString inputfileName = inputfileDir + dataset + \".votes\";\r\n\t\tString outputfileName = inputfileDir + dataset + \".raw\";\r\n\t\tSystem.out.println(\"\\nConverting .votes to .raw file: \" + inputfileName);\r\n\t\t\r\n\t\treader = new BufferedReader(new InputStreamReader(new FileInputStream(inputfileName)));\r\n\t\tPrintWriter writer = new PrintWriter (new FileOutputStream(outputfileName));\r\n\t\tString line;\r\n\t\tint count = 0;\r\n\t\twhile ((line = reader.readLine()) != null) {\r\n\t\t\tString review = parseVotesLine(line).review;\r\n\t\t\t// Output to the .raw file.\r\n\t\t\twriter.println(\"<DOC>\");\r\n\t\t\twriter.println(review);\r\n\t\t\twriter.println(\"</DOC>\");\r\n\t\t\tif (count++ % 10000 == 0)\r\n\t\t\t\tSystem.out.print(\".\");\r\n\t\t}\r\n\t\tSystem.out.println(\"\\nGenerated .raw file\" + outputfileName);\r\n\t\treader.close();\r\n\t\twriter.close();\r\n\t}\r\n\t\r\n\t\r\n\tpublic void ConvertTxtToRawFile(String inputfileDir, String dataset) throws IOException {\r\n\t\tString inputfileName = inputfileDir + dataset + \".txt\";\r\n\t\tString outputfileName = inputfileDir + dataset + \".raw\";\r\n\t\tSystem.out.println(\"\\nConverting to .raw file: \" + inputfileName);\r\n\t\t\r\n\t\treader = new BufferedReader(new InputStreamReader(new FileInputStream(inputfileName)));\r\n\t\tPrintWriter writer = new PrintWriter (new FileOutputStream(outputfileName));\r\n\t\t\r\n\t\tString line;\r\n\t\tint count = 0;\r\n\t\twhile((line = reader.readLine()) != null) {\r\n\t\t\tif (line.contains(\":\")) {\r\n\t\t\t\tString[] segments = line.split(\":\");\r\n\t\t\t\tString linename = segments[0].trim();\r\n\t\t\t\tif (linename.equals(\"review/text\")) {\r\n\t\t\t\t\tString review = segments[1].trim();\r\n\t\t\t\t\t// Output to the raw file.\r\n\t\t\t\t\twriter.println(\"<DOC>\");\r\n\t\t\t\t\twriter.println(review);\r\n\t\t\t\t\twriter.println(\"</DOC>\");\r\n\t\t\t\t}\r\n\t\t\t\tif (count++ % 10000 == 0)\r\n\t\t\t\t\tSystem.out.print(\".\");\r\n\t\t\t}\r\n\t\t}\r\n\t\t\r\n\t\treader.close();\r\n\t\twriter.close();\r\n\t}\r\n\t\r\n\t/**\r\n\t * If a user has rated an item multiple times, using the recent one.\r\n\t * @param inputDir\r\n\t * @param dataset\r\n\t * @throws IOException \r\n\t */\r\n\tpublic void RemoveDuplicateInVotesFile(String inputDir, String dataset) throws IOException {\r\n\t\tString inputFile = inputDir + dataset +\".votes\";\r\n\t\treader = new BufferedReader(new InputStreamReader(new FileInputStream(inputFile)));\r\n\t\tString outputFile = inputDir + dataset + \".votes.noDuplicate\";\r\n\t\tPrintWriter writer = new PrintWriter (new FileOutputStream(outputFile));\r\n\t\t\r\n\t\t// Build map, where key is userID_itemID\r\n\t\tHashMap<String, ArrayList<Vote>> map = new HashMap<String, ArrayList<Vote>>();\r\n\t\tString line;\r\n\t\tint count = 0;\r\n\t\twhile((line = reader.readLine()) != null) {\r\n\t\t\tVote vote = parseVotesLine(line);\r\n\t\t\tString key = vote.user + \"_\" + vote.item;\r\n\t\t\tif (!map.containsKey(key)) {\r\n\t\t\t\tmap.put(key, new ArrayList<Vote>());\r\n\t\t\t}\r\n\t\t\tmap.get(key).add(vote);\r\n\t\t\tcount ++;\r\n\t\t}\r\n\t\t\r\n\t\t// Write file.\r\n\t\tfor (Entry<String, ArrayList<Vote>> it : map.entrySet()) {\r\n\t\t\tArrayList<Vote> votes = it.getValue();\r\n\t\t\tif(it.getValue().size() > 1) { // write the latest vote.\r\n\t\t\t\tVote.sortByTime(votes);\r\n\t\t\t}\r\n\t\t\twriter.println(votes.get(votes.size() - 1).toString());\r\n\t\t}\r\n\t\t\r\n\t\tSystem.out.printf(\"Before removing duplicates, #lines: %d, after: %d\\n\", count, map.size());\r\n\t\tSystem.out.printf(\"Generated file: %s\\n\", outputFile);\r\n\t\treader.close();\r\n\t\twriter.close();\r\n\t}\r\n\t\r\n\t/**\r\n\t * \r\n\t * @param inputfileDir\r\n\t * @param K Number of test items to holdout for each user.\r\n\t * @throws IOException \r\n\t */\r\n\tpublic void SplitVotesFileRandomAllButK(String inputfileDir, String dataset, int K) throws IOException {\r\n\t\tString inputfile = inputfileDir+\"all/\" + dataset + \".votes\";\r\n\t\tBufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(inputfile)));\r\n\t\tSystem.out.printf(\"Spliting .votes file %s randomly All-But-%d\\n\", dataset, K);\r\n\t\t\r\n\t\t// Step 1: Build votes dictionary of each user.\r\n\t\tHashMap<String, ArrayList<Vote>> user_votes = new HashMap<String, ArrayList<Vote>>();\r\n\t\tString line;\r\n\t\tint numReviews = 0;\r\n\t\twhile ((line = reader.readLine()) != null ) {\r\n\t\t\tVote vote = parseVotesLine(line);\r\n\t\t\tif (vote != null) {\r\n\t\t\t\tif (!user_votes.containsKey(vote.user)) {\r\n\t\t\t\t\tuser_votes.put(vote.user, new ArrayList<Vote>());\r\n\t\t\t\t}\r\n\t\t\t\tuser_votes.get(vote.user).add(vote);\r\n\t\t\t\tnumReviews ++;\r\n\t\t\t\tif (numReviews % 10000 == 0)\tSystem.out.print(\".\");\r\n\t\t\t}\r\n\t\t}\r\n\t\t//System.out.print(\"\\n\\t #reviews: \" + numReviews + \", #users: \" + user_votes.size());\r\n\t\treader.close();\r\n\t\t\r\n\t\t// Step 2: Write the train/valid/test file.\r\n\t\t//System.out.print(\"\\n  2nd Step: Writing train/validation/split files.\");\r\n\t\treader = new BufferedReader(new InputStreamReader(new FileInputStream(inputfile)));\r\n\t\tString outputfileTrain = inputfileDir + \"train\\\\\" + dataset + \".votes\";\r\n\t\tString outputfileValid = inputfileDir + \"validation\\\\\" + dataset + \".votes\";\r\n\t\tString outputfileTest =  inputfileDir + \"test\\\\\" + dataset + \".votes\";\r\n\t\tPrintWriter writerTrain = new PrintWriter (new FileOutputStream(outputfileTrain));\r\n\t\tPrintWriter writerValid = new PrintWriter (new FileOutputStream(outputfileValid));\r\n\t\tPrintWriter writerTest  = new PrintWriter (new FileOutputStream(outputfileTest));\r\n\t\t\r\n\t\tint numTrain = 0, numValid = 0, numTest = 0;\r\n\t\tfor (String user : user_votes.keySet()) {\r\n\t\t\tArrayList<Vote> votes = user_votes.get(user);\r\n\t\t\tHashSet<Integer> samples = new HashSet<Integer>();\r\n\t\t\t// Generate for test set and valid set first.\r\n\t\t\twhile (true) {\r\n\t\t\t\tif (samples.size() >= 2*K)\tbreak;\r\n\t\t\t\tint sample = (int) (votes.size() * Math.random());\r\n\t\t\t\tif (!samples.contains(sample)) { \r\n\t\t\t\t\tsamples.add(sample);\r\n\t\t\t\t\tif (samples.size() <= K) { // add to test.\r\n\t\t\t\t\t\twriterTest.println(votes.get(sample));\r\n\t\t\t\t\t\tnumTest ++;\r\n\t\t\t\t\t} else { // add to valid.\r\n\t\t\t\t\t\twriterValid.println(votes.get(sample));\r\n\t\t\t\t\t\tnumValid ++;\r\n\t\t\t\t\t}\r\n\t\t\t\t}\r\n\t\t\t}\r\n\t\t\t// Add the remaining into training.\r\n\t\t\tfor (int i = 0; i < votes.size(); i++) {\r\n\t\t\t\tif (!samples.contains(i)) {\r\n\t\t\t\t\twriterTrain.println(votes.get(i));\r\n\t\t\t\t\tnumTrain ++;\r\n\t\t\t\t}\r\n\t\t\t}\r\n\t\t}\r\n\t\t\r\n\t\t//System.out.print(\"\\n\\t #train: \" + numTrain + \", #valid: \" + numValid + \", #test: \" + numTest);\r\n\t\treader.close();\r\n\t\twriterTrain.close();\r\n\t\twriterValid.close();\r\n\t\twriterTest.close();\r\n\t\t\r\n\t\t/*System.out.print(\"\\n Write splitted files into: \\n\");\r\n\t\tSystem.out.println(outputfileTrain);\r\n\t\tSystem.out.println(outputfileValid);\r\n\t\tSystem.out.println(outputfileTest);*/\r\n\t}\r\n\t\r\n\t/**\r\n\t * \r\n\t * @param inputfileDir\r\n\t * @param K Number of test/validation items to holdout for each user. \r\n\t * @throws IOException \r\n\t */\r\n\tpublic void SplitVotesFileByTimeAllButK(String inputfileDir, String dataset, int K) throws IOException {\r\n\t\tString inputfile = inputfileDir + dataset + \".votes\";\r\n\t\tBufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(inputfile)));\r\n\t\tSystem.out.printf(\"Spliting .votes file %s by time All-But-%d\\n\", dataset, K);\r\n\t\t\r\n\t\t// Step 1: Build votes dictionary of each user.\r\n\t\tHashMap<String, ArrayList<Vote>> user_votes = new HashMap<String, ArrayList<Vote>>();\r\n\t\tString line;\r\n\t\tint numReviews = 0;\r\n\t\twhile ((line = reader.readLine()) != null ) {\r\n\t\t\tVote vote = parseVotesLine(line);\r\n\t\t\tif (vote != null) {\r\n\t\t\t\tif (!user_votes.containsKey(vote.user)) {\r\n\t\t\t\t\tuser_votes.put(vote.user, new ArrayList<Vote>());\r\n\t\t\t\t}\r\n\t\t\t\tuser_votes.get(vote.user).add(vote);\r\n\t\t\t\tnumReviews ++;\r\n\t\t\t\tif (numReviews % 10000 == 0)\tSystem.out.print(\".\");\r\n\t\t\t}\r\n\t\t}\r\n\t\t//System.out.print(\"\\n\\t #reviews: \" + numReviews + \", #users: \" + user_votes.size());\r\n\t\treader.close();\r\n\t\t\r\n\t\t// Step 2: Sort each user's votes.\r\n\t\t//System.out.print(\"\\n  2nd Step: Sort each user's votes.\");\r\n\t\tfor (String user : user_votes.keySet()) {\r\n\t\t\tVote.sortByTime(user_votes.get(user));\r\n\t\t}\r\n\t\t\r\n\t\t// Step 3: Write the train/valid/test file.\r\n\t\t//System.out.print(\"\\n  3rd Step: Writing train/validation/split files.\");\r\n\t\treader = new BufferedReader(new InputStreamReader(new FileInputStream(inputfile)));\r\n\t\tString outputfileTrain = inputfileDir + \"train/\" + dataset + \".votes\";\r\n\t\tString outputfileValid = inputfileDir + \"validation/\" + dataset + \".votes\";\r\n\t\tString outputfileTest =  inputfileDir + \"test/\" + dataset + \".votes\";\r\n\t\tPrintWriter writerTrain = new PrintWriter (new FileOutputStream(outputfileTrain));\r\n\t\tPrintWriter writerValid = new PrintWriter (new FileOutputStream(outputfileValid));\r\n\t\tPrintWriter writerTest  = new PrintWriter (new FileOutputStream(outputfileTest));\r\n\t\t\r\n\t\tint numTrain = 0, numValid = 0, numTest = 0;\r\n\t\tfor (String user : user_votes.keySet()) {\r\n\t\t\tArrayList<Vote> votes = user_votes.get(user);\r\n\t\t\tint trainCount = votes.size() - 2 * K;\r\n\t\t\tint validCount = K;\r\n\t\t\tint testCount = K;\r\n\t\t\tfor (int i = 0; i < votes.size(); i++) {\r\n\t\t\t\tif (i < trainCount)\t{\r\n\t\t\t\t\twriterTrain.println(votes.get(i));\r\n\t\t\t\t} else if (i < trainCount + validCount) {\r\n\t\t\t\t\twriterValid.println(votes.get(i));\r\n\t\t\t\t} else {\r\n\t\t\t\t\twriterTest.println(votes.get(i));\r\n\t\t\t\t}\r\n\t\t\t}\r\n\t\t\tnumTrain += trainCount;\r\n\t\t\tnumValid += validCount;\r\n\t\t\tnumTest  += testCount;\r\n\t\t}\r\n\t\t\r\n\t\tSystem.out.print(\"\\n\\t #train: \" + numTrain + \", #valid: \" + numValid + \", #test: \" + numTest);\r\n\t\treader.close();\r\n\t\twriterTrain.close();\r\n\t\twriterValid.close();\r\n\t\twriterTest.close();\r\n\t}\r\n\t\r\n\t/**\r\n\t * Split the .vote Review dataset by reviewing time on each user basis.\r\n\t * For each user, first select the oldest reviews as train, then randomly split valid and test. \r\n\t * If a user's review number (N) is less than 10, split as <N-2, 1, 1> for <train, valid, test)\r\n\t * Output three files : train/dataset.votes, validation/dataset.votes, test/dataset.votes. \r\n\t * Three steps:\r\n\t * \t1. Build votes dictionary of each user.\r\n\t *  2. Sort each user's votes.\r\n\t *  3. Write the train/valid/test file.\r\n\t * @throws IOException \r\n\t */\r\n\tpublic void SplitVotesFileByTimePerUser(String inputfileDir, String dataset, \r\n\t\t\tdouble trainRatio, double validRatio, double testRatio) throws IOException {\r\n\t\tif (trainRatio + validRatio + testRatio != 1.0) {\r\n\t\t\tSystem.out.println(\"Error - Sum of all train,valid,test ratios are not 1, can not split!\");\r\n\t\t\treturn ;\r\n\t\t}\r\n\t\t\r\n\t\tString inputfile = inputfileDir+\"all/\" + dataset + \".votes\";\r\n\t\tBufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(inputfile)));\r\n\t\tSystem.out.print(\"\\nSpliting .votes file (by review time per user): \" + inputfile);\r\n\t\t\r\n\t\t// Step 1: Build votes dictionary of each user.\r\n\t\tHashMap<String, ArrayList<Vote>> user_votes = new HashMap<String, ArrayList<Vote>>();\r\n\t\tString line;\r\n\t\tint numReviews = 0;\r\n\t\twhile ((line = reader.readLine()) != null ) {\r\n\t\t\tVote vote = parseVotesLine(line);\r\n\t\t\tif (vote != null) {\r\n\t\t\t\tif (!user_votes.containsKey(vote.user)) {\r\n\t\t\t\t\tuser_votes.put(vote.user, new ArrayList<Vote>());\r\n\t\t\t\t}\r\n\t\t\t\tuser_votes.get(vote.user).add(vote);\r\n\t\t\t\tnumReviews ++;\r\n\t\t\t\tif (numReviews % 10000 == 0)\tSystem.out.print(\".\");\r\n\t\t\t}\r\n\t\t}\r\n\t\tSystem.out.print(\"\\n\\t #reviews: \" + numReviews + \", #users: \" + user_votes.size());\r\n\t\treader.close();\r\n\t\t\r\n\t\t// Step 2: Sort each user's votes.\r\n\t\tSystem.out.print(\"\\n  2nd Step: Sort each user's votes.\");\r\n\t\tfor (String user : user_votes.keySet()) {\r\n\t\t\tVote.sortByTime(user_votes.get(user));\r\n\t\t}\r\n\t\t\r\n\t\t// Step 3: Write the train/valid/test file.\r\n\t\tSystem.out.print(\"\\n  3rd Step: Writing train/validation/split files.\");\r\n\t\treader = new BufferedReader(new InputStreamReader(new FileInputStream(inputfile)));\r\n\t\tString outputfileTrain = inputfileDir + \"train/\" + dataset + \".votes\";\r\n\t\tString outputfileValid = inputfileDir + \"validation/\" + dataset + \".votes\";\r\n\t\tString outputfileTest =  inputfileDir + \"test/\" + dataset + \".votes\";\r\n\t\tPrintWriter writerTrain = new PrintWriter (new FileOutputStream(outputfileTrain));\r\n\t\tPrintWriter writerValid = new PrintWriter (new FileOutputStream(outputfileValid));\r\n\t\tPrintWriter writerTest  = new PrintWriter (new FileOutputStream(outputfileTest));\r\n\t\t\r\n\t\tint numTrain = 0, numValid = 0, numTest = 0;\r\n\t\tfor (String user : user_votes.keySet()) {\r\n\t\t\tArrayList<Vote> votes = user_votes.get(user);\r\n\t\t\tint trainCount, validCount, testCount;\r\n\t\t\tif (votes.size() < 3) {\r\n\t\t\t\ttrainCount = votes.size();\r\n\t\t\t\tvalidCount = 0;\r\n\t\t\t\ttestCount = 0;\r\n\t\t\t}\r\n\t\t\tif (votes.size() < 10) {\r\n\t\t\t\ttrainCount = votes.size() - 2;\r\n\t\t\t\tvalidCount = 1;\r\n\t\t\t\ttestCount = 1;\r\n\t\t\t} else {\r\n\t\t\t\ttestCount = (int) (votes.size() * testRatio);\r\n\t\t\t\tvalidCount = (int) (votes.size() * validRatio);\r\n\t\t\t\ttrainCount = votes.size() - testCount - validCount;\r\n\t\t\t}\r\n\t\t\t\r\n\t\t\tfor (int i = 0; i < votes.size(); i++) {\r\n\t\t\t\tif (i < trainCount)\t{\r\n\t\t\t\t\twriterTrain.println(votes.get(i));\r\n\t\t\t\t} else {\r\n\t\t\t\t\tif (i < trainCount + validCount)\twriterValid.println(votes.get(i));\r\n\t\t\t\t\telse \twriterTest.println(votes.get(i));\r\n\t\t\t\t}\r\n\t\t\t}\r\n\t\t\tnumTrain += trainCount;\r\n\t\t\tnumValid += validCount;\r\n\t\t\tnumTest  += testCount;\r\n\t\t}\r\n\t\t\r\n\t\tSystem.out.print(\"\\n\\t #train: \" + numTrain + \", #valid: \" + numValid + \", #test: \" + numTest);\r\n\t\treader.close();\r\n\t\twriterTrain.close();\r\n\t\twriterValid.close();\r\n\t\twriterTest.close();\r\n\t\t\r\n\t\tSystem.out.print(\"\\n Write splitted files into: \\n\");\r\n\t\tSystem.out.println(outputfileTrain);\r\n\t\tSystem.out.println(outputfileValid);\r\n\t\tSystem.out.println(outputfileTest);\r\n\t}\r\n\t\r\n\t/**\r\n\t * Only retain users whose number of reviews is not in the range of [min_reviews, max_reviews]\r\n\t * @param inputfileDir\r\n\t * @param dataset\r\n\t * @param min_reviews\r\n\t * @param max_reviews\r\n\t * @throws IOException \r\n\t */\r\n\tpublic void FilterVotesFileByUsers(String inputfileDir, String dataset, int min_reviews, int max_reviews) throws IOException {\r\n\t\tString inputfile = inputfileDir + dataset  + \".votes\";\r\n\t\tString outputfile= inputfileDir + dataset + \"_u\" + min_reviews + \"_\" + max_reviews + \".votes\";\r\n\t\treader = new BufferedReader(new InputStreamReader(new FileInputStream(inputfile)));\r\n\t\t\r\n\t\tSystem.out.printf(\"Filtering reviews with range [%d, %d] reviews/user for %s \\n\", \r\n\t\t\t\tmin_reviews, max_reviews, dataset);\r\n\t\t// Step 1: count how many reviews per user.\r\n\t\tHashMap<String, Integer> map_user_count = new HashMap<String, Integer>();\r\n\t\tString line;\r\n\t\twhile ((line = reader.readLine()) != null ) {\r\n\t\t\tString user_id = line.split(\" \")[0];\r\n\t\t\tif (!map_user_count.containsKey(user_id)) {\r\n\t\t\t\tmap_user_count.put(user_id, 0);\r\n\t\t\t}\r\n\t\t\tmap_user_count.put(user_id, map_user_count.get(user_id) + 1);\r\n\t\t}\r\n\t\treader.close();\r\n\t\tSystem.out.println(\"Before filtering, #users: \" + map_user_count.size());\r\n\t\t\r\n\t\t// Step 2: output the new filtered file.\r\n\t\treader = new BufferedReader(new InputStreamReader(new FileInputStream(inputfile)));\r\n\t\tPrintWriter writer = new PrintWriter (new FileOutputStream(outputfile));\r\n\t\twhile ((line = reader.readLine())!= null) {\r\n\t\t\tString user_id = line.split(\" \")[0];\r\n\t\t\tif (map_user_count.containsKey(user_id) && map_user_count.get(user_id) >= min_reviews &&\r\n\t\t\t\t\tmap_user_count.get(user_id) <= max_reviews) {\r\n\t\t\t\twriter.println(line);\r\n\t\t\t} else {\r\n\t\t\t\tmap_user_count.remove(user_id);\r\n\t\t\t}\r\n\t\t}\r\n\t\treader.close();\r\n\t\twriter.close();\r\n\t\tSystem.out.println(\"After filtering, #users: \" + map_user_count.size());\r\n\t\tSystem.out.println(\"Write the filtered file in: \" + outputfile);\r\n\t}\r\n\t\r\n\t/**\r\n\t * Filter a user if his/her number of reviews is less than the input threshold min_reviews.\r\n\t * @param inputfileDir\r\n\t * @param dataset\r\n\t * @param min_reviews\r\n\t * @throws IOException \r\n\t */\r\n\tpublic void FilterVotesFileByUsers(String inputfileDir, String dataset, int min_reviews) throws IOException {\r\n\t\tString inputfile = inputfileDir + dataset  + \".votes\";\r\n\t\tString outputfile= inputfileDir + dataset + \"_u\" + min_reviews + \".votes\";\r\n\t\treader = new BufferedReader(new InputStreamReader(new FileInputStream(inputfile)));\r\n\t\t\r\n\t\tSystem.out.println(\"Filtering \" + inputfile + \" with min_reviews per user: \" + min_reviews);\r\n\t\t// Step 1: count how many reviews per user.\r\n\t\tHashMap<String, Integer> map_user_count = new HashMap<String, Integer>();\r\n\t\tString line;\r\n\t\twhile ((line = reader.readLine()) != null ) {\r\n\t\t\tString user_id = line.split(\" \")[0];\r\n\t\t\tif (!map_user_count.containsKey(user_id)) {\r\n\t\t\t\tmap_user_count.put(user_id, 0);\r\n\t\t\t}\r\n\t\t\tmap_user_count.put(user_id, map_user_count.get(user_id) + 1);\r\n\t\t}\r\n\t\treader.close();\r\n\t\tSystem.out.println(\"Before filtering, #users: \" + map_user_count.size());\r\n\t\t\r\n\t\t// Step 2: output the new filtered file.\r\n\t\treader = new BufferedReader(new InputStreamReader(new FileInputStream(inputfile)));\r\n\t\tPrintWriter writer = new PrintWriter (new FileOutputStream(outputfile));\r\n\t\twhile ((line = reader.readLine())!= null) {\r\n\t\t\tString user_id = line.split(\" \")[0];\r\n\t\t\tif (map_user_count.containsKey(user_id) && map_user_count.get(user_id) >= min_reviews) {\r\n\t\t\t\twriter.println(line);\r\n\t\t\t} else {\r\n\t\t\t\tmap_user_count.remove(user_id);\r\n\t\t\t}\r\n\t\t}\r\n\t\treader.close();\r\n\t\twriter.close();\r\n\t\tSystem.out.println(\"After filtering, #users: \" + map_user_count.size());\r\n\t\tSystem.out.println(\"Write the filtered file in: \" + outputfile);\r\n\t}\r\n\r\n\t/**\r\n\t * Filter an item if its number of reviews is less than the input threshold min_reviews.\r\n\t * @param inputfileDir\r\n\t * @param dataset\r\n\t * @param min_reivews\r\n\t * @throws IOException \r\n\t */\r\n\tpublic void FilterVotesFileByItems(String inputfileDir, String dataset, int min_reviews) throws IOException {\r\n\t\tString inputfile = inputfileDir + dataset + \".votes\";\r\n\t\tString outputfile= inputfileDir + dataset + \"_i\" + min_reviews + \".votes\";\r\n\t\treader = new BufferedReader(new InputStreamReader(new FileInputStream(inputfile)));\r\n\t\t\r\n\t\tSystem.out.println(\"Filtering \" + inputfile + \" with min_reviews per item: \" + min_reviews);\r\n\t\t// Step 1: count how many reviews per item.\r\n\t\tHashMap<String, Integer> map_item_count = new HashMap<String, Integer>();\r\n\t\tString line;\r\n\t\twhile ((line = reader.readLine()) != null ) {\r\n\t\t\tString item_id = line.split(\" \")[1];\r\n\t\t\tif (!map_item_count.containsKey(item_id)) {\r\n\t\t\t\tmap_item_count.put(item_id, 0);\r\n\t\t\t}\r\n\t\t\tmap_item_count.put(item_id, map_item_count.get(item_id) + 1);\r\n\t\t}\r\n\t\treader.close();\r\n\t\tSystem.out.println(\"Before filtering, #item: \" + map_item_count.size());\r\n\t\t\r\n\t\t// Step 2: output the new filtered file.\r\n\t\treader = new BufferedReader(new InputStreamReader(new FileInputStream(inputfile)));\r\n\t\tPrintWriter writer = new PrintWriter (new FileOutputStream(outputfile));\r\n\t\twhile ((line = reader.readLine())!= null) {\r\n\t\t\tString item_id = line.split(\" \")[1];\r\n\t\t\tif (map_item_count.containsKey(item_id) && map_item_count.get(item_id) >= min_reviews) {\r\n\t\t\t\twriter.println(line);\r\n\t\t\t} else {\r\n\t\t\t\tmap_item_count.remove(item_id);\r\n\t\t\t}\r\n\t\t}\r\n\t\treader.close();\r\n\t\twriter.close();\r\n\t\tSystem.out.println(\"After filtering, #items: \" + map_item_count.size());\r\n\t\tSystem.out.println(\"Write the filtered file in: \" + outputfile);\r\n\t}\r\n\t\r\n\t/**\r\n\t * Check the user overlap of two votes datasets\r\n\t * @param dir\r\n\t * @param dataset1\r\n\t * @param dataset2\r\n\t * @throws IOException \r\n\t */\r\n\tpublic void checkOverlapUsers(String dir, String dataset1, String dataset2) throws IOException {\r\n\t\tString file1 = dir + dataset1 + \".votes\";\r\n\t\tString file2 = dir + dataset2 + \".votes\";\r\n\t\tint userIndex = 0; // the index of user in votes file.\r\n\t\t\r\n\t\t// Read users of dataset1\r\n\t\tHashSet<String> users1 = new HashSet<String>();\r\n\t\treader = new BufferedReader(new InputStreamReader(new FileInputStream(file1)));\r\n\t\tString line;\r\n\t\tint count=0;\r\n\t\twhile ((line = reader.readLine()) != null) {\r\n\t\t\tif (count++ % 100000 == 0)\tSystem.out.print(\".\");\r\n\t\t\tString user = line.split(\" \")[userIndex];\r\n\t\t\tusers1.add(user);\r\n\t\t}\r\n\t\treader.close();\r\n\t\tSystem.out.println(\"\");\r\n\t\t\r\n\t\t// Read users of dataset2\r\n\t\tHashSet<String> users2 = new HashSet<String>();\r\n\t\treader = new BufferedReader(new InputStreamReader(new FileInputStream(file2)));\r\n\t\twhile ((line = reader.readLine()) != null) {\r\n\t\t\tif (count++ % 100000 == 0)\tSystem.out.print(\".\");\r\n\t\t\tString user = line.split(\" \")[userIndex];\r\n\t\t\tusers2.add(user);\r\n\t\t}\r\n\t\tSystem.out.println(\"\");\r\n\r\n\t\tHashSet<String> intersection = new HashSet<String>(users1);\r\n\t\tintersection.retainAll(users2);\r\n\t\tSystem.out.printf(\"#overlap users of <%s, %s>: %d \\t %.2f%%, %.2f%%\\n\", \r\n\t\t\t\tdataset1, dataset2, intersection.size(), intersection.size()/ (users1.size()/100.0), \r\n\t\t\t\tintersection.size()/ (users2.size()/100.0));\r\n\t\treader.close();\r\n\t}\r\n\t\r\n\t/**\r\n\t * Only retain top occurrence words of a review. \r\n\t * @param inputfileDir\r\n\t * @param dataset\r\n\t * @param maxWords The number of (top occurrence) words in the word dictionary.\r\n\t * @throws IOException\r\n\t * @throws LangDetectException \r\n\t */\r\n\tpublic void FilterVotesReviewsByWords(String inputfileDir, String dataset, int maxWords) \r\n\t\t\tthrows IOException {\r\n\t\tString inputfile = inputfileDir + dataset + \".votes\";\r\n\t\tString outputfile = inputfileDir + dataset + \"_w\" + maxWords/1000 + \"k.votes\";\r\n\t\treader = new BufferedReader(new InputStreamReader(new FileInputStream(inputfile)));\r\n\t\t\r\n\t\tSystem.out.print(\"\\nFiltering reviews by words: \" + dataset);\r\n\t\t// Step 1: Build word dictionary.\r\n\t\tHashMap<String, Integer> map_word_id = new HashMap<String, Integer>();\r\n\t\tthis.buildWordsDictionary(inputfile, map_word_id, maxWords);\r\n\t\t\r\n\t\t// Step 2: Write the filtered file.\r\n\t\tPrintWriter writer = new PrintWriter (new FileOutputStream(outputfile));\r\n\t\tString line;\r\n\t\twhile ((line = reader.readLine()) != null) {\r\n\t\t\tString[] arr = line.split(\" \");\r\n\t\t\tString filtered_review_text = \"\";\r\n\t\t\tint wordcount = 0;\r\n\t\t\tfor (int i = 5; i < arr.length; i++) {\r\n\t\t\t\tString word = arr[i];\r\n\t\t\t\tif (map_word_id.containsKey(word)) {\r\n\t\t\t\t\twordcount ++;\r\n\t\t\t\t\tfiltered_review_text = filtered_review_text + word + \" \";\r\n\t\t\t\t}\r\n\t\t\t}\r\n\t\t\twriter.printf(\"%s %s %s %s %d %s\\n\", \r\n\t\t\t\t\tarr[0], arr[1], arr[2], arr[3], wordcount, filtered_review_text);\r\n\t\t}\r\n\t\t\r\n\t\tSystem.out.println(\"\\nWrite the filtered file in: \" + outputfile);\r\n\t\twriter.close();\r\n\t\treader.close();\r\n\t}\r\n\t\r\n\t/**\r\n\t * Write a matrix into file. \r\n\t * Format of each line: row_id [non-zero entryCount]: (col1, val1), (col2, val2) ... \r\n\t * @param matrix\r\n\t * @param filename\r\n\t * @throws IOException\r\n\t */\r\n\tpublic static void writeMatrixToFile(SparseMatrix matrix, String filename) throws IOException {\r\n\t\tPrintWriter writer = new PrintWriter (new FileOutputStream(filename));\r\n\t\tint rowCount = matrix.length()[0];\r\n\t\tfor (int i = 1; i < rowCount; i++) {\r\n\t\t\tArrayList<Integer> indexList = matrix.getRowRef(i).indexList();\r\n\t\t\tString line;\r\n\t\t\tif (indexList.size() == 0) {\r\n\t\t\t\tline = String.format(\"%d [0]:\\t\", i);\r\n\t\t\t} else {\r\n\t\t\t\tline = String.format(\"%d [%d]:\\t\", i, indexList.size());\r\n\t\t\t\tfor (int j : indexList) {\r\n\t\t\t\t\tline += String.format(\"(%d, %.4f)\\t\", j, matrix.getValue(i, j));\r\n\t\t\t\t}\r\n\t\t\t}\r\n\t\t\twriter.println(line);\r\n\t\t}\r\n\t\twriter.close();\r\n\t}\r\n\t\r\n\t\r\n\t/**\r\n\t * Process the .lexicon file (generate by thuir-sentires.rar tool), and generate feature set.\r\n\t * Select top features by descending order of number of opinions.\r\n\t * \r\n\t * @param lexiconFile\r\n\t * @param aspectRatio Percentage of top aspects to read. \r\n\t * @return\r\n\t * @throws IOException\r\n\t */\r\n\tstatic public HashMap<String, HashSet<String>> loadFeaturesFromLexiconFile(String lexiconFile, \r\n\t\t\tdouble aspectRatio) throws IOException {\r\n\t\tHashMap<String, HashSet<String>> map_feature_opinion = \r\n\t\t\t\tnew HashMap<String, HashSet<String>>();\r\n\t\t\r\n\t\t// System.out.println(\"Loading features from lexicon file: \" + lexiconFile);\r\n\t\tBufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(lexiconFile)));\r\n\t\tString line;\r\n\t\twhile ((line = reader.readLine()) != null) {\r\n\t\t\tString feature_opinion = line.split(\"\\t\")[1];\r\n\t\t\tString feature = feature_opinion.split(\"\\\\|\")[0].replaceAll(\"!\", \"\").trim();\r\n\t\t\tString opinion = feature_opinion.split(\"\\\\|\")[1].trim();\r\n\t\t\tif (!map_feature_opinion.containsKey(feature)) {\r\n\t\t\t\tmap_feature_opinion.put(feature, new HashSet<String>());\r\n\t\t\t}\r\n\t\t\tmap_feature_opinion.get(feature).add(opinion);\r\n\t\t}\r\n\t\tSystem.out.println(\"Feature count in total: \" + map_feature_opinion.size()); \r\n\t\t\r\n\t\t// Select features by descending order of number of opinions.\r\n\t\tint aspectNum = (int) (map_feature_opinion.size() * aspectRatio);\r\n\t\tHashMap<String, Integer> map_feature_count = new HashMap<String, Integer>();\r\n\t\tfor (Map.Entry<String, HashSet<String>> entry : map_feature_opinion.entrySet()) {\r\n\t\t\tmap_feature_count.put(entry.getKey(), entry.getValue().size());\r\n\t\t}\r\n\t\tHashSet<String> topFeatures = new HashSet<String>(\r\n\t\t\t\tCommonUtils.TopKeysByValue(map_feature_count, aspectNum, null));\r\n\t\tSet<String> featureSet = new HashSet<String>(map_feature_opinion.keySet());\r\n\t\tfor (String feature : featureSet) {\r\n\t\t\tif (!topFeatures.contains(feature)) {\r\n\t\t\t\tmap_feature_opinion.remove(feature);\r\n\t\t\t}\r\n\t\t}\r\n\t\t\r\n\t\treader.close();\r\n\t\t// System.out.println(\"# of features loaded: \" + map_feature_opinion.size());\r\n\t\treturn map_feature_opinion;\r\n\t}\r\n\t\r\n\tstatic public HashSet<String> loadFeaturesFromFeatureFile(String featureFile) \r\n\t\t\tthrows IOException {\r\n\t\tBufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(featureFile)));\r\n\t\tHashSet<String> features = new HashSet<String>();\r\n\t\tString line;\r\n\t\twhile ((line = reader.readLine()) != null) {\r\n\t\t\tString[] arr = line.trim().split(\"\\t\");\r\n\t\t\tif (arr != null && arr.length > 1) {\r\n\t\t\t\tfeatures.add(arr[0]);\r\n\t\t\t}\r\n\t\t}\r\n\t\treader.close();\r\n\t\treturn features;\r\n\t}\r\n\t\r\n\t/**\r\n\t * Load positive Features.\r\n\t * @param lexiconFile\r\n\t * @return\r\n\t * @throws IOException\r\n\t */\r\n\tstatic public HashMap<String, HashSet<String>> loadPosFeaturesFromLexiconFile(String lexiconFile, \r\n\t\t\tdouble aspectRatio) throws IOException {\r\n\t\tHashMap<String, HashSet<String>> map_feature_opinion = \r\n\t\t\t\tnew HashMap<String, HashSet<String>>();\r\n\t\t\r\n\t\t// System.out.println(\"Loading features from lexicon file: \" + lexiconFile);\r\n\t\tBufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(lexiconFile)));\r\n\t\tString line;\r\n\t\twhile ((line = reader.readLine()) != null) {\r\n\t\t\tif (!line.contains(\"[1]\"))\tcontinue; // Only consider positive FO pairs.\r\n\t\t\tString feature_opinion = line.split(\"\\t\")[1];\r\n\t\t\tString feature = feature_opinion.split(\"\\\\|\")[0].replaceAll(\"!\", \"\").trim();\r\n\t\t\tString opinion = feature_opinion.split(\"\\\\|\")[1].trim();\r\n\t\t\tif (!map_feature_opinion.containsKey(feature)) {\r\n\t\t\t\tmap_feature_opinion.put(feature, new HashSet<String>());\r\n\t\t\t}\r\n\t\t\tmap_feature_opinion.get(feature).add(opinion);\r\n\t\t}\r\n\t\tSystem.out.printf(\"Feature count in total: %d. \", map_feature_opinion.size()); \r\n\t\t\r\n\t\t// Select features by descending order of number of opinions.\r\n\t\tint aspectNum = (int) (map_feature_opinion.size() * aspectRatio);\r\n\t\tHashMap<String, Integer> map_feature_count = new HashMap<String, Integer>();\r\n\t\tfor (Map.Entry<String, HashSet<String>> entry : map_feature_opinion.entrySet()) {\r\n\t\t\tmap_feature_count.put(entry.getKey(), entry.getValue().size());\r\n\t\t}\r\n\t\tHashSet<String> topFeatures = new HashSet<String>(\r\n\t\t\t\tCommonUtils.TopKeysByValue(map_feature_count, aspectNum, null));\r\n\t\tSet<String> featureSet = new HashSet<String>(map_feature_opinion.keySet());\r\n\t\tfor (String feature : featureSet) {\r\n\t\t\tif (!topFeatures.contains(feature)) {\r\n\t\t\t\tmap_feature_opinion.remove(feature);\r\n\t\t\t}\r\n\t\t}\r\n\t\t\r\n\t\t// Count number of F-O pairs.\r\n\t\tint count = 0;\r\n\t\tfor (String feature : map_feature_opinion.keySet()) {\r\n\t\t\tcount += map_feature_opinion.get(feature).size();\r\n\t\t}\r\n\t\tSystem.out.printf(\"Positive F-O pairs: %d. \", count);\r\n\r\n\t\treader.close();\r\n\t\treturn map_feature_opinion;\r\n\t}\r\n\t\r\n\t/**\r\n\t * Filter FO pairs that do not occur in the training votes file.\r\n\t */\r\n\tstatic public HashMap<String, ArrayList<String>> filterFOpairs(\r\n\t\t\tHashMap<String, ArrayList<String>> feature_opinions, String votesFile) throws IOException {\r\n\t\tHashSet<String> features = new HashSet<String>(feature_opinions.keySet());\r\n\t\tHashMap<String, ArrayList<String>> filteredFO = new HashMap<String, ArrayList<String>>();\r\n\t\tfor (String feature : feature_opinions.keySet()) {\r\n\t\t\tfilteredFO.put(feature, new ArrayList<String>());\r\n\t\t}\r\n\t\t\r\n\t\tBufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(votesFile)));\r\n\t\tString line;\r\n\t\twhile ((line = reader.readLine()) != null) {\r\n\t\t\tVote vote = parseVotesLine(line);\r\n\t\t\t// Find all features occurred in the review.\r\n\t\t\tHashSet<String> find_features = findFeaturesFromReview(features, vote.review);\r\n\t\t\tfor (String feature : find_features) {\r\n\t\t\t\t// Find all opinions occurred in the review.\r\n\t\t\t\tHashSet<String> opinions = new HashSet<String>(feature_opinions.get(feature));\r\n\t\t\t\topinions = findFeaturesFromReview(opinions, vote.review);\r\n\t\t\t\tfor (String opinion : opinions) {\r\n\t\t\t\t\tfilteredFO.get(feature).add(opinion);\r\n\t\t\t\t\tfeature_opinions.get(feature).remove(opinion);\r\n\t\t\t\t}\r\n\t\t\t}\r\n\t\t}\r\n\t\treader.close();\r\n\t\t// Count number of F-O pairs.\r\n\t\tint count = 0;\r\n\t\tfor (String feature : filteredFO.keySet()) {\r\n\t\t\tcount += filteredFO.get(feature).size();\r\n\t\t}\r\n\t\tSystem.out.println(\"Filtered F-O pairs: \" + count);\r\n\t\t\r\n\t\treturn filteredFO;\r\n\t}\r\n\t\r\n\t/*==============================================================================================\r\n\t * Private and protected functions.\r\n\t *==============================================================================================*/\r\n\t\r\n\t/** Build itemWordsMatrix and userWordsMatrix based on the input user, item and word dictionary.\r\n\t * \r\n\t * @param trainFileName\r\n\t * @param itemWordsMatrix\r\n\t * @param userWordsMatrix\r\n\t * @param map_item_id Dictionary of all items (id starts from 1)\r\n\t * @param map_user_id Dictionary of all users (id starts from 1)\r\n\t * @param map_word_id Dictionary of all words (id starts from 1)\r\n\t * @throws IOException \r\n\t */\r\n\tpublic void buildWordsMatrix(String fileName, SparseMatrix itemWordsMatrix, \r\n\t\t\tSparseMatrix userWordsMatrix, HashMap<String, Integer> map_item_id, \r\n\t\t\tHashMap<String, Integer> map_user_id, HashMap<String, Integer> map_word_id) throws IOException {\r\n\t\tBufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(fileName)));\r\n\t\tString line;\r\n\t\twhile ((line = reader.readLine()) != null) {\r\n\t\t\tString[] arr = line.split(\" \");\r\n\t\t\tif (arr.length >= 4) {\r\n\t\t\t\t// Extract item, user and review words.\r\n\t\t\t\tint userID = map_user_id.get(arr[0]);\r\n\t\t\t\tint itemID = map_item_id.get(arr[1]);\r\n\t\t\t\tfor (int i = 5; i < arr.length; i++) {\r\n\t\t\t\t\tString word = arr[i].trim();\r\n\t\t\t\t\tif (map_word_id.containsKey(word)) {\r\n\t\t\t\t\t\tint wordID = map_word_id.get(word);\r\n\t\t\t\t\t\tuserWordsMatrix.setValue(userID, wordID, userWordsMatrix.getValue(userID, wordID) + 1);\r\n\t\t\t\t\t\titemWordsMatrix.setValue(itemID, wordID, itemWordsMatrix.getValue(itemID, wordID) + 1);\r\n\t\t\t\t\t}\r\n\t\t\t\t}\r\n\t\t\t}\r\n\t\t}\r\n\t\treader.close();\r\n\t}\r\n\r\n\t/**\r\n\t * Build words dictionary (from votes file).\r\n\t * Only consider English reviews.\r\n\t * \r\n\t * @param fileName\r\n\t * @param map_word_id Save the results of word dictionary.\r\n\t * @param maxWords The maximum words in the dictionary (select top words). To disable the function, set it as 0.\r\n\t * @throws IOException\r\n\t * @throws LangDetectException \r\n\t */\r\n\tpublic void buildWordsDictionary(String fileName, HashMap<String, Integer> map_word_id, \r\n\t\t\tint maxWords) throws IOException {\r\n\t\tBufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(fileName)));\r\n\t\t\r\n\t\t// map from word to its number of occurrence.\r\n\t\tHashMap<String, Integer> map_word_count = new HashMap<String, Integer>();\r\n\t\tStopwordsFilter.init(\"lib/stopwords.txt\");\r\n\t\t\r\n\t\tString line;\r\n\t\tint linecount=0;\r\n\t\twhile ((line = reader.readLine()) != null) {\r\n\t\t\tVote vote = parseVotesLine(line);\r\n\t\t\tif (vote!=null) {\r\n\t\t\t\t// Process review words.\r\n\t\t\t\tString review_text = vote.review.trim();\r\n\t\t\t\t// if (!LanguageDetector.isEnglish(review_text))  continue; // Filter nonEnglish reviews.\r\n\t\t\t\tfor (String word : review_text.split(\" \")) {\r\n\t\t\t\t\tif (StopwordsFilter.isStopword(word))\tcontinue; // Filter stopwords.\r\n\t\t\t\t\tif (word.matches(\".*\\\\d+.*\"))\tcontinue;// Filter word that contains digit.\r\n\t\t\t\t\tif (!map_word_count.containsKey(word)) {\r\n\t\t\t\t\t\tmap_word_count.put(word, 0);\r\n\t\t\t\t\t}\r\n\t\t\t\t\tmap_word_count.put(word, map_word_count.get(word) + 1);\r\n\t\t\t\t}\r\n\t\t\t}\r\n\t\t\tif (linecount % 10000 == 0) {\r\n\t\t\t\tSystem.out.print(\".\");\r\n\t\t\t}\r\n\t\t\tlinecount++;\r\n\t\t}\r\n\t\t\r\n\t\t// System.out.print(\"\\nBefore filtering, dictionary_size: \" + map_word_count.size() +\", after filtering: \" + maxWords);\r\n\t\t\r\n\t\t// Use the most frequent maxWords as the word dictionary.\r\n\t\tList<Map.Entry<String, Integer>> sortedMap;\r\n\t\tif (maxWords > 0) {\r\n\t\t\tsortedMap = mostFrequentEntries(map_word_count, maxWords);\r\n\t\t} else {\r\n\t\t\tsortedMap = CommonUtils.SortMapByValue(map_word_count);\r\n\t\t}\r\n\t\t\r\n\t\t// Words are sorted by its number of occurrence. \r\n\t\tfor (Map.Entry<String, Integer> entity : sortedMap) {\r\n\t\t\tmap_word_id.put(entity.getKey(), map_word_id.size());\r\n\t\t}\t\r\n\t\treader.close();\r\n\t}\r\n\t\r\n\t/**\r\n\t * Build aspects matrix of the input .votes file, given the user/item/aspect dictionary.\r\n\t * In this function, an aspect represents a F-O pair.\r\n\t * @param votesFile\r\n\t * @param map_user_id\r\n\t * @param map_item_id\r\n\t * @param map_aspect_id\r\n\t * @param itemAspect\r\n\t * @param userAspect\r\n\t * @throws IOException\r\n\t */\r\n\tstatic public void buildAspectsMatrix_FO(String votesFile, HashMap<String, Integer> map_user_id, \r\n\t\t\t HashMap<String, Integer> map_item_id,  HashMap<String, Integer> map_aspect_id, \r\n\t\t\t SparseMatrix itemAspect, SparseMatrix userAspect, \r\n\t\t\t HashMap<String, HashSet<String>> map_feature_opinions) throws IOException {\r\n\t\tBufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(votesFile)));\r\n\t\tHashSet<String> features = new HashSet<String>(map_feature_opinions.keySet());\r\n\t\tString line;\r\n\t\twhile((line = reader.readLine()) != null) {\r\n\t\t\tVote vote = parseVotesLine(line);\r\n\t\t\tint userId = map_user_id.get(vote.user);\r\n\t\t\tint itemId = map_item_id.get(vote.item);\r\n\t\t\tHashSet<String> find_features = findFeaturesFromReview(features, vote.review);\r\n\t\t\tfor (String feature : find_features) {\r\n\t\t\t\tHashSet<String> find_opinions = findOpinionsFromReview(map_feature_opinions.get(feature), vote.review);\r\n\t\t\t\tfor (String opinion: find_opinions) {\r\n\t\t\t\t\tString aspect = feature + \"|\" + opinion;\r\n\t\t\t\t\tint aspectId = map_aspect_id.get(aspect);\r\n\t\t\t\t\titemAspect.setValue(itemId, aspectId, itemAspect.getValue(itemId, aspectId) + 1);\r\n\t\t\t\t\tuserAspect.setValue(userId, aspectId, userAspect.getValue(userId, aspectId) + 1);\r\n\t\t\t\t}\r\n\t\t\t}\r\n\t\t}\r\n\t\t\r\n\t\treader.close();\r\n\t}\r\n\t\r\n\t/**\r\n\t * Find features from the review.\r\n\t * @param features\r\n\t * @param review\r\n\t * @return\r\n\t */\r\n\tprivate static HashSet<String> findFeaturesFromReview(HashSet<String> features, String review) {\r\n\t\t// A feature may contain at most 3 words.\r\n\t\tHashSet<String> grams = new HashSet<String>();\r\n\t\tgrams.addAll(CommonUtils.StringToGramSet(review, 1));\r\n\t\tgrams.addAll(CommonUtils.StringToGramSet(review, 2));\r\n\t\tgrams.addAll(CommonUtils.StringToGramSet(review, 3));\r\n\t\t\r\n\t\tgrams.retainAll(features);\r\n\t\treturn grams;\r\n\t}\r\n\t\r\n\tprivate static HashSet<String> findOpinionsFromReview(HashSet<String> opinions, String review) {\r\n\t\tHashSet<String> grams = new HashSet<String>();\r\n\t\tgrams.addAll(CommonUtils.StringToGramSet(review, 1));\r\n\t\t\r\n\t\tgrams.retainAll(opinions);\r\n\t\treturn grams;\t\t\r\n\t}\r\n\t\r\n\tprotected void statReviewsPerItem(String inputfileDir, String dataset) throws IOException {\r\n\t\tString inputfile = inputfileDir + dataset + \".votes\";\r\n\t\treader = new BufferedReader(new InputStreamReader(new FileInputStream(inputfile)));\r\n\t\t\r\n\t\t// Build <item, count> dictionary.\r\n\t\tSystem.out.print(dataset);\r\n\t\tHashMap<String, Integer> map_item_count = new HashMap<String, Integer>();\r\n\t\tString line;\r\n\t\twhile ((line = reader.readLine()) != null) {\r\n\t\t\tString item_id = line.split(\" \") [1];\r\n\t\t\tif (!map_item_count.containsKey(item_id)) {\r\n\t\t\t\tmap_item_count.put(item_id, 0);\r\n\t\t\t\tif (map_item_count.size() % 10000 == 0)\r\n\t\t\t\t\tSystem.out.print(\".\");\r\n\t\t\t}\r\n\t\t\tmap_item_count.put(item_id, map_item_count.get(item_id) + 1);\r\n\t\t}\r\n\t\t\r\n\t\t// Revert the dictionary to <count, number of items>\r\n\t\tHashMap<Integer, Integer> map_count_items = new HashMap<Integer, Integer>();\r\n\t\tfor (String item : map_item_count.keySet()) {\r\n\t\t\tint count = map_item_count.get(item);\r\n\t\t\tif (!map_count_items.containsKey(count)) \r\n\t\t\t\tmap_count_items.put(count, 0);\r\n\t\t\tmap_count_items.put(count, map_count_items.get(count) + 1);\r\n\t\t}\r\n\t\t\r\n\t\t// Print map_count_items statistics.\r\n\t\tint itemCount = map_item_count.size();\r\n\t\tSystem.out.println(\"\\nAll Items: \" + itemCount);\r\n\t\tSystem.out.println(\"#Reviews: \\t #Items: \\t Percentage\");\r\n\t\tArrayList<Integer> counts = new ArrayList<Integer>(map_count_items.keySet());\r\n\t\tCollections.sort(counts);\r\n\t\tint count10 = 0;\r\n\t\tfor (Integer count : counts) {\r\n\t\t\tint items = map_count_items.get(count);\r\n\t\t\tif (count < 10)\r\n\t\t\t\tSystem.out.printf (\"%d \\t\\t %d \\t\\t %.4f \\n\", count, items, (double)items / itemCount * 100);\r\n\t\t\telse \r\n\t\t\t\tcount10 += items;\r\n\t\t}\r\n\t\tSystem.out.printf (\">=10 \\t\\t %d \\t\\t %.4f \\n\", count10, (double)count10 / itemCount * 100);\r\n\t\treader.close();\r\n\t}\r\n\t\r\n\tprotected void statReviewsPerUser(String inputfileDir, String dataset) throws IOException {\r\n\t\tString inputfile = inputfileDir + dataset + \".votes\";\r\n\t\treader = new BufferedReader(new InputStreamReader(new FileInputStream(inputfile)));\r\n\t\t\r\n\t\t// Build <user, count> dictionary.\r\n\t\tSystem.out.print(dataset);\r\n\t\tHashMap<String, Integer> map_user_count = new HashMap<String, Integer>();\r\n\t\tString line;\r\n\t\twhile ((line = reader.readLine()) != null) {\r\n\t\t\tString user_id = line.split(\" \") [0];\r\n\t\t\tif (!map_user_count.containsKey(user_id)) {\r\n\t\t\t\tmap_user_count.put(user_id, 0);\r\n\t\t\t\tif (map_user_count.size() % 10000 == 0)\r\n\t\t\t\t\tSystem.out.print(\".\");\r\n\t\t\t}\r\n\t\t\tmap_user_count.put(user_id, map_user_count.get(user_id) + 1);\r\n\t\t}\r\n\t\t\r\n\t\t// Revert the dictionary to <count, number of users>\r\n\t\tHashMap<Integer, Integer> map_count_users = new HashMap<Integer, Integer>();\r\n\t\tfor (String user : map_user_count.keySet()) {\r\n\t\t\tint count = map_user_count.get(user);\r\n\t\t\tif (!map_count_users.containsKey(count)) \r\n\t\t\t\tmap_count_users.put(count, 0);\r\n\t\t\tmap_count_users.put(count, map_count_users.get(count) + 1);\r\n\t\t}\r\n\t\t\r\n\t\t// Print map_count_users statistics.\r\n\t\tint userCount = map_user_count.size();\r\n\t\tSystem.out.println(\"\\nAll Users: \" + userCount);\r\n\t\tSystem.out.println(\"#Reviews: \\t #Users: \\t Percentage\");\r\n\t\tArrayList<Integer> counts = new ArrayList<Integer>(map_count_users.keySet());\r\n\t\tCollections.sort(counts);\r\n\t\tint count10 = 0;\r\n\t\tfor (Integer count : counts) {\r\n\t\t\tint users = map_count_users.get(count);\r\n\t\t\tif (count < 10)\r\n\t\t\t\tSystem.out.printf (\"%d \\t\\t %d \\t\\t %.4f \\n\", count, users, (double)users / userCount * 100);\r\n\t\t\telse \r\n\t\t\t\tcount10 += users;\r\n\t\t}\r\n\t\tSystem.out.printf (\">=10 \\t\\t %d \\t\\t %.4f \\n\", count10, (double)count10 / userCount * 100);\r\n\t\treader.close();\r\n\t}\r\n\t\r\n\t/**\r\n\t * Select the top entries (according to the weight) of a map.\r\n\t * @param map_feature_weight A map from feature to its weight.\r\n\t * @param maxEntities Maximum entries of the map to select.\r\n\t * \r\n\t * @return Sorted top entries (by its weight)\r\n\t */\r\n\tprivate List<Map.Entry<String, Integer>> mostFrequentEntries(HashMap<String, Integer >map_feature_weight, int maxEntities) {\r\n\t\tList<Map.Entry<String, Integer>> sortedEntities = CommonUtils.SortMapByValue(map_feature_weight);\r\n\t\tList<Map.Entry<String, Integer>> topEntities = new ArrayList<Map.Entry<String, Integer>>();\r\n\t\tint count = 0;\r\n\t\tfor (Map.Entry<String, Integer> entity : sortedEntities) {\r\n\t\t\ttopEntities.add(entity);\r\n\t\t\t// Output the top words and their weight.\r\n\t\t\t// System.out.println(\"\"+ count++ +\"\\t\"+ entity.getKey() + \"\\t\" + entity.getValue());\r\n\t\t\tif (topEntities.size() >= maxEntities)\r\n\t\t\t\tbreak;\r\n\t\t}\r\n\t\treturn topEntities;\r\n\t}\r\n\t\r\n\t/**\r\n\t * Revert an ID_Map (value is an unique ID)\r\n\t * @param map_feature_id Map from feature name to its ID.\r\n\t */\r\n\tstatic public HashMap<Integer, String> revertIDMap(HashMap<String, Integer> map_feature_id) {\r\n\t\tHashMap<Integer, String> map_id_feature = new HashMap<Integer, String>();\r\n\t\tfor (Map.Entry<String, Integer> entry : map_feature_id.entrySet()) {\r\n\t\t\tmap_id_feature.put(entry.getValue(), entry.getKey());\r\n\t\t}\r\n\t\treturn map_id_feature;\r\n\t}\r\n\t\r\n\t/**\r\n\t * Parse each line in votes file.\r\n\t * @param line A line in votes file\r\n\t * @return A Vote object. If it is not a valid votes line, return null.\r\n\t */\r\n\tprivate static Vote parseVotesLine(String line) {\r\n\t\tString[] arr = line.split(\" \");\r\n\t\tif (arr.length > 3) {\r\n\t\t\tString user = arr[0];\r\n\t\t\tString item = arr[1];\r\n\t\t\tdouble score = Double.parseDouble(arr[2]);\r\n\t\t\tint time  = Integer.parseInt(arr[3]);\r\n\t\t\tint wordCount = Integer.parseInt(arr[4]);\r\n\t\t\tString review = \"\";\r\n\t\t\tfor (int i = 5; i < arr.length; i++) {\r\n\t\t\t\treview = review + arr[i] + \" \";\r\n\t\t\t}\r\n\t\t\treturn new Vote(user, item, score, time, wordCount, review);\r\n\t\t}\r\n\t\treturn null;\r\n\t}\r\n\t\r\n\t/**\r\n\t * Parse a sentence to words. \r\n\t * @param sentence Input sentence to parse\r\n\t * @return A String array containing English words in the sentence.\r\n\t */\r\n\tprivate static String[] parseSentence(String sentence) {\r\n\t\tString[] words = sentence.split(\"\\\\s+\");\r\n\t\tfor (int i = 0; i < words.length; i++) {\r\n\t\t    // Check for a non-word character.\r\n\t\t    words[i] = words[i].replaceAll(\"[^\\\\w]\", \"\");\r\n\t\t}\r\n\t\treturn words;\r\n\t}\r\n\t\r\n\t/**\r\n\t * Convert a string to word HashMap, where key is word and value is the frequency of the word.\r\n\t * @param str\r\n\t * @return\r\n\t */\r\n\tprivate static HashMap<String, Integer> stringToSet(String str) {\r\n\t\tString[] words = str.split(\" \");\r\n\t\tHashMap<String, Integer> map = new HashMap<String, Integer>();\r\n\t\tfor (String word : words) {\r\n\t\t\tif (!map.containsKey(word)) {\r\n\t\t\t\tmap.put(word, 0);\r\n\t\t\t}\r\n\t\t\tmap.put(word, map.get(word) + 1);\r\n\t\t}\r\n\t\treturn map;\r\n\t}\r\n\t\r\n\t\r\n\tprivate static void replaceFileWithKeyword(String inputFile, String oldWord, String outputFile, \r\n\t\t\tString newWord) throws IOException {\r\n\t\tBufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(inputFile)));\r\n\t\tPrintWriter writer = new PrintWriter (new FileOutputStream(outputFile));\r\n\t\tString line;\r\n\t\twhile ((line = reader.readLine())!= null) {\r\n\t\t\tString newLine = line.replace(oldWord, newWord);\r\n\t\t\twriter.println(newLine);\r\n\t\t}\r\n\t\t\t\r\n\t\twriter.close();\r\n\t\treader.close();\r\n\t}\r\n\t\r\n\t/**\r\n\t * @param args\r\n\t * @throws IOException \r\n\t * @throws ParseException \r\n\t * @throws java.text.ParseException \r\n\t * @throws LangDetectException \r\n\t */\r\n\tpublic static void main(String[] args) throws IOException, ParseException, java.text.ParseException {\r\n\r\n\t\tDatasetUtil util = new DatasetUtil();\r\n\t\tString Dir = \"/Users/xiangnanhe/Workspace/yelp-challenge/\";\r\n\t\tint thres = 50;\r\n\t\t//util.ConvertJsonToVotesFile(\"/Users/xiangnanhe/Workspace/yelp-challenge/all/\", \"yelp\");\r\n\t\t//util.RemoveDuplicateInVotesFile(Dir + \"all/\", \"yelp\");\r\n\t\t//util.FilterVotesReviewsByWords(\"/Users/xiangnanhe/Workspace/yelp-challenge/\", \"yelp_1M_u3\", 20000);\r\n\t\t\r\n\t\t\r\n\t\tutil.FilterVotesFileByUsers(Dir +\"all/\", \"yelp\", thres);\r\n\t\tutil.SplitVotesFileByTimePerUser(Dir,  \"yelp_u\" + thres, 0.6, 0.2, 0.2);\r\n\t\t\r\n\t\tutil.ConvertVotesToRatingFile(Dir + \"train/\", \"yelp_u\" + thres);\r\n\t    util.ConvertVotesToRatingFile(Dir + \"test/\", \"yelp_u\" + thres);\r\n\t    util.ConvertVotesToRatingFile(Dir + \"validation/\", \"yelp_u\" + thres); \r\n\r\n\t\tSystem.out.println(\"end\");\r\n\t}\r\n}\r\n"
  },
  {
    "path": "src/utils/Printer.java",
    "content": "package utils;\r\n\r\n/**\r\n * This is a class containing printing functions\r\n * in human-readable format from various kinds of data.\r\n * \r\n * @author Joonseok Lee \r\n * @author Mingxuan Sun \r\n * @since 2012. 4. 20\r\n * @version 1.1\r\n */\r\npublic class Printer {\r\n\t/**\r\n\t * Print each element in a double array.\r\n\t * \r\n\t * @param A The array to print\r\n\t */\r\n\tpublic static void printArray(double[] A) {\r\n\t\tfor(int i = 0; i < A.length; i++){\r\n\t\t\tSystem.out.print(A[i] + \"\\t\");\r\n\t\t}\r\n\t}\r\n\r\n\t/**\r\n\t * Print each element in an integer array\r\n\t * \r\n\t * @param A The array to print\r\n\t */\r\n\tpublic static void printArray(int[] A) {\r\n\t\tfor(int i = 0; i < A.length; i++){\r\n\t\t\tSystem.out.print(A[i] + \"\\t\");\r\n\t\t}\r\n\t}\r\n\r\n\t/**\r\n\t * Print each element in a 2-D double matrix.\r\n\t * \r\n\t * @param A The array to print\r\n\t */\r\n\tpublic static void printArray(double[][] A) {\r\n\t\tfor(int i = 0; i < A.length; i++){\r\n\t\t\tprintArray(A[i]);\r\n\t\t\tSystem.out.println();\r\n\t\t}\r\n\t}\r\n\r\n\t/**\r\n\t * Print each element in a 3-D double matrix.\r\n\t * \r\n\t * @param A The array to print\r\n\t */\r\n\tpublic static void printArray(double[][][] A) {\r\n\t\tfor(int i = 0; i < A.length; i++){\r\n\t\t\tprintArray(A[i]);\r\n\t\t\tSystem.out.println();\r\n\t\t}\r\n\t}\r\n\r\n\t/**\r\n\t * Convert time in milliseconds to human-readable format.\r\n\t * \r\n\t * @param msType The time in milliseconds\r\n\t * @return a human-readable string version of the time\r\n\t */\r\n\tpublic static String printTime(long msType) {\r\n\t\tlong original = msType;\r\n\t\tint ms = (int) (msType % 1000);\r\n\r\n\t\toriginal = original / 1000;\r\n\t\tint sec = (int) (original % 60);\r\n\r\n\t\toriginal = original / 60;\r\n\t\tint min = (int) (original % 60);\r\n\r\n\t\toriginal = original / 60;\r\n\t\tint hr = (int) (original % 24);\r\n\r\n\t\toriginal = original / 24;\r\n\t\tint day = (int) original;\r\n\r\n\t\tif (day > 1) {\r\n\t\t\treturn String.format(\"%d days, %02d:%02d:%02d.%03d\", day, hr, min, sec, ms);\r\n\t\t}\r\n\t\telse if (day > 0) {\r\n\t\t\treturn String.format(\"%d day, %02d:%02d:%02d.%03d\", day, hr, min, sec, ms);\r\n\t\t}\r\n\t\telse {\r\n\t\t\treturn String.format(\"%02d:%02d:%02d.%03d\", hr, min, sec, ms);\r\n\t\t}\r\n\t}\r\n}\r\n"
  },
  {
    "path": "src/utils/SortMapExample.java",
    "content": "package utils;\n\nimport java.util.Comparator;\nimport java.util.HashMap;\nimport java.util.Iterator;\nimport java.util.Map;\nimport java.util.Map.Entry;\nimport java.util.TreeMap;\n\n\npublic class SortMapExample {\n\n\t/**\n\t * The main method.\n\t *\n\t * @param args the arguments\n\t */\n\tpublic static void main(String[] args) {\n\t\t\n\t\tTreeMap<Integer, Double> tmap = new TreeMap<Integer, Double> ();\n\t\ttmap.put(1, 2.0);\n\t\ttmap.put(1, 3.0);\n\t\ttmap.put(1, 1.0);\n\t\ttmap.put(3, 2.0);\n\t\ttmap.put(4, 0.0);\n\t\t\n\t\tIterator<Entry<Integer, Double>> iter = tmap.entrySet().iterator();\n\t\twhile(iter.hasNext()) {\n\t\t\tMap.Entry<Integer, Double> entry = (Map.Entry<Integer, Double>)iter.next();\n\t\t\tSystem.out.printf(\"next : %s - %s\\n\", entry.getKey(), entry.getValue());\n\t\t}\n\t}\n\n}"
  },
  {
    "path": "src/utils/StopwordsFilter.java",
    "content": "package utils;\r\nimport java.io.BufferedReader;\r\nimport java.io.FileInputStream;\r\nimport java.io.FileNotFoundException;\r\nimport java.io.IOException;\r\nimport java.io.InputStreamReader;\r\nimport java.util.*;\r\n\r\n\r\npublic class StopwordsFilter {\r\n\r\n\tprivate static HashSet<String> stopwords = new HashSet<String>();\r\n\tprivate static boolean isInitialized = false;\r\n\t\r\n\tpublic static void init(String stopwordsFile) throws IOException {\r\n\t\tif (!isInitialized) {\r\n\t\t\tBufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(stopwordsFile)));\r\n\t\t\t\r\n\t\t\tString line;\r\n\t\t\twhile ((line = reader.readLine()) != null) {\r\n\t\t\t\tString stopword = line.trim().toLowerCase();\r\n\t\t\t\tstopwords.add(stopword);\r\n\t\t\t\tstopword = stopword.replaceAll(\"[^\\\\w]\", \"\"); // Add the processed version.\r\n\t\t\t\tstopwords.add(stopword);\r\n\t\t\t}\r\n\t\t\tstopwords.add(\"\\t\");\r\n\t\t\treader.close();\r\n\t\t\tisInitialized = true;\r\n\t\t}\r\n\t}\r\n\t\r\n\t/**\r\n\t * Check whether the input word is a stopword.\r\n\t * @param word\r\n\t * @return\r\n\t */\r\n\tpublic static boolean isStopword(String word) {\r\n\t\treturn stopwords.contains(word.toLowerCase());\r\n\t}\r\n\t\r\n\tpublic static void main(String[] args) throws IOException {\r\n\t\t// TODO Auto-generated method stub\r\n\t\tStopwordsFilter.init(\"lib/stopwords.txt\");\r\n\t\t\r\n\t\tSystem.out.println(isStopword(\"the\"));\r\n\t}\r\n\r\n}\r\n"
  },
  {
    "path": "src/utils/TopKPriorityQueue.java",
    "content": "package utils;\r\n\r\nimport java.io.IOException;\r\nimport java.util.ArrayList;\r\nimport java.util.Collections;\r\nimport java.util.Comparator;\r\nimport java.util.HashMap;\r\nimport java.util.Map;\r\nimport java.util.PriorityQueue;\r\nimport java.util.AbstractMap;\r\n\r\nimport data_structure.DenseVector;\r\n\r\n/**\r\n * Using PriorityQueue to implement MinHeap, for selecting topK maximum entries\r\n *  (by value) of a map.\r\n * \r\n * @author HeXiangnan\r\n */\r\npublic class TopKPriorityQueue<K, V extends Comparable<? super V>> {\r\n\tpublic PriorityQueue<Map.Entry<K, V>> queue;\r\n\tprivate int K;  // Maximum size of the heap.\r\n\t\r\n\tprivate Comparator<Map.Entry<K, V>> c = new Comparator<Map.Entry<K, V>>() {\r\n\t\tpublic int compare(Map.Entry<K, V> o1, Map.Entry<K, V> o2) {\r\n\t\t\treturn o1.getValue().compareTo(o2.getValue());\r\n\t\t}\r\n\t};\r\n\t\r\n\tpublic TopKPriorityQueue(int maxSize) {\r\n\t\tif (maxSize <= 0) {\r\n\t\t\tthrow new IllegalArgumentException();\r\n\t\t}\r\n\t\tthis.K = maxSize;\r\n\t\tthis.queue = new PriorityQueue<Map.Entry<K, V>>(maxSize, c);\r\n\t}\r\n\r\n\tpublic void add(Map.Entry<K, V> e) {\r\n\t\tif (queue.size() < K) { // The queue is not full.\r\n\t\t\tqueue.add(e);\r\n\t\t} else { // The queue is full.\r\n\t\t\tMap.Entry<K, V> peek = queue.peek(); // Pick the top element\r\n\t\t\tif (c.compare(e, peek) > 0) { \r\n\t\t\t\tqueue.poll();\r\n\t\t\t\tqueue.add(e);\r\n\t\t\t}\r\n\t\t}\r\n\t}\r\n\t\r\n\tpublic ArrayList<Map.Entry<K, V>> toList() {\r\n\t\treturn new ArrayList<Map.Entry<K, V>>(queue);\r\n\t}\r\n\r\n\tpublic ArrayList<Map.Entry<K, V>> sortedList() {\r\n\t\tArrayList<Map.Entry<K, V>> list = new ArrayList<Map.Entry<K, V>>(queue); \r\n\t\tCollections.sort(list, c.reversed()); \r\n\t\treturn list;\r\n\t}\r\n\t\r\n\tArrayList<Integer> maxPoolingIndices(DenseVector vec, int maxPooling) {\r\n\t\tArrayList<Integer> indexList = new ArrayList<Integer>();\r\n\t\tTopKPriorityQueue<Integer, Double> q = new TopKPriorityQueue<Integer, Double>(maxPooling);\r\n\t\tfor (int i = 0; i < vec.size(); i ++) {\r\n\t\t\tq.add(new AbstractMap.SimpleEntry<Integer, Double>(i, vec.get(i)));\r\n\t\t}\r\n\t\tfor (Map.Entry<Integer, Double> e : q.toList()) {\r\n\t\t\tindexList.add(e.getKey());\r\n\t\t}\r\n\t\treturn indexList;\r\n\t}\r\n\t\r\n\tpublic static void main(String[] args) throws IOException {\r\n\t\t//Test topK selection.\r\n\t\tTopKPriorityQueue<Integer, Double> q = new TopKPriorityQueue(3);\r\n\t\tHashMap<Integer, Double> map = new HashMap<Integer, Double>();\r\n\t\tmap.put(1, 1.0);\r\n\t\tmap.put(2, 2.0);\r\n\t\tmap.put(3, 4.0);\r\n\t\tmap.put(4, 3.0);\r\n\t\t\r\n\t\tfor (Map.Entry<Integer, Double> e : map.entrySet()) {\r\n\t\t\tq.add(e);\r\n\t\t}\r\n\t\tq.add(new AbstractMap.SimpleEntry<Integer, Double>(6, 5.0));\r\n\t\tfor (Map.Entry<Integer, Double> e : q.sortedList()) {\r\n\t\t\tSystem.out.println(e.getKey() +\": \" + e.getValue());\r\n\t\t}\r\n\t\t\r\n\t\t// Test maxPoolingIndices function.\r\n\t\tint[] array = {3, 1, 5, 6, 2};\r\n\t\t\r\n\t\tdouble[] arr = new double[10];\r\n\t\tPrinter.printArray(arr);\r\n\t\t\r\n\t\tfor (int i : CommonUtils.ArrayToArraylist(array)) {\r\n\t\t\tSystem.out.print(i + \" \");\r\n\t\t}\r\n\t}\r\n}"
  }
]