[
  {
    "path": "LICENSE",
    "content": "MIT License\n\nCopyright (c) 2017 Nishant Nikhil\n\nPermission is hereby granted, free of charge, to any person obtaining a copy\nof this software and associated documentation files (the \"Software\"), to deal\nin the Software without restriction, including without limitation the rights\nto use, copy, modify, merge, publish, distribute, sublicense, and/or sell\ncopies of the Software, and to permit persons to whom the Software is\nfurnished to do so, subject to the following conditions:\n\nThe above copyright notice and this permission notice shall be included in all\ncopies or substantial portions of the Software.\n\nTHE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\nIMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\nFITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\nAUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\nLIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\nOUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\nSOFTWARE.\n"
  },
  {
    "path": "README.md",
    "content": "# Deep-Semantic-Similarity-Model-PyTorch\nImplementation of C-DSSM(Microsoft Research Paper) described [here](http://research.microsoft.com/pubs/226585/cikm2014_cdssm_final.pdf). A random data generator is included in the code, you can play with it or use your own data.\n\nKeras model: [airalcorn2/Deep-Semantic-Similarity-Model](https://github.com/airalcorn2/Deep-Semantic-Similarity-Model).\n\nCorresponding blog post is at: [Medium](https://medium.com/towards-data-science/pytorch-first-program-and-walk-through-ceb739134ab9)\n"
  },
  {
    "path": "cdssm.py",
    "content": "# Nishant Nikhil (i.nishantnikhil@gmail.com)\n# An implementation of the Deep Semantic Similarity Model (DSSM) found in [1].\n# [1] Shen, Y., He, X., Gao, J., Deng, L., and Mesnil, G. 2014. A latent semantic model\n#         with convolutional-pooling structure for information retrieval. In CIKM, pp. 101-110.\n#         http://research.microsoft.com/pubs/226585/cikm2014_cdssm_final.pdf\n# [2] http://research.microsoft.com/en-us/projects/dssm/\n# [3] http://research.microsoft.com/pubs/238873/wsdm2015.v3.pdf\n\nimport torch \nimport torch.nn as nn\nimport torchvision.datasets as dsets\nimport torchvision.transforms as transforms\nfrom torch.autograd import Variable\nimport torch.nn.functional as F\n\n\nLETTER_GRAM_SIZE = 3 # See section 3.2.\nWINDOW_SIZE = 3 # See section 3.2.\nTOTAL_LETTER_GRAMS = int(3 * 1e4) # Determined from data. See section 3.2.\nWORD_DEPTH = WINDOW_SIZE * TOTAL_LETTER_GRAMS # See equation (1).\n# Uncomment it, if testing\n# WORD_DEPTH = 1000\nK = 300 # Dimensionality of the max-pooling layer. See section 3.4.\nL = 128 # Dimensionality of latent semantic space. See section 3.5.\nJ = 4 # Number of random unclicked documents serving as negative examples for a query. See section 4.\nFILTER_LENGTH = 1 # We only consider one time step for convolutions.\n\n\ndef kmax_pooling(x, dim, k):\n    index = x.topk(k, dim = dim)[1].sort(dim = dim)[0]\n    return x.gather(dim, index)\n\nclass CDSSM(nn.Module):\n    def __init__(self):\n        super(CDSSM, self).__init__()\n        # layers for query\n        self.query_conv = nn.Conv1d(WORD_DEPTH, K, FILTER_LENGTH)\n        self.query_sem = nn.Linear(K, L)\n        # layers for docs\n        self.doc_conv = nn.Conv1d(WORD_DEPTH, K, FILTER_LENGTH)\n        self.doc_sem = nn.Linear(K, L)\n        # learning gamma\n        self.learn_gamma = nn.Conv1d(1, 1, 1)\n    def forward(self, q, pos, negs):\n        # Query model. The paper uses separate neural nets for queries and documents (see section 5.2).\n        # To make it compatible with Conv layer we reshape it to: (batch_size, WORD_DEPTH, query_len)\n        q = q.transpose(1,2)\n        # In this step, we transform each word vector with WORD_DEPTH dimensions into its\n        # convolved representation with K dimensions. K is the number of kernels/filters\n        # being used in the operation. Essentially, the operation is taking the dot product\n        # of a single weight matrix (W_c) with each of the word vectors (l_t) from the\n        # query matrix (l_Q), adding a bias vector (b_c), and then applying the tanh activation.\n        # That is, h_Q = tanh(W_c • l_Q + b_c). Note: the paper does not include bias units.\n        q_c = F.tanh(self.query_conv(q))\n        # Next, we apply a max-pooling layer to the convolved query matrix.\n        q_k = kmax_pooling(q_c, 2, 1)\n        q_k = q_k.transpose(1,2)\n        # In this step, we generate the semantic vector represenation of the query. This\n        # is a standard neural network dense layer, i.e., y = tanh(W_s • v + b_s). Again,\n        # the paper does not include bias units.\n        q_s = F.tanh(self.query_sem(q_k))\n        q_s = q_s.resize(L)\n        # # The document equivalent of the above query model for positive document\n        pos = pos.transpose(1,2)\n        pos_c = F.tanh(self.doc_conv(pos))\n        pos_k = kmax_pooling(pos_c, 2, 1)\n        pos_k = pos_k.transpose(1,2)\n        pos_s = F.tanh(self.doc_sem(pos_k))\n        pos_s = pos_s.resize(L)\n        # # The document equivalent of the above query model for negative documents\n        negs = [neg.transpose(1,2) for neg in negs]\n        neg_cs = [F.tanh(self.doc_conv(neg)) for neg in negs]\n        neg_ks = [kmax_pooling(neg_c, 2, 1) for neg_c in neg_cs]\n        neg_ks = [neg_k.transpose(1,2) for neg_k in neg_ks]\n        neg_ss = [F.tanh(self.doc_sem(neg_k)) for neg_k in neg_ks]\n        neg_ss = [neg_s.resize(L) for neg_s in neg_ss]\n        # Now let us calculates the cosine similarity between the semantic representations of\n        # a queries and documents\n        # dots[0] is the dot-product for positive document, this is necessary to remember\n        # because we set the target label accordingly\n        dots = [q_s.dot(pos_s)]\n        dots = dots + [q_s.dot(neg_s) for neg_s in neg_ss]\n        # dots is a list as of now, lets convert it to torch variable\n        dots = torch.stack(dots)\n        # In this step, we multiply each dot product value by gamma. In the paper, gamma is\n        # described as a smoothing factor for the softmax function, and it's set empirically\n        # on a held-out data set. We're going to learn gamma's value by pretending it's\n        # a single 1 x 1 kernel.\n        with_gamma = self.learn_gamma(dots.resize(J+1, 1, 1))\n        # You can use the softmax function to calculate P(D+|Q), but here we return the logits for the CrossEntropyLoss\n        # prob = F.softmax(with_gamma)\n        return with_gamma\n\nmodel = CDSSM()\n\n# Build a random data set.\nimport numpy as np\nsample_size = 10\nl_Qs = []\npos_l_Ds = []\n\n(query_len, doc_len) = (5, 100)\n\nfor i in range(sample_size):\n    query_len = np.random.randint(1, 10)\n    l_Q = np.random.rand(1, query_len, WORD_DEPTH)\n    l_Qs.append(l_Q)\n    \n    doc_len = np.random.randint(50, 500)\n    l_D = np.random.rand(1, doc_len, WORD_DEPTH)\n    pos_l_Ds.append(l_D)\n\nneg_l_Ds = [[] for j in range(J)]\nfor i in range(sample_size):\n    possibilities = list(range(sample_size))\n    possibilities.remove(i)\n    negatives = np.random.choice(possibilities, J, replace = False)\n    for j in range(J):\n        negative = negatives[j]\n        neg_l_Ds[j].append(pos_l_Ds[negative])\n\n# Till now, we have made a complete numpy dataset\n# Now let's convert the numpy variables to torch Variable\n\nfor i in range(len(l_Qs)):\n    l_Qs[i] = Variable(torch.from_numpy(l_Qs[i]).float())\n    pos_l_Ds[i] = Variable(torch.from_numpy(pos_l_Ds[i]).float())\n    for j in range(J):\n        neg_l_Ds[j][i] = Variable(torch.from_numpy(neg_l_Ds[j][i]).float())\n\n\n# Loss and optimizer\ncriterion = torch.nn.CrossEntropyLoss()\noptimizer = torch.optim.SGD(model.parameters(), lr=1e-4, momentum=0.9)\n\n# output variable, remember the cosine similarity with positive doc was at 0th index\ny = np.ndarray(1)\n# CrossEntropyLoss expects only the index as a long tensor\ny[0] = 0\ny = Variable(torch.from_numpy(y).long())\n\nfor i in range(sample_size):\n    y_pred = model(l_Qs[i], pos_l_Ds[i], [neg_l_Ds[j][i] for j in range(J)])\n    loss = criterion(y_pred.resize(1,J+1), y)\n    print (i, loss.data[0])\n    optimizer.zero_grad()\n    loss.backward()\n    optimizer.step()\n"
  }
]