[
  {
    "path": ".gitignore",
    "content": "# Byte-compiled / optimized / DLL files\n__pycache__/\n*.py[cod]\n*$py.class\n\n# Distribution / packaging\n.Python\nbuild/\ndevelop-eggs/\ndist/\ndownloads/\neggs/\n.eggs/\nlib/\nlib64/\nparts/\nsdist/\nvar/\nwheels/\nshare/python-wheels/\n*.egg-info/\n.installed.cfg\n*.egg\nMANIFEST\n\n# Jupyter Notebook\n.ipynb_checkpoints\n"
  },
  {
    "path": ".travis.yml",
    "content": "language: python\npython:\n  - \"3.6\"\n  - \"3.7-dev\"\n# command to install dependencies\ninstall:\n  - pip install black==19.10b\n  - pip install -q -r requirements.txt\n  - python setup.py install\n# command to run tests\nscript:\n  - black --fast --check ptdec/.\n  - black --fast --check examples/.\n  - black --fast --check tests/.\n  - pytest --cov=ptdec tests\n  - python examples/mnist/mnist.py --cuda=False --batch-size=48 --pretrain-epochs=1 --finetune-epochs=1 --testing-mode=True\nafter_success:\n  - codecov\n"
  },
  {
    "path": "LICENSE",
    "content": "MIT License\n\nCopyright (c) 2018 Vladimir Lukiyanov\n\nPermission is hereby granted, free of charge, to any person obtaining a copy\nof this software and associated documentation files (the \"Software\"), to deal\nin the Software without restriction, including without limitation the rights\nto use, copy, modify, merge, publish, distribute, sublicense, and/or sell\ncopies of the Software, and to permit persons to whom the Software is\nfurnished to do so, subject to the following conditions:\n\nThe above copyright notice and this permission notice shall be included in all\ncopies or substantial portions of the Software.\n\nTHE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\nIMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\nFITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\nAUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\nLIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\nOUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\nSOFTWARE.\n"
  },
  {
    "path": "README.md",
    "content": "# pt-dec\n[![Build Status](https://travis-ci.org/vlukiyanov/pt-dec.svg?branch=master)](https://travis-ci.org/vlukiyanov/pt-dec) [![codecov](https://codecov.io/gh/vlukiyanov/pt-dec/branch/master/graph/badge.svg)](https://codecov.io/gh/vlukiyanov/pt-dec)\n[![Codacy Badge](https://api.codacy.com/project/badge/Grade/5877a6b3baa342c3bb2d8f4a4c94f8dd)](https://app.codacy.com/app/vlukiyanov/pt-dec?utm_source=github.com&utm_medium=referral&utm_content=vlukiyanov/pt-dec&utm_campaign=Badge_Grade_Settings)\n\nPyTorch implementation of a version of the Deep Embedded Clustering (DEC) algorithm. Compatible with PyTorch 1.0.0 and Python 3.6 or 3.7 with or without CUDA.\n\nThis follows (*or attempts to; note this implementation is unofficial*) the algorithm described in \"Unsupervised Deep Embedding for Clustering Analysis\" of Junyuan Xie, Ross Girshick, Ali Farhadi (<https://arxiv.org/abs/1511.06335>).\n\n## Examples\n\nAn example using MNIST data can be found in the `examples/mnist/mnist.py` which achieves around 85% accuracy.\n\nHere is an example [confusion matrix](http://scikit-learn.org/stable/modules/generated/sklearn.metrics.confusion_matrix.html), true labels on y-axis and predicted labels on the x-axis.\n\n![Alt text](confusion.png)\n\n## Usage\n\nThis is distributed as a Python package `ptdec` and can be installed with `python setup.py install` after installing `ptsdae` from https://github.com/vlukiyanov/pt-sdae. The PyTorch `nn.Module` class representing the DEC is `DEC` in `ptdec.dec`, while the `train` function from `ptdec.model` is used to train DEC.\n\n## Other implementations of DEC\n\n*   Original Caffe: <https://github.com/piiswrong/dec>\n*   PyTorch: <https://github.com/CharlesNord/DEC-pytorch> and <https://github.com/eelxpeng/dec-pytorch>\n*   Keras: <https://github.com/XifengGuo/DEC-keras> and <https://github.com/fferroni/DEC-Keras>\n*   MXNet: <https://github.com/apache/incubator-mxnet/blob/master/example/deep-embedded-clustering/dec.py>\n*   Chainer: <https://github.com/ymym3412/DeepEmbeddedClustering>\n"
  },
  {
    "path": "examples/mnist/mnist.py",
    "content": "import click\nimport numpy as np\nimport seaborn as sns\nfrom sklearn.metrics import confusion_matrix\nfrom torch.optim import SGD\nfrom torch.optim.lr_scheduler import StepLR\nimport torch\nfrom torch.utils.data import Dataset\nfrom torchvision import transforms\nfrom torchvision.datasets import MNIST\nfrom tensorboardX import SummaryWriter\nimport uuid\n\nfrom ptdec.dec import DEC\nfrom ptdec.model import train, predict\nfrom ptsdae.sdae import StackedDenoisingAutoEncoder\nimport ptsdae.model as ae\nfrom ptdec.utils import cluster_accuracy\n\n\nclass CachedMNIST(Dataset):\n    def __init__(self, train, cuda, testing_mode=False):\n        img_transform = transforms.Compose([transforms.Lambda(self._transformation)])\n        self.ds = MNIST(\"./data\", download=True, train=train, transform=img_transform)\n        self.cuda = cuda\n        self.testing_mode = testing_mode\n        self._cache = dict()\n\n    @staticmethod\n    def _transformation(img):\n        return (\n            torch.ByteTensor(torch.ByteStorage.from_buffer(img.tobytes())).float()\n            * 0.02\n        )\n\n    def __getitem__(self, index: int) -> torch.Tensor:\n        if index not in self._cache:\n            self._cache[index] = list(self.ds[index])\n            if self.cuda:\n                self._cache[index][0] = self._cache[index][0].cuda(non_blocking=True)\n                self._cache[index][1] = torch.tensor(\n                    self._cache[index][1], dtype=torch.long\n                ).cuda(non_blocking=True)\n        return self._cache[index]\n\n    def __len__(self) -> int:\n        return 128 if self.testing_mode else len(self.ds)\n\n\n@click.command()\n@click.option(\n    \"--cuda\", help=\"whether to use CUDA (default False).\", type=bool, default=False\n)\n@click.option(\n    \"--batch-size\", help=\"training batch size (default 256).\", type=int, default=256\n)\n@click.option(\n    \"--pretrain-epochs\",\n    help=\"number of pretraining epochs (default 300).\",\n    type=int,\n    default=300,\n)\n@click.option(\n    \"--finetune-epochs\",\n    help=\"number of finetune epochs (default 500).\",\n    type=int,\n    default=500,\n)\n@click.option(\n    \"--testing-mode\",\n    help=\"whether to run in testing mode (default False).\",\n    type=bool,\n    default=False,\n)\ndef main(cuda, batch_size, pretrain_epochs, finetune_epochs, testing_mode):\n    writer = SummaryWriter()  # create the TensorBoard object\n    # callback function to call during training, uses writer from the scope\n\n    def training_callback(epoch, lr, loss, validation_loss):\n        writer.add_scalars(\n            \"data/autoencoder\",\n            {\"lr\": lr, \"loss\": loss, \"validation_loss\": validation_loss,},\n            epoch,\n        )\n\n    ds_train = CachedMNIST(\n        train=True, cuda=cuda, testing_mode=testing_mode\n    )  # training dataset\n    ds_val = CachedMNIST(\n        train=False, cuda=cuda, testing_mode=testing_mode\n    )  # evaluation dataset\n    autoencoder = StackedDenoisingAutoEncoder(\n        [28 * 28, 500, 500, 2000, 10], final_activation=None\n    )\n    if cuda:\n        autoencoder.cuda()\n    print(\"Pretraining stage.\")\n    ae.pretrain(\n        ds_train,\n        autoencoder,\n        cuda=cuda,\n        validation=ds_val,\n        epochs=pretrain_epochs,\n        batch_size=batch_size,\n        optimizer=lambda model: SGD(model.parameters(), lr=0.1, momentum=0.9),\n        scheduler=lambda x: StepLR(x, 100, gamma=0.1),\n        corruption=0.2,\n    )\n    print(\"Training stage.\")\n    ae_optimizer = SGD(params=autoencoder.parameters(), lr=0.1, momentum=0.9)\n    ae.train(\n        ds_train,\n        autoencoder,\n        cuda=cuda,\n        validation=ds_val,\n        epochs=finetune_epochs,\n        batch_size=batch_size,\n        optimizer=ae_optimizer,\n        scheduler=StepLR(ae_optimizer, 100, gamma=0.1),\n        corruption=0.2,\n        update_callback=training_callback,\n    )\n    print(\"DEC stage.\")\n    model = DEC(cluster_number=10, hidden_dimension=10, encoder=autoencoder.encoder)\n    if cuda:\n        model.cuda()\n    dec_optimizer = SGD(model.parameters(), lr=0.01, momentum=0.9)\n    train(\n        dataset=ds_train,\n        model=model,\n        epochs=100,\n        batch_size=256,\n        optimizer=dec_optimizer,\n        stopping_delta=0.000001,\n        cuda=cuda,\n    )\n    predicted, actual = predict(\n        ds_train, model, 1024, silent=True, return_actual=True, cuda=cuda\n    )\n    actual = actual.cpu().numpy()\n    predicted = predicted.cpu().numpy()\n    reassignment, accuracy = cluster_accuracy(actual, predicted)\n    print(\"Final DEC accuracy: %s\" % accuracy)\n    if not testing_mode:\n        predicted_reassigned = [\n            reassignment[item] for item in predicted\n        ]  # TODO numpify\n        confusion = confusion_matrix(actual, predicted_reassigned)\n        normalised_confusion = (\n            confusion.astype(\"float\") / confusion.sum(axis=1)[:, np.newaxis]\n        )\n        confusion_id = uuid.uuid4().hex\n        sns.heatmap(normalised_confusion).get_figure().savefig(\n            \"confusion_%s.png\" % confusion_id\n        )\n        print(\"Writing out confusion diagram with UUID: %s\" % confusion_id)\n        writer.close()\n\n\nif __name__ == \"__main__\":\n    main()\n"
  },
  {
    "path": "ptdec/__init__.py",
    "content": ""
  },
  {
    "path": "ptdec/cluster.py",
    "content": "import torch\nimport torch.nn as nn\nfrom torch.nn import Parameter\nfrom typing import Optional\n\n\nclass ClusterAssignment(nn.Module):\n    def __init__(\n        self,\n        cluster_number: int,\n        embedding_dimension: int,\n        alpha: float = 1.0,\n        cluster_centers: Optional[torch.Tensor] = None,\n    ) -> None:\n        \"\"\"\n        Module to handle the soft assignment, for a description see in 3.1.1. in Xie/Girshick/Farhadi,\n        where the Student's t-distribution is used measure similarity between feature vector and each\n        cluster centroid.\n\n        :param cluster_number: number of clusters\n        :param embedding_dimension: embedding dimension of feature vectors\n        :param alpha: parameter representing the degrees of freedom in the t-distribution, default 1.0\n        :param cluster_centers: clusters centers to initialise, if None then use Xavier uniform\n        \"\"\"\n        super(ClusterAssignment, self).__init__()\n        self.embedding_dimension = embedding_dimension\n        self.cluster_number = cluster_number\n        self.alpha = alpha\n        if cluster_centers is None:\n            initial_cluster_centers = torch.zeros(\n                self.cluster_number, self.embedding_dimension, dtype=torch.float\n            )\n            nn.init.xavier_uniform_(initial_cluster_centers)\n        else:\n            initial_cluster_centers = cluster_centers\n        self.cluster_centers = Parameter(initial_cluster_centers)\n\n    def forward(self, batch: torch.Tensor) -> torch.Tensor:\n        \"\"\"\n        Compute the soft assignment for a batch of feature vectors, returning a batch of assignments\n        for each cluster.\n\n        :param batch: FloatTensor of [batch size, embedding dimension]\n        :return: FloatTensor [batch size, number of clusters]\n        \"\"\"\n        norm_squared = torch.sum((batch.unsqueeze(1) - self.cluster_centers) ** 2, 2)\n        numerator = 1.0 / (1.0 + (norm_squared / self.alpha))\n        power = float(self.alpha + 1) / 2\n        numerator = numerator ** power\n        return numerator / torch.sum(numerator, dim=1, keepdim=True)\n"
  },
  {
    "path": "ptdec/dec.py",
    "content": "import torch\nimport torch.nn as nn\n\nfrom ptdec.cluster import ClusterAssignment\n\n\nclass DEC(nn.Module):\n    def __init__(\n        self,\n        cluster_number: int,\n        hidden_dimension: int,\n        encoder: torch.nn.Module,\n        alpha: float = 1.0,\n    ):\n        \"\"\"\n        Module which holds all the moving parts of the DEC algorithm, as described in\n        Xie/Girshick/Farhadi; this includes the AutoEncoder stage and the ClusterAssignment stage.\n\n        :param cluster_number: number of clusters\n        :param hidden_dimension: hidden dimension, output of the encoder\n        :param encoder: encoder to use\n        :param alpha: parameter representing the degrees of freedom in the t-distribution, default 1.0\n        \"\"\"\n        super(DEC, self).__init__()\n        self.encoder = encoder\n        self.hidden_dimension = hidden_dimension\n        self.cluster_number = cluster_number\n        self.alpha = alpha\n        self.assignment = ClusterAssignment(\n            cluster_number, self.hidden_dimension, alpha\n        )\n\n    def forward(self, batch: torch.Tensor) -> torch.Tensor:\n        \"\"\"\n        Compute the cluster assignment using the ClusterAssignment after running the batch\n        through the encoder part of the associated AutoEncoder module.\n\n        :param batch: [batch size, embedding dimension] FloatTensor\n        :return: [batch size, number of clusters] FloatTensor\n        \"\"\"\n        return self.assignment(self.encoder(batch))\n"
  },
  {
    "path": "ptdec/model.py",
    "content": "import numpy as np\nfrom sklearn.cluster import KMeans\nimport torch\nimport torch.nn as nn\nfrom torch.utils.data.dataloader import DataLoader, default_collate\nfrom typing import Tuple, Callable, Optional, Union\nfrom tqdm import tqdm\n\nfrom ptdec.utils import target_distribution, cluster_accuracy\n\n\ndef train(\n    dataset: torch.utils.data.Dataset,\n    model: torch.nn.Module,\n    epochs: int,\n    batch_size: int,\n    optimizer: torch.optim.Optimizer,\n    stopping_delta: Optional[float] = None,\n    collate_fn=default_collate,\n    cuda: bool = True,\n    sampler: Optional[torch.utils.data.sampler.Sampler] = None,\n    silent: bool = False,\n    update_freq: int = 10,\n    evaluate_batch_size: int = 1024,\n    update_callback: Optional[Callable[[float, float], None]] = None,\n    epoch_callback: Optional[Callable[[int, torch.nn.Module], None]] = None,\n) -> None:\n    \"\"\"\n    Train the DEC model given a dataset, a model instance and various configuration parameters.\n\n    :param dataset: instance of Dataset to use for training\n    :param model: instance of DEC model to train\n    :param epochs: number of training epochs\n    :param batch_size: size of the batch to train with\n    :param optimizer: instance of optimizer to use\n    :param stopping_delta: label delta as a proportion to use for stopping, None to disable, default None\n    :param collate_fn: function to merge a list of samples into mini-batch\n    :param cuda: whether to use CUDA, defaults to True\n    :param sampler: optional sampler to use in the DataLoader, defaults to None\n    :param silent: set to True to prevent printing out summary statistics, defaults to False\n    :param update_freq: frequency of batches with which to update counter, None disables, default 10\n    :param evaluate_batch_size: batch size for evaluation stage, default 1024\n    :param update_callback: optional function of accuracy and loss to update, default None\n    :param epoch_callback: optional function of epoch and model, default None\n    :return: None\n    \"\"\"\n    static_dataloader = DataLoader(\n        dataset,\n        batch_size=batch_size,\n        collate_fn=collate_fn,\n        pin_memory=False,\n        sampler=sampler,\n        shuffle=False,\n    )\n    train_dataloader = DataLoader(\n        dataset,\n        batch_size=batch_size,\n        collate_fn=collate_fn,\n        sampler=sampler,\n        shuffle=True,\n    )\n    data_iterator = tqdm(\n        static_dataloader,\n        leave=True,\n        unit=\"batch\",\n        postfix={\n            \"epo\": -1,\n            \"acc\": \"%.4f\" % 0.0,\n            \"lss\": \"%.8f\" % 0.0,\n            \"dlb\": \"%.4f\" % -1,\n        },\n        disable=silent,\n    )\n    kmeans = KMeans(n_clusters=model.cluster_number, n_init=20)\n    model.train()\n    features = []\n    actual = []\n    # form initial cluster centres\n    for index, batch in enumerate(data_iterator):\n        if (isinstance(batch, tuple) or isinstance(batch, list)) and len(batch) == 2:\n            batch, value = batch  # if we have a prediction label, separate it to actual\n            actual.append(value)\n        if cuda:\n            batch = batch.cuda(non_blocking=True)\n        features.append(model.encoder(batch).detach().cpu())\n    actual = torch.cat(actual).long()\n    predicted = kmeans.fit_predict(torch.cat(features).numpy())\n    predicted_previous = torch.tensor(np.copy(predicted), dtype=torch.long)\n    _, accuracy = cluster_accuracy(predicted, actual.cpu().numpy())\n    cluster_centers = torch.tensor(\n        kmeans.cluster_centers_, dtype=torch.float, requires_grad=True\n    )\n    if cuda:\n        cluster_centers = cluster_centers.cuda(non_blocking=True)\n    with torch.no_grad():\n        # initialise the cluster centers\n        model.state_dict()[\"assignment.cluster_centers\"].copy_(cluster_centers)\n    loss_function = nn.KLDivLoss(size_average=False)\n    delta_label = None\n    for epoch in range(epochs):\n        features = []\n        data_iterator = tqdm(\n            train_dataloader,\n            leave=True,\n            unit=\"batch\",\n            postfix={\n                \"epo\": epoch,\n                \"acc\": \"%.4f\" % (accuracy or 0.0),\n                \"lss\": \"%.8f\" % 0.0,\n                \"dlb\": \"%.4f\" % (delta_label or 0.0),\n            },\n            disable=silent,\n        )\n        model.train()\n        for index, batch in enumerate(data_iterator):\n            if (isinstance(batch, tuple) or isinstance(batch, list)) and len(\n                batch\n            ) == 2:\n                batch, _ = batch  # if we have a prediction label, strip it away\n            if cuda:\n                batch = batch.cuda(non_blocking=True)\n            output = model(batch)\n            target = target_distribution(output).detach()\n            loss = loss_function(output.log(), target) / output.shape[0]\n            data_iterator.set_postfix(\n                epo=epoch,\n                acc=\"%.4f\" % (accuracy or 0.0),\n                lss=\"%.8f\" % float(loss.item()),\n                dlb=\"%.4f\" % (delta_label or 0.0),\n            )\n            optimizer.zero_grad()\n            loss.backward()\n            optimizer.step(closure=None)\n            features.append(model.encoder(batch).detach().cpu())\n            if update_freq is not None and index % update_freq == 0:\n                loss_value = float(loss.item())\n                data_iterator.set_postfix(\n                    epo=epoch,\n                    acc=\"%.4f\" % (accuracy or 0.0),\n                    lss=\"%.8f\" % loss_value,\n                    dlb=\"%.4f\" % (delta_label or 0.0),\n                )\n                if update_callback is not None:\n                    update_callback(accuracy, loss_value, delta_label)\n        predicted, actual = predict(\n            dataset,\n            model,\n            batch_size=evaluate_batch_size,\n            collate_fn=collate_fn,\n            silent=True,\n            return_actual=True,\n            cuda=cuda,\n        )\n        delta_label = (\n            float((predicted != predicted_previous).float().sum().item())\n            / predicted_previous.shape[0]\n        )\n        if stopping_delta is not None and delta_label < stopping_delta:\n            print(\n                'Early stopping as label delta \"%1.5f\" less than \"%1.5f\".'\n                % (delta_label, stopping_delta)\n            )\n            break\n        predicted_previous = predicted\n        _, accuracy = cluster_accuracy(predicted.cpu().numpy(), actual.cpu().numpy())\n        data_iterator.set_postfix(\n            epo=epoch,\n            acc=\"%.4f\" % (accuracy or 0.0),\n            lss=\"%.8f\" % 0.0,\n            dlb=\"%.4f\" % (delta_label or 0.0),\n        )\n        if epoch_callback is not None:\n            epoch_callback(epoch, model)\n\n\ndef predict(\n    dataset: torch.utils.data.Dataset,\n    model: torch.nn.Module,\n    batch_size: int = 1024,\n    collate_fn=default_collate,\n    cuda: bool = True,\n    silent: bool = False,\n    return_actual: bool = False,\n) -> Union[Tuple[torch.Tensor, torch.Tensor], torch.Tensor]:\n    \"\"\"\n    Predict clusters for a dataset given a DEC model instance and various configuration parameters.\n\n    :param dataset: instance of Dataset to use for training\n    :param model: instance of DEC model to predict\n    :param batch_size: size of the batch to predict with, default 1024\n    :param collate_fn: function to merge a list of samples into mini-batch\n    :param cuda: whether CUDA is used, defaults to True\n    :param silent: set to True to prevent printing out summary statistics, defaults to False\n    :param return_actual: return actual values, if present in the Dataset\n    :return: tuple of prediction and actual if return_actual is True otherwise prediction\n    \"\"\"\n    dataloader = DataLoader(\n        dataset, batch_size=batch_size, collate_fn=collate_fn, shuffle=False\n    )\n    data_iterator = tqdm(dataloader, leave=True, unit=\"batch\", disable=silent,)\n    features = []\n    actual = []\n    model.eval()\n    for batch in data_iterator:\n        if (isinstance(batch, tuple) or isinstance(batch, list)) and len(batch) == 2:\n            batch, value = batch  # unpack if we have a prediction label\n            if return_actual:\n                actual.append(value)\n        elif return_actual:\n            raise ValueError(\n                \"Dataset has no actual value to unpack, but return_actual is set.\"\n            )\n        if cuda:\n            batch = batch.cuda(non_blocking=True)\n        features.append(\n            model(batch).detach().cpu()\n        )  # move to the CPU to prevent out of memory on the GPU\n    if return_actual:\n        return torch.cat(features).max(1)[1], torch.cat(actual).long()\n    else:\n        return torch.cat(features).max(1)[1]\n"
  },
  {
    "path": "ptdec/utils.py",
    "content": "import numpy as np\nimport torch\nfrom typing import Optional\nfrom scipy.optimize import linear_sum_assignment\n\n\ndef cluster_accuracy(y_true, y_predicted, cluster_number: Optional[int] = None):\n    \"\"\"\n    Calculate clustering accuracy after using the linear_sum_assignment function in SciPy to\n    determine reassignments.\n\n    :param y_true: list of true cluster numbers, an integer array 0-indexed\n    :param y_predicted: list  of predicted cluster numbers, an integer array 0-indexed\n    :param cluster_number: number of clusters, if None then calculated from input\n    :return: reassignment dictionary, clustering accuracy\n    \"\"\"\n    if cluster_number is None:\n        cluster_number = (\n            max(y_predicted.max(), y_true.max()) + 1\n        )  # assume labels are 0-indexed\n    count_matrix = np.zeros((cluster_number, cluster_number), dtype=np.int64)\n    for i in range(y_predicted.size):\n        count_matrix[y_predicted[i], y_true[i]] += 1\n\n    row_ind, col_ind = linear_sum_assignment(count_matrix.max() - count_matrix)\n    reassignment = dict(zip(row_ind, col_ind))\n    accuracy = count_matrix[row_ind, col_ind].sum() / y_predicted.size\n    return reassignment, accuracy\n\n\ndef target_distribution(batch: torch.Tensor) -> torch.Tensor:\n    \"\"\"\n    Compute the target distribution p_ij, given the batch (q_ij), as in 3.1.3 Equation 3 of\n    Xie/Girshick/Farhadi; this is used the KL-divergence loss function.\n\n    :param batch: [batch size, number of clusters] Tensor of dtype float\n    :return: [batch size, number of clusters] Tensor of dtype float\n    \"\"\"\n    weight = (batch ** 2) / torch.sum(batch, 0)\n    return (weight.t() / torch.sum(weight, 1)).t()\n"
  },
  {
    "path": "requirements.txt",
    "content": "numpy>=1.13.3\ntorch>=0.4.0\nscipy>=1.0.0\npandas>=0.21.0\nvisdom>=0.1.05\nclick>=6.7\nxlrd>=1.0.0\ncytoolz>=0.9.0.1\ntqdm>=4.11.2\nscikit-learn>=0.19.1\nflake8>=3.6.0\ntensorboardX>=1.2\nsetuptools>=40.2.0\ntorchvision>=0.2.1\nseaborn>=0.9.0\npytest>=3.8.0\npytest-cov>=2.6.0\ncodecov>=2.0.15\n-e git://github.com/vlukiyanov/pt-sdae.git#egg=ptsdae\n"
  },
  {
    "path": "setup.cfg",
    "content": "[aliases]\ntest=pytest\n"
  },
  {
    "path": "setup.py",
    "content": "from setuptools import setup\n\n\nsetup(\n    name=\"ptdec\",\n    version=\"1.0\",\n    description=\"PyTorch implementation of DEC.\",\n    author=\"Vladimir Lukiyanov\",\n    author_email=\"vladimir.lukiyanov@me.com\",\n    url=\"https://github.com/vlukiyanov/pt-dec/\",\n    download_url=\"\",\n    license=\"MIT\",\n    setup_requires=[\"pytest-runner\"],\n    tests_require=[\"pytest\"],\n    install_requires=[\n        \"numpy>=1.13.3\",\n        \"torch>=0.4.0\",\n        \"scipy>=1.0.0\",\n        \"pandas>=0.21.0\",\n        \"visdom>=0.1.05\",\n        \"click>=6.7\",\n        \"xlrd>=1.0.0\",\n        \"cytoolz>=0.9.0.1\",\n        \"tqdm>=4.11.2\",\n        \"scikit-learn>=0.19.1\",\n        \"ptsdae>=1.0.0\",\n    ],\n    packages=[\"ptdec\"],\n)\n"
  },
  {
    "path": "tests/__init__.py",
    "content": ""
  },
  {
    "path": "tests/test_cluster.py",
    "content": "import torch\nfrom unittest import TestCase\n\nfrom ptdec.cluster import ClusterAssignment\n\n\nclass TestClusterAssignment(TestCase):\n    @classmethod\n    def setUpClass(cls):\n        cls.ca = ClusterAssignment(\n            cluster_number=2,\n            embedding_dimension=2,\n            cluster_centers=torch.Tensor([[-1, -1], [1, 1]]).float(),\n        )\n\n    def test_forward(self):\n        \"\"\"\n        Basic test to check that the calculation is equivalent to the one in the paper.\n        \"\"\"\n        test_tensor = torch.Tensor([-2, -2]).float().unsqueeze(0)\n        den = float(1) / 3 + float(1) / 19\n        gold = torch.Tensor([(float(1) / 3) / den, (float(1) / 19) / den])\n        output = self.ca(test_tensor).data\n        self.assertAlmostEqual((gold - output).numpy()[0][0], 0.0)\n        self.assertAlmostEqual((gold - output).numpy()[0][1], 0.0)\n"
  },
  {
    "path": "tests/test_dec.py",
    "content": "import torch\nfrom torch.autograd import Variable\nfrom unittest import TestCase\n\nfrom ptsdae.sdae import StackedDenoisingAutoEncoder\nfrom ptdec.dec import DEC\n\n\nclass TestAutoEncoder(TestCase):\n    @classmethod\n    def setUpClass(cls):\n        cls.ae = StackedDenoisingAutoEncoder([100, 50, 5])\n        cls.dec = DEC(2, 5, cls.ae.encoder)\n\n    def test_dimension(self):\n        \"\"\"\n        Basic tests that check that given an input tensor the output and encoded tensors are of the\n        expected size.\n        \"\"\"\n        input_tensor = Variable(torch.Tensor(1, 100).fill_(1.0))\n        output_tensor = self.dec(input_tensor)\n        self.assertEqual(tuple(output_tensor.size()), (1, 2))\n"
  },
  {
    "path": "tests/test_model.py",
    "content": "from ptdec.model import predict, train\nimport torch\nfrom torch.utils.data import TensorDataset\nfrom unittest.mock import MagicMock, Mock\n\n\ndef test_train_with_prediction():\n    model = Mock()\n    model.return_value = torch.zeros(100, 100).requires_grad_()\n    model.cluster_number = 10\n    model.encoder.return_value = torch.zeros(100, 100)\n    model.state_dict.return_value = MagicMock()\n    optimizer = Mock()\n    dataset = TensorDataset(torch.zeros(100, 100), torch.zeros(100, 1))\n    train(\n        dataset=dataset,\n        model=model,\n        epochs=1,\n        batch_size=100,\n        optimizer=optimizer,\n        cuda=False,\n    )\n    assert model.call_count == 2\n\n\ndef test_predict():\n    autoencoder = Mock()\n    autoencoder.return_value = torch.zeros(10, 100)\n    dataset = TensorDataset(torch.zeros(100, 100), torch.zeros(100, 1))\n    output = predict(dataset, autoencoder, batch_size=10, cuda=False)\n    assert autoencoder.call_count == 10\n    assert output.shape == (100,)\n"
  },
  {
    "path": "tests/test_utils.py",
    "content": "import numpy as np\nimport torch\nfrom unittest import TestCase\n\nfrom ptdec.utils import cluster_accuracy, target_distribution\n\n\nclass TestClusterAccuracy(TestCase):\n    def test_basic(self):\n        \"\"\"\n        Basic test to check that the calculation is sensible.\n        \"\"\"\n        true_value1 = np.array([1, 2, 1, 2, 0, 0], dtype=np.int64)\n        pred_value1 = np.array([2, 1, 2, 1, 0, 0], dtype=np.int64)\n        self.assertAlmostEqual(cluster_accuracy(true_value1, pred_value1)[1], 1.0)\n        self.assertAlmostEqual(cluster_accuracy(true_value1, pred_value1, 3)[1], 1.0)\n        self.assertDictEqual(\n            cluster_accuracy(true_value1, pred_value1)[0], {0: 0, 1: 2, 2: 1}\n        )\n        true_value2 = np.array([1, 1, 1, 1, 1, 1], dtype=np.int64)\n        pred_value2 = np.array([0, 1, 2, 3, 4, 5], dtype=np.int64)\n        self.assertAlmostEqual(cluster_accuracy(true_value2, pred_value2)[1], 1.0 / 6.0)\n        self.assertAlmostEqual(\n            cluster_accuracy(true_value2, pred_value2, 6)[1], 1.0 / 6.0\n        )\n        true_value3 = np.array([1, 3, 1, 3, 0, 2], dtype=np.int64)\n        pred_value3 = np.array([2, 1, 2, 1, 3, 0], dtype=np.int64)\n        self.assertDictEqual(\n            cluster_accuracy(true_value3, pred_value3)[0], {2: 1, 1: 3, 3: 0, 0: 2}\n        )\n\n\nclass TestTargetDistribution(TestCase):\n    def test_basic(self):\n        \"\"\"\n        Basic test to check that the calculation is sensible and conforms to the formula.\n        \"\"\"\n        test_tensor = torch.Tensor([[0.5, 0.5], [0.0, 1.0]])\n        output = target_distribution(test_tensor)\n        self.assertAlmostEqual(tuple(output[0]), (0.75, 0.25))\n        self.assertAlmostEqual(tuple(output[1]), (0.0, 1.0))\n"
  }
]