Repository: vlukiyanov/pt-dec Branch: master Commit: 11b30553858c Files: 18 Total size: 27.6 KB Directory structure: gitextract_rmfv6y78/ ├── .gitignore ├── .travis.yml ├── LICENSE ├── README.md ├── examples/ │ └── mnist/ │ └── mnist.py ├── ptdec/ │ ├── __init__.py │ ├── cluster.py │ ├── dec.py │ ├── model.py │ └── utils.py ├── requirements.txt ├── setup.cfg ├── setup.py └── tests/ ├── __init__.py ├── test_cluster.py ├── test_dec.py ├── test_model.py └── test_utils.py ================================================ FILE CONTENTS ================================================ ================================================ FILE: .gitignore ================================================ # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] *$py.class # Distribution / packaging .Python build/ develop-eggs/ dist/ downloads/ eggs/ .eggs/ lib/ lib64/ parts/ sdist/ var/ wheels/ share/python-wheels/ *.egg-info/ .installed.cfg *.egg MANIFEST # Jupyter Notebook .ipynb_checkpoints ================================================ FILE: .travis.yml ================================================ language: python python: - "3.6" - "3.7-dev" # command to install dependencies install: - pip install black==19.10b - pip install -q -r requirements.txt - python setup.py install # command to run tests script: - black --fast --check ptdec/. - black --fast --check examples/. - black --fast --check tests/. - pytest --cov=ptdec tests - python examples/mnist/mnist.py --cuda=False --batch-size=48 --pretrain-epochs=1 --finetune-epochs=1 --testing-mode=True after_success: - codecov ================================================ FILE: LICENSE ================================================ MIT License Copyright (c) 2018 Vladimir Lukiyanov Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ================================================ FILE: README.md ================================================ # pt-dec [![Build Status](https://travis-ci.org/vlukiyanov/pt-dec.svg?branch=master)](https://travis-ci.org/vlukiyanov/pt-dec) [![codecov](https://codecov.io/gh/vlukiyanov/pt-dec/branch/master/graph/badge.svg)](https://codecov.io/gh/vlukiyanov/pt-dec) [![Codacy Badge](https://api.codacy.com/project/badge/Grade/5877a6b3baa342c3bb2d8f4a4c94f8dd)](https://app.codacy.com/app/vlukiyanov/pt-dec?utm_source=github.com&utm_medium=referral&utm_content=vlukiyanov/pt-dec&utm_campaign=Badge_Grade_Settings) PyTorch implementation of a version of the Deep Embedded Clustering (DEC) algorithm. Compatible with PyTorch 1.0.0 and Python 3.6 or 3.7 with or without CUDA. This follows (*or attempts to; note this implementation is unofficial*) the algorithm described in "Unsupervised Deep Embedding for Clustering Analysis" of Junyuan Xie, Ross Girshick, Ali Farhadi (). ## Examples An example using MNIST data can be found in the `examples/mnist/mnist.py` which achieves around 85% accuracy. Here is an example [confusion matrix](http://scikit-learn.org/stable/modules/generated/sklearn.metrics.confusion_matrix.html), true labels on y-axis and predicted labels on the x-axis. ![Alt text](confusion.png) ## Usage This is distributed as a Python package `ptdec` and can be installed with `python setup.py install` after installing `ptsdae` from https://github.com/vlukiyanov/pt-sdae. The PyTorch `nn.Module` class representing the DEC is `DEC` in `ptdec.dec`, while the `train` function from `ptdec.model` is used to train DEC. ## Other implementations of DEC * Original Caffe: * PyTorch: and * Keras: and * MXNet: * Chainer: ================================================ FILE: examples/mnist/mnist.py ================================================ import click import numpy as np import seaborn as sns from sklearn.metrics import confusion_matrix from torch.optim import SGD from torch.optim.lr_scheduler import StepLR import torch from torch.utils.data import Dataset from torchvision import transforms from torchvision.datasets import MNIST from tensorboardX import SummaryWriter import uuid from ptdec.dec import DEC from ptdec.model import train, predict from ptsdae.sdae import StackedDenoisingAutoEncoder import ptsdae.model as ae from ptdec.utils import cluster_accuracy class CachedMNIST(Dataset): def __init__(self, train, cuda, testing_mode=False): img_transform = transforms.Compose([transforms.Lambda(self._transformation)]) self.ds = MNIST("./data", download=True, train=train, transform=img_transform) self.cuda = cuda self.testing_mode = testing_mode self._cache = dict() @staticmethod def _transformation(img): return ( torch.ByteTensor(torch.ByteStorage.from_buffer(img.tobytes())).float() * 0.02 ) def __getitem__(self, index: int) -> torch.Tensor: if index not in self._cache: self._cache[index] = list(self.ds[index]) if self.cuda: self._cache[index][0] = self._cache[index][0].cuda(non_blocking=True) self._cache[index][1] = torch.tensor( self._cache[index][1], dtype=torch.long ).cuda(non_blocking=True) return self._cache[index] def __len__(self) -> int: return 128 if self.testing_mode else len(self.ds) @click.command() @click.option( "--cuda", help="whether to use CUDA (default False).", type=bool, default=False ) @click.option( "--batch-size", help="training batch size (default 256).", type=int, default=256 ) @click.option( "--pretrain-epochs", help="number of pretraining epochs (default 300).", type=int, default=300, ) @click.option( "--finetune-epochs", help="number of finetune epochs (default 500).", type=int, default=500, ) @click.option( "--testing-mode", help="whether to run in testing mode (default False).", type=bool, default=False, ) def main(cuda, batch_size, pretrain_epochs, finetune_epochs, testing_mode): writer = SummaryWriter() # create the TensorBoard object # callback function to call during training, uses writer from the scope def training_callback(epoch, lr, loss, validation_loss): writer.add_scalars( "data/autoencoder", {"lr": lr, "loss": loss, "validation_loss": validation_loss,}, epoch, ) ds_train = CachedMNIST( train=True, cuda=cuda, testing_mode=testing_mode ) # training dataset ds_val = CachedMNIST( train=False, cuda=cuda, testing_mode=testing_mode ) # evaluation dataset autoencoder = StackedDenoisingAutoEncoder( [28 * 28, 500, 500, 2000, 10], final_activation=None ) if cuda: autoencoder.cuda() print("Pretraining stage.") ae.pretrain( ds_train, autoencoder, cuda=cuda, validation=ds_val, epochs=pretrain_epochs, batch_size=batch_size, optimizer=lambda model: SGD(model.parameters(), lr=0.1, momentum=0.9), scheduler=lambda x: StepLR(x, 100, gamma=0.1), corruption=0.2, ) print("Training stage.") ae_optimizer = SGD(params=autoencoder.parameters(), lr=0.1, momentum=0.9) ae.train( ds_train, autoencoder, cuda=cuda, validation=ds_val, epochs=finetune_epochs, batch_size=batch_size, optimizer=ae_optimizer, scheduler=StepLR(ae_optimizer, 100, gamma=0.1), corruption=0.2, update_callback=training_callback, ) print("DEC stage.") model = DEC(cluster_number=10, hidden_dimension=10, encoder=autoencoder.encoder) if cuda: model.cuda() dec_optimizer = SGD(model.parameters(), lr=0.01, momentum=0.9) train( dataset=ds_train, model=model, epochs=100, batch_size=256, optimizer=dec_optimizer, stopping_delta=0.000001, cuda=cuda, ) predicted, actual = predict( ds_train, model, 1024, silent=True, return_actual=True, cuda=cuda ) actual = actual.cpu().numpy() predicted = predicted.cpu().numpy() reassignment, accuracy = cluster_accuracy(actual, predicted) print("Final DEC accuracy: %s" % accuracy) if not testing_mode: predicted_reassigned = [ reassignment[item] for item in predicted ] # TODO numpify confusion = confusion_matrix(actual, predicted_reassigned) normalised_confusion = ( confusion.astype("float") / confusion.sum(axis=1)[:, np.newaxis] ) confusion_id = uuid.uuid4().hex sns.heatmap(normalised_confusion).get_figure().savefig( "confusion_%s.png" % confusion_id ) print("Writing out confusion diagram with UUID: %s" % confusion_id) writer.close() if __name__ == "__main__": main() ================================================ FILE: ptdec/__init__.py ================================================ ================================================ FILE: ptdec/cluster.py ================================================ import torch import torch.nn as nn from torch.nn import Parameter from typing import Optional class ClusterAssignment(nn.Module): def __init__( self, cluster_number: int, embedding_dimension: int, alpha: float = 1.0, cluster_centers: Optional[torch.Tensor] = None, ) -> None: """ Module to handle the soft assignment, for a description see in 3.1.1. in Xie/Girshick/Farhadi, where the Student's t-distribution is used measure similarity between feature vector and each cluster centroid. :param cluster_number: number of clusters :param embedding_dimension: embedding dimension of feature vectors :param alpha: parameter representing the degrees of freedom in the t-distribution, default 1.0 :param cluster_centers: clusters centers to initialise, if None then use Xavier uniform """ super(ClusterAssignment, self).__init__() self.embedding_dimension = embedding_dimension self.cluster_number = cluster_number self.alpha = alpha if cluster_centers is None: initial_cluster_centers = torch.zeros( self.cluster_number, self.embedding_dimension, dtype=torch.float ) nn.init.xavier_uniform_(initial_cluster_centers) else: initial_cluster_centers = cluster_centers self.cluster_centers = Parameter(initial_cluster_centers) def forward(self, batch: torch.Tensor) -> torch.Tensor: """ Compute the soft assignment for a batch of feature vectors, returning a batch of assignments for each cluster. :param batch: FloatTensor of [batch size, embedding dimension] :return: FloatTensor [batch size, number of clusters] """ norm_squared = torch.sum((batch.unsqueeze(1) - self.cluster_centers) ** 2, 2) numerator = 1.0 / (1.0 + (norm_squared / self.alpha)) power = float(self.alpha + 1) / 2 numerator = numerator ** power return numerator / torch.sum(numerator, dim=1, keepdim=True) ================================================ FILE: ptdec/dec.py ================================================ import torch import torch.nn as nn from ptdec.cluster import ClusterAssignment class DEC(nn.Module): def __init__( self, cluster_number: int, hidden_dimension: int, encoder: torch.nn.Module, alpha: float = 1.0, ): """ Module which holds all the moving parts of the DEC algorithm, as described in Xie/Girshick/Farhadi; this includes the AutoEncoder stage and the ClusterAssignment stage. :param cluster_number: number of clusters :param hidden_dimension: hidden dimension, output of the encoder :param encoder: encoder to use :param alpha: parameter representing the degrees of freedom in the t-distribution, default 1.0 """ super(DEC, self).__init__() self.encoder = encoder self.hidden_dimension = hidden_dimension self.cluster_number = cluster_number self.alpha = alpha self.assignment = ClusterAssignment( cluster_number, self.hidden_dimension, alpha ) def forward(self, batch: torch.Tensor) -> torch.Tensor: """ Compute the cluster assignment using the ClusterAssignment after running the batch through the encoder part of the associated AutoEncoder module. :param batch: [batch size, embedding dimension] FloatTensor :return: [batch size, number of clusters] FloatTensor """ return self.assignment(self.encoder(batch)) ================================================ FILE: ptdec/model.py ================================================ import numpy as np from sklearn.cluster import KMeans import torch import torch.nn as nn from torch.utils.data.dataloader import DataLoader, default_collate from typing import Tuple, Callable, Optional, Union from tqdm import tqdm from ptdec.utils import target_distribution, cluster_accuracy def train( dataset: torch.utils.data.Dataset, model: torch.nn.Module, epochs: int, batch_size: int, optimizer: torch.optim.Optimizer, stopping_delta: Optional[float] = None, collate_fn=default_collate, cuda: bool = True, sampler: Optional[torch.utils.data.sampler.Sampler] = None, silent: bool = False, update_freq: int = 10, evaluate_batch_size: int = 1024, update_callback: Optional[Callable[[float, float], None]] = None, epoch_callback: Optional[Callable[[int, torch.nn.Module], None]] = None, ) -> None: """ Train the DEC model given a dataset, a model instance and various configuration parameters. :param dataset: instance of Dataset to use for training :param model: instance of DEC model to train :param epochs: number of training epochs :param batch_size: size of the batch to train with :param optimizer: instance of optimizer to use :param stopping_delta: label delta as a proportion to use for stopping, None to disable, default None :param collate_fn: function to merge a list of samples into mini-batch :param cuda: whether to use CUDA, defaults to True :param sampler: optional sampler to use in the DataLoader, defaults to None :param silent: set to True to prevent printing out summary statistics, defaults to False :param update_freq: frequency of batches with which to update counter, None disables, default 10 :param evaluate_batch_size: batch size for evaluation stage, default 1024 :param update_callback: optional function of accuracy and loss to update, default None :param epoch_callback: optional function of epoch and model, default None :return: None """ static_dataloader = DataLoader( dataset, batch_size=batch_size, collate_fn=collate_fn, pin_memory=False, sampler=sampler, shuffle=False, ) train_dataloader = DataLoader( dataset, batch_size=batch_size, collate_fn=collate_fn, sampler=sampler, shuffle=True, ) data_iterator = tqdm( static_dataloader, leave=True, unit="batch", postfix={ "epo": -1, "acc": "%.4f" % 0.0, "lss": "%.8f" % 0.0, "dlb": "%.4f" % -1, }, disable=silent, ) kmeans = KMeans(n_clusters=model.cluster_number, n_init=20) model.train() features = [] actual = [] # form initial cluster centres for index, batch in enumerate(data_iterator): if (isinstance(batch, tuple) or isinstance(batch, list)) and len(batch) == 2: batch, value = batch # if we have a prediction label, separate it to actual actual.append(value) if cuda: batch = batch.cuda(non_blocking=True) features.append(model.encoder(batch).detach().cpu()) actual = torch.cat(actual).long() predicted = kmeans.fit_predict(torch.cat(features).numpy()) predicted_previous = torch.tensor(np.copy(predicted), dtype=torch.long) _, accuracy = cluster_accuracy(predicted, actual.cpu().numpy()) cluster_centers = torch.tensor( kmeans.cluster_centers_, dtype=torch.float, requires_grad=True ) if cuda: cluster_centers = cluster_centers.cuda(non_blocking=True) with torch.no_grad(): # initialise the cluster centers model.state_dict()["assignment.cluster_centers"].copy_(cluster_centers) loss_function = nn.KLDivLoss(size_average=False) delta_label = None for epoch in range(epochs): features = [] data_iterator = tqdm( train_dataloader, leave=True, unit="batch", postfix={ "epo": epoch, "acc": "%.4f" % (accuracy or 0.0), "lss": "%.8f" % 0.0, "dlb": "%.4f" % (delta_label or 0.0), }, disable=silent, ) model.train() for index, batch in enumerate(data_iterator): if (isinstance(batch, tuple) or isinstance(batch, list)) and len( batch ) == 2: batch, _ = batch # if we have a prediction label, strip it away if cuda: batch = batch.cuda(non_blocking=True) output = model(batch) target = target_distribution(output).detach() loss = loss_function(output.log(), target) / output.shape[0] data_iterator.set_postfix( epo=epoch, acc="%.4f" % (accuracy or 0.0), lss="%.8f" % float(loss.item()), dlb="%.4f" % (delta_label or 0.0), ) optimizer.zero_grad() loss.backward() optimizer.step(closure=None) features.append(model.encoder(batch).detach().cpu()) if update_freq is not None and index % update_freq == 0: loss_value = float(loss.item()) data_iterator.set_postfix( epo=epoch, acc="%.4f" % (accuracy or 0.0), lss="%.8f" % loss_value, dlb="%.4f" % (delta_label or 0.0), ) if update_callback is not None: update_callback(accuracy, loss_value, delta_label) predicted, actual = predict( dataset, model, batch_size=evaluate_batch_size, collate_fn=collate_fn, silent=True, return_actual=True, cuda=cuda, ) delta_label = ( float((predicted != predicted_previous).float().sum().item()) / predicted_previous.shape[0] ) if stopping_delta is not None and delta_label < stopping_delta: print( 'Early stopping as label delta "%1.5f" less than "%1.5f".' % (delta_label, stopping_delta) ) break predicted_previous = predicted _, accuracy = cluster_accuracy(predicted.cpu().numpy(), actual.cpu().numpy()) data_iterator.set_postfix( epo=epoch, acc="%.4f" % (accuracy or 0.0), lss="%.8f" % 0.0, dlb="%.4f" % (delta_label or 0.0), ) if epoch_callback is not None: epoch_callback(epoch, model) def predict( dataset: torch.utils.data.Dataset, model: torch.nn.Module, batch_size: int = 1024, collate_fn=default_collate, cuda: bool = True, silent: bool = False, return_actual: bool = False, ) -> Union[Tuple[torch.Tensor, torch.Tensor], torch.Tensor]: """ Predict clusters for a dataset given a DEC model instance and various configuration parameters. :param dataset: instance of Dataset to use for training :param model: instance of DEC model to predict :param batch_size: size of the batch to predict with, default 1024 :param collate_fn: function to merge a list of samples into mini-batch :param cuda: whether CUDA is used, defaults to True :param silent: set to True to prevent printing out summary statistics, defaults to False :param return_actual: return actual values, if present in the Dataset :return: tuple of prediction and actual if return_actual is True otherwise prediction """ dataloader = DataLoader( dataset, batch_size=batch_size, collate_fn=collate_fn, shuffle=False ) data_iterator = tqdm(dataloader, leave=True, unit="batch", disable=silent,) features = [] actual = [] model.eval() for batch in data_iterator: if (isinstance(batch, tuple) or isinstance(batch, list)) and len(batch) == 2: batch, value = batch # unpack if we have a prediction label if return_actual: actual.append(value) elif return_actual: raise ValueError( "Dataset has no actual value to unpack, but return_actual is set." ) if cuda: batch = batch.cuda(non_blocking=True) features.append( model(batch).detach().cpu() ) # move to the CPU to prevent out of memory on the GPU if return_actual: return torch.cat(features).max(1)[1], torch.cat(actual).long() else: return torch.cat(features).max(1)[1] ================================================ FILE: ptdec/utils.py ================================================ import numpy as np import torch from typing import Optional from scipy.optimize import linear_sum_assignment def cluster_accuracy(y_true, y_predicted, cluster_number: Optional[int] = None): """ Calculate clustering accuracy after using the linear_sum_assignment function in SciPy to determine reassignments. :param y_true: list of true cluster numbers, an integer array 0-indexed :param y_predicted: list of predicted cluster numbers, an integer array 0-indexed :param cluster_number: number of clusters, if None then calculated from input :return: reassignment dictionary, clustering accuracy """ if cluster_number is None: cluster_number = ( max(y_predicted.max(), y_true.max()) + 1 ) # assume labels are 0-indexed count_matrix = np.zeros((cluster_number, cluster_number), dtype=np.int64) for i in range(y_predicted.size): count_matrix[y_predicted[i], y_true[i]] += 1 row_ind, col_ind = linear_sum_assignment(count_matrix.max() - count_matrix) reassignment = dict(zip(row_ind, col_ind)) accuracy = count_matrix[row_ind, col_ind].sum() / y_predicted.size return reassignment, accuracy def target_distribution(batch: torch.Tensor) -> torch.Tensor: """ Compute the target distribution p_ij, given the batch (q_ij), as in 3.1.3 Equation 3 of Xie/Girshick/Farhadi; this is used the KL-divergence loss function. :param batch: [batch size, number of clusters] Tensor of dtype float :return: [batch size, number of clusters] Tensor of dtype float """ weight = (batch ** 2) / torch.sum(batch, 0) return (weight.t() / torch.sum(weight, 1)).t() ================================================ FILE: requirements.txt ================================================ numpy>=1.13.3 torch>=0.4.0 scipy>=1.0.0 pandas>=0.21.0 visdom>=0.1.05 click>=6.7 xlrd>=1.0.0 cytoolz>=0.9.0.1 tqdm>=4.11.2 scikit-learn>=0.19.1 flake8>=3.6.0 tensorboardX>=1.2 setuptools>=40.2.0 torchvision>=0.2.1 seaborn>=0.9.0 pytest>=3.8.0 pytest-cov>=2.6.0 codecov>=2.0.15 -e git://github.com/vlukiyanov/pt-sdae.git#egg=ptsdae ================================================ FILE: setup.cfg ================================================ [aliases] test=pytest ================================================ FILE: setup.py ================================================ from setuptools import setup setup( name="ptdec", version="1.0", description="PyTorch implementation of DEC.", author="Vladimir Lukiyanov", author_email="vladimir.lukiyanov@me.com", url="https://github.com/vlukiyanov/pt-dec/", download_url="", license="MIT", setup_requires=["pytest-runner"], tests_require=["pytest"], install_requires=[ "numpy>=1.13.3", "torch>=0.4.0", "scipy>=1.0.0", "pandas>=0.21.0", "visdom>=0.1.05", "click>=6.7", "xlrd>=1.0.0", "cytoolz>=0.9.0.1", "tqdm>=4.11.2", "scikit-learn>=0.19.1", "ptsdae>=1.0.0", ], packages=["ptdec"], ) ================================================ FILE: tests/__init__.py ================================================ ================================================ FILE: tests/test_cluster.py ================================================ import torch from unittest import TestCase from ptdec.cluster import ClusterAssignment class TestClusterAssignment(TestCase): @classmethod def setUpClass(cls): cls.ca = ClusterAssignment( cluster_number=2, embedding_dimension=2, cluster_centers=torch.Tensor([[-1, -1], [1, 1]]).float(), ) def test_forward(self): """ Basic test to check that the calculation is equivalent to the one in the paper. """ test_tensor = torch.Tensor([-2, -2]).float().unsqueeze(0) den = float(1) / 3 + float(1) / 19 gold = torch.Tensor([(float(1) / 3) / den, (float(1) / 19) / den]) output = self.ca(test_tensor).data self.assertAlmostEqual((gold - output).numpy()[0][0], 0.0) self.assertAlmostEqual((gold - output).numpy()[0][1], 0.0) ================================================ FILE: tests/test_dec.py ================================================ import torch from torch.autograd import Variable from unittest import TestCase from ptsdae.sdae import StackedDenoisingAutoEncoder from ptdec.dec import DEC class TestAutoEncoder(TestCase): @classmethod def setUpClass(cls): cls.ae = StackedDenoisingAutoEncoder([100, 50, 5]) cls.dec = DEC(2, 5, cls.ae.encoder) def test_dimension(self): """ Basic tests that check that given an input tensor the output and encoded tensors are of the expected size. """ input_tensor = Variable(torch.Tensor(1, 100).fill_(1.0)) output_tensor = self.dec(input_tensor) self.assertEqual(tuple(output_tensor.size()), (1, 2)) ================================================ FILE: tests/test_model.py ================================================ from ptdec.model import predict, train import torch from torch.utils.data import TensorDataset from unittest.mock import MagicMock, Mock def test_train_with_prediction(): model = Mock() model.return_value = torch.zeros(100, 100).requires_grad_() model.cluster_number = 10 model.encoder.return_value = torch.zeros(100, 100) model.state_dict.return_value = MagicMock() optimizer = Mock() dataset = TensorDataset(torch.zeros(100, 100), torch.zeros(100, 1)) train( dataset=dataset, model=model, epochs=1, batch_size=100, optimizer=optimizer, cuda=False, ) assert model.call_count == 2 def test_predict(): autoencoder = Mock() autoencoder.return_value = torch.zeros(10, 100) dataset = TensorDataset(torch.zeros(100, 100), torch.zeros(100, 1)) output = predict(dataset, autoencoder, batch_size=10, cuda=False) assert autoencoder.call_count == 10 assert output.shape == (100,) ================================================ FILE: tests/test_utils.py ================================================ import numpy as np import torch from unittest import TestCase from ptdec.utils import cluster_accuracy, target_distribution class TestClusterAccuracy(TestCase): def test_basic(self): """ Basic test to check that the calculation is sensible. """ true_value1 = np.array([1, 2, 1, 2, 0, 0], dtype=np.int64) pred_value1 = np.array([2, 1, 2, 1, 0, 0], dtype=np.int64) self.assertAlmostEqual(cluster_accuracy(true_value1, pred_value1)[1], 1.0) self.assertAlmostEqual(cluster_accuracy(true_value1, pred_value1, 3)[1], 1.0) self.assertDictEqual( cluster_accuracy(true_value1, pred_value1)[0], {0: 0, 1: 2, 2: 1} ) true_value2 = np.array([1, 1, 1, 1, 1, 1], dtype=np.int64) pred_value2 = np.array([0, 1, 2, 3, 4, 5], dtype=np.int64) self.assertAlmostEqual(cluster_accuracy(true_value2, pred_value2)[1], 1.0 / 6.0) self.assertAlmostEqual( cluster_accuracy(true_value2, pred_value2, 6)[1], 1.0 / 6.0 ) true_value3 = np.array([1, 3, 1, 3, 0, 2], dtype=np.int64) pred_value3 = np.array([2, 1, 2, 1, 3, 0], dtype=np.int64) self.assertDictEqual( cluster_accuracy(true_value3, pred_value3)[0], {2: 1, 1: 3, 3: 0, 0: 2} ) class TestTargetDistribution(TestCase): def test_basic(self): """ Basic test to check that the calculation is sensible and conforms to the formula. """ test_tensor = torch.Tensor([[0.5, 0.5], [0.0, 1.0]]) output = target_distribution(test_tensor) self.assertAlmostEqual(tuple(output[0]), (0.75, 0.25)) self.assertAlmostEqual(tuple(output[1]), (0.0, 1.0))