[
  {
    "path": "README.md",
    "content": "Simple tool to split a multi-label coco annotation dataset with preserving class distributions among train and test sets.\n\nThe code is an updated version from [akarazniewicz/cocosplit](https://github.com/akarazniewicz/cocosplit)  original repo, where the functionality of splitting multi-class data while preserving distributions is added.\n\n\n## Installation\n\n``cocosplit`` requires python 3 and basic set of dependencies:\n\nspecifically, in addition to the requirements of the original repo, (``scikit-multilearn``) is required, it is included the requirements.txt file\n\n```\npip install -r requirements\n```\n\n\n## Usage\n\nThe same as the original repo, with adding an argument (``--multi-class``) to preserve class distributions\nThe argument is optional to ensure backward compatibility\n\n```\n$ python cocosplit.py -h\nusage: cocosplit.py [-h] -s SPLIT [--having-annotations]\n                    coco_annotations train test\n\nSplits COCO annotations file into training and test sets.\n\npositional arguments:\n  coco_annotations      Path to COCO annotations file.\n  train                 Where to store COCO training annotations\n  test                  Where to store COCO test annotations\n\noptional arguments:\n  -h, --help            show this help message and exit\n  -s SPLIT              A percentage of a split; a number in (0, 1)\n  --having-annotations  Ignore all images without annotations. Keep only these\n                        with at least one annotation\n  --multi-class         Split a multi-class dataset while preserving class\n                        distributions in train and test sets\n```\n\n# Running\n\n```\n$ python cocosplit.py --having-annotations --multi-class -s 0.8 /path/to/your/coco_annotations.json train.json test.json\n```\n\nwill split ``coco_annotation.json`` into ``train.json`` and ``test.json`` with ratio 80%/20% respectively. It will skip all\nimages (``--having-annotations``) without annotations.\n"
  },
  {
    "path": "cocosplit.py",
    "content": "import json\nimport argparse\nimport funcy\nfrom sklearn.model_selection import train_test_split\nfrom skmultilearn.model_selection import iterative_train_test_split\nimport numpy as np\n\n\ndef save_coco(file, info, licenses, images, annotations, categories):\n    with open(file, 'wt', encoding='UTF-8') as coco:\n        json.dump({ 'info': info, 'licenses': licenses, 'images': images, \n            'annotations': annotations, 'categories': categories}, coco, indent=2, sort_keys=True)\n\ndef filter_annotations(annotations, images):\n    image_ids = funcy.lmap(lambda i: int(i['id']), images)\n    return funcy.lfilter(lambda a: int(a['image_id']) in image_ids, annotations)\n\n\ndef filter_images(images, annotations):\n\n    annotation_ids = funcy.lmap(lambda i: int(i['image_id']), annotations)\n\n    return funcy.lfilter(lambda a: int(a['id']) in annotation_ids, images)\n\n\nparser = argparse.ArgumentParser(description='Splits COCO annotations file into training and test sets.')\nparser.add_argument('annotations', metavar='coco_annotations', type=str,\n                    help='Path to COCO annotations file.')\nparser.add_argument('train', type=str, help='Where to store COCO training annotations')\nparser.add_argument('test', type=str, help='Where to store COCO test annotations')\nparser.add_argument('-s', dest='split', type=float, required=True,\n                    help=\"A percentage of a split; a number in (0, 1)\")\nparser.add_argument('--having-annotations', dest='having_annotations', action='store_true',\n                    help='Ignore all images without annotations. Keep only these with at least one annotation')\n\nparser.add_argument('--multi-class', dest='multi_class', action='store_true',\n                    help='Split a multi-class dataset while preserving class distributions in train and test sets')\n\nargs = parser.parse_args()\n\ndef main(args):\n\n    with open(args.annotations, 'rt', encoding='UTF-8') as annotations:\n        coco = json.load(annotations)\n        info = coco['info']\n        licenses = coco['licenses']\n        images = coco['images']\n        annotations = coco['annotations']\n        categories = coco['categories']\n\n        number_of_images = len(images)\n\n        images_with_annotations = funcy.lmap(lambda a: int(a['image_id']), annotations)\n\n        if args.having_annotations:\n            images = funcy.lremove(lambda i: i['id'] not in images_with_annotations, images)\n\n\n        if args.multi_class:\n\n            annotation_categories = funcy.lmap(lambda a: int(a['category_id']), annotations)\n\n            #bottle neck 1\n            #remove classes that has only one sample, because it can't be split into the training and testing sets\n            annotation_categories =  funcy.lremove(lambda i: annotation_categories.count(i) <=1  , annotation_categories)\n\n            annotations =  funcy.lremove(lambda i: i['category_id'] not in annotation_categories  , annotations)\n\n\n            X_train, y_train, X_test, y_test = iterative_train_test_split(np.array([annotations]).T,np.array([ annotation_categories]).T, test_size = 1-args.split)\n\n            save_coco(args.train, info, licenses, filter_images(images, X_train.reshape(-1)), X_train.reshape(-1).tolist(), categories)\n            save_coco(args.test, info, licenses,  filter_images(images, X_test.reshape(-1)), X_test.reshape(-1).tolist(), categories)\n\n            print(\"Saved {} entries in {} and {} in {}\".format(len(X_train), args.train, len(X_test), args.test))\n            \n        else:\n\n            X_train, X_test = train_test_split(images, train_size=args.split)\n\n            anns_train = filter_annotations(annotations, X_train)\n            anns_test=filter_annotations(annotations, X_test)\n\n            save_coco(args.train, info, licenses, X_train, anns_train, categories)\n            save_coco(args.test, info, licenses, X_test, anns_test, categories)\n\n            print(\"Saved {} entries in {} and {} in {}\".format(len(anns_train), args.train, len(anns_test), args.test))\n            \n\n\nif __name__ == \"__main__\":\n    main(args)"
  },
  {
    "path": "requirements.txt",
    "content": "sklearn\nfuncy\nargparse\nscikit-multilearn\n"
  }
]