Repository: akarazniewicz/cocosplit Branch: master Commit: 4c8ddccee94f Files: 3 Total size: 5.9 KB Directory structure: gitextract_von68ux0/ ├── README.md ├── cocosplit.py └── requirements.txt ================================================ FILE CONTENTS ================================================ ================================================ FILE: README.md ================================================ Simple tool to split a multi-label coco annotation dataset with preserving class distributions among train and test sets. The code is an updated version from [akarazniewicz/cocosplit](https://github.com/akarazniewicz/cocosplit) original repo, where the functionality of splitting multi-class data while preserving distributions is added. ## Installation ``cocosplit`` requires python 3 and basic set of dependencies: specifically, in addition to the requirements of the original repo, (``scikit-multilearn``) is required, it is included the requirements.txt file ``` pip install -r requirements ``` ## Usage The same as the original repo, with adding an argument (``--multi-class``) to preserve class distributions The argument is optional to ensure backward compatibility ``` $ python cocosplit.py -h usage: cocosplit.py [-h] -s SPLIT [--having-annotations] coco_annotations train test Splits COCO annotations file into training and test sets. positional arguments: coco_annotations Path to COCO annotations file. train Where to store COCO training annotations test Where to store COCO test annotations optional arguments: -h, --help show this help message and exit -s SPLIT A percentage of a split; a number in (0, 1) --having-annotations Ignore all images without annotations. Keep only these with at least one annotation --multi-class Split a multi-class dataset while preserving class distributions in train and test sets ``` # Running ``` $ python cocosplit.py --having-annotations --multi-class -s 0.8 /path/to/your/coco_annotations.json train.json test.json ``` will split ``coco_annotation.json`` into ``train.json`` and ``test.json`` with ratio 80%/20% respectively. It will skip all images (``--having-annotations``) without annotations. ================================================ FILE: cocosplit.py ================================================ import json import argparse import funcy from sklearn.model_selection import train_test_split from skmultilearn.model_selection import iterative_train_test_split import numpy as np def save_coco(file, info, licenses, images, annotations, categories): with open(file, 'wt', encoding='UTF-8') as coco: json.dump({ 'info': info, 'licenses': licenses, 'images': images, 'annotations': annotations, 'categories': categories}, coco, indent=2, sort_keys=True) def filter_annotations(annotations, images): image_ids = funcy.lmap(lambda i: int(i['id']), images) return funcy.lfilter(lambda a: int(a['image_id']) in image_ids, annotations) def filter_images(images, annotations): annotation_ids = funcy.lmap(lambda i: int(i['image_id']), annotations) return funcy.lfilter(lambda a: int(a['id']) in annotation_ids, images) parser = argparse.ArgumentParser(description='Splits COCO annotations file into training and test sets.') parser.add_argument('annotations', metavar='coco_annotations', type=str, help='Path to COCO annotations file.') parser.add_argument('train', type=str, help='Where to store COCO training annotations') parser.add_argument('test', type=str, help='Where to store COCO test annotations') parser.add_argument('-s', dest='split', type=float, required=True, help="A percentage of a split; a number in (0, 1)") parser.add_argument('--having-annotations', dest='having_annotations', action='store_true', help='Ignore all images without annotations. Keep only these with at least one annotation') parser.add_argument('--multi-class', dest='multi_class', action='store_true', help='Split a multi-class dataset while preserving class distributions in train and test sets') args = parser.parse_args() def main(args): with open(args.annotations, 'rt', encoding='UTF-8') as annotations: coco = json.load(annotations) info = coco['info'] licenses = coco['licenses'] images = coco['images'] annotations = coco['annotations'] categories = coco['categories'] number_of_images = len(images) images_with_annotations = funcy.lmap(lambda a: int(a['image_id']), annotations) if args.having_annotations: images = funcy.lremove(lambda i: i['id'] not in images_with_annotations, images) if args.multi_class: annotation_categories = funcy.lmap(lambda a: int(a['category_id']), annotations) #bottle neck 1 #remove classes that has only one sample, because it can't be split into the training and testing sets annotation_categories = funcy.lremove(lambda i: annotation_categories.count(i) <=1 , annotation_categories) annotations = funcy.lremove(lambda i: i['category_id'] not in annotation_categories , annotations) X_train, y_train, X_test, y_test = iterative_train_test_split(np.array([annotations]).T,np.array([ annotation_categories]).T, test_size = 1-args.split) save_coco(args.train, info, licenses, filter_images(images, X_train.reshape(-1)), X_train.reshape(-1).tolist(), categories) save_coco(args.test, info, licenses, filter_images(images, X_test.reshape(-1)), X_test.reshape(-1).tolist(), categories) print("Saved {} entries in {} and {} in {}".format(len(X_train), args.train, len(X_test), args.test)) else: X_train, X_test = train_test_split(images, train_size=args.split) anns_train = filter_annotations(annotations, X_train) anns_test=filter_annotations(annotations, X_test) save_coco(args.train, info, licenses, X_train, anns_train, categories) save_coco(args.test, info, licenses, X_test, anns_test, categories) print("Saved {} entries in {} and {} in {}".format(len(anns_train), args.train, len(anns_test), args.test)) if __name__ == "__main__": main(args) ================================================ FILE: requirements.txt ================================================ sklearn funcy argparse scikit-multilearn