[
  {
    "path": ".gitignore",
    "content": ".vscode/\n\n# local stuff\ninput/\n*.csv\n*.bin\n*.pkl\n*.h5\n\n# Byte-compiled / optimized / DLL files\n__pycache__/\n*.py[cod]\n*$py.class\n\n# C extensions\n*.so\n\n# Distribution / packaging\n.Python\nbuild/\ndevelop-eggs/\ndist/\ndownloads/\neggs/\n.eggs/\nlib/\nlib64/\nparts/\nsdist/\nvar/\nwheels/\npip-wheel-metadata/\nshare/python-wheels/\n*.egg-info/\n.installed.cfg\n*.egg\nMANIFEST\n\n# PyInstaller\n#  Usually these files are written by a python script from a template\n#  before PyInstaller builds the exe, so as to inject date/other infos into it.\n*.manifest\n*.spec\n\n# Installer logs\npip-log.txt\npip-delete-this-directory.txt\n\n# Unit test / coverage reports\nhtmlcov/\n.tox/\n.nox/\n.coverage\n.coverage.*\n.cache\nnosetests.xml\ncoverage.xml\n*.cover\n*.py,cover\n.hypothesis/\n.pytest_cache/\n\n# Translations\n*.mo\n*.pot\n\n# Django stuff:\n*.log\nlocal_settings.py\ndb.sqlite3\ndb.sqlite3-journal\n\n# Flask stuff:\ninstance/\n.webassets-cache\n\n# Scrapy stuff:\n.scrapy\n\n# Sphinx documentation\ndocs/_build/\n\n# PyBuilder\ntarget/\n\n# Jupyter Notebook\n.ipynb_checkpoints\n\n# IPython\nprofile_default/\nipython_config.py\n\n# pyenv\n.python-version\n\n# pipenv\n#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.\n#   However, in case of collaboration, if having platform-specific dependencies or dependencies\n#   having no cross-platform support, pipenv may install dependencies that don't work, or not\n#   install all needed dependencies.\n#Pipfile.lock\n\n# PEP 582; used by e.g. github.com/David-OConnor/pyflow\n__pypackages__/\n\n# Celery stuff\ncelerybeat-schedule\ncelerybeat.pid\n\n# SageMath parsed files\n*.sage.py\n\n# Environments\n.env\n.venv\nenv/\nvenv/\nENV/\nenv.bak/\nvenv.bak/\n\n# Spyder project settings\n.spyderproject\n.spyproject\n\n# Rope project settings\n.ropeproject\n\n# mkdocs documentation\n/site\n\n# mypy\n.mypy_cache/\n.dmypy.json\ndmypy.json\n\n# Pyre type checker\n.pyre/\n"
  },
  {
    "path": "LICENSE",
    "content": "MIT License\n\nCopyright (c) 2022 Abhishek Thakur\n\nPermission is hereby granted, free of charge, to any person obtaining a copy\nof this software and associated documentation files (the \"Software\"), to deal\nin the Software without restriction, including without limitation the rights\nto use, copy, modify, merge, publish, distribute, sublicense, and/or sell\ncopies of the Software, and to permit persons to whom the Software is\nfurnished to do so, subject to the following conditions:\n\nThe above copyright notice and this permission notice shall be included in all\ncopies or substantial portions of the Software.\n\nTHE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\nIMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\nFITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\nAUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\nLIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\nOUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\nSOFTWARE."
  },
  {
    "path": "README.md",
    "content": "# Long text token classification using LongFormer\n\nThe data comes from: https://www.kaggle.com/c/feedback-prize-2021/\n\nTo train the model for 5 folds, you can run:\n\n    python train.py --fold 0 --model allenai/longformer-large-4096 --lr 1e-5 --epochs 10 --max_len 1536 --batch_size 4 --valid_batch_size 4\n    python train.py --fold 1 --model allenai/longformer-large-4096 --lr 1e-5 --epochs 10 --max_len 1536 --batch_size 4 --valid_batch_size 4\n    python train.py --fold 2 --model allenai/longformer-large-4096 --lr 1e-5 --epochs 10 --max_len 1536 --batch_size 4 --valid_batch_size 4\n    python train.py --fold 3 --model allenai/longformer-large-4096 --lr 1e-5 --epochs 10 --max_len 1536 --batch_size 4 --valid_batch_size 4\n    python train.py --fold 4 --model allenai/longformer-large-4096 --lr 1e-5 --epochs 10 --max_len 1536 --batch_size 4 --valid_batch_size 4\n\nNote that you need have `kfold` column in training data.\n"
  },
  {
    "path": "train.py",
    "content": "import argparse\nimport os\nimport random\nimport warnings\n\nimport numpy as np\nimport pandas as pd\nimport tez\nimport torch\nimport torch.nn as nn\nfrom sklearn import metrics\nfrom torch.nn import functional as F\nfrom transformers import AdamW, AutoConfig, AutoModel, AutoTokenizer, get_cosine_schedule_with_warmup\n\nfrom utils import EarlyStopping, prepare_training_data, target_id_map\n\nwarnings.filterwarnings(\"ignore\")\n\n\ndef seed_everything(seed: int):\n    random.seed(seed)\n    os.environ[\"PYTHONHASHSEED\"] = str(seed)\n    np.random.seed(seed)\n    torch.manual_seed(seed)\n    torch.cuda.manual_seed(seed)\n    torch.backends.cudnn.deterministic = True\n    torch.backends.cudnn.benchmark = True\n\n\ndef parse_args():\n    parser = argparse.ArgumentParser()\n    parser.add_argument(\"--fold\", type=int, required=True)\n    parser.add_argument(\"--model\", type=str, required=True)\n    parser.add_argument(\"--lr\", type=float, required=True)\n    parser.add_argument(\"--output\", type=str, default=\"../model\", required=False)\n    parser.add_argument(\"--input\", type=str, default=\"../input\", required=False)\n    parser.add_argument(\"--max_len\", type=int, default=1024, required=False)\n    parser.add_argument(\"--batch_size\", type=int, default=8, required=False)\n    parser.add_argument(\"--valid_batch_size\", type=int, default=8, required=False)\n    parser.add_argument(\"--epochs\", type=int, default=20, required=False)\n    parser.add_argument(\"--accumulation_steps\", type=int, default=1, required=False)\n    return parser.parse_args()\n\n\nclass FeedbackDataset:\n    def __init__(self, samples, max_len, tokenizer):\n        self.samples = samples\n        self.max_len = max_len\n        self.tokenizer = tokenizer\n        self.length = len(samples)\n\n    def __len__(self):\n        return self.length\n\n    def __getitem__(self, idx):\n        input_ids = self.samples[idx][\"input_ids\"]\n        input_labels = self.samples[idx][\"input_labels\"]\n        input_labels = [target_id_map[x] for x in input_labels]\n        other_label_id = target_id_map[\"O\"]\n        padding_label_id = target_id_map[\"PAD\"]\n        # print(input_ids)\n        # print(input_labels)\n\n        # add start token id to the input_ids\n        input_ids = [self.tokenizer.cls_token_id] + input_ids\n        input_labels = [other_label_id] + input_labels\n\n        if len(input_ids) > self.max_len - 1:\n            input_ids = input_ids[: self.max_len - 1]\n            input_labels = input_labels[: self.max_len - 1]\n\n        # add end token id to the input_ids\n        input_ids = input_ids + [self.tokenizer.sep_token_id]\n        input_labels = input_labels + [other_label_id]\n\n        attention_mask = [1] * len(input_ids)\n\n        padding_length = self.max_len - len(input_ids)\n        if padding_length > 0:\n            if self.tokenizer.padding_side == \"right\":\n                input_ids = input_ids + [self.tokenizer.pad_token_id] * padding_length\n                input_labels = input_labels + [padding_label_id] * padding_length\n                attention_mask = attention_mask + [0] * padding_length\n            else:\n                input_ids = [self.tokenizer.pad_token_id] * padding_length + input_ids\n                input_labels = [padding_label_id] * padding_length + input_labels\n                attention_mask = [0] * padding_length + attention_mask\n\n        return {\n            \"ids\": torch.tensor(input_ids, dtype=torch.long),\n            \"mask\": torch.tensor(attention_mask, dtype=torch.long),\n            \"targets\": torch.tensor(input_labels, dtype=torch.long),\n        }\n\n\nclass FeedbackModel(tez.Model):\n    def __init__(self, model_name, num_train_steps, learning_rate, num_labels, steps_per_epoch):\n        super().__init__()\n        self.learning_rate = learning_rate\n        self.model_name = model_name\n        self.num_train_steps = num_train_steps\n        self.num_labels = num_labels\n        self.steps_per_epoch = steps_per_epoch\n        self.step_scheduler_after = \"batch\"\n\n        hidden_dropout_prob: float = 0.1\n        layer_norm_eps: float = 1e-7\n\n        config = AutoConfig.from_pretrained(model_name)\n\n        config.update(\n            {\n                \"output_hidden_states\": True,\n                \"hidden_dropout_prob\": hidden_dropout_prob,\n                \"layer_norm_eps\": layer_norm_eps,\n                \"add_pooling_layer\": False,\n                \"num_labels\": self.num_labels,\n            }\n        )\n        self.transformer = AutoModel.from_pretrained(model_name, config=config)\n        self.dropout = nn.Dropout(config.hidden_dropout_prob)\n        self.dropout1 = nn.Dropout(0.1)\n        self.dropout2 = nn.Dropout(0.2)\n        self.dropout3 = nn.Dropout(0.3)\n        self.dropout4 = nn.Dropout(0.4)\n        self.dropout5 = nn.Dropout(0.5)\n        self.output = nn.Linear(config.hidden_size, self.num_labels)\n\n    def fetch_optimizer(self):\n        param_optimizer = list(self.named_parameters())\n        no_decay = [\"bias\", \"LayerNorm.bias\"]\n        optimizer_parameters = [\n            {\n                \"params\": [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],\n                \"weight_decay\": 0.01,\n            },\n            {\n                \"params\": [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],\n                \"weight_decay\": 0.0,\n            },\n        ]\n        opt = AdamW(optimizer_parameters, lr=self.learning_rate)\n        return opt\n\n    def fetch_scheduler(self):\n        sch = get_cosine_schedule_with_warmup(\n            self.optimizer,\n            num_warmup_steps=int(0.1 * self.num_train_steps),\n            num_training_steps=self.num_train_steps,\n            num_cycles=1,\n            last_epoch=-1,\n        )\n        return sch\n\n    def loss(self, outputs, targets, attention_mask):\n        loss_fct = nn.CrossEntropyLoss()\n\n        active_loss = attention_mask.view(-1) == 1\n        active_logits = outputs.view(-1, self.num_labels)\n        true_labels = targets.view(-1)\n        outputs = active_logits.argmax(dim=-1)\n        idxs = np.where(active_loss.cpu().numpy() == 1)[0]\n        active_logits = active_logits[idxs]\n        true_labels = true_labels[idxs].to(torch.long)\n\n        loss = loss_fct(active_logits, true_labels)\n        return loss\n\n    def monitor_metrics(self, outputs, targets, attention_mask):\n        active_loss = (attention_mask.view(-1) == 1).cpu().numpy()\n        active_logits = outputs.view(-1, self.num_labels)\n        true_labels = targets.view(-1).cpu().numpy()\n        outputs = active_logits.argmax(dim=-1).cpu().numpy()\n        idxs = np.where(active_loss == 1)[0]\n        f1_score = metrics.f1_score(true_labels[idxs], outputs[idxs], average=\"macro\")\n        return {\"f1\": f1_score}\n\n    def forward(self, ids, mask, token_type_ids=None, targets=None):\n\n        if token_type_ids:\n            transformer_out = self.transformer(ids, mask, token_type_ids)\n        else:\n            transformer_out = self.transformer(ids, mask)\n        sequence_output = transformer_out.last_hidden_state\n        sequence_output = self.dropout(sequence_output)\n\n        logits1 = self.output(self.dropout1(sequence_output))\n        logits2 = self.output(self.dropout2(sequence_output))\n        logits3 = self.output(self.dropout3(sequence_output))\n        logits4 = self.output(self.dropout4(sequence_output))\n        logits5 = self.output(self.dropout5(sequence_output))\n\n        logits = (logits1 + logits2 + logits3 + logits4 + logits5) / 5\n        logits = torch.softmax(logits, dim=-1)\n        loss = 0\n\n        if targets is not None:\n            loss1 = self.loss(logits1, targets, attention_mask=mask)\n            loss2 = self.loss(logits2, targets, attention_mask=mask)\n            loss3 = self.loss(logits3, targets, attention_mask=mask)\n            loss4 = self.loss(logits4, targets, attention_mask=mask)\n            loss5 = self.loss(logits5, targets, attention_mask=mask)\n            loss = (loss1 + loss2 + loss3 + loss4 + loss5) / 5\n            f1_1 = self.monitor_metrics(logits1, targets, attention_mask=mask)[\"f1\"]\n            f1_2 = self.monitor_metrics(logits2, targets, attention_mask=mask)[\"f1\"]\n            f1_3 = self.monitor_metrics(logits3, targets, attention_mask=mask)[\"f1\"]\n            f1_4 = self.monitor_metrics(logits4, targets, attention_mask=mask)[\"f1\"]\n            f1_5 = self.monitor_metrics(logits5, targets, attention_mask=mask)[\"f1\"]\n            f1 = (f1_1 + f1_2 + f1_3 + f1_4 + f1_5) / 5\n            metric = {\"f1\": f1}\n            return logits, loss, metric\n\n        return logits, loss, {}\n\n\nif __name__ == \"__main__\":\n    NUM_JOBS = 12\n    args = parse_args()\n    seed_everything(42)\n    os.makedirs(args.output, exist_ok=True)\n    df = pd.read_csv(os.path.join(args.input, \"train_folds.csv\"))\n\n    train_df = df[df[\"kfold\"] != args.fold].reset_index(drop=True)\n    valid_df = df[df[\"kfold\"] == args.fold].reset_index(drop=True)\n\n    tokenizer = AutoTokenizer.from_pretrained(args.model)\n    training_samples = prepare_training_data(train_df, tokenizer, args, num_jobs=NUM_JOBS)\n    valid_samples = prepare_training_data(valid_df, tokenizer, args, num_jobs=NUM_JOBS)\n\n    train_dataset = FeedbackDataset(training_samples, args.max_len, tokenizer)\n\n    num_train_steps = int(len(train_dataset) / args.batch_size / args.accumulation_steps * args.epochs)\n    print(num_train_steps)\n\n    model = FeedbackModel(\n        model_name=args.model,\n        num_train_steps=num_train_steps,\n        learning_rate=args.lr,\n        num_labels=len(target_id_map) - 1,\n        steps_per_epoch=len(train_dataset) / args.batch_size,\n    )\n\n    es = EarlyStopping(\n        model_path=os.path.join(args.output, f\"model_{args.fold}.bin\"),\n        valid_df=valid_df,\n        valid_samples=valid_samples,\n        batch_size=args.valid_batch_size,\n        patience=5,\n        mode=\"max\",\n        delta=0.001,\n        save_weights_only=True,\n        tokenizer=tokenizer,\n    )\n\n    model.fit(\n        train_dataset,\n        train_bs=args.batch_size,\n        device=\"cuda\",\n        epochs=args.epochs,\n        callbacks=[es],\n        fp16=True,\n        accumulation_steps=args.accumulation_steps,\n    )\n\n"
  },
  {
    "path": "utils.py",
    "content": "import copy\nimport os\n\nimport numpy as np\nimport pandas as pd\nimport torch\nfrom joblib import Parallel, delayed\nfrom tez import enums\nfrom tez.callbacks import Callback\nfrom tqdm import tqdm\n\ntarget_id_map = {\n    \"B-Lead\": 0,\n    \"I-Lead\": 1,\n    \"B-Position\": 2,\n    \"I-Position\": 3,\n    \"B-Evidence\": 4,\n    \"I-Evidence\": 5,\n    \"B-Claim\": 6,\n    \"I-Claim\": 7,\n    \"B-Concluding Statement\": 8,\n    \"I-Concluding Statement\": 9,\n    \"B-Counterclaim\": 10,\n    \"I-Counterclaim\": 11,\n    \"B-Rebuttal\": 12,\n    \"I-Rebuttal\": 13,\n    \"O\": 14,\n    \"PAD\": -100,\n}\n\n\nid_target_map = {v: k for k, v in target_id_map.items()}\n\n\ndef _prepare_training_data_helper(args, tokenizer, df, train_ids):\n    training_samples = []\n    for idx in tqdm(train_ids):\n        filename = os.path.join(args.input, \"train\", idx + \".txt\")\n        with open(filename, \"r\") as f:\n            text = f.read()\n\n        encoded_text = tokenizer.encode_plus(\n            text,\n            add_special_tokens=False,\n            return_offsets_mapping=True,\n        )\n        input_ids = encoded_text[\"input_ids\"]\n        input_labels = copy.deepcopy(input_ids)\n        offset_mapping = encoded_text[\"offset_mapping\"]\n\n        for k in range(len(input_labels)):\n            input_labels[k] = \"O\"\n\n        sample = {\n            \"id\": idx,\n            \"input_ids\": input_ids,\n            \"text\": text,\n            \"offset_mapping\": offset_mapping,\n        }\n\n        temp_df = df[df[\"id\"] == idx]\n        for _, row in temp_df.iterrows():\n            text_labels = [0] * len(text)\n            discourse_start = int(row[\"discourse_start\"])\n            discourse_end = int(row[\"discourse_end\"])\n            prediction_label = row[\"discourse_type\"]\n            text_labels[discourse_start:discourse_end] = [1] * (discourse_end - discourse_start)\n            target_idx = []\n            for map_idx, (offset1, offset2) in enumerate(encoded_text[\"offset_mapping\"]):\n                if sum(text_labels[offset1:offset2]) > 0:\n                    if len(text[offset1:offset2].split()) > 0:\n                        target_idx.append(map_idx)\n\n            targets_start = target_idx[0]\n            targets_end = target_idx[-1]\n            pred_start = \"B-\" + prediction_label\n            pred_end = \"I-\" + prediction_label\n            input_labels[targets_start] = pred_start\n            input_labels[targets_start + 1 : targets_end + 1] = [pred_end] * (targets_end - targets_start)\n\n        sample[\"input_ids\"] = input_ids\n        sample[\"input_labels\"] = input_labels\n        training_samples.append(sample)\n    return training_samples\n\n\ndef prepare_training_data(df, tokenizer, args, num_jobs):\n    training_samples = []\n    train_ids = df[\"id\"].unique()\n\n    train_ids_splits = np.array_split(train_ids, num_jobs)\n\n    results = Parallel(n_jobs=num_jobs, backend=\"multiprocessing\")(\n        delayed(_prepare_training_data_helper)(args, tokenizer, df, idx) for idx in train_ids_splits\n    )\n    for result in results:\n        training_samples.extend(result)\n\n    return training_samples\n\n\ndef calc_overlap(row):\n    \"\"\"\n    Calculates the overlap between prediction and\n    ground truth and overlap percentages used for determining\n    true positives.\n    \"\"\"\n    set_pred = set(row.predictionstring_pred.split(\" \"))\n    set_gt = set(row.predictionstring_gt.split(\" \"))\n    # Length of each and intersection\n    len_gt = len(set_gt)\n    len_pred = len(set_pred)\n    inter = len(set_gt.intersection(set_pred))\n    overlap_1 = inter / len_gt\n    overlap_2 = inter / len_pred\n    return [overlap_1, overlap_2]\n\n\ndef score_feedback_comp_micro(pred_df, gt_df):\n    \"\"\"\n    A function that scores for the kaggle\n        Student Writing Competition\n\n    Uses the steps in the evaluation page here:\n        https://www.kaggle.com/c/feedback-prize-2021/overview/evaluation\n    This code is from Rob Mulla's Kaggle kernel.\n    \"\"\"\n    gt_df = gt_df[[\"id\", \"discourse_type\", \"predictionstring\"]].reset_index(drop=True).copy()\n    pred_df = pred_df[[\"id\", \"class\", \"predictionstring\"]].reset_index(drop=True).copy()\n    pred_df[\"pred_id\"] = pred_df.index\n    gt_df[\"gt_id\"] = gt_df.index\n    # Step 1. all ground truths and predictions for a given class are compared.\n    joined = pred_df.merge(\n        gt_df,\n        left_on=[\"id\", \"class\"],\n        right_on=[\"id\", \"discourse_type\"],\n        how=\"outer\",\n        suffixes=(\"_pred\", \"_gt\"),\n    )\n    joined[\"predictionstring_gt\"] = joined[\"predictionstring_gt\"].fillna(\" \")\n    joined[\"predictionstring_pred\"] = joined[\"predictionstring_pred\"].fillna(\" \")\n\n    joined[\"overlaps\"] = joined.apply(calc_overlap, axis=1)\n\n    # 2. If the overlap between the ground truth and prediction is >= 0.5,\n    # and the overlap between the prediction and the ground truth >= 0.5,\n    # the prediction is a match and considered a true positive.\n    # If multiple matches exist, the match with the highest pair of overlaps is taken.\n    joined[\"overlap1\"] = joined[\"overlaps\"].apply(lambda x: eval(str(x))[0])\n    joined[\"overlap2\"] = joined[\"overlaps\"].apply(lambda x: eval(str(x))[1])\n\n    joined[\"potential_TP\"] = (joined[\"overlap1\"] >= 0.5) & (joined[\"overlap2\"] >= 0.5)\n    joined[\"max_overlap\"] = joined[[\"overlap1\", \"overlap2\"]].max(axis=1)\n    tp_pred_ids = (\n        joined.query(\"potential_TP\")\n        .sort_values(\"max_overlap\", ascending=False)\n        .groupby([\"id\", \"predictionstring_gt\"])\n        .first()[\"pred_id\"]\n        .values\n    )\n\n    # 3. Any unmatched ground truths are false negatives\n    # and any unmatched predictions are false positives.\n    fp_pred_ids = [p for p in joined[\"pred_id\"].unique() if p not in tp_pred_ids]\n\n    matched_gt_ids = joined.query(\"potential_TP\")[\"gt_id\"].unique()\n    unmatched_gt_ids = [c for c in joined[\"gt_id\"].unique() if c not in matched_gt_ids]\n\n    # Get numbers of each type\n    TP = len(tp_pred_ids)\n    FP = len(fp_pred_ids)\n    FN = len(unmatched_gt_ids)\n    # calc microf1\n    my_f1_score = TP / (TP + 0.5 * (FP + FN))\n    return my_f1_score\n\n\ndef score_feedback_comp(pred_df, gt_df, return_class_scores=False):\n    class_scores = {}\n    pred_df = pred_df[[\"id\", \"class\", \"predictionstring\"]].reset_index(drop=True).copy()\n    for discourse_type, gt_subset in gt_df.groupby(\"discourse_type\"):\n        pred_subset = pred_df.loc[pred_df[\"class\"] == discourse_type].reset_index(drop=True).copy()\n        class_score = score_feedback_comp_micro(pred_subset, gt_subset)\n        class_scores[discourse_type] = class_score\n    f1 = np.mean([v for v in class_scores.values()])\n    if return_class_scores:\n        return f1, class_scores\n    return f1\n\n\nclass FeedbackDatasetValid:\n    def __init__(self, samples, max_len, tokenizer):\n        self.samples = samples\n        self.max_len = max_len\n        self.tokenizer = tokenizer\n        self.length = len(samples)\n\n    def __len__(self):\n        return self.length\n\n    def __getitem__(self, idx):\n        input_ids = self.samples[idx][\"input_ids\"]\n        input_ids = [self.tokenizer.cls_token_id] + input_ids\n\n        if len(input_ids) > self.max_len - 1:\n            input_ids = input_ids[: self.max_len - 1]\n\n        # add end token id to the input_ids\n        input_ids = input_ids + [self.tokenizer.sep_token_id]\n        attention_mask = [1] * len(input_ids)\n\n        return {\n            \"ids\": input_ids,\n            \"mask\": attention_mask,\n        }\n\n\nclass Collate:\n    def __init__(self, tokenizer):\n        self.tokenizer = tokenizer\n\n    def __call__(self, batch):\n        output = dict()\n        output[\"ids\"] = [sample[\"ids\"] for sample in batch]\n        output[\"mask\"] = [sample[\"mask\"] for sample in batch]\n\n        # calculate max token length of this batch\n        batch_max = max([len(ids) for ids in output[\"ids\"]])\n\n        # add padding\n        if self.tokenizer.padding_side == \"right\":\n            output[\"ids\"] = [s + (batch_max - len(s)) * [self.tokenizer.pad_token_id] for s in output[\"ids\"]]\n            output[\"mask\"] = [s + (batch_max - len(s)) * [0] for s in output[\"mask\"]]\n        else:\n            output[\"ids\"] = [(batch_max - len(s)) * [self.tokenizer.pad_token_id] + s for s in output[\"ids\"]]\n            output[\"mask\"] = [(batch_max - len(s)) * [0] + s for s in output[\"mask\"]]\n\n        # convert to tensors\n        output[\"ids\"] = torch.tensor(output[\"ids\"], dtype=torch.long)\n        output[\"mask\"] = torch.tensor(output[\"mask\"], dtype=torch.long)\n\n        return output\n\n\nclass EarlyStopping(Callback):\n    def __init__(\n        self,\n        model_path,\n        valid_df,\n        valid_samples,\n        batch_size,\n        tokenizer,\n        patience=5,\n        mode=\"max\",\n        delta=0.001,\n        save_weights_only=True,\n    ):\n        self.patience = patience\n        self.counter = 0\n        self.mode = mode\n        self.best_score = None\n        self.early_stop = False\n        self.delta = delta\n        self.save_weights_only = save_weights_only\n        self.model_path = model_path\n        self.valid_samples = valid_samples\n        self.batch_size = batch_size\n        self.valid_df = valid_df\n        self.tokenizer = tokenizer\n\n        if self.mode == \"min\":\n            self.val_score = np.Inf\n        else:\n            self.val_score = -np.Inf\n\n    def on_epoch_end(self, model):\n        model.eval()\n        valid_dataset = FeedbackDatasetValid(self.valid_samples, 4096, self.tokenizer)\n        collate = Collate(self.tokenizer)\n\n        preds_iter = model.predict(\n            valid_dataset,\n            batch_size=self.batch_size,\n            n_jobs=-1,\n            collate_fn=collate,\n        )\n\n        final_preds = []\n        final_scores = []\n        for preds in preds_iter:\n            pred_class = np.argmax(preds, axis=2)\n            pred_scrs = np.max(preds, axis=2)\n            for pred, pred_scr in zip(pred_class, pred_scrs):\n                final_preds.append(pred.tolist())\n                final_scores.append(pred_scr.tolist())\n\n        for j in range(len(self.valid_samples)):\n            tt = [id_target_map[p] for p in final_preds[j][1:]]\n            tt_score = final_scores[j][1:]\n            self.valid_samples[j][\"preds\"] = tt\n            self.valid_samples[j][\"pred_scores\"] = tt_score\n\n        submission = []\n        min_thresh = {\n            \"Lead\": 9,\n            \"Position\": 5,\n            \"Evidence\": 14,\n            \"Claim\": 3,\n            \"Concluding Statement\": 11,\n            \"Counterclaim\": 6,\n            \"Rebuttal\": 4,\n        }\n        proba_thresh = {\n            \"Lead\": 0.7,\n            \"Position\": 0.55,\n            \"Evidence\": 0.65,\n            \"Claim\": 0.55,\n            \"Concluding Statement\": 0.7,\n            \"Counterclaim\": 0.5,\n            \"Rebuttal\": 0.55,\n        }\n\n        for _, sample in enumerate(self.valid_samples):\n            preds = sample[\"preds\"]\n            offset_mapping = sample[\"offset_mapping\"]\n            sample_id = sample[\"id\"]\n            sample_text = sample[\"text\"]\n            sample_pred_scores = sample[\"pred_scores\"]\n\n            # pad preds to same length as offset_mapping\n            if len(preds) < len(offset_mapping):\n                preds = preds + [\"O\"] * (len(offset_mapping) - len(preds))\n                sample_pred_scores = sample_pred_scores + [0] * (len(offset_mapping) - len(sample_pred_scores))\n\n            idx = 0\n            phrase_preds = []\n            while idx < len(offset_mapping):\n                start, _ = offset_mapping[idx]\n                if preds[idx] != \"O\":\n                    label = preds[idx][2:]\n                else:\n                    label = \"O\"\n                phrase_scores = []\n                phrase_scores.append(sample_pred_scores[idx])\n                idx += 1\n                while idx < len(offset_mapping):\n                    if label == \"O\":\n                        matching_label = \"O\"\n                    else:\n                        matching_label = f\"I-{label}\"\n                    if preds[idx] == matching_label:\n                        _, end = offset_mapping[idx]\n                        phrase_scores.append(sample_pred_scores[idx])\n                        idx += 1\n                    else:\n                        break\n                if \"end\" in locals():\n                    phrase = sample_text[start:end]\n                    phrase_preds.append((phrase, start, end, label, phrase_scores))\n\n            temp_df = []\n            for phrase_idx, (phrase, start, end, label, phrase_scores) in enumerate(phrase_preds):\n                word_start = len(sample_text[:start].split())\n                word_end = word_start + len(sample_text[start:end].split())\n                word_end = min(word_end, len(sample_text.split()))\n                ps = \" \".join([str(x) for x in range(word_start, word_end)])\n                if label != \"O\":\n                    if sum(phrase_scores) / len(phrase_scores) >= proba_thresh[label]:\n                        temp_df.append((sample_id, label, ps))\n\n            temp_df = pd.DataFrame(temp_df, columns=[\"id\", \"class\", \"predictionstring\"])\n\n            submission.append(temp_df)\n\n        submission = pd.concat(submission).reset_index(drop=True)\n        submission[\"len\"] = submission.predictionstring.apply(lambda x: len(x.split()))\n\n        def threshold(df):\n            df = df.copy()\n            for key, value in min_thresh.items():\n                index = df.loc[df[\"class\"] == key].query(f\"len<{value}\").index\n                df.drop(index, inplace=True)\n            return df\n\n        submission = threshold(submission)\n\n        # drop len\n        submission = submission.drop(columns=[\"len\"])\n\n        scr = score_feedback_comp(submission, self.valid_df, return_class_scores=True)\n        print(scr)\n        model.train()\n\n        epoch_score = scr[0]\n        if self.mode == \"min\":\n            score = -1.0 * epoch_score\n        else:\n            score = np.copy(epoch_score)\n\n        if self.best_score is None:\n            self.best_score = score\n            self.save_checkpoint(epoch_score, model)\n        elif score < self.best_score + self.delta:\n            self.counter += 1\n            print(\"EarlyStopping counter: {} out of {}\".format(self.counter, self.patience))\n            if self.counter >= self.patience:\n                model.model_state = enums.ModelState.END\n        else:\n            self.best_score = score\n            self.save_checkpoint(epoch_score, model)\n            self.counter = 0\n\n    def save_checkpoint(self, epoch_score, model):\n        if epoch_score not in [-np.inf, np.inf, -np.nan, np.nan]:\n            print(\"Validation score improved ({} --> {}). Saving model!\".format(self.val_score, epoch_score))\n            model.save(self.model_path, weights_only=self.save_weights_only)\n        self.val_score = epoch_score\n\n"
  }
]