[
  {
    "path": ".coveragerc",
    "content": "[report]\nexclude_lines =\n    pragma: no cover\n    if TYPE_CHECKING:\n    raise NotImplementedError\n    if __name__ == \"__main__\":"
  },
  {
    "path": ".gitignore",
    "content": ".dbx/\nvenv/\n.venv/\n.idea/\n.python-version\n__pycache__/\nhtmlcov/\n.pytest_cache/\n.coverage\n"
  },
  {
    "path": "CONTRIBUTING.md",
    "content": "We happily welcome contributions to Dolly. We use GitHub Issues to track community reported issues and GitHub Pull Requests for accepting changes."
  },
  {
    "path": "LICENSE",
    "content": "\n                                 Apache License\n                           Version 2.0, January 2004\n                        http://www.apache.org/licenses/\n\n   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION\n\n   1. Definitions.\n\n      \"License\" shall mean the terms and conditions for use, reproduction,\n      and distribution as defined by Sections 1 through 9 of this document.\n\n      \"Licensor\" shall mean the copyright owner or entity authorized by\n      the copyright owner that is granting the License.\n\n      \"Legal Entity\" shall mean the union of the acting entity and all\n      other entities that control, are controlled by, or are under common\n      control with that entity. For the purposes of this definition,\n      \"control\" means (i) the power, direct or indirect, to cause the\n      direction or management of such entity, whether by contract or\n      otherwise, or (ii) ownership of fifty percent (50%) or more of the\n      outstanding shares, or (iii) beneficial ownership of such entity.\n\n      \"You\" (or \"Your\") shall mean an individual or Legal Entity\n      exercising permissions granted by this License.\n\n      \"Source\" form shall mean the preferred form for making modifications,\n      including but not limited to software source code, documentation\n      source, and configuration files.\n\n      \"Object\" form shall mean any form resulting from mechanical\n      transformation or translation of a Source form, including but\n      not limited to compiled object code, generated documentation,\n      and conversions to other media types.\n\n      \"Work\" shall mean the work of authorship, whether in Source or\n      Object form, made available under the License, as indicated by a\n      copyright notice that is included in or attached to the work\n      (an example is provided in the Appendix below).\n\n      \"Derivative Works\" shall mean any work, whether in Source or Object\n      form, that is based on (or derived from) the Work and for which the\n      editorial revisions, annotations, elaborations, or other modifications\n      represent, as a whole, an original work of authorship. For the purposes\n      of this License, Derivative Works shall not include works that remain\n      separable from, or merely link (or bind by name) to the interfaces of,\n      the Work and Derivative Works thereof.\n\n      \"Contribution\" shall mean any work of authorship, including\n      the original version of the Work and any modifications or additions\n      to that Work or Derivative Works thereof, that is intentionally\n      submitted to Licensor for inclusion in the Work by the copyright owner\n      or by an individual or Legal Entity authorized to submit on behalf of\n      the copyright owner. For the purposes of this definition, \"submitted\"\n      means any form of electronic, verbal, or written communication sent\n      to the Licensor or its representatives, including but not limited to\n      communication on electronic mailing lists, source code control systems,\n      and issue tracking systems that are managed by, or on behalf of, the\n      Licensor for the purpose of discussing and improving the Work, but\n      excluding communication that is conspicuously marked or otherwise\n      designated in writing by the copyright owner as \"Not a Contribution.\"\n\n      \"Contributor\" shall mean Licensor and any individual or Legal Entity\n      on behalf of whom a Contribution has been received by Licensor and\n      subsequently incorporated within the Work.\n\n   2. Grant of Copyright License. Subject to the terms and conditions of\n      this License, each Contributor hereby grants to You a perpetual,\n      worldwide, non-exclusive, no-charge, royalty-free, irrevocable\n      copyright license to reproduce, prepare Derivative Works of,\n      publicly display, publicly perform, sublicense, and distribute the\n      Work and such Derivative Works in Source or Object form.\n\n   3. Grant of Patent License. Subject to the terms and conditions of\n      this License, each Contributor hereby grants to You a perpetual,\n      worldwide, non-exclusive, no-charge, royalty-free, irrevocable\n      (except as stated in this section) patent license to make, have made,\n      use, offer to sell, sell, import, and otherwise transfer the Work,\n      where such license applies only to those patent claims licensable\n      by such Contributor that are necessarily infringed by their\n      Contribution(s) alone or by combination of their Contribution(s)\n      with the Work to which such Contribution(s) was submitted. If You\n      institute patent litigation against any entity (including a\n      cross-claim or counterclaim in a lawsuit) alleging that the Work\n      or a Contribution incorporated within the Work constitutes direct\n      or contributory patent infringement, then any patent licenses\n      granted to You under this License for that Work shall terminate\n      as of the date such litigation is filed.\n\n   4. Redistribution. You may reproduce and distribute copies of the\n      Work or Derivative Works thereof in any medium, with or without\n      modifications, and in Source or Object form, provided that You\n      meet the following conditions:\n\n      (a) You must give any other recipients of the Work or\n          Derivative Works a copy of this License; and\n\n      (b) You must cause any modified files to carry prominent notices\n          stating that You changed the files; and\n\n      (c) You must retain, in the Source form of any Derivative Works\n          that You distribute, all copyright, patent, trademark, and\n          attribution notices from the Source form of the Work,\n          excluding those notices that do not pertain to any part of\n          the Derivative Works; and\n\n      (d) If the Work includes a \"NOTICE\" text file as part of its\n          distribution, then any Derivative Works that You distribute must\n          include a readable copy of the attribution notices contained\n          within such NOTICE file, excluding those notices that do not\n          pertain to any part of the Derivative Works, in at least one\n          of the following places: within a NOTICE text file distributed\n          as part of the Derivative Works; within the Source form or\n          documentation, if provided along with the Derivative Works; or,\n          within a display generated by the Derivative Works, if and\n          wherever such third-party notices normally appear. The contents\n          of the NOTICE file are for informational purposes only and\n          do not modify the License. You may add Your own attribution\n          notices within Derivative Works that You distribute, alongside\n          or as an addendum to the NOTICE text from the Work, provided\n          that such additional attribution notices cannot be construed\n          as modifying the License.\n\n      You may add Your own copyright statement to Your modifications and\n      may provide additional or different license terms and conditions\n      for use, reproduction, or distribution of Your modifications, or\n      for any such Derivative Works as a whole, provided Your use,\n      reproduction, and distribution of the Work otherwise complies with\n      the conditions stated in this License.\n\n   5. Submission of Contributions. Unless You explicitly state otherwise,\n      any Contribution intentionally submitted for inclusion in the Work\n      by You to the Licensor shall be under the terms and conditions of\n      this License, without any additional terms or conditions.\n      Notwithstanding the above, nothing herein shall supersede or modify\n      the terms of any separate license agreement you may have executed\n      with Licensor regarding such Contributions.\n\n   6. Trademarks. This License does not grant permission to use the trade\n      names, trademarks, service marks, or product names of the Licensor,\n      except as required for reasonable and customary use in describing the\n      origin of the Work and reproducing the content of the NOTICE file.\n\n   7. Disclaimer of Warranty. Unless required by applicable law or\n      agreed to in writing, Licensor provides the Work (and each\n      Contributor provides its Contributions) on an \"AS IS\" BASIS,\n      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or\n      implied, including, without limitation, any warranties or conditions\n      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A\n      PARTICULAR PURPOSE. You are solely responsible for determining the\n      appropriateness of using or redistributing the Work and assume any\n      risks associated with Your exercise of permissions under this License.\n\n   8. Limitation of Liability. In no event and under no legal theory,\n      whether in tort (including negligence), contract, or otherwise,\n      unless required by applicable law (such as deliberate and grossly\n      negligent acts) or agreed to in writing, shall any Contributor be\n      liable to You for damages, including any direct, indirect, special,\n      incidental, or consequential damages of any character arising as a\n      result of this License or out of the use or inability to use the\n      Work (including but not limited to damages for loss of goodwill,\n      work stoppage, computer failure or malfunction, or any and all\n      other commercial damages or losses), even if such Contributor\n      has been advised of the possibility of such damages.\n\n   9. Accepting Warranty or Additional Liability. While redistributing\n      the Work or Derivative Works thereof, You may choose to offer,\n      and charge a fee for, acceptance of support, warranty, indemnity,\n      or other liability obligations and/or rights consistent with this\n      License. However, in accepting such obligations, You may act only\n      on Your own behalf and on Your sole responsibility, not on behalf\n      of any other Contributor, and only if You agree to indemnify,\n      defend, and hold each Contributor harmless for any liability\n      incurred by, or claims asserted against, such Contributor by reason\n      of your accepting any such warranty or additional liability.\n\n   END OF TERMS AND CONDITIONS\n\n   APPENDIX: How to apply the Apache License to your work.\n\n      To apply the Apache License to your work, attach the following\n      boilerplate notice, with the fields enclosed by brackets \"[]\"\n      replaced with your own identifying information. (Don't include\n      the brackets!)  The text should be enclosed in the appropriate\n      comment syntax for the file format. We also recommend that a\n      file or class name and description of purpose be included on the\n      same \"printed page\" as the copyright notice for easier\n      identification within third-party archives.\n\n   Copyright 2023 Databricks, Inc.\n\n   Licensed under the Apache License, Version 2.0 (the \"License\");\n   you may not use this file except in compliance with the License.\n   You may obtain a copy of the License at\n\n       http://www.apache.org/licenses/LICENSE-2.0\n\n   Unless required by applicable law or agreed to in writing, software\n   distributed under the License is distributed on an \"AS IS\" BASIS,\n   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n   See the License for the specific language governing permissions and\n   limitations under the License."
  },
  {
    "path": "NOTICE",
    "content": "Dolly\r\n\r\nCopyright (2023) Databricks, Inc.\r\n\r\nThis Software includes software developed at Databricks (https://www.databricks.com/) and its use is subject to the included LICENSE file.\r\n"
  },
  {
    "path": "README.md",
    "content": "# Dolly\n\nDatabricks’ [Dolly](https://huggingface.co/databricks/dolly-v2-12b) is an instruction-following large language model trained on the Databricks machine learning platform\nthat is licensed for commercial use. Based on `pythia-12b`, Dolly is trained on ~15k instruction/response fine tuning records\n[`databricks-dolly-15k`](https://huggingface.co/datasets/databricks/databricks-dolly-15k) generated\nby Databricks employees in capability domains from the InstructGPT paper, including brainstorming, classification, closed QA, generation,\ninformation extraction, open QA and summarization. `dolly-v2-12b` is not a state-of-the-art model, but does exhibit surprisingly\nhigh quality instruction following behavior not characteristic of the foundation model on which it is based.\n\nDatabricks is committed to ensuring that every organization and individual benefits from the transformative power of artificial intelligence. The Dolly model family represents our first steps along this journey, and we’re excited to share this technology with the world.\n\nThe model is available on Hugging Face as [databricks/dolly-v2-12b](https://huggingface.co/databricks/dolly-v2-12b).\n\n## Model Overview\n\n`dolly-v2-12b` is a 12 billion parameter causal language model created by [Databricks](https://databricks.com/) that is derived from\n[EleutherAI’s](https://www.eleuther.ai/) [Pythia-12b](https://huggingface.co/EleutherAI/pythia-12b) and fine-tuned\non a [~15K record instruction corpus](https://github.com/databrickslabs/dolly/tree/master/data) generated by Databricks employees and released under a permissive license (CC-BY-SA)\n\n\n## Known Limitations\n\n### Performance Limitations\n**`dolly-v2-12b` is not a state-of-the-art generative language model** and, though quantitative benchmarking is ongoing, is not designed to perform\ncompetitively with more modern model architectures or models subject to larger pretraining corpuses.\n\nThe Dolly model family is under active development, and so any list of shortcomings is unlikely to be exhaustive, but we include known limitations and misfires here as a means to document and share our preliminary findings with the community.\nIn particular, `dolly-v2-12b` struggles with: syntactically complex prompts, programming problems, mathematical operations, factual errors,\ndates and times, open-ended question answering, hallucination, enumerating lists of specific length, stylistic mimicry, having a sense of humor, etc.\nMoreover, we find that `dolly-v2-12b` does not have some capabilities, such as well-formatted letter writing, present in the original model.\n\n### Dataset Limitations\nLike all language models, `dolly-v2-12b` reflects the content and limitations of its training corpuses.\n\n- **The Pile**: GPT-J’s pre-training corpus contains content mostly collected from the public internet, and like most web-scale datasets,\nit contains content many users would find objectionable. As such, the model is likely to reflect these shortcomings, potentially overtly\nin the case it is explicitly asked to produce objectionable content, and sometimes subtly, as in the case of biased or harmful implicit\nassociations.\n\n- **`databricks-dolly-15k`**: The training data on which `dolly-v2-12b` is instruction tuned represents natural language instructions generated\nby Databricks employees during a period spanning March and April 2023 and includes passages from Wikipedia as references passages\nfor instruction categories like closed QA and summarization. To our knowledge it does not contain obscenity, intellectual property or\npersonally identifying information about non-public figures, but it may contain typos and factual errors.\nThe dataset may also reflect biases found in Wikipedia. Finally, the dataset likely reflects\nthe interests and semantic choices of Databricks employees, a demographic which is not representative of the global population at large.\n\nDatabricks is committed to ongoing research and development efforts to develop helpful, honest and harmless AI technologies that\nmaximize the potential of all individuals and organizations.\n\n## Getting Started with Response Generation\n\nIf you'd like to simply test the model without training, the model is available on Hugging Face as [databricks/dolly-v2-12b](https://huggingface.co/databricks/dolly-v2-12b).\n\nTo use the model with the `transformers` library on a machine with A100 GPUs:\n\n```\nfrom transformers import pipeline\nimport torch\n\ninstruct_pipeline = pipeline(model=\"databricks/dolly-v2-12b\", torch_dtype=torch.bfloat16, trust_remote_code=True, device_map=\"auto\")\n```\n\nYou can then use the pipeline to answer instructions:\n\n```\ninstruct_pipeline(\"Explain to me the difference between nuclear fission and fusion.\")\n```\n\n### Generating on Other Instances\n\nA100 instance types are not available in all cloud regions, or can be hard to provision. Inference is possible on other GPU instance types.\n\n#### A10 GPUs\n\nThe 6.9B and 2.8B param models should work as-is.\n\nTo generate using the 12B param model on A10s (ex: `g5.4xlarge`, 1 x A10 24GB), it's necessary to load and run generating using 8-bit weights, which impacts the results slightly:\n\n- Also install `bitsandbytes`\n- Add `model_kwargs={'load_in_8bit': True}` to the `pipeline()` command shown above\n\n#### V100 GPUs\n\nWhen using V100s (ex: `p3.2xlarge`, 1 x V100 16GB, `NC6s_v3`), in all cases, set `torch_dtype=torch.float16` in `pipeline()` instead.\n\nOtherwise, follow the steps above. The 12B param model may not function well in 8-bit on V100s.\n\n## Getting Started with Training\n\n- Add the `dolly` repo to Databricks (under Repos click Add Repo, enter `https://github.com/databrickslabs/dolly.git`, then click Create Repo).\n- Start a `13.x ML (includes Apache Spark 3.4.0, GPU, Scala 2.12)` or later single-node cluster with node type having 8 A100 GPUs (e.g. `Standard_ND96asr_v4` or `p4d.24xlarge`). Note that these instance types may not be available in all regions, or may be difficult to provision. In Databricks, note that you must select the GPU runtime first, and unselect \"Use Photon\", for these instance types to appear (where supported).\n- Open the `train_dolly` notebook in the Repo (which is the `train_dolly.py` file in the Github `dolly` repo), attach to your GPU cluster, and run all cells.  When training finishes, the notebook will save the model under `/dbfs/dolly_training`.\n\n### Training on Other Instances\n\nA100 instance types are not available in all cloud regions, or can be hard to provision. Training is possible on other GPU instance types, \nfor smaller Dolly model sizes, and with small modifications to reduce memory usage. These modifications are not optimal, but are simple to make. \n\nSelect your GPU family type from the `gpu_family` widget, enter the number of GPUs available in the `num_gpus` widget, and then run the rest of the code. \nA number of different options will be set for you to train the model for one of the following GPU types:\n- A100 (default)\n- A10 \n- V100\n\nDetails of the different configurations are below.\n\n#### A100 GPUs\n\nA100 GPUs are preferred for training all model sizes, and are the only GPUs that can train the 12B param model in a reasonable amount of time.\nAs such, this is the default configuration, as set in the `a100_config.json` deepspeed config file.\n\n#### A10 GPUs\n\nTraining the 12B param model is not recommended on A10s.\n\nTo train the 6.9B param model on A10 instances (ex: `g5.24xlarge`, 4 x A10 24GB; `Standard_NV72ads_A10_v5`, 2 x A10),\nsimply select `a10` from the `gpu_family` widget and enter the number of GPUs available in the `num_gpus` widget, then run the rest of the code. \nThis will use the `a10_config.json` deepspeed config file, which makes the following changes:\n\n- `per-device-train-batch-size` and `per-device-eval-batch-size` are set to 3 in the `train_dolly.py` invocation of `deepspeed`\n- Within the `\"zero_optimization\"` section of the deepspeed config, we have added:\n  ```\n  \"offload_optimizer\": {\n    \"device\": \"cpu\",\n    \"pin_memory\": true\n  },\n  ```\n\n#### V100 GPUs\n\nTo run on V100 instances with 32GB of GPU memory (ex: `p3dn.24xlarge` or `Standard_ND40rs_v2`), \nsimply select `v100` from the `gpu_family` widget and enter the number of GPUs available in the `num_gpus` widget, and then run the rest of the code. \nThis will use the `v100_config.json` deepspeed config file, which makes the following changes:\n\n- It makes the changes described above for A10s\n- It enables fp16 floating point format\n- It sets the `per-device-train-batch-size` and `per-device-eval-batch-size` to 3\n  \nYou may be able to slightly increase the batch size with 32GB instances, compared to what works above for 24GB A10s.\n\n## Running Unit Tests Locally\n\n```\npyenv local 3.8.13\npython -m venv .venv\n. .venv/bin/activate\npip install -r requirements_dev.txt\n./run_pytest.sh\n```\n\n## Citation\n\n```\n@online{DatabricksBlog2023DollyV2,\n    author    = {Mike Conover and Matt Hayes and Ankit Mathur and Jianwei Xie and Jun Wan and Sam Shah and Ali Ghodsi and Patrick Wendell and Matei Zaharia and Reynold Xin},\n    title     = {Free Dolly: Introducing the World's First Truly Open Instruction-Tuned LLM},\n    year      = {2023},\n    url       = {https://www.databricks.com/blog/2023/04/12/dolly-first-open-commercially-viable-instruction-tuned-llm},\n    urldate   = {2023-06-30}\n}\n```\n"
  },
  {
    "path": "__init__.py",
    "content": ""
  },
  {
    "path": "config/a100_config.json",
    "content": "{\n    \"fp16\": {\n      \"enabled\": false\n    },\n    \"bf16\": {\n      \"enabled\": true\n    },\n    \"optimizer\": {\n      \"type\": \"AdamW\",\n      \"params\": {\n        \"lr\": \"auto\",\n        \"betas\": \"auto\",\n        \"eps\": \"auto\",\n        \"weight_decay\": \"auto\"\n      }\n    },\n    \"scheduler\": {\n      \"type\": \"WarmupLR\",\n      \"params\": {\n        \"warmup_min_lr\": \"auto\",\n        \"warmup_max_lr\": \"auto\",\n        \"warmup_num_steps\": \"auto\"\n      }\n    },\n    \"zero_optimization\": {\n      \"stage\": 3,\n      \"overlap_comm\": true,\n      \"contiguous_gradients\": true,\n      \"sub_group_size\": 1e9,\n      \"reduce_bucket_size\": \"auto\",\n      \"stage3_prefetch_bucket_size\": \"auto\",\n      \"stage3_param_persistence_threshold\": \"auto\",\n      \"stage3_max_live_parameters\": 1e9,\n      \"stage3_max_reuse_distance\": 1e9,\n      \"stage3_gather_16bit_weights_on_model_save\": true\n    },\n    \"gradient_accumulation_steps\": \"auto\",\n    \"gradient_clipping\": \"auto\",\n    \"steps_per_print\": 2000,\n    \"train_batch_size\": \"auto\",\n    \"train_micro_batch_size_per_gpu\": \"auto\",\n    \"wall_clock_breakdown\": false\n  }"
  },
  {
    "path": "config/a10_config.json",
    "content": "{\n    \"fp16\": {\n      \"enabled\": false\n    },\n    \"bf16\": {\n      \"enabled\": true\n    },\n    \"optimizer\": {\n      \"type\": \"AdamW\",\n      \"params\": {\n        \"lr\": \"auto\",\n        \"betas\": \"auto\",\n        \"eps\": \"auto\",\n        \"weight_decay\": \"auto\"\n      }\n    },\n    \"scheduler\": {\n      \"type\": \"WarmupLR\",\n      \"params\": {\n        \"warmup_min_lr\": \"auto\",\n        \"warmup_max_lr\": \"auto\",\n        \"warmup_num_steps\": \"auto\"\n      }\n    },\n    \"zero_optimization\": {\n      \"stage\": 3,\n      \"overlap_comm\": true,\n      \"contiguous_gradients\": true,\n      \"sub_group_size\": 1e9,\n      \"reduce_bucket_size\": \"auto\",\n      \"stage3_prefetch_bucket_size\": \"auto\",\n      \"stage3_param_persistence_threshold\": \"auto\",\n      \"stage3_max_live_parameters\": 1e9,\n      \"stage3_max_reuse_distance\": 1e9,\n      \"stage3_gather_16bit_weights_on_model_save\": true,\n      \"offload_optimizer\": {\n        \"device\": \"cpu\",\n        \"pin_memory\": true\n      }\n    },\n    \"gradient_accumulation_steps\": \"auto\",\n    \"gradient_clipping\": \"auto\",\n    \"steps_per_print\": 2000,\n    \"train_batch_size\": \"auto\",\n    \"train_micro_batch_size_per_gpu\": \"auto\",\n    \"wall_clock_breakdown\": false\n  }"
  },
  {
    "path": "config/v100_config.json",
    "content": "{\n    \"fp16\": {\n      \"enabled\": true\n    },\n    \"bf16\": {\n      \"enabled\": false\n    },\n    \"optimizer\": {\n      \"type\": \"AdamW\",\n      \"params\": {\n        \"lr\": \"auto\",\n        \"betas\": \"auto\",\n        \"eps\": \"auto\",\n        \"weight_decay\": \"auto\"\n      }\n    },\n    \"scheduler\": {\n      \"type\": \"WarmupLR\",\n      \"params\": {\n        \"warmup_min_lr\": \"auto\",\n        \"warmup_max_lr\": \"auto\",\n        \"warmup_num_steps\": \"auto\"\n      }\n    },\n    \"zero_optimization\": {\n      \"stage\": 3,\n      \"overlap_comm\": true,\n      \"contiguous_gradients\": true,\n      \"sub_group_size\": 1e9,\n      \"reduce_bucket_size\": \"auto\",\n      \"stage3_prefetch_bucket_size\": \"auto\",\n      \"stage3_param_persistence_threshold\": \"auto\",\n      \"stage3_max_live_parameters\": 1e9,\n      \"stage3_max_reuse_distance\": 1e9,\n      \"stage3_gather_16bit_weights_on_model_save\": true,\n      \"offload_optimizer\": {\n        \"device\": \"cpu\",\n        \"pin_memory\": true\n      }\n    },\n    \"gradient_accumulation_steps\": \"auto\",\n    \"gradient_clipping\": \"auto\",\n    \"steps_per_print\": 2000,\n    \"train_batch_size\": \"auto\",\n    \"train_micro_batch_size_per_gpu\": \"auto\",\n    \"wall_clock_breakdown\": false\n  }"
  },
  {
    "path": "data/README.md",
    "content": "# Moved to Hugging Face!\n\nThe `databricks-dolly-15k` dataset is now \n[hosted on Hugging Face](https://huggingface.co/datasets/databricks/databricks-dolly-15k).\n\nPlease simply use `datasets` to load `databricks/databricks-dolly-15k`.\n\nThe data file is directly accessible at \nhttps://huggingface.co/datasets/databricks/databricks-dolly-15k/blob/main/databricks-dolly-15k.jsonl\n"
  },
  {
    "path": "examples/generation.py",
    "content": "# Databricks notebook source\n# MAGIC %md\n# MAGIC ## Generation Example\n# MAGIC\n# MAGIC This takes a pretrained Dolly model, either from Hugging face or from a local path, and runs generation with it\n# MAGIC using the code from this repo.\n# MAGIC\n# MAGIC The model to load for generation is controlled by `input_model`.  The default options are the pretrained\n# MAGIC Dolly models shared on Hugging Face.  Alternatively, the path to a local model that has been trained using the\n# MAGIC `train_dolly` notebook can also be used.\n\n# COMMAND ----------\n\n# MAGIC %pip install -r ../requirements.txt\n\n# COMMAND ----------\n\n# MAGIC %load_ext autoreload\n# MAGIC %autoreload 2\n\ndefault_model = \"databricks/dolly-v2-3b\"\n\nsuggested_models = [\n    \"databricks/dolly-v1-6b\",\n    \"databricks/dolly-v2-3b\",\n    \"databricks/dolly-v2-7b\",\n    \"databricks/dolly-v2-12b\",\n]\n\ndbutils.widgets.combobox(\"input_model\", default_model, suggested_models, \"input_model\")\n\n# COMMAND ----------\n\nfrom training.generate import generate_response, load_model_tokenizer_for_generate\n\ninput_model = dbutils.widgets.get(\"input_model\")\n\nmodel, tokenizer = load_model_tokenizer_for_generate(input_model)\n\n# COMMAND ----------\n\n# Examples from https://www.databricks.com/blog/2023/03/24/hello-dolly-democratizing-magic-chatgpt-open-models.html\ninstructions = [\n    \"Explain to me the difference between nuclear fission and fusion.\",\n    \"Give me a list of 5 science fiction books I should read next.\",\n]\n\n# Use the model to generate responses for each of the instructions above.\nfor instruction in instructions:\n    response = generate_response(instruction, model=model, tokenizer=tokenizer)\n    if response:\n        print(f\"Instruction: {instruction}\\n\\n{response}\\n\\n-----------\\n\")\n"
  },
  {
    "path": "examples/langchain.py",
    "content": "# Databricks notebook source\n# MAGIC %md\n# MAGIC ## Langchain Example\n# MAGIC\n# MAGIC This takes a pretrained Dolly model, either from Hugging face or from a local path, and uses langchain\n# MAGIC to run generation.\n# MAGIC\n# MAGIC The model to load for generation is controlled by `input_model`.  The default options are the pretrained\n# MAGIC Dolly models shared on Hugging Face.  Alternatively, the path to a local model that has been trained using the\n# MAGIC `train_dolly` notebook can also be used.\n\n# COMMAND ----------\n\n# MAGIC %pip install -r ../requirements.txt\n\n# COMMAND ----------\n\n# MAGIC %load_ext autoreload\n# MAGIC %autoreload 2\n\n# COMMAND ----------\n\ndefault_model = \"databricks/dolly-v2-3b\"\n\nsuggested_models = [\n    \"databricks/dolly-v1-6b\",\n    \"databricks/dolly-v2-3b\",\n    \"databricks/dolly-v2-7b\",\n    \"databricks/dolly-v2-12b\",\n]\n\ndbutils.widgets.combobox(\"input_model\", default_model, suggested_models, \"input_model\")\n\n# COMMAND ----------\n\nfrom training.generate import InstructionTextGenerationPipeline, load_model_tokenizer_for_generate\n\ninput_model = dbutils.widgets.get(\"input_model\")\n\nmodel, tokenizer = load_model_tokenizer_for_generate(input_model)\n\n# COMMAND ----------\n\nfrom langchain import PromptTemplate, LLMChain\nfrom langchain.llms import HuggingFacePipeline\n\n# template for an instrution with no input\nprompt = PromptTemplate(\n    input_variables=[\"instruction\"],\n    template=\"{instruction}\")\n\n# template for an instruction with input\nprompt_with_context = PromptTemplate(\n    input_variables=[\"instruction\", \"context\"],\n    template=\"{instruction}\\n\\nInput:\\n{context}\")\n\nhf_pipeline = HuggingFacePipeline(\n    pipeline=InstructionTextGenerationPipeline(\n        # Return the full text, because this is what the HuggingFacePipeline expects.\n        model=model, tokenizer=tokenizer, return_full_text=True, task=\"text-generation\"))\n\nllm_chain = LLMChain(llm=hf_pipeline, prompt=prompt)\nllm_context_chain = LLMChain(llm=hf_pipeline, prompt=prompt_with_context)\n\n# COMMAND ----------\n\n# Examples from https://www.databricks.com/blog/2023/03/24/hello-dolly-democratizing-magic-chatgpt-open-models.html\ninstructions = [\n    \"Explain to me the difference between nuclear fission and fusion.\",\n    \"Give me a list of 5 science fiction books I should read next.\",\n]\n\n# Use the model to generate responses for each of the instructions above.\nfor instruction in instructions:\n    response = llm_chain.predict(instruction=instruction)\n    print(f\"Instruction: {instruction}\\n\\n{response}\\n\\n-----------\\n\")\n\n# COMMAND ----------\n\ncontext = (\n    \"\"\"George Washington (February 22, 1732[b] – December 14, 1799) was an American military officer, statesman, \"\"\"\n    \"\"\"and Founding Father who served as the first president of the United States from 1789 to 1797. Appointed by \"\"\"\n    \"\"\"the Continental Congress as commander of the Continental Army, Washington led Patriot forces to victory in \"\"\"\n    \"\"\"the American Revolutionary War and served as president of the Constitutional Convention of 1787, which \"\"\"\n    \"\"\"created and ratified the Constitution of the United States and the American federal government. Washington \"\"\"\n    \"\"\"has been called the \"Father of his Country\" for his manifold leadership in the nation's founding.\"\"\"\n)\n\ninstruction = \"When did George Washinton serve as president of the Constitutional Convention?\"\n\nresponse = llm_context_chain.predict(instruction=instruction, context=context)\nprint(f\"Instruction: {instruction}\\n\\nContext:\\n{context}\\n\\nResponse:\\n{response}\\n\\n-----------\\n\")\n"
  },
  {
    "path": "examples/pipeline.py",
    "content": "# Databricks notebook source\n# MAGIC %md\n# MAGIC ## Pipeline Example\n# MAGIC\n# MAGIC This takes a pretrained Dolly model, either from Hugging face or from a local path, and uses the pipeline from\n# MAGIC this repo to perform generation.\n# MAGIC\n# MAGIC The model to load for generation is controlled by `input_model`.  The default options are the pretrained\n# MAGIC Dolly models shared on Hugging Face.  Alternatively, the path to a local model that has been trained using the\n# MAGIC `train_dolly` notebook can also be used.\n\n# COMMAND ----------\n\n# MAGIC %pip install -r ../requirements.txt\n\n# COMMAND ----------\n\n# MAGIC %load_ext autoreload\n# MAGIC %autoreload 2\n\ndefault_model = \"databricks/dolly-v2-3b\"\n\nsuggested_models = [\n    \"databricks/dolly-v1-6b\",\n    \"databricks/dolly-v2-3b\",\n    \"databricks/dolly-v2-7b\",\n    \"databricks/dolly-v2-12b\",\n]\n\ndbutils.widgets.combobox(\"input_model\", default_model, suggested_models, \"input_model\")\n\n# COMMAND ----------\n\nfrom training.generate import InstructionTextGenerationPipeline, load_model_tokenizer_for_generate\n\ninput_model = dbutils.widgets.get(\"input_model\")\n\nmodel, tokenizer = load_model_tokenizer_for_generate(input_model)\n\n# COMMAND ----------\n\ngeneration_pipeline = InstructionTextGenerationPipeline(model=model, tokenizer=tokenizer)\n\n# Examples from https://www.databricks.com/blog/2023/03/24/hello-dolly-democratizing-magic-chatgpt-open-models.html\ninstructions = [\n    \"Explain to me the difference between nuclear fission and fusion.\",\n    \"Give me a list of 5 science fiction books I should read next.\",\n]\n\n# Use the model to generate responses for each of the instructions above.\nfor instruction in instructions:\n    results = generation_pipeline(instruction, num_return_sequences=2)\n\n    print(f\"Instruction: {instruction}\\n\")\n    for i, res in enumerate(results, 1):\n        text = res[\"generated_text\"]\n        print(f\"Sample #{i}:\\n{text}\\n\")\n    print(\"-----------\\n\")\n"
  },
  {
    "path": "pytest.ini",
    "content": "[pytest]\ntestpaths =\n  test\nlog_level = INFO"
  },
  {
    "path": "requirements.txt",
    "content": "accelerate>=0.16.0,<1\nclick>=8.0.4,<9\ndatasets>=2.10.0,<3\ndeepspeed>=0.8.3,<0.9\ntransformers[torch]>=4.28.1,<5\nlangchain>=0.0.139\ntorch>=1.13.1,<2\n"
  },
  {
    "path": "requirements_dev.txt",
    "content": "-r requirements.txt\npytest-cov==3.0.0\npytest==7.1.2\ntorchvision==0.14.1\n"
  },
  {
    "path": "run_pytest.sh",
    "content": "#!/bin/bash\n\nset -e\n\npython -m pytest \\\n--cov=dolly \\\n--cov-report=html \\\n--cov-report=term \\\n--cov-branch \\\n$@"
  },
  {
    "path": "test/__init__.py",
    "content": ""
  },
  {
    "path": "test/test_trainer.py",
    "content": "from training.trainer import load_tokenizer, load_training_dataset\n\ndef test_tokenizer():\n    \"\"\"Make sure we can encode and decode with the tokenizer\"\"\"\n    tokenizer = load_tokenizer()\n    assert tokenizer.decode(tokenizer.encode(\"Hello Dolly!\")) == \"Hello Dolly!\"\n\ndef test_load_training_dataset():\n    \"\"\"Make sure we can load the training dataset and it has records\"\"\"\n    dataset = load_training_dataset()\n    assert dataset.num_rows > 50_000\n"
  },
  {
    "path": "train_dolly.py",
    "content": "# Databricks notebook source\n# MAGIC %md\n# MAGIC ## Train Dolly\n# MAGIC\n# MAGIC This fine-tunes EleutherAI Pythia models\n# MAGIC (e.g. [pythia-2.8b](https://huggingface.co/EleutherAI/pythia-2.8b),\n# MAGIC [pythia-6.9b](https://huggingface.co/EleutherAI/pythia-6.9b), or\n# MAGIC [pythia-12b](https://huggingface.co/EleutherAI/pythia-12b)) on\n# MAGIC the [databricks-dolly-15k](https://github.com/databrickslabs/dolly/tree/master/data) dataset.\n# MAGIC\n# MAGIC ```\n# MAGIC   Licensed under the Apache License, Version 2.0 (the \"License\");\n# MAGIC   you may not use this file except in compliance with the License.\n# MAGIC   You may obtain a copy of the License at\n# MAGIC\n# MAGIC       http://www.apache.org/licenses/LICENSE-2.0\n# MAGIC\n# MAGIC   Unless required by applicable law or agreed to in writing, software\n# MAGIC   distributed under the License is distributed on an \"AS IS\" BASIS,\n# MAGIC   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# MAGIC   See the License for the specific language governing permissions and\n# MAGIC   limitations under the License.\n# MAGIC ```\n# MAGIC\n# MAGIC The EleutherAI Pythia models are [Apache 2.0 licensed](https://huggingface.co/EleutherAI/gpt-j-6B) and\n# MAGIC the [databricks-dolly-15k](https://github.com/databrickslabs/dolly/tree/master/data) is licensed under the terms\n# MAGIC of [Creative Commons Attribution-ShareAlike 3.0 Unported License](https://creativecommons.org/licenses/by-sa/3.0/legalcode),\n# MAGIC which means it can be used for either academic or commercial purposes.\n\n# COMMAND ----------\n\n# MAGIC %md\n# MAGIC Install these additional NVIDIA libraries for Databricks Runtime 13.x+ ML:\n\n# COMMAND ----------\n\n# MAGIC !wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/libcusparse-dev-11-7_11.7.3.50-1_amd64.deb -O /tmp/libcusparse-dev-11-7_11.7.3.50-1_amd64.deb && \\\n# MAGIC   wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/libcublas-dev-11-7_11.10.1.25-1_amd64.deb -O /tmp/libcublas-dev-11-7_11.10.1.25-1_amd64.deb && \\\n# MAGIC   wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/libcusolver-dev-11-7_11.4.0.1-1_amd64.deb -O /tmp/libcusolver-dev-11-7_11.4.0.1-1_amd64.deb && \\\n# MAGIC   wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/libcurand-dev-11-7_10.2.10.91-1_amd64.deb -O /tmp/libcurand-dev-11-7_10.2.10.91-1_amd64.deb && \\\n# MAGIC   dpkg -i /tmp/libcusparse-dev-11-7_11.7.3.50-1_amd64.deb && \\\n# MAGIC   dpkg -i /tmp/libcublas-dev-11-7_11.10.1.25-1_amd64.deb && \\\n# MAGIC   dpkg -i /tmp/libcusolver-dev-11-7_11.4.0.1-1_amd64.deb && \\\n# MAGIC   dpkg -i /tmp/libcurand-dev-11-7_10.2.10.91-1_amd64.deb\n\n# COMMAND ----------\n\n# MAGIC %pip install -r requirements.txt\n\n# COMMAND ----------\n\n# MAGIC %load_ext autoreload\n# MAGIC %autoreload 2\n\n# COMMAND ----------\n\nimport logging\n\nlogging.basicConfig(\n    format=\"%(asctime)s %(levelname)s [%(name)s] %(message)s\", level=logging.INFO, datefmt=\"%Y-%m-%d %H:%M:%S\"\n)\nlogging.getLogger(\"py4j\").setLevel(logging.WARNING)\nlogging.getLogger(\"sh.command\").setLevel(logging.ERROR)\n\n# COMMAND ----------\n\nimport os\nimport re\nfrom datetime import datetime\nfrom training.consts import DEFAULT_INPUT_MODEL, SUGGESTED_INPUT_MODELS\nfrom training.trainer import load_training_dataset, load_tokenizer\n\ndbutils.widgets.combobox(\"input_model\", DEFAULT_INPUT_MODEL, SUGGESTED_INPUT_MODELS, \"input_model\")\ndbutils.widgets.text(\"num_gpus\", \"\", \"num_gpus\")\ndbutils.widgets.text(\"local_training_root\", \"\", \"local_training_root\")\ndbutils.widgets.text(\"dbfs_output_root\", \"\", \"dbfs_output_root\")\ndbutils.widgets.text(\"experiment_id\", \"\", \"experiment_id\")\ndbutils.widgets.combobox(\"gpu_family\", \"a100\", [\"v100\", \"a10\", \"a100\"])\n\n# COMMAND ----------\n\ntimestamp = datetime.now().strftime(\"%Y-%m-%dT%H:%M:%S\")\nmodel_name = \"dolly\"\n\nexperiment_id = dbutils.widgets.get(\"experiment_id\")\ninput_model = dbutils.widgets.get(\"input_model\")\n\nif experiment_id:\n    experiment_id = re.sub(r\"\\s+\", \"_\", experiment_id.strip())\n    model_name = f\"{model_name}__{experiment_id}\"\n\ncheckpoint_dir_name = f\"{model_name}__{timestamp}\"\n\ndolly_training_dir_name = \"dolly_training\"\n\n# Use the local training root path if it was provided.  Otherwise try to find a sensible default.\nlocal_training_root = dbutils.widgets.get(\"local_training_root\")\nif not local_training_root:\n    # Use preferred path when working in a Databricks cluster if it exists.\n    if os.path.exists(\"/local_disk0\"):\n        local_training_root = os.path.join(\"/local_disk0\", dolly_training_dir_name)\n    # Otherwise use the home directory.\n    else:\n        local_training_root = os.path.join(os.path.expanduser('~'), dolly_training_dir_name)\n\ndbfs_output_root = dbutils.widgets.get(\"dbfs_output_root\")\nif not dbfs_output_root:\n    dbfs_output_root = f\"/dbfs/{dolly_training_dir_name}\"\n\nos.makedirs(local_training_root, exist_ok=True)\nos.makedirs(dbfs_output_root, exist_ok=True)\n\nlocal_output_dir = os.path.join(local_training_root, checkpoint_dir_name)\ndbfs_output_dir = os.path.join(dbfs_output_root, checkpoint_dir_name)\ntensorboard_display_dir = f\"{local_output_dir}/runs\"\n\nprint(f\"Local Output Dir: {local_output_dir}\")\nprint(f\"DBFS Output Dir: {dbfs_output_dir}\")\nprint(f\"Tensorboard Display Dir: {tensorboard_display_dir}\")\n\n# pick an appropriate config file\ngpu_family = dbutils.widgets.get(\"gpu_family\")\nconfig_file_name = f\"{gpu_family}_config.json\"\ndeepspeed_config = os.path.join(os.getcwd(), \"config\", config_file_name)\nprint(f\"Deepspeed config file: {deepspeed_config}\")\n\n# configure the batch_size\nbatch_size = 3\nif gpu_family == \"a10\":\n    batch_size = 4\nelif gpu_family == \"a100\":\n    batch_size = 6\n\n# configure num_gpus, if specified\nnum_gpus_flag = \"\"\nnum_gpus = dbutils.widgets.get(\"num_gpus\")\nif num_gpus:\n    num_gpus = int(num_gpus)\n    num_gpus_flag = f\"--num_gpus={num_gpus}\"\n\nif gpu_family == \"v100\":\n    bf16_flag = \"--bf16 false\"\nelse:\n    bf16_flag = \"--bf16 true\"\n\nos.environ[\"TOKENIZERS_PARALLELISM\"] = \"false\"\n\n# COMMAND ----------\n\n# MAGIC %load_ext tensorboard\n# MAGIC %tensorboard --logdir '{tensorboard_display_dir}'\n\n# COMMAND ----------\n\n!deepspeed {num_gpus_flag} \\\n    --module training.trainer \\\n    --input-model {input_model} \\\n    --deepspeed {deepspeed_config} \\\n    --epochs 2 \\\n    --local-output-dir {local_output_dir} \\\n    --dbfs-output-dir {dbfs_output_dir} \\\n    --per-device-train-batch-size {batch_size} \\\n    --per-device-eval-batch-size {batch_size} \\\n    --logging-steps 10 \\\n    --save-steps 200 \\\n    --save-total-limit 20 \\\n    --eval-steps 50 \\\n    --warmup-steps 50 \\\n    --test-size 200 \\\n    --lr 5e-6 \\\n    {bf16_flag}\n\n# COMMAND ----------\n\nfrom training.generate import generate_response, load_model_tokenizer_for_generate\n\nmodel, tokenizer = load_model_tokenizer_for_generate(dbfs_output_dir)\n\n# COMMAND ----------\n\n# Examples from https://www.databricks.com/blog/2023/03/24/hello-dolly-democratizing-magic-chatgpt-open-models.html\ninstructions = [\n    \"Write a love letter to Edgar Allan Poe.\",\n    \"Write a tweet announcing Dolly, a large language model from Databricks.\",\n    \"I'm selling my Nikon D-750, write a short blurb for my ad.\",\n    \"Explain to me the difference between nuclear fission and fusion.\",\n    \"Give me a list of 5 science fiction books I should read next.\",\n]\n\n# set some additional pipeline args\npipeline_kwargs = {'torch_dtype': \"auto\"}\nif gpu_family == \"v100\":\n    pipeline_kwargs['torch_dtype'] = \"float16\"\nelif gpu_family == \"a10\" or gpu_family == \"a100\":\n    pipeline_kwargs['torch_dtype'] = \"bfloat16\"\n\n# Use the model to generate responses for each of the instructions above.\nfor instruction in instructions:\n    response = generate_response(instruction, model=model, tokenizer=tokenizer, **pipeline_kwargs)\n    if response:\n        print(f\"Instruction: {instruction}\\n\\n{response}\\n\\n-----------\\n\")\n\n# COMMAND ----------\n\n\n"
  },
  {
    "path": "training/__init__.py",
    "content": ""
  },
  {
    "path": "training/consts.py",
    "content": "DEFAULT_INPUT_MODEL = \"EleutherAI/pythia-6.9b\"\nSUGGESTED_INPUT_MODELS = [\n    \"EleutherAI/pythia-2.8b\",\n    \"EleutherAI/pythia-6.9b\",\n    \"EleutherAI/pythia-12b\",\n    \"EleutherAI/gpt-j-6B\",\n    \"databricks/dolly-v2-3b\",\n    \"databricks/dolly-v2-7b\",\n    \"databricks/dolly-v2-12b\"\n]\nDEFAULT_TRAINING_DATASET = \"databricks/databricks-dolly-15k\"\nINTRO_BLURB = (\n    \"Below is an instruction that describes a task. Write a response that appropriately completes the request.\"\n)\nINSTRUCTION_KEY = \"### Instruction:\"\nINPUT_KEY = \"Input:\"\nRESPONSE_KEY = \"### Response:\"\nEND_KEY = \"### End\"\nRESPONSE_KEY_NL = f\"{RESPONSE_KEY}\\n\"\nDEFAULT_SEED = 42\n\n# This is a training prompt that does not contain an input string.  The instruction by itself has enough information\n# to respond.  For example, the instruction might ask for the year a historic figure was born.\nPROMPT_NO_INPUT_FORMAT = \"\"\"{intro}\n\n{instruction_key}\n{instruction}\n\n{response_key}\n{response}\n\n{end_key}\"\"\".format(\n    intro=INTRO_BLURB,\n    instruction_key=INSTRUCTION_KEY,\n    instruction=\"{instruction}\",\n    response_key=RESPONSE_KEY,\n    response=\"{response}\",\n    end_key=END_KEY,\n)\n\n# This is a training prompt that contains an input string that serves as context for the instruction.  For example,\n# the input might be a passage from Wikipedia and the intruction is to extract some information from it.\nPROMPT_WITH_INPUT_FORMAT = \"\"\"{intro}\n\n{instruction_key}\n{instruction}\n\n{input_key}\n{input}\n\n{response_key}\n{response}\n\n{end_key}\"\"\".format(\n    intro=INTRO_BLURB,\n    instruction_key=INSTRUCTION_KEY,\n    instruction=\"{instruction}\",\n    input_key=INPUT_KEY,\n    input=\"{input}\",\n    response_key=RESPONSE_KEY,\n    response=\"{response}\",\n    end_key=END_KEY,\n)\n\n# This is the prompt that is used for generating responses using an already trained model.  It ends with the response\n# key, where the job of the model is to provide the completion that follows it (i.e. the response itself).\nPROMPT_FOR_GENERATION_FORMAT = \"\"\"{intro}\n\n{instruction_key}\n{instruction}\n\n{response_key}\n\"\"\".format(\n    intro=INTRO_BLURB,\n    instruction_key=INSTRUCTION_KEY,\n    instruction=\"{instruction}\",\n    response_key=RESPONSE_KEY,\n)\n"
  },
  {
    "path": "training/generate.py",
    "content": "import logging\nimport re\nfrom typing import List, Tuple\nimport torch\n\nimport numpy as np\nfrom transformers import (\n    AutoModelForCausalLM,\n    AutoTokenizer,\n    Pipeline,\n    PreTrainedModel,\n    PreTrainedTokenizer,\n)\n\nfrom transformers.utils import is_tf_available\n\nif is_tf_available():\n    import tensorflow as tf\n\nfrom .consts import END_KEY, PROMPT_FOR_GENERATION_FORMAT, RESPONSE_KEY\n\nlogger = logging.getLogger(__name__)\n\n\ndef load_model_tokenizer_for_generate(\n    pretrained_model_name_or_path: str,\n) -> Tuple[PreTrainedModel, PreTrainedTokenizer]:\n    \"\"\"Loads the model and tokenizer so that it can be used for generating responses.\n\n    Args:\n        pretrained_model_name_or_path (str): name or path for model\n\n    Returns:\n        Tuple[PreTrainedModel, PreTrainedTokenizer]: model and tokenizer\n    \"\"\"\n    tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path, padding_side=\"left\")\n    model = AutoModelForCausalLM.from_pretrained(\n        pretrained_model_name_or_path, device_map=\"auto\", torch_dtype=torch.bfloat16, trust_remote_code=True\n    )\n    return model, tokenizer\n\n\ndef get_special_token_id(tokenizer: PreTrainedTokenizer, key: str) -> int:\n    \"\"\"Gets the token ID for a given string that has been added to the tokenizer as a special token.\n\n    When training, we configure the tokenizer so that the sequences like \"### Instruction:\" and \"### End\" are\n    treated specially and converted to a single, new token.  This retrieves the token ID each of these keys map to.\n\n    Args:\n        tokenizer (PreTrainedTokenizer): the tokenizer\n        key (str): the key to convert to a single token\n\n    Raises:\n        ValueError: if more than one ID was generated\n\n    Returns:\n        int: the token ID for the given key\n    \"\"\"\n    token_ids = tokenizer.encode(key)\n    if len(token_ids) > 1:\n        raise ValueError(f\"Expected only a single token for '{key}' but found {token_ids}\")\n    return token_ids[0]\n\n\nclass InstructionTextGenerationPipeline(Pipeline):\n    def __init__(\n        self, *args, do_sample: bool = True, max_new_tokens: int = 256, top_p: float = 0.92, top_k: int = 0, **kwargs\n    ):\n        \"\"\"Initialize the pipeline\n\n        Args:\n            do_sample (bool, optional): Whether or not to use sampling. Defaults to True.\n            max_new_tokens (int, optional): Max new tokens after the prompt to generate. Defaults to 256.\n            top_p (float, optional): If set to float < 1, only the smallest set of most probable tokens with\n                probabilities that add up to top_p or higher are kept for generation. Defaults to 0.92.\n            top_k (int, optional): The number of highest probability vocabulary tokens to keep for top-k-filtering.\n                Defaults to 0.\n        \"\"\"\n        super().__init__(*args, do_sample=do_sample, max_new_tokens=max_new_tokens, top_p=top_p, top_k=top_k,\n                         **kwargs)\n\n    def _sanitize_parameters(self,\n                             return_full_text: bool = None,\n                             **generate_kwargs):\n        preprocess_params = {}\n\n        # newer versions of the tokenizer configure the response key as a special token.  newer versions still may\n        # append a newline to yield a single token.  find whatever token is configured for the response key.\n        tokenizer_response_key = next(\n            (token for token in self.tokenizer.additional_special_tokens if token.startswith(RESPONSE_KEY)), None\n        )\n\n        response_key_token_id = None\n        end_key_token_id = None\n        if tokenizer_response_key:\n            try:\n                response_key_token_id = get_special_token_id(self.tokenizer, tokenizer_response_key)\n                end_key_token_id = get_special_token_id(self.tokenizer, END_KEY)\n\n                # Ensure generation stops once it generates \"### End\"\n                generate_kwargs[\"eos_token_id\"] = end_key_token_id\n            except ValueError:\n                pass\n\n        forward_params = generate_kwargs\n        postprocess_params = {\n            \"response_key_token_id\": response_key_token_id,\n            \"end_key_token_id\": end_key_token_id\n        }\n\n        if return_full_text is not None:\n            postprocess_params[\"return_full_text\"] = return_full_text\n\n        return preprocess_params, forward_params, postprocess_params\n\n    def preprocess(self, instruction_text, **generate_kwargs):\n        prompt_text = PROMPT_FOR_GENERATION_FORMAT.format(instruction=instruction_text)\n        inputs = self.tokenizer(\n            prompt_text,\n            return_tensors=\"pt\",\n        )\n        inputs[\"prompt_text\"] = prompt_text\n        inputs[\"instruction_text\"] = instruction_text\n        return inputs\n\n    def _forward(self, model_inputs, **generate_kwargs):\n        input_ids = model_inputs[\"input_ids\"]\n        attention_mask = model_inputs.get(\"attention_mask\", None)\n\n        if input_ids.shape[1] == 0:\n            input_ids = None\n            attention_mask = None\n            in_b = 1\n        else:\n            in_b = input_ids.shape[0]\n\n        generated_sequence = self.model.generate(\n            input_ids=input_ids.to(self.model.device),\n            attention_mask=attention_mask.to(self.model.device),\n            pad_token_id=self.tokenizer.pad_token_id,\n            **generate_kwargs,\n        )\n\n        out_b = generated_sequence.shape[0]\n        if self.framework == \"pt\":\n            generated_sequence = generated_sequence.reshape(in_b, out_b // in_b, *generated_sequence.shape[1:])\n        elif self.framework == \"tf\":\n            generated_sequence = tf.reshape(generated_sequence, (in_b, out_b // in_b, *generated_sequence.shape[1:]))\n\n        instruction_text = model_inputs.pop(\"instruction_text\")\n        return {\"generated_sequence\": generated_sequence, \"input_ids\": input_ids, \"instruction_text\": instruction_text}\n\n    def postprocess(self, model_outputs, response_key_token_id, end_key_token_id, return_full_text: bool = False):\n\n        generated_sequence = model_outputs[\"generated_sequence\"][0]\n        instruction_text = model_outputs[\"instruction_text\"]\n\n        generated_sequence: List[List[int]] = generated_sequence.numpy().tolist()\n        records = []\n        for sequence in generated_sequence:\n\n            # The response will be set to this variable if we can identify it.\n            decoded = None\n\n            # If we have token IDs for the response and end, then we can find the tokens and only decode between them.\n            if response_key_token_id and end_key_token_id:\n                # Find where \"### Response:\" is first found in the generated tokens.  Considering this is part of the\n                # prompt, we should definitely find it.  We will return the tokens found after this token.\n                try:\n                    response_pos = sequence.index(response_key_token_id)\n                except ValueError:\n                    logger.warn(f\"Could not find response key {response_key_token_id} in: {sequence}\")\n                    response_pos = None\n\n                if response_pos:\n                    # Next find where \"### End\" is located.  The model has been trained to end its responses with this\n                    # sequence (or actually, the token ID it maps to, since it is a special token).  We may not find\n                    # this token, as the response could be truncated.  If we don't find it then just return everything\n                    # to the end.  Note that even though we set eos_token_id, we still see the this token at the end.\n                    try:\n                        end_pos = sequence.index(end_key_token_id)\n                    except ValueError:\n                        end_pos = None\n\n                    decoded = self.tokenizer.decode(sequence[response_pos + 1 : end_pos]).strip()\n\n            if not decoded:\n                # Otherwise we'll decode everything and use a regex to find the response and end.\n\n                fully_decoded = self.tokenizer.decode(sequence)\n\n                # The response appears after \"### Response:\".  The model has been trained to append \"### End\" at the\n                # end.\n                m = re.search(r\"#+\\s*Response:\\s*(.+?)#+\\s*End\", fully_decoded, flags=re.DOTALL)\n\n                if m:\n                    decoded = m.group(1).strip()\n                else:\n                    # The model might not generate the \"### End\" sequence before reaching the max tokens.  In this case,\n                    # return everything after \"### Response:\".\n                    m = re.search(r\"#+\\s*Response:\\s*(.+)\", fully_decoded, flags=re.DOTALL)\n                    if m:\n                        decoded = m.group(1).strip()\n                    else:\n                        logger.warn(f\"Failed to find response in:\\n{fully_decoded}\")\n\n            # If the full text is requested, then append the decoded text to the original instruction.\n            # This technically isn't the full text, as we format the instruction in the prompt the model has been\n            # trained on, but to the client it will appear to be the full text.\n            if return_full_text:\n                decoded = f\"{instruction_text}\\n{decoded}\"\n\n            rec = {\"generated_text\": decoded}\n\n            records.append(rec)\n\n        return records\n\n\ndef generate_response(\n    instruction: str,\n    *,\n    model: PreTrainedModel,\n    tokenizer: PreTrainedTokenizer,\n    **kwargs,\n) -> str:\n    \"\"\"Given an instruction, uses the model and tokenizer to generate a response.  This formats the instruction in\n    the instruction format that the model was fine-tuned on.\n\n    Args:\n        instruction (str): _description_\n        model (PreTrainedModel): the model to use\n        tokenizer (PreTrainedTokenizer): the tokenizer to use\n\n    Returns:\n        str: response\n    \"\"\"\n\n    generation_pipeline = InstructionTextGenerationPipeline(model=model, tokenizer=tokenizer, **kwargs)\n    return generation_pipeline(instruction)[0][\"generated_text\"]\n"
  },
  {
    "path": "training/trainer.py",
    "content": "# Copyright 2023 Databricks, Inc.\n\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n\n#     http://www.apache.org/licenses/LICENSE-2.0\n\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport logging\nfrom functools import partial\nfrom pathlib import Path\nfrom typing import Any, Dict, List, Tuple, Union\n\nimport click\nimport numpy as np\nfrom datasets import Dataset, load_dataset\nfrom transformers import (\n    AutoModelForCausalLM,\n    AutoTokenizer,\n    DataCollatorForLanguageModeling,\n    PreTrainedTokenizer,\n    Trainer,\n    TrainingArguments,\n    set_seed,\n)\n\nfrom .consts import (\n    DEFAULT_INPUT_MODEL,\n    DEFAULT_SEED,\n    PROMPT_WITH_INPUT_FORMAT,\n    PROMPT_NO_INPUT_FORMAT,\n    END_KEY,\n    INSTRUCTION_KEY,\n    RESPONSE_KEY_NL,\n    DEFAULT_TRAINING_DATASET,\n)\n\nlogger = logging.getLogger(__name__)\nROOT_PATH = Path(__file__).parent.parent\n\n\nclass DataCollatorForCompletionOnlyLM(DataCollatorForLanguageModeling):\n    def torch_call(self, examples: List[Union[List[int], Any, Dict[str, Any]]]) -> Dict[str, Any]:\n        batch = super().torch_call(examples)\n\n        # The prompt ends with the response key plus a newline.  We encode this and then try to find it in the\n        # sequence of tokens.  This should just be a single token.\n        response_token_ids = self.tokenizer.encode(RESPONSE_KEY_NL)\n\n        labels = batch[\"labels\"].clone()\n\n        for i in range(len(examples)):\n\n            response_token_ids_start_idx = None\n            for idx in np.where(batch[\"labels\"][i] == response_token_ids[0])[0]:\n                response_token_ids_start_idx = idx\n                break\n\n            if response_token_ids_start_idx is None:\n                raise RuntimeError(\n                    f'Could not find response key {response_token_ids} in token IDs {batch[\"labels\"][i]}'\n                )\n\n            response_token_ids_end_idx = response_token_ids_start_idx + 1\n\n            # Make pytorch loss function ignore all tokens up through the end of the response key\n            labels[i, :response_token_ids_end_idx] = -100\n\n        batch[\"labels\"] = labels\n\n        return batch\n\n\ndef preprocess_batch(batch: Dict[str, List], tokenizer: AutoTokenizer, max_length: int) -> dict:\n    return tokenizer(\n        batch[\"text\"],\n        max_length=max_length,\n        truncation=True,\n    )\n\n\ndef load_training_dataset(path_or_dataset: str = DEFAULT_TRAINING_DATASET) -> Dataset:\n    logger.info(f\"Loading dataset from {path_or_dataset}\")\n    dataset = load_dataset(path_or_dataset)[\"train\"]\n    logger.info(\"Found %d rows\", dataset.num_rows)\n\n    def _add_text(rec):\n        instruction = rec[\"instruction\"]\n        response = rec[\"response\"]\n        context = rec.get(\"context\")\n\n        if not instruction:\n            raise ValueError(f\"Expected an instruction in: {rec}\")\n\n        if not response:\n            raise ValueError(f\"Expected a response in: {rec}\")\n\n        # For some instructions there is an input that goes along with the instruction, providing context for the\n        # instruction.  For example, the input might be a passage from Wikipedia and the instruction says to extract\n        # some piece of information from it.  The response is that information to extract.  In other cases there is\n        # no input.  For example, the instruction might be open QA such as asking what year some historic figure was\n        # born.\n        if context:\n            rec[\"text\"] = PROMPT_WITH_INPUT_FORMAT.format(instruction=instruction, response=response, input=context)\n        else:\n            rec[\"text\"] = PROMPT_NO_INPUT_FORMAT.format(instruction=instruction, response=response)\n        return rec\n\n    dataset = dataset.map(_add_text)\n\n    return dataset\n\n\ndef load_tokenizer(pretrained_model_name_or_path: str = DEFAULT_INPUT_MODEL) -> PreTrainedTokenizer:\n    logger.info(f\"Loading tokenizer for {pretrained_model_name_or_path}\")\n    tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path)\n    tokenizer.pad_token = tokenizer.eos_token\n    tokenizer.add_special_tokens({\"additional_special_tokens\": [END_KEY, INSTRUCTION_KEY, RESPONSE_KEY_NL]})\n    return tokenizer\n\n\ndef load_model(\n    pretrained_model_name_or_path: str = DEFAULT_INPUT_MODEL, *, gradient_checkpointing: bool = False\n) -> AutoModelForCausalLM:\n    logger.info(f\"Loading model for {pretrained_model_name_or_path}\")\n    model = AutoModelForCausalLM.from_pretrained(\n        pretrained_model_name_or_path, trust_remote_code=True, use_cache=False if gradient_checkpointing else True\n    )\n    return model\n\n\ndef get_model_tokenizer(\n    pretrained_model_name_or_path: str = DEFAULT_INPUT_MODEL, *, gradient_checkpointing: bool = False\n) -> Tuple[AutoModelForCausalLM, PreTrainedTokenizer]:\n    tokenizer = load_tokenizer(pretrained_model_name_or_path)\n    model = load_model(pretrained_model_name_or_path, gradient_checkpointing=gradient_checkpointing)\n    model.resize_token_embeddings(len(tokenizer))\n\n    return model, tokenizer\n\n\ndef preprocess_dataset(tokenizer: AutoTokenizer, max_length: int, seed=DEFAULT_SEED, training_dataset: str = DEFAULT_TRAINING_DATASET) -> Dataset:\n    \"\"\"Loads the training dataset and tokenizes it so it is ready for training.\n\n    Args:\n        tokenizer (AutoTokenizer): Tokenizer tied to the model.\n        max_length (int): Maximum number of tokens to emit from tokenizer.\n\n    Returns:\n        Dataset: HuggingFace dataset\n    \"\"\"\n\n    dataset = load_training_dataset(training_dataset)\n\n    logger.info(\"Preprocessing dataset\")\n    _preprocessing_function = partial(preprocess_batch, max_length=max_length, tokenizer=tokenizer)\n    dataset = dataset.map(\n        _preprocessing_function,\n        batched=True,\n        remove_columns=[\"instruction\", \"context\", \"response\", \"text\", \"category\"],\n    )\n\n    # Make sure we don't have any truncated records, as this would mean the end keyword is missing.\n    logger.info(\"Processed dataset has %d rows\", dataset.num_rows)\n    dataset = dataset.filter(lambda rec: len(rec[\"input_ids\"]) < max_length)\n    logger.info(\"Processed dataset has %d rows after filtering for truncated records\", dataset.num_rows)\n\n    logger.info(\"Shuffling dataset\")\n    dataset = dataset.shuffle(seed=seed)\n\n    logger.info(\"Done preprocessing\")\n\n    return dataset\n\n\ndef train(\n    *,\n    input_model: str,\n    local_output_dir: str,\n    dbfs_output_dir: str,\n    epochs: int,\n    per_device_train_batch_size: int,\n    per_device_eval_batch_size: int,\n    lr: float,\n    seed: int,\n    deepspeed: str,\n    gradient_checkpointing: bool,\n    local_rank: str,\n    bf16: bool,\n    logging_steps: int,\n    save_steps: int,\n    eval_steps: int,\n    test_size: Union[float, int],\n    save_total_limit: int,\n    warmup_steps: int,\n    training_dataset: str = DEFAULT_TRAINING_DATASET,\n):\n    set_seed(seed)\n\n    model, tokenizer = get_model_tokenizer(\n        pretrained_model_name_or_path=input_model, gradient_checkpointing=gradient_checkpointing\n    )\n\n    # Use the same max length that the model supports.  Fall back to 1024 if the setting can't be found.\n    # The configuraton for the length can be stored under different names depending on the model.  Here we attempt\n    # a few possible names we've encountered.\n    conf = model.config\n    max_length = None\n    for length_setting in [\"n_positions\", \"max_position_embeddings\", \"seq_length\"]:\n        max_length = getattr(model.config, length_setting, None)\n        if max_length:\n            logger.info(f\"Found max lenth: {max_length}\")\n            break\n    if not max_length:\n        max_length = 1024\n        logger.info(f\"Using default max length: {max_length}\")\n\n    processed_dataset = preprocess_dataset(tokenizer=tokenizer, max_length=max_length, seed=seed, training_dataset=training_dataset)\n\n    split_dataset = processed_dataset.train_test_split(test_size=test_size, seed=seed)\n\n    logger.info(\"Train data size: %d\", split_dataset[\"train\"].num_rows)\n    logger.info(\"Test data size: %d\", split_dataset[\"test\"].num_rows)\n\n    data_collator = DataCollatorForCompletionOnlyLM(\n        tokenizer=tokenizer, mlm=False, return_tensors=\"pt\", pad_to_multiple_of=8\n    )\n\n    # enable fp16 if not bf16\n    fp16 = not bf16\n\n    if not dbfs_output_dir:\n        logger.warn(\"Will NOT save to DBFS\")\n\n    training_args = TrainingArguments(\n        output_dir=local_output_dir,\n        per_device_train_batch_size=per_device_train_batch_size,\n        per_device_eval_batch_size=per_device_eval_batch_size,\n        fp16=fp16,\n        bf16=bf16,\n        learning_rate=lr,\n        num_train_epochs=epochs,\n        deepspeed=deepspeed,\n        gradient_checkpointing=gradient_checkpointing,\n        logging_dir=f\"{local_output_dir}/runs\",\n        logging_strategy=\"steps\",\n        logging_steps=logging_steps,\n        evaluation_strategy=\"steps\",\n        eval_steps=eval_steps,\n        save_strategy=\"steps\",\n        save_steps=save_steps,\n        save_total_limit=save_total_limit,\n        load_best_model_at_end=False,\n        report_to=\"tensorboard\",\n        disable_tqdm=True,\n        remove_unused_columns=False,\n        local_rank=local_rank,\n        warmup_steps=warmup_steps,\n    )\n\n    logger.info(\"Instantiating Trainer\")\n\n    trainer = Trainer(\n        model=model,\n        tokenizer=tokenizer,\n        args=training_args,\n        train_dataset=split_dataset[\"train\"],\n        eval_dataset=split_dataset[\"test\"],\n        data_collator=data_collator,\n    )\n\n    logger.info(\"Training\")\n    trainer.train()\n\n    logger.info(f\"Saving Model to {local_output_dir}\")\n    trainer.save_model(output_dir=local_output_dir)\n\n    if dbfs_output_dir:\n        logger.info(f\"Saving Model to {dbfs_output_dir}\")\n        trainer.save_model(output_dir=dbfs_output_dir)\n\n    logger.info(\"Done.\")\n\n\n@click.command()\n@click.option(\"--input-model\", type=str, help=\"Input model to fine tune\", default=DEFAULT_INPUT_MODEL)\n@click.option(\"--local-output-dir\", type=str, help=\"Write directly to this local path\", required=True)\n@click.option(\"--dbfs-output-dir\", type=str, help=\"Sync data to this path on DBFS\")\n@click.option(\"--epochs\", type=int, default=3, help=\"Number of epochs to train for.\")\n@click.option(\"--per-device-train-batch-size\", type=int, default=8, help=\"Batch size to use for training.\")\n@click.option(\"--per-device-eval-batch-size\", type=int, default=8, help=\"Batch size to use for evaluation.\")\n@click.option(\n    \"--test-size\", type=int, default=1000, help=\"Number of test records for evaluation, or ratio of test records.\"\n)\n@click.option(\"--warmup-steps\", type=int, default=None, help=\"Number of steps to warm up to learning rate\")\n@click.option(\"--logging-steps\", type=int, default=10, help=\"How often to log\")\n@click.option(\"--eval-steps\", type=int, default=50, help=\"How often to run evaluation on test records\")\n@click.option(\"--save-steps\", type=int, default=400, help=\"How often to checkpoint the model\")\n@click.option(\"--save-total-limit\", type=int, default=10, help=\"Maximum number of checkpoints to keep on disk\")\n@click.option(\"--lr\", type=float, default=1e-5, help=\"Learning rate to use for training.\")\n@click.option(\"--seed\", type=int, default=DEFAULT_SEED, help=\"Seed to use for training.\")\n@click.option(\"--deepspeed\", type=str, default=None, help=\"Path to deepspeed config file.\")\n@click.option(\"--training-dataset\", type=str, default=DEFAULT_TRAINING_DATASET, help=\"Path to dataset for training\")\n@click.option(\n    \"--gradient-checkpointing/--no-gradient-checkpointing\",\n    is_flag=True,\n    default=True,\n    help=\"Use gradient checkpointing?\",\n)\n@click.option(\n    \"--local_rank\",\n    type=str,\n    default=True,\n    help=\"Provided by deepspeed to identify which instance this process is when performing multi-GPU training.\",\n)\n@click.option(\"--bf16\", type=bool, default=None, help=\"Whether to use bf16 (preferred on A100's).\")\ndef main(**kwargs):\n    train(**kwargs)\n\n\nif __name__ == \"__main__\":\n    logging.basicConfig(\n        format=\"%(asctime)s %(levelname)s [%(name)s] %(message)s\", level=logging.INFO, datefmt=\"%Y-%m-%d %H:%M:%S\"\n    )\n    try:\n        main()\n    except Exception:\n        logger.exception(\"main failed\")\n        raise\n"
  }
]