[
  {
    "path": "AGENTS.md",
    "content": "# Agent Instructions\n\nThis repository contains a single Python script for reading Kindle clippings. When modifying or adding code, keep the following points in mind:\n\n- Follow PEP8 style conventions for all Python code.\n- Maintain compatibility with both Python 2 and Python 3.\n- Before committing, run `python -m py_compile $(git ls-files '*.py')` to ensure there are no syntax errors.\n- Keep documentation in Markdown format.\n- Write concise commit messages summarizing your changes.\n\n"
  },
  {
    "path": "KindleClippings.py",
    "content": "from __future__ import print_function\nimport re\nimport io\nimport os\nimport argparse\nfrom fpdf import FPDF\nimport docx\n\ndef remove_chars(s, end_directory=\"\"):\n    \"\"\"\n    This is a utility function that removes special characters from the string, so that it can\n    become a valid filename.\n    :param s: input string\n    :return: the input string, stripped of special characters\n    \"\"\"\n    # Replace colons with a hyphen so \"A: B\" becomes \"A - B\"\n    s = re.sub(\" *: *\", \" - \", s)\n    # Remove question marks or ampersands\n    s = s.replace(\"?\", \"\").replace(\"&\", \"and\")\n    # Replace ( ) with a hyphen so \"this (text)\" becomes \"this - text\"\n    s = re.sub(r\"\\((.+?)\\)\", r\"- \\1\", s)\n    # Delete filename chars tht are not alphanumeric or ; , _ -\n    s = re.sub(r\"[^a-zA-Z\\d\\s\\w;,_-]+\", \"\", s)\n    # Trim off anything that isn't a word at the start & end\n    s = re.sub(r\"^\\W+|\\W+$\", \"\", s)\n\n    max_length = 245 - len(end_directory)  # max file size limited to 255.\n    s = s[:max_length]\n    return s\n\n\ndef insert_line_break_in_pdf(pdf_file: FPDF, num_breaks: int = 1) -> FPDF:\n    \"\"\"\n    Inserts a line break in a pdf for num_breaks times\n    \"\"\"\n    while num_breaks != 0:\n        pdf_file.multi_cell(0, 5, \"\", 0)\n        num_breaks -= 1\n\n    return pdf_file\n\ndef insert_bar_separator_in_pdf(pdf_file: FPDF):\n    \"\"\"\n    Inserts a bar separator in a pdf, useful to separate highlights\n    \"\"\"\n    pdf_file = insert_line_break_in_pdf(pdf_file)\n    pdf_file.set_draw_color(191, 191, 191)\n    pdf_file.line(40, pdf_file.y, 150, pdf_file.y)\n    pdf_file = insert_line_break_in_pdf(pdf_file)\n\n    return pdf_file\n\n\ndef prepare_pdf_document(highlights: str, include_clip_meta = False, title: str = \"Your Notes And Highlights\") -> FPDF:\n    \"\"\"\n    Will create pdf document from the notes\n\n    :param highlights:\n    :return: FPDF\n    \"\"\"\n    pdf_file = FPDF()\n    pdf_file.add_page()\n    pdf_file.add_font(\"lisboa\", '', 'media/Lisboa.ttf', uni=True)\n    pdf_file.set_font(\"lisboa\", '', 22)\n    pdf_file.set_margins(25, 40, 25)\n    pdf_file = insert_line_break_in_pdf(pdf_file, 3)\n    pdf_file.multi_cell(0, 5, title, align=\"C\")\n    pdf_file = insert_line_break_in_pdf(pdf_file, 2)\n    \n    meta_regex_pattern = r\"(Your.*\\| Added on)\"\n    for highlight_line in highlights:\n        # create multi-cell pdf object and add text to it\n        if re.search(meta_regex_pattern, highlight_line):\n            pdf_file.set_font(\"lisboa\", '', 11)\n            pdf_file.set_text_color(77, 77, 77)\n            pdf_file.multi_cell(0, 5, highlight_line, 0)\n            pdf_file = insert_bar_separator_in_pdf(pdf_file)\n        elif len(highlight_line) < 10:\n            if not include_clip_meta and highlight_line == \"...\":\n                pdf_file = insert_bar_separator_in_pdf(pdf_file)\n            else:\n                continue\n        else:\n            pdf_file.set_font(\"lisboa\", '', 15)\n            pdf_file.set_text_color(0, 0, 0)\n            pdf_file.multi_cell(0, 5, highlight_line, 0)\n\n    return pdf_file\n\n\ndef convert_to_format(path, file_name, format, include_clip_meta=False):\n    \"\"\"\n    Will get text file and will convert to specified output\n\n    :param path:\n    :param file_name:\n    :param format:\n    :return: name of the file created\n    \"\"\"\n    output_file_name = file_name[0:-4] + \".\" + format\n    txt_path = os.path.join(path, file_name)\n    with open(txt_path, \"r+\", encoding=\"utf8\") as txt_file:\n\n        paragraph = txt_file.read().split(\"\\n\")\n        if format == \"pdf\":\n            pdf_file = prepare_pdf_document(paragraph, include_clip_meta, file_name[:-4])\n            pdf_path = os.path.join(path, output_file_name)\n            pdf_file.output(pdf_path)\n\n        elif format == \"docx\":\n            docx_file = docx.Document()\n            docx_file.add_heading(file_name[0:-4], 0)\n\n            for para in paragraph:\n                # add a paragraph and store the object in a variable\n                docx_file.add_paragraph(para)\n            docx_path = os.path.join(path, output_file_name)\n            docx_file.save(docx_path)\n\n    return output_file_name\n\n\ndef create_file_by_type(end_directory, format, include_clip_meta=False):\n    \"\"\"\n    Will iterate over all text files and will convert and create file with specified format\n    Currently Only pdf and docx are supported\n\n    :param end_directory:\n    :param format:\n    :return: list of output filenames\n    \"\"\"\n    output_files = []\n\n    # get files in end_directory\n    files = [\n        f for f in os.listdir(end_directory)\n        if os.path.isfile(os.path.join(end_directory, f))\n    ]\n\n    for file in files:\n        if file[-3:] == \"txt\":\n            output_files.append(\n                convert_to_format(end_directory, file, format, include_clip_meta)\n            )\n\n    return output_files\n\n\ndef parse_clippings(source_file, end_directory, encoding=\"utf-8\", format=\"txt\", include_clip_meta=False):\n    \"\"\"\n    Each clipping always consists of 5 lines:\n    - title line\n    - clipping info/metadata\n    - a blank line\n    - clipping text\n    - a divider made up of equals signs\n    Thus we can parse the clippings, and organise them by book.\n\n    :param end_directory: the output directory where all of organised highlights will go\n    :type end_directory: str\n    :return: organises kindle highlights by book .\n    :param format: output file format. Only \"pdf\" or \"docx\" create additional files.\n                    Any other value is treated as \"txt\".\n    \"\"\"\n\n    # Check that the source file (on the kindle) exists\n    if not os.path.isfile(source_file):\n        raise IOError(\"ERROR: cannot find \" + source_file)\n\n    # Create the output directory if it doesn't exist\n    if not os.path.exists(end_directory):\n        os.makedirs(end_directory, exist_ok=True)\n\n    # This will keep track of the titles that we have already processed\n    output_files = set()\n    title = \"\"\n\n    # Open clippings textfile and read data in lines\n    with io.open(source_file, \"r\", encoding=encoding, errors=\"ignore\") as f:\n        # Individual highlights within clippings are separated by ==========\n        for highlight in f.read().split(\"==========\"):\n            # For each highlight, we split it into the lines\n            lines = highlight.split(\"\\n\")[1:]\n            # Don't try to write if there is no body text\n            # A valid clipping should have at least 4 lines:\n            # title, metadata, blank line and body text\n            if len(lines) < 4 or lines[3].strip() == \"\":\n                continue\n            # Set title and trim the hex character\n            title = lines[0]\n            if title[0] == \"\\ufeff\":\n                title = title[1:]\n\n            # Remove characters and create path\n            outfile_name = remove_chars(title, end_directory) + \".txt\"\n            path = os.path.join(end_directory, outfile_name)\n\n            # If we haven't seen title yet, set mode to write. Else, set to append.\n            if outfile_name not in output_files and outfile_name not in os.listdir(end_directory):\n                mode = \"w\"\n                output_files.add(outfile_name)\n                current_text = \"\"\n            else:\n                # If the title exists, read it as text so that we won't append duplicates\n                mode = \"a\"\n                with io.open(path, \"r\", encoding=encoding, errors=\"ignore\") as textfile:\n                    current_text = textfile.read()\n\n            clipping_text = lines[3]\n            clip_meta = lines[1]\n\n            with io.open(path, mode, encoding=encoding, errors=\"ignore\") as outfile:\n                # Write out the the clippings text if it's not already there\n                if clipping_text not in current_text:\n                    outfile.write(clipping_text + \"\\n\")\n                    if include_clip_meta:\n                        outfile.write(clip_meta + \"\\n\")\n                    outfile.write(\"\\n...\\n\\n\")\n\n    # create additional files based on requested format\n    if format in [\"pdf\", \"docx\"]:\n        formatted_out_files = create_file_by_type(end_directory, format, include_clip_meta)\n        output_files.update(formatted_out_files)\n    elif format != \"txt\":\n        print(\"Invalid format specified. Defaulting to txt.\")\n        format = \"txt\"\n\n    print(\"\\nExported titles:\\n\")\n    for i in output_files:\n        print(i)\n\n\nif __name__ == \"__main__\":\n    parser = argparse.ArgumentParser(\n        description=\"Extract kindle clippings into a folder with nice text files\"\n    )\n    parser.add_argument(\"-source\", type=str, default=\"/Volumes/Kindle\")\n    parser.add_argument(\"-destination\", type=str, default=\"./\")\n    parser.add_argument(\"-encoding\", type=str, default=\"utf8\")\n    parser.add_argument(\"-format\", type=str, default=\"txt\")\n    parser.add_argument(\"-include_clip_meta\", type=bool, default=False)\n    args = parser.parse_args()\n\n    if args.source.endswith(\".txt\"):\n        source_file = args.source\n    else:\n        source_file = os.path.join(args.source, \"My Clippings.txt\")\n\n    destination = os.path.join(args.destination, \"KindleClippings\")\n\n    parse_clippings(source_file, destination, args.encoding, args.format, args.include_clip_meta)\n"
  },
  {
    "path": "LICENSE.txt",
    "content": "MIT License\n\nCopyright (c) 2017 Robert Andrew Martin\n\nPermission is hereby granted, free of charge, to any person obtaining a copy\nof this software and associated documentation files (the \"Software\"), to deal\nin the Software without restriction, including without limitation the rights\nto use, copy, modify, merge, publish, distribute, sublicense, and/or sell\ncopies of the Software, and to permit persons to whom the Software is\nfurnished to do so, subject to the following conditions:\n\nThe above copyright notice and this permission notice shall be included in all\ncopies or substantial portions of the Software.\n\nTHE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\nIMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\nFITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\nAUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\nLIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\nOUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\nSOFTWARE.\n"
  },
  {
    "path": "README.md",
    "content": "<p align=\"center\">\n    <img width=60% src=\"https://github.com/robertmartin8/KindleClippings/blob/master/media/logo.png\">\n</p>\n\n<!-- buttons -->\n<p align=\"center\">\n    <a href=\"https://www.python.org/\">\n        <img src=\"https://img.shields.io/badge/python-v2,3-blue.svg\"\n            alt=\"python\"></a> &nbsp;\n    <!-- <a href=\"https://pypi.org/project/PyPortfolioOpt/\">\n        <img src=\"https://img.shields.io/badge/pypi-v0.1.0rc1-brightgreen.svg?style=flat-square\"\n            alt=\"python\"></a> &nbsp; -->\n    <a href=\"https://opensource.org/licenses/MIT\">\n        <img src=\"https://img.shields.io/badge/license-MIT-blue.svg\"\n            alt=\"MIT license\"></a> &nbsp;\n    <a href=\"https://github.com/robertmartin8/PyPortfolioOpt/graphs/commit-activity\">\n        <img src=\"https://img.shields.io/badge/Maintained%3F-yes-blue.svg\"\n            alt=\"maintained\"></a> &nbsp;\n</p>\n\nOne of the many great things about kindles is that you can highlight parts of your book to go back to later. However, it is perhaps surprising that there is no good way of aggregating the highlights (even per book).\n\nKindleClippings is a utility born out of personal need, which fetches any highlights that you have made on your kindle, and organises them into plain text files per book. It is run from the command line using:\n\nYou can use the `-format` option to generate additional files in that format.\nOnly `pdf` and `docx` are supported. Any other value will fall back to plain\ntext output.\n\n```bash\npython KindleClippings.py\n```\n\nThe result is a new folder with individual text files per book:\n\n<p align=\"center\">\n    <img width=60% src=\"https://github.com/robertmartin8/KindleClippings/blob/master/media/screenshot.png\">\n</p>\n\nIn my workflow, I then copy these into Evernote, but the whole point is that you are now free to do whatever you want.\n\n## Background\n\nWhen you make highlights or add bookmarks on your kindle, they are stored to a text file on the kindle called `My Clippings.txt`. This has a regular format, which means that it can be parsed:\n\n```txt\n==========\nThe Selfish Gene: 30th Anniversary Edition (Richard Dawkins)\n- Your Highlight on page 92 | location 1406-1407 | Added on Saturday, 26 March 2016 14:59:39\n\nPerhaps consciousness arises when the brain's simulation of the world becomes so complete that it must include a model of itself.(4)\n\n==========\nFahrenheit 451 (Ray Bradbury)\n- Your Bookmark at location 346 | Added on Saturday, 26 March 2016 15:46:21\n\n\n==========\nFahrenheit 451 (Ray Bradbury)\n- Your Highlight at location 784-785 | Added on Saturday, 26 March 2016 18:37:26\n\nWho knows who might be the target of the well-read man?\n==========\n```\n\n## Prerequisites\n\nThe only requirement for this project is to have python (either python 2 or python 3) installed on your system. For users on macOS, you don't have to worry about this because it is already installed. On Windows, python can be installed following the instructions [here](http://docs.python-guide.org/en/latest/starting/install3/win/).\n\nFor pdf and docx conversion, it requires library. It can be installed using below pip command\n```bash\npip install -r requirements.txt\n```\n\n\n## Basic usage\n\nIt is recommended that you download the `KindleClippings.py` and place it either in your home directory or the desktop. Connect your kindle, and make sure it exists in your filesystem. Then, open up your terminal/shell.\n\nIf you're on a mac, you *might* just be able to run\n\n```bash\npython KindleClippings.py\n```\n\nHowever, most users will need to specify the path to the kindle and optionally the path to the destination. By default, the script will create a folder called `KindleClippings` in the current directory, and place the resulting text files there (though this likely only works on mac). For example\n\n```bash\npython KindleClippings.py -source /Volumes/Kindle/\n```\n\nOn windows, this might look something like:\n\n```bash\npython KindleClippings.py -source C:\\Kindle -destination \\\n```\n\nIf the parsing is successful, the script will print all of the exported titles\n\n```txt\nExported titles:\n\nTo Kill a Mockingbird - Harper Lee.txt\nA Clockwork Orange - 50th Anniversary Edition - Anthony Burgess.txt\nThe Road - Cormac McCarthy.txt\nFahrenheit 451 - Ray Bradbury.txt\nHeart of Darkness - Joseph Conrad.txt\nThe Meaning of It All - Richard P Feynman.txt\nThe Selfish Gene - 30th Anniversary Edition - Richard Dawkins.txt\n```\n\nUse the `-format` flag after exporting text files to also create a PDF or DOCX\nversion:\n\n```bash\npython KindleClippings.py -source C:\\Kindle -format pdf\n```\nIf any other value is passed to `-format`, the script simply exports text files.\n\n\n## About\n\nI originally forked [`firewood`](https://github.com/sebpearce/firewood), but I realised that my fork was fundamentally different to firewood – to the extent that it has become a different solution.\n\nIf you play around with `firewood` enough, you'll find that sometimes it can just completely break.\nThis is because firewood relies on the regular order of the `My  Clippings.txt` file from the kindle. For the most part, this is a fair assumption. However, I have found that very occasionally, kindle will insert an extra blank line that will prevent the whole program from functioning.\n\nMy solution does require regularity, but it is a lot more robust to irregularity. We first split the text file into individual highlights, then proceed from there.\n\nSometimes when you make a highlight on kindle, then delete it, it still gets stored into clippings. So if you make a wrong highlight and redo it, you'll end up with multiple very similar highlights. I haven't yet decided whether this is worth fixing, but in my workflow it's not very important.\n"
  },
  {
    "path": "requirements.txt",
    "content": "FPDF\npython-docx\n"
  },
  {
    "path": "tests/My Clippings.txt",
    "content": "==========\nExample Title: The Beginning (John Doe)\n- Your Highlight on page 12 | location 123-124 | Added on Monday, 1 January 2020 12:00:00\n\nThis is a highlight text.\n==========\nAnother Book? & Something & Else (Jane Smith)\n- Your Highlight at location 200 | Added on Tuesday, 2 February 2021 13:00:00\n\nInteresting highlight & note?\n======="
  },
  {
    "path": "tests/conftest.py",
    "content": "import sys\nimport types\nimport pytest\n\n@pytest.fixture(autouse=True)\ndef stub_dependencies(monkeypatch):\n    fpdf_mod = types.ModuleType('fpdf')\n    class DummyFPDF:\n        def add_page(self):\n            pass\n        def add_font(self, *args, **kwargs):\n            pass\n        def set_font(self, *args, **kwargs):\n            pass\n        def set_margins(self, *args, **kwargs):\n            pass\n        def multi_cell(self, *args, **kwargs):\n            pass\n        def set_draw_color(self, *args, **kwargs):\n            pass\n        def line(self, *args, **kwargs):\n            pass\n        def output(self, *args, **kwargs):\n            pass\n        y = 0\n    fpdf_mod.FPDF = DummyFPDF\n    monkeypatch.setitem(sys.modules, 'fpdf', fpdf_mod)\n\n    docx_mod = types.ModuleType('docx')\n    class DummyDocxDocument:\n        def add_heading(self, *args, **kwargs):\n            pass\n        def add_paragraph(self, *args, **kwargs):\n            pass\n        def save(self, *args, **kwargs):\n            pass\n    docx_mod.Document = DummyDocxDocument\n    monkeypatch.setitem(sys.modules, 'docx', docx_mod)\n\n    import KindleClippings\n    monkeypatch.setattr(KindleClippings, 'args', types.SimpleNamespace(format='txt'), raising=False)\n"
  },
  {
    "path": "tests/test_parse_clippings.py",
    "content": "import os\nfrom pathlib import Path\nimport types\n\nimport KindleClippings\nfrom KindleClippings import parse_clippings, remove_chars\n\n\ndef test_parse_clippings_creates_files(tmp_path):\n    clippings = Path(__file__).with_name(\"My Clippings.txt\")\n    out_dir = tmp_path / \"out\"\n    parse_clippings(str(clippings), str(out_dir), format=\"txt\")\n\n    expected_files = [\n        remove_chars(\"Example Title: The Beginning (John Doe)\") + \".txt\",\n        remove_chars(\"Another Book? & Something & Else (Jane Smith)\") + \".txt\",\n    ]\n    assert sorted(os.listdir(out_dir)) == sorted(expected_files)\n\n    contents1 = (out_dir / expected_files[0]).read_text(encoding=\"utf8\")\n    assert \"This is a highlight text.\" in contents1\n\n    contents2 = (out_dir / expected_files[1]).read_text(encoding=\"utf8\")\n    assert \"Interesting highlight & note?\" in contents2\n"
  },
  {
    "path": "tests/test_remove_chars.py",
    "content": "from KindleClippings import remove_chars\n\n\ndef test_colon_replacement():\n    assert remove_chars('Title: Subtitle') == 'Title - Subtitle'\n    assert remove_chars('A:B') == 'A - B'\n    assert remove_chars('Multi:part:colon') == 'Multi - part - colon'\n    assert remove_chars('Title:Subtitle : Another') == 'Title - Subtitle - Another'\n\n\ndef test_remove_question_and_ampersand():\n    assert remove_chars('Where & When?') == 'Where and When'\n    assert remove_chars('Q? & A?') == 'Q and A'\n    assert remove_chars('This & That & Those') == 'This and That and Those'\n    assert remove_chars('??What?') == 'What'\n    assert remove_chars(' weird??? &?? ') == 'weird and'\n\n\ndef test_trim_invalid_characters():\n    assert remove_chars('???Title???') == 'Title'\n    assert remove_chars('!@#My Book!!!') == 'My Book'\n    assert remove_chars('Book (Edition)') == 'Book - Edition'\n\ndef test_length_limit():\n    long_title = 'A' * 300\n    assert len(remove_chars(long_title)) == 245\n"
  }
]