Repository: robertmartin8/KindleClippings
Branch: master
Commit: 2bf50b836b25
Files: 9
Total size: 19.3 KB

Directory structure:
gitextract_gqxv6t7_/

├── AGENTS.md
├── KindleClippings.py
├── LICENSE.txt
├── README.md
├── requirements.txt
└── tests/
    ├── My Clippings.txt
    ├── conftest.py
    ├── test_parse_clippings.py
    └── test_remove_chars.py

================================================
FILE CONTENTS
================================================

================================================
FILE: AGENTS.md
================================================
# Agent Instructions

This repository contains a single Python script for reading Kindle clippings. When modifying or adding code, keep the following points in mind:

- Follow PEP8 style conventions for all Python code.
- Maintain compatibility with both Python 2 and Python 3.
- Before committing, run `python -m py_compile $(git ls-files '*.py')` to ensure there are no syntax errors.
- Keep documentation in Markdown format.
- Write concise commit messages summarizing your changes.


================================================
FILE: KindleClippings.py
================================================
from __future__ import print_function
import re
import io
import os
import argparse
from fpdf import FPDF
import docx

def remove_chars(s, end_directory=""):
    """
    This is a utility function that removes special characters from the string, so that it can
    become a valid filename.
    :param s: input string
    :return: the input string, stripped of special characters
    """
    # Replace colons with a hyphen so "A: B" becomes "A - B"
    s = re.sub(" *: *", " - ", s)
    # Remove question marks or ampersands
    s = s.replace("?", "").replace("&", "and")
    # Replace ( ) with a hyphen so "this (text)" becomes "this - text"
    s = re.sub(r"\((.+?)\)", r"- \1", s)
    # Delete filename chars tht are not alphanumeric or ; , _ -
    s = re.sub(r"[^a-zA-Z\d\s\w;,_-]+", "", s)
    # Trim off anything that isn't a word at the start & end
    s = re.sub(r"^\W+|\W+$", "", s)

    max_length = 245 - len(end_directory)  # max file size limited to 255.
    s = s[:max_length]
    return s


def insert_line_break_in_pdf(pdf_file: FPDF, num_breaks: int = 1) -> FPDF:
    """
    Inserts a line break in a pdf for num_breaks times
    """
    while num_breaks != 0:
        pdf_file.multi_cell(0, 5, "", 0)
        num_breaks -= 1

    return pdf_file

def insert_bar_separator_in_pdf(pdf_file: FPDF):
    """
    Inserts a bar separator in a pdf, useful to separate highlights
    """
    pdf_file = insert_line_break_in_pdf(pdf_file)
    pdf_file.set_draw_color(191, 191, 191)
    pdf_file.line(40, pdf_file.y, 150, pdf_file.y)
    pdf_file = insert_line_break_in_pdf(pdf_file)

    return pdf_file


def prepare_pdf_document(highlights: str, include_clip_meta = False, title: str = "Your Notes And Highlights") -> FPDF:
    """
    Will create pdf document from the notes

    :param highlights:
    :return: FPDF
    """
    pdf_file = FPDF()
    pdf_file.add_page()
    pdf_file.add_font("lisboa", '', 'media/Lisboa.ttf', uni=True)
    pdf_file.set_font("lisboa", '', 22)
    pdf_file.set_margins(25, 40, 25)
    pdf_file = insert_line_break_in_pdf(pdf_file, 3)
    pdf_file.multi_cell(0, 5, title, align="C")
    pdf_file = insert_line_break_in_pdf(pdf_file, 2)
    
    meta_regex_pattern = r"(Your.*\| Added on)"
    for highlight_line in highlights:
        # create multi-cell pdf object and add text to it
        if re.search(meta_regex_pattern, highlight_line):
            pdf_file.set_font("lisboa", '', 11)
            pdf_file.set_text_color(77, 77, 77)
            pdf_file.multi_cell(0, 5, highlight_line, 0)
            pdf_file = insert_bar_separator_in_pdf(pdf_file)
        elif len(highlight_line) < 10:
            if not include_clip_meta and highlight_line == "...":
                pdf_file = insert_bar_separator_in_pdf(pdf_file)
            else:
                continue
        else:
            pdf_file.set_font("lisboa", '', 15)
            pdf_file.set_text_color(0, 0, 0)
            pdf_file.multi_cell(0, 5, highlight_line, 0)

    return pdf_file


def convert_to_format(path, file_name, format, include_clip_meta=False):
    """
    Will get text file and will convert to specified output

    :param path:
    :param file_name:
    :param format:
    :return: name of the file created
    """
    output_file_name = file_name[0:-4] + "." + format
    txt_path = os.path.join(path, file_name)
    with open(txt_path, "r+", encoding="utf8") as txt_file:

        paragraph = txt_file.read().split("\n")
        if format == "pdf":
            pdf_file = prepare_pdf_document(paragraph, include_clip_meta, file_name[:-4])
            pdf_path = os.path.join(path, output_file_name)
            pdf_file.output(pdf_path)

        elif format == "docx":
            docx_file = docx.Document()
            docx_file.add_heading(file_name[0:-4], 0)

            for para in paragraph:
                # add a paragraph and store the object in a variable
                docx_file.add_paragraph(para)
            docx_path = os.path.join(path, output_file_name)
            docx_file.save(docx_path)

    return output_file_name


def create_file_by_type(end_directory, format, include_clip_meta=False):
    """
    Will iterate over all text files and will convert and create file with specified format
    Currently Only pdf and docx are supported

    :param end_directory:
    :param format:
    :return: list of output filenames
    """
    output_files = []

    # get files in end_directory
    files = [
        f for f in os.listdir(end_directory)
        if os.path.isfile(os.path.join(end_directory, f))
    ]

    for file in files:
        if file[-3:] == "txt":
            output_files.append(
                convert_to_format(end_directory, file, format, include_clip_meta)
            )

    return output_files


def parse_clippings(source_file, end_directory, encoding="utf-8", format="txt", include_clip_meta=False):
    """
    Each clipping always consists of 5 lines:
    - title line
    - clipping info/metadata
    - a blank line
    - clipping text
    - a divider made up of equals signs
    Thus we can parse the clippings, and organise them by book.

    :param end_directory: the output directory where all of organised highlights will go
    :type end_directory: str
    :return: organises kindle highlights by book .
    :param format: output file format. Only "pdf" or "docx" create additional files.
                    Any other value is treated as "txt".
    """

    # Check that the source file (on the kindle) exists
    if not os.path.isfile(source_file):
        raise IOError("ERROR: cannot find " + source_file)

    # Create the output directory if it doesn't exist
    if not os.path.exists(end_directory):
        os.makedirs(end_directory, exist_ok=True)

    # This will keep track of the titles that we have already processed
    output_files = set()
    title = ""

    # Open clippings textfile and read data in lines
    with io.open(source_file, "r", encoding=encoding, errors="ignore") as f:
        # Individual highlights within clippings are separated by ==========
        for highlight in f.read().split("=========="):
            # For each highlight, we split it into the lines
            lines = highlight.split("\n")[1:]
            # Don't try to write if there is no body text
            # A valid clipping should have at least 4 lines:
            # title, metadata, blank line and body text
            if len(lines) < 4 or lines[3].strip() == "":
                continue
            # Set title and trim the hex character
            title = lines[0]
            if title[0] == "\ufeff":
                title = title[1:]

            # Remove characters and create path
            outfile_name = remove_chars(title, end_directory) + ".txt"
            path = os.path.join(end_directory, outfile_name)

            # If we haven't seen title yet, set mode to write. Else, set to append.
            if outfile_name not in output_files and outfile_name not in os.listdir(end_directory):
                mode = "w"
                output_files.add(outfile_name)
                current_text = ""
            else:
                # If the title exists, read it as text so that we won't append duplicates
                mode = "a"
                with io.open(path, "r", encoding=encoding, errors="ignore") as textfile:
                    current_text = textfile.read()

            clipping_text = lines[3]
            clip_meta = lines[1]

            with io.open(path, mode, encoding=encoding, errors="ignore") as outfile:
                # Write out the the clippings text if it's not already there
                if clipping_text not in current_text:
                    outfile.write(clipping_text + "\n")
                    if include_clip_meta:
                        outfile.write(clip_meta + "\n")
                    outfile.write("\n...\n\n")

    # create additional files based on requested format
    if format in ["pdf", "docx"]:
        formatted_out_files = create_file_by_type(end_directory, format, include_clip_meta)
        output_files.update(formatted_out_files)
    elif format != "txt":
        print("Invalid format specified. Defaulting to txt.")
        format = "txt"

    print("\nExported titles:\n")
    for i in output_files:
        print(i)


if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description="Extract kindle clippings into a folder with nice text files"
    )
    parser.add_argument("-source", type=str, default="/Volumes/Kindle")
    parser.add_argument("-destination", type=str, default="./")
    parser.add_argument("-encoding", type=str, default="utf8")
    parser.add_argument("-format", type=str, default="txt")
    parser.add_argument("-include_clip_meta", type=bool, default=False)
    args = parser.parse_args()

    if args.source.endswith(".txt"):
        source_file = args.source
    else:
        source_file = os.path.join(args.source, "My Clippings.txt")

    destination = os.path.join(args.destination, "KindleClippings")

    parse_clippings(source_file, destination, args.encoding, args.format, args.include_clip_meta)


================================================
FILE: LICENSE.txt
================================================
MIT License

Copyright (c) 2017 Robert Andrew Martin

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.


================================================
FILE: README.md
================================================
<p align="center">
    <img width=60% src="https://github.com/robertmartin8/KindleClippings/blob/master/media/logo.png">
</p>

<!-- buttons -->
<p align="center">
    <a href="https://www.python.org/">
        <img src="https://img.shields.io/badge/python-v2,3-blue.svg"
            alt="python"></a> &nbsp;
    <!-- <a href="https://pypi.org/project/PyPortfolioOpt/">
        <img src="https://img.shields.io/badge/pypi-v0.1.0rc1-brightgreen.svg?style=flat-square"
            alt="python"></a> &nbsp; -->
    <a href="https://opensource.org/licenses/MIT">
        <img src="https://img.shields.io/badge/license-MIT-blue.svg"
            alt="MIT license"></a> &nbsp;
    <a href="https://github.com/robertmartin8/PyPortfolioOpt/graphs/commit-activity">
        <img src="https://img.shields.io/badge/Maintained%3F-yes-blue.svg"
            alt="maintained"></a> &nbsp;
</p>

One of the many great things about kindles is that you can highlight parts of your book to go back to later. However, it is perhaps surprising that there is no good way of aggregating the highlights (even per book).

KindleClippings is a utility born out of personal need, which fetches any highlights that you have made on your kindle, and organises them into plain text files per book. It is run from the command line using:

You can use the `-format` option to generate additional files in that format.
Only `pdf` and `docx` are supported. Any other value will fall back to plain
text output.

```bash
python KindleClippings.py
```

The result is a new folder with individual text files per book:

<p align="center">
    <img width=60% src="https://github.com/robertmartin8/KindleClippings/blob/master/media/screenshot.png">
</p>

In my workflow, I then copy these into Evernote, but the whole point is that you are now free to do whatever you want.

## Background

When you make highlights or add bookmarks on your kindle, they are stored to a text file on the kindle called `My Clippings.txt`. This has a regular format, which means that it can be parsed:

```txt
==========
The Selfish Gene: 30th Anniversary Edition (Richard Dawkins)
- Your Highlight on page 92 | location 1406-1407 | Added on Saturday, 26 March 2016 14:59:39

Perhaps consciousness arises when the brain's simulation of the world becomes so complete that it must include a model of itself.(4)

==========
Fahrenheit 451 (Ray Bradbury)
- Your Bookmark at location 346 | Added on Saturday, 26 March 2016 15:46:21


==========
Fahrenheit 451 (Ray Bradbury)
- Your Highlight at location 784-785 | Added on Saturday, 26 March 2016 18:37:26

Who knows who might be the target of the well-read man?
==========
```

## Prerequisites

The only requirement for this project is to have python (either python 2 or python 3) installed on your system. For users on macOS, you don't have to worry about this because it is already installed. On Windows, python can be installed following the instructions [here](http://docs.python-guide.org/en/latest/starting/install3/win/).

For pdf and docx conversion, it requires library. It can be installed using below pip command
```bash
pip install -r requirements.txt
```


## Basic usage

It is recommended that you download the `KindleClippings.py` and place it either in your home directory or the desktop. Connect your kindle, and make sure it exists in your filesystem. Then, open up your terminal/shell.

If you're on a mac, you *might* just be able to run

```bash
python KindleClippings.py
```

However, most users will need to specify the path to the kindle and optionally the path to the destination. By default, the script will create a folder called `KindleClippings` in the current directory, and place the resulting text files there (though this likely only works on mac). For example

```bash
python KindleClippings.py -source /Volumes/Kindle/
```

On windows, this might look something like:

```bash
python KindleClippings.py -source C:\Kindle -destination \
```

If the parsing is successful, the script will print all of the exported titles

```txt
Exported titles:

To Kill a Mockingbird - Harper Lee.txt
A Clockwork Orange - 50th Anniversary Edition - Anthony Burgess.txt
The Road - Cormac McCarthy.txt
Fahrenheit 451 - Ray Bradbury.txt
Heart of Darkness - Joseph Conrad.txt
The Meaning of It All - Richard P Feynman.txt
The Selfish Gene - 30th Anniversary Edition - Richard Dawkins.txt
```

Use the `-format` flag after exporting text files to also create a PDF or DOCX
version:

```bash
python KindleClippings.py -source C:\Kindle -format pdf
```
If any other value is passed to `-format`, the script simply exports text files.


## About

I originally forked [`firewood`](https://github.com/sebpearce/firewood), but I realised that my fork was fundamentally different to firewood – to the extent that it has become a different solution.

If you play around with `firewood` enough, you'll find that sometimes it can just completely break.
This is because firewood relies on the regular order of the `My  Clippings.txt` file from the kindle. For the most part, this is a fair assumption. However, I have found that very occasionally, kindle will insert an extra blank line that will prevent the whole program from functioning.

My solution does require regularity, but it is a lot more robust to irregularity. We first split the text file into individual highlights, then proceed from there.

Sometimes when you make a highlight on kindle, then delete it, it still gets stored into clippings. So if you make a wrong highlight and redo it, you'll end up with multiple very similar highlights. I haven't yet decided whether this is worth fixing, but in my workflow it's not very important.


================================================
FILE: requirements.txt
================================================
FPDF
python-docx


================================================
FILE: tests/My Clippings.txt
================================================
==========
Example Title: The Beginning (John Doe)
- Your Highlight on page 12 | location 123-124 | Added on Monday, 1 January 2020 12:00:00

This is a highlight text.
==========
Another Book? & Something & Else (Jane Smith)
- Your Highlight at location 200 | Added on Tuesday, 2 February 2021 13:00:00

Interesting highlight & note?
=======

================================================
FILE: tests/conftest.py
================================================
import sys
import types
import pytest

@pytest.fixture(autouse=True)
def stub_dependencies(monkeypatch):
    fpdf_mod = types.ModuleType('fpdf')
    class DummyFPDF:
        def add_page(self):
            pass
        def add_font(self, *args, **kwargs):
            pass
        def set_font(self, *args, **kwargs):
            pass
        def set_margins(self, *args, **kwargs):
            pass
        def multi_cell(self, *args, **kwargs):
            pass
        def set_draw_color(self, *args, **kwargs):
            pass
        def line(self, *args, **kwargs):
            pass
        def output(self, *args, **kwargs):
            pass
        y = 0
    fpdf_mod.FPDF = DummyFPDF
    monkeypatch.setitem(sys.modules, 'fpdf', fpdf_mod)

    docx_mod = types.ModuleType('docx')
    class DummyDocxDocument:
        def add_heading(self, *args, **kwargs):
            pass
        def add_paragraph(self, *args, **kwargs):
            pass
        def save(self, *args, **kwargs):
            pass
    docx_mod.Document = DummyDocxDocument
    monkeypatch.setitem(sys.modules, 'docx', docx_mod)

    import KindleClippings
    monkeypatch.setattr(KindleClippings, 'args', types.SimpleNamespace(format='txt'), raising=False)


================================================
FILE: tests/test_parse_clippings.py
================================================
import os
from pathlib import Path
import types

import KindleClippings
from KindleClippings import parse_clippings, remove_chars


def test_parse_clippings_creates_files(tmp_path):
    clippings = Path(__file__).with_name("My Clippings.txt")
    out_dir = tmp_path / "out"
    parse_clippings(str(clippings), str(out_dir), format="txt")

    expected_files = [
        remove_chars("Example Title: The Beginning (John Doe)") + ".txt",
        remove_chars("Another Book? & Something & Else (Jane Smith)") + ".txt",
    ]
    assert sorted(os.listdir(out_dir)) == sorted(expected_files)

    contents1 = (out_dir / expected_files[0]).read_text(encoding="utf8")
    assert "This is a highlight text." in contents1

    contents2 = (out_dir / expected_files[1]).read_text(encoding="utf8")
    assert "Interesting highlight & note?" in contents2


================================================
FILE: tests/test_remove_chars.py
================================================
from KindleClippings import remove_chars


def test_colon_replacement():
    assert remove_chars('Title: Subtitle') == 'Title - Subtitle'
    assert remove_chars('A:B') == 'A - B'
    assert remove_chars('Multi:part:colon') == 'Multi - part - colon'
    assert remove_chars('Title:Subtitle : Another') == 'Title - Subtitle - Another'


def test_remove_question_and_ampersand():
    assert remove_chars('Where & When?') == 'Where and When'
    assert remove_chars('Q? & A?') == 'Q and A'
    assert remove_chars('This & That & Those') == 'This and That and Those'
    assert remove_chars('??What?') == 'What'
    assert remove_chars(' weird??? &?? ') == 'weird and'


def test_trim_invalid_characters():
    assert remove_chars('???Title???') == 'Title'
    assert remove_chars('!@#My Book!!!') == 'My Book'
    assert remove_chars('Book (Edition)') == 'Book - Edition'

def test_length_limit():
    long_title = 'A' * 300
    assert len(remove_chars(long_title)) == 245