Repository: robertmartin8/KindleClippings Branch: master Commit: 2bf50b836b25 Files: 9 Total size: 19.3 KB Directory structure: gitextract_gqxv6t7_/ ├── AGENTS.md ├── KindleClippings.py ├── LICENSE.txt ├── README.md ├── requirements.txt └── tests/ ├── My Clippings.txt ├── conftest.py ├── test_parse_clippings.py └── test_remove_chars.py ================================================ FILE CONTENTS ================================================ ================================================ FILE: AGENTS.md ================================================ # Agent Instructions This repository contains a single Python script for reading Kindle clippings. When modifying or adding code, keep the following points in mind: - Follow PEP8 style conventions for all Python code. - Maintain compatibility with both Python 2 and Python 3. - Before committing, run `python -m py_compile $(git ls-files '*.py')` to ensure there are no syntax errors. - Keep documentation in Markdown format. - Write concise commit messages summarizing your changes. ================================================ FILE: KindleClippings.py ================================================ from __future__ import print_function import re import io import os import argparse from fpdf import FPDF import docx def remove_chars(s, end_directory=""): """ This is a utility function that removes special characters from the string, so that it can become a valid filename. :param s: input string :return: the input string, stripped of special characters """ # Replace colons with a hyphen so "A: B" becomes "A - B" s = re.sub(" *: *", " - ", s) # Remove question marks or ampersands s = s.replace("?", "").replace("&", "and") # Replace ( ) with a hyphen so "this (text)" becomes "this - text" s = re.sub(r"\((.+?)\)", r"- \1", s) # Delete filename chars tht are not alphanumeric or ; , _ - s = re.sub(r"[^a-zA-Z\d\s\w;,_-]+", "", s) # Trim off anything that isn't a word at the start & end s = re.sub(r"^\W+|\W+$", "", s) max_length = 245 - len(end_directory) # max file size limited to 255. s = s[:max_length] return s def insert_line_break_in_pdf(pdf_file: FPDF, num_breaks: int = 1) -> FPDF: """ Inserts a line break in a pdf for num_breaks times """ while num_breaks != 0: pdf_file.multi_cell(0, 5, "", 0) num_breaks -= 1 return pdf_file def insert_bar_separator_in_pdf(pdf_file: FPDF): """ Inserts a bar separator in a pdf, useful to separate highlights """ pdf_file = insert_line_break_in_pdf(pdf_file) pdf_file.set_draw_color(191, 191, 191) pdf_file.line(40, pdf_file.y, 150, pdf_file.y) pdf_file = insert_line_break_in_pdf(pdf_file) return pdf_file def prepare_pdf_document(highlights: str, include_clip_meta = False, title: str = "Your Notes And Highlights") -> FPDF: """ Will create pdf document from the notes :param highlights: :return: FPDF """ pdf_file = FPDF() pdf_file.add_page() pdf_file.add_font("lisboa", '', 'media/Lisboa.ttf', uni=True) pdf_file.set_font("lisboa", '', 22) pdf_file.set_margins(25, 40, 25) pdf_file = insert_line_break_in_pdf(pdf_file, 3) pdf_file.multi_cell(0, 5, title, align="C") pdf_file = insert_line_break_in_pdf(pdf_file, 2) meta_regex_pattern = r"(Your.*\| Added on)" for highlight_line in highlights: # create multi-cell pdf object and add text to it if re.search(meta_regex_pattern, highlight_line): pdf_file.set_font("lisboa", '', 11) pdf_file.set_text_color(77, 77, 77) pdf_file.multi_cell(0, 5, highlight_line, 0) pdf_file = insert_bar_separator_in_pdf(pdf_file) elif len(highlight_line) < 10: if not include_clip_meta and highlight_line == "...": pdf_file = insert_bar_separator_in_pdf(pdf_file) else: continue else: pdf_file.set_font("lisboa", '', 15) pdf_file.set_text_color(0, 0, 0) pdf_file.multi_cell(0, 5, highlight_line, 0) return pdf_file def convert_to_format(path, file_name, format, include_clip_meta=False): """ Will get text file and will convert to specified output :param path: :param file_name: :param format: :return: name of the file created """ output_file_name = file_name[0:-4] + "." + format txt_path = os.path.join(path, file_name) with open(txt_path, "r+", encoding="utf8") as txt_file: paragraph = txt_file.read().split("\n") if format == "pdf": pdf_file = prepare_pdf_document(paragraph, include_clip_meta, file_name[:-4]) pdf_path = os.path.join(path, output_file_name) pdf_file.output(pdf_path) elif format == "docx": docx_file = docx.Document() docx_file.add_heading(file_name[0:-4], 0) for para in paragraph: # add a paragraph and store the object in a variable docx_file.add_paragraph(para) docx_path = os.path.join(path, output_file_name) docx_file.save(docx_path) return output_file_name def create_file_by_type(end_directory, format, include_clip_meta=False): """ Will iterate over all text files and will convert and create file with specified format Currently Only pdf and docx are supported :param end_directory: :param format: :return: list of output filenames """ output_files = [] # get files in end_directory files = [ f for f in os.listdir(end_directory) if os.path.isfile(os.path.join(end_directory, f)) ] for file in files: if file[-3:] == "txt": output_files.append( convert_to_format(end_directory, file, format, include_clip_meta) ) return output_files def parse_clippings(source_file, end_directory, encoding="utf-8", format="txt", include_clip_meta=False): """ Each clipping always consists of 5 lines: - title line - clipping info/metadata - a blank line - clipping text - a divider made up of equals signs Thus we can parse the clippings, and organise them by book. :param end_directory: the output directory where all of organised highlights will go :type end_directory: str :return: organises kindle highlights by book . :param format: output file format. Only "pdf" or "docx" create additional files. Any other value is treated as "txt". """ # Check that the source file (on the kindle) exists if not os.path.isfile(source_file): raise IOError("ERROR: cannot find " + source_file) # Create the output directory if it doesn't exist if not os.path.exists(end_directory): os.makedirs(end_directory, exist_ok=True) # This will keep track of the titles that we have already processed output_files = set() title = "" # Open clippings textfile and read data in lines with io.open(source_file, "r", encoding=encoding, errors="ignore") as f: # Individual highlights within clippings are separated by ========== for highlight in f.read().split("=========="): # For each highlight, we split it into the lines lines = highlight.split("\n")[1:] # Don't try to write if there is no body text # A valid clipping should have at least 4 lines: # title, metadata, blank line and body text if len(lines) < 4 or lines[3].strip() == "": continue # Set title and trim the hex character title = lines[0] if title[0] == "\ufeff": title = title[1:] # Remove characters and create path outfile_name = remove_chars(title, end_directory) + ".txt" path = os.path.join(end_directory, outfile_name) # If we haven't seen title yet, set mode to write. Else, set to append. if outfile_name not in output_files and outfile_name not in os.listdir(end_directory): mode = "w" output_files.add(outfile_name) current_text = "" else: # If the title exists, read it as text so that we won't append duplicates mode = "a" with io.open(path, "r", encoding=encoding, errors="ignore") as textfile: current_text = textfile.read() clipping_text = lines[3] clip_meta = lines[1] with io.open(path, mode, encoding=encoding, errors="ignore") as outfile: # Write out the the clippings text if it's not already there if clipping_text not in current_text: outfile.write(clipping_text + "\n") if include_clip_meta: outfile.write(clip_meta + "\n") outfile.write("\n...\n\n") # create additional files based on requested format if format in ["pdf", "docx"]: formatted_out_files = create_file_by_type(end_directory, format, include_clip_meta) output_files.update(formatted_out_files) elif format != "txt": print("Invalid format specified. Defaulting to txt.") format = "txt" print("\nExported titles:\n") for i in output_files: print(i) if __name__ == "__main__": parser = argparse.ArgumentParser( description="Extract kindle clippings into a folder with nice text files" ) parser.add_argument("-source", type=str, default="/Volumes/Kindle") parser.add_argument("-destination", type=str, default="./") parser.add_argument("-encoding", type=str, default="utf8") parser.add_argument("-format", type=str, default="txt") parser.add_argument("-include_clip_meta", type=bool, default=False) args = parser.parse_args() if args.source.endswith(".txt"): source_file = args.source else: source_file = os.path.join(args.source, "My Clippings.txt") destination = os.path.join(args.destination, "KindleClippings") parse_clippings(source_file, destination, args.encoding, args.format, args.include_clip_meta) ================================================ FILE: LICENSE.txt ================================================ MIT License Copyright (c) 2017 Robert Andrew Martin Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ================================================ FILE: README.md ================================================

python   MIT license   maintained  

One of the many great things about kindles is that you can highlight parts of your book to go back to later. However, it is perhaps surprising that there is no good way of aggregating the highlights (even per book). KindleClippings is a utility born out of personal need, which fetches any highlights that you have made on your kindle, and organises them into plain text files per book. It is run from the command line using: You can use the `-format` option to generate additional files in that format. Only `pdf` and `docx` are supported. Any other value will fall back to plain text output. ```bash python KindleClippings.py ``` The result is a new folder with individual text files per book:

In my workflow, I then copy these into Evernote, but the whole point is that you are now free to do whatever you want. ## Background When you make highlights or add bookmarks on your kindle, they are stored to a text file on the kindle called `My Clippings.txt`. This has a regular format, which means that it can be parsed: ```txt ========== The Selfish Gene: 30th Anniversary Edition (Richard Dawkins) - Your Highlight on page 92 | location 1406-1407 | Added on Saturday, 26 March 2016 14:59:39 Perhaps consciousness arises when the brain's simulation of the world becomes so complete that it must include a model of itself.(4) ========== Fahrenheit 451 (Ray Bradbury) - Your Bookmark at location 346 | Added on Saturday, 26 March 2016 15:46:21 ========== Fahrenheit 451 (Ray Bradbury) - Your Highlight at location 784-785 | Added on Saturday, 26 March 2016 18:37:26 Who knows who might be the target of the well-read man? ========== ``` ## Prerequisites The only requirement for this project is to have python (either python 2 or python 3) installed on your system. For users on macOS, you don't have to worry about this because it is already installed. On Windows, python can be installed following the instructions [here](http://docs.python-guide.org/en/latest/starting/install3/win/). For pdf and docx conversion, it requires library. It can be installed using below pip command ```bash pip install -r requirements.txt ``` ## Basic usage It is recommended that you download the `KindleClippings.py` and place it either in your home directory or the desktop. Connect your kindle, and make sure it exists in your filesystem. Then, open up your terminal/shell. If you're on a mac, you *might* just be able to run ```bash python KindleClippings.py ``` However, most users will need to specify the path to the kindle and optionally the path to the destination. By default, the script will create a folder called `KindleClippings` in the current directory, and place the resulting text files there (though this likely only works on mac). For example ```bash python KindleClippings.py -source /Volumes/Kindle/ ``` On windows, this might look something like: ```bash python KindleClippings.py -source C:\Kindle -destination \ ``` If the parsing is successful, the script will print all of the exported titles ```txt Exported titles: To Kill a Mockingbird - Harper Lee.txt A Clockwork Orange - 50th Anniversary Edition - Anthony Burgess.txt The Road - Cormac McCarthy.txt Fahrenheit 451 - Ray Bradbury.txt Heart of Darkness - Joseph Conrad.txt The Meaning of It All - Richard P Feynman.txt The Selfish Gene - 30th Anniversary Edition - Richard Dawkins.txt ``` Use the `-format` flag after exporting text files to also create a PDF or DOCX version: ```bash python KindleClippings.py -source C:\Kindle -format pdf ``` If any other value is passed to `-format`, the script simply exports text files. ## About I originally forked [`firewood`](https://github.com/sebpearce/firewood), but I realised that my fork was fundamentally different to firewood – to the extent that it has become a different solution. If you play around with `firewood` enough, you'll find that sometimes it can just completely break. This is because firewood relies on the regular order of the `My Clippings.txt` file from the kindle. For the most part, this is a fair assumption. However, I have found that very occasionally, kindle will insert an extra blank line that will prevent the whole program from functioning. My solution does require regularity, but it is a lot more robust to irregularity. We first split the text file into individual highlights, then proceed from there. Sometimes when you make a highlight on kindle, then delete it, it still gets stored into clippings. So if you make a wrong highlight and redo it, you'll end up with multiple very similar highlights. I haven't yet decided whether this is worth fixing, but in my workflow it's not very important. ================================================ FILE: requirements.txt ================================================ FPDF python-docx ================================================ FILE: tests/My Clippings.txt ================================================ ========== Example Title: The Beginning (John Doe) - Your Highlight on page 12 | location 123-124 | Added on Monday, 1 January 2020 12:00:00 This is a highlight text. ========== Another Book? & Something & Else (Jane Smith) - Your Highlight at location 200 | Added on Tuesday, 2 February 2021 13:00:00 Interesting highlight & note? ======= ================================================ FILE: tests/conftest.py ================================================ import sys import types import pytest @pytest.fixture(autouse=True) def stub_dependencies(monkeypatch): fpdf_mod = types.ModuleType('fpdf') class DummyFPDF: def add_page(self): pass def add_font(self, *args, **kwargs): pass def set_font(self, *args, **kwargs): pass def set_margins(self, *args, **kwargs): pass def multi_cell(self, *args, **kwargs): pass def set_draw_color(self, *args, **kwargs): pass def line(self, *args, **kwargs): pass def output(self, *args, **kwargs): pass y = 0 fpdf_mod.FPDF = DummyFPDF monkeypatch.setitem(sys.modules, 'fpdf', fpdf_mod) docx_mod = types.ModuleType('docx') class DummyDocxDocument: def add_heading(self, *args, **kwargs): pass def add_paragraph(self, *args, **kwargs): pass def save(self, *args, **kwargs): pass docx_mod.Document = DummyDocxDocument monkeypatch.setitem(sys.modules, 'docx', docx_mod) import KindleClippings monkeypatch.setattr(KindleClippings, 'args', types.SimpleNamespace(format='txt'), raising=False) ================================================ FILE: tests/test_parse_clippings.py ================================================ import os from pathlib import Path import types import KindleClippings from KindleClippings import parse_clippings, remove_chars def test_parse_clippings_creates_files(tmp_path): clippings = Path(__file__).with_name("My Clippings.txt") out_dir = tmp_path / "out" parse_clippings(str(clippings), str(out_dir), format="txt") expected_files = [ remove_chars("Example Title: The Beginning (John Doe)") + ".txt", remove_chars("Another Book? & Something & Else (Jane Smith)") + ".txt", ] assert sorted(os.listdir(out_dir)) == sorted(expected_files) contents1 = (out_dir / expected_files[0]).read_text(encoding="utf8") assert "This is a highlight text." in contents1 contents2 = (out_dir / expected_files[1]).read_text(encoding="utf8") assert "Interesting highlight & note?" in contents2 ================================================ FILE: tests/test_remove_chars.py ================================================ from KindleClippings import remove_chars def test_colon_replacement(): assert remove_chars('Title: Subtitle') == 'Title - Subtitle' assert remove_chars('A:B') == 'A - B' assert remove_chars('Multi:part:colon') == 'Multi - part - colon' assert remove_chars('Title:Subtitle : Another') == 'Title - Subtitle - Another' def test_remove_question_and_ampersand(): assert remove_chars('Where & When?') == 'Where and When' assert remove_chars('Q? & A?') == 'Q and A' assert remove_chars('This & That & Those') == 'This and That and Those' assert remove_chars('??What?') == 'What' assert remove_chars(' weird??? &?? ') == 'weird and' def test_trim_invalid_characters(): assert remove_chars('???Title???') == 'Title' assert remove_chars('!@#My Book!!!') == 'My Book' assert remove_chars('Book (Edition)') == 'Book - Edition' def test_length_limit(): long_title = 'A' * 300 assert len(remove_chars(long_title)) == 245