Repository: HAKSOAT/EpubToPdf
Branch: master
Commit: 213a88d1e43d
Files: 7
Total size: 6.8 KB
Directory structure:
gitextract_dgir6t81/
├── .gitignore
├── README.md
├── getpy.py
├── main.py
├── manage.py
├── pdfpy.py
└── requirements.txt
================================================
FILE CONTENTS
================================================
================================================
FILE: .gitignore
================================================
__pycache__/
*.epub
*.pdf
venv/*
================================================
FILE: README.md
================================================
# EpubToPdf
This program converts epub documents to pdf documents.
**Installation**
Clone this repository to your machine.
**Requirements**
To install the requirements, run the following command:
```pip install -r requirements.txt```
After this, you install the _wkhtmltopdf_ which is a dependency of the _pdfkit_ module.
wkhtmltopdf file can be downloaded [here](https://wkhtmltopdf.org/downloads.html)
Make sure to add wkhtlmtopdf as an executable in Windows environment paths.
For Debian/Ubuntu users:
```$ sudo apt-get install wkhtmltopdf```
should do the trick.
If you get a `QXcbConnection: Could not connect to display` error, check [this issue](https://github.com/JazzCore/python-pdfkit/issues/82).
**Usage**
Copy the epub file inside the repository folder when forked.
Run the main.py file, adding the name of the epub file as a commandline argument.
As shown below:
```python main.py epub-file-name```
================================================
FILE: getpy.py
================================================
from bs4 import BeautifulSoup as bs
import os
import re
import ntpath
class GetEngine(object):
"""
This class contains the methods needed to get the files,
to help make the pdf file.
The class contains the following methods:
get_html() --- Which gets the html file names.
get_pdf() --- Which gets the pdf file names.
get_css() --- Which gets the css file names.
get_images() --- Which gets the image file names.
To create an instance of this object, pass in the name of the directory
that stores all the extracted files from the epub file.
"""
def __init__(self, directory):
self.html_files = []
self.css_files = []
self.image_files = []
self.directory = directory
self.files = []
self.pdf_files = []
def get_html(self):
for file in self.files:
if file.endswith(".xhtml") or file.endswith(".html"):
self.html_files.append(file)
def get_pdf(self):
for file in self.html_files:
self.pdf_files.append("{}.pdf".format(self.html_files.index(file)))
def get_css(self):
for file in self.files:
if file.endswith(".css"):
self.css_files.append(file)
def get_images(self):
for file in self.files:
if file.endswith((".png", ".jpg", ".gif")):
self.image_files.append(file)
def get_all(self):
file = None
directory_paths = []
for root, dirs, files in os.walk(self.directory):
#This traverses the directory passed in as an argument,
#returns the current directory, the sub directories and all the files
directory_paths.append(root)
if file:
continue
for each in files:
if each.endswith(".opf"):
file = os.path.join(root, each)
continue
if not file:
return
xml_content = open(file, "r").read()
xml_tree = bs(xml_content, features = "xml")
file_names = xml_tree.package.manifest.findAll('item')
# Gets the name of all the documents in order
# from the opf file, then saves the file name with its path
# The file path in the opf file can't be relied upon
# Hence, the need to extract file name and get its path
for file in file_names:
file_path_match = re.match(r'.+\.[a-zA-Z]+', file.get('href', ''))
if not file_path_match:
continue
file_name = ntpath.basename(file_path_match.group())
for path in directory_paths:
filepath = path + '/' + file_name
if os.path.exists(filepath):
self.files.append(filepath)
================================================
FILE: main.py
================================================
import sys
from manage import FileManager
from getpy import GetEngine
from pdfpy import PdfEngine
def process():
if sys.argv[1].endswith(".epub"):
print('--- Epub to PDF conversion started')
epub_file = sys.argv[1]
file = FileManager(epub_file)
file.epub_to_zip()
file.get_directory()
file.extract_zip()
engine = GetEngine(file.directory)
engine.get_all()
engine.get_html()
engine.get_pdf()
engine.get_css()
engine.get_images()
pdf = PdfEngine(engine.html_files, engine.css_files,
engine.pdf_files, file.directory)
pdf.convert()
pdf.combine()
pdf.del_pdf()
file.zip_to_epub()
file.del_directory()
print('--- Epub to PDF conversion successful')
else:
print("File is not an epub file")
if __name__ == "__main__":
process()
================================================
FILE: manage.py
================================================
import os
import zipfile
import shutil
class FileManager(object):
"""
This class is used for file interactions.
It has the following methods:
epub_to_zip() --- Which converts the epub file to a zip file
extract_zip() --- Which extracts the content of the zip file
get_directory() --- Which gets the directory name where content of
zip file was extracted
zip_to_epub() --- Which converts the zip file back to epub
del_directory() --- Which deletes the directory where zip files
were extracted
del_pdf() --- Which deletes the pdf files created by
"""
def __init__(self, epub_file):
self.epub_file = epub_file
self.zip_file = "{}.zip".format(epub_file.split(".epub")[0])
self.directory = ""
def epub_to_zip(self):
os.rename(self.epub_file, self.zip_file)
def extract_zip(self):
extracted_files = zipfile.ZipFile(self.zip_file)
extracted_files.extractall(self.directory)
extracted_files.close()
def get_directory(self):
minus_open_paren = self.epub_file.split(".epub")[0].replace("(", "")
minus_close_paren = minus_open_paren.replace(")", "")
self.directory = minus_close_paren.replace(" ", "")
def zip_to_epub(self):
os.rename(self.zip_file, self.epub_file)
def del_directory(self):
shutil.rmtree(self.directory)
================================================
FILE: pdfpy.py
================================================
import pdfkit
import os
from PyPDF2 import PdfFileMerger
from PyPDF2.utils import PdfReadError
class PdfEngine(object):
"""
This class carries operations on pdf files.
It has the following methods:
convert() --- Which converts each of the markup file
passed in to pdf. Markup file should be html
combine() --- Which merges all of the pdf files created by
the convert method, creating a new file.
del_pdf() --- Which deletes all the pdf files created by
the convert method.
"""
def __init__(self, markup_files, style_files, pdf_files, directory):
self.markup_files = markup_files
self.style_files = style_files
self.pdf_files = pdf_files
self.directory = directory
def convert(self):
for each in self.markup_files:
# Prevent conversion process from showing terminal updates
options = {"enable-local-file-access": None, "quiet": ""}
pdfkit.from_file(each, "{}.pdf".format(self.markup_files.index(each)),
options=options)
print('--- Sections converted to pdf')
def combine(self):
merger = PdfFileMerger()
for pdf in self.pdf_files:
try:
merger.append(pdf, import_bookmarks=False)
except PdfReadError:
pass
merger.write("{}.pdf".format(self.directory))
print('--- Sections combined together in a single pdf file')
merger.close()
def del_pdf(self):
for each in self.pdf_files:
os.remove(each)
print('--- Individual pdf files deleted from directory')
================================================
FILE: requirements.txt
================================================
beautifulsoup4==4.5.3
lxml==4.6.3
pdfkit==0.6.1
PyPDF2==1.26.0
gitextract_dgir6t81/ ├── .gitignore ├── README.md ├── getpy.py ├── main.py ├── manage.py ├── pdfpy.py └── requirements.txt
SYMBOL INDEX (20 symbols across 4 files)
FILE: getpy.py
class GetEngine (line 7) | class GetEngine(object):
method __init__ (line 31) | def __init__(self, directory):
method get_html (line 39) | def get_html(self):
method get_pdf (line 45) | def get_pdf(self):
method get_css (line 50) | def get_css(self):
method get_images (line 56) | def get_images(self):
method get_all (line 62) | def get_all(self):
FILE: main.py
function process (line 6) | def process():
FILE: manage.py
class FileManager (line 6) | class FileManager(object):
method __init__ (line 32) | def __init__(self, epub_file):
method epub_to_zip (line 38) | def epub_to_zip(self):
method extract_zip (line 42) | def extract_zip(self):
method get_directory (line 47) | def get_directory(self):
method zip_to_epub (line 53) | def zip_to_epub(self):
method del_directory (line 57) | def del_directory(self):
FILE: pdfpy.py
class PdfEngine (line 7) | class PdfEngine(object):
method __init__ (line 25) | def __init__(self, markup_files, style_files, pdf_files, directory):
method convert (line 31) | def convert(self):
method combine (line 41) | def combine(self):
method del_pdf (line 57) | def del_pdf(self):
Condensed preview — 7 files, each showing path, character count, and a content snippet. Download the .json file or copy for the full structured content (8K chars).
[
{
"path": ".gitignore",
"chars": 33,
"preview": "__pycache__/\n*.epub\n*.pdf\nvenv/*\n"
},
{
"path": "README.md",
"chars": 936,
"preview": "# EpubToPdf\n\nThis program converts epub documents to pdf documents.\n\n\n**Installation**\n\nClone this repository to your ma"
},
{
"path": "getpy.py",
"chars": 2380,
"preview": "from bs4 import BeautifulSoup as bs\nimport os\nimport re\nimport ntpath\n\n\nclass GetEngine(object):\n\n\t\"\"\"\n\t\t\n\t\tThis class c"
},
{
"path": "main.py",
"chars": 779,
"preview": "import sys\nfrom manage import FileManager\nfrom getpy import GetEngine\nfrom pdfpy import PdfEngine\n\ndef process():\n\n\tif s"
},
{
"path": "manage.py",
"chars": 1305,
"preview": "import os\nimport zipfile\nimport shutil\n\n\nclass FileManager(object):\n\n\n\t\"\"\"\n\t\t\n\t\tThis class is used for file interactions"
},
{
"path": "pdfpy.py",
"chars": 1449,
"preview": "import pdfkit\nimport os\nfrom PyPDF2 import PdfFileMerger\nfrom PyPDF2.utils import PdfReadError\n\n\nclass PdfEngine(object)"
},
{
"path": "requirements.txt",
"chars": 63,
"preview": "beautifulsoup4==4.5.3\nlxml==4.6.3\npdfkit==0.6.1\nPyPDF2==1.26.0\n"
}
]
About this extraction
This page contains the full source code of the HAKSOAT/EpubToPdf GitHub repository, extracted and formatted as plain text for AI agents and large language models (LLMs). The extraction includes 7 files (6.8 KB), approximately 1.9k tokens, and a symbol index with 20 extracted functions, classes, methods, constants, and types. Use this with OpenClaw, Claude, ChatGPT, Cursor, Windsurf, or any other AI tool that accepts text input. You can copy the full output to your clipboard or download it as a .txt file.
Extracted by GitExtract — free GitHub repo to text converter for AI. Built by Nikandr Surkov.