Repository: HAKSOAT/EpubToPdf Branch: master Commit: 213a88d1e43d Files: 7 Total size: 6.8 KB Directory structure: gitextract_dgir6t81/ ├── .gitignore ├── README.md ├── getpy.py ├── main.py ├── manage.py ├── pdfpy.py └── requirements.txt ================================================ FILE CONTENTS ================================================ ================================================ FILE: .gitignore ================================================ __pycache__/ *.epub *.pdf venv/* ================================================ FILE: README.md ================================================ # EpubToPdf This program converts epub documents to pdf documents. **Installation** Clone this repository to your machine. **Requirements** To install the requirements, run the following command: ```pip install -r requirements.txt``` After this, you install the _wkhtmltopdf_ which is a dependency of the _pdfkit_ module. wkhtmltopdf file can be downloaded [here](https://wkhtmltopdf.org/downloads.html) Make sure to add wkhtlmtopdf as an executable in Windows environment paths. For Debian/Ubuntu users: ```$ sudo apt-get install wkhtmltopdf``` should do the trick. If you get a `QXcbConnection: Could not connect to display` error, check [this issue](https://github.com/JazzCore/python-pdfkit/issues/82). **Usage** Copy the epub file inside the repository folder when forked. Run the main.py file, adding the name of the epub file as a commandline argument. As shown below: ```python main.py epub-file-name``` ================================================ FILE: getpy.py ================================================ from bs4 import BeautifulSoup as bs import os import re import ntpath class GetEngine(object): """ This class contains the methods needed to get the files, to help make the pdf file. The class contains the following methods: get_html() --- Which gets the html file names. get_pdf() --- Which gets the pdf file names. get_css() --- Which gets the css file names. get_images() --- Which gets the image file names. To create an instance of this object, pass in the name of the directory that stores all the extracted files from the epub file. """ def __init__(self, directory): self.html_files = [] self.css_files = [] self.image_files = [] self.directory = directory self.files = [] self.pdf_files = [] def get_html(self): for file in self.files: if file.endswith(".xhtml") or file.endswith(".html"): self.html_files.append(file) def get_pdf(self): for file in self.html_files: self.pdf_files.append("{}.pdf".format(self.html_files.index(file))) def get_css(self): for file in self.files: if file.endswith(".css"): self.css_files.append(file) def get_images(self): for file in self.files: if file.endswith((".png", ".jpg", ".gif")): self.image_files.append(file) def get_all(self): file = None directory_paths = [] for root, dirs, files in os.walk(self.directory): #This traverses the directory passed in as an argument, #returns the current directory, the sub directories and all the files directory_paths.append(root) if file: continue for each in files: if each.endswith(".opf"): file = os.path.join(root, each) continue if not file: return xml_content = open(file, "r").read() xml_tree = bs(xml_content, features = "xml") file_names = xml_tree.package.manifest.findAll('item') # Gets the name of all the documents in order # from the opf file, then saves the file name with its path # The file path in the opf file can't be relied upon # Hence, the need to extract file name and get its path for file in file_names: file_path_match = re.match(r'.+\.[a-zA-Z]+', file.get('href', '')) if not file_path_match: continue file_name = ntpath.basename(file_path_match.group()) for path in directory_paths: filepath = path + '/' + file_name if os.path.exists(filepath): self.files.append(filepath) ================================================ FILE: main.py ================================================ import sys from manage import FileManager from getpy import GetEngine from pdfpy import PdfEngine def process(): if sys.argv[1].endswith(".epub"): print('--- Epub to PDF conversion started') epub_file = sys.argv[1] file = FileManager(epub_file) file.epub_to_zip() file.get_directory() file.extract_zip() engine = GetEngine(file.directory) engine.get_all() engine.get_html() engine.get_pdf() engine.get_css() engine.get_images() pdf = PdfEngine(engine.html_files, engine.css_files, engine.pdf_files, file.directory) pdf.convert() pdf.combine() pdf.del_pdf() file.zip_to_epub() file.del_directory() print('--- Epub to PDF conversion successful') else: print("File is not an epub file") if __name__ == "__main__": process() ================================================ FILE: manage.py ================================================ import os import zipfile import shutil class FileManager(object): """ This class is used for file interactions. It has the following methods: epub_to_zip() --- Which converts the epub file to a zip file extract_zip() --- Which extracts the content of the zip file get_directory() --- Which gets the directory name where content of zip file was extracted zip_to_epub() --- Which converts the zip file back to epub del_directory() --- Which deletes the directory where zip files were extracted del_pdf() --- Which deletes the pdf files created by """ def __init__(self, epub_file): self.epub_file = epub_file self.zip_file = "{}.zip".format(epub_file.split(".epub")[0]) self.directory = "" def epub_to_zip(self): os.rename(self.epub_file, self.zip_file) def extract_zip(self): extracted_files = zipfile.ZipFile(self.zip_file) extracted_files.extractall(self.directory) extracted_files.close() def get_directory(self): minus_open_paren = self.epub_file.split(".epub")[0].replace("(", "") minus_close_paren = minus_open_paren.replace(")", "") self.directory = minus_close_paren.replace(" ", "") def zip_to_epub(self): os.rename(self.zip_file, self.epub_file) def del_directory(self): shutil.rmtree(self.directory) ================================================ FILE: pdfpy.py ================================================ import pdfkit import os from PyPDF2 import PdfFileMerger from PyPDF2.utils import PdfReadError class PdfEngine(object): """ This class carries operations on pdf files. It has the following methods: convert() --- Which converts each of the markup file passed in to pdf. Markup file should be html combine() --- Which merges all of the pdf files created by the convert method, creating a new file. del_pdf() --- Which deletes all the pdf files created by the convert method. """ def __init__(self, markup_files, style_files, pdf_files, directory): self.markup_files = markup_files self.style_files = style_files self.pdf_files = pdf_files self.directory = directory def convert(self): for each in self.markup_files: # Prevent conversion process from showing terminal updates options = {"enable-local-file-access": None, "quiet": ""} pdfkit.from_file(each, "{}.pdf".format(self.markup_files.index(each)), options=options) print('--- Sections converted to pdf') def combine(self): merger = PdfFileMerger() for pdf in self.pdf_files: try: merger.append(pdf, import_bookmarks=False) except PdfReadError: pass merger.write("{}.pdf".format(self.directory)) print('--- Sections combined together in a single pdf file') merger.close() def del_pdf(self): for each in self.pdf_files: os.remove(each) print('--- Individual pdf files deleted from directory') ================================================ FILE: requirements.txt ================================================ beautifulsoup4==4.5.3 lxml==4.6.3 pdfkit==0.6.1 PyPDF2==1.26.0