Repository: HAKSOAT/EpubToPdf
Branch: master
Commit: 213a88d1e43d
Files: 7
Total size: 6.8 KB

Directory structure:
gitextract_dgir6t81/

├── .gitignore
├── README.md
├── getpy.py
├── main.py
├── manage.py
├── pdfpy.py
└── requirements.txt

================================================
FILE CONTENTS
================================================

================================================
FILE: .gitignore
================================================
__pycache__/
*.epub
*.pdf
venv/*


================================================
FILE: README.md
================================================
# EpubToPdf

This program converts epub documents to pdf documents.


**Installation**

Clone this repository to your machine.


**Requirements**

To install the requirements, run the following command:

```pip install -r requirements.txt```

After this, you install the _wkhtmltopdf_ which is a dependency of the _pdfkit_ module.


wkhtmltopdf file can be downloaded [here](https://wkhtmltopdf.org/downloads.html)

Make sure to add wkhtlmtopdf as an executable in Windows environment paths.

For Debian/Ubuntu users:


```$ sudo apt-get install wkhtmltopdf```

should do the trick.

If you get a `QXcbConnection: Could not connect to display` error, check [this issue](https://github.com/JazzCore/python-pdfkit/issues/82).


**Usage**

Copy the epub file inside the repository folder when forked.

Run the main.py file, adding the name of the epub file as a commandline argument.

As shown below:

```python main.py epub-file-name```


================================================
FILE: getpy.py
================================================
from bs4 import BeautifulSoup as bs
import os
import re
import ntpath


class GetEngine(object):

	"""
		
		This class contains the methods needed to get the files,
		to help make the pdf file.

		The class contains the following methods:

		get_html() --- Which gets the html file names.

		get_pdf() --- Which gets the pdf file names.

		get_css() --- Which gets the css file names.

		get_images() --- Which gets the image file names.


		To create an instance of this object, pass in the name of the directory
		that stores all the extracted files from the epub file.


	"""

	def __init__(self, directory):
		self.html_files = []
		self.css_files = []
		self.image_files = []
		self.directory = directory
		self.files = []
		self.pdf_files = []

	def get_html(self):

		for file in self.files:
			if file.endswith(".xhtml") or file.endswith(".html"):
				self.html_files.append(file)

	def get_pdf(self):

		for file in self.html_files:
			self.pdf_files.append("{}.pdf".format(self.html_files.index(file)))

	def get_css(self):

		for file in self.files:
			if file.endswith(".css"):
				self.css_files.append(file)

	def get_images(self):

		for file in self.files:
			if file.endswith((".png", ".jpg", ".gif")):
				self.image_files.append(file)

	def get_all(self):
		file = None
		directory_paths = []
		for root, dirs, files in os.walk(self.directory):
			#This traverses the directory passed in as an argument,
			#returns the current directory, the sub directories and all the files
			directory_paths.append(root)
			if file:
				continue
			for each in files:
				if each.endswith(".opf"):
					file = os.path.join(root, each)
					continue
		if not file:
			return

		xml_content = open(file, "r").read()

		xml_tree = bs(xml_content, features = "xml")

		file_names = xml_tree.package.manifest.findAll('item')

		# Gets the name of all the documents in order
		# from the opf file, then saves the file name with its path
		# The file path in the opf file can't be relied upon
		# Hence, the need to extract file name and get its path

		for file in file_names:
			file_path_match = re.match(r'.+\.[a-zA-Z]+', file.get('href', ''))
			if not file_path_match:
				continue
			file_name = ntpath.basename(file_path_match.group())
			for path in directory_paths:
				filepath = path + '/' + file_name
				if os.path.exists(filepath):
					self.files.append(filepath)


================================================
FILE: main.py
================================================
import sys
from manage import FileManager
from getpy import GetEngine
from pdfpy import PdfEngine

def process():

	if sys.argv[1].endswith(".epub"):

		print('--- Epub to PDF conversion started')

		epub_file = sys.argv[1]
		file = FileManager(epub_file)
		file.epub_to_zip()
		file.get_directory()
		file.extract_zip()
		engine = GetEngine(file.directory)
		engine.get_all()
		engine.get_html()
		engine.get_pdf()
		engine.get_css()
		engine.get_images()
		pdf = PdfEngine(engine.html_files, engine.css_files,
						engine.pdf_files, file.directory)
		pdf.convert()
		pdf.combine()
		pdf.del_pdf()
		file.zip_to_epub()
		file.del_directory()

		print('--- Epub to PDF conversion successful')

	else:

		print("File is not an epub file")


if __name__ == "__main__":
	process()


================================================
FILE: manage.py
================================================
import os
import zipfile
import shutil


class FileManager(object):


	"""
		
		This class is used for file interactions.

		It has the following methods:

		epub_to_zip() --- Which converts the epub file to a zip file

		extract_zip() --- Which extracts the content of the zip file

		get_directory() --- Which gets the directory name where content of
		 					zip file was extracted

		zip_to_epub() --- Which converts the zip file back to epub

		del_directory() --- Which deletes the directory where zip files
							were extracted

		del_pdf() --- Which deletes the pdf files created by 


	"""

	def __init__(self, epub_file):
		self.epub_file = epub_file
		self.zip_file = "{}.zip".format(epub_file.split(".epub")[0])
		self.directory = ""


	def epub_to_zip(self):
		os.rename(self.epub_file, self.zip_file)


	def extract_zip(self):
		extracted_files = zipfile.ZipFile(self.zip_file)
		extracted_files.extractall(self.directory)
		extracted_files.close()

	def get_directory(self):
		minus_open_paren = self.epub_file.split(".epub")[0].replace("(", "")
		minus_close_paren = minus_open_paren.replace(")", "")
		self.directory = minus_close_paren.replace(" ", "")
		

	def zip_to_epub(self):
		os.rename(self.zip_file, self.epub_file)


	def del_directory(self):
		shutil.rmtree(self.directory)


================================================
FILE: pdfpy.py
================================================
import pdfkit
import os
from PyPDF2 import PdfFileMerger
from PyPDF2.utils import PdfReadError


class PdfEngine(object):

	"""
		This class carries operations on pdf files.

		It has the following methods:

		convert() --- Which converts each of the markup file
		passed in to pdf. Markup file should be html

		combine() --- Which merges all of the pdf files created by
		the convert method, creating a new file.

		del_pdf() --- Which deletes all the pdf files created by
		the convert method.

	"""

	def __init__(self, markup_files, style_files, pdf_files, directory):
		self.markup_files = markup_files
		self.style_files = style_files
		self.pdf_files = pdf_files
		self.directory = directory

	def convert(self):
		for each in self.markup_files:

			# Prevent conversion process from showing terminal updates
			options = {"enable-local-file-access": None, "quiet": ""}
			pdfkit.from_file(each, "{}.pdf".format(self.markup_files.index(each)),
							 options=options)

		print('--- Sections converted to pdf')

	def combine(self):

		merger = PdfFileMerger()

		for pdf in self.pdf_files:
			try:
				merger.append(pdf, import_bookmarks=False)
			except PdfReadError:
				pass

		merger.write("{}.pdf".format(self.directory))

		print('--- Sections combined together in a single pdf file')

		merger.close()

	def del_pdf(self):
			for each in self.pdf_files:
				os.remove(each)
			print('--- Individual pdf files deleted from directory')


================================================
FILE: requirements.txt
================================================
beautifulsoup4==4.5.3
lxml==4.6.3
pdfkit==0.6.1
PyPDF2==1.26.0