[
  {
    "path": ".gitignore",
    "content": "__pycache__/\n*.epub\n*.pdf\nvenv/*\n"
  },
  {
    "path": "README.md",
    "content": "# EpubToPdf\n\nThis program converts epub documents to pdf documents.\n\n\n**Installation**\n\nClone this repository to your machine.\n\n\n**Requirements**\n\nTo install the requirements, run the following command:\n\n```pip install -r requirements.txt```\n\nAfter this, you install the _wkhtmltopdf_ which is a dependency of the _pdfkit_ module.\n\n\nwkhtmltopdf file can be downloaded [here](https://wkhtmltopdf.org/downloads.html)\n\nMake sure to add wkhtlmtopdf as an executable in Windows environment paths.\n\nFor Debian/Ubuntu users:\n\n\n```$ sudo apt-get install wkhtmltopdf```\n\nshould do the trick.\n\nIf you get a `QXcbConnection: Could not connect to display` error, check [this issue](https://github.com/JazzCore/python-pdfkit/issues/82).\n\n\n\n**Usage**\n\nCopy the epub file inside the repository folder when forked.\n\nRun the main.py file, adding the name of the epub file as a commandline argument.\n\nAs shown below:\n\n```python main.py epub-file-name```\n"
  },
  {
    "path": "getpy.py",
    "content": "from bs4 import BeautifulSoup as bs\nimport os\nimport re\nimport ntpath\n\n\nclass GetEngine(object):\n\n\t\"\"\"\n\t\t\n\t\tThis class contains the methods needed to get the files,\n\t\tto help make the pdf file.\n\n\t\tThe class contains the following methods:\n\n\t\tget_html() --- Which gets the html file names.\n\n\t\tget_pdf() --- Which gets the pdf file names.\n\n\t\tget_css() --- Which gets the css file names.\n\n\t\tget_images() --- Which gets the image file names.\n\n\n\t\tTo create an instance of this object, pass in the name of the directory\n\t\tthat stores all the extracted files from the epub file.\n\n\n\t\"\"\"\n\n\tdef __init__(self, directory):\n\t\tself.html_files = []\n\t\tself.css_files = []\n\t\tself.image_files = []\n\t\tself.directory = directory\n\t\tself.files = []\n\t\tself.pdf_files = []\n\n\tdef get_html(self):\n\n\t\tfor file in self.files:\n\t\t\tif file.endswith(\".xhtml\") or file.endswith(\".html\"):\n\t\t\t\tself.html_files.append(file)\n\n\tdef get_pdf(self):\n\n\t\tfor file in self.html_files:\n\t\t\tself.pdf_files.append(\"{}.pdf\".format(self.html_files.index(file)))\n\n\tdef get_css(self):\n\n\t\tfor file in self.files:\n\t\t\tif file.endswith(\".css\"):\n\t\t\t\tself.css_files.append(file)\n\n\tdef get_images(self):\n\n\t\tfor file in self.files:\n\t\t\tif file.endswith((\".png\", \".jpg\", \".gif\")):\n\t\t\t\tself.image_files.append(file)\n\n\tdef get_all(self):\n\t\tfile = None\n\t\tdirectory_paths = []\n\t\tfor root, dirs, files in os.walk(self.directory):\n\t\t\t#This traverses the directory passed in as an argument,\n\t\t\t#returns the current directory, the sub directories and all the files\n\t\t\tdirectory_paths.append(root)\n\t\t\tif file:\n\t\t\t\tcontinue\n\t\t\tfor each in files:\n\t\t\t\tif each.endswith(\".opf\"):\n\t\t\t\t\tfile = os.path.join(root, each)\n\t\t\t\t\tcontinue\n\t\tif not file:\n\t\t\treturn\n\n\t\txml_content = open(file, \"r\").read()\n\n\t\txml_tree = bs(xml_content, features = \"xml\")\n\n\t\tfile_names = xml_tree.package.manifest.findAll('item')\n\n\t\t# Gets the name of all the documents in order\n\t\t# from the opf file, then saves the file name with its path\n\t\t# The file path in the opf file can't be relied upon\n\t\t# Hence, the need to extract file name and get its path\n\n\t\tfor file in file_names:\n\t\t\tfile_path_match = re.match(r'.+\\.[a-zA-Z]+', file.get('href', ''))\n\t\t\tif not file_path_match:\n\t\t\t\tcontinue\n\t\t\tfile_name = ntpath.basename(file_path_match.group())\n\t\t\tfor path in directory_paths:\n\t\t\t\tfilepath = path + '/' + file_name\n\t\t\t\tif os.path.exists(filepath):\n\t\t\t\t\tself.files.append(filepath)\n"
  },
  {
    "path": "main.py",
    "content": "import sys\nfrom manage import FileManager\nfrom getpy import GetEngine\nfrom pdfpy import PdfEngine\n\ndef process():\n\n\tif sys.argv[1].endswith(\".epub\"):\n\n\t\tprint('--- Epub to PDF conversion started')\n\n\t\tepub_file = sys.argv[1]\n\t\tfile = FileManager(epub_file)\n\t\tfile.epub_to_zip()\n\t\tfile.get_directory()\n\t\tfile.extract_zip()\n\t\tengine = GetEngine(file.directory)\n\t\tengine.get_all()\n\t\tengine.get_html()\n\t\tengine.get_pdf()\n\t\tengine.get_css()\n\t\tengine.get_images()\n\t\tpdf = PdfEngine(engine.html_files, engine.css_files,\n\t\t\t\t\t\tengine.pdf_files, file.directory)\n\t\tpdf.convert()\n\t\tpdf.combine()\n\t\tpdf.del_pdf()\n\t\tfile.zip_to_epub()\n\t\tfile.del_directory()\n\n\t\tprint('--- Epub to PDF conversion successful')\n\n\telse:\n\n\t\tprint(\"File is not an epub file\")\n\n\nif __name__ == \"__main__\":\n\tprocess()\n"
  },
  {
    "path": "manage.py",
    "content": "import os\nimport zipfile\nimport shutil\n\n\nclass FileManager(object):\n\n\n\t\"\"\"\n\t\t\n\t\tThis class is used for file interactions.\n\n\t\tIt has the following methods:\n\n\t\tepub_to_zip() --- Which converts the epub file to a zip file\n\n\t\textract_zip() --- Which extracts the content of the zip file\n\n\t\tget_directory() --- Which gets the directory name where content of\n\t\t \t\t\t\t\tzip file was extracted\n\n\t\tzip_to_epub() --- Which converts the zip file back to epub\n\n\t\tdel_directory() --- Which deletes the directory where zip files\n\t\t\t\t\t\t\twere extracted\n\n\t\tdel_pdf() --- Which deletes the pdf files created by \n\n\n\t\"\"\"\n\n\tdef __init__(self, epub_file):\n\t\tself.epub_file = epub_file\n\t\tself.zip_file = \"{}.zip\".format(epub_file.split(\".epub\")[0])\n\t\tself.directory = \"\"\n\n\n\tdef epub_to_zip(self):\n\t\tos.rename(self.epub_file, self.zip_file)\n\n\n\tdef extract_zip(self):\n\t\textracted_files = zipfile.ZipFile(self.zip_file)\n\t\textracted_files.extractall(self.directory)\n\t\textracted_files.close()\n\n\tdef get_directory(self):\n\t\tminus_open_paren = self.epub_file.split(\".epub\")[0].replace(\"(\", \"\")\n\t\tminus_close_paren = minus_open_paren.replace(\")\", \"\")\n\t\tself.directory = minus_close_paren.replace(\" \", \"\")\n\t\t\n\n\tdef zip_to_epub(self):\n\t\tos.rename(self.zip_file, self.epub_file)\n\n\n\tdef del_directory(self):\n\t\tshutil.rmtree(self.directory)\n\n\n\n"
  },
  {
    "path": "pdfpy.py",
    "content": "import pdfkit\nimport os\nfrom PyPDF2 import PdfFileMerger\nfrom PyPDF2.utils import PdfReadError\n\n\nclass PdfEngine(object):\n\n\t\"\"\"\n\t\tThis class carries operations on pdf files.\n\n\t\tIt has the following methods:\n\n\t\tconvert() --- Which converts each of the markup file\n\t\tpassed in to pdf. Markup file should be html\n\n\t\tcombine() --- Which merges all of the pdf files created by\n\t\tthe convert method, creating a new file.\n\n\t\tdel_pdf() --- Which deletes all the pdf files created by\n\t\tthe convert method.\n\n\t\"\"\"\n\n\tdef __init__(self, markup_files, style_files, pdf_files, directory):\n\t\tself.markup_files = markup_files\n\t\tself.style_files = style_files\n\t\tself.pdf_files = pdf_files\n\t\tself.directory = directory\n\n\tdef convert(self):\n\t\tfor each in self.markup_files:\n\n\t\t\t# Prevent conversion process from showing terminal updates\n\t\t\toptions = {\"enable-local-file-access\": None, \"quiet\": \"\"}\n\t\t\tpdfkit.from_file(each, \"{}.pdf\".format(self.markup_files.index(each)),\n\t\t\t\t\t\t\t options=options)\n\n\t\tprint('--- Sections converted to pdf')\n\n\tdef combine(self):\n\n\t\tmerger = PdfFileMerger()\n\n\t\tfor pdf in self.pdf_files:\n\t\t\ttry:\n\t\t\t\tmerger.append(pdf, import_bookmarks=False)\n\t\t\texcept PdfReadError:\n\t\t\t\tpass\n\n\t\tmerger.write(\"{}.pdf\".format(self.directory))\n\n\t\tprint('--- Sections combined together in a single pdf file')\n\n\t\tmerger.close()\n\n\tdef del_pdf(self):\n\t\t\tfor each in self.pdf_files:\n\t\t\t\tos.remove(each)\n\t\t\tprint('--- Individual pdf files deleted from directory')\n"
  },
  {
    "path": "requirements.txt",
    "content": "beautifulsoup4==4.5.3\nlxml==4.6.3\npdfkit==0.6.1\nPyPDF2==1.26.0\n"
  }
]