Repository: lmbringas/packtpub-downloader Branch: master Commit: 457395e3ae6d Files: 9 Total size: 13.0 KB Directory structure: gitextract_28rn_m5h/ ├── .gitignore ├── README.md ├── config.py ├── data.env-sample ├── docker-compose.yml ├── entrypoint.sh ├── main.py ├── requirements.txt └── user.py ================================================ FILE CONTENTS ================================================ ================================================ FILE: .gitignore ================================================ # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] *$py.class # C extensions *.so # Distribution / packaging .Python build/ develop-eggs/ dist/ downloads/ eggs/ .eggs/ lib/ lib64/ parts/ sdist/ var/ wheels/ *.egg-info/ .installed.cfg *.egg MANIFEST # PyInstaller # Usually these files are written by a python script from a template # before PyInstaller builds the exe, so as to inject date/other infos into it. *.manifest *.spec # Installer logs pip-log.txt pip-delete-this-directory.txt # Unit test / coverage reports htmlcov/ .tox/ .coverage .coverage.* .cache nosetests.xml coverage.xml *.cover .hypothesis/ .pytest_cache/ # Translations *.mo *.pot # Django stuff: *.log local_settings.py db.sqlite3 # Flask stuff: instance/ .webassets-cache # Scrapy stuff: .scrapy # Sphinx documentation docs/_build/ # PyBuilder target/ # Jupyter Notebook .ipynb_checkpoints # pyenv .python-version # celery beat schedule file celerybeat-schedule # SageMath parsed files *.sage.py # Environments .env .venv env/ venv/ ENV/ env.bak/ venv.bak/ # Spyder project settings .spyderproject .spyproject # Rope project settings .ropeproject # mkdocs documentation /site # mypy .mypy_cache/ *.env ================================================ FILE: README.md ================================================ # PacktPub Downloader Script to download all your PacktPub books inspired by https://github.com/ozzieperez/packtpub-library-downloader Since PacktPub restructured their website [packtpub-library-downloader](https://github.com/ozzieperez/packtpub-library-downloader) became obsolete because the downloader used webscraping. So I figured out that now PacktPub uses a REST API. Then I found which endpoint to use for downloading books and made a simple script. Feel free to fork and PR to improve. Packtpub's API isn't documented :'( ## Usage: pip install -r requirements.txt python main.py -e -p [-d -b -s -v -q] ##### Example: Download books in PDF format python main.py -e hello@world.com -p p@ssw0rd -d ~/Desktop/packt -b pdf,epub,mobi,code ## Docker integration You must put your data in the `.env` file. ``` mv data.env-sample data.env ``` and replace the sample data with your login credentials. ``` docker-compose up ``` After the execution, you can see the content in the `book` directory. ## Commandline Options - *-e*, *--email* = Your login email - *-p*, *--password* = Your login password - *-d*, *--directory* = Directory to download into. Default is "media/" in the current directory - *-b*, *--books* = Assets to download. Options are: *pdf,mobi,epub,code* - *-s*, *--separate* = Create a separate directory for each book - *-v*, *--verbose* = Show more detailed information - *-q*, *--quiet* = Don't show information or progress bars **Book File Types** - *pdf*: PDF format - *mobi*: MOBI format - *epub*: EPUB format - *code*: Accompanying source code, saved as .zip files I'm working on Python 3.6.0 ================================================ FILE: config.py ================================================ # -*- coding: utf-8 -*- ''' This file contain all url endpoint ''' # instead of variables should i change variables to a one big json of urls ? # this is base url where i do the requests BASE_URL = "https://services.packtpub.com/" # URL to request jwt token, params by post are user and pass, return jwt token AUTH_ENDPOINT = "auth-v1/users/tokens" # URL to get all your books, two params that i change are offset and limit, method GET PRODUCTS_ENDPOINT = "entitlements-v1/users/me/products?sort=createdAt:DESC&offset={offset}&limit={limit}" # URL to get types , param is book id, method GET URL_BOOK_TYPES_ENDPOINT = "products-v1/products/{book_id}/types" # URL to get url file to download, params are book id and format of the file (can be pdf, epub, etc..), method GET URL_BOOK_ENDPOINT = "products-v1/products/{book_id}/files/{format}" ================================================ FILE: data.env-sample ================================================ EMAIL=email@example.com PASSWORD=example$password ================================================ FILE: docker-compose.yml ================================================ version: '3.3' services: packtpub-downloader: image: python:3.6.0 container_name: "packtpub-downloader" env_file: - data.env volumes: - "./:/app" command: "/bin/bash /app/entrypoint.sh" ================================================ FILE: entrypoint.sh ================================================ pip install -r /app/requirements.txt python /app/main.py -e $EMAIL -p $PASSWORD -d /app/book -b pdf,mobi,epub,code ================================================ FILE: main.py ================================================ # -*- coding: utf-8 -*- #!/usr/bin/python from __future__ import print_function import os import sys import glob import math import getopt import requests from tqdm import tqdm, trange from config import BASE_URL, PRODUCTS_ENDPOINT, URL_BOOK_TYPES_ENDPOINT, URL_BOOK_ENDPOINT from user import User #TODO: I should do a function that his only purpose is to request and return data def book_request(user, offset=0, limit=10, verbose=False): data = [] url = BASE_URL + PRODUCTS_ENDPOINT.format(offset=offset, limit=limit) if verbose: print(url) r = requests.get(url, headers=user.get_header()) data += r.json().get('data', []) return url, r, data def get_books(user, offset=0, limit=10, is_verbose=False, is_quiet=False): ''' Request all your books, return json with info of all your books Params ... header : str offset : int limit : int how many book wanna get by request ''' # TODO: given x time jwt expired and should refresh the header, user.refresh_header() url, r, data = book_request(user, offset, limit) print(f'You have {str(r.json()["count"])} books') print("Getting list of books...") if not is_quiet: pages_list = trange(r.json()['count'] // limit, unit='Pages') else: pages_list = range(r.json()['count'] // limit) for i in pages_list: offset += limit data += book_request(user, offset, limit, is_verbose)[2] return data def get_url_book(user, book_id, format='pdf'): ''' Return url of the book to download ''' url = BASE_URL + URL_BOOK_ENDPOINT.format(book_id=book_id, format=format) r = requests.get(url, headers=user.get_header()) if r.status_code == 200: # success return r.json().get('data', '') elif r.status_code == 401: # jwt expired user.refresh_header() # refresh token get_url_book(user, book_id, format) # call recursive print('ERROR (please copy and paste in the issue)') print(r.json()) print(r.status_code) return '' def get_book_file_types(user, book_id): ''' Return a list with file types of a book ''' url = BASE_URL + URL_BOOK_TYPES_ENDPOINT.format(book_id=book_id) r = requests.get(url, headers=user.get_header()) if (r.status_code == 200): # success return r.json()['data'][0].get('fileTypes', []) elif (r.status_code == 401): # jwt expired user.refresh_header() # refresh token get_book_file_types(user, book_id, format) # call recursive print('ERROR (please copy and paste in the issue)') print(r.json()) print(r.status_code) return [] # TODO: i'd like that this functions be async and download faster def download_book(filename, url): ''' Download your book ''' print('Starting to download ' + filename) with open(filename, 'wb') as f: r = requests.get(url, stream=True) total = r.headers.get('content-length') if total is None: f.write(response.content) else: total = int(total) # TODO: read more about tqdm for chunk in tqdm(r.iter_content(chunk_size=1024), total=math.ceil(total//1024), unit='KB', unit_scale=True): if chunk: # filter out keep-alive new chunks f.write(chunk) f.flush() print('Finished ' + filename) def make_zip(filename): if filename[-4:] == 'code': os.replace(filename, filename[:-4] + 'zip') def move_current_files(root, book): sub_dir = f'{root}/{book}' does_dir_exist(sub_dir) for f in glob.iglob(sub_dir + '.*'): try: os.rename(f, f'{sub_dir}/{book}' + f[f.index('.'):]) except OSError: os.rename(f, f'{sub_dir}/{book}' + '_1' + f[f.index('.'):]) except ValueError as e: print(e) print('Skipping') def does_dir_exist(directory): if not os.path.exists(directory): try: os.makedirs(directory) except Exception as e: print(e) sys.exit(2) def main(argv): # thanks to https://github.com/ozzieperez/packtpub-library-downloader/blob/master/downloader.py email = None password = None root_directory = 'media' book_file_types = ['pdf', 'mobi', 'epub', 'code'] separate = None verbose = None quiet = None errorMessage = 'Usage: main.py -e -p [-d -b -s -v -q]' # get the command line arguments/options try: opts, args = getopt.getopt( argv, 'e:p:d:b:svq', ['email=', 'pass=', 'directory=', 'books=', 'separate', 'verbose', 'quiet']) except getopt.GetoptError: print(errorMessage) sys.exit(2) # hold the values of the command line options for opt, arg in opts: if opt in ('-e', '--email'): email = arg elif opt in ('-p', '--pass'): password = arg elif opt in ('-d', '--directory'): root_directory = os.path.expanduser( arg) if '~' in arg else os.path.abspath(arg) elif opt in ('-b', '--books'): book_file_types = arg.split(',') elif opt in ('-s', '--separate'): separate = True elif opt in ('-v', '--verbose'): verbose = True elif opt in ('-q', '--quiet'): quiet = True if verbose and quiet: print("Verbose and quiet cannot be used together.") sys.exit(2) # do we have the minimum required info? if not email or not password: print(errorMessage) sys.exit(2) # check if not exists dir and create does_dir_exist(root_directory) # create user with his properly header user = User(email, password) # get all your books books = get_books(user, is_verbose=verbose, is_quiet=quiet) print('Downloading books...') if not quiet: books_iter = tqdm(books, unit='Book') else: books_iter = books for book in books_iter: # get the different file type of current book file_types = get_book_file_types(user, book['productId']) for file_type in file_types: if file_type in book_file_types: # check if the file type entered is available by the current book book_name = book['productName'].replace(' ', '_').replace('.', '_').replace(':', '_').replace('/','') if separate: filename = f'{root_directory}/{book_name}/{book_name}.{file_type}' move_current_files(root_directory, book_name) else: filename = f'{root_directory}/{book_name}.{file_type}' # get url of the book to download url = get_url_book(user, book['productId'], file_type) if not os.path.exists(filename) and not os.path.exists(filename.replace('.code', '.zip')): download_book(filename, url) make_zip(filename) else: if verbose: tqdm.write(f'{filename} already exists, skipping.') if __name__ == '__main__': main(sys.argv[1:]) ================================================ FILE: requirements.txt ================================================ aiofiles==0.4.0 aiohttp==3.5.4 async-timeout==3.0.1 attrs==18.2.0 certifi==2018.11.29 chardet==3.0.4 idna==2.8 idna-ssl==1.1.0 multidict==4.5.2 requests==2.21.0 tqdm==4.30.0 typing-extensions==3.7.2 urllib3==1.24.1 yarl==1.3.0 ================================================ FILE: user.py ================================================ # -*- coding: utf-8 -*- from __future__ import print_function import sys import requests from config import BASE_URL, AUTH_ENDPOINT class User: """ User object that contain his header """ username = "" password = "" # need to fill Authoritazion with current token provide by api header = { "User-Agent": "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 " + "(KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36", "Authorization":"" } def __init__(self, username, password): self.username = username self.password = password self.header["Authorization"] = self.get_token() def get_token(self): """ Request auth endpoint and return user token """ url = BASE_URL+AUTH_ENDPOINT # use json paramenter because for any reason they send user and pass in plain text :'( r = requests.post(url, json={'username':self.username, 'password':self.password}) if r.status_code == 200: print("You are in!") return 'Bearer ' + r.json()['data']['access'] # except should happend when user and pass are incorrect print("Error login, check user and password") print("Error {}".format(e)) sys.exit(2) def get_header(self): return self.header def refresh_header(self): """ Refresh jwt because it expired and returned """ self.header["Authorization"] = self.get_token() return self.header