Repository: bitbybyte/fantiadl Branch: master Commit: 8a3a4e848a0b Files: 12 Total size: 47.5 KB Directory structure: gitextract_yo9s0qtf/ ├── .gitignore ├── LICENSE ├── README.md ├── fantiadl/ │ ├── __init__.py │ ├── __main__.py │ ├── __version__.py │ ├── db.py │ ├── fantiadl.py │ └── models.py ├── fantiadl.py ├── requirements.txt └── setup.py ================================================ FILE CONTENTS ================================================ ================================================ FILE: .gitignore ================================================ # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] *$py.class # C extensions *.so # Distribution / packaging .Python env/ build/ develop-eggs/ dist/ downloads/ eggs/ .eggs/ lib/ lib64/ parts/ sdist/ var/ wheels/ *.egg-info/ .installed.cfg *.egg # PyInstaller # Usually these files are written by a python script from a template # before PyInstaller builds the exe, so as to inject date/other infos into it. *.manifest *.spec # Installer logs pip-log.txt pip-delete-this-directory.txt # Unit test / coverage reports htmlcov/ .tox/ .coverage .coverage.* .cache nosetests.xml coverage.xml *.cover .hypothesis/ # Translations *.mo *.pot # Django stuff: *.log local_settings.py # Flask stuff: instance/ .webassets-cache # Scrapy stuff: .scrapy # Sphinx documentation docs/_build/ # PyBuilder target/ # Jupyter Notebook .ipynb_checkpoints # pyenv .python-version # celery beat schedule file celerybeat-schedule # SageMath parsed files *.sage.py # dotenv .env # virtualenv .venv venv/ ENV/ # Spyder project settings .spyderproject .spyproject # Rope project settings .ropeproject # mkdocs documentation /site # mypy .mypy_cache/ # File downloads *.jpg *.jpe *.png *.gif *.mp4 *.webm *.zip *.incomplete # Metadata *.json .incomplete # Logs *.log # Crawljobs *.crawljob # Jetbrains .idea/ # Database *.db ================================================ FILE: LICENSE ================================================ MIT License Copyright (c) 2018 bitbybyte Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ================================================ FILE: README.md ================================================ # FantiaDL Download media and other data from Fantia fanclubs and posts. A session cookie must be provided with the -c/--cookie argument directly or by passing the path to a legacy Netscape cookies file. Please see the [About Session Cookies](#about-session-cookies) section. ``` usage: fantiadl [options] url positional arguments: url fanclub or post URL options: -h, --help show this help message and exit -c SESSION_COOKIE, --cookie SESSION_COOKIE _session_id cookie or cookies.txt -q, --quiet suppress output -v, --version show program's version number and exit --db DB_PATH database to track post download state (creates tables when first specified)" --db-bypass-post-check bypass checking a post for new content if it's marked as completed on the database download options: -i, --ignore-errors continue on download errors -l #, --limit # limit the number of posts to process per fanclub (excludes -n) -o OUTPUT_PATH, --output-directory OUTPUT_PATH directory to download to -s, --use-server-filenames download using server defined filenames -r, --mark-incomplete-posts add .incomplete file to post directories that are incomplete -m, --dump-metadata store metadata to file (including fanclub icon, header, and background) -x, --parse-for-external-links parse posts for external links -t, --download-thumbnail download post thumbnails -f, --download-fanclubs download posts from all followed fanclubs -p, --download-paid-fanclubs download posts from all fanclubs backed on a paid plan -n #, --download-new-posts # download a specified number of new posts from your fanclub timeline -d %Y-%m, --download-month %Y-%m download posts only from a specific month, e.g. 2007-08 (excludes -n) --exclude EXCLUDE_FILE file containing a list of filenames to exclude from downloading ``` To track post downloads, specify a database path using `--db`, e.g. `--db ~/fantiadl.db`. When existing post content downloads are encountered, they will be skipped over. When all post contents under a parent post have been downloaded, the post will be marked complete on the database. If future requests to download a post indicate the post was modified based on its timestamp, new contents will be checked for; this behavior can be disabled by setting `--db-bypass-post-check`. When parsing for external links using `-x`, a .crawljob file is created in your root directory (either the directory provided with `-o` or the directory the script is being run from) that can be parsed by [JDownloader](http://jdownloader.org/). As posts are parsed, links will be appended and assigned their appropriate post directories for download. You can import this file manually into JDownloader (File -> Load Linkcontainer) or setup the Folder Watch plugin to watch your root directory for .crawljob files. ## About Session Cookies Due to recent changes imposed by Fantia, providing an email and password to login from the command line is no longer supported. In order to login, you will need to provide the `_session_id` cookie for your Fantia login session using -c/--cookie. After logging in normally on your browser, this value can then be extracted and used with FantiaDL. This value expires and may need to be updated with some regularity. ### Mozilla Firefox 1. On https://fantia.jp, press Ctrl + Shift + I to open Developer Tools. 2. Select the Storage tab at the top. In the sidebar, select https://fantia.jp under the Cookies heading. 3. Locate the `_session_id` cookie name. Click on the value to copy it. ### Google Chrome 1. On https://fantia.jp, press Ctrl + Shift + I to open DevTools. 2. Select the Application tab at the top. In the sidebar, expand Cookies under the Storage heading and select https://fantia.jp. 3. Locate the `_session_id` cookie name. Click on the value to copy it. ### Third-Party Extensions (cookies.txt) You also have the option of passing the path to a legacy Netscape format cookies file with -c/--cookie, e.g. `-c ~/cookies.txt`. Using an extension like [cookies.txt](https://chrome.google.com/webstore/detail/cookiestxt/njabckikapfpffapmjgojcnbfjonfjfg), create a text file matching the accepted format: ``` # Netscape HTTP Cookie File # https://curl.haxx.se/rfc/cookie_spec.html # This is a generated file! Do not edit. fantia.jp FALSE / FALSE 1595755239 _session_id a1b2c3d4... ``` Only the `_session_id` cookie is required. ## Download `pip install fantiadl` https://pypi.org/project/fantiadl/ Binaries are also provided for [new releases](https://github.com/bitbybyte/fantiadl/releases/latest). ## Build Requirements - Python >=3.7 - requests - beautifulsoup4 ## Roadmap - More robust logging ================================================ FILE: fantiadl/__init__.py ================================================ from . import fantiadl ================================================ FILE: fantiadl/__main__.py ================================================ from .fantiadl import cli cli() ================================================ FILE: fantiadl/__version__.py ================================================ __version__ = "2.0.4" ================================================ FILE: fantiadl/db.py ================================================ import time import sqlite3 class FantiaDlDatabase: def __init__(self, db_path): if db_path is None: self.conn = None return self.conn = sqlite3.connect(db_path) self.conn.row_factory = sqlite3.Row self.cursor = self.conn.cursor() self.cursor.execute("CREATE TABLE IF NOT EXISTS urls (url TEXT PRIMARY KEY, timestamp INTEGER)") self.cursor.execute("CREATE TABLE IF NOT EXISTS posts (id INTEGER PRIMARY KEY, title TEXT, fanclub INTEGER, posted_at INTEGER, converted_at INTEGER, download_complete INTEGER, timestamp INTEGER)") self.cursor.execute("CREATE TABLE IF NOT EXISTS post_contents (id INTEGER PRIMARY KEY, parent_post INTEGER, title TEXT, category TEXT, price INTEGER, currency TEXT, timestamp INTEGER, FOREIGN KEY(parent_post) REFERENCES posts(id))") self.conn.commit() def __del__(self): if self.conn is not None: self.conn.close() # Helper methods def execute(self, query, args): if self.conn is None: return self.cursor.execute(query, args) self.conn.commit() def fetchone(self, query, args): if self.conn is None: return None self.cursor.execute(query, args) return self.cursor.fetchone() # INSERT, REPLACE def insert_post(self, id, title, fanclub, posted_at, converted_at): self.execute("REPLACE INTO posts VALUES (?, ?, ?, ?, ?, 0, ?)", (id, title, fanclub, posted_at, converted_at, int(time.time()))) def insert_post_content(self, id, parent_post, title, category, price, price_unit): self.execute("INSERT INTO post_contents VALUES (?, ?, ?, ?, ?, ?, ?)", (id, parent_post, title, category, price, price_unit, int(time.time()))) def insert_url(self, url): self.execute("INSERT INTO urls VALUES (?, ?)", (url, int(time.time()))) # SELECT def find_post(self, id): return self.fetchone("SELECT * FROM posts WHERE id = ?", (id,)) def is_post_content_downloaded(self, id): return self.fetchone("SELECT timestamp FROM post_contents WHERE id = ?", (id,)) is not None def is_url_downloaded(self, url): return self.fetchone("SELECT timestamp FROM urls WHERE url = ?", (url,)) is not None # UPDATE def update_post_download_complete(self, id, download_complete=1): self.execute("UPDATE posts SET download_complete = ?, timestamp = ? WHERE id = ?", (download_complete, int(time.time()), id)) def update_post_converted_at(self, id, converted_at): self.execute("UPDATE posts SET converted_at = ?, timestamp = ? WHERE id = ?", (converted_at, int(time.time()), id)) ================================================ FILE: fantiadl/fantiadl.py ================================================ #!/usr/bin/env python # -*- coding: utf-8 -*- """Download media and other data from Fantia""" import argparse import getpass import netrc import sys import traceback from .models import FantiaDownloader, FantiaClub, FANTIA_URL_RE from .__version__ import __version__ __author__ = "bitbybyte" __copyright__ = "Copyright 2024 bitbybyte" __license__ = "MIT" BASE_HOST = "fantia.jp" cmdl_usage = "%(prog)s [options] url" cmdl_version = __version__ cmdl_parser = argparse.ArgumentParser(usage=cmdl_usage, conflict_handler="resolve") cmdl_parser.add_argument("-c", "--cookie", dest="session_arg", metavar="SESSION_COOKIE", help="_session_id cookie or cookies.txt") cmdl_parser.add_argument("-e", "--email", dest="email", metavar="EMAIL", help=argparse.SUPPRESS) cmdl_parser.add_argument("-p", "--password", dest="password", metavar="PASSWORD", help=argparse.SUPPRESS) cmdl_parser.add_argument("-n", "--netrc", action="store_true", dest="netrc", help=argparse.SUPPRESS) cmdl_parser.add_argument("-q", "--quiet", action="store_true", dest="quiet", help="suppress output") cmdl_parser.add_argument("-v", "--version", action="version", version=cmdl_version) cmdl_parser.add_argument("--db", dest="db_path", help="database to track post download state (creates tables when first specified)") cmdl_parser.add_argument("--db-bypass-post-check", action="store_true", dest="db_bypass_post_check", help="bypass checking a post for new content if it's marked as completed on the database") cmdl_parser.add_argument("url", action="store", nargs="*", help="fanclub or post URL") dl_group = cmdl_parser.add_argument_group("download options") dl_group.add_argument("-i", "--ignore-errors", action="store_true", dest="continue_on_error", help="continue on download errors") dl_group.add_argument("-l", "--limit", dest="limit", metavar="#", type=int, default=0, help="limit the number of posts to process per fanclub (excludes -n)") dl_group.add_argument("-o", "--output-directory", dest="output_path", help="directory to download to") dl_group.add_argument("-s", "--use-server-filenames", action="store_true", dest="use_server_filenames", help="download using server defined filenames") dl_group.add_argument("-r", "--mark-incomplete-posts", action="store_true", dest="mark_incomplete_posts", help="add .incomplete file to post directories that are incomplete") dl_group.add_argument("-m", "--dump-metadata", action="store_true", dest="dump_metadata", help="store metadata to file (including fanclub icon, header, and background)") dl_group.add_argument("-x", "--parse-for-external-links", action="store_true", dest="parse_for_external_links", help="parse posts for external links") dl_group.add_argument("-t", "--download-thumbnail", action="store_true", dest="download_thumb", help="download post thumbnails") dl_group.add_argument("-f", "--download-fanclubs", action="store_true", dest="download_fanclubs", help="download posts from all followed fanclubs") dl_group.add_argument("-p", "--download-paid-fanclubs", action="store_true", dest="download_paid_fanclubs", help="download posts from all fanclubs backed on a paid plan") dl_group.add_argument("-n", "--download-new-posts", dest="download_new_posts", metavar="#", type=int, help="download a specified number of new posts from your fanclub timeline") dl_group.add_argument("-d", "--download-month", dest="month_limit", metavar="%Y-%m", help="download posts only from a specific month, e.g. 2007-08 (excludes -n)") dl_group.add_argument("--exclude", dest="exclude_file", metavar="EXCLUDE_FILE", help="file containing a list of filenames to exclude from downloading") cmdl_opts = cmdl_parser.parse_args() def main(): session_arg = cmdl_opts.session_arg email = cmdl_opts.email password = cmdl_opts.password if (email or password or cmdl_opts.netrc) and not session_arg: sys.exit("Logging in from the command line is no longer supported. Please provide a session cookie using -c/--cookie. See the README for more information.") if not (cmdl_opts.download_fanclubs or cmdl_opts.download_paid_fanclubs or cmdl_opts.download_new_posts) and not cmdl_opts.url: sys.exit("Error: No valid input provided") if not session_arg: session_arg = input("Fantia session cookie (_session_id or cookies.txt path): ") # if cmdl_opts.netrc: # login = netrc.netrc().authenticators(BASE_HOST) # if login: # email = login[0] # password = login[2] # else: # sys.exit("Error: No Fantia login found in .netrc") # else: # if not email: # email = input("Email: ") # if not password: # password = getpass.getpass("Password: ") try: downloader = FantiaDownloader(session_arg=session_arg, dump_metadata=cmdl_opts.dump_metadata, parse_for_external_links=cmdl_opts.parse_for_external_links, download_thumb=cmdl_opts.download_thumb, directory=cmdl_opts.output_path, quiet=cmdl_opts.quiet, continue_on_error=cmdl_opts.continue_on_error, use_server_filenames=cmdl_opts.use_server_filenames, mark_incomplete_posts=cmdl_opts.mark_incomplete_posts, month_limit=cmdl_opts.month_limit, exclude_file=cmdl_opts.exclude_file, db_path=cmdl_opts.db_path, db_bypass_post_check=cmdl_opts.db_bypass_post_check) if cmdl_opts.download_fanclubs: try: downloader.download_followed_fanclubs(limit=cmdl_opts.limit) except KeyboardInterrupt: raise except: if cmdl_opts.continue_on_error: downloader.output("Encountered an error downloading followed fanclubs. Skipping...\n") traceback.print_exc() pass else: raise elif cmdl_opts.download_paid_fanclubs: try: downloader.download_paid_fanclubs(limit=cmdl_opts.limit) except: if cmdl_opts.continue_on_error: downloader.output("Encountered an error downloading paid fanclubs. Skipping...\n") traceback.print_exc() pass else: raise elif cmdl_opts.download_new_posts: try: downloader.download_new_posts(post_limit=cmdl_opts.download_new_posts) except: if cmdl_opts.continue_on_error: downloader.output("Encountered an error downloading new posts from timeline. Skipping...\n") traceback.print_exc() pass else: raise if cmdl_opts.url: for url in cmdl_opts.url: url_match = FANTIA_URL_RE.match(url) if url_match: try: url_groups = url_match.groups() if url_groups[0] == "fanclubs": fanclub = FantiaClub(url_groups[1]) downloader.download_fanclub(fanclub, cmdl_opts.limit) elif url_groups[0] == "posts": downloader.download_post(url_groups[1]) except KeyboardInterrupt: raise except: if cmdl_opts.continue_on_error: downloader.output("Encountered an error downloading URL. Skipping...\n") traceback.print_exc() continue else: raise else: sys.stderr.write("Error: {} is not a valid URL. Please provide a fully qualified Fantia URL (https://fantia.jp/posts/[id], https://fantia.jp/fanclubs/[id])\n".format(url)) except KeyboardInterrupt: sys.exit("Interrupted by user. Exiting...") def cli(): global cmdl_opts cmdl_opts = cmdl_parser.parse_args() main() if __name__ == "__main__": cli() ================================================ FILE: fantiadl/models.py ================================================ #!/usr/bin/env python # -*- coding: utf-8 -*- from bs4 import BeautifulSoup from requests.adapters import HTTPAdapter, Retry import requests from datetime import datetime as dt from email.utils import parsedate_to_datetime from urllib.parse import unquote from urllib.parse import urljoin from urllib.parse import urlparse import http.cookiejar import json import math import mimetypes import os import re import sys import time import traceback from .__version__ import __version__ from .db import FantiaDlDatabase FANTIA_URL_RE = re.compile(r"(?:https?://(?:(?:www\.)?(?:fantia\.jp/(fanclubs|posts)/)))([0-9]+)") EXTERNAL_LINKS_RE = re.compile(r"(?:[\s]+)?((?:(?:https?://)?(?:(?:www\.)?(?:mega\.nz|mediafire\.com|(?:drive|docs)\.google\.com|youtube.com|dropbox.com)\/))[^\s]+)") DOMAIN = "fantia.jp" BASE_URL = "https://fantia.jp/" LOGIN_SIGNIN_URL = "https://fantia.jp/sessions/signin" LOGIN_SESSION_URL = "https://fantia.jp/sessions" ME_API = "https://fantia.jp/api/v1/me" FANCLUB_API = "https://fantia.jp/api/v1/fanclubs/{}" FANCLUBS_FOLLOWING_API = "https://fantia.jp/api/v1/me/fanclubs" FANCLUBS_PAID_HTML = "https://fantia.jp/mypage/users/plans?type=not_free&page={}" FANCLUB_POSTS_HTML = "https://fantia.jp/fanclubs/{}/posts?page={}" POST_API = "https://fantia.jp/api/v1/posts/{}" POST_URL = "https://fantia.jp/posts/{}" POSTS_URL = "https://fantia.jp/posts" POST_RELATIVE_URL = "/posts/" TIMELINES_API = "https://fantia.jp/api/v1/me/timelines/posts?page={}&per=24" USER_AGENT = "fantiadl/{}".format(__version__) CRAWLJOB_FILENAME = "external_links.crawljob" MIMETYPES = { "image/jpeg": ".jpg", "image/png": ".png", "image/gif": ".gif", "video/mp4": ".mp4", "video/webm": ".webm" } UNICODE_CONTROL_MAP = dict.fromkeys(range(32)) class FantiaClub: def __init__(self, fanclub_id): self.id = fanclub_id class FantiaDownloader: def __init__(self, session_arg, chunk_size=1024 * 1024 * 5, dump_metadata=False, parse_for_external_links=False, download_thumb=False, directory=None, quiet=True, continue_on_error=False, use_server_filenames=False, mark_incomplete_posts=False, month_limit=None, exclude_file=None, db_path=None, db_bypass_post_check=False): # self.email = email # self.password = password self.session_arg = session_arg self.chunk_size = chunk_size self.dump_metadata = dump_metadata self.parse_for_external_links = parse_for_external_links self.download_thumb = download_thumb self.directory = directory or "" self.quiet = quiet self.continue_on_error = continue_on_error self.use_server_filenames = use_server_filenames self.mark_incomplete_posts = mark_incomplete_posts self.month_limit = dt.strptime(month_limit, "%Y-%m") if month_limit else None self.exclude_file = exclude_file self.exclusions = [] self.db = FantiaDlDatabase(db_path) self.db_bypass_post_check = db_bypass_post_check self.initialize_session() self.login() self.create_exclusions() def output(self, output): """Write output to the console.""" if not self.quiet: try: sys.stdout.write(output.encode(sys.stdout.encoding, errors="backslashreplace").decode(sys.stdout.encoding)) sys.stdout.flush() except (UnicodeEncodeError, UnicodeDecodeError): sys.stdout.buffer.write(output.encode("utf-8")) sys.stdout.flush() def initialize_session(self): """Initialize session with necessary headers and config.""" self.session = requests.session() self.session.headers.update({"User-Agent": USER_AGENT}) retries = Retry( total=5, connect=5, read=5, status_forcelist=[429, 500, 502, 503, 504, 507, 508], backoff_factor=2, # retry delay = {backoff factor} * (2 ** ({retry number} - 1)) raise_on_status=True ) self.session.mount("http://", HTTPAdapter(max_retries=retries)) self.session.mount("https://", HTTPAdapter(max_retries=retries)) def login(self): """Login to Fantia using the provided email and password.""" try: with open(self.session_arg, "r") as cookies_file: cookies = http.cookiejar.MozillaCookieJar(self.session_arg) cookies.load() self.session.cookies = cookies except FileNotFoundError: login_cookie = requests.cookies.create_cookie(domain=DOMAIN, name="_session_id", value=self.session_arg) self.session.cookies.set_cookie(login_cookie) check_user = self.session.get(ME_API) if not (check_user.ok or check_user.status_code == 304): sys.exit("Error: Invalid session. Please verify your session cookie") # Login flow, requires reCAPTCHA token # login_json = { # "utf8": "✓", # "button": "", # "user[email]": self.email, # "user[password]": self.password, # } # login_session = self.session.get(LOGIN_SIGNIN_URL) # login_page = BeautifulSoup(login_session.text, "html.parser") # authenticity_token = login_page.select_one("input[name=\"authenticity_token\"]")["value"] # print(login_page.select_one("input[name=\"recaptcha_response\"]")) # login_json["authenticity_token"] = authenticity_token # login_json["recaptcha_response"] = ... # create_session = self.session.post(LOGIN_SESSION_URL, data=login_json) # if not create_session.headers.get("Location"): # sys.exit("Error: Bad login form data") # elif create_session.headers["Location"] == LOGIN_SIGNIN_URL: # sys.exit("Error: Failed to login. Please verify your username and password") # check_user = self.session.get(ME_API) # if not (check_user.ok or check_user.status_code == 304): # sys.exit("Error: Invalid session") def create_exclusions(self): """Read files to exclude from downloading.""" if self.exclude_file: with open(self.exclude_file, "r") as file: self.exclusions = [line.rstrip("\n") for line in file] def process_content_type(self, url): """Process the Content-Type from a request header and use it to build a filename.""" url_header = self.session.head(url, allow_redirects=True) mimetype = url_header.headers["Content-Type"] extension = guess_extension(mimetype, url) return extension def collect_post_titles(self, post_metadata): """Collect all post titles to check for duplicate names and rename as necessary by appending a counter.""" post_titles = [] for post in post_metadata["post_contents"]: try: potential_title = post["title"] or post["parent_post"]["title"] if not potential_title: potential_title = str(post["id"]) except KeyError: potential_title = str(post["id"]) title = potential_title counter = 2 while title in post_titles: title = potential_title + "_{}".format(counter) counter += 1 post_titles.append(title) return post_titles def download_fanclub_metadata(self, fanclub): """Download fanclub header, icon, and custom background.""" response = self.session.get(FANCLUB_API.format(fanclub.id)) response.raise_for_status() fanclub_json = json.loads(response.text) fanclub_creator = fanclub_json["fanclub"]["creator_name"] fanclub_directory = os.path.join(self.directory, sanitize_for_path(fanclub_creator)) os.makedirs(fanclub_directory, exist_ok=True) self.save_metadata(fanclub_json, fanclub_directory) header_url = fanclub_json["fanclub"]["cover"]["original"] if header_url: header_filename = os.path.join(fanclub_directory, "header" + self.process_content_type(header_url)) self.output("Downloading fanclub header...\n") self.perform_download(header_url, header_filename, use_server_filename=self.use_server_filenames) fanclub_icon_url = fanclub_json["fanclub"]["icon"]["original"] if fanclub_icon_url: fanclub_icon_filename = os.path.join(fanclub_directory, "icon" + self.process_content_type(fanclub_icon_url)) self.output("Downloading fanclub icon...\n") self.perform_download(fanclub_icon_url, fanclub_icon_filename, use_server_filename=self.use_server_filenames) background_url = fanclub_json["fanclub"]["background"] if background_url: background_filename = os.path.join(fanclub_directory, "background" + self.process_content_type(background_url)) self.output("Downloading fanclub background...\n") self.perform_download(background_url, background_filename, use_server_filename=self.use_server_filenames) def download_fanclub(self, fanclub, limit=0): """Download a fanclub.""" self.output("Downloading fanclub {}...\n".format(fanclub.id)) post_ids = self.fetch_fanclub_posts(fanclub) if self.dump_metadata: self.download_fanclub_metadata(fanclub) for post_id in post_ids if limit == 0 else post_ids[:limit]: try: self.download_post(post_id) except KeyboardInterrupt: raise except: if self.continue_on_error: self.output("Encountered an error downloading post. Skipping...\n") traceback.print_exc() continue else: raise def download_followed_fanclubs(self, limit=0): """Download all followed fanclubs.""" response = self.session.get(FANCLUBS_FOLLOWING_API) response.raise_for_status() fanclub_ids = json.loads(response.text)["fanclub_ids"] for fanclub_id in fanclub_ids: try: fanclub = FantiaClub(fanclub_id) self.download_fanclub(fanclub, limit) except KeyboardInterrupt: raise except: if self.continue_on_error: self.output("Encountered an error downloading fanclub. Skipping...\n") traceback.print_exc() continue else: raise def download_paid_fanclubs(self, limit=0): """Download all fanclubs backed on a paid plan.""" all_paid_fanclubs = [] page_number = 1 self.output("Collecting paid fanclubs...\n") while True: response = self.session.get(FANCLUBS_PAID_HTML.format(page_number)) response.raise_for_status() response_page = BeautifulSoup(response.text, "html.parser") fanclub_links = response_page.select("div.mb-5-children > div:nth-of-type(1) a[href^=\"/fanclubs\"]") for fanclub_link in fanclub_links: fanclub_id = fanclub_link["href"].lstrip("/fanclubs/") all_paid_fanclubs.append(fanclub_id) if not fanclub_links: self.output("Collected {} fanclubs.\n".format(len(all_paid_fanclubs))) break else: page_number += 1 for fanclub_id in all_paid_fanclubs: try: fanclub = FantiaClub(fanclub_id) self.download_fanclub(fanclub, limit) except: if self.continue_on_error: self.output("Encountered an error downloading fanclub. Skipping...\n") traceback.print_exc() continue else: raise def download_new_posts(self, post_limit=24): all_new_post_ids = [] total_pages = math.ceil(post_limit / 24) page_number = 1 has_next = True self.output("Downloading {} new posts...\n".format(post_limit)) while has_next and not len(all_new_post_ids) >= post_limit: response = self.session.get(TIMELINES_API.format(page_number)) response.raise_for_status() json_response = json.loads(response.text) posts = json_response["posts"] has_next = json_response["has_next"] for post in posts: if len(all_new_post_ids) >= post_limit: break post_id = post["id"] all_new_post_ids.append(post_id) page_number += 1 for post_id in all_new_post_ids: try: self.download_post(post_id) except KeyboardInterrupt: raise except: if self.continue_on_error: self.output("Encountered an error downloading post. Skipping...\n") traceback.print_exc() continue else: raise def fetch_fanclub_posts(self, fanclub): """Iterate over a fanclub's HTML pages to fetch all post IDs.""" all_posts = [] post_found = False page_number = 1 self.output("Collecting fanclub posts...\n") while True: response = self.session.get(FANCLUB_POSTS_HTML.format(fanclub.id, page_number)) response.raise_for_status() response_page = BeautifulSoup(response.text, "html.parser") posts = response_page.select("div.post") new_post_ids = [] for post in posts: link = post.select_one("a.link-block")["href"] post_id = link.lstrip(POST_RELATIVE_URL) date_string = post.select_one(".post-date .mr-5").text if post.select_one(".post-date .mr-5") else post.select_one(".post-date").text parsed_date = dt.strptime(date_string, "%Y-%m-%d %H:%M") if not self.month_limit or (parsed_date.year == self.month_limit.year and parsed_date.month == self.month_limit.month): post_found = True new_post_ids.append(post_id) all_posts += new_post_ids if not posts or (not new_post_ids and post_found): # No new posts found and we've already collected a post self.output("Collected {} posts.\n".format(len(all_posts))) return all_posts else: page_number += 1 def perform_download(self, url, filepath, use_server_filename=False, append_server_extension=False): """Perform a download for the specified URL while showing progress.""" url_path = unquote(url.split("?", 1)[0]) server_filename = os.path.basename(url_path) filename = os.path.basename(filepath) if use_server_filename: filepath = os.path.join(os.path.dirname(filepath), server_filename) # Check if filename is in exclusion list if server_filename in self.exclusions: self.output("Server filename in exclusion list (skipping): {}\n".format(server_filename)) return elif filename in self.exclusions: self.output("Filename in exclusion list (skipping): {}\n".format(filename)) return if self.db.conn and self.db.is_url_downloaded(url_path): self.output("URL already downloaded. Skipping...\n") return request = self.session.get(url, stream=True) if request.status_code == 404: self.output("Download URL returned 404. Skipping...\n") return request.raise_for_status() # Handle redirects so we can properly catch an excluded filename # Attachments typically route from fantia.jp/posts/#/download/# # Images typically are served directly from cc.fantia.jp # Metadata images typically are served from c.fantia.jp if request.url != url: url_path = unquote(request.url.split("?", 1)[0]) server_filename = os.path.basename(url_path) if server_filename in self.exclusions: self.output("Server filename in exclusion list (skipping): {}\n".format(server_filename)) return if use_server_filename: filepath = os.path.join(os.path.dirname(filepath), server_filename) if not use_server_filename and append_server_extension: filepath += guess_extension(request.headers["Content-Type"], url) file_size = int(request.headers["Content-Length"]) if os.path.isfile(filepath) and os.stat(filepath).st_size == file_size: self.output("File found (skipping): {}\n".format(filepath)) self.db.insert_url(url_path) return self.output("File: {}\n".format(filepath)) incomplete_filename = filepath + ".part" downloaded = 0 with open(incomplete_filename, "wb") as file: for chunk in request.iter_content(self.chunk_size): downloaded += len(chunk) file.write(chunk) done = int(25 * downloaded / file_size) percent = int(100 * downloaded / file_size) self.output("\r|{0}{1}| {2}% ".format("\u2588" * done, " " * (25 - done), percent)) self.output("\n") if downloaded != file_size: raise Exception("Downloaded file size mismatch (expected {}, got {})".format(file_size, downloaded)) if os.path.exists(filepath): os.remove(filepath) os.rename(incomplete_filename, filepath) self.db.insert_url(url_path) modification_time_string = request.headers["Last-Modified"] modification_time = int(dt.strptime(modification_time_string, "%a, %d %b %Y %H:%M:%S %Z").timestamp()) if modification_time: access_time = int(time.time()) os.utime(filepath, times=(access_time, modification_time)) def download_photo(self, photo_url, photo_counter, gallery_directory): """Download a photo to the post's directory.""" extension = self.process_content_type(photo_url) filename = os.path.join(gallery_directory, str(photo_counter) + extension) if gallery_directory else str() self.perform_download(photo_url, filename, use_server_filename=self.use_server_filenames) def download_file(self, download_url, filename, post_directory): """Download a file to the post's directory.""" self.perform_download(download_url, filename, use_server_filename=True) # Force serve filenames to prevent duplicate collision def download_post_content(self, post_json, post_directory, post_title): """Parse the post's content to determine whether to save the content as a photo gallery or file.""" self.output(f"> Content {post_json['id']}\n") if self.db.conn and self.db.is_post_content_downloaded(post_json["id"]): self.output("Post content already downloaded. Skipping...\n") return True if post_json["visible_status"] != "visible": self.output("Post content not available on current plan. Skipping...\n") return False if post_json.get("category"): if post_json["category"] == "photo_gallery": photo_gallery = post_json["post_content_photos"] photo_counter = 0 gallery_directory = os.path.join(post_directory, sanitize_for_path(post_title)) os.makedirs(gallery_directory, exist_ok=True) for photo in photo_gallery: photo_url = photo["url"]["original"] self.download_photo(photo_url, photo_counter, gallery_directory) photo_counter += 1 elif post_json["category"] == "file": filename = os.path.join(post_directory, post_json["filename"]) download_url = urljoin(POSTS_URL, post_json["download_uri"]) self.download_file(download_url, filename, post_directory) elif post_json["category"] == "embed": if self.parse_for_external_links: # TODO: Check what URLs are allowed as embeds link_as_list = [post_json["embed_url"]] self.output("Adding embedded link {0} to {1}.\n".format(post_json["embed_url"], CRAWLJOB_FILENAME)) build_crawljob(link_as_list, self.directory, post_directory) elif post_json["category"] == "blog": blog_comment = post_json["comment"] blog_json = json.loads(blog_comment) photo_counter = 0 gallery_directory = os.path.join(post_directory, sanitize_for_path(post_title)) os.makedirs(gallery_directory, exist_ok=True) for op in blog_json["ops"]: if type(op["insert"]) is dict and op["insert"].get("fantiaImage"): photo_url = urljoin(BASE_URL, op["insert"]["fantiaImage"]["original_url"]) self.download_photo(photo_url, photo_counter, gallery_directory) photo_counter += 1 else: self.output("Post content category \"{}\" is not supported. Skipping...\n".format(post_json.get("category"))) return False self.db.insert_post_content(post_json["id"], post_json["parent_post"]["url"].rsplit("/", 1)[1], post_json["title"], post_json["category"], post_json["foreign_plan_price"], post_json["currency_code"]) if self.parse_for_external_links: post_description = post_json["comment"] or "" self.parse_external_links(post_description, os.path.abspath(post_directory)) return True def download_thumbnail(self, thumb_url, post_directory): """Download a thumbnail to the post's directory.""" extension = self.process_content_type(thumb_url) filename = os.path.join(post_directory, "thumb" + extension) self.perform_download(thumb_url, filename, use_server_filename=self.use_server_filenames) def download_post(self, post_id): """Download a post to its own directory.""" db_post = self.db.find_post(post_id) if self.db_bypass_post_check and self.db.conn and db_post and db_post["download_complete"]: self.output("Post {} already downloaded. Skipping...\n".format(post_id)) return self.output("Downloading post {}...\n".format(post_id)) post_html_response = self.session.get(POST_URL.format(post_id)) post_html_response.raise_for_status() post_html = BeautifulSoup(post_html_response.text, "html.parser") csrf_token = post_html.select_one("meta[name=\"csrf-token\"]")["content"] response = self.session.get(POST_API.format(post_id), headers={ "X-CSRF-Token": csrf_token, "X-Requested-With": "XMLHttpRequest" }) response.raise_for_status() post_json = json.loads(response.text)["post"] post_id = post_json["id"] post_creator = post_json["fanclub"]["creator_name"] post_title = post_json["title"] post_contents = post_json["post_contents"] post_posted_at = int(parsedate_to_datetime(post_json["posted_at"]).timestamp()) post_converted_at = int(dt.fromisoformat(post_json["converted_at"]).timestamp()) if post_json["converted_at"] else post_posted_at if self.db.conn and db_post and db_post["download_complete"]: # Check if the post date changed, which may indicate new contents were added if db_post["converted_at"] != post_converted_at: self.output("Post date does not match date in database. Checking for new contents...\n") self.db.update_post_download_complete(post_id, download_complete=0) self.db.update_post_converted_at(post_id, post_converted_at) else: self.output("Post appears to have been downloaded completely. Skipping...\n".format(post_id)) return if self.db.conn and not db_post: self.db.insert_post(post_id, post_title, post_json["fanclub"]["id"], post_posted_at, post_converted_at) post_directory_title = sanitize_for_path(str(post_id)) post_directory = os.path.join(self.directory, sanitize_for_path(post_creator), post_directory_title) os.makedirs(post_directory, exist_ok=True) post_titles = self.collect_post_titles(post_json) if self.dump_metadata: self.save_metadata(post_json, post_directory) if self.mark_incomplete_posts: self.mark_incomplete_post(post_json, post_directory) if self.download_thumb and post_json["thumb"]: self.download_thumbnail(post_json["thumb"]["original"], post_directory) if self.parse_for_external_links: # Main post post_description = post_json["comment"] or "" self.parse_external_links(post_description, os.path.abspath(post_directory)) download_complete_counter = 0 for post_index, post in enumerate(post_contents): post_title = post_titles[post_index] if self.download_post_content(post, post_directory, post_title): download_complete_counter += 1 if self.db.conn and download_complete_counter == len(post_contents): self.output("All post content appears to have been downloaded. Marking as complete in database...\n") self.db.update_post_download_complete(post_id) if not os.listdir(post_directory): self.output("No content downloaded for post {}. Deleting directory.\n".format(post_id)) os.rmdir(post_directory) def parse_external_links(self, post_description, post_directory): """Parse the post description for external links, e.g. Mega and Google Drive links.""" link_matches = EXTERNAL_LINKS_RE.findall(post_description) if link_matches: self.output("Found {} external link(s) in post. Saving...\n".format(len(link_matches))) build_crawljob(link_matches, self.directory, post_directory) def save_metadata(self, metadata, directory): """Save the metadata for a post to the post's directory.""" filename = os.path.join(directory, "metadata.json") with open(filename, "w", encoding='utf-8') as file: json.dump(metadata, file, sort_keys=True, ensure_ascii=False, indent=4) def mark_incomplete_post(self, post_metadata, post_directory): """Mark incomplete posts with a .incomplete file.""" is_incomplete = False incomplete_filename = os.path.join(post_directory, ".incomplete") for post in post_metadata["post_contents"]: if post["visible_status"] != "visible": is_incomplete = True break if is_incomplete: if not os.path.exists(incomplete_filename): open(incomplete_filename, 'a').close() else: if os.path.exists(incomplete_filename): os.remove(incomplete_filename) def guess_extension(mimetype, download_url): """ Guess the file extension from the mimetype or force a specific extension for certain mimetypes. If the mimetype returns no found extension, guess based on the download URL. """ extension = MIMETYPES.get(mimetype) or mimetypes.guess_extension(mimetype, strict=True) if not extension: try: path = urlparse(download_url).path extension = os.path.splitext(path)[1] except IndexError: extension = ".unknown" return extension def sanitize_for_path(value, replace=' '): """Remove potentially illegal characters from a path.""" sanitized = re.sub(r'[<>\"\?\\\/\*:|]', replace, value) sanitized = sanitized.translate(UNICODE_CONTROL_MAP) return re.sub(r'[\s.]+$', '', sanitized) def build_crawljob(links, root_directory, post_directory): """Append to a root .crawljob file with external links gathered from a post.""" filename = os.path.join(root_directory, CRAWLJOB_FILENAME) with open(filename, "a", encoding="utf-8") as file: for link in links: crawl_dict = { "packageName": "Fantia", "text": link, "downloadFolder": post_directory, "enabled": "true", "autoStart": "true", "forcedStart": "true", "autoConfirm": "true", "addOfflineLink": "true", "extractAfterDownload": "false" } for key, value in crawl_dict.items(): file.write(key + "=" + value + "\n") file.write("\n") ================================================ FILE: fantiadl.py ================================================ from fantiadl.fantiadl import cli cli() ================================================ FILE: requirements.txt ================================================ beautifulsoup4 requests ================================================ FILE: setup.py ================================================ import setuptools from distutils.util import convert_path with open("README.md", "r") as description_file: long_description = description_file.read() with open("requirements.txt", "r") as requirements_file: requirements = requirements_file.read().split("\n") main_ns = {} version_path = convert_path("fantiadl/__version__.py") with open(version_path, encoding="utf8") as version_file: exec(version_file.read(), main_ns) setuptools.setup( name="fantiadl", version=main_ns["__version__"], description="Download posts and media from Fantia", long_description=long_description, long_description_content_type="text/markdown", author="bitto", url="https://github.com/bitbybyte/fantiadl", packages=["fantiadl"], classifiers=[ "Programming Language :: Python", "Programming Language :: Python :: 3", "License :: OSI Approved :: MIT License", "Topic :: Internet :: WWW/HTTP", ], license="MIT", install_requires=requirements, entry_points={ "console_scripts": [ "fantiadl=fantiadl.fantiadl:cli" ] }, python_requires=">=3.7", )