Repository: ArmindoFlores/ao3_api Branch: master Commit: 02e349985d92 Files: 36 Total size: 166.2 KB Directory structure: gitextract_gr15a5tc/ ├── AO3/ │ ├── __init__.py │ ├── chapters.py │ ├── comments.py │ ├── common.py │ ├── extra.py │ ├── requester.py │ ├── search.py │ ├── series.py │ ├── session.py │ ├── threadable.py │ ├── users.py │ ├── utils.py │ └── works.py ├── LICENSE ├── README.md ├── dist/ │ ├── ao3_api-2.0.0-py3-none-any.whl │ ├── ao3_api-2.0.1-py3-none-any.whl │ ├── ao3_api-2.0.2-py3-none-any.whl │ ├── ao3_api-2.0.3-py3-none-any.whl │ ├── ao3_api-2.0.4-py3-none-any.whl │ ├── ao3_api-2.0.5-py3-none-any.whl │ ├── ao3_api-2.0.6-py3-none-any.whl │ ├── ao3_api-2.0.7-py3-none-any.whl │ ├── ao3_api-2.0.8-py3-none-any.whl │ ├── ao3_api-2.1.0-py3-none-any.whl │ ├── ao3_api-2.1.1-py3-none-any.whl │ ├── ao3_api-2.1.2-py3-none-any.whl │ ├── ao3_api-2.2.0-py3-none-any.whl │ ├── ao3_api-2.2.1-py3-none-any.whl │ ├── ao3_api-2.3.0-py3-none-any.whl │ └── ao3_api-2.3.1-py3-none-any.whl ├── docs/ │ ├── index.md │ ├── install.md │ └── use.md ├── mkdocs.yml └── pyproject.toml ================================================ FILE CONTENTS ================================================ ================================================ FILE: AO3/__init__.py ================================================ from . import extra, utils from .chapters import Chapter from .comments import Comment from .search import Search from .series import Series from .session import GuestSession, Session from .users import User from .works import Work VERSION = "2.3.0" ================================================ FILE: AO3/chapters.py ================================================ from functools import cached_property import bs4 from bs4 import BeautifulSoup from . import threadable, utils from .comments import Comment from .requester import requester from .users import User class Chapter: """ AO3 chapter object """ def __init__(self, chapterid, work, session=None, load=True): self._session = session self._work = work self.id = chapterid self._soup = None if load: self.reload() def __repr__(self): if self.id is None: return f"Chapter [ONESHOT] from [{self.work}]" try: return f"" except: return f"" def __eq__(self, other): return isinstance(other, __class__) and other.id == self.id def __getstate__(self): d = {} for attr in self.__dict__: if isinstance(self.__dict__[attr], BeautifulSoup): d[attr] = (self.__dict__[attr].encode(), True) else: d[attr] = (self.__dict__[attr], False) return d def __setstate__(self, d): for attr in d: value, issoup = d[attr] if issoup: self.__dict__[attr] = BeautifulSoup(value, "lxml") else: self.__dict__[attr] = value def set_session(self, session): """Sets the session used to make requests for this chapter Args: session (AO3.Session/AO3.GuestSession): session object """ self._session = session @threadable.threadable def reload(self): """ Loads information about this chapter. This function is threadable. """ from .works import Work for attr in self.__class__.__dict__: if isinstance(getattr(self.__class__, attr), cached_property): if attr in self.__dict__: delattr(self, attr) if self.work is None: soup = self.request(f"https://archiveofourown.org/chapters/{self.id}?view_adult=true") workid = soup.find("li", {"class": "chapter entire"}) if workid is None: raise utils.InvalidIdError("Cannot find work") self._work = Work(utils.workid_from_url(workid.a["href"])) else: self.work.reload() for chapter in self.work.chapters: if chapter == self: self._soup = chapter._soup @threadable.threadable def comment(self, comment_text, email="", name="", pseud=None): """Leaves a comment on this chapter. This function is threadable. Args: comment_text (str): Comment text Raises: utils.UnloadedError: Couldn't load chapters utils.AuthError: Invalid session Returns: requests.models.Response: Response object """ if self.id is None: return self._work.comment(comment_text, email, name, pseud) if not self.loaded: raise utils.UnloadedError("Chapter isn't loaded. Have you tried calling Chapter.reload()?") if self._session is None: raise utils.AuthError("Invalid session") if self.id is not None: return utils.comment(self, comment_text, self._session, False, email=email, name=name, pseud=pseud) def get_comments(self, maximum=None): """Returns a list of all threads of comments in the chapter. This operation can take a very long time. Because of that, it is recomended that you set a maximum number of comments. Duration: ~ (0.13 * n_comments) seconds or 2.9 seconds per comment page Args: maximum (int, optional): Maximum number of comments to be returned. None -> No maximum Raises: ValueError: Invalid chapter number IndexError: Invalid chapter number utils.UnloadedError: Chapter isn't loaded Returns: list: List of comments """ if self.id is None: return self._work.get_comments(maximum=maximum) if not self.loaded: raise utils.UnloadedError("Chapter isn't loaded. Have you tried calling Chapter.reload()?") url = f"https://archiveofourown.org/chapters/{self.id}?page=%d&show_comments=true&view_adult=true" soup = self.request(url%1) pages = 0 div = soup.find("div", {"id": "comments_placeholder"}) ol = div.find("ol", {"class": "pagination actions"}) if ol is None: pages = 1 else: for li in ol.findAll("li"): if li.getText().isdigit(): pages = int(li.getText()) comments = [] for page in range(pages): if page != 0: soup = self.request(url%(page+1)) ol = soup.find("ol", {"class": "thread"}) for li in ol.findAll("li", {"role": "article"}, recursive=False): if maximum is not None and len(comments) >= maximum: return comments id_ = int(li.attrs["id"][8:]) header = li.find("h4", {"class": ("heading", "byline")}) if header is None: author = None else: author = User(str(header.a.text), self._session, False) if li.blockquote is not None: text = li.blockquote.getText() else: text = "" comment = Comment(id_, self, session=self._session, load=False) setattr(comment, "authenticity_token", self.authenticity_token) setattr(comment, "author", author) setattr(comment, "text", text) comment._thread = None comments.append(comment) return comments def get_images(self): """Gets all images from this work Raises: utils.UnloadedError: Raises this error if the chapter isn't loaded Returns: tuple: Pairs of image urls and the paragraph number """ div = self._soup.find("div", {"class": "userstuff"}) images = [] line = 0 for p in div.findAll("p"): line += 1 for img in p.findAll("img"): if "src" in img.attrs: images.append((img.attrs["src"], line)) return tuple(images) @property def loaded(self): """Returns True if this chapter has been loaded""" return self._soup is not None @property def authenticity_token(self): """Token used to take actions that involve this work""" return self.work.authenticity_token @property def work(self): """Work this chapter is a part of""" return self._work @cached_property def text(self): """This chapter's text""" text = "" if self.id is not None: div = self._soup.find("div", {"role": "article"}) else: div = self._soup for p in div.findAll(("p", "center")): text += p.getText().replace("\n", "") + "\n" if isinstance(p.next_sibling, bs4.element.NavigableString): text += str(p.next_sibling) return text @cached_property def title(self): """This chapter's title""" if self.id is None: return self.work.title preface_group = self._soup.find("div", {"class": ("chapter", "preface", "group")}) if preface_group is None: return str(self.number) title = preface_group.find("h3", {"class": "title"}) if title is None: return str(self.number) return tuple(title.strings)[-1].strip()[2:] @cached_property def number(self): """This chapter's number""" if self.id is None: return 1 return int(self._soup["id"].split("-")[-1]) @cached_property def words(self): """Number of words from this chapter""" return utils.word_count(self.text) @cached_property def summary(self): """Text from this chapter's summary""" notes = self._soup.find("div", {"id": "summary"}) if notes is None: return "" text = "" for p in notes.findAll("p"): text += p.getText() + "\n" return text @cached_property def start_notes(self): """Text from this chapter's start notes""" notes = self._soup.find("div", {"id": "notes"}) if notes is None: return "" text = "" for p in notes.findAll("p"): text += p.getText().strip() + "\n" return text @cached_property def end_notes(self): """Text from this chapter's end notes""" notes = self._soup.find("div", {"id": f"chapter_{self.number}_endnotes"}) if notes is None: return "" text = "" for p in notes.findAll("p"): text += p.getText() + "\n" return text @cached_property def url(self): """Returns the URL to this chapter Returns: str: chapter URL """ return f"https://archiveofourown.org/works/{self._work.id}/chapters/{self.id}" def request(self, url): """Request a web page and return a BeautifulSoup object. Args: url (str): Url to request Returns: bs4.BeautifulSoup: BeautifulSoup object representing the requested page's html """ req = self.get(url) soup = BeautifulSoup(req.content, "lxml") return soup def get(self, *args, **kwargs): """Request a web page and return a Response object""" if self._session is None: req = requester.request("get", *args, **kwargs) else: req = requester.request("get", *args, **kwargs, session=self._session.session) if req.status_code == 429: raise utils.HTTPError("We are being rate-limited. Try again in a while or reduce the number of requests") return req ================================================ FILE: AO3/comments.py ================================================ from functools import cached_property from bs4 import BeautifulSoup from . import threadable, utils from .requester import requester from .users import User class Comment: """ AO3 comment object """ def __init__(self, comment_id, parent=None, parent_comment=None, session=None, load=True): """Creates a new AO3 comment object Args: comment_id (int/str): Comment ID parent (Work/Chapter, optional): Parent object (where the comment is posted). Defaults to None. parent_comment (Comment, optional): Parent comment. Defaults to None. session (Session/GuestSession, optional): Session object load (boolean, optional): If true, the comment is loaded on initialization. Defaults to True. """ self.id = comment_id self.parent = parent self.parent_comment = parent_comment self.authenticity_token = None self._thread = None self._session = session self.__soup = None if load: self.reload() def __repr__(self): return f"" @property def _soup(self): if self.__soup is None: if self.parent_comment is None: return None return self.parent_comment._soup return self.__soup @property def first_parent_comment(self): if self.parent_comment is None: return self else: return self.parent_comment.first_parent_comment @property def fullwork(self): from .works import Work if self.parent is None: return None return isinstance(self.parent, Work) @cached_property def author(self): """Comment author""" li = self._soup.find("li", {"id": f"comment_{self.id}"}) header = li.find("h4", {"class": ("heading", "byline")}) if header is None: author = None else: author = User(str(header.a.text), self._session, False) return author @cached_property def text(self): """Comment text""" li = self._soup.find("li", {"id": f"comment_{self.id}"}) if li.blockquote is not None: text = li.blockquote.getText() else: text = "" return text def get_thread(self): """Returns all the replies to this comment, and all subsequent replies recursively. Also loads any parent comments this comment might have. Raises: utils.InvalidIdError: The specified comment_id was invalid Returns: list: Thread """ if self._thread is not None: return self._thread else: if self._soup is None: self.reload() nav = self._soup.find("ul", {"id": f"navigation_for_comment_{self.id}"}) for li in nav.findAll("li"): if li.getText() == "\nParent Thread\n": id_ = int(li.a["href"].split("/")[-1]) parent = Comment(id_, session=self._session) for comment in parent.get_thread_iterator(): if comment.id == self.id: index = comment.parent_comment._thread.index(comment) comment.parent_comment._thread.pop(index) comment.parent_comment._thread.insert(index, self) self._thread = comment._thread self.parent_comment = comment.parent_comment del comment return self._thread thread = self._soup.find("ol", {"class": "thread"}) if thread is None: self._thread = [] return self._thread self._get_thread(None, thread) if self._thread is None: self._thread = [] return self._thread def _get_thread(self, parent, soup): comments = soup.findAll("li", recursive=False) l = [self] if parent is None else [] for comment in comments: if "role" in comment.attrs: id_ = int(comment.attrs["id"][8:]) c = Comment(id_, self.parent, session=self._session, load=False) c.authenticity_token = self.authenticity_token c._thread = [] if parent is not None: c.parent_comment = parent if comment.blockquote is not None: text = comment.blockquote.getText() else: text = "" if comment.a is not None: author = User(comment.a.getText(), load=False) else: author = None setattr(c, "text", text) setattr(c, "author", author) l.append(c) else: c.parent_comment = self if comment.blockquote is not None: text = comment.blockquote.getText() else: text = "" if comment.a is not None: author = User(comment.a.getText(), load=False) else: author = None setattr(l[0], "text", text) setattr(l[0], "author", author) else: self._get_thread(l[-1], comment.ol) if parent is not None: parent._thread = l def get_thread_iterator(self): """Returns a generator that allows you to iterate through the entire thread Returns: generator: The generator object """ return threadIterator(self) @threadable.threadable def reply(self, comment_text, email="", name=""): """Replies to a comment. This function is threadable. Args: comment_text (str): Comment text email (str, optional): Email. Defaults to "". name (str, optional): Name. Defaults to "". Raises: utils.InvalidIdError: Invalid ID utils.UnexpectedResponseError: Unknown error utils.PseudoError: Couldn't find a valid pseudonym to post under utils.DuplicateCommentError: The comment you're trying to post was already posted ValueError: Invalid name/email ValueError: self.parent cannot be None Returns: requests.models.Response: Response object """ if self.parent is None: raise ValueError("self.parent cannot be 'None'") return utils.comment(self.parent, comment_text, self._session, self.fullwork, self.id, email, name) @threadable.threadable def reload(self): """Loads all comment properties This function is threadable. """ from .works import Work for attr in self.__class__.__dict__: if isinstance(getattr(self.__class__, attr), cached_property): if attr in self.__dict__: delattr(self, attr) req = self.get(f"https://archiveofourown.org/comments/{self.id}") self.__soup = BeautifulSoup(req.content, features="lxml") token = self.__soup.find("meta", {"name": "csrf-token"}) self.authenticity_token = token["content"] self._thread = None li = self._soup.find("li", {"id": f"comment_{self.id}"}) reply_link = li.find("li", {"id": f"add_comment_reply_link_{self.id}"}) if self.parent is None: if reply_link is not None: fields = [field.split("=") for field in reply_link.a["href"].split("?")[-1].split("&")] for key, value in fields: if key == "chapter_id": self.parent = int(value) break self.parent_comment = None @threadable.threadable def delete(self): """Deletes this comment. This function is threadable. Raises: PermissionError: You don't have permission to delete the comment utils.AuthError: Invalid auth token utils.UnexpectedResponseError: Unknown error """ utils.delete_comment(self, self._session) def get(self, *args, **kwargs): """Request a web page and return a Response object""" if self._session is None: req = requester.request("get", *args, **kwargs) else: req = requester.request("get", *args, **kwargs, session=self._session.session) if req.status_code == 429: raise utils.HTTPError("We are being rate-limited. Try again in a while or reduce the number of requests") return req def threadIterator(comment): if comment.get_thread() is None or len(comment.get_thread()) == 0: yield comment else: for c in comment.get_thread(): yield c for sub in threadIterator(c): if c != sub: yield sub ================================================ FILE: AO3/common.py ================================================ import datetime from . import utils def __setifnotnone(obj, attr, value): if value is not None: setattr(obj, attr, value) def get_work_from_banner(work): #* These imports need to be here to prevent circular imports #* (series.py would requite common.py and vice-versa) from .series import Series from .users import User from .works import Work authors = [] try: for a in work.h4.find_all("a"): if 'rel' in a.attrs.keys(): if "author" in a['rel']: authors.append(User(a.string, load=False)) elif a.attrs["href"].startswith("/works"): workname = a.string workid = utils.workid_from_url(a['href']) except AttributeError: pass new = Work(workid, load=False) fandoms = [] try: for a in work.find("h5", {"class": "fandoms"}).find_all("a"): fandoms.append(a.string) except AttributeError: pass warnings = [] relationships = [] characters = [] freeforms = [] try: for a in work.find(attrs={"class": "tags"}).find_all("li"): if "warnings" in a['class']: warnings.append(a.text) elif "relationships" in a['class']: relationships.append(a.text) elif "characters" in a['class']: characters.append(a.text) elif "freeforms" in a['class']: freeforms.append(a.text) except AttributeError: pass reqtags = work.find(attrs={"class": "required-tags"}) if reqtags is not None: rating = reqtags.find(attrs={"class": "rating"}) if rating is not None: rating = rating.text categories = reqtags.find(attrs={"class": "category"}) if categories is not None: categories = categories.text.split(", ") else: rating = categories = None summary = work.find(attrs={"class": "userstuff summary"}) if summary is not None: summary = summary.text series = [] series_list = work.find(attrs={"class": "series"}) if series_list is not None: for a in series_list.find_all("a"): seriesid = int(a.attrs['href'].split("/")[-1]) seriesname = a.text s = Series(seriesid, load=False) setattr(s, "name", seriesname) series.append(s) stats = work.find(attrs={"class": "stats"}) if stats is not None: language = stats.find("dd", {"class": "language"}) if language is not None: language = language.text words = stats.find("dd", {"class": "words"}) if words is not None: words = words.text.replace(",", "") if words.isdigit(): words = int(words) else: words = None bookmarks = stats.find("dd", {"class": "bookmarks"}) if bookmarks is not None: bookmarks = bookmarks.text.replace(",", "") if bookmarks.isdigit(): bookmarks = int(bookmarks) else: bookmarks = None chapters = stats.find("dd", {"class": "chapters"}) if chapters is not None: chapters = chapters.text.split('/')[0].replace(",", "") if chapters.isdigit(): chapters = int(chapters) else: chapters = None expected_chapters = stats.find("dd", {"class": "chapters"}) if expected_chapters is not None: expected_chapters = expected_chapters.text.split('/')[-1].replace(",", "") if expected_chapters.isdigit(): expected_chapters = int(expected_chapters) else: expected_chapters = None hits = stats.find("dd", {"class": "hits"}) if hits is not None: hits = hits.text.replace(",", "") if hits.isdigit(): hits = int(hits) else: hits = None kudos = stats.find("dd", {"class": "kudos"}) if kudos is not None: kudos = kudos.text.replace(",", "") if kudos.isdigit(): kudos = int(kudos) else: kudos = None comments = stats.find("dd", {"class": "comments"}) if comments is not None: comments = comments.text.replace(",", "") if comments.isdigit(): comments = int(comments) else: comments = None restricted = work.find("img", {"title": "Restricted"}) is not None if chapters is None: complete = None else: complete = chapters == expected_chapters else: language = words = bookmarks = chapters = expected_chapters = hits = restricted = complete = None date = work.find("p", {"class": "datetime"}) if date is None: date_updated = None else: date_updated = datetime.datetime.strptime(date.getText(), "%d %b %Y") __setifnotnone(new, "authors", authors) __setifnotnone(new, "bookmarks", bookmarks) __setifnotnone(new, "categories", categories) __setifnotnone(new, "nchapters", chapters) __setifnotnone(new, "characters", characters) __setifnotnone(new, "complete", complete) __setifnotnone(new, "date_updated", date_updated) __setifnotnone(new, "expected_chapters", expected_chapters) __setifnotnone(new, "fandoms", fandoms) __setifnotnone(new, "hits", hits) __setifnotnone(new, "comments", comments) __setifnotnone(new, "kudos", kudos) __setifnotnone(new, "language", language) __setifnotnone(new, "rating", rating) __setifnotnone(new, "relationships", relationships) __setifnotnone(new, "restricted", restricted) __setifnotnone(new, "series", series) __setifnotnone(new, "summary", summary) __setifnotnone(new, "tags", freeforms) __setifnotnone(new, "title", workname) __setifnotnone(new, "warnings", warnings) __setifnotnone(new, "words", words) return new def url_join(base, *args): result = base for arg in args: if len(result) > 0 and not result[-1] == "/": result += "/" if len(arg) > 0 and arg[0] != "/": result += arg else: result += arg[1:] return result ================================================ FILE: AO3/extra.py ================================================ import functools import os import pathlib import pickle from bs4 import BeautifulSoup from . import threadable, utils from .requester import requester def _download_languages(): path = os.path.dirname(__file__) languages = [] try: rsrc_path = os.path.join(path, "resources") if not os.path.isdir(rsrc_path): os.mkdir(rsrc_path) language_path = os.path.join(rsrc_path, "languages") if not os.path.isdir(language_path): os.mkdir(language_path) url = "https://archiveofourown.org/languages" print(f"Downloading from {url}") req = requester.request("get", url) soup = BeautifulSoup(req.content, "lxml") for dt in soup.find("dl", {"class": "language index group"}).findAll("dt"): if dt.a is not None: alias = dt.a.attrs["href"].split("/")[-1] else: alias = None languages.append((dt.getText(), alias)) with open(f"{os.path.join(language_path, 'languages')}.pkl", "wb") as file: pickle.dump(languages, file) except AttributeError: raise utils.UnexpectedResponseError("Couldn't download the desired resource. Do you have the latest version of ao3-api?") print(f"Download complete ({len(languages)} languages)") def _download_fandom(fandom_key, name): path = os.path.dirname(__file__) fandoms = [] try: rsrc_path = os.path.join(path, "resources") if not os.path.isdir(rsrc_path): os.mkdir(rsrc_path) fandom_path = os.path.join(rsrc_path, "fandoms") if not os.path.isdir(fandom_path): os.mkdir(fandom_path) url = f"https://archiveofourown.org/media/{fandom_key}/fandoms" print(f"Downloading from {url}") req = requester.request("get", url) soup = BeautifulSoup(req.content, "lxml") for fandom in soup.find("ol", {"class": "alphabet fandom index group"}).findAll("a", {"class": "tag"}): fandoms.append(fandom.getText()) with open(f"{os.path.join(fandom_path, name)}.pkl", "wb") as file: pickle.dump(fandoms, file) except AttributeError: raise utils.UnexpectedResponseError("Couldn't download the desired resource. Do you have the latest version of ao3-api?") print(f"Download complete ({len(fandoms)} fandoms)") _FANDOM_RESOURCES = { "anime_manga_fandoms": functools.partial( _download_fandom, "Anime%20*a*%20Manga", "anime_manga_fandoms"), "books_literature_fandoms": functools.partial( _download_fandom, "Books%20*a*%20Literature", "books_literature_fandoms"), "cartoons_comics_graphicnovels_fandoms": functools.partial( _download_fandom, "Cartoons%20*a*%20Comics%20*a*%20Graphic%20Novels", "cartoons_comics_graphicnovels_fandoms"), "celebrities_real_people_fandoms": functools.partial( _download_fandom, "Celebrities%20*a*%20Real%20People", "celebrities_real_people_fandoms"), "movies_fandoms": functools.partial( _download_fandom, "Movies", "movies_fandoms"), "music_bands_fandoms": functools.partial( _download_fandom, "Music%20*a*%20Bands", "music_bands_fandoms"), "other_media_fandoms": functools.partial( _download_fandom, "Other%20Media", "other_media_fandoms"), "theater_fandoms": functools.partial( _download_fandom, "Theater", "theater_fandoms"), "tvshows_fandoms": functools.partial( _download_fandom, "TV%20Shows", "tvshows_fandoms"), "videogames_fandoms": functools.partial( _download_fandom, "Video%20Games", "videogames_fandoms"), "uncategorized_fandoms": functools.partial( _download_fandom, "Uncategorized%20Fandoms", "uncategorized_fandoms") } _LANGUAGE_RESOURCES = { "languages": _download_languages } _RESOURCE_DICTS = [("fandoms", _FANDOM_RESOURCES), ("languages", _LANGUAGE_RESOURCES)] @threadable.threadable def download(resource): """Downloads the specified resource. This function is threadable. Args: resource (str): Resource name Raises: KeyError: Invalid resource """ for _, resource_dict in _RESOURCE_DICTS: if resource in resource_dict: resource_dict[resource]() return raise KeyError(f"'{resource}' is not a valid resource") def get_resources(): """Returns a list of every resource available for download""" d = {} for name, resource_dict in _RESOURCE_DICTS: d[name] = list(resource_dict.keys()) return d def has_resource(resource): """Returns True if resource was already download, False otherwise""" path = os.path.join(os.path.dirname(__file__), "resources") return len(list(pathlib.Path(path).rglob(resource+".pkl"))) > 0 @threadable.threadable def download_all(redownload=False): """Downloads every available resource. This function is threadable.""" types = get_resources() for rsrc_type in types: for rsrc in types[rsrc_type]: if redownload or not has_resource(rsrc): download(rsrc) @threadable.threadable def download_all_threaded(redownload=False): """Downloads every available resource in parallel (about ~3.7x faster). This function is threadable.""" threads = [] types = get_resources() for rsrc_type in types: for rsrc in types[rsrc_type]: if redownload or not has_resource(rsrc): threads.append(download(rsrc, threaded=True)) for thread in threads: thread.join() ================================================ FILE: AO3/requester.py ================================================ import threading import time import requests class Requester: """Requester object""" def __init__(self, rqtw=-1, timew=60): """Limits the request rate to prevent HTTP 429 (rate limiting) responses. 12 request per minute seems to be the limit. Args: rqm (int, optional): Maximum requests per time window (-1 -> no limit). Defaults to -1. timew (int, optional): Time window (seconds). Defaults to 60. """ self._requests = [] self._rqtw = rqtw self._timew = timew self._lock = threading.Lock() self.total = 0 def setRQTW(self, value): self._rqtw = value def setTimeW(self, value): self._timew = value def request(self, *args, **kwargs): """Requests a web page once enough time has passed since the last request Args: session(requests.Session, optional): Session object to request with Returns: requests.Response: Response object """ # We've made a bunch of requests, time to rate limit? if self._rqtw != -1: with self._lock: if len(self._requests) >= self._rqtw: t = time.time() # Reduce list to only requests made within the current time window while len(self._requests): if t-self._requests[0] >= self._timew: self._requests.pop(0) # Older than window, forget about it else: break # Inside window, the rest of them must be too # Have we used up all available requests within our window? if len(self._requests) >= self._rqtw: # Yes # Wait until the oldest request exits the window, giving us a slot for the new one time.sleep(self._requests[0] + self._timew - t) # Now outside window, drop it self._requests.pop(0) if self._rqtw != -1: self._requests.append(time.time()) self.total += 1 if "session" in kwargs: sess = kwargs["session"] del kwargs["session"] req = sess.request(*args, **kwargs) else: req = requests.request(*args, **kwargs) return req requester = Requester() ================================================ FILE: AO3/search.py ================================================ from math import ceil from bs4 import BeautifulSoup from . import threadable, utils from .common import get_work_from_banner from .requester import requester from .series import Series from .users import User from .works import Work DEFAULT = "_score" BEST_MATCH = "_score" AUTHOR = "authors_to_sort_on" TITLE = "title_to_sort_on" DATE_POSTED = "created_at" DATE_UPDATED = "revised_at" WORD_COUNT = "word_count" RATING = "rating_ids" HITS = "hits" BOOKMARKS = "bookmarks_count" COMMENTS = "comments_count" KUDOS = "kudos_count" DESCENDING = "desc" ASCENDING = "asc" class Search: def __init__( self, any_field="", title="", author="", single_chapter=False, word_count=None, language="", fandoms="", rating=None, hits=None, kudos=None, crossovers=None, bookmarks=None, excluded_tags="", comments=None, completion_status=None, page=1, sort_column="", sort_direction="", revised_at="", characters="", relationships="", tags="", session=None): self.any_field = any_field self.title = title self.author = author self.single_chapter = single_chapter self.word_count = word_count self.language = language self.fandoms = fandoms self.characters = characters self.relationships = relationships self.tags = tags self.rating = rating self.hits = hits self.kudos = kudos self.crossovers = crossovers self.bookmarks = bookmarks self.excluded_tags = excluded_tags self.comments = comments self.completion_status = completion_status self.page = page self.sort_column = sort_column self.sort_direction = sort_direction self.revised_at = revised_at self.session = session self.results = None self.pages = 0 self.total_results = 0 @threadable.threadable def update(self): """Sends a request to the AO3 website with the defined search parameters, and updates all info. This function is threadable. """ soup = search( self.any_field, self.title, self.author, self.single_chapter, self.word_count, self.language, self.fandoms, self.rating, self.hits, self.kudos, self.crossovers, self.bookmarks, self.excluded_tags, self.comments, self.completion_status, self.page, self.sort_column, self.sort_direction, self.revised_at, self.session, self.characters, self.relationships, self.tags) results = soup.find("ol", {"class": ("work", "index", "group")}) if results is None and soup.find("p", text="No results found. You may want to edit your search to make it less specific.") is not None: self.results = [] self.total_results = 0 self.pages = 0 return works = [] for work in results.find_all("li", {"role": "article"}): if work.h4 is None: continue new = get_work_from_banner(work) new._session = self.session works.append(new) self.results = works maindiv = soup.find("div", {"class": "works-search region", "id": "main"}) self.total_results = int(maindiv.find("h3", {"class": "heading"}).getText().replace(',','').replace('.','').strip().split(" ")[0]) self.pages = ceil(self.total_results / 20) def search( any_field="", title="", author="", single_chapter=False, word_count=None, language="", fandoms="", rating=None, hits=None, kudos=None, crossovers=None, bookmarks=None, excluded_tags="", comments=None, completion_status=None, page=1, sort_column="", sort_direction="", revised_at="", session=None, characters="", relationships="", tags=""): """Returns the results page for the search as a Soup object Args: any_field (str, optional): Generic search. Defaults to "". title (str, optional): Title of the work. Defaults to "". author (str, optional): Authors of the work. Defaults to "". single_chapter (bool, optional): Only include one-shots. Defaults to False. word_count (AO3.utils.Constraint, optional): Word count. Defaults to None. language (str, optional): Work language. Defaults to "". fandoms (str, optional): Fandoms included in the work. Defaults to "". characters (str, optional): Characters included in the work. Defaults to "". relationships (str, optional): Relationships included in the work. Defaults to "". tags (str, optional): Additional tags applied to the work. Defaults to "". rating (int, optional): Rating for the work. 9 for Not Rated, 10 for General Audiences, 11 for Teen And Up Audiences, 12 for Mature, 13 for Explicit. Defaults to None. hits (AO3.utils.Constraint, optional): Number of hits. Defaults to None. kudos (AO3.utils.Constraint, optional): Number of kudos. Defaults to None. crossovers (bool, optional): If specified, if false, exclude crossovers, if true, include only crossovers bookmarks (AO3.utils.Constraint, optional): Number of bookmarks. Defaults to None. excluded_tags (str, optional): Tags to exclude. Defaults to "". comments (AO3.utils.Constraint, optional): Number of comments. Defaults to None. page (int, optional): Page number. Defaults to 1. sort_column (str, optional): Which column to sort on. Defaults to "". sort_direction (str, optional): Which direction to sort. Defaults to "". revised_at (str, optional): Show works older / more recent than this date. Defaults to "". session (AO3.Session, optional): Session object. Defaults to None. Returns: bs4.BeautifulSoup: Search result's soup """ query = utils.Query() query.add_field(f"work_search[query]={any_field if any_field != '' else ' '}") if page != 1: query.add_field(f"page={page}") if title != "": query.add_field(f"work_search[title]={title}") if author != "": query.add_field(f"work_search[creators]={author}") if single_chapter: query.add_field(f"work_search[single_chapter]=1") if word_count is not None: query.add_field(f"work_search[word_count]={word_count}") if language != "": query.add_field(f"work_search[language_id]={language}") if fandoms != "": query.add_field(f"work_search[fandom_names]={fandoms}") if characters != "": query.add_field(f"work_search[character_names]={characters}") if relationships != "": query.add_field(f"work_search[relationship_names]={relationships}") if tags != "": query.add_field(f"work_search[freeform_names]={tags}") if rating is not None: query.add_field(f"work_search[rating_ids]={rating}") if hits is not None: query.add_field(f"work_search[hits]={hits}") if kudos is not None: query.add_field(f"work_search[kudos_count]={kudos}") if crossovers is not None: query.add_field(f"work_search[crossover]={'T' if crossovers else 'F'}") if bookmarks is not None: query.add_field(f"work_search[bookmarks_count]={bookmarks}") if excluded_tags != "": query.add_field(f"work_search[excluded_tag_names]={excluded_tags}") if comments is not None: query.add_field(f"work_search[comments_count]={comments}") if completion_status is not None: query.add_field(f"work_search[complete]={'T' if completion_status else 'F'}") if sort_column != "": query.add_field(f"work_search[sort_column]={sort_column}") if sort_direction != "": query.add_field(f"work_search[sort_direction]={sort_direction}") if revised_at != "": query.add_field(f"work_search[revised_at]={revised_at}") url = f"https://archiveofourown.org/works/search?{query.string}" if session is None: req = requester.request("get", url) else: req = session.get(url) if req.status_code == 429: raise utils.HTTPError("We are being rate-limited. Try again in a while or reduce the number of requests") soup = BeautifulSoup(req.content, features="lxml") return soup ================================================ FILE: AO3/series.py ================================================ from datetime import date from functools import cached_property from bs4 import BeautifulSoup from . import threadable, utils from .common import get_work_from_banner from .requester import requester from .users import User from .works import Work class Series: def __init__(self, seriesid, session=None, load=True): """Creates a new series object Args: seriesid (int/str): ID of the series session (AO3.Session, optional): Session object. Defaults to None. load (bool, optional): If true, the work is loaded on initialization. Defaults to True. Raises: utils.InvalidIdError: Invalid series ID """ self.id = seriesid self._session = session self._soup = None if load: self.reload() def __eq__(self, other): return isinstance(other, __class__) and other.id == self.id def __repr__(self): try: return f"" except: return f"" def __getstate__(self): d = {} for attr in self.__dict__: if isinstance(self.__dict__[attr], BeautifulSoup): d[attr] = (self.__dict__[attr].encode(), True) else: d[attr] = (self.__dict__[attr], False) return d def __setstate__(self, d): for attr in d: value, issoup = d[attr] if issoup: self.__dict__[attr] = BeautifulSoup(value, "lxml") else: self.__dict__[attr] = value def set_session(self, session): """Sets the session used to make requests for this series Args: session (AO3.Session/AO3.GuestSession): session object """ self._session = session @threadable.threadable def reload(self): """ Loads information about this series. This function is threadable. """ for attr in self.__class__.__dict__: if isinstance(getattr(self.__class__, attr), cached_property): if attr in self.__dict__: delattr(self, attr) self._soup = self.request(f"https://archiveofourown.org/series/{self.id}") if "Error 404" in self._soup.text: raise utils.InvalidIdError("Cannot find series") @threadable.threadable def subscribe(self): """Subscribes to this series. This function is threadable. Raises: utils.AuthError: Invalid session """ if self._session is None or not self._session.is_authed: raise utils.AuthError("You can only subscribe to a series using an authenticated session") utils.subscribe(self, "Series", self._session) @threadable.threadable def unsubscribe(self): """Unubscribes from this series. This function is threadable. Raises: utils.AuthError: Invalid session """ if not self.is_subscribed: raise Exception("You are not subscribed to this series") if self._session is None or not self._session.is_authed: raise utils.AuthError("You can only unsubscribe from a series using an authenticated session") utils.subscribe(self, "Series", self._session, True, self._sub_id) @threadable.threadable def bookmark(self, notes="", tags=None, collections=None, private=False, recommend=False, pseud=None): """Bookmarks this series This function is threadable Args: notes (str, optional): Bookmark notes. Defaults to "". tags (list, optional): What tags to add. Defaults to None. collections (list, optional): What collections to add this bookmark to. Defaults to None. private (bool, optional): Whether this bookmark should be private. Defaults to False. recommend (bool, optional): Whether to recommend this bookmark. Defaults to False. pseud (str, optional): What pseud to add the bookmark under. Defaults to default pseud. Raises: utils.UnloadedError: Series isn't loaded utils.AuthError: Invalid session """ if not self.loaded: raise utils.UnloadedError("Series isn't loaded. Have you tried calling Series.reload()?") if self._session is None: raise utils.AuthError("Invalid session") utils.bookmark(self, self._session, notes, tags, collections, private, recommend, pseud) @threadable.threadable def delete_bookmark(self): """Removes a bookmark from this series This function is threadable Raises: utils.UnloadedError: Series isn't loaded utils.AuthError: Invalid session """ if not self.loaded: raise utils.UnloadedError("Series isn't loaded. Have you tried calling Series.reload()?") if self._session is None: raise utils.AuthError("Invalid session") if self._bookmarkid is None: raise utils.BookmarkError("You don't have a bookmark here") utils.delete_bookmark(self._bookmarkid, self._session, self.authenticity_token) @cached_property def _bookmarkid(self): form_div = self._soup.find("div", {"id": "bookmark-form"}) if form_div is None: return None if form_div.form is None: return None if "action" in form_div.form and form_div.form["action"].startswith("/bookmark"): text = form_div.form["action"].split("/")[-1] if text.isdigit(): return int(text) return None return None @cached_property def url(self): """Returns the URL to this series Returns: str: series URL """ return f"https://archiveofourown.org/series/{self.id}" @property def loaded(self): """Returns True if this series has been loaded""" return self._soup is not None @cached_property def authenticity_token(self): """Token used to take actions that involve this work""" if not self.loaded: return None token = self._soup.find("meta", {"name": "csrf-token"}) return token["content"] @cached_property def is_subscribed(self): """True if you're subscribed to this series""" if self._session is None or not self._session.is_authed: raise utils.AuthError("You can only get a series ID using an authenticated session") form = self._soup.find("form", {"data-create-value": "Subscribe"}) input_ = form.find("input", {"name": "commit", "value": "Unsubscribe"}) return input_ is not None @cached_property def _sub_id(self): """Returns the subscription ID. Used for unsubscribing""" if not self.is_subscribed: raise Exception("You are not subscribed to this series") form = self._soup.find("form", {"data-create-value": "Subscribe"}) id_ = form.attrs["action"].split("/")[-1] return int(id_) @cached_property def name(self): div = self._soup.find("div", {"class": "series-show region"}) return div.h2.getText().replace("\t", "").replace("\n", "") @cached_property def creators(self): dl = self._soup.find("dl", {"class": "series meta group"}) return [User(author.getText(), load=False) for author in dl.findAll("a", {"rel": "author"})] @cached_property def series_begun(self): dl = self._soup.find("dl", {"class": "series meta group"}) info = dl.findAll(("dd", "dt")) last_dt = None for field in info: if field.name == "dt": last_dt = field.getText().strip() elif last_dt == "Series Begun:": date_str = field.getText().strip() break return date(*list(map(int, date_str.split("-")))) @cached_property def series_updated(self): dl = self._soup.find("dl", {"class": "series meta group"}) info = dl.findAll(("dd", "dt")) last_dt = None for field in info: if field.name == "dt": last_dt = field.getText().strip() elif last_dt == "Series Updated:": date_str = field.getText().strip() break return date(*list(map(int, date_str.split("-")))) @cached_property def words(self): dl = self._soup.find("dl", {"class": "series meta group"}) stats = dl.find("dl", {"class": "stats"}).findAll(("dd", "dt")) last_dt = None for field in stats: if field.name == "dt": last_dt = field.getText().strip() elif last_dt == "Words:": words = field.getText().strip() break return int(words.replace(",", "")) @cached_property def nworks(self): dl = self._soup.find("dl", {"class": "series meta group"}) stats = dl.find("dl", {"class": "stats"}).findAll(("dd", "dt")) last_dt = None for field in stats: if field.name == "dt": last_dt = field.getText().strip() elif last_dt == "Works:": works = field.getText().strip() break return int(works.replace(",", "")) @cached_property def complete(self): dl = self._soup.find("dl", {"class": "series meta group"}) stats = dl.find("dl", {"class": "stats"}).findAll(("dd", "dt")) last_dt = None for field in stats: if field.name == "dt": last_dt = field.getText().strip() elif last_dt == "Complete:": complete = field.getText().strip() break return True if complete == "Yes" else False @cached_property def description(self): dl = self._soup.find("dl", {"class": "series meta group"}) info = dl.findAll(("dd", "dt")) last_dt = None desc = "" for field in info: if field.name == "dt": last_dt = field.getText().strip() elif last_dt == "Description:": desc = field.getText().strip() break return desc @cached_property def notes(self): dl = self._soup.find("dl", {"class": "series meta group"}) info = dl.findAll(("dd", "dt")) last_dt = None notes = "" for field in info: if field.name == "dt": last_dt = field.getText().strip() elif last_dt == "Notes:": notes = field.getText().strip() break return notes @cached_property def nbookmarks(self): dl = self._soup.find("dl", {"class": "series meta group"}) stats = dl.find("dl", {"class": "stats"}).findAll(("dd", "dt")) last_dt = None book = "0" for field in stats: if field.name == "dt": last_dt = field.getText().strip() elif last_dt == "Bookmarks:": book = field.getText().strip() break return int(book.replace(",", "")) @cached_property def work_list(self): ul = self._soup.find("ul", {"class": "series work index group"}) works = [] for work in ul.find_all("li", {"role": "article"}): if work.h4 is None: continue works.append(get_work_from_banner(work)) # authors = [] # if work.h4 is None: # continue # for a in work.h4.find_all("a"): # if "rel" in a.attrs.keys(): # if "author" in a["rel"]: # authors.append(User(a.string, load=False)) # elif a.attrs["href"].startswith("/works"): # workname = a.string # workid = utils.workid_from_url(a["href"]) # new = Work(workid, load=False) # setattr(new, "title", workname) # setattr(new, "authors", authors) # works.append(new) return works def get(self, *args, **kwargs): """Request a web page and return a Response object""" if self._session is None: req = requester.request("get", *args, **kwargs) else: req = requester.request("get", *args, **kwargs, session=self._session.session) if req.status_code == 429: raise utils.HTTPError("We are being rate-limited. Try again in a while or reduce the number of requests") return req def request(self, url): """Request a web page and return a BeautifulSoup object. Args: url (str): Url to request Returns: bs4.BeautifulSoup: BeautifulSoup object representing the requested page's html """ req = self.get(url) soup = BeautifulSoup(req.content, "lxml") return soup ================================================ FILE: AO3/session.py ================================================ import datetime import re import time from functools import cached_property import requests from bs4 import BeautifulSoup from . import threadable, utils from .requester import requester from .series import Series from .users import User from .works import Work class GuestSession: """ AO3 guest session object """ def __init__(self): self.is_authed = False self.authenticity_token = None self.username = "" self.session = requests.Session() @property def user(self): return User(self.username, self, False) @threadable.threadable def comment(self, commentable, comment_text, oneshot=False, commentid=None): """Leaves a comment on a specific work. This function is threadable. Args: commentable (Work/Chapter): Commentable object comment_text (str): Comment text (must have between 1 and 10000 characters) oneshot (bool): Should be True if the work has only one chapter. In this case, chapterid becomes workid commentid (str/int): If specified, the comment is posted as a reply to this one. Defaults to None. Raises: utils.InvalidIdError: Invalid ID utils.UnexpectedResponseError: Unknown error utils.PseudoError: Couldn't find a valid pseudonym to post under utils.DuplicateCommentError: The comment you're trying to post was already posted ValueError: Invalid name/email Returns: requests.models.Response: Response object """ response = utils.comment(commentable, comment_text, self, oneshot, commentid) return response @threadable.threadable def kudos(self, work): """Leave a 'kudos' in a specific work. This function is threadable. Args: work (Work): ID of the work Raises: utils.UnexpectedResponseError: Unexpected response received utils.InvalidIdError: Invalid ID (work doesn't exist) Returns: bool: True if successful, False if you already left kudos there """ return utils.kudos(work, self) @threadable.threadable def refresh_auth_token(self): """Refreshes the authenticity token. This function is threadable. Raises: utils.UnexpectedResponseError: Couldn't refresh the token """ # For some reason, the auth token in the root path only works if you're # unauthenticated. To get around that, we check if this is an authed # session and, if so, get the token from the profile page. if self.is_authed: req = self.session.get(f"https://archiveofourown.org/users/{self.username}") else: req = self.session.get("https://archiveofourown.org") if req.status_code == 429: raise utils.HTTPError("We are being rate-limited. Try again in a while or reduce the number of requests") soup = BeautifulSoup(req.content, "lxml") token = soup.find("input", {"name": "authenticity_token"}) if token is None: raise utils.UnexpectedResponseError("Couldn't refresh token") self.authenticity_token = token.attrs["value"] def get(self, *args, **kwargs): """Request a web page and return a Response object""" if self.session is None: req = requester.request("get", *args, **kwargs) else: req = requester.request("get", *args, **kwargs, session=self.session) if req.status_code == 429: raise utils.HTTPError("We are being rate-limited. Try again in a while or reduce the number of requests") return req def request(self, url): """Request a web page and return a BeautifulSoup object. Args: url (str): Url to request Returns: bs4.BeautifulSoup: BeautifulSoup object representing the requested page's html """ req = self.get(url) soup = BeautifulSoup(req.content, "lxml") return soup def post(self, *args, **kwargs): """Make a post request with the current session Returns: requests.Request """ req = self.session.post(*args, **kwargs) if req.status_code == 429: raise utils.HTTPError("We are being rate-limited. Try again in a while or reduce the number of requests") return req def __del__(self): self.session.close() class Session(GuestSession): """ AO3 session object """ def __init__(self, username, password): """Creates a new AO3 session object Args: username (str): AO3 username password (str): AO3 password Raises: utils.LoginError: Login was unsucessful (wrong username or password) """ super().__init__() self.is_authed = True self.username = username self.url = "https://archiveofourown.org/users/%s"%self.username self.session = requests.Session() soup = self.request("https://archiveofourown.org/users/login") self.authenticity_token = soup.find("input", {"name": 'authenticity_token'})["value"] payload = {'user[login]': username, 'user[password]': password, 'authenticity_token': self.authenticity_token} post = self.post("https://archiveofourown.org/users/login", params=payload, allow_redirects=False) if not post.status_code == 302: raise utils.LoginError("Invalid username or password") self._subscriptions_url = "https://archiveofourown.org/users/{0}/subscriptions?page={1:d}" self._bookmarks_url = "https://archiveofourown.org/users/{0}/bookmarks?page={1:d}" self._history_url = "https://archiveofourown.org/users/{0}/readings?page={1:d}" self._bookmarks = None self._subscriptions = None self._history = None def __getstate__(self): d = {} for attr in self.__dict__: if isinstance(self.__dict__[attr], BeautifulSoup): d[attr] = (self.__dict__[attr].encode(), True) else: d[attr] = (self.__dict__[attr], False) return d def __setstate__(self, d): for attr in d: value, issoup = d[attr] if issoup: self.__dict__[attr] = BeautifulSoup(value, "lxml") else: self.__dict__[attr] = value def clear_cache(self): for attr in self.__class__.__dict__: if isinstance(getattr(self.__class__, attr), cached_property): if attr in self.__dict__: delattr(self, attr) self._bookmarks = None self._subscriptions = None @cached_property def _subscription_pages(self): url = self._subscriptions_url.format(self.username, 1) soup = self.request(url) pages = soup.find("ol", {"title": "pagination"}) if pages is None: return 1 n = 1 for li in pages.findAll("li"): text = li.getText() if text.isdigit(): n = int(text) return n def get_work_subscriptions(self, use_threading=False): """ Get subscribed works. Loads them if they haven't been previously Returns: list: List of work subscriptions """ subs = self.get_subscriptions(use_threading) return list(filter(lambda obj: isinstance(obj, Work), subs)) def get_series_subscriptions(self, use_threading=False): """ Get subscribed series. Loads them if they haven't been previously Returns: list: List of series subscriptions """ subs = self.get_subscriptions(use_threading) return list(filter(lambda obj: isinstance(obj, Series), subs)) def get_user_subscriptions(self, use_threading=False): """ Get subscribed users. Loads them if they haven't been previously Returns: list: List of users subscriptions """ subs = self.get_subscriptions(use_threading) return list(filter(lambda obj: isinstance(obj, User), subs)) def get_subscriptions(self, use_threading=False): """ Get user's subscriptions. Loads them if they haven't been previously Returns: list: List of subscriptions """ if self._subscriptions is None: if use_threading: self.load_subscriptions_threaded() else: self._subscriptions = [] for page in range(self._subscription_pages): self._load_subscriptions(page=page+1) return self._subscriptions @threadable.threadable def load_subscriptions_threaded(self): """ Get subscribed works using threads. This function is threadable. """ threads = [] self._subscriptions = [] for page in range(self._subscription_pages): threads.append(self._load_subscriptions(page=page+1, threaded=True)) for thread in threads: thread.join() @threadable.threadable def _load_subscriptions(self, page=1): url = self._subscriptions_url.format(self.username, page) soup = self.request(url) subscriptions = soup.find("dl", {"class": "subscription index group"}) for sub in subscriptions.find_all("dt"): type_ = "work" user = None series = None workid = None workname = None authors = [] for a in sub.find_all("a"): if "rel" in a.attrs.keys(): if "author" in a["rel"]: authors.append(User(str(a.string), load=False)) elif a["href"].startswith("/works"): workname = str(a.string) workid = utils.workid_from_url(a["href"]) elif a["href"].startswith("/users"): type_ = "user" user = User(str(a.string), load=False) else: type_ = "series" workname = str(a.string) series = int(a["href"].split("/")[-1]) if type_ == "work": new = Work(workid, load=False) setattr(new, "title", workname) setattr(new, "authors", authors) self._subscriptions.append(new) elif type_ == "user": self._subscriptions.append(user) elif type_ == "series": new = Series(series, load=False) setattr(new, "name", workname) setattr(new, "authors", authors) self._subscriptions.append(new) @cached_property def _history_pages(self): url = self._history_url.format(self.username, 1) soup = self.request(url) pages = soup.find("ol", {"title": "pagination"}) if pages is None: return 1 n = 1 for li in pages.findAll("li"): text = li.getText() if text.isdigit(): n = int(text) return n def get_history(self, hist_sleep=3, start_page=0, max_pages=None, timeout_sleep=60): """ Get history works. Loads them if they haven't been previously. Arguments: hist_sleep (int to sleep between requests) start_page (int for page to start on, zero-indexed) max_pages (int for page to end on, zero-indexed) timeout_sleep (int, if set will attempt to recovery from http errors, likely timeouts, if set to None will just attempt to load) takes two arguments the first hist_sleep is an int and is a sleep to run between pages of history to load to avoid hitting the rate limiter, the second is an int of the maximum number of pages of history to load, by default this is None so loads them all. Returns: list: List of tuples (Work, number-of-visits, datetime-last-visited) """ if self._history is None: self._history = [] for page in range(start_page, self._history_pages): # If we are attempting to recover from errors then # catch and loop, otherwise just call and go if timeout_sleep is None: self._load_history(page=page+1) else: loaded=False while loaded == False: try: self._load_history(page=page+1) # print(f"Read history page {page+1}") loaded = True except utils.HTTPError: # print(f"History being rate limited, sleeping for {timeout_sleep} seconds") time.sleep(timeout_sleep) # Check for maximum history page load if max_pages is not None and page >= max_pages: return self._history # Again attempt to avoid rate limiter, sleep for a few # seconds between page requests. if hist_sleep is not None and hist_sleep > 0: time.sleep(hist_sleep) return self._history def _load_history(self, page=1): url = self._history_url.format(self.username, page) soup = self.request(url) history = soup.find("ol", {"class": "reading work index group"}) for item in history.find_all("li", {"role": "article"}): # authors = [] workname = None workid = None for a in item.h4.find_all("a"): if a.attrs["href"].startswith("/works"): workname = str(a.string) workid = utils.workid_from_url(a["href"]) visited_date = None visited_num = 1 for viewed in item.find_all("h4", {"class": "viewed heading" }): data_string = str(viewed) date_str = re.search('Last visited: (\d{2} .+ \d{4})', data_string) if date_str is not None: raw_date = date_str.group(1) date_time_obj = datetime.datetime.strptime(date_str.group(1), '%d %b %Y') visited_date = date_time_obj visited_str = re.search('Visited (\d+) times', data_string) if visited_str is not None: visited_num = int(visited_str.group(1)) if workname != None and workid != None: new = Work(workid, load=False) setattr(new, "title", workname) # setattr(new, "authors", authors) hist_item = [ new, visited_num, visited_date ] # print(hist_item) if new not in self._history: self._history.append(hist_item) @cached_property def _bookmark_pages(self): url = self._bookmarks_url.format(self.username, 1) soup = self.request(url) pages = soup.find("ol", {"title": "pagination"}) if pages is None: return 1 n = 1 for li in pages.findAll("li"): text = li.getText() if text.isdigit(): n = int(text) return n def get_bookmarks(self, use_threading=False): """ Get bookmarked works. Loads them if they haven't been previously Returns: list: List of tuples (workid, workname, authors) """ if self._bookmarks is None: if use_threading: self.load_bookmarks_threaded() else: self._bookmarks = [] for page in range(self._bookmark_pages): self._load_bookmarks(page=page+1) return self._bookmarks @threadable.threadable def load_bookmarks_threaded(self): """ Get bookmarked works using threads. This function is threadable. """ threads = [] self._bookmarks = [] for page in range(self._bookmark_pages): threads.append(self._load_bookmarks(page=page+1, threaded=True)) for thread in threads: thread.join() @threadable.threadable def _load_bookmarks(self, page=1): url = self._bookmarks_url.format(self.username, page) soup = self.request(url) bookmarks = soup.find("ol", {"class": "bookmark index group"}) for bookm in bookmarks.find_all("li", {"class": ["bookmark", "index", "group"]}): authors = [] recommended = False workid = -1 if bookm.h4 is not None: for a in bookm.h4.find_all("a"): if "rel" in a.attrs.keys(): if "author" in a["rel"]: authors.append(User(str(a.string), load=False)) elif a.attrs["href"].startswith("/works"): workname = str(a.string) workid = utils.workid_from_url(a["href"]) # Get whether the bookmark is recommended for span in bookm.p.find_all("span"): if "title" in span.attrs.keys(): if span["title"] == "Rec": recommended = True if workid != -1: new = Work(workid, load=False) setattr(new, "title", workname) setattr(new, "authors", authors) setattr(new, "recommended", recommended) if new not in self._bookmarks: self._bookmarks.append(new) @cached_property def bookmarks(self): """Get the number of your bookmarks. Must be logged in to use. Returns: int: Number of bookmarks """ url = self._bookmarks_url.format(self.username, 1) soup = self.request(url) div = soup.find("div", {"class": "bookmarks-index dashboard filtered region"}) h2 = div.h2.text.split() return int(h2[4].replace(',', '')) def get_statistics(self, year=None): year = "All+Years" if year is None else str(year) url = f"https://archiveofourown.org/users/{self.username}/stats?year={year}" soup = self.request(url) stats = {} dt = soup.find("dl", {"class": "statistics meta group"}) if dt is not None: for field in dt.findAll("dt"): name = field.getText()[:-1].lower().replace(" ", "_") if field.next_sibling is not None and field.next_sibling.next_sibling is not None: value = field.next_sibling.next_sibling.getText().replace(",", "") if value.isdigit(): stats[name] = int(value) return stats @staticmethod def str_format(string): """Formats a given string Args: string (str): String to format Returns: str: Formatted string """ return string.replace(",", "") def get_marked_for_later(self, sleep=1, timeout_sleep=60): """ Gets every marked for later work Arguments: sleep (int): The time to wait between page requests timeout_sleep (int): The time to wait after the rate limit is hit Returns: works (list): All marked for later works """ pageRaw = self.request(f"https://archiveofourown.org/users/{self.username}/readings?page=1&show=to-read").find("ol", {"class": "pagination actions"}).find_all("li") maxPage = int(pageRaw[len(pageRaw)-2].text) works = [] for page in range(maxPage): grabbed = False while grabbed == False: try: workPage = self.request(f"https://archiveofourown.org/users/{self.username}/readings?page={page+1}&show=to-read") worksRaw = workPage.find_all("li", {"role": "article"}) for work in worksRaw: try: workId = int(work.h4.a.get("href").split("/")[2]) works.append(Work(workId, session=self, load=False)) except AttributeError: pass grabbed = True except utils.HTTPError: time.sleep(timeout_sleep) time.sleep(sleep) return works ================================================ FILE: AO3/threadable.py ================================================ import threading def threadable(func): """Allows the function to be ran as a thread using the 'threaded' argument""" def new(*args, threaded=False, **kwargs): if threaded: thread = threading.Thread(target=func, args=args, kwargs=kwargs) thread.start() return thread else: return func(*args, **kwargs) new.__doc__ = func.__doc__ new.__name__ = func.__name__ new._threadable = True return new class ThreadPool: def __init__(self, maximum=None): self.maximum = maximum self._tasks = [] self._threads = [] def add_task(self, task): self._tasks.append(task) @threadable def start(self): while len(self._threads) != 0 or len(self._tasks) != 0: self._threads[:] = filter(lambda thread: thread.is_alive(), self._threads) for _ in range(min(self.maximum-len(self._threads), len(self._tasks))): self._threads.append(self._tasks.pop(0)(threaded=True)) ================================================ FILE: AO3/users.py ================================================ import datetime from functools import cached_property import requests from bs4 import BeautifulSoup from . import threadable, utils from .common import get_work_from_banner from .requester import requester class User: """ AO3 user object """ def __init__(self, username, session=None, load=True): """Creates a new AO3 user object Args: username (str): AO3 username session (AO3.Session, optional): Used to access additional info load (bool, optional): If true, the user is loaded on initialization. Defaults to True. """ self.username = username self._session = session self._soup_works = None self._soup_profile = None self._soup_bookmarks = None self._works = None self._bookmarks = None if load: self.reload() def __repr__(self): return f"" def __eq__(self, other): return isinstance(other, __class__) and other.username == self.username def __getstate__(self): d = {} for attr in self.__dict__: if isinstance(self.__dict__[attr], BeautifulSoup): d[attr] = (self.__dict__[attr].encode(), True) else: d[attr] = (self.__dict__[attr], False) return d def __setstate__(self, d): for attr in d: value, issoup = d[attr] if issoup: self.__dict__[attr] = BeautifulSoup(value, "lxml") else: self.__dict__[attr] = value def set_session(self, session): """Sets the session used to make requests for this work Args: session (AO3.Session/AO3.GuestSession): session object """ self._session = session @threadable.threadable def reload(self): """ Loads information about this user. This function is threadable. """ for attr in self.__class__.__dict__: if isinstance(getattr(self.__class__, attr), cached_property): if attr in self.__dict__: delattr(self, attr) @threadable.threadable def req_works(username): self._soup_works = self.request(f"https://archiveofourown.org/users/{username}/works") token = self._soup_works.find("meta", {"name": "csrf-token"}) setattr(self, "authenticity_token", token["content"]) @threadable.threadable def req_profile(username): self._soup_profile = self.request(f"https://archiveofourown.org/users/{username}/profile") token = self._soup_profile.find("meta", {"name": "csrf-token"}) setattr(self, "authenticity_token", token["content"]) @threadable.threadable def req_bookmarks(username): self._soup_bookmarks = self.request(f"https://archiveofourown.org/users/{username}/bookmarks") token = self._soup_bookmarks.find("meta", {"name": "csrf-token"}) setattr(self, "authenticity_token", token["content"]) rs = [req_works(self.username, threaded=True), req_profile(self.username, threaded=True), req_bookmarks(self.username, threaded=True)] for r in rs: r.join() self._works = None self._bookmarks = None def get_avatar(self): """Returns a tuple containing the name of the file and its data Returns: tuple: (name: str, img: bytes) """ icon = self._soup_profile.find("p", {"class": "icon"}) src = icon.img.attrs["src"] name = src.split("/")[-1].split("?")[0] img = self.get(src).content return name, img @threadable.threadable def subscribe(self): """Subscribes to this user. This function is threadable. Raises: utils.AuthError: Invalid session """ if self._session is None or not self._session.is_authed: raise utils.AuthError("You can only subscribe to a user using an authenticated session") utils.subscribe(self, "User", self._session) @threadable.threadable def unsubscribe(self): """Unubscribes from this user. This function is threadable. Raises: utils.AuthError: Invalid session """ if not self.is_subscribed: raise Exception("You are not subscribed to this user") if self._session is None or not self._session.is_authed: raise utils.AuthError("You can only unsubscribe from a user using an authenticated session") utils.subscribe(self, "User", self._session, True, self._sub_id) @property def id(self): id_ = self._soup_profile.find("input", {"id": "subscription_subscribable_id"}) return int(id_["value"]) if id_ is not None else None @cached_property def is_subscribed(self): """True if you're subscribed to this user""" if self._session is None or not self._session.is_authed: raise utils.AuthError("You can only get a user ID using an authenticated session") header = self._soup_profile.find("div", {"class": "primary header module"}) input_ = header.find("input", {"name": "commit", "value": "Unsubscribe"}) return input_ is not None @property def loaded(self): """Returns True if this user has been loaded""" return self._soup_profile is not None # @cached_property # def authenticity_token(self): # """Token used to take actions that involve this user""" # if not self.loaded: # return None # token = self._soup_profile.find("meta", {"name": "csrf-token"}) # return token["content"] @cached_property def user_id(self): if self._session is None or not self._session.is_authed: raise utils.AuthError("You can only get a user ID using an authenticated session") header = self._soup_profile.find("div", {"class": "primary header module"}) input_ = header.find("input", {"name": "subscription[subscribable_id]"}) if input_ is None: raise utils.UnexpectedResponseError("Couldn't fetch user ID") return int(input_.attrs["value"]) @cached_property def _sub_id(self): """Returns the subscription ID. Used for unsubscribing""" if not self.is_subscribed: raise Exception("You are not subscribed to this user") header = self._soup_profile.find("div", {"class": "primary header module"}) id_ = header.form.attrs["action"].split("/")[-1] return int(id_) @cached_property def works(self): """Returns the number of works authored by this user Returns: int: Number of works """ div = self._soup_works.find("div", {"class": "works-index dashboard filtered region"}) h2 = div.h2.text.split() return int(h2[4].replace(',', '')) @cached_property def _works_pages(self): pages = self._soup_works.find("ol", {"title": "pagination"}) if pages is None: return 1 n = 1 for li in pages.findAll("li"): text = li.getText() if text.isdigit(): n = int(text) return n def get_works(self, use_threading=False): """ Get works authored by this user. Returns: list: List of works """ if self._works is None: if use_threading: self.load_works_threaded() else: self._works = [] for page in range(self._works_pages): self._load_works(page=page+1) return self._works @threadable.threadable def load_works_threaded(self): """ Get the user's works using threads. This function is threadable. """ threads = [] self._works = [] for page in range(self._works_pages): threads.append(self._load_works(page=page+1, threaded=True)) for thread in threads: thread.join() @threadable.threadable def _load_works(self, page=1): from .works import Work self._soup_works = self.request(f"https://archiveofourown.org/users/{self.username}/works?page={page}") ol = self._soup_works.find("ol", {"class": "work index group"}) for work in ol.find_all("li", {"role": "article"}): if work.h4 is None: continue self._works.append(get_work_from_banner(work)) @cached_property def bookmarks(self): """Returns the number of works user has bookmarked Returns: int: Number of bookmarks """ div = self._soup_bookmarks.find("div", {"class": "bookmarks-index dashboard filtered region"}) h2 = div.h2.text.split() return int(h2[4].replace(',', '')) @cached_property def _bookmarks_pages(self): pages = self._soup_bookmarks.find("ol", {"title": "pagination"}) if pages is None: return 1 n = 1 for li in pages.findAll("li"): text = li.getText() if text.isdigit(): n = int(text) return n def get_bookmarks(self, use_threading=False): """ Get this user's bookmarked works. Loads them if they haven't been previously Returns: list: List of works """ if self._bookmarks is None: if use_threading: self.load_bookmarks_threaded() else: self._bookmarks = [] for page in range(self._bookmarks_pages): self._load_bookmarks(page=page+1) return self._bookmarks @threadable.threadable def load_bookmarks_threaded(self): """ Get the user's bookmarks using threads. This function is threadable. """ threads = [] self._bookmarks = [] for page in range(self._bookmarks_pages): threads.append(self._load_bookmarks(page=page+1, threaded=True)) for thread in threads: thread.join() @threadable.threadable def _load_bookmarks(self, page=1): from .works import Work self._soup_bookmarks = self.request(f"https://archiveofourown.org/users/{self.username}/bookmarks?page={page}") ol = self._soup_bookmarks.find("ol", {"class": "bookmark index group"}) for work in ol.find_all("li", {"role": "article"}): authors = [] if work.h4 is None: continue self._bookmarks.append(get_work_from_banner(work)) @cached_property def bio(self): """Returns the user's bio Returns: str: User's bio """ div = self._soup_profile.find("div", {"class": "bio module"}) if div is None: return "" blockquote = div.find("blockquote", {"class": "userstuff"}) return blockquote.getText() if blockquote is not None else "" @cached_property def url(self): """Returns the URL to the user's profile Returns: str: user profile URL """ return "https://archiveofourown.org/users/%s"%self.username def get(self, *args, **kwargs): """Request a web page and return a Response object""" if self._session is None: req = requester.request("get", *args, **kwargs) else: req = requester.request("get", *args, **kwargs, session=self._session.session) if req.status_code == 429: raise utils.HTTPError("We are being rate-limited. Try again in a while or reduce the number of requests") return req def request(self, url): """Request a web page and return a BeautifulSoup object. Args: url (str): Url to request Returns: bs4.BeautifulSoup: BeautifulSoup object representing the requested page's html """ req = self.get(url) soup = BeautifulSoup(req.content, "lxml") return soup @staticmethod def str_format(string): """Formats a given string Args: string (str): String to format Returns: str: Formatted string """ return string.replace(",", "") @property def work_pages(self): """ Returns how many pages of works a user has Returns: int: Amount of pages """ return self._works_pages ================================================ FILE: AO3/utils.py ================================================ import os import pickle import re from bs4 import BeautifulSoup from .requester import requester from .common import url_join _FANDOMS = None _LANGUAGES = None AO3_AUTH_ERROR_URL = "https://archiveofourown.org/auth_error" class LoginError(Exception): def __init__(self, message, errors=[]): super().__init__(message) self.errors = errors class UnloadedError(Exception): def __init__(self, message, errors=[]): super().__init__(message) self.errors = errors class UnexpectedResponseError(Exception): def __init__(self, message, errors=[]): super().__init__(message) self.errors = errors class InvalidIdError(Exception): def __init__(self, message, errors=[]): super().__init__(message) self.errors = errors class DownloadError(Exception): def __init__(self, message, errors=[]): super().__init__(message) self.errors = errors class AuthError(Exception): def __init__(self, message, errors=[]): super().__init__(message) self.errors = errors class DuplicateCommentError(Exception): def __init__(self, message, errors=[]): super().__init__(message) self.errors = errors class PseudError(Exception): def __init__(self, message, errors=[]): super().__init__(message) self.errors = errors class HTTPError(Exception): def __init__(self, message, errors=[]): super().__init__(message) self.errors = errors class BookmarkError(Exception): def __init__(self, message, errors=[]): super().__init__(message) self.errors = errors class CollectError(Exception): def __init__(self, message, errors=[]): super().__init__(message) self.errors = errors class Query: def __init__(self): self.fields = [] def add_field(self, text): self.fields.append(text) @property def string(self): return '&'.join(self.fields) class Constraint: """Represents a bounding box of a value """ def __init__(self, lowerbound=0, upperbound=None): """Creates a new Constraint object Args: lowerbound (int, optional): Constraint lowerbound. Defaults to 0. upperbound (int, optional): Constraint upperbound. Defaults to None. """ self._lb = lowerbound self._ub = upperbound @property def string(self): """Returns the string representation of this constraint Returns: str: string representation """ if self._lb == 0: return f"<{self._ub}" elif self._ub is None: return f">{self._lb}" elif self._ub == self._lb: return str(self._lb) else: return f"{self._lb}-{self._ub}" def __str__(self): return self.string def word_count(text): return len(tuple(filter(lambda w: w != "", re.split(" |\n|\t", text)))) def set_rqtw(value): """Sets the requests per time window parameter for the AO3 requester""" requester.setRQTW(value) def set_timew(value): """Sets the time window parameter for the AO3 requester""" requester.setTimeW(value) def limit_requests(limit=True): """Toggles request limiting""" if limit: requester.setRQTW(12) else: requester.setRQTW(-1) def load_fandoms(): """Loads fandoms into memory Raises: FileNotFoundError: No resource was found """ global _FANDOMS fandom_path = os.path.join(os.path.dirname(__file__), "resources", "fandoms") if not os.path.isdir(fandom_path): raise FileNotFoundError("No fandom resources have been downloaded. Try AO3.extra.download()") files = os.listdir(fandom_path) _FANDOMS = [] for file in files: with open(os.path.join(fandom_path, file), "rb") as f: _FANDOMS += pickle.load(f) def load_languages(): """Loads languages into memory Raises: FileNotFoundError: No resource was found """ global _LANGUAGES language_path = os.path.join(os.path.dirname(__file__), "resources", "languages") if not os.path.isdir(language_path): raise FileNotFoundError("No language resources have been downloaded. Try AO3.extra.download()") files = os.listdir(language_path) _LANGUAGES = [] for file in files: with open(os.path.join(language_path, file), "rb") as f: _LANGUAGES += pickle.load(f) def get_languages(): """Returns all available languages""" return _LANGUAGES[:] def search_fandom(fandom_string): """Searches for a fandom that matches the given string Args: fandom_string (str): query string Raises: UnloadedError: load_fandoms() wasn't called UnloadedError: No resources were downloaded Returns: list: All results matching 'fandom_string' """ if _FANDOMS is None: raise UnloadedError("Did you forget to call AO3.utils.load_fandoms()?") if _FANDOMS == []: raise UnloadedError("Did you forget to download the required resources with AO3.extra.download()?") results = [] for fandom in _FANDOMS: if fandom_string.lower() in fandom.lower(): results.append(fandom) return results def workid_from_url(url): """Get the workid from an archiveofourown.org website url Args: url (str): Work URL Returns: int: Work ID """ split_url = url.split("/") try: index = split_url.index("works") except ValueError: return if len(split_url) >= index+1: workid = split_url[index+1].split("?")[0] if workid.isdigit(): return int(workid) return def comment(commentable, comment_text, session, fullwork=False, commentid=None, email="", name="", pseud=None): """Leaves a comment on a specific work Args: commentable (Work/Chapter): Chapter/Work object comment_text (str): Comment text (must have between 1 and 10000 characters) fullwork (bool): Should be True if the work has only one chapter or if the comment is to be posted on the full work. session (AO3.Session/AO3.GuestSession): Session object to request with. commentid (str/int): If specified, the comment is posted as a reply to this comment. Defaults to None. email (str): Email to post with. Only used if sess is None. Defaults to "". name (str): Name that will appear on the comment. Only used if sess is None. Defaults to "". pseud (str, optional): What pseud to add the comment under. Defaults to default pseud. Raises: utils.InvalidIdError: Invalid ID utils.UnexpectedResponseError: Unknown error utils.PseudError: Couldn't find a valid pseudonym to post under utils.DuplicateCommentError: The comment you're trying to post was already posted ValueError: Invalid name/email Returns: requests.models.Response: Response object """ if commentable.authenticity_token is not None: at = commentable.authenticity_token else: at = session.authenticity_token headers = { "x-requested-with": "XMLHttpRequest", "x-newrelic-id": "VQcCWV9RGwIJVFFRAw==", "x-csrf-token": at } data = {} if fullwork: data["work_id"] = str(commentable.id) else: data["chapter_id"] = str(commentable.id) if commentid is not None: data["comment_id"] = commentid if session.is_authed: if fullwork: referer = f"https://archiveofourown.org/works/{commentable.id}" else: referer = f"https://archiveofourown.org/chapters/{commentable.id}" pseud_id = get_pseud_id(commentable, session, pseud) if pseud_id is None: raise PseudError("Couldn't find your pseud's id") data.update({ "authenticity_token": at, "comment[pseud_id]": pseud_id, "comment[comment_content]": comment_text, }) else: if email == "" or name == "": raise ValueError("You need to specify both an email and a name!") data.update({ "authenticity_token": at, "comment[email]": email, "comment[name]": name, "comment[comment_content]": comment_text, }) response = session.post(f"https://archiveofourown.org/comments.js", headers=headers, data=data) if response.status_code == 429: raise HTTPError("We are being rate-limited. Try again in a while or reduce the number of requests") if response.status_code == 404: if len(response.content) > 0: return response else: raise InvalidIdError(f"Invalid {'work ID' if fullwork else 'chapter ID'}") if response.status_code == 422: json = response.json() if "errors" in json: if "auth_error" in json["errors"]: raise AuthError("Invalid authentication token. Try calling session.refresh_auth_token()") raise UnexpectedResponseError(f"Unexpected json received:\n{str(json)}") elif response.status_code == 200: raise DuplicateCommentError("You have already left this comment here") raise UnexpectedResponseError(f"Unexpected HTTP status code received ({response.status_code})") def delete_comment(comment, session): """Deletes the specified comment Args: comment (AO3.Comment): Comment object session (AO3.Session): Session object Raises: PermissionError: You don't have permission to delete the comment utils.AuthError: Invalid auth token utils.UnexpectedResponseError: Unknown error """ if session is None or not session.is_authed: raise PermissionError("You don't have permission to do this") if comment.authenticity_token is not None: at = comment.authenticity_token else: at = session.authenticity_token data = { "authenticity_token": at, "_method": "delete" } req = session.post(f"https://archiveofourown.org/comments/{comment.id}", data=data) if req.status_code == 429: raise HTTPError("We are being rate-limited. Try again in a while or reduce the number of requests") else: soup = BeautifulSoup(req.content, "lxml") if "auth error" in soup.title.getText().lower(): raise AuthError("Invalid authentication token. Try calling session.refresh_auth_token()") else: error = soup.find("div", {"id": "main"}).getText() if "you don't have permission" in error.lower(): raise PermissionError("You don't have permission to do this") def kudos(work, session): """Leave a 'kudos' in a specific work Args: work (Work): Work object Raises: utils.UnexpectedResponseError: Unexpected response received utils.InvalidIdError: Invalid ID (work doesn't exist) utils.AuthError: Invalid authenticity token Returns: bool: True if successful, False if you already left kudos there """ if work.authenticity_token is not None: at = work.authenticity_token else: at = session.authenticity_token data = { "authenticity_token": at, "kudo[commentable_id]": work.id, "kudo[commentable_type]": "Work" } headers = { "x-csrf-token": work.authenticity_token, "x-requested-with": "XMLHttpRequest", "referer": f"https://archiveofourown.org/work/{work.id}" } response = session.post("https://archiveofourown.org/kudos.js", headers=headers, data=data) if response.status_code == 429: raise HTTPError("We are being rate-limited. Try again in a while or reduce the number of requests") if response.status_code == 201: return True # Success elif response.status_code == 422: json = response.json() if "errors" in json: if "auth_error" in json["errors"]: raise AuthError("Invalid authentication token. Try calling session.refresh_auth_token()") elif "user_id" in json["errors"] or "ip_address" in json["errors"]: return False # User has already left kudos elif "no_commentable" in json["errors"]: raise InvalidIdError("Invalid ID") raise UnexpectedResponseError(f"Unexpected json received:\n"+str(json)) else: raise UnexpectedResponseError(f"Unexpected HTTP status code received ({response.status_code})") def subscribe(subscribable, worktype, session, unsubscribe=False, subid=None): """Subscribes to a work. Be careful, you can subscribe to a work multiple times Args: subscribable (Work/Series/User): AO3 object worktype (str): Type of the work (Series/Work/User) session (AO3.Session): Session object unsubscribe (bool, optional): Unsubscribe instead of subscribing. Defaults to False. subid (str/int, optional): Subscription ID, used when unsubscribing. Defaults to None. Raises: AuthError: Invalid auth token AuthError: Invalid session InvalidIdError: Invalid ID / worktype InvalidIdError: Invalid subid """ if session is None: session = subscribable.session if session is None or not session.is_authed: raise AuthError("Invalid session") if subscribable.authenticity_token is not None: at = subscribable.authenticity_token else: at = session.authenticity_token data = { "authenticity_token": at, "subscription[subscribable_id]": subscribable.id, "subscription[subscribable_type]": worktype.capitalize() } url = f"https://archiveofourown.org/users/{session.username}/subscriptions" if unsubscribe: if subid is None: raise InvalidIdError("When unsubscribing, subid cannot be None") url += f"/{subid}" data["_method"] = "delete" req = session.session.post(url, data=data, allow_redirects=False) if unsubscribe: return req if req.status_code == 302: if req.headers["Location"] == AO3_AUTH_ERROR_URL: raise AuthError("Invalid authentication token. Try calling session.refresh_auth_token()") else: raise InvalidIdError(f"Invalid ID / worktype") def bookmark(bookmarkable, session=None, notes="", tags=None, collections=None, private=False, recommend=False, pseud=None): """Adds a bookmark to a work/series. Be careful, you can bookmark a work multiple times Args: bookmarkable (Work/Series): AO3 object session (AO3.Session): Session object notes (str, optional): Bookmark notes. Defaults to "". tags (list, optional): What tags to add. Defaults to None. collections (list, optional): What collections to add this bookmark to. Defaults to None. private (bool, optional): Whether this bookmark should be private. Defaults to False. recommend (bool, optional): Whether to recommend this bookmark. Defaults to False. pseud (str, optional): What pseud to add the bookmark under. Defaults to default pseud. """ if session is None: session = bookmarkable.session if session == None or not session.is_authed: raise AuthError("Invalid session") if bookmarkable.authenticity_token is not None: at = bookmarkable.authenticity_token else: at = session.authenticity_token if tags is None: tags = [] if collections is None: collections = [] pseud_id = get_pseud_id(bookmarkable, session, pseud) if pseud_id is None: raise PseudError("Couldn't find your pseud's id") data = { "authenticity_token": at, "bookmark[pseud_id]": pseud_id, "bookmark[tag_string]": ",".join(tags), "bookmark[collection_names]": ",".join(collections), "bookmark[private]": int(private), "bookmark[rec]" : int(recommend), "commit": "Create" } if notes != "": data["bookmark[bookmarker_notes]"] = notes url = url_join(bookmarkable.url, "bookmarks") req = session.session.post(url, data=data, allow_redirects=False) handle_bookmark_errors(req) def delete_bookmark(bookmarkid, session, auth_token=None): """Remove a bookmark from the work/series Args: bookmarkid (Work/Series): AO3 object session (AO3.Session): Session object auth_token (str, optional): Authenticity token. Defaults to None. """ if session == None or not session.is_authed: raise AuthError("Invalid session") data = { "authenticity_token": session.authenticity_token if auth_token is None else auth_token, "_method": "delete" } url = f"https://archiveofourown.org/bookmarks/{bookmarkid}" req = session.session.post(url, data=data, allow_redirects=False) handle_bookmark_errors(req) def handle_bookmark_errors(request): if request.status_code == 302: if request.headers["Location"] == AO3_AUTH_ERROR_URL: raise AuthError("Invalid authentication token. Try calling session.refresh_auth_token()") else: if request.status_code == 200: soup = BeautifulSoup(request.content, "lxml") error_div = soup.find("div", {"id": "error", "class": "error"}) if error_div is None: raise UnexpectedResponseError("An unknown error occurred") errors = [item.getText() for item in error_div.findAll("li")] if len(errors) == 0: raise BookmarkError("An unknown error occurred") raise BookmarkError("Error(s) creating bookmark:" + " ".join(errors)) raise UnexpectedResponseError(f"Unexpected HTTP status code received ({request.status_code})") def get_pseud_id(ao3object, session=None, specified_pseud=None): if session is None: session = ao3object.session if session is None or not session.is_authed: raise AuthError("Invalid session") soup = session.request(ao3object.url) pseud = soup.find("input", {"name": re.compile(".+\\[pseud_id\\]")}) if pseud is None: pseud = soup.find("select", {"name": re.compile(".+\\[pseud_id\\]")}) if pseud is None: return None pseud_id = None if specified_pseud: for option in pseud.findAll("option"): if option.string == specified_pseud: pseud_id = option.attrs["value"] break else: for option in pseud.findAll("option"): if "selected" in option.attrs and option.attrs["selected"] == "selected": pseud_id = option.attrs["value"] break else: pseud_id = pseud.attrs["value"] return pseud_id def collect(collectable, session, collections): """Invites a work to a collection. Be careful, you can collect a work multiple times Args: work (Work): Work object session (AO3.Session): Session object collections (list, optional): What collections to add this work to. Defaults to None. """ if session is None: session = collectable.session if session == None or not session.is_authed: raise AuthError("Invalid session") if collectable.authenticity_token is not None: at = collectable.authenticity_token else: at = session.authenticity_token if collections is None: collections = [] data = { "authenticity_token": at, "collection_names": ",".join(collections), "commit": "Add" } url = url_join(collectable.url, "collection_items") req = session.session.post(url, data=data, allow_redirects=True) if req.status_code == 302: if req.headers["Location"] == AO3_AUTH_ERROR_URL: raise AuthError("Invalid authentication token. Try calling session.refresh_auth_token()") elif req.status_code == 200: soup = BeautifulSoup(req.content, "lxml") notice_div = soup.find("div", {"class": "notice"}) error_div = soup.find("div", {"class": "error"}) if error_div is None and notice_div is None: raise UnexpectedResponseError("An unknown error occurred") if error_div is not None: errors = [item.getText() for item in error_div.findAll("ul")] if len(errors) == 0: raise CollectError("An unknown error occurred") raise CollectError("We couldn't add your submission to the following collection(s): " + " ".join(errors)) else: raise UnexpectedResponseError(f"Unexpected HTTP status code received ({req.status_code})") ================================================ FILE: AO3/works.py ================================================ import warnings from datetime import datetime from functools import cached_property from bs4 import BeautifulSoup from . import threadable, utils from .chapters import Chapter from .comments import Comment from .requester import requester from .users import User class Work: """ AO3 work object """ def __init__(self, workid, session=None, load=True, load_chapters=True): """Creates a new AO3 work object Args: workid (int): AO3 work ID session (AO3.Session, optional): Used to access restricted works load (bool, optional): If true, the work is loaded on initialization. Defaults to True. load_chapters (bool, optional): If false, chapter text won't be parsed, and Work.load_chapters() will have to be called. Defaults to True. Raises: utils.InvalidIdError: Raised if the work wasn't found """ self._session = session self.chapters = [] self.id = workid self._soup = None if load: self.reload(load_chapters) def __repr__(self): try: return f"" except: return f"" def __eq__(self, other): return isinstance(other, __class__) and other.id == self.id def __getstate__(self): d = {} for attr in self.__dict__: if isinstance(self.__dict__[attr], BeautifulSoup): d[attr] = (self.__dict__[attr].encode(), True) else: d[attr] = (self.__dict__[attr], False) return d def __setstate__(self, d): for attr in d: value, issoup = d[attr] if issoup: self.__dict__[attr] = BeautifulSoup(value, "lxml") else: self.__dict__[attr] = value @threadable.threadable def reload(self, load_chapters=True): """ Loads information about this work. This function is threadable. Args: load_chapters (bool, optional): If false, chapter text won't be parsed, and Work.load_chapters() will have to be called. Defaults to True. """ for attr in self.__class__.__dict__: if isinstance(getattr(self.__class__, attr), cached_property): if attr in self.__dict__: delattr(self, attr) self._soup = self.request(f"https://archiveofourown.org/works/{self.id}?view_adult=true&view_full_work=true") if "Error 404" in self._soup.find("h2", {"class", "heading"}).text: raise utils.InvalidIdError("Cannot find work") if load_chapters: self.load_chapters() def set_session(self, session): """Sets the session used to make requests for this work Args: session (AO3.Session/AO3.GuestSession): session object """ self._session = session def load_chapters(self): """Loads chapter objects for each one of this work's chapters """ self.chapters = [] chapters_div = self._soup.find(attrs={"id": "chapters"}) if chapters_div is None: return if self.nchapters > 1: for n in range(1, self.nchapters+1): chapter = chapters_div.find("div", {"id": f"chapter-{n}"}) if chapter is None: continue chapter.extract() preface_group = chapter.find("div", {"class": ("chapter", "preface", "group")}) if preface_group is None: continue title = preface_group.find("h3", {"class": "title"}) if title is None: continue id_ = int(title.a["href"].split("/")[-1]) c = Chapter(id_, self, self._session, False) c._soup = chapter self.chapters.append(c) else: c = Chapter(None, self, self._session, False) c._soup = chapters_div self.chapters.append(c) def get_images(self): """Gets all images from this work Raises: utils.UnloadedError: Raises this error if the work isn't loaded Returns: dict: key = chapter_n; value = chapter.get_images() """ if not self.loaded: raise utils.UnloadedError("Work isn't loaded. Have you tried calling Work.reload()?") chapters = {} for chapter in self.chapters: images = chapter.get_images() if len(images) != 0: chapters[chapter.number] = images return chapters def download(self, filetype="PDF"): """Downloads this work Args: filetype (str, optional): Desired filetype. Defaults to "PDF". Known filetypes are: AZW3, EPUB, HTML, MOBI, PDF. Raises: utils.DownloadError: Raised if there was an error with the download utils.UnexpectedResponseError: Raised if the filetype is not available for download Returns: bytes: File content """ if not self.loaded: raise utils.UnloadedError("Work isn't loaded. Have you tried calling Work.reload()?") download_btn = self._soup.find("li", {"class": "download"}) for download_type in download_btn.findAll("li"): if download_type.a.getText() == filetype.upper(): url = f"https://archiveofourown.org/{download_type.a.attrs['href']}" req = self.get(url) if req.status_code == 429: raise utils.HTTPError("We are being rate-limited. Try again in a while or reduce the number of requests") if not req.ok: raise utils.DownloadError("An error occurred while downloading the work") return req.content raise utils.UnexpectedResponseError(f"Filetype '{filetype}' is not available for download") @threadable.threadable def download_to_file(self, filename, filetype="PDF"): """Downloads this work and saves it in the specified file. This function is threadable. Args: filename (str): Name of the resulting file filetype (str, optional): Desired filetype. Defaults to "PDF". Known filetypes are: AZW3, EPUB, HTML, MOBI, PDF. Raises: utils.DownloadError: Raised if there was an error with the download utils.UnexpectedResponseError: Raised if the filetype is not available for download """ with open(filename, "wb") as file: file.write(self.download(filetype)) @property def metadata(self): metadata = {} normal_fields = ( "bookmarks", "categories", "nchapters", "characters", "complete", "comments", "expected_chapters", "fandoms", "hits", "kudos", "language", "rating", "relationships", "restricted", "status", "summary", "tags", "title", "warnings", "id", "words", "collections" ) string_fields = ( "date_edited", "date_published", "date_updated", ) for field in string_fields: try: metadata[field] = str(getattr(self, field)) except AttributeError: pass for field in normal_fields: try: metadata[field] = getattr(self, field) except AttributeError: pass try: metadata["authors"] = list(map(lambda author: author.username, self.authors)) except AttributeError: pass try: metadata["series"] = list(map(lambda series: series.name, self.series)) except AttributeError: pass try: metadata["chapter_titles"] = list(map(lambda chapter: chapter.title, self.chapters)) except AttributeError: pass return metadata def get_comments(self, maximum=None): """Returns a list of all threads of comments in the work. This operation can take a very long time. Because of that, it is recomended that you set a maximum number of comments. Duration: ~ (0.13 * n_comments) seconds or 2.9 seconds per comment page Args: maximum (int, optional): Maximum number of comments to be returned. None -> No maximum Raises: ValueError: Invalid chapter number IndexError: Invalid chapter number utils.UnloadedError: Work isn't loaded Returns: list: List of comments """ if not self.loaded: raise utils.UnloadedError("Work isn't loaded. Have you tried calling Work.reload()?") url = f"https://archiveofourown.org/works/{self.id}?page=%d&show_comments=true&view_adult=true&view_full_work=true" soup = self.request(url%1) pages = 0 div = soup.find("div", {"id": "comments_placeholder"}) ol = div.find("ol", {"class": "pagination actions"}) if ol is None: pages = 1 else: for li in ol.findAll("li"): if li.getText().isdigit(): pages = int(li.getText()) comments = [] for page in range(pages): if page != 0: soup = self.request(url%(page+1)) ol = soup.find("ol", {"class": "thread"}) for li in ol.findAll("li", {"role": "article"}, recursive=False): if maximum is not None and len(comments) >= maximum: return comments id_ = int(li.attrs["id"][8:]) header = li.find("h4", {"class": ("heading", "byline")}) if header is None or header.a is None: author = None else: author = User(str(header.a.text), self._session, False) if li.blockquote is not None: text = li.blockquote.getText() else: text = "" comment = Comment(id_, self, session=self._session, load=False) setattr(comment, "authenticity_token", self.authenticity_token) setattr(comment, "author", author) setattr(comment, "text", text) comment._thread = None comments.append(comment) return comments @threadable.threadable def subscribe(self): """Subscribes to this work. This function is threadable. Raises: utils.AuthError: Invalid session """ if self._session is None or not self._session.is_authed: raise utils.AuthError("You can only subscribe to a work using an authenticated session") utils.subscribe(self, "Work", self._session) @threadable.threadable def unsubscribe(self): """Unubscribes from this user. This function is threadable. Raises: utils.AuthError: Invalid session """ if not self.is_subscribed: raise Exception("You are not subscribed to this work") if self._session is None or not self._session.is_authed: raise utils.AuthError("You can only unsubscribe from a work using an authenticated session") utils.subscribe(self, "Work", self._session, True, self._sub_id) @cached_property def text(self): """This work's text""" text = "" for chapter in self.chapters: text += chapter.text text += "\n" return text @cached_property def authenticity_token(self): """Token used to take actions that involve this work""" if not self.loaded: return None token = self._soup.find("meta", {"name": "csrf-token"}) return token["content"] @cached_property def is_subscribed(self): """True if you're subscribed to this work""" if self._session is None or not self._session.is_authed: raise utils.AuthError("You can only get a user ID using an authenticated session") ul = self._soup.find("ul", {"class": "work navigation actions"}) input_ = ul.find("li", {"class": "subscribe"}).find("input", {"name": "commit", "value": "Unsubscribe"}) return input_ is not None @cached_property def _sub_id(self): """Returns the subscription ID. Used for unsubscribing""" if self._session is None or not self._session.is_authed: raise utils.AuthError("You can only get a user ID using an authenticated session") ul = self._soup.find("ul", {"class": "work navigation actions"}) id_ = ul.find("li", {"class": "subscribe"}).form.attrs["action"].split("/")[-1] return int(id_) @threadable.threadable def leave_kudos(self): """Leave a "kudos" in this work. This function is threadable. Raises: utils.UnexpectedResponseError: Unexpected response received utils.InvalidIdError: Invalid ID (work doesn't exist) utils.AuthError: Invalid session or authenticity token Returns: bool: True if successful, False if you already left kudos there """ if self._session is None: raise utils.AuthError("Invalid session") return utils.kudos(self, self._session) @threadable.threadable def comment(self, comment_text, email="", name="", pseud=None): """Leaves a comment on this work. This function is threadable. Args: comment_text (str): Comment text email (str, optional): Email to add comment. Needed if not logged in. name (str, optional): Name to add comment under. Needed if not logged in. pseud (str, optional): Pseud to add the comment under. Defaults to default pseud. Raises: utils.UnloadedError: Couldn't load chapters utils.AuthError: Invalid session Returns: requests.models.Response: Response object """ if not self.loaded: raise utils.UnloadedError("Work isn't loaded. Have you tried calling Work.reload()?") if self._session is None: raise utils.AuthError("Invalid session") return utils.comment(self, comment_text, self._session, True, email=email, name=name, pseud=pseud) @threadable.threadable def bookmark(self, notes="", tags=None, collections=None, private=False, recommend=False, pseud=None): """Bookmarks this work This function is threadable Args: notes (str, optional): Bookmark notes. Defaults to "". tags (list, optional): What tags to add. Defaults to None. collections (list, optional): What collections to add this bookmark to. Defaults to None. private (bool, optional): Whether this bookmark should be private. Defaults to False. recommend (bool, optional): Whether to recommend this bookmark. Defaults to False. pseud (str, optional): What pseud to add the bookmark under. Defaults to default pseud. Raises: utils.UnloadedError: Work isn't loaded utils.AuthError: Invalid session """ if not self.loaded: raise utils.UnloadedError("Work isn't loaded. Have you tried calling Work.reload()?") if self._session is None: raise utils.AuthError("Invalid session") utils.bookmark(self, self._session, notes, tags, collections, private, recommend, pseud) @threadable.threadable def delete_bookmark(self): """Removes a bookmark from this work This function is threadable Raises: utils.UnloadedError: Work isn't loaded utils.AuthError: Invalid session """ if not self.loaded: raise utils.UnloadedError("Work isn't loaded. Have you tried calling Work.reload()?") if self._session is None: raise utils.AuthError("Invalid session") if self._bookmarkid is None: raise utils.BookmarkError("You don't have a bookmark here") utils.delete_bookmark(self._bookmarkid, self._session, self.authenticity_token) @threadable.threadable def collect(self, collections): """Invites/collects this work to a collection or collections This function is threadable Args: collections (list): What collections to add this work to. Defaults to None. Raises: utils.UnloadedError: Work isn't loaded utils.AuthError: Invalid session """ if not self.loaded: raise utils.UnloadedError("Work isn't loaded. Have you tried calling Work.reload()?") if self._session is None: raise utils.AuthError("Invalid session") utils.collect(self, self._session, collections) @cached_property def _bookmarkid(self): form_div = self._soup.find("div", {"id": "bookmark-form"}) if form_div is None: return None if form_div.form is None: return None if "action" in form_div.form.attrs and form_div.form["action"].startswith("/bookmarks"): text = form_div.form["action"].split("/")[-1] if text.isdigit(): return int(text) return None return None @property def loaded(self): """Returns True if this work has been loaded""" return self._soup is not None @property def oneshot(self): """Returns True if this work has only one chapter""" return self.nchapters == 1 @cached_property def series(self): """Returns the series this work belongs to""" from .series import Series dd = self._soup.find("dd", {"class": "series"}) if dd is None: return [] s = [] for span in dd.find_all("span", {"class": "position"}): seriesid = int(span.a.attrs["href"].split("/")[-1]) seriesname = span.a.getText() series = Series(seriesid, self._session, False) setattr(series, "name", seriesname) s.append(series) return s @cached_property def authors(self): """Returns the list of the work's author Returns: list: list of authors """ from .users import User authors = self._soup.find_all("h3", {"class": "byline heading"}) if len(authors) == 0: return [] formatted_authors = authors[0].text.replace("\n", "").split(", ") author_list = [] if authors is not None: for author in formatted_authors: user = User(author, load=False) author_list.append(user) return author_list @cached_property def nchapters(self): """Returns the number of chapters of this work Returns: int: number of chapters """ chapters = self._soup.find("dd", {"class": "chapters"}) if chapters is not None: return int(self.str_format(chapters.string.split("/")[0])) return 0 @cached_property def expected_chapters(self): """Returns the number of expected chapters for this work, or None if the author hasn't provided an expected number Returns: int: number of chapters """ chapters = self._soup.find("dd", {"class": "chapters"}) if chapters is not None: n = self.str_format(chapters.string.split("/")[-1]) if n.isdigit(): return int(n) return None @property def status(self): """Returns the status of this work Returns: str: work status """ return "Completed" if self.nchapters == self.expected_chapters else "Work in Progress" @cached_property def hits(self): """Returns the number of hits this work has Returns: int: number of hits """ hits = self._soup.find("dd", {"class": "hits"}) if hits is not None: return int(self.str_format(hits.string)) return 0 @cached_property def kudos(self): """Returns the number of kudos this work has Returns: int: number of kudos """ kudos = self._soup.find("dd", {"class": "kudos"}) if kudos is not None: return int(self.str_format(kudos.string)) return 0 @cached_property def comments(self): """Returns the number of comments this work has Returns: int: number of comments """ comments = self._soup.find("dd", {"class": "comments"}) if comments is not None: return int(self.str_format(comments.string)) return 0 @cached_property def restricted(self): """Whether this is a restricted work or not Returns: int: True if work is restricted """ return self._soup.find("img", {"title": "Restricted"}) is not None @cached_property def words(self): """Returns the this work's word count Returns: int: number of words """ words = self._soup.find("dd", {"class": "words"}) if words is not None: return int(self.str_format(words.string)) return 0 @cached_property def language(self): """Returns this work's language Returns: str: Language """ language = self._soup.find("dd", {"class": "language"}) if language is not None: return language.string.strip() else: return "Unknown" @cached_property def bookmarks(self): """Returns the number of bookmarks this work has Returns: int: number of bookmarks """ bookmarks = self._soup.find("dd", {"class": "bookmarks"}) if bookmarks is not None: return int(self.str_format(bookmarks.string)) return 0 @cached_property def title(self): """Returns the title of this work Returns: str: work title """ title = self._soup.find("div", {"class": "preface group"}) if title is not None: return str(title.h2.text.strip()) return "" @cached_property def date_published(self): """Returns the date this work was published Returns: datetime.date: publish date """ dp = self._soup.find("dd", {"class": "published"}).string return datetime(*list(map(int, dp.split("-")))) @cached_property def date_edited(self): """Returns the date this work was last edited Returns: datetime.datetime: edit date """ download = self._soup.find("li", {"class": "download"}) if download is not None and download.ul is not None: timestamp = int(download.ul.a["href"].split("=")[-1]) return datetime.fromtimestamp(timestamp) return datetime(self.date_published) @cached_property def date_updated(self): """Returns the date this work was last updated Returns: datetime.datetime: update date """ update = self._soup.find("dd", {"class": "status"}) if update is not None: split = update.string.split("-") return datetime(*list(map(int, split))) return self.date_published @cached_property def tags(self): """Returns all the work's tags Returns: list: List of tags """ html = self._soup.find("dd", {"class": "freeform tags"}) tags = [] if html is not None: for tag in html.find_all("li"): tags.append(tag.a.string) return tags @cached_property def characters(self): """Returns all the work's characters Returns: list: List of characters """ html = self._soup.find("dd", {"class": "character tags"}) characters = [] if html is not None: for character in html.find_all("li"): characters.append(character.a.string) return characters @cached_property def relationships(self): """Returns all the work's relationships Returns: list: List of relationships """ html = self._soup.find("dd", {"class": "relationship tags"}) relationships = [] if html is not None: for relationship in html.find_all("li"): relationships.append(relationship.a.string) return relationships @cached_property def fandoms(self): """Returns all the work's fandoms Returns: list: List of fandoms """ html = self._soup.find("dd", {"class": "fandom tags"}) fandoms = [] if html is not None: for fandom in html.find_all("li"): fandoms.append(fandom.a.string) return fandoms @cached_property def categories(self): """Returns all the work's categories Returns: list: List of categories """ html = self._soup.find("dd", {"class": "category tags"}) categories = [] if html is not None: for category in html.find_all("li"): categories.append(category.a.string) return categories @cached_property def warnings(self): """Returns all the work's warnings Returns: list: List of warnings """ html = self._soup.find("dd", {"class": "warning tags"}) warnings = [] if html is not None: for warning in html.find_all("li"): warnings.append(warning.a.string) return warnings @cached_property def rating(self): """Returns this work's rating Returns: str: Rating """ html = self._soup.find("dd", {"class": "rating tags"}) if html is not None: rating = html.a.string return rating return None @cached_property def summary(self): """Returns this work's summary Returns: str: Summary """ div = self._soup.find("div", {"class": "preface group"}) if div is None: return "" html = div.find("blockquote", {"class": "userstuff"}) if html is None: return "" return str(BeautifulSoup.getText(html)) @cached_property def start_notes(self): """Text from this work's start notes""" notes = self._soup.find("div", {"class": "notes module"}) if notes is None: return "" text = "" for p in notes.findAll("p"): text += p.getText().strip() + "\n" return text @cached_property def end_notes(self): """Text from this work's end notes""" notes = self._soup.find("div", {"id": "work_endnotes"}) if notes is None: return "" text = "" for p in notes.findAll("p"): text += p.getText() + "\n" return text @cached_property def url(self): """Returns the URL to this work Returns: str: work URL """ return f"https://archiveofourown.org/works/{self.id}" @cached_property def complete(self): """ Return True if the work is complete Retuns: bool: True if a work is complete """ chapterStatus = self._soup.find("dd", {"class": "chapters"}).string.split("/") return chapterStatus[0] == chapterStatus[1] @cached_property def collections(self): """Returns all the collections the work belongs to Returns: list: List of collections """ html = self._soup.find("dd", {"class": "collections"}) collections = [] if html is not None: for collection in html.find_all("a"): collections.append(collection.get_text()) return collections def get(self, *args, **kwargs): """Request a web page and return a Response object""" if self._session is None: req = requester.request("get", *args, **kwargs) else: req = requester.request("get", *args, **kwargs, session=self._session.session) if req.status_code == 429: raise utils.HTTPError("We are being rate-limited. Try again in a while or reduce the number of requests") return req def request(self, url): """Request a web page and return a BeautifulSoup object. Args: url (str): Url to request Returns: bs4.BeautifulSoup: BeautifulSoup object representing the requested page's html """ req = self.get(url) if len(req.content) > 650000: warnings.warn("This work is very big and might take a very long time to load") soup = BeautifulSoup(req.content, "lxml") return soup @staticmethod def str_format(string): """Formats a given string Args: string (str): String to format Returns: str: Formatted string """ return string.replace(",", "") ================================================ FILE: LICENSE ================================================ MIT License Copyright (c) 2019 Francisco Patrcio Rodrigues Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ================================================ FILE: README.md ================================================ [![Documentation Status](https://readthedocs.org/projects/ao3-api/badge/?version=latest)](https://ao3-api.readthedocs.io/en/latest/?badge=latest) # AO3 API This is an unofficial API that lets you access some of AO3's (archiveofourown.org) data through Python. ## Installation Use the package manager [pip](https://pip.pypa.io/en/stable/) to install AO3 API. ```bash pip install ao3_api ``` # Github https://github.com/wendytg/ao3_api # Usage This package is divided in 9 core modules: works, chapters, users, series, search, session, comments, extra, and utils. ## Works One of the most basic things you might want to do with this package is loading a work and checking its statistics and information. To do that, you'll need the `AO3.Work` class. We start by finding the _workid_ of the work we want to load. We do that either by using `AO3.utils.workid_from_url(url)` or by just looking at the url ourselves. Let's take a look: ```py3 import AO3 url = "https://archiveofourown.org/works/14392692/chapters/33236241" workid = AO3.utils.workid_from_url(url) print(f"Work ID: {workid}") work = AO3.Work(workid) print(f"Chapters: {work.nchapters}") ``` After running this snippet, we get the output: ``` Work ID: 14392692 Chapters: 46 ``` It's important to note that some works may not be accessible to guest users, and in this case you will get 0 chapters as an output, and the error `AO3.utils.AuthError: This work is only available to registered users of the Archive` if you try to load it. Nontheless, we can still do a lot more with this Work object: Lets try to get the first 20 words of the second chapter. ```py3 import AO3 work = AO3.Work(14392692) print(work.chapters[1].title) # Second chapter name text = work.chapters[1].text # Second chapter text print(' '.join(text.split(" ")[:20])) ``` ``` What Branches Grow Meaning December 27, 2018 Christmas sucked this year, and Shouto’s got the black eye to prove it. Things had started out well enough, ``` The objects in work.chapters are of type `AO3.Chapter`. They have a lot of the same properties as a `Work` object would. Another thing you can do with the work object is download the entire work as a pdf or e-book. At the moment you can download works as AZW3, EPUB, HTML, MOBI, and PDF files. ```py3 import AO3 work = AO3.Work(14392692) with open(f"{work.title}.pdf", "wb") as file: file.write(work.download("PDF")) ``` __Advanced functionality__ Usually, when you call the constructor for the `Work` class, all info about it is loaded in the `__init__()` function. However, this process takes quite some time (~1-1.5 seconds) and if you want to load a list of works from a series, for example, you might be waiting for upwards of 30 seconds. To avoid this problem, the `Work.reload()` function, called on initialization, is a "threadable" function, which means that if you call it with the argument `threaded=True`, it will return a `Thread` object and work in parallel, meaning you can load multiple works at the same time. Let's take a look at an implementation: ```py3 import AO3 import time series = AO3.Series(1295090) works = [] threads = [] start = time.time() for work in series.work_list: works.append(work) threads.append(work.reload(threaded=True)) for thread in threads: thread.join() print(f"Loaded {len(works)} works in {round(time.time()-start, 1)} seconds.") ``` `Loaded 29 works in 2.2 seconds.` The `load=False` inside the `Work` constructor makes sure we don't load the work as soon as we create an instance of the class. In the end, we iterate over every thread and wait for the last one to finish using `.join()`. Let's compare this method with the standard way of loading AO3 works: ```py3 import AO3 import time series = AO3.Series(1295090) works = [] start = time.time() for work in series.work_list: work.reload() works.append(work) print(f"Loaded {len(works)} works in {round(time.time()-start, 1)} seconds.") ``` `Loaded 29 works in 21.6 seconds.` As we can see, there is a significant performance increase. There are other functions in this package which have this functionality. To see if a function is "threadable", either use `hasattr(function, "_threadable")` or check its `__doc__` string. To save even more time, if you're only interested in metadata, you can load a work with the `load_chapters` option set to False. Also, be aware that some functions (like `Series.work_list` or `Search.results`) might return semi-loaded `Work` objects. This means that no requests have been made to load this work (so you don't have access to chapter text, notes, etc...) but almost all of its metadata will already have been cached, and you might not need to call `Work.reload()` at all. The last important information about the `Work` class is that most of its properties (like the number of bookmarks, kudos, the authors' names, etc...) are cached properties. That means that once you check them once, the value is stored and it won't ever change, even if those values change. To update these values, you will need to call `Work.reload()`. See the example below: ```py3 import AO3 sess = AO3.GuestSession() work = AO3.Work(16721367, sess) print(work.kudos) work.leave_kudos() work.reload() print(work.kudos) ``` ``` 392 393 ``` ## Users Another useful thing you might want to do is get information on who wrote which works / comments. For that, we use the `AO3.User` class. ```py3 import AO3 user = AO3.User("bothersomepotato") print(user.url) print(user.bio) print(user.works) # Number of works published ``` ``` https://archiveofourown.org/users/bothersomepotato University student, opening documents to write essays but writing this stuff instead. No regrets though. My Tumblr, come chat with -or yell at- me if you feel like it! :) 2 ``` ## Search To search for works, you can either use the `AO3.search()` function and parse the BeautifulSoup object returned yourself, or use the `AO3.Search` class to automatically do that for you. ```py3 import AO3 search = AO3.Search(any_field="Clarke Lexa", word_count=AO3.utils.Constraint(5000, 15000)) search.update() print(search.total_results) for result in search.results: print(result) ``` ``` 3074 ``` You can then use the workid to load one of the works you searched for. To get more then the first 20 works, change the page number using ```py3 search.page = 2 ``` ## Session A lot of actions you might want to take might require an AO3 account. If you already have one, you can access those actions using an AO3.Session object. You start by logging in using your username and password, and then you can use that object to access restricted content. ```py3 import AO3 session = AO3.Session("username", "password") print(f"Bookmarks: {session.bookmarks}") session.refresh_auth_token() print(session.kudos(AO3.Work(18001499, load=False)) ``` ``` Bookmarks: 67 True ``` We successfully left kudos in a work and checked our bookmarks. The `session.refresh_auth_token()` is needed for some activities such as leaving kudos and comments. If it is expired or you forget to call this function, the error `AO3.utils.AuthError: Invalid authentication token. Try calling session.refresh_auth_token()` will be raised. You can also comment / leave kudos in a work by calling `Work.leave_kudos()`/`Work.comment()` and provided you have instantiated that object with a session already (`AO3.Work(xxxxxx, session=sess)` or using `Work.set_session()`). This is probably the best way to do so because you will run into less authentication issues (as the work's authenticity token will be used instead). If you would prefer to leave a comment or kudos anonymously, you can use an `AO3.GuestSession` in the same way you'd use a normal session, except you won't be able to check your bookmarks, subscriptions, etc. because you're not actually logged in. ## Comments To retrieve and process comment threads, you might want to look at the `Work.get_comments()` method. It returns all the comments in a specific chapter and their respective threads. You can then process them however you want. Let's take a look: ```py3 from time import time import AO3 work = AO3.Work(24560008) work.load_chapters() start = time() comments = work.get_comments(5) print(f"Loaded {len(comments)} comment threads in {round(time()-start, 1)} seconds\n") for comment in comments: print(f"Comment ID: {comment.id}\nReplies: {len(comment.get_thread())}") ``` ``` Loaded 5 comment threads in 1.8 seconds Comment ID: 312237184 Replies: 1 Comment ID: 312245032 Replies: 1 Comment ID: 312257098 Replies: 1 Comment ID: 312257860 Replies: 1 Comment ID: 312285673 Replies: 2 ``` Loading comments takes a very long time so you should try and use it as little as possible. It also causes lots of requests to be sent to the AO3 servers, which might result in getting the error `utils.HTTPError: We are being rate-limited. Try again in a while or reduce the number of requests`. If that happens, you should try to space out your requests or reduce their number. There is also the option to enable request limiting using `AO3.utils.limit_requests()`, which make it so you can't make more than x requests in a certain time window. You can also reply to comments using the `Comment.reply()` function, or delete one (if it's yours) using `Comment.delete()`. ## Extra AO3.extra contains the the code to download some extra resources that are not core to the functionality of this package and don't change very often. One example would be the list of fandoms recognized by AO3. To download a resource, simply use `AO3.extra.download(resource_name)`. To download every resource, you can use `AO3.extra.download_all()`. To see the list of available resources, use `AO3.extra.get_resources()`. # Contact info For information or bug reports, please create an issue or start a discussion. # License [MIT](https://choosealicense.com/licenses/mit/) ================================================ FILE: docs/index.md ================================================ # AO3 API This is an unofficial python library that lets you access some of AO3's (archiveofourown.org) data using webscraping and some other tools. __Documentation__ https://ao3-api.readthedocs.io __Source code repository and issue tracker__ https://github.com/wendytg/ao3_api __License__ [MIT](https://choosealicense.com/licenses/mit/) ================================================ FILE: docs/install.md ================================================ # Installation You can install this package using pip ```pip install ao3_api``` or by cloning the repository and building it from source. __Requirements__ - BeautifulSoup4 - Requests - LXML ================================================ FILE: docs/use.md ================================================ # Usage This package is divided in 9 core modules: works, chapters, users, series, search, session, comments, extra, and utils. ## Works One of the most basic things you might want to do with this package is loading a work and checking its statistics and informations. To do that, you'll need the `AO3.Work` class. We start by finding the _workid_ of the work we want to load. We do that either by using `AO3.utils.workid_from_url(url)` or by just looking at the url ourselves. Let's take a look: ```python import AO3 url = "https://archiveofourown.org/works/14392692/chapters/33236241" workid = AO3.utils.workid_from_url(url) print(f"Work ID: {workid}") work = AO3.Work(workid) print(f"Chapters: {work.nchapters}") ``` After running this snippet, we get the output: ``` Work ID: 14392692 Chapters: 46 ``` It's important to note that some works may not be accessible to guest users, and in this case you will get 0 chapters as an output, and the error `AO3.utils.AuthError: This work is only available to registered users of the Archive` if you try to load it. Nontheless, we can still do a lot more with this Work object: Lets try to get the first 20 words of the second chapter. ```python import AO3 work = AO3.Work(14392692) print(work.chapters[1].title) # Second chapter name text = work.chapters[1].text # Second chapter text print(' '.join(text.split(" ")[:20])) ``` ``` What Branches Grow Meaning December 27, 2018 Christmas sucked this year, and Shouto’s got the black eye to prove it. Things had started out well enough, ``` The objects in work.chapters are of type `AO3.Chapter`. They have a lot of the same properties as a `Work` object would. Another thing you can do with the work object is download the entire work as a pdf or e-book. At the moment you can download works as AZW3, EPUB, HTML, MOBI, and PDF files. ```python import AO3 work = AO3.Work(14392692) with open(f"{work.title}.pdf", "wb") as file: file.write(work.download("PDF")) ``` __Advanced functionality__ Usually, when you call the constructor for the `Work` class, all info about it is loaded in the `__init__()` function. However, this process takes quite some time (~1-1.5 seconds) and if you want to load a list of works from a series, for example, you might be waiting for upwards of 30 seconds. To avoid this problem, the `Work.reload()` function, called on initialization, is a "threadable" function, which means that if you call it with the argument `threaded=True`, it will return a `Thread` object and work in parallel, meaning you can load multiple works at the same time. Let's take a look at an implementation: ```python import AO3 import time series = AO3.Series(1295090) works = [] threads = [] start = time.time() for work in series.work_list: works.append(work) threads.append(work.reload(threaded=True)) for thread in threads: thread.join() print(f"Loaded {len(works)} works in {round(time.time()-start, 1)} seconds.") ``` `Loaded 29 works in 2.2 seconds.` The `load=False` inside the `Work` constructor makes sure we don't load the work as soon as we create an instance of the class. In the end, we iterate over every thread and wait for the last one to finish using `.join()`. Let's compare this method with the standard way of loading AO3 works: ```python import AO3 import time series = AO3.Series(1295090) works = [] start = time.time() for work in series.work_list: work.reload() works.append(work) print(f"Loaded {len(works)} works in {round(time.time()-start, 1)} seconds.") ``` `Loaded 29 works in 21.6 seconds.` As we can see, there is a significant performance increase. There are other functions in this package which have this functionality. To see if a function is "threadable", either use `hasattr(function, "_threadable")` or check its `__doc__` string. To save even more time, if you're only interested in metadata, you can load a work with the `load_chapters` option set to False. Also, be aware that some functions (like `Series.work_list` or `Search.results`) might return semi-loaded `Work` objects. This means that no requests have been made to load this work (so you don't have access to chapter text, notes, etc...) but almost all of its metadata will already have been cached, and you might not need to call `Work.reload()` at all. The last important information about the `Work` class is that most of its properties (like the number of bookmarks, kudos, the authors' names, etc...) are cached properties. That means that once you check them once, the value is stored and it won't ever change, even if those values change. To update these values, you will need to call `Work.reload()`. See the example below: ```python import AO3 sess = AO3.GuestSession() work = AO3.Work(16721367, sess) print(work.kudos) work.leave_kudos() work.reload() print(work.kudos) ``` ``` 392 393 ``` ## Users Another useful thing you might want to do is get information on who wrote which works / comments. For that, we use the `AO3.User` class. ```python import AO3 user = AO3.User("bothersomepotato") print(user.url) print(user.bio) print(user.works) # Number of works published ``` ``` https://archiveofourown.org/users/bothersomepotato University student, opening documents to write essays but writing this stuff instead. No regrets though. My Tumblr, come chat with -or yell at- me if you feel like it! :) 2 ``` ## Search To search for works, you can either use the `AO3.search()` function and parse the BeautifulSoup object returned yourself, or use the `AO3.Search` class to automatically do that for you ```python import AO3 search = AO3.Search(any_field="Clarke Lexa", word_count=AO3.utils.Constraint(5000, 15000)) search.update() print(search.total_results) for result in search.results: print(result) ``` ``` 3074 ``` You can then use the workid to load one of the works you searched for. To get more then the first 20 works, change the page number using ```python search.page = 2 ``` ## Session A lot of actions you might want to take might require an AO3 account, and if you have one, you can get access to those actions using an AO3.Session object. You start by logging in using your username and password, and then you can use that object to access restricted content. ```python import AO3 session = AO3.Session("username", "password") print(f"Bookmarks: {session.bookmarks}") session.refresh_auth_token() print(session.kudos(AO3.Work(18001499, load=False)) ``` ``` Bookmarks: 67 True ``` We successfully left kudos in a work and checked our bookmarks. The `session.refresh_auth_token()` is needed for some activities such as leaving kudos and comments. If it is expired or you forget to call this function, the error `AO3.utils.AuthError: Invalid authentication token. Try calling session.refresh_auth_token()` will be raised. You can also comment / leave kudos in a work by calling `Work.leave_kudos()`/`Work.comment()` and provided you have instantiated that object with a session already (`AO3.Work(xxxxxx, session=sess)` or using `Work.set_session()`). This is probably the best way to do so because you will run into less authentication issues (as the work's authenticity token will be used instead). If you would prefer to leave a comment or kudos anonimously, you can use an `AO3.GuestSession` in the same way you'd use a normal session, except you won't be able to check your bookmarks, subscriptions, etc... because you're not actually logged in. ## Comments To retrieve and process comment threads, you might want to look at the `Work.get_comments()` method. It returns all the comments in a specific chapter and their respective threads. You can then process them however you want. Let's take a look: ```python from time import time import AO3 work = AO3.Work(24560008) work.load_chapters() start = time() comments = work.get_comments(5) print(f"Loaded {len(comments)} comment threads in {round(time()-start, 1)} seconds\n") for comment in comments: print(f"Comment ID: {comment.id}\nReplies: {len(comment.get_thread())}") ``` ``` Loaded 5 comment threads in 1.8 seconds Comment ID: 312237184 Replies: 1 Comment ID: 312245032 Replies: 1 Comment ID: 312257098 Replies: 1 Comment ID: 312257860 Replies: 1 Comment ID: 312285673 Replies: 2 ``` Loading comments takes a very long time so you should try and use it as little as possible. It also causes lots of requests to be sent to the AO3 servers, which might result in getting the error `utils.HTTPError: We are being rate-limited. Try again in a while or reduce the number of requests`. If it happens, you should try to space out your requests or reduce their number. There is also the option to enable request limiting using `AO3.utils.limit_requests()`, which make it so you can't make more than x requests in a certain time window. You can also reply to comments using the `Comment.reply()` function, or delete one (if it's yours) using `Comment.delete()`. ## Extra AO3.extra contains the the code to download some extra resources that are not core to the functionality of this package and don't change very often. One example would be the list of fandoms recognized by AO3. To download a resource, simply use `AO3.extra.download(resource_name)`. To download every resource, you can use `AO3.extra.download_all()`. To see the list of available resources, `AO3.extra.get_resources()` will help you. ================================================ FILE: mkdocs.yml ================================================ site_name: AO3 API docs_dir: docs theme: readthedocs nav: - Home: index.md - Installation: install.md - Usage: use.md ================================================ FILE: pyproject.toml ================================================ [build-system] requires = ["setuptools>=61.0"] build-backend = "setuptools.build_meta" [project] name = "ao3-api" version = "2.3.1" authors = [ { name="Wendy" }, ] description = "An unofficial AO3 (archiveofourown.org) API" readme = "README.md" requires-python = ">=3.8" classifiers = [ "Programming Language :: Python :: 3", "License :: OSI Approved :: MIT License", "Operating System :: OS Independent", ] keywords = ["ao3", "fanfiction", "Archive of Our Own"] dependencies = [ "BeautifulSoup4", "lxml", "requests" ] [project.urls] Homepage = "https://github.com/wendytg/ao3_api" Issues = "https://github.com/wendytg/ao3_api/issues" Documentation = "https://ao3-api.readthedocs.io/"