[
  {
    "path": ".github/FUNDING.yml",
    "content": "# These are supported funding model platforms\n\ngithub: [alirezamika] # Replace with up to 4 GitHub Sponsors-enabled usernames e.g., [user1, user2]\npatreon: # Replace with a single Patreon username\nopen_collective: # Replace with a single Open Collective username\nko_fi: # Replace with a single Ko-fi username\ntidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel\ncommunity_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry\nliberapay: # Replace with a single Liberapay username\nissuehunt: # Replace with a single IssueHunt username\notechie: # Replace with a single Otechie username\ncustom: # Replace with up to 4 custom sponsorship URLs e.g., ['link1', 'link2']\n"
  },
  {
    "path": ".github/workflows/python-publish.yml",
    "content": "# This workflows will upload a Python Package using Twine when a release is created\n# For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries\n\nname: Upload Python Package\n\non:\n  release:\n    types: [created]\n\njobs:\n  deploy:\n\n    runs-on: ubuntu-latest\n\n    steps:\n    - uses: actions/checkout@v2\n    - name: Set up Python\n      uses: actions/setup-python@v2\n      with:\n        python-version: '3.x'\n    - name: Install dependencies\n      run: |\n        python -m pip install --upgrade pip\n        pip install setuptools wheel twine pytest\n        pip install .\n    - name: Run tests\n      run: |\n        pytest -q\n    - name: Build and publish\n      env:\n        TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }}\n        TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }}\n      run: |\n        python setup.py sdist bdist_wheel\n        twine upload dist/*\n"
  },
  {
    "path": ".github/workflows/stale-issues.yml",
    "content": "name: Close inactive issues\non:\n  schedule:\n    - cron: \"30 1 * * *\"\n\njobs:\n  close-issues:\n    runs-on: ubuntu-latest\n    permissions:\n      issues: write\n      pull-requests: write\n    steps:\n      - uses: actions/stale@v5\n        with:\n          days-before-issue-stale: 30\n          days-before-issue-close: 14\n          stale-issue-label: \"stale\"\n          stale-issue-message: \"This issue is stale because it has been open for 30 days with no activity.\"\n          close-issue-message: \"This issue was closed because it has been inactive for 14 days since being marked as stale.\"\n          days-before-pr-stale: 30\n          days-before-pr-close: 14\n          repo-token: ${{ secrets.GITHUB_TOKEN }}\n"
  },
  {
    "path": ".github/workflows/tests.yml",
    "content": "name: Run Tests\n\non:\n  push:\n  release:\n    types: [created]\n\njobs:\n  test:\n    runs-on: ubuntu-latest\n    steps:\n    - uses: actions/checkout@v2\n    - name: Set up Python\n      uses: actions/setup-python@v2\n      with:\n        python-version: '3.x'\n    - name: Install dependencies\n      run: |\n        python -m pip install --upgrade pip\n        pip install pytest\n        pip install .\n    - name: Run tests\n      run: pytest -q\n"
  },
  {
    "path": ".gitignore",
    "content": "# Byte-compiled / optimized / DLL files\n__pycache__/\n*.py[cod]\n*$py.class\n.idea/\n.vscode/\n\n# C extensions\n*.so\n\n# Distribution / packaging\n.Python\nenv/\nbuild/\ndevelop-eggs/\ndist/\ndownloads/\neggs/\n.eggs/\nlib/\nlib64/\nparts/\nsdist/\nvar/\nwheels/\n*.egg-info/\n.installed.cfg\n*.egg\n\n# PyInstaller\n#  Usually these files are written by a python script from a template\n#  before PyInstaller builds the exe, so as to inject date/other infos into it.\n*.manifest\n*.spec\n\n# Installer logs\npip-log.txt\npip-delete-this-directory.txt\n\n# Unit test / coverage reports\nhtmlcov/\n.tox/\n.coverage\n.coverage.*\n.cache\nnosetests.xml\ncoverage.xml\n*.cover\n.hypothesis/\n\n# Translations\n*.mo\n*.pot\n\n# Django stuff:\n*.log\nlocal_settings.py\n\n# Flask stuff:\ninstance/\n.webassets-cache\n\n# Scrapy stuff:\n.scrapy\n\n# Sphinx documentation\ndocs/_build/\n\n# PyBuilder\ntarget/\n\n# Jupyter Notebook\n.ipynb_checkpoints\n\n# pyenv\n.python-version\n\n# celery beat schedule file\ncelerybeat-schedule\n\n# SageMath parsed files\n*.sage.py\n\n# dotenv\n.env\n\n# virtualenv\n.venv\nvenv/\nENV/\n\n# Spyder project settings\n.spyderproject\n.spyproject\n\n# Rope project settings\n.ropeproject\n\n# mkdocs documentation\n/site\n\n# mypy\n.mypy_cache/\n"
  },
  {
    "path": "LICENSE",
    "content": "MIT License\n\nCopyright (c) 2020 Alireza Mika\n\nPermission is hereby granted, free of charge, to any person obtaining a copy\nof this software and associated documentation files (the \"Software\"), to deal\nin the Software without restriction, including without limitation the rights\nto use, copy, modify, merge, publish, distribute, sublicense, and/or sell\ncopies of the Software, and to permit persons to whom the Software is\nfurnished to do so, subject to the following conditions:\n\nThe above copyright notice and this permission notice shall be included in all\ncopies or substantial portions of the Software.\n\nTHE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\nIMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\nFITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\nAUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\nLIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\nOUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\nSOFTWARE.\n"
  },
  {
    "path": "README.md",
    "content": "# AutoScraper: A Smart, Automatic, Fast and Lightweight Web Scraper for Python\n\n![img](https://user-images.githubusercontent.com/17881612/91968083-5ee92080-ed29-11ea-82ec-d99ec85367a5.png)\n\nThis project is made for automatic web scraping to make scraping easy. \nIt gets a url or the html content of a web page and a list of sample data which we want to scrape from that page. **This data can be text, url or any html tag value of that page.** It learns the scraping rules and returns the similar elements. Then you can use this learned object with new urls to get similar content or the exact same element of those new pages.\n\n\n## Installation\n\nIt's compatible with python 3.\n\n- Install latest version from git repository using pip:\n```bash\n$ pip install git+https://github.com/alirezamika/autoscraper.git\n```\n\n- Install from PyPI:\n```bash\n$ pip install autoscraper\n```\n\n- Install from source:\n```bash\n$ python setup.py install\n```\n\n## How to use\n\n### Getting similar results\n\nSay we want to fetch all related post titles in a stackoverflow page:\n\n```python\nfrom autoscraper import AutoScraper\n\nurl = 'https://stackoverflow.com/questions/2081586/web-scraping-with-python'\n\n# We can add one or multiple candidates here.\n# You can also put urls here to retrieve urls.\nwanted_list = [\"What are metaclasses in Python?\"]\n\nscraper = AutoScraper()\nresult = scraper.build(url, wanted_list)\nprint(result)\n```\n\nHere's the output:\n```python\n[\n    'How do I merge two dictionaries in a single expression in Python (taking union of dictionaries)?', \n    'How to call an external command?', \n    'What are metaclasses in Python?', \n    'Does Python have a ternary conditional operator?', \n    'How do you remove duplicates from a list whilst preserving order?', \n    'Convert bytes to a string', \n    'How to get line count of a large file cheaply in Python?', \n    \"Does Python have a string 'contains' substring method?\", \n    'Why is “1000000000000000 in range(1000000000000001)” so fast in Python 3?'\n]\n```\nNow you can use the `scraper` object to get related topics of any stackoverflow page:\n```python\nscraper.get_result_similar('https://stackoverflow.com/questions/606191/convert-bytes-to-a-string')\n```\n\n### Getting exact result\n\nSay we want to scrape live stock prices from yahoo finance:\n\n```python\nfrom autoscraper import AutoScraper\n\nurl = 'https://finance.yahoo.com/quote/AAPL/'\n\nwanted_list = [\"124.81\"]\n\nscraper = AutoScraper()\n\n# Here we can also pass html content via the html parameter instead of the url (html=html_content)\nresult = scraper.build(url, wanted_list)\nprint(result)\n```\nNote that you should update the `wanted_list` if you want to copy this code, as the content of the page dynamically changes.\n\nYou can also pass any custom `requests` module parameter. for example you may want to use proxies or custom headers:\n\n```python\nproxies = {\n    \"http\": 'http://127.0.0.1:8001',\n    \"https\": 'https://127.0.0.1:8001',\n}\n\nresult = scraper.build(url, wanted_list, request_args=dict(proxies=proxies))\n```\n\nNow we can get the price of any symbol:\n\n```python\nscraper.get_result_exact('https://finance.yahoo.com/quote/MSFT/')\n```\n\n**You may want to get other info as well.** For example if you want to get market cap too, you can just append it to the wanted list. By using the `get_result_exact` method, it will retrieve the data as the same exact order in the wanted list.\n\n**Another example:** Say we want to scrape the about text, number of stars and the link to issues of Github repo pages:\n\n```python\nfrom autoscraper import AutoScraper\n\nurl = 'https://github.com/alirezamika/autoscraper'\n\nwanted_list = ['A Smart, Automatic, Fast and Lightweight Web Scraper for Python', '6.2k', 'https://github.com/alirezamika/autoscraper/issues']\n\nscraper = AutoScraper()\nscraper.build(url, wanted_list)\n```\n\nSimple, right?\n\n\n### Saving the model\n\nWe can now save the built model to use it later. To save:\n\n```python\n# Give it a file path\nscraper.save('yahoo-finance')\n```\n\nAnd to load:\n\n```python\nscraper.load('yahoo-finance')\n```\n\n## Tutorials\n\n- See [this gist](https://gist.github.com/alirezamika/72083221891eecd991bbc0a2a2467673) for more advanced usages.\n- [AutoScraper and Flask: Create an API From Any Website in Less Than 5 Minutes](https://medium.com/better-programming/autoscraper-and-flask-create-an-api-from-any-website-in-less-than-5-minutes-3f0f176fc4a3)\n\n## Issues\nFeel free to open an issue if you have any problem using the module.\n\n\n## Support the project\n\n<a href=\"https://www.buymeacoffee.com/alirezam\" target=\"_blank\"><img src=\"https://cdn.buymeacoffee.com/buttons/v2/default-black.png\" alt=\"Buy Me A Coffee\" height=\"45\" width=\"163\" ></a>\n\n\n#### Happy Coding  ♥️\n"
  },
  {
    "path": "autoscraper/__init__.py",
    "content": "from autoscraper.auto_scraper import AutoScraper\n"
  },
  {
    "path": "autoscraper/auto_scraper.py",
    "content": "import hashlib\nimport json\nfrom collections import defaultdict\nfrom html import unescape\nfrom urllib.parse import urljoin, urlparse\n\nimport requests\nfrom bs4 import BeautifulSoup\n\nfrom autoscraper.utils import (\n    FuzzyText,\n    ResultItem,\n    get_non_rec_text,\n    normalize,\n    text_match,\n    unique_hashable,\n    unique_stack_list,\n)\n\n\nclass AutoScraper(object):\n    \"\"\"\n    AutoScraper : A Smart, Automatic, Fast and Lightweight Web Scraper for Python.\n    AutoScraper automatically learns a set of rules required to extract the needed content\n        from a web page. So the programmer doesn't need to explicitly construct the rules.\n\n    Attributes\n    ----------\n    stack_list: list\n        List of rules learned by AutoScraper\n\n    Methods\n    -------\n    build() - Learns a set of rules represented as stack_list based on the wanted_list,\n        which can be reused for scraping similar elements from other web pages in the future.\n    get_result_similar() - Gets similar results based on the previously learned rules.\n    get_result_exact() - Gets exact results based on the previously learned rules.\n    get_results() - Gets exact and similar results based on the previously learned rules.\n    save() - Serializes the stack_list as JSON and saves it to disk.\n    load() - De-serializes the JSON representation of the stack_list and loads it back.\n    remove_rules() - Removes one or more learned rule[s] from the stack_list.\n    keep_rules() - Keeps only the specified learned rules in the stack_list and removes the others.\n    \"\"\"\n\n    request_headers = {\n        \"User-Agent\": \"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 \\\n            (KHTML, like Gecko) Chrome/84.0.4147.135 Safari/537.36\"\n    }\n\n    def __init__(self, stack_list=None):\n        self.stack_list = stack_list or []\n\n    def save(self, file_path):\n        \"\"\"\n        Serializes the stack_list as JSON and saves it to the disk.\n\n        Parameters\n        ----------\n        file_path: str\n            Path of the JSON output\n\n        Returns\n        -------\n        None\n        \"\"\"\n\n        data = dict(stack_list=self.stack_list)\n        with open(file_path, \"w\") as f:\n            json.dump(data, f)\n\n    def load(self, file_path):\n        \"\"\"\n        De-serializes the JSON representation of the stack_list and loads it back.\n\n        Parameters\n        ----------\n        file_path: str\n            Path of the JSON file to load stack_list from.\n\n        Returns\n        -------\n        None\n        \"\"\"\n\n        with open(file_path, \"r\") as f:\n            data = json.load(f)\n\n        # for backward compatibility\n        if isinstance(data, list):\n            self.stack_list = data\n            return\n\n        self.stack_list = data[\"stack_list\"]\n\n    @classmethod\n    def _fetch_html(cls, url, request_args=None):\n        request_args = request_args or {}\n        headers = dict(cls.request_headers)\n        if url:\n            headers[\"Host\"] = urlparse(url).netloc\n\n        user_headers = request_args.pop(\"headers\", {})\n        headers.update(user_headers)\n        res = requests.get(url, headers=headers, **request_args)\n        if res.encoding == \"ISO-8859-1\" and not \"ISO-8859-1\" in res.headers.get(\n            \"Content-Type\", \"\"\n        ):\n            res.encoding = res.apparent_encoding\n        html = res.text\n        return html\n\n    @classmethod\n    def _get_soup(cls, url=None, html=None, request_args=None):\n        if html:\n            html = normalize(unescape(html))\n            return BeautifulSoup(html, \"lxml\")\n\n        html = cls._fetch_html(url, request_args)\n        html = normalize(unescape(html))\n\n        return BeautifulSoup(html, \"lxml\")\n\n    @staticmethod\n    def _get_valid_attrs(item):\n        key_attrs = {\"class\", \"style\"}\n        attrs = {\n            k: v if v != [] else \"\" for k, v in item.attrs.items() if k in key_attrs\n        }\n\n        for attr in key_attrs:\n            if attr not in attrs:\n                attrs[attr] = \"\"\n        return attrs\n\n    @staticmethod\n    def _child_has_text(child, text, url, text_fuzz_ratio):\n        child_text = child.getText().strip()\n\n        if text_match(text, child_text, text_fuzz_ratio):\n            parent_text = child.parent.getText().strip()\n            if child_text == parent_text and child.parent.parent:\n                return False\n\n            child.wanted_attr = None\n            return True\n\n        if text_match(text, get_non_rec_text(child), text_fuzz_ratio):\n            child.is_non_rec_text = True\n            child.wanted_attr = None\n            return True\n\n        for key, value in child.attrs.items():\n            if not isinstance(value, str):\n                continue\n\n            value = value.strip()\n            if text_match(text, value, text_fuzz_ratio):\n                child.wanted_attr = key\n                return True\n\n            if key in {\"href\", \"src\"}:\n                full_url = urljoin(url, value)\n                if text_match(text, full_url, text_fuzz_ratio):\n                    child.wanted_attr = key\n                    child.is_full_url = True\n                    return True\n\n        return False\n\n    def _get_children(self, soup, text, url, text_fuzz_ratio):\n        children = reversed(soup.findChildren())\n        children = [\n            x for x in children if self._child_has_text(x, text, url, text_fuzz_ratio)\n        ]\n        return children\n\n    def build(\n        self,\n        url=None,\n        wanted_list=None,\n        wanted_dict=None,\n        html=None,\n        request_args=None,\n        update=False,\n        text_fuzz_ratio=1.0,\n    ):\n        \"\"\"\n        Automatically constructs a set of rules to scrape the specified target[s] from a web page.\n            The rules are represented as stack_list.\n\n        Parameters:\n        ----------\n        url: str, optional\n            URL of the target web page. You should either pass url or html or both.\n\n        wanted_list: list of strings or compiled regular expressions, optional\n            A list of needed contents to be scraped.\n                AutoScraper learns a set of rules to scrape these targets. If specified,\n                wanted_dict will be ignored.\n\n        wanted_dict: dict, optional\n            A dict of needed contents to be scraped. Keys are aliases and values are list of target texts\n                or compiled regular expressions.\n                AutoScraper learns a set of rules to scrape these targets and sets its aliases.\n\n        html: str, optional\n            An HTML string can also be passed instead of URL.\n                You should either pass url or html or both.\n\n        request_args: dict, optional\n            A dictionary used to specify a set of additional request parameters used by requests\n                module. You can specify proxy URLs, custom headers etc.\n\n        update: bool, optional, defaults to False\n            If True, new learned rules will be added to the previous ones.\n            If False, all previously learned rules will be removed.\n\n        text_fuzz_ratio: float in range [0, 1], optional, defaults to 1.0\n            The fuzziness ratio threshold for matching the wanted contents.\n\n        Returns:\n        --------\n        List of similar results\n        \"\"\"\n\n        if not wanted_list and not (wanted_dict and any(wanted_dict.values())):\n            raise ValueError(\"No targets were supplied\")\n\n        soup = self._get_soup(url=url, html=html, request_args=request_args)\n\n        result_list = []\n\n        if update is False:\n            self.stack_list = []\n\n        if wanted_list:\n            wanted_dict = {\"\": wanted_list}\n\n        wanted_list = []\n\n        for alias, wanted_items in wanted_dict.items():\n            wanted_items = [normalize(w) for w in wanted_items]\n            wanted_list += wanted_items\n\n            for wanted in wanted_items:\n                children = self._get_children(soup, wanted, url, text_fuzz_ratio)\n\n                for child in children:\n                    result, stack = self._get_result_for_child(child, soup, url)\n                    stack[\"alias\"] = alias\n                    result_list += result\n                    self.stack_list.append(stack)\n\n        result_list = [item.text for item in result_list]\n        result_list = unique_hashable(result_list)\n\n        self.stack_list = unique_stack_list(self.stack_list)\n        return result_list\n\n    @classmethod\n    def _build_stack(cls, child, url):\n        content = [(child.name, cls._get_valid_attrs(child))]\n\n        parent = child\n        while True:\n            grand_parent = parent.findParent()\n            if not grand_parent:\n                break\n\n            children = grand_parent.findAll(\n                parent.name, cls._get_valid_attrs(parent), recursive=False\n            )\n            for i, c in enumerate(children):\n                if c == parent:\n                    content.insert(\n                        0, (grand_parent.name, cls._get_valid_attrs(grand_parent), i)\n                    )\n                    break\n\n            if not grand_parent.parent:\n                break\n\n            parent = grand_parent\n\n        wanted_attr = getattr(child, \"wanted_attr\", None)\n        is_full_url = getattr(child, \"is_full_url\", False)\n        is_non_rec_text = getattr(child, \"is_non_rec_text\", False)\n        stack = dict(\n            content=content,\n            wanted_attr=wanted_attr,\n            is_full_url=is_full_url,\n            is_non_rec_text=is_non_rec_text,\n        )\n        stack[\"url\"] = url if is_full_url else \"\"\n        stack[\"hash\"] = hashlib.sha256(str(stack).encode(\"utf-8\")).hexdigest()\n        stack[\"stack_id\"] = \"rule_\" + stack[\"hash\"][:8]\n        return stack\n\n    def _get_result_for_child(self, child, soup, url):\n        stack = self._build_stack(child, url)\n        result = self._get_result_with_stack(stack, soup, url, 1.0)\n        return result, stack\n\n    @staticmethod\n    def _fetch_result_from_child(child, wanted_attr, is_full_url, url, is_non_rec_text):\n        if wanted_attr is None:\n            if is_non_rec_text:\n                return get_non_rec_text(child)\n            return child.getText().strip()\n\n        if wanted_attr not in child.attrs:\n            return None\n\n        if is_full_url:\n            return urljoin(url, child.attrs[wanted_attr])\n\n        return child.attrs[wanted_attr]\n\n    @staticmethod\n    def _get_fuzzy_attrs(attrs, attr_fuzz_ratio):\n        attrs = dict(attrs)\n        for key, val in attrs.items():\n            if isinstance(val, str) and val:\n                val = FuzzyText(val, attr_fuzz_ratio)\n            elif isinstance(val, (list, tuple)):\n                val = [FuzzyText(x, attr_fuzz_ratio) if x else x for x in val]\n            attrs[key] = val\n        return attrs\n\n    def _get_result_with_stack(self, stack, soup, url, attr_fuzz_ratio, **kwargs):\n        parents = [soup]\n        stack_content = stack[\"content\"]\n        contain_sibling_leaves = kwargs.get(\"contain_sibling_leaves\", False)\n        for index, item in enumerate(stack_content):\n            children = []\n            if item[0] == \"[document]\":\n                continue\n            for parent in parents:\n\n                attrs = item[1]\n                if attr_fuzz_ratio < 1.0:\n                    attrs = self._get_fuzzy_attrs(attrs, attr_fuzz_ratio)\n\n                found = parent.findAll(item[0], attrs, recursive=False)\n                if not found:\n                    continue\n\n                if not contain_sibling_leaves and index == len(stack_content) - 1:\n                    idx = min(len(found) - 1, stack_content[index - 1][2])\n                    found = [found[idx]]\n\n                children += found\n\n            parents = children\n\n        wanted_attr = stack[\"wanted_attr\"]\n        is_full_url = stack[\"is_full_url\"]\n        is_non_rec_text = stack.get(\"is_non_rec_text\", False)\n        result = [\n            ResultItem(\n                self._fetch_result_from_child(\n                    i, wanted_attr, is_full_url, url, is_non_rec_text\n                ),\n                getattr(i, \"child_index\", 0),\n            )\n            for i in parents\n        ]\n        if not kwargs.get(\"keep_blank\", False):\n            result = [x for x in result if x.text]\n        return result\n\n    def _get_result_with_stack_index_based(\n        self, stack, soup, url, attr_fuzz_ratio, **kwargs\n    ):\n        p = soup.findChildren(recursive=False)[0]\n        stack_content = stack[\"content\"]\n        for index, item in enumerate(stack_content[:-1]):\n            if item[0] == \"[document]\":\n                continue\n            content = stack_content[index + 1]\n            attrs = content[1]\n            if attr_fuzz_ratio < 1.0:\n                attrs = self._get_fuzzy_attrs(attrs, attr_fuzz_ratio)\n            p = p.findAll(content[0], attrs, recursive=False)\n            if not p:\n                return []\n            idx = min(len(p) - 1, item[2])\n            p = p[idx]\n\n        result = [\n            ResultItem(\n                self._fetch_result_from_child(\n                    p,\n                    stack[\"wanted_attr\"],\n                    stack[\"is_full_url\"],\n                    url,\n                    stack[\"is_non_rec_text\"],\n                ),\n                getattr(p, \"child_index\", 0),\n            )\n        ]\n        if not kwargs.get(\"keep_blank\", False):\n            result = [x for x in result if x.text]\n        return result\n\n    def _get_result_by_func(\n        self,\n        func,\n        url,\n        html,\n        soup,\n        request_args,\n        grouped,\n        group_by_alias,\n        unique,\n        attr_fuzz_ratio,\n        **kwargs\n    ):\n        if not soup:\n            soup = self._get_soup(url=url, html=html, request_args=request_args)\n\n        keep_order = kwargs.get(\"keep_order\", False)\n\n        if group_by_alias or (keep_order and not grouped):\n            for index, child in enumerate(soup.findChildren()):\n                setattr(child, \"child_index\", index)\n\n        result_list = []\n        grouped_result = defaultdict(list)\n        for stack in self.stack_list:\n            if not url:\n                url = stack.get(\"url\", \"\")\n\n            result = func(stack, soup, url, attr_fuzz_ratio, **kwargs)\n\n            if not grouped and not group_by_alias:\n                result_list += result\n                continue\n\n            group_id = stack.get(\"alias\", \"\") if group_by_alias else stack[\"stack_id\"]\n            grouped_result[group_id] += result\n\n        return self._clean_result(\n            result_list, grouped_result, grouped, group_by_alias, unique, keep_order\n        )\n\n    @staticmethod\n    def _clean_result(\n        result_list, grouped_result, grouped, grouped_by_alias, unique, keep_order\n    ):\n        if not grouped and not grouped_by_alias:\n            if unique is None:\n                unique = True\n            if keep_order:\n                result_list = sorted(result_list, key=lambda x: x.index)\n            result = [x.text for x in result_list]\n            if unique:\n                result = unique_hashable(result)\n            return result\n\n        for k, val in grouped_result.items():\n            if grouped_by_alias:\n                val = sorted(val, key=lambda x: x.index)\n            val = [x.text for x in val]\n            if unique:\n                val = unique_hashable(val)\n            grouped_result[k] = val\n\n        return dict(grouped_result)\n\n    def get_result_similar(\n        self,\n        url=None,\n        html=None,\n        soup=None,\n        request_args=None,\n        grouped=False,\n        group_by_alias=False,\n        unique=None,\n        attr_fuzz_ratio=1.0,\n        keep_blank=False,\n        keep_order=False,\n        contain_sibling_leaves=False,\n    ):\n        \"\"\"\n        Gets similar results based on the previously learned rules.\n\n        Parameters:\n        ----------\n        url: str, optional\n            URL of the target web page. You should either pass url or html or both.\n\n        html: str, optional\n            An HTML string can also be passed instead of URL.\n                You should either pass url or html or both.\n\n        request_args: dict, optional\n            A dictionary used to specify a set of additional request parameters used by requests\n                module. You can specify proxy URLs, custom headers etc.\n\n        grouped: bool, optional, defaults to False\n            If set to True, the result will be a dictionary with the rule_ids as keys\n                and a list of scraped data per rule as values.\n\n        group_by_alias: bool, optional, defaults to False\n            If set to True, the result will be a dictionary with the rule alias as keys\n                and a list of scraped data per alias as values.\n\n        unique: bool, optional, defaults to True for non grouped results and\n                False for grouped results.\n            If set to True, will remove duplicates from returned result list.\n\n        attr_fuzz_ratio: float in range [0, 1], optional, defaults to 1.0\n            The fuzziness ratio threshold for matching html tag attributes.\n\n        keep_blank: bool, optional, defaults to False\n            If set to True, missing values will be returned as empty strings.\n\n        keep_order: bool, optional, defaults to False\n            If set to True, the results will be ordered as they are present on the web page.\n\n        contain_sibling_leaves: bool, optional, defaults to False\n            If set to True, the results will also contain the sibling leaves of the wanted elements.\n\n        Returns:\n        --------\n        List of similar results scraped from the web page.\n        Dictionary if grouped=True or group_by_alias=True.\n        \"\"\"\n\n        func = self._get_result_with_stack\n        return self._get_result_by_func(\n            func,\n            url,\n            html,\n            soup,\n            request_args,\n            grouped,\n            group_by_alias,\n            unique,\n            attr_fuzz_ratio,\n            keep_blank=keep_blank,\n            keep_order=keep_order,\n            contain_sibling_leaves=contain_sibling_leaves,\n        )\n\n    def get_result_exact(\n        self,\n        url=None,\n        html=None,\n        soup=None,\n        request_args=None,\n        grouped=False,\n        group_by_alias=False,\n        unique=None,\n        attr_fuzz_ratio=1.0,\n        keep_blank=False,\n    ):\n        \"\"\"\n        Gets exact results based on the previously learned rules.\n\n        Parameters:\n        ----------\n        url: str, optional\n            URL of the target web page. You should either pass url or html or both.\n\n        html: str, optional\n            An HTML string can also be passed instead of URL.\n                You should either pass url or html or both.\n\n        request_args: dict, optional\n            A dictionary used to specify a set of additional request parameters used by requests\n                module. You can specify proxy URLs, custom headers etc.\n\n        grouped: bool, optional, defaults to False\n            If set to True, the result will be a dictionary with the rule_ids as keys\n                and a list of scraped data per rule as values.\n\n        group_by_alias: bool, optional, defaults to False\n            If set to True, the result will be a dictionary with the rule alias as keys\n                and a list of scraped data per alias as values.\n\n        unique: bool, optional, defaults to True for non grouped results and\n                False for grouped results.\n            If set to True, will remove duplicates from returned result list.\n\n        attr_fuzz_ratio: float in range [0, 1], optional, defaults to 1.0\n            The fuzziness ratio threshold for matching html tag attributes.\n\n        keep_blank: bool, optional, defaults to False\n            If set to True, missing values will be returned as empty strings.\n\n        Returns:\n        --------\n        List of exact results scraped from the web page.\n        Dictionary if grouped=True or group_by_alias=True.\n        \"\"\"\n\n        func = self._get_result_with_stack_index_based\n        return self._get_result_by_func(\n            func,\n            url,\n            html,\n            soup,\n            request_args,\n            grouped,\n            group_by_alias,\n            unique,\n            attr_fuzz_ratio,\n            keep_blank=keep_blank,\n        )\n\n    def get_result(\n        self,\n        url=None,\n        html=None,\n        request_args=None,\n        grouped=False,\n        group_by_alias=False,\n        unique=None,\n        attr_fuzz_ratio=1.0,\n    ):\n        \"\"\"\n        Gets similar and exact results based on the previously learned rules.\n\n        Parameters:\n        ----------\n        url: str, optional\n            URL of the target web page. You should either pass url or html or both.\n\n        html: str, optional\n            An HTML string can also be passed instead of URL.\n                You should either pass url or html or both.\n\n        request_args: dict, optional\n            A dictionary used to specify a set of additional request parameters used by requests\n                module. You can specify proxy URLs, custom headers etc.\n\n        grouped: bool, optional, defaults to False\n            If set to True, the result will be dictionaries with the rule_ids as keys\n                and a list of scraped data per rule as values.\n\n        group_by_alias: bool, optional, defaults to False\n            If set to True, the result will be a dictionary with the rule alias as keys\n                and a list of scraped data per alias as values.\n\n        unique: bool, optional, defaults to True for non grouped results and\n                False for grouped results.\n            If set to True, will remove duplicates from returned result list.\n\n        attr_fuzz_ratio: float in range [0, 1], optional, defaults to 1.0\n            The fuzziness ratio threshold for matching html tag attributes.\n\n        Returns:\n        --------\n        Pair of (similar, exact) results.\n        See get_result_similar and get_result_exact methods.\n        \"\"\"\n\n        soup = self._get_soup(url=url, html=html, request_args=request_args)\n        args = dict(\n            url=url,\n            soup=soup,\n            grouped=grouped,\n            group_by_alias=group_by_alias,\n            unique=unique,\n            attr_fuzz_ratio=attr_fuzz_ratio,\n        )\n        similar = self.get_result_similar(**args)\n        exact = self.get_result_exact(**args)\n        return similar, exact\n\n    def remove_rules(self, rules):\n        \"\"\"\n        Removes a list of learned rules from stack_list.\n\n        Parameters:\n        ----------\n        rules : list\n            A list of rules to be removed\n\n        Returns:\n        --------\n        None\n        \"\"\"\n\n        self.stack_list = [x for x in self.stack_list if x[\"stack_id\"] not in rules]\n\n    def keep_rules(self, rules):\n        \"\"\"\n        Removes all other rules except the specified ones.\n\n        Parameters:\n        ----------\n        rules : list\n            A list of rules to keep in stack_list and removing the rest.\n\n        Returns:\n        --------\n        None\n        \"\"\"\n\n        self.stack_list = [x for x in self.stack_list if x[\"stack_id\"] in rules]\n\n    def set_rule_aliases(self, rule_aliases):\n        \"\"\"\n        Sets the specified alias for each rule\n\n        Parameters:\n        ----------\n        rule_aliases : dict\n            A dictionary with keys of rule_id and values of alias\n\n        Returns:\n        --------\n        None\n        \"\"\"\n\n        id_to_stack = {stack[\"stack_id\"]: stack for stack in self.stack_list}\n        for rule_id, alias in rule_aliases.items():\n            id_to_stack[rule_id][\"alias\"] = alias\n\n    def generate_python_code(self):\n        # deprecated\n        print(\"This function is deprecated. Please use save() and load() instead.\")\n"
  },
  {
    "path": "autoscraper/utils.py",
    "content": "from collections import OrderedDict\n\nimport unicodedata\n\nfrom difflib import SequenceMatcher\n\n\ndef unique_stack_list(stack_list):\n    seen = set()\n    unique_list = []\n    for stack in stack_list:\n        stack_hash = stack['hash']\n        if stack_hash in seen:\n            continue\n        unique_list.append(stack)\n        seen.add(stack_hash)\n    return unique_list\n\n\ndef unique_hashable(hashable_items):\n    \"\"\"Removes duplicates from the list. Must preserve the orders.\"\"\"\n    return list(OrderedDict.fromkeys(hashable_items))\n\n\ndef get_non_rec_text(element):\n    return ''.join(element.find_all(text=True, recursive=False)).strip()\n\n\ndef normalize(item):\n    if not isinstance(item, str):\n        return item\n    return unicodedata.normalize(\"NFKD\", item.strip())\n\n\ndef text_match(t1, t2, ratio_limit):\n    if hasattr(t1, 'fullmatch'):\n        return bool(t1.fullmatch(t2))\n    if ratio_limit >= 1:\n        return t1 == t2\n    return SequenceMatcher(None, t1, t2).ratio() >= ratio_limit\n\n\nclass ResultItem():\n    def __init__(self, text, index):\n        self.text = text\n        self.index = index\n\n    def __str__(self):\n        return self.text\n\n\nclass FuzzyText(object):\n    def __init__(self, text, ratio_limit):\n        self.text = text\n        self.ratio_limit = ratio_limit\n        self.match = None\n\n    def search(self, text):\n        return SequenceMatcher(None, self.text, text).ratio() >= self.ratio_limit\n"
  },
  {
    "path": "setup.py",
    "content": "from codecs import open\nfrom os import path\n\nfrom setuptools import find_packages, setup\n\nhere = path.abspath(path.dirname(__file__))\n\nwith open(path.join(here, \"README.md\"), encoding=\"utf-8\") as f:\n    long_description = f.read()\n\nsetup(\n    name=\"autoscraper\",\n    version=\"1.1.14\",\n    description=\"A Smart, Automatic, Fast and Lightweight Web Scraper for Python\",\n    long_description_content_type=\"text/markdown\",\n    long_description=long_description,\n    url=\"https://github.com/alirezamika/autoscraper\",\n    author=\"Alireza Mika\",\n    author_email=\"alirezamika@gmail.com\",\n    license=\"MIT\",\n    classifiers=[\n        \"Development Status :: 4 - Beta\",\n        \"License :: OSI Approved :: MIT License\",\n        \"Programming Language :: Python :: 3\",\n    ],\n    keywords=\"scraping - scraper\",\n    packages=find_packages(exclude=[\"contrib\", \"docs\", \"tests\"]),\n    python_requires=\">=3.6\",\n    install_requires=[\"requests\", \"bs4\", \"lxml\"],\n)\n"
  },
  {
    "path": "tests/__init__.py",
    "content": ""
  },
  {
    "path": "tests/conftest.py",
    "content": "import sys\nfrom types import ModuleType\nfrom html.parser import HTMLParser\n\nclass _Node:\n    def __init__(self, name, attrs, parent=None):\n        self.name = name\n        self.attrs = dict(attrs)\n        self.parent = parent\n        self.children = []\n        self.text = \"\"\n\n    def append_child(self, child):\n        self.children.append(child)\n        child.parent = self\n\n    def getText(self):\n        return self.text + \"\".join(c.getText() for c in self.children)\n\n    def findChildren(self, recursive=True):\n        result = []\n        for child in self.children:\n            result.append(child)\n            if recursive:\n                result.extend(child.findChildren(recursive))\n        return result\n\n    def findParent(self):\n        return self.parent\n\n    def _attr_match(self, child, attrs):\n        from autoscraper.utils import FuzzyText\n\n        for key, val in (attrs or {}).items():\n            actual = child.attrs.get(key, \"\")\n            if isinstance(actual, list):\n                actual = \" \".join(actual)\n\n            if isinstance(val, FuzzyText):\n                if not val.search(actual):\n                    return False\n            elif actual != val:\n                return False\n        return True\n\n    def findAll(self, name=None, attrs=None, recursive=True):\n        result = []\n        for child in self.children:\n            if (name is None or child.name == name) and self._attr_match(child, attrs):\n                result.append(child)\n            if recursive:\n                result.extend(child.findAll(name, attrs, recursive))\n        return result\n\n    def find_all(self, name=None, attrs=None, text=None, recursive=True):\n        if text:\n            res = []\n            if self.text.strip():\n                res.append(self.text)\n            for child in self.children:\n                if recursive:\n                    res.extend(child.find_all(text=True, recursive=True))\n                elif child.text.strip():\n                    res.append(child.text)\n            return res\n        return self.findAll(name, attrs, recursive)\n\nclass _Parser(HTMLParser):\n    def __init__(self):\n        super().__init__()\n        self.root = _Node(\"[document]\", {})\n        self.current = self.root\n\n    def handle_starttag(self, tag, attrs):\n        node = _Node(tag, attrs)\n        self.current.append_child(node)\n        self.current = node\n\n    def handle_endtag(self, tag):\n        if self.current.parent:\n            self.current = self.current.parent\n\n    def handle_data(self, data):\n        self.current.text += data\n\nclass BeautifulSoup(_Node):\n    def __init__(self, html, parser):\n        p = _Parser()\n        p.feed(html)\n        super().__init__(p.root.name, p.root.attrs)\n        self.children = p.root.children\n        for c in self.children:\n            c.parent = self\n\nbs4_mod = ModuleType(\"bs4\")\nbs4_mod.BeautifulSoup = BeautifulSoup\nsys.modules.setdefault(\"bs4\", bs4_mod)\n\nclass _Response:\n    def __init__(self, text=\"\"):\n        self.encoding = \"utf-8\"\n        self.headers = {\"Content-Type\": \"text/html\"}\n        self.text = text\n\nrequests_mod = ModuleType(\"requests\")\nrequests_mod.get = lambda url, headers=None, **kw: _Response()\nsys.modules.setdefault(\"requests\", requests_mod)\n"
  },
  {
    "path": "tests/integration/__init__.py",
    "content": ""
  },
  {
    "path": "tests/integration/test_complex_features.py",
    "content": "import pytest\nimport re\nfrom autoscraper import AutoScraper\n\nHTML_COMPLEX = \"\"\"\n<div id=\"main\">\n  <ul class=\"fruits\">\n    <li class=\"item\"><span class=\"name\">Banana</span><a href=\"/banana\" class=\"link\">More</a></li>\n    <li class=\"item\"><span class=\"name\">Apple</span><a href=\"/apple\" class=\"link\">More</a></li>\n    <li class=\"item\"><span class=\"name\">Orange</span><a href=\"/orange\" class=\"link\">More</a></li>\n    <li class=\"item\"><span class=\"name\">Banana</span></li>\n  </ul>\n  <p class=\"info\">Fresh fruits</p>\n  <a class=\"external\" href=\"/shop\">Shop Now</a>\n</div>\n\"\"\"\n\n\ndef test_extract_relative_link():\n    scraper = AutoScraper()\n    url = \"https://example.com/index.html\"\n    result = scraper.build(url=url, html=HTML_COMPLEX, wanted_list=[\"https://example.com/apple\"])\n    assert \"https://example.com/apple\" in result\n    similar = scraper.get_result_similar(\n        url=url, html=HTML_COMPLEX, contain_sibling_leaves=True, unique=True\n    )\n    assert set(similar) == {\n        \"https://example.com/banana\",\n        \"https://example.com/apple\",\n        \"https://example.com/orange\",\n    }\n    exact = scraper.get_result_exact(url=url, html=HTML_COMPLEX)\n    assert exact == [\"https://example.com/apple\"]\n\n\ndef test_build_with_regex():\n    scraper = AutoScraper()\n    scraper.build(html=HTML_COMPLEX, wanted_list=[re.compile(\"Ban.*\")])\n    result = scraper.get_result_exact(html=HTML_COMPLEX)\n    assert \"Banana\" in result[0]\n\n\ndef test_update_appends_rules():\n    scraper = AutoScraper()\n    scraper.build(html=HTML_COMPLEX, wanted_list=[\"Banana\"])\n    count = len(scraper.stack_list)\n    scraper.build(html=HTML_COMPLEX, wanted_list=[\"Apple\"], update=True)\n    assert len(scraper.stack_list) == count + 1\n\n\ndef test_remove_rules():\n    scraper = AutoScraper()\n    scraper.build(html=HTML_COMPLEX, wanted_list=[\"Banana\"])\n    scraper.build(html=HTML_COMPLEX, wanted_list=[\"Apple\"], update=True)\n    rule_ids = [s[\"stack_id\"] for s in scraper.stack_list]\n    to_remove = rule_ids[0]\n    scraper.remove_rules([to_remove])\n    remaining = [s[\"stack_id\"] for s in scraper.stack_list]\n    assert to_remove not in remaining\n    assert len(remaining) == len(rule_ids) - 1\n\n\ndef test_keep_blank_returns_empty():\n    scraper = AutoScraper()\n    scraper.build(html=HTML_COMPLEX, wanted_list=[\"/shop\"])\n    html_blank = HTML_COMPLEX.replace('href=\"/shop\"', 'href=\"\"')\n    result = scraper.get_result_exact(html=html_blank, keep_blank=True)\n    assert result == [\"\"]\n\n\ndef test_attr_fuzz_ratio():\n    html_base = '<div><a class=\"btn-primary\" href=\"/item\">Buy</a></div>'\n    html_variant = '<div><a class=\"btn-prime\" href=\"/item\">Buy</a></div>'\n    scraper = AutoScraper()\n    scraper.build(html=html_base, wanted_list=[\"Buy\"])\n    res = scraper.get_result_exact(html=html_variant, attr_fuzz_ratio=0.8)\n    assert res == [\"Buy\"]\n"
  },
  {
    "path": "tests/integration/test_real_world.py",
    "content": "import re\nfrom autoscraper import AutoScraper\n\nHTML_PAGE_1 = \"\"\"\n<div id='product'>\n  <h1 class='title'>Sony PlayStation 4 PS4 Pro 1TB 4K Console - Black</h1>\n  <span class='price'>US $349.99</span>\n  <div class='rating'><span class='value'>4.8</span></div>\n  <div class='note'>See details</div>\n</div>\n\"\"\"\n\nHTML_PAGE_2 = \"\"\"\n<div id='product'>\n  <h1 class='title'>Acer Predator Helios 300 15.6'' 144Hz FHD Laptop i7-9750H 16GB 512GB GTX 1660 Ti</h1>\n  <span class='price'>US $1,229.49</span>\n  <div class='rating'><span class='value'>5.0</span></div>\n  <div class='note'>See details</div>\n</div>\n\"\"\"\n\nHTML_WALMART_1 = \"<div class='price'>$8.95</div>\"\nHTML_WALMART_2 = \"<div class='price'>$7.00</div>\"\nHTML_ETSY_1 = \"<span class='amount'>$12.50+</span>\"\nHTML_ETSY_2 = \"<span class='amount'>$60.00</span>\"\n\n\ndef test_grouping_and_rule_removal():\n    scraper = AutoScraper()\n    wanted = [\n        \"Sony PlayStation 4 PS4 Pro 1TB 4K Console - Black\",\n        \"US $349.99\",\n        \"4.8\",\n        \"See details\",\n    ]\n    scraper.build(html=HTML_PAGE_1, wanted_list=wanted)\n    grouped = scraper.get_result_exact(html=HTML_PAGE_2, grouped=True)\n    unwanted = [r for r, v in grouped.items() if v == [\"See details\"]]\n    scraper.remove_rules(unwanted)\n    result = scraper.get_result_exact(html=HTML_PAGE_2)\n    assert result == [\n        \"Acer Predator Helios 300 15.6'' 144Hz FHD Laptop i7-9750H 16GB 512GB GTX 1660 Ti\",\n        \"US $1,229.49\",\n        \"5.0\",\n    ]\n\n\ndef test_incremental_learning_multiple_sites():\n    scraper = AutoScraper()\n    data = [\n        (HTML_PAGE_1, [\"US $349.99\"]),\n        (HTML_WALMART_1, [\"$8.95\"]),\n        (HTML_ETSY_1, [\"$12.50+\"]),\n    ]\n    for html, wanted in data:\n        scraper.build(html=html, wanted_list=wanted, update=True)\n    assert \"US $1,229.49\" in scraper.get_result_exact(html=HTML_PAGE_2)\n    assert \"$7.00\" in scraper.get_result_exact(html=HTML_WALMART_2)\n    assert \"$60.00\" in scraper.get_result_exact(html=HTML_ETSY_2)\n\n\ndef test_attr_fuzz_ratio_realistic():\n    base = \"<div><a class='btn-primary-action' href='/buy'>Buy</a></div>\"\n    variant = \"<div><a class='btn-prim-action' href='/buy'>Buy</a></div>\"\n    scraper = AutoScraper()\n    scraper.build(html=base, wanted_list=[\"Buy\"])\n    assert scraper.get_result_exact(html=variant, attr_fuzz_ratio=0.8) == [\"Buy\"]\n\n\ndef test_regex_name_extraction():\n    scraper = AutoScraper()\n    scraper.build(html=HTML_PAGE_1, wanted_list=[re.compile(r\".*PlayStation.*Console.*\")])\n    result = scraper.get_result_exact(html=HTML_PAGE_1)\n    assert any(\"PlayStation\" in r for r in result)\n\n\ndef test_keep_blank_for_missing_rating():\n    scraper = AutoScraper()\n    scraper.build(html=HTML_PAGE_1, wanted_list=[\"4.8\"])\n    html_no_rating = HTML_PAGE_2.replace(\"5.0\", \"\")\n    res = scraper.get_result_exact(html=html_no_rating, keep_blank=True)\n    assert res == [\"\"]\n\n"
  },
  {
    "path": "tests/unit/__init__.py",
    "content": ""
  },
  {
    "path": "tests/unit/test_additional_features.py",
    "content": "from autoscraper import AutoScraper\n\nHTML = \"<ul><li>Banana</li><li>Apple</li><li>Orange</li></ul>\"\nHTML_DUP = \"<ul><li>Banana</li><li>Banana</li></ul>\"\n\n\ndef test_text_fuzz_ratio_partial():\n    scraper = AutoScraper()\n    scraper.build(html=\"<ul><li>Banana</li></ul>\", wanted_list=[\"Banan\"], text_fuzz_ratio=0.8)\n    assert scraper.get_result_exact(html=\"<ul><li>Banana</li></ul>\") == [\"Banana\"]\n\n\ndef test_set_rule_aliases():\n    scraper = AutoScraper()\n    scraper.build(html=HTML, wanted_list=[\"Banana\"])\n    rule_id = scraper.stack_list[0][\"stack_id\"]\n    scraper.set_rule_aliases({rule_id: \"fruit\"})\n    result = scraper.get_result_similar(html=HTML, group_by_alias=True, contain_sibling_leaves=True)\n    assert result == {\"fruit\": [\"Banana\", \"Apple\", \"Orange\"]}\n\n\ndef test_grouped_results_by_rule():\n    scraper = AutoScraper()\n    scraper.build(html=HTML, wanted_list=[\"Banana\"])\n    rule_id = scraper.stack_list[0][\"stack_id\"]\n    result = scraper.get_result_similar(html=HTML, grouped=True, contain_sibling_leaves=True)\n    assert result == {rule_id: [\"Banana\", \"Apple\", \"Orange\"]}\n\n\ndef test_similar_unique_false():\n    scraper = AutoScraper()\n    scraper.build(html=HTML_DUP, wanted_list=[\"Banana\"])\n    result = scraper.get_result_similar(html=HTML_DUP, unique=False)\n    assert result == [\"Banana\", \"Banana\"]\n\n\ndef test_similar_keep_order():\n    scraper = AutoScraper()\n    scraper.build(html=HTML, wanted_list=[\"Banana\"])\n    result = scraper.get_result_similar(html=HTML, contain_sibling_leaves=True, keep_order=True)\n    assert result == [\"Banana\", \"Apple\", \"Orange\"]\n"
  },
  {
    "path": "tests/unit/test_build.py",
    "content": "import pytest\nfrom autoscraper import AutoScraper\n\nHTML = \"<ul><li>Banana</li><li>Apple</li><li>Orange</li></ul>\"\n\n\ndef test_build_requires_targets():\n    scraper = AutoScraper()\n    with pytest.raises(ValueError):\n        scraper.build(html=HTML)\n\n\ndef test_build_and_get_result_similar():\n    scraper = AutoScraper()\n    result = scraper.build(html=HTML, wanted_list=[\"Banana\"])\n    assert result == [\"Banana\"]\n    similar = scraper.get_result_similar(html=HTML, contain_sibling_leaves=True)\n    assert similar == [\"Banana\", \"Apple\", \"Orange\"]\n"
  },
  {
    "path": "tests/unit/test_features.py",
    "content": "import pytest\n\nfrom autoscraper import AutoScraper\n\nHTML = \"<ul><li>Banana</li><li>Apple</li><li>Orange</li></ul>\"\nHTML_COMPLEX_ORDER = \"\"\"\n<div class='products'>\n  <h2>Banana</h2>\n  <p class='price'>$1</p>\n  <h2>Apple</h2>\n  <p class='price'>$2</p>\n</div>\n\"\"\"\n\n\ndef test_get_result_exact_order():\n    scraper = AutoScraper()\n    scraper.build(html=HTML_COMPLEX_ORDER, wanted_list=[\"Banana\", \"$2\"])\n    assert scraper.get_result_exact(html=HTML_COMPLEX_ORDER) == [\"Banana\", \"$2\"]\n\n\ndef test_group_by_alias():\n    scraper = AutoScraper()\n    scraper.build(html=HTML, wanted_dict={\"fruit\": [\"Banana\"]})\n    similar = scraper.get_result_similar(\n        html=HTML, group_by_alias=True, contain_sibling_leaves=True, unique=True\n    )\n    assert similar == {\"fruit\": [\"Banana\", \"Apple\", \"Orange\"]}\n\n\ndef test_save_and_load(tmp_path):\n    scraper = AutoScraper()\n    scraper.build(html=HTML, wanted_list=[\"Banana\"])\n    file_path = tmp_path / \"model.json\"\n    scraper.save(file_path)\n    new_scraper = AutoScraper()\n    new_scraper.load(file_path)\n    assert new_scraper.get_result_exact(html=HTML) == scraper.get_result_exact(html=HTML)\n\n\ndef test_keep_rules():\n    scraper = AutoScraper()\n    scraper.build(html=HTML, wanted_list=[\"Banana\"])\n    first_rule = scraper.stack_list[0][\"stack_id\"]\n    scraper.build(html=HTML, wanted_list=[\"Apple\"], update=True)\n    second_rule = scraper.stack_list[1][\"stack_id\"]\n    scraper.keep_rules([second_rule])\n    assert len(scraper.stack_list) == 1\n    assert scraper.stack_list[0][\"stack_id\"] == second_rule\n\n\ndef test_get_result_combined():\n    scraper = AutoScraper()\n    scraper.build(html=HTML, wanted_list=[\"Banana\"])\n    similar, exact = scraper.get_result(html=HTML)\n    assert exact == [\"Banana\"]\n    assert similar == [\"Banana\"]\n"
  }
]