Repository: laramies/theHarvester Branch: master Commit: 53e13662409e Files: 127 Total size: 2.1 MB Directory structure: gitextract_7wyx50xx/ ├── .dockerignore ├── .git-blame-ignore-revs ├── .gitattributes ├── .github/ │ ├── FUNDING.yml │ ├── ISSUE_TEMPLATE/ │ │ └── issue-template.md │ ├── dependabot.yml │ └── workflows/ │ ├── codeql-analysis.yml │ ├── docker-build-push.yml │ ├── dockerci.yml │ └── theHarvester.yml ├── .gitignore ├── CHANGELOG.md ├── Dockerfile ├── README/ │ ├── CONTRIBUTING.md │ ├── COPYING │ └── LICENSES ├── README.md ├── bin/ │ ├── restfulHarvest │ └── theHarvester ├── docker-compose.yml ├── pyproject.toml ├── tests/ │ ├── __init__.py │ ├── discovery/ │ │ ├── __init__.py │ │ ├── test_baidusearch.py │ │ ├── test_censys.py │ │ ├── test_certspotter.py │ │ ├── test_criminalip.py │ │ ├── test_githubcode.py │ │ ├── test_githubcode_additions.py │ │ ├── test_otx.py │ │ ├── test_rocketreach.py │ │ ├── test_shodan_engine.py │ │ └── test_thc.py │ ├── lib/ │ │ ├── test_core.py │ │ └── test_output.py │ ├── test_hackertarget_apikey.py │ ├── test_mojeek.py │ ├── test_myparser.py │ └── test_security.py └── theHarvester/ ├── __init__.py ├── __main__.py ├── data/ │ ├── proxies.yaml │ └── wordlists/ │ ├── api_endpoints.txt │ ├── dns-big.txt │ ├── dns-names.txt │ ├── dorks.txt │ ├── general/ │ │ └── common.txt │ └── names_small.txt ├── discovery/ │ ├── __init__.py │ ├── additional_apis.py │ ├── api_endpoints.py │ ├── baidusearch.py │ ├── bevigil.py │ ├── bitbucket.py │ ├── bravesearch.py │ ├── bufferoverun.py │ ├── builtwith.py │ ├── censysearch.py │ ├── certspottersearch.py │ ├── chaos.py │ ├── commoncrawl.py │ ├── constants.py │ ├── criminalip.py │ ├── crtsh.py │ ├── dnssearch.py │ ├── duckduckgosearch.py │ ├── fofa.py │ ├── fullhuntsearch.py │ ├── githubcode.py │ ├── gitlabsearch.py │ ├── hackertarget.py │ ├── haveibeenpwned.py │ ├── hudsonrocksearch.py │ ├── huntersearch.py │ ├── intelxsearch.py │ ├── leakix.py │ ├── leaklookup.py │ ├── mojeek.py │ ├── netlas.py │ ├── onyphe.py │ ├── otxsearch.py │ ├── pentesttools.py │ ├── projectdiscovery.py │ ├── rapiddns.py │ ├── robtex.py │ ├── rocketreach.py │ ├── search_dehashed.py │ ├── search_dnsdumpster.py │ ├── searchhunterhow.py │ ├── securityscorecard.py │ ├── securitytrailssearch.py │ ├── shodansearch.py │ ├── subdomaincenter.py │ ├── subdomainfinderc99.py │ ├── takeover.py │ ├── thc.py │ ├── threatcrowd.py │ ├── tombasearch.py │ ├── urlscan.py │ ├── venacussearch.py │ ├── virustotal.py │ ├── waybackarchive.py │ ├── whoisxml.py │ ├── windvane.py │ ├── yahoosearch.py │ └── zoomeyesearch.py ├── lib/ │ ├── __init__.py │ ├── api/ │ │ ├── __init__.py │ │ ├── additional_endpoints.py │ │ ├── api.py │ │ ├── api_example.py │ │ ├── auth.py │ │ └── static/ │ │ └── .gitkeep │ ├── core.py │ ├── hostchecker.py │ ├── output.py │ ├── resolvers.txt │ └── stash.py ├── parsers/ │ ├── __init__.py │ ├── intelxparser.py │ ├── myparser.py │ ├── securitytrailsparser.py │ └── venacusparser.py ├── restfulHarvest.py ├── screenshot/ │ ├── __init__.py │ └── screenshot.py └── theHarvester.py ================================================ FILE CONTENTS ================================================ ================================================ FILE: .dockerignore ================================================ .github/* .gitattributes .git-blame-ignore-revs .idea/ .pytest_cache .mypy_cache tests/* README/ bin/ theHarvester-logo.png theHarvester-logo.webp CHANGELOG.md ================================================ FILE: .git-blame-ignore-revs ================================================ # #1492 run `black .` and `isort .` c13843ec0d513ac7f9c35b7bd0501fa46e356415 ================================================ FILE: .gitattributes ================================================ # Set the default behavior, which is to have git automatically determine # whether a file is a text or binary, unless otherwise specified. * text=auto # Basic .gitattributes for a python repo. # Source files # ============ *.pxd text diff=python *.py text diff=python *.py3 text diff=python *.pyw text diff=python *.pyx text diff=python # Binary files # ============ *.db binary *.p binary *.pkl binary *.pyc binary *.pyd binary *.pyo binary # Note: .db, .p, and .pkl files are associated with the python modules # ``pickle``, ``dbm.*``, # ``shelve``, ``marshal``, ``anydbm``, & ``bsddb`` # (among others). ================================================ FILE: .github/FUNDING.yml ================================================ # These are supported funding model platforms github: [L1ghtn1ng, NotoriousRebel] open_collective: # Replace with a single Open Collective username ko_fi: # tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry liberapay: # Replace with a single Liberapay username issuehunt: # Replace with a single IssueHunt username otechie: # Replace with a single Otechie username custom: # Replace with up to 4 custom sponsorship URLs e.g., ['link1', 'link2'] ================================================ FILE: .github/ISSUE_TEMPLATE/issue-template.md ================================================ --- name: Issue Template about: A template for new issues. title: "[Bug|Feature Request|Other] Short Description of Issue" labels: '' --- ## Note we do not support installing theHarvester on android **Feature Request or Bug or Another** Feature Request | Bug | Other **Describe the feature request or bug or other** A clear and concise description of what the bug, feature request, or other request is. **To Reproduce** Steps to reproduce the behaviour: 1. Run tool like this: '...' 2. See error **Expected behaviour** A clear and concise description of what you expected to happen. **Screenshots** If possible please add screenshots to help explain your problem. **System Information (System that tool is running on):** - OS: [e.g. Windows10] - Version [e.g. 2.7] **Additional context** Add any other context about the problem here. ================================================ FILE: .github/dependabot.yml ================================================ version: 2 updates: - package-ecosystem: github-actions directory: "/" schedule: interval: daily timezone: Europe/London - package-ecosystem: uv directory: "/" schedule: interval: daily timezone: Europe/London open-pull-requests-limit: 10 target-branch: master allow: - dependency-type: direct - dependency-type: indirect ================================================ FILE: .github/workflows/codeql-analysis.yml ================================================ # For most projects, this workflow file will not need changing; you simply need # to commit it to your repository. # # You may wish to alter this file to override the set of languages analyzed, # or to provide custom queries or build logic. # # ******** NOTE ******** # We have attempted to detect the languages in your repository. Please check # the `language` matrix defined below to confirm you have the correct set of # supported CodeQL languages. # name: "CodeQL" on: push: branches: [ master, dev ] pull_request: # The branches below must be a subset of the branches above branches: [ master, dev ] schedule: - cron: '19 11 * * 4' jobs: analyze: name: Analyze runs-on: ubuntu-latest strategy: fail-fast: false matrix: language: [ 'python' ] # CodeQL supports [ 'cpp', 'csharp', 'go', 'java', 'javascript', 'python' ] # Learn more: # https://docs.github.com/en/free-pro-team@latest/github/finding-security-vulnerabilities-and-errors-in-your-code/configuring-code-scanning#changing-the-languages-that-are-analyzed steps: - name: Checkout repository uses: actions/checkout@v6 # Initializes the CodeQL tools for scanning. - name: Initialize CodeQL uses: github/codeql-action/init@v4 with: languages: ${{ matrix.language }} # If you wish to specify custom queries, you can do so here or in a config file. # By default, queries listed here will override any specified in a config file. # Prefix the list here with "+" to use these queries and those in the config file. # queries: ./path/to/local/query, your-org/your-repo/queries@main # Autobuild attempts to build any compiled languages (C/C++, C#, or Java). # If this step fails, then you should remove it and run the build manually (see below) - name: Autobuild uses: github/codeql-action/autobuild@v4 # ℹ️ Command-line programs to run using the OS shell. # 📚 https://git.io/JvXDl # ✏️ If the Autobuild fails above, remove it and uncomment the following three lines # and modify them (or add more) to build your code if your project # uses a compiled language #- run: | # make bootstrap # make release - name: Perform CodeQL Analysis uses: github/codeql-action/analyze@v4 ================================================ FILE: .github/workflows/docker-build-push.yml ================================================ name: Build and Push Docker Image on: push: branches: - master permissions: contents: read packages: write jobs: build-and-push: runs-on: ubuntu-latest steps: - name: Checkout repository uses: actions/checkout@v6 - name: Set up Docker Buildx uses: docker/setup-buildx-action@v4 - name: Log in to GitHub Container Registry uses: docker/login-action@v4 with: registry: ghcr.io username: ${{ github.actor }} password: ${{ secrets.GITHUB_TOKEN }} - name: Extract metadata for Docker id: meta uses: docker/metadata-action@v6 with: images: ghcr.io/${{ github.repository_owner }}/theharvester tags: | latest type=ref,event=branch type=sha - name: Build and push Docker image uses: docker/build-push-action@v7 with: context: . file: Dockerfile push: true platforms: linux/amd64,linux/arm64 tags: ${{ steps.meta.outputs.tags }} labels: ${{ steps.meta.outputs.labels }} ================================================ FILE: .github/workflows/dockerci.yml ================================================ name: TheHarvester Docker Image CI on: [push, pull_request] jobs: build: runs-on: ubuntu-latest steps: - uses: actions/checkout@v6 - name: Build the Docker image run: docker build --tag theharvester . - name: Smoke test run: docker run --rm theharvester --help | grep restfulHarvest ================================================ FILE: .github/workflows/theHarvester.yml ================================================ name: TheHarvester Python CI on: push: branches: - '*' pull_request: branches: - '*' jobs: Python: runs-on: ${{ matrix.os }} strategy: max-parallel: 10 matrix: os: [ ubuntu-latest ] python-version: [ '3.12', '3.13', '3.14' ] steps: - uses: actions/checkout@v6 - name: Install uv uses: astral-sh/setup-uv@v7 with: python-version: ${{ matrix.python-version }} enable-cache: true cache-dependency-glob: "uv.lock" - name: Install dependencies run: | sudo mkdir -p /usr/local/etc/theHarvester sudo cp theHarvester/data/*.yaml /usr/local/etc/theHarvester/ sudo chown -R runner:runner /usr/local/etc/theHarvester/ uv sync --all-groups --frozen echo "$GITHUB_WORKSPACE/.venv/bin" >> $GITHUB_PATH - name: Lint with ruff uses: astral-sh/ruff-action@v3 with: args: check --fix - name: Format with ruff uses: astral-sh/ruff-action@v3 with: args: format - name: Commit changes for ruff formating and linting if: github.event_name == 'push' run: | git config user.name github-actions git config user.email github-actions@github.com git add . git commit -m "Apply ruff fixes and formatting" || true # Use || true to prevent failure if no changes git push origin HEAD:${{ github.ref_name }} env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - name: Test with pytest run: | pytest tests/** - name: Run theHarvester module Baidu run: | theHarvester -d yale.edu -b baidu - name: Run theHarvester module CertSpotter run: | theHarvester -d yale.edu -b certspotter - name: Run theHarvester module Crtsh run: | theHarvester -d hcl.com -b crtsh - name: Run theHarvester module DuckDuckGo run: | theHarvester -d yale.edu -b duckduckgo - name: Run theHarvester module HackerTarget run: | theHarvester -d yale.edu -b hackertarget - name: Run theHarvester module Otx run: | theHarvester -d yale.edu -b otx - name: Run theHarvester module RapidDns run: | theHarvester -d yale.edu -b rapiddns - name: Run theHarvester module Urlscan run: | theHarvester -d yale.edu -b urlscan - name: Run theHarvester module Yahoo run: | theHarvester -d yale.edu -b yahoo - name: Run theHarvester module DNS brute force run: | theHarvester -d yale.edu -c ================================================ FILE: .gitignore ================================================ *.idea *.pyc *.sqlite *.html *.htm *.vscode *.xml *.json debug_results.txt venv .mypy_cache .pytest_cache build/ dist/ theHarvester.egg-info api-keys.yaml .DS_Store .venv .venv/** .pyre .junie ================================================ FILE: CHANGELOG.md ================================================ # Changelog All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). ## [Unreleased] ## [4.10.1] - 2026-02-22 ### Changed - Updated Censys integration to align with current API documentation ([67419190](https://github.com/laramies/theHarvester/commit/67419190)). - Updated RocketReach integration to align with latest API documentation and tests ([ffc7420d](https://github.com/laramies/theHarvester/commit/ffc7420d)). - Refactored async file handling in CLI paths: replace blocking path calls with awaited operations and improve path sanitization ([e98bf5bb](https://github.com/laramies/theHarvester/commit/e98bf5bb), [607016a1](https://github.com/laramies/theHarvester/commit/607016a1)). - Migrated packaging/build configuration to `flit-core` and updated entrypoint/version wiring ([d2cae0be](https://github.com/laramies/theHarvester/commit/d2cae0be)). - Refactored and standardized output utilities, with new regression tests for output formatting and dedup helpers ([fa2dedd3](https://github.com/laramies/theHarvester/commit/fa2dedd3)). - Updated dependencies: bump `fastapi`, `playwright`, `ruff`, `ty`, and `uvicorn` ([1dfa6e98](https://github.com/laramies/theHarvester/commit/1dfa6e98), [46865337](https://github.com/laramies/theHarvester/commit/46865337), [c1ac137d](https://github.com/laramies/theHarvester/commit/c1ac137d), [7eaec4da](https://github.com/laramies/theHarvester/commit/7eaec4da)). - Updated packaging dependency `wheel` to `0.46.3` ([46865337](https://github.com/laramies/theHarvester/commit/46865337)). ### Fixed - Fixed CriminalIP integration for current API behavior, including safer scan/report handling and hostname normalization (issue #2229) ([06c2fbd9](https://github.com/laramies/theHarvester/commit/06c2fbd9)). - Fixed Shodan engine processing to return hostnames consistently and avoid worker processing errors (issue #2227) ([419291a3](https://github.com/laramies/theHarvester/commit/419291a3)). - Fixed Bitbucket search flow so discovery runs successfully ([a1968f71](https://github.com/laramies/theHarvester/commit/a1968f71)). - Improved module API key error messages for clearer diagnostics ([e1b775e3](https://github.com/laramies/theHarvester/commit/e1b775e3)). - Improved BuiltWith URL handling logic in CLI processing ([15872350](https://github.com/laramies/theHarvester/commit/15872350)). ## [4.10.0] - 2026-01-18 ### Added - LeakIX API key support and improved request header configuration ([31861c19](https://github.com/laramies/theHarvester/commit/31861c19)). - Bitbucket API key entry in `theHarvester/data/api-keys.yaml` ([6be673fa](https://github.com/laramies/theHarvester/commit/6be673fa)). - Fix issue #469 Add socks proxy support ([e38bb8fb](https://github.com/laramies/theHarvester/commit/e38bb8fb)). ### Changed - CI: switch GitHub workflow to `ruff-action` for linting and formatting ([8ddcd1a8](https://github.com/laramies/theHarvester/commit/8ddcd1a8)). - Dockerfile: add `apt-get update/upgrade` and clean up apt cache layers ([3a5d504b](https://github.com/laramies/theHarvester/commit/3a5d504b)). - Dependencies updated: bump `aiodns`, `ruff`, `ty`, `filelock`, and `librt` ([40759146](https://github.com/laramies/theHarvester/commit/40759146)). - Codebase formatting and lint fixes applied (Ruff) ([7c6dec53](https://github.com/laramies/theHarvester/commit/7c6dec53)). - Tests: expand proxy parameter default structure to include both `http` and `socks5` fields ([bc2fce07](https://github.com/laramies/theHarvester/commit/bc2fce07)). - `api-keys.yaml` synchronized with `Core` API key references; add consistency test coverage ([ffe1f3a8](https://github.com/laramies/theHarvester/commit/ffe1f3a8)). ### Removed - `Core.bing_key()` removed ([814c7811](https://github.com/laramies/theHarvester/commit/814c7811)). ### Fixed - Fix mypy type-checking errors ([0991356b](https://github.com/laramies/theHarvester/commit/0991356b)). ### Security - Improve input sanitization and add security-focused tests ([3d7489c9](https://github.com/laramies/theHarvester/commit/3d7489c9)). [Unreleased]: https://github.com/laramies/theHarvester/compare/06520b40...master [4.10.1]: https://github.com/laramies/theHarvester/compare/4.10.0...06520b40 [4.10.0]: https://github.com/laramies/theHarvester/compare/4.9.2...4.10.0 ================================================ FILE: Dockerfile ================================================ FROM python:3.14-slim-trixie LABEL maintainer="@jay_townsend1 & @NotoriousRebel1" RUN useradd -m -u 1000 -s /bin/bash theharvester RUN apt-get update && apt-get upgrade -yqq && apt-get clean && \ rm -rf /var/lib/apt/lists/* # Set workdir and copy project files WORKDIR /app COPY . /app # Create and sync environment using uv # Compile bytecode for faster startup and install to system site-packages RUN --mount=from=ghcr.io/astral-sh/uv,source=/uv,target=/bin/uv \ UV_PROJECT_ENVIRONMENT=/usr/local uv sync --locked --no-dev --no-cache --compile-bytecode # Use non-root user USER theharvester # Expose port if the service listens on 80 EXPOSE 80 # Run the application as theharvester user ENTRYPOINT ["restfulHarvest", "-H", "0.0.0.0", "-p", "80"] ================================================ FILE: README/CONTRIBUTING.md ================================================ # Contributing to theHarvester Project Welcome to theHarvester project, so you would like to contribute. The following below must be met to get accepted. # CI Make sure all CI passes and you do not introduce any alerts from ruff # Unit Tests For new modules a unit test for that module is required and we use pytest. # Coding Standards * No single letter variables and variable names must represent the action that it is performing * Have static typing on functions etc * Make sure no errors are reported from mypy * No issues reported with ruff # Submitting Bugs If you find a bug in a module that you want to submit an issue for and know how to write python code. Please create a unit test for that bug(If possible) and submit a fix for it as it would be a big help to the project. ================================================ FILE: README/COPYING ================================================ GNU GENERAL PUBLIC LICENSE Version 2, June 1991 Copyright (C) 1989, 1991 Free Software Foundation, Inc. 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA Everyone is permitted to copy and distribute verbatim copies of this license document, but changing it is not allowed. Preamble The licenses for most software are designed to take away your freedom to share and change it. By contrast, the GNU General Public License is intended to guarantee your freedom to share and change free software--to make sure the software is free for all its users. This General Public License applies to most of the Free Software Foundation's software and to any other program whose authors commit to using it. (Some other Free Software Foundation software is covered by the GNU Library General Public License instead.) You can apply it to your programs, too. When we speak of free software, we are referring to freedom, not price. Our General Public Licenses are designed to make sure that you have the freedom to distribute copies of free software (and charge for this service if you wish), that you receive source code or can get it if you want it, that you can change the software or use pieces of it in new free programs; and that you know you can do these things. To protect your rights, we need to make restrictions that forbid anyone to deny you these rights or to ask you to surrender the rights. These restrictions translate to certain responsibilities for you if you distribute copies of the software, or if you modify it. For example, if you distribute copies of such a program, whether gratis or for a fee, you must give the recipients all the rights that you have. You must make sure that they, too, receive or can get the source code. And you must show them these terms so they know their rights. We protect your rights with two steps: (1) copyright the software, and (2) offer you this license which gives you legal permission to copy, distribute and/or modify the software. Also, for each author's protection and ours, we want to make certain that everyone understands that there is no warranty for this free software. If the software is modified by someone else and passed on, we want its recipients to know that what they have is not the original, so that any problems introduced by others will not reflect on the original authors' reputations. Finally, any free program is threatened constantly by software patents. We wish to avoid the danger that redistributors of a free program will individually obtain patent licenses, in effect making the program proprietary. To prevent this, we have made it clear that any patent must be licensed for everyone's free use or not licensed at all. The precise terms and conditions for copying, distribution and modification follow. GNU GENERAL PUBLIC LICENSE TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION 0. This License applies to any program or other work which contains a notice placed by the copyright holder saying it may be distributed under the terms of this General Public License. The "Program", below, refers to any such program or work, and a "work based on the Program" means either the Program or any derivative work under copyright law: that is to say, a work containing the Program or a portion of it, either verbatim or with modifications and/or translated into another language. (Hereinafter, translation is included without limitation in the term "modification".) Each licensee is addressed as "you". Activities other than copying, distribution and modification are not covered by this License; they are outside its scope. The act of running the Program is not restricted, and the output from the Program is covered only if its contents constitute a work based on the Program (independent of having been made by running the Program). Whether that is true depends on what the Program does. 1. You may copy and distribute verbatim copies of the Program's source code as you receive it, in any medium, provided that you conspicuously and appropriately publish on each copy an appropriate copyright notice and disclaimer of warranty; keep intact all the notices that refer to this License and to the absence of any warranty; and give any other recipients of the Program a copy of this License along with the Program. You may charge a fee for the physical act of transferring a copy, and you may at your option offer warranty protection in exchange for a fee. 2. You may modify your copy or copies of the Program or any portion of it, thus forming a work based on the Program, and copy and distribute such modifications or work under the terms of Section 1 above, provided that you also meet all of these conditions: a) You must cause the modified files to carry prominent notices stating that you changed the files and the date of any change. b) You must cause any work that you distribute or publish, that in whole or in part contains or is derived from the Program or any part thereof, to be licensed as a whole at no charge to all third parties under the terms of this License. c) If the modified program normally reads commands interactively when run, you must cause it, when started running for such interactive use in the most ordinary way, to print or display an announcement including an appropriate copyright notice and a notice that there is no warranty (or else, saying that you provide a warranty) and that users may redistribute the program under these conditions, and telling the user how to view a copy of this License. (Exception: if the Program itself is interactive but does not normally print such an announcement, your work based on the Program is not required to print an announcement.) These requirements apply to the modified work as a whole. If identifiable sections of that work are not derived from the Program, and can be reasonably considered independent and separate works in themselves, then this License, and its terms, do not apply to those sections when you distribute them as separate works. But when you distribute the same sections as part of a whole which is a work based on the Program, the distribution of the whole must be on the terms of this License, whose permissions for other licensees extend to the entire whole, and thus to each and every part regardless of who wrote it. Thus, it is not the intent of this section to claim rights or contest your rights to work written entirely by you; rather, the intent is to exercise the right to control the distribution of derivative or collective works based on the Program. In addition, mere aggregation of another work not based on the Program with the Program (or with a work based on the Program) on a volume of a storage or distribution medium does not bring the other work under the scope of this License. 3. You may copy and distribute the Program (or a work based on it, under Section 2) in object code or executable form under the terms of Sections 1 and 2 above provided that you also do one of the following: a) Accompany it with the complete corresponding machine-readable source code, which must be distributed under the terms of Sections 1 and 2 above on a medium customarily used for software interchange; or, b) Accompany it with a written offer, valid for at least three years, to give any third party, for a charge no more than your cost of physically performing source distribution, a complete machine-readable copy of the corresponding source code, to be distributed under the terms of Sections 1 and 2 above on a medium customarily used for software interchange; or, c) Accompany it with the information you received as to the offer to distribute corresponding source code. (This alternative is allowed only for noncommercial distribution and only if you received the program in object code or executable form with such an offer, in accord with Subsection b above.) The source code for a work means the preferred form of the work for making modifications to it. For an executable work, complete source code means all the source code for all modules it contains, plus any associated interface definition files, plus the scripts used to control compilation and installation of the executable. However, as a special exception, the source code distributed need not include anything that is normally distributed (in either source or binary form) with the major components (compiler, kernel, and so on) of the operating system on which the executable runs, unless that component itself accompanies the executable. If distribution of executable or object code is made by offering access to copy from a designated place, then offering equivalent access to copy the source code from the same place counts as distribution of the source code, even though third parties are not compelled to copy the source along with the object code. 4. You may not copy, modify, sublicense, or distribute the Program except as expressly provided under this License. Any attempt otherwise to copy, modify, sublicense or distribute the Program is void, and will automatically terminate your rights under this License. However, parties who have received copies, or rights, from you under this License will not have their licenses terminated so long as such parties remain in full compliance. 5. You are not required to accept this License, since you have not signed it. However, nothing else grants you permission to modify or distribute the Program or its derivative works. These actions are prohibited by law if you do not accept this License. Therefore, by modifying or distributing the Program (or any work based on the Program), you indicate your acceptance of this License to do so, and all its terms and conditions for copying, distributing or modifying the Program or works based on it. 6. Each time you redistribute the Program (or any work based on the Program), the recipient automatically receives a license from the original licensor to copy, distribute or modify the Program subject to these terms and conditions. You may not impose any further restrictions on the recipients' exercise of the rights granted herein. You are not responsible for enforcing compliance by third parties to this License. 7. If, as a consequence of a court judgment or allegation of patent infringement or for any other reason (not limited to patent issues), conditions are imposed on you (whether by court order, agreement or otherwise) that contradict the conditions of this License, they do not excuse you from the conditions of this License. If you cannot distribute so as to satisfy simultaneously your obligations under this License and any other pertinent obligations, then as a consequence you may not distribute the Program at all. For example, if a patent license would not permit royalty-free redistribution of the Program by all those who receive copies directly or indirectly through you, then the only way you could satisfy both it and this License would be to refrain entirely from distribution of the Program. If any portion of this section is held invalid or unenforceable under any particular circumstance, the balance of the section is intended to apply and the section as a whole is intended to apply in other circumstances. It is not the purpose of this section to induce you to infringe any patents or other property right claims or to contest validity of any such claims; this section has the sole purpose of protecting the integrity of the free software distribution system, which is implemented by public license practices. Many people have made generous contributions to the wide range of software distributed through that system in reliance on consistent application of that system; it is up to the author/donor to decide if he or she is willing to distribute software through any other system and a licensee cannot impose that choice. This section is intended to make thoroughly clear what is believed to be a consequence of the rest of this License. 8. If the distribution and/or use of the Program is restricted in certain countries either by patents or by copyrighted interfaces, the original copyright holder who places the Program under this License may add an explicit geographical distribution limitation excluding those countries, so that distribution is permitted only in or among countries not thus excluded. In such case, this License incorporates the limitation as if written in the body of this License. 9. The Free Software Foundation may publish revised and/or new versions of the General Public License from time to time. Such new versions will be similar in spirit to the present version, but may differ in detail to address new problems or concerns. Each version is given a distinguishing version number. If the Program specifies a version number of this License which applies to it and "any later version", you have the option of following the terms and conditions either of that version or of any later version published by the Free Software Foundation. If the Program does not specify a version number of this License, you may choose any version ever published by the Free Software Foundation. 10. If you wish to incorporate parts of the Program into other free programs whose distribution conditions are different, write to the author to ask for permission. For software which is copyrighted by the Free Software Foundation, write to the Free Software Foundation; we sometimes make exceptions for this. Our decision will be guided by the two goals of preserving the free status of all derivatives of our free software and of promoting the sharing and reuse of software generally. NO WARRANTY 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. END OF TERMS AND CONDITIONS ================================================ FILE: README/LICENSES ================================================ Released under the GPL v 2.0. If you did not receive a copy of the GPL, try http://www.gnu.org/. Copyright 2011 Christian Martorella theHarvester is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation version 2 of the License. theHarvester is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ================================================ FILE: README.md ================================================ ![theHarvester](https://github.com/laramies/theHarvester/blob/master/theHarvester-logo.webp) ![TheHarvester CI](https://github.com/laramies/theHarvester/workflows/TheHarvester%20Python%20CI/badge.svg) ![TheHarvester Docker Image CI](https://github.com/laramies/theHarvester/workflows/TheHarvester%20Docker%20Image%20CI/badge.svg) [![Rawsec's CyberSecurity Inventory](https://inventory.raw.pm/img/badges/Rawsec-inventoried-FF5050_flat_without_logo.svg)](https://inventory.raw.pm/) [![Packaging status](https://repology.org/badge/vertical-allrepos/theharvester.svg)](https://repology.org/project/theharvester/versions) About ----- theHarvester is a simple to use, yet powerful tool designed to be used during the reconnaissance stage of a red team assessment or penetration test. It performs open source intelligence (OSINT) gathering to help determine a domain's external threat landscape. The tool gathers names, emails, IPs, subdomains, and URLs by using multiple public resources that include: Install and dependencies ------------------------ * Python 3.12 or higher. * https://github.com/laramies/theHarvester/wiki/Installation Install uv: ```bash curl -LsSf https://astral.sh/uv/install.sh | sh ``` Clone the repository: ```bash git clone https://github.com/laramies/theHarvester cd theHarvester ``` Install dependencies and create a virtual environment: ```bash uv sync ``` Run theHarvester: ```bash uv run theHarvester ``` ## Development To install development dependencies: ```bash uv sync --all-groups ``` To run tests: ```bash uv run pytest ``` To run linting and formatting: ```bash uv run ruff check ``` ```bash uv run ruff format ``` Passive modules --------------- * baidu: Baidu search engine (https://www.baidu.com) * bevigil: CloudSEK BeVigil scans mobile application for OSINT assets (https://bevigil.com/osint-api) * brave: Brave search engine - now uses official Brave Search API (https://api-dashboard.search.brave.com) * bufferoverun: Fast domain name lookups for TLS certificates in IPv4 space (https://tls.bufferover.run) * builtwith: Find out what websites are built with (https://builtwith.com) * censys: Uses certificates searches to enumerate subdomains and gather emails (https://censys.io) * certspotter: Cert Spotter monitors Certificate Transparency logs (https://sslmate.com/certspotter) * criminalip: Specialized Cyber Threat Intelligence (CTI) search engine (https://www.criminalip.io) * crtsh: Comodo Certificate search (https://crt.sh) * dehashed: Take your data security to the next level is (https://dehashed.com) * dnsdumpster: Domain research tool that can discover hosts related to a domain (https://dnsdumpster.com) * duckduckgo: DuckDuckGo search engine (https://duckduckgo.com) * fofa: FOFA search eingine (https://en.fofa.info) * fullhunt: Next-generation attack surface security platform (https://fullhunt.io) * github-code: GitHub code search engine (https://www.github.com) * hackertarget: Online vulnerability scanners and network intelligence to help organizations (https://hackertarget.com) * haveibeenpwned: Check if your email address is in a data breach (https://haveibeenpwned.com) * hunter: Hunter search engine (https://hunter.io) * hunterhow: Internet search engines for security researchers (https://hunter.how) * intelx: Intelx search engine (https://intelx.io) * leakix: LeakIX search engine (https://leakix.net) * leaklookup: Data breach search engine (https://leak-lookup.com) * mojeek: Mojeek search engine (https://www.mojeek.com) * netlas: A Shodan or Censys competitor (https://app.netlas.io) * onyphe: Cyber defense search engine (https://www.onyphe.io) * otx: AlienVault open threat exchange (https://otx.alienvault.com) * pentesttools: Cloud-based toolkit for offensive security testing, focused on web applications and network penetration testing (https://pentest-tools.com) * projecdiscovery: Actively collects and maintains internet-wide assets data, to enhance research and analyse changes around DNS for better insights (https://chaos.projectdiscovery.io) * rapiddns: DNS query tool which make querying subdomains or sites of a same IP easy (https://rapiddns.io) * rocketreach: Access real-time verified personal/professional emails, phone numbers, and social media links (https://rocketreach.co) * securityscorecard: helps TPRM and SOC teams detect, prioritize, and remediate vendor risk across their entire supplier ecosystem at scale (https://securityscorecard.com) * securityTrails: Security Trails search engine, the world's largest repository of historical DNS data (https://securitytrails.com) * -s, --shodan: Shodan search engine will search for ports and banners from discovered hosts (https://shodan.io) * subdomaincenter: A subdomain finder tool used to find subdomains of a given domain (https://www.subdomain.center) * subdomainfinderc99: A subdomain finder is a tool used to find the subdomains of a given domain (https://subdomainfinder.c99.nl) * thc: Free subdomain enumeration service with no API key required (https://ip.thc.org) * threatminer: Data mining for threat intelligence (https://www.threatminer.org) * tomba: Tomba search engine (https://tomba.io) * urlscan: A sandbox for the web that is a URL and website scanner (https://urlscan.io) * venacus: Venacus search engine (https://venacus.com) * virustotal: Domain search (https://www.virustotal.com) * whoisxml: Subdomain search (https://subdomains.whoisxmlapi.com/api/pricing) * yahoo: Yahoo search engine (https://www.yahoo.com) * windvane: Windvane search engine (https://windvane.lichoin.com) * zoomeye: China's version of Shodan (https://www.zoomeye.org) Active modules -------------- * DNS brute force: dictionary brute force enumeration * Screenshots: Take screenshots of subdomains that were found Modules that require an API key ------------------------------- Documentation to setup API keys can be found at - https://github.com/laramies/theHarvester/wiki/Installation#api-keys * bevigil - 50 free queries/month. 1k queries/month $50 * brave - free plan available. Pro plans for higher limits * bufferoverun - 100 free queries/month. 10k/month $25 * builtwith - 50 free queries ever. $2950/yr * censys - 500 credits $100 * criminalip - 100 free queries/month. 700k/month $59 * dehashed - 500 credts $15, 5k credits $150 * dnsdumpster - 50 free querries/day, $49 * fofa - query credits 10,000/month. 100k results/month $25 * fullhunt - 50 free queries. 200 queries $29/month, 500 queries $59 * github-code * haveibeenpwned - 10 email searches/min $4.50, 50 email searches/min $22 * hunter - 50 free credits/month. 12k credits/yr $34 * hunterhow - 10k free API results per 30 days. 50k API results per 30 days $10 * intelx - free account is very limited. Business acount $2900 * leakix - free 25 results pages, 3000 API requests/month. Bounty Hunter $29 * leaklookup - 20 credits $10, 50 credits $20, 140 credits $50, 300 credits $100 * mojeek - 5000 free credits $6.50, $1.30 CPM (Personal), $2.60 CPM (Startup), $3.90 CPM (Business) * netlas - 50 free requests/day. 1k requests $49, 10k requests $249 * onyphe - 10M results/month $587 * pentesttools - 5 assets netsec $95/month, 5 assets webnetsec $140/month * projecdiscovery - requires work email. Free monthly discovery and vulnerability scans on sign-up email domain, enterprise $ * rocketreach - 100 email lookups/month $48, 250 email lookups/month $108 * securityscorecard - requires a work email * securityTrails - 50 free queries/month. 20k queries/month $500 * shodan - Freelancer $69 month, Small Business $359 month * tomba - 25 free searches/month. 1k searches/month $39, 5k searches/month $89 * venacus - 1 free search/day. 10 searches/day $12, 30 searches/day $36 * virustotal - 500 free lookups/day, 15.5k lookups/month. Busines accounts requires a work email * whoisxml - 2k queries $50, 5k queries $105 * windvane - 100 free queries * zoomeye - 5 free results/day. 30/results/day $190/yr ## Package versions [![Packaging status](https://repology.org/badge/vertical-allrepos/theharvester.svg)](https://repology.org/project/theharvester/versions) Comments, bugs, and requests ---------------------------- * [![Twitter Follow](https://img.shields.io/twitter/follow/laramies.svg?style=social&label=Follow)](https://twitter.com/laramies) Christian Martorella @laramies cmartorella@edge-security.com * [![Twitter Follow](https://img.shields.io/twitter/follow/NotoriousRebel1.svg?style=social&label=Follow)](https://twitter.com/NotoriousRebel1) Matthew Brown @NotoriousRebel1 * [![Twitter Follow](https://img.shields.io/twitter/follow/jay_townsend1.svg?style=social&label=Follow)](https://twitter.com/jay_townsend1) Jay "L1ghtn1ng" Townsend @jay_townsend1 Main contributors ----------------- * [![Twitter Follow](https://img.shields.io/twitter/follow/NotoriousRebel1.svg?style=social&label=Follow)](https://twitter.com/NotoriousRebel1) Matthew Brown @NotoriousRebel1 * [![Twitter Follow](https://img.shields.io/twitter/follow/jay_townsend1.svg?style=social&label=Follow)](https://twitter.com/jay_townsend1) Jay "L1ghtn1ng" Townsend @jay_townsend1 * [![Twitter Follow](https://img.shields.io/twitter/follow/discoverscripts.svg?style=social&label=Follow)](https://twitter.com/discoverscripts) Lee Baird @discoverscripts Thanks ------ * John Matherly - Shodan project * Ahmed Aboul Ela - subdomain names dictionaries (big and small) ================================================ FILE: bin/restfulHarvest ================================================ #!/usr/bin/env python3 from theHarvester.restfulHarvest import main if __name__ == '__main__': main() ================================================ FILE: bin/theHarvester ================================================ #!/usr/bin/env python3 # Note: This script runs theHarvester import sys from theHarvester.theHarvester import main if sys.version_info.major < 3 or sys.version_info.minor < 10: print('[!] Make sure you have Python 3.10+ installed, quitting.\n\n') sys.exit(1) if __name__ == '__main__': main() ================================================ FILE: docker-compose.yml ================================================ services: theharvester.svc.local: container_name: theHarvester volumes: - ./theHarvester/data/api-keys.yaml:/root/.theHarvester/api-keys.yaml - ./theHarvester/data/api-keys.yaml:/etc/theHarvester/api-keys.yaml - ./theHarvester/data/proxies.yaml:/etc/theHarvester/proxies.yaml - ./theHarvester/data/proxies.yaml:/root/.theHarvester/proxies.yaml build: . ports: - "5000:80" networks: default: name: app_theHarvester_network ================================================ FILE: pyproject.toml ================================================ [project] name = "theHarvester" description = "theHarvester is a very simple, yet effective tool designed to be used in the early stages of a penetration test" readme = "README.md" license = "GPL-2.0-only" authors = [ { name = "Christian Martorella", email = "cmartorella@edge-security.com" }, { name = "Jay Townsend", email = "jay@cybermon.uk" }, { name = "Matthew Brown", email = "36310667+NotoriousRebel@users.noreply.github.com" }, ] requires-python = ">=3.12" urls.Homepage = "https://github.com/laramies/theHarvester" classifiers = [ "Programming Language :: Python :: 3", "Programming Language :: Python :: 3.12", "Programming Language :: Python :: 3.13", "Programming Language :: Python :: 3.14", "Operating System :: OS Independent", ] dynamic = ["version"] dependencies = [ "aiodns==4.0.0", "aiofiles==25.1.0", "aiohttp==3.13.3", "aiohttp-socks==0.11.0", "aiomultiprocess==0.9.1", "aiosqlite==0.22.1", "beautifulsoup4==4.14.3", "censys==2.2.19", "certifi==2026.2.25", "dnspython==2.8.0", "fastapi==0.135.1", "lxml==6.0.2", "netaddr==1.3.0", "playwright==1.58.0", "PyYAML==6.0.3", "python-dateutil==2.9.0.post0", "httpx==0.28.1", "retrying==1.4.2", "shodan==1.31.0", "slowapi==0.1.9", "ujson==5.12.0", "uvicorn==0.41.0", "uvloop==0.22.1; platform_system != 'Windows'", "winloop==0.4.0; platform_system == 'Windows'", ] [dependency-groups] dev = [ "mypy==1.19.1", "mypy-extensions==1.1.0", "pytest==9.0.2", "pytest-asyncio==1.3.0", "types-certifi==2021.10.8.3", "types-chardet==5.0.4.6", "types-python-dateutil==2.9.0.20260305", "types-PyYAML==6.0.12.20250915", "ruff==0.15.5", "types-ujson==5.10.0.20250822", "wheel==0.46.3", "ty==0.0.21", ] [project.scripts] theHarvester = "theHarvester.theHarvester:main" restfulHarvest = "theHarvester.restfulHarvest:main" [tool.pytest.ini_options] minversion = "8.3.3" asyncio_mode = "auto" asyncio_default_fixture_loop_scope = "function" addopts = "--no-header" testpaths = ["tests"] [build-system] requires = ["flit_core >=3.11,<4"] build-backend = "flit_core.buildapi" [tool.mypy] python_version = "3.13" warn_unused_configs = true ignore_missing_imports = true show_traceback = true show_error_codes = true namespace_packages = true check_untyped_defs = true [tool.uv] python-preference = "managed" [tool.uv.pip] python-version = "3.13" [tool.ty.src] respect-ignore-files = false exclude = [ ".venv/**", "tests/**", ".github/*" ] [tool.ruff] # Exclude a variety of commonly ignored directories. exclude = [ "tests", ".eggs", ".git", ".git-rewrite", ".mypy_cache", ".pyenv", ".pytest_cache", ".pytype", ".ruff_cache", ".github", ".venv", ".vscode", ".idea", "__pypackages__", "build", "dist", "site-packages", "venv", ] line-length = 130 target-version = "py313" show-fixes = true [tool.ruff.lint] select = ["E", "F", "N", "I", "UP", "TCH", "FA", "RUF", "PT", "TC", "ASYNC" ] ignore = [ "E501", "ASYNC230", "N999", "PLR0915" ] # Allow fix for all enabled rules (when `--fix`) is provided. fixable = ["ALL"] unfixable = [] # Allow unused variables when underscore-prefixed. dummy-variable-rgx = "^(_+|(_+[a-zA-Z0-9_]*[a-zA-Z0-9]+?))$" [tool.ruff.format] # Like Black, use double quotes for strings. quote-style = "single" indent-style = "space" # Like Black, respect magic trailing commas. skip-magic-trailing-comma = false # Like Black, automatically detect the appropriate line ending. line-ending = "auto" ================================================ FILE: tests/__init__.py ================================================ ================================================ FILE: tests/discovery/__init__.py ================================================ ================================================ FILE: tests/discovery/test_baidusearch.py ================================================ import pytest from theHarvester.discovery import baidusearch class TestBaiduSearch: @pytest.mark.asyncio async def test_process_and_parsing(self, monkeypatch): called = {} async def fake_fetch_all(urls, headers=None, proxy=False): called["urls"] = urls called["headers"] = headers called["proxy"] = proxy return [ "Contact foo@example.com on a.example.com \n", " bar@sub.example.com is here and www.example.com appears \n", " Visit sub.a.example.com. baz@example.com \n", ] # Patch the AsyncFetcher.fetch_all to avoid network I/O import theHarvester.lib.core as core_module monkeypatch.setattr(core_module.AsyncFetcher, "fetch_all", fake_fetch_all) # Make user agent deterministic (not strictly necessary, but stable) monkeypatch.setattr(core_module.Core, "get_user_agent", staticmethod(lambda: "UA"), raising=True) search = baidusearch.SearchBaidu(word="example.com", limit=21) await search.process(proxy=True) expected_urls = [ "https://www.baidu.com/s?wd=%40example.com&pn=0&oq=example.com", "https://www.baidu.com/s?wd=%40example.com&pn=10&oq=example.com", "https://www.baidu.com/s?wd=%40example.com&pn=20&oq=example.com", ] assert called["urls"] == expected_urls assert called["proxy"] is True emails = await search.get_emails() hosts = await search.get_hostnames() # Ensure our expected values are present assert "foo@example.com" in emails assert "bar@sub.example.com" in emails assert "baz@example.com" in emails assert {"a.example.com", "www.example.com", "sub.a.example.com"} <= set(hosts) @pytest.mark.asyncio async def test_pagination_limit_exclusive(self, monkeypatch): captured = {} async def fake_fetch_all(urls, headers=None, proxy=False): captured["urls"] = urls return [""] * len(urls) import theHarvester.lib.core as core_module monkeypatch.setattr(core_module.AsyncFetcher, "fetch_all", fake_fetch_all) monkeypatch.setattr(core_module.Core, "get_user_agent", staticmethod(lambda: "UA"), raising=True) search = baidusearch.SearchBaidu(word="example.com", limit=20) await search.process() # For limit=20, range(0, 20, 10) yields 0 and 10 only (20 is excluded) assert captured["urls"] == [ "https://www.baidu.com/s?wd=%40example.com&pn=0&oq=example.com", "https://www.baidu.com/s?wd=%40example.com&pn=10&oq=example.com", ] ================================================ FILE: tests/discovery/test_censys.py ================================================ import sys import types import pytest if 'aiohttp_socks' not in sys.modules: aiohttp_socks_stub = types.ModuleType('aiohttp_socks') class _ProxyConnector: @staticmethod def from_url(*_args, **_kwargs): return None setattr(aiohttp_socks_stub, 'ProxyConnector', _ProxyConnector) sys.modules['aiohttp_socks'] = aiohttp_socks_stub from theHarvester.discovery import censysearch from theHarvester.discovery.constants import MissingKey class _FakeQuery: def __init__(self, pages): self.pages = pages def __iter__(self): return iter(self.pages) @pytest.mark.asyncio async def test_missing_key_raises(monkeypatch) -> None: monkeypatch.setattr(censysearch.Core, 'censys_key', lambda: (None, None)) with pytest.raises(MissingKey): censysearch.SearchCensys('example.com') @pytest.mark.asyncio async def test_search_uses_documented_pagination_and_fields(monkeypatch) -> None: monkeypatch.setattr(censysearch.Core, 'censys_key', lambda: ('id', 'secret')) calls = {} class _FakeCensysCerts: def __init__(self, api_id, api_secret, user_agent): calls['init'] = {'api_id': api_id, 'api_secret': api_secret, 'user_agent': user_agent} def search(self, **kwargs): calls['search'] = kwargs return _FakeQuery( [ [ {'names': ['a.example.com'], 'parsed': {'subject': {'email_address': 'admin@example.com'}}}, {'names': ['b.example.com'], 'parsed': {'subject': {'email_address': ['ops@example.com']}}}, ], [ {'names': ['c.example.com'], 'parsed': {'subject': {'email_address': None}}}, ], ] ) monkeypatch.setattr(censysearch, 'CensysCerts', _FakeCensysCerts) search = censysearch.SearchCensys('example.com', limit=250) await search.process() assert calls['init']['api_id'] == 'id' assert calls['init']['api_secret'] == 'secret' assert calls['search']['query'] == 'names: example.com' assert calls['search']['per_page'] == 100 assert calls['search']['pages'] == 3 assert calls['search']['fields'] == ['names', 'parsed.subject.email_address'] assert await search.get_hostnames() == {'a.example.com', 'b.example.com', 'c.example.com'} assert await search.get_emails() == {'admin@example.com', 'ops@example.com'} @pytest.mark.asyncio async def test_search_respects_limit_across_page_data(monkeypatch) -> None: monkeypatch.setattr(censysearch.Core, 'censys_key', lambda: ('id', 'secret')) class _FakeCensysCerts: def __init__(self, api_id, api_secret, user_agent): del api_id, api_secret, user_agent def search(self, **kwargs): del kwargs return _FakeQuery( [ [ {'names': ['1.example.com']}, {'names': ['2.example.com']}, {'names': ['3.example.com']}, {'names': ['4.example.com']}, {'names': ['5.example.com']}, ] ] ) monkeypatch.setattr(censysearch, 'CensysCerts', _FakeCensysCerts) search = censysearch.SearchCensys('example.com', limit=3) await search.process() assert await search.get_hostnames() == {'1.example.com', '2.example.com', '3.example.com'} ================================================ FILE: tests/discovery/test_certspotter.py ================================================ #!/usr/bin/env python3 # coding=utf-8 import os from typing import Optional import pytest import httpx from theHarvester.discovery import certspottersearch from theHarvester.lib.core import * github_ci: Optional[str] = os.getenv( "GITHUB_ACTIONS" ) # Github set this to be the following: true instead of True class TestCertspotter(object): @staticmethod def domain() -> str: return "metasploit.com" @pytest.mark.skipif(github_ci == 'true', reason="Skipping this test for now") class TestCertspotterSearch(object): @pytest.mark.asyncio async def test_api(self) -> None: base_url = f"https://api.certspotter.com/v1/issuances?domain={TestCertspotter.domain()}&expand=dns_names" headers = {"User-Agent": Core.get_user_agent()} request = httpx.get(base_url, headers=headers) assert request.status_code == 200 @pytest.mark.asyncio async def test_search(self) -> None: search = certspottersearch.SearchCertspoter(TestCertspotter.domain()) await search.process() assert isinstance(await search.get_hostnames(), set) if __name__ == "__main__": pytest.main() ================================================ FILE: tests/discovery/test_criminalip.py ================================================ #!/usr/bin/env python3 # coding=utf-8 import pytest from theHarvester.discovery import criminalip @pytest.mark.asyncio async def test_parser_handles_missing_legacy_fields(monkeypatch) -> None: monkeypatch.setattr(criminalip.Core, 'criminalip_key', lambda: 'test-key') search = criminalip.SearchCriminalIP('example.com') payload = { 'data': { 'certificates': [{'subject': 'www.example.com'}], 'connected_domain_subdomain': [{'main_domain': {'domain': 'example.com'}, 'subdomains': [{'domain': 'api.example.com'}]}], 'connected_ip': [{'ip': '93.184.216.34'}], 'connected_ip_info': [ { 'asn': '15133', 'ip': '93.184.216.34', 'domain_list': [{'domain': 'mail.example.com'}], } ], 'cookies': [{'domain': '.portal.example.com'}], 'dns_record': { 'dns_record_type_a': {'ipv4': [{'ip': '93.184.216.34'}], 'ipv6': []}, 'dns_record_type_ns': ['ns1.example.com.'], }, 'html_page_link_domains': [{'domain': 'www.iana.org', 'mapped_ips': [{'ip': '192.0.33.8'}]}], 'links': [{'url': 'https://docs.example.com/guide'}], 'mapped_ip': [{'ip': '203.0.113.10'}], 'network_logs': { 'data': [{'url': 'https://cdn.example.com/script.js', 'as_number': '64500', 'ip_port': '198.51.100.10:443'}] }, 'page_redirections': [[{'url': 'https://login.example.com'}]], 'subdomains': [{'subdomain_name': 'blog.example.com'}], } } await search.parser(payload) hostnames = await search.get_hostnames() ips = await search.get_ips() asns = await search.get_asns() assert {'api.example.com', 'blog.example.com', 'cdn.example.com', 'docs.example.com', 'login.example.com'}.issubset(hostnames) assert {'93.184.216.34', '198.51.100.10', '203.0.113.10'}.issubset(ips) assert {'15133', '64500'}.issubset(asns) @pytest.mark.asyncio async def test_do_search_uses_v2_report_endpoint(monkeypatch) -> None: monkeypatch.setattr(criminalip.Core, 'criminalip_key', lambda: 'test-key') monkeypatch.setattr(criminalip.Core, 'get_user_agent', lambda: 'test-agent') called_urls = [] async def fake_post_fetch(url, **kwargs): assert url == 'https://api.criminalip.io/v1/domain/scan' return {'status': 200, 'data': {'scan_id': 12345}} async def fake_fetch_all(urls, **kwargs): called_urls.append(urls[0]) if '/v1/domain/status/' in urls[0]: return [{'status': 200, 'data': {'scan_percentage': 100}}] if '/v2/domain/report/' in urls[0]: return [ { 'status': 200, 'data': { 'certificates': [], 'connected_domain_subdomain': [], 'connected_ip': [], 'connected_ip_info': [], 'cookies': [], 'dns_record': {}, 'html_page_link_domains': [], 'links': [], 'mapped_ip': [], 'network_logs': {'data': []}, 'page_redirections': [], 'subdomains': [], }, } ] return [{'status': 500}] monkeypatch.setattr(criminalip.AsyncFetcher, 'post_fetch', fake_post_fetch) monkeypatch.setattr(criminalip.AsyncFetcher, 'fetch_all', fake_fetch_all) search = criminalip.SearchCriminalIP('example.com') await search.process() assert any('/v2/domain/report/12345' in url for url in called_urls) assert all('/v1/domain/report/' not in url for url in called_urls) ================================================ FILE: tests/discovery/test_githubcode.py ================================================ from unittest.mock import MagicMock import pytest from httpx import Response from theHarvester.discovery import githubcode from theHarvester.discovery.constants import MissingKey from theHarvester.lib.core import Core class TestSearchGithubCode: class OkResponse: response = Response(status_code=200) # Mocking the json method properly def __init__(self): self.response = Response(status_code=200) object.__setattr__( self.response, "json", MagicMock( return_value={ "items": [ {"text_matches": [{"fragment": "test1"}]}, {"text_matches": [{"fragment": "test2"}]}, ] } ), ) class FailureResponse: def __init__(self): self.response = Response(status_code=401) object.__setattr__(self.response, "json", MagicMock(return_value={})) class RetryResponse: def __init__(self): self.response = Response(status_code=403) object.__setattr__(self.response, "json", MagicMock(return_value={})) class MalformedResponse: def __init__(self): self.response = Response(status_code=200) object.__setattr__( self.response, "json", MagicMock( return_value={ "items": [ {"fail": True}, {"text_matches": []}, {"text_matches": [{"weird": "result"}]}, ] } ), ) @pytest.mark.asyncio async def test_missing_key(self): with pytest.raises(MissingKey): Core.github_key = MagicMock(return_value=None) # type: ignore[method-assign] githubcode.SearchGithubCode(word="test", limit=500) @pytest.mark.asyncio async def test_fragments_from_response(self): Core.github_key = MagicMock(return_value="test_key") # type: ignore[method-assign] test_class_instance = githubcode.SearchGithubCode(word="test", limit=500) test_result = await test_class_instance.fragments_from_response( self.OkResponse().response.json() ) print("test_result: ", test_result) assert test_result == ["test1", "test2"] @pytest.mark.asyncio async def test_invalid_fragments_from_response(self): Core.github_key = MagicMock(return_value="test_key") # type: ignore[method-assign] test_class_instance = githubcode.SearchGithubCode(word="test", limit=500) test_result = await test_class_instance.fragments_from_response( self.MalformedResponse().response.json() ) assert test_result == [] @pytest.mark.asyncio async def test_next_page(self): Core.github_key = MagicMock(return_value="test_key") # type: ignore[method-assign] test_class_instance = githubcode.SearchGithubCode(word="test", limit=500) test_result = githubcode.SuccessResult(list(), next_page=2, last_page=4) assert 2 == await test_class_instance.next_page_or_end(test_result) @pytest.mark.asyncio async def test_last_page(self): Core.github_key = MagicMock(return_value="test_key") # type: ignore[method-assign] test_class_instance = githubcode.SearchGithubCode(word="test", limit=500) test_result = githubcode.SuccessResult(list(), 0, 0) assert await test_class_instance.next_page_or_end(test_result) == 0 @pytest.mark.asyncio async def test_infinite_loop_fix_page_zero(self): """Test that the loop condition properly exits when page becomes 0""" Core.github_key = MagicMock(return_value="test_key") # type: ignore[method-assign] test_class_instance = githubcode.SearchGithubCode(word="test", limit=500) # Test the fixed condition: page != 0 page = 0 counter = 0 limit = 10 # The condition should be False when page is 0, preventing infinite loop condition_result = counter <= limit and page != 0 assert condition_result is False, "Loop should exit when page is 0" @pytest.mark.asyncio async def test_infinite_loop_fix_page_nonzero(self): """Test that the loop condition continues when page is non-zero""" Core.github_key = MagicMock(return_value="test_key") # type: ignore[method-assign] test_class_instance = githubcode.SearchGithubCode(word="test", limit=500) # Test with non-zero page values for page in [1, 2, 3, 10]: counter = 0 limit = 10 # The condition should be True when page is non-zero condition_result = counter <= limit and page != 0 assert condition_result is True, f"Loop should continue when page is {page}" @pytest.mark.asyncio async def test_infinite_loop_fix_old_vs_new_condition(self): """Test that demonstrates the difference between old and new conditions""" Core.github_key = MagicMock(return_value="test_key") # type: ignore[method-assign] test_class_instance = githubcode.SearchGithubCode(word="test", limit=500) page = 0 counter = 0 limit = 10 # Old problematic condition (would cause infinite loop) old_condition = counter <= limit and page is not None # New fixed condition (properly exits) new_condition = counter <= limit and page != 0 # Old condition would be True (causing infinite loop) assert old_condition is True, "Old condition would cause infinite loop when page=0" # New condition is False (properly exits) assert new_condition is False, "New condition properly exits when page=0" if __name__ == "__main__": pytest.main() ================================================ FILE: tests/discovery/test_githubcode_additions.py ================================================ from unittest.mock import MagicMock, AsyncMock import asyncio import pytest from theHarvester.discovery import githubcode from theHarvester.lib.core import Core class TestSearchGithubCodeProcess: @pytest.mark.asyncio async def test_process_stops_after_max_retries(self, monkeypatch): Core.github_key = MagicMock(return_value="test_key") # type: ignore[method-assign] inst = githubcode.SearchGithubCode(word="test", limit=10) # Speed up by avoiding actual sleeps monkeypatch.setattr(githubcode, "get_delay", lambda: 0, raising=False) monkeypatch.setattr(asyncio, "sleep", AsyncMock(return_value=None)) # Force RetryResult every time monkeypatch.setattr( inst, "handle_response", AsyncMock(return_value=githubcode.RetryResult(0)), ) monkeypatch.setattr( inst, "do_search", AsyncMock(return_value=("", {}, 403, {})), ) inst.max_retries = 2 await inst.process() assert inst.page == 0, "Process should stop after exceeding max retries" assert inst.retry_count == 3, "Retry count should exceed max_retries before stopping" @pytest.mark.asyncio async def test_process_stops_on_error_result(self, monkeypatch): Core.github_key = MagicMock(return_value="test_key") # type: ignore[method-assign] inst = githubcode.SearchGithubCode(word="test", limit=10) monkeypatch.setattr(githubcode, "get_delay", lambda: 0, raising=False) monkeypatch.setattr(asyncio, "sleep", AsyncMock(return_value=None)) # Force ErrorResult monkeypatch.setattr( inst, "handle_response", AsyncMock(return_value=githubcode.ErrorResult(500, "err")), ) monkeypatch.setattr( inst, "do_search", AsyncMock(return_value=("", {}, 500, {})), ) await inst.process() assert inst.page == 0, "Process should stop on error result to avoid infinite loop" @pytest.mark.asyncio async def test_process_breaks_on_same_page_pagination(self, monkeypatch): Core.github_key = MagicMock(return_value="test_key") # type: ignore[method-assign] inst = githubcode.SearchGithubCode(word="test", limit=10) monkeypatch.setattr(githubcode, "get_delay", lambda: 0, raising=False) monkeypatch.setattr(asyncio, "sleep", AsyncMock(return_value=None)) # Force SuccessResult that does not advance the page monkeypatch.setattr( inst, "handle_response", AsyncMock(return_value=githubcode.SuccessResult([], next_page=1, last_page=0)), ) monkeypatch.setattr( inst, "do_search", AsyncMock(return_value=("", {"items": []}, 200, {})), ) await inst.process() assert inst.page == 0, "Process should stop when pagination does not advance" ================================================ FILE: tests/discovery/test_otx.py ================================================ #!/usr/bin/env python3 # coding=utf-8 import os from typing import Optional import httpx import pytest from theHarvester.discovery import otxsearch from theHarvester.lib.core import * github_ci: Optional[str] = os.getenv( "GITHUB_ACTIONS" ) # Github set this to be the following: true instead of True class TestOtx(object): @staticmethod def domain() -> str: return "apple.com" @pytest.mark.asyncio async def test_search(self) -> None: search = otxsearch.SearchOtx(TestOtx.domain()) try: await search.process() except (httpx.TimeoutException, httpx.RequestError): pytest.skip("Skipping OTX search due to network error") assert isinstance(await search.get_hostnames(), set) assert isinstance(await search.get_ips(), set) if __name__ == "__main__": pytest.main() ================================================ FILE: tests/discovery/test_rocketreach.py ================================================ import sys import types import pytest if 'aiohttp_socks' not in sys.modules: aiohttp_socks_stub = types.ModuleType('aiohttp_socks') class _ProxyConnector: @staticmethod def from_url(*_args, **_kwargs): return None setattr(aiohttp_socks_stub, 'ProxyConnector', _ProxyConnector) sys.modules['aiohttp_socks'] = aiohttp_socks_stub from theHarvester.discovery import rocketreach from theHarvester.discovery.constants import MissingKey @pytest.mark.asyncio async def test_missing_key_raises(monkeypatch) -> None: monkeypatch.setattr(rocketreach.Core, 'rocketreach_key', lambda: None) with pytest.raises(MissingKey): rocketreach.SearchRocketReach('example.com', 10) @pytest.mark.asyncio async def test_do_search_uses_people_data_endpoint_and_start_pagination(monkeypatch) -> None: monkeypatch.setattr(rocketreach.Core, 'rocketreach_key', lambda: 'test-key') monkeypatch.setattr(rocketreach.Core, 'get_user_agent', lambda: 'test-agent') monkeypatch.setattr(rocketreach, 'get_delay', lambda: 0) async def fake_sleep(_seconds): return None monkeypatch.setattr(rocketreach.asyncio, 'sleep', fake_sleep) calls = [] async def fake_post_fetch(url, headers=None, data=None, json=False, **kwargs): calls.append((url, headers, data, json, kwargs)) if len(calls) == 1: first_page_profiles = [] for index in range(100): first_page_profiles.append( { 'linkedin_url': f'https://linkedin.com/in/user{index}', 'emails': [{'email': f'user{index}@example.com'}], } ) return { 'profiles': first_page_profiles, 'pagination': {'page': 1, 'total': 150}, } second_page_profiles = [] for index in range(100, 150): second_page_profiles.append( { 'linkedin_url': f'https://linkedin.com/in/user{index}', 'emails': [{'email': f'user{index}@example.com'}], } ) return { 'profiles': second_page_profiles, 'pagination': {'page': 2, 'total': 150}, } monkeypatch.setattr(rocketreach.AsyncFetcher, 'post_fetch', fake_post_fetch) search = rocketreach.SearchRocketReach('example.com', 150) await search.process() assert len(calls) == 2 first_url, first_headers, first_data, first_json, _ = calls[0] second_url, _, second_data, _, _ = calls[1] assert first_url == 'https://api.rocketreach.co/api/v2/person/search' assert second_url == 'https://api.rocketreach.co/api/v2/person/search' assert first_headers['Api-Key'] == 'test-key' assert first_headers['User-Agent'] == 'test-agent' assert first_json is True assert first_data == {'query': {'current_employer_domain': ['example.com']}, 'start': 0, 'page_size': 100} assert second_data == {'query': {'current_employer_domain': ['example.com']}, 'start': 100, 'page_size': 50} links = await search.get_links() emails = await search.get_emails() assert len(links) == 150 assert len(emails) == 150 assert 'https://linkedin.com/in/user0' in links assert 'https://linkedin.com/in/user149' in links assert 'user0@example.com' in emails assert 'user149@example.com' in emails @pytest.mark.asyncio async def test_do_search_stops_on_throttling_message(monkeypatch) -> None: monkeypatch.setattr(rocketreach.Core, 'rocketreach_key', lambda: 'test-key') monkeypatch.setattr(rocketreach.Core, 'get_user_agent', lambda: 'test-agent') monkeypatch.setattr(rocketreach, 'get_delay', lambda: 0) async def fake_sleep(_seconds): return None monkeypatch.setattr(rocketreach.asyncio, 'sleep', fake_sleep) calls = [] async def fake_post_fetch(url, headers=None, data=None, json=False, **kwargs): calls.append((url, data)) return {'detail': 'Request was throttled. Credits will become available in 10 seconds.'} monkeypatch.setattr(rocketreach.AsyncFetcher, 'post_fetch', fake_post_fetch) search = rocketreach.SearchRocketReach('example.com', 10) await search.process() assert len(calls) == 1 ================================================ FILE: tests/discovery/test_shodan_engine.py ================================================ import socket import sys from collections import OrderedDict import pytest class TestShodanEngine: @pytest.mark.asyncio async def test_shodan_engine_processes_without_work_item_error_and_yields_hostnames(self, monkeypatch, capsys): # Import inside the test so monkeypatching affects the already-imported module namespace. import theHarvester.__main__ as main_module # Make DNS resolution deterministic and offline. monkeypatch.setattr(socket, "gethostbyname", lambda _domain: "1.2.3.4", raising=True) # Avoid filesystem/sqlite side effects. class DummyStashManager: async def do_init(self) -> None: return None async def store_all(self, domain, all, res_type, source) -> None: # noqa: A002 return None monkeypatch.setattr(main_module.stash, "StashManager", DummyStashManager, raising=True) # Stub Shodan search to avoid network and API key requirements. class DummySearchShodan: async def search_ip(self, ip): return OrderedDict({ip: {"hostnames": ["a.example.com", "b.example.com"]}}) monkeypatch.setattr(main_module.shodansearch, "SearchShodan", DummySearchShodan, raising=True) # Run the CLI path that uses the engine queue/worker (`-b shodan`). monkeypatch.setattr(sys, "argv", ["theHarvester", "-d", "example.com", "-b", "shodan"], raising=True) with pytest.raises(SystemExit) as excinfo: await main_module.start() assert excinfo.value.code == 0 out = capsys.readouterr().out assert 'A error occurred while processing a "work item"' not in out assert "a.example.com" in out assert "b.example.com" in out ================================================ FILE: tests/discovery/test_thc.py ================================================ #!/usr/bin/env python3 # coding=utf-8 """ Tests for THC (ip.thc.org) discovery module. THC provides multiple endpoints: - Subdomain enumeration - CNAME lookup - Reverse DNS lookup API Documentation: https://ip.thc.org/docs/ """ import os from typing import Optional import httpx import pytest from theHarvester.discovery import thc from theHarvester.lib.core import Core github_ci: Optional[str] = os.getenv('GITHUB_ACTIONS') # ============================================================================= # 1. Direct API Tests (Endpoint Validation) # ============================================================================= class TestThcApi: """Tests to validate that the THC API responds correctly.""" @pytest.mark.asyncio async def test_api_subdomains_download_endpoint_responds(self) -> None: """Verify that the subdomain download endpoint responds.""" url = 'https://ip.thc.org/api/v1/subdomains/download?domain=google.com&limit=10&hide_header=true' headers = {'User-Agent': Core.get_user_agent()} try: response = httpx.get(url, headers=headers, timeout=30) assert response.status_code == 200 except (httpx.TimeoutException, httpx.RequestError): pytest.skip('Skipping due to network error') @pytest.mark.asyncio async def test_api_subdomains_returns_text_format(self) -> None: """Verify that the response is plain text.""" url = 'https://ip.thc.org/api/v1/subdomains/download?domain=google.com&limit=5&hide_header=true' headers = {'User-Agent': Core.get_user_agent()} try: response = httpx.get(url, headers=headers, timeout=30) content_type = response.headers.get('content-type', '') assert 'text' in content_type or 'octet-stream' in content_type or response.status_code == 200 except (httpx.TimeoutException, httpx.RequestError): pytest.skip('Skipping due to network error') @pytest.mark.asyncio async def test_api_cli_subdomain_endpoint(self) -> None: """Verify CLI endpoint /sb/{domain}.""" url = 'https://ip.thc.org/sb/google.com?l=5&noheader' headers = {'User-Agent': Core.get_user_agent()} try: response = httpx.get(url, headers=headers, timeout=30) assert response.status_code == 200 except (httpx.TimeoutException, httpx.RequestError): pytest.skip('Skipping due to network error') @pytest.mark.asyncio async def test_api_returns_rate_limit_headers(self) -> None: """Verify that the API returns rate limit headers.""" url = 'https://ip.thc.org/api/v1/subdomains/download?domain=example.com&limit=1&hide_header=true' headers = {'User-Agent': Core.get_user_agent()} try: response = httpx.get(url, headers=headers, timeout=30) assert 'x-ratelimit-limit' in response.headers assert 'x-ratelimit-remaining' in response.headers except (httpx.TimeoutException, httpx.RequestError): pytest.skip('Skipping due to network error') # ============================================================================= # 2. Subdomain Search Tests (Main Functionality) # ============================================================================= class TestThcSubdomainSearch: """Tests for subdomain search functionality.""" @staticmethod def domain() -> str: return 'tesla.com' @staticmethod def small_domain() -> str: return 'thc.org' @pytest.mark.asyncio async def test_search_returns_set(self) -> None: """Verify that get_hostnames() returns a set.""" search = thc.SearchThc(self.domain()) try: await search.process() except (httpx.TimeoutException, httpx.RequestError): pytest.skip('Skipping due to network error') result = await search.get_hostnames() assert isinstance(result, set) @pytest.mark.asyncio async def test_search_finds_subdomains(self) -> None: """Verify that it finds subdomains for a known domain.""" search = thc.SearchThc(self.domain()) try: await search.process() except (httpx.TimeoutException, httpx.RequestError): pytest.skip('Skipping due to network error') result = await search.get_hostnames() assert len(result) > 0, 'Should find at least one subdomain for tesla.com' @pytest.mark.asyncio async def test_search_results_contain_target_domain(self) -> None: """Verify that all results contain the target domain.""" search = thc.SearchThc(self.small_domain()) try: await search.process() except (httpx.TimeoutException, httpx.RequestError): pytest.skip('Skipping due to network error') result = await search.get_hostnames() for hostname in result: assert self.small_domain() in hostname, f'{hostname} should contain {self.small_domain()}' @pytest.mark.asyncio async def test_search_no_duplicates(self) -> None: """Verify that there are no duplicates in the results.""" search = thc.SearchThc(self.domain()) try: await search.process() except (httpx.TimeoutException, httpx.RequestError): pytest.skip('Skipping due to network error') result = await search.get_hostnames() result_list = list(result) assert len(result_list) == len(set(result_list)) # ============================================================================= # 3. Edge Case Tests # ============================================================================= class TestThcEdgeCases: """Tests for edge cases and error handling.""" @pytest.mark.asyncio async def test_search_nonexistent_domain(self) -> None: """Verify behavior with non-existent domain.""" search = thc.SearchThc('this-domain-definitely-does-not-exist-12345.com') try: await search.process() except (httpx.TimeoutException, httpx.RequestError): pytest.skip('Skipping due to network error') except Exception: pass result = await search.get_hostnames() assert isinstance(result, set) @pytest.mark.asyncio async def test_search_empty_domain(self) -> None: """Verify behavior with empty domain.""" search = thc.SearchThc('') try: await search.process() except (httpx.TimeoutException, httpx.RequestError): pytest.skip('Skipping due to network error') except Exception: pass result = await search.get_hostnames() assert isinstance(result, set) @pytest.mark.asyncio async def test_search_special_characters_domain(self) -> None: """Verify behavior with special characters.""" search = thc.SearchThc('example.com; DROP TABLE domains;--') try: await search.process() except (httpx.TimeoutException, httpx.RequestError): pytest.skip('Skipping due to network error') except Exception: pass result = await search.get_hostnames() assert isinstance(result, set) @pytest.mark.asyncio async def test_search_unicode_domain(self) -> None: """Verify behavior with IDN/unicode domain.""" search = thc.SearchThc('xn--mnchen-3ya.de') try: await search.process() except (httpx.TimeoutException, httpx.RequestError): pytest.skip('Skipping due to network error') except Exception: pass result = await search.get_hostnames() assert isinstance(result, set) @pytest.mark.asyncio async def test_search_subdomain_as_input(self) -> None: """Verify behavior when a subdomain is passed as input.""" search = thc.SearchThc('www.google.com') try: await search.process() except (httpx.TimeoutException, httpx.RequestError): pytest.skip('Skipping due to network error') result = await search.get_hostnames() assert isinstance(result, set) # ============================================================================= # 4. Proxy Tests # ============================================================================= class TestThcProxy: """Tests for proxy functionality.""" @staticmethod def domain() -> str: return 'example.com' @pytest.mark.asyncio async def test_process_accepts_proxy_parameter(self) -> None: """Verify that process() accepts proxy parameter.""" search = thc.SearchThc(self.domain()) try: await search.process(proxy=False) except (httpx.TimeoutException, httpx.RequestError): pytest.skip('Skipping due to network error') result = await search.get_hostnames() assert isinstance(result, set) @pytest.mark.asyncio async def test_proxy_attribute_is_set(self) -> None: """Verify that the proxy attribute is set correctly.""" search = thc.SearchThc(self.domain()) assert search.proxy is False # ============================================================================= # 5. Initialization and Attributes Tests # ============================================================================= class TestThcInitialization: """Tests for class initialization and structure.""" def test_init_sets_word(self) -> None: """Verify that __init__ sets the domain.""" domain = 'test.com' search = thc.SearchThc(domain) assert search.word == domain def test_init_creates_empty_results(self) -> None: """Verify that results is initialized empty.""" search = thc.SearchThc('test.com') assert hasattr(search, 'results') assert len(search.results) == 0 def test_init_proxy_default_false(self) -> None: """Verify that proxy is False by default.""" search = thc.SearchThc('test.com') assert search.proxy is False def test_init_has_rate_limit_settings(self) -> None: """Verify that rate limit settings are initialized.""" search = thc.SearchThc('test.com') assert hasattr(search, 'max_retries') assert hasattr(search, 'base_delay') assert search.max_retries == 3 assert search.base_delay == 2 def test_class_has_required_methods(self) -> None: """Verify that the class has the required methods.""" search = thc.SearchThc('test.com') assert hasattr(search, 'do_search') assert hasattr(search, 'get_hostnames') assert hasattr(search, 'process') assert callable(search.do_search) assert callable(search.get_hostnames) assert callable(search.process) # ============================================================================= # 6. Response Format Tests # ============================================================================= class TestThcResponseFormat: """Tests to verify response format.""" @staticmethod def domain() -> str: return 'github.com' @pytest.mark.asyncio async def test_hostnames_are_strings(self) -> None: """Verify that all hostnames are strings.""" search = thc.SearchThc(self.domain()) try: await search.process() except (httpx.TimeoutException, httpx.RequestError): pytest.skip('Skipping due to network error') result = await search.get_hostnames() for hostname in result: assert isinstance(hostname, str) @pytest.mark.asyncio async def test_hostnames_are_valid_format(self) -> None: """Verify that hostnames have valid format.""" search = thc.SearchThc(self.domain()) try: await search.process() except (httpx.TimeoutException, httpx.RequestError): pytest.skip('Skipping due to network error') result = await search.get_hostnames() for hostname in result: assert ' ' not in hostname assert '\n' not in hostname assert '\t' not in hostname @pytest.mark.asyncio async def test_hostnames_are_lowercase(self) -> None: """Verify that hostnames are lowercase.""" search = thc.SearchThc(self.domain()) try: await search.process() except (httpx.TimeoutException, httpx.RequestError): pytest.skip('Skipping due to network error') result = await search.get_hostnames() for hostname in result: assert hostname == hostname.lower() # ============================================================================= # 7. Integration Tests with theHarvester # ============================================================================= @pytest.mark.skipif(github_ci == 'true', reason='Skip integration tests in CI') class TestThcIntegration: """Integration tests with theHarvester framework.""" @pytest.mark.asyncio async def test_module_can_be_imported(self) -> None: """Verify that the module can be imported.""" from theHarvester.discovery import thc as thc_module assert thc_module is not None @pytest.mark.asyncio async def test_search_class_exists(self) -> None: """Verify that SearchThc class exists.""" from theHarvester.discovery import thc as thc_module assert hasattr(thc_module, 'SearchThc') @pytest.mark.asyncio async def test_compatible_with_store_function(self) -> None: """Verify compatibility with store function from __main__.py.""" search = thc.SearchThc('example.com') assert hasattr(search, 'process') assert hasattr(search, 'get_hostnames') if __name__ == '__main__': pytest.main() ================================================ FILE: tests/lib/test_core.py ================================================ from __future__ import annotations from pathlib import Path from typing import Any from unittest import mock import pytest import yaml import theHarvester.lib.core as core_module from theHarvester.lib.core import CONFIG_DIRS, DATA_DIR, AsyncFetcher, Core @pytest.fixture(autouse=True) def mock_environ(monkeypatch, tmp_path: Path): monkeypatch.setenv("HOME", str(tmp_path)) def mock_read_text(mocked: dict[Path, str | Exception]): read_text = Path.read_text def _read_text(self: Path, *args, **kwargs): if result := mocked.get(self): if isinstance(result, Exception): raise result return result return read_text(self, *args, **kwargs) return _read_text @pytest.mark.parametrize( ("name", "contents", "expected"), [ ("api-keys", "apikeys: {}", {}), ("proxies", "http: [localhost:8080]", {"http": ["http://localhost:8080"], "socks5": []}), ], ) @pytest.mark.parametrize("dir", CONFIG_DIRS) def test_read_config_searches_config_dirs( name: str, contents: str, expected: Any, dir: Path, capsys ): file = dir.expanduser() / f"{name}.yaml" config_files = [d.expanduser() / file.name for d in CONFIG_DIRS] side_effect = mock_read_text( {f: contents if f == file else FileNotFoundError() for f in config_files} ) with mock.patch("pathlib.Path.read_text", autospec=True, side_effect=side_effect): got = Core.api_keys() if name == "api-keys" else Core.proxy_list() assert got == expected assert f"Read {file.name} from {file}" in capsys.readouterr().out @pytest.mark.parametrize("name", ("api-keys", "proxies")) def test_read_config_copies_default_to_home(name: str, capsys): file = Path(f"~/.theHarvester/{name}.yaml").expanduser() config_files = [d.expanduser() / file.name for d in CONFIG_DIRS] side_effect = mock_read_text({f: FileNotFoundError() for f in config_files}) with mock.patch("pathlib.Path.read_text", autospec=True, side_effect=side_effect): got = Core.api_keys() if name == "api-keys" else Core.proxy_list() default = yaml.safe_load((DATA_DIR / file.name).read_text()) expected = ( default["apikeys"] if name == "api-keys" else { "http": [f"http://{h}" for h in default["http"]] if default.get("http") else [], "socks5": [f"socks5://{h}" for h in default["socks5"]] if default.get("socks5") else [], } ) assert got == expected assert f"Created default {file.name} at {file}" in capsys.readouterr().out assert file.exists() class DummyResponse: def __init__(self, text_value: str = 'response-text', json_value: Any = None): self.text_value = text_value self.json_value = {'ok': True} if json_value is None else json_value async def __aenter__(self): return self async def __aexit__(self, exc_type, exc, tb): return False async def text(self): return self.text_value async def json(self): return self.json_value class DummySession: instances: list['DummySession'] = [] def __init__(self, *, headers=None, timeout=None, connector=None): self.headers = headers self.timeout = timeout self.connector = connector self.closed = False self.requests: list[tuple[str, str, dict[str, Any]]] = [] DummySession.instances.append(self) async def __aenter__(self): return self async def __aexit__(self, exc_type, exc, tb): await self.close() return False def request(self, method: str, url: str, **kwargs): self.requests.append((method, url, kwargs)) return DummyResponse() def get(self, url: str, **kwargs): self.requests.append(('GET', url, kwargs)) return DummyResponse() def post(self, url: str, **kwargs): self.requests.append(('POST', url, kwargs)) return DummyResponse(json_value={'posted': True}) async def close(self): self.closed = True def reset_dummy_sessions() -> None: DummySession.instances.clear() async def fake_sleep(_seconds: float) -> None: return None def test_api_keys_yaml_is_in_sync_with_core_accessors(): required = core_module.Core._API_KEY_FIELDS assert required, "No API-key references were detected in `Core`" config = yaml.safe_load((DATA_DIR / "api-keys.yaml").read_text(encoding="utf-8")) apikeys = config["apikeys"] missing_providers = sorted(set(required) - set(apikeys)) assert not missing_providers, f"Missing providers in api-keys.yaml: {missing_providers}" missing_fields: dict[str, list[str]] = {} for provider, fields in required.items(): for field in sorted(fields): if field not in apikeys[provider]: missing_fields.setdefault(provider, []).append(field) assert not missing_fields, f"Missing fields in api-keys.yaml: {missing_fields}" @pytest.mark.parametrize( ("accessor_name", "expected"), [ ("bevigil_key", "bevigil-key"), ("censys_key", ("censys-id", "censys-secret")), ("fofa_key", ("fofa-key", "fofa-email")), ("tomba_key", ("tomba-key", "tomba-secret")), ], ) def test_api_key_accessors_delegate_to_shared_mapping(monkeypatch, accessor_name: str, expected: Any): monkeypatch.setattr( Core, 'api_keys', staticmethod( lambda: { 'bevigil': {'key': 'bevigil-key'}, 'censys': {'id': 'censys-id', 'secret': 'censys-secret'}, 'fofa': {'key': 'fofa-key', 'email': 'fofa-email'}, 'tomba': {'key': 'tomba-key', 'secret': 'tomba-secret'}, } ), ) accessor = getattr(Core, accessor_name) assert accessor() == expected @pytest.mark.asyncio async def test_fetch_creates_session_with_default_headers(monkeypatch) -> None: reset_dummy_sessions() monkeypatch.setattr(core_module.aiohttp, 'ClientSession', DummySession) monkeypatch.setattr(core_module.ssl, 'create_default_context', lambda cafile=None: 'ssl-context') monkeypatch.setattr(core_module.certifi, 'where', lambda: '/tmp/cacert.pem') monkeypatch.setattr(core_module.asyncio, 'sleep', fake_sleep) monkeypatch.setattr(Core, 'get_user_agent', staticmethod(lambda: 'test-agent')) result = await AsyncFetcher.fetch(url='https://example.com', follow_redirects=False) assert result == 'response-text' assert len(DummySession.instances) == 1 session = DummySession.instances[0] assert session.headers == {'User-Agent': 'test-agent'} assert session.closed is True assert session.requests == [ ('GET', 'https://example.com', {'ssl': 'ssl-context', 'allow_redirects': False}) ] @pytest.mark.asyncio async def test_fetch_uses_http_proxy_when_enabled(monkeypatch) -> None: reset_dummy_sessions() monkeypatch.setattr(core_module.aiohttp, 'ClientSession', DummySession) monkeypatch.setattr(core_module.ssl, 'create_default_context', lambda cafile=None: 'ssl-context') monkeypatch.setattr(core_module.certifi, 'where', lambda: '/tmp/cacert.pem') monkeypatch.setattr(core_module.asyncio, 'sleep', fake_sleep) monkeypatch.setattr(AsyncFetcher, '_get_random_proxy', staticmethod(lambda proxy_dict: ('http://proxy.local:8080', 'http'))) async def fake_create_connector(proxy_url, proxy_type, ssl_context=None): return 'connector' monkeypatch.setattr(AsyncFetcher, '_create_connector', fake_create_connector) result = await AsyncFetcher.fetch(url='https://example.com', proxy=True) assert result == 'response-text' session = DummySession.instances[0] assert session.connector == 'connector' assert session.requests == [ ('GET', 'https://example.com', {'ssl': 'ssl-context', 'proxy': 'http://proxy.local:8080'}) ] @pytest.mark.asyncio async def test_post_fetch_decodes_string_payload_and_posts_params(monkeypatch) -> None: reset_dummy_sessions() monkeypatch.setattr(core_module.aiohttp, 'ClientSession', DummySession) monkeypatch.setattr(core_module.asyncio, 'sleep', fake_sleep) monkeypatch.setattr(core_module.ssl, 'create_default_context', lambda cafile=None: 'ssl-context') monkeypatch.setattr(core_module.certifi, 'where', lambda: '/tmp/cacert.pem') monkeypatch.setattr(Core, 'get_user_agent', staticmethod(lambda: 'test-agent')) result = await AsyncFetcher.post_fetch( 'https://example.com/api', data='{"query": "example"}', params={'page': 2}, json=True, ) assert result == {'ok': True} session = DummySession.instances[0] assert session.headers == {'User-Agent': 'test-agent'} assert session.requests == [ ('POST', 'https://example.com/api', {'data': {'query': 'example'}, 'ssl': 'ssl-context', 'params': {'page': 2}}) ] @pytest.mark.asyncio async def test_post_fetch_proxy_branch_uses_get_with_http_proxy(monkeypatch) -> None: reset_dummy_sessions() created_connectors = [] monkeypatch.setattr(core_module.aiohttp, 'ClientSession', DummySession) monkeypatch.setattr(core_module.asyncio, 'sleep', fake_sleep) monkeypatch.setattr(core_module.ssl, 'create_default_context', lambda cafile=None: 'ssl-context') monkeypatch.setattr(core_module.certifi, 'where', lambda: '/tmp/cacert.pem') monkeypatch.setattr(AsyncFetcher, '_get_random_proxy', staticmethod(lambda proxy_dict: ('http://proxy.local:8080', 'http'))) async def fake_create_connector(proxy_url, proxy_type, ssl_context=None): created_connectors.append((proxy_url, proxy_type, ssl_context)) return 'connector' monkeypatch.setattr(AsyncFetcher, '_create_connector', fake_create_connector) result = await AsyncFetcher.post_fetch('https://example.com/resource', proxy=True) assert result == 'response-text' assert created_connectors == [('http://proxy.local:8080', 'http', 'ssl-context')] session = DummySession.instances[0] assert session.connector == 'connector' assert session.requests == [ ('GET', 'https://example.com/resource', {'proxy': 'http://proxy.local:8080'}) ] ================================================ FILE: tests/lib/test_output.py ================================================ from __future__ import annotations from theHarvester.lib.output import print_linkedin_sections, sorted_unique def test_sorted_unique_sorts_and_deduplicates() -> None: assert sorted_unique(["b", "a", "b"]) == ["a", "b"] def test_print_linkedin_sections_prints_links_when_present(capsys) -> None: # Regression coverage: the CLI previously never printed LinkedIn links when the list was non-empty. print_linkedin_sections( engines=["linkedin"], people=[], links=["https://b.example", "https://a.example", "https://a.example"], ) out = capsys.readouterr().out assert "No LinkedIn users found" in out assert "LinkedIn Links found: 3" in out assert "https://a.example" in out assert "https://b.example" in out def test_print_linkedin_sections_prints_people_and_links(capsys) -> None: print_linkedin_sections( engines=["rocketreach"], people=["bob", "alice", "bob"], links=["https://z.example", "https://z.example"], ) out = capsys.readouterr().out assert "LinkedIn Users found: 3" in out assert "alice" in out assert "bob" in out assert "LinkedIn Links found: 2" in out assert "https://z.example" in out ================================================ FILE: tests/test_hackertarget_apikey.py ================================================ import pytest from theHarvester.discovery import hackertarget as ht_mod from theHarvester.lib.core import Core class TestHackerTargetApiKey: @pytest.mark.asyncio async def test_do_search_with_apikey(self, monkeypatch): # make Core.hackertarget_key return a known key monkeypatch.setattr(Core, "hackertarget_key", lambda: "TESTKEY") # monkeypatch AsyncFetcher.fetch_all to capture requested URLs async def fake_fetch_all(urls, headers=None, proxy=False): # ensure apikey present in each URL assert all("apikey=TESTKEY" in u for u in urls) return ["1.2.3.4,host.example.com\n", "No PTR records found\n"] monkeypatch.setattr(ht_mod.AsyncFetcher, "fetch_all", fake_fetch_all) s = ht_mod.SearchHackerTarget("example.com") await s.do_search() # after do_search, total_results should include our fake response (commas replaced by colons) assert "1.2.3.4:host.example.com" in s.total_results @pytest.mark.asyncio async def test_do_search_without_apikey(self, monkeypatch): monkeypatch.setattr(Core, "hackertarget_key", lambda: None) async def fake_fetch_all(urls, headers=None, proxy=False): assert all("apikey=" not in u for u in urls) return ["1.2.3.4,host.example.com\n"] monkeypatch.setattr(ht_mod.AsyncFetcher, "fetch_all", fake_fetch_all) s = ht_mod.SearchHackerTarget("example.com") await s.do_search() assert "1.2.3.4:host.example.com" in s.total_results ================================================ FILE: tests/test_mojeek.py ================================================ import pytest from theHarvester.discovery import mojeek class TestMojeekSearch: @pytest.mark.asyncio async def test_process_and_parsing(self, monkeypatch): called = {} async def fake_fetch_all(urls, headers=None, proxy=False): called["urls"] = urls called["headers"] = headers called["proxy"] = proxy return [ "Contact admin@exemple.com sur www.exemple.com \n", " dev@exemple.com est présent sur api.exemple.com \n" ] import theHarvester.lib.core as core_module monkeypatch.setattr(core_module.AsyncFetcher, "fetch_all", fake_fetch_all) monkeypatch.setattr(core_module.Core, "get_user_agent", staticmethod(lambda: "UA"), raising=True) search = mojeek.SearchMojeek(word="exemple.com", limit=20) await search.process(proxy=True) expected_urls = [ "https://www.mojeek.com/search?q=%40exemple.com&s=0", "https://www.mojeek.com/search?q=%40exemple.com&s=10" ] assert any("mojeek.com" in url for url in called["urls"]) emails = await search.get_emails() hosts = await search.get_hostnames() assert "admin@exemple.com" in emails assert "dev@exemple.com" in emails assert "www.exemple.com" in hosts assert "api.exemple.com" in hosts @pytest.mark.asyncio async def test_pagination_limit(self, monkeypatch): captured = {} async def fake_fetch_all(urls, headers=None, proxy=False): captured["urls"] = urls return [""] * len(urls) import theHarvester.lib.core as core_module monkeypatch.setattr(core_module.AsyncFetcher, "fetch_all", fake_fetch_all) monkeypatch.setattr(core_module.Core, "get_user_agent", staticmethod(lambda: "UA"), raising=True) search = mojeek.SearchMojeek(word="exemple.com", limit=10) await search.process() assert len(captured["urls"]) == 1 ================================================ FILE: tests/test_myparser.py ================================================ #!/usr/bin/env python3 # coding=utf-8 import pytest from theHarvester.parsers import myparser class TestMyParser(object): @pytest.mark.asyncio async def test_emails(self) -> None: word = "domain.com" results = "@domain.com***a@domain***banotherdomain.com***c@domain.com***d@sub.domain.com***" parse = myparser.Parser(results, word) emails = sorted(await parse.emails()) assert emails, ["c@domain.com", "d@sub.domain.com"] if __name__ == "__main__": pytest.main() ================================================ FILE: tests/test_security.py ================================================ import os import re import tempfile from pathlib import Path import pytest from fastapi.testclient import TestClient from theHarvester.__main__ import sanitize_filename, sanitize_for_xml class TestCORSConfiguration: """Test CORS security configuration.""" def test_cors_does_not_allow_credentials_with_wildcard_origins(self): """ Security Test: CORS should not allow credentials with wildcard origins. This prevents credential theft attacks where any origin can make authenticated requests to the API. """ from theHarvester.lib.api.api import app # Find CORS middleware in the app cors_middleware = None for middleware in app.user_middleware: if 'CORSMiddleware' in str(middleware.cls): cors_middleware = middleware break assert cors_middleware is not None, 'CORS middleware should be configured' # Check that if allow_origins contains '*', allow_credentials must be False # Access kwargs from the middleware options = cors_middleware.kwargs allow_origins = options.get('allow_origins', []) allow_credentials = options.get('allow_credentials', False) if isinstance(allow_origins, (list, tuple, set)) and '*' in allow_origins: assert ( allow_credentials is False ), 'CRITICAL: CORS must not allow credentials with wildcard origins (CVE risk)' def test_cors_restricts_http_methods(self): """ Security Test: CORS should restrict HTTP methods to only what's needed. Reduces attack surface by limiting available methods. """ from theHarvester.lib.api.api import app cors_middleware = None for middleware in app.user_middleware: if 'CORSMiddleware' in str(middleware.cls): cors_middleware = middleware break assert cors_middleware is not None options = cors_middleware.kwargs allow_methods = options.get('allow_methods', []) # Should not allow all methods assert allow_methods != ['*'], 'CORS should restrict HTTP methods, not allow all (*)' # Should only allow necessary methods (GET, POST for this API) if isinstance(allow_methods, list): dangerous_methods = {'DELETE', 'PUT', 'PATCH', 'TRACE', 'CONNECT'} allowed_set = {m.upper() for m in allow_methods} assert not ( allowed_set & dangerous_methods ), f'Unnecessary HTTP methods detected: {allowed_set & dangerous_methods}' class TestXMLInjectionPrevention: """Test XML injection prevention.""" def test_sanitize_for_xml_escapes_special_characters(self): """ Security Test: Verify XML special characters are properly escaped. Prevents XML injection attacks. """ # Test all XML special characters test_cases = [ ('&', '&'), ('<', '<'), ('>', '>'), ('"', '"'), ("'", '''), ('', '<script>alert("XSS")</script>'), ('user@example.com & ', 'user@example.com & <test>'), ('Normal text', 'Normal text'), ] for input_text, expected_output in test_cases: result = sanitize_for_xml(input_text) assert result == expected_output, f'Failed to properly escape: {input_text}' def test_sanitize_for_xml_prevents_xml_entity_injection(self): """ Security Test: Prevent XML entity injection attempts. """ malicious_inputs = [ ']>', '', '', '<script>', ] for malicious_input in malicious_inputs: result = sanitize_for_xml(malicious_input) # Ensure dangerous characters are escaped assert '<' in result or '&' in result, f'Failed to sanitize: {malicious_input}' assert '<' not in result or result == malicious_input.replace('<', '<'), f'XML tags not escaped: {malicious_input}' def test_command_line_args_are_sanitized_in_xml_output(self): """ Security Test: Command line arguments must be sanitized before XML output. This test is a conceptual check - in real usage, ensure the XML writing code uses sanitize_for_xml() on all user-controlled data. """ # Simulate dangerous command line arguments dangerous_args = [ '--domain=test.com', "--source=''", '--output="; rm -rf /', '--domain=example.com¶m=', ] for arg in dangerous_args: sanitized = sanitize_for_xml(arg) # Verify no unescaped XML special characters remain assert '