Full Code of jina-ai/reader for AI

main 5f07900eabe0 cached

72 files

713.6 KB

186.7k tokens

396 symbols

1 requests

Download .txt

Showing preview only (742K chars total). Download the full file or copy to clipboard to get everything.

Repository: jina-ai/reader
Branch: main
Commit: 5f07900eabe0
Files: 72
Total size: 713.6 KB

Directory structure:
gitextract_azcmq_sa/

├── .github/
│   └── workflows/
│       ├── .keep
│       └── cd.yml
├── .gitignore
├── .gitmodules
├── .vscode/
│   ├── exensions.json
│   ├── launch.json
│   ├── settings.json
│   └── tasks.json
├── Dockerfile
├── LICENSE
├── README.md
├── integrity-check.cjs
├── package.json
├── public/
│   └── robots.txt
├── src/
│   ├── api/
│   │   ├── crawler.ts
│   │   ├── searcher.ts
│   │   └── serp.ts
│   ├── cloud-functions/
│   │   ├── adaptive-crawler.ts
│   │   └── data-crunching.ts
│   ├── db/
│   │   ├── adaptive-crawl-task.ts
│   │   ├── crawled.ts
│   │   ├── domain-blockade.ts
│   │   ├── domain-profile.ts
│   │   ├── img-alt.ts
│   │   ├── pdf.ts
│   │   └── searched.ts
│   ├── dto/
│   │   ├── adaptive-crawler-options.ts
│   │   ├── crawler-options.ts
│   │   ├── jina-embeddings-auth.ts
│   │   └── turndown-tweakable-options.ts
│   ├── fetch.d.ts
│   ├── lib/
│   │   └── transform-server-event-stream.ts
│   ├── services/
│   │   ├── alt-text.ts
│   │   ├── async-context.ts
│   │   ├── blackhole-detector.ts
│   │   ├── brave-search.ts
│   │   ├── canvas.ts
│   │   ├── cf-browser-rendering.ts
│   │   ├── curl.ts
│   │   ├── errors.ts
│   │   ├── finalizer.ts
│   │   ├── geoip.ts
│   │   ├── jsdom.ts
│   │   ├── lm.ts
│   │   ├── logger.ts
│   │   ├── minimal-stealth.js
│   │   ├── misc.ts
│   │   ├── pdf-extract.ts
│   │   ├── pseudo-transfer.ts
│   │   ├── puppeteer.ts
│   │   ├── registry.ts
│   │   ├── robots-text.ts
│   │   ├── serp/
│   │   │   ├── compat.ts
│   │   │   ├── google.ts
│   │   │   ├── internal.ts
│   │   │   ├── puppeteer.ts
│   │   │   └── serper.ts
│   │   ├── serper-search.ts
│   │   ├── snapshot-formatter.ts
│   │   ├── temp-file.ts
│   │   └── threaded.ts
│   ├── stand-alone/
│   │   ├── crawl.ts
│   │   ├── search.ts
│   │   └── serp.ts
│   ├── types.d.ts
│   └── utils/
│       ├── encoding.ts
│       ├── get-function-url.ts
│       ├── ip.ts
│       ├── markdown.ts
│       ├── misc.ts
│       └── tailwind-classes.ts
└── tsconfig.json

================================================
FILE CONTENTS
================================================

================================================
FILE: .github/workflows/.keep
================================================


================================================
FILE: .github/workflows/cd.yml
================================================
run-name: Build push and deploy (CD)
on:
  push:
    branches:
      - main
      - ci-debug
      - dev
    tags:
      - '*'

jobs:
  build-and-push-to-gcr:
    runs-on: ubuntu-latest
    concurrency:
      group: ${{ github.ref_type == 'branch' && github.ref }}
      cancel-in-progress: true
    permissions:
      contents: read
    steps:
      - uses: actions/checkout@v4
        with:
          lfs: true
          submodules: true
          token: ${{ secrets.THINAPPS_SHARED_READ_TOKEN }}
      - uses: 'google-github-actions/auth@v2'
        with:
           credentials_json: '${{ secrets.GCLOUD_SERVICE_ACCOUNT_SECRET_JSON }}'
      - name: 'Set up Cloud SDK'
        uses: 'google-github-actions/setup-gcloud@v2'
        with:
          install_components: beta
      - name: "Docker auth"
        run: |-
          gcloud auth configure-docker us-docker.pkg.dev --quiet
      - name: Set controller release version
        run: echo "RELEASE_VERSION=${GITHUB_REF#refs/*/}" >> $GITHUB_ENV
      - name: Set up Node.js
        uses: actions/setup-node@v4
        with:
          node-version: 22.12.0
          cache: npm

      - name: npm install
        run: npm ci
      - name: get maxmind mmdb
        run: mkdir -p licensed && curl -o licensed/GeoLite2-City.mmdb https://raw.githubusercontent.com/P3TERX/GeoLite.mmdb/download/GeoLite2-City.mmdb
      - name: get source han sans font
        run: curl -o licensed/SourceHanSansSC-Regular.otf https://raw.githubusercontent.com/adobe-fonts/source-han-sans/refs/heads/release/OTF/SimplifiedChinese/SourceHanSansSC-Regular.otf
      - name: build application
        run: npm run build
      - name: Set package version
        run: npm version --no-git-tag-version ${{ env.RELEASE_VERSION }}
        if: github.ref_type == 'tag'
      - name: Docker meta
        id: meta
        uses: docker/metadata-action@v5
        with:
          images: |
            us-docker.pkg.dev/reader-6b7dc/jina-reader/reader
      - name: Set up QEMU
        uses: docker/setup-qemu-action@v3
      - name: Set up Docker Buildx
        uses: docker/setup-buildx-action@v3
      - name: Build and push
        id: container
        uses: docker/build-push-action@v6
        with:
          context: .
          push: true
          tags: ${{ steps.meta.outputs.tags }}
          labels: ${{ steps.meta.outputs.labels }}
      - name: Deploy CRAWL with Tag
        run: |
          gcloud beta run deploy crawl --image us-docker.pkg.dev/reader-6b7dc/jina-reader/reader@${{steps.container.outputs.imageid}} --tag ${{ env.RELEASE_VERSION }} --command '' --args build/stand-alone/crawl.js --region us-central1 --async --min-instances 0 --deploy-health-check --use-http2
      - name: Deploy SEARCH with Tag
        run: |
          gcloud beta run deploy search --image us-docker.pkg.dev/reader-6b7dc/jina-reader/reader@${{steps.container.outputs.imageid}} --tag ${{ env.RELEASE_VERSION }} --command '' --args build/stand-alone/search.js --region us-central1 --async --min-instances 0 --deploy-health-check --use-http2
      - name: Deploy SERP with Tag
        run: |
          gcloud beta run deploy serp --image us-docker.pkg.dev/reader-6b7dc/jina-reader/reader@${{steps.container.outputs.imageid}} --tag ${{ env.RELEASE_VERSION }} --command '' --args build/stand-alone/serp.js --region us-central1 --async --min-instances 0 --deploy-health-check --use-http2
      - name: Deploy CRAWL-EU with Tag
        run: |
          gcloud beta run deploy crawl-eu --image us-docker.pkg.dev/reader-6b7dc/jina-reader/reader@${{steps.container.outputs.imageid}} --tag ${{ env.RELEASE_VERSION }} --command '' --args build/stand-alone/crawl.js --region europe-west1 --async --min-instances 0 --deploy-health-check --use-http2
      - name: Deploy SEARCH-EU with Tag
        run: |
          gcloud beta run deploy search-eu --image us-docker.pkg.dev/reader-6b7dc/jina-reader/reader@${{steps.container.outputs.imageid}} --tag ${{ env.RELEASE_VERSION }} --command '' --args build/stand-alone/search.js --region europe-west1 --async --min-instances 0 --deploy-health-check --use-http2
      - name: Deploy SERP-HK with Tag
        run: |
          gcloud beta run deploy serp-hk --image us-docker.pkg.dev/reader-6b7dc/jina-reader/reader@${{steps.container.outputs.imageid}} --tag ${{ env.RELEASE_VERSION }} --command '' --args build/stand-alone/serp.js --region asia-east2 --async --min-instances 0 --deploy-health-check --use-http2

================================================
FILE: .gitignore
================================================
# Logs
logs
*.log
npm-debug.log*
yarn-debug.log*
yarn-error.log*
firebase-debug.log*
firebase-debug.*.log*

# Firebase cache
.firebase/

# Firebase config

# Uncomment this if you'd like others to create their own Firebase project.
# For a team working on the same Firebase project(s), it is recommended to leave
# it commented so all members can deploy to the same project(s) in .firebaserc.
# .firebaserc

# Runtime data
pids
*.pid
*.seed
*.pid.lock

# Directory for instrumented libs generated by jscoverage/JSCover
lib-cov

# Coverage directory used by tools like istanbul
coverage

# nyc test coverage
.nyc_output

# Grunt intermediate storage (http://gruntjs.com/creating-plugins#storing-task-files)
.grunt

# Bower dependency directory (https://bower.io/)
bower_components

# node-waf configuration
.lock-wscript

# Compiled binary addons (http://nodejs.org/api/addons.html)
build/Release

# Dependency directories
node_modules/

# Optional npm cache directory
.npm

# Optional eslint cache
.eslintcache

# Optional REPL history
.node_repl_history

# Output of 'npm pack'
*.tgz

# Yarn Integrity file
.yarn-integrity

# dotenv environment variables file
.env
.secret.local

toy*.ts

.DS_Store
build/
.firebase-emu/
*.log
.DS_Store

*.local
.secret.*
licensed/

================================================
FILE: .gitmodules
================================================
[submodule "thinapps-shared"]
	path = thinapps-shared
	url = git@github.com:jina-ai/thinapps-shared.git


================================================
FILE: .vscode/exensions.json
================================================
{
    "recommendations": [
        "editorconfig.editorconfig",
        "octref.vetur",
        "redhat.vscode-yaml",
        "dbaeumer.vscode-eslint",
        "esbenp.prettier-vscode",
        "streetsidesoftware.code-spell-checker"
    ]
}

================================================
FILE: .vscode/launch.json
================================================
{
  "version": "0.2.0",
  "configurations": [
    {
      "name": "Attach",
      "port": 9229,
      "request": "attach",
      "skipFiles": [
        "<node_internals>/**"
      ],
      "type": "node"
    },
    {
      "name": "Attach by Process ID",
      "processId": "${command:PickProcess}",
      "request": "attach",
      "skipFiles": [
        "<node_internals>/**"
      ],
      "type": "node"
    },
    {
      "name": "Debug Stand Alone Crawl",
      "request": "launch",
      "runtimeArgs": [
        "--env-file=.secret.local",
      ],
      "env": {
        "GCLOUD_PROJECT": "reader-6b7dc",
        "LD_PRELOAD": "/usr/local/lib/libcurl-impersonate-chrome.dylib"
      },
      "cwd": "${workspaceFolder}",
      "program": "build/stand-alone/crawl.js",
      "skipFiles": [
        "<node_internals>/**"
      ],
      "type": "node",
      "outputCapture": "std",
      "preLaunchTask": "Backend:build:watch",
      "killBehavior": "forceful"
    },
    {
      "name": "Debug Stand Alone Crawl + Browser",
      "request": "launch",
      "runtimeArgs": [
        "--env-file=.secret.local",
      ],
      "env": {
        "GCLOUD_PROJECT": "reader-6b7dc",
        "DEBUG_BROWSER": "true",
        "LD_PRELOAD": "/usr/local/lib/libcurl-impersonate-chrome.dylib"
      },
      "cwd": "${workspaceFolder}",
      "program": "build/stand-alone/crawl.js",
      "skipFiles": [
        "<node_internals>/**"
      ],
      "type": "node",
      "outputCapture": "std",
      "preLaunchTask": "Backend:build:watch",
      "killBehavior": "forceful"
    },
    {
      "name": "Debug Stand Alone Crawl - EU",
      "request": "launch",
      "runtimeArgs": [
        "--env-file=.secret.local",
      ],
      "env": {
        "GCLOUD_PROJECT": "reader-6b7dc",
        "FIRESTORE_DATABASE": "reader-eu",
        "GCP_STORAGE_BUCKET": "reader-eu",
        "LD_PRELOAD": "/usr/local/lib/libcurl-impersonate-chrome.dylib"
      },
      "cwd": "${workspaceFolder}",
      "program": "build/stand-alone/crawl.js",
      "skipFiles": [
        "<node_internals>/**"
      ],
      "type": "node",
      "outputCapture": "std",
      "preLaunchTask": "Backend:build:watch",
      "killBehavior": "forceful"
    },
    {
      "name": "Debug Stand Alone Search",
      "request": "launch",
      "runtimeArgs": [
        "--env-file=.secret.local",
      ],
      "env": {
        "GCLOUD_PROJECT": "reader-6b7dc",
        "LD_PRELOAD": "/usr/local/lib/libcurl-impersonate-chrome.dylib"
      },
      "cwd": "${workspaceFolder}",
      "program": "build/stand-alone/search.js",
      "skipFiles": [
        "<node_internals>/**"
      ],
      "type": "node",
      "outputCapture": "std",
      "preLaunchTask": "Backend:build:watch",
      "killBehavior": "forceful"
    },
    {
      "name": "Debug Stand Alone SERP",
      "request": "launch",
      "runtimeArgs": [
        "--env-file=.secret.local",
      ],
      "env": {
        "GCLOUD_PROJECT": "reader-6b7dc",
        "PREFERRED_PROXY_COUNTRY": "hk",
        "OVERRIDE_GOOGLE_DOMAIN": "www.google.com.hk",
        "LD_PRELOAD": "/usr/local/lib/libcurl-impersonate-chrome.dylib"
      },
      "cwd": "${workspaceFolder}",
      "program": "build/stand-alone/serp.js",
      "skipFiles": [
        "<node_internals>/**"
      ],
      "type": "node",
      "outputCapture": "std",
      "preLaunchTask": "Backend:build:watch",
      "killBehavior": "forceful"
    },
  ]
}

================================================
FILE: .vscode/settings.json
================================================
{
    "editor.wordWrap": "on",
    "editor.wordWrapColumn": 120,
    "files.trimTrailingWhitespace": true,
    "files.trimFinalNewlines": true,
    "[javascript]": {
        "editor.defaultFormatter": "vscode.typescript-language-features"
    },
    "[jsonc]": {
        "editor.defaultFormatter": "vscode.json-language-features"
    },
    "[typescript]": {
        "editor.defaultFormatter": "vscode.typescript-language-features"
    },
    "[json]": {
        "editor.defaultFormatter": "vscode.json-language-features"
    },
    "[yaml]": {
        "editor.defaultFormatter": "redhat.vscode-yaml"
    },
    "[markdown]": {
        "files.trimTrailingWhitespace": false
    },
    "typescript.tsdk": "node_modules/typescript/lib",
    "typescript.preferences.quoteStyle": "single",
    "typescript.format.semicolons": "insert",
    "typescript.preferences.importModuleSpecifier": "project-relative",
    "typescript.locale": "en",
    "cSpell.enabled": true,
    "cSpell.words": [
    ],
}

================================================
FILE: .vscode/tasks.json
================================================
{
    "version": "2.0.0",
    "tasks": [
        {
            "type": "npm",
            "script": "build",
            "group": "build",
            "options": {
                "cwd": "${workspaceFolder}"
            },
            "problemMatcher": [],
            "label": "Backend:rebuild",
            "detail": "Backend:rebuild"
        },
        {
            "type": "typescript",
            "options": {
                "cwd": "${workspaceFolder}"
            },
            "tsconfig": "tsconfig.json",
            "option": "watch",
            "isBackground": true,
            "problemMatcher": [
                "$tsc-watch"
            ],
            "group": "build",
            "label": "Backend:build:watch"
        }
    ]
}

================================================
FILE: Dockerfile
================================================
# syntax=docker/dockerfile:1
FROM lwthiker/curl-impersonate:0.6-chrome-slim-bullseye

FROM node:22

RUN apt-get update \
    && apt-get install -y wget gnupg \
    && wget -q -O - https://dl-ssl.google.com/linux/linux_signing_key.pub | apt-key add - \
    && sh -c 'echo "deb [arch=amd64] http://dl.google.com/linux/chrome/deb/ stable main" >> /etc/apt/sources.list.d/google.list' \
    && apt-get update \
    && apt-get install -y google-chrome-stable fonts-ipafont-gothic fonts-wqy-zenhei fonts-thai-tlwg fonts-kacst fonts-freefont-ttf libxss1 zstd \
    --no-install-recommends \
    && rm -rf /var/lib/apt/lists/*

COPY --from=0 /usr/local/lib/libcurl-impersonate.so /usr/local/lib/libcurl-impersonate.so

RUN groupadd -r jina
RUN useradd -g jina  -G audio,video -m jina
USER jina

WORKDIR /app

COPY package.json package-lock.json ./
RUN npm ci

COPY build ./build
COPY public ./public
COPY licensed ./licensed

RUN rm -rf ~/.config/chromium && mkdir -p ~/.config/chromium

RUN NODE_COMPILE_CACHE=node_modules npm run dry-run

ENV OVERRIDE_CHROME_EXECUTABLE_PATH=/usr/bin/google-chrome-stable
ENV LD_PRELOAD=/usr/local/lib/libcurl-impersonate.so CURL_IMPERSONATE=chrome116 CURL_IMPERSONATE_HEADERS=no
ENV NODE_COMPILE_CACHE=node_modules
ENV PORT=8080

EXPOSE 3000 3001 8080 8081
ENTRYPOINT ["node"]
CMD [ "build/stand-alone/crawl.js" ]


================================================
FILE: LICENSE
================================================
Copyright 2020-2024 Jina AI Limited.  All rights reserved.


                                 Apache License
                           Version 2.0, January 2004
                        http://www.apache.org/licenses/

   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION

   1. Definitions.

      "License" shall mean the terms and conditions for use, reproduction,
      and distribution as defined by Sections 1 through 9 of this document.

      "Licensor" shall mean the copyright owner or entity authorized by
      the copyright owner that is granting the License.

      "Legal Entity" shall mean the union of the acting entity and all
      other entities that control, are controlled by, or are under common
      control with that entity. For the purposes of this definition,
      "control" means (i) the power, direct or indirect, to cause the
      direction or management of such entity, whether by contract or
      otherwise, or (ii) ownership of fifty percent (50%) or more of the
      outstanding shares, or (iii) beneficial ownership of such entity.

      "You" (or "Your") shall mean an individual or Legal Entity
      exercising permissions granted by this License.

      "Source" form shall mean the preferred form for making modifications,
      including but not limited to software source code, documentation
      source, and configuration files.

      "Object" form shall mean any form resulting from mechanical
      transformation or translation of a Source form, including but
      not limited to compiled object code, generated documentation,
      and conversions to other media types.

      "Work" shall mean the work of authorship, whether in Source or
      Object form, made available under the License, as indicated by a
      copyright notice that is included in or attached to the work
      (an example is provided in the Appendix below).

      "Derivative Works" shall mean any work, whether in Source or Object
      form, that is based on (or derived from) the Work and for which the
      editorial revisions, annotations, elaborations, or other modifications
      represent, as a whole, an original work of authorship. For the purposes
      of this License, Derivative Works shall not include works that remain
      separable from, or merely link (or bind by name) to the interfaces of,
      the Work and Derivative Works thereof.

      "Contribution" shall mean any work of authorship, including
      the original version of the Work and any modifications or additions
      to that Work or Derivative Works thereof, that is intentionally
      submitted to Licensor for inclusion in the Work by the copyright owner
      or by an individual or Legal Entity authorized to submit on behalf of
      the copyright owner. For the purposes of this definition, "submitted"
      means any form of electronic, verbal, or written communication sent
      to the Licensor or its representatives, including but not limited to
      communication on electronic mailing lists, source code control systems,
      and issue tracking systems that are managed by, or on behalf of, the
      Licensor for the purpose of discussing and improving the Work, but
      excluding communication that is conspicuously marked or otherwise
      designated in writing by the copyright owner as "Not a Contribution."

      "Contributor" shall mean Licensor and any individual or Legal Entity
      on behalf of whom a Contribution has been received by Licensor and
      subsequently incorporated within the Work.

   2. Grant of Copyright License. Subject to the terms and conditions of
      this License, each Contributor hereby grants to You a perpetual,
      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
      copyright license to reproduce, prepare Derivative Works of,
      publicly display, publicly perform, sublicense, and distribute the
      Work and such Derivative Works in Source or Object form.

   3. Grant of Patent License. Subject to the terms and conditions of
      this License, each Contributor hereby grants to You a perpetual,
      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
      (except as stated in this section) patent license to make, have made,
      use, offer to sell, sell, import, and otherwise transfer the Work,
      where such license applies only to those patent claims licensable
      by such Contributor that are necessarily infringed by their
      Contribution(s) alone or by combination of their Contribution(s)
      with the Work to which such Contribution(s) was submitted. If You
      institute patent litigation against any entity (including a
      cross-claim or counterclaim in a lawsuit) alleging that the Work
      or a Contribution incorporated within the Work constitutes direct
      or contributory patent infringement, then any patent licenses
      granted to You under this License for that Work shall terminate
      as of the date such litigation is filed.

   4. Redistribution. You may reproduce and distribute copies of the
      Work or Derivative Works thereof in any medium, with or without
      modifications, and in Source or Object form, provided that You
      meet the following conditions:

      (a) You must give any other recipients of the Work or
          Derivative Works a copy of this License; and

      (b) You must cause any modified files to carry prominent notices
          stating that You changed the files; and

      (c) You must retain, in the Source form of any Derivative Works
          that You distribute, all copyright, patent, trademark, and
          attribution notices from the Source form of the Work,
          excluding those notices that do not pertain to any part of
          the Derivative Works; and

      (d) If the Work includes a "NOTICE" text file as part of its
          distribution, then any Derivative Works that You distribute must
          include a readable copy of the attribution notices contained
          within such NOTICE file, excluding those notices that do not
          pertain to any part of the Derivative Works, in at least one
          of the following places: within a NOTICE text file distributed
          as part of the Derivative Works; within the Source form or
          documentation, if provided along with the Derivative Works; or,
          within a display generated by the Derivative Works, if and
          wherever such third-party notices normally appear. The contents
          of the NOTICE file are for informational purposes only and
          do not modify the License. You may add Your own attribution
          notices within Derivative Works that You distribute, alongside
          or as an addendum to the NOTICE text from the Work, provided
          that such additional attribution notices cannot be construed
          as modifying the License.

      You may add Your own copyright statement to Your modifications and
      may provide additional or different license terms and conditions
      for use, reproduction, or distribution of Your modifications, or
      for any such Derivative Works as a whole, provided Your use,
      reproduction, and distribution of the Work otherwise complies with
      the conditions stated in this License.

   5. Submission of Contributions. Unless You explicitly state otherwise,
      any Contribution intentionally submitted for inclusion in the Work
      by You to the Licensor shall be under the terms and conditions of
      this License, without any additional terms or conditions.
      Notwithstanding the above, nothing herein shall supersede or modify
      the terms of any separate license agreement you may have executed
      with Licensor regarding such Contributions.

   6. Trademarks. This License does not grant permission to use the trade
      names, trademarks, service marks, or product names of the Licensor,
      except as required for reasonable and customary use in describing the
      origin of the Work and reproducing the content of the NOTICE file.

   7. Disclaimer of Warranty. Unless required by applicable law or
      agreed to in writing, Licensor provides the Work (and each
      Contributor provides its Contributions) on an "AS IS" BASIS,
      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
      implied, including, without limitation, any warranties or conditions
      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
      PARTICULAR PURPOSE. You are solely responsible for determining the
      appropriateness of using or redistributing the Work and assume any
      risks associated with Your exercise of permissions under this License.

   8. Limitation of Liability. In no event and under no legal theory,
      whether in tort (including negligence), contract, or otherwise,
      unless required by applicable law (such as deliberate and grossly
      negligent acts) or agreed to in writing, shall any Contributor be
      liable to You for damages, including any direct, indirect, special,
      incidental, or consequential damages of any character arising as a
      result of this License or out of the use or inability to use the
      Work (including but not limited to damages for loss of goodwill,
      work stoppage, computer failure or malfunction, or any and all
      other commercial damages or losses), even if such Contributor
      has been advised of the possibility of such damages.

   9. Accepting Warranty or Additional Liability. While redistributing
      the Work or Derivative Works thereof, You may choose to offer,
      and charge a fee for, acceptance of support, warranty, indemnity,
      or other liability obligations and/or rights consistent with this
      License. However, in accepting such obligations, You may act only
      on Your own behalf and on Your sole responsibility, not on behalf
      of any other Contributor, and only if You agree to indemnify,
      defend, and hold each Contributor harmless for any liability
      incurred by, or claims asserted against, such Contributor by reason
      of your accepting any such warranty or additional liability.

   END OF TERMS AND CONDITIONS

   Copyright 2020-2021 Jina AI Limited

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.


================================================
FILE: README.md
================================================
# Reader

Your LLMs deserve better input.

Reader does two things:
- **Read**: It converts any URL to an **LLM-friendly** input with `https://r.jina.ai/https://your.url`. Get improved output for your agent and RAG systems at no cost.
- **Search**: It searches the web for a given query with `https://s.jina.ai/your+query`. This allows your LLMs to access the latest world knowledge from the web.

Check out [the live demo](https://jina.ai/reader#demo)

Or just visit these URLs (**Read**) https://r.jina.ai/https://github.com/jina-ai/reader, (**Search**) https://s.jina.ai/Who%20will%20win%202024%20US%20presidential%20election%3F and see yourself.

> Feel free to use Reader API in production. It is free, stable and scalable. We are maintaining it actively as one of the core products of Jina AI. [Check out rate limit](https://jina.ai/reader#pricing)

<img width="973" alt="image" src="https://github.com/jina-ai/reader/assets/2041322/2067c7a2-c12e-4465-b107-9a16ca178d41">
<img width="973" alt="image" src="https://github.com/jina-ai/reader/assets/2041322/675ac203-f246-41c2-b094-76318240159f">


## Updates

- **2024-07-15**: To restrict the results of `s.jina.ai` to certain domain/website, you can set e.g. `site=jina.ai` in the query parameters, which enables in-site search. For more options, [try our updated live-demo](https://jina.ai/reader/#apiform).
- **2024-05-30**: Reader can now read abitrary PDF from any URL! Check out [this PDF result from NASA.gov](https://r.jina.ai/https://www.nasa.gov/wp-content/uploads/2023/01/55583main_vision_space_exploration2.pdf) vs [the original](https://www.nasa.gov/wp-content/uploads/2023/01/55583main_vision_space_exploration2.pdf).
- **2024-05-15**: We introduced a new endpoint `s.jina.ai` that searches on the web and return top-5 results, each in a LLM-friendly format. [Read more about this new feature here](https://jina.ai/news/jina-reader-for-search-grounding-to-improve-factuality-of-llms).
- **2024-05-08**: Image caption is off by default for better latency. To turn it on, set `x-with-generated-alt: true` in the request header.
- **2024-04-24**: You now have more fine-grained control over Reader API [using headers](#using-request-headers), e.g. forwarding cookies, using HTTP proxy.
- **2024-04-15**: Reader now supports image reading! It captions all images at the specified URL and adds `Image [idx]: [caption]` as an alt tag (if they initially lack one). This enables downstream LLMs to interact with the images in reasoning, summarizing etc. [See example here](https://x.com/JinaAI_/status/1780094402071023926).

## Usage

### Using `r.jina.ai` for single URL fetching
Simply prepend `https://r.jina.ai/` to any URL. For example, to convert the URL `https://en.wikipedia.org/wiki/Artificial_intelligence` to an LLM-friendly input, use the following URL:

[https://r.jina.ai/https://en.wikipedia.org/wiki/Artificial_intelligence](https://r.jina.ai/https://en.wikipedia.org/wiki/Artificial_intelligence)

### [Using `r.jina.ai` for a full website fetching (Google Colab)](https://colab.research.google.com/drive/1uoBy6_7BhxqpFQ45vuhgDDDGwstaCt4P#scrollTo=5LQjzJiT9ewT)

### Using `s.jina.ai` for web search
Simply prepend `https://s.jina.ai/` to your search query. Note that if you are using this in the code, make sure to encode your search query first, e.g. if your query is `Who will win 2024 US presidential election?` then your url should look like:

[https://s.jina.ai/Who%20will%20win%202024%20US%20presidential%20election%3F](https://s.jina.ai/Who%20will%20win%202024%20US%20presidential%20election%3F)

Behind the scenes, Reader searches the web, fetches the top 5 results, visits each URL, and applies `r.jina.ai` to it. This is different from many `web search function-calling` in agent/RAG frameworks, which often return only the title, URL, and description provided by the search engine API. If you want to read one result more deeply, you have to fetch the content yourself from that URL. With Reader, `http://s.jina.ai` automatically fetches the content from the top 5 search result URLs for you (reusing the tech stack behind `http://r.jina.ai`). This means you don't have to handle browser rendering, blocking, or any issues related to JavaScript and CSS yourself.

### Using `s.jina.ai` for in-site search
Simply specify `site` in the query parameters such as:

```bash
curl 'https://s.jina.ai/When%20was%20Jina%20AI%20founded%3F?site=jina.ai&site=github.com'
```

### [Interactive Code Snippet Builder](https://jina.ai/reader#apiform)

We highly recommend using the code builder to explore different parameter combinations of the Reader API.

<a href="https://jina.ai/reader#apiform"><img width="973" alt="image" src="https://github.com/jina-ai/reader/assets/2041322/a490fd3a-1c4c-4a3f-a95a-c481c2a8cc8f"></a>


### Using request headers

As you have already seen above, one can control the behavior of the Reader API using request headers. Here is a complete list of supported headers.

- You can enable the image caption feature via the `x-with-generated-alt: true` header.
- You can ask the Reader API to forward cookies settings via the `x-set-cookie` header.
  - Note that requests with cookies will not be cached.
- You can bypass `readability` filtering via the `x-respond-with` header, specifically:
  - `x-respond-with: markdown` returns markdown *without* going through `reability`
  - `x-respond-with: html` returns `documentElement.outerHTML`
  - `x-respond-with: text` returns `document.body.innerText`
  - `x-respond-with: screenshot` returns the URL of the webpage's screenshot
- You can specify a proxy server via the `x-proxy-url` header.
- You can customize cache tolerance via the `x-cache-tolerance` header (integer in seconds).
- You can bypass the cached page (lifetime 3600s) via the `x-no-cache: true` header (equivalent of `x-cache-tolerance: 0`).
- If you already know the HTML structure of your target page, you may specify `x-target-selector` or `x-wait-for-selector` to direct the Reader API to focus on a specific part of the page.
  - By setting `x-target-selector` header to a CSS selector, the Reader API return the content within the matched element, instead of the full HTML. Setting this header is useful when the automatic content extraction fails to capture the desired content and you can manually select the correct target.
  - By setting `x-wait-for-selector` header to a CSS selector, the Reader API will wait until the matched element is rendered before returning the content. If you already specified `x-wait-for-selector`, this header can be omitted if you plan to wait for the same element.

### Using `r.jina.ai` for single page application (SPA) fetching
Many websites nowadays rely on JavaScript frameworks and client-side rendering. Usually known as Single Page Application (SPA). Thanks to [Puppeteer](https://github.com/puppeteer/puppeteer) and headless Chrome browser, Reader natively supports fetching these websites. However, due to specific approach some SPA are developed, there may be some extra precautions to take. 

#### SPAs with hash-based routing
By definition of the web standards, content come after `#` in a URL is not sent to the server. To mitigate this issue, use `POST` method with `url` parameter in body.

```bash
curl -X POST 'https://r.jina.ai/' -d 'url=https://example.com/#/route' 
```

#### SPAs with preloading contents
Some SPAs, or even some websites that are not strictly SPAs, may show preload contents before later loading the main content dynamically. In this case, Reader may be capturing the preload content instead of the main content. To mitigate this issue, here are some possible solutions:

##### Specifying `x-timeout` 
When timeout is explicitly specified, Reader will not attempt to return early and will wait for network idle until the timeout is reached. This is useful when the target website will eventually come to a network idle. 

```bash
curl 'https://example.com/' -H 'x-timeout: 30'
```

##### Specifying `x-wait-for-selector` 
When wait-for-selector is explicitly specified, Reader will wait for the appearance of the specified CSS selector until timeout is reached. This is useful when you know exactly what element to wait for. 

```bash
curl 'https://example.com/' -H 'x-wait-for-selector: #content'
```

### Streaming mode

Streaming mode is useful when you find that the standard mode provides an incomplete result. This is because the Reader will wait a bit longer until the page is *stablely* rendered. Use the accept-header to toggle the streaming mode:

```bash
curl -H "Accept: text/event-stream" https://r.jina.ai/https://en.m.wikipedia.org/wiki/Main_Page
```

The data comes in a stream; each subsequent chunk contains more complete information. **The last chunk should provide the most complete and final result.** If you come from LLMs, please note that it is a different behavior than the LLMs' text-generation streaming.

For example, compare these two curl commands below. You can see streaming one gives you complete information at last, whereas standard mode does not. This is because the content loading on this particular site is triggered by some js *after* the page is fully loaded, and standard mode returns the page "too soon".
```bash
curl -H 'x-no-cache: true' https://access.redhat.com/security/cve/CVE-2023-45853
curl -H "Accept: text/event-stream" -H 'x-no-cache: true' https://r.jina.ai/https://access.redhat.com/security/cve/CVE-2023-45853
```

> Note: `-H 'x-no-cache: true'` is used only for demonstration purposes to bypass the cache.

Streaming mode is also useful if your downstream LLM/agent system requires immediate content delivery or needs to process data in chunks to interleave I/O and LLM processing times. This allows for quicker access and more efficient data handling:

```text
Reader API:  streamContent1 ----> streamContent2 ----> streamContent3 ---> ... 
                          |                    |                     |
                          v                    |                     |
Your LLM:                 LLM(streamContent1)  |                     |
                                               v                     |
                                               LLM(streamContent2)   |
                                                                     v
                                                                     LLM(streamContent3)
```

Note that in terms of completeness: `... > streamContent3 > streamContent2 > streamContent1`, each subsequent chunk contains more complete information.

### JSON mode

This is still very early and the result is not really a "useful" JSON. It contains three fields `url`, `title` and `content` only. Nonetheless, you can use accept-header to control the output format:
```bash
curl -H "Accept: application/json" https://r.jina.ai/https://en.m.wikipedia.org/wiki/Main_Page
```

JSON mode is probably more useful in `s.jina.ai` than `r.jina.ai`. For `s.jina.ai` with JSON mode, it returns 5 results in a list, each in the structure of `{'title', 'content', 'url'}`.

### Generated alt

All images in that page that lack `alt` tag can be auto-captioned by a VLM (vision langauge model) and formatted as `!(Image [idx]: [VLM_caption])[img_URL]`. This should give your downstream text-only LLM *just enough* hints to include those images into reasoning, selecting, and summarization. Use the x-with-generated-alt header to toggle the streaming mode:

```bash
curl -H "X-With-Generated-Alt: true" https://r.jina.ai/https://en.m.wikipedia.org/wiki/Main_Page
```

## How it works
[![Ask DeepWiki](https://deepwiki.com/badge.svg)](https://deepwiki.com/jina-ai/reader)

## What is `thinapps-shared` submodule?

You might notice a reference to `thinapps-shared` submodule, an internal package we use to share code across our products. While it’s not open-sourced and isn't integral to the Reader's functions, it mainly helps with decorators, logging, secrets management, etc. Feel free to ignore it for now.

That said, this is *the single codebase* behind `https://r.jina.ai`, so everytime we commit here, we will deploy the new version to the `https://r.jina.ai`.

## Having trouble on some websites?
Please raise an issue with the URL you are having trouble with. We will look into it and try to fix it.

## License
Reader is backed by [Jina AI](https://jina.ai) and licensed under [Apache-2.0](./LICENSE).


================================================
FILE: integrity-check.cjs
================================================
#!/usr/bin/env node

const fs = require('fs');
const path = require('path');

const file = path.resolve(__dirname, 'licensed/GeoLite2-City.mmdb');

if (!fs.existsSync(file)) {
    console.error(`Integrity check failed: ${file} does not exist.`);
    process.exit(1);
}


================================================
FILE: package.json
================================================
{
  "name": "reader",
  "scripts": {
    "lint": "eslint --ext .js,.ts .",
    "build": "node ./integrity-check.cjs && tsc -p .",
    "build:watch": "tsc --watch",
    "build:clean": "rm -rf ./build",
    "serve": "npm run build && npm run start",
    "debug": "npm run build && npm run dev",
    "start": "node ./build/stand-alone/crawl.js",
    "dry-run": "NODE_ENV=dry-run node ./build/stand-alone/search.js"
  },
  "engines": {
    "node": ">=18"
  },
  "main": "build/index.js",
  "dependencies": {
    "@esm2cjs/normalize-url": "^8.0.0",
    "@google-cloud/translate": "^8.2.0",
    "@koa/bodyparser": "^5.1.1",
    "@mozilla/readability": "^0.6.0",
    "@napi-rs/canvas": "^0.1.68",
    "@types/turndown": "^5.0.4",
    "@xmldom/xmldom": "^0.9.3",
    "archiver": "^6.0.1",
    "axios": "^1.3.3",
    "bcrypt": "^5.1.0",
    "busboy": "^1.6.0",
    "civkit": "^0.9.0-2570394",
    "cors": "^2.8.5",
    "dayjs": "^1.11.9",
    "express": "^4.19.2",
    "firebase-admin": "^12.1.0",
    "firebase-functions": "^6.1.1",
    "htmlparser2": "^9.0.0",
    "jose": "^5.1.0",
    "koa": "^2.16.0",
    "koa-compress": "^5.1.1",
    "langdetect": "^0.2.1",
    "linkedom": "^0.18.4",
    "lru-cache": "^11.0.2",
    "maxmind": "^4.3.18",
    "minio": "^7.1.3",
    "node-libcurl": "^4.1.0",
    "openai": "^4.20.0",
    "pdfjs-dist": "^4.10.38",
    "puppeteer": "^23.3.0",
    "puppeteer-extra": "^3.3.6",
    "puppeteer-extra-plugin-block-resources": "^2.4.3",
    "robots-parser": "^3.0.1",
    "set-cookie-parser": "^2.6.0",
    "simple-zstd": "^1.4.2",
    "stripe": "^11.11.0",
    "svg2png-wasm": "^1.4.1",
    "tiktoken": "^1.0.16",
    "tld-extract": "^2.1.0",
    "turndown": "^7.1.3",
    "turndown-plugin-gfm": "^1.0.2",
    "undici": "^7.8.0"
  },
  "devDependencies": {
    "@types/archiver": "^5.3.4",
    "@types/bcrypt": "^5.0.0",
    "@types/busboy": "^1.5.4",
    "@types/cors": "^2.8.17",
    "@types/koa": "^2.15.0",
    "@types/koa-compress": "^4.0.6",
    "@types/node": "^20.14.13",
    "@types/set-cookie-parser": "^2.4.7",
    "@types/xmldom": "^0.1.34",
    "@typescript-eslint/eslint-plugin": "^5.12.0",
    "@typescript-eslint/parser": "^5.12.0",
    "eslint": "^8.9.0",
    "eslint-config-google": "^0.14.0",
    "eslint-plugin-import": "^2.25.4",
    "firebase-functions-test": "^3.0.0",
    "pino-pretty": "^13.0.0",
    "replicate": "^0.16.1",
    "typescript": "^5.5.4"
  },
  "private": true,
  "exports": {
    ".": "./build/index.js"
  }
}


================================================
FILE: public/robots.txt
================================================
User-Agent: *
Disallow: /


================================================
FILE: src/api/crawler.ts
================================================
import { singleton } from 'tsyringe';
import { pathToFileURL } from 'url';
import { randomUUID } from 'crypto';
import _ from 'lodash';

import {
    assignTransferProtocolMeta, RPCHost, RPCReflection,
    AssertionFailureError, ParamValidationError,
    RawString,
    ApplicationError,
    DataStreamBrokenError,
    assignMeta,
} from 'civkit/civ-rpc';
import { marshalErrorLike } from 'civkit/lang';
import { Defer } from 'civkit/defer';
import { retryWith } from 'civkit/decorators';
import { FancyFile } from 'civkit/fancy-file';

import { CONTENT_FORMAT, CrawlerOptions, CrawlerOptionsHeaderOnly, ENGINE_TYPE, RESPOND_TIMING } from '../dto/crawler-options';

import { Crawled } from '../db/crawled';
import { DomainBlockade } from '../db/domain-blockade';
import { OutputServerEventStream } from '../lib/transform-server-event-stream';

import { PageSnapshot, PuppeteerControl, ScrappingOptions } from '../services/puppeteer';
import { JSDomControl } from '../services/jsdom';
import { FormattedPage, md5Hasher, SnapshotFormatter } from '../services/snapshot-formatter';
import { CurlControl } from '../services/curl';
import { LmControl } from '../services/lm';
import { tryDecodeURIComponent } from '../utils/misc';
import { CFBrowserRendering } from '../services/cf-browser-rendering';

import { GlobalLogger } from '../services/logger';
import { RateLimitControl, RateLimitDesc } from '../shared/services/rate-limit';
import { AsyncLocalContext } from '../services/async-context';
import { Context, Ctx, Method, Param, RPCReflect } from '../services/registry';
import {
    BudgetExceededError, InsufficientBalanceError,
    SecurityCompromiseError, ServiceBadApproachError, ServiceBadAttemptError,
    ServiceNodeResourceDrainError
} from '../services/errors';

import { countGPTToken as estimateToken } from '../shared/utils/openai';
import { ProxyProviderService } from '../shared/services/proxy-provider';
import { FirebaseStorageBucketControl } from '../shared/services/firebase-storage-bucket';
import { JinaEmbeddingsAuthDTO } from '../dto/jina-embeddings-auth';
import { RobotsTxtService } from '../services/robots-text';
import { TempFileManager } from '../services/temp-file';
import { MiscService } from '../services/misc';
import { HTTPServiceError } from 'civkit/http';
import { GeoIPService } from '../services/geoip';

export interface ExtraScrappingOptions extends ScrappingOptions {
    withIframe?: boolean | 'quoted';
    withShadowDom?: boolean;
    targetSelector?: string | string[];
    removeSelector?: string | string[];
    keepImgDataUrl?: boolean;
    engine?: string;
    allocProxy?: string;
    private?: boolean;
    countryHint?: string;
}

const indexProto = {
    toString: function (): string {
        return _(this)
            .toPairs()
            .map(([k, v]) => k ? `[${_.upperFirst(_.lowerCase(k))}] ${v}` : '')
            .value()
            .join('\n') + '\n';
    }
};

@singleton()
export class CrawlerHost extends RPCHost {
    logger = this.globalLogger.child({ service: this.constructor.name });

    cacheRetentionMs = 1000 * 3600 * 24 * 7;
    cacheValidMs = 1000 * 3600;
    urlValidMs = 1000 * 3600 * 4;
    abuseBlockMs = 1000 * 3600;
    domainProfileRetentionMs = 1000 * 3600 * 24 * 30;

    batchedCaches: Crawled[] = [];

    constructor(
        protected globalLogger: GlobalLogger,
        protected puppeteerControl: PuppeteerControl,
        protected curlControl: CurlControl,
        protected cfBrowserRendering: CFBrowserRendering,
        protected proxyProvider: ProxyProviderService,
        protected lmControl: LmControl,
        protected jsdomControl: JSDomControl,
        protected snapshotFormatter: SnapshotFormatter,
        protected firebaseObjectStorage: FirebaseStorageBucketControl,
        protected rateLimitControl: RateLimitControl,
        protected threadLocal: AsyncLocalContext,
        protected robotsTxtService: RobotsTxtService,
        protected tempFileManager: TempFileManager,
        protected geoIpService: GeoIPService,
        protected miscService: MiscService,
    ) {
        super(...arguments);

        puppeteerControl.on('crawled', async (snapshot: PageSnapshot, options: ExtraScrappingOptions & { url: URL; }) => {
            if (!snapshot.title?.trim() && !snapshot.pdfs?.length) {
                return;
            }
            if (options.cookies?.length || options.private) {
                // Potential privacy issue, dont cache if cookies are used
                return;
            }
            if (options.injectFrameScripts?.length || options.injectPageScripts?.length || options.viewport) {
                // Potentially mangeled content, dont cache if scripts are injected
                return;
            }
            if (snapshot.isIntermediate) {
                return;
            }
            if (!snapshot.lastMutationIdle) {
                // Never reached mutationIdle, presumably too short timeout
                return;
            }
            if (options.locale) {
                Reflect.set(snapshot, 'locale', options.locale);
            }

            const analyzed = await this.jsdomControl.analyzeHTMLTextLite(snapshot.html);
            if (analyzed.tokens < 200) {
                // Does not contain enough content
                if (snapshot.status !== 200) {
                    return;
                }
                if (snapshot.html.includes('captcha') || snapshot.html.includes('cf-turnstile')) {
                    return;
                }
            }

            await this.setToCache(options.url, snapshot);
        });

        puppeteerControl.on('abuse', async (abuseEvent: { url: URL; reason: string, sn: number; }) => {
            this.logger.warn(`Abuse detected on ${abuseEvent.url}, blocking ${abuseEvent.url.hostname}`, { reason: abuseEvent.reason, sn: abuseEvent.sn });

            await DomainBlockade.save(DomainBlockade.from({
                domain: abuseEvent.url.hostname.toLowerCase(),
                triggerReason: `${abuseEvent.reason}`,
                triggerUrl: abuseEvent.url.toString(),
                createdAt: new Date(),
                expireAt: new Date(Date.now() + this.abuseBlockMs),
            })).catch((err) => {
                this.logger.warn(`Failed to save domain blockade for ${abuseEvent.url.hostname}`, { err: marshalErrorLike(err) });
            });

        });

        setInterval(() => {
            const thisBatch = this.batchedCaches;
            this.batchedCaches = [];
            if (!thisBatch.length) {
                return;
            }
            const batch = Crawled.DB.batch();

            for (const x of thisBatch) {
                batch.set(Crawled.COLLECTION.doc(x._id), x.degradeForFireStore(), { merge: true });
            }

            batch.commit()
                .then(() => {
                    this.logger.debug(`Saved ${thisBatch.length} caches by batch`);
                })
                .catch((err) => {
                    this.logger.warn(`Failed to save cache in batch`, { err });
                });
        }, 1000 * 10 + Math.round(1000 * Math.random())).unref();
    }

    override async init() {
        await this.dependencyReady();

        if (this.puppeteerControl.effectiveUA) {
            this.curlControl.impersonateChrome(this.puppeteerControl.effectiveUA);
        }

        this.emit('ready');
    }

    async getIndex(auth?: JinaEmbeddingsAuthDTO) {
        const indexObject: Record<string, string | number | undefined> = Object.create(indexProto);
        Object.assign(indexObject, {
            usage1: 'https://r.jina.ai/YOUR_URL',
            usage2: 'https://s.jina.ai/YOUR_SEARCH_QUERY',
            homepage: 'https://jina.ai/reader',
        });

        await auth?.solveUID();
        if (auth && auth.user) {
            indexObject[''] = undefined;
            indexObject.authenticatedAs = `${auth.user.user_id} (${auth.user.full_name})`;
            indexObject.balanceLeft = auth.user.wallet.total_balance;
        }

        return indexObject;
    }

    @Method({
        name: 'getIndex',
        description: 'Index of the service',
        proto: {
            http: {
                action: 'get',
                path: '/',
            }
        },
        tags: ['misc', 'crawl'],
        returnType: [String, Object],
    })
    async getIndexCtrl(@Ctx() ctx: Context, @Param({ required: false }) auth?: JinaEmbeddingsAuthDTO) {
        const indexObject = await this.getIndex(auth);

        if (!ctx.accepts('text/plain') && (ctx.accepts('text/json') || ctx.accepts('application/json'))) {
            return indexObject;
        }

        return assignTransferProtocolMeta(`${indexObject}`,
            { contentType: 'text/plain; charset=utf-8', envelope: null }
        );
    }


    @Method({
        name: 'crawlByPostingToIndex',
        description: 'Crawl any url into markdown',
        proto: {
            http: {
                action: 'POST',
                path: '/',
            }
        },
        tags: ['crawl'],
        returnType: [String, OutputServerEventStream],
    })
    @Method({
        description: 'Crawl any url into markdown',
        proto: {
            http: {
                action: ['GET', 'POST'],
                path: '::url',
            }
        },
        tags: ['crawl'],
        returnType: [String, OutputServerEventStream, RawString],
    })
    async crawl(
        @RPCReflect() rpcReflect: RPCReflection,
        @Ctx() ctx: Context,
        auth: JinaEmbeddingsAuthDTO,
        crawlerOptionsHeaderOnly: CrawlerOptionsHeaderOnly,
        crawlerOptionsParamsAllowed: CrawlerOptions,
    ) {
        const uid = await auth.solveUID();
        let chargeAmount = 0;
        const crawlerOptions = ctx.method === 'GET' ? crawlerOptionsHeaderOnly : crawlerOptionsParamsAllowed;
        const tierPolicy = await this.saasAssertTierPolicy(crawlerOptions, auth);

        // Use koa ctx.URL, a standard URL object to avoid node.js framework prop naming confusion
        const targetUrl = await this.getTargetUrl(tryDecodeURIComponent(`${ctx.URL.pathname}${ctx.URL.search}`), crawlerOptions);
        if (!targetUrl) {
            return await this.getIndex(auth);
        }

        // Prevent circular crawling
        this.puppeteerControl.circuitBreakerHosts.add(
            ctx.hostname.toLowerCase()
        );

        if (uid) {
            const user = await auth.assertUser();
            if (!(user.wallet.total_balance > 0)) {
                throw new InsufficientBalanceError(`Account balance not enough to run this query, please recharge.`);
            }

            const rateLimitPolicy = auth.getRateLimits('CRAWL') || [
                parseInt(user.metadata?.speed_level) >= 2 ?
                    RateLimitDesc.from({
                        occurrence: 5000,
                        periodSeconds: 60
                    }) :
                    RateLimitDesc.from({
                        occurrence: 500,
                        periodSeconds: 60
                    })
            ];

            const apiRoll = await this.rateLimitControl.simpleRPCUidBasedLimit(
                rpcReflect, uid, ['CRAWL'],
                ...rateLimitPolicy
            );

            rpcReflect.finally(() => {
                if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
                    return;
                }
                if (chargeAmount) {
                    auth.reportUsage(chargeAmount, `reader-crawl`).catch((err) => {
                        this.logger.warn(`Unable to report usage for ${uid}`, { err: marshalErrorLike(err) });
                    });
                    apiRoll.chargeAmount = chargeAmount;
                }
            });
        } else if (ctx.ip) {
            const apiRoll = await this.rateLimitControl.simpleRpcIPBasedLimit(rpcReflect, ctx.ip, ['CRAWL'],
                [
                    // 20 requests per minute
                    new Date(Date.now() - 60 * 1000), 20
                ]
            );

            rpcReflect.finally(() => {
                if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
                    return;
                }
                apiRoll.chargeAmount = chargeAmount;
            });
        }

        if (!uid) {
            // Enforce no proxy is allocated for anonymous users due to abuse.
            crawlerOptions.proxy = 'none';
            const blockade = (await DomainBlockade.fromFirestoreQuery(
                DomainBlockade.COLLECTION
                    .where('domain', '==', targetUrl.hostname.toLowerCase())
                    .where('expireAt', '>=', new Date())
                    .limit(1)
            ))[0];
            if (blockade) {
                throw new SecurityCompromiseError(`Domain ${targetUrl.hostname} blocked until ${blockade.expireAt || 'Eternally'} due to previous abuse found on ${blockade.triggerUrl || 'site'}: ${blockade.triggerReason}`);
            }
        }
        const crawlOpts = await this.configure(crawlerOptions);
        this.logger.info(`Accepting request from ${uid || ctx.ip}`, { opts: crawlerOptions });
        if (crawlerOptions.robotsTxt) {
            await this.robotsTxtService.assertAccessAllowed(targetUrl, crawlerOptions.robotsTxt);
        }
        if (rpcReflect.signal.aborted) {
            return;
        }
        if (!ctx.accepts('text/plain') && ctx.accepts('text/event-stream')) {
            const sseStream = new OutputServerEventStream();
            rpcReflect.return(sseStream);

            try {
                for await (const scrapped of this.iterSnapshots(targetUrl, crawlOpts, crawlerOptions)) {
                    if (!scrapped) {
                        continue;
                    }
                    if (rpcReflect.signal.aborted) {
                        break;
                    }

                    const formatted = await this.formatSnapshot(crawlerOptions, scrapped, targetUrl, this.urlValidMs, crawlOpts);
                    chargeAmount = this.assignChargeAmount(formatted, tierPolicy);
                    sseStream.write({
                        event: 'data',
                        data: formatted,
                    });
                    if (chargeAmount && scrapped.pdfs?.length) {
                        break;
                    }
                }
            } catch (err: any) {
                this.logger.error(`Failed to crawl ${targetUrl}`, { err: marshalErrorLike(err) });
                sseStream.write({
                    event: 'error',
                    data: marshalErrorLike(err),
                });
            }

            sseStream.end();

            return sseStream;
        }

        let lastScrapped;
        if (!ctx.accepts('text/plain') && (ctx.accepts('text/json') || ctx.accepts('application/json'))) {
            try {
                for await (const scrapped of this.iterSnapshots(targetUrl, crawlOpts, crawlerOptions)) {
                    lastScrapped = scrapped;
                    if (rpcReflect.signal.aborted) {
                        break;
                    }
                    if (!scrapped || !crawlerOptions.isSnapshotAcceptableForEarlyResponse(scrapped)) {
                        continue;
                    }
                    if (!scrapped.title) {
                        continue;
                    }

                    const formatted = await this.formatSnapshot(crawlerOptions, scrapped, targetUrl, this.urlValidMs, crawlOpts);
                    chargeAmount = this.assignChargeAmount(formatted, tierPolicy);

                    if (scrapped?.pdfs?.length && !chargeAmount) {
                        continue;
                    }

                    return formatted;
                }
            } catch (err) {
                if (!lastScrapped) {
                    throw err;
                }
            }

            if (!lastScrapped) {
                if (crawlOpts.targetSelector) {
                    throw new AssertionFailureError(`No content available for URL ${targetUrl} with target selector ${Array.isArray(crawlOpts.targetSelector) ? crawlOpts.targetSelector.join(', ') : crawlOpts.targetSelector}`);
                }
                throw new AssertionFailureError(`No content available for URL ${targetUrl}`);
            }

            const formatted = await this.formatSnapshot(crawlerOptions, lastScrapped, targetUrl, this.urlValidMs, crawlOpts);
            chargeAmount = this.assignChargeAmount(formatted, tierPolicy);

            return formatted;
        }

        if (crawlerOptions.isRequestingCompoundContentFormat()) {
            throw new ParamValidationError({
                path: 'respondWith',
                message: `You are requesting compound content format, please explicitly accept 'text/event-stream' or 'application/json' in header.`
            });
        }

        try {
            for await (const scrapped of this.iterSnapshots(targetUrl, crawlOpts, crawlerOptions)) {
                lastScrapped = scrapped;
                if (rpcReflect.signal.aborted) {
                    break;
                }
                if (!scrapped || !crawlerOptions.isSnapshotAcceptableForEarlyResponse(scrapped)) {
                    continue;
                }
                if (!scrapped.title) {
                    continue;
                }

                const formatted = await this.formatSnapshot(crawlerOptions, scrapped, targetUrl, this.urlValidMs, crawlOpts);
                chargeAmount = this.assignChargeAmount(formatted, tierPolicy);

                if (crawlerOptions.respondWith === 'screenshot' && Reflect.get(formatted, 'screenshotUrl')) {
                    return assignTransferProtocolMeta(`${formatted.textRepresentation}`,
                        { code: 302, envelope: null, headers: { Location: Reflect.get(formatted, 'screenshotUrl') } }
                    );
                }
                if (crawlerOptions.respondWith === 'pageshot' && Reflect.get(formatted, 'pageshotUrl')) {
                    return assignTransferProtocolMeta(`${formatted.textRepresentation}`,
                        { code: 302, envelope: null, headers: { Location: Reflect.get(formatted, 'pageshotUrl') } }
                    );
                }

                return assignTransferProtocolMeta(`${formatted.textRepresentation}`, { contentType: 'text/plain; charset=utf-8', envelope: null });
            }
        } catch (err) {
            if (!lastScrapped) {
                throw err;
            }
        }

        if (!lastScrapped) {
            if (crawlOpts.targetSelector) {
                throw new AssertionFailureError(`No content available for URL ${targetUrl} with target selector ${Array.isArray(crawlOpts.targetSelector) ? crawlOpts.targetSelector.join(', ') : crawlOpts.targetSelector}`);
            }
            throw new AssertionFailureError(`No content available for URL ${targetUrl}`);
        }
        const formatted = await this.formatSnapshot(crawlerOptions, lastScrapped, targetUrl, this.urlValidMs, crawlOpts);
        chargeAmount = this.assignChargeAmount(formatted, tierPolicy);

        if (crawlerOptions.respondWith === 'screenshot' && Reflect.get(formatted, 'screenshotUrl')) {

            return assignTransferProtocolMeta(`${formatted.textRepresentation}`,
                { code: 302, envelope: null, headers: { Location: Reflect.get(formatted, 'screenshotUrl') } }
            );
        }
        if (crawlerOptions.respondWith === 'pageshot' && Reflect.get(formatted, 'pageshotUrl')) {

            return assignTransferProtocolMeta(`${formatted.textRepresentation}`,
                { code: 302, envelope: null, headers: { Location: Reflect.get(formatted, 'pageshotUrl') } }
            );
        }

        return assignTransferProtocolMeta(`${formatted.textRepresentation}`, { contentType: 'text/plain; charset=utf-8', envelope: null });

    }

    async getTargetUrl(originPath: string, crawlerOptions: CrawlerOptions) {
        let url: string = '';

        const targetUrlFromGet = originPath.slice(1);
        if (crawlerOptions.pdf) {
            const pdfFile = crawlerOptions.pdf;
            const identifier = pdfFile instanceof FancyFile ? (await pdfFile.sha256Sum) : randomUUID();
            url = `blob://pdf/${identifier}`;
            crawlerOptions.url ??= url;
        } else if (targetUrlFromGet) {
            url = targetUrlFromGet.trim();
        } else if (crawlerOptions.url) {
            url = crawlerOptions.url.trim();
        }

        if (!url) {
            throw new ParamValidationError({
                message: 'No URL provided',
                path: 'url'
            });
        }

        const { url: safeURL, ips } = await this.miscService.assertNormalizedUrl(url);
        if (this.puppeteerControl.circuitBreakerHosts.has(safeURL.hostname.toLowerCase())) {
            throw new SecurityCompromiseError({
                message: `Circular hostname: ${safeURL.protocol}`,
                path: 'url'
            });
        }
        crawlerOptions._hintIps = ips;

        return safeURL;
    }

    getUrlDigest(urlToCrawl: URL) {
        const normalizedURL = new URL(urlToCrawl);
        if (!normalizedURL.hash.startsWith('#/')) {
            normalizedURL.hash = '';
        }
        const normalizedUrl = normalizedURL.toString().toLowerCase();
        const digest = md5Hasher.hash(normalizedUrl.toString());

        return digest;
    }

    async *queryCache(urlToCrawl: URL, cacheTolerance: number) {
        const digest = this.getUrlDigest(urlToCrawl);

        const cache = (
            await
                (Crawled.fromFirestoreQuery(
                    Crawled.COLLECTION.where('urlPathDigest', '==', digest).orderBy('createdAt', 'desc').limit(1)
                ).catch((err) => {
                    this.logger.warn(`Failed to query cache, unknown issue`, { err });
                    // https://github.com/grpc/grpc-node/issues/2647
                    // https://github.com/googleapis/nodejs-firestore/issues/1023
                    // https://github.com/googleapis/nodejs-firestore/issues/1023

                    return undefined;
                }))
        )?.[0];

        yield cache;

        if (!cache) {
            return;
        }

        const age = Date.now() - cache.createdAt.valueOf();
        const stale = cache.createdAt.valueOf() < (Date.now() - cacheTolerance);
        this.logger.info(`${stale ? 'Stale cache exists' : 'Cache hit'} for ${urlToCrawl}, normalized digest: ${digest}, ${age}ms old, tolerance ${cacheTolerance}ms`, {
            url: urlToCrawl, digest, age, stale, cacheTolerance
        });

        let snapshot: PageSnapshot | undefined;
        let screenshotUrl: string | undefined;
        let pageshotUrl: string | undefined;
        const preparations = [
            this.firebaseObjectStorage.downloadFile(`snapshots/${cache._id}`).then((r) => {
                snapshot = JSON.parse(r.toString('utf-8'));
            }),
            cache.screenshotAvailable ?
                this.firebaseObjectStorage.signDownloadUrl(`screenshots/${cache._id}`, Date.now() + this.urlValidMs).then((r) => {
                    screenshotUrl = r;
                }) :
                Promise.resolve(undefined),
            cache.pageshotAvailable ?
                this.firebaseObjectStorage.signDownloadUrl(`pageshots/${cache._id}`, Date.now() + this.urlValidMs).then((r) => {
                    pageshotUrl = r;
                }) :
                Promise.resolve(undefined)
        ];
        try {
            await Promise.all(preparations);
        } catch (_err) {
            // Swallow cache errors.
            return undefined;
        }

        yield {
            isFresh: !stale,
            ...cache,
            snapshot: {
                ...snapshot,
                screenshot: undefined,
                pageshot: undefined,
                screenshotUrl,
                pageshotUrl,
            } as PageSnapshot & { screenshotUrl?: string; pageshotUrl?: string; }
        };
    }

    async setToCache(urlToCrawl: URL, snapshot: PageSnapshot) {
        const digest = this.getUrlDigest(urlToCrawl);

        this.logger.info(`Caching snapshot of ${urlToCrawl}...`, { url: urlToCrawl, digest, title: snapshot?.title, href: snapshot?.href });
        const nowDate = new Date();

        const cache = Crawled.from({
            _id: randomUUID(),
            url: urlToCrawl.toString(),
            createdAt: nowDate,
            expireAt: new Date(nowDate.valueOf() + this.cacheRetentionMs),
            htmlSignificantlyModifiedByJs: snapshot.htmlSignificantlyModifiedByJs,
            urlPathDigest: digest,
        });

        const savingOfSnapshot = this.firebaseObjectStorage.saveFile(`snapshots/${cache._id}`,
            Buffer.from(
                JSON.stringify({
                    ...snapshot,
                    screenshot: undefined,
                    pageshot: undefined,
                }),
                'utf-8'
            ),
            {
                metadata: {
                    contentType: 'application/json',
                }
            }
        ).then((r) => {
            cache.snapshotAvailable = true;
            return r;
        });

        if (snapshot.screenshot) {
            await this.firebaseObjectStorage.saveFile(`screenshots/${cache._id}`, snapshot.screenshot, {
                metadata: {
                    contentType: 'image/png',
                }
            });
            cache.screenshotAvailable = true;
        }
        if (snapshot.pageshot) {
            await this.firebaseObjectStorage.saveFile(`pageshots/${cache._id}`, snapshot.pageshot, {
                metadata: {
                    contentType: 'image/png',
                }
            });
            cache.pageshotAvailable = true;
        }
        await savingOfSnapshot;
        this.batchedCaches.push(cache);
        // const r = await Crawled.save(cache.degradeForFireStore()).catch((err) => {
        //     this.logger.error(`Failed to save cache for ${urlToCrawl}`, { err: marshalErrorLike(err) });

        //     return undefined;
        // });

        return cache;
    }

    async *iterSnapshots(urlToCrawl: URL, crawlOpts?: ExtraScrappingOptions, crawlerOpts?: CrawlerOptions) {
        // if (crawlerOpts?.respondWith.includes(CONTENT_FORMAT.VLM)) {
        //     const finalBrowserSnapshot = await this.getFinalSnapshot(urlToCrawl, {
        //         ...crawlOpts, engine: ENGINE_TYPE.BROWSER
        //     }, crawlerOpts);

        //     yield* this.lmControl.geminiFromBrowserSnapshot(finalBrowserSnapshot);

        //     return;
        // }

        if (crawlerOpts?.respondWith.includes(CONTENT_FORMAT.READER_LM)) {
            const finalAutoSnapshot = await this.getFinalSnapshot(urlToCrawl, {
                ...crawlOpts,
                engine: crawlOpts?.engine || ENGINE_TYPE.AUTO,
            }, CrawlerOptions.from({
                ...crawlerOpts,
                respondWith: 'html',
            }));

            if (!finalAutoSnapshot?.html) {
                throw new AssertionFailureError(`Unexpected non HTML content for ReaderLM: ${urlToCrawl}`);
            }

            if (crawlerOpts?.instruction || crawlerOpts?.jsonSchema) {
                const jsonSchema = crawlerOpts.jsonSchema ? JSON.stringify(crawlerOpts.jsonSchema, undefined, 2) : undefined;
                yield* this.lmControl.readerLMFromSnapshot(crawlerOpts.instruction, jsonSchema, finalAutoSnapshot);

                return;
            }

            try {
                yield* this.lmControl.readerLMMarkdownFromSnapshot(finalAutoSnapshot);
            } catch (err) {
                if (err instanceof HTTPServiceError && err.status === 429) {
                    throw new ServiceNodeResourceDrainError(`Reader LM is at capacity, please try again later.`);
                }
                throw err;
            }

            return;
        }

        yield* this.cachedScrap(urlToCrawl, crawlOpts, crawlerOpts);
    }

    async *cachedScrap(urlToCrawl: URL, crawlOpts?: ExtraScrappingOptions, crawlerOpts?: CrawlerOptions) {
        if (crawlerOpts?.html) {
            const snapshot = {
                href: urlToCrawl.toString(),
                html: crawlerOpts.html,
                title: '',
                text: '',
            } as PageSnapshot;
            yield this.jsdomControl.narrowSnapshot(snapshot, crawlOpts);

            return;
        }

        if (crawlerOpts?.pdf) {
            const pdfFile = crawlerOpts.pdf instanceof FancyFile ? crawlerOpts.pdf : this.tempFileManager.cacheBuffer(Buffer.from(crawlerOpts.pdf, 'base64'));
            const pdfLocalPath = pathToFileURL((await pdfFile.filePath));
            const snapshot = {
                href: urlToCrawl.toString(),
                html: `<!DOCTYPE html><html><head></head><body style="height: 100%; width: 100%; overflow: hidden; margin:0px; background-color: rgb(82, 86, 89);"><embed style="position:absolute; left: 0; top: 0;" width="100%" height="100%" src="${crawlerOpts.url}"></body></html>`,
                title: '',
                text: '',
                pdfs: [pdfLocalPath.href],
            } as PageSnapshot;

            yield this.jsdomControl.narrowSnapshot(snapshot, crawlOpts);

            return;
        }

        if (
            crawlOpts?.engine === ENGINE_TYPE.CURL ||
            // deprecated name
            crawlOpts?.engine === 'direct'
        ) {
            let sideLoaded;
            try {
                sideLoaded = (crawlOpts?.allocProxy && !crawlOpts?.proxyUrl) ?
                    await this.sideLoadWithAllocatedProxy(urlToCrawl, crawlOpts) :
                    await this.curlControl.sideLoad(urlToCrawl, crawlOpts);

            } catch (err) {
                if (err instanceof ServiceBadAttemptError) {
                    throw new AssertionFailureError(err.message);
                }
                throw err;
            }
            if (!sideLoaded?.file) {
                throw new AssertionFailureError(`Remote server did not return a body: ${urlToCrawl}`);
            }
            const draftSnapshot = await this.snapshotFormatter.createSnapshotFromFile(urlToCrawl, sideLoaded.file, sideLoaded.contentType, sideLoaded.fileName);
            draftSnapshot.status = sideLoaded.status;
            draftSnapshot.statusText = sideLoaded.statusText;
            yield this.jsdomControl.narrowSnapshot(draftSnapshot, crawlOpts);
            return;
        }
        if (crawlOpts?.engine === ENGINE_TYPE.CF_BROWSER_RENDERING) {
            const html = await this.cfBrowserRendering.fetchContent(urlToCrawl.href);
            const snapshot = {
                href: urlToCrawl.toString(),
                html,
                title: '',
                text: '',
            } as PageSnapshot;
            yield this.jsdomControl.narrowSnapshot(snapshot, crawlOpts);
            return;
        }

        const cacheTolerance = crawlerOpts?.cacheTolerance ?? this.cacheValidMs;
        const cacheIt = this.queryCache(urlToCrawl, cacheTolerance);

        let cache = (await cacheIt.next()).value;
        if (cache?.htmlSignificantlyModifiedByJs === false) {
            if (crawlerOpts && crawlerOpts.timeout === undefined) {
                crawlerOpts.respondTiming ??= RESPOND_TIMING.HTML;
            }
        }

        if (!crawlerOpts || crawlerOpts.isCacheQueryApplicable()) {
            cache = (await cacheIt.next()).value;
        }
        cacheIt.return(undefined);

        if (cache?.isFresh &&
            (!crawlOpts?.favorScreenshot || (crawlOpts?.favorScreenshot && (cache.screenshotAvailable && cache.pageshotAvailable))) &&
            (_.get(cache.snapshot, 'locale') === crawlOpts?.locale)
        ) {
            if (cache.snapshot) {
                cache.snapshot.isFromCache = true;
            }
            yield this.jsdomControl.narrowSnapshot(cache.snapshot, crawlOpts);

            return;
        }

        if (crawlOpts?.engine !== ENGINE_TYPE.BROWSER && !this.knownUrlThatSideLoadingWouldCrashTheBrowser(urlToCrawl)) {
            const sideLoadSnapshotPermitted = crawlerOpts?.browserIsNotRequired() &&
                [RESPOND_TIMING.HTML, RESPOND_TIMING.VISIBLE_CONTENT].includes(crawlerOpts.presumedRespondTiming);
            try {
                const altOpts = { ...crawlOpts };
                let sideLoaded = (crawlOpts?.allocProxy && !crawlOpts?.proxyUrl) ?
                    await this.sideLoadWithAllocatedProxy(urlToCrawl, altOpts) :
                    await this.curlControl.sideLoad(urlToCrawl, altOpts).catch((err) => {
                        this.logger.warn(`Failed to side load ${urlToCrawl.origin}`, { err: marshalErrorLike(err), href: urlToCrawl.href });

                        if (err instanceof ApplicationError && !(err instanceof ServiceBadAttemptError)) {
                            return Promise.reject(err);
                        }

                        return this.sideLoadWithAllocatedProxy(urlToCrawl, altOpts);
                    });
                if (!sideLoaded.file) {
                    throw new ServiceBadAttemptError(`Remote server did not return a body: ${urlToCrawl}`);
                }
                const draftSnapshot = await this.snapshotFormatter.createSnapshotFromFile(
                    urlToCrawl, sideLoaded.file, sideLoaded.contentType, sideLoaded.fileName
                ).catch((err) => {
                    if (err instanceof ApplicationError) {
                        return Promise.reject(new ServiceBadAttemptError(err.message));
                    }
                    return Promise.reject(err);
                });
                draftSnapshot.status = sideLoaded.status;
                draftSnapshot.statusText = sideLoaded.statusText;
                if (sideLoaded.status == 200 && !sideLoaded.contentType.startsWith('text/html')) {
                    yield draftSnapshot;
                    return;
                }

                let analyzed = await this.jsdomControl.analyzeHTMLTextLite(draftSnapshot.html);
                draftSnapshot.title ??= analyzed.title;
                draftSnapshot.isIntermediate = true;
                if (sideLoadSnapshotPermitted) {
                    yield this.jsdomControl.narrowSnapshot(draftSnapshot, crawlOpts);
                }
                let fallbackProxyIsUsed = false;
                if (
                    ((!crawlOpts?.allocProxy || crawlOpts.allocProxy !== 'none') && !crawlOpts?.proxyUrl) &&
                    (analyzed.tokens < 42 || sideLoaded.status !== 200)
                ) {
                    const proxyLoaded = await this.sideLoadWithAllocatedProxy(urlToCrawl, altOpts);
                    if (!proxyLoaded.file) {
                        throw new ServiceBadAttemptError(`Remote server did not return a body: ${urlToCrawl}`);
                    }
                    const proxySnapshot = await this.snapshotFormatter.createSnapshotFromFile(
                        urlToCrawl, proxyLoaded.file, proxyLoaded.contentType, proxyLoaded.fileName
                    ).catch((err) => {
                        if (err instanceof ApplicationError) {
                            return Promise.reject(new ServiceBadAttemptError(err.message));
                        }
                        return Promise.reject(err);
                    });
                    proxySnapshot.status = proxyLoaded.status;
                    proxySnapshot.statusText = proxyLoaded.statusText;
                    if (proxyLoaded.status === 200 && crawlerOpts?.browserIsNotRequired()) {
                    }
                    analyzed = await this.jsdomControl.analyzeHTMLTextLite(proxySnapshot.html);
                    if (proxyLoaded.status === 200 || analyzed.tokens >= 200) {
                        proxySnapshot.isIntermediate = true;
                        if (sideLoadSnapshotPermitted) {
                            yield this.jsdomControl.narrowSnapshot(proxySnapshot, crawlOpts);
                        }
                        sideLoaded = proxyLoaded;
                        fallbackProxyIsUsed = true;
                    }
                }

                if (crawlOpts && (sideLoaded.status === 200 || analyzed.tokens >= 200 || crawlOpts.allocProxy)) {
                    this.logger.info(`Side load seems to work, applying to crawler.`, { url: urlToCrawl.href });
                    crawlOpts.sideLoad ??= sideLoaded.sideLoadOpts;
                    if (fallbackProxyIsUsed) {
                        this.logger.info(`Proxy seems to salvage the page`, { url: urlToCrawl.href });
                    }
                }
            } catch (err: any) {
                this.logger.warn(`Failed to side load ${urlToCrawl.origin}`, { err: marshalErrorLike(err), href: urlToCrawl.href });
                if (err instanceof ApplicationError &&
                    !(err instanceof ServiceBadAttemptError) &&
                    !(err instanceof DataStreamBrokenError)
                ) {
                    throw err;
                }
            }
        } else if (crawlOpts?.allocProxy && crawlOpts.allocProxy !== 'none' && !crawlOpts.proxyUrl) {
            const proxyUrl = await this.proxyProvider.alloc(this.figureOutBestProxyCountry(crawlOpts));
            crawlOpts.proxyUrl = proxyUrl.href;
        }

        try {
            if (crawlOpts?.targetSelector || crawlOpts?.removeSelector || crawlOpts?.withIframe || crawlOpts?.withShadowDom) {
                for await (const x of this.puppeteerControl.scrap(urlToCrawl, crawlOpts)) {
                    yield this.jsdomControl.narrowSnapshot(x, crawlOpts);
                }

                return;
            }

            yield* this.puppeteerControl.scrap(urlToCrawl, crawlOpts);
        } catch (err: any) {
            if (cache && !(err instanceof SecurityCompromiseError)) {
                this.logger.warn(`Failed to scrap ${urlToCrawl}, but a stale cache is available. Falling back to cache`, { err: marshalErrorLike(err) });
                yield this.jsdomControl.narrowSnapshot(cache.snapshot, crawlOpts);
                return;
            }
            throw err;
        }
    }

    assignChargeAmount(formatted: FormattedPage, saasTierPolicy?: Parameters<typeof this.saasApplyTierPolicy>[0]) {
        if (!formatted) {
            return 0;
        }

        let amount = 0;
        if (formatted.content) {
            amount = estimateToken(formatted.content);
        } else if (formatted.description) {
            amount += estimateToken(formatted.description);
        }

        if (formatted.text) {
            amount += estimateToken(formatted.text);
        }

        if (formatted.html) {
            amount += estimateToken(formatted.html);
        }
        if (formatted.screenshotUrl || formatted.screenshot) {
            // OpenAI image token count for 1024x1024 image
            amount += 765;
        }

        if (saasTierPolicy) {
            amount = this.saasApplyTierPolicy(saasTierPolicy, amount);
        }

        Object.assign(formatted, { usage: { tokens: amount } });
        assignMeta(formatted, { usage: { tokens: amount } });

        return amount;
    }


    async *scrapMany(urls: URL[], options?: ExtraScrappingOptions, crawlerOpts?: CrawlerOptions) {
        const iterators = urls.map((url) => this.cachedScrap(url, options, crawlerOpts));

        const results: (PageSnapshot | undefined)[] = iterators.map((_x) => undefined);

        let nextDeferred = Defer();
        let concluded = false;

        const handler = async (it: AsyncGenerator<PageSnapshot | undefined>, idx: number) => {
            try {
                for await (const x of it) {
                    results[idx] = x;

                    if (x) {
                        nextDeferred.resolve();
                        nextDeferred = Defer();
                    }

                }
            } catch (err: any) {
                this.logger.warn(`Failed to scrap ${urls[idx]}`, { err: marshalErrorLike(err) });
            }
        };

        Promise.allSettled(
            iterators.map((it, idx) => handler(it, idx))
        ).finally(() => {
            concluded = true;
            nextDeferred.resolve();
        });

        yield results;

        try {
            while (!concluded) {
                await nextDeferred.promise;

                yield results;
            }
            yield results;
        } finally {
            for (const x of iterators) {
                x.return();
            }
        }
    }

    async configure(opts: CrawlerOptions) {

        this.threadLocal.set('withGeneratedAlt', opts.withGeneratedAlt);
        this.threadLocal.set('withLinksSummary', opts.withLinksSummary);
        this.threadLocal.set('withImagesSummary', opts.withImagesSummary);
        this.threadLocal.set('keepImgDataUrl', opts.keepImgDataUrl);
        this.threadLocal.set('cacheTolerance', opts.cacheTolerance);
        this.threadLocal.set('withIframe', opts.withIframe);
        this.threadLocal.set('withShadowDom', opts.withShadowDom);
        this.threadLocal.set('userAgent', opts.userAgent);
        if (opts.timeout) {
            this.threadLocal.set('timeout', opts.timeout * 1000);
        }
        this.threadLocal.set('retainImages', opts.retainImages);
        this.threadLocal.set('noGfm', opts.noGfm);
        this.threadLocal.set('DNT', Boolean(opts.doNotTrack));
        if (opts.markdown) {
            this.threadLocal.set('turndownOpts', opts.markdown);
        }

        const crawlOpts: ExtraScrappingOptions = {
            proxyUrl: opts.proxyUrl,
            cookies: opts.setCookies,
            favorScreenshot: ['screenshot', 'pageshot'].some((x) => opts.respondWith.includes(x)),
            removeSelector: opts.removeSelector,
            targetSelector: opts.targetSelector,
            waitForSelector: opts.waitForSelector,
            overrideUserAgent: opts.userAgent,
            timeoutMs: opts.timeout ? opts.timeout * 1000 : undefined,
            withIframe: opts.withIframe,
            withShadowDom: opts.withShadowDom,
            locale: opts.locale,
            referer: opts.referer,
            viewport: opts.viewport,
            engine: opts.engine,
            allocProxy: opts.proxy?.endsWith('+') ? opts.proxy.slice(0, -1) : opts.proxy,
            proxyResources: (opts.proxyUrl || opts.proxy?.endsWith('+')) ? true : false,
            private: Boolean(opts.doNotTrack),
        };

        if (crawlOpts.targetSelector?.length) {
            if (typeof crawlOpts.targetSelector === 'string') {
                crawlOpts.targetSelector = [crawlOpts.targetSelector];
            }
            for (const s of crawlOpts.targetSelector) {
                for (const e of s.split(',').map((x) => x.trim())) {
                    if (e.startsWith('*') || e.startsWith(':') || e.includes('*:')) {
                        throw new ParamValidationError({
                            message: `Unacceptable selector: '${e}'. We cannot accept match-all selector for performance reasons. Sorry.`,
                            path: 'targetSelector'
                        });
                    }
                }
            }
        }

        if (opts._hintIps?.length) {
            const hints = await this.geoIpService.lookupCities(opts._hintIps);
            const board: Record<string, number> = {};
            for (const x of hints) {
                if (x.country?.code) {
                    board[x.country.code] = (board[x.country.code] || 0) + 1;
                }
            }
            const hintCountry = _.maxBy(Array.from(Object.entries(board)), 1)?.[0];
            crawlOpts.countryHint = hintCountry?.toLowerCase();
        }

        if (opts.locale) {
            crawlOpts.extraHeaders ??= {};
            crawlOpts.extraHeaders['Accept-Language'] = opts.locale;
        }

        if (opts.respondWith.includes(CONTENT_FORMAT.VLM)) {
            crawlOpts.favorScreenshot = true;
        }

        if (opts.injectFrameScript?.length) {
            crawlOpts.injectFrameScripts = (await Promise.all(
                opts.injectFrameScript.map((x) => {
                    if (URL.canParse(x)) {
                        return fetch(x).then((r) => r.text());
                    }

                    return x;
                })
            )).filter(Boolean);
        }

        if (opts.injectPageScript?.length) {
            crawlOpts.injectPageScripts = (await Promise.all(
                opts.injectPageScript.map((x) => {
                    if (URL.canParse(x)) {
                        return fetch(x).then((r) => r.text());
                    }

                    return x;
                })
            )).filter(Boolean);
        }

        return crawlOpts;
    }

    protected async formatSnapshot(
        crawlerOptions: CrawlerOptions,
        snapshot: PageSnapshot & {
            screenshotUrl?: string;
            pageshotUrl?: string;
        },
        nominalUrl?: URL,
        urlValidMs?: number,
        scrappingOptions?: ScrappingOptions
    ) {
        const presumedURL = crawlerOptions.base === 'final' ? new URL(snapshot.href) : nominalUrl;

        const respondWith = crawlerOptions.respondWith;
        if (respondWith === CONTENT_FORMAT.READER_LM || respondWith === CONTENT_FORMAT.VLM) {
            const output: FormattedPage = {
                title: snapshot.title,
                content: snapshot.parsed?.textContent,
                url: presumedURL?.href || snapshot.href,
            };

            Object.defineProperty(output, 'textRepresentation', {
                value: snapshot.parsed?.textContent,
                enumerable: false,
            });

            return output;
        }

        return this.formatSnapshotWithPDFSideLoad(respondWith, snapshot, presumedURL, urlValidMs, scrappingOptions);
    }

    async formatSnapshotWithPDFSideLoad(mode: string, snapshot: PageSnapshot, nominalUrl?: URL, urlValidMs?: number, scrappingOptions?: ScrappingOptions) {
        const snapshotCopy = _.cloneDeep(snapshot);

        if (snapshotCopy.pdfs?.length) {
            const pdfUrl = snapshotCopy.pdfs[0];
            if (pdfUrl.startsWith('http')) {
                const sideLoaded = scrappingOptions?.sideLoad?.impersonate[pdfUrl];
                if (sideLoaded?.status === 200 && sideLoaded.body) {
                    snapshotCopy.pdfs[0] = pathToFileURL(await sideLoaded?.body.filePath).href;
                    return this.snapshotFormatter.formatSnapshot(mode, snapshotCopy, nominalUrl, urlValidMs);
                }

                const r = await this.curlControl.sideLoad(new URL(pdfUrl), scrappingOptions).catch((err) => {
                    if (err instanceof ServiceBadAttemptError) {
                        return Promise.reject(new AssertionFailureError(`Failed to load PDF(${pdfUrl}): ${err.message}`));
                    }

                    return Promise.reject(err);
                });
                if (r.status !== 200) {
                    throw new AssertionFailureError(`Failed to load PDF(${pdfUrl}): Server responded status ${r.status}`);
                }
                if (!r.contentType.includes('application/pdf')) {
                    throw new AssertionFailureError(`Failed to load PDF(${pdfUrl}): Server responded with wrong content type ${r.contentType}`);
                }
                if (!r.file) {
                    throw new AssertionFailureError(`Failed to load PDF(${pdfUrl}): Server did not return a body`);
                }
                snapshotCopy.pdfs[0] = pathToFileURL(await r.file.filePath).href;
            }
        }

        return this.snapshotFormatter.formatSnapshot(mode, snapshotCopy, nominalUrl, urlValidMs);
    }

    async getFinalSnapshot(url: URL, opts?: ExtraScrappingOptions, crawlerOptions?: CrawlerOptions): Promise<PageSnapshot | undefined> {
        const it = this.cachedScrap(url, opts, crawlerOptions);

        let lastSnapshot;
        let lastError;
        try {
            for await (const x of it) {
                lastSnapshot = x;
            }
        } catch (err) {
            lastError = err;
        }

        if (!lastSnapshot && lastError) {
            throw lastError;
        }

        if (!lastSnapshot) {
            throw new AssertionFailureError(`No content available`);
        }

        return lastSnapshot;
    }

    async simpleCrawl(mode: string, url: URL, opts?: ExtraScrappingOptions) {
        const it = this.iterSnapshots(url, { ...opts, minIntervalMs: 500 });

        let lastSnapshot;
        let goodEnough = false;
        try {
            for await (const x of it) {
                lastSnapshot = x;

                if (goodEnough) {
                    break;
                }

                if (lastSnapshot?.parsed?.content) {
                    // After it's good enough, wait for next snapshot;
                    goodEnough = true;
                }
            }

        } catch (err) {
            if (lastSnapshot) {
                return this.snapshotFormatter.formatSnapshot(mode, lastSnapshot, url, this.urlValidMs);
            }

            throw err;
        }

        if (!lastSnapshot) {
            throw new AssertionFailureError(`No content available`);
        }

        return this.snapshotFormatter.formatSnapshot(mode, lastSnapshot, url, this.urlValidMs);
    }

    getDomainProfileUrlDigest(url: URL) {
        const pathname = url.pathname;
        const pathVec = pathname.split('/');
        const parentPath = pathVec.slice(0, -1).join('/');

        const finalPath = parentPath || pathname;

        const key = url.origin.toLocaleLowerCase() + finalPath;

        return {
            digest: md5Hasher.hash(key),
            path: finalPath,
        };
    }

    proxyIterMap = new WeakMap<ExtraScrappingOptions, ReturnType<ProxyProviderService['iterAlloc']>>();
    @retryWith((err) => {
        if (err instanceof ServiceBadApproachError) {
            return false;
        }
        if (err instanceof ServiceBadAttemptError) {
            // Keep trying
            return true;
        }
        if (err instanceof ApplicationError) {
            // Quit with this error
            return false;
        }
        return undefined;
    }, 3)
    async sideLoadWithAllocatedProxy(url: URL, opts?: ExtraScrappingOptions) {
        if (opts?.allocProxy === 'none') {
            return this.curlControl.sideLoad(url, opts);
        }
        let proxy;
        if (opts) {
            let it = this.proxyIterMap.get(opts);
            if (!it) {
                it = this.proxyProvider.iterAlloc(this.figureOutBestProxyCountry(opts));
                this.proxyIterMap.set(opts, it);
            }
            proxy = (await it.next()).value;
        }

        proxy ??= await this.proxyProvider.alloc(this.figureOutBestProxyCountry(opts));
        this.logger.debug(`Proxy allocated`, { proxy: proxy.href });
        const r = await this.curlControl.sideLoad(url, {
            ...opts,
            proxyUrl: proxy.href,
        });

        if (opts && opts.allocProxy) {
            opts.proxyUrl ??= proxy.href;
        }

        return { ...r, proxy };
    }

    protected figureOutBestProxyCountry(opts?: ExtraScrappingOptions) {
        if (!opts) {
            return 'auto';
        }

        let draft;

        if (opts.allocProxy) {
            if (this.proxyProvider.supports(opts.allocProxy)) {
                draft = opts.allocProxy;
            } else if (opts.allocProxy === 'none') {
                return 'none';
            }
        }

        if (opts.countryHint) {
            if (this.proxyProvider.supports(opts.countryHint)) {
                draft ??= opts.countryHint;
            }
        }

        draft ??= opts.allocProxy || 'auto';

        return draft;
    }

    knownUrlThatSideLoadingWouldCrashTheBrowser(url: URL) {
        if (url.hostname === 'chromewebstore.google.com') {
            return true;
        }

        return false;
    }

    async saasAssertTierPolicy(opts: CrawlerOptions, auth: JinaEmbeddingsAuthDTO) {
        let chargeScalar = 1;
        let minimalCharge = 0;

        if (opts.withGeneratedAlt) {
            await auth.assertTier(0, 'Alt text generation');
            minimalCharge = 765;
        }

        if (opts.injectPageScript || opts.injectFrameScript) {
            await auth.assertTier(0, 'Script injection');
            minimalCharge = 4_000;
        }

        if (opts.withIframe) {
            await auth.assertTier(0, 'Iframe');
        }

        if (opts.engine === ENGINE_TYPE.CF_BROWSER_RENDERING) {
            await auth.assertTier(0, 'Cloudflare browser rendering');
            minimalCharge = 4_000;
        }

        if (opts.respondWith.includes('lm') || opts.engine?.includes('lm')) {
            await auth.assertTier(0, 'Language model');
            minimalCharge = 4_000;
            chargeScalar = 3;
        }

        if (opts.proxy && opts.proxy !== 'none') {
            await auth.assertTier(['auto', 'any'].includes(opts.proxy) ? 0 : 2, 'Proxy allocation');
            chargeScalar = 5;
        }

        return {
            budget: opts.tokenBudget || 0,
            chargeScalar,
            minimalCharge,
        };
    }

    saasApplyTierPolicy(policy: Awaited<ReturnType<typeof this.saasAssertTierPolicy>>, chargeAmount: number) {
        const effectiveChargeAmount = policy.chargeScalar * Math.max(chargeAmount, policy.minimalCharge);
        if (policy.budget && policy.budget < effectiveChargeAmount) {
            throw new BudgetExceededError(`Token budget (${policy.budget}) exceeded, intended charge amount ${effectiveChargeAmount}`);
        }

        return effectiveChargeAmount;
    }
}


================================================
FILE: src/api/searcher.ts
================================================
import { singleton } from 'tsyringe';
import {
    assignTransferProtocolMeta, RPCHost, RPCReflection, AssertionFailureError, assignMeta, RawString,
} from 'civkit/civ-rpc';
import { marshalErrorLike } from 'civkit/lang';
import { objHashMd5B64Of } from 'civkit/hash';
import _ from 'lodash';

import { RateLimitControl, RateLimitDesc, RateLimitTriggeredError } from '../shared/services/rate-limit';

import { CrawlerHost, ExtraScrappingOptions } from './crawler';
import { CrawlerOptions, RESPOND_TIMING } from '../dto/crawler-options';
import { SnapshotFormatter, FormattedPage as RealFormattedPage } from '../services/snapshot-formatter';
import { GoogleSearchExplicitOperatorsDto } from '../services/serper-search';

import { GlobalLogger } from '../services/logger';
import { AsyncLocalContext } from '../services/async-context';
import { Context, Ctx, Method, Param, RPCReflect } from '../services/registry';
import { OutputServerEventStream } from '../lib/transform-server-event-stream';
import { JinaEmbeddingsAuthDTO } from '../dto/jina-embeddings-auth';
import { InsufficientBalanceError } from '../services/errors';

import { SerperBingSearchService, SerperGoogleSearchService } from '../services/serp/serper';
import { toAsyncGenerator } from '../utils/misc';
import type { JinaEmbeddingsTokenAccount } from '../shared/db/jina-embeddings-token-account';
import { LRUCache } from 'lru-cache';
import { API_CALL_STATUS } from '../shared/db/api-roll';
import { SERPResult } from '../db/searched';
import { SerperSearchQueryParams, WORLD_COUNTRIES, WORLD_LANGUAGES } from '../shared/3rd-party/serper-search';
import { InternalJinaSerpService } from '../services/serp/internal';
import { WebSearchEntry } from '../services/serp/compat';

const WORLD_COUNTRY_CODES = Object.keys(WORLD_COUNTRIES).map((x) => x.toLowerCase());

interface FormattedPage extends RealFormattedPage {
    favicon?: string;
    date?: string;
}

type RateLimitCache = {
    blockedUntil?: Date;
    user?: JinaEmbeddingsTokenAccount;
};

@singleton()
export class SearcherHost extends RPCHost {
    logger = this.globalLogger.child({ service: this.constructor.name });

    cacheRetentionMs = 1000 * 3600 * 24 * 7;
    cacheValidMs = 1000 * 3600;
    pageCacheToleranceMs = 1000 * 3600 * 24;

    reasonableDelayMs = 15_000;

    targetResultCount = 5;

    highFreqKeyCache = new LRUCache<string, RateLimitCache>({
        max: 256,
        ttl: 60 * 60 * 1000,
        updateAgeOnGet: false,
        updateAgeOnHas: false,
    });

    batchedCaches: SERPResult[] = [];

    constructor(
        protected globalLogger: GlobalLogger,
        protected rateLimitControl: RateLimitControl,
        protected threadLocal: AsyncLocalContext,
        protected crawler: CrawlerHost,
        protected snapshotFormatter: SnapshotFormatter,
        protected serperGoogle: SerperGoogleSearchService,
        protected serperBing: SerperBingSearchService,
        protected jinaSerp: InternalJinaSerpService,
    ) {
        super(...arguments);

        setInterval(() => {
            const thisBatch = this.batchedCaches;
            this.batchedCaches = [];
            if (!thisBatch.length) {
                return;
            }
            const batch = SERPResult.DB.batch();

            for (const x of thisBatch) {
                batch.set(SERPResult.COLLECTION.doc(), x.degradeForFireStore());
            }
            batch.commit()
                .then(() => {
                    this.logger.debug(`Saved ${thisBatch.length} caches by batch`);
                })
                .catch((err) => {
                    this.logger.warn(`Failed to cache search result in batch`, { err });
                });
        }, 1000 * 10 + Math.round(1000 * Math.random())).unref();
    }

    override async init() {
        await this.dependencyReady();

        this.emit('ready');
    }

    @Method({
        name: 'searchIndex',
        ext: {
            http: {
                action: ['get', 'post'],
                path: '/search'
            }
        },
        tags: ['search'],
        returnType: [String, OutputServerEventStream],
    })
    @Method({
        ext: {
            http: {
                action: ['get', 'post'],
                path: '::q'
            }
        },
        tags: ['search'],
        returnType: [String, OutputServerEventStream, RawString],
    })
    async search(
        @RPCReflect() rpcReflect: RPCReflection,
        @Ctx() ctx: Context,
        auth: JinaEmbeddingsAuthDTO,
        crawlerOptions: CrawlerOptions,
        searchExplicitOperators: GoogleSearchExplicitOperatorsDto,
        @Param('count', { validate: (v: number) => v >= 0 && v <= 20 })
        count: number,
        @Param('type', { type: new Set(['web', 'images', 'news']), default: 'web' })
        variant: 'web' | 'images' | 'news',
        @Param('provider', { type: new Set(['google', 'bing']), default: 'google' })
        searchEngine: 'google' | 'bing',
        @Param('num', { validate: (v: number) => v >= 0 && v <= 20 })
        num?: number,
        @Param('gl', { validate: (v: string) => WORLD_COUNTRY_CODES.includes(v?.toLowerCase()) }) gl?: string,
        @Param('hl', { validate: (v: string) => WORLD_LANGUAGES.some(l => l.code === v) }) hl?: string,
        @Param('location') location?: string,
        @Param('page') page?: number,
        @Param('fallback', { type: Boolean, default: true }) fallback?: boolean,
        @Param('q') q?: string,
    ) {
        // We want to make our search API follow SERP schema, so we need to expose 'num' parameter.
        // Since we used 'count' as 'num' previously, we need to keep 'count' for old users.
        // Here we combine 'count' and 'num' to 'count' for the rest of the function.
        count = (num !== undefined ? num : count) ?? 10;

        const authToken = auth.bearerToken;
        let highFreqKey: RateLimitCache | undefined;
        if (authToken && this.highFreqKeyCache.has(authToken)) {
            highFreqKey = this.highFreqKeyCache.get(authToken)!;
            auth.user = highFreqKey.user;
            auth.uid = highFreqKey.user?.user_id;
        }

        const uid = await auth.solveUID();
        // Return content by default
        const crawlWithoutContent = crawlerOptions.respondWith.includes('no-content');
        const withFavicon = Boolean(ctx.get('X-With-Favicons'));
        this.threadLocal.set('collect-favicon', withFavicon);
        crawlerOptions.respondTiming ??= RESPOND_TIMING.VISIBLE_CONTENT;

        let chargeAmount = 0;
        const noSlashPath = decodeURIComponent(ctx.path).slice(1);
        if (!noSlashPath && !q) {
            const index = await this.crawler.getIndex(auth);
            if (!uid) {
                index.note = 'Authentication is required to use this endpoint. Please provide a valid API key via Authorization header.';
            }
            if (!ctx.accepts('text/plain') && (ctx.accepts('text/json') || ctx.accepts('application/json'))) {

                return index;
            }

            return assignTransferProtocolMeta(`${index}`,
                { contentType: 'text/plain', envelope: null }
            );
        }

        const user = await auth.assertUser();
        if (!(user.wallet.total_balance > 0)) {
            throw new InsufficientBalanceError(`Account balance not enough to run this query, please recharge.`);
        }

        if (highFreqKey?.blockedUntil) {
            const now = new Date();
            const blockedTimeRemaining = (highFreqKey.blockedUntil.valueOf() - now.valueOf());
            if (blockedTimeRemaining > 0) {
                throw RateLimitTriggeredError.from({
                    message: `Per UID rate limit exceeded (async)`,
                    retryAfter: Math.ceil(blockedTimeRemaining / 1000),
                });
            }
        }

        const rateLimitPolicy = auth.getRateLimits(rpcReflect.name.toUpperCase()) || [
            parseInt(user.metadata?.speed_level) >= 2 ?
                RateLimitDesc.from({
                    occurrence: 1000,
                    periodSeconds: 60
                }) :
                RateLimitDesc.from({
                    occurrence: 100,
                    periodSeconds: 60
                })
        ];

        const apiRollPromise = this.rateLimitControl.simpleRPCUidBasedLimit(
            rpcReflect, uid!, [rpcReflect.name.toUpperCase()],
            ...rateLimitPolicy
        );

        if (!highFreqKey) {
            // Normal path
            await apiRollPromise;

            if (rateLimitPolicy.some(
                (x) => {
                    const rpm = x.occurrence / (x.periodSeconds / 60);
                    if (rpm >= 400) {
                        return true;
                    }

                    return false;
                })
            ) {
                this.highFreqKeyCache.set(auth.bearerToken!, {
                    user,
                });
            }

        } else {
            // High freq key path
            apiRollPromise.then(
                // Rate limit not triggered, make sure not blocking.
                () => {
                    delete highFreqKey.blockedUntil;
                },
                // Rate limit triggered
                (err) => {
                    if (!(err instanceof RateLimitTriggeredError)) {
                        return;
                    }
                    const now = Date.now();
                    let tgtDate;
                    if (err.retryAfterDate) {
                        tgtDate = err.retryAfterDate;
                    } else if (err.retryAfter) {
                        tgtDate = new Date(now + err.retryAfter * 1000);
                    }

                    if (tgtDate) {
                        const dt = tgtDate.valueOf() - now;
                        highFreqKey.blockedUntil = tgtDate;
                        setTimeout(() => {
                            if (highFreqKey.blockedUntil === tgtDate) {
                                delete highFreqKey.blockedUntil;
                            }
                        }, dt).unref();
                    }
                }
            ).finally(async () => {
                // Always asynchronously update user(wallet);
                const user = await auth.getBrief().catch(() => undefined);
                if (user) {
                    highFreqKey.user = user;
                }
            });
        }

        rpcReflect.finally(async () => {
            if (chargeAmount) {
                auth.reportUsage(chargeAmount, `reader-${rpcReflect.name}`).catch((err) => {
                    this.logger.warn(`Unable to report usage for ${uid}`, { err: marshalErrorLike(err) });
                });
                try {
                    const apiRoll = await apiRollPromise;
                    apiRoll.chargeAmount = chargeAmount;

                } catch (err) {
                    await this.rateLimitControl.record({
                        uid,
                        tags: [rpcReflect.name.toUpperCase()],
                        status: API_CALL_STATUS.SUCCESS,
                        chargeAmount,
                    }).save().catch((err) => {
                        this.logger.warn(`Failed to save rate limit record`, { err: marshalErrorLike(err) });
                    });
                }
            }
        });

        delete crawlerOptions.html;

        const crawlOpts = await this.crawler.configure(crawlerOptions);
        const searchQuery = searchExplicitOperators.addTo(q || noSlashPath);

        let fetchNum = count;
        if ((page ?? 1) === 1) {
            fetchNum = count > 10 ? 30 : 20;
        }

        let fallbackQuery: string | undefined;
        let chargeAmountScaler = 1;
        if (searchEngine === 'bing') {
            this.threadLocal.set('bing-preferred', true);
            chargeAmountScaler = 3;
        }

        if (variant !== 'web') {
            chargeAmountScaler = 5;
        }

        // Search with fallback logic if enabled
        const searchParams = {
            variant,
            provider: searchEngine,
            q: searchQuery,
            num: fetchNum,
            gl,
            hl,
            location,
            page,
        };

        const { results, query: successQuery, tryTimes } = await this.searchWithFallback(
            searchParams, fallback, crawlerOptions.noCache
        );
        chargeAmountScaler *= tryTimes;

        fallbackQuery = successQuery !== searchQuery ? successQuery : undefined;

        if (!results.length) {
            throw new AssertionFailureError(`No search results available for query ${searchQuery}`);
        }

        if (crawlOpts.timeoutMs && crawlOpts.timeoutMs < 30_000) {
            delete crawlOpts.timeoutMs;
        }


        let lastScrapped: any[] | undefined;
        const targetResultCount = crawlWithoutContent ? count : count + 2;
        const trimmedResults: any[] = results.filter((x) => Boolean(x.link)).slice(0, targetResultCount).map((x) => this.mapToFinalResults(x));
        trimmedResults.toString = function () {
            let r = this.map((x, i) => x ? Reflect.apply(x.toString, x, [i]) : '').join('\n\n').trimEnd() + '\n';
            if (fallbackQuery) {
                r = `Fallback query: ${fallbackQuery}\n\n${r}`;
            }
            return r;
        };
        if (!crawlerOptions.respondWith.includes('no-content') &&
            ['html', 'text', 'shot', 'markdown', 'content'].some((x) => crawlerOptions.respondWith.includes(x))
        ) {
            for (const x of trimmedResults) {
                x.content ??= '';
            }
        }
        const assigningOfGeneralMixins = Promise.allSettled(
            trimmedResults.map((x) => this.assignGeneralMixin(x))
        );

        let it;

        if (crawlWithoutContent || count === 0) {
            it = toAsyncGenerator(trimmedResults);
            await assigningOfGeneralMixins;
        } else {
            it = this.fetchSearchResults(crawlerOptions.respondWith, trimmedResults, crawlOpts,
                CrawlerOptions.from({ ...crawlerOptions, cacheTolerance: crawlerOptions.cacheTolerance ?? this.pageCacheToleranceMs }),
                count,
            );
        }

        if (!ctx.accepts('text/plain') && ctx.accepts('text/event-stream')) {
            const sseStream = new OutputServerEventStream();
            rpcReflect.return(sseStream);
            try {
                for await (const scrapped of it) {
                    if (!scrapped) {
                        continue;
                    }
                    if (rpcReflect.signal.aborted) {
                        break;
                    }

                    chargeAmount = this.assignChargeAmount(scrapped, count, chargeAmountScaler, fallbackQuery);
                    lastScrapped = scrapped;

                    if (fallbackQuery) {
                        sseStream.write({
                            event: 'meta',
                            data: { fallback: fallbackQuery },
                        });
                    }

                    sseStream.write({
                        event: 'data',
                        data: scrapped,
                    });
                }
            } catch (err: any) {
                this.logger.error(`Failed to collect search result for query ${searchQuery}`,
                    { err: marshalErrorLike(err) }
                );
                sseStream.write({
                    event: 'error',
                    data: marshalErrorLike(err),
                });
            }

            sseStream.end();

            return sseStream;
        }

        let earlyReturn = false;
        if (!ctx.accepts('text/plain') && (ctx.accepts('text/json') || ctx.accepts('application/json'))) {
            let earlyReturnTimer: ReturnType<typeof setTimeout> | undefined;
            const setEarlyReturnTimer = () => {
                if (earlyReturnTimer) {
                    return;
                }
                earlyReturnTimer = setTimeout(async () => {
                    if (!lastScrapped) {
                        return;
                    }
                    await assigningOfGeneralMixins;
                    chargeAmount = this.assignChargeAmount(lastScrapped, count, chargeAmountScaler, fallbackQuery);

                    rpcReflect.return(lastScrapped);
                    earlyReturn = true;
                }, ((crawlerOptions.timeout || 0) * 1000) || this.reasonableDelayMs);
            };

            for await (const scrapped of it) {
                lastScrapped = scrapped;
                if (rpcReflect.signal.aborted || earlyReturn) {
                    break;
                }
                if (_.some(scrapped, (x) => this.pageQualified(x))) {
                    setEarlyReturnTimer();
                }
                if (!this.searchResultsQualified(scrapped, count)) {
                    continue;
                }
                if (earlyReturnTimer) {
                    clearTimeout(earlyReturnTimer);
                }
                await assigningOfGeneralMixins;
                chargeAmount = this.assignChargeAmount(scrapped, count, chargeAmountScaler, fallbackQuery);

                return scrapped;
            }

            if (earlyReturnTimer) {
                clearTimeout(earlyReturnTimer);
            }

            if (!lastScrapped) {
                throw new AssertionFailureError(`No content available for query ${searchQuery}`);
            }

            if (!earlyReturn) {
                await assigningOfGeneralMixins;
                chargeAmount = this.assignChargeAmount(lastScrapped, count, chargeAmountScaler, fallbackQuery);
            }

            return lastScrapped;
        }

        let earlyReturnTimer: ReturnType<typeof setTimeout> | undefined;
        const setEarlyReturnTimer = () => {
            if (earlyReturnTimer) {
                return;
            }
            earlyReturnTimer = setTimeout(async () => {
                if (!lastScrapped) {
                    return;
                }
                await assigningOfGeneralMixins;
                chargeAmount = this.assignChargeAmount(lastScrapped, count, chargeAmountScaler, fallbackQuery);

                rpcReflect.return(assignTransferProtocolMeta(`${lastScrapped}`, { contentType: 'text/plain', envelope: null }));
                earlyReturn = true;
            }, ((crawlerOptions.timeout || 0) * 1000) || this.reasonableDelayMs);
        };

        for await (const scrapped of it) {
            lastScrapped = scrapped;
            if (rpcReflect.signal.aborted || earlyReturn) {
                break;
            }
            if (_.some(scrapped, (x) => this.pageQualified(x))) {
                setEarlyReturnTimer();
            }

            if (!this.searchResultsQualified(scrapped, count)) {
                continue;
            }

            if (earlyReturnTimer) {
                clearTimeout(earlyReturnTimer);
            }
            await assigningOfGeneralMixins;
            chargeAmount = this.assignChargeAmount(scrapped, count, chargeAmountScaler, fallbackQuery);

            return assignTransferProtocolMeta(`${scrapped}`, { contentType: 'text/plain', envelope: null });
        }

        if (earlyReturnTimer) {
            clearTimeout(earlyReturnTimer);
        }

        if (!lastScrapped) {
            throw new AssertionFailureError(`No content available for query ${searchQuery}`);
        }

        if (!earlyReturn) {
            await assigningOfGeneralMixins;
            chargeAmount = this.assignChargeAmount(lastScrapped, count, chargeAmountScaler, fallbackQuery);
        }

        return assignTransferProtocolMeta(`${lastScrapped}`, { contentType: 'text/plain', envelope: null });
    }

    /**
     * Search with fallback to progressively shorter queries if no results found
     * @param params Search parameters
     * @param useFallback Whether to use the fallback mechanism
     * @param noCache Whether to bypass cache
     * @returns Search response and the successful query
     */
    async searchWithFallback(
        params: SerperSearchQueryParams & { variant: 'web' | 'images' | 'news'; provider?: string; },
        useFallback: boolean = false,
        noCache: boolean = false
    ) {
        // Try original query first
        const originalQuery = params.q;
        const containsRTL = /[\u0600-\u06FF\u0750-\u077F\u08A0-\u08FF\uFB50-\uFDFF\uFE70-\uFEFF\u0590-\u05FF\uFB1D-\uFB4F\u0700-\u074F\u0780-\u07BF\u07C0-\u07FF]/.test(originalQuery);

        // Extract results based on variant
        let tryTimes = 1;
        const results = await this.cachedSearch(params.variant, params, noCache);
        if (results.length || !useFallback) {
            return { results, query: params.q, tryTimes };
        }

        let queryTerms = originalQuery.split(/\s+/);
        const lastResort = containsRTL ? queryTerms.slice(queryTerms.length - 2) : queryTerms.slice(0, 2);

        this.logger.info(`No results for "${originalQuery}", trying fallback queries`);

        let terms: string[] = [];
        // fallback n times
        const n = 4;

        while (tryTimes < n) {
            const delta = Math.ceil(queryTerms.length / n) * tryTimes;
            terms = containsRTL ? queryTerms.slice(delta) : queryTerms.slice(0, queryTerms.length - delta);
            const query = terms.join(' ');
            if (!query) {
                break;
            }
            tryTimes += 1;
            this.logger.info(`Retrying search with fallback query: "${query}"`);
            const fallbackParams = { ...params, q: query };
            const fallbackResults = await this.cachedSearch(params.variant, fallbackParams, noCache);
            if (fallbackResults.length > 0) {
                return { results: fallbackResults, query: fallbackParams.q, tryTimes };
            }
        }

        if (terms.length > lastResort.length) {
            const query = lastResort.join(' ');
            this.logger.info(`Retrying search with fallback query: "${query}"`);
            const fallbackParams = { ...params, q: query };
            tryTimes += 1;
            const fallbackResults = await this.cachedSearch(params.variant, fallbackParams, noCache);

            if (fallbackResults.length > 0) {
                return { results: fallbackResults, query, tryTimes };
            }
        }

        return { results, query: originalQuery, tryTimes };
    }

    async *fetchSearchResults(
        mode: string | 'markdown' | 'html' | 'text' | 'screenshot' | 'favicon' | 'content',
        searchResults?: FormattedPage[],
        options?: ExtraScrappingOptions,
        crawlerOptions?: CrawlerOptions,
        count?: number,
    ) {
        if (!searchResults) {
            return;
        }
        const urls = searchResults.map((x) => new URL(x.url!));
        const snapshotMap = new WeakMap();
        for await (const scrapped of this.crawler.scrapMany(urls, options, crawlerOptions)) {
            const mapped = scrapped.map((x, i) => {
                if (!x) {
                    return {};
                }
                if (snapshotMap.has(x)) {
                    return snapshotMap.get(x);
                }
                return this.crawler.formatSnapshotWithPDFSideLoad(mode, x, urls[i], undefined, options).then((r) => {
                    snapshotMap.set(x, r);

                    return r;
                }).catch((err) => {
                    this.logger.error(`Failed to format snapshot for ${urls[i].href}`, { err: marshalErrorLike(err) });

                    return {};
                });
            });

            const resultArray = await Promise.all(mapped) as FormattedPage[];
            for (const [i, v] of resultArray.entries()) {
                if (v) {
                    Object.assign(searchResults[i], v);
                }
            }

            yield this.reOrganizeSearchResults(searchResults, count);
        }
    }

    reOrganizeSearchResults(searchResults: FormattedPage[], count?: number) {
        const targetResultCount = count || this.targetResultCount;
        const [qualifiedPages, unqualifiedPages] = _.partition(searchResults, (x) => this.pageQualified(x));
        const acceptSet = new Set(qualifiedPages);

        const n = targetResultCount - qualifiedPages.length;
        for (const x of unqualifiedPages.slice(0, n >= 0 ? n : 0)) {
            acceptSet.add(x);
        }

        const filtered = searchResults.filter((x) => acceptSet.has(x)).slice(0, targetResultCount);

        const resultArray = filtered;

        resultArray.toString = searchResults.toString;

        return resultArray;
    }

    assignChargeAmount(formatted: FormattedPage[], num: number, scaler: number, fallbackQuery?: string) {
        let contentCharge = 0;
        for (const x of formatted) {
            const itemAmount = this.crawler.assignChargeAmount(x) || 0;

            if (!itemAmount) {
                continue;
            }

            contentCharge += itemAmount;
        }

        const numCharge = Math.ceil(formatted.length / 10) * 10000 * scaler;

        const final = Math.max(contentCharge, numCharge);

        if (final === numCharge) {
            for (const x of formatted) {
                x.usage = { tokens: Math.ceil(numCharge / formatted.length) };
            }
        }

        const metadata: Record<string, any> = { usage: { tokens: final } };
        if (fallbackQuery) {
            metadata.fallback = fallbackQuery;
        }

        assignMeta(formatted, metadata);

        return final;
    }

    pageQualified(formattedPage: FormattedPage) {
        return formattedPage.title &&
            formattedPage.content ||
            formattedPage.screenshotUrl ||
            formattedPage.pageshotUrl ||
            formattedPage.text ||
            formattedPage.html;
    }

    searchResultsQualified(results: FormattedPage[], targetResultCount = this.targetResultCount) {
        return _.every(results, (x) => this.pageQualified(x)) && results.length >= targetResultCount;
    }

    async getFavicon(domain: string) {
        const url = `https://www.google.com/s2/favicons?sz=32&domain_url=${domain}`;

        try {
            const response = await fetch(url);
            if (!response.ok) {
                return '';
            }
            const ab = await response.arrayBuffer();
            const buffer = Buffer.from(ab);
            const base64 = buffer.toString('base64');
            return `data:image/png;base64,${base64}`;
        } catch (error: any) {
            this.logger.warn(`Failed to get favicon base64 string`, { err: marshalErrorLike(error) });
            return '';
        }
    }

    *iterProviders(preference?: string, variant?: string) {
        if (preference === 'bing') {
            yield this.serperBing;
            yield variant === 'web' ? this.jinaSerp : this.serperGoogle;
            yield this.serperGoogle;

            return;
        }

        if (preference === 'google') {
            yield variant === 'web' ? this.jinaSerp : this.serperGoogle;
            yield this.serperGoogle;
            yield this.serperGoogle;

            return;
        }

        yield variant === 'web' ? this.jinaSerp : this.serperGoogle;
        yield this.serperGoogle;
        yield this.serperGoogle;
    }

    async cachedSearch(variant: 'web' | 'news' | 'images', query: Record<string, any>, noCache?: boolean): Promise<WebSearchEntry[]> {
        const queryDigest = objHashMd5B64Of({ ...query, variant });
        const provider = query.provider;
        Reflect.deleteProperty(query, 'provider');
        let cache;
        if (!noCache) {
            cache = (await SERPResult.fromFirestoreQuery(
                SERPResult.COLLECTION.where('queryDigest', '==', queryDigest)
                    .orderBy('createdAt', 'desc')
                    .limit(1)
            ))[0];
            if (cache) {
                const age = Date.now() - cache.createdAt.valueOf();
                const stale = cache.createdAt.valueOf() < (Date.now() - this.cacheValidMs);
                this.logger.info(`${stale ? 'Stale cache exists' : 'Cache hit'} for search query "${query.q}", normalized digest: ${queryDigest}, ${age}ms old`, {
                    query, digest: queryDigest, age, stale
                });

                if (!stale) {
                    return cache.response as any;
                }
            }
        }

        try {
            let r: any[] | undefined;
            let lastError;
            outerLoop:
            for (const client of this.iterProviders(provider, variant)) {
                const t0 = Date.now();
                try {
                    switch (variant) {
                        case 'images': {
                            r = await Reflect.apply(client.imageSearch, client, [query]);
                            break;
                        }
                        case 'news': {
                            r = await Reflect.apply(client.newsSearch, client, [query]);
                            break;
                        }
                        case 'web':
                        default: {
                            r = await Reflect.apply(client.webSearch, client, [query]);
                            break;
                        }
                    }
                    const dt = Date.now() - t0;
                    this.logger.info(`Search took ${dt}ms, ${client.constructor.name}(${variant})`, { searchDt: dt, variant, client: client.constructor.name });
                    break outerLoop;
                } catch (err) {
                    lastError = err;
                    const dt = Date.now() - t0;
                    this.logger.warn(`Failed to do ${variant} search using ${client.constructor.name}`, { err, variant, searchDt: dt, });
                }
            }

            if (r?.length) {
                const nowDate = new Date();
                const record = SERPResult.from({
                    query,
                    queryDigest,
                    response: r,
                    createdAt: nowDate,
                    expireAt: new Date(nowDate.valueOf() + this.cacheRetentionMs)
                });

                this.batchedCaches.push(record);
            } else if (lastError) {
                throw lastError;
            }

            return r as WebSearchEntry[];
        } catch (err: any) {
            if (cache) {
                this.logger.warn(`Failed to fetch search result, but a stale cache is available. falling back to stale cache`, { err: marshalErrorLike(err) });

                return cache.response as any;
            }

            throw err;
        }
    }

    mapToFinalResults(input: WebSearchEntry) {
        const whitelistedProps = [
            'imageUrl', 'imageWidth', 'imageHeight', 'source', 'date', 'siteLinks'
        ];
        const result = {
            title: input.title,
            url: input.link,
            description: Reflect.get(input, 'snippet'),
            ..._.pick(input, whitelistedProps),
        };

        return result;
    }

    async assignGeneralMixin(result: FormattedPage) {
        const collectFavicon = this.threadLocal.get('collect-favicon');

        if (collectFavicon && result.url) {
            const url = new URL(result.url);
            Reflect.set(result, 'favicon', await this.getFavicon(url.origin));
        }

        Object.setPrototypeOf(result, searchResultProto);
    }
}

const dataItems = [
    { key: 'title', label: 'Title' },
    { key: 'source', label: 'Source' },
    { key: 'url', label: 'URL Source' },
    { key: 'imageUrl', label: 'Image URL' },
    { key: 'description', label: 'Description' },
    { key: 'publishedTime', label: 'Published Time' },
    { key: 'imageWidth', label: 'Image Width' },
    { key: 'imageHeight', label: 'Image Height' },
    { key: 'date', label: 'Date' },
    { key: 'favicon', label: 'Favicon' },
];

const searchResultProto = {
    toString(this: FormattedPage, i?: number) {
        const chunks = [];
        for (const item of dataItems) {
            const v = Reflect.get(this, item.key);
            if (typeof v !== 'undefined') {
                if (i === undefined) {
                    chunks.push(`[${item.label}]: ${v}`);
                } else {
                    chunks.push(`[${i + 1}] ${item.label}: ${v}`);
                }
            }
        }

        if (this.content) {
            chunks.push(`\n${this.content}`);
        }

        if (this.images) {
            const imageSummaryChunks = [`${i === undefined ? '' : `[${i + 1}] `}Images:`];
            for (const [k, v] of Object.entries(this.images)) {
                imageSummaryChunks.push(`- ![${k}](${v})`);
            }
            if (imageSummaryChunks.length === 1) {
                imageSummaryChunks.push('This page does not seem to contain any images.');
            }
            chunks.push(imageSummaryChunks.join('\n'));
        }
        if (this.links) {
            const linkSummaryChunks = [`${i === undefined ? '' : `[${i + 1}] `}Links/Buttons:`];
            if (Array.isArray(this.links)) {
                for (const [k, v] of this.links) {
                    linkSummaryChunks.push(`- [${k}](${v})`);
                }
            } else {
                for (const [k, v] of Object.entries(this.links)) {
                    linkSummaryChunks.push(`- [${k}](${v})`);
                }
            }
            if (linkSummaryChunks.length === 1) {
                linkSummaryChunks.push('This page does not seem to contain any buttons/links.');
            }
            chunks.push(linkSummaryChunks.join('\n'));
        }

        return chunks.join('\n');
    }
};


================================================
FILE: src/api/serp.ts
================================================
import { singleton } from 'tsyringe';
import {
    RPCHost, RPCReflection, assignMeta, RawString,
    ParamValidationError,
    assignTransferProtocolMeta,
} from 'civkit/civ-rpc';
import { marshalErrorLike } from 'civkit/lang';
import _ from 'lodash';

import { RateLimitControl, RateLimitDesc, RateLimitTriggeredError } from '../shared/services/rate-limit';

import { GlobalLogger } from '../services/logger';
import { AsyncLocalContext } from '../services/async-context';
import { Context, Ctx, Method, Param, RPCReflect } from '../services/registry';
import { OutputServerEventStream } from '../lib/transform-server-event-stream';
import { JinaEmbeddingsAuthDTO } from '../dto/jina-embeddings-auth';
import { InsufficientBalanceError } from '../services/errors';
import { WORLD_COUNTRIES, WORLD_LANGUAGES } from '../shared/3rd-party/serper-search';
import { GoogleSERP } from '../services/serp/google';
import { WebSearchEntry } from '../services/serp/compat';
import { CrawlerOptions } from '../dto/crawler-options';
import { ScrappingOptions } from '../services/serp/puppeteer';
import { objHashMd5B64Of } from 'civkit/hash';
import { SERPResult } from '../db/searched';
import { SerperBingSearchService, SerperGoogleSearchService } from '../services/serp/serper';
import type { JinaEmbeddingsTokenAccount } from '../shared/db/jina-embeddings-token-account';
import { LRUCache } from 'lru-cache';
import { API_CALL_STATUS } from '../shared/db/api-roll';
import { InternalJinaSerpService } from '../services/serp/internal';

const WORLD_COUNTRY_CODES = Object.keys(WORLD_COUNTRIES).map((x) => x.toLowerCase());

type RateLimitCache = {
    blockedUntil?: Date;
    user?: JinaEmbeddingsTokenAccount;
};

const indexProto = {
    toString: function (): string {
        return _(this)
            .toPairs()
            .map(([k, v]) => k ? `[${_.upperFirst(_.lowerCase(k))}] ${v}` : '')
            .value()
            .join('\n') + '\n';
    }
};

@singleton()
export class SerpHost extends RPCHost {
    logger = this.globalLogger.child({ service: this.constructor.name });

    cacheRetentionMs = 1000 * 3600 * 24 * 7;
    cacheValidMs = 1000 * 3600;
    pageCacheToleranceMs = 1000 * 3600 * 24;

    reasonableDelayMs = 15_000;

    targetResultCount = 5;

    highFreqKeyCache = new LRUCache<string, RateLimitCache>({
        max: 256,
        ttl: 60 * 60 * 1000,
        updateAgeOnGet: false,
        updateAgeOnHas: false,
    });

    batchedCaches: SERPResult[] = [];

    async getIndex(ctx: Context, auth?: JinaEmbeddingsAuthDTO) {
        const indexObject: Record<string, string | number | undefined> = Object.create(indexProto);
        Object.assign(indexObject, {
            usage1: 'https://r.jina.ai/YOUR_URL',
            usage2: 'https://s.jina.ai/YOUR_SEARCH_QUERY',
            usage3: `${ctx.origin}/?q=YOUR_SEARCH_QUERY`,
            homepage: 'https://jina.ai/reader',
        });

        if (auth && auth.user) {
            indexObject[''] = undefined;
            indexObject.authenticatedAs = `${auth.user.user_id} (${auth.user.full_name})`;
            indexObject.balanceLeft = auth.user.wallet.total_balance;
        } else {
            indexObject.note = 'Authentication is required to use this endpoint. Please provide a valid API key via Authorization header.';
        }

        return indexObject;
    }

    constructor(
        protected globalLogger: GlobalLogger,
        protected rateLimitControl: RateLimitControl,
        protected threadLocal: AsyncLocalContext,
        protected googleSerp: GoogleSERP,
        protected serperGoogle: SerperGoogleSearchService,
        protected serperBing: SerperBingSearchService,
        protected jinaSerp: InternalJinaSerpService,
    ) {
        super(...arguments);

        setInterval(() => {
            const thisBatch = this.batchedCaches;
            this.batchedCaches = [];
            if (!thisBatch.length) {
                return;
            }
            const batch = SERPResult.DB.batch();

            for (const x of thisBatch) {
                batch.set(SERPResult.COLLECTION.doc(), x.degradeForFireStore());
            }
            batch.commit()
                .then(() => {
                    this.logger.debug(`Saved ${thisBatch.length} caches by batch`);
                })
                .catch((err) => {
                    this.logger.warn(`Failed to cache search result in batch`, { err });
                });
        }, 1000 * 10 + Math.round(1000 * Math.random())).unref();
    }

    override async init() {
        await this.dependencyReady();

        this.emit('ready');
    }

    @Method({
        name: 'searchIndex',
        ext: {
            http: {
                action: ['get', 'post'],
                path: '/'
            }
        },
        tags: ['search'],
        returnType: [String, OutputServerEventStream, RawString],
    })
    @Method({
        ext: {
            http: {
                action: ['get', 'post'],
            }
        },
        tags: ['search'],
        returnType: [String, OutputServerEventStream, RawString],
    })
    async search(
        @RPCReflect() rpcReflect: RPCReflection,
        @Ctx() ctx: Context,
        crawlerOptions: CrawlerOptions,
        auth: JinaEmbeddingsAuthDTO,
        @Param('type', { type: new Set(['web', 'images', 'news']), default: 'web' })
        variant: 'web' | 'images' | 'news',
        @Param('q') q?: string,
        @Param('provider', { type: new Set(['google', 'bing']) })
        searchEngine?: 'google' | 'bing',
        @Param('num', { validate: (v: number) => v >= 0 && v <= 20 })
        num?: number,
        @Param('gl', { validate: (v: string) => WORLD_COUNTRY_CODES.includes(v?.toLowerCase()) }) gl?: string,
        @Param('hl', { validate: (v: string) => WORLD_LANGUAGES.some(l => l.code === v) }) _hl?: string,
        @Param('location') location?: string,
        @Param('page') page?: number,
        @Param('fallback') fallback?: boolean,
    ) {
        const authToken = auth.bearerToken;
        let highFreqKey: RateLimitCache | undefined;
        if (authToken && this.highFreqKeyCache.has(authToken)) {
            highFreqKey = this.highFreqKeyCache.get(authToken)!;
            auth.user = highFreqKey.user;
            auth.uid = highFreqKey.user?.user_id;
        }

        const uid = await auth.solveUID();
        if (!q) {
            if (ctx.path === '/') {
                const indexObject = await this.getIndex(ctx, auth);
                if (!ctx.accepts('text/plain') && (ctx.accepts('text/json') || ctx.accepts('application/json'))) {
                    return indexObject;
                }

                return assignTransferProtocolMeta(`${indexObject}`,
                    { contentType: 'text/plain; charset=utf-8', envelope: null }
                );
            }
            throw new ParamValidationError({
                path: 'q',
                message: `Required but not provided`
            });
        }
        // Return content by default
        const user = await auth.assertUser();
        if (!(user.wallet.total_balance > 0)) {
            throw new InsufficientBalanceError(`Account balance not enough to run this query, please recharge.`);
        }

        if (highFreqKey?.blockedUntil) {
            const now = new Date();
            const blockedTimeRemaining = (highFreqKey.blockedUntil.valueOf() - now.valueOf());
            if (blockedTimeRemaining > 0) {
                this.logger.warn(`Rate limit triggered for ${uid}, this request should have been blocked`);
                // throw RateLimitTriggeredError.from({
                //     message: `Per UID rate limit exceeded (async)`,
                //     retryAfter: Math.ceil(blockedTimeRemaining / 1000),
                // });
            }
        }

        const PREMIUM_KEY_LIMIT = 400;
        const rateLimitPolicy = auth.getRateLimits('SEARCH') || [
            parseInt(user.metadata?.speed_level) >= 2 ?
                RateLimitDesc.from({
                    occurrence: PREMIUM_KEY_LIMIT,
                    periodSeconds: 60
                }) :
                RateLimitDesc.from({
                    occurrence: 40,
                    periodSeconds: 60
                })
        ];

        const apiRollPromise = this.rateLimitControl.simpleRPCUidBasedLimit(
            rpcReflect, uid!, ['SEARCH'],
            ...rateLimitPolicy
        );

        if (!highFreqKey) {
            // Normal path
            await apiRollPromise;

            if (rateLimitPolicy.some(
                (x) => {
                    const rpm = x.occurrence / (x.periodSeconds / 60);
                    if (rpm >= PREMIUM_KEY_LIMIT) {
                        return true;
                    }

                    return false;
                })
            ) {
                this.highFreqKeyCache.set(auth.bearerToken!, {
                    user,
                });
            }
        } else {
            // High freq key path
            apiRollPromise.then(
                // Rate limit not triggered, make sure not blocking.
                () => {
                    delete highFreqKey.blockedUntil;
                },
                // Rate limit triggered
                (err) => {
                    if (!(err instanceof RateLimitTriggeredError)) {
                        return;
                    }
                    const now = Date.now();
                    let tgtDate;
                    if (err.retryAfterDate) {
                        tgtDate = err.retryAfterDate;
                    } else if (err.retryAfter) {
                        tgtDate = new Date(now + err.retryAfter * 1000);
                    }

                    if (tgtDate) {
                        const dt = tgtDate.valueOf() - now;
                        highFreqKey.blockedUntil = tgtDate;
                        setTimeout(() => {
                            if (highFreqKey.blockedUntil === tgtDate) {
                                delete highFreqKey.blockedUntil;
                            }
                        }, dt).unref();
                    }
                }
            ).finally(async () => {
                // Always asynchronously update user(wallet);
                const user = await auth.getBrief().catch(() => undefined);
                if (user) {
                    highFreqKey.user = user;
                }
            });
        }

        let chargeAmount = 0;
        rpcReflect.finally(async () => {
            if (chargeAmount) {
                auth.reportUsage(chargeAmount, `reader-search`).catch((err) => {
                    this.logger.warn(`Unable to report usage for ${uid}`, { err: marshalErrorLike(err) });
                });
                try {
                    const apiRoll = await apiRollPromise;
                    apiRoll.chargeAmount = chargeAmount;
                } catch (err) {
                    await this.rateLimitControl.record({
                        uid,
                        tags: [rpcReflect.name.toUpperCase()],
                        status: API_CALL_STATUS.SUCCESS,
                        chargeAmount,
                    }).save().catch((err) => {
                        this.logger.warn(`Failed to save rate limit record`, { err: marshalErrorLike(err) });
                    });
                }
            }
        });

        let chargeAmountScaler = 1;
        if (searchEngine === 'bing') {
            chargeAmountScaler = 3;
        }
        if (variant !== 'web') {
            chargeAmountScaler = 5;
        }

        let realQuery = q;
        let queryTerms = q.split(/\s+/g).filter((x) => !!x);

        let results = await this.cachedSearch(variant, {
            provider: searchEngine,
            q,
            num,
            gl,
            // hl,
            location,
            page,
        }, crawlerOptions);


        if (fallback && !results?.length && (!page || page === 1)) {
            let tryTimes = 1;
            const containsRTL = /[\u0600-\u06FF\u0750-\u077F\u08A0-\u08FF\uFB50-\uFDFF\uFE70-\uFEFF\u0590-\u05FF\uFB1D-\uFB4F\u0700-\u074F\u0780-\u07BF\u07C0-\u07FF]/.test(q);
            const lastResort = (containsRTL ? queryTerms.slice(queryTerms.length - 2) : queryTerms.slice(0, 2)).join(' ');
            const n = 4;
            let terms: string[] = [];
            while (tryTimes < n) {
                const delta = Math.ceil(queryTerms.length / n) * tryTimes;
                terms = containsRTL ? queryTerms.slice(delta) : queryTerms.slice(0, queryTerms.length - delta);
                const query = terms.join(' ');
                if (!query) {
                    break;
                }
                if (realQuery === query) {
                    continue;
                }
                tryTimes += 1;
                realQuery = query;
                this.logger.info(`Retrying search with fallback query: "${realQuery}"`);
                results = await this.cachedSearch(variant, {
                    provider: searchEngine,
                    q: realQuery,
                    num,
                    gl,
                    // hl,
                    location,
                }, crawlerOptions);
                if (results?.length) {
                    break;
                }
            }

            if (!results?.length && realQuery.length > lastResort.length) {
                realQuery = lastResort;
                this.logger.info(`Retrying search with fallback query: "${realQuery}"`);
                tryTimes += 1;
                results = await this.cachedSearch(variant, {
                    provider: searchEngine,
                    q: realQuery,
                    num,
                    gl,
                    // hl,
                    location,
                }, crawlerOptions);
            }

            chargeAmountScaler *= tryTimes;
        }

        if (!results?.length) {
            results = [];
        }

        const finalResults = results.map((x: any) => this.mapToFinalResults(x));

        await Promise.all(finalResults.map((x: any) => this.assignGeneralMixin(x)));

        chargeAmount = this.assignChargeAmount(finalResults, chargeAmountScaler);
        assignMeta(finalResults, {
            query: realQuery,
            fallback: realQuery === q ? undefined : realQuery,
        });

        return finalResults;
    }


    assignChargeAmount(items: unknown[], scaler: number) {
        const numCharge = Math.ceil(items.length / 10) * 10000 * scaler;
        assignMeta(items, { usage: { tokens: numCharge } });

        return numCharge;
    }

    async getFavicon(domain: string) {
        const url = `https://www.google.com/s2/favicons?sz=32&domain_url=${domain}`;

        try {
            const response = await fetch(url);
            if (!response.ok) {
                return '';
            }
            const ab = await response.arrayBuffer();
            const buffer = Buffer.from(ab);
            const base64 = buffer.toString('base64');
            return `data:image/png;base64,${base64}`;
        } catch (error: any) {
            this.logger.warn(`Failed to get favicon base64 string`, { err: marshalErrorLike(error) });
            return '';
        }
    }

    async configure(opts: CrawlerOptions) {
        const crawlOpts: ScrappingOptions = {
            proxyUrl: opts.proxyUrl,
            cookies: opts.setCookies,
            overrideUserAgent: opts.userAgent,
            timeoutMs: opts.timeout ? opts.timeout * 1000 : undefined,
            locale: opts.locale,
            referer: opts.referer,
            viewport: opts.viewport,
            proxyResources: (opts.proxyUrl || opts.proxy?.endsWith('+')) ? true : false,
            allocProxy: opts.proxy?.endsWith('+') ? opts.proxy.slice(0, -1) : opts.proxy,
        };

        if (opts.locale) {
            crawlOpts.extraHeaders ??= {};
            crawlOpts.extraHeaders['Accept-Language'] = opts.locale;
        }

        return crawlOpts;
    }

    mapToFinalResults(input: WebSearchEntry) {
        const whitelistedProps = [
            'imageUrl', 'imageWidth', 'imageHeight', 'source', 'date', 'siteLinks'
        ];
        const result = {
            title: input.title,
            url: input.link,
            description: Reflect.get(input, 'snippet'),
            ..._.pick(input, whitelistedProps),
        };

        return result;
    }

    *iterProviders(preference?: string, variant?: string) {
        if (preference === 'bing') {
            yield this.serperBing;
            yield this.serperGoogle;
            yield this.googleSerp;

            return;
        }

        if (preference === 'google') {
            yield this.googleSerp;
            yield this.googleSerp;
            yield this.serperGoogle;

            return;
        }

        // yield variant === 'web' ? this.jinaSerp : this.serperGoogle;
        yield this.serperGoogle
        yield this.serperGoogle;
        yield this.googleSerp;
    }

    async cachedSearch(variant: 'web' | 'news' | 'images', query: Record<string, any>, opts: CrawlerOptions) {
        const queryDigest = objHashMd5B64Of({ ...query, variant });
        const provider = query.provider;
        Reflect.deleteProperty(query, 'provider');
        const noCache = opts.noCache;
        let cache;
        if (!noCache) {
            cache = (await SERPResult.fromFirestoreQuery(
                SERPResult.COLLECTION.where('queryDigest', '==', queryDigest)
                    .orderBy('createdAt', 'desc')
                    .limit(1)
            ))[0];
            if (cache) {
                const age = Date.now() - cache.createdAt.valueOf();
                const stale = cache.createdAt.valueOf() < (Date.now() - this.cacheValidMs);
                this.logger.info(`${stale ? 'Stale cache exists' : 'Cache hit'} for search query "${query.q}", normalized digest: ${queryDigest}, ${age}ms old`, {
                    query, digest: queryDigest, age, stale
                });

                if (!stale) {
                    return cache.response as any;
                }
            }
        }
        const scrappingOptions = await this.configure(opts);

        try {
            let r: any[] | undefined;
            let lastError;
            outerLoop:
            for (const client of this.iterProviders(provider, variant)) {
                const t0 = Date.now();
                try {
                    switch (variant) {
                        case 'images': {
                            r = await Reflect.apply(client.imageSearch, client, [query, scrappingOptions]);
                            break;
                        }
                        case 'news': {
                            r = await Reflect.apply(client.newsSearch, client, [query, scrappingOptions]);
                            break;
                        }
                        case 'web':
                        default: {
                            r = await Reflect.apply(client.webSearch, client, [query, scrappingOptions]);
                            break;
                        }
                    }
                    const dt = Date.now() - t0;
                    this.logger.info(`Search took ${dt}ms, ${client.constructor.name}(${variant})`, { searchDt: dt, variant, client: client.constructor.name });
                    break outerLoop;
                } catch (err) {
                    lastError = err;
                    const dt = Date.now() - t0;
                    this.logger.warn(`Failed to do ${variant} search using ${client.constructor.name}`, { err, variant, searchDt: dt, });
                }
            }

            if (r?.length) {
                const nowDate = new Date();
                const record = SERPResult.from({
                    query,
                    queryDigest,
                    response: r,
                    createdAt: nowDate,
                    expireAt: new Date(nowDate.valueOf() + this.cacheRetentionMs)
                });
                this.batchedCaches.push(record);
            } else if (lastError) {
                throw lastError;
            }

            return r;
        } catch (err: any) {
            if (cache) {
                this.logger.warn(`Failed to fetch search result, but a stale cache is available. falling back to stale cache`, { err: marshalErrorLike(err) });

                return cache.response as any;
            }

            throw err;
        }
    }

    async assignGeneralMixin(result: Partial<WebSearchEntry>) {
        const collectFavicon = this.threadLocal.get('collect-favicon');

        if (collectFavicon && result.link) {
            const url = new URL(result.link);
            Reflect.set(result, 'favicon', await this.getFavicon(url.origin));
        }
    }
}


================================================
FILE: src/cloud-functions/adaptive-crawler.ts
================================================
import {
    AssertionFailureError,
    assignTransferProtocolMeta,
    HashManager,
    ParamValidationError,
    RPCHost, RPCReflection,
} from 'civkit';
import { singleton } from 'tsyringe';
import { CloudHTTPv2, CloudTaskV2, Ctx, FirebaseStorageBucketControl, Logger, Param, RPCReflect } from '../shared';
import _ from 'lodash';
import { Request, Response } from 'express';
import { JinaEmbeddingsAuthDTO } from '../shared/dto/jina-embeddings-auth';
import robotsParser from 'robots-parser';
import { DOMParser } from '@xmldom/xmldom';

import { AdaptiveCrawlerOptions } from '../dto/adaptive-crawler-options';
import { CrawlerOptions } from '../dto/crawler-options';
import { JinaEmbeddingsTokenAccount } from '../shared/db/jina-embeddings-token-account';
import { AdaptiveCrawlTask, AdaptiveCrawlTaskStatus } from '../db/adaptive-crawl-task';
import { getFunctions } from 'firebase-admin/functions';
import { getFunctionUrl } from '../utils/get-function-url';
import { Timestamp } from 'firebase-admin/firestore';

const md5Hasher = new HashManager('md5', 'hex');
const removeURLHash = (url: string) => {
    try {
        const o = new URL(url);
        o.hash = '';
        return o.toString();
    } catch (e) {
        return url;
    }
}

@singleton()
export class AdaptiveCrawlerHost extends RPCHost {
    logger = this.globalLogger.child({ service: this.constructor.name });
    // Actual cache storage (gcp buckets) exists for 7 days, so here we need to select a time < 7 days.
    cacheExpiry = 3 * 1000 * 60 * 60 * 24;

    static readonly __singleCrawlQueueName = 'singleCrawlQueue';

    constructor(
        protected globalLogger: Logger,
        protected firebaseObjectStorage: FirebaseStorageBucketControl,
    ) {
        super(...arguments);
    }

    override async init() {
        await this.dependencyReady();

        this.emit('ready');
    }

    @CloudHTTPv2({
        runtime: {
            memory: '1GiB',
            timeoutSeconds: 300,
            concurrency: 22,
        },
        tags: ['Crawler'],
        httpMethod: ['post', 'get'],
        returnType: [String],
    })
    async adaptiveCrawl(
        @RPCReflect() rpcReflect: RPCReflection,
        @Ctx() ctx: {
            req: Request,
            res: Response,
        },
        auth: JinaEmbeddingsAuthDTO,
        crawlerOptions: CrawlerOptions,
        adaptiveCrawlerOptions: AdaptiveCrawlerOptions,
    ) {
        this.logger.debug({
            adaptiveCrawlerOptions,
            crawlerOptions,
        });


        const uid = await auth.solveUID();
        const { useSitemap, maxPages } = adaptiveCrawlerOptions;

        let tmpUrl = ctx.req.url.slice(1)?.trim();
        if (!tmpUrl) {
            tmpUrl = crawlerOptions.url?.trim() ?? '';
        }
        const targetUrl = new URL(tmpUrl);

        if (!targetUrl) {
            const latestUser = uid ? await auth.assertUser() : undefined;
            if (!ctx.req.accepts('text/plain') && (ctx.req.accepts('text/json') || ctx.req.accepts('application/json'))) {
                return this.getIndex(latestUser);
            }

            return assignTransferProtocolMeta(`${this.getIndex(latestUser)}`,
                { contentType: 'text/plain', envelope: null }
            );
        }

        const meta = {
            targetUrl: targetUrl.toString(),
            useSitemap,
            maxPages,
        };

        const digest = md5Hasher.hash(JSON.stringify(meta));
        const shortDigest = Buffer.from(digest, 'hex').toString('base64url');
        const existing = await AdaptiveCrawlTask.fromFirestore(shortDigest);

        if (existing?.createdAt) {
            if (existing.createdAt.getTime() > Date.now() - this.cacheExpiry) {
                this.logger.info(`Cache hit for ${shortDigest}, created at ${existing.createdAt.toDateString()}`);
                return { taskId: shortDigest };
            } else {
                this.logger.info(`Cache expired for ${shortDigest}, created at ${existing.createdAt.toDateString()}`);
            }
        }

        await AdaptiveCrawlTask.COLLECTION.doc(shortDigest).set({
            _id: shortDigest,
            status: AdaptiveCrawlTaskStatus.PENDING,
            statusText: 'Pending',
            meta,
            createdAt: new Date(),
            urls: [],
            processed: {},
            failed: {},
        });

        let urls: string[] = [];
        if (useSitemap) {
            urls = await this.crawlUrlsFromSitemap(targetUrl, maxPages);
        }

        if (urls.length > 0) {
            await AdaptiveCrawlTask.COLLECTION.doc(shortDigest).update({
                status: AdaptiveCrawlTaskStatus.PROCESSING,
                statusText: `Processing 0/${urls.length}`,
                urls,
            });

            const promises = [];
            for (const url of urls) {
                promises.push(getFunctions().taskQueue(AdaptiveCrawlerHost.__singleCrawlQueueName).enqueue({
                    shortDigest, url, token: auth.bearerToken, meta
                }, {
                    dispatchDeadlineSeconds: 1800,
                    uri: await getFunctionUrl(AdaptiveCrawlerHost.__singleCrawlQueueName),
                }));
            };

            await Promise.all(promises);
        } else {
            meta.useSitemap = false;

            await AdaptiveCrawlTask.COLLECTION.doc(shortDigest).update({
                urls: [targetUrl.toString()],
            });

            await getFunctions().taskQueue(AdaptiveCrawlerHost.__singleCrawlQueueName).enqueue({
                shortDigest, url: targetUrl.toString(), token: auth.bearerToken, meta
            }, {
                dispatchDeadlineSeconds: 1800,
                uri: await getFunctionUrl(AdaptiveCrawlerHost.__singleCrawlQueueName),
            })
        }

        return { taskId: shortDigest };
    }

    @CloudHTTPv2({
        runtime: {
            memory: '1GiB',
            timeoutSeconds: 300,
            concurrency: 22,
        },
        tags: ['Crawler'],
        httpMethod: ['post', 'get'],
        returnType: AdaptiveCrawlTask,
    })
    async adaptiveCrawlStatus(
        @RPCReflect() rpcReflect: RPCReflection,
        @Ctx() ctx: {
            req: Request,
            res: Response,
        },
        auth: JinaEmbeddingsAuthDTO,
        @Param('taskId') taskId: string,
        @Param('urls') urls: string[] = [],
    ) {
        if (!taskId) {
            throw new ParamValidationError('taskId is required');
        }

        const state = await AdaptiveCrawlTask.fromFirestore(taskId);

        if (!state) {
            throw new AssertionFailureError('The task does not exist');
        }

        if (state?.createdAt && state.createdAt.getTime() < Date.now() - this.cacheExpiry) {
            throw new AssertionFailureError('The task has expired');
        }

        if (urls.length) {
            const promises = Object.entries(state?.processed ?? {}).map(async ([url, cachePath]) => {
                if (urls.includes(url)) {
                    const raw = await this.firebaseObjectStorage.downloadFile(cachePath);
                    state!.processed[url] = JSON.parse(raw.toString('utf-8'));
                }
            });

            await Promise.all(promises);
        }


        return state;
    }

    @CloudTaskV2({
        name: AdaptiveCrawlerHost.__singleCrawlQueueName,
        runtime: {
            cpu: 1,
            memory: '1GiB',
            timeoutSeconds: 3600,
            concurrency: 2,
            maxInstances: 200,
            retryConfig: {
                maxAttempts: 3,
                minBackoffSeconds: 60,
            },
            rateLimits: {
                maxConcurrentDispatches: 150,
                maxDispatchesPerSecond: 5,
            },
        }
    })
    async singleCrawlQueue(
        @Param('shortDigest') shortDigest: string,
        @Param('url') url: string,
        @Param('token') token: string,
        @Param('meta') meta: AdaptiveCrawlTask['meta'],
    ) {
        const error = {
            reason: ''
        };

        const state = await AdaptiveCrawlTask.fromFirestore(shortDigest);
        if (state?.status === AdaptiveCrawlTaskStatus.COMPLETED) {
            return;
        }

        try {
            url = removeURLHash(url);
        } catch(e) {
            error.reason = `Failed to parse url: ${url}`;
        }

        this.logger.debug(shortDigest, url, meta);
        const cachePath = `adaptive-crawl-task/${shortDigest}/${md5Hasher.hash(url)}`;

        if (!error.reason) {
            const result = meta.useSitemap
                ? await this.handleSingleCrawl(shortDigest, url, token, cachePath)
                : await this.handleSingleCrawlRecursively(shortDigest, url, token, meta, cachePath);

            if (!result) {
                return;
            }

            error.reason = result.error.reason;
        }

        await AdaptiveCrawlTask.DB.runTransaction(async (transaction) => {
            const ref = AdaptiveCrawlTask.COLLECTION.doc(shortDigest);
            const state = await transaction.get(ref);
            const data = state.data() as AdaptiveCrawlTask & { createdAt: Timestamp };

            if (error.reason) {
                data.failed[url] = error;
            } else {
                data.processed[url] = cachePath;
            }

            const status = Object.keys(data.processed).length + Object.keys(data.failed).length >= data.urls.length
                ? AdaptiveCrawlTaskStatus.COMPLETED : AdaptiveCrawlTaskStatus.PROCESSING;
            const statusText = Object.keys(data.processed).length + Object.keys(data.failed).length >= data.urls.length
                ? `Completed ${Object.keys(data.processed).length} Succeeded, ${Object.keys(data.failed).length} Failed`
                : `Processing ${Object.keys(data.processed).length + Object.keys(data.failed).length}/${data.urls.length}`;

            const payload: Partial<AdaptiveCrawlTask> = {
                status,
                statusText,
                processed: data.processed,
                failed: data.failed,
            };

            if (status === AdaptiveCrawlTaskStatus.COMPLETED) {
                payload.finishedAt = new Date();
                payload.duration = new Date().getTime() - data.createdAt.toDate().getTime();
            }

            transaction.update(ref, payload);
        });
    }

    async handleSingleCrawl(shortDigest: string, url: string, token: string, cachePath: string) {
        const error = {
            reason: ''
        }

        const response = await fetch('https://r.jina.ai', {
            method: 'POST',
            headers: {
                'Content-Type': 'application/json',
                'Authorization': `Bearer ${token}`,
                'Accept': 'application/json',
            },
            body: JSON.stringify({ url })
        })

        if (!response.ok) {
            error.reason = `Failed to crawl ${url}, ${response.statusText}`;
        } else {
            const json = await response.json();

            await this.firebaseObjectStorage.saveFile(cachePath,
                Buffer.from(
                    JSON.stringify(json),
                    'utf-8'
                ),
                {
                    metadata: {
                        contentType: 'application/json',
                    }
                }
            )
        }

        return {
            error,
        }
    }

    async handleSingleCrawlRecursively(
        shortDigest: string, url: string, token: string, meta: AdaptiveCrawlTask['meta'], cachePath: string
    ) {
        const error = {
            reason: ''
        }
        const response = await fetch('https://r.jina.ai', {
            method: 'POST',
            headers: {
                'Content-Type': 'application/json',
                'Authorization': `Bearer ${token}`,
                'Accept': 'application/json',
                'X-With-Links-Summary': 'true',
            },
            body: JSON.stringify({ url })
        });

        if (!response.ok) {
            error.reason = `Failed to crawl ${url}, ${response.statusText}`;
        } else {
            const json = await response.json();
            await this.firebaseObjectStorage.saveFile(cachePath,
                Buffer.from(
                    JSON.stringify(json),
                    'utf-8'
                ),
                {
                    metadata: {
                        contentType: 'application/json',
                    }
                }
            )

            const title = json.data.title;
            const description = json.data.description;
            const links = json.data.links as Record<string, string>;

            const relevantUrls = await this.getRelevantUrls(token, { title, description, links });
            this.logger.debug(`Total urls: ${Object.keys(links).length}, relevant urls: ${relevantUrls.length}`);

            for (const url of relevantUrls) {
                let abortContinue = false;
                let abortBreak = false;
                await AdaptiveCrawlTask.DB.runTransaction(async (transaction) => {
                    const ref = AdaptiveCrawlTask.COLLECTION.doc(shortDigest);
                    const state = await transaction.get(ref);
                    const data = state.data() as AdaptiveCrawlTask & { createdAt: Timestamp };

                    if (data.urls.includes(url)) {
                        this.logger.debug('Recursive CONTINUE', data);
                        abortContinue = true;
                        return;
                    }

                    const urls = [
                        ...data.urls,
                        url
                    ];

                    if (urls.length > meta.maxPages || data.status === AdaptiveCrawlTaskStatus.COMPLETED) {
                        this.logger.debug('Recursive BREAK', data);
                        abortBreak = true;
                        return;
                    }

                    transaction.update(ref, { urls });
                });

                if (abortContinue) {
                    continue;
                }
                if (abortBreak) {
                    break;
                }

                await getFunctions().taskQueue(AdaptiveCrawlerHost.__singleCrawlQueueName).enqueue({
                    shortDigest, url, token, meta
                }, {
                    dispatchDeadlineSeconds: 1800,
                    uri: await getFunctionUrl(AdaptiveCrawlerHost.__singleCrawlQueueName),
                });
            };
        }

        return {
            error,
        }
    }

    async getRelevantUrls(token: string, {
        title, description, links
    }: {
        title: string;
        description: string;
        links: Record<string, string>;
    }) {
        const invalidSuffix = [
            '.zip',
            '.docx',
            '.pptx',
            '.xlsx',
        ];

        const validLinks = Object.entries(links)
            .map(([title, link]) => link)
            .filter(link => link.startsWith('http') && !invalidSuffix.some(suffix => link.endsWith(suffix)));

        let query = '';
        if (!description) {
            query += title;
        } else  {
            query += `TITLE: ${title}; DESCRIPTION: ${description}`;
        }

        const data = {
            model: 'jina-reranker-v2-base-multilingual',
            query,
            top_n: 15,
            documents: validLinks,
        };

        const response = await fetch('https://api.jina.ai/v1/rerank', {
            method: 'POST',
            headers: {
                'Content-Type': 'application/json',
                'Authorization': `Bearer ${token}`
            },
            body: JSON.stringify(data)
        });

        const json = (await response.json()) as {
            results: {
                index: number;
                document: {
                    text: string;
                };
                relevance_score: number;
            }[];
        };

        const highestRelevanceScore = json.results[0]?.relevance_score ?? 0;
        return json.results.filter(r => r.relevance_score > Math.max(highestRelevanceScore * 0.6, 0.1)).map(r => removeURLHash(r.document.text));
    }

    getIndex(user?: JinaEmbeddingsTokenAccount) {
        // TODO: 需要更新使用方式
        // const indexObject: Record<string, string | number | undefined> = Object.create(indexProto);

        // Object.assign(indexObject, {
        //     usage1: 'https://r.jina.ai/YOUR_URL',
        //     usage2: 'https://s.jina.ai/YOUR_SEARCH_QUERY',
        //     homepage: 'https://jina.ai/reader',
        //     sourceCode: 'https://github.com/jina-ai/reader',
        // });

        // if (user) {
        //     indexObject[''] = undefined;
        //     indexObject.authenticatedAs = `${user.user_id} (${user.full_name})`;
        //     indexObject.balanceLeft = user.wallet.total_balance;
        // }

        // return indexObject;
    }

    async crawlUrlsFromSitemap(url: URL, maxPages: number) {
        const sitemapsFromRobotsTxt = await this.getSitemapsFromRobotsTxt(url);

        const initialSitemaps: string[] = [];
        if (sitemapsFromRobotsTxt === null) {
            initialSitemaps.push(`${url.origin}/sitemap.xml`);
        } else {
            initialSitemaps.push(...sitemapsFromRobotsTxt);
        }


        const allUrls: Set<string> = new Set();
        const processedSitemaps: Set<string> = new Set();

        const fetchSitemapUrls = async (sitemapUrl: string) => {
            sitemapUrl = sitemapUrl.trim();

            if (processedSitemaps.has(sitemapUrl)) {
                return;
            }

            processedSitemaps.add(sitemapUrl);

            try {
                const response = await fetch(sitemapUrl);
                const sitemapContent = await response.text();
                const parser = new DOMParser();
                const xmlDoc = parser.parseFromString(sitemapContent, 'text/xml');

                // handle normal sitemap
                const urlElements = xmlDoc.getElementsByTagName('url');
                for (let i = 0; i < urlElements.length; i++) {
                    const locElement = urlElements[i].getElementsByTagName('loc')[0];
                    if (locElement) {
                        const loc = locElement.textContent?.trim() || '';
                        if (loc.startsWith(url.origin) && !loc.endsWith('.xml')) {
                            allUrls.add(removeURLHash(loc));
                        }
                        if (allUrls.size >= maxPages) {
                            return;
                        }
                    }
                }

                // handle sitemap index
                const sitemapElements = xmlDoc.getElementsByTagName('sitemap');
                for (let i = 0; i < sitemapElements.length; i++) {
                    const locElement = sitemapElements[i].getElementsByTagName('loc')[0];
                    if (locElement) {
                        await fetchSitemapUrls(locElement.textContent?.trim() || '');
                        if (allUrls.size >= maxPages) {
                            return;
                        }
                    }
                }
            } catch (error) {
                this.logger.error(`Error fetching sitemap ${sitemapUrl}:`, error);
            }
        };

        for (const sitemapUrl of initialSitemaps) {
            await fetchSitemapUrls(sitemapUrl);
            if (allUrls.size >= maxPages) {
                break;
            }
        }

        const urlsToProcess = Array.from(allUrls).slice(0, maxPages);

        return urlsToProcess;
    }

    async getSitemapsFromRobotsTxt(url: URL) {
        const hostname = url.origin;
        const robotsUrl = `${hostname}/robots.txt`;
        const response = await fetch(robotsUrl);
        if (response.status === 404) {
            return null;
        }
        const robotsTxt = await response.text();
        if (robotsTxt.length) {
            const robot = robotsParser(robotsUrl, robotsTxt);
            return robot.getSitemaps();
        }

        return null;
    }
}


================================================
FILE: src/cloud-functions/data-crunching.ts
================================================
import {
    Defer,
    PromiseThrottle,
    RPCHost,
} from 'civkit';
import { singleton } from 'tsyringe';
import {
    // CloudScheduleV2, CloudTaskV2,
    FirebaseStorageBucketControl, Logger, Param, TempFileManager
} from '../shared';
import _ from 'lodash';
import { CrawlerHost } from '../api/crawler';

import { Crawled } from '../db/crawled';
import dayjs from 'dayjs';
import { createReadStream } from 'fs';
import { appendFile } from 'fs/promises';
import { createGzip } from 'zlib';
import { getFunctions } from 'firebase-admin/functions';
import { SnapshotFormatter } from '../services/snapshot-formatter';
import { getFunctionUrl } from '../utils/get-function-url';

dayjs.extend(require('dayjs/plugin/utc'));

@singleton()
export class DataCrunchingHost extends RPCHost {
    logger = this.globalLogger.child({ service: this.constructor.name });

    pageCacheCrunchingPrefix = 'crunched-pages';
    pageCacheCrunchingBatchSize = 5000;
    pageCacheCrunchingTMinus = 6 * 24 * 60 * 60 * 1000;
    rev = 7;

    constructor(
        protected globalLogger: Logger,

        protected crawler: CrawlerHost,
        protected snapshotFormatter: SnapshotFormatter,
        protected tempFileManager: TempFileManager,
        protected firebaseObjectStorage: FirebaseStorageBucketControl,
    ) {
        super(..._.without(arguments, crawler));
    }

    override async init() {
        await this.dependencyReady();

        this.emit('ready');
    }

    // @CloudTaskV2({
    //     runtime: {
    //         cpu: 2,
    //         memory: '4GiB',
    //         timeoutSeconds: 3600,
    //         concurrency: 2,
    //         maxInstances: 200,
    //         retryConfig: {
    //             maxAttempts: 3,
    //             minBackoffSeconds: 60,
    //         },
    //         rateLimits: {
    //             maxConcurrentDispatches: 150,
    //             maxDispatchesPerSecond: 2,
    //         },
    //     },
    //     tags: ['DataCrunching'],
    // })
    async crunchPageCacheWorker(
        @Param('date') date: string,
        @Param('offset', { default: 0 }) offset: number
    ) {
        this.logger.info(`Crunching page cache @${date}+${offset}...`);
        for await (const { fileName, records } of this.iterPageCacheRecords(date, offset)) {
            this.logger.info(`Crunching ${fileName}...`);
            const fileOnDrive = await this.crunchCacheRecords(records);
            const fstream = createReadStream(fileOnDrive.path);
            const gzipStream = createGzip();
            fstream.pipe(gzipStream, { end: true });
            await this.firebaseObjectStorage.bucket.file(fileName).save(gzipStream, {
                contentType: 'application/jsonl+gzip',
            });
        }

        this.logger.info(`Crunching page cache @${date}+${offset} done.`);

        return true;
    }

    // @CloudScheduleV2('2 0 * * *', {
    //     name: 'crunchPageCacheEveryday',
    //     runtime: {
    //         cpu: 2,
    //         memory: '4GiB',
    //         timeoutSeconds: 1800,
    //         timeZone: 'UTC',
    //         retryCount: 3,
    //         minBackoffSeconds: 60,
    //     },
    //     tags: ['DataCrunching'],
    // })
    async dispatchPageCacheCrunching() {
        for await (const { fileName, date, offset } of this.iterPageCacheChunks()) {
            this.logger.info(`Dispatching ${fileName}...`);
            // sse.write({ data: `Dispatching ${fileName}...` });

            await getFunctions().taskQueue('crunchPageCacheWorker').enqueue({ date, offset }, {
                dispatchDeadlineSeconds: 1800,
                uri: await getFunctionUrl('crunchPageCacheWorker'),
            });
        }

        return true;
    }

    // @CloudHTTPv2({
    //     runtime: {
    //         cpu: 2,
    //         memory: '4GiB',
    //         timeoutSeconds: 3600,
    //         concurrency: 2,
    //         maxInstances: 200,
    //     },
    //     tags: ['DataCrunching'],
    // })
    // async dispatchPageCacheCrunching(
    //     @RPCReflect() rpcReflect: RPCReflection
    // ) {
    //     const sse = new OutputServerEventStream({ highWaterMark: 4096 });
    //     rpcReflect.return(sse);
    //     rpcReflect.catch((err) => {
    //         sse.end({ data: `Error: ${err.message}` });
    //     });
    //     for await (const { fileName, date, offset } of this.iterPageCacheChunks()) {
    //         this.logger.info(`Dispatching ${fileName}...`);
    //         sse.write({ data: `Dispatching ${fileName}...` });

    //         await getFunctions().taskQueue('crunchPageCacheWorker').enqueue({ date, offset }, {
    //             dispatchDeadlineSeconds: 1800,
    //             uri: await getFunctionUrl('crunchPageCacheWorker'),
    //         });
    //     }

    //     sse.end({ data: 'done' });

    //     return true;
    // }

    async* iterPageCacheRecords(date?: string, inputOffset?: number | string) {
        const startOfToday = dayjs().utc().startOf('day');
        const startingPoint = dayjs().utc().subtract(this.pageCacheCrunchingTMinus, 'ms').startOf('day');
        let theDay = startingPoint;

        if (date) {
            theDay = dayjs(date).utc().startOf('day');
        }

        let counter = 0;
        if (inputOffset) {
            counter = parseInt(inputOffset as string, 10);
        }

        while (theDay.isBefore(startOfToday)) {
            const fileName = `${this.pageCacheCrunchingPrefix}/r${this.rev}/${theDay.format('YYYY-MM-DD')}/${counter}.jsonl.gz`;
            const offset = counter;
            counter += this.pageCacheCrunchingBatchSize;
            const fileExists = (await this.firebaseObjectStorage.bucket.file(fileName).exists())[0];
            if (fileExists) {
                continue;
            }

            const records = await Crawled.fromFirestoreQuery(Crawled.COLLECTION
                .where('createdAt', '>=', theDay.toDate())
                .where('createdAt', '<', theDay.add(1, 'day').toDate())
                .orderBy('createdAt', 'asc')
                .offset(offset)
                .limit(this.pageCacheCrunchingBatchSize)
            );

            this.logger.info(`Found ${records.length} records for ${theDay.format('YYYY-MM-DD')} at offset ${offset}`, { fileName, counter });

            if (!records.length) {
                if (date) {
                    break;
                }
                theDay = theDay.add(1, 'day');
                counter = 0;
                continue;
            }

            yield { fileName, records };

            if (offset) {
                break;
            }
        }
    }

    async* iterPageCacheChunks() {
        const startOfToday = dayjs().utc().startOf('day');
        const startingPoint = dayjs().utc().subtract(this.pageCacheCrunchingTMinus, 'ms').startOf('day');
        let theDay = startingPoint;

        let counter = 0;

        while (theDay.isBefore(startOfToday)) {
            const fileName = `${this.pageCacheCrunchingPrefix}/r${this.rev}/${theDay.format('YYYY-MM-DD')}/${counter}.jsonl.gz`;
            const offset = counter;
            counter += this.pageCacheCrunchingBatchSize;
            const fileExists = (await this.firebaseObjectStorage.bucket.file(fileName).exists())[0];
            if (fileExists) {
                continue;
            }

            const nRecords = (await Crawled.COLLECTION
                .where('createdAt', '>=', theDay.toDate())
                .where('createdAt', '<', theDay.add(1, 'day').toDate())
                .orderBy('createdAt', 'asc')
                .offset(offset)
                .limit(this.pageCacheCrunchingBatchSize)
                .count().get()).data().count;

            this.logger.info(`Found ${nRecords} records for ${theDay.format('YYYY-MM-DD')} at offset ${offset}`, { fileName, counter });
            if (nRecords < this.pageCacheCrunchingBatchSize) {
                theDay = theDay.add(1, 'day');
                counter = 0;
            }
            if (nRecords) {
                yield { fileName, date: theDay.toISOString(), offset };
            }
        }
    }

    async crunchCacheRecords(records: Crawled[]) {
        const throttle = new PromiseThrottle(30);
        const localFilePath = this.tempFileManager.alloc();
        let nextDrainDeferred = Defer();
        nextDrainDeferred.resolve();

        for (const record of records) {
            await throttle.acquire();
            this.firebaseObjectStorage.downloadFile(`snapshots/${record._id}`)
                .then(async (snapshotTxt) => {
                    try {
                        const snapshot = JSON.parse(snapshotTxt.toString('utf-8'));

                        let formatted = await this.snapshotFormatter.formatSnapshot('default', snapshot);
                        if (!formatted.content) {
                            formatted = await this.snapshotFormatter.formatSnapshot('markdown', snapshot);
                        }

                        await nextDrainDeferred.promise;
                        await appendFile(localFilePath, JSON.stringify({
                            url: snapshot.href,
                            title: snapshot.title || '',
                            html: snapshot.html || '',
                            text: snapshot.text || '',
                            content: formatted.content || '',
                        }) + '\n', { encoding: 'utf-8' });

                    } catch (err) {
                        this.logger.warn(`Failed to parse snapshot for ${record._id}`, { err });
                    }
                })
                .finally(() => {
                    throttle.release();
                });
        }

        await throttle.nextDrain();


        const ro = {
            path: localFilePath
        };

        this.tempFileManager.bindPathTo(ro, localFilePath);

        return ro;
    }
}


================================================
FILE: src/db/adaptive-crawl-task.ts
================================================
import { Also, Prop, parseJSONText } from 'civkit';
import { FirestoreRecord } from '../shared/lib/firestore';
import _ from 'lodash';

export enum AdaptiveCrawlTaskStatus {
    PENDING = 'pending',
    PROCESSING = 'processing',
    COMPLETED = 'completed',
    FAILED = 'failed',
}

@Also({
    dictOf: Object
})
export class AdaptiveCrawlTask extends FirestoreRecord {
    static override collectionName = 'adaptiveCrawlTasks';

    override _id!: string;

    @Prop({
        required: true
    })
    status!: AdaptiveCrawlTaskStatus;

    @Prop({
        required: true
    })
    statusText!: string;

    @Prop()
    meta!: {
        useSitemap: boolean;
        maxPages: number;
        targetUrl: string;
    };

    @Prop()
    urls!: string[];

    @Prop()
    processed!: {
        [url: string]: string;
    };

    @Prop()
    failed!: {
        [url: string]: any;
    };

    @Prop()
    createdAt!: Date;

    @Prop()
    finishedAt?: Date;

    @Prop()
    duration?: number;

    static patchedFields = [
        'meta',
    ];

    static override from(input: any) {
        for (const field of this.patchedFields) {
            if (typeof input[field] === 'string') {
                input[field] = parseJSONText(input[field]);
            }
        }

        return super.from(input) as AdaptiveCrawlTask;
    }

    override degradeForFireStore() {
        const copy: any = { ...this };

        for (const field of (this.constructor as typeof AdaptiveCrawlTask).patchedFields) {
            if (typeof copy[field] === 'object') {
                copy[field] = JSON.stringify(copy[field]) as any;
            }
        }

        return copy;
    }

    [k: string]: any;
}


================================================
FILE: src/db/crawled.ts
================================================
import { Also, parseJSONText, Prop } from 'civkit';
import { FirestoreRecord } from '../shared/lib/firestore';
import _ from 'lodash';
import type { PageSnapshot } from '../services/puppeteer';

@Also({
    dictOf: Object
})
export class Crawled extends FirestoreRecord {
    static override collectionName = 'crawled';

    override _id!: string;

    @Prop({
        required: true
    })
    url!: string;

    @Prop({
        required: true
    })
    urlPathDigest!: string;

    @Prop()
    htmlSignificantlyModifiedByJs?: boolean;

    @Prop()
    snapshot?: PageSnapshot & { screenshot: never; pageshot: never; };

    @Prop()
    screenshotAvailable?: boolean;

    @Prop()
    pageshotAvailable?: boolean;

    @Prop()
    snapshotAvailable?: boolean;

    @Prop()
    createdAt!: Date;

    @Prop()
    expireAt!: Date;

    static patchedFields = [
        'snapshot'
    ];

    static override from(input: any) {
        for (const field of this.patchedFields) {
            if (typeof input[field] === 'string') {
                input[field] = parseJSONText(input[field]);
            }
        }

        return super.from(input) as Crawled;
    }

    override degradeForFireStore() {
        const copy: any = { ...this };

        for (const field of (this.constructor as typeof Crawled).patchedFields) {
            if (typeof copy[field] === 'object') {
                copy[field] = JSON.stringify(copy[field]) as any;
            }
        }

        return copy;
    }

    [k: string]: any;
}


================================================
FILE: src/db/domain-blockade.ts
================================================
import { Also, Prop } from 'civkit';
import { FirestoreRecord } from '../shared/lib/firestore';

@Also({
    dictOf: Object
})
export class DomainBlockade extends FirestoreRecord {
    static override collectionName = 'domainBlockades';

    override _id!: string;

    @Prop({
        required: true
    })
    domain!: string;

    @Prop({ required: true })
    triggerReason!: string;

    @Prop()
    triggerUrl?: string;

    @Prop()
    createdAt!: Date;

    @Prop()
    expireAt?: Date;

    [k: string]: any;
}


================================================
FILE: src/db/domain-profile.ts
================================================
import { Also, Prop } from 'civkit';
import { FirestoreRecord } from '../shared/lib/firestore';
import { ENGINE_TYPE } from '../dto/crawler-options';

@Also({
    dictOf: Object
})
export class DomainProfile extends FirestoreRecord {
    static override collectionName = 'domainProfiles';

    override _id!: string;

    @Prop({
        required: true
    })
    path!: string;

    @Prop()
    triggerUrl?: string;

    @Prop({ required: true, type: ENGINE_TYPE })
    engine!: string;

    @Prop()
    createdAt!: Date;

    @Prop()
    expireAt?: Date;

    [k: string]: any;
}


================================================
FILE: src/db/img-alt.ts
================================================
import { Also, Prop } from 'civkit';
import { FirestoreRecord } from '../shared/lib/firestore';
import _ from 'lodash';

@Also({
    dictOf: Object
})
export class ImgAlt extends FirestoreRecord {
    static override collectionName = 'imgAlts';

    override _id!: string;

    @Prop({
        required: true
    })
    src!: string;

    @Prop({
        required: true
    })
    urlDigest!: string;

    @Prop()
    width?: number;

    @Prop()
    height?: number;

    @Prop()
    generatedAlt?: string;

    @Prop()
    originalAlt?: string;

    @Prop()
    createdAt!: Date;

    @Prop()
    expireAt?: Date;

    [k: string]: any;
}


================================================
FILE: src/db/pdf.ts
================================================
import { Also, Prop, parseJSONText } from 'civkit';
import { FirestoreRecord } from '../shared/lib/firestore';
import _ from 'lodash';

@Also({
    dictOf: Object
})
export class PDFContent extends FirestoreRecord {
    static override collectionName = 'pdfs';

    override _id!: string;

    @Prop({
        required: true
    })
    src!: string;

    @Prop({
        required: true
    })
    urlDigest!: string;

    @Prop()
    meta?: { [k: string]: any; };

    @Prop()
    text?: string;

    @Prop()
    content?: string;

    @Prop()
    createdAt!: Date;

    @Prop()
    expireAt?: Date;

    static patchedFields = [
        'meta'
    ];

    static override from(input: any) {
        for (const field of this.patchedFields) {
            if (typeof input[field] === 'string') {
                input[field] = parseJSONText(input[field]);
            }
        }

        return super.from(input) as PDFContent;
    }

    override degradeForFireStore() {
        const copy: any = { ...this };

        for (const field of (this.constructor as typeof PDFContent).patchedFields) {
            if (typeof copy[field] === 'object') {
                copy[field] = JSON.stringify(copy[field]) as any;
            }
        }

        return copy;
    }

    [k: string]: any;
}


================================================
FILE: src/db/searched.ts
================================================
import { Also, parseJSONText, Prop } from 'civkit';
import { FirestoreRecord } from '../shared/lib/firestore';
import _ from 'lodash';

@Also({
    dictOf: Object
})
export class SearchResult extends FirestoreRecord {
    static override collectionName = 'searchResults';

    override _id!: string;

    @Prop({
        required: true
    })
    query!: any;

    @Prop({
        required: true
    })
    queryDigest!: string;

    @Prop()
    response?: any;

    @Prop()
    createdAt!: Date;

    @Prop()
    expireAt?: Date;

    [k: string]: any;

    static patchedFields = [
        'query',
        'response',
    ];

    static override from(input: any) {
        for (const field of this.patchedFields) {
            if (typeof input[field] === 'string') {
                input[field] = parseJSONText(input[field]);
            }
        }

        return super.from(input) as SearchResult;
    }

    override degradeForFireStore() {
        const copy: any = { ...this };

        for (const field of (this.constructor as typeof SearchResult).patchedFields) {
            if (typeof copy[field] === 'object') {
                copy[field] = JSON.stringify(copy[field]) as any;
            }
        }

        return copy;
    }
}

export class SerperSearchResult extends SearchResult {
    static override collectionName = 'serperSearchResults';
}

export class SERPResult extends SearchResult {
    static override collectionName = 'SERPResults';
}

================================================
FILE: src/dto/adaptive-crawler-options.ts
================================================
import { Also, AutoCastable, Prop, RPC_CALL_ENVIRONMENT } from 'civkit';
import type { Request, Response } from 'express';


@Also({
    openapi: {
        operation: {
            parameters: {
                'X-Use-Sitemap': {
                    description: 'Use sitemap to crawl the website.',
                    in: 'header',
                    schema: { type: 'string' }
                },
                'X-Max-Depth': {
                    description: 'Max deep level to crawl.',
                    in: 'header',
                    schema: { type: 'string' }
                },
                'X-Max-Pages': {
                    description: 'Max number of pages to crawl.',
                    in: 'header',
                    schema: { type: 'string' }
                },
            }
        }
    }
})
export class AdaptiveCrawlerOptions extends AutoCastable {
    @Prop({
        default: true,
        desc: 'Use sitemap to crawl the website.',
    })
    useSitemap!: boolean;

    @Prop({
        default: 10,
        desc: 'Max number of pages to crawl.',
        validate: (v: number) => v >= 1 && v <= 100,
    })
    maxPages!: number;

    static override from(input: any) {
        const instance = super.from(input) as AdaptiveCrawlerOptions;
        const ctx = Reflect.get(input, RPC_CALL_ENVIRONMENT) as {
            req: Request,
            res: Response,
        } | undefined;

        let maxPages = parseInt(ctx?.req.get('x-max-pages') || '');
        if (!isNaN(maxPages) && maxPages > 0) {
            instance.maxPages = maxPages <= 100 ? maxPages : 100;
        }

        const useSitemap = ctx?.req.get('x-use-sitemap');
        if (useSitemap !== undefined) {
            instance.useSitemap = Boolean(useSitemap);
        }

        return instance;
    }
}


================================================
FILE: src/dto/crawler-options.ts
================================================
import { Also, AutoCastable, ParamValidationError, Prop, RPC_CALL_ENVIRONMENT } from 'civkit/civ-rpc';
import { FancyFile } from 'civkit/fancy-file';
import { Cookie, parseString as parseSetCookieString } from 'set-cookie-parser';
import { Context } from '../services/registry';
import { TurnDownTweakableOptions } from './turndown-tweakable-options';
import type { PageSnapshot } from '../services/puppeteer';

export enum CONTENT_FORMAT {
    CONTENT = 'content',
    MARKDOWN = 'markdown',
    HTML = 'html',
    TEXT = 'text',
    PAGESHOT = 'pageshot',
    SCREENSHOT = 'screenshot',
    VLM = 'vlm',
    READER_LM = 'readerlm-v2',
}

export enum ENGINE_TYPE {
    AUTO = 'auto',
    BROWSER = 'browser',
    CURL = 'curl',
    CF_BROWSER_RENDERING = 'cf-browser-rendering',
}

export enum RESPOND_TIMING {
    HTML = 'html',
    VISIBLE_CONTENT = 'visible-content',
    MUTATION_IDLE = 'mutation-idle',
    RESOURCE_IDLE = 'resource-idle',
    MEDIA_IDLE = 'media-idle',
    NETWORK_IDLE = 'network-idle',
}

const CONTENT_FORMAT_VALUES = new Set<string>(Object.values(CONTENT_FORMAT));

export const IMAGE_RETENTION_MODES = ['none', 'all', 'alt', 'all_p', 'alt_p'] as const;
const IMAGE_RETENTION_MODE_VALUES = new Set<string>(IMAGE_RETENTION_MODES);
export const BASE_URL_MODES = ['initial', 'final'] as const;
const BASE_URL_MODE_VALUES = new Set<string>(BASE_URL_MODES);

class Viewport extends AutoCastable {
    @Prop({
        default: 1024
    })
    width!: number;
    @Prop({
        default: 1024
    })
    height!: number;
    @Prop()
    deviceScaleFactor?: number;
    @Prop()
    isMobile?: boolean;
    @Prop()
    isLandscape?: boolean;
    @Prop()
    hasTouch?: boolean;
}

@Also({
    openapi: {
        operation: {
            parameters: {
                'Accept': {
                    description: `Specifies your preference for the response format.\n\n` +
                        `Supported formats: \n` +
                        `- text/event-stream\n` +
                        `- application/json or text/json\n` +
                        `- text/plain`
                    ,
                    in: 'header',
                    schema: { type: 'string' }
                },
                'X-Cache-Tolerance': {
                    description: `Sets internal cache tolerance in seconds if this header is specified with a integer.`,
                    in: 'header',
                    schema: { type: 'string' }
                },
                'X-No-Cache': {
                    description: `Ignores internal cache if this header is specified with a value.\n\nEquivalent to X-Cache-Tolerance: 0`,
                    in: 'header',
                    schema: { type: 'string' }
                },
                'X-Respond-With': {
                    description: `Specifies the (non-default) form of the crawled data you prefer.\n\n` +
                        `Supported formats: \n` +
                        `- markdown\n` +
                        `- html\n` +
                        `- text\n` +
                        `- pageshot\n` +
                        `- screenshot\n` +
                        `- content\n` +
                        `- any combination of the above\n` +
                        `- readerlm-v2\n` +
                        `- vlm\n\n` +
                        `Default: content\n`
                    ,
                    in: 'header',
                    schema: { type: 'string' }
                },
                'X-Wait-For-Selector': {
                    description: `Specifies a CSS selector to wait for the appearance of such an element before returning.\n\n` +
                        'Example: `X-Wait-For-Selector: .content-block`\n'
                    ,
                    in: 'header',
                    schema: { type: 'string' }
                },
                'X-Target-Selector': {
                    description: `Specifies a CSS selector for return target instead of the full html.\n\n` +
                        'Implies `X-Wait-For-Selector: (same selector)`'
                    ,
                    in: 'header',
                    schema: { type: 'string' }
                },
                'X-Remove-Selector': {
                    description: `Specifies a CSS selector to remove elements from the full html.\n\n` +
                        'Example `X-Remove-Selector: nav`'
                    ,
                    in: 'header',
                    schema: { type: 'string' }
                },
                'X-Keep-Img-Data-Url': {
                    description: `Keep data-url as it instead of transforming them to object-url. (Only applicable when targeting markdown format)\n\n` +
                        'Example `X-Keep-Img-Data-Url: true`'
                    ,
                    in: 'header',
                    schema: { type: 'string' }
                },
                'X-Proxy-Url': {
                    description: `Specifies your custom proxy if you prefer to use one.\n\n` +
                        `Supported protocols: \n` +
                        `- http\n` +
                        `- https\n` +
                        `- socks4\n` +
                        `- socks5\n\n` +
                        `For authentication, https://user:pass@host:port`,
                    in: 'header',
                    schema: { type: 'string' }
                },
                'X-Proxy': {
                    description: `Use a proxy server provided by us.\n\nOptionally specify two-letter country code.`,
                    in: 'header',
                    schema: { type: 'string' }
                },
                'X-Robots-Txt': {
                    description: `Load and conform to the respective robot.txt on the target origin.\n\nOptionally specify a bot UA to check against.\n\n`,
                    in: 'header',
                    schema: { type: 'string' }
                },
                'DNT': {
                    description: `When set to 1, prevent the result of this request to be cached in the system.\n\n`,
                    in: 'header',
                    schema: { type: 'string' }
                },
                'X-Set-Cookie': {
                    description: `Sets cookie(s) to the headless browser for your request. \n\n` +
                        `Syntax is the same with standard Set-Cookie`,
                    in: 'header',
                    schema: { type: 'string' }
                },
                'X-With-Generated-Alt': {
                    description: `Enable automatic alt-text generating for images without an meaningful alt-text.\n\n` +
                        `Note: Does not work when \`X-Respond-With\` is specified`,
                    in: 'header',
                    schema: { type: 'string' }
                },
                'X-With-Images-Summary': {
                    description: `Enable dedicated summary section for images on the page.`,
                    in: 'header',
                    schema: { type: 'string' }
                },
                'X-With-links-Summary': {
                    description: `Enable dedicated summary section for hyper links on the page.`,
                    in: 'header',
                    schema: { type: 'string' }
                },
                'X-Retain-Images': {
                    description: `Image retention modes.\n\n` +
                        `Supported modes: \n` +
                        `- all: all images\n` +
                        `- none: no images\n` +
                        `- alt: only alt text\n` +
                        `- all_p: all images and with generated alt text\n` +
                        `- alt_p: only alt text and with generated alt\n\n`,
                    in: 'header',
                    schema: { type: 'string' }
                },
                'X-With-Iframe': {
                    description: `Enable filling iframe contents into main. (violates standards)`,
                    in: 'header',
                    schema: { type: 'string' }
                },
                'X-With-Shadow-Dom': {
                    description: `Enable filling shadow dom contents into main. (violates standards)`,
                    in: 'header',
                    schema: { type: 'string' }
                },
                'X-User-Agent': {
                    description: `Override User-Agent.`,
                    in: 'header',
                    schema: { type: 'string' }
                },
                'X-Timeout': {
                    description: `Specify timeout in seconds. Max 180.`,
                    in: 'header',
                    schema: { type: 'string' }
                },
                'X-Locale': {
                    description: 'Specify browser locale for the page.',
                    in: 'header',
                    schema: { type: 'string' }
                },
                'X-Referer': {
                    description: 'Specify referer for the page.',
                    in: '

Download .txt

gitextract_azcmq_sa/

├── .github/
│   └── workflows/
│       ├── .keep
│       └── cd.yml
├── .gitignore
├── .gitmodules
├── .vscode/
│   ├── exensions.json
│   ├── launch.json
│   ├── settings.json
│   └── tasks.json
├── Dockerfile
├── LICENSE
├── README.md
├── integrity-check.cjs
├── package.json
├── public/
│   └── robots.txt
├── src/
│   ├── api/
│   │   ├── crawler.ts
│   │   ├── searcher.ts
│   │   └── serp.ts
│   ├── cloud-functions/
│   │   ├── adaptive-crawler.ts
│   │   └── data-crunching.ts
│   ├── db/
│   │   ├── adaptive-crawl-task.ts
│   │   ├── crawled.ts
│   │   ├── domain-blockade.ts
│   │   ├── domain-profile.ts
│   │   ├── img-alt.ts
│   │   ├── pdf.ts
│   │   └── searched.ts
│   ├── dto/
│   │   ├── adaptive-crawler-options.ts
│   │   ├── crawler-options.ts
│   │   ├── jina-embeddings-auth.ts
│   │   └── turndown-tweakable-options.ts
│   ├── fetch.d.ts
│   ├── lib/
│   │   └── transform-server-event-stream.ts
│   ├── services/
│   │   ├── alt-text.ts
│   │   ├── async-context.ts
│   │   ├── blackhole-detector.ts
│   │   ├── brave-search.ts
│   │   ├── canvas.ts
│   │   ├── cf-browser-rendering.ts
│   │   ├── curl.ts
│   │   ├── errors.ts
│   │   ├── finalizer.ts
│   │   ├── geoip.ts
│   │   ├── jsdom.ts
│   │   ├── lm.ts
│   │   ├── logger.ts
│   │   ├── minimal-stealth.js
│   │   ├── misc.ts
│   │   ├── pdf-extract.ts
│   │   ├── pseudo-transfer.ts
│   │   ├── puppeteer.ts
│   │   ├── registry.ts
│   │   ├── robots-text.ts
│   │   ├── serp/
│   │   │   ├── compat.ts
│   │   │   ├── google.ts
│   │   │   ├── internal.ts
│   │   │   ├── puppeteer.ts
│   │   │   └── serper.ts
│   │   ├── serper-search.ts
│   │   ├── snapshot-formatter.ts
│   │   ├── temp-file.ts
│   │   └── threaded.ts
│   ├── stand-alone/
│   │   ├── crawl.ts
│   │   ├── search.ts
│   │   └── serp.ts
│   ├── types.d.ts
│   └── utils/
│       ├── encoding.ts
│       ├── get-function-url.ts
│       ├── ip.ts
│       ├── markdown.ts
│       ├── misc.ts
│       └── tailwind-classes.ts
└── tsconfig.json

Download .txt

SYMBOL INDEX (396 symbols across 55 files)

FILE: src/api/crawler.ts
  type ExtraScrappingOptions (line 53) | interface ExtraScrappingOptions extends ScrappingOptions {
  class CrawlerHost (line 76) | class CrawlerHost extends RPCHost {
    method constructor (line 87) | constructor(
    method init (line 180) | override async init() {
    method getIndex (line 190) | async getIndex(auth?: JinaEmbeddingsAuthDTO) {
    method getIndexCtrl (line 220) | async getIndexCtrl(@Ctx() ctx: Context, @Param({ required: false }) au...
    method crawl (line 256) | async crawl(
    method getTargetUrl (line 497) | async getTargetUrl(originPath: string, crawlerOptions: CrawlerOptions) {
    method getUrlDigest (line 531) | getUrlDigest(urlToCrawl: URL) {
    method queryCache (line 542) | async *queryCache(urlToCrawl: URL, cacheTolerance: number) {
    method setToCache (line 609) | async setToCache(urlToCrawl: URL, snapshot: PageSnapshot) {
    method iterSnapshots (line 670) | async *iterSnapshots(urlToCrawl: URL, crawlOpts?: ExtraScrappingOption...
    method cachedScrap (line 716) | async *cachedScrap(urlToCrawl: URL, crawlOpts?: ExtraScrappingOptions,...
    method assignChargeAmount (line 923) | assignChargeAmount(formatted: FormattedPage, saasTierPolicy?: Paramete...
    method scrapMany (line 958) | async *scrapMany(urls: URL[], options?: ExtraScrappingOptions, crawler...
    method configure (line 1005) | async configure(opts: CrawlerOptions) {
    method formatSnapshot (line 1109) | protected async formatSnapshot(
    method formatSnapshotWithPDFSideLoad (line 1140) | async formatSnapshotWithPDFSideLoad(mode: string, snapshot: PageSnapsh...
    method getFinalSnapshot (line 1175) | async getFinalSnapshot(url: URL, opts?: ExtraScrappingOptions, crawler...
    method simpleCrawl (line 1199) | async simpleCrawl(mode: string, url: URL, opts?: ExtraScrappingOptions) {
    method getDomainProfileUrlDigest (line 1233) | getDomainProfileUrlDigest(url: URL) {
    method sideLoadWithAllocatedProxy (line 1263) | async sideLoadWithAllocatedProxy(url: URL, opts?: ExtraScrappingOption...
    method figureOutBestProxyCountry (line 1291) | protected figureOutBestProxyCountry(opts?: ExtraScrappingOptions) {
    method knownUrlThatSideLoadingWouldCrashTheBrowser (line 1317) | knownUrlThatSideLoadingWouldCrashTheBrowser(url: URL) {
    method saasAssertTierPolicy (line 1325) | async saasAssertTierPolicy(opts: CrawlerOptions, auth: JinaEmbeddingsA...
    method saasApplyTierPolicy (line 1366) | saasApplyTierPolicy(policy: Awaited<ReturnType<typeof this.saasAssertT...

FILE: src/api/searcher.ts
  constant WORLD_COUNTRY_CODES (line 33) | const WORLD_COUNTRY_CODES = Object.keys(WORLD_COUNTRIES).map((x) => x.to...
  type FormattedPage (line 35) | interface FormattedPage extends RealFormattedPage {
  type RateLimitCache (line 40) | type RateLimitCache = {
  class SearcherHost (line 46) | class SearcherHost extends RPCHost {
    method constructor (line 66) | constructor(
    method init (line 99) | override async init() {
    method search (line 126) | async search(
    method searchWithFallback (line 538) | async searchWithFallback(
    method fetchSearchResults (line 594) | async *fetchSearchResults(
    method reOrganizeSearchResults (line 636) | reOrganizeSearchResults(searchResults: FormattedPage[], count?: number) {
    method assignChargeAmount (line 655) | assignChargeAmount(formatted: FormattedPage[], num: number, scaler: nu...
    method pageQualified (line 687) | pageQualified(formattedPage: FormattedPage) {
    method searchResultsQualified (line 696) | searchResultsQualified(results: FormattedPage[], targetResultCount = t...
    method getFavicon (line 700) | async getFavicon(domain: string) {
    method iterProviders (line 718) | *iterProviders(preference?: string, variant?: string) {
    method cachedSearch (line 740) | async cachedSearch(variant: 'web' | 'news' | 'images', query: Record<s...
    method mapToFinalResults (line 823) | mapToFinalResults(input: WebSearchEntry) {
    method assignGeneralMixin (line 837) | async assignGeneralMixin(result: FormattedPage) {
  method toString (line 863) | toString(this: FormattedPage, i?: number) {

FILE: src/api/serp.ts
  constant WORLD_COUNTRY_CODES (line 31) | const WORLD_COUNTRY_CODES = Object.keys(WORLD_COUNTRIES).map((x) => x.to...
  type RateLimitCache (line 33) | type RateLimitCache = {
  class SerpHost (line 49) | class SerpHost extends RPCHost {
    method getIndex (line 69) | async getIndex(ctx: Context, auth?: JinaEmbeddingsAuthDTO) {
    method constructor (line 89) | constructor(
    method init (line 121) | override async init() {
    method search (line 147) | async search(
    method assignChargeAmount (line 394) | assignChargeAmount(items: unknown[], scaler: number) {
    method getFavicon (line 401) | async getFavicon(domain: string) {
    method configure (line 419) | async configure(opts: CrawlerOptions) {
    method mapToFinalResults (line 440) | mapToFinalResults(input: WebSearchEntry) {
    method iterProviders (line 454) | *iterProviders(preference?: string, variant?: string) {
    method cachedSearch (line 477) | async cachedSearch(variant: 'web' | 'news' | 'images', query: Record<s...
    method assignGeneralMixin (line 561) | async assignGeneralMixin(result: Partial<WebSearchEntry>) {

FILE: src/cloud-functions/adaptive-crawler.ts
  class AdaptiveCrawlerHost (line 36) | class AdaptiveCrawlerHost extends RPCHost {
    method constructor (line 43) | constructor(
    method init (line 50) | override async init() {
    method adaptiveCrawl (line 66) | async adaptiveCrawl(
    method adaptiveCrawlStatus (line 183) | async adaptiveCrawlStatus(
    method singleCrawlQueue (line 240) | async singleCrawlQueue(
    method handleSingleCrawl (line 309) | async handleSingleCrawl(shortDigest: string, url: string, token: strin...
    method handleSingleCrawlRecursively (line 347) | async handleSingleCrawlRecursively(
    method getRelevantUrls (line 436) | async getRelevantUrls(token: string, {
    method getIndex (line 491) | getIndex(user?: JinaEmbeddingsTokenAccount) {
    method crawlUrlsFromSitemap (line 511) | async crawlUrlsFromSitemap(url: URL, maxPages: number) {
    method getSitemapsFromRobotsTxt (line 583) | async getSitemapsFromRobotsTxt(url: URL) {

FILE: src/cloud-functions/data-crunching.ts
  class DataCrunchingHost (line 26) | class DataCrunchingHost extends RPCHost {
    method constructor (line 34) | constructor(
    method init (line 45) | override async init() {
    method crunchPageCacheWorker (line 69) | async crunchPageCacheWorker(
    method dispatchPageCacheCrunching (line 102) | async dispatchPageCacheCrunching() {
    method iterPageCacheRecords (line 149) | async* iterPageCacheRecords(date?: string, inputOffset?: number | stri...
    method iterPageCacheChunks (line 199) | async* iterPageCacheChunks() {
    method crunchCacheRecords (line 234) | async crunchCacheRecords(records: Crawled[]) {

FILE: src/db/adaptive-crawl-task.ts
  type AdaptiveCrawlTaskStatus (line 5) | enum AdaptiveCrawlTaskStatus {
  class AdaptiveCrawlTask (line 15) | class AdaptiveCrawlTask extends FirestoreRecord {
    method from (line 63) | static override from(input: any) {
    method degradeForFireStore (line 73) | override degradeForFireStore() {

FILE: src/db/crawled.ts
  class Crawled (line 9) | class Crawled extends FirestoreRecord {
    method from (line 49) | static override from(input: any) {
    method degradeForFireStore (line 59) | override degradeForFireStore() {

FILE: src/db/domain-blockade.ts
  class DomainBlockade (line 7) | class DomainBlockade extends FirestoreRecord {

FILE: src/db/domain-profile.ts
  class DomainProfile (line 8) | class DomainProfile extends FirestoreRecord {

FILE: src/db/img-alt.ts
  class ImgAlt (line 8) | class ImgAlt extends FirestoreRecord {

FILE: src/db/pdf.ts
  class PDFContent (line 8) | class PDFContent extends FirestoreRecord {
    method from (line 42) | static override from(input: any) {
    method degradeForFireStore (line 52) | override degradeForFireStore() {

FILE: src/db/searched.ts
  class SearchResult (line 8) | class SearchResult extends FirestoreRecord {
    method from (line 39) | static override from(input: any) {
    method degradeForFireStore (line 49) | override degradeForFireStore() {
  class SerperSearchResult (line 62) | class SerperSearchResult extends SearchResult {
  class SERPResult (line 66) | class SERPResult extends SearchResult {

FILE: src/dto/adaptive-crawler-options.ts
  class AdaptiveCrawlerOptions (line 28) | class AdaptiveCrawlerOptions extends AutoCastable {
    method from (line 42) | static override from(input: any) {

FILE: src/dto/crawler-options.ts
  type CONTENT_FORMAT (line 8) | enum CONTENT_FORMAT {
  type ENGINE_TYPE (line 19) | enum ENGINE_TYPE {
  type RESPOND_TIMING (line 26) | enum RESPOND_TIMING {
  constant CONTENT_FORMAT_VALUES (line 35) | const CONTENT_FORMAT_VALUES = new Set<string>(Object.values(CONTENT_FORM...
  constant IMAGE_RETENTION_MODES (line 37) | const IMAGE_RETENTION_MODES = ['none', 'all', 'alt', 'all_p', 'alt_p'] a...
  constant IMAGE_RETENTION_MODE_VALUES (line 38) | const IMAGE_RETENTION_MODE_VALUES = new Set<string>(IMAGE_RETENTION_MODES);
  constant BASE_URL_MODES (line 39) | const BASE_URL_MODES = ['initial', 'final'] as const;
  constant BASE_URL_MODE_VALUES (line 40) | const BASE_URL_MODE_VALUES = new Set<string>(BASE_URL_MODES);
  class Viewport (line 42) | class Viewport extends AutoCastable {
  class CrawlerOptions (line 284) | class CrawlerOptions extends AutoCastable {
    method from (line 434) | static override from(input: any) {
    method presumedRespondTiming (line 603) | get presumedRespondTiming() {
    method isSnapshotAcceptableForEarlyResponse (line 617) | isSnapshotAcceptableForEarlyResponse(snapshot: PageSnapshot) {
    method isCacheQueryApplicable (line 665) | isCacheQueryApplicable() {
    method isRequestingCompoundContentFormat (line 685) | isRequestingCompoundContentFormat() {
    method browserIsNotRequired (line 689) | browserIsNotRequired() {
  class CrawlerOptionsHeaderOnly (line 719) | class CrawlerOptionsHeaderOnly extends CrawlerOptions {
    method from (line 720) | static override from(input: any) {

FILE: src/dto/jina-embeddings-auth.ts
  constant THE_VERY_SAME_JINA_EMBEDDINGS_CLIENT (line 25) | const THE_VERY_SAME_JINA_EMBEDDINGS_CLIENT = new JinaEmbeddingsDashboard...
  class JinaEmbeddingsAuthDTO (line 47) | class JinaEmbeddingsAuthDTO extends AutoCastable {
    method from (line 57) | static override from(input: any) {
    method getBrief (line 79) | async getBrief(ignoreCache?: boolean | string) {
    method reportUsage (line 171) | async reportUsage(tokenCount: number, mdl: string, endpoint: string = ...
    method solveUID (line 203) | async solveUID() {
    method assertUID (line 220) | async assertUID() {
    method assertUser (line 230) | async assertUser() {
    method assertTier (line 240) | async assertTier(n: number, feature?: string) {
    method getRateLimits (line 264) | getRateLimits(...tags: string[]) {

FILE: src/dto/turndown-tweakable-options.ts
  class TurnDownTweakableOptions (line 6) | class TurnDownTweakableOptions extends AutoCastable {
    method fromCtx (line 49) | static fromCtx(ctx: Context, prefix= 'x-md-') {

FILE: src/lib/transform-server-event-stream.ts
  class InputServerEventStream (line 4) | class InputServerEventStream extends Transform {
    method constructor (line 7) | constructor(options?: TransformOptions) {
    method decodeRoutine (line 14) | decodeRoutine() {
    method _transform (line 73) | override _transform(chunk: any, encoding: BufferEncoding, callback: Tr...
    method _final (line 84) | override _final(callback: (error?: Error | null | undefined) => void):...
  class OutputServerEventStream (line 93) | class OutputServerEventStream extends Transform {
    method constructor (line 96) | constructor(options?: TransformOptions) {
    method encodeRoutine (line 102) | encodeRoutine(chunk: {
    method _transform (line 149) | override _transform(chunk: any, encoding: BufferEncoding, callback: Tr...
  type OutputServerEventStream (line 160) | interface OutputServerEventStream extends Transform {
    method constructor (line 96) | constructor(options?: TransformOptions) {
    method encodeRoutine (line 102) | encodeRoutine(chunk: {
    method _transform (line 149) | override _transform(chunk: any, encoding: BufferEncoding, callback: Tr...

FILE: src/services/alt-text.ts
  class AltTextService (line 13) | class AltTextService extends AsyncService {
    method constructor (line 18) | constructor(
    method init (line 27) | override async init() {
    method caption (line 32) | async caption(url: string) {
    method getAltText (line 60) | async getAltText(imgBrief: ImgBrief) {

FILE: src/services/async-context.ts
  class AsyncLocalContext (line 5) | class AsyncLocalContext extends GlobalAsyncContext { }

FILE: src/services/blackhole-detector.ts
  class BlackHoleDetector (line 8) | class BlackHoleDetector extends AsyncService {
    method constructor (line 20) | constructor(protected globalLogger: GlobalLogger) {
    method init (line 30) | override async init() {
    method routine (line 36) | async routine() {
    method incomingRequest (line 63) | incomingRequest() {
    method doneWithRequest (line 68) | doneWithRequest() {
    method itWorked (line 73) | itWorked() {

FILE: src/services/brave-search.ts
  class BraveSearchService (line 13) | class BraveSearchService extends AsyncService {
    method constructor (line 19) | constructor(
    method init (line 29) | override async init() {
    method webSearch (line 36) | async webSearch(query: WebSearchQueryParams) {
  class BraveSearchExplicitOperatorsDto (line 94) | class BraveSearchExplicitOperatorsDto extends AutoCastable {
    method addTo (line 143) | addTo(searchTerm: string) {
    method from (line 163) | static override from(input: any) {

FILE: src/services/canvas.ts
  class CanvasService (line 31) | class CanvasService extends AsyncService {
    method constructor (line 37) | constructor(
    method init (line 44) | override async init() {
    method renderSvgToPng (line 68) | async renderSvgToPng(svgContent: string,) {
    method _loadImage (line 72) | protected async _loadImage(input: string | Buffer) {
    method loadImage (line 120) | async loadImage(uri: string | Buffer) {
    method fitImageToSquareBox (line 137) | fitImageToSquareBox(image: canvas.Image | canvas.Canvas, size: number ...
    method corpImage (line 165) | corpImage(image: canvas.Image | canvas.Canvas, x: number, y: number, w...
    method canvasToDataUrl (line 176) | canvasToDataUrl(canvas: canvas.Canvas, mimeType?: 'image/png' | 'image...
    method canvasToBuffer (line 182) | async canvasToBuffer(canvas: canvas.Canvas, mimeType?: 'image/png' | '...

FILE: src/services/cf-browser-rendering.ts
  class CFBrowserRendering (line 10) | class CFBrowserRendering extends AsyncService {
    method constructor (line 15) | constructor(
    method init (line 23) | override async init() {
    method fetchContent (line 31) | async fetchContent(url: string) {

FILE: src/services/curl.ts
  type CURLScrappingOptions (line 19) | interface CURLScrappingOptions extends ScrappingOptions {
  class CurlControl (line 25) | class CurlControl extends AsyncService {
    method constructor (line 36) | constructor(
    method init (line 45) | override async init() {
    method impersonateChrome (line 57) | impersonateChrome(ua: string) {
    method curlImpersonateHeader (line 63) | curlImpersonateHeader(curl: Curl, headers?: object) {
    method urlToFile1Shot (line 111) | urlToFile1Shot(urlToCrawl: URL, crawlOpts?: CURLScrappingOptions) {
    method urlToFile (line 320) | async urlToFile(urlToCrawl: URL, crawlOpts?: CURLScrappingOptions) {
    method sideLoad (line 374) | async sideLoad(targetUrl: URL, crawlOpts?: CURLScrappingOptions) {
    method digestCurlCode (line 424) | digestCurlCode(code: CurlCode, msg: string) {

FILE: src/services/errors.ts
  class ServiceDisabledError (line 9) | class ServiceDisabledError extends ApplicationError { }
  class ServiceCrashedError (line 12) | class ServiceCrashedError extends ApplicationError { }
  class ServiceNodeResourceDrainError (line 15) | class ServiceNodeResourceDrainError extends ApplicationError { }
  class ServiceBadAttemptError (line 18) | class ServiceBadAttemptError extends ApplicationError { }
  class ServiceBadApproachError (line 21) | class ServiceBadApproachError extends ServiceBadAttemptError { }
  class EmailUnverifiedError (line 24) | class EmailUnverifiedError extends ApplicationError { }
  class InsufficientCreditsError (line 27) | class InsufficientCreditsError extends ApplicationError { }
  class TierFeatureConstraintError (line 30) | class TierFeatureConstraintError extends ApplicationError { }
  class InsufficientBalanceError (line 33) | class InsufficientBalanceError extends ApplicationError { }
  class LockConflictError (line 36) | class LockConflictError extends ApplicationError { }
  class BudgetExceededError (line 39) | class BudgetExceededError extends ApplicationError { }
  class HarmfulContentError (line 42) | class HarmfulContentError extends ApplicationError { }
  class SecurityCompromiseError (line 45) | class SecurityCompromiseError extends ApplicationError { }
  class BatchSizeTooLargeError (line 48) | class BatchSizeTooLargeError extends ApplicationError { }

FILE: src/services/finalizer.ts
  class FinalizerService (line 15) | class FinalizerService extends AbstractFinalizerService {
    method quitProcess (line 20) | override quitProcess(code?: string | number | null | undefined): never {
    method constructor (line 24) | constructor(protected globalLogger: GlobalLogger) {
    method onUnhandledRejection (line 28) | override onUnhandledRejection(err: unknown, _triggeringPromise: Promis...

FILE: src/services/geoip.ts
  type GEOIP_SUPPORTED_LANGUAGES (line 9) | enum GEOIP_SUPPORTED_LANGUAGES {
  class GeoIPInfo (line 20) | class GeoIPInfo extends AutoCastable {
  class GeoIPCountryInfo (line 28) | class GeoIPCountryInfo extends GeoIPInfo {
  class GeoIPCityResponse (line 33) | class GeoIPCityResponse extends AutoCastable {
  class GeoIPService (line 58) | class GeoIPService extends AsyncService {
    method constructor (line 64) | constructor(
    method init (line 71) | override async init() {
    method _lazyload (line 78) | async _lazyload() {
    method lookupCity (line 90) | async lookupCity(ip: string, lang: GEOIP_SUPPORTED_LANGUAGES = GEOIP_S...
    method lookupCities (line 122) | async lookupCities(ips: string[], lang: GEOIP_SUPPORTED_LANGUAGES = GE...

FILE: src/services/jsdom.ts
  class JSDomControl (line 16) | class JSDomControl extends AsyncService {
    method constructor (line 22) | constructor(
    method init (line 28) | override async init() {
    method narrowSnapshot (line 34) | async narrowSnapshot(snapshot: PageSnapshot | undefined, options?: Ext...
    method actualNarrowSnapshot (line 56) | async actualNarrowSnapshot(snapshot: PageSnapshot, options?: ExtraScra...
    method inferSnapshot (line 219) | async inferSnapshot(snapshot: PageSnapshot) {
    method cleanRedundantEmptyLines (line 275) | cleanRedundantEmptyLines(text: string) {
    method cleanHTMLforLMs (line 283) | async cleanHTMLforLMs(sourceHTML: string, ...discardSelectors: string[...
    method snippetToElement (line 344) | snippetToElement(snippet?: string, url?: string) {
    method runTurndown (line 358) | runTurndown(turndownService: TurndownService, html: TurndownService.No...
    method analyzeHTMLTextLite (line 372) | async analyzeHTMLTextLite(sourceHTML: string) {

FILE: src/services/lm.ts
  class LmControl (line 14) | class LmControl extends AsyncService {
    method constructor (line 18) | constructor(
    method init (line 26) | override async init() {
    method geminiFromBrowserSnapshot (line 32) | async* geminiFromBrowserSnapshot(snapshot?: PageSnapshot & {
    method readerLMMarkdownFromSnapshot (line 72) | async* readerLMMarkdownFromSnapshot(snapshot?: PageSnapshot) {
    method readerLMFromSnapshot (line 113) | async* readerLMFromSnapshot(schema?: string, instruction: string = 'In...

FILE: src/services/logger.ts
  class GlobalLogger (line 17) | class GlobalLogger extends AbstractPinoLogger {
    method init (line 25) | override init(): void {
    method log (line 43) | override log(...args: any[]) {

FILE: src/services/minimal-stealth.js
  function minimalStealth (line 3) | function minimalStealth() {

FILE: src/services/misc.ts
  class MiscService (line 14) | class MiscService extends AsyncService {
    method constructor (line 18) | constructor(
    method init (line 24) | override async init() {
    method assertNormalizedUrl (line 31) | async assertNormalizedUrl(input: string) {

FILE: src/services/pdf-extract.ts
  function stdDev (line 23) | function stdDev(numbers: number[]) {
  function isRotatedByAtLeast35Degrees (line 30) | function isRotatedByAtLeast35Degrees(transform?: [number, number, number...
  class PDFExtractor (line 49) | class PDFExtractor extends AsyncService {
    method constructor (line 56) | constructor(
    method init (line 64) | override async init() {
    method isDataUrl (line 71) | isDataUrl(url: string) {
    method parseDataUrl (line 75) | parseDataUrl(url: string) {
    method extract (line 93) | async extract(url: string | URL) {
    method cachedExtract (line 276) | async cachedExtract(url: string, cacheTolerance: number = 1000 * 3600 ...
    method parsePdfDate (line 353) | parsePdfDate(pdfDate: string | undefined) {

FILE: src/services/pseudo-transfer.ts
  class PseudoTransfer (line 7) | class PseudoTransfer extends AbstractPseudoTransfer {
    method init (line 9) | override async init() {

FILE: src/services/puppeteer.ts
  constant READABILITY_JS (line 25) | const READABILITY_JS = fs.readFileSync(require.resolve('@mozilla/readabi...
  type ImgBrief (line 28) | interface ImgBrief {
  type ReadabilityParsed (line 38) | interface ReadabilityParsed {
  type PageSnapshot (line 51) | interface PageSnapshot {
  type ExtendedSnapshot (line 77) | interface ExtendedSnapshot extends PageSnapshot {
  type ScrappingOptions (line 82) | interface ScrappingOptions {
  constant SIMULATE_SCROLL (line 112) | const SIMULATE_SCROLL = `
  constant MUTATION_IDLE_WATCH (line 209) | const MUTATION_IDLE_WATCH = `
  constant SCRIPT_TO_INJECT_INTO_FRAME (line 234) | const SCRIPT_TO_INJECT_INTO_FRAME = `
  class PageReqCtrlKit (line 451) | class PageReqCtrlKit {
    method constructor (line 458) | constructor(
    method onNewRequest (line 466) | onNewRequest(req: HTTPRequest) {
    method onFinishRequest (line 477) | onFinishRequest(req: HTTPRequest) {
  class PuppeteerControl (line 499) | class PuppeteerControl extends AsyncService {
    method constructor (line 524) | constructor(
    method init (line 547) | override async init() {
    method getRpsControlKit (line 594) | protected getRpsControlKit(page: Page) {
    method newPage (line 604) | async newPage(bewareDeadLock: any = false) {
    method getNextPage (line 790) | async getNextPage() {
    method ditchPage (line 819) | async ditchPage(page: Page) {
    method scrap (line 846) | async *scrap(parsedUrl: URL, options: ScrappingOptions = {}): AsyncGen...
    method takeScreenShot (line 1299) | protected async takeScreenShot(page: Page, opts?: Parameters<typeof pa...
    method snapshotChildFrames (line 1311) | async snapshotChildFrames(page: Page): Promise<PageSnapshot[]> {

FILE: src/services/registry.ts
  class RPCRegistry (line 16) | class RPCRegistry extends KoaRPCRegistry {
    method constructor (line 38) | constructor(
    method init (line 51) | override async init() {

FILE: src/services/robots-text.ts
  class RobotsTxtService (line 16) | class RobotsTxtService extends AsyncService {
    method constructor (line 20) | constructor(
    method init (line 27) | override async init() {
    method getCachedRobotTxt (line 32) | async getCachedRobotTxt(origin: string) {
    method assertAccessAllowed (line 57) | async assertAccessAllowed(url: URL, inputMyUa = '*') {

FILE: src/services/serp/compat.ts
  type WebSearchEntry (line 1) | interface WebSearchEntry {

FILE: src/services/serp/google.ts
  class GoogleSERP (line 18) | class GoogleSERP extends AsyncService {
    method constructor (line 22) | constructor(
    method init (line 33) | override async init() {
    method sideLoadWithAllocatedProxy (line 53) | async sideLoadWithAllocatedProxy(url: URL, opts?: ScrappingOptions) {
    method digestQuery (line 78) | digestQuery(query: { [k: string]: any; }) {
    method webSearch (line 104) | async webSearch(query: { [k: string]: any; }, opts?: ScrappingOptions) {
    method newsSearch (line 117) | async newsSearch(query: { [k: string]: any; }, opts?: ScrappingOptions) {
    method imageSearch (line 132) | async imageSearch(query: { [k: string]: any; }, opts?: ScrappingOption...
  function getWebSearchResults (line 164) | async function getWebSearchResults() {
  function getNewsSearchResults (line 258) | async function getNewsSearchResults() {

FILE: src/services/serp/internal.ts
  class InternalJinaSerpService (line 13) | class InternalJinaSerpService extends AsyncService {
    method constructor (line 19) | constructor(
    method init (line 28) | override async init() {
    method doSearch (line 36) | async doSearch(variant: 'web' | 'images' | 'news', query: SerperSearch...
    method webSearch (line 67) | async webSearch(query: SerperSearchQueryParams) {
    method imageSearch (line 70) | async imageSearch(query: SerperSearchQueryParams) {
    method newsSearch (line 73) | async newsSearch(query: SerperSearchQueryParams) {

FILE: src/services/serp/puppeteer.ts
  type ScrappingOptions (line 23) | interface ScrappingOptions {
  constant SIMULATE_SCROLL (line 49) | const SIMULATE_SCROLL = `
  constant MUTATION_IDLE_WATCH (line 146) | const MUTATION_IDLE_WATCH = `
  constant SCRIPT_TO_INJECT_INTO_FRAME (line 171) | const SCRIPT_TO_INJECT_INTO_FRAME = `
  class SERPSpecializedPuppeteerControl (line 223) | class SERPSpecializedPuppeteerControl extends AsyncService {
    method constructor (line 242) | constructor(
    method init (line 265) | override async init() {
    method newPage (line 311) | async newPage<T>(bewareDeadLock: any = false) {
    method getNextPage (line 350) | async getNextPage() {
    method ditchPage (line 379) | async ditchPage(page: Page) {
    method controlledScrap (line 405) | async controlledScrap<T>(parsedUrl: URL, func: (this: void) => Promise...

FILE: src/services/serp/serper.ts
  class SerperGoogleSearchService (line 13) | class SerperGoogleSearchService extends AsyncService {
    method constructor (line 19) | constructor(
    method init (line 28) | override async init() {
    method doSearch (line 39) | async doSearch(variant: 'web' | 'images' | 'news', query: SerperSearch...
    method webSearch (line 70) | async webSearch(query: SerperSearchQueryParams) {
    method imageSearch (line 73) | async imageSearch(query: SerperSearchQueryParams) {
    method newsSearch (line 76) | async newsSearch(query: SerperSearchQueryParams) {
  class SerperBingSearchService (line 83) | class SerperBingSearchService extends SerperGoogleSearchService {
    method init (line 86) | override async init() {
  class GoogleSearchExplicitOperatorsDto (line 94) | class GoogleSearchExplicitOperatorsDto extends AutoCastable {
    method addTo (line 125) | addTo(searchTerm: string) {
    method from (line 145) | static override from(input: any) {

FILE: src/services/serper-search.ts
  class SerperSearchService (line 12) | class SerperSearchService extends AsyncService {
    method constructor (line 19) | constructor(
    method init (line 28) | override async init() {
    method iterClient (line 36) | *iterClient() {
    method doSearch (line 49) | async doSearch(variant: 'web' | 'images' | 'news', query: SerperSearch...
    method webSearch (line 129) | async webSearch(query: SerperSearchQueryParams) {
    method imageSearch (line 132) | async imageSearch(query: SerperSearchQueryParams) {
    method newsSearch (line 135) | async newsSearch(query: SerperSearchQueryParams) {
  class GoogleSearchExplicitOperatorsDto (line 141) | class GoogleSearchExplicitOperatorsDto extends AutoCastable {
    method addTo (line 172) | addTo(searchTerm: string) {
    method from (line 192) | static override from(input: any) {

FILE: src/services/snapshot-formatter.ts
  type FormattedPage (line 22) | interface FormattedPage {
  function highlightedCodeBlock (line 53) | function highlightedCodeBlock(turndownService: TurndownService) {
  class SnapshotFormatter (line 76) | class SnapshotFormatter extends AsyncService {
    method constructor (line 83) | constructor(
    method init (line 94) | override async init() {
    method formatSnapshot (line 101) | async formatSnapshot(mode: string | 'markdown' | 'html' | 'text' | 'sc...
    method dataUrlToBlobUrl (line 518) | dataUrlToBlobUrl(dataUrl: string, baseUrl: string = 'http://localhost/...
    method getGeneralSnapshotMixins (line 525) | async getGeneralSnapshotMixins(snapshot: PageSnapshot) {
    method getTurndown (line 579) | getTurndown(options?: {
    method isPoorlyTransformed (line 767) | isPoorlyTransformed(content?: string, node?: Element) {
    method createSnapshotFromFile (line 802) | async createSnapshotFromFile(url: URL, file: FancyFile, overrideConten...

FILE: src/services/temp-file.ts
  class TempFileManager (line 7) | class TempFileManager extends AbstractTempFileManger {
    method init (line 11) | override async init() {
    method standDown (line 18) | override async standDown() {

FILE: src/services/threaded.ts
  class ThreadedServiceRegistry (line 14) | class ThreadedServiceRegistry extends AbstractThreadedServiceRegistry {
    method constructor (line 19) | constructor(
    method setMaxWorkersByCpu (line 27) | setMaxWorkersByCpu() {
    method init (line 38) | override async init() {

FILE: src/stand-alone/crawl.ts
  class CrawlStandAloneServer (line 24) | class CrawlStandAloneServer extends KoaServer {
    method constructor (line 30) | constructor(
    method h2c (line 40) | h2c() {
    method init (line 52) | override async init() {
    method walkForAssets (line 57) | async walkForAssets() {
    method listen (line 68) | override listen(port: number) {
    method makeAssetsServingController (line 80) | makeAssetsServingController() {
    method registerRoutes (line 101) | registerRoutes(): void {
    method insertAsyncHookMiddleware (line 126) | override insertAsyncHookMiddleware() {
    method standDown (line 142) | override async standDown() {

FILE: src/stand-alone/search.ts
  class SearchStandAloneServer (line 24) | class SearchStandAloneServer extends KoaServer {
    method constructor (line 30) | constructor(
    method h2c (line 40) | h2c() {
    method init (line 52) | override async init() {
    method walkForAssets (line 65) | async walkForAssets() {
    method listen (line 76) | override listen(port: number) {
    method makeAssetsServingController (line 88) | makeAssetsServingController() {
    method registerRoutes (line 109) | registerRoutes(): void {
    method insertAsyncHookMiddleware (line 135) | override insertAsyncHookMiddleware() {
    method standDown (line 151) | override async standDown() {

FILE: src/stand-alone/serp.ts
  class SERPStandAloneServer (line 25) | class SERPStandAloneServer extends KoaServer {
    method constructor (line 31) | constructor(
    method h2c (line 41) | h2c() {
    method init (line 53) | override async init() {
    method walkForAssets (line 66) | async walkForAssets() {
    method listen (line 77) | override listen(port: number) {
    method makeAssetsServingController (line 89) | makeAssetsServingController() {
    method registerRoutes (line 110) | registerRoutes(): void {
    method insertAsyncHookMiddleware (line 137) | override insertAsyncHookMiddleware() {
    method standDown (line 153) | override async standDown() {

FILE: src/types.d.ts
  type DetectionResult (line 2) | interface DetectionResult {
  class JSDOM (line 13) | class JSDOM {
  class VirtualConsole (line 17) | class VirtualConsole extends EventEmitter {

FILE: src/utils/encoding.ts
  function decodeFileStream (line 5) | async function decodeFileStream(
  function readFile (line 21) | async function readFile(

FILE: src/utils/get-function-url.ts
  function getFunctionUrl (line 10) | async function getFunctionUrl(name: string, location = "us-central1") {

FILE: src/utils/ip.ts
  function parseIp (line 3) | function parseIp(ip: string): Buffer {
  function parseCIDR (line 56) | function parseCIDR(cidr: string): [Buffer, Buffer] {
  class CIDR (line 89) | class CIDR {
    method constructor (line 93) | constructor(cidr: string) {
    method toString (line 98) | toString() {
    method family (line 102) | get family() {
    method test (line 106) | test(ip: string | Buffer): boolean {
  function isIPInNonPublicRange (line 157) | function isIPInNonPublicRange(ip: string) {

FILE: src/utils/markdown.ts
  function tidyMarkdown (line 2) | function tidyMarkdown(markdown: string): string {

FILE: src/utils/misc.ts
  function cleanAttribute (line 3) | function cleanAttribute(attribute: string | null) {
  function tryDecodeURIComponent (line 8) | function tryDecodeURIComponent(input: string) {

Download .json

Condensed preview — 72 files, each showing path, character count, and a content snippet. Download the .json file or copy for the full structured content (779K chars).

[
  {
    "path": ".github/workflows/.keep",
    "chars": 0,
    "preview": ""
  },
  {
    "path": ".github/workflows/cd.yml",
    "chars": 4463,
    "preview": "run-name: Build push and deploy (CD)\non:\n  push:\n    branches:\n      - main\n      - ci-debug\n      - dev\n    tags:\n     "
  },
  {
    "path": ".gitignore",
    "chars": 1266,
    "preview": "# Logs\nlogs\n*.log\nnpm-debug.log*\nyarn-debug.log*\nyarn-error.log*\nfirebase-debug.log*\nfirebase-debug.*.log*\n\n# Firebase c"
  },
  {
    "path": ".gitmodules",
    "chars": 104,
    "preview": "[submodule \"thinapps-shared\"]\n\tpath = thinapps-shared\n\turl = git@github.com:jina-ai/thinapps-shared.git\n"
  },
  {
    "path": ".vscode/exensions.json",
    "chars": 241,
    "preview": "{\n    \"recommendations\": [\n        \"editorconfig.editorconfig\",\n        \"octref.vetur\",\n        \"redhat.vscode-yaml\",\n  "
  },
  {
    "path": ".vscode/launch.json",
    "chars": 3450,
    "preview": "{\n  \"version\": \"0.2.0\",\n  \"configurations\": [\n    {\n      \"name\": \"Attach\",\n      \"port\": 9229,\n      \"request\": \"attach"
  },
  {
    "path": ".vscode/settings.json",
    "chars": 993,
    "preview": "{\n    \"editor.wordWrap\": \"on\",\n    \"editor.wordWrapColumn\": 120,\n    \"files.trimTrailingWhitespace\": true,\n    \"files.tr"
  },
  {
    "path": ".vscode/tasks.json",
    "chars": 748,
    "preview": "{\n    \"version\": \"2.0.0\",\n    \"tasks\": [\n        {\n            \"type\": \"npm\",\n            \"script\": \"build\",\n           "
  },
  {
    "path": "Dockerfile",
    "chars": 1342,
    "preview": "# syntax=docker/dockerfile:1\nFROM lwthiker/curl-impersonate:0.6-chrome-slim-bullseye\n\nFROM node:22\n\nRUN apt-get update \\"
  },
  {
    "path": "LICENSE",
    "chars": 10826,
    "preview": "Copyright 2020-2024 Jina AI Limited.  All rights reserved.\n\n\n                                 Apache License\n           "
  },
  {
    "path": "README.md",
    "chars": 12419,
    "preview": "# Reader\n\nYour LLMs deserve better input.\n\nReader does two things:\n- **Read**: It converts any URL to an **LLM-friendly*"
  },
  {
    "path": "integrity-check.cjs",
    "chars": 269,
    "preview": "#!/usr/bin/env node\n\nconst fs = require('fs');\nconst path = require('path');\n\nconst file = path.resolve(__dirname, 'lice"
  },
  {
    "path": "package.json",
    "chars": 2476,
    "preview": "{\n  \"name\": \"reader\",\n  \"scripts\": {\n    \"lint\": \"eslint --ext .js,.ts .\",\n    \"build\": \"node ./integrity-check.cjs && t"
  },
  {
    "path": "public/robots.txt",
    "chars": 26,
    "preview": "User-Agent: *\nDisallow: /\n"
  },
  {
    "path": "src/api/crawler.ts",
    "chars": 54141,
    "preview": "import { singleton } from 'tsyringe';\nimport { pathToFileURL } from 'url';\nimport { randomUUID } from 'crypto';\nimport _"
  },
  {
    "path": "src/api/searcher.ts",
    "chars": 33970,
    "preview": "import { singleton } from 'tsyringe';\nimport {\n    assignTransferProtocolMeta, RPCHost, RPCReflection, AssertionFailureE"
  },
  {
    "path": "src/api/serp.ts",
    "chars": 20911,
    "preview": "import { singleton } from 'tsyringe';\nimport {\n    RPCHost, RPCReflection, assignMeta, RawString,\n    ParamValidationErr"
  },
  {
    "path": "src/cloud-functions/adaptive-crawler.ts",
    "chars": 20292,
    "preview": "import {\n    AssertionFailureError,\n    assignTransferProtocolMeta,\n    HashManager,\n    ParamValidationError,\n    RPCHo"
  },
  {
    "path": "src/cloud-functions/data-crunching.ts",
    "chars": 9919,
    "preview": "import {\n    Defer,\n    PromiseThrottle,\n    RPCHost,\n} from 'civkit';\nimport { singleton } from 'tsyringe';\nimport {\n  "
  },
  {
    "path": "src/db/adaptive-crawl-task.ts",
    "chars": 1701,
    "preview": "import { Also, Prop, parseJSONText } from 'civkit';\nimport { FirestoreRecord } from '../shared/lib/firestore';\nimport _ "
  },
  {
    "path": "src/db/crawled.ts",
    "chars": 1519,
    "preview": "import { Also, parseJSONText, Prop } from 'civkit';\nimport { FirestoreRecord } from '../shared/lib/firestore';\nimport _ "
  },
  {
    "path": "src/db/domain-blockade.ts",
    "chars": 520,
    "preview": "import { Also, Prop } from 'civkit';\nimport { FirestoreRecord } from '../shared/lib/firestore';\n\n@Also({\n    dictOf: Obj"
  },
  {
    "path": "src/db/domain-profile.ts",
    "chars": 582,
    "preview": "import { Also, Prop } from 'civkit';\nimport { FirestoreRecord } from '../shared/lib/firestore';\nimport { ENGINE_TYPE } f"
  },
  {
    "path": "src/db/img-alt.ts",
    "chars": 641,
    "preview": "import { Also, Prop } from 'civkit';\nimport { FirestoreRecord } from '../shared/lib/firestore';\nimport _ from 'lodash';\n"
  },
  {
    "path": "src/db/pdf.ts",
    "chars": 1290,
    "preview": "import { Also, Prop, parseJSONText } from 'civkit';\nimport { FirestoreRecord } from '../shared/lib/firestore';\nimport _ "
  },
  {
    "path": "src/db/searched.ts",
    "chars": 1466,
    "preview": "import { Also, parseJSONText, Prop } from 'civkit';\nimport { FirestoreRecord } from '../shared/lib/firestore';\nimport _ "
  },
  {
    "path": "src/dto/adaptive-crawler-options.ts",
    "chars": 1811,
    "preview": "import { Also, AutoCastable, Prop, RPC_CALL_ENVIRONMENT } from 'civkit';\nimport type { Request, Response } from 'express"
  },
  {
    "path": "src/dto/crawler-options.ts",
    "chars": 26542,
    "preview": "import { Also, AutoCastable, ParamValidationError, Prop, RPC_CALL_ENVIRONMENT } from 'civkit/civ-rpc';\nimport { FancyFil"
  },
  {
    "path": "src/dto/jina-embeddings-auth.ts",
    "chars": 8899,
    "preview": "import _ from 'lodash';\nimport {\n    Also, AuthenticationFailedError, AuthenticationRequiredError,\n    RPC_CALL_ENVIRONM"
  },
  {
    "path": "src/dto/turndown-tweakable-options.ts",
    "chars": 1717,
    "preview": "import { AutoCastable, Prop } from 'civkit/civ-rpc';\nimport {Context} from '../services/registry';\nimport _ from 'lodash"
  },
  {
    "path": "src/fetch.d.ts",
    "chars": 286,
    "preview": "declare global {\n    export const {\n        fetch,\n        FormData,\n        Headers,\n        Request,\n        Response,"
  },
  {
    "path": "src/lib/transform-server-event-stream.ts",
    "chars": 4933,
    "preview": "import { TPM, parseJSONText } from 'civkit';\nimport { Transform, TransformCallback, TransformOptions } from 'stream';\n\ne"
  },
  {
    "path": "src/services/alt-text.ts",
    "chars": 5499,
    "preview": "import { AssertionFailureError, AsyncService, HashManager } from 'civkit';\nimport { singleton } from 'tsyringe';\nimport "
  },
  {
    "path": "src/services/async-context.ts",
    "chars": 319,
    "preview": "import { GlobalAsyncContext } from 'civkit/async-context';\nimport { container, singleton } from 'tsyringe';\n\n@singleton("
  },
  {
    "path": "src/services/blackhole-detector.ts",
    "chars": 2499,
    "preview": "import { singleton } from 'tsyringe';\nimport { AsyncService } from 'civkit/async-service';\nimport { GlobalLogger } from "
  },
  {
    "path": "src/services/brave-search.ts",
    "chars": 7406,
    "preview": "import { AsyncService, AutoCastable, DownstreamServiceFailureError, Prop, RPC_CALL_ENVIRONMENT, delay, marshalErrorLike "
  },
  {
    "path": "src/services/canvas.ts",
    "chars": 7561,
    "preview": "import { singleton, container } from 'tsyringe';\nimport { AsyncService, mimeOf, ParamValidationError, SubmittedDataMalfo"
  },
  {
    "path": "src/services/cf-browser-rendering.ts",
    "chars": 1725,
    "preview": "import { container, singleton } from 'tsyringe';\nimport { AsyncService } from 'civkit/async-service';\nimport { SecretExp"
  },
  {
    "path": "src/services/curl.ts",
    "chars": 18847,
    "preview": "import { AsyncService } from 'civkit/async-service';\nimport { singleton } from 'tsyringe';\n\nimport { Curl, CurlCode, Cur"
  },
  {
    "path": "src/services/errors.ts",
    "chars": 1365,
    "preview": "import { ApplicationError, StatusCode } from 'civkit/civ-rpc';\nimport _ from 'lodash';\nimport dayjs from 'dayjs';\nimport"
  },
  {
    "path": "src/services/finalizer.ts",
    "chars": 1151,
    "preview": "import { AbstractFinalizerService } from 'civkit/finalizer';\nimport { container, singleton } from 'tsyringe';\nimport { i"
  },
  {
    "path": "src/services/geoip.ts",
    "chars": 3289,
    "preview": "import { container, singleton } from 'tsyringe';\nimport fsp from 'fs/promises';\nimport { CityResponse, Reader } from 'ma"
  },
  {
    "path": "src/services/jsdom.ts",
    "chars": 15555,
    "preview": "import { container, singleton } from 'tsyringe';\nimport { GlobalLogger } from './logger';\nimport { ExtendedSnapshot, Img"
  },
  {
    "path": "src/services/lm.ts",
    "chars": 5163,
    "preview": "import { AsyncService } from 'civkit/async-service';\nimport { singleton } from 'tsyringe';\n\nimport { PageSnapshot } from"
  },
  {
    "path": "src/services/logger.ts",
    "chars": 1764,
    "preview": "import { AbstractPinoLogger } from 'civkit/pino-logger';\nimport { singleton, container } from 'tsyringe';\nimport { threa"
  },
  {
    "path": "src/services/minimal-stealth.js",
    "chars": 25790,
    "preview": "\n\nexport function minimalStealth() {\n    /**\n     * A set of shared utility functions specifically for the purpose of mo"
  },
  {
    "path": "src/services/misc.ts",
    "chars": 3623,
    "preview": "import { singleton } from 'tsyringe';\nimport { AsyncService } from 'civkit/async-service';\nimport { ParamValidationError"
  },
  {
    "path": "src/services/pdf-extract.ts",
    "chars": 13643,
    "preview": "import { singleton } from 'tsyringe';\nimport _ from 'lodash';\nimport { TextItem } from 'pdfjs-dist/types/src/display/api"
  },
  {
    "path": "src/services/pseudo-transfer.ts",
    "chars": 1938,
    "preview": "import { marshalErrorLike } from 'civkit';\nimport { AbstractPseudoTransfer, SYM_PSEUDO_TRANSFERABLE } from 'civkit/pseud"
  },
  {
    "path": "src/services/puppeteer.ts",
    "chars": 48794,
    "preview": "import _ from 'lodash';\nimport { isIP } from 'net';\nimport { readFile } from 'fs/promises';\nimport fs from 'fs';\nimport "
  },
  {
    "path": "src/services/registry.ts",
    "chars": 2064,
    "preview": "import { propertyInjectorFactory } from 'civkit/property-injector';\nimport { KoaRPCRegistry } from 'civkit/civ-rpc/koa';"
  },
  {
    "path": "src/services/robots-text.ts",
    "chars": 4400,
    "preview": "import { singleton } from 'tsyringe';\nimport { URL } from 'url';\nimport { AssertionFailureError, DownstreamServiceFailur"
  },
  {
    "path": "src/services/serp/compat.ts",
    "chars": 280,
    "preview": "export interface WebSearchEntry {\n    link: string;\n    title: string;\n    source?: string;\n    date?: string;\n    snipp"
  },
  {
    "path": "src/services/serp/google.ts",
    "chars": 11267,
    "preview": "import { singleton } from 'tsyringe';\nimport { AsyncService } from 'civkit/async-service';\nimport { GlobalLogger } from "
  },
  {
    "path": "src/services/serp/internal.ts",
    "chars": 2357,
    "preview": "\nimport { singleton } from 'tsyringe';\nimport { GlobalLogger } from '../logger';\nimport { SecretExposer } from '../../sh"
  },
  {
    "path": "src/services/serp/puppeteer.ts",
    "chars": 25396,
    "preview": "import _ from 'lodash';\nimport { readFile } from 'fs/promises';\nimport { container, singleton } from 'tsyringe';\n\nimport"
  },
  {
    "path": "src/services/serp/serper.ts",
    "chars": 5779,
    "preview": "\nimport { singleton } from 'tsyringe';\nimport { GlobalLogger } from '../logger';\nimport { SecretExposer } from '../../sh"
  },
  {
    "path": "src/services/serper-search.ts",
    "chars": 8335,
    "preview": "import { AsyncService, AutoCastable, DownstreamServiceFailureError, Prop, RPC_CALL_ENVIRONMENT, delay, marshalErrorLike "
  },
  {
    "path": "src/services/snapshot-formatter.ts",
    "chars": 37302,
    "preview": "import { randomUUID } from 'crypto';\nimport { container, singleton } from 'tsyringe';\nimport { AssertionFailureError, As"
  },
  {
    "path": "src/services/temp-file.ts",
    "chars": 550,
    "preview": "import { AbstractTempFileManger } from 'civkit/temp';\nimport { rm } from 'fs/promises';\nimport { singleton } from 'tsyri"
  },
  {
    "path": "src/services/threaded.ts",
    "chars": 2118,
    "preview": "import 'reflect-metadata';\n\nimport { singleton, container } from 'tsyringe';\nimport { AbstractThreadedServiceRegistry } "
  },
  {
    "path": "src/stand-alone/crawl.ts",
    "chars": 5557,
    "preview": "import 'reflect-metadata';\nimport { container, singleton } from 'tsyringe';\n\nimport { KoaServer } from 'civkit/civ-rpc/k"
  },
  {
    "path": "src/stand-alone/search.ts",
    "chars": 5781,
    "preview": "import 'reflect-metadata';\nimport { container, singleton } from 'tsyringe';\n\nimport { KoaServer } from 'civkit/civ-rpc/k"
  },
  {
    "path": "src/stand-alone/serp.ts",
    "chars": 5877,
    "preview": "import 'reflect-metadata';\nimport { container, singleton } from 'tsyringe';\n\nimport { KoaServer } from 'civkit/civ-rpc/k"
  },
  {
    "path": "src/types.d.ts",
    "chars": 767,
    "preview": "declare module 'langdetect' {\n    interface DetectionResult {\n        lang: string;\n        prob: number;\n    }\n\n    exp"
  },
  {
    "path": "src/utils/encoding.ts",
    "chars": 958,
    "preview": "import { createReadStream } from 'fs';\nimport { Readable } from 'stream';\nimport { TextDecoderStream } from 'stream/web'"
  },
  {
    "path": "src/utils/get-function-url.ts",
    "chars": 875,
    "preview": "import { GoogleAuth } from 'google-auth-library';\n\n/**\n * Get the URL of a given v2 cloud function.\n *\n * @param {string"
  },
  {
    "path": "src/utils/ip.ts",
    "chars": 4013,
    "preview": "import { isIPv4, isIPv6 } from 'net';\n\nexport function parseIp(ip: string): Buffer {\n    if (isIPv4(ip)) {\n        const"
  },
  {
    "path": "src/utils/markdown.ts",
    "chars": 1775,
    "preview": "\nexport function tidyMarkdown(markdown: string): string {\n\n    // Step 1: Handle complex broken links with text and opti"
  },
  {
    "path": "src/utils/misc.ts",
    "chars": 621,
    "preview": "import { ParamValidationError } from 'civkit';\n\nexport function cleanAttribute(attribute: string | null) {\n    return at"
  },
  {
    "path": "src/utils/tailwind-classes.ts",
    "chars": 208441,
    "preview": "export const tailwindClasses = new Set([\n    \"aspect-auto\",\n    \"aspect-square\",\n    \"aspect-video\",\n    \"container\",\n  "
  },
  {
    "path": "tsconfig.json",
    "chars": 560,
    "preview": "{\n    \"compilerOptions\": {\n        \"module\": \"node16\",\n\n        \"noImplicitReturns\": true,\n        \"noUnusedLocals\": tru"
  }
]

About this extraction

This page contains the full source code of the jina-ai/reader GitHub repository, extracted and formatted as plain text for AI agents and large language models (LLMs). The extraction includes 72 files (713.6 KB), approximately 186.7k tokens, and a symbol index with 396 extracted functions, classes, methods, constants, and types. Use this with OpenClaw, Claude, ChatGPT, Cursor, Windsurf, or any other AI tool that accepts text input. You can copy the full output to your clipboard or download it as a .txt file.

Extracted by GitExtract — free GitHub repo to text converter for AI. Built by Nikandr Surkov.

Extract another repo