Repository: website-scraper/website-scraper-puppeteer Branch: master Commit: 5d55fe463900 Files: 18 Total size: 20.1 KB Directory structure: gitextract_5f61e0q7/ ├── .eslintrc.yml ├── .github/ │ ├── dependabot.yml │ └── workflows/ │ ├── codeql.yml │ ├── node.js.yml │ ├── publish.yml │ ├── sponsors.yml │ └── stale.yml ├── .gitignore ├── LICENSE ├── README.md ├── lib/ │ ├── browserUtils/ │ │ ├── .eslintrc.yml │ │ └── scrollToBottom.js │ ├── index.js │ └── logger.js ├── package.json └── test/ ├── mock/ │ ├── index.html │ └── navigation.html └── puppeteer-plugin.test.js ================================================ FILE CONTENTS ================================================ ================================================ FILE: .eslintrc.yml ================================================ extends: "eslint:recommended" parserOptions: ecmaVersion: 8 sourceType: "module" env: node: true es6: true rules: consistent-return: "error" curly: "error" default-case: "error" dot-notation: "error" eqeqeq: "error" no-extend-native: "error" no-implicit-coercion: "error" no-loop-func: "error" no-multi-spaces: "error" no-throw-literal: "error" global-require: "error" no-path-concat: "error" brace-style: ["error", "1tbs", {allowSingleLine: true}] camelcase: "error" consistent-this: ["error", "self"] indent: ["error", "tab", {SwitchCase: 1}] linebreak-style: ["error", "unix"] eol-last: "error" quotes: ["error", "single"] semi: "error" space-infix-ops: "error" space-unary-ops: "error" func-names: "warn" space-before-function-paren: "warn" no-spaced-func: "warn" keyword-spacing: "error" space-before-blocks: "error" no-console: "error" ================================================ FILE: .github/dependabot.yml ================================================ version: 2 updates: - package-ecosystem: "npm" directory: "/" assignees: - "s0ph1e" open-pull-requests-limit: 10 schedule: interval: "weekly" - package-ecosystem: "github-actions" directory: "/" assignees: - "aivus" schedule: interval: "weekly" ================================================ FILE: .github/workflows/codeql.yml ================================================ name: "CodeQL" on: push: branches: ["master"] pull_request: branches: ["master"] schedule: - cron: '0 1 * * 2' jobs: analyze: name: Analyze runs-on: ubuntu-latest permissions: actions: read contents: read security-events: write strategy: fail-fast: false matrix: language: [ 'javascript' ] steps: - name: Checkout repository uses: actions/checkout@v6 - name: Initialize CodeQL uses: github/codeql-action/init@v4 with: languages: ${{ matrix.language }} - name: Autobuild uses: github/codeql-action/autobuild@v4 - name: Perform CodeQL Analysis uses: github/codeql-action/analyze@v4 with: category: "/language:${{matrix.language}}" ================================================ FILE: .github/workflows/node.js.yml ================================================ name: Node.js CI on: push: branches: [ master ] pull_request: branches: [ master ] schedule: - cron: '17 2 * * *' workflow_dispatch: ~ jobs: test: timeout-minutes: 10 runs-on: ${{ matrix.os }} strategy: fail-fast: false matrix: node-version: - 20 - 22 - 24 - current os: - ubuntu-latest - windows-latest include: - node-version: 24 os: macos-latest steps: - uses: actions/checkout@v6 - name: Use Node.js ${{ matrix.node-version }} uses: actions/setup-node@v6 with: node-version: ${{ matrix.node-version }} - run: npm i - name: Disable AppArmor if: ${{ matrix.os == 'ubuntu-latest' }} run: echo 0 | sudo tee /proc/sys/kernel/apparmor_restrict_unprivileged_userns - run: npm test - run: npm run eslint if: ${{ matrix.node-version == '24' && matrix.os == 'ubuntu-latest' }} - name: Publish Qlty code coverage if: ${{ matrix.node-version == '24' && matrix.os == 'ubuntu-latest' }} uses: qltysh/qlty-action/coverage@v2 with: token: ${{ secrets.QLTY_COVERAGE_TOKEN }} files: coverage/lcov.info ================================================ FILE: .github/workflows/publish.yml ================================================ name: Create a tag and publish to npm on: workflow_dispatch: inputs: bump: description: 'Version bump type' type: choice required: true default: 'minor' options: - patch - minor - major - premajor - preminor - prepatch - prerelease preid: description: 'Prerelease identifier (see "npm version") for pre* bumps' type: string required: false npmPreTag: description: 'NPM tag used for all pre* bumps' type: string default: 'next' required: false dryRun: description: 'Run in "dry run" mode' type: boolean default: false required: true permissions: id-token: write contents: write jobs: publish: runs-on: ubuntu-latest steps: - name: Checkout code uses: actions/checkout@v6 with: # Required to allow push to master without the checks/PR token: ${{ secrets.GH_PUSH_TOKEN }} - name: Set up Node.js uses: actions/setup-node@v6 with: node-version: '24' registry-url: 'https://registry.npmjs.org' - name: Update npm run: npm install -g npm@latest - name: Install dependencies run: npm install - name: List of installed dependencies run: npm ls -a - name: Verifying provenance attestations run: npm audit signatures - name: Disable AppArmor run: echo 0 | sudo tee /proc/sys/kernel/apparmor_restrict_unprivileged_userns - name: Run tests run: npm test - name: Bump version and create tag id: bump-version env: PREID_FLAG: ${{ startsWith(inputs.bump, 'pre') && inputs.preid && format('--preid {0}', inputs.preid) || '' }} run: | git config --global user.name "github-actions[bot]" git config --global user.email "github-actions[bot]@users.noreply.github.com" TAG=$(npm version ${{ github.event.inputs.bump }} $PREID_FLAG -m "Release %s") echo "Created tag: $TAG" echo "tag=$TAG" >> "$GITHUB_OUTPUT" - name: Publish to npm env: DRY_RUN_FLAG: ${{ inputs.dryRun && '--dry-run' || '' }} TAG_FLAG: ${{ startsWith(inputs.bump, 'pre') && format('--tag {0}', inputs.npmPreTag) || ''}} run: npm publish --provenance --access=public $DRY_RUN_FLAG $TAG_FLAG - name: Push changes to master if: ${{ !inputs.dryRun }} run: | git push origin master --follow-tags - name: Create GitHub Release if: ${{ !inputs.dryRun }} uses: softprops/action-gh-release@b4309332981a82ec1c5618f44dd2e27cc8bfbfda # v3.0.0 with: tag_name: ${{ steps.bump-version.outputs.tag }} generate_release_notes: true env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} ================================================ FILE: .github/workflows/sponsors.yml ================================================ name: Generate Sponsors list on: workflow_dispatch: schedule: - cron: '31 6 * * *' permissions: contents: write jobs: deploy: runs-on: ubuntu-latest steps: - name: Checkout 🛎️ uses: actions/checkout@v6 with: # Required to allow push to master without the checks/PR token: ${{ secrets.GH_PUSH_TOKEN }} - name: Generate Sponsors 💖 uses: JamesIves/github-sponsors-readme-action@2fd9142e765f755780202122261dc85e78459405 # v1.6.0 with: token: ${{ secrets.SOFIIA_SPONSORS_READ_TOKEN }} file: 'README.md' active-only: false include-private: true - name: Commit changes run: | git config --global user.name "github-actions[bot]" git config --global user.email "github-actions[bot]@users.noreply.github.com" git add README.md # Check if there are any staged changes if git diff --cached --quiet; then echo "No changes to commit." exit 0 fi git commit -m "Update list of sponsors" git push origin master ================================================ FILE: .github/workflows/stale.yml ================================================ # This workflow warns and then closes issues and PRs that have had no activity for a specified amount of time. # # You can adjust the behavior by modifying this file. # For more information, see: # https://github.com/actions/stale name: Mark stale issues and pull requests on: workflow_dispatch: ~ schedule: - cron: '39 3 * * *' jobs: stale: runs-on: ubuntu-latest permissions: issues: write steps: - uses: actions/stale@v10 with: repo-token: ${{ secrets.GITHUB_TOKEN }} days-before-stale: 60 days-before-close: 7 # Do not stale PRs days-before-pr-stale: -1 days-before-pr-close: -1 exempt-issue-labels: 'bug,maybe-later,help wanted' stale-issue-message: 'This issue has been automatically marked as stale because it has not had recent activity. It will be closed if no further activity occurs. Thank you for your contributions.' stale-issue-label: 'wontfix' # debug-only: true ================================================ FILE: .gitignore ================================================ node_modules package-lock.json .idea coverage ================================================ FILE: LICENSE ================================================ MIT License Copyright (c) 2018-2023 Sofiia Antypenko Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ================================================ FILE: README.md ================================================ [![Version](https://img.shields.io/npm/v/website-scraper-puppeteer.svg?style=flat)](https://www.npmjs.org/package/website-scraper-puppeteer) [![Downloads](https://img.shields.io/npm/dm/website-scraper-puppeteer.svg?style=flat)](https://www.npmjs.org/package/website-scraper-puppeteer) [![Node.js CI](https://github.com/website-scraper/website-scraper-puppeteer/actions/workflows/node.js.yml/badge.svg)](https://github.com/website-scraper/website-scraper-puppeteer) [![Code Coverage](https://qlty.sh/gh/website-scraper/projects/website-scraper-puppeteer/coverage.svg)](https://qlty.sh/gh/website-scraper/projects/website-scraper-puppeteer) # website-scraper-puppeteer Plugin for [website-scraper](https://github.com/website-scraper/node-website-scraper) which returns html for dynamic websites using [puppeteer](https://github.com/puppeteer/puppeteer). ## Sponsors Maintenance of this project is made possible by all the [contributors](https://github.com/website-scraper/website-scraper-puppeteer/graphs/contributors) and [sponsors](https://github.com/sponsors/s0ph1e). If you'd like to sponsor this project and have your avatar or company logo appear below [click here](https://github.com/sponsors/s0ph1e). 💖 User avatar: Illia AntypenkoUser avatar: Pascal BirchlerUser avatar: Carlos RufoUser avatar: Francesca MaranoUser avatar: GitHubUser avatar: Andrew VorobiovUser avatar: User avatar: ## Requirements * nodejs version >= 20 * website-scraper version >= 5 ## Installation ```sh npm install website-scraper website-scraper-puppeteer ``` ## Usage ```javascript import scrape from 'website-scraper'; import PuppeteerPlugin from 'website-scraper-puppeteer'; await scrape({ urls: ['https://www.instagram.com/gopro/'], directory: '/path/to/save', plugins: [ new PuppeteerPlugin({ launchOptions: { headless: "new" }, /* optional */ gotoOptions: { waitUntil: "networkidle0" }, /* optional */ scrollToBottom: { timeout: 10000, viewportN: 10 }, /* optional */ }) ] }); ``` Puppeteer plugin constructor accepts next params: * `launchOptions` - *(optional)* - puppeteer launch options, can be found in [puppeteer docs](https://github.com/puppeteer/puppeteer/blob/main/docs/api/puppeteer.puppeteerlaunchoptions.md) * `gotoOptions` - *(optional)* - puppeteer page.goto options, can be found in [puppeteer docs](https://github.com/puppeteer/puppeteer/blob/main/docs/api/puppeteer.frame.goto.md#parameters) * `scrollToBottom` - *(optional)* - in some cases, the page needs to be scrolled down to render its assets (lazyloading). Because some pages can be really endless, the scrolldown process can be interrupted before reaching the bottom when one or both of the bellow limitations are reached: * `timeout` - in milliseconds * `viewportN` - viewport height multiplier ## How it works It starts Chromium in headless mode which just opens page and waits until page is loaded. It is far from ideal because probably you need to wait until some resource is loaded or click some button or log in. Currently this module doesn't support such functionality. ================================================ FILE: lib/browserUtils/.eslintrc.yml ================================================ extends: '../../.eslintrc.yml' env: browser: true ================================================ FILE: lib/browserUtils/scrollToBottom.js ================================================ export default async (timeout, viewportN) => { await new Promise((resolve) => { let totalHeight = 0, distance = 200, duration = 0, maxHeight = window.innerHeight * viewportN; const timer = setInterval(() => { duration += 200; window.scrollBy(0, distance); totalHeight += distance; if (totalHeight >= document.body.scrollHeight || duration >= timeout || totalHeight >= maxHeight) { clearInterval(timer); resolve(); } }, 200); }); }; ================================================ FILE: lib/index.js ================================================ import puppeteer from '@website-scraper/puppeteer-version-wrapper'; import logger from './logger.js'; import scrollToBottomBrowser from './browserUtils/scrollToBottom.js'; class PuppeteerPlugin { constructor ({ launchOptions = {}, gotoOptions = {}, scrollToBottom = null, } = {}) { this.launchOptions = launchOptions; this.gotoOptions = gotoOptions; this.scrollToBottom = scrollToBottom; this.browser = null; this.headers = {}; logger.info('init plugin', { launchOptions, scrollToBottom }); } apply (registerAction) { registerAction('beforeStart', async () => { this.browser = await puppeteer.launch(this.launchOptions); }); registerAction('beforeRequest', async ({requestOptions}) => { if (hasValues(requestOptions.headers)) { this.headers = Object.assign({}, requestOptions.headers); } return {requestOptions}; }); registerAction('afterResponse', async ({response}) => { const contentType = response.headers['content-type']; const isHtml = contentType && contentType.split(';')[0] === 'text/html'; if (isHtml) { const url = response.url; const page = await this.browser.newPage(); if (hasValues(this.headers)) { logger.info('set headers to puppeteer page', this.headers); await page.setExtraHTTPHeaders(this.headers); } await page.goto(url, this.gotoOptions); if (this.scrollToBottom) { await scrollToBottom(page, this.scrollToBottom.timeout, this.scrollToBottom.viewportN); } const content = await page.content(); await page.close(); // convert utf-8 -> binary string because website-scraper needs binary return Buffer.from(content).toString('binary'); } else { return response.body; } }); registerAction('afterFinish', () => this.browser && this.browser.close()); } } function hasValues (obj) { return obj && Object.keys(obj).length > 0; } async function scrollToBottom (page, timeout, viewportN) { logger.info(`scroll puppeteer page to bottom ${viewportN} times with timeout = ${timeout}`); await page.evaluate(scrollToBottomBrowser, timeout, viewportN); } export default PuppeteerPlugin; ================================================ FILE: lib/logger.js ================================================ import debug from 'debug'; const appName = 'website-scraper-puppeteer'; const logLevels = ['error', 'warn', 'info', 'debug', 'log']; const logger = {}; logLevels.forEach(logLevel => { logger[logLevel] = debug(`${appName}:${logLevel}`); }); export default logger; ================================================ FILE: package.json ================================================ { "name": "website-scraper-puppeteer", "version": "2.0.0", "description": "Plugin for website-scraper which returns html for dynamic websites using puppeteer", "readmeFilename": "README.md", "type": "module", "exports": { ".": "./lib/index.js" }, "keywords": [ "website-scraper", "puppeteer", "chromium", "chrome", "headless", "html" ], "dependencies": { "debug": "^4.1.1", "@website-scraper/puppeteer-version-wrapper": "^1.0.0" }, "peerDependencies": { "website-scraper": "^6.0.0" }, "devDependencies": { "c8": "^11.0.0", "chai": "^6.0.1", "eslint": "^8.5.0", "finalhandler": "^2.1.0", "fs-extra": "^11.1.0", "mocha": "^11.0.1", "serve-static": "^2.2.0", "website-scraper": "^6.0.0" }, "scripts": { "test": "c8 --all --reporter=text --reporter=lcov mocha --recursive --timeout 15000", "eslint": "eslint lib/**" }, "repository": { "type": "git", "url": "git+https://github.com/website-scraper/website-scraper-puppeteer.git" }, "author": "Sofiia Antypenko ", "license": "MIT", "bugs": { "url": "https://github.com/website-scraper/website-scraper-puppeteer/issues" }, "homepage": "https://github.com/website-scraper/website-scraper-puppeteer#readme", "files": [ "lib" ], "engines": { "node": ">=20" } } ================================================ FILE: test/mock/index.html ================================================ Test
================================================ FILE: test/mock/navigation.html ================================================ Test
================================================ FILE: test/puppeteer-plugin.test.js ================================================ import { expect } from 'chai'; import http from 'http'; import finalhandler from 'finalhandler'; import serveStatic from 'serve-static'; import fs from 'fs-extra'; import scrape from 'website-scraper'; import PuppeteerPlugin from '../lib/index.js'; const directory = './test/tmp'; const SERVE_WEBSITE_PORT = 4567; describe('Puppeteer plugin test', () => { let result, content, server; before('start webserver', () => server = startWebserver(SERVE_WEBSITE_PORT)); after('stop webserver', () => server.close()) describe('Dynamic content', () => { before('scrape website', async () => { result = await scrape({ urls: [`http://localhost:${SERVE_WEBSITE_PORT}`], directory: directory, plugins: [ new PuppeteerPlugin({ scrollToBottom: { timeout: 50, viewportN: 10 } }) ] }); }); before('get content from file', () => { content = fs.readFileSync(`${directory}/${result[0].filename}`).toString(); }); after('delete dir', () => fs.removeSync(directory)); it('should have 1 item in result array', () => { expect(result.length).eql(1); }); it('should render dymanic website', async () => { expect(content).to.contain('
Hello world from JS!
'); }); it('should render special characters correctly', async () => { expect(content).to.contain('
7년 동안 한국에서 살았어요. Слава Україні!
'); }); }); }); function startWebserver(port = 3000) { const serve = serveStatic('./test/mock', {'index': ['index.html']}); const server = http.createServer(function onRequest (req, res) { serve(req, res, finalhandler(req, res)) }); return server.listen(port) }