Repository: website-scraper/website-scraper-puppeteer
Branch: master
Commit: 5d55fe463900
Files: 18
Total size: 20.1 KB

Directory structure:
gitextract_5f61e0q7/

├── .eslintrc.yml
├── .github/
│   ├── dependabot.yml
│   └── workflows/
│       ├── codeql.yml
│       ├── node.js.yml
│       ├── publish.yml
│       ├── sponsors.yml
│       └── stale.yml
├── .gitignore
├── LICENSE
├── README.md
├── lib/
│   ├── browserUtils/
│   │   ├── .eslintrc.yml
│   │   └── scrollToBottom.js
│   ├── index.js
│   └── logger.js
├── package.json
└── test/
    ├── mock/
    │   ├── index.html
    │   └── navigation.html
    └── puppeteer-plugin.test.js

================================================
FILE CONTENTS
================================================

================================================
FILE: .eslintrc.yml
================================================
extends: "eslint:recommended"
parserOptions:
  ecmaVersion: 8
  sourceType: "module"
env:
  node: true
  es6: true
rules:
  consistent-return: "error"
  curly: "error"
  default-case: "error"
  dot-notation: "error"
  eqeqeq: "error"
  no-extend-native: "error"
  no-implicit-coercion: "error"
  no-loop-func: "error"
  no-multi-spaces: "error"
  no-throw-literal: "error"
  global-require: "error"
  no-path-concat: "error"
  brace-style: ["error", "1tbs", {allowSingleLine: true}]
  camelcase: "error"
  consistent-this: ["error", "self"]
  indent: ["error", "tab", {SwitchCase: 1}]
  linebreak-style: ["error", "unix"]
  eol-last: "error"
  quotes: ["error", "single"]
  semi: "error"
  space-infix-ops: "error"
  space-unary-ops: "error"
  func-names: "warn"
  space-before-function-paren: "warn"
  no-spaced-func: "warn"
  keyword-spacing: "error"
  space-before-blocks: "error"
  no-console: "error"


================================================
FILE: .github/dependabot.yml
================================================
version: 2
updates:
  - package-ecosystem: "npm"
    directory: "/"
    assignees:
      - "s0ph1e"
    open-pull-requests-limit: 10
    schedule:
      interval: "weekly"
  - package-ecosystem: "github-actions"
    directory: "/"
    assignees:
      - "aivus"
    schedule:
      interval: "weekly"


================================================
FILE: .github/workflows/codeql.yml
================================================
name: "CodeQL"

on:
  push:
    branches: ["master"]
  pull_request:
    branches: ["master"]
  schedule:
    - cron: '0 1 * * 2'

jobs:
  analyze:
    name: Analyze
    runs-on: ubuntu-latest
    permissions:
      actions: read
      contents: read
      security-events: write

    strategy:
      fail-fast: false
      matrix:
        language: [ 'javascript' ]

    steps:
    - name: Checkout repository
      uses: actions/checkout@v6

    - name: Initialize CodeQL
      uses: github/codeql-action/init@v4
      with:
        languages: ${{ matrix.language }}
        
    - name: Autobuild
      uses: github/codeql-action/autobuild@v4

    - name: Perform CodeQL Analysis
      uses: github/codeql-action/analyze@v4
      with:
        category: "/language:${{matrix.language}}"


================================================
FILE: .github/workflows/node.js.yml
================================================
name: Node.js CI

on:
  push:
    branches: [ master ]
  pull_request:
    branches: [ master ]
  schedule:
    - cron: '17 2 * * *'
  workflow_dispatch: ~

jobs:
  test:
    timeout-minutes: 10
    runs-on: ${{ matrix.os }}
    strategy:
      fail-fast: false
      matrix:
        node-version:
          - 20
          - 22
          - 24
          - current
        os:
          - ubuntu-latest
          - windows-latest
        include:
          - node-version: 24
            os: macos-latest

    steps:
      - uses: actions/checkout@v6
      - name: Use Node.js ${{ matrix.node-version }}
        uses: actions/setup-node@v6
        with:
          node-version: ${{ matrix.node-version }}
      - run: npm i
      - name: Disable AppArmor
        if: ${{ matrix.os == 'ubuntu-latest' }}
        run: echo 0 | sudo tee /proc/sys/kernel/apparmor_restrict_unprivileged_userns
      - run: npm test
      - run: npm run eslint
        if: ${{ matrix.node-version == '24' && matrix.os == 'ubuntu-latest' }}
      - name: Publish Qlty code coverage
        if: ${{ matrix.node-version == '24' && matrix.os == 'ubuntu-latest' }}
        uses: qltysh/qlty-action/coverage@v2
        with:
          token: ${{ secrets.QLTY_COVERAGE_TOKEN }}
          files: coverage/lcov.info


================================================
FILE: .github/workflows/publish.yml
================================================
name: Create a tag and publish to npm

on:
  workflow_dispatch:
    inputs:
      bump:
        description: 'Version bump type'
        type: choice
        required: true
        default: 'minor'
        options:
          - patch
          - minor
          - major
          - premajor
          - preminor
          - prepatch
          - prerelease
      preid:
        description: 'Prerelease identifier (see "npm version") for pre* bumps'
        type: string
        required: false
      npmPreTag:
        description: 'NPM tag used for all pre* bumps'
        type: string
        default: 'next'
        required: false
      dryRun:
        description: 'Run in "dry run" mode'
        type: boolean
        default: false
        required: true

permissions:
  id-token: write
  contents: write

jobs:
  publish:
    runs-on: ubuntu-latest

    steps:
      - name: Checkout code
        uses: actions/checkout@v6
        with:
          # Required to allow push to master without the checks/PR
          token: ${{ secrets.GH_PUSH_TOKEN }}

      - name: Set up Node.js
        uses: actions/setup-node@v6
        with:
          node-version: '24'
          registry-url: 'https://registry.npmjs.org'

      - name: Update npm
        run: npm install -g npm@latest

      - name: Install dependencies
        run: npm install

      - name: List of installed dependencies
        run: npm ls -a

      - name: Verifying provenance attestations
        run: npm audit signatures

      - name: Disable AppArmor
        run: echo 0 | sudo tee /proc/sys/kernel/apparmor_restrict_unprivileged_userns

      - name: Run tests
        run: npm test

      - name: Bump version and create tag
        id: bump-version
        env:
          PREID_FLAG: ${{ startsWith(inputs.bump, 'pre') && inputs.preid && format('--preid {0}', inputs.preid) || '' }}
        run: |
          git config --global user.name "github-actions[bot]"
          git config --global user.email "github-actions[bot]@users.noreply.github.com"
          TAG=$(npm version ${{ github.event.inputs.bump }} $PREID_FLAG -m "Release %s")
          echo "Created tag: $TAG"
          echo "tag=$TAG" >> "$GITHUB_OUTPUT"

      - name: Publish to npm
        env:
          DRY_RUN_FLAG: ${{ inputs.dryRun && '--dry-run' || '' }}
          TAG_FLAG: ${{ startsWith(inputs.bump, 'pre') && format('--tag {0}', inputs.npmPreTag) || ''}}
        run: npm publish --provenance --access=public $DRY_RUN_FLAG $TAG_FLAG

      - name: Push changes to master
        if: ${{ !inputs.dryRun }}
        run: |
          git push origin master --follow-tags

      - name: Create GitHub Release
        if: ${{ !inputs.dryRun }}
        uses: softprops/action-gh-release@b4309332981a82ec1c5618f44dd2e27cc8bfbfda # v3.0.0
        with:
          tag_name: ${{ steps.bump-version.outputs.tag }}
          generate_release_notes: true
        env:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}


================================================
FILE: .github/workflows/sponsors.yml
================================================
name: Generate Sponsors list
on:
  workflow_dispatch:
  schedule:
    - cron: '31 6 * * *'
permissions:
  contents: write
jobs:
  deploy:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout 🛎️
        uses: actions/checkout@v6
        with:
          # Required to allow push to master without the checks/PR
          token: ${{ secrets.GH_PUSH_TOKEN }}

      - name: Generate Sponsors 💖
        uses: JamesIves/github-sponsors-readme-action@2fd9142e765f755780202122261dc85e78459405 # v1.6.0
        with:
          token: ${{ secrets.SOFIIA_SPONSORS_READ_TOKEN }}
          file: 'README.md'
          active-only: false
          include-private: true

      - name: Commit changes
        run: |
          git config --global user.name "github-actions[bot]"
          git config --global user.email "github-actions[bot]@users.noreply.github.com"
          git add README.md
          
          # Check if there are any staged changes
          if git diff --cached --quiet; then
            echo "No changes to commit."
            exit 0
          fi
          
          git commit -m "Update list of sponsors"
          git push origin master


================================================
FILE: .github/workflows/stale.yml
================================================
# This workflow warns and then closes issues and PRs that have had no activity for a specified amount of time.
#
# You can adjust the behavior by modifying this file.
# For more information, see:
# https://github.com/actions/stale
name: Mark stale issues and pull requests

on:
  workflow_dispatch: ~
  schedule:
    - cron: '39 3 * * *'

jobs:
  stale:
    runs-on: ubuntu-latest
    permissions:
      issues: write

    steps:
      - uses: actions/stale@v10
        with:
          repo-token: ${{ secrets.GITHUB_TOKEN }}
          days-before-stale: 60
          days-before-close: 7
          
          # Do not stale PRs
          days-before-pr-stale: -1
          days-before-pr-close: -1
          
          exempt-issue-labels: 'bug,maybe-later,help wanted'
          stale-issue-message: 'This issue has been automatically marked as stale because it has not had recent activity. It will be closed if no further activity occurs. Thank you for your contributions.'
          stale-issue-label: 'wontfix'
          # debug-only: true


================================================
FILE: .gitignore
================================================
node_modules
package-lock.json
.idea
coverage


================================================
FILE: LICENSE
================================================
MIT License

Copyright (c) 2018-2023 Sofiia Antypenko

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.


================================================
FILE: README.md
================================================
[![Version](https://img.shields.io/npm/v/website-scraper-puppeteer.svg?style=flat)](https://www.npmjs.org/package/website-scraper-puppeteer)
[![Downloads](https://img.shields.io/npm/dm/website-scraper-puppeteer.svg?style=flat)](https://www.npmjs.org/package/website-scraper-puppeteer)
[![Node.js CI](https://github.com/website-scraper/website-scraper-puppeteer/actions/workflows/node.js.yml/badge.svg)](https://github.com/website-scraper/website-scraper-puppeteer)
[![Code Coverage](https://qlty.sh/gh/website-scraper/projects/website-scraper-puppeteer/coverage.svg)](https://qlty.sh/gh/website-scraper/projects/website-scraper-puppeteer)

# website-scraper-puppeteer
Plugin for [website-scraper](https://github.com/website-scraper/node-website-scraper) which returns html for dynamic websites using [puppeteer](https://github.com/puppeteer/puppeteer).

## Sponsors
Maintenance of this project is made possible by all the [contributors](https://github.com/website-scraper/website-scraper-puppeteer/graphs/contributors) and [sponsors](https://github.com/sponsors/s0ph1e).
If you'd like to sponsor this project and have your avatar or company logo appear below [click here](https://github.com/sponsors/s0ph1e). 💖

<!-- sponsors --><a href="https://github.com/aivus"><img src="https:&#x2F;&#x2F;github.com&#x2F;aivus.png" width="60px" alt="User avatar: Illia Antypenko" /></a><a href="https://github.com/swissspidy"><img src="https:&#x2F;&#x2F;github.com&#x2F;swissspidy.png" width="60px" alt="User avatar: Pascal Birchler" /></a><a href="https://github.com/itscarlosrufo"><img src="https:&#x2F;&#x2F;github.com&#x2F;itscarlosrufo.png" width="60px" alt="User avatar: Carlos Rufo" /></a><a href="https://github.com/francescamarano"><img src="https:&#x2F;&#x2F;github.com&#x2F;francescamarano.png" width="60px" alt="User avatar: Francesca Marano" /></a><a href="https://github.com/github"><img src="https:&#x2F;&#x2F;github.com&#x2F;github.png" width="60px" alt="User avatar: GitHub" /></a><a href="https://github.com/Belrestro"><img src="https:&#x2F;&#x2F;github.com&#x2F;Belrestro.png" width="60px" alt="User avatar: Andrew Vorobiov" /></a><a href="https://github.com/Effiezhu"><img src="https:&#x2F;&#x2F;github.com&#x2F;Effiezhu.png" width="60px" alt="User avatar: " /></a><a href="https://github.com/slicemedia"><img src="https:&#x2F;&#x2F;github.com&#x2F;slicemedia.png" width="60px" alt="User avatar: " /></a><!-- sponsors -->

## Requirements
* nodejs version >= 20
* website-scraper version >= 5

## Installation
```sh
npm install website-scraper website-scraper-puppeteer
```

## Usage
```javascript
import scrape from 'website-scraper';
import PuppeteerPlugin from 'website-scraper-puppeteer';

await scrape({
    urls: ['https://www.instagram.com/gopro/'],
    directory: '/path/to/save',
    plugins: [ 
      new PuppeteerPlugin({
        launchOptions: { headless: "new" }, /* optional */
        gotoOptions: { waitUntil: "networkidle0" }, /* optional */
        scrollToBottom: { timeout: 10000, viewportN: 10 }, /* optional */
      })
    ]
});
```
Puppeteer plugin constructor accepts next params:
* `launchOptions` - *(optional)* - puppeteer launch options, can be found in [puppeteer docs](https://github.com/puppeteer/puppeteer/blob/main/docs/api/puppeteer.puppeteerlaunchoptions.md)
* `gotoOptions` - *(optional)* - puppeteer page.goto options, can be found in [puppeteer docs](https://github.com/puppeteer/puppeteer/blob/main/docs/api/puppeteer.frame.goto.md#parameters)
* `scrollToBottom` - *(optional)* - in some cases, the page needs to be scrolled down to render its assets (lazyloading). Because some pages can be really endless, the scrolldown process can be interrupted before reaching the bottom when one or both of the bellow limitations are reached:
    * `timeout` - in milliseconds
    * `viewportN` - viewport height multiplier

## How it works
It starts Chromium in headless mode which just opens page and waits until page is loaded.
It is far from ideal because probably you need to wait until some resource is loaded or click some button or log in. Currently this module doesn't support such functionality.


================================================
FILE: lib/browserUtils/.eslintrc.yml
================================================
extends: '../../.eslintrc.yml'
env:
  browser: true


================================================
FILE: lib/browserUtils/scrollToBottom.js
================================================
export default async (timeout, viewportN) => {
	await new Promise((resolve) => {
		let totalHeight = 0, distance = 200, duration = 0, maxHeight = window.innerHeight * viewportN;
		const timer = setInterval(() => {
			duration += 200;
			window.scrollBy(0, distance);
			totalHeight += distance;
			if (totalHeight >= document.body.scrollHeight || duration >= timeout || totalHeight >= maxHeight) {
				clearInterval(timer);
				resolve();
			}
		}, 200);
	});
};


================================================
FILE: lib/index.js
================================================
import puppeteer from '@website-scraper/puppeteer-version-wrapper';
import logger from './logger.js';
import scrollToBottomBrowser from './browserUtils/scrollToBottom.js';

class PuppeteerPlugin {
	constructor ({
		launchOptions = {},
		gotoOptions = {},
		scrollToBottom = null,
	} = {}) {
		this.launchOptions = launchOptions;
		this.gotoOptions = gotoOptions;
		this.scrollToBottom = scrollToBottom;
		this.browser = null;
		this.headers = {};

		logger.info('init plugin', { launchOptions, scrollToBottom });
	}

	apply (registerAction) {
		registerAction('beforeStart', async () => {
			this.browser = await puppeteer.launch(this.launchOptions);
		});

		registerAction('beforeRequest', async ({requestOptions}) => {
			if (hasValues(requestOptions.headers)) {
				this.headers = Object.assign({}, requestOptions.headers);
			}
			return {requestOptions};
		});

		registerAction('afterResponse', async ({response}) => {
			const contentType = response.headers['content-type'];
			const isHtml = contentType && contentType.split(';')[0] === 'text/html';
			if (isHtml) {
				const url = response.url;
				const page = await this.browser.newPage();

				if (hasValues(this.headers)) {
					logger.info('set headers to puppeteer page', this.headers);
					await page.setExtraHTTPHeaders(this.headers);
				}

				await page.goto(url, this.gotoOptions);

				if (this.scrollToBottom) {
					await scrollToBottom(page, this.scrollToBottom.timeout, this.scrollToBottom.viewportN);
				}

				const content = await page.content();
				await page.close();

				// convert utf-8 -> binary string because website-scraper needs binary
				return Buffer.from(content).toString('binary');
			} else {
				return response.body;
			}
		});

		registerAction('afterFinish', () => this.browser && this.browser.close());
	}
}

function hasValues (obj) {
	return obj && Object.keys(obj).length > 0;
}


async function scrollToBottom (page, timeout, viewportN) {
	logger.info(`scroll puppeteer page to bottom ${viewportN} times with timeout = ${timeout}`);

	await page.evaluate(scrollToBottomBrowser, timeout, viewportN);
}

export default PuppeteerPlugin;


================================================
FILE: lib/logger.js
================================================
import debug from 'debug';

const appName = 'website-scraper-puppeteer';
const logLevels = ['error', 'warn', 'info', 'debug', 'log'];

const logger = {};
logLevels.forEach(logLevel => {
	logger[logLevel] = debug(`${appName}:${logLevel}`);
});

export default logger;


================================================
FILE: package.json
================================================
{
  "name": "website-scraper-puppeteer",
  "version": "2.0.0",
  "description": "Plugin for website-scraper which returns html for dynamic websites using puppeteer",
  "readmeFilename": "README.md",
  "type": "module",
  "exports": {
    ".": "./lib/index.js"
  },
  "keywords": [
    "website-scraper",
    "puppeteer",
    "chromium",
    "chrome",
    "headless",
    "html"
  ],
  "dependencies": {
    "debug": "^4.1.1",
    "@website-scraper/puppeteer-version-wrapper": "^1.0.0"
  },
  "peerDependencies": {
    "website-scraper": "^6.0.0"
  },
  "devDependencies": {
    "c8": "^11.0.0",
    "chai": "^6.0.1",
    "eslint": "^8.5.0",
    "finalhandler": "^2.1.0",
    "fs-extra": "^11.1.0",
    "mocha": "^11.0.1",
    "serve-static": "^2.2.0",
    "website-scraper": "^6.0.0"
  },
  "scripts": {
    "test": "c8 --all --reporter=text --reporter=lcov mocha --recursive --timeout 15000",
    "eslint": "eslint lib/**"
  },
  "repository": {
    "type": "git",
    "url": "git+https://github.com/website-scraper/website-scraper-puppeteer.git"
  },
  "author": "Sofiia Antypenko <sofiia@antypenko.dev>",
  "license": "MIT",
  "bugs": {
    "url": "https://github.com/website-scraper/website-scraper-puppeteer/issues"
  },
  "homepage": "https://github.com/website-scraper/website-scraper-puppeteer#readme",
  "files": [
    "lib"
  ],
  "engines": {
    "node": ">=20"
  }
}


================================================
FILE: test/mock/index.html
================================================
<!DOCTYPE html>
<html lang="en">
<head>
	<meta charset="UTF-8">
	<title>Test</title>
</head>
<body>

<div id="root"></div>
<div id="special-characters-test"></div>

<script>
	window.onload = function() {
		document.getElementById('root').innerText = 'Hello world from JS!';
		/**
		 * TODO: Original innerText "저는 7년 동안 한국에서 살았어요. Слава Україні!" was changed due to issues
		 * with cheerio and website-scraper itself.
		 * See https://github.com/cheeriojs/cheerio/pull/2280
		 */
		document.getElementById('special-characters-test').innerText = '7년 동안 한국에서 살았어요. Слава Україні!';
	};
</script>

</body>
</html>


================================================
FILE: test/mock/navigation.html
================================================
<!DOCTYPE html>
<html lang="en">
<head>
	<meta charset="UTF-8">
	<title>Test</title>
</head>
<body>

<div id="root"></div>

<script>
	window.onload = function() {
		window.location.replace('http://example.com');
		document.getElementById('root').innerText = 'Navigation blocked!';
	};
</script>

</body>
</html>

================================================
FILE: test/puppeteer-plugin.test.js
================================================
import { expect } from 'chai';
import http from 'http';
import finalhandler from 'finalhandler';
import serveStatic from 'serve-static';
import fs from 'fs-extra';
import scrape from 'website-scraper';
import PuppeteerPlugin from '../lib/index.js';

const directory = './test/tmp';
const SERVE_WEBSITE_PORT = 4567;

describe('Puppeteer plugin test', () => {
	let result, content, server;

	before('start webserver', () => server = startWebserver(SERVE_WEBSITE_PORT));
	after('stop webserver', () => server.close())

	describe('Dynamic content', () => {
		before('scrape website', async () => {
			result = await scrape({
				urls: [`http://localhost:${SERVE_WEBSITE_PORT}`],
				directory: directory,
				plugins: [ new PuppeteerPlugin({
					scrollToBottom: { timeout: 50, viewportN: 10 }
				}) ]
			});
		});
		before('get content from file', () => {
			content = fs.readFileSync(`${directory}/${result[0].filename}`).toString();
		});
		after('delete dir', () => fs.removeSync(directory));

		it('should have 1 item in result array', () => {
			expect(result.length).eql(1);
		});

		it('should render dymanic website', async () => {
			expect(content).to.contain('<div id="root">Hello world from JS!</div>');
		});

		it('should render special characters correctly', async () => {
			expect(content).to.contain('<div id="special-characters-test">7년 동안 한국에서 살았어요. Слава Україні!</div>');
		});
	});
});

function startWebserver(port = 3000) {
	const serve = serveStatic('./test/mock', {'index': ['index.html']});
	const server = http.createServer(function onRequest (req, res) {
		serve(req, res, finalhandler(req, res))
	});

	return server.listen(port)
}