Full Code of mozilla/readability for AI

main 08be6b4bdb20 cached

415 files

27.1 MB

7.1M tokens

142 symbols

1 requests

Copy disabled (too large) Download .txt

Showing preview only (28,451K chars total). Download the full file to get everything.

Repository: mozilla/readability
Branch: main
Commit: 08be6b4bdb20
Files: 415
Total size: 27.1 MB

Directory structure:
gitextract_3shkyv0s/

├── .gitattributes
├── .gitignore
├── .npmignore
├── .prettierrc.js
├── .release-it.json
├── .taskcluster.yml
├── CHANGELOG.md
├── CODE_OF_CONDUCT.md
├── CONTRIBUTING.md
├── JSDOMParser.js
├── LICENSE.md
├── README.md
├── Readability-readerable.js
├── Readability.js
├── SECURITY.md
├── eslint.config.mjs
├── index.d.ts
├── index.js
├── package.json
└── test/
    ├── debug-testcase.js
    ├── generate-testcase.js
    ├── test-isProbablyReaderable.js
    ├── test-jsdomparser.js
    ├── test-pages/
    │   ├── 001/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── 002/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── 003-metadata-preferred/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── 004-metadata-space-separated-properties/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── 005-unescape-html-entities/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── aclu/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── aktualne/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── archive-of-our-own/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── ars-1/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── article-author-tag/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── base-url/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── base-url-base-element/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── base-url-base-element-relative/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── basic-tags-cleaning/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── bbc-1/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── blogger/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── breitbart/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── bug-1255978/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── buzzfeed-1/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── citylab-1/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── clean-links/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── cnet/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── cnet-svg-classes/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── cnn/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── comment-inside-script-parsing/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── daringfireball-1/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── data-url-image/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── dev418/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── dropbox-blog/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── ebb-org/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── ehow-1/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── ehow-2/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── embedded-videos/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── engadget/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── firefox-nightly-blog/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── folha/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── gitlab-blog/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── gmw/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── google-sre-book-1/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── guardian-1/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── heise/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── herald-sun-1/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── hidden-nodes/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── hukumusume/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── iab-1/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── ietf-1/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── invalid-attributes/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── js-link-replacement/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── keep-images/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── keep-tabular-data/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── la-nacion/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── lazy-image-1/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── lazy-image-2/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── lazy-image-3/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── lemonde-1/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── liberation-1/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── lifehacker-post-comment-load/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── lifehacker-working/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── links-in-tables/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── lwn-1/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── mathjax/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── medicalnewstoday/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── medium-1/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── medium-2/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── medium-3/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── mercurial/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── metadata-content-missing/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── missing-paragraphs/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── mozilla-1/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── mozilla-2/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── msn/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── normalize-spaces/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── nytimes-1/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── nytimes-2/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── nytimes-3/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── nytimes-4/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── nytimes-5/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── ol/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── parsely-metadata/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── pixnet/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── qq/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── quanta-1/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── remove-aria-hidden/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── remove-extra-brs/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── remove-extra-paragraphs/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── remove-script-tags/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── reordering-paragraphs/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── replace-brs/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── replace-font-tags/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── royal-road/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── rtl-1/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── rtl-2/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── rtl-3/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── rtl-4/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── salon-1/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── schema-org-context-object/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── seattletimes-1/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── simplyfound-1/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── social-buttons/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── spiceworks/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── style-tags-removal/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── svg-parsing/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── table-style-attributes/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── telegraph/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── theverge/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── title-and-h1-discrepancy/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── title-en-dash/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── tmz-1/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── toc-missing/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── topicseed-1/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── tumblr/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── v8-blog/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── videos-1/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── videos-2/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── visibility-hidden/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── wapo-1/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── wapo-2/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── webmd-1/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── webmd-2/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── wikia/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── wikipedia/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── wikipedia-2/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── wikipedia-3/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── wikipedia-4/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── wordpress/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── yahoo-1/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── yahoo-2/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── yahoo-3/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── yahoo-4/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   └── youth/
    │       ├── expected-metadata.json
    │       ├── expected.html
    │       └── source.html
    ├── test-readability.js
    └── utils.js

================================================
FILE CONTENTS
================================================

================================================
FILE: .gitattributes
================================================
**/*.html linguist-detectable=false
* text=auto eol=lf


================================================
FILE: .gitignore
================================================
.DS_Store
npm-debug.log
node_modules
.metadata
*.pyc
*~
.*.sw?
.sw?
*.jar
*.xpi


================================================
FILE: .npmignore
================================================
/benchmarks/
/test/
.gitattributes
.release-it.json
.taskcluster.yml


================================================
FILE: .prettierrc.js
================================================
/* This Source Code Form is subject to the terms of the Mozilla Public
 * License, v. 2.0. If a copy of the MPL was not distributed with this
 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */

/* eslint-env node */

module.exports = {
  arrowParens: "avoid",
  endOfLine: "lf",
  printWidth: 80,
  tabWidth: 2,
  trailingComma: "es5",
};


================================================
FILE: .release-it.json
================================================
{
  "plugins": {
    "@release-it/keep-a-changelog": {
      "addUnreleased": true,
      "filename": "CHANGELOG.md"
    }
  },
  "github": {
    "release": false
  },
  "git": {
    "requireBranch": "main",
    "pushRepo": "https://github.com/mozilla/readability.git"
  }
}


================================================
FILE: .taskcluster.yml
================================================
version: 1
policy:
  pullRequests: public
tasks:
  $let:
    head_rev:
      $if: 'tasks_for == "github-pull-request"'
      then: ${event.pull_request.head.sha}
      else: ${event.after}
    repository:
      $if: 'tasks_for == "github-pull-request"'
      then: ${event.pull_request.head.repo.html_url}
      else: ${event.repository.html_url}
  in:
    $match:
      '(tasks_for == "github-pull-request" && event["action"] in ["opened","reopened","synchronize"]) || (tasks_for == "github-push")':
        taskId:
          $eval: as_slugid("pr_task")
        provisionerId: proj-misc
        workerType: ci
        deadline: {$fromNow: '1 day'}
        payload:
          maxRunTime: 600
          image: node
          command:
            - /bin/bash
            - '--login'
            - '-c'
            - >-
              git clone ${repository} repo && cd repo && git config
              advice.detachedHead false && git checkout ${head_rev} && npm
              install . && npm run lint && npm test
        metadata:
          name: Run tests and linter
          description: ''
          owner: '${event.sender.login}@users.noreply.github.com'
          source: '${event.repository.url}'


================================================
FILE: CHANGELOG.md
================================================
# Changelog

Notable changes to readability will be documented in this file.

The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project attempts to adhere to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

For the purposes of Semantic Versioning, the readability output object for a given
input document is not considered a stable API. That is, minor version increments
may change this output. Patch version increments will only do so in ways that are
strict improvements (e.g. from empty strings or exceptions to something more
reasonable).

## [Unreleased]

## [0.6.0] - 2025-03-03

- [Add Parsely tags as a fallback metadata source](https://github.com/mozilla/readability/pull/865)
- [Fix the case that jsonld parse process is ignored when context url include the trailing slash](https://github.com/mozilla/readability/pull/833)
- [Improve data table support](https://github.com/mozilla/readability/pull/858)
- [Fixed situations where short paragraphs of legitimate content would be excluded](https://github.com/mozilla/readability/pull/867)
- [Add an option to modify link density value](https://github.com/mozilla/readability/pull/874)
- [Byline metadata should lead to not deleting lookalike non-byline content](https://github.com/mozilla/readability/pull/869)
- [Avoid removing headers on gitlab](https://github.com/mozilla/readability/pull/885)
- [Improved HTML character unescaping](https://github.com/mozilla/readability/pull/896)
- Various performance improvements: [#894](https://github.com/mozilla/readability/pull/894),
  [#892](https://github.com/mozilla/readability/pull/892), [#893](https://github.com/mozilla/readability/pull/893),
  [#915](https://github.com/mozilla/readability/pull/915),
- [Fix broken JSONLD context handling](https://github.com/mozilla/readability/pull/902)
- [Include Jekyll footnotes in output](https://github.com/mozilla/readability/pull/907)
- [Handle schema.org context objects](https://github.com/mozilla/readability/pull/940)
- [Fix invalid attributes breaking parsing](https://github.com/mozilla/readability/pull/918)
- [Include article:author metadata](https://github.com/mozilla/readability/pull/942)
- [Handle itemprop=name for author metadata](https://github.com/mozilla/readability/pull/943)
- [Improve typescript definitions](https://github.com/mozilla/readability/pull/944)
- [Handle JSONLD Arrays](https://github.com/mozilla/readability/pull/947)

## [0.5.0] - 2023-12-15

- [Add published time metadata](https://github.com/mozilla/readability/pull/813)
- [Expanded comma detection to non-Latin commas](https://github.com/mozilla/readability/pull/796)
- [Fix detection of elements hidden with style="visibility: hidden"](https://github.com/mozilla/readability/pull/817)

## [0.4.4] - 2023-03-31

- Fixed [undefined `li_count` variable breaking use of readability in Cloudflare workers](https://github.com/mozilla/readability/issues/791)

## [0.4.3] - 2023-03-22

- Fixed [`aria-modal` cookie dialogs interfering with readability](https://github.com/mozilla/readability/pull/746)
- Fixed [lists of images not showing](https://github.com/mozilla/readability/pull/738)
- [Updated type information for TypeScript](https://github.com/mozilla/readability/pull/734)
- [Simplify `script` and `noscript` removal](https://github.com/mozilla/readability/pull/762)
- [Updated dependencies](https://github.com/mozilla/readability/pull/770)
- [Added allowedVideoRegex option to override the default](https://github.com/mozilla/readability/pull/788)

## [0.4.2] - 2022-02-09

- Fix [compatibility with DOM implementations where the `childNodes` property is not live](https://github.com/mozilla/readability/pull/694) ([x2](https://github.com/mozilla/readability/pull/677)).
- Lazily-loaded image references [will no longer use the `alt` attribute](https://github.com/mozilla/readability/pull/689) to find images.
- `parse()` [provides the root element's `lang` attribute](https://github.com/mozilla/readability/pull/721)
- `isProbablyReadable` [includes article tags](https://github.com/mozilla/readability/pull/724)
- Improvements to JSON-LD support
  - [Continue parsing other JSON-LD elements until we find one we can support](https://github.com/mozilla/readability/pull/713)
  - [Prefer using headline for article title](https://github.com/mozilla/readability/pull/713)

## [0.4.1] - 2021-01-13

### Added

- Typescript type definition file (`.d.ts`).

## [0.4.0] - 2020-12-23

### Added

- `isProbablyReaderable` [can now take an optional options object](https://github.com/mozilla/readability/pull/634) to configure it,
allowing you to specify the minimum content length, minimum score, and how to
check if nodes are visible.

- Better support for [deeply-nested content](https://github.com/mozilla/readability/pull/611).

- Readability is now more likely to [keep tables of content](https://github.com/mozilla/readability/pull/646).

- Better support for [content in `<code>` tags](https://github.com/mozilla/readability/pull/647).

- Readability (finally) no longer [throws away all `<h1>` tags](https://github.com/mozilla/readability/pull/650).

### Changed

- JSON-LD [support for multiple authors](https://github.com/mozilla/readability/pull/618)
  was improved.

- Elements with roles `menu`, `menubar`, `complementary`, `navigation`, `alert`,
  `alertdialog`, `dialog` will [all be removed](https://github.com/mozilla/readability/pull/619).

## [0.3.0] - 2020-08-05

The first version that was published on NPM.

Previously, we did not consistently version anything,
nor did we publish to NPM.

At some point, we may wish to expand this changelog into the past.


================================================
FILE: CODE_OF_CONDUCT.md
================================================
# Community Participation Guidelines

This repository is governed by Mozilla's code of conduct and etiquette guidelines. 
For more details, please read the
[Mozilla Community Participation Guidelines](https://www.mozilla.org/about/governance/policies/participation/). 

## How to Report
For more information on how to report violations of the Community Participation Guidelines, please read our '[How to Report](https://www.mozilla.org/about/governance/policies/participation/reporting/)' page.

<!--
## Project Specific Etiquette

In some cases, there will be additional project etiquette i.e.: (https://bugzilla.mozilla.org/page.cgi?id=etiquette.html).
Please update for your project.
-->


================================================
FILE: CONTRIBUTING.md
================================================
# Contributing

Thank you for wanting to help make `readability` better!

For outstanding issues, see the issue list in this repo, as well as this [bug list](https://bugzilla.mozilla.org/buglist.cgi?component=Reader%20Mode&product=Toolkit&bug_status=__open__&limit=0).

Any changes to the main code should be reviewed by an [appropriate Firefox/toolkit peer](https://wiki.mozilla.org/Modules/Firefox), such as [@gijsk](https://github.com/gijsk), since these changes will be merged to mozilla-central and shipped in Firefox.

To test local changes to Readability.js, you can use the [automated tests](#tests).

This repository is governed by Mozilla's code of conduct and etiquette guidelines. 
For more details, please read the
[Mozilla Community Participation Guidelines](https://www.mozilla.org/about/governance/policies/participation/). 

## Tests

[![Build Status](https://community-tc.services.mozilla.com/api/github/v1/repository/mozilla/readability/main/badge.svg)](https://community-tc.services.mozilla.com/api/github/v1/repository/mozilla/readability/main/latest)

Please run [eslint](http://eslint.org/) as a first check that your changes are valid JS and adhere to our style guidelines:

    $ npm run lint


To run the test suite:

    $ npm test

To run a specific test page by its name:

    $ npm test -- -g 001

To run the test suite in TDD mode:

    $ npm test -- -w

Combo time:

    $ npm test -- -w -g 001

### Add new tests

There's a [node script](https://github.com/mozilla/readability/blob/master/test/generate-testcase.js) to help you create new tests.
You can run it using:

    $ node test/generate-testcase.js slug https://example.com/article

Replacing `slug` with the identifier the test should use, and providing a URL
to an actual article on which the test should be based. If your test case involves dynamic content, you can save the page to disk and pass the file as a `file://` URL.

You may need to make the `tidy` binary executable before that script will succeed. If you see an `EACCES` error when running that script, try:

    # On MacOS
    $ chmod +x ./node_modules/htmltidy2/bin/darwin/tidy
    
    # On Linux, the linuxXX may change depending on your platform
    # The specific path may change depending on the node tools you use
    $ chmod +x ./node_modules/htmltidy2/bin/linux64/tidy
    

    

## Pull Requests

We're always happy to see pull requests to improve readability.

Please ensure you run the linter and [tests](#tests) before submitting a PR.

If you're changing the algorithm to fix a specific page/article, please
[add new tests](#add-new-tests) for the case you're fixing, so we avoid
breaking it in future.

## Steps to release

1. Ensure [CHANGELOG.md](CHANGELOG.md) is up-to-date. ``git log `npm view . version`...master `` may help with this.
2. Run `npm run release` to create a release, which should:
     1. `npm version [patch | minor | major]`, depending on the nature of the changes according to
[semver](https://semver.org/). This will bump the version in `package.json` and `package-lock.json`
and create a commit and Git tag for the release.
     2. `npm publish` to push the release to the npm registry.
     3. `git push origin head --follow-tags` to push the new commit and tag to GitHub.

## Keeping a changelog

Ensure significant changes are added to `CHANGELOG.md`. Do not add
changes that only affect tests or documentation.



================================================
FILE: JSDOMParser.js
================================================
/* This Source Code Form is subject to the terms of the Mozilla Public
 * License, v. 2.0. If a copy of the MPL was not distributed with this file,
 * You can obtain one at http://mozilla.org/MPL/2.0/. */

/**
 * This is a relatively lightweight DOMParser that is safe to use in a web
 * worker. This is far from a complete DOM implementation; however, it should
 * contain the minimal set of functionality necessary for Readability.js.
 *
 * Aside from not implementing the full DOM API, there are other quirks to be
 * aware of when using the JSDOMParser:
 *
 *   1) Properly formed HTML/XML must be used. This means you should be extra
 *      careful when using this parser on anything received directly from an
 *      XMLHttpRequest. Providing a serialized string from an XMLSerializer,
 *      however, should be safe (since the browser's XMLSerializer should
 *      generate valid HTML/XML). Therefore, if parsing a document from an XHR,
 *      the recommended approach is to do the XHR in the main thread, use
 *      XMLSerializer.serializeToString() on the responseXML, and pass the
 *      resulting string to the worker.
 *
 *   2) Live NodeLists are not supported. DOM methods and properties such as
 *      getElementsByTagName() and childNodes return standard arrays. If you
 *      want these lists to be updated when nodes are removed or added to the
 *      document, you must take care to manually update them yourself.
 */
(function (global) {
  // XML only defines these and the numeric ones:

  var entityTable = {
    lt: "<",
    gt: ">",
    amp: "&",
    quot: '"',
    apos: "'",
  };

  var reverseEntityTable = {
    "<": "&lt;",
    ">": "&gt;",
    "&": "&amp;",
    '"': "&quot;",
    "'": "&apos;",
  };

  function encodeTextContentHTML(s) {
    return s.replace(/[&<>]/g, function (x) {
      return reverseEntityTable[x];
    });
  }

  function encodeHTML(s) {
    return s.replace(/[&<>'"]/g, function (x) {
      return reverseEntityTable[x];
    });
  }

  function decodeHTML(str) {
    return str
      .replace(/&(quot|amp|apos|lt|gt);/g, function (match, tag) {
        return entityTable[tag];
      })
      .replace(/&#(?:x([0-9a-f]+)|([0-9]+));/gi, function (match, hex, numStr) {
        var num = parseInt(hex || numStr, hex ? 16 : 10);

        // these character references are replaced by a conforming HTML parser
        if (num == 0 || num > 0x10ffff || (num >= 0xd800 && num <= 0xdfff)) {
          num = 0xfffd;
        }

        return String.fromCodePoint(num);
      });
  }

  // When a style is set in JS, map it to the corresponding CSS attribute
  var styleMap = {
    alignmentBaseline: "alignment-baseline",
    background: "background",
    backgroundAttachment: "background-attachment",
    backgroundClip: "background-clip",
    backgroundColor: "background-color",
    backgroundImage: "background-image",
    backgroundOrigin: "background-origin",
    backgroundPosition: "background-position",
    backgroundPositionX: "background-position-x",
    backgroundPositionY: "background-position-y",
    backgroundRepeat: "background-repeat",
    backgroundRepeatX: "background-repeat-x",
    backgroundRepeatY: "background-repeat-y",
    backgroundSize: "background-size",
    baselineShift: "baseline-shift",
    border: "border",
    borderBottom: "border-bottom",
    borderBottomColor: "border-bottom-color",
    borderBottomLeftRadius: "border-bottom-left-radius",
    borderBottomRightRadius: "border-bottom-right-radius",
    borderBottomStyle: "border-bottom-style",
    borderBottomWidth: "border-bottom-width",
    borderCollapse: "border-collapse",
    borderColor: "border-color",
    borderImage: "border-image",
    borderImageOutset: "border-image-outset",
    borderImageRepeat: "border-image-repeat",
    borderImageSlice: "border-image-slice",
    borderImageSource: "border-image-source",
    borderImageWidth: "border-image-width",
    borderLeft: "border-left",
    borderLeftColor: "border-left-color",
    borderLeftStyle: "border-left-style",
    borderLeftWidth: "border-left-width",
    borderRadius: "border-radius",
    borderRight: "border-right",
    borderRightColor: "border-right-color",
    borderRightStyle: "border-right-style",
    borderRightWidth: "border-right-width",
    borderSpacing: "border-spacing",
    borderStyle: "border-style",
    borderTop: "border-top",
    borderTopColor: "border-top-color",
    borderTopLeftRadius: "border-top-left-radius",
    borderTopRightRadius: "border-top-right-radius",
    borderTopStyle: "border-top-style",
    borderTopWidth: "border-top-width",
    borderWidth: "border-width",
    bottom: "bottom",
    boxShadow: "box-shadow",
    boxSizing: "box-sizing",
    captionSide: "caption-side",
    clear: "clear",
    clip: "clip",
    clipPath: "clip-path",
    clipRule: "clip-rule",
    color: "color",
    colorInterpolation: "color-interpolation",
    colorInterpolationFilters: "color-interpolation-filters",
    colorProfile: "color-profile",
    colorRendering: "color-rendering",
    content: "content",
    counterIncrement: "counter-increment",
    counterReset: "counter-reset",
    cursor: "cursor",
    direction: "direction",
    display: "display",
    dominantBaseline: "dominant-baseline",
    emptyCells: "empty-cells",
    enableBackground: "enable-background",
    fill: "fill",
    fillOpacity: "fill-opacity",
    fillRule: "fill-rule",
    filter: "filter",
    cssFloat: "float",
    floodColor: "flood-color",
    floodOpacity: "flood-opacity",
    font: "font",
    fontFamily: "font-family",
    fontSize: "font-size",
    fontStretch: "font-stretch",
    fontStyle: "font-style",
    fontVariant: "font-variant",
    fontWeight: "font-weight",
    glyphOrientationHorizontal: "glyph-orientation-horizontal",
    glyphOrientationVertical: "glyph-orientation-vertical",
    height: "height",
    imageRendering: "image-rendering",
    kerning: "kerning",
    left: "left",
    letterSpacing: "letter-spacing",
    lightingColor: "lighting-color",
    lineHeight: "line-height",
    listStyle: "list-style",
    listStyleImage: "list-style-image",
    listStylePosition: "list-style-position",
    listStyleType: "list-style-type",
    margin: "margin",
    marginBottom: "margin-bottom",
    marginLeft: "margin-left",
    marginRight: "margin-right",
    marginTop: "margin-top",
    marker: "marker",
    markerEnd: "marker-end",
    markerMid: "marker-mid",
    markerStart: "marker-start",
    mask: "mask",
    maxHeight: "max-height",
    maxWidth: "max-width",
    minHeight: "min-height",
    minWidth: "min-width",
    opacity: "opacity",
    orphans: "orphans",
    outline: "outline",
    outlineColor: "outline-color",
    outlineOffset: "outline-offset",
    outlineStyle: "outline-style",
    outlineWidth: "outline-width",
    overflow: "overflow",
    overflowX: "overflow-x",
    overflowY: "overflow-y",
    padding: "padding",
    paddingBottom: "padding-bottom",
    paddingLeft: "padding-left",
    paddingRight: "padding-right",
    paddingTop: "padding-top",
    page: "page",
    pageBreakAfter: "page-break-after",
    pageBreakBefore: "page-break-before",
    pageBreakInside: "page-break-inside",
    pointerEvents: "pointer-events",
    position: "position",
    quotes: "quotes",
    resize: "resize",
    right: "right",
    shapeRendering: "shape-rendering",
    size: "size",
    speak: "speak",
    src: "src",
    stopColor: "stop-color",
    stopOpacity: "stop-opacity",
    stroke: "stroke",
    strokeDasharray: "stroke-dasharray",
    strokeDashoffset: "stroke-dashoffset",
    strokeLinecap: "stroke-linecap",
    strokeLinejoin: "stroke-linejoin",
    strokeMiterlimit: "stroke-miterlimit",
    strokeOpacity: "stroke-opacity",
    strokeWidth: "stroke-width",
    tableLayout: "table-layout",
    textAlign: "text-align",
    textAnchor: "text-anchor",
    textDecoration: "text-decoration",
    textIndent: "text-indent",
    textLineThrough: "text-line-through",
    textLineThroughColor: "text-line-through-color",
    textLineThroughMode: "text-line-through-mode",
    textLineThroughStyle: "text-line-through-style",
    textLineThroughWidth: "text-line-through-width",
    textOverflow: "text-overflow",
    textOverline: "text-overline",
    textOverlineColor: "text-overline-color",
    textOverlineMode: "text-overline-mode",
    textOverlineStyle: "text-overline-style",
    textOverlineWidth: "text-overline-width",
    textRendering: "text-rendering",
    textShadow: "text-shadow",
    textTransform: "text-transform",
    textUnderline: "text-underline",
    textUnderlineColor: "text-underline-color",
    textUnderlineMode: "text-underline-mode",
    textUnderlineStyle: "text-underline-style",
    textUnderlineWidth: "text-underline-width",
    top: "top",
    unicodeBidi: "unicode-bidi",
    unicodeRange: "unicode-range",
    vectorEffect: "vector-effect",
    verticalAlign: "vertical-align",
    visibility: "visibility",
    whiteSpace: "white-space",
    widows: "widows",
    width: "width",
    wordBreak: "word-break",
    wordSpacing: "word-spacing",
    wordWrap: "word-wrap",
    writingMode: "writing-mode",
    zIndex: "z-index",
    zoom: "zoom",
  };

  // Elements that can be self-closing
  var voidElems = {
    area: true,
    base: true,
    br: true,
    col: true,
    command: true,
    embed: true,
    hr: true,
    img: true,
    input: true,
    link: true,
    meta: true,
    param: true,
    source: true,
    wbr: true,
  };

  var whitespace = [" ", "\t", "\n", "\r"];

  // See https://developer.mozilla.org/en-US/docs/Web/API/Node/nodeType
  var nodeTypes = {
    ELEMENT_NODE: 1,
    ATTRIBUTE_NODE: 2,
    TEXT_NODE: 3,
    CDATA_SECTION_NODE: 4,
    ENTITY_REFERENCE_NODE: 5,
    ENTITY_NODE: 6,
    PROCESSING_INSTRUCTION_NODE: 7,
    COMMENT_NODE: 8,
    DOCUMENT_NODE: 9,
    DOCUMENT_TYPE_NODE: 10,
    DOCUMENT_FRAGMENT_NODE: 11,
    NOTATION_NODE: 12,
  };

  function getElementsByTagName(tag) {
    tag = tag.toUpperCase();
    var elems = [];
    var allTags = tag === "*";
    function getElems(node) {
      var length = node.children.length;
      for (var i = 0; i < length; i++) {
        var child = node.children[i];
        if (allTags || child.tagName === tag) {
          elems.push(child);
        }
        getElems(child);
      }
    }
    getElems(this);
    elems._isLiveNodeList = true;
    return elems;
  }

  var Node = function () {};

  Node.prototype = {
    attributes: null,
    childNodes: null,
    localName: null,
    nodeName: null,
    parentNode: null,
    textContent: null,
    nextSibling: null,
    previousSibling: null,

    get firstChild() {
      return this.childNodes[0] || null;
    },

    get firstElementChild() {
      return this.children[0] || null;
    },

    get lastChild() {
      return this.childNodes[this.childNodes.length - 1] || null;
    },

    get lastElementChild() {
      return this.children[this.children.length - 1] || null;
    },

    /**
     * The workhorse for all node insertion operations. The public methods
     * (`appendChild()`, `insertBefore()`, `replaceChild()`) are thin wrappers
     * around this.
     *
     * @private
     * @param {Node[]} nodes - An array of nodes to insert. It is assumed that
     *   these nodes are distinct, and are not children of this object.
     * @param {Number} index - A valid index to insert `nodes` at, or -1 to
     *   indicate insertion as the last children.
     * @returns {void}
     */
    _insertNodesAtIndex(nodes, index) {
      if (!nodes.length) {
        return;
      }

      // Detach nodes from their previous parents.
      for (var i = 0; i < nodes.length; i++) {
        if (nodes[i].parentNode) {
          nodes[i].remove();
        }
      }

      var afterSibling = index === -1 ? null : this.childNodes[index];

      // Store the previous sibling before we modify the DOM.
      var prevSibling = afterSibling
        ? afterSibling.previousSibling
        : this.lastChild;

      // Insert nodes into childNodes.
      var insertionPoint = index === -1 ? this.childNodes.length : index;
      Array.prototype.splice.apply(
        this.childNodes,
        [insertionPoint, 0].concat(nodes)
      );

      // Update parentNode and sibling pointers for the new nodes.
      for (var j = 0; j < nodes.length; j++) {
        var node = nodes[j];
        node.parentNode = this;
        node.previousSibling = prevSibling;
        if (prevSibling) {
          prevSibling.nextSibling = node;
        }
        prevSibling = node;
      }
      var lastInsertedNode = nodes[nodes.length - 1];
      lastInsertedNode.nextSibling = afterSibling;
      if (afterSibling) {
        afterSibling.previousSibling = lastInsertedNode;
      }

      // Filter for element nodes and update children array and pointers.
      var elementsToInsert = [];
      for (var k = 0; k < nodes.length; k++) {
        if (nodes[k].nodeType === Node.ELEMENT_NODE) {
          elementsToInsert.push(nodes[k]);
        }
      }

      if (elementsToInsert.length) {
        // Find the next element sibling to use as an insertion reference.
        // This is done after `childNodes` is modified, as the forward
        // traversal from `afterSibling` remains valid.
        var afterElem = afterSibling;
        while (afterElem && afterElem.nodeType !== Node.ELEMENT_NODE) {
          afterElem = afterElem.nextSibling;
        }

        // Store the previous element sibling before more DOM modifications.
        var prevElem = afterElem
          ? afterElem.previousElementSibling
          : this.lastElementChild;

        var afterElemIndex = afterElem ? this.children.indexOf(afterElem) : -1;
        var elemInsertionPoint =
          afterElemIndex === -1 ? this.children.length : afterElemIndex;
        Array.prototype.splice.apply(
          this.children,
          [elemInsertionPoint, 0].concat(elementsToInsert)
        );

        for (var l = 0; l < elementsToInsert.length; l++) {
          var elem = elementsToInsert[l];
          elem.previousElementSibling = prevElem;
          if (prevElem) {
            prevElem.nextElementSibling = elem;
          }
          prevElem = elem;
        }
        var lastInsertedElem = elementsToInsert[elementsToInsert.length - 1];
        lastInsertedElem.nextElementSibling = afterElem;
        if (afterElem) {
          afterElem.previousElementSibling = lastInsertedElem;
        }
      }
    },

    appendChild(child) {
      var nodes =
        child.nodeType === Node.DOCUMENT_FRAGMENT_NODE
          ? Array.from(child.childNodes)
          : [child];
      this._insertNodesAtIndex(nodes, -1);
      return child;
    },

    insertBefore(newNode, referenceNode) {
      if (newNode === referenceNode) {
        return newNode;
      }
      var nodes =
        newNode.nodeType === Node.DOCUMENT_FRAGMENT_NODE
          ? Array.from(newNode.childNodes)
          : [newNode];
      var index = referenceNode ? this.childNodes.indexOf(referenceNode) : -1;
      if (referenceNode && index === -1) {
        throw new Error("insertBefore: reference node not found");
      }
      this._insertNodesAtIndex(nodes, index);
      return newNode;
    },

    remove() {
      let parent = this.parentNode;
      if (!parent) {
        // We were already detached so there's nothing to do.
        return this;
      }
      var childNodes = parent.childNodes;
      var childIndex = childNodes.indexOf(this);
      if (childIndex === -1) {
        throw new Error("removeChild: node not found");
      }
      this.parentNode = null;
      var prev = this.previousSibling;
      var next = this.nextSibling;
      if (prev) {
        prev.nextSibling = next;
      }
      if (next) {
        next.previousSibling = prev;
      }
      childNodes.splice(childIndex, 1);

      if (this.nodeType === Node.ELEMENT_NODE) {
        var prevElem = this.previousElementSibling;
        var nextElem = this.nextElementSibling;
        if (prevElem) {
          prevElem.nextElementSibling = nextElem;
        }
        if (nextElem) {
          nextElem.previousElementSibling = prevElem;
        }
        parent.children.splice(parent.children.indexOf(this), 1);
        this.previousElementSibling = this.nextElementSibling = null;
      }

      this.previousSibling = this.nextSibling = null;

      return this;
    },

    removeChild(child) {
      return child.remove();
    },

    replaceChild(newNode, oldNode) {
      if (newNode === oldNode) {
        return oldNode;
      }
      if (oldNode.parentNode !== this) {
        throw new Error(
          "replaceChild: node to be replaced is not a child of this node"
        );
      }
      // Insert the new node(s) before the node to be replaced.
      this.insertBefore(newNode, oldNode);
      // Now, remove the old node.
      oldNode.remove();
      return oldNode;
    },

    __JSDOMParser__: true,
  };

  for (var nodeType in nodeTypes) {
    Node[nodeType] = Node.prototype[nodeType] = nodeTypes[nodeType];
  }

  var Attribute = function (name, value) {
    this.name = name;
    this._value = value;
  };

  Attribute.prototype = {
    get value() {
      return this._value;
    },
    setValue(newValue) {
      this._value = newValue;
    },
    getEncodedValue() {
      return encodeHTML(this._value);
    },
    // Cheat horribly. This is fine for our usecases.
    cloneNode() {
      return this;
    },
  };

  var Comment = function () {
    this.childNodes = [];
  };

  Comment.prototype = {
    __proto__: Node.prototype,

    nodeName: "#comment",
    nodeType: Node.COMMENT_NODE,
  };

  var DocumentFragment = function () {
    this.childNodes = [];
    this.children = [];
  };

  DocumentFragment.prototype = {
    __proto__: Node.prototype,
    nodeName: "#document-fragment",
    nodeType: Node.DOCUMENT_FRAGMENT_NODE,
  };

  var Text = function () {
    this.childNodes = [];
  };

  Text.prototype = {
    __proto__: Node.prototype,

    nodeName: "#text",
    nodeType: Node.TEXT_NODE,
    get textContent() {
      if (typeof this._textContent === "undefined") {
        this._textContent = decodeHTML(this._innerHTML || "");
      }
      return this._textContent;
    },
    get innerHTML() {
      if (typeof this._innerHTML === "undefined") {
        this._innerHTML = encodeTextContentHTML(this._textContent || "");
      }
      return this._innerHTML;
    },

    set innerHTML(newHTML) {
      this._innerHTML = newHTML;
      delete this._textContent;
    },
    set textContent(newText) {
      this._textContent = newText;
      delete this._innerHTML;
    },
  };

  var Document = function (url) {
    this.documentURI = url;
    this.styleSheets = [];
    this.childNodes = [];
    this.children = [];
  };

  Document.prototype = {
    __proto__: Node.prototype,

    nodeName: "#document",
    nodeType: Node.DOCUMENT_NODE,
    title: "",

    getElementsByTagName,

    getElementById(id) {
      function getElem(node) {
        var length = node.children.length;
        if (node.id === id) {
          return node;
        }
        for (var i = 0; i < length; i++) {
          var el = getElem(node.children[i]);
          if (el) {
            return el;
          }
        }
        return null;
      }
      return getElem(this);
    },

    createElement(tag) {
      var node = new Element(tag);
      return node;
    },

    createTextNode(text) {
      var node = new Text();
      node.textContent = text;
      return node;
    },

    createDocumentFragment() {
      return new DocumentFragment();
    },

    get baseURI() {
      if (!this.hasOwnProperty("_baseURI")) {
        this._baseURI = this.documentURI;
        var baseElements = this.getElementsByTagName("base");
        var href = baseElements[0] && baseElements[0].getAttribute("href");
        if (href) {
          try {
            this._baseURI = new URL(href, this._baseURI).href;
          } catch (ex) {
            /* Just fall back to documentURI */
          }
        }
      }
      return this._baseURI;
    },
  };

  var Element = function (tag) {
    // We use this to find the closing tag.
    this._matchingTag = tag;
    // We're explicitly a non-namespace aware parser, we just pretend it's all HTML.
    var lastColonIndex = tag.lastIndexOf(":");
    if (lastColonIndex != -1) {
      tag = tag.substring(lastColonIndex + 1);
    }
    this.attributes = [];
    this.childNodes = [];
    this.children = [];
    this.nextElementSibling = this.previousElementSibling = null;
    this.localName = tag.toLowerCase();
    this.tagName = tag.toUpperCase();
    this.style = new Style(this);
  };

  Element.prototype = {
    __proto__: Node.prototype,

    nodeType: Node.ELEMENT_NODE,

    getElementsByTagName,

    get className() {
      return this.getAttribute("class") || "";
    },

    set className(str) {
      this.setAttribute("class", str);
    },

    get id() {
      return this.getAttribute("id") || "";
    },

    set id(str) {
      this.setAttribute("id", str);
    },

    get href() {
      return this.getAttribute("href") || "";
    },

    set href(str) {
      this.setAttribute("href", str);
    },

    get src() {
      return this.getAttribute("src") || "";
    },

    set src(str) {
      this.setAttribute("src", str);
    },

    get srcset() {
      return this.getAttribute("srcset") || "";
    },

    set srcset(str) {
      this.setAttribute("srcset", str);
    },

    get nodeName() {
      return this.tagName;
    },

    get innerHTML() {
      function getHTML(node) {
        var i = 0;
        for (i = 0; i < node.childNodes.length; i++) {
          var child = node.childNodes[i];
          if (child.localName) {
            arr.push("<" + child.localName);

            // serialize attribute list
            for (var j = 0; j < child.attributes.length; j++) {
              var attr = child.attributes[j];
              // the attribute value will be HTML escaped.
              var val = attr.getEncodedValue();
              var quote = !val.includes('"') ? '"' : "'";
              arr.push(" " + attr.name + "=" + quote + val + quote);
            }

            if (child.localName in voidElems && !child.childNodes.length) {
              // if this is a self-closing element, end it here
              arr.push("/>");
            } else {
              // otherwise, add its children
              arr.push(">");
              getHTML(child);
              arr.push("</" + child.localName + ">");
            }
          } else {
            // This is a text node, so asking for innerHTML won't recurse.
            arr.push(child.innerHTML);
          }
        }
      }

      // Using Array.join() avoids the overhead from lazy string concatenation.
      var arr = [];
      getHTML(this);
      return arr.join("");
    },

    set innerHTML(html) {
      var parser = new JSDOMParser();
      var node = parser.parse(html);
      var i;
      for (i = this.childNodes.length; --i >= 0; ) {
        this.childNodes[i].parentNode = null;
      }
      this.childNodes = node.childNodes;
      this.children = node.children;
      for (i = this.childNodes.length; --i >= 0; ) {
        this.childNodes[i].parentNode = this;
      }
    },

    set textContent(text) {
      // clear parentNodes for existing children
      for (var i = this.childNodes.length; --i >= 0; ) {
        this.childNodes[i].parentNode = null;
      }

      var node = new Text();
      this.childNodes = [node];
      this.children = [];
      node.textContent = text;
      node.parentNode = this;
    },

    get textContent() {
      function getText(node) {
        var nodes = node.childNodes;
        for (var i = 0; i < nodes.length; i++) {
          var child = nodes[i];
          if (child.nodeType === 3) {
            text.push(child.textContent);
          } else {
            getText(child);
          }
        }
      }

      // Using Array.join() avoids the overhead from lazy string concatenation.
      // See http://blog.cdleary.com/2012/01/string-representation-in-spidermonkey/#ropes
      var text = [];
      getText(this);
      return text.join("");
    },

    getAttribute(name) {
      for (var i = this.attributes.length; --i >= 0; ) {
        var attr = this.attributes[i];
        if (attr.name === name) {
          return attr.value;
        }
      }
      return undefined;
    },

    setAttribute(name, value) {
      for (var i = this.attributes.length; --i >= 0; ) {
        var attr = this.attributes[i];
        if (attr.name === name) {
          attr.setValue(value);
          return;
        }
      }
      this.attributes.push(new Attribute(name, value));
    },

    setAttributeNode(node) {
      this.setAttribute(node.name, node.value);
    },

    removeAttribute(name) {
      for (var i = this.attributes.length; --i >= 0; ) {
        var attr = this.attributes[i];
        if (attr.name === name) {
          this.attributes.splice(i, 1);
          break;
        }
      }
    },

    hasAttribute(name) {
      return this.attributes.some(function (attr) {
        return attr.name == name;
      });
    },
  };

  var Style = function (node) {
    this.node = node;
  };

  // getStyle() and setStyle() use the style attribute string directly. This
  // won't be very efficient if there are a lot of style manipulations, but
  // it's the easiest way to make sure the style attribute string and the JS
  // style property stay in sync. Readability.js doesn't do many style
  // manipulations, so this should be okay.
  Style.prototype = {
    getStyle(styleName) {
      var attr = this.node.getAttribute("style");
      if (!attr) {
        return undefined;
      }

      var styles = attr.split(";");
      for (var i = 0; i < styles.length; i++) {
        var style = styles[i].split(":");
        var name = style[0].trim();
        if (name === styleName) {
          return style[1].trim();
        }
      }

      return undefined;
    },

    setStyle(styleName, styleValue) {
      var value = this.node.getAttribute("style") || "";
      var index = 0;
      do {
        var next = value.indexOf(";", index) + 1;
        var length = next - index - 1;
        var style =
          length > 0 ? value.substr(index, length) : value.substr(index);
        if (style.substr(0, style.indexOf(":")).trim() === styleName) {
          value =
            value.substr(0, index).trim() +
            (next ? " " + value.substr(next).trim() : "");
          break;
        }
        index = next;
      } while (index);

      value += " " + styleName + ": " + styleValue + ";";
      this.node.setAttribute("style", value.trim());
    },
  };

  // For each item in styleMap, define a getter and setter on the style
  // property.
  for (var jsName in styleMap) {
    (function (cssName) {
      Style.prototype.__defineGetter__(jsName, function () {
        return this.getStyle(cssName);
      });
      Style.prototype.__defineSetter__(jsName, function (value) {
        this.setStyle(cssName, value);
      });
    })(styleMap[jsName]);
  }

  var JSDOMParser = function () {
    this.currentChar = 0;

    // In makeElementNode() we build up many strings one char at a time. Using
    // += for this results in lots of short-lived intermediate strings. It's
    // better to build an array of single-char strings and then join() them
    // together at the end. And reusing a single array (i.e. |this.strBuf|)
    // over and over for this purpose uses less memory than using a new array
    // for each string.
    this.strBuf = [];

    // Similarly, we reuse this array to return the two arguments from
    // makeElementNode(), which saves us from having to allocate a new array
    // every time.
    this.retPair = [];

    this.errorState = "";
  };

  JSDOMParser.prototype = {
    error(m) {
      if (typeof console !== "undefined") {
        // eslint-disable-next-line no-console
        console.log("JSDOMParser error: " + m + "\n");
      } else if (typeof dump !== "undefined") {
        /* global dump */
        dump("JSDOMParser error: " + m + "\n");
      }
      this.errorState += m + "\n";
    },

    /**
     * Look at the next character without advancing the index.
     */
    peekNext() {
      return this.html[this.currentChar];
    },

    /**
     * Get the next character and advance the index.
     */
    nextChar() {
      return this.html[this.currentChar++];
    },

    /**
     * Called after a quote character is read. This finds the next quote
     * character and returns the text string in between.
     */
    readString(quote) {
      var str;
      var n = this.html.indexOf(quote, this.currentChar);
      if (n === -1) {
        this.currentChar = this.html.length;
        str = null;
      } else {
        str = this.html.substring(this.currentChar, n);
        this.currentChar = n + 1;
      }

      return str;
    },

    /**
     * Called when parsing a node. This finds the next name/value attribute
     * pair and adds the result to the attributes list.
     */
    readAttribute(node) {
      var name = "";

      var n = this.html.indexOf("=", this.currentChar);
      if (n === -1) {
        this.currentChar = this.html.length;
      } else {
        // Read until a '=' character is hit; this will be the attribute key
        name = this.html.substring(this.currentChar, n);
        this.currentChar = n + 1;
      }

      if (!name) {
        return;
      }

      // After a '=', we should see a '"' for the attribute value
      var c = this.nextChar();
      if (c !== '"' && c !== "'") {
        this.error("Error reading attribute " + name + ", expecting '\"'");
        return;
      }

      // Read the attribute value (and consume the matching quote)
      var value = this.readString(c);

      node.attributes.push(new Attribute(name, decodeHTML(value)));
    },

    /**
     * Parses and returns an Element node. This is called after a '<' has been
     * read.
     *
     * @returns an array; the first index of the array is the parsed node;
     *          the second index is a boolean indicating whether this is a void
     *          Element
     */
    makeElementNode(retPair) {
      var c = this.nextChar();

      // Read the Element tag name
      var strBuf = this.strBuf;
      strBuf.length = 0;
      while (!whitespace.includes(c) && c !== ">" && c !== "/") {
        if (c === undefined) {
          return false;
        }
        strBuf.push(c);
        c = this.nextChar();
      }
      var tag = strBuf.join("");

      if (!tag) {
        return false;
      }

      var node = new Element(tag);

      // Read Element attributes
      while (c !== "/" && c !== ">") {
        if (c === undefined) {
          return false;
        }
        while (whitespace.includes(this.html[this.currentChar++])) {
          // Advance cursor to first non-whitespace char.
        }
        this.currentChar--;
        c = this.nextChar();
        if (c !== "/" && c !== ">") {
          --this.currentChar;
          this.readAttribute(node);
        }
      }

      // If this is a self-closing tag, read '/>'
      var closed = false;
      if (c === "/") {
        closed = true;
        c = this.nextChar();
        if (c !== ">") {
          this.error("expected '>' to close " + tag);
          return false;
        }
      }

      retPair[0] = node;
      retPair[1] = closed;
      return true;
    },

    /**
     * If the current input matches this string, advance the input index;
     * otherwise, do nothing.
     *
     * @returns whether input matched string
     */
    match(str) {
      var strlen = str.length;
      if (
        this.html.substr(this.currentChar, strlen).toLowerCase() ===
        str.toLowerCase()
      ) {
        this.currentChar += strlen;
        return true;
      }
      return false;
    },

    /**
     * Searches the input until a string is found and discards all input up to
     * and including the matched string.
     */
    discardTo(str) {
      var index = this.html.indexOf(str, this.currentChar) + str.length;
      if (index === -1) {
        this.currentChar = this.html.length;
      }
      this.currentChar = index;
    },

    /**
     * Reads child nodes for the given node.
     */
    readChildren(node) {
      var child;
      while ((child = this.readNode())) {
        // Don't keep Comment nodes
        if (child.nodeType !== 8) {
          node.appendChild(child);
        }
      }
    },

    discardNextComment() {
      if (this.match("--")) {
        this.discardTo("-->");
      } else {
        var c = this.nextChar();
        while (c !== ">") {
          if (c === undefined) {
            return null;
          }
          if (c === '"' || c === "'") {
            this.readString(c);
          }
          c = this.nextChar();
        }
      }
      return new Comment();
    },

    /**
     * Reads the next child node from the input. If we're reading a closing
     * tag, or if we've reached the end of input, return null.
     *
     * @returns the node
     */
    readNode() {
      var c = this.nextChar();

      if (c === undefined) {
        return null;
      }

      // Read any text as Text node
      var textNode;
      if (c !== "<") {
        --this.currentChar;
        textNode = new Text();
        var n = this.html.indexOf("<", this.currentChar);
        // We're not expecting XSS type exploitation inside JSDOMParser,
        // we just have to implement innerHTML stuff...
        /* eslint-disable no-unsanitized/property */
        if (n === -1) {
          textNode.innerHTML = this.html.substring(
            this.currentChar,
            this.html.length
          );
          this.currentChar = this.html.length;
        } else {
          textNode.innerHTML = this.html.substring(this.currentChar, n);
          this.currentChar = n;
        }
        /* eslint-enable no-unsanitized/property */
        return textNode;
      }

      if (this.match("![CDATA[")) {
        var endChar = this.html.indexOf("]]>", this.currentChar);
        if (endChar === -1) {
          this.error("unclosed CDATA section");
          return null;
        }
        textNode = new Text();
        textNode.textContent = this.html.substring(this.currentChar, endChar);
        this.currentChar = endChar + "]]>".length;
        return textNode;
      }

      c = this.peekNext();

      // Read Comment node. Normally, Comment nodes know their inner
      // textContent, but we don't really care about Comment nodes (we throw
      // them away in readChildren()). So just returning an empty Comment node
      // here is sufficient.
      if (c === "!" || c === "?") {
        // We're still before the ! or ? that is starting this comment:
        this.currentChar++;
        return this.discardNextComment();
      }

      // If we're reading a closing tag, return null. This means we've reached
      // the end of this set of child nodes.
      if (c === "/") {
        --this.currentChar;
        return null;
      }

      // Otherwise, we're looking at an Element node
      var result = this.makeElementNode(this.retPair);
      if (!result) {
        return null;
      }

      var node = this.retPair[0];
      var closed = this.retPair[1];
      var localName = node.localName;

      // If this isn't a void Element, read its child nodes
      if (!closed) {
        this.readChildren(node);
        var closingTag = "</" + node._matchingTag + ">";
        if (!this.match(closingTag)) {
          this.error(
            "expected '" +
              closingTag +
              "' and got " +
              this.html.substr(this.currentChar, closingTag.length)
          );
          return null;
        }
      }

      // Only use the first title, because SVG might have other
      // title elements which we don't care about (medium.com
      // does this, at least).
      if (localName === "title" && !this.doc.title) {
        this.doc.title = node.textContent.trim();
      } else if (localName === "head") {
        this.doc.head = node;
      } else if (localName === "body") {
        this.doc.body = node;
      } else if (localName === "html") {
        this.doc.documentElement = node;
      }

      return node;
    },

    /**
     * Parses an HTML string and returns a JS implementation of the Document.
     */
    parse(html, url) {
      this.html = html;
      var doc = (this.doc = new Document(url));
      this.readChildren(doc);

      // If this is an HTML document, remove root-level children except for the
      // <html> node
      if (doc.documentElement) {
        for (var i = doc.childNodes.length; --i >= 0; ) {
          var child = doc.childNodes[i];
          if (child !== doc.documentElement) {
            child.remove();
          }
        }
      }

      return doc;
    },
  };

  // Attach the standard DOM types to the global scope
  global.Node = Node;
  global.Comment = Comment;
  global.Document = Document;
  global.DocumentFragment = DocumentFragment;
  global.Element = Element;
  global.Text = Text;

  // Attach JSDOMParser to the global scope
  global.JSDOMParser = JSDOMParser;
})(this);

if (typeof module === "object") {
  /* eslint-disable-next-line no-redeclare */
  /* global module */
  module.exports = this.JSDOMParser;
}


================================================
FILE: LICENSE.md
================================================
Copyright (c) 2010 Arc90 Inc

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

   http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.


================================================
FILE: README.md
================================================
# Readability.js

A standalone version of the readability library used for [Firefox Reader View](https://support.mozilla.org/kb/firefox-reader-view-clutter-free-web-pages).

## Installation

Readability is available on npm:

```bash
npm install @mozilla/readability
```

You can then `require()` it, or for web-based projects, load the `Readability.js` script from your webpage.

## Basic usage

To parse a document, you must create a new `Readability` object from a DOM document object, and then call the [`parse()`](#parse) method. Here's an example:

```javascript
var article = new Readability(document).parse();
```

If you use Readability in a web browser, you will likely be able to use a `document` reference from elsewhere (e.g. fetched via XMLHttpRequest, in a same-origin `<iframe>` you have access to, etc.). In Node.js, you can [use an external DOM library](#nodejs-usage).

## API Reference

### `new Readability(document, options)`

The `options` object accepts a number of properties, all optional:

* `debug` (boolean, default `false`): whether to enable logging.
* `maxElemsToParse` (number, default `0` i.e. no limit): the maximum number of elements to parse.
* `nbTopCandidates` (number, default `5`): the number of top candidates to consider when analysing how tight the competition is among candidates.
* `charThreshold` (number, default `500`): the number of characters an article must have in order to return a result.
* `classesToPreserve` (array): a set of classes to preserve on HTML elements when the `keepClasses` options is set to `false`.
* `keepClasses` (boolean, default `false`): whether to preserve all classes on HTML elements. When set to `false` only classes specified in the `classesToPreserve` array are kept.
* `disableJSONLD` (boolean, default `false`): when extracting page metadata, Readability gives precedence to Schema.org fields specified in the JSON-LD format. Set this option to `true` to skip JSON-LD parsing.
* `serializer` (function, default `el => el.innerHTML`) controls how the `content` property returned by the `parse()` method is produced from the root DOM element. It may be useful to specify the `serializer` as the identity function (`el => el`) to obtain a DOM element instead of a string for `content` if you plan to process it further.
* `allowedVideoRegex` (RegExp, default `undefined` ): a regular expression that matches video URLs that should be allowed to be included in the article content. If `undefined`, the [default regex](https://github.com/mozilla/readability/blob/8e8ec27cd2013940bc6f3cc609de10e35a1d9d86/Readability.js#L133) is applied.
* `linkDensityModifier` (number, default `0`): a number that is added to the base link density threshold during the shadiness checks. This can be used to penalize nodes with a high link density or vice versa.

### `parse()`

Returns an object containing the following properties:

* `title`: article title;
* `content`: HTML string of processed article content;
* `textContent`: text content of the article, with all the HTML tags removed;
* `length`: length of an article, in characters;
* `excerpt`: article description, or short excerpt from the content;
* `byline`: author metadata;
* `dir`: content direction;
* `siteName`: name of the site;
* `lang`: content language;
* `publishedTime`: published time;

The `parse()` method works by modifying the DOM. This removes some elements in the web page, which may be undesirable. You can avoid this by passing the clone of the `document` object to the `Readability` constructor:

```js
var documentClone = document.cloneNode(true);
var article = new Readability(documentClone).parse();
```

### `isProbablyReaderable(document, options)`

A quick-and-dirty way of figuring out if it's plausible that the contents of a given document are suitable for processing with Readability. It is likely to produce both false positives and false negatives. The reason it exists is to avoid bogging down a time-sensitive process (like loading and showing the user a webpage) with the complex logic in the core of Readability. Improvements to its logic (while not deteriorating its performance) are very welcome.

The `options` object accepts a number of properties, all optional:

* `minContentLength` (number, default `140`): the minimum node content length used to decide if the document is readerable;
* `minScore` (number, default `20`): the minimum cumulated 'score' used to determine if the document is readerable;
* `visibilityChecker` (function, default `isNodeVisible`): the function used to determine if a node is visible;

The function returns a boolean corresponding to whether or not we suspect `Readability.parse()` will succeed at returning an article object. Here's an example:

```js
/*
    Only instantiate Readability  if we suspect
    the `parse()` method will produce a meaningful result.
*/
if (isProbablyReaderable(document)) {
    let article = new Readability(document).parse();
}
```

## Node.js usage

Since Node.js does not come with its own DOM implementation, we rely on external libraries like [jsdom](https://github.com/jsdom/jsdom). Here's an example using `jsdom` to obtain a DOM document object:

```js
var { Readability } = require('@mozilla/readability');
var { JSDOM } = require('jsdom');
var doc = new JSDOM("<body>Look at this cat: <img src='./cat.jpg'></body>", {
  url: "https://www.example.com/the-page-i-got-the-source-from"
});
let reader = new Readability(doc.window.document);
let article = reader.parse();
```

Remember to pass the page's URI as the `url` option in the `JSDOM` constructor (as shown in the example above), so that Readability can convert relative URLs for images, hyperlinks, etc. to their absolute counterparts.

`jsdom` has the ability to run the scripts included in the HTML and fetch remote resources. For security reasons these are [disabled by default](https://github.com/jsdom/jsdom#executing-scripts), and we **strongly** recommend you keep them that way.

## Security

If you're going to use Readability with untrusted input (whether in HTML or DOM form), we **strongly** recommend you use a sanitizer library like [DOMPurify](https://github.com/cure53/DOMPurify) to avoid script injection when you use
the output of Readability. We would also recommend using [CSP](https://developer.mozilla.org/en-US/docs/Web/HTTP/CSP) to add further defense-in-depth
restrictions to what you allow the resulting content to do. The Firefox integration of
reader mode uses both of these techniques itself. Sanitizing unsafe content out of the input is explicitly not something we aim to do as part of Readability itself - there are other good sanitizer libraries out there, use them!

## Contributing

Please see our [Contributing](CONTRIBUTING.md) document.

## License

    Copyright (c) 2010 Arc90 Inc

    Licensed under the Apache License, Version 2.0 (the "License");
    you may not use this file except in compliance with the License.
    You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

    Unless required by applicable law or agreed to in writing, software
    distributed under the License is distributed on an "AS IS" BASIS,
    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    See the License for the specific language governing permissions and
    limitations under the License.


================================================
FILE: Readability-readerable.js
================================================
/*
 * Copyright (c) 2010 Arc90 Inc
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

/*
 * This code is heavily based on Arc90's readability.js (1.7.1) script
 * available at: http://code.google.com/p/arc90labs-readability
 */

var REGEXPS = {
  // NOTE: These two regular expressions are duplicated in
  // Readability.js. Please keep both copies in sync.
  unlikelyCandidates:
    /-ad-|ai2html|banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|footer|gdpr|header|legends|menu|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote/i,
  okMaybeItsACandidate: /and|article|body|column|content|main|mathjax|shadow/i,
};

function isNodeVisible(node) {
  // Have to null-check node.style and node.className.includes to deal with SVG and MathML nodes.
  return (
    (!node.style || node.style.display != "none") &&
    !node.hasAttribute("hidden") &&
    //check for "fallback-image" so that wikimedia math images are displayed
    (!node.hasAttribute("aria-hidden") ||
      node.getAttribute("aria-hidden") != "true" ||
      (node.className &&
        node.className.includes &&
        node.className.includes("fallback-image")))
  );
}

/**
 * Decides whether or not the document is reader-able without parsing the whole thing.
 * @param {Object} options Configuration object.
 * @param {number} [options.minContentLength=140] The minimum node content length used to decide if the document is readerable.
 * @param {number} [options.minScore=20] The minumum cumulated 'score' used to determine if the document is readerable.
 * @param {Function} [options.visibilityChecker=isNodeVisible] The function used to determine if a node is visible.
 * @return {boolean} Whether or not we suspect Readability.parse() will suceeed at returning an article object.
 */
function isProbablyReaderable(doc, options = {}) {
  // For backward compatibility reasons 'options' can either be a configuration object or the function used
  // to determine if a node is visible.
  if (typeof options == "function") {
    options = { visibilityChecker: options };
  }

  var defaultOptions = {
    minScore: 20,
    minContentLength: 140,
    visibilityChecker: isNodeVisible,
  };
  options = Object.assign(defaultOptions, options);

  var nodes = doc.querySelectorAll("p, pre, article");

  // Get <div> nodes which have <br> node(s) and append them into the `nodes` variable.
  // Some articles' DOM structures might look like
  // <div>
  //   Sentences<br>
  //   <br>
  //   Sentences<br>
  // </div>
  var brNodes = doc.querySelectorAll("div > br");
  if (brNodes.length) {
    var set = new Set(nodes);
    [].forEach.call(brNodes, function (node) {
      set.add(node.parentNode);
    });
    nodes = Array.from(set);
  }

  var score = 0;
  // This is a little cheeky, we use the accumulator 'score' to decide what to return from
  // this callback:
  return [].some.call(nodes, function (node) {
    if (!options.visibilityChecker(node)) {
      return false;
    }

    var matchString = node.className + " " + node.id;
    if (
      REGEXPS.unlikelyCandidates.test(matchString) &&
      !REGEXPS.okMaybeItsACandidate.test(matchString)
    ) {
      return false;
    }

    if (node.matches("li p")) {
      return false;
    }

    var textContentLength = node.textContent.trim().length;
    if (textContentLength < options.minContentLength) {
      return false;
    }

    score += Math.sqrt(textContentLength - options.minContentLength);

    if (score > options.minScore) {
      return true;
    }
    return false;
  });
}

if (typeof module === "object") {
  /* eslint-disable-next-line no-redeclare */
  /* global module */
  module.exports = isProbablyReaderable;
}


================================================
FILE: Readability.js
================================================
/*
 * Copyright (c) 2010 Arc90 Inc
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

/*
 * This code is heavily based on Arc90's readability.js (1.7.1) script
 * available at: http://code.google.com/p/arc90labs-readability
 */

/**
 * Public constructor.
 * @param {HTMLDocument} doc     The document to parse.
 * @param {Object}       options The options object.
 */
function Readability(doc, options) {
  // In some older versions, people passed a URI as the first argument. Cope:
  if (options && options.documentElement) {
    doc = options;
    options = arguments[2];
  } else if (!doc || !doc.documentElement) {
    throw new Error(
      "First argument to Readability constructor should be a document object."
    );
  }
  options = options || {};

  this._doc = doc;
  this._docJSDOMParser = this._doc.firstChild.__JSDOMParser__;
  this._articleTitle = null;
  this._articleByline = null;
  this._articleDir = null;
  this._articleSiteName = null;
  this._attempts = [];
  this._metadata = {};

  // Configurable options
  this._debug = !!options.debug;
  this._maxElemsToParse =
    options.maxElemsToParse || this.DEFAULT_MAX_ELEMS_TO_PARSE;
  this._nbTopCandidates =
    options.nbTopCandidates || this.DEFAULT_N_TOP_CANDIDATES;
  this._charThreshold = options.charThreshold || this.DEFAULT_CHAR_THRESHOLD;
  this._classesToPreserve = this.CLASSES_TO_PRESERVE.concat(
    options.classesToPreserve || []
  );
  this._keepClasses = !!options.keepClasses;
  this._serializer =
    options.serializer ||
    function (el) {
      return el.innerHTML;
    };
  this._disableJSONLD = !!options.disableJSONLD;
  this._allowedVideoRegex = options.allowedVideoRegex || this.REGEXPS.videos;
  this._linkDensityModifier = options.linkDensityModifier || 0;

  // Start with all flags set
  this._flags =
    this.FLAG_STRIP_UNLIKELYS |
    this.FLAG_WEIGHT_CLASSES |
    this.FLAG_CLEAN_CONDITIONALLY;

  // Control whether log messages are sent to the console
  if (this._debug) {
    let logNode = function (node) {
      if (node.nodeType == node.TEXT_NODE) {
        return `${node.nodeName} ("${node.textContent}")`;
      }
      let attrPairs = Array.from(node.attributes || [], function (attr) {
        return `${attr.name}="${attr.value}"`;
      }).join(" ");
      return `<${node.localName} ${attrPairs}>`;
    };
    this.log = function () {
      if (typeof console !== "undefined") {
        let args = Array.from(arguments, arg => {
          if (arg && arg.nodeType == this.ELEMENT_NODE) {
            return logNode(arg);
          }
          return arg;
        });
        args.unshift("Reader: (Readability)");
        // eslint-disable-next-line no-console
        console.log(...args);
      } else if (typeof dump !== "undefined") {
        /* global dump */
        var msg = Array.prototype.map
          .call(arguments, function (x) {
            return x && x.nodeName ? logNode(x) : x;
          })
          .join(" ");
        dump("Reader: (Readability) " + msg + "\n");
      }
    };
  } else {
    this.log = function () {};
  }
}

Readability.prototype = {
  FLAG_STRIP_UNLIKELYS: 0x1,
  FLAG_WEIGHT_CLASSES: 0x2,
  FLAG_CLEAN_CONDITIONALLY: 0x4,

  // https://developer.mozilla.org/en-US/docs/Web/API/Node/nodeType
  ELEMENT_NODE: 1,
  TEXT_NODE: 3,

  // Max number of nodes supported by this parser. Default: 0 (no limit)
  DEFAULT_MAX_ELEMS_TO_PARSE: 0,

  // The number of top candidates to consider when analysing how
  // tight the competition is among candidates.
  DEFAULT_N_TOP_CANDIDATES: 5,

  // Element tags to score by default.
  DEFAULT_TAGS_TO_SCORE: "section,h2,h3,h4,h5,h6,p,td,pre"
    .toUpperCase()
    .split(","),

  // The default number of chars an article must have in order to return a result
  DEFAULT_CHAR_THRESHOLD: 500,

  // All of the regular expressions in use within readability.
  // Defined up here so we don't instantiate them repeatedly in loops.
  REGEXPS: {
    // NOTE: These two regular expressions are duplicated in
    // Readability-readerable.js. Please keep both copies in sync.
    unlikelyCandidates:
      /-ad-|ai2html|banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|footer|gdpr|header|legends|menu|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote/i,
    okMaybeItsACandidate:
      /and|article|body|column|content|main|mathjax|shadow/i,

    positive:
      /article|body|content|entry|hentry|h-entry|main|page|pagination|post|text|blog|story/i,
    negative:
      /-ad-|hidden|^hid$| hid$| hid |^hid |banner|combx|comment|com-|contact|footer|gdpr|masthead|media|meta|outbrain|promo|related|scroll|share|shoutbox|sidebar|skyscraper|sponsor|shopping|tags|widget/i,
    extraneous:
      /print|archive|comment|discuss|e[\-]?mail|share|reply|all|login|sign|single|utility/i,
    byline: /byline|author|dateline|writtenby|p-author/i,
    replaceFonts: /<(\/?)font[^>]*>/gi,
    normalize: /\s{2,}/g,
    videos:
      /\/\/(www\.)?((dailymotion|youtube|youtube-nocookie|player\.vimeo|v\.qq|bilibili|live.bilibili)\.com|(archive|upload\.wikimedia)\.org|player\.twitch\.tv)/i,
    shareElements: /(\b|_)(share|sharedaddy)(\b|_)/i,
    nextLink: /(next|weiter|continue|>([^\|]|$)|»([^\|]|$))/i,
    prevLink: /(prev|earl|old|new|<|«)/i,
    tokenize: /\W+/g,
    whitespace: /^\s*$/,
    hasContent: /\S$/,
    hashUrl: /^#.+/,
    srcsetUrl: /(\S+)(\s+[\d.]+[xw])?(\s*(?:,|$))/g,
    b64DataUrl: /^data:\s*([^\s;,]+)\s*;\s*base64\s*,/i,
    // Commas as used in Latin, Sindhi, Chinese and various other scripts.
    // see: https://en.wikipedia.org/wiki/Comma#Comma_variants
    commas: /\u002C|\u060C|\uFE50|\uFE10|\uFE11|\u2E41|\u2E34|\u2E32|\uFF0C/g,
    // See: https://schema.org/Article
    jsonLdArticleTypes:
      /^Article|AdvertiserContentArticle|NewsArticle|AnalysisNewsArticle|AskPublicNewsArticle|BackgroundNewsArticle|OpinionNewsArticle|ReportageNewsArticle|ReviewNewsArticle|Report|SatiricalArticle|ScholarlyArticle|MedicalScholarlyArticle|SocialMediaPosting|BlogPosting|LiveBlogPosting|DiscussionForumPosting|TechArticle|APIReference$/,
    // used to see if a node's content matches words commonly used for ad blocks or loading indicators
    adWords:
      /^(ad(vertising|vertisement)?|pub(licité)?|werb(ung)?|广告|Реклама|Anuncio)$/iu,
    loadingWords:
      /^((loading|正在加载|Загрузка|chargement|cargando)(…|\.\.\.)?)$/iu,
  },

  UNLIKELY_ROLES: [
    "menu",
    "menubar",
    "complementary",
    "navigation",
    "alert",
    "alertdialog",
    "dialog",
  ],

  DIV_TO_P_ELEMS: new Set([
    "BLOCKQUOTE",
    "DL",
    "DIV",
    "IMG",
    "OL",
    "P",
    "PRE",
    "TABLE",
    "UL",
  ]),

  ALTER_TO_DIV_EXCEPTIONS: ["DIV", "ARTICLE", "SECTION", "P", "OL", "UL"],

  PRESENTATIONAL_ATTRIBUTES: [
    "align",
    "background",
    "bgcolor",
    "border",
    "cellpadding",
    "cellspacing",
    "frame",
    "hspace",
    "rules",
    "style",
    "valign",
    "vspace",
  ],

  DEPRECATED_SIZE_ATTRIBUTE_ELEMS: ["TABLE", "TH", "TD", "HR", "PRE"],

  // The commented out elements qualify as phrasing content but tend to be
  // removed by readability when put into paragraphs, so we ignore them here.
  PHRASING_ELEMS: [
    // "CANVAS", "IFRAME", "SVG", "VIDEO",
    "ABBR",
    "AUDIO",
    "B",
    "BDO",
    "BR",
    "BUTTON",
    "CITE",
    "CODE",
    "DATA",
    "DATALIST",
    "DFN",
    "EM",
    "EMBED",
    "I",
    "IMG",
    "INPUT",
    "KBD",
    "LABEL",
    "MARK",
    "MATH",
    "METER",
    "NOSCRIPT",
    "OBJECT",
    "OUTPUT",
    "PROGRESS",
    "Q",
    "RUBY",
    "SAMP",
    "SCRIPT",
    "SELECT",
    "SMALL",
    "SPAN",
    "STRONG",
    "SUB",
    "SUP",
    "TEXTAREA",
    "TIME",
    "VAR",
    "WBR",
  ],

  // These are the classes that readability sets itself.
  CLASSES_TO_PRESERVE: ["page"],

  // These are the list of HTML entities that need to be escaped.
  HTML_ESCAPE_MAP: {
    lt: "<",
    gt: ">",
    amp: "&",
    quot: '"',
    apos: "'",
  },

  /**
   * Run any post-process modifications to article content as necessary.
   *
   * @param Element
   * @return void
   **/
  _postProcessContent(articleContent) {
    // Readability cannot open relative uris so we convert them to absolute uris.
    this._fixRelativeUris(articleContent);

    this._simplifyNestedElements(articleContent);

    if (!this._keepClasses) {
      // Remove classes.
      this._cleanClasses(articleContent);
    }
  },

  /**
   * Iterates over a NodeList, calls `filterFn` for each node and removes node
   * if function returned `true`.
   *
   * If function is not passed, removes all the nodes in node list.
   *
   * @param NodeList nodeList The nodes to operate on
   * @param Function filterFn the function to use as a filter
   * @return void
   */
  _removeNodes(nodeList, filterFn) {
    // Avoid ever operating on live node lists.
    if (this._docJSDOMParser && nodeList._isLiveNodeList) {
      throw new Error("Do not pass live node lists to _removeNodes");
    }
    for (var i = nodeList.length - 1; i >= 0; i--) {
      var node = nodeList[i];
      var parentNode = node.parentNode;
      if (parentNode) {
        if (!filterFn || filterFn.call(this, node, i, nodeList)) {
          parentNode.removeChild(node);
        }
      }
    }
  },

  /**
   * Iterates over a NodeList, and calls _setNodeTag for each node.
   *
   * @param NodeList nodeList The nodes to operate on
   * @param String newTagName the new tag name to use
   * @return void
   */
  _replaceNodeTags(nodeList, newTagName) {
    // Avoid ever operating on live node lists.
    if (this._docJSDOMParser && nodeList._isLiveNodeList) {
      throw new Error("Do not pass live node lists to _replaceNodeTags");
    }
    for (const node of nodeList) {
      this._setNodeTag(node, newTagName);
    }
  },

  /**
   * Iterate over a NodeList, which doesn't natively fully implement the Array
   * interface.
   *
   * For convenience, the current object context is applied to the provided
   * iterate function.
   *
   * @param  NodeList nodeList The NodeList.
   * @param  Function fn       The iterate function.
   * @return void
   */
  _forEachNode(nodeList, fn) {
    Array.prototype.forEach.call(nodeList, fn, this);
  },

  /**
   * Iterate over a NodeList, and return the first node that passes
   * the supplied test function
   *
   * For convenience, the current object context is applied to the provided
   * test function.
   *
   * @param  NodeList nodeList The NodeList.
   * @param  Function fn       The test function.
   * @return void
   */
  _findNode(nodeList, fn) {
    return Array.prototype.find.call(nodeList, fn, this);
  },

  /**
   * Iterate over a NodeList, return true if any of the provided iterate
   * function calls returns true, false otherwise.
   *
   * For convenience, the current object context is applied to the
   * provided iterate function.
   *
   * @param  NodeList nodeList The NodeList.
   * @param  Function fn       The iterate function.
   * @return Boolean
   */
  _someNode(nodeList, fn) {
    return Array.prototype.some.call(nodeList, fn, this);
  },

  /**
   * Iterate over a NodeList, return true if all of the provided iterate
   * function calls return true, false otherwise.
   *
   * For convenience, the current object context is applied to the
   * provided iterate function.
   *
   * @param  NodeList nodeList The NodeList.
   * @param  Function fn       The iterate function.
   * @return Boolean
   */
  _everyNode(nodeList, fn) {
    return Array.prototype.every.call(nodeList, fn, this);
  },

  _getAllNodesWithTag(node, tagNames) {
    if (node.querySelectorAll) {
      return node.querySelectorAll(tagNames.join(","));
    }
    return [].concat.apply(
      [],
      tagNames.map(function (tag) {
        var collection = node.getElementsByTagName(tag);
        return Array.isArray(collection) ? collection : Array.from(collection);
      })
    );
  },

  /**
   * Removes the class="" attribute from every element in the given
   * subtree, except those that match CLASSES_TO_PRESERVE and
   * the classesToPreserve array from the options object.
   *
   * @param Element
   * @return void
   */
  _cleanClasses(node) {
    var classesToPreserve = this._classesToPreserve;
    var className = (node.getAttribute("class") || "")
      .split(/\s+/)
      .filter(cls => classesToPreserve.includes(cls))
      .join(" ");

    if (className) {
      node.setAttribute("class", className);
    } else {
      node.removeAttribute("class");
    }

    for (node = node.firstElementChild; node; node = node.nextElementSibling) {
      this._cleanClasses(node);
    }
  },

  /**
   * Tests whether a string is a URL or not.
   *
   * @param {string} str The string to test
   * @return {boolean} true if str is a URL, false if not
   */
  _isUrl(str) {
    try {
      new URL(str);
      return true;
    } catch {
      return false;
    }
  },
  /**
   * Converts each <a> and <img> uri in the given element to an absolute URI,
   * ignoring #ref URIs.
   *
   * @param Element
   * @return void
   */
  _fixRelativeUris(articleContent) {
    var baseURI = this._doc.baseURI;
    var documentURI = this._doc.documentURI;
    function toAbsoluteURI(uri) {
      // Leave hash links alone if the base URI matches the document URI:
      if (baseURI == documentURI && uri.charAt(0) == "#") {
        return uri;
      }

      // Otherwise, resolve against base URI:
      try {
        return new URL(uri, baseURI).href;
      } catch (ex) {
        // Something went wrong, just return the original:
      }
      return uri;
    }

    var links = this._getAllNodesWithTag(articleContent, ["a"]);
    this._forEachNode(links, function (link) {
      var href = link.getAttribute("href");
      if (href) {
        // Remove links with javascript: URIs, since
        // they won't work after scripts have been removed from the page.
        if (href.indexOf("javascript:") === 0) {
          // if the link only contains simple text content, it can be converted to a text node
          if (
            link.childNodes.length === 1 &&
            link.childNodes[0].nodeType === this.TEXT_NODE
          ) {
            var text = this._doc.createTextNode(link.textContent);
            link.parentNode.replaceChild(text, link);
          } else {
            // if the link has multiple children, they should all be preserved
            var container = this._doc.createElement("span");
            while (link.firstChild) {
              container.appendChild(link.firstChild);
            }
            link.parentNode.replaceChild(container, link);
          }
        } else {
          link.setAttribute("href", toAbsoluteURI(href));
        }
      }
    });

    var medias = this._getAllNodesWithTag(articleContent, [
      "img",
      "picture",
      "figure",
      "video",
      "audio",
      "source",
    ]);

    this._forEachNode(medias, function (media) {
      var src = media.getAttribute("src");
      var poster = media.getAttribute("poster");
      var srcset = media.getAttribute("srcset");

      if (src) {
        media.setAttribute("src", toAbsoluteURI(src));
      }

      if (poster) {
        media.setAttribute("poster", toAbsoluteURI(poster));
      }

      if (srcset) {
        var newSrcset = srcset.replace(
          this.REGEXPS.srcsetUrl,
          function (_, p1, p2, p3) {
            return toAbsoluteURI(p1) + (p2 || "") + p3;
          }
        );

        media.setAttribute("srcset", newSrcset);
      }
    });
  },

  _simplifyNestedElements(articleContent) {
    var node = articleContent;

    while (node) {
      if (
        node.parentNode &&
        ["DIV", "SECTION"].includes(node.tagName) &&
        !(node.id && node.id.startsWith("readability"))
      ) {
        if (this._isElementWithoutContent(node)) {
          node = this._removeAndGetNext(node);
          continue;
        } else if (
          this._hasSingleTagInsideElement(node, "DIV") ||
          this._hasSingleTagInsideElement(node, "SECTION")
        ) {
          var child = node.children[0];
          for (var i = 0; i < node.attributes.length; i++) {
            child.setAttributeNode(node.attributes[i].cloneNode());
          }
          node.parentNode.replaceChild(child, node);
          node = child;
          continue;
        }
      }

      node = this._getNextNode(node);
    }
  },

  /**
   * Get the article title as an H1.
   *
   * @return string
   **/
  _getArticleTitle() {
    var doc = this._doc;
    var curTitle = "";
    var origTitle = "";

    try {
      curTitle = origTitle = doc.title.trim();

      // If they had an element with id "title" in their HTML
      if (typeof curTitle !== "string") {
        curTitle = origTitle = this._getInnerText(
          doc.getElementsByTagName("title")[0]
        );
      }
    } catch (e) {
      /* ignore exceptions setting the title. */
    }

    var titleHadHierarchicalSeparators = false;
    function wordCount(str) {
      return str.split(/\s+/).length;
    }

    // If there's a separator in the title, first remove the final part
    const titleSeparators = /\|\-–—\\\/>»/.source;
    if (new RegExp(`\\s[${titleSeparators}]\\s`).test(curTitle)) {
      titleHadHierarchicalSeparators = /\s[\\\/>»]\s/.test(curTitle);
      let allSeparators = Array.from(
        origTitle.matchAll(new RegExp(`\\s[${titleSeparators}]\\s`, "gi"))
      );
      curTitle = origTitle.substring(0, allSeparators.pop().index);

      // If the resulting title is too short, remove the first part instead:
      if (wordCount(curTitle) < 3) {
        curTitle = origTitle.replace(
          new RegExp(`^[^${titleSeparators}]*[${titleSeparators}]`, "gi"),
          ""
        );
      }
    } else if (curTitle.includes(": ")) {
      // Check if we have an heading containing this exact string, so we
      // could assume it's the full title.
      var headings = this._getAllNodesWithTag(doc, ["h1", "h2"]);
      var trimmedTitle = curTitle.trim();
      var match = this._someNode(headings, function (heading) {
        return heading.textContent.trim() === trimmedTitle;
      });

      // If we don't, let's extract the title out of the original title string.
      if (!match) {
        curTitle = origTitle.substring(origTitle.lastIndexOf(":") + 1);

        // If the title is now too short, try the first colon instead:
        if (wordCount(curTitle) < 3) {
          curTitle = origTitle.substring(origTitle.indexOf(":") + 1);
          // But if we have too many words before the colon there's something weird
          // with the titles and the H tags so let's just use the original title instead
        } else if (wordCount(origTitle.substr(0, origTitle.indexOf(":"))) > 5) {
          curTitle = origTitle;
        }
      }
    } else if (curTitle.length > 150 || curTitle.length < 15) {
      var hOnes = doc.getElementsByTagName("h1");

      if (hOnes.length === 1) {
        curTitle = this._getInnerText(hOnes[0]);
      }
    }

    curTitle = curTitle.trim().replace(this.REGEXPS.normalize, " ");
    // If we now have 4 words or fewer as our title, and either no
    // 'hierarchical' separators (\, /, > or ») were found in the original
    // title or we decreased the number of words by more than 1 word, use
    // the original title.
    var curTitleWordCount = wordCount(curTitle);
    if (
      curTitleWordCount <= 4 &&
      (!titleHadHierarchicalSeparators ||
        curTitleWordCount !=
          wordCount(
            origTitle.replace(new RegExp(`\\s[${titleSeparators}]\\s`, "g"), "")
          ) -
            1)
    ) {
      curTitle = origTitle;
    }

    return curTitle;
  },

  /**
   * Prepare the HTML document for readability to scrape it.
   * This includes things like stripping javascript, CSS, and handling terrible markup.
   *
   * @return void
   **/
  _prepDocument() {
    var doc = this._doc;

    // Remove all style tags in head
    this._removeNodes(this._getAllNodesWithTag(doc, ["style"]));

    if (doc.body) {
      this._replaceBrs(doc.body);
    }

    this._replaceNodeTags(this._getAllNodesWithTag(doc, ["font"]), "SPAN");
  },

  /**
   * Finds the next node, starting from the given node, and ignoring
   * whitespace in between. If the given node is an element, the same node is
   * returned.
   */
  _nextNode(node) {
    var next = node;
    while (
      next &&
      next.nodeType != this.ELEMENT_NODE &&
      this.REGEXPS.whitespace.test(next.textContent)
    ) {
      next = next.nextSibling;
    }
    return next;
  },

  /**
   * Replaces 2 or more successive <br> elements with a single <p>.
   * Whitespace between <br> elements are ignored. For example:
   *   <div>foo<br>bar<br> <br><br>abc</div>
   * will become:
   *   <div>foo<br>bar<p>abc</p></div>
   */
  _replaceBrs(elem) {
    this._forEachNode(this._getAllNodesWithTag(elem, ["br"]), function (br) {
      var next = br.nextSibling;

      // Whether 2 or more <br> elements have been found and replaced with a
      // <p> block.
      var replaced = false;

      // If we find a <br> chain, remove the <br>s until we hit another node
      // or non-whitespace. This leaves behind the first <br> in the chain
      // (which will be replaced with a <p> later).
      while ((next = this._nextNode(next)) && next.tagName == "BR") {
        replaced = true;
        var brSibling = next.nextSibling;
        next.remove();
        next = brSibling;
      }

      // If we removed a <br> chain, replace the remaining <br> with a <p>. Add
      // all sibling nodes as children of the <p> until we hit another <br>
      // chain.
      if (replaced) {
        var p = this._doc.createElement("p");
        br.parentNode.replaceChild(p, br);

        next = p.nextSibling;
        while (next) {
          // If we've hit another <br><br>, we're done adding children to this <p>.
          if (next.tagName == "BR") {
            var nextElem = this._nextNode(next.nextSibling);
            if (nextElem && nextElem.tagName == "BR") {
              break;
            }
          }

          if (!this._isPhrasingContent(next)) {
            break;
          }

          // Otherwise, make this node a child of the new <p>.
          var sibling = next.nextSibling;
          p.appendChild(next);
          next = sibling;
        }

        while (p.lastChild && this._isWhitespace(p.lastChild)) {
          p.lastChild.remove();
        }

        if (p.parentNode.tagName === "P") {
          this._setNodeTag(p.parentNode, "DIV");
        }
      }
    });
  },

  _setNodeTag(node, tag) {
    this.log("_setNodeTag", node, tag);
    if (this._docJSDOMParser) {
      node.localName = tag.toLowerCase();
      node.tagName = tag.toUpperCase();
      return node;
    }

    var replacement = node.ownerDocument.createElement(tag);
    while (node.firstChild) {
      replacement.appendChild(node.firstChild);
    }
    node.parentNode.replaceChild(replacement, node);
    if (node.readability) {
      replacement.readability = node.readability;
    }

    for (var i = 0; i < node.attributes.length; i++) {
      replacement.setAttributeNode(node.attributes[i].cloneNode());
    }
    return replacement;
  },

  /**
   * Prepare the article node for display. Clean out any inline styles,
   * iframes, forms, strip extraneous <p> tags, etc.
   *
   * @param Element
   * @return void
   **/
  _prepArticle(articleContent) {
    this._cleanStyles(articleContent);

    // Check for data tables before we continue, to avoid removing items in
    // those tables, which will often be isolated even though they're
    // visually linked to other content-ful elements (text, images, etc.).
    this._markDataTables(articleContent);

    this._fixLazyImages(articleContent);

    // Clean out junk from the article content
    this._cleanConditionally(articleContent, "form");
    this._cleanConditionally(articleContent, "fieldset");
    this._clean(articleContent, "object");
    this._clean(articleContent, "embed");
    this._clean(articleContent, "footer");
    this._clean(articleContent, "link");
    this._clean(articleContent, "aside");

    // Clean out elements with little content that have "share" in their id/class combinations from final top candidates,
    // which means we don't remove the top candidates even they have "share".

    var shareElementThreshold = this.DEFAULT_CHAR_THRESHOLD;

    this._forEachNode(articleContent.children, function (topCandidate) {
      this._cleanMatchedNodes(topCandidate, function (node, matchString) {
        return (
          this.REGEXPS.shareElements.test(matchString) &&
          node.textContent.length < shareElementThreshold
        );
      });
    });

    this._clean(articleContent, "iframe");
    this._clean(articleContent, "input");
    this._clean(articleContent, "textarea");
    this._clean(articleContent, "select");
    this._clean(articleContent, "button");
    this._cleanHeaders(articleContent);

    // Do these last as the previous stuff may have removed junk
    // that will affect these
    this._cleanConditionally(articleContent, "table");
    this._cleanConditionally(articleContent, "ul");
    this._cleanConditionally(articleContent, "div");

    // replace H1 with H2 as H1 should be only title that is displayed separately
    this._replaceNodeTags(
      this._getAllNodesWithTag(articleContent, ["h1"]),
      "h2"
    );

    // Remove extra paragraphs
    this._removeNodes(
      this._getAllNodesWithTag(articleContent, ["p"]),
      function (paragraph) {
        // At this point, nasty iframes have been removed; only embedded video
        // ones remain.
        var contentElementCount = this._getAllNodesWithTag(paragraph, [
          "img",
          "embed",
          "object",
          "iframe",
        ]).length;
        return (
          contentElementCount === 0 && !this._getInnerText(paragraph, false)
        );
      }
    );

    this._forEachNode(
      this._getAllNodesWithTag(articleContent, ["br"]),
      function (br) {
        var next = this._nextNode(br.nextSibling);
        if (next && next.tagName == "P") {
          br.remove();
        }
      }
    );

    // Remove single-cell tables
    this._forEachNode(
      this._getAllNodesWithTag(articleContent, ["table"]),
      function (table) {
        var tbody = this._hasSingleTagInsideElement(table, "TBODY")
          ? table.firstElementChild
          : table;
        if (this._hasSingleTagInsideElement(tbody, "TR")) {
          var row = tbody.firstElementChild;
          if (this._hasSingleTagInsideElement(row, "TD")) {
            var cell = row.firstElementChild;
            cell = this._setNodeTag(
              cell,
              this._everyNode(cell.childNodes, this._isPhrasingContent)
                ? "P"
                : "DIV"
            );
            table.parentNode.replaceChild(cell, table);
          }
        }
      }
    );
  },

  /**
   * Initialize a node with the readability object. Also checks the
   * className/id for special names to add to its score.
   *
   * @param Element
   * @return void
   **/
  _initializeNode(node) {
    node.readability = { contentScore: 0 };

    switch (node.tagName) {
      case "DIV":
        node.readability.contentScore += 5;
        break;

      case "PRE":
      case "TD":
      case "BLOCKQUOTE":
        node.readability.contentScore += 3;
        break;

      case "ADDRESS":
      case "OL":
      case "UL":
      case "DL":
      case "DD":
      case "DT":
      case "LI":
      case "FORM":
        node.readability.contentScore -= 3;
        break;

      case "H1":
      case "H2":
      case "H3":
      case "H4":
      case "H5":
      case "H6":
      case "TH":
        node.readability.contentScore -= 5;
        break;
    }

    node.readability.contentScore += this._getClassWeight(node);
  },

  _removeAndGetNext(node) {
    var nextNode = this._getNextNode(node, true);
    node.remove();
    return nextNode;
  },

  /**
   * Traverse the DOM from node to node, starting at the node passed in.
   * Pass true for the second parameter to indicate this node itself
   * (and its kids) are going away, and we want the next node over.
   *
   * Calling this in a loop will traverse the DOM depth-first.
   *
   * @param {Element} node
   * @param {boolean} ignoreSelfAndKids
   * @return {Element}
   */
  _getNextNode(node, ignoreSelfAndKids) {
    // First check for kids if those aren't being ignored
    if (!ignoreSelfAndKids && node.firstElementChild) {
      return node.firstElementChild;
    }
    // Then for siblings...
    if (node.nextElementSibling) {
      return node.nextElementSibling;
    }
    // And finally, move up the parent chain *and* find a sibling
    // (because this is depth-first traversal, we will have already
    // seen the parent nodes themselves).
    do {
      node = node.parentNode;
    } while (node && !node.nextElementSibling);
    return node && node.nextElementSibling;
  },

  // compares second text to first one
  // 1 = same text, 0 = completely different text
  // works the way that it splits both texts into words and then finds words that are unique in second text
  // the result is given by the lower length of unique parts
  _textSimilarity(textA, textB) {
    var tokensA = textA
      .toLowerCase()
      .split(this.REGEXPS.tokenize)
      .filter(Boolean);
    var tokensB = textB
      .toLowerCase()
      .split(this.REGEXPS.tokenize)
      .filter(Boolean);
    if (!tokensA.length || !tokensB.length) {
      return 0;
    }
    var uniqTokensB = tokensB.filter(token => !tokensA.includes(token));
    var distanceB = uniqTokensB.join(" ").length / tokensB.join(" ").length;
    return 1 - distanceB;
  },

  /**
   * Checks whether an element node contains a valid byline
   *
   * @param node {Element}
   * @param matchString {string}
   * @return boolean
   */
  _isValidByline(node, matchString) {
    var rel = node.getAttribute("rel");
    var itemprop = node.getAttribute("itemprop");
    var bylineLength = node.textContent.trim().length;

    return (
      (rel === "author" ||
        (itemprop && itemprop.includes("author")) ||
        this.REGEXPS.byline.test(matchString)) &&
      !!bylineLength &&
      bylineLength < 100
    );
  },

  _getNodeAncestors(node, maxDepth) {
    maxDepth = maxDepth || 0;
    var i = 0,
      ancestors = [];
    while (node.parentNode) {
      ancestors.push(node.parentNode);
      if (maxDepth && ++i === maxDepth) {
        break;
      }
      node = node.parentNode;
    }
    return ancestors;
  },

  /***
   * grabArticle - Using a variety of metrics (content score, classname, element types), find the content that is
   *         most likely to be the stuff a user wants to read. Then return it wrapped up in a div.
   *
   * @param page a document to run upon. Needs to be a full document, complete with body.
   * @return Element
   **/
  /* eslint-disable-next-line complexity */
  _grabArticle(page) {
    this.log("**** grabArticle ****");
    var doc = this._doc;
    var isPaging = page !== null;
    page = page ? page : this._doc.body;

    // We can't grab an article if we don't have a page!
    if (!page) {
      this.log("No body found in document. Abort.");
      return null;
    }

    var pageCacheHtml = page.innerHTML;

    while (true) {
      this.log("Starting grabArticle loop");
      var stripUnlikelyCandidates = this._flagIsActive(
        this.FLAG_STRIP_UNLIKELYS
      );

      // First, node prepping. Trash nodes that look cruddy (like ones with the
      // class name "comment", etc), and turn divs into P tags where they have been
      // used inappropriately (as in, where they contain no other block level elements.)
      var elementsToScore = [];
      var node = this._doc.documentElement;

      let shouldRemoveTitleHeader = true;

      while (node) {
        if (node.tagName === "HTML") {
          this._articleLang = node.getAttribute("lang");
        }

        var matchString = node.className + " " + node.id;

        if (!this._isProbablyVisible(node)) {
          this.log("Removing hidden node - " + matchString);
          node = this._removeAndGetNext(node);
          continue;
        }

        // User is not able to see elements applied with both "aria-modal = true" and "role = dialog"
        if (
          node.getAttribute("aria-modal") == "true" &&
          node.getAttribute("role") == "dialog"
        ) {
          node = this._removeAndGetNext(node);
          continue;
        }

        // If we don't have a byline yet check to see if this node is a byline; if it is store the byline and remove the node.
        if (
          !this._articleByline &&
          !this._metadata.byline &&
          this._isValidByline(node, matchString)
        ) {
          // Find child node matching [itemprop="name"] and use that if it exists for a more accurate author name byline
          var endOfSearchMarkerNode = this._getNextNode(node, true);
          var next = this._getNextNode(node);
          var itemPropNameNode = null;
          while (next && next != endOfSearchMarkerNode) {
            var itemprop = next.getAttribute("itemprop");
            if (itemprop && itemprop.includes("name")) {
              itemPropNameNode = next;
              break;
            } else {
              next = this._getNextNode(next);
            }
          }
          this._articleByline = (itemPropNameNode ?? node).textContent.trim();
          node = this._removeAndGetNext(node);
          continue;
        }

        if (shouldRemoveTitleHeader && this._headerDuplicatesTitle(node)) {
          this.log(
            "Removing header: ",
            node.textContent.trim(),
            this._articleTitle.trim()
          );
          shouldRemoveTitleHeader = false;
          node = this._removeAndGetNext(node);
          continue;
        }

        // Remove unlikely candidates
        if (stripUnlikelyCandidates) {
          if (
            this.REGEXPS.unlikelyCandidates.test(matchString) &&
            !this.REGEXPS.okMaybeItsACandidate.test(matchString) &&
            !this._hasAncestorTag(node, "table") &&
            !this._hasAncestorTag(node, "code") &&
            node.tagName !== "BODY" &&
            node.tagName !== "A"
          ) {
            this.log("Removing unlikely candidate - " + matchString);
            node = this._removeAndGetNext(node);
            continue;
          }

          if (this.UNLIKELY_ROLES.includes(node.getAttribute("role"))) {
            this.log(
              "Removing content with role " +
                node.getAttribute("role") +
                " - " +
                matchString
            );
            node = this._removeAndGetNext(node);
            continue;
          }
        }

        // Remove DIV, SECTION, and HEADER nodes without any content(e.g. text, image, video, or iframe).
        if (
          (node.tagName === "DIV" ||
            node.tagName === "SECTION" ||
            node.tagName === "HEADER" ||
            node.tagName === "H1" ||
            node.tagName === "H2" ||
            node.tagName === "H3" ||
            node.tagName === "H4" ||
            node.tagName === "H5" ||
            node.tagName === "H6") &&
          this._isElementWithoutContent(node)
        ) {
          node = this._removeAndGetNext(node);
          continue;
        }

        if (this.DEFAULT_TAGS_TO_SCORE.includes(node.tagName)) {
          elementsToScore.push(node);
        }

        // Turn all divs that don't have children block level elements into p's
        if (node.tagName === "DIV") {
          // Put phrasing content into paragraphs.
          var childNode = node.firstChild;
          while (childNode) {
            var nextSibling = childNode.nextSibling;
            if (this._isPhrasingContent(childNode)) {
              var fragment = doc.createDocumentFragment();
              // Collect all consecutive phrasing content into a fragment.
              do {
                nextSibling = childNode.nextSibling;
                fragment.appendChild(childNode);
                childNode = nextSibling;
              } while (childNode && this._isPhrasingContent(childNode));

              // Trim leading and trailing whitespace from the fragment.
              while (
                fragment.firstChild &&
                this._isWhitespace(fragment.firstChild)
              ) {
                fragment.firstChild.remove();
              }
              while (
                fragment.lastChild &&
                this._isWhitespace(fragment.lastChild)
              ) {
                fragment.lastChild.remove();
              }

              // If the fragment contains anything, wrap it in a paragraph and
              // insert it before the next non-phrasing node.
              if (fragment.firstChild) {
                var p = doc.createElement("p");
                p.appendChild(fragment);
                node.insertBefore(p, nextSibling);
              }
            }
            childNode = nextSibling;
          }

          // Sites like http://mobile.slate.com encloses each paragraph with a DIV
          // element. DIVs with only a P element inside and no text content can be
          // safely converted into plain P elements to avoid confusing the scoring
          // algorithm with DIVs with are, in practice, paragraphs.
          if (
            this._hasSingleTagInsideElement(node, "P") &&
            this._getLinkDensity(node) < 0.25
          ) {
            var newNode = node.children[0];
            node.parentNode.replaceChild(newNode, node);
            node = newNode;
            elementsToScore.push(node);
          } else if (!this._hasChildBlockElement(node)) {
            node = this._setNodeTag(node, "P");
            elementsToScore.push(node);
          }
        }
        node = this._getNextNode(node);
      }

      /**
       * Loop through all paragraphs, and assign a score to them based on how content-y they look.
       * Then add their score to their parent node.
       *
       * A score is determined by things like number of commas, class names, etc. Maybe eventually link density.
       **/
      var candidates = [];
      this._forEachNode(elementsToScore, function (elementToScore) {
        if (
          !elementToScore.parentNode ||
          typeof elementToScore.parentNode.tagName === "undefined"
        ) {
          return;
        }

        // If this paragraph is less than 25 characters, don't even count it.
        var innerText = this._getInnerText(elementToScore);
        if (innerText.length < 25) {
          return;
        }

        // Exclude nodes with no ancestor.
        var ancestors = this._getNodeAncestors(elementToScore, 5);
        if (ancestors.length === 0) {
          return;
        }

        var contentScore = 0;

        // Add a point for the paragraph itself as a base.
        contentScore += 1;

        // Add points for any commas within this paragraph.
        contentScore += innerText.split(this.REGEXPS.commas).length;

        // For every 100 characters in this paragraph, add another point. Up to 3 points.
        contentScore += Math.min(Math.floor(innerText.length / 100), 3);

        // Initialize and score ancestors.
        this._forEachNode(ancestors, function (ancestor, level) {
          if (
            !ancestor.tagName ||
            !ancestor.parentNode ||
            typeof ancestor.parentNode.tagName === "undefined"
          ) {
            return;
          }

          if (typeof ancestor.readability === "undefined") {
            this._initializeNode(ancestor);
            candidates.push(ancestor);
          }

          // Node score divider:
          // - parent:             1 (no division)
          // - grandparent:        2
          // - great grandparent+: ancestor level * 3
          if (level === 0) {
            var scoreDivider = 1;
          } else if (level === 1) {
            scoreDivider = 2;
          } else {
            scoreDivider = level * 3;
          }
          ancestor.readability.contentScore += contentScore / scoreDivider;
        });
      });

      // After we've calculated scores, loop through all of the possible
      // candidate nodes we found and find the one with the highest score.
      var topCandidates = [];
      for (var c = 0, cl = candidates.length; c < cl; c += 1) {
        var candidate = candidates[c];

        // Scale the final candidates score based on link density. Good content
        // should have a relatively small link density (5% or less) and be mostly
        // unaffected by this operation.
        var candidateScore =
          candidate.readability.contentScore *
          (1 - this._getLinkDensity(candidate));
        candidate.readability.contentScore = candidateScore;

        this.log("Candidate:", candidate, "with score " + candidateScore);

        for (var t = 0; t < this._nbTopCandidates; t++) {
          var aTopCandidate = topCandidates[t];

          if (
            !aTopCandidate ||
            candidateScore > aTopCandidate.readability.contentScore
          ) {
            topCandidates.splice(t, 0, candidate);
            if (topCandidates.length > this._nbTopCandidates) {
              topCandidates.pop();
            }
            break;
          }
        }
      }

      var topCandidate = topCandidates[0] || null;
      var neededToCreateTopCandidate = false;
      var parentOfTopCandidate;

      // If we still have no top candidate, just use the body as a last resort.
      // We also have to copy the body node so it is something we can modify.
      if (topCandidate === null || topCandidate.tagName === "BODY") {
        // Move all of the page's children into topCandidate
        topCandidate = doc.createElement("DIV");
        neededToCreateTopCandidate = true;
        // Move everything (not just elements, also text nodes etc.) into the container
        // so we even include text directly in the body:
        while (page.firstChild) {
          this.log("Moving child out:", page.firstChild);
          topCandidate.appendChild(page.firstChild);
        }

        page.appendChild(topCandidate);

        this._initializeNode(topCandidate);
      } else if (topCandidate) {
        // Find a better top candidate node if it contains (at least three) nodes which belong to `topCandidates` array
        // and whose scores are quite closed with current `topCandidate` node.
        var alternativeCandidateAncestors = [];
        for (var i = 1; i < topCandidates.length; i++) {
          if (
            topCandidates[i].readability.contentScore /
              topCandidate.readability.contentScore >=
            0.75
          ) {
            alternativeCandidateAncestors.push(
              this._getNodeAncestors(topCandidates[i])
            );
          }
        }
        var MINIMUM_TOPCANDIDATES = 3;
        if (alternativeCandidateAncestors.length >= MINIMUM_TOPCANDIDATES) {
          parentOfTopCandidate = topCandidate.parentNode;
          while (parentOfTopCandidate.tagName !== "BODY") {
            var listsContainingThisAncestor = 0;
            for (
              var ancestorIndex = 0;
              ancestorIndex < alternativeCandidateAncestors.length &&
              listsContainingThisAncestor < MINIMUM_TOPCANDIDATES;
              ancestorIndex++
            ) {
              listsContainingThisAncestor += Number(
                alternativeCandidateAncestors[ancestorIndex].includes(
                  parentOfTopCandidate
                )
              );
            }
            if (listsContainingThisAncestor >= MINIMUM_TOPCANDIDATES) {
              topCandidate = parentOfTopCandidate;
              break;
            }
            parentOfTopCandidate = parentOfTopCandidate.parentNode;
          }
        }
        if (!topCandidate.readability) {
          this._initializeNode(topCandidate);
        }

        // Because of our bonus system, parents of candidates might have scores
        // themselves. They get half of the node. There won't be nodes with higher
        // scores than our topCandidate, but if we see the score going *up* in the first
        // few steps up the tree, that's a decent sign that there might be more content
        // lurking in other places that we want to unify in. The sibling stuff
        // below does some of that - but only if we've looked high enough up the DOM
        // tree.
        parentOfTopCandidate = topCandidate.parentNode;
        var lastScore = topCandidate.readability.contentScore;
        // The scores shouldn't get too low.
        var scoreThreshold = lastScore / 3;
        while (parentOfTopCandidate.tagName !== "BODY") {
          if (!parentOfTopCandidate.readability) {
            parentOfTopCandidate = parentOfTopCandidate.parentNode;
            continue;
          }
          var parentScore = parentOfTopCandidate.readability.contentScore;
          if (parentScore < scoreThreshold) {
            break;
          }
          if (parentScore > lastScore) {
            // Alright! We found a better parent to use.
            topCandidate = parentOfTopCandidate;
            break;
          }
          lastScore = parentOfTopCandidate.readability.contentScore;
          parentOfTopCandidate = parentOfTopCandidate.parentNode;
        }

        // If the top candidate is the only child, use parent instead. This will help sibling
        // joining logic when adjacent content is actually located in parent's sibling node.
        parentOfTopCandidate = topCandidate.parentNode;
        while (
          parentOfTopCandidate.tagName != "BODY" &&
          parentOfTopCandidate.children.length == 1
        ) {
          topCandidate = parentOfTopCandidate;
          parentOfTopCandidate = topCandidate.parentNode;
        }
        if (!topCandidate.readability) {
          this._initializeNode(topCandidate);
        }
      }

      // Now that we have the top candidate, look through its siblings for content
      // that might also be related. Things like preambles, content split by ads
      // that we removed, etc.
      var articleContent = doc.createElement("DIV");
      if (isPaging) {
        articleContent.id = "readability-content";
      }

      var siblingScoreThreshold = Math.max(
        10,
        topCandidate.readability.contentScore * 0.2
      );
      // Keep potential top candidate's parent node to try to get text direction of it later.
      parentOfTopCandidate = topCandidate.parentNode;
      var siblings = parentOfTopCandidate.children;

      for (var s = 0, sl = siblings.length; s < sl; s++) {
        var sibling = siblings[s];
        var append = false;

        this.log(
          "Looking at sibling node:",
          sibling,
          sibling.readability
            ? "with score " + sibling.readability.contentScore
            : ""
        );
        this.log(
          "Sibling has score",
          sibling.readability ? sibling.readability.contentScore : "Unknown"
        );

        if (sibling === topCandidate) {
          append = true;
        } else {
          var contentBonus = 0;

          // Give a bonus if sibling nodes and top candidates have the example same classname
          if (
            sibling.className === topCandidate.className &&
            topCandidate.className !== ""
          ) {
            contentBonus += topCandidate.readability.contentScore * 0.2;
          }

          if (
            sibling.readability &&
            sibling.readability.contentScore + contentBonus >=
              siblingScoreThreshold
          ) {
            append = true;
          } else if (sibling.nodeName === "P") {
            var linkDensity = this._getLinkDensity(sibling);
            var nodeContent = this._getInnerText(sibling);
            var nodeLength = nodeContent.length;

            if (nodeLength > 80 && linkDensity < 0.25) {
              append = true;
            } else if (
              nodeLength < 80 &&
              nodeLength > 0 &&
              linkDensity === 0 &&
              nodeContent.search(/\.( |$)/) !== -1
            ) {
              append = true;
            }
          }
        }

        if (append) {
          this.log("Appending node:", sibling);

          if (!this.ALTER_TO_DIV_EXCEPTIONS.includes(sibling.nodeName)) {
            // We have a node that isn't a common block level element, like a form or td tag.
            // Turn it into a div so it doesn't get filtered out later by accident.
            this.log("Altering sibling:", sibling, "to div.");

            sibling = this._setNodeTag(sibling, "DIV");
          }

          articleContent.appendChild(sibling);
          // Fetch children again to make it compatible
          // with DOM parsers without live collection support.
          siblings = parentOfTopCandidate.children;
          // siblings is a reference to the children array, and
          // sibling is removed from the array when we call appendChild().
          // As a result, we must revisit this index since the nodes
          // have been shifted.
          s -= 1;
          sl -= 1;
        }
      }

      if (this._debug) {
        this.log("Article content pre-prep: " + articleContent.innerHTML);
      }
      // So we have all of the content that we need. Now we clean it up for presentation.
      this._prepArticle(articleContent);
      if (this._debug) {
        this.log("Article content post-prep: " + articleContent.innerHTML);
      }

      if (neededToCreateTopCandidate) {
        // We already created a fake div thing, and there wouldn't have been any siblings left
        // for the previous loop, so there's no point trying to create a new div, and then
        // move all the children over. Just assign IDs and class names here. No need to append
        // because that already happened anyway.
        topCandidate.id = "readability-page-1";
        topCandidate.className = "page";
      } else {
        var div = doc.createElement("DIV");
        div.id = "readability-page-1";
        div.className = "page";
        while (articleContent.firstChild) {
          div.appendChild(articleContent.firstChild);
        }
        articleContent.appendChild(div);
      }

      if (this._debug) {
        this.log("Article content after paging: " + articleContent.innerHTML);
      }

      var parseSuccessful = true;

      // Now that we've gone through the full algorithm, check to see if
      // we got any meaningful content. If we didn't, we may need to re-run
      // grabArticle with different flags set. This gives us a higher likelihood of
      // finding the content, and the sieve approach gives us a higher likelihood of
      // finding the -right- content.
      var textLength = this._getInnerText(articleContent, true).length;
      if (textLength < this._charThreshold) {
        parseSuccessful = false;
        // eslint-disable-next-line no-unsanitized/property
        page.innerHTML = pageCacheHtml;

        this._attempts.push({
          articleContent,
          textLength,
        });

        if (this._flagIsActive(this.FLAG_STRIP_UNLIKELYS)) {
          this._removeFlag(this.FLAG_STRIP_UNLIKELYS);
        } else if (this._flagIsActive(this.FLAG_WEIGHT_CLASSES)) {
          this._removeFlag(this.FLAG_WEIGHT_CLASSES);
        } else if (this._flagIsActive(this.FLAG_CLEAN_CONDITIONALLY)) {
          this._removeFlag(this.FLAG_CLEAN_CONDITIONALLY);
        } else {
          // No luck after removing flags, just return the longest text we found during the different loops
          this._attempts.sort(function (a, b) {
            return b.textLength - a.textLength;
          });

          // But first check if we actually have something
          if (!this._attempts[0].textLength) {
            return null;
          }

          articleContent = this._attempts[0].articleContent;
          parseSuccessful = true;
        }
      }

      if (parseSuccessful) {
        // Find out text direction from ancestors of final top candidate.
        var ancestors = [parentOfTopCandidate, topCandidate].concat(
          this._getNodeAncestors(parentOfTopCandidate)
        );
        this._someNode(ancestors, function (ancestor) {
          if (!ancestor.tagName) {
            return false;
          }
          var articleDir = ancestor.getAttribute("dir");
          if (articleDir) {
            this._articleDir = articleDir;
            return true;
          }
          return false;
        });
        return articleContent;
      }
    }
  },

  /**
   * Converts some of the common HTML entities in string to their corresponding characters.
   *
   * @param str {string} - a string to unescape.
   * @return string without HTML entity.
   */
  _unescapeHtmlEntities(str) {
    if (!str) {
      return str;
    }

    var htmlEscapeMap = this.HTML_ESCAPE_MAP;
    return str
      .replace(/&(quot|amp|apos|lt|gt);/g, function (_, tag) {
        return htmlEscapeMap[tag];
      })
      .replace(/&#(?:x([0-9a-f]+)|([0-9]+));/gi, function (_, hex, numStr) {
        var num = parseInt(hex || numStr, hex ? 16 : 10);

        // these character references are replaced by a conforming HTML parser
        if (num == 0 || num > 0x10ffff || (num >= 0xd800 && num <= 0xdfff)) {
          num = 0xfffd;
        }

        return String.fromCodePoint(num);
      });
  },

  /**
   * Try to extract metadata from JSON-LD object.
   * For now, only Schema.org objects of type Article or its subtypes are supported.
   * @return Object with any metadata that could be extracted (possibly none)
   */
  _getJSONLD(doc) {
    var scripts = this._getAllNodesWithTag(doc, ["script"]);

    var metadata;

    this._forEachNode(scripts, function (jsonLdElement) {
      if (
        !metadata &&
        jsonLdElement.getAttribute("type") === "application/ld+json"
      ) {
        try {
          // Strip CDATA markers if present
          var content = jsonLdElement.textContent.replace(
            /^\s*<!\[CDATA\[|\]\]>\s*$/g,
            ""
          );
          var parsed = JSON.parse(content);

          if (Array.isArray(parsed)) {
            parsed = parsed.find(it => {
              return (
                it["@type"] &&
                it["@type"].match(this.REGEXPS.jsonLdArticleTypes)
              );
            });
            if (!parsed) {
              return;
            }
          }

          var schemaDotOrgRegex = /^https?\:\/\/schema\.org\/?$/;
          var matches =
            (typeof parsed["@context"] === "string" &&
              parsed["@context"].match(schemaDotOrgRegex)) ||
            (typeof parsed["@context"] === "object" &&
              typeof parsed["@context"]["@vocab"] == "string" &&
              parsed["@context"]["@vocab"].match(schemaDotOrgRegex));

          if (!matches) {
            return;
          }

          if (!parsed["@type"] && Array.isArray(parsed["@graph"])) {
            parsed = parsed["@graph"].find(it => {
              return (it["@type"] || "").match(this.REGEXPS.jsonLdArticleTypes);
            });
          }

          if (
            !parsed ||
            !parsed["@type"] ||
            !parsed["@type"].match(this.REGEXPS.jsonLdArticleTypes)
          ) {
            return;
          }

          metadata = {};

          if (
            typeof parsed.name === "string" &&
            typeof parsed.headline === "string" &&
            parsed.name !== parsed.headline
          ) {
            // we have both name and headline element in the JSON-LD. They should both be the same but some websites like aktualne.cz
            // put their own name into "name" and the article title to "headline" which confuses Readability. So we try to check if either
            // "name" or "headline" closely matches the html title, and if so, use that one. If not, then we use "name" by default.

            var title = this._getArticleTitle();
            var nameMatches = this._textSimilarity(parsed.name, title) > 0.75;
            var headlineMatches =
              this._textSimilarity(parsed.headline, title) > 0.75;

            if (headlineMatches && !nameMatches) {
              metadata.title = parsed.headline;
            } else {
              metadata.title = parsed.name;
            }
          } else if (typeof parsed.name === "string") {
            metadata.title = parsed.name.trim();
          } else if (typeof parsed.headline === "string") {
            metadata.title = parsed.headline.trim();
          }
          if (parsed.author) {
            if (typeof parsed.author.name === "string") {
              metadata.byline = parsed.author.name.trim();
            } else if (
              Array.isArray(parsed.author) &&
              parsed.author[0] &&
              typeof parsed.author[0].name === "string"
            ) {
              metadata.byline = parsed.author
                .filter(function (author) {
                  return author && typeof author.name === "string";
                })
                .map(function (author) {
                  return author.name.trim();
                })
                .join(", ");
            }
          }
          if (typeof parsed.description === "string") {
            metadata.excerpt = parsed.description.trim();
          }
          if (parsed.publisher && typeof parsed.publisher.name === "string") {
            metadata.siteName = parsed.publisher.name.trim();
          }
          if (typeof parsed.datePublished === "string") {
            metadata.datePublished = parsed.datePublished.trim();
          }
        } catch (err) {
          this.log(err.message);
        }
      }
    });
    return metadata ? metadata : {};
  },

  /**
   * Attempts to get excerpt and byline metadata for the article.
   *
   * @param {Object} jsonld — object containing any metadata that
   * could be extracted from JSON-LD object.
   *
   * @return Object with optional "excerpt" and "byline" properties
   */
  _getArticleMetadata(jsonld) {
    var metadata = {};
    var values = {};
    var metaElements = this._doc.getElementsByTagName("meta");

    // property is a space-separated list of values
    var propertyPattern =
      /\s*(article|dc|dcterm|og|twitter)\s*:\s*(author|creator|description|published_time|title|site_name)\s*/gi;

    // name is a single value
    var namePattern =
      /^\s*(?:(dc|dcterm|og|twitter|parsely|weibo:(article|webpage))\s*[-\.:]\s*)?(author|creator|pub-date|description|title|site_name)\s*$/i;

    // Find description tags.
    this._forEachNode(metaElements, function (element) {
      var elementName = element.getAttribute("name");
      var elementProperty = element.getAttribute("property");
      var content = element.getAttribute("content");
      if (!content) {
        return;
      }
      var matches = null;
      var name = null;

      if (elementProperty) {
        matches = elementProperty.match(propertyPattern);
        if (matches) {
          // Convert to lowercase, and remove any whitespace
          // so we can match below.
          name = matches[0].toLowerCase().replace(/\s/g, "");
          // multiple authors
          values[name] = content.trim();
        }
      }
      if (!matches && elementName && namePattern.test(elementName)) {
        name = elementName;
        if (content) {
          // Convert to lowercase, remove any whitespace, and convert dots
          // to colons so we can match below.
          name = name.toLowerCase().replace(/\s/g, "").replace(/\./g, ":");
          values[name] = content.trim();
        }
      }
    });

    // get title
    metadata.title =
      jsonld.title ||
      values["dc:title"] ||
      values["dcterm:title"] ||
      values["og:title"] ||
      values["weibo:article:title"] ||
      values["weibo:webpage:title"] ||
      values.title ||
      values["twitter:title"] ||
      values["parsely-title"];

    if (!metadata.title) {
      metadata.title = this._getArticleTitle();
    }

    const articleAuthor =
      typeof values["article:author"] === "string" &&
      !this._isUrl(values["article:author"])
        ? values["article:author"]
        : undefined;

    // get author
    metadata.byline =
      jsonld.byline ||
      values["dc:creator"] ||
      values["dcterm:creator"] ||
      values.author ||
      values["parsely-author"] ||
      articleAuthor;

    // get description
    metadata.excerpt =
      jsonld.excerpt ||
      values["dc:description"] ||
      values["dcterm:description"] ||
      values["og:description"] ||
      values["weibo:article:description"] ||
      values["weibo:webpage:description"] ||
      values.description ||
      values["twitter:description"];

    // get site name
    metadata.siteName = jsonld.siteName || values["og:site_name"];

    // get article published time
    metadata.publishedTime =
      jsonld.datePublished ||
      values["article:published_time"] ||
      values["parsely-pub-date"] ||
      null;

    // in many sites the meta value is escaped with HTML entities,
    // so here we need to unescape it
    metadata.title = this._unescapeHtmlEntities(metadata.title);
    metadata.byline = this._unescapeHtmlEntities(metadata.byline);
    metadata.excerpt = this._unescapeHtmlEntities(metadata.excerpt);
    metadata.siteName = this._unescapeHtmlEntities(metadata.siteName);
    metadata.publishedTime = this._unescapeHtmlEntities(metadata.publishedTime);

    return metadata;
  },

  /**
   * Check if node is image, or if node contains exactly only one image
   * whether as a direct child or as its descendants.
   *
   * @param Element
   **/
  _isSingleImage(node) {
    while (node) {
      if (node.tagName === "IMG") {
        return true;
      }
      if (node.children.length !== 1 || node.textContent.trim() !== "") {
        return false;
      }
      node = node.children[0];
    }
    return false;
  },

  /**
   * Find all <noscript> that are located after <img> nodes, and which contain only one
   * <img> element. Replace the first image with the image from inside the <noscript> tag,
   * and remove the <noscript> tag. This improves the quality of the images we use on
   * some sites (e.g. Medium).
   *
   * @param Element
   **/
  _unwrapNoscriptImages(doc) {
    // Find img without source or attributes that might contains image, and remove it.
    // This is done to prevent a placeholder img is replaced by img from noscript in next step.
    var imgs = Array.from(doc.getElementsByTagName("img"));
    this._forEachNode(imgs, function (img) {
      for (var i = 0; i < img.attributes.length; i++) {
        var attr = img.attributes[i];
        switch (attr.name) {
          case "src":
          case "srcset":
          case "data-src":
          case "data-srcset":
            return;
        }

        if (/\.(jpg|jpeg|png|webp)/i.test(attr.value)) {
          return;
        }
      }

      img.remove();
    });

    // Next find noscript and try to extract its image
    var noscripts = Array.from(doc.getElementsByTagName("noscript"));
    this._forEachNode(noscripts, function (noscript) {
      // Parse content of noscript and make sure it only contains image
      if (!this._isSingleImage(noscript)) {
        return;
      }
      var tmp = doc.createElement("div");
      // We're running in the document context, and using unmodified
      // document contents, so doing this should be safe.
      // (Also we heavily discourage people from allowing script to
      // run at all in this document...)
      // eslint-disable-next-line no-unsanitized/property
      tmp.innerHTML = noscript.innerHTML;

      // If noscript has previous sibling and it only contains image,
      // replace it with noscript content. However we also keep old
      // attributes that might contains image.
      var prevElement = noscript.previousElementSibling;
      if (prevElement && this._isSingleImage(prevElement)) {
        var prevImg = prevElement;
        if (prevImg.tagName !== "IMG") {
          prevImg = prevElement.getElementsByTagName("img")[0];
        }

        var newImg = tmp.getElementsByTagName("img")[0];
        for (var i = 0; i < prevImg.attributes.length; i++) {
          var attr = prevImg.attributes[i];
          if (attr.value === "") {
            continue;
          }

          if (
            attr.name === "src" ||
            attr.name === "srcset" ||
            /\.(jpg|jpeg|png|webp)/i.test(attr.value)
          ) {
            if (newImg.getAttribute(attr.name) === attr.value) {
              continue;
            }

            var attrName = attr.name;
            if (newImg.hasAttribute(attrName)) {
              attrName = "data-old-" + attrName;
            }

            newImg.setAttribute(attrName, attr.value);
          }
        }

        noscript.parentNode.replaceChild(tmp.firstElementChild, prevElement);
      }
    });
  },

  /**
   * Removes script tags from the document.
   *
   * @param Element
   **/
  _removeScripts(doc) {
    this._removeNodes(this._getAllNodesWithTag(doc, ["script", "noscript"]));
  },

  /**
   * Check if this node has only whitespace and a single element with given tag
   * Returns false if the DIV node contains non-empty text nodes
   * or if it contains no element with given tag or more than 1 element.
   *
   * @param Element
   * @param string tag of child element
   **/
  _hasSingleTagInsideElement(element, tag) {
    // There should be exactly 1 element child with given tag
    if (element.children.length != 1 || element.children[0].tagName !== tag) {
      return false;
    }

    // And there should be no text nodes with real content
    return !this._someNode(element.childNodes, function (node) {
      return (
        node.nodeType === this.TEXT_NODE &&
        this.REGEXPS.hasContent.test(node.textContent)
      );
    });
  },

  _isElementWithoutContent(node) {
    return (
      node.nodeType === this.ELEMENT_NODE &&
      !node.textContent.trim().length &&
      (!node.children.length ||
        node.children.length ==
          node.getElementsByTagName("br").length +
            node.getElementsByTagName("hr").length)
    );
  },

  /**
   * Determine whether element has any children block level elements.
   *
   * @param Element
   */
  _hasChildBlockElement(element) {
    return this._someNode(element.childNodes, function (node) {
      return (
        this.DIV_TO_P_ELEMS.has(node.tagName) ||
        this._hasChildBlockElement(node)
      );
    });
  },

  /***
   * Determine if a node qualifies as phrasing content.
   * https://developer.mozilla.org/en-US/docs/Web/Guide/HTML/Content_categories#Phrasing_content
   **/
  _isPhrasingContent(node) {
    return (
      node.nodeType === this.TEXT_NODE ||
      this.PHRASING_ELEMS.includes(node.tagName) ||
      ((node.tagName === "A" ||
        node.tagName === "DEL" ||
        node.tagName === "INS") &&
        this._everyNode(node.childNodes, this._isPhrasingContent))
    );
  },

  _isWhitespace(node) {
    return (
      (node.nodeType === this.TEXT_NODE &&
        node.textContent.trim().length === 0) ||
      (node.nodeType === this.ELEMENT_NODE && node.tagName === "BR")
    );
  },

  /**
   * Get the inner text of a node - cross browser compatibly.
   * This also strips out any excess whitespace to be found.
   *
   * @param Element
   * @param Boolean normalizeSpaces (default: true)
   * @return string
   **/
  _getInnerText(e, normalizeSpaces) {
    normalizeSpaces =
      typeof normalizeSpaces === "undefined" ? true : normalizeSpaces;
    var textContent = e.textContent.trim();

    if (normalizeSpaces) {
      return textContent.replace(this.REGEXPS.normalize, " ");
    }
    return textContent;
  },

  /**
   * Get the number of times a string s appears in the node e.
   *
   * @param Element
   * @param string - what to split on. Default is ","
   * @return number (integer)
   **/
  _getCharCount(e, s) {
    s = s || ",";
    return this._getInnerText(e).split(s).length - 1;
  },

  /**
   * Remove the style attribute on every e and under.
   * TODO: Test if getElementsByTagName(*) is faster.
   *
   * @param Element
   * @return void
   **/
  _cleanStyles(e) {
    if (!e || e.tagName.toLowerCase() === "svg") {
      return;
    }

    // Remove `style` and deprecated presentational attributes
    for (var i = 0; i < this.PRESENTATIONAL_ATTRIBUTES.length; i++) {
      e.removeAttribute(this.PRESENTATIONAL_ATTRIBUTES[i]);
    }

    if (this.DEPRECATED_SIZE_ATTRIBUTE_ELEMS.includes(e.tagName)) {
      e.removeAttribute("width");
      e.removeAttribute("height");
    }

    var cur = e.firstElementChild;
    while (cur !== null) {
      this._cleanStyles(cur);
      cur = cur.nextElementSibling;
    }
  },

  /**
   * Get the density of links as a percentage of the content
   * This is the amount of text that is inside a link divided by the total text in the node.
   *
   * @param Element
   * @return number (float)
   **/
  _getLinkDensity(element) {
    var textLength = this._getInnerText(element).length;
    if (textLength === 0) {
      return 0;
    }

    var linkLength = 0;

    // XXX implement _reduceNodeList?
    this._forEachNode(element.getElementsByTagName("a"), function (linkNode) {
      var href = linkNode.getAttribute("href");
      var coefficient = href && this.REGEXPS.hashUrl.test(href) ? 0.3 : 1;
      linkLength += this._getInnerText(linkNode).length * coefficient;
    });

    return linkLength / textLength;
  },

  /**
   * Get an elements class/id weight. Uses regular expressions to tell if this
   * element looks good or bad.
   *
   * @param Element
   * @return number (Integer)
   **/
  _getClassWeight(e) {
    if (!this._flagIsActive(this.FLAG_WEIGHT_CLASSES)) {
      return 0;
    }

    var weight = 0;

    // Look for a special classname
    if (typeof e.className === "string" && e.className !== "") {
      if (this.REGEXPS.negative.test(e.className)) {
        weight -= 25;
      }

      if (this.REGEXPS.positive.test(e.className)) {
        weight += 25;
      }
    }

    // Look for a special ID
    if (typeof e.id === "string" && e.id !== "") {
      if (this.REGEXPS.negative.test(e.id)) {
        weight -= 25;
      }

      if (this.REGEXPS.positive.test(e.id)) {
        weight += 25;
      }
    }

    return weight;
  },

  /**
   * Clean a node of all elements of type "tag".
   * (Unless it's a youtube/vimeo video. People love movies.)
   *
   * @param Element
   * @param string tag to clean
   * @return void
   **/
  _clean(e, tag) {
    var isEmbed = ["object", "embed", "iframe"].includes(tag);

    this._removeNodes(this._getAllNodesWithTag(e, [tag]), function (element) {
      // Allow youtube and vimeo videos through as people usually want to see those.
      if (isEmbed) {
        // First, check the elements attributes to see if any of them contain youtube or vimeo
        for (var i = 0; i < element.attributes.length; i++) {
          if (this._allowedVideoRegex.test(element.attributes[i].value)) {
            return false;
          }
        }

        // For embed with <object> tag, check inner HTML as well.
        if (
          element.tagName === "object" &&
          this._allowedVideoRegex.test(element.innerHTML)
        ) {
          return false;
        }
      }

      return true;
    });
  },

  /**
   * Check if a given node has one of its ancestor tag name matching the
   * provided one.
   * @param  HTMLElement node
   * @param  String      tagName
   * @param  Number      maxDepth
   * @param  Function    filterFn a filter to invoke to determine whether this node 'counts'
   * @return Boolean
   */
  _hasAncestorTag(node, tagName, maxDepth, filterFn) {
    maxDepth = maxDepth || 3;
    tagName = tagName.toUpperCase();
    var depth = 0;
    while (node.parentNode) {
      if (maxDepth > 0 && depth > maxDepth) {
        return false;
      }
      if (
        node.parentNode.tagName === tagName &&
        (!filterFn || filterFn(node.parentNode))
      ) {
        return true;
      }
      node = node.parentNode;
      depth++;
    }
    return false;
  },

  /**
   * Return an object indicating how many rows and columns this table has.
   */
  _getRowAndColumnCount(table) {
    var rows = 0;
    var columns = 0;
    var trs = table.getElementsByTagName("tr");
    for (var i = 0; i < trs.length; i++) {
      var rowspan = trs[i].getAttribute("rowspan") || 0;
      if (rowspan) {
        rowspan = parseInt(rowspan, 10);
      }
      rows += rowspan || 1;

      // Now look for column-related info
      var columnsInThisRow = 0;
      var cells = trs[i].getElementsByTagName("td");
      for (var j = 0; j < cells.length; j++) {
        var colspan = cells[j].getAttribute("colspan") || 0;
        if (colspan) {
          colspan = parseInt(colspan, 10);
        }
        columnsInThisRow += colspan || 1;
      }
      columns = Math.max(columns, columnsInThisRow);
    }
    return { rows, columns };
  },

  /**
   * Look for 'data' (as opposed to 'layout') tables, for which we use
   * similar checks as
   * https://searchfox.org/mozilla-central/rev/f82d5c549f046cb64ce5602bfd894b7ae807c8f8/accessible/generic/TableAccessible.cpp#19
   */
  _markDataTables(root) {
    var tables = root.getElementsByTagName("table");
    for (var i = 0; i < tables.length; i++) {
      var table = tables[i];
      var role = table.getAttribute("role");
      if (role == "presentation") {
        table._readabilityDataTable = false;
        continue;
      }
      var datatable = table.getAttribute("datatable");
      if (datatable == "0") {
        table._readabilityDataTable = false;
        continue;
      }
      var summary = table.getAttribute("summary");
      if (summary) {
        table._readabilityDataTable = true;
        continue;
      }

      var caption = table.getElementsByTagName("caption")[0];
      if (caption && caption.childNodes.length) {
        table._readabilityDataTable = true;
        continue;
      }

      // If the table has a descendant with any of these tags, consider a data table:
      var dataTableDescendants = ["col", "colgroup", "tfoot", "thead", "th"];
      var descendantExists = function (tag) {
        return !!table.getElementsByTagName(tag)[0];
      };
      if (dataTableDescendants.some(descendantExists)) {
        this.log("Data table because found data-y descendant");
        table._readabilityDataTable = true;
        continue;
      }

      // Nested tables indicate a layout table:
      if (table.getElementsByTagName("table")[0]) {
        table._readabilityDataTable = false;
        continue;
      }

      var sizeInfo = this._getRowAndColumnCount(table);

      if (sizeInfo.columns == 1 || sizeInfo.rows == 1) {
        // single colum/row tables are commonly used for page layout purposes.
        table._readabilityDataTable = false;
        continue;
      }

      if (sizeInfo.rows >= 10 || sizeInfo.columns > 4) {
        table._readabilityDataTable = true;
        continue;
      }
      // Now just go by size entirely:
      table._readabilityDataTable = sizeInfo.rows * sizeInfo.columns > 10;
    }
  },

  /* convert images and figures that have properties like data-src into images that can be loaded without JS */
  _fixLazyImages(root) {
    this._forEachNode(
      this._getAllNodesWithTag(root, ["img", "picture", "figure"]),
      function (elem) {
        // In some sites (e.g. Kotaku), they put 1px square image as base64 data uri in the src attribute.
        // So, here we check if the data uri is too short, just might as well remove it.
        if (elem.src && this.REGEXPS.b64DataUrl.test(elem.src)) {
          // Make sure it's not SVG, because SVG can have a meaningful image in under 133 bytes.
          var parts = this.REGEXPS.b64DataUrl.exec(elem.src);
          if (parts[1] === "image/svg+xml") {
            return;
          }

          // Make sure this element has other attributes which contains image.
          // If it doesn't, then this src is important and shouldn't be removed.
          var srcCouldBeRemoved = false;
          for (var i = 0; i < elem.attributes.length; i++) {
            var attr = elem.attributes[i];
            if (attr.name === "src") {
              continue;
            }

            if (/\.(jpg|jpeg|png|webp)/i.test(attr.value)) {
              srcCouldBeRemoved = true;
              break;
            }
          }

          // Here we assume if image is less than 100 bytes (or 133 after encoded to base64)
          // it will be too small, therefore it might be placeholder image.
          if (srcCouldBeRemoved) {
            var b64starts = parts[0].length;
            var b64length = elem.src.length - b64starts;
            if (b64length < 133) {
              elem.removeAttribute("src");
            }
          }
        }

        // also check for "null" to work around https://github.com/jsdom/jsdom/issues/2580
        if (
          (elem.src || (elem.srcset && elem.srcset != "null")) &&
          !elem.className.toLowerCase().includes("lazy")
        ) {
          return;
        }

        for (var j = 0; j < elem.attributes.length; j++) {
          attr = elem.attributes[j];
          if (
            attr.name === "src" ||
            attr.name === "srcset" ||
            attr.name === "alt"
          ) {
            continue;
          }
          var copyTo = null;
          if (/\.(jpg|jpeg|png|webp)\s+\d/.test(attr.value)) {
            copyTo = "srcset";
          } else if (/^\s*\S+\.(jpg|jpeg|png|webp)\S*\s*$/.test(attr.value)) {
            copyTo = "src";
          }
          if (copyTo) {
            //if this is an img or picture, set the attribute directly
            if (elem.tagName === "IMG" || elem.tagName === "PICTURE") {
              elem.setAttribute(copyTo, attr.value);
            } else if (
              elem.tagName === "FIGURE" &&
              !this._getAllNodesWithTag(elem, ["img", "picture"]).length
            ) {
              //if the item is a <figure> that does not contain an image or picture, create one and place it inside the figure
              //see the nytimes-3 testcase for an example
              var img = this._doc.createElement("img");
              img.setAttribute(copyTo, attr.value);
              elem.appendChild(img);
            }
          }
        }
      }
    );
  },

  _getTextDensity(e, tags) {
    var textLength = this._getInnerText(e, true).length;
    if (textLength === 0) {
      return 0;
    }
    var childrenLength = 0;
    var children = this._getAllNodesWithTag(e, tags);
    this._forEachNode(
      children,
      child => (childrenLength += this._getInnerText(child, true).length)
    );
    return childrenLength / textLength;
  },

  /**
   * Clean an element of all tags of type "tag" if they look fishy.
   * "Fishy" is an algorithm based on content length, classnames, link density, number of images & embeds, etc.
   *
   * @return void
   **/
  _cleanConditionally(e, tag) {
    if (!this._flagIsActive(this.FLAG_CLEAN_CONDITIONALLY)) {
      return;
    }

    // Gather counts for other typical elements embedded within.
    // Traverse backwards so we can remove nodes at the same time
    // without effecting the traversal.
    //
    // TODO: Consider taking into account original contentScore here.
    this._removeNodes(this._getAllNodesWithTag(e, [tag]), function (node) {
      // First check if this node IS data table, in which case don't remove it.
      var isDataTable = function (t) {
        return t._readabilityDataTable;
      };

      var isList = tag === "ul" || tag === "ol";
      if (!isList) {
        var listLength = 0;
        var listNodes = this._getAllNodesWithTag(node, ["ul", "ol"]);
        this._forEachNode(
          listNodes,
          list => (listLength += this._getInnerText(list).length)
        );
        isList = listLength / this._getInnerText(node).length > 0.9;
      }

      if (tag === "table" && isDataTable(node)) {
        return false;
      }

      // Next check if we're inside a data table, in which case don't remove it as well.
      if (this._hasAncestorTag(node, "table", -1, isDataTable)) {
        return false;
      }

      if (this._hasAncestorTag(node, "code")) {
        return false;
      }

      // keep element if it has a data tables
      if (
        [...node.getElementsByTagName("table")].some(
          tbl => tbl._readabilityDataTable
        )
      ) {
        return false;
      }

      var weight = this._getClassWeight(node);

      this.log("Cleaning Conditionally", node);

      var contentScore = 0;

      if (weight + contentScore < 0) {
        return true;
      }

      if (this._getCharCount(node, ",") < 10) {
        // If there are not very many commas, and the number of
        // non-paragraph elements is more than paragraphs or other
        // ominous signs, remove the element.
        var p = node.getElementsByTagName("p").length;
        var img = node.getElementsByTagName("img").length;
        var li = node.getElementsByTagName("li").length - 100;
        var input = node.getElementsByTagName("input").length;
        var headingDensity = this._getTextDensity(node, [
          "h1",
          "h2",
          "h3",
          "h4",
          "h5",
          "h6",
        ]);

        var embedCount = 0;
        var embeds = this._getAllNodesWithTag(node, [
          "object",
          "embed",
          "iframe",
        ]);

        for (var i = 0; i < embeds.length; i++) {
          // If this embed has attribute that matches video regex, don't delete it.
          for (var j = 0; j < embeds[i].attributes.length; j++) {
            if (this._allowedVideoRegex.test(embeds[i].attributes[j].value)) {
              return false;
            }
          }

          // For embed with <object> tag, check inner HTML as well.
          if (
            embeds[i].tagName === "object" &&
            this._allowedVideoRegex.test(embeds[i].innerHTML)
          ) {
            return false;
          }

          embedCount++;
        }

        var innerText = this._getInnerText(node);

        // toss any node whose inner text contains nothing but suspicious words
        if (
          this.REGEXPS.adWords.test(innerText) ||
          this.REGEXPS.loadingWords.test(innerText)
        ) {
          return true;
        }

        var contentLength = innerText.length;
        var linkDensity = this._getLinkDensity(node);
        var textishTags = ["SPAN", "LI", "TD"].concat(
          Array.from(this.DIV_TO_P_ELEMS)
        );
        var textDensity = this._getTextDensity(node, textishTags);
        var isFigureChild = this._hasAncestorTag(node, "figure");

        // apply shadiness checks, then check for exceptions
        const shouldRemoveNode = () => {
          const errs = [];
          if (!isFigureChild && img > 1 && p / img < 0.5) {
            errs.push(`Bad p to img ratio (img=${img}, p=${p})`);
          }
          if (!isList && li > p) {
            errs.push(`Too many li's outside of a list. (li=${li} > p=${p})`);
          }
          if (input > Math.floor(p / 3)) {
            errs.push(`Too many inputs per p. (input=${input}, p=${p})`);
          }
          if (
            !isList &&
            !isFigureChild &&
            headingDensity < 0.9 &&
            contentLength < 25 &&
            (img === 0 || img > 2) &&
            linkDensity > 0
          ) {
            errs.push(
              `Suspiciously short. (headingDensity=${headingDensity}, img=${img}, linkDensity=${linkDensity})`
            );
          }
          if (
            !isList &&
            weight < 25 &&
            linkDensity > 0.2 + this._linkDensityModifier
          ) {
            errs.push(
              `Low weight and a little linky. (linkDensity=${linkDensity})`
            );
          }
          if (weight >= 25 && linkDensity > 0.5 + this._linkDensityModifier) {
            errs.push(
              `High weight and mostly links. (linkDensity=${linkDensity})`
            );
          }
          if ((embedCount === 1 && contentLength < 75) || embedCount > 1) {
            errs.push(
              `Suspicious embed. (embedCount=${embedCount}, contentLength=${contentLength})`
            );
          }
          if (img === 0 && textDensity === 0) {
            errs.push(
              `No useful content. (img=${img}, textDensity=${textDensity})`
            );
          }

          if (errs.length) {
            this.log("Checks failed", errs);
            return true;
          }

          return false;
        };

        var haveToRemove = shouldRemoveNode();

        // Allow simple lists of images to remain in pages
        if (isList && haveToRemove) {
          for (var x = 0; x < node.children.length; x++) {
            let child = node.children[x];
            // Don't filter in lists with li's that contain more than one child
            if (child.children.length > 1) {
              return haveToRemove;
            }
          }
          let li_count = node.getElementsByTagName("li").length;
          // Only allow the list to remain if every li contains an image
          if (img == li_count) {
            return false;
          }
        }
        return haveToRemove;
      }
      return false;
    });
  },

  /**
   * Clean out elements that match the specified conditions
   *
   * @param Element
   * @param Function determines whether a node should be removed
   * @return void
   **/
  _cleanMatchedNodes(e, filter) {
    var endOfSearchMarkerNode = this._getNextNode(e, true);
    var next = this._getNextNode(e);
    while (next && next != endOfSearchMarkerNode) {
      if (filter.call(this, next, next.className + " " + next.id)) {
        next = this._removeAndGetNext(next);
      } else {
        next = this._getNextNode(next);
      }
    }
  },

  /**
   * Clean out spurious headers from an Element.
   *
   * @param Element
   * @return void
   **/
  _cleanHeaders(e) {
    let headingNodes = this._getAllNodesWithTag(e, ["h1", "h2"]);
    this._removeNodes(headingNodes, function (node) {
      let shouldRemove = this._getClassWeight(node) < 0;
      if (shouldRemove) {
        this.log("Removing header with low class weight:", node);
      }
      return shouldRemove;
    });
  },

  /**
   * Check if this node is an H1 or H2 element whose content is mostly
   * the same as the article title.
   *
   * @param Element  the node to check.
   * @return boolean indicating whether this is a title-like header.
   */
  _headerDuplicatesTitle(node) {
    if (node.tagName != "H1" && node.tagName != "H2") {
      return false;
    }
    var heading = this._getInnerText(node, false);
    this.log("Evaluating similarity of header:", heading, this._articleTitle);
    return this._textSimilarity(this._articleTitle, heading) > 0.75;
  },

  _flagIsActive(flag) {
    return (this._flags & flag) > 0;
  },

  _removeFlag(flag) {
    this._flags = this._flags & ~flag;
  },

  _isProbablyVisible(node) {
    // Have to null-check node.style and node.className.includes to deal with SVG and MathML nodes.
    return (
      (!node.style || node.style.display != "none") &&
      (!node.style || node.style.visibility != "hidden") &&
      !node.hasAttribute("hidden") &&
      //check for "fallback-image" so that wikimedia math images are displayed
      (!node.hasAttribute("aria-hidden") ||
        node.getAttribute("aria-hidden") != "true" ||
        (node.className &&
          node.className.includes &&
          node.className.includes("fallback-image")))
    );
  },

  /**
   * Runs readability.
   *
   * Workflow:
   *  1. Prep the document by removing script tags, css, etc.
   *  2. Build readability's DOM tree.
   *  3. Grab the article content from the current dom tree.
   *  4. Replace the current DOM tree with the new one.
   *  5. Read peacefully.
   *
   * @return void
   **/
  parse() {
    // Avoid parsing too large documents, as per configuration option
    if (this._maxElemsToParse > 0) {
      var numTags = this._doc.getElementsByTagName("*").length;
      if (numTags > this._maxElemsToParse) {
        throw new Error(
          "Aborting parsing document; " + numTags + " elements found"
        );
      }
    }

    // Unwrap image from noscript
    this._unwrapNoscriptImages(this._doc);

    // Extract JSON-LD metadata before removing scripts
    var jsonLd = this._disableJSONLD ? {} : this._getJSONLD(this._doc);

    // Remove script tags from the document.
    this._removeScripts(this._doc);

    this._prepDocument();

    var metadata = this._getArticleMetadata(jsonLd);
    this._metadata = metadata;
    this._articleTitle = metadata.title;

    var articleContent = this._grabArticle();
    if (!articleContent) {
      return null;
    }

    this.log("Grabbed: " + articleContent.innerHTML);

    this._postProcessContent(articleContent);

    // If we haven't found an excerpt in the article's metadata, use the article's
    // first paragraph as the excerpt. This is used for displaying a preview of
    // the article's content.
    if (!metadata.excerpt) {
      var paragraphs = articleContent.getElementsByTagName("p");
      if (paragraphs.length) {
        metadata.excerpt = paragraphs[0].textContent.trim();
      }
    }

    var textContent = articleContent.textContent;
    return {
      title: this._articleTitle,
      byline: metadata.byline || this._articleByline,
      dir: this._articleDir,
      lang: this._articleLang,
      content: this._serializer(articleContent),
      textContent,
      length: textContent.length,
      excerpt: metadata.excerpt,
      siteName: metadata.siteName || this._articleSiteName,
      publishedTime: metadata.publishedTime,
    };
  },
};

if (typeof module === "object") {
  /* eslint-disable-next-line no-redeclare */
  /* global module */
  module.exports = Readability;
}


================================================
FILE: SECURITY.md
================================================
# Security Policy

This code is included in Mozilla’s client [bug bounty program](https://www.mozilla.org/en-US/security/client-bug-bounty/). 
If you find a security vulnerability, please submit it via the process outlined in the [FAQ pages](https://www.mozilla.org/en-US/security/client-bug-bounty/). 

Please submit all security-related bugs through Bugzilla using the [client security bug form](https://bugzilla.mozilla.org/form.client.bounty). Never submit security-related bugs through a Github Issue or by email.

Note: as noted in the README.md file in this repository, `readability` itself does not intend to do security-related input sanitization, and you should use appropriate measures to sanitize input/output for your usecase. "XSS" or similar issues in JSDOMParser.js or Readability.js on their own are unlikely to be treated as security issues - it is expected that some interactive/scripting input may remain after `readability` processes input. If you can bypass appropriate sanitization measures like [DOMPurify](https://github.com/cure53/DOMPurify) you should report that using their procedures, not Mozilla’s.


================================================
FILE: eslint.config.mjs
================================================
/* eslint-env node */
"use strict";

import globals from "globals";

import { createRequire } from "module";
const require = createRequire(import.meta.url);
const mozillaRecommended = require("eslint-plugin-mozilla");
const noUnsanitized = require("eslint-plugin-no-unsanitized");

export default [
  {
    languageOptions: {
      parserOptions: {
        ecmaVersion: "latest",
      },
      globals: {
        ...globals.node,
      },
    },
    files: ["**/*.js"],
    plugins: {
      mozilla: mozillaRecommended,
      "no-unsanitized": noUnsanitized,
    },
    rules: {
      // Can't use everything because this isn't flat-config ready.
      ...mozillaRecommended.configs.recommended.rules,
      "no-inner-declarations": 2,

      "no-shadow": 2,

      "no-unused-vars": [
        2,
        {
          vars: "all",
          args: "none",
        },
      ],
    },
  },
  {
    files: ["test/**/*.js"],
    languageOptions: {
      globals: {
        it: "readonly",
        describe: "readonly",
        before: "readonly",
      },
    },
    rules: {
      "no-console": 0,
    },
  },
];


================================================
FILE: index.d.ts
================================================
/**
 * Decides whether or not the document is reader-able without parsing the whole thing.
 * @return {boolean} Whether or not we suspect Readability.parse() will succeed at returning an article object.
 */
export function isProbablyReaderable(
  document: Document,
  options?: {
    /** The minimum node content length used to decide if the document is readerable. */
    minContentLength?: number;
    /** The minumum cumulated 'score' used to determine if the document is readerable. */
    minScore?: number;
    /** The function used to determine if a node is visible. */
    visibilityChecker?: (node: Node) => boolean;
  }
): boolean;

export interface ReadabilityOptions<T = string> {
  /**
   * Whether to output debug messages. Defaults to `false`.
   */
  debug?: boolean;
  /**
   * The maximum number of elements to parse. If the document exceeds this,
   * Readability will stop processing. Useful for performance on very large documents.
   * Defaults to 0 (no limit).
   */
  maxElemsToParse?: number;
  /**
   * The number of top candidate nodes to consider when determining the main article content.
   * A higher number might lead to better results but could increase processing time.
   * Defaults to 5.
   */
  nbTopCandidates?: number;
  /**
   * The minimum number of characters required for a text node to be considered
   * significant and included in the article content.
   * Defaults to 500.
   */
  charThreshold?: number;
  /**
   * An array of class names to preserve. If `keepClasses` is `true`,
   * only classes in this array will be kept. Defaults to an empty array.
   */
  classesToPreserve?: string[];
  /**
   * If `true`, Readability will retain the original class names of elements
   * in the parsed article content. If `classesToPreserve` is also set,
   * only those specified classes will be kept. Defaults to `false`.
   */
  keepClasses?: boolean;
  /**
   * A function that serializes an HTML element into a string or another representation.
   * Defaults to `el => el.innerHTML`. This is used to get the content of the parsed article.
   * An identity function (`el => el`) may be useful for returning a DOM element as-is
   * for further processing.
   * @param el The Node to serialize.
   * @returns The HTML string representation of the element's content.
   */
  serializer?: (el: Node) => T;
  /**
   * If `true`, Readability will not attempt to parse or extract
   * JSON-LD structured data from the document. Defaults to `false`.
   */
  disableJSONLD?: boolean;
  /**
   * A regular expression used to validate video URLs. Only videos
   * matching this regex will be included in the parsed content.
   * Defaults to a regex that allows common video embedding platforms.
   */
  allowedVideoRegex?: RegExp;
  /**
   * A modifier applied to the link density score of an element.
   * This influences how Readability judges the main content area,
   * potentially helping with documents that have many or few links.
   * Defaults to 1.
   */
  linkDensityModifier?: number;
}

export class Readability<T = string> {
  constructor(
    document: Document,
    options?: ReadabilityOptions<T>
  );

  parse(): null | {
    /** article title */
    title: string | null | undefined;

    /** HTML string of processed article content */
    content: T | null | undefined;

    /** text content of the article, with all the HTML tags removed */
    textContent: string | null | undefined;

    /** length of an article, in characters */
    length: number | null | undefined;

    /** article description, or short excerpt from the content */
    excerpt: string | null | undefined;

    /** author metadata */
    byline: string | null | undefined;

    /** content direction */
    dir: string | null | undefined;

    /** name of the site */
    siteName: string | null | undefined;

    /** content language */
    lang: string | null | undefined;

    /** published time */
    publishedTime: string | null | undefined;
  };
}

// Assuming Article is the return type of Readability.prototype.parse()
export type Article = ReturnType<Readability['parse']>;



================================================
FILE: index.js
================================================
/* eslint-env node */
var Readability = require("./Readability");
var isProbablyReaderable = require("./Readability-readerable");

module.exports = {
  Readability,
  isProbablyReaderable,
};


================================================
FILE: package.json
================================================
{
  "name": "@mozilla/readability",
  "version": "0.6.0",
  "description": "A standalone version of the readability library used for Firefox Reader View.",
  "main": "index.js",
  "types": "index.d.ts",
  "scripts": {
    "lint": "eslint . && prettier Readability.js JSDOMParser.js test/*.js --check",
    "test": "mocha test/test-*.js",
    "generate-testcase": "node test/generate-testcase.js",
    "release": "release-it"
  },
  "repository": {
    "type": "git",
    "url": "https://github.com/mozilla/readability"
  },
  "author": "",
  "license": "Apache-2.0",
  "bugs": {
    "url": "https://github.com/mozilla/readability/issues"
  },
  "engines": {
    "node": ">=14.0.0"
  },
  "homepage": "https://github.com/mozilla/readability",
  "devDependencies": {
    "@release-it/keep-a-changelog": "7.0.0",
    "chai": "4.3.7",
    "eslint": "8.57.0",
    "eslint-plugin-mozilla": "^3.7.4",
    "eslint-plugin-no-unsanitized": "^4.0.2",
    "htmltidy2": "1.0.0",
    "js-beautify": "1.14.7",
    "jsdom": "20.0.2",
    "mocha": "10.8.2",
    "prettier": "^3.3.2",
    "release-it": "19.0.4",
    "sinon": "14.0.2",
    "xml-name-validator": "^5.0.0"
  }
}


================================================
FILE: test/debug-testcase.js
================================================
/* eslint-env node */

var Readability = require("../Readability");
var { JSDOM } = require("jsdom");
var fs = require("fs");
var path = require("path");

var testcaseRoot = path.join(__dirname, "test-pages");

if (process.argv.length < 3) {
  console.log("No testcase provided.");
  process.exit(1);
}

var src = fs
  .readFileSync(`${testcaseRoot}/${process.argv[2]}/source.html`, {
    encoding: "utf-8",
  })
  .trim();

var doc = new JSDOM(src, { url: "http://fakehost/test/page.html" }).window
  .document;

new Readability(doc, { debug: true }).parse();


================================================
FILE: test/generate-testcase.js
================================================
/* eslint-env node, mocha */

var debug = false;

var path = require("path");
var fs = require("fs");
var JSDOM = require("jsdom").JSDOM;
var prettyPrint = require("./utils").prettyPrint;
var http = require("http");
let { parse: urlparse, fileURLToPath } = require("url");
var htmltidy = require("htmltidy2").tidy;

var { Readability, isProbablyReaderable } = require("../index");
var JSDOMParser = require("../JSDOMParser");

var FFX_UA =
  "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:80.0) Gecko/20100101 Firefox/80.0";

var testcaseRoot = path.join(__dirname, "test-pages");

var argURL = process.argv[3]; // Could be undefined, we'll warn if it is if that is an issue.

function generateTestcase(slug) {
  var destRoot = path.join(testcaseRoot, slug);

  fs.mkdir(destRoot, function (err) {
    if (err) {
      var sourceFile = path.join(destRoot, "source.html");
      fs.exists(sourceFile, function (exists) {
        if (exists) {
          fetchLocalSource(sourceFile, function (data) {
            onResponseReceived(null, data, destRoot);
          });
        } else {
          fetchSource(argURL, function (fetchErr, data) {
            onResponseReceived(fetchErr, data, destRoot);
          });
        }
      });
      return;
    }
    fetchSource(argURL, function (fetchErr, data) {
      onResponseReceived(fetchErr, data, destRoot);
    });
  });
}

function fetchSource(url, callbackFn) {
  if (!url) {
    console.error("You should pass a URL if the source doesn't exist yet!");
    process.exit(1);
    return;
  }
  if (url.indexOf("http") == 0) {
    var client = http;
    if (url.indexOf("https") == 0) {
      client = require("https");
    }
    var options = urlparse(url);
    options.headers = { "User-Agent": FFX_UA };

    client.get(options, function (response) {
      if (debug) {
        console.log("STATUS:", response.statusCode);
        console.log("HEADERS:", JSON.stringify(response.headers));
      }
      response.setEncoding("utf-8");
      var rv = "";
      response.on("data", function (chunk) {
        rv += chunk;
      });
      response.on("end", function () {
        if (debug) {
          console.log("End received");
        }
        sanitizeSource(rv, callbackFn);
      });
    });
  } else if (url.indexOf("file://") == 0) {
    sourceFile = fileURLToPath(url);
    fs.exists(sourceFile, function (exists) {
      if (exists) {
        fetchLocalSource(sourceFile, function (data) {
          sanitizeSource(data, callbackFn);
        });
      } else {
        console.error("File doesn't exist!");
        process.exit(1);
      }
    });
  }
}

function fetchLocalSource(sourceFile, callbackFn) {
  fs.readFile(sourceFile, { encoding: "utf-8" }, function (readFileErr, data) {
    if (readFileErr) {
      console.error("Source existed but couldn't be read?");
      process.exit(1);
      return;
    }
    callbackFn(data);
  });
}

function sanitizeSource(html, callbackFn) {
  htmltidy(
    new JSDOM(html).serialize(),
    {
      indent: true,
      "indent-spaces": 4,
      "numeric-entities": true,
      "output-xhtml": true,
      "custom-tags": "blocklevel",
      wrap: 0,
    },
    callbackFn
  );
}

function onResponseReceived(error, source, destRoot) {
  if (error) {
    console.error("Couldn't tidy source html!");
    console.error(error);
    return;
  }
  if (debug) {
    console.log("writing");
  }
  var sourcePath = path.join(destRoot, "source.html");
  fs.writeFile(sourcePath, source, function (err) {
    if (err) {
      console.error("Couldn't write data to source.html!");
      console.error(err);
      return;
    }
    if (debug) {
      console.log("Running readability stuff");
    }
    runReadability(
      source,
      path.join(destRoot, "expected.html"),
      path.join(destRoot, "expected-metadata.json")
    );
  });
}

function runReadability(source, destPath, metadataDestPath) {
  var uri = "http://fakehost/test/page.html";
  var doc = new JSDOMParser().parse(source, uri);
  var myReader, result, readerable;
  try {
    // We pass `caption` as a class to check that passing in extra classes works,
    // given that it appears in some of the test documents.
    myReader = new Readability(doc, { classesToPreserve: ["caption"] });
    result = myReader.parse();
  } catch (ex) {
    console.error(ex);
    ex.stack.forEach(console.log.bind(console));
  }
  // Use jsdom for isProbablyReaderable because it supports querySelectorAll
  try {
    var jsdomDoc = new JSDOM(source, {
      url: uri,
    }).window.document;
    myReader = new Readability(jsdomDoc);
    readerable = isProbablyReaderable(jsdomDoc);
  } catch (ex) {
    console.error(ex);
    ex.stack.forEach(console.log.bind(console));
  }
  if (!result) {
    console.error(
      "No content generated by readability, not going to write expected.html!"
    );
    return;
  }

  fs.writeFile(destPath, prettyPrint(result.content), function (fileWriteErr) {
    if (fileWriteErr) {
      console.error("Couldn't write data to expected.html!");
      console.error(fileWriteErr);
    }

    // Delete the result data we don't care about checking.
    delete result.content;
    delete result.textContent;
    delete result.length;

    // Add isProbablyReaderable result
    result.readerable = readerable;

    fs.writeFile(
      metadataDestPath,
      JSON.stringify(result, null, 2) + "\n",
      function (metadataWriteErr) {
        if (metadataWriteErr) {
          console.error("Couldn't write data to expected-metadata.json!");
          console.error(metadataWriteErr);
        }
      }
    );
  });
}

if (process.argv.length < 3) {
  console.error(
    "Need at least a destination slug and potentially a URL (if the slug doesn't have source)."
  );
  process.exit(0);
  throw new Error("Abort");
}

if (process.argv[2] === "all") {
  fs.readdir(testcaseRoot, function (err, files) {
    if (err) {
      console.error("error reading testcaseses");
      return;
    }

    files.forEach(function (file) {
      generateTestcase(file);
    });
  });
} else {
  generateTestcase(process.argv[2]);
}


================================================
FILE: test/test-isProbablyReaderable.js
================================================
/* eslint-env node, mocha */

var JSDOM = require("jsdom").JSDOM;
var chai = require("chai");
chai.config.includeStack = true;
var expect = chai.expect;

var testPages = require("./utils").getTestPages();
var isProbablyReaderable = require("../index").isProbablyReaderable;

describe("isProbablyReaderable - test pages", function () {
  testPages.forEach(function (testPage) {
    var uri = "http://fakehost/test/page.html";
    describe(testPage.dir, function () {
      var doc = new JSDOM(testPage.source, {
        url: uri,
      }).window.document;
      var expected = testPage.expectedMetadata.readerable;
      it(
        "The result should " + (expected ? "" : "not ") + "be readerable",
        function () {
          expect(isProbablyReaderable(doc)).eql(expected);
        }
      );
    });
  });
});

describe("isProbablyReaderable", function () {
  const makeDoc = source => new JSDOM(source).window.document;
  var verySmallDoc = makeDoc('<html><p id="main">hello there</p></html>'); // content length: 11
  var smallDoc = makeDoc(
    `<html><p id="main">${"hello there ".repeat(11)}</p></html>`
  ); // content length: 132
  var largeDoc = makeDoc(
    `<html><p id="main">${"hello there ".repeat(12)}</p></html>`
  ); // content length: 144
  var veryLargeDoc = makeDoc(
    `<html><p id="main">${"hello there ".repeat(50)}</p></html>`
  ); // content length: 600

  it("should only declare large documents as readerable when default options", function () {
    expect(isProbablyReaderable(verySmallDoc), "very small doc").to.be.false; // score: 0
    expect(isProbablyReaderable(smallDoc), "small doc").to.be.false; // score: 0
    expect(isProbablyReaderable(largeDoc), "large doc").to.be.false; // score: ~1.7
    expect(isProbablyReaderable(veryLargeDoc), "very large doc").to.be.true; // score: ~21.4
  });

  it("should declare small and large documents as readerable when lower minContentLength", function () {
    var options = { minContentLength: 120, minScore: 0 };
    expect(isProbablyReaderable(verySmallDoc, options), "very small doc").to.be
      .false;
    expect(isProbablyReaderable(smallDoc, options), "small doc").to.be.true;
    expect(isProbablyReaderable(largeDoc, options), "large doc").to.be.true;
    expect(isProbablyReaderable(veryLargeDoc, options), "very large doc").to.be
      .true;
  });

  it("should only declare largest document as readerable when higher minContentLength", function () {
    var options = { minContentLength: 200, minScore: 0 };
    expect(isProbablyReaderable(verySmallDoc, options), "very small doc").to.be
      .false;
    expect(isProbablyReaderable(smallDoc, options), "small doc").to.be.false;
    expect(isProbablyReaderable(largeDoc, options), "large doc").to.be.false;
    expect(isProbablyReaderable(veryLargeDoc, options), "very large doc").to.be
      .true;
  });

  it("should declare small and large documents as readerable when lower minScore", function () {
    var options = { minContentLength: 0, minScore: 4 };
    expect(isProbablyReaderable(verySmallDoc, options), "very small doc").to.be
      .false; // score: ~3.3
    expect(isProbablyReaderable(smallDoc, options), "small doc").to.be.true; // score: ~11.4
    expect(isProbablyReaderable(largeDoc, options), "large doc").to.be.true; // score: ~11.9
    expect(isProbablyReaderable(veryLargeDoc, options), "very large doc").to.be
      .true; // score: ~24.4
  });

  it("should declare large documents as readerable when higher minScore", function () {
    var options = { minContentLength: 0, minScore: 11.5 };
    expect(isProbablyReaderable(verySmallDoc, options), "very small doc").to.be
      .false; // score: ~3.3
    expect(isProbablyReaderable(smallDoc, options), "small doc").to.be.false; // score: ~11.4
    expect(isProbablyReaderable(largeDoc, options), "large doc").to.be.true; // score: ~11.9
    expect(isProbablyReaderable(veryLargeDoc, options), "very large doc").to.be
      .true; // score: ~24.4
  });

  it("should use node visibility checker provided as option - not visible", function () {
    var called = false;
    var options = {
      visibilityChecker() {
        called = true;
        return false;
      },
    };
    expect(isProbablyReaderable(veryLargeDoc, options)).to.be.false;
    expect(called).to.be.true;
  });

  it("should use node visibility checker provided as option - visible", function () {
    var called = false;
    var options = {
      visibilityChecker() {
        called = true;
        return true;
      },
    };
    expect(isProbablyReaderable(veryLargeDoc, options)).to.be.true;
    expect(called).to.be.true;
  });

  it("should use node visibility checker provided as parameter - not visible", function () {
    var called = false;
    var visibilityChecker = () => {
      called = true;
      return false;
    };
    expect(isProbablyReaderable(veryLargeDoc, visibilityChecker)).to.be.false;
    expect(called).to.be.true;
  });

  it("should use node visibility checker provided as parameter - visible", function () {
    var called = false;
    var visibilityChecker = () => {
      called = true;
      return true;
    };
    expect(isProbablyReaderable(veryLargeDoc, visibilityChecker)).to.be.true;
    expect(called).to.be.true;
  });
});


================================================
FILE: test/test-jsdomparser.js
================================================
/* eslint-env node, mocha */

var chai = require("chai");
chai.config.includeStack = true;
var expect = chai.expect;

var JSDOMParser = require("../JSDOMParser");

var BASETESTCASE =
  '<html><body><p>Some text and <a class="someclass" href="#">a link</a></p>' +
  '<div id="foo">With a <script>With &lt; fancy " characters in it because' +
  '</script> that is fun.<span>And another node to make it harder</span></div><form><input type="text"/><input type="number"/>Here\'s a form</form></body></html>';

var baseDoc = new JSDOMParser().parse(BASETESTCASE, "http://fakehost/");

describe("Test JSDOM functionality", function () {
  function nodeExpect(actual, expected) {
    try {
      expect(actual).eql(expected);
    } catch (ex) {
      throw ex.message;
    }
  }
  it("should work for basic operations using the parent child hierarchy and innerHTML", function () {
    expect(baseDoc.childNodes.length).eql(1);
    expect(baseDoc.getElementsByTagName("*").length).eql(10);
    var foo = baseDoc.getElementById("foo");
    expect(foo.parentNode.localName).eql("body");
    nodeExpect(baseDoc.body, foo.parentNode);
    nodeExpect(baseDoc.body.parentNode, baseDoc.documentElement);
    expect(baseDoc.body.childNodes.length).eql(3);

    var generatedHTML = baseDoc.getElementsByTagName("p")[0].innerHTML;
    expect(generatedHTML).eql(
      'Some text and <a class="someclass" href="#">a link</a>'
    );
    var scriptNode = baseDoc.getElementsByTagName("script")[0];
    generatedHTML = scriptNode.innerHTML;
    expect(generatedHTML).eql('With &lt; fancy " characters in it because');
    expect(scriptNode.textContent).eql(
      'With < fancy " characters in it because'
    );
  });

  it("should have basic URI information", function () {
    expect(baseDoc.documentURI, "http://fakehost/");
    expect(baseDoc.baseURI, "http://fakehost/");
  });

  it("should deal with script tags", function () {
    // Check our script parsing worked:
    var scripts = baseDoc.getElementsByTagName("script");
    expect(scripts.length).eql(1);
    expect(scripts[0].textContent).eql(
      'With < fancy " characters in it because'
    );
  });

  it("should have working sibling/first+lastChild properties", function () {
    var foo = baseDoc.getElementById("foo");

    nodeExpect(foo.previousSibling.nextSibling, foo);
    nodeExpect(foo.nextSibling.previousSibling, foo);
    nodeExpect(foo.nextSibling, foo.nextElementSibling);
    nodeExpect(foo.previousSibling, foo.previousElementSibling);

    var beforeFoo = foo.previousSibling;
    var afterFoo = foo.nextSibling;

    nodeExpect(baseDoc.body.lastChild, afterFoo);
    nodeExpect(baseDoc.body.firstChild, beforeFoo);
  });

  it("should have working removeChild and appendChild functionality", function () {
    var foo = baseDoc.getElementById("foo");
    var beforeFoo = foo.previousSibling;
    var afterFoo = foo.nextSibling;

    // eslint-disable-next-line mozilla/avoid-removeChild
    var removedFoo = foo.parentNode.removeChild(foo);
    nodeExpect(foo, removedFoo);
    nodeExpect(foo.parentNode, null);
    nodeExpect(foo.previousSibling, null);
    nodeExpect(foo.nextSibling, null);
    nodeExpect(foo.previousElementSibling, null);
    nodeExpect(foo.nextElementSibling, null);

    expect(beforeFoo.localName).eql("p");
    nodeExpect(beforeFoo.nextSibling, afterFoo);
    nodeExpect(afterFoo.previousSibling, beforeFoo);
    nodeExpect(beforeFoo.nextElementSibling, afterFoo);
    nodeExpect(afterFoo.previousElementSibling, beforeFoo);

    expect(baseDoc.body.childNodes.length).eql(2);

    baseDoc.body.appendChild(foo);

    expect(baseDoc.body.childNodes.length).eql(3);
    nodeExpect(afterFoo.nextSibling, foo);
    nodeExpect(foo.previousSibling, afterFoo);
    nodeExpect(afterFoo.nextElementSibling, foo);
    nodeExpect(foo.previousElementSibling, afterFoo);

    // This should reorder back to sanity:
    baseDoc.body.appendChild(afterFoo);
    nodeExpect(foo.previousSibling, beforeFoo);
    nodeExpect(foo.nextSibling, afterFoo);
    nodeExpect(foo.previousElementSibling, beforeFoo);
    nodeExpect(foo.nextElementSibling, afterFoo);

    nodeExpect(foo.previousSibling.nextSibling, foo);
    nodeExpect(foo.nextSibling.previousSibling, foo);
    nodeExpect(foo.nextSibling, foo.nextElementSibling);
    nodeExpect(foo.previousSibling, foo.previousElementSibling);
  });

  it("should handle attributes", function () {
    var link = baseDoc.getElementsByTagName("a")[0];
    expect(link.getAttribute("href")).eql("#");
    expect(link.getAttribute("class")).eql(link.className);
    var foo = baseDoc.getElementById("foo");
    expect(foo.id).eql(foo.getAttribute("id"));
  });

  it("should have a working replaceChild", function () {
    var parent = baseDoc.getElementsByTagName("div")[0];
    var p = baseDoc.createElement("p");
    p.setAttribute("id", "my-replaced-kid");
    var childCount = parent.childNodes.length;
    var childElCount = parent.children.length;
    for (var i = 0; i < parent.childNodes.length; i++) {
      var replacedNode = parent.childNodes[i];
      var replacedAnElement =
        replacedNode.nodeType === replacedNode.ELEMENT_NODE;
      var oldNext = replacedNode.nextSibling;
      var oldNextEl = replacedNode.nextElementSibling;
      var oldPrev = replacedNode.previousSibling;
      var oldPrevEl = replacedNode.previousElementSibling;

      parent.replaceChild(p, replacedNode);

      // Check siblings and parents on both nodes were set:
      nodeExpect(p.nextSibling, oldNext);
      nodeExpect(p.previousSibling, oldPrev);
      nodeExpect(p.parentNode, parent);

      nodeExpect(replacedNode.parentNode, null);
      nodeExpect(replacedNode.nextSibling, null);
      nodeExpect(replacedNode.previousSibling, null);
      // if the old node was an element, element siblings should now be null
      if (replacedAnElement) {
        nodeExpect(replacedNode.nextElementSibling, null);
        nodeExpect(replacedNode.previousElementSibling, null);
      }

      // Check the siblings were updated
      if (oldNext) {
        nodeExpect(oldNext.previousSibling, p);
      }
      if (oldPrev) {
        nodeExpect(oldPrev.nextSibling, p);
      }

      // check the array was updated
      nodeExpect(parent.childNodes[i], p);

      // Now check element properties/lists:
      var kidElementIndex = parent.children.indexOf(p);
      // should be in the list:
      expect(kidElementIndex).not.eql(-1);

      if (kidElementIndex > 0) {
        nodeExpect(
          parent.children[kidElementIndex - 1],
          p.previousElementSibling
        );
        nodeExpect(p.previousElementSibling.nextElementSibling, p);
      } else {
        nodeExpect(p.previousElementSibling, null);
      }
      if (kidElementIndex < parent.children.length - 1) {
        nodeExpect(parent.children[kidElementIndex + 1], p.nextElementSibling);
        nodeExpect(p.nextElementSibling.previousElementSibling, p);
      } else {
        nodeExpect(p.nextElementSibling, null);
      }

      if (replacedAnElement) {
        nodeExpect(oldNextEl, p.nextElementSibling);
        nodeExpect(oldPrevEl, p.previousElementSibling);
      }

      expect(parent.childNodes.length).eql(childCount);
      expect(parent.children.length).eql(
        replacedAnElement ? childElCount : childElCount + 1
      );

      parent.replaceChild(replacedNode, p);

      nodeExpect(oldNext, replacedNode.nextSibling);
      nodeExpect(oldNextEl, replacedNode.nextElementSibling);
      nodeExpect(oldPrev, replacedNode.previousSibling);
      nodeExpect(oldPrevEl, replacedNode.previousElementSibling);
      if (replacedNode.nextSibling) {
        nodeExpect(replacedNode.nextSibling.previousSibling, replacedNode);
      }
      if (replacedNode.previousSibling) {
        nodeExpect(replacedNode.previousSibling.nextSibling, replacedNode);
      }
      if (replacedAnElement) {
        if (replacedNode.previousElementSibling) {
          nodeExpect(
            replacedNode.previousElementSibling.nextElementSibling,
            replacedNode
          );
        }
        if (replacedNode.nextElementSibling) {
          nodeExpect(
            replacedNode.nextElementSibling.previousElementSibling,
            replacedNode
          );
        }
      }
    }
  });

  it("should have a working insertBefore", function () {
    var doc = new JSDOMParser().parse(BASETESTCASE);
    var body = doc.body;
    var foo = doc.getElementById("foo");
    var p = doc.getElementsByTagName("p")[0];
    var form = doc.getElementsByTagName("form")[0];

    // Insert in the middle
    var newEl = doc.createElement("hr");
    body.insertBefore(newEl, foo);
    nodeExpect(p.nextSibling, newEl);
    nodeExpect(newEl.nextSibling, foo);
    nodeExpect(foo.previousSibling, newEl);
    nodeExpect(newEl.previousSibling, p);
    nodeExpect(p.nextElementSibling, newEl);
    nodeExpect(newEl.nextElementSibling, foo);
    nodeExpect(foo.previousElementSibling, newEl);
    nodeExpect(newEl.previousElementSibling, p);
    expect(body.childNodes.length).eql(4);
    expect(body.children.length).eql(4);

    // Insert at the end (ref = null)
    var newEl2 = doc.createElement("hr");
    body.insertBefore(newEl2, null);
    nodeExpect(body.lastChild, newEl2);
    nodeExpect(form.nextSibling, newEl2);
    nodeExpect(newEl2.previousSibling, form);
    expect(body.childNodes.length).eql(5);
    expect(body.children.length).eql(5);

    // Insert at the beginning
    var newEl3 = doc.createElement("hr");
    body.insertBefore(newEl3, p);
    nodeExpect(body.firstChild, newEl3);
    nodeExpect(newEl3.nextSibling, p);
    nodeExpect(p.previousSibling, newEl3);
    expect(body.childNodes.length).eql(6);
    expect(body.children.length).eql(6);
  });

  it("should correctly handle mixed element/text siblings on insertBefore", function () {
    // Insert between a text node and an element node
    var html1 = "<div><p>A</p>Some Text<span>B</span></div>";
    var doc1 = new JSDOMParser().parse(html1);
    var div1 = doc1.getElementsByTagName("div")[0];
    var pA1 = doc1.getElementsByTagName("p")[0];
    var textNode1 = div1.childNodes[1];
    var spanB1 = doc1.getElementsByTagName("span")[0];
    var newEl1 = doc1.createElement("hr");
    div1.insertBefore(newEl1, spanB1);
    nodeExpect(newEl1.previousSibling, textNode1);
    nodeExpect(newEl1.previousElementSibling, pA1);
    nodeExpect(newEl1.nextSibling, spanB1);
    nodeExpect(newEl1.nextElementSibling, spanB1);
    nodeExpect(pA1.nextElementSibling, newEl1);
    nodeExpect(spanB1.previousElementSibling, newEl1);

    // Insert between an element node and a text node
    var html2 = "<div><p>A</p><span>B</span>Some Text</div>";
    var doc2 = new JSDOMParser().parse(html2);
    var div2 = doc2.getElementsByTagName("div")[0];
    var pA2 = doc2.getElementsByTagName("p")[0];
    var spanB2 = doc2.getElementsByTagName("span")[0];
    var textNode2 = div2.childNodes[2];
    var newEl2 = doc2.createElement("hr");
    div2.insertBefore(newEl2, textNode2);
    nodeExpect(newEl2.previousSibling, spanB2);
    nodeExpect(newEl2.previousElementSibling, spanB2);
    nodeExpect(newEl2.nextSibling, textNode2);
    expect(newEl2.nextElementSibling).to.be.null;
    nodeExpect(pA2.nextElementSibling, spanB2);
    nodeExpect(spanB2.nextElementSibling, newEl2);
  });

  it("should throw an error when inserting before a non-child", function () {
    var doc = new JSDOMParser().parse("<div><p>A</p></div>");
    var div = doc.getElementsByTagName("div")[0];
    var p = doc.createElement("p");
    var unconnected = doc.createElement("span");

    expect(function () {
      div.insertBefore(p, unconnected);
    }).to.Throw("insertBefore: reference node not found");
  });

  it("should have a working createDocumentFragment", function () {
    var doc = new JSDOMParser().parse(BASETESTCASE);
    var body = doc.body;
    var fragment = doc.createDocumentFragment();
    expect(fragment.nodeType).eql(fragment.DOCUMENT_FRAGMENT_NODE);
    expect(fragment.nodeName).eql("#document-fragment");

    var p = doc.getElementsByTagName("p")[0];
    var foo = doc.getElementById("foo");

    fragment.appendChild(p);
    fragment.appendChild(foo);

    expect(p.parentNode).eql(fragment);
    expect(foo.parentNode).eql(fragment);
    expect(fragment.childNodes.length).eql(2);
    expect(fragment.children.length).eql(2);
    expect(body.childNodes.length).eql(1); // only form is left

    body.appendChild(fragment);
    expect(body.childNodes.length).eql(3);
    expect(p.parentNode).eql(body);
    expect(foo.parentNode).eql(body);
    expect(fragment.childNodes.length).eql(0);
  });

  it("should handle moving an existing child with insertBefore", function () {
    var doc = new JSDOMParser().parse("<div><p>A</p><p>B</p><p>C</p></div>");
    var div = doc.getElementsByTagName("div")[0];
    var pA = div.children[0];
    var pB = div.children[1];
    var pC = div.children[2];

    // Move C before B
    div.insertBefore(pC, pB);

    // Check final state
    expect(div.children.length).eql(3);
    nodeExpect(div.children[0], pA);
    nodeExpect(div.children[1], pC);
    nodeExpect(div.children[2], pB);

    // Check pointers on A
    nodeExpect(pA.previousSibling, null);
    nodeExpect(pA.nextSibling, pC);

    // Check pointers on C
    nodeExpect(pC.previousSibling, pA);
    nodeExpect(pC.nextSibling, pB);

    // Check pointers on B
    nodeExpect(pB.previousSibling, pC);
    nodeExpect(pB.nextSibling, null);
  });

  it("should handle inserting a node before itself as a no-op", function () {
    var doc = new JSDOMParser().parse("<div><p>A</p><p>B</p></div>");
    var div = doc.getElementsByTagName("div")[0];
    var pA = div.children[0];
    var pB = div.children[1];

    // Try to insert B before B.
    div.insertBefore(pB, pB);

    // Check that the DOM remains unchanged.
    expect(div.children.length).eql(2);
    nodeExpect(div.children[0], pA);
    nodeExpect(div.children[1], pB);
    nodeExpect(pA.nextSibling, pB);
    nodeExpect(pB.previousSibling, pA);
  });

  it("should handle replacing a node with itself as a no-op", function () {
    var doc = new JSDOMParser().parse("<div><p>A</p><p>B</p></div>");
    var div = doc.getElementsByTagName("div")[0];
    var pA = div.children[0];
    var pB = div.children[1];

    // Try to replace B with B.
    div.replaceChild(pB, pB);

    // Check that the DOM remains unchanged.
    expect(div.children.length).eql(2);
    nodeExpect(div.children[0], pA);
    nodeExpect(div.children[1], pB);
    nodeExpect(pA.nextSibling, pB);
    nodeExpect(pB.previousSibling, pA);
  });

  it("should correctly handle sibling pointers on remove()", function () {
    var doc = new JSDOMParser().parse("<div><p>A</p>Some text<p>B</p></div>");
    var div = doc.getElementsByTagName("div")[0];
    var pA = div.children[0];
    var textNode = div.childNodes[1];
    var pB = div.children[1];

    // Check initial state
    nodeExpect(pA.nextElementSibling, pB);
    nodeExpect(pB.previousElementSibling, pA);
    expect(textNode.nextElementSibling).eql(undefined);

    // Remove the text node
    textNode.remove();

    // Check element sibling pointers are updated
    nodeExpect(pA.nextElementSibling, pB);
    nodeExpect(pB.previousElementSibling, pA);

    // Check the removed node's properties
    nodeExpect(textNode.parentNode, null);
    nodeExpect(textNode.nextSibling, null);
    nodeExpect(textNode.previousSibling, null);
    expect(textNode.nextElementSibling).eql(undefined);
  });
});

describe("Test HTML escaping", function () {
  var baseStr =
    "<p>Hello, everyone &amp; all their friends, &lt;this&gt; is a &quot; test with &apos; quotes.</p>";
  var doc = new JSDOMParser().parse(baseStr);
  var p = doc.getElementsByTagName("p")[0];
  var txtNode = p.firstChild;
  it("should handle encoding HTML correctly", function () {
    // This /should/ just be cached straight from reading it:
    expect("<p>" + p.innerHTML + "</p>").eql(baseStr);
    expect("<p>" + txtNode.innerHTML + "</p>").eql(baseStr);
  });

  it("should have decoded correctly", function () {
    expect(p.textContent).eql(
      "Hello, everyone & all their friends, <this> is a \" test with ' quotes."
    );
    expect(txtNode.textContent).eql(
      "Hello, everyone & all their friends, <this> is a \" test with ' quotes."
    );
  });

  it("should handle updates via textContent correctly", function () {
    // Because the initial tests might be based on cached innerHTML values,
    // let's manipulate via textContent in order to test that it alters
    // the innerHTML correctly.
    txtNode.textContent = txtNode.textContent + " ";
    txtNode.textContent = txtNode.textContent.trim();
    var expectedHTML = baseStr.replace("&quot;", '"').replace("&apos;", "'");
    expect("<p>" + txtNode.innerHTML + "</p>").eql(expectedHTML);
    expect("<p>" + p.innerHTML + "</p>").eql(expectedHTML);
  });

  it("should handle decimal and hex escape sequences", function () {
    var parsedDoc = new JSDOMParser().parse("<p>&#32;&#x20;</p>");
    expect(parsedDoc.getElementsByTagName("p")[0].textContent).eql("  ");
  });
});

describe("Script parsing", function () {
  it("should strip ?-based comments within script tags", function () {
    var html = '<script><?Silly test <img src="test"></script>';
    var doc = new JSDOMParser().parse(html);
    expect(doc.firstChild.tagName).eql("SCRIPT");
    expect(doc.firstChild.textContent).eql("");
    expect(doc.firstChild.children.length).eql(0);
    expect(doc.firstChild.childNodes.length).eql(0);
  });

  it("should strip !-based comments within script tags", function () {
    var html =
      '<script><!--Silly test > <script src="foo.js"></script>--></script>';
    var doc = new JSDOMParser().parse(html);
    expect(doc.firstChild.tagName).eql("SCRIPT");
    expect(doc.firstChild.textContent).eql("");
    expect(doc.firstChild.children.length).eql(0);
    expect(doc.firstChild.childNodes.length).eql(0);
  });

  it("should strip any other nodes within script tags", function () {
    var html = "<script>&lt;div>Hello, I'm not really in a &lt;/div></script>";
    var doc = new JSDOMParser().parse(html);
    expect(doc.firstChild.tagName).eql("SCRIPT");
    expect(doc.firstChild.textContent).eql(
      "<div>Hello, I'm not really in a </div>"
    );
    expect(doc.firstChild.children.length).eql(0);
    expect(doc.firstChild.childNodes.length).eql(1);
  });

  it("should strip any other invalid script nodes within script tags", function () {
    var html = '<script>&lt;script src="foo.js">&lt;/script></script>';
    var doc = new JSDOMParser().parse(html);
    expect(doc.firstChild.tagName).eql("SCRIPT");
    expect(doc.firstChild.textContent).eql('<script src="foo.js"></script>');
    expect(doc.firstChild.children.length).eql(0);
    expect(doc.firstChild.childNodes.length).eql(1);
  });

  it("should not be confused by partial closing tags", function () {
    var html = "<script>var x = '&lt;script>Hi&lt;' + '/script>';</script>";
    var doc = new JSDOMParser().parse(html);
    expect(doc.firstChild.tagName).eql("SCRIPT");
    expect(doc.firstChild.textContent).eql(
      "var x = '<script>Hi<' + '/script>';"
    );
    expect(doc.firstChild.children.length).eql(0);
    expect(doc.firstChild.childNodes.length).eql(1);
  });
});

describe("Tag local name case handling", function () {
  it("should lowercase tag names", function () {
    var html = "<DIV><svG><clippath/></svG></DIV>";
    var doc = new JSDOMParser().parse(html);
    expect(doc.firstChild.tagName).eql("DIV");
    expect(doc.firstChild.localName).eql("div");
    expect(doc.firstChild.firstChild.tagName).eql("SVG");
    expect(doc.firstChild.firstChild.localName).eql("svg");
    expect(doc.firstChild.firstChild.firstChild.tagName).eql("CLIPPATH");
    expect(doc.firstChild.firstChild.firstChild.localName).eql("clippath");
  });
});

describe("Recovery from self-closing tags that have close tags", function () {
  it("should handle delayed closing of a tag", function () {
    var html = "<div><input><p>I'm in an input</p></input></div>";
    var doc = new JSDOMParser().parse(html);
    expect(doc.firstChild.localName).eql("div");
    expect(doc.firstChild.childNodes.length).eql(1);
    expect(doc.firstChild.firstChild.localName).eql("input");
    expect(doc.firstChild.firstChild.childNodes.length).eql(1);
    expect(doc.firstChild.firstChild.firstChild.localName).eql("p");
  });
});

describe("baseURI parsing", function () {
  it("should handle various types of relative and absolute base URIs", function () {
    function checkBase(base, expectedResult) {
      var html =
        "<html><head><base href='" + base + "'></base></head><body/></html>";
      var doc = new JSDOMParser().parse(html, "http://fakehost/some/dir/");
      expect(doc.baseURI).eql(expectedResult);
    }

    checkBase("relative/path", "http://fakehost/some/dir/relative/path");
    checkBase("/path", "http://fakehost/path");
    checkBase("http://absolute/", "http://absolute/");
    checkBase("//absolute/path", "http://absolute/path");
  });
});

describe("namespace workarounds", function () {
  it("should handle random namespace information in the serialized DOM", function () {
    var html =
      "<a0:html><a0:body><a0:DIV><a0:svG><a0:clippath/></a0:svG></a0:DIV></a0:body></a0:html>";
    var doc = new JSDOMParser().parse(html);
    var div = doc.getElementsByTagName("div")[0];
    expect(div.tagName).eql("DIV");
    expect(div.localName).eql("div");
    expect(div.firstChild.tagName).eql("SVG");
    expect(div.firstChild.localName).eql("svg");
    expect(div.firstChild.firstChild.tagName).eql("CLIPPATH");
    expect(div.firstChild.firstChild.localName).eql("clippath");
    expect(doc.documentElement).eql(doc.firstChild);
    expect(doc.body).eql(doc.documentElement.firstChild);
  });
});


================================================
FILE: test/test-pages/001/expected-metadata.json
================================================
{
  "title": "Get your Frontend JavaScript Code Covered | Code",
  "byline": "Nicolas Perriault",
  "dir": null,
  "lang": "en",
  "excerpt": "Nicolas Perriault's homepage.",
  "siteName": null,
  "publishedTime": null,
  "readerable": true
}


================================================
FILE: test/test-pages/001/expected.html
================================================
<div id="readability-page-1" class="page">
    <section>
        <p><strong>So finally you're <a href="http://fakehost/code/2013/testing-frontend-javascript-code-using-mocha-chai-and-sinon/">testing your frontend JavaScript code</a>? Great! The more you write tests, the more confident you are with your code… but how much precisely? That's where <a href="http://en.wikipedia.org/wiki/Code_coverage">code coverage</a> might help.</strong>
        </p>
        <p>The idea behind code coverage is to record which parts of your code (functions, statements, conditionals and so on) have been executed by your test suite, to compute metrics out of these data and usually to provide tools for navigating and inspecting them.</p>
        <p>Not a lot of frontend developers I know actually test their frontend code, and I can barely imagine how many of them have ever setup code coverage… Mostly because there are not many frontend-oriented tools in this area I guess.</p>
        <p>Actually I've only found one which provides an adapter for <a href="http://visionmedia.github.io/mocha/">Mocha</a> and actually works…</p>
        <blockquote>
            <p>Drinking game for web devs: <br />(1) Think of a noun <br />(2) Google "&lt;noun&gt;.js" <br />(3) If a library with that name exists - drink</p>— Shay Friedman (@ironshay) <a href="https://twitter.com/ironshay/statuses/370525864523743232">August 22, 2013</a>
        </blockquote>
        <p><strong><a href="http://blanketjs.org/">Blanket.js</a></strong> is an <em>easy to install, easy to configure, and easy to use JavaScript code coverage library that works both in-browser and with nodejs.</em>
        </p>
        <p>Its use is dead easy, adding Blanket support to your Mocha test suite is just matter of adding this simple line to your HTML test file:</p>
        <pre><code>&lt;script src="vendor/blanket.js"
        data-cover-adapter="vendor/mocha-blanket.js"&gt;&lt;/script&gt;
</code></pre>
        <p>Source files: <a href="https://raw.github.com/alex-seville/blanket/master/dist/qunit/blanket.min.js">blanket.js</a>, <a href="https://raw.github.com/alex-seville/blanket/master/src/adapters/mocha-blanket.js">mocha-blanket.js</a>
        </p>
        <p>As an example, let's reuse the silly <code>Cow</code> example we used <a href="http://fakehost/code/2013/testing-frontend-javascript-code-using-mocha-chai-and-sinon/">in a previous episode</a>:</p>
        <pre><code>// cow.js
(function(exports) {
  "use strict";

  function Cow(name) {
    this.name = name || "Anon cow";
  }
  exports.Cow = Cow;

  Cow.prototype = {
    greets: function(target) {
      if (!target)
        throw new Error("missing target");
      return this.name + " greets " + target;
    }
  };
}

Download .txt

gitextract_3shkyv0s/

├── .gitattributes
├── .gitignore
├── .npmignore
├── .prettierrc.js
├── .release-it.json
├── .taskcluster.yml
├── CHANGELOG.md
├── CODE_OF_CONDUCT.md
├── CONTRIBUTING.md
├── JSDOMParser.js
├── LICENSE.md
├── README.md
├── Readability-readerable.js
├── Readability.js
├── SECURITY.md
├── eslint.config.mjs
├── index.d.ts
├── index.js
├── package.json
└── test/
    ├── debug-testcase.js
    ├── generate-testcase.js
    ├── test-isProbablyReaderable.js
    ├── test-jsdomparser.js
    ├── test-pages/
    │   ├── 001/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── 002/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── 003-metadata-preferred/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── 004-metadata-space-separated-properties/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── 005-unescape-html-entities/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── aclu/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── aktualne/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── archive-of-our-own/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── ars-1/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── article-author-tag/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── base-url/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── base-url-base-element/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── base-url-base-element-relative/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── basic-tags-cleaning/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── bbc-1/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── blogger/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── breitbart/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── bug-1255978/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── buzzfeed-1/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── citylab-1/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── clean-links/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── cnet/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── cnet-svg-classes/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── cnn/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── comment-inside-script-parsing/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── daringfireball-1/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── data-url-image/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── dev418/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── dropbox-blog/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── ebb-org/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── ehow-1/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── ehow-2/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── embedded-videos/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── engadget/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── firefox-nightly-blog/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── folha/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── gitlab-blog/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── gmw/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── google-sre-book-1/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── guardian-1/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── heise/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── herald-sun-1/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── hidden-nodes/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── hukumusume/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── iab-1/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── ietf-1/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── invalid-attributes/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── js-link-replacement/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── keep-images/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── keep-tabular-data/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── la-nacion/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── lazy-image-1/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── lazy-image-2/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── lazy-image-3/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── lemonde-1/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── liberation-1/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── lifehacker-post-comment-load/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── lifehacker-working/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── links-in-tables/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── lwn-1/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── mathjax/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── medicalnewstoday/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── medium-1/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── medium-2/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── medium-3/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── mercurial/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── metadata-content-missing/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── missing-paragraphs/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── mozilla-1/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── mozilla-2/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── msn/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── normalize-spaces/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── nytimes-1/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── nytimes-2/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── nytimes-3/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── nytimes-4/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── nytimes-5/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── ol/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── parsely-metadata/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── pixnet/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── qq/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── quanta-1/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── remove-aria-hidden/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── remove-extra-brs/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── remove-extra-paragraphs/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── remove-script-tags/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── reordering-paragraphs/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── replace-brs/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── replace-font-tags/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── royal-road/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── rtl-1/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── rtl-2/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── rtl-3/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── rtl-4/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── salon-1/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── schema-org-context-object/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── seattletimes-1/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── simplyfound-1/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── social-buttons/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── spiceworks/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── style-tags-removal/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── svg-parsing/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── table-style-attributes/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── telegraph/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── theverge/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── title-and-h1-discrepancy/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── title-en-dash/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── tmz-1/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── toc-missing/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── topicseed-1/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── tumblr/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── v8-blog/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── videos-1/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── videos-2/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── visibility-hidden/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── wapo-1/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── wapo-2/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── webmd-1/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── webmd-2/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── wikia/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── wikipedia/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── wikipedia-2/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── wikipedia-3/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── wikipedia-4/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── wordpress/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── yahoo-1/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── yahoo-2/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── yahoo-3/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   ├── yahoo-4/
    │   │   ├── expected-metadata.json
    │   │   ├── expected.html
    │   │   └── source.html
    │   └── youth/
    │       ├── expected-metadata.json
    │       ├── expected.html
    │       └── source.html
    ├── test-readability.js
    └── utils.js

Download .txt

SYMBOL INDEX (142 symbols across 9 files)

FILE: JSDOMParser.js
  function encodeTextContentHTML (line 46) | function encodeTextContentHTML(s) {
  function encodeHTML (line 52) | function encodeHTML(s) {
  function decodeHTML (line 58) | function decodeHTML(str) {
  function getElementsByTagName (line 303) | function getElementsByTagName(tag) {
  method firstChild (line 334) | get firstChild() {
  method firstElementChild (line 338) | get firstElementChild() {
  method lastChild (line 342) | get lastChild() {
  method lastElementChild (line 346) | get lastElementChild() {
  method _insertNodesAtIndex (line 362) | _insertNodesAtIndex(nodes, index) {
  method appendChild (line 450) | appendChild(child) {
  method insertBefore (line 459) | insertBefore(newNode, referenceNode) {
  method remove (line 475) | remove() {
  method removeChild (line 515) | removeChild(child) {
  method replaceChild (line 519) | replaceChild(newNode, oldNode) {
  method value (line 548) | get value() {
  method setValue (line 551) | setValue(newValue) {
  method getEncodedValue (line 554) | getEncodedValue() {
  method cloneNode (line 558) | cloneNode() {
  method textContent (line 594) | get textContent() {
  method innerHTML (line 600) | get innerHTML() {
  method innerHTML (line 607) | set innerHTML(newHTML) {
  method textContent (line 611) | set textContent(newText) {
  method getElementById (line 633) | getElementById(id) {
  method createElement (line 650) | createElement(tag) {
  method createTextNode (line 655) | createTextNode(text) {
  method createDocumentFragment (line 661) | createDocumentFragment() {
  method baseURI (line 665) | get baseURI() {
  method className (line 706) | get className() {
  method className (line 710) | set className(str) {
  method id (line 714) | get id() {
  method id (line 718) | set id(str) {
  method href (line 722) | get href() {
  method href (line 726) | set href(str) {
  method src (line 730) | get src() {
  method src (line 734) | set src(str) {
  method srcset (line 738) | get srcset() {
  method srcset (line 742) | set srcset(str) {
  method nodeName (line 746) | get nodeName() {
  method innerHTML (line 750) | get innerHTML() {
  method innerHTML (line 789) | set innerHTML(html) {
  method textContent (line 803) | set textContent(text) {
  method textContent (line 816) | get textContent() {
  method getAttribute (line 836) | getAttribute(name) {
  method setAttribute (line 846) | setAttribute(name, value) {
  method setAttributeNode (line 857) | setAttributeNode(node) {
  method removeAttribute (line 861) | removeAttribute(name) {
  method hasAttribute (line 871) | hasAttribute(name) {
  method getStyle (line 888) | getStyle(styleName) {
  method setStyle (line 906) | setStyle(styleName, styleValue) {
  method error (line 961) | error(m) {
  method peekNext (line 975) | peekNext() {
  method nextChar (line 982) | nextChar() {
  method readString (line 990) | readString(quote) {
  method readAttribute (line 1008) | readAttribute(node) {
  method makeElementNode (line 1045) | makeElementNode(retPair) {
  method match (line 1104) | match(str) {
  method discardTo (line 1120) | discardTo(str) {
  method readChildren (line 1131) | readChildren(node) {
  method discardNextComment (line 1141) | discardNextComment() {
  method readNode (line 1165) | readNode() {
  method parse (line 1270) | parse(html, url) {

FILE: Readability-readerable.js
  function isNodeVisible (line 30) | function isNodeVisible(node) {
  function isProbablyReaderable (line 52) | function isProbablyReaderable(doc, options = {}) {

FILE: Readability.js
  function Readability (line 27) | function Readability(doc, options) {
  method _postProcessContent (line 282) | _postProcessContent(articleContent) {
  method _removeNodes (line 304) | _removeNodes(nodeList, filterFn) {
  method _replaceNodeTags (line 327) | _replaceNodeTags(nodeList, newTagName) {
  method _forEachNode (line 348) | _forEachNode(nodeList, fn) {
  method _findNode (line 363) | _findNode(nodeList, fn) {
  method _someNode (line 378) | _someNode(nodeList, fn) {
  method _everyNode (line 393) | _everyNode(nodeList, fn) {
  method _getAllNodesWithTag (line 397) | _getAllNodesWithTag(node, tagNames) {
  method _cleanClasses (line 418) | _cleanClasses(node) {
  method _isUrl (line 442) | _isUrl(str) {
  method _fixRelativeUris (line 457) | _fixRelativeUris(articleContent) {
  method _simplifyNestedElements (line 538) | _simplifyNestedElements(articleContent) {
  method _getArticleTitle (line 573) | _getArticleTitle() {
  method _prepDocument (line 669) | _prepDocument() {
  method _nextNode (line 687) | _nextNode(node) {
  method _replaceBrs (line 706) | _replaceBrs(elem) {
  method _setNodeTag (line 762) | _setNodeTag(node, tag) {
  method _prepArticle (line 792) | _prepArticle(articleContent) {
  method _initializeNode (line 903) | _initializeNode(node) {
  method _removeAndGetNext (line 942) | _removeAndGetNext(node) {
  method _getNextNode (line 959) | _getNextNode(node, ignoreSelfAndKids) {
  method _textSimilarity (line 981) | _textSimilarity(textA, textB) {
  method _isValidByline (line 1005) | _isValidByline(node, matchString) {
  method _getNodeAncestors (line 1019) | _getNodeAncestors(node, maxDepth) {
  method _grabArticle (line 1041) | _grabArticle(page) {
  method _unescapeHtmlEntities (line 1631) | _unescapeHtmlEntities(str) {
  method _getJSONLD (line 1658) | _getJSONLD(doc) {
  method _getArticleMetadata (line 1783) | _getArticleMetadata(jsonld) {
  method _isSingleImage (line 1897) | _isSingleImage(node) {
  method _unwrapNoscriptImages (line 1918) | _unwrapNoscriptImages(doc) {
  method _removeScripts (line 2001) | _removeScripts(doc) {
  method _hasSingleTagInsideElement (line 2013) | _hasSingleTagInsideElement(element, tag) {
  method _isElementWithoutContent (line 2028) | _isElementWithoutContent(node) {
  method _hasChildBlockElement (line 2044) | _hasChildBlockElement(element) {
  method _isPhrasingContent (line 2057) | _isPhrasingContent(node) {
  method _isWhitespace (line 2068) | _isWhitespace(node) {
  method _getInnerText (line 2084) | _getInnerText(e, normalizeSpaces) {
  method _getCharCount (line 2102) | _getCharCount(e, s) {
  method _cleanStyles (line 2114) | _cleanStyles(e) {
  method _getLinkDensity (line 2143) | _getLinkDensity(element) {
  method _getClassWeight (line 2168) | _getClassWeight(e) {
  method _clean (line 2208) | _clean(e, tag) {
  method _hasAncestorTag (line 2243) | _hasAncestorTag(node, tagName, maxDepth, filterFn) {
  method _getRowAndColumnCount (line 2266) | _getRowAndColumnCount(table) {
  method _markDataTables (line 2297) | _markDataTables(root) {
  method _fixLazyImages (line 2358) | _fixLazyImages(root) {
  method _getTextDensity (line 2440) | _getTextDensity(e, tags) {
  method _cleanConditionally (line 2460) | _cleanConditionally(e, tag) {
  method _cleanMatchedNodes (line 2667) | _cleanMatchedNodes(e, filter) {
  method _cleanHeaders (line 2685) | _cleanHeaders(e) {
  method _headerDuplicatesTitle (line 2703) | _headerDuplicatesTitle(node) {
  method _flagIsActive (line 2712) | _flagIsActive(flag) {
  method _removeFlag (line 2716) | _removeFlag(flag) {
  method _isProbablyVisible (line 2720) | _isProbablyVisible(node) {
  method parse (line 2747) | parse() {

FILE: index.d.ts
  type ReadabilityOptions (line 17) | interface ReadabilityOptions<T = string> {
  class Readability (line 80) | class Readability<T = string> {
  type Article (line 120) | type Article = ReturnType<Readability['parse']>;

FILE: test/generate-testcase.js
  function generateTestcase (line 23) | function generateTestcase(slug) {
  function fetchSource (line 48) | function fetchSource(url, callbackFn) {
  function fetchLocalSource (line 94) | function fetchLocalSource(sourceFile, callbackFn) {
  function sanitizeSource (line 105) | function sanitizeSource(html, callbackFn) {
  function onResponseReceived (line 120) | function onResponseReceived(error, source, destRoot) {
  function runReadability (line 147) | function runReadability(source, destPath, metadataDestPath) {

FILE: test/test-isProbablyReaderable.js
  method visibilityChecker (line 92) | visibilityChecker() {
  method visibilityChecker (line 104) | visibilityChecker() {

FILE: test/test-jsdomparser.js
  function nodeExpect (line 17) | function nodeExpect(actual, expected) {
  function checkBase (line 545) | function checkBase(base, expectedResult) {

FILE: test/test-readability.js
  function reformatError (line 16) | function reformatError(err) {
  function inOrderTraverse (line 22) | function inOrderTraverse(fromNode) {
  function inOrderIgnoreEmptyTextNodes (line 32) | function inOrderIgnoreEmptyTextNodes(fromNode) {
  function traverseDOM (line 39) | function traverseDOM(callback, expectedDOM, actualDOM) {
  function htmlTransform (line 53) | function htmlTransform(str) {
  function runTestsWithItems (line 57) | function runTestsWithItems(
  function removeCommentNodesRecursively (line 233) | function removeCommentNodesRecursively(node) {
  method serializer (line 336) | serializer(el) {

FILE: test/utils.js
  function readFile (line 7) | function readFile(filePath) {
  function readJSON (line 11) | function readJSON(jsonPath) {

Copy disabled (too large) Download .json

Condensed preview — 415 files, each showing path, character count, and a content snippet. Download the .json file for the full structured content (29,505K chars).

[
  {
    "path": ".gitattributes",
    "chars": 55,
    "preview": "**/*.html linguist-detectable=false\n* text=auto eol=lf\n"
  },
  {
    "path": ".gitignore",
    "chars": 80,
    "preview": ".DS_Store\nnpm-debug.log\nnode_modules\n.metadata\n*.pyc\n*~\n.*.sw?\n.sw?\n*.jar\n*.xpi\n"
  },
  {
    "path": ".npmignore",
    "chars": 69,
    "preview": "/benchmarks/\n/test/\n.gitattributes\n.release-it.json\n.taskcluster.yml\n"
  },
  {
    "path": ".prettierrc.js",
    "chars": 351,
    "preview": "/* This Source Code Form is subject to the terms of the Mozilla Public\n * License, v. 2.0. If a copy of the MPL was not "
  },
  {
    "path": ".release-it.json",
    "chars": 275,
    "preview": "{\n  \"plugins\": {\n    \"@release-it/keep-a-changelog\": {\n      \"addUnreleased\": true,\n      \"filename\": \"CHANGELOG.md\"\n   "
  },
  {
    "path": ".taskcluster.yml",
    "chars": 1203,
    "preview": "version: 1\npolicy:\n  pullRequests: public\ntasks:\n  $let:\n    head_rev:\n      $if: 'tasks_for == \"github-pull-request\"'\n "
  },
  {
    "path": "CHANGELOG.md",
    "chars": 5667,
    "preview": "# Changelog\n\nNotable changes to readability will be documented in this file.\n\nThe format is based on [Keep a Changelog]("
  },
  {
    "path": "CODE_OF_CONDUCT.md",
    "chars": 691,
    "preview": "# Community Participation Guidelines\n\nThis repository is governed by Mozilla's code of conduct and etiquette guidelines."
  },
  {
    "path": "CONTRIBUTING.md",
    "chars": 3413,
    "preview": "# Contributing\n\nThank you for wanting to help make `readability` better!\n\nFor outstanding issues, see the issue list in "
  },
  {
    "path": "JSDOMParser.js",
    "chars": 37436,
    "preview": "/* This Source Code Form is subject to the terms of the Mozilla Public\n * License, v. 2.0. If a copy of the MPL was not "
  },
  {
    "path": "LICENSE.md",
    "chars": 553,
    "preview": "Copyright (c) 2010 Arc90 Inc\n\nLicensed under the Apache License, Version 2.0 (the \"License\");\nyou may not use this file "
  },
  {
    "path": "README.md",
    "chars": 7376,
    "preview": "# Readability.js\n\nA standalone version of the readability library used for [Firefox Reader View](https://support.mozilla"
  },
  {
    "path": "Readability-readerable.js",
    "chars": 4279,
    "preview": "/*\n * Copyright (c) 2010 Arc90 Inc\n *\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may not "
  },
  {
    "path": "Readability.js",
    "chars": 90903,
    "preview": "/*\n * Copyright (c) 2010 Arc90 Inc\n *\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may not "
  },
  {
    "path": "SECURITY.md",
    "chars": 1130,
    "preview": "# Security Policy\n\nThis code is included in Mozilla’s client [bug bounty program](https://www.mozilla.org/en-US/security"
  },
  {
    "path": "eslint.config.mjs",
    "chars": 1109,
    "preview": "/* eslint-env node */\n\"use strict\";\n\nimport globals from \"globals\";\n\nimport { createRequire } from \"module\";\nconst requi"
  },
  {
    "path": "index.d.ts",
    "chars": 4114,
    "preview": "/**\n * Decides whether or not the document is reader-able without parsing the whole thing.\n * @return {boolean} Whether "
  },
  {
    "path": "index.js",
    "chars": 192,
    "preview": "/* eslint-env node */\nvar Readability = require(\"./Readability\");\nvar isProbablyReaderable = require(\"./Readability-read"
  },
  {
    "path": "package.json",
    "chars": 1159,
    "preview": "{\n  \"name\": \"@mozilla/readability\",\n  \"version\": \"0.6.0\",\n  \"description\": \"A standalone version of the readability libr"
  },
  {
    "path": "test/debug-testcase.js",
    "chars": 561,
    "preview": "/* eslint-env node */\n\nvar Readability = require(\"../Readability\");\nvar { JSDOM } = require(\"jsdom\");\nvar fs = require(\""
  },
  {
    "path": "test/generate-testcase.js",
    "chars": 6106,
    "preview": "/* eslint-env node, mocha */\n\nvar debug = false;\n\nvar path = require(\"path\");\nvar fs = require(\"fs\");\nvar JSDOM = requir"
  },
  {
    "path": "test/test-isProbablyReaderable.js",
    "chars": 5266,
    "preview": "/* eslint-env node, mocha */\n\nvar JSDOM = require(\"jsdom\").JSDOM;\nvar chai = require(\"chai\");\nchai.config.includeStack ="
  },
  {
    "path": "test/test-jsdomparser.js",
    "chars": 21919,
    "preview": "/* eslint-env node, mocha */\n\nvar chai = require(\"chai\");\nchai.config.includeStack = true;\nvar expect = chai.expect;\n\nva"
  },
  {
    "path": "test/test-pages/001/expected-metadata.json",
    "chars": 243,
    "preview": "{\n  \"title\": \"Get your Frontend JavaScript Code Covered | Code\",\n  \"byline\": \"Nicolas Perriault\",\n  \"dir\": null,\n  \"lang"
  },
  {
    "path": "test/test-pages/001/expected.html",
    "chars": 5902,
    "preview": "<div id=\"readability-page-1\" class=\"page\">\n    <section>\n        <p><strong>So finally you're <a href=\"http://fakehost/c"
  },
  {
    "path": "test/test-pages/001/source.html",
    "chars": 12504,
    "preview": "<!DOCTYPE html>\n<html class=\"no-js\" lang=\"en\">\n    \n    <head>\n        <meta charset=\"utf-8\"/>\n        <meta http-equiv="
  },
  {
    "path": "test/test-pages/002/expected-metadata.json",
    "chars": 375,
    "preview": "{\n  \"title\": \"This API is so Fetching!\",\n  \"byline\": \"Nikhil Marathe\",\n  \"dir\": null,\n  \"lang\": \"en-US\",\n  \"excerpt\": \"F"
  },
  {
    "path": "test/test-pages/002/expected.html",
    "chars": 30017,
    "preview": "<div id=\"readability-page-1\" class=\"page\">\n    <div id=\"content-main\">\n        <article role=\"article\">\n            <p>F"
  },
  {
    "path": "test/test-pages/002/source.html",
    "chars": 142048,
    "preview": "<!DOCTYPE html>\n<html lang=\"en-US\" id=\"hacks-mozilla-org\">\n    \n    <head>\n        <meta name=\"viewport\" content=\"width="
  },
  {
    "path": "test/test-pages/003-metadata-preferred/expected-metadata.json",
    "chars": 218,
    "preview": "{\n  \"title\": \"Dublin Core property title\",\n  \"byline\": \"Dublin Core property author\",\n  \"dir\": null,\n  \"excerpt\": \"Dubli"
  },
  {
    "path": "test/test-pages/003-metadata-preferred/expected.html",
    "chars": 1043,
    "preview": "<div id=\"readability-page-1\" class=\"page\">\n    <article>\n        <h2>Test document title</h2>\n        <p> Lorem ipsum do"
  },
  {
    "path": "test/test-pages/003-metadata-preferred/source.html",
    "chars": 2408,
    "preview": "<!DOCTYPE html>\n<html>\n  <head>\n    <meta charset=\"utf-8\"/>\n    <title>Title Element</title>\n    <meta name=\"title\" cont"
  },
  {
    "path": "test/test-pages/004-metadata-space-separated-properties/expected-metadata.json",
    "chars": 181,
    "preview": "{\n  \"title\": \"Preferred title\",\n  \"byline\": \"Creator Name\",\n  \"dir\": null,\n  \"excerpt\": \"Preferred description\",\n  \"site"
  },
  {
    "path": "test/test-pages/004-metadata-space-separated-properties/expected.html",
    "chars": 1043,
    "preview": "<div id=\"readability-page-1\" class=\"page\">\n    <article>\n        <h2>Test document title</h2>\n        <p> Lorem ipsum do"
  },
  {
    "path": "test/test-pages/004-metadata-space-separated-properties/source.html",
    "chars": 1725,
    "preview": "<!DOCTYPE html>\n<html>\n  <head>\n    <meta charset=\"utf-8\"/>\n    <title>Title Element</title>\n    <meta property=\"x:title"
  },
  {
    "path": "test/test-pages/005-unescape-html-entities/expected-metadata.json",
    "chars": 149,
    "preview": "{\n  \"title\": \"\",\n  \"byline\": null,\n  \"dir\": null,\n  \"excerpt\": \"&#xg; 😭 😭 � �\",\n  \"siteName\": null,\n  \"publishedTime\": n"
  },
  {
    "path": "test/test-pages/005-unescape-html-entities/expected.html",
    "chars": 54,
    "preview": "<div id=\"readability-page-1\" class=\"page\"> Test </div>"
  },
  {
    "path": "test/test-pages/005-unescape-html-entities/source.html",
    "chars": 221,
    "preview": "<!DOCTYPE html>\n<html>\n    <head>\n        <meta property=\"dc:description og:description\" content=\"&amp;#xg; &amp;#x1F62D"
  },
  {
    "path": "test/test-pages/aclu/expected-metadata.json",
    "chars": 390,
    "preview": "{\n  \"title\": \"Facebook Is Tracking Me Even Though I’m Not on Facebook\",\n  \"byline\": \"Daniel Kahn Gillmor\",\n  \"dir\": \"ltr"
  },
  {
    "path": "test/test-pages/aclu/expected.html",
    "chars": 13782,
    "preview": "<div id=\"readability-page-1\" class=\"page\">\n    <div>\n        <p> I don't use Facebook. I'm not technophobic — I'm a geek"
  },
  {
    "path": "test/test-pages/aclu/source.html",
    "chars": 205096,
    "preview": "<!DOCTYPE html>\n<!--[if IEMobile 7]><html class=\"iem7\"  lang=\"en\" dir=\"ltr\"><![endif]--><!--[if lte IE 6]><html class=\"l"
  },
  {
    "path": "test/test-pages/aktualne/expected-metadata.json",
    "chars": 343,
    "preview": "{\n  \"title\": \"West Ham hrozí gigantům, okouzlil i Linekera. Součka je snadné přehlédnout\",\n  \"byline\": \"Aleš Vávra\",\n  \""
  },
  {
    "path": "test/test-pages/aktualne/expected.html",
    "chars": 5379,
    "preview": "<div id=\"readability-page-1\" class=\"page\">\n    <p> Zázrak jedné sezony? West Ham dává pochybovačům stále pádnější odpově"
  },
  {
    "path": "test/test-pages/aktualne/source.html",
    "chars": 298889,
    "preview": "<!DOCTYPE html>\n<html lang=\"cs\" prefix=\"og: http://ogp.me/ns# fb: http://ogp.me/ns/fb# article: http://ogp.me/ns/article"
  },
  {
    "path": "test/test-pages/archive-of-our-own/expected-metadata.json",
    "chars": 367,
    "preview": "{\n  \"title\": \"Conversations with a Cryptid - Chapter 1 - AMournfulHowlInTheNight - 僕のヒーローアカデミア | Boku no Hero Academia\","
  },
  {
    "path": "test/test-pages/archive-of-our-own/expected.html",
    "chars": 24223,
    "preview": "<div id=\"readability-page-1\" class=\"page\">\n    <div role=\"article\" id=\"chapters\">\n        <h3 id=\"work\"> Chapter Text </"
  },
  {
    "path": "test/test-pages/archive-of-our-own/source.html",
    "chars": 265146,
    "preview": "<!DOCTYPE html>\n<html lang=\"en\" xmlns=\"http://www.w3.org/1999/xhtml\" xml:lang=\"en\">\n    <head>\n        <meta charset=\"ut"
  },
  {
    "path": "test/test-pages/ars-1/expected-metadata.json",
    "chars": 305,
    "preview": "{\n  \"title\": \"Just-released Minecraft exploit makes it easy to crash game servers\",\n  \"byline\": \"Dan Goodin\",\n  \"dir\": n"
  },
  {
    "path": "test/test-pages/ars-1/expected.html",
    "chars": 5976,
    "preview": "<div id=\"readability-page-1\" class=\"page\">\n    <div>\n        <header>\n            <h4> Biz &amp; IT — </h4>\n            "
  },
  {
    "path": "test/test-pages/ars-1/source.html",
    "chars": 55962,
    "preview": "<!DOCTYPE html>\n<html lang=\"en-us\" xmlns=\"http://www.w3.org/1999/xhtml\" xml:lang=\"en-us\">\n    <head>\n        <title>\n   "
  },
  {
    "path": "test/test-pages/article-author-tag/expected-metadata.json",
    "chars": 413,
    "preview": "{\n  \"title\": \"The Deck of Cards That Made Tarot A Global Phenomenon\",\n  \"byline\": \"Laura June Topolsky\",\n  \"dir\": null,\n"
  },
  {
    "path": "test/test-pages/article-author-tag/expected.html",
    "chars": 18722,
    "preview": "<div id=\"readability-page-1\" class=\"page\">\n    <section id=\"article-body\">\n        <p>\n            <img src=\"https://img"
  },
  {
    "path": "test/test-pages/article-author-tag/source.html",
    "chars": 290579,
    "preview": "<!DOCTYPE html>\n<html lang=\"en\" xmlns=\"http://www.w3.org/1999/xhtml\" xml:lang=\"en\">\n    <head>\n        <meta charset=\"ut"
  },
  {
    "path": "test/test-pages/base-url/expected-metadata.json",
    "chars": 629,
    "preview": "{\n  \"title\": \"Base URL test\",\n  \"byline\": null,\n  \"dir\": null,\n  \"excerpt\": \"Lorem ipsum dolor sit amet, consectetur adi"
  },
  {
    "path": "test/test-pages/base-url/expected.html",
    "chars": 1844,
    "preview": "<div id=\"readability-page-1\" class=\"page\">\n    <article>\n        <h2>Lorem</h2>\n        <p> Lorem ipsum dolor sit amet, "
  },
  {
    "path": "test/test-pages/base-url/source.html",
    "chars": 1764,
    "preview": "<!DOCTYPE html>\n<html>\n<head>\n  <meta charset=\"utf-8\"/>\n  <title>Base URL test</title>\n</head>\n<body>\n  <article>\n    <h"
  },
  {
    "path": "test/test-pages/base-url-base-element/expected-metadata.json",
    "chars": 639,
    "preview": "{\n  \"title\": \"Base URL with base test\",\n  \"byline\": null,\n  \"dir\": null,\n  \"excerpt\": \"Lorem ipsum dolor sit amet, conse"
  },
  {
    "path": "test/test-pages/base-url-base-element/expected.html",
    "chars": 1835,
    "preview": "<div id=\"readability-page-1\" class=\"page\">\n    <article>\n        <h2>Lorem</h2>\n        <p> Lorem ipsum dolor sit amet, "
  },
  {
    "path": "test/test-pages/base-url-base-element/source.html",
    "chars": 1793,
    "preview": "<!DOCTYPE html>\n<html>\n<head>\n  <meta charset=\"utf-8\"/>\n  <base href=\"/\"/>\n  <title>Base URL with base test</title>\n</he"
  },
  {
    "path": "test/test-pages/base-url-base-element-relative/expected-metadata.json",
    "chars": 648,
    "preview": "{\n  \"title\": \"Base URL with base relative test\",\n  \"byline\": null,\n  \"dir\": null,\n  \"excerpt\": \"Lorem ipsum dolor sit am"
  },
  {
    "path": "test/test-pages/base-url-base-element-relative/expected.html",
    "chars": 1895,
    "preview": "<div id=\"readability-page-1\" class=\"page\">\n    <article>\n        <h2>Lorem</h2>\n        <p> Lorem ipsum dolor sit amet, "
  },
  {
    "path": "test/test-pages/base-url-base-element-relative/source.html",
    "chars": 1806,
    "preview": "<!DOCTYPE html>\n<html>\n<head>\n  <meta charset=\"utf-8\"/>\n  <base href=\"base/\"/>\n  <title>Base URL with base relative test"
  },
  {
    "path": "test/test-pages/basic-tags-cleaning/expected-metadata.json",
    "chars": 291,
    "preview": "{\n  \"title\": \"Basic tag cleaning test\",\n  \"byline\": null,\n  \"dir\": null,\n  \"excerpt\": \"Lorem ipsum dolor sit amet, conse"
  },
  {
    "path": "test/test-pages/basic-tags-cleaning/expected.html",
    "chars": 987,
    "preview": "<div id=\"readability-page-1\" class=\"page\">\n    <div>\n        <p>Lorem ipsum dolor sit amet, consectetur adipisicing elit"
  },
  {
    "path": "test/test-pages/basic-tags-cleaning/source.html",
    "chars": 1404,
    "preview": "<!DOCTYPE html>\n<html>\n<head>\n  <meta charset=\"utf-8\"/>\n  <title>Basic tag cleaning test</title>\n</head>\n<body>\n  <artic"
  },
  {
    "path": "test/test-pages/bbc-1/expected-metadata.json",
    "chars": 376,
    "preview": "{\n  \"title\": \"Obama admits US gun laws are his 'biggest frustration'\",\n  \"byline\": \"BBC News\",\n  \"dir\": null,\n  \"lang\": "
  },
  {
    "path": "test/test-pages/bbc-1/expected.html",
    "chars": 7944,
    "preview": "<div id=\"readability-page-1\" class=\"page\">\n    <div property=\"articleBody\">\n        <p>President Barack Obama has admitt"
  },
  {
    "path": "test/test-pages/bbc-1/source.html",
    "chars": 264051,
    "preview": "<html class=\"orb-js bbcdotcom bbcdotcom-responsive ads-enabled ctm ff flex bbcdotcom-init bbcdotcom-analytics-init grunt"
  },
  {
    "path": "test/test-pages/blogger/expected-metadata.json",
    "chars": 344,
    "preview": "{\n  \"title\": \"Open Verilog flow for Silego GreenPak4 programmable logic devices\",\n  \"byline\": null,\n  \"dir\": \"ltr\",\n  \"e"
  },
  {
    "path": "test/test-pages/blogger/expected.html",
    "chars": 18100,
    "preview": "<div id=\"readability-page-1\" class=\"page\">\n    <div id=\"post-body-932306423056216142\" itemprop=\"description articleBody\""
  },
  {
    "path": "test/test-pages/blogger/source.html",
    "chars": 154796,
    "preview": "<!DOCTYPE html>\n<html class='v2' dir='ltr' xmlns='http://www.w3.org/1999/xhtml' xmlns:b='http://www.google.com/2005/gml/"
  },
  {
    "path": "test/test-pages/breitbart/expected-metadata.json",
    "chars": 425,
    "preview": "{\n  \"title\": \"'Neutral' Snopes Fact-Checker David Emery: 'Are There Any Un-Angry Trump Supporters?' - Breitbart\",\n  \"byl"
  },
  {
    "path": "test/test-pages/breitbart/expected.html",
    "chars": 3773,
    "preview": "<div id=\"readability-page-1\" class=\"page\">\n    <div>\n        <figure>\n            <div>\n                <p><img itemprop"
  },
  {
    "path": "test/test-pages/breitbart/source.html",
    "chars": 854445,
    "preview": "<!DOCTYPE html>\n<html xmlns=\"http://www.w3.org/1999/xhtml\" xml:lang=\"en\" dir=\"ltr\" prefix=\"og: http://ogp.me/ns# fb: htt"
  },
  {
    "path": "test/test-pages/bug-1255978/expected-metadata.json",
    "chars": 559,
    "preview": "{\n  \"title\": \"Seven secrets that hotel owners don't want you to know\",\n  \"byline\": \"Hazel Sheffield\",\n  \"dir\": null,\n  \""
  },
  {
    "path": "test/test-pages/bug-1255978/expected.html",
    "chars": 8519,
    "preview": "<div id=\"readability-page-1\" class=\"page\">\n    <div itemprop=\"articleBody\" id=\"gigya-share-btns-2_gig_containerParent\">\n"
  },
  {
    "path": "test/test-pages/bug-1255978/source.html",
    "chars": 335904,
    "preview": "<!DOCTYPE html>\n<!--[if IE 8]>\n<html class=\"lt-ie9\">\n<![endif]-->\n<!--[if gt IE 8]><!-->\n<html xmlns=\"http://www.w3.org/"
  },
  {
    "path": "test/test-pages/buzzfeed-1/expected-metadata.json",
    "chars": 301,
    "preview": "{\n  \"title\": \"Student Dies After Diet Pills She Bought Online \\\"Burned Her Up From Within\\\"\",\n  \"byline\": null,\n  \"dir\":"
  },
  {
    "path": "test/test-pages/buzzfeed-1/expected.html",
    "chars": 3578,
    "preview": "<div id=\"readability-page-1\" class=\"page\">\n    <div id=\"buzz_sub_buzz\">\n        <div id=\"superlist_3758406_5547137\" rel:"
  },
  {
    "path": "test/test-pages/buzzfeed-1/source.html",
    "chars": 378143,
    "preview": "<html class=\" fonts-loaded srcset svg inlinesvg no-webp\" xmlns=\"http://www.w3.org/1999/xhtml\" xmlns:fb=\"http://www.faceb"
  },
  {
    "path": "test/test-pages/citylab-1/expected-metadata.json",
    "chars": 333,
    "preview": "{\n  \"title\": \"The Modern Ambitions Behind Neon\",\n  \"byline\": \"Sarah Archer\",\n  \"dir\": null,\n  \"lang\": \"en\",\n  \"excerpt\":"
  },
  {
    "path": "test/test-pages/citylab-1/expected.html",
    "chars": 14005,
    "preview": "<div id=\"readability-page-1\" class=\"page\">\n    <article itemscope=\"itemscope\" itemtype=\"https://schema.org/NewsArticle\">"
  },
  {
    "path": "test/test-pages/citylab-1/source.html",
    "chars": 173759,
    "preview": "<!DOCTYPE html>\n<html lang=\"en\" class=\"no-js\" xmlns=\"http://www.w3.org/1999/xhtml\" xml:lang=\"en\">\n    <head>\n        <me"
  },
  {
    "path": "test/test-pages/clean-links/expected-metadata.json",
    "chars": 495,
    "preview": "{\n  \"title\": \"Bartleby the Scrivener Web Study Text\",\n  \"byline\": null,\n  \"dir\": null,\n  \"excerpt\": \"Ere introducing the"
  },
  {
    "path": "test/test-pages/clean-links/expected.html",
    "chars": 85894,
    "preview": "<div id=\"readability-page-1\" class=\"page\">\n    <div>\n        <h3>Study Webtext</h3>\n        <h2><span face=\"Lucida Handw"
  },
  {
    "path": "test/test-pages/clean-links/source.html",
    "chars": 182350,
    "preview": "<!DOCTYPE html>\n<html>\n    \n    <head>\n        <title>Bartleby the Scrivener Web Study Text</title>\n        <meta http-e"
  },
  {
    "path": "test/test-pages/cnet/expected-metadata.json",
    "chars": 324,
    "preview": "{\n  \"title\": \"Zuckerberg offers peek at Facebook's acquisition strategies\",\n  \"byline\": \"Steven Musil\",\n  \"dir\": null,\n "
  },
  {
    "path": "test/test-pages/cnet/expected.html",
    "chars": 5922,
    "preview": "<div id=\"readability-page-1\" class=\"page\">\n    <div itemprop=\"articleBody\" data-component=\"lazyloadImages\">\n        <fig"
  },
  {
    "path": "test/test-pages/cnet/source.html",
    "chars": 267207,
    "preview": "<!DOCTYPE html>\n<html xmlns=\"http://www.w3.org/1999/xhtml\" xmlns:og=\"http://opengraphprotocol.org/schema/\" xmlns:fb=\"htt"
  },
  {
    "path": "test/test-pages/cnet-svg-classes/expected-metadata.json",
    "chars": 351,
    "preview": "{\n  \"title\": \"Twitter Lite se estrena en México, Venezuela y otros nueve países\",\n  \"byline\": \"César Salza\",\n  \"dir\": nu"
  },
  {
    "path": "test/test-pages/cnet-svg-classes/expected.html",
    "chars": 34997,
    "preview": "<div id=\"readability-page-1\" class=\"page\">\n    <div itemprop=\"articleBody\" data-component=\"lazyloadImages\">\n        <fig"
  },
  {
    "path": "test/test-pages/cnet-svg-classes/source.html",
    "chars": 136766,
    "preview": "<!DOCTYPE html>\n<html lang=\"es\" xmlns:og=\"http://opengraphprotocol.org/schema/\" xmlns:fb=\"http://ogp.me/ns/fb#\" class=\"\""
  },
  {
    "path": "test/test-pages/cnn/expected-metadata.json",
    "chars": 318,
    "preview": "{\n  \"title\": \"The 'birth lottery' and economic mobility\",\n  \"byline\": \"Ahiza Garcia\",\n  \"dir\": null,\n  \"excerpt\": \"A rec"
  },
  {
    "path": "test/test-pages/cnn/expected.html",
    "chars": 3265,
    "preview": "<div id=\"readability-page-1\" class=\"page\">\n    <div id=\"storytext\">\n        <h2>The U.S. has long been heralded as a lan"
  },
  {
    "path": "test/test-pages/cnn/source.html",
    "chars": 258652,
    "preview": "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\""
  },
  {
    "path": "test/test-pages/comment-inside-script-parsing/expected-metadata.json",
    "chars": 287,
    "preview": "{\n  \"title\": \"Test script parsing\",\n  \"byline\": null,\n  \"dir\": null,\n  \"excerpt\": \"Lorem ipsum dolor sit amet, consectet"
  },
  {
    "path": "test/test-pages/comment-inside-script-parsing/expected.html",
    "chars": 987,
    "preview": "<div id=\"readability-page-1\" class=\"page\">\n    <div>\n        <p>Lorem ipsum dolor sit amet, consectetur adipisicing elit"
  },
  {
    "path": "test/test-pages/comment-inside-script-parsing/source.html",
    "chars": 1221,
    "preview": "<html>\n  <head><title>Test script parsing</title></head>\n<body>\n  <script>\n    <!--\n    Silly test\n    <script src=\"foo."
  },
  {
    "path": "test/test-pages/daringfireball-1/expected-metadata.json",
    "chars": 231,
    "preview": "{\n  \"title\": \"Daring Fireball: Colophon\",\n  \"byline\": null,\n  \"dir\": null,\n  \"lang\": \"en\",\n  \"excerpt\": \"Daring Fireball"
  },
  {
    "path": "test/test-pages/daringfireball-1/expected.html",
    "chars": 3253,
    "preview": "<div id=\"readability-page-1\" class=\"page\">\n    <div id=\"Box\">\n        <h2>About This Site</h2>\n        <p>Daring Firebal"
  },
  {
    "path": "test/test-pages/daringfireball-1/source.html",
    "chars": 10099,
    "preview": "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\""
  },
  {
    "path": "test/test-pages/data-url-image/expected-metadata.json",
    "chars": 913,
    "preview": "{\n  \"title\": \"Document\",\n  \"byline\": null,\n  \"dir\": null,\n  \"lang\": \"en\",\n  \"excerpt\": \"Lorem ipsum dolor sit amet conse"
  },
  {
    "path": "test/test-pages/data-url-image/expected.html",
    "chars": 66160,
    "preview": "<div id=\"readability-page-1\" class=\"page\">\n    <img src=\"data:image/gif;base64,R0lGODlhAQABAAAAACH5BAEKAAEALAAAAAABAAEAA"
  },
  {
    "path": "test/test-pages/data-url-image/source.html",
    "chars": 65841,
    "preview": "<!DOCTYPE html>\n<html lang=\"en\">\n<head>\n\t<meta charset=\"UTF-8\"></meta>\n\t<meta name=\"viewport\" content=\"width=device-widt"
  },
  {
    "path": "test/test-pages/dev418/expected-metadata.json",
    "chars": 382,
    "preview": "{\n  \"title\": \"Readability Test\",\n  \"byline\": null,\n  \"dir\": null,\n  \"excerpt\": \"Lorem ipsum dolor sit amet, consectetur "
  },
  {
    "path": "test/test-pages/dev418/expected.html",
    "chars": 2937,
    "preview": "<div id=\"readability-page-1\" class=\"page\">\n    <p> Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusm"
  },
  {
    "path": "test/test-pages/dev418/source.html",
    "chars": 3815,
    "preview": "<!DOCTYPE html>\n<html xmlns=\"http://www.w3.org/1999/xhtml\">\n    <head>\n        <meta charset=\"utf-8\" />\n        <title>\n"
  },
  {
    "path": "test/test-pages/dropbox-blog/expected-metadata.json",
    "chars": 646,
    "preview": "{\n  \"title\": \"How we designed Dropbox’s ATF - an async task framework\",\n  \"byline\": \"Arun Sai Krishnan\",\n  \"dir\": null,\n"
  },
  {
    "path": "test/test-pages/dropbox-blog/expected.html",
    "chars": 31024,
    "preview": "<div id=\"readability-page-1\" class=\"page\">\n    <div>\n        <div>\n            <p> I joined Dropbox not long after gradu"
  },
  {
    "path": "test/test-pages/dropbox-blog/source.html",
    "chars": 80086,
    "preview": "<!DOCTYPE html>\n<html lang=\"en\" xml:lang=\"en\" data-cms-lang=\"en-us\" xmlns=\"http://www.w3.org/1999/xhtml\">\n    <head>\n   "
  },
  {
    "path": "test/test-pages/ebb-org/expected-metadata.json",
    "chars": 416,
    "preview": "{\n  \"title\": \"On Recent Controversial Events - Bradley M. Kuhn ( Brad ) ( bkuhn )\",\n  \"byline\": \"Bradley M. Kuhn (http:/"
  },
  {
    "path": "test/test-pages/ebb-org/expected.html",
    "chars": 16074,
    "preview": "<div id=\"readability-page-1\" class=\"page\">\n    <div id=\"contentWithSidebar\">\n        <p> Tuesday 15 October 2019 by Brad"
  },
  {
    "path": "test/test-pages/ebb-org/source.html",
    "chars": 39578,
    "preview": "<!DOCTYPE html>\n<html lang=\"en-US\" xml:lang=\"en-US\" xmlns=\"http://www.w3.org/1999/xhtml\">\n    <head>\n        <meta conte"
  },
  {
    "path": "test/test-pages/ehow-1/expected-metadata.json",
    "chars": 538,
    "preview": "{\n  \"title\": \"How to Build a Terrarium | eHow\",\n  \"byline\": \"Lucy Akins\",\n  \"dir\": null,\n  \"lang\": \"en-US\",\n  \"excerpt\":"
  },
  {
    "path": "test/test-pages/ehow-1/expected.html",
    "chars": 7374,
    "preview": "<div id=\"readability-page-1\" class=\"page\">\n    <div>\n        <header>\n            <div>\n                <p><span></span>"
  },
  {
    "path": "test/test-pages/ehow-1/source.html",
    "chars": 67355,
    "preview": "<!DOCTYPE html>\n<!--[if IE]><![endif]-->\n<html class=\"Crafts en-US\" lang=\"en-US\" xmlns:fb=\"http://www.facebook.com/2008/"
  },
  {
    "path": "test/test-pages/ehow-2/expected-metadata.json",
    "chars": 563,
    "preview": "{\n  \"title\": \"How to Throw a Graduation Party on a Budget | eHow\",\n  \"byline\": \"Gina Roberts-Grey\",\n  \"dir\": null,\n  \"la"
  },
  {
    "path": "test/test-pages/ehow-2/expected.html",
    "chars": 12358,
    "preview": "<div id=\"readability-page-1\" class=\"page\">\n    <div>\n        <div data-type=\"AuthorProfile\">\n            <div>\n         "
  },
  {
    "path": "test/test-pages/ehow-2/source.html",
    "chars": 107055,
    "preview": "<!DOCTYPE html>\n<!--[if IE]><![endif]-->\n<html xmlns=\"http://www.w3.org/1999/xhtml\" class=\"Corporate en-US\" xmlns:fb=\"ht"
  },
  {
    "path": "test/test-pages/embedded-videos/expected-metadata.json",
    "chars": 636,
    "preview": "{\n  \"title\": \"Embedded videos test\",\n  \"byline\": null,\n  \"dir\": null,\n  \"excerpt\": \"Lorem ipsum dolor sit amet, consecte"
  },
  {
    "path": "test/test-pages/embedded-videos/expected.html",
    "chars": 1840,
    "preview": "<div id=\"readability-page-1\" class=\"page\">\n    <article>\n        <h2>Lorem</h2>\n        <p>Lorem ipsum dolor sit amet, c"
  },
  {
    "path": "test/test-pages/embedded-videos/source.html",
    "chars": 2059,
    "preview": "<!DOCTYPE html>\n<html>\n<head>\n  <meta charset=\"utf-8\"/>\n  <title>Embedded videos test</title>\n</head>\n<body>\n  <article>"
  },
  {
    "path": "test/test-pages/engadget/expected-metadata.json",
    "chars": 342,
    "preview": "{\n  \"title\": \"Xbox One X review:  A console that keeps up with gaming PCs\",\n  \"byline\": \"Devindra Hardawar\",\n  \"dir\": nu"
  },
  {
    "path": "test/test-pages/engadget/expected.html",
    "chars": 18717,
    "preview": "<div id=\"readability-page-1\" class=\"page\">\n    <div>\n        <p>The <a href=\"https://www.engadget.com/2017/06/13/the-xbo"
  },
  {
    "path": "test/test-pages/engadget/source.html",
    "chars": 350100,
    "preview": "<html lang=\"en\">\n\n<head>\n    <meta charset=\"UTF-8\"></meta>\n    <meta http-equiv=\"cache-control\" content=\"no-cache\"></met"
  },
  {
    "path": "test/test-pages/firefox-nightly-blog/expected-metadata.json",
    "chars": 428,
    "preview": "{\n  \"title\": \"These Weeks in Firefox: Issue 85 – Firefox Nightly News\",\n  \"byline\": \"Mike Conley\",\n  \"dir\": \"ltr\",\n  \"la"
  },
  {
    "path": "test/test-pages/firefox-nightly-blog/expected.html",
    "chars": 28053,
    "preview": "<div id=\"readability-page-1\" class=\"page\">\n    <div id=\"content\">\n        <div>\n            <article id=\"post-997\">\n    "
  },
  {
    "path": "test/test-pages/firefox-nightly-blog/source.html",
    "chars": 82689,
    "preview": "<!DOCTYPE html>\n<html lang=\"en-US\" dir=\"ltr\" class=\"no-js\" xmlns=\"http://www.w3.org/1999/xhtml\" xml:lang=\"en-US\">\n    <h"
  },
  {
    "path": "test/test-pages/folha/expected-metadata.json",
    "chars": 359,
    "preview": "{\n  \"title\": \"Tite diz que errou ao levar taça da Libertadores a Lula em 2012\",\n  \"byline\": \"Bruno (Henrique Zecchin) Ro"
  },
  {
    "path": "test/test-pages/folha/expected.html",
    "chars": 2961,
    "preview": "<div id=\"readability-page-1\" class=\"page\">\n    <div data-share-text=\"\" data-news-content-text=\"\" data-disable-copy=\"\" da"
  },
  {
    "path": "test/test-pages/folha/source.html",
    "chars": 368009,
    "preview": "<!DOCTYPE html>\n<html lang=\"pt-BR\" data-version=\"prod@5391c020\" xmlns=\"http://www.w3.org/1999/xhtml\" xml:lang=\"pt-BR\">\n "
  },
  {
    "path": "test/test-pages/gitlab-blog/expected-metadata.json",
    "chars": 373,
    "preview": "{\n  \"title\": \"3 surprising findings from our 2024 Global DevSecOps Survey\",\n  \"byline\": \"Dave Steer\",\n  \"dir\": null,\n  \""
  },
  {
    "path": "test/test-pages/gitlab-blog/expected.html",
    "chars": 12857,
    "preview": "<div id=\"readability-page-1\" class=\"page\">\n    <div data-v-b794d8fe=\"\" data-v-0c13407a=\"\" data-v-74bd29c6=\"\">\n        <p"
  },
  {
    "path": "test/test-pages/gitlab-blog/source.html",
    "chars": 71889,
    "preview": "<!DOCTYPE html>\n<html data-n-head-ssr=\"\" lang=\"en-us\" data-n-head=\"%7B%22lang%22:%7B%22ssr%22:%22en-us%22%7D%7D\" xmlns=\""
  },
  {
    "path": "test/test-pages/gmw/expected-metadata.json",
    "chars": 220,
    "preview": "{\n  \"title\": \"宇航员在太空中喝酒会怎么样？后果很严重 _探索者 _光明网\",\n  \"byline\": \"肖春芳\",\n  \"dir\": null,\n  \"excerpt\": \"不幸的是，对于希望能喝上一杯的太空探险者，那些将他们"
  },
  {
    "path": "test/test-pages/gmw/expected.html",
    "chars": 5013,
    "preview": "<div id=\"readability-page-1\" class=\"page\">\n    <div id=\"contentMain\">\n        <p>　　翱翔于距地球数千公里的太空中，进入广袤漆黑的未知领域，是一项艰苦卓绝的工作"
  },
  {
    "path": "test/test-pages/gmw/source.html",
    "chars": 134867,
    "preview": "<!DOCTYPE html>\n<html xmlns=\"http://www.w3.org/1999/xhtml\">\n\n<head>\n    <script type=\"text/javascript\" async=\"\" src=\"htt"
  },
  {
    "path": "test/test-pages/google-sre-book-1/expected-metadata.json",
    "chars": 539,
    "preview": "{\n  \"title\": \"Google - Site Reliability Engineering\",\n  \"byline\": \"Written by Rob Ewaschuk\\n                            "
  },
  {
    "path": "test/test-pages/google-sre-book-1/expected.html",
    "chars": 41915,
    "preview": "<div id=\"readability-page-1\" class=\"page\">\n    <section data-type=\"chapter\" id=\"maia-main\" role=\"main\">\n        <h2> Mon"
  },
  {
    "path": "test/test-pages/google-sre-book-1/source.html",
    "chars": 69880,
    "preview": "<!DOCTYPE html>\n<html lang=\"en\" xmlns=\"http://www.w3.org/1999/xhtml\" xml:lang=\"en\">\n    <head>\n        <meta charset=\"ut"
  },
  {
    "path": "test/test-pages/guardian-1/expected-metadata.json",
    "chars": 395,
    "preview": "{\n  \"title\": \"'What is the sea telling us?': Māori tribes fearful over whale strandings | Eleanor Ainge Roy\",\n  \"byline\""
  },
  {
    "path": "test/test-pages/guardian-1/expected.html",
    "chars": 66015,
    "preview": "<div id=\"readability-page-1\" class=\"page\">\n    <div itemprop=\"articleBody\" data-test-id=\"article-review-body\">\n        <"
  },
  {
    "path": "test/test-pages/guardian-1/source.html",
    "chars": 1160671,
    "preview": "<!DOCTYPE html>\n<html id=\"js-context\" class=\"js-off is-not-modern id--signed-out\" lang=\"en\" data-page-path=\"/environment"
  },
  {
    "path": "test/test-pages/heise/expected-metadata.json",
    "chars": 376,
    "preview": "{\n  \"title\": \"1Password für Mac generiert Einmal-Passwörter\",\n  \"byline\": \"Mac & i\",\n  \"dir\": null,\n  \"lang\": \"de\",\n  \"e"
  },
  {
    "path": "test/test-pages/heise/expected.html",
    "chars": 2718,
    "preview": "<div id=\"readability-page-1\" class=\"page\">\n    <div>\n        <figure>\n            <img src=\"http://3.f.ix.de/scale/geome"
  },
  {
    "path": "test/test-pages/heise/source.html",
    "chars": 61889,
    "preview": "<!DOCTYPE html>\n<html xmlns=\"http://www.w3.org/1999/xhtml\" lang=\"de\"><head>\n    <title>1Password für Mac generiert Einma"
  },
  {
    "path": "test/test-pages/herald-sun-1/expected-metadata.json",
    "chars": 442,
    "preview": "{\n  \"title\": \"Angry media won’t buckle over new surveillance laws\",\n  \"byline\": \"JOE HILDEBRAND\",\n  \"dir\": null,\n  \"lang"
  },
  {
    "path": "test/test-pages/herald-sun-1/expected.html",
    "chars": 5940,
    "preview": "<div id=\"readability-page-1\" class=\"page\">\n    <div>\n        <div>\n            <p><img data-src=\"http://api.news.com.au/"
  },
  {
    "path": "test/test-pages/herald-sun-1/source.html",
    "chars": 62020,
    "preview": "<!--?xml version=\"1.0\" encoding=\"UTF-8\" ?--><!DOCTYPE html PUBLIC \"-//WAPFORUM//DTD XHTML Mobile 1.2//EN\" \"http://www.op"
  },
  {
    "path": "test/test-pages/hidden-nodes/expected-metadata.json",
    "chars": 1133,
    "preview": "{\n  \"title\": \"Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt\",\n  \"bylin"
  },
  {
    "path": "test/test-pages/hidden-nodes/expected.html",
    "chars": 1010,
    "preview": "<div id=\"readability-page-1\" class=\"page\">\n    <p> Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam non"
  },
  {
    "path": "test/test-pages/hidden-nodes/source.html",
    "chars": 3369,
    "preview": "<!DOCTYPE html>\n<html lang=\"en\" xmlns=\"http://www.w3.org/1999/xhtml\" xml:lang=\"en\">\n    <head>\n        <meta charset=\"ut"
  },
  {
    "path": "test/test-pages/hukumusume/expected-metadata.json",
    "chars": 199,
    "preview": "{\n  \"title\": \"欲張りなイヌ　＜福娘童話集　きょうのイソップ童話＞\",\n  \"byline\": null,\n  \"dir\": null,\n  \"excerpt\": \"福娘童話集 > きょうのイソップ童話 > １月のイソップ童話 "
  },
  {
    "path": "test/test-pages/hukumusume/expected.html",
    "chars": 12332,
    "preview": "<div id=\"readability-page-1\" class=\"page\">\n    <div>\n        <td>\n            <table>\n                <tbody>\n          "
  },
  {
    "path": "test/test-pages/hukumusume/source.html",
    "chars": 22824,
    "preview": "<!DOCTYPE html>\n<html xmlns=\"http://www.w3.org/1999/xhtml\">\n    <head>\n        <title>\n            欲張りなイヌ　＜福娘童話集　きょうのイソッ"
  },
  {
    "path": "test/test-pages/iab-1/expected-metadata.json",
    "chars": 589,
    "preview": "{\n  \"title\": \"Getting LEAN with Digital Ad UX | IAB\",\n  \"byline\": \"By\\n\\t\\t\\tScott Cunningham\",\n  \"dir\": null,\n  \"lang\":"
  },
  {
    "path": "test/test-pages/iab-1/expected.html",
    "chars": 6814,
    "preview": "<div id=\"readability-page-1\" class=\"page\">\n    <div>\n        <p>We messed up. As technologists, tasked with delivering c"
  },
  {
    "path": "test/test-pages/iab-1/source.html",
    "chars": 102343,
    "preview": "<!DOCTYPE html>\n<html xmlns=\"http://www.w3.org/1999/xhtml\" lang=\"en-US\" prefix=\"og: http://ogp.me/ns#\" class=\"js flexbox"
  },
  {
    "path": "test/test-pages/ietf-1/expected-metadata.json",
    "chars": 161,
    "preview": "{\n  \"title\": \"remoteStorage\",\n  \"byline\": \"Jong, Michiel de\",\n  \"dir\": null,\n  \"lang\": \"en\",\n  \"siteName\": null,\n  \"publ"
  },
  {
    "path": "test/test-pages/ietf-1/expected.html",
    "chars": 56074,
    "preview": "<div id=\"readability-page-1\" class=\"page\">\n    <span>[<a href=\"http://fakehost/html/\" title=\"Document search and retriev"
  },
  {
    "path": "test/test-pages/ietf-1/source.html",
    "chars": 64711,
    "preview": "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\"\n  \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dt"
  },
  {
    "path": "test/test-pages/invalid-attributes/expected-metadata.json",
    "chars": 203,
    "preview": "{\n  \"title\": \"Lorem Ipsum\",\n  \"byline\": null,\n  \"dir\": null,\n  \"excerpt\": \"Lorem ipsum dolor sit amet, consectetur adipi"
  },
  {
    "path": "test/test-pages/invalid-attributes/expected.html",
    "chars": 154,
    "preview": "<div id=\"readability-page-1\" class=\"page\">\n  <div \"=\"\">\n    <p>\n        Lorem ipsum dolor sit amet, consectetur adipisci"
  },
  {
    "path": "test/test-pages/invalid-attributes/source.html",
    "chars": 437,
    "preview": "<!DOCTYPE html>\n<html>\n    <head>\n        <title>Lorem Ipsum</title>\n    </head>\n    <body>\n        <main>\n            <"
  },
  {
    "path": "test/test-pages/js-link-replacement/expected-metadata.json",
    "chars": 164,
    "preview": "{\n  \"title\": \"Replace javascript: links\",\n  \"byline\": null,\n  \"dir\": null,\n  \"excerpt\": \"abc\",\n  \"siteName\": null,\n  \"pu"
  },
  {
    "path": "test/test-pages/js-link-replacement/expected.html",
    "chars": 114,
    "preview": "<div id=\"readability-page-1\" class=\"page\">\n    <span>\n        <p>abc</p>\n        <p>def</p> ghi\n    </span>\n</div>"
  },
  {
    "path": "test/test-pages/js-link-replacement/source.html",
    "chars": 209,
    "preview": "<!DOCTYPE html>\n<html>\n<head>\n  <meta charset=\"utf-8\"/>\n  <title>Replace javascript: links</title>\n</head>\n<body>  \n  <a"
  },
  {
    "path": "test/test-pages/keep-images/expected-metadata.json",
    "chars": 318,
    "preview": "{\n  \"title\": \"Inside the Deep Web Drug Lab\",\n  \"byline\": \"Joseph Cox\",\n  \"dir\": null,\n  \"excerpt\": \"Welcome to DoctorX’s"
  },
  {
    "path": "test/test-pages/keep-images/expected.html",
    "chars": 29984,
    "preview": "<div id=\"readability-page-1\" class=\"page\">\n    <div name=\"ef8c\">\n        <div>\n            <figure name=\"b9ad\" id=\"b9ad\""
  },
  {
    "path": "test/test-pages/keep-images/source.html",
    "chars": 140549,
    "preview": "<!DOCTYPE html>\n<html>\n\n    <head prefix=\"og: http://ogp.me/ns# fb: http://ogp.me/ns/fb# medium-com: http://ogp.me/ns/fb"
  },
  {
    "path": "test/test-pages/keep-tabular-data/expected-metadata.json",
    "chars": 256,
    "preview": "{\n  \"title\": \"Friday Facts #282 - 0.17 in sight | Factorio\",\n  \"byline\": null,\n  \"dir\": null,\n  \"excerpt\": \"Posted by ko"
  },
  {
    "path": "test/test-pages/keep-tabular-data/expected.html",
    "chars": 43079,
    "preview": "<div id=\"readability-page-1\" class=\"page\">\n    <div>\n        <p>Posted by kovarex, TOGos, Ernestas, Albert on 2019-02-15"
  },
  {
    "path": "test/test-pages/keep-tabular-data/source.html",
    "chars": 64201,
    "preview": "<html>\n\n<head>\n    <title>Friday Facts #282 - 0.17 in sight | Factorio</title>\n    <meta property=\"og:title\" content=\"Fr"
  },
  {
    "path": "test/test-pages/la-nacion/expected-metadata.json",
    "chars": 317,
    "preview": "{\n  \"title\": \"Una solución no violenta para la cuestión mapuche\",\n  \"byline\": null,\n  \"dir\": null,\n  \"excerpt\": \"Los pue"
  },
  {
    "path": "test/test-pages/la-nacion/expected.html",
    "chars": 8282,
    "preview": "<div id=\"readability-page-1\" class=\"page\">\n    <article id=\"nota\" itemscope=\"\" itemtype=\"http://schema.org/NewsArticle\" "
  },
  {
    "path": "test/test-pages/la-nacion/source.html",
    "chars": 63203,
    "preview": "<!DOCTYPE html>\n<html>\n<head>\n    <meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\"></meta>\n    <link "
  },
  {
    "path": "test/test-pages/lazy-image-1/expected-metadata.json",
    "chars": 380,
    "preview": "{\n  \"title\": \"Node.js and CPU profiling on production (in real-time without downtime)\",\n  \"byline\": \"Vincent Vallet\",\n  "
  },
  {
    "path": "test/test-pages/lazy-image-1/expected.html",
    "chars": 16365,
    "preview": "<div id=\"readability-page-1\" class=\"page\">\n    <div>\n        <div>\n            <p><a rel=\"noopener\" href=\"http://fakehos"
  },
  {
    "path": "test/test-pages/lazy-image-1/source.html",
    "chars": 266955,
    "preview": "<!DOCTYPE html>\n<html lang=\"en\" xmlns=\"http://www.w3.org/1999/xhtml\" xml:lang=\"en\">\n    <head>\n        <meta data-rh=\"tr"
  },
  {
    "path": "test/test-pages/lazy-image-2/expected-metadata.json",
    "chars": 668,
    "preview": "{\n  \"title\": \"The Spectacular Story Of Metroid, One Of Gaming's Richest Universes\",\n  \"byline\": \"Mama Robotnik\",\n  \"dir\""
  },
  {
    "path": "test/test-pages/lazy-image-2/expected.html",
    "chars": 152915,
    "preview": "<div id=\"readability-page-1\" class=\"page\">\n    <div>\n        <figure data-id=\"18zu12g5xzyxojpg\" data-recommend-id=\"image"
  },
  {
    "path": "test/test-pages/lazy-image-2/source.html",
    "chars": 729852,
    "preview": "<!DOCTYPE html>\n<html lang=\"en-us\" data-reactroot=\"\" xmlns=\"http://www.w3.org/1999/xhtml\" xml:lang=\"en-us\">\n    <head>\n "
  },
  {
    "path": "test/test-pages/lazy-image-3/expected-metadata.json",
    "chars": 170,
    "preview": "{\n  \"title\": \"Lazy Load with Alt includes jpg/png/webp extensions\",\n  \"byline\": null,\n  \"dir\": null,\n  \"siteName\": null,"
  },
  {
    "path": "test/test-pages/lazy-image-3/expected.html",
    "chars": 698,
    "preview": "<div id=\"readability-page-1\" class=\"page\">\n    <article>\n        <h2>Test Case 1</h2>\n        <img data-src=\"https://p3-"
  },
  {
    "path": "test/test-pages/lazy-image-3/source.html",
    "chars": 664,
    "preview": "<!DOCTYPE html>\n<html>\n<head>\n  <meta charset=\"utf-8\"/>\n  <title>Lazy Load with Alt includes jpg/png/webp extensions</ti"
  },
  {
    "path": "test/test-pages/lemonde-1/expected-metadata.json",
    "chars": 399,
    "preview": "{\n  \"title\": \"Le projet de loi sur le renseignement massivement approuvé à l'Assemblée\",\n  \"byline\": \"Martin Untersinger"
  },
  {
    "path": "test/test-pages/lemonde-1/expected.html",
    "chars": 11888,
    "preview": "<div id=\"readability-page-1\" class=\"page\">\n    <div id=\"articleBody\" itemprop=\"articleBody\">\n        <p>\n            <if"
  },
  {
    "path": "test/test-pages/lemonde-1/source.html",
    "chars": 86694,
    "preview": "<!DOCTYPE html>\n<!--[if lt IE 9]><html class=\"ie\"><![endif]-->\n<!--[if IE 9]><html class=\"ie9\"><![endif]-->\n<!--[if gte "
  },
  {
    "path": "test/test-pages/liberation-1/expected-metadata.json",
    "chars": 342,
    "preview": "{\n  \"title\": \"Un troisième Français mort dans le séisme au Népal\",\n  \"byline\": \"Par Sébastien Farcis\",\n  \"dir\": null,\n  "
  },
  {
    "path": "test/test-pages/liberation-1/expected.html",
    "chars": 2900,
    "preview": "<div id=\"readability-page-1\" class=\"page\">\n    <section id=\"news-article\">\n        <article itemscope=\"\" itemtype=\"http:"
  },
  {
    "path": "test/test-pages/liberation-1/source.html",
    "chars": 140236,
    "preview": "<!DOCTYPE html>\n<!-- HTML5 Boilerplate -->\n<!--[if lt IE 7]><html class=\"no-js lt-ie9 lt-ie8 lt-ie7\" lang=\"en\"> <![endif"
  },
  {
    "path": "test/test-pages/lifehacker-post-comment-load/expected-metadata.json",
    "chars": 530,
    "preview": "{\n  \"title\": \"How to Program Your Mind to Stop Buying Crap You Don’t Need\",\n  \"byline\": \"Patrick Allan\",\n  \"dir\": null,\n"
  },
  {
    "path": "test/test-pages/lifehacker-post-comment-load/expected.html",
    "chars": 26548,
    "preview": "<div id=\"readability-page-1\" class=\"page\">\n    <div>\n        <p data-textannotation-id=\"58a492029dca5e6a6e481d21b6b2933a"
  },
  {
    "path": "test/test-pages/lifehacker-post-comment-load/source.html",
    "chars": 248267,
    "preview": "<!DOCTYPE html>\n<!--[if lt IE 8]>\t <html class=\"no-js lt-ie9 lt-ie8 lt-ie7\" lang=\"en-US\"> <![endif]-->\n<!--[if IE 8]>\t  "
  },
  {
    "path": "test/test-pages/lifehacker-working/expected-metadata.json",
    "chars": 530,
    "preview": "{\n  \"title\": \"How to Program Your Mind to Stop Buying Crap You Don’t Need\",\n  \"byline\": \"Patrick Allan\",\n  \"dir\": null,\n"
  },
  {
    "path": "test/test-pages/lifehacker-working/expected.html",
    "chars": 26548,
    "preview": "<div id=\"readability-page-1\" class=\"page\">\n    <div>\n        <p data-textannotation-id=\"58a492029dca5e6a6e481d21b6b2933a"
  },
  {
    "path": "test/test-pages/lifehacker-working/source.html",
    "chars": 127994,
    "preview": "<!DOCTYPE html>\n<!--[if lt IE 8]>\t <html class=\"no-js lt-ie9 lt-ie8 lt-ie7\" lang=\"en-US\"> <![endif]-->\n<!--[if IE 8]>\t  "
  },
  {
    "path": "test/test-pages/links-in-tables/expected-metadata.json",
    "chars": 330,
    "preview": "{\n  \"title\": \"Saving Data: Reducing the size of App Updates by 65%\",\n  \"byline\": null,\n  \"dir\": \"ltr\",\n  \"excerpt\": \"Pos"
  },
  {
    "path": "test/test-pages/links-in-tables/expected.html",
    "chars": 12659,
    "preview": "<div id=\"readability-page-1\" class=\"page\">\n    <div id=\"post-body-2701400044422363572\" itemprop=\"articlesBody\">\n        "
  },
  {
    "path": "test/test-pages/links-in-tables/source.html",
    "chars": 131533,
    "preview": "<!DOCTYPE html>\n<html class=\"v2\" dir=\"ltr\" xmlns=\"http://www.w3.org/1999/xhtml\" xmlns:b=\"http://www.google.com/2005/gml/"
  }
]

// ... and 215 more files (download for full content)

About this extraction

This page contains the full source code of the mozilla/readability GitHub repository, extracted and formatted as plain text for AI agents and large language models (LLMs). The extraction includes 415 files (27.1 MB), approximately 7.1M tokens, and a symbol index with 142 extracted functions, classes, methods, constants, and types. Use this with OpenClaw, Claude, ChatGPT, Cursor, Windsurf, or any other AI tool that accepts text input. You can copy the full output to your clipboard or download it as a .txt file.

Extracted by GitExtract — free GitHub repo to text converter for AI. Built by Nikandr Surkov.

Extract another repo