Full Code of niklak/dom_smoothie for AI

main 426cb6dfaf2d cached

536 files

35.8 MB

9.4M tokens

277 symbols

1 requests

Copy disabled (too large) Download .txt

Showing preview only (37,829K chars total). Download the full file to get everything.

Repository: niklak/dom_smoothie
Branch: main
Commit: 426cb6dfaf2d
Files: 536
Total size: 35.8 MB

Directory structure:
gitextract_hsrmhacy/

├── .cargo/
│   └── config.toml
├── .gitattributes
├── .github/
│   ├── dependabot.yml
│   └── workflows/
│       ├── audit.yml
│       ├── benchmark.yml
│       ├── coverage.yml
│       ├── release.yml
│       ├── rust.yml
│       └── wasm.yml
├── .gitignore
├── CHANGELOG.md
├── Cargo.toml
├── LICENSE
├── README.md
├── crates/
│   ├── bench/
│   │   ├── Cargo.toml
│   │   └── benches/
│   │       └── parse.rs
│   ├── cli/
│   │   ├── Cargo.toml
│   │   └── src/
│   │       └── main.rs
│   ├── js/
│   │   ├── .gitignore
│   │   ├── Cargo.toml
│   │   ├── LICENSE_MIT
│   │   ├── README.md
│   │   ├── src/
│   │   │   ├── lib.rs
│   │   │   └── utils.rs
│   │   └── tests/
│   │       └── web.rs
│   └── lua/
│       ├── Cargo.toml
│       └── src/
│           └── lib.rs
├── deny.toml
├── src/
│   ├── ac_automat.rs
│   ├── config.rs
│   ├── glob.rs
│   ├── grab.rs
│   ├── grab_flags.rs
│   ├── helpers.rs
│   ├── lib.rs
│   ├── matching.rs
│   ├── prep_article.rs
│   ├── readability.rs
│   ├── readable.rs
│   ├── score.rs
│   ├── serde_helpers.rs
│   └── url_helpers.rs
├── test-pages/
│   ├── aclu_ld_meta.json
│   ├── alice-two-paragraphs.html
│   ├── alt/
│   │   ├── arstechnica/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   ├── expected.md
│   │   │   ├── expected_alt.txt
│   │   │   └── source.html
│   │   ├── hacker-news/
│   │   │   ├── expected.md
│   │   │   ├── expected_alt.txt
│   │   │   └── source.html
│   │   ├── mozilla_readability/
│   │   │   ├── expected.md
│   │   │   ├── expected_alt.txt
│   │   │   └── source.html
│   │   └── rust-blog/
│   │       ├── expected.md
│   │       ├── expected_alt.txt
│   │       └── source.html
│   ├── ld.json
│   ├── not-matching/
│   │   ├── empty-links/
│   │   │   ├── google-sre-book-1/
│   │   │   │   ├── expected-metadata.json
│   │   │   │   ├── expected.html
│   │   │   │   └── source.html
│   │   │   ├── lazy-image-2/
│   │   │   │   ├── expected-metadata.json
│   │   │   │   ├── expected.html
│   │   │   │   └── source.html
│   │   │   └── yahoo-3/
│   │   │       ├── expected-metadata.json
│   │   │       ├── expected.html
│   │   │       └── source.html
│   │   ├── redundant-class-page/
│   │   │   ├── nytimes-1/
│   │   │   │   ├── expected-metadata.json
│   │   │   │   ├── expected.html
│   │   │   │   └── source.html
│   │   │   └── nytimes-2/
│   │   │       ├── expected-metadata.json
│   │   │       ├── expected.html
│   │   │       └── source.html
│   │   ├── redundant-div/
│   │   │   ├── citylab-1/
│   │   │   │   ├── expected-metadata.json
│   │   │   │   ├── expected.html
│   │   │   │   └── source.html
│   │   │   ├── la-nacion/
│   │   │   │   ├── expected-metadata.json
│   │   │   │   ├── expected.html
│   │   │   │   └── source.html
│   │   │   ├── lwn-1/
│   │   │   │   ├── expected-metadata.json
│   │   │   │   ├── expected.html
│   │   │   │   └── source.html
│   │   │   └── wapo-2/
│   │   │       ├── expected-metadata.json
│   │   │       ├── expected.html
│   │   │       └── source.html
│   │   ├── redundant-font-attrs/
│   │   │   ├── clean-links/
│   │   │   │   ├── expected-metadata.json
│   │   │   │   ├── expected.html
│   │   │   │   └── source.html
│   │   │   ├── gmw/
│   │   │   │   ├── expected-metadata.json
│   │   │   │   ├── expected.html
│   │   │   │   └── source.html
│   │   │   ├── hukumusume/
│   │   │   │   ├── expected-metadata.json
│   │   │   │   ├── expected.html
│   │   │   │   └── source.html
│   │   │   ├── keep-tabular-data/
│   │   │   │   ├── expected-metadata.json
│   │   │   │   ├── expected.html
│   │   │   │   └── source.html
│   │   │   ├── replace-font-tags/
│   │   │   │   ├── expected-metadata.json
│   │   │   │   ├── expected.html
│   │   │   │   └── source.html
│   │   │   └── table-style-attributes/
│   │   │       ├── expected-metadata.json
│   │   │       ├── expected.html
│   │   │       └── source.html
│   │   └── urls/
│   │       ├── 002/
│   │       │   ├── expected-metadata.json
│   │       │   ├── expected.html
│   │       │   └── source.html
│   │       ├── ietf-1/
│   │       │   ├── expected-metadata.json
│   │       │   ├── expected.html
│   │       │   └── source.html
│   │       ├── toc-missing/
│   │       │   ├── expected-metadata.json
│   │       │   ├── expected.html
│   │       │   └── source.html
│   │       ├── v8-blog/
│   │       │   ├── expected-metadata.json
│   │       │   ├── expected.html
│   │       │   └── source.html
│   │       ├── videos-1/
│   │       │   ├── expected-metadata.json
│   │       │   ├── expected.html
│   │       │   └── source.html
│   │       ├── wikia/
│   │       │   ├── expected-metadata.json
│   │       │   ├── expected.html
│   │       │   └── source.html
│   │       ├── wikipedia/
│   │       │   ├── expected-metadata.json
│   │       │   ├── expected.html
│   │       │   └── source.html
│   │       └── wikipedia-2/
│   │           ├── expected-metadata.json
│   │           ├── expected.html
│   │           └── source.html
│   ├── ok/
│   │   ├── 001/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── 002/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── 003-metadata-preferred/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── 004-metadata-space-separated-properties/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── 005-unescape-html-entities/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── aclu/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── aktualne/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── archive-of-our-own/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── ars-1/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── base-url/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── base-url-base-element-relative/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── breitbart/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── citylab-1/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── clean-links/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── cnn/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── dev418/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── ehow-1/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── engadget/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── gmw/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── hukumusume/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── ietf-1/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── js-link-replacement/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── keep-tabular-data/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── la-nacion/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── lwn-1/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── medicalnewstoday/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── medium-3/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── qq/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── replace-brs/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── replace-font-tags/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── social-buttons/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── table-style-attributes/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── tmz-1/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── toc-missing/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── v8-blog/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── videos-1/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── wikia/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── wikipedia/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── wikipedia-2/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   └── wikipedia-3/
│   │       ├── expected-metadata.json
│   │       ├── expected.html
│   │       └── source.html
│   ├── readability/
│   │   ├── article-author-tag/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── arxiv/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   ├── expected.md
│   │   │   ├── expected_alt.txt
│   │   │   └── source.html
│   │   ├── base-url-base-element/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── basic-tags-cleaning/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── bbc-1/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── blogger/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── bug-1255978/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── buzzfeed-1/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── cnet/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── cnet-svg-classes/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── comment-inside-script-parsing/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── daringfireball-1/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── data-url-image/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── dropbox-blog/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── ebb-org/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── ehow-2/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── embedded-videos/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── firefox-nightly-blog/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── folha/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── gitlab-blog/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── google-sre-book-1/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── guardian-1/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── heise/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── herald-sun-1/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── hidden-nodes/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── iab-1/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── invalid-attributes/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── keep-images/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── lazy-image-1/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── lazy-image-2/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── lazy-image-3/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── lemonde-1/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── liberation-1/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── lifehacker-post-comment-load/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── lifehacker-working/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── links-in-tables/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── mathjax/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── medium-1/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── medium-2/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── mercurial/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── metadata-content-missing/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── missing-paragraphs/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── mozilla-1/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── mozilla-2/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── msn/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── normalize-spaces/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── nytimes-1/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── nytimes-2/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── nytimes-3/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── nytimes-4/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── nytimes-5/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── ol/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── parsely-metadata/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── pixnet/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── quanta-1/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── remove-aria-hidden/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── remove-extra-brs/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── remove-extra-paragraphs/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── remove-script-tags/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── reordering-paragraphs/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── royal-road/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── rtl-1/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── rtl-2/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── rtl-3/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── rtl-4/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── salon-1/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── schema-org-context-object/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── seattletimes-1/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── simplyfound-1/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── spiceworks/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── style-tags-removal/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── svg-parsing/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── telegraph/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── theverge/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── title-and-h1-discrepancy/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── title-en-dash/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── topicseed-1/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── tumblr/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── videos-2/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── visibility-hidden/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── wapo-1/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── wapo-2/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── webmd-1/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── webmd-2/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── wikipedia-4/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── wordpress/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── yahoo-1/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── yahoo-2/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── yahoo-3/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── yahoo-4/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   └── youth/
│   │       ├── expected-metadata.json
│   │       ├── expected.html
│   │       └── source.html
│   ├── rustwiki_2024.html
│   ├── rustwiki_2024_result.html
│   └── rustwiki_2024_result.txt
└── tests/
    ├── alt.rs
    ├── bad.rs
    ├── candidate_modes.rs
    ├── common.rs
    ├── favicon.rs
    ├── metadata.rs
    ├── parse_policy.rs
    ├── readability.rs
    ├── readability_ok.rs
    └── wasm.rs

================================================
FILE CONTENTS
================================================

================================================
FILE: .cargo/config.toml
================================================
[target.wasm32-unknown-unknown]
runner = 'wasm-bindgen-test-runner'


================================================
FILE: .gitattributes
================================================

test-pages/** linguist-generated=true


================================================
FILE: .github/dependabot.yml
================================================
# To get started with Dependabot version updates, you'll need to specify which
# package ecosystems to update and where the package manifests are located.
# Please see the documentation for all configuration options:
# https://docs.github.com/code-security/dependabot/dependabot-version-updates/configuration-options-for-the-dependabot.yml-file

version: 2
updates:
  - package-ecosystem: "cargo" # See documentation for possible values
    directory: "/" # Location of package manifests
    schedule:
      interval: "weekly"


================================================
FILE: .github/workflows/audit.yml
================================================
name: Rust Audit

on:
  push:
    branches: [ "main" ]
  pull_request:
    branches: [ "main" ]

env:
  CARGO_TERM_COLOR: always
  RUST_BACKTRACE: 1

jobs:

  msrv-verify:
    runs-on: ubuntu-24.04
    steps:
    - uses: actions/checkout@v4
    - uses: dtolnay/rust-toolchain@master
      with:
        toolchain: 1.75.0
    - name: Install cargo msrv
      uses: taiki-e/install-action@v2
      with:
        tool: cargo-msrv
    - name: Verify MSRV
      run: cargo msrv verify

  audit:
    runs-on: ubuntu-24.04
    steps:
    - uses: actions/checkout@v4
    - name: Run clippy
      run: cargo clippy --verbose --all-targets --all-features -- -D warnings
    - name: Install cargo deny
      uses: taiki-e/install-action@cargo-deny
    - name: Check advisories
      run: cargo deny check advisories
    - name: Check bans
      run: cargo deny check bans

================================================
FILE: .github/workflows/benchmark.yml
================================================
name: Benchmark

on:
  push:
    branches: [main]

permissions:
  # deployments permission to deploy GitHub pages website
  deployments: write
  # contents permission to update benchmark contents in gh-pages branch
  contents: write

jobs:
  benchmark:
    name: Performance regression check
    runs-on: ubuntu-24.04
    steps:
      - uses: actions/checkout@v4

      - name: Install Rust
        uses: dtolnay/rust-toolchain@master
        with:
          toolchain: 1.89.0

      # Cache dependencies to speed up build
      - uses: Swatinem/rust-cache@v2

      # Run benchmark
      - name: Run benchmark
        run: cargo bench -p dom-smoothie-bench --bench parse -- --output-format bencher | tee benchmark-results.txt

      # Store benchmark results
      - name: Store benchmark result
        uses: benchmark-action/github-action-benchmark@v1
        with:
          name: Rust Benchmark
          tool: "cargo"
          output-file-path: benchmark-results.txt
          # Save the results as GitHub Pages
          github-token: ${{ secrets.GITHUB_TOKEN }}
          auto-push: true
          # Show alert with commit comment on detecting possible performance regression
          alert-threshold: "150%"
          comment-on-alert: true
          # Optional: Alert only when changes are made to specific files
          alert-comment-cc-users: "@niklak"
          # Optional: Enable failure when performance regresses
          # fail-on-alert: true

          # Configure GitHub Pages
          gh-pages-branch: gh-pages
          benchmark-data-dir-path: docs/dev/bench/

      # Optional: Upload the results as artifacts
      - name: Upload benchmark results
        uses: actions/upload-artifact@v4
        with:
          name: benchmark-results
          path: benchmark-results.txt


================================================
FILE: .github/workflows/coverage.yml
================================================
name: Coverage

on:
  push:
    branches: [ "main" ]
  pull_request:
    branches: [ "main" ]

jobs:
  coverage:
    runs-on: ubuntu-24.04
    env:
      CARGO_TERM_COLOR: always
    steps:
      - uses: actions/checkout@v4
      - name: Update Rust
        run: rustup update stable
      - name: Install cargo-llvm-cov
        uses: taiki-e/install-action@cargo-llvm-cov
      - name: Generate code coverage
        run: cargo llvm-cov --lcov --output-path lcov.info
      - name: Upload coverage to Codecov
        uses: codecov/codecov-action@v5
        with:
          token: ${{ secrets.CODECOV_TOKEN }}
          files: lcov.info
          fail_ci_if_error: true

================================================
FILE: .github/workflows/release.yml
================================================
name: Build and Release Binaries

on:
  release:
    types: [published]

  workflow_dispatch:

permissions:
  contents: write

jobs:
  build-and-upload:
    name: Build and upload

    strategy:
      matrix:
        include:
          - os: ubuntu-latest
            target: x86_64-unknown-linux-gnu
            src_file: dom_smoothie_cli
            dst_file: dom_smoothie_cli-x86_64-unknown-linux-gnu-${{ github.ref_name }}

          - os: macos-latest
            target: x86_64-apple-darwin
            src_file: dom_smoothie_cli
            dst_file: dom_smoothie_cli-x86_64-apple-darwin-${{ github.ref_name }}

          - os: windows-latest
            target: x86_64-pc-windows-gnu
            src_file: dom_smoothie_cli.exe
            dst_file: dom_smoothie_cli-x86_64-pc-windows-gnu-${{ github.ref_name }}

    runs-on: ${{ matrix.os }}

    steps:
      - uses: actions/checkout@v4

      - name: Install Rust
        uses: dtolnay/rust-toolchain@stable
        with:
          targets: ${{ matrix.target }}

      - name: Build Binary
        run: cargo build --release --target ${{ matrix.target }}
        working-directory: crates/cli
      
      - name: Install zip (Windows)
        if: matrix.os == 'windows-latest'
        run: choco install zip -y

      - name: Prepare Archive
        run: |
          zip -j ${{ matrix.dst_file }}.zip target/${{ matrix.target }}/release/${{ matrix.src_file }}

      - name: Release
        uses: softprops/action-gh-release@v1
        with:
          files: |
            ${{ matrix.dst_file }}.zip


================================================
FILE: .github/workflows/rust.yml
================================================
name: Rust CI

on:
  push:
    branches: [ "main" ]
  pull_request:
    branches: [ "main" ]

env:
  CARGO_TERM_COLOR: always
  RUST_BACKTRACE: 1

jobs:
  build:

    runs-on: ubuntu-latest
    strategy:
      matrix:
        rust: [ stable, nightly, 1.75.0 ]

    steps:
    - uses: actions/checkout@v5
    - uses: dtolnay/rust-toolchain@master
      with:
        toolchain: ${{ matrix.rust }}
    - name: Cache cargo bin
      uses: actions/cache@v5
      with:
        path: ~/.cargo/bin
        key: cargo-bin-${{ matrix.rust }}
    
    - name: Install cargo-nextest
      uses: taiki-e/install-action@v2
      with:
        tool: cargo-nextest    
    - name: Build
      run: cargo build --verbose
    - name: Run tests
      run: cargo nextest run --all-targets
    - name: Run tests with all features
      run: cargo nextest run --all-targets --all-features


================================================
FILE: .github/workflows/wasm.yml
================================================
name: wasm ci

on:
  push:
    branches: [ "main", "feature/*" ]
  pull_request:
    branches: [ "main" ]

env:
  CARGO_TERM_COLOR: always
  RUST_BACKTRACE: 1
  CARGO_TARGET_WASM32_UNKNOWN_UNKNOWN_RUNNER: wasm-bindgen-test-runner

jobs:
  test-wasm:

    runs-on: ubuntu-24.04
    steps:
    - uses: actions/checkout@v4
    - name: Install stable rust
      uses: dtolnay/rust-toolchain@master
      with:
        toolchain: stable
        targets: wasm32-unknown-unknown
    - name: Install wasm-pack & wasm-bindgen-cli
      uses: taiki-e/install-action@v2
      with:
        tool: wasm-pack,wasm-bindgen-cli@0.2.117
    - uses: Swatinem/rust-cache@v2
      with:
        workspaces: .       
    - name: Run tests (crates/js)
      working-directory: crates/js
      run: wasm-pack test --node
    - name: Run tests 
      run: cargo test --target wasm32-unknown-unknown

================================================
FILE: .gitignore
================================================
/target
/examples
*.js
**/draft

================================================
FILE: CHANGELOG.md
================================================
# Changelog

All notable changes to the `dom_smoothie` crate will be documented in this file.

## [Unreleased]
### Changed
- Applied selected Clippy suggestions (pedantic) to improve code quality. 


## [0.17.0] - 2026-03-28

### Changed
- Updated `dom_query` version from `0.26.0` to `0.27.0`.
- Reworked the filtering pipeline to a single-pass approach (similar to `Readability.js`).
  - The byline is now extracted (and removed from the document) during processing if it was not found earlier via `parse_json_ld` or `get_article_metadata`.
  - This removes the two-stage filtering approach while preserving reliable byline detection.
- Applied selected clippy suggestions (pedantic and nursery) to improve code quality.
- Revised `simplify_nested_elements` to make it more flexible for future changes.
- Removed redundant whitespace check in `wrap_phrasing_content`.

## [0.16.0] - 2026-03-09

### Changed
- Minor internal code changes.
- Refactored `div_into_p` function.
- Simplify control flow in `Readability::handle_candidates`.
- Updated `dom_query` version from `0.25.1` to `0.26.0`.

## [0.15.0] - 2026-01-18

### Changed
- Updated `dom_query` version from `0.24.0` to `0.25.1`.
- Replaced logic of `has_single_tag_inside_element` with `single_child_element`;

### Fixed
- **Breaking**: Fixed a panic when `Readability::with_document` was used with a `dom_query::Document` created via `dom_query::Document::fragment`, where `<body>` is unreachable. 
In this case, `Readability::parse` now returns `ReadabilityError::GrabFailed`.

## [0.14.0] - 2025-12-01

### Added
- Implemented internal `BytePatternCheck`, improving the performance of `match_unlikely` and `determine_attr_weight`
 when the `aho-corasick` feature is disabled. No public API changes.

### Changed
- Updated `dom_query` version from `0.23.0` to `0.24.0`.
- Moved all secondary crates into `crates/`.
- Performed internal refactoring based on `clippy`\'s recommendations to improve code clarity. No public API changes.

## [0.13.0] - 2025-10-02

### Added
- Added `Metadata.favicon` and `Article.favicon` support when calling `Readability::get_article_metadata` and `Readability::parse`.
- Added *experimental* crate `dom-smoothie-lua` providing **Lua** bindings for the `dom_smoothie` crate.

### Changed
- Updated `dom_query` version from `0.22.0` to `0.23.0`.
- Revised `grab::score_elements`: use a cache for normalized char count to improve performance. No public API changes.
- Minor internal code changes.

### Fixed
- `MATCHER_LAZY_IMG` is now used in `prep_article::fix_lazy_images` instead of `MINI_LAZY`, since the latter does not support complex selectors.

## [0.12.0] - 2025-09-04

### Changed
- Optimized internal implementation of `grab::score_elements`. No public API changes.

- Absolute URL transformation is now performed internally by `dom_smoothie`.

- The `url` dependency has been removed from `dom_smoothie` for the following reasons:
  - Although an excellent crate, its features are excessive for `dom_smoothie`. It requires only `is_absolute_url` and `to_absolute_url` functionality.
  - MSRV issues: `url` requires Rust 1.63, but its `idna` dependencies require 1.82. This would prevent `dom_smoothie` 
  from building on older Rust versions, and disabling these dependencies is cumbersome.
- **Breaking**: `ReadabilityError::BadDocumentURL` is now a unit variant (`BadDocumentURL`) instead of a tuple variant. Update downstream pattern matches accordingly.
- **Breaking**: `Readability::doc_url` type changed from `Option<url::Url>` to `Option<String>`. Update code accessing this public field.
- Set MSRV to 1.75.
- Downgraded `phf` to `0.11.3` to prevent duplicate dependencies (`cssparser`, `selectors`, `web_atoms`).
- Updated `dom_query` version from `0.21.0` to `0.22.0`.

### Fixed
- Fixed `Readability::fix_relative_uris` behavior when handling srcset\'s item without a condition (e.g., `image.jpg` instead of `image.jpg 2x`).
- Fixed `MATCHER_SOURCES`, which previously contained a typo (`sources` instead of `source`).
- Fixed metadata key typos:
  - `META_MOD_TIME_KEYS`: "dcterms.modifie" -> "dcterms.modified"
  - `META_TITLE_KEYS`: "dcterm:title" -> "dcterms:title"
  - `META_EXCERPT_KEYS`: "dcterm:description" -> "dcterms:description"
  - `META_PROPERTY_PREFIXES`: "dcterm" -> "dcterms"
  - `META_NAME_PREFIXES`: "dcterm" -> "dcterms"

## [0.11.2] - 2025-08-09

### Changed
- Minor internal code changes.
- Updated `dom_query` version from `0.19.2` to `0.20.1`.

## [0.11.1] - 2025-07-08

### Changed
- Updated `dom_query` version from `0.18.0` to `0.19.2`.


## [0.11.0] - 2025-04-30

### Changed
- Updated `dom_query` version to `0.18.0`.
- Updated codebase to match latest changes (a07e62c) in [mozilla/readability](https://github.com/mozilla/readability) library.
- Minor internal code changes.


## [0.10.0] - 2025-04-01

### Added
- Added the `Config::min_score_to_adjust` option, which allows controlling the minimum score required for adjustment during the scoring process. Only nodes with a score higher than this value will be adjusted by their link density. Thus, the higher the value, the faster the scoring process.
- Implemented the `aho-corasick` feature, enabling the use of the `aho-corasick` crate for defining unlikely candidates and for the node scoring process. This can speed up the overall parsing process by 5-10% in some cases, at the cost of slightly higher memory usage and an increase in binary size.

### Changed
- Improved the internal function `fix_lazy_images` to better detect occurrences of `lazy` as a substring within an element's `class` attribute.
- Optimized the internal function `should_clean_conditionally` to improve performance.
- Minor internal code changes.
- Changed the default allocator for `dom-smoothie-js` from `alloc_cat` to `lol_alloc` because `lol_alloc` is licensed under **MIT**, whereas `alloc_cat` is not.

## [0.9.0] - 2025-03-17

### Added
- Added `Readability::parse_with_policy` method, which performs one attempt to extract relevant content from an HTML document with `ParsePolicy`. This method consumes **significantly** less memory than `Readability::parse` but it is also less precise, as `Readability::parse` is able to perform more than one attempt.
- Added the `dom_smoothie_js::Readability::parse_with_policy` method, a wrapper around `dom_smoothie::Readability::parse_with_policy`.

### Changed
- Ignoring `svg` elements during pre-filtering and element collection for scoring, improving performance for documents with many `svg` elements.

### Fixed
- Fixed the `get_row_and_col_count` function, which determines the number of rows and columns. Skipped counting `rowspan` since it is meaningless.

## [0.8.0] - 2025-03-10

### Changed
- Link elements (`<a>`) without an `href` attribute and without child nodes are now removed from the article content during post-processing.
- Changed how phrasing content determines wrapping some `<div>` element children with a `<p>` element. Now the element must contain some nodes to be wrapped.
- Updated `dom_query`'s version to `0.16.0`.

## [0.7.0] - 2025-03-03

### Added
- `Readability::parse` can now output text as `Markdown` in `Article::text_content` when `Config::text_mode` is set to `TextMode::Markdown`.

### Changed
- Update `dom_query`'s version to `0.15.1`.
- Minor code changes.

## [0.6.1] - 2025-02-16

### Changed
- Update `dom_query`'s version to `0.14.0` which brings performance improvements and improves the accuracy of the `NodeRef::formatted_text` method.
- Code optimizations, which improve the performance of the `Readability::parse` method.


## [0.6.0] - 2025-02-13

### Fixed
- Avoid a potential underflow of `orig_wc - 1` in `Readability::get_article_title`, which causes a panic when the `<title>` element contains only the `/` character. (Fix by @rMazeiks).

### Added
- Introduced `dom-smoothie-js` a sub-crate that wraps the `dom_smoothie` for use in a JS environment.

### Changed
- Switch from using regular expressions to equivalent matching functions.

## [0.5.1] - 2025-02-08

### Changed
- Updated `dom_query` version to `0.13.3`, which improves the accuracy of the `NodeRef::formatted_text` method.

### Fixed
- `Config` now implements `#[serde(default)]`. This change makes it more convenient to work with serde by removing the need to explicitly set every value in `Config`.

## [0.5.0] - 2025-02-06

### Added
- Introducing the `Config::candidate_select_mode`: this mode determines whether the top candidate is adjusted in the [Readability.js](https://github.com/mozilla/readability)  order or using the crate's exclusive implementation.
- Introducing the `Config::text_mode`: this mode determines whether the text is formatted or not. The default is `TextMode::Raw`, which is completely compatible with previous versions of this crate.

### Changed
- Changed the `Readability::grab_article` method implementation to retain only the best attempt among failed attempts, instead of keeping all of them until the exit.
- Internal code optimizations aimed to reduce execution time.
- **Breaking** Revised document filtering. Since most of the filtering is now separated from extracting elements for scoring. This applies to `Metadata.byline`, which previously could incorrectly assign a commentator as the article's author or leave it missing altogether. In the `mozilla/readability` test pages, I've encountered cases where this happened because `Readability` failed to extract readable content on the first iteration. The removal of duplicate `Metadata.title` elements is handled more accurately, reducing redundancy and improving document clarity.
- If Metadata.byline was assigned while grabbing the article, it will be normalized (no new lines or trailing spaces).

### Fixed
- Corrected handling of manually created `p` elements for scores. Previously, these elements were sometimes omitted.
- Skipped ancestor assignment for elements beyond the `body` element. Previously, these elements may have been incorrectly assigned to the root element, which has no parent, causing a runtime panic.


## [0.4.0] - 2025-01-21

### Added
- Implemented a `serde` optional crate feature, enabling `serde::Serialize` and `serde::Deserialize` traits for `Article`, `Metadata`, and `Config` structures. 

### Changed
- Reduced the number of regex checks since they can be replaced with `contains` checks.
- Updated the dependencies.
- Internal code change: use `dom_query::Document::base_uri` to extract the base uri instead of `dom_query::Matcher`. 
- Updated the code (Byline extraction and JSON-LD parsing) to align with Mozilla's recent updates to the Readability library ([118f015](https://github.com/mozilla/readability/commit/118f01538e167218bd86ffd493bd3466aec4870a)).
- **Breaking:** Revised `Readability::is_probably_readable` method: it now uses `Config::readable_min_score` and `Config::readable_min_content_length` from the instance configuration instead of accepting arguments.


## [0.3.0] - 2025-01-08

### Added
- Implemented a CLI tool (`dom_smoothie_cli`) for demonstration purposes.
- Implemented `is_probably_readable` function. 
A quick-and-dirty way of figuring out if the contents of a given document are suitable for processing with `Readability`.
- Implemented `Readability::is_probably_readable`. This method calls the above function but uses its internal document (`dom_query::Document`).
- Implemented `Readability::with_document` method, which allows to create a new `Readability` instance with external `dom_query::Document`.

### Changed
- Changed visibility of `get_text_density`, `normalize_spaces`, and `link_density` to `pub(crate)` since they are used internally only.
- Refactor `Readability::parse_json_ld`.
- `Readability::parse_json_ld` also tries to extract `dateModified` and `image` from `ld+json` script. 

### Fixed
- `Article.text_content` accidentally contained text content of the original document. Now it contains only the text content of the article after processing.

## [0.2.0] - 2024-12-30

### Added

- Implement support for `Config::max_elements_to_parse` in `Readability::parse`.
- Implement support for `Config::disable_json_ld` in `Readability::parse`.
- Implement support for `Config::n_top_candidates` in `Readability::parse`.
- Implement support for `Config::char_threshold` in `Readability::parse`.


### Fixed

- Improve parsing of article's metadata (title, byline, excerpt, site_name, published_time, lang, and dir).
- Improve parsing of `dir` attribute.
- Fix the internal behavior of `Readability::clean_classes` when `Config::classes_to_preserve` is empty.


## [0.1.1] - 2024-12-18

### Changed

- Changed visibility of the `Readability::prepare` to private.
- The `Article` struct is now visible.

### Added

- Added documentation for public API.


================================================
FILE: Cargo.toml
================================================

[workspace.package]
version = "0.17.0"
edition = "2021"
license = "MIT"
rust-version = "1.75"
repository = "https://github.com/niklak/dom_smoothie"
authors = ["niklak <morgenpurple@gmail.com>"]


[package]
name = "dom_smoothie"
version.workspace = true
license.workspace = true
repository.workspace = true
edition.workspace = true
authors.workspace = true
rust-version.workspace = true
description = "A Rust crate for extracting relevant content from web pages"
documentation = "https://docs.rs/dom_smoothie/latest"
keywords = ["html", "readability"]
categories = ["web-programming", "text-processing"]
readme = "README.md"
exclude = [".*", "test-pages", "deny.toml"]

[dependencies]
dom_query = {version = "0.27.0", features = ["mini_selector", "markdown"]}
tendril = {version = "0.5.0"}
once_cell = { version = "1" }
serde = {version = "1.0", features = ["derive"], optional = true}
gjson = {version = "0.8.1"}
html-escape = "0.2.13"
flagset = "0.4.7"
unicode-segmentation = "1.12.0"
thiserror = "2.0"
phf = { version = "0.13.1", features = ["macros"] }
foldhash = "0.2.0"
aho-corasick = { version = "1.1.4", optional = true}

[dev-dependencies]
serde_json = {version = "1.0"}
serde = {version = "1.0", features = ["derive"]}
wasm-bindgen-test = "=0.3.67"



[features]
serde = ["dep:serde"]
aho-corasick = ["dep:aho-corasick"]

[workspace]
members = [
    "crates/cli",
    "crates/js",
    "crates/bench",
    "crates/lua",
    ]

[profile.bench]
codegen-units = 1
lto = "fat"
incremental = false
opt-level = 3


[lints.clippy]
all = { level = "warn", priority = -1 }
cargo = { level = "warn", priority = -1 }
pedantic = { level = "warn", priority = -1 }
must_use_candidate = "allow"
return_self_not_must_use = "allow"
redundant_closure_for_method_calls = "allow"
similar_names = "allow"
needless_pass_by_value = "allow"
trivially_copy_pass_by_ref = "allow"
range_plus_one = "allow"
should_panic_without_expect = "allow"
doc_comment_double_space_linebreaks = "allow"
cast_precision_loss = "allow"

================================================
FILE: LICENSE
================================================
MIT License

Copyright (c) 2024 Mykola Humanov

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.


================================================
FILE: README.md
================================================
# DOM_SMOOTHIE

[![Crates.io version](https://img.shields.io/crates/v/dom_smoothie.svg?style=flat)](https://crates.io/crates/dom_smoothie)
[![Download](https://img.shields.io/crates/d/dom_smoothie.svg?style=flat)](https://crates.io/crates/dom_smoothie)
[![docs.rs docs](https://img.shields.io/badge/docs-latest-blue.svg?style=flat)](https://docs.rs/dom_smoothie)
[![codecov](https://codecov.io/gh/niklak/dom_smoothie/graph/badge.svg?token=X0LB1HB90L)](https://codecov.io/gh/niklak/dom_smoothie)

[![Rust CI](https://github.com/niklak/dom_smoothie/actions/workflows/rust.yml/badge.svg)](https://github.com/niklak/dom_smoothie/actions/workflows/rust.yml)

> A Rust crate for extracting readable content from web pages.

**dom_smoothie** closely follows the implementation of [readability.js](https://github.com/mozilla/readability), bringing its functionality to Rust.


## Examples


<details>
    <summary><b>Readability::parse — a basic example</b></summary>


```rust
use std::error::Error;

use dom_smoothie::{Article, Config, Readability};

fn main() -> Result<(), Box<dyn Error>> {
    let html = include_str!("../test-pages/rustwiki_2024.html");
    let document_url = "https://en.wikipedia.org/wiki/Rust_(programming_language)";

    // for more options check the documentation
    let cfg = Config {
        max_elements_to_parse: 9000,
        ..Default::default()
    };
    // Readability supplies an optional `Config`. If `cfg` is omitted, 
    // then a default `Config` instance will be used.
    // Readability also supplies an optional `document_url` parameter, 
    // which may be used to transform relative URLs into absolute URLs.
    let mut readability = Readability::new(html, Some(document_url), Some(cfg))?;

    let article: Article = readability.parse()?;

    println!("{:<15} {}","Title:", article.title);
    println!("{:<15} {:?}","Byline:", article.byline);
    println!("{:<15} {}","Length:", article.length);
    println!("{:<15} {:?}","Excerpt:", article.excerpt);
    println!("{:<15} {:?}","Site Name:", article.site_name);
    println!("{:<15} {:?}", "Dir:", article.dir);
    println!("{:<15} {:?}","Published Time:", article.published_time);
    println!("{:<15} {:?}","Modified Time:", article.modified_time);
    println!("{:<15} {:?}","Image:", article.image);
    // This uri can be taken only from ld+json
    println!("{:<15} {:?}","URL", article.url);

    // Skipping article.content since it is too large.
    // To check out the html content of the article please have a look at
    // `./test-pages/rustwiki_2024_result.html`
    // println!("HTML Content: {}", article.content);

    // Skipping article.text_content since it is too large.
    // To check out the html content of the article please have a look at 
    // `./test-pages/rustwiki_2024_result.txt`
    //println!("Text Content: {}", article.text_content);

    // Right now, `text_content` provides almost the same result 
    // as readability.js, which is far from perfect. 
    // It may squash words together if element nodes don't have a whitespace before closing, 
    // and currently, I have no definitive opinion on this matter.

    Ok(())
}
```
</details>


<details>
    <summary><b>Parsing only metadata</b></summary>


```rust
use std::error::Error;

use dom_smoothie::{Metadata, Config, Readability};

fn main() -> Result<(), Box<dyn Error>> {
    let html = include_str!("../test-pages/rustwiki_2024.html");

    let cfg = Config {
        // parsing `ld+json` may be skipped
        disable_json_ld: false,
        ..Default::default()
    };

    // You can parse only metadata without parsing the article content
    let readability = Readability::new(html, None, Some(cfg))?;

    // <script type="application/ld+json"> may contain some useful information, 
    // but usually it is not enough.
    let ld_meta: Option<Metadata> = readability.parse_json_ld();

    if let Some(ref meta) = ld_meta {
        println!("LD META: {:#?}", meta);
    }

    println!("\n=============\n");
    // Under the hood, `Readability::parse` passes the metadata obtained from `Readability::parse_json_ld` 
    // as the basis to `Readability::get_article_metadata`. But this is not necessary.
    let meta = readability.get_article_metadata(ld_meta);
    println!("META: {:#?}", &meta);

    // Some fields of Metadata may be missing because they can be assigned
    // during the Readability::parse process.
    // This applies to `excerpt`, `byline`, and `dir`.
    Ok(())
}
```
</details>

<details>
    <summary><b>Parsing only article`s title</b></summary>


```rust
use std::error::Error;

use dom_query::Document;
use dom_smoothie::Readability;

fn main() -> Result<(), Box<dyn Error>> {
    let html = include_str!("../test-pages/rustwiki_2024.html");

    let doc: Document = dom_query::Document::from(html);

    // You can parse only the metadata without parsing the article content.
    let readability: Readability = Readability::with_document(doc, None, None)?;
    
    // Parse only the title without extracting the full content.
    let title: tendril::Tendril<tendril::fmt::UTF8> = readability.get_article_title();
    assert_eq!(title, "Rust (programming language) - Wikipedia".into());
    
    // However, this title may differ from `metadata.title`,
    // as `metadata.title` first attempts to extract the title from the metadata
    // and falls back to `Readability::get_article_title` if unavailable.
    println!("Title: {}", title);

    Ok(())
}
```
</details>


<details>
    <summary><b>Checking if content is readable</b></summary>


```rust
use std::error::Error;

use dom_smoothie::{Article, Readability, Config};

fn main() -> Result<(), Box<dyn Error>> {
    let html = include_str!("../test-pages/rustwiki_2024.html");
    // you can specify optional parameters for `Readability::is_probably_readable`.
    let cfg = Config{
        readable_min_score: 20.0,
        readable_min_content_length: 140,
        ..Default::default()
    };

    let mut readability = Readability::new(html, None,  Some(cfg))?;

    // There is a way to perform a quick check to determine 
    // if the document is readable before cleaning and parsing it.
    // After calling `Readability::parse`, it may show different results, 
    // but calling it after parsing would be nonsensical.

if readability.is_probably_readable() {
        let article: Article = readability.parse()?;
        println!("{:<15} {}", "Title:", article.title);
        println!("{:<15} {:?}", "Byline:", article.byline);
        println!("{:<15} {:?}", "Site Name:", article.site_name);
        println!("{:<15} {:?}", "URL", article.url);
    }

    // This is the same as:
    /*
    let doc = dom_query::Document::from(html);

    if is_probably_readable(&doc, Some(20.0), Some(140)) {

    }
    */

    Ok(())
}
```
</details>


<details>
    <summary><b>Using an alternative approach to selecting the best candidate</b></summary>

Unfortunately, the approach used in mozilla/readability does not always produce the desired 
result when extracting meaningful content. Sometimes, this approach discards part of the 
content simply because there were fewer than three alternative candidates to the best one. 
While this method does a good job, it still relies on too many magic numbers.


After @emschwartz discovered this issue, I decided to add an alternative implementation 
for finding the common candidate. Currently, this implementation may produce a less 
"clean" result compared to mozilla/readability, but in return, it can capture more of
the meaningful content, whereas the original approach from mozilla/readability may fail in 
some cases.

That said, this approach is not necessarily superior to the original—there is still 
room for improvement.

```rust
use std::error::Error;

use dom_smoothie::{Article, Config, Readability, CandidateSelectMode};

fn main() -> Result<(), Box<dyn Error>> {

    let html = include_str!("../test-pages/alt/arstechnica/source.html");
    // for more options check the documentation
    let cfg = Config {
        // activating alternative approach for candidate selection
        candidate_select_mode: CandidateSelectMode::DomSmoothie,
        ..Default::default()
    };

    let mut readability = Readability::new(html, None, Some(cfg))?;

    let article: Article = readability.parse()?;
    println!("Text Content: {}", article.text_content);
    Ok(())
}
```
</details>


<details>
    <summary><b>Formatted text content and Markdown</b></summary>

By default, the text content is output as-is, without formatting, 
preserving whitespace from the original HTML document. 
Depending on the document's initial markup, this can be quite verbose and inconvenient.

To retrieve formatted text content, set text_mode: `TextMode::Formatted` in the config.
This formatting does not preserve table structures, meaning table data may be output as plain text without column alignment.
While this formatting is not as structured as Markdown, it provides a cleaner output compared to raw text.

`TextMode::Markdown` enables Markdown formatting.


```rust
use std::error::Error;

use dom_smoothie::{Article, Config, Readability, TextMode};

fn main() -> Result<(), Box<dyn Error>> {
    
    let html = include_str!("../test-pages/hacker_news.html");
    let cfg = Config {
        // Enable formatted text output
        text_mode: TextMode::Formatted,
        // Enable Markdown output (for more structured text)
        //text_mode: TextMode::Markdown,
        ..Default::default()
    };

    let mut readability = Readability::new(html, None, Some(cfg))?;

    let article: Article = readability.parse()?;
    println!("Text Content: {}", article.text_content);
    Ok(())
}
```
</details>


<details>
    <summary><b>Parsing with One Policy</b></summary>

The `Readability::parse_with_policy` method allows parsing content with a specific policy.
This method follows the same steps as `Readability::parse` but makes only a single attempt using the specified `ParsePolicy`.

As a result, it doesn't store the best attempt, leading to significantly lower memory consumption. Some policies may also be faster than others.
Typically, `ParsePolicy::Strict` is the slowest but provides the cleanest result. `ParsePolicy::Moderate` can also yield a good result, while the others may be less accurate.

In some cases, using certain policies (e.g., `ParsePolicy::Strict`) may result in a `ReadabilityError::GrabFailed` error, whereas `Readability::parse` might succeed.
This happens because `Readability::parse` attempts parsing with different policies (essentially a set of grab flags) until it either succeeds or exhausts all options.

```rust
use std::error::Error;
use dom_smoothie::{ParsePolicy, Readability};

fn main() -> Result<(), Box<dyn Error>> {
    let html = include_str!("../test-pages/readability/lazy-image-3/source.html");
    
    // Policy and expected success
    let cases: [(ParsePolicy, bool); 4] = [
        (ParsePolicy::Strict, false),
        (ParsePolicy::Moderate, false),
        (ParsePolicy::Clean, false),
        (ParsePolicy::Raw, true),
    ];

    for (policy, expected) in cases {
        let mut r = Readability::new(html, None, None)?;
        let article = r.parse_with_policy(policy);
        assert_eq!(article.is_ok(), expected);
    }
    
    Ok(())
}
```
</details>

## Crate Features

- `serde`: Enables the `serde::Serialize` and `serde::Deserialize` traits for the `Article`, `Metadata`, and `Config` structures.
- `aho-corasick`: Enables the use of the `aho-corasick` crate for defining unlikely candidates and for the node scoring process. 
This can speed up the parsing by 5-10% in some cases, at the cost of slightly higher memory usage and a larger binary size.

## Differences from Mozilla/Readability.js

### Absolute URL normalization
- `dom_smoothie` does not modify `href` attributes if they contain absolute URLs — they remain untouched.
- `Readability.js` normalizes URLs — it may add a trailing slash and normalize text case.

**Example:**
  
  ```
  https://fetch.spec.whatwg.org    // dom_smoothie
  https://fetch.spec.whatwg.org/   // Readability.js
  ```

### DOM simplification
- `dom_smoothie` more aggressively removes parent `<div>` elements when they contain only a single `<p>` or `<div>` element.

### Attribute cleanup
- `dom_smoothie` removes all `font` attributes and converts `<font>` elements to `<span>`.
- `Readability.js` converts `<font>` to `<span>` but preserves all attributes.

### Empty links handling
- `dom_smoothie` removes `<a>` elements without an `href` attribute or without child nodes.
- `Readability.js` keeps such elements.

### Class preservation
- In `dom_smoothie`, `class="page"` is preserved only for the article node (`id="readability-page-1"`),
unless explicitly allowed via `Config.classes_to_preserve`.
- `Readability.js` preserves it across the document.


### Filtering order differences

> **Note:** This behavior applies to versions ≤ 0.16.0.

In `Readability.js`, element filtering (removal of unwanted nodes) and scoring are performed in a single stage.

In `dom_smoothie`, part of the filtering is applied globally across all parsing attempts, 
while another part is applied per attempt. This approach helps better eliminate "nested" 
structures (e.g., `<div>` elements that contain only `<div>` or `<p>`).

However, there is a trade-off — the output may slightly differ from `Readability.js` in terms of which elements remain.
This mostly affects elements that overlap with *UNLIKELY CANDIDATES*.

In practice, this may result in duplicate headings (`<h1>`, `<h2>`) and byline elements.

This happens because unlikely candidates may include containers that wrap such elements.


For convenience, all known cases of these differences are collected here:
[test-pages/not-matching](https://github.com/niklak/dom_smoothie/tree/main/test-pages/not-matching).

## See Also

- [readability-rs](https://crates.io/crates/readability-rs): a fork of the currently unmaintained [readability](https://crates.io/crates/readability) crate.

## Changelog
[Changelog](./CHANGELOG.md)


## License

Licensed under MIT ([LICENSE](LICENSE) or http://opensource.org/licenses/MIT).

## Contribution

Any contribution intentionally submitted for inclusion in this project will be licensed under the MIT license, without any additional terms or conditions.


================================================
FILE: crates/bench/Cargo.toml
================================================
[package]
name = "dom-smoothie-bench"
version.workspace = true
edition.workspace = true
license.workspace = true
rust-version.workspace = true
repository.workspace = true
authors.workspace = true
publish = false

[dependencies]
dom_smoothie = { path = "../.." }


[dev-dependencies]
criterion = { version = "0.7.0" }

[[bench]]
name = "parse"
harness = false

================================================
FILE: crates/bench/benches/parse.rs
================================================
use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion};
use std::hint::black_box;

use dom_smoothie::{Article, Config, Readability, ReadabilityError};

fn dom_smoothie_parse(contents: &str, cfg: &Config) -> Result<Article, ReadabilityError> {
    let mut readability = Readability::new(contents, None, Some(cfg.clone()))?;
    readability.parse()
}

fn bench_dom_smoothie_parse(c: &mut Criterion) {
    let mut group = c.benchmark_group("dom_smoothie");

    let small = include_str!("../test-pages/ok/ehow-1/source.html");
    let medium = include_str!("../test-pages/ok/engadget/source.html");
    let large = include_str!("../test-pages/ok/wikipedia-2/source.html");

    // Test different sizes/types of content
    let test_cases = vec![
        ("small", small, 5.0f32),
        ("medium", medium, 5.0f32),
        ("large", large, 5.0f32),
        ("small, min score to adjust 10", small, 10.0f32),
        ("medium, min score to adjust 10", medium, 10.0f32),
        ("large, min score to adjust 10", large, 10.0f32),
    ];

    for (name, contents, min_score_to_adjust) in test_cases {
        let cfg = Config {
            min_score_to_adjust,
            ..Default::default()
        };
        group.bench_with_input(BenchmarkId::new("parse", name), contents, |b, contents| {
            b.iter(|| {
                let res = dom_smoothie_parse(black_box(contents), black_box(&cfg))
                    .expect("Parsing failed");
                black_box(res)
            })
        });
    }
    group.finish();
}

fn configure_criterion() -> Criterion {
    Criterion::default()
}

criterion_group! { name = benches; config = configure_criterion(); targets = bench_dom_smoothie_parse }
criterion_main!(benches);


================================================
FILE: crates/cli/Cargo.toml
================================================
[package]
name = "dom_smoothie_cli"
version.workspace = true
edition.workspace = true
license.workspace = true
rust-version.workspace = true
authors.workspace = true
description = "A reference implementation of a CLI tool for the `dom_smoothie`"
publish = false


[dependencies]
dom_smoothie = { path = "../.." }
clap = {version = "4.5.54", features = ["derive"]}
serde = {version = "1.0", features = ["derive"]}
serde_json = {version = "1.0"}


================================================
FILE: crates/cli/src/main.rs
================================================
//! This is a reference implementation of a CLI tool for the `dom_smoothie` crate.
//!
//! The tool processes an HTML document using [`dom_smoothie::Readability`] to extract
//! relevant content and metadata. It accepts an input HTML file (or stdin) and outputs the
//! parsed article content as both HTML and plain text, along with metadata in JSON format.
//!
//! ## Usage
//! ```bash
//! # File input, file output (default)
//! dom_smoothie_cli --input path/to/input.html --output path/to/output/dir
//!
//! # Stdin to stdout
//! cat page.html | dom_smoothie_cli
//!
//! # Stdin, select text output
//! curl -s https://example.com | dom_smoothie_cli -f text
//!
//! # File input, stdout
//! dom_smoothie_cli --input page.html --stdout -f metadata
//! ```
//!
//! If the `--input` argument is omitted (or set to `-`), input is read from stdin.
//! When reading from stdin with no `--output` specified, results are printed to stdout.
//! If the `--output` argument is omitted with file input, the results will be saved in the
//! same directory as the input file. An optional `--document-url` parameter can be provided
//! to enhance parsing accuracy by specifying the base document URL.

use std::error::Error;
use std::ffi::OsString;
use std::io::{self, IsTerminal, Read, Write};
use std::{fs, path::PathBuf};

use clap::{Parser, ValueEnum};
use dom_smoothie::{Article, CandidateSelectMode, Config, Readability, TextMode};

#[derive(Clone, ValueEnum)]
enum OutputFormat {
    /// Extracted article HTML
    Html,
    /// Extracted plain text content
    Text,
    /// Article metadata as JSON
    Metadata,
}

#[derive(Parser)]
#[clap(version, about, long_about = None)]
#[clap(help_template = "{name} {version}\n\n{about}\n\n{usage}\n\n{all-args}")]
struct Cli {
    /// Sets an input path to the html document. Omit or use `-` to read from stdin.
    #[clap(short, long, value_parser)]
    input: Option<PathBuf>,
    /// Sets an output path. If omitted the parent dir of `<INPUT>` will be used.
    /// When reading from stdin, omitting this enables stdout mode.
    #[clap(short, long, value_parser)]
    output: Option<PathBuf>,
    /// Print output to stdout instead of writing files
    #[clap(long, value_parser)]
    stdout: bool,
    /// Output format when writing to stdout (html, text, or metadata)
    #[clap(short = 'f', long, value_enum, default_value = "html")]
    output_format: OutputFormat,
    /// Sets an optional base document URL
    #[clap(short, long, value_parser, value_name = "URL")]
    document_url: Option<String>,
    /// Keeps elements' classes if set true
    #[clap(long, value_parser)]
    keep_classes: bool,
    /// Sets a list of classes that will be preserved and not removed during the post-process.
    /// Multiple classes should be separated by a comma (`,`)
    #[clap(long, value_parser, value_delimiter = ',')]
    preserved_classes: Vec<String>,
    /// Skips parsing metadata from ld+json script elements
    #[clap(long, value_parser)]
    disable_json_ld: bool,
    /// Sets a maximum number of elements to parse. If it equals 0, then there is no limit.
    #[clap(long, value_parser, default_value = "0")]
    max_elements: usize,
    /// Sets a character threshold for content extraction
    #[clap(long, value_parser, default_value = "500")]
    char_threshold: usize,
    /// Sets a number of top candidates for content extraction
    #[clap(long, value_parser, default_value = "5")]
    n_top_candidates: usize,
    // Produce formatted text output
    #[clap(long, value_parser, default_value = "false")]
    formatted_text: bool,
    // Use alternative (dom_smoothie) mode for finding common top candidate.
    #[clap(long, value_parser, default_value = "false")]
    alt_mode: bool,
}

/// This struct represents the metadata from the [`dom_smoothie::Article`]
#[derive(Default, serde::Deserialize, serde::Serialize)]
struct Metadata {
    title: String,
    byline: Option<String>,
    excerpt: Option<String>,
    site_name: Option<String>,
    published_time: Option<String>,
    modified_time: Option<String>,
    lang: Option<String>,
    url: Option<String>,
    dir: Option<String>,
}

impl From<&Article> for Metadata {
    fn from(value: &Article) -> Self {
        Self {
            title: value.title.clone(),
            byline: value.byline.clone(),
            excerpt: value.excerpt.clone(),
            site_name: value.site_name.clone(),
            published_time: value.published_time.clone(),
            modified_time: value.modified_time.clone(),
            lang: value.lang.clone(),
            url: value.url.clone(),
            dir: value.dir.clone(),
        }
    }
}

/// Reads HTML content from either a file or stdin.
/// Returns the content string and an optional source name (file stem).
fn read_input(input: &Option<PathBuf>) -> Result<(String, Option<OsString>), Box<dyn Error>> {
    match input {
        Some(path) if path.as_os_str() != "-" => {
            let source_name = path.with_extension("").file_name().map(|n| n.to_owned());
            let contents = fs::read_to_string(path)?;
            Ok((contents, source_name))
        }
        _ => {
            let stdin = io::stdin();
            if stdin.is_terminal() {
                eprintln!(
                    "Warning: reading from terminal stdin. \
                     Did you mean to pipe input? Press Ctrl+D when done."
                );
            }
            let mut contents = String::new();
            stdin.lock().read_to_string(&mut contents)?;
            Ok((contents, None))
        }
    }
}

fn main() -> Result<(), Box<dyn Error>> {
    let cli = Cli::parse();

    // Read input from file or stdin
    let (contents, source_name) = read_input(&cli.input)?;
    let document_url = cli.document_url.as_deref();

    // Determine if we're in stdout mode:
    // explicitly via --stdout, or implicitly when input is stdin and no --output given
    let is_stdin =
        cli.input.is_none() || cli.input.as_deref() == Some(std::path::Path::new("-"));
    let use_stdout = cli.stdout || (is_stdin && cli.output.is_none());

    let text_mode = if cli.formatted_text {
        TextMode::Formatted
    } else {
        TextMode::Raw
    };

    let candidate_select_mode = if cli.alt_mode {
        CandidateSelectMode::DomSmoothie
    } else {
        CandidateSelectMode::Readability
    };

    let cfg = Config {
        keep_classes: cli.keep_classes,
        classes_to_preserve: cli.preserved_classes,
        max_elements_to_parse: cli.max_elements,
        disable_json_ld: cli.disable_json_ld,
        n_top_candidates: cli.n_top_candidates,
        char_threshold: cli.char_threshold,
        candidate_select_mode,
        text_mode,
        ..Default::default()
    };

    let mut ra = Readability::new(contents, document_url, Some(cfg))?;
    let article = ra.parse()?;

    if use_stdout {
        let mut stdout = io::stdout().lock();
        match cli.output_format {
            OutputFormat::Html => {
                write!(stdout, "{}", article.content)?;
            }
            OutputFormat::Text => {
                write!(stdout, "{}", article.text_content)?;
            }
            OutputFormat::Metadata => {
                let metadata = Metadata::from(&article);
                let metadata_content = serde_json::to_string_pretty(&metadata)?;
                write!(stdout, "{}", metadata_content)?;
            }
        }
    } else {
        // File output mode
        let base_name = source_name.unwrap_or_else(|| OsString::from("stdin"));
        let base_name_str = base_name.to_string_lossy();

        let output_path = cli.output.unwrap_or_else(|| {
            cli.input
                .as_ref()
                .and_then(|p| p.with_extension("").parent().map(|par| par.to_path_buf()))
                .unwrap_or_else(|| PathBuf::from("."))
        });

        let result_html_path = output_path.join(format!("{base_name_str}_result.html"));
        fs::write(result_html_path, article.content.as_bytes())?;

        let result_text_path = output_path.join(format!("{base_name_str}_result.txt"));
        fs::write(result_text_path, article.text_content.as_bytes())?;

        let metadata = Metadata::from(&article);
        let metadata_content = serde_json::to_string_pretty(&metadata)?;
        let meta_path = output_path.join(format!("{base_name_str}_metadata.json"));
        fs::write(meta_path, metadata_content)?;
    }

    Ok(())
}


================================================
FILE: crates/js/.gitignore
================================================
/target
**/*.rs.bk
Cargo.lock
bin/
pkg/
wasm-pack.log


================================================
FILE: crates/js/Cargo.toml
================================================
[package]
name = "dom-smoothie-js"
version.workspace = true
edition.workspace = true
license.workspace = true
rust-version.workspace = true
authors.workspace = true
description = "A wrapper around the `dom_smoothie` crate for extracting relevant content from web pages"
repository = "https://github.com/niklak/dom_smoothie"
publish = false

keywords = ["html", "readability"]


[lib]
crate-type = ["cdylib", "rlib"]


[dependencies]
wasm-bindgen = "0.2.84"
dom_smoothie = { path = "../..", features = ["serde"] }

serde-wasm-bindgen = "0.6.5"
lol_alloc = {version = "0.4.1", optional = true}
cfg-if = "1.0.4"
# The `console_error_panic_hook` crate provides better debugging of panics by
# logging them with `console.error`. This is great for development, but requires
# all the `std::fmt` and `std::panicking` infrastructure, so isn't great for
# code size when deploying.
console_error_panic_hook = { version = "0.1.7", optional = true }

[dev-dependencies]
wasm-bindgen-test = "0.3.34"


# `wasm-opt` is on by default in for the release profile, but it can be
# disabled by setting it to `false`
[package.metadata.wasm-pack.profile.release]
wasm-opt = ['-Oz']


[features]
default = ["console_error_panic_hook", "lol_alloc"]
lol_alloc = ["dep:lol_alloc"]
console_error_panic_hook = ["dep:console_error_panic_hook"]

================================================
FILE: crates/js/LICENSE_MIT
================================================
MIT License

Copyright (c) 2025 Mykola Humanov

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.


================================================
FILE: crates/js/README.md
================================================
# DOM-SMOOTHIE-JS
> `dom-smoothie-js` is a nodejs package for extracting readable content from web pages. 
> It is a wrapper around the rust [dom_smoothie](https://github.com/niklak/dom_smoothie) crate.


## Configuration
| Parameter                  | Type                       | Default Value                      | Description |
|-----------------------------|---------------------------|------------------------------------|-------------|
| keep_classes               | `boolean`                 | `false`                            | Keep all classes in the document |
| classes_to_preserve        | `Array<string>`           | `[]`                               | List of classes to preserve |
| max_elements_to_parse      | `number`                  | `0`                                | Maximum number of elements to parse |
| disable_json_ld            | `boolean`                 | `false`                            | Disable JSON-LD extraction |
| n_top_candidates           | `number`                  | `5`                                | Number of top candidates to consider |
| char_threshold             | `number`                  | `500`                              | Character threshold for content extraction |
| readable_min_score         | `number` (float)          | `20.0`                             | Minimum score required for readability check |
| readable_min_content_length| `number`                  | `140`                              | Minimum content length for readability check |
| candidate_select_mode      | `'Readability' \| 'DomSmoothie'` | `'Readability'`                 | Candidate selection mode |
| text_mode                  | `'Raw' \| 'Formatted' \| 'Markdown'`    | `'Raw'`                            | Text output mode, either raw, formatted or Markdown |

### Example Object with Default Parameters

```javascript
const config = {
  keep_classes: false,
  classes_to_preserve: [],
  max_elements_to_parse: 0,
  disable_json_ld: false,
  n_top_candidates: 5,
  char_threshold: 500,
  readable_min_score: 20.0,
  readable_min_content_length: 140,
  candidate_select_mode: 'Readability',
  text_mode: 'Raw'
};
```

## Examples


<details>
    <summary><b>Readability.parse — a basic example</b></summary>


```javascript
import { Readability } from "dom-smoothie-js";
import { readFileSync } from "node:fs";

function main() {
  const content = readFileSync("test_data/rustwiki_2024.html", "utf8");
  const document_url = "https://en.wikipedia.org/wiki/Rust_(programming_language)";
  const cfg = {
    classes_to_preserve: ["caption"],
  }

  // document_url and cfg
  const article = new Readability(content, document_url, cfg).parse();
  console.log("Title:", article.title);
  console.log("Byline:", article.byline);
  console.log("Length:", article.length);
  console.log("Excerpt:", article.excerpt);
  console.log("Site Name:", article.site_name);
  console.log("Dir:", article.dir);
  console.log("Published Time:", article.published_time);
  console.log("Modified Time:", article.modified_time);
  console.log("Image:", article.image);
  // This uri can be taken only from ld+json
  console.log("URL:", article.url);

  // Skipping article.content since it is too large.
  //console.log("HTML Content:", article.content);

  // Skipping article.text_content since it is too large.
  //console.log("Text Content:", article.text_content);
}

main();
```
</details>


<details>
    <summary><b>Parsing only article`s title</b></summary>


```javascript
import { Readability } from "dom-smoothie-js";
import { readFileSync } from "node:fs";

function main() {
  const content = readFileSync("test_data/rustwiki_2024.html", "utf8");


  // You can parse only the metadata without parsing the article content.
  const readability = new Readability(content, null, null);

  // Parse only the title without extracting the full content.
  const title = readability.get_article_title();
  console.log("Title:", title);

  // However, this title may differ from `metadata.title`,
  // as `metadata.title` first attempts to extract the title from the metadata
  // and falls back to `Readability::get_article_title` if unavailable.

}

main();
```
</details>


<details>
    <summary><b>Parsing only metadata</b></summary>


```javascript
import { Readability } from "dom-smoothie-js";
import { readFileSync } from "node:fs";

function main() {
  const content = readFileSync("test_data/rustwiki_2024.html", "utf8");

  const cfg = {
    disable_json_ld: false,
  };

  // You can parse only metadata without parsing the article content
  const readability = new Readability(content, null, cfg);

  // <script type="application/ld+json"> may contain some useful information,
  // but usually it is not enough.
  const ld_meta = readability.parse_json_ld();

  console.log("LD META:", ld_meta);

  // Under the hood, `Readability::parse` passes the metadata obtained from `Readability::parse_json_ld`
  // as the basis to `Readability::get_article_metadata`. But this is not necessary.
  const meta = readability.get_article_metadata(ld_meta);

  console.log("META:", meta);

  // Some fields of Metadata may be missing because they can be assigned
  // during the Readability::parse process.
  // This applies to `excerpt`, `byline`, and `dir`.
}

main();
```
</details>


<details>
    <summary><b>Checking if content is readable</b></summary>


```javascript
import { Readability } from "dom-smoothie-js";
import { readFileSync } from "node:fs";

function main() {
  const content = readFileSync("test_data/rustwiki_2024.html", "utf8");

  // you can specify optional parameters for `Readability.is_probably_readable`.
  const cfg = {
    readable_min_score: 20.0,
    readable_min_content_length: 140,
  };

  const readability = new Readability(content, null, cfg);

  // There is a way to perform a quick check to determine
  // if the document is readable before cleaning and parsing it.
  // After calling `Readability::parse`, it may show different results,
  // but calling it after parsing would be nonsensical.
  if (readability.is_probably_readable()) {
    let article = readability.parse();
    console.log("Title:", article.title);
    console.log("Byline:", article.byline);
    console.log("Site Name:", article.site_name);
    console.log("URL:", article.url);
    // and so on...
  }
}

main();
```
</details>


<details>
    <summary><b>Using an alternative approach to selecting the best candidate</b></summary>


```javascript
import { Readability } from "dom-smoothie-js";
import { readFileSync } from "node:fs";

function main() {
  const content = readFileSync("test_data/rustwiki_2024.html", "utf8");

  const cfg = {
    candidate_select_mode: "DomSmoothie",
  };

  const readability = new Readability(content, null, cfg);

  const article = readability.parse();
  console.log("Text Content:", article.text_content);
}

main();
```
</details>



<details>
    <summary><b>Formatted text content and Markdown</b></summary>

By default, the text content is output as-is, without formatting, 
preserving whitespace from the original HTML document. 
Depending on the document's initial markup, this can be quite verbose and inconvenient.

To retrieve formatted text content, set text_mode: `TextMode::Formatted` in the config.
This formatting does not preserve table structures, meaning table data may be output as plain text without column alignment.
While this formatting is not as structured as Markdown, it provides a cleaner output compared to raw text.

`TextMode::Markdown` enables Markdown formatting.

```javascript
import { Readability } from "dom-smoothie-js";
import { readFileSync } from "node:fs";

function main() {
  const content = readFileSync("test_data/rustwiki_2024.html", "utf8");

  const cfg = {
    text_mode: "Formatted",
    //text_mode: "Markdown",
  };

  const readability = new Readability(content, null, cfg);

  const article = readability.parse();
  console.log("Text Content:", article.text_content);
}

main();
```
</details>


<details>
    <summary><b>Parsing with One Policy</b></summary>

The `Readability.parse_with_policy` method allows parsing content with a specific policy.
This method follows the same steps as `Readability.parse` but makes only a single attempt using the specified `ParsePolicy`.

As a result, it doesn't store the best attempt, leading to significantly lower memory consumption. Some policies may also be faster than others.
Typically, `ParsePolicy.Strict` is the slowest but provides the cleanest result. `ParsePolicy.Moderate` can also yield a good result, while the others may be less accurate.

In some cases, using certain policies (e.g., `ParsePolicy.Strict`) may result in an error, whereas `Readability.parse` might succeed.
This happens because `Readability.parse` attempts parsing with different policies (essentially a set of grab flags) until it either succeeds or exhausts all options.

Available policies: `ParsePolicy.Strict`, `ParsePolicy.Moderate`, `ParsePolicy.Clean`, `ParsePolicy.Raw`.

```javascript
import { Readability, ParsePolicy } from "dom-smoothie-js";
import { readFileSync } from "node:fs";

function main() {
  const content = readFileSync("test_data/rustwiki_2024.html", "utf8");
  const document_url = "https://en.wikipedia.org/wiki/Rust_(programming_language)";

  // Available policies: ParsePolicy.Strict, ParsePolicy.Moderate, ParsePolicy.Clean, ParsePolicy.Raw
  const article = new Readability(content, document_url, null).parse_with_policy(ParsePolicy.Strict);
  console.log("Text Content:", article.text_content);
}

main();
```
</details>

## License

Licensed under MIT ([LICENSE](LICENSE) or http://opensource.org/licenses/MIT).

================================================
FILE: crates/js/src/lib.rs
================================================
mod utils;

use cfg_if::cfg_if;
use wasm_bindgen::prelude::*;

cfg_if! {
    if #[cfg(all(feature = "lol_alloc", target_arch = "wasm32"))]{
        use lol_alloc::{FreeListAllocator, LockedAllocator};
        #[global_allocator]
        static ALLOCATOR: LockedAllocator<FreeListAllocator> = LockedAllocator::new(FreeListAllocator::new());
    }
}

#[wasm_bindgen]
#[derive(Debug, Default, Clone, Copy)]
/// `ParsePolicy` defines how scoring, content extraction, and cleaning should be performed.
pub enum ParsePolicy {
    /// Strict policy
    /// - removes unlikely elements before determining the elements score;
    /// - uses `id` and `class` attributes of the element to determine its score;
    /// - applies additional content cleaning after identifying the main content.
    #[default]
    Strict,
    /// Moderate policy
    /// - uses `id` and `class` attributes of the element to determine its score;
    /// - applies additional content cleaning after identifying the main content.
    Moderate,
    /// Clean policy
    /// - applies additional content cleaning after identifying the main content.
    Clean,
    /// Raw policy
    /// - applies no cleaning heuristics.
    Raw,
}

impl From<ParsePolicy> for dom_smoothie::ParsePolicy {
    fn from(val: ParsePolicy) -> Self {
        match val {
            ParsePolicy::Strict => dom_smoothie::ParsePolicy::Strict,
            ParsePolicy::Moderate => dom_smoothie::ParsePolicy::Moderate,
            ParsePolicy::Clean => dom_smoothie::ParsePolicy::Clean,
            ParsePolicy::Raw => dom_smoothie::ParsePolicy::Raw,
        }
    }
}

#[wasm_bindgen]
/// A struct that provides readability functionality
pub struct Readability(dom_smoothie::Readability);

#[wasm_bindgen]
impl Readability {
    #[wasm_bindgen(constructor)]
    /// Create a new `Readability` instance
    ///
    /// # Panics
    ///
    /// Panics if `document_url` is not a valid URL
    ///
    /// # Arguments
    ///
    /// - `html` -- HTML content
    /// - `document_url` -- a base URL of the page
    /// - `cfg` -- an optional `JsValue` instance
    ///
    /// # Returns
    ///
    /// A new [`Readability`] instance
    ///
    /// # Errors
    ///
    /// Returns [`JsError`] if `document_url` is not a valid URL
    pub fn new(
        content: String,
        document_url: Option<String>,
        cfg: JsValue,
    ) -> Result<Readability, JsError> {
        let cfg = if cfg.is_null() {
            None
        } else {
            serde_wasm_bindgen::from_value(cfg).map_err(|e| JsError::new(&e.to_string()))?
        };

        let doc_url = document_url.as_ref().map(|s| s.as_str());
        let ra = dom_smoothie::Readability::new(content, doc_url, cfg)
            .map_err(|e| JsError::new(&e.to_string()))?;
        Ok(Readability(ra))
    }

    /// Extracts the relevant content from the document and provides it as a JSON object.
    ///
    /// This is the primary method of the crate. It performs the following steps:
    ///
    /// - Verify the document
    /// - Extracts the metadata
    /// - Cleans the document
    /// - Extracts the main content of the document
    /// - Post-processes the content
    /// - Returns the content and the metadata as a JSON object
    ///
    /// # Returns
    ///
    /// A JSON object containing the content and the metadata.
    ///
    /// # Errors
    /// If `config.max_elements_to_parse` is > 0 and the document's number of element nodes exceeds this limit,
    /// a `JsError` error is returned.
    /// If the document fails to extract the content, a `JsError` error is returned.
    #[wasm_bindgen]
    pub fn parse(&mut self) -> Result<JsValue, JsError> {
        match self.0.parse() {
            Ok(article) => {
                serde_wasm_bindgen::to_value(&article).map_err(|e| JsError::new(&e.to_string()))
            }
            Err(e) => Err(JsError::new(&e.to_string())),
        }
    }

    /// Extracts the relevant content from the document and provides it as an JSON object.
    ///
    /// This method performs the same steps as [`Readability::parse`], but performs only one attempt with the specified [`ParsePolicy`].
    /// The results of this method are likely to be worse than those of [`Readability::parse`], but it consumes significantly
    /// less memory because it does not need to keep the best attempt.
    /// If you need more precise results, use [`Readability::parse`],  
    /// as it sequentially applies all policies, from strict to raw.
    #[wasm_bindgen]
    pub fn parse_with_policy(&mut self, policy: ParsePolicy) -> Result<JsValue, JsError> {
        match self.0.parse_with_policy(policy.into()) {
            Ok(article) => {
                serde_wasm_bindgen::to_value(&article).map_err(|e| JsError::new(&e.to_string()))
            }
            Err(e) => Err(JsError::new(&e.to_string())),
        }
    }

    /// Returns the title of the article as a string.
    #[wasm_bindgen]
    pub fn get_article_title(&mut self) -> String {
        self.0.get_article_title().to_string()
    }

    /// Searches for a JSON-LD block in the page and extracts the metadata from it.
    ///
    /// # Returns
    ///
    /// An object containing the metadata extracted from the JSON-LD block.
    /// If no valid JSON-LD block is found, this method returns `null`.
    #[wasm_bindgen]
    pub fn parse_json_ld(&mut self) -> JsValue {
        let json_ld = self.0.parse_json_ld();
        serde_wasm_bindgen::to_value(&json_ld)
            .ok()
            .unwrap_or(JsValue::null())
    }

    /// Extracts the metadata from the article.
    ///
    /// This method takes an optional `json_ld` object as input, which is used as a fallback
    /// if no metadata can be found on the page. If the input `json_ld` object contains any
    /// of the following fields, they will not be overwritten by this function:
    /// - `title`
    /// - `byline`
    /// - `excerpt`
    /// - `site_name`
    /// - `published_time`
    /// - `modified_time`
    /// - `lang`
    /// - `dir`
    /// - `image`
    /// - `url`
    ///
    /// # Returns
    ///
    /// An object containing the metadata extracted from the article.
    /// If no valid metadata can be found, this method returns `null`.
    #[wasm_bindgen]
    pub fn get_article_metadata(&mut self, json_ld: JsValue) -> JsValue {
        let json_ld: Option<dom_smoothie::Metadata> = serde_wasm_bindgen::from_value(json_ld).ok();
        let metadata = self.0.get_article_metadata(json_ld);
        serde_wasm_bindgen::to_value(&metadata)
            .ok()
            .unwrap_or(JsValue::null())
    }

    /// Returns true if the content is probably readable, false otherwise.
    ///
    /// This method is useful for quickly determining whether content is
    /// readable without having to parse the content.
    #[wasm_bindgen]
    pub fn is_probably_readable(&mut self) -> bool {
        self.0.is_probably_readable()
    }
}

/// Parse the content of a document.
///
/// This is a convenience method that is equivalent to creating a new
/// `Readability` instance and calling its `parse` method.
///
/// # Returns
///
/// An object containing the content and the metadata.
///
/// # Errors
///
/// Returns a `JsError` if the document fails to parse.
#[wasm_bindgen]
pub fn parse(content: &str) -> Result<JsValue, JsError> {
    let mut ra = dom_smoothie::Readability::new(content, None, None)
        .map_err(|e| JsError::new(&e.to_string()))?;

    match ra.parse() {
        Ok(article) => {
            serde_wasm_bindgen::to_value(&article).map_err(|e| JsError::new(&e.to_string()))
        }
        Err(e) => Err(JsError::new(&e.to_string())),
    }
}


================================================
FILE: crates/js/src/utils.rs
================================================
#![allow(dead_code)]
pub fn set_panic_hook() {
    // When the `console_error_panic_hook` feature is enabled, we can call the
    // `set_panic_hook` function at least once during initialization, and then
    // we will get better error messages if our code ever panics.
    //
    // For more details see
    // https://github.com/rustwasm/console_error_panic_hook#readme
    #[cfg(feature = "console_error_panic_hook")]
    console_error_panic_hook::set_once();
}


================================================
FILE: crates/js/tests/web.rs
================================================
//! Test suite for the Web and headless browsers.

#![cfg(target_arch = "wasm32")]

extern crate wasm_bindgen_test;
use wasm_bindgen::JsValue;
use wasm_bindgen_test::*;

//wasm_bindgen_test_configure!(run_in_browser);

#[wasm_bindgen_test]
fn test_parse() {
    let contents = include_str!("../test-pages/rustwiki_2024.html");
    let res = dom_smoothie_js::parse(contents);
    assert!(res.is_ok());
}

#[wasm_bindgen_test]
fn test_parse_constructor() {
    let contents = include_str!("../test-pages/rustwiki_2024.html");

    let mut ra =
        dom_smoothie_js::Readability::new(contents.to_string(), None, JsValue::null()).unwrap();

    let article = ra.parse();
    assert!(article.is_ok());
}


================================================
FILE: crates/lua/Cargo.toml
================================================
[package]
name = "dom-smoothie-lua"
version.workspace = true
edition.workspace = true
license.workspace = true
rust-version.workspace = true
repository.workspace = true
authors.workspace = true
publish = false

[lib]
crate-type = ["cdylib"]

[dependencies]
dom_smoothie = { path = "../..", features = ["serde"] }
mlua = { version = "0.11.5", features = ["lua54", "module", "serde"] }

================================================
FILE: crates/lua/src/lib.rs
================================================
use dom_smoothie;
use mlua::prelude::*;
use mlua::LuaSerdeExt;

pub struct Readability(dom_smoothie::Readability);

impl LuaUserData for Readability {
    fn add_fields<F: LuaUserDataFields<Self>>(_fields: &mut F) {}

    fn add_methods<M: LuaUserDataMethods<Self>>(methods: &mut M) {
        methods.add_method("get_article_title", |_, this, ()| {
            let title = this.0.get_article_title();
            Ok(title.to_string())
        });

        methods.add_method("parse_json_ld", |lua, this, ()| {
            lua.to_value(&this.0.parse_json_ld())
        });

        methods.add_method("get_article_metadata", |lua, this, json_ld: LuaValue| {
            let meta_ld: Option<dom_smoothie::Metadata> = lua.from_value(json_ld)?;
            lua.to_value(&this.0.get_article_metadata(meta_ld))
        });

        methods.add_method("is_probably_readable", |_, this, ()| {
            Ok(this.0.is_probably_readable())
        });

        methods.add_method_mut("parse", |lua, this, ()| {
            let article = this.0.parse().map_err(|e| LuaError::external(e))?;
            lua.to_value(&article)
        });

        methods.add_method_mut("parse_with_policy", |lua, this, policy: LuaValue| {
            let parse_policy: dom_smoothie::ParsePolicy = lua.from_value(policy)?;
            let article = this
                .0
                .parse_with_policy(parse_policy)
                .map_err(|e| LuaError::external(e))?;
            lua.to_value(&article)
        });
    }
}

#[mlua::lua_module(name = "dom_smoothie")]
fn dom_smoothie_module(lua: &'_ Lua) -> LuaResult<LuaTable> {
    let exports = lua.create_table()?;

    let readability_ctor = lua.create_function(
        |lua_vm, (html, doc_url, config): (String, Option<String>, Option<LuaTable>)| {
            let mut cfg = dom_smoothie::Config::default();

            if let Some(opts) = config {
                if let Some(v) = opts.get::<Option<bool>>("keep_classes")? {
                    cfg.keep_classes = v;
                }
                if let Some(v) = opts.get::<Option<Vec<String>>>("classes_to_preserve")? {
                    cfg.classes_to_preserve = v;
                }
                if let Some(v) = opts.get::<Option<usize>>("max_elements_to_parse")? {
                    cfg.max_elements_to_parse = v;
                }
                if let Some(v) = opts.get::<Option<bool>>("disable_json_ld")? {
                    cfg.disable_json_ld = v;
                }
                if let Some(v) = opts.get::<Option<usize>>("n_top_candidates")? {
                    cfg.n_top_candidates = v;
                }
                if let Some(v) = opts.get::<Option<usize>>("char_threshold")? {
                    cfg.char_threshold = v;
                }
                if let Some(v) = opts.get::<Option<f32>>("min_score_to_adjust")? {
                    cfg.min_score_to_adjust = v;
                }
                if let Some(v) = opts.get::<Option<f32>>("readable_min_score")? {
                    cfg.readable_min_score = v;
                }
                if let Some(v) = opts.get::<Option<usize>>("readable_min_content_length")? {
                    cfg.readable_min_content_length = v;
                }
                if let Some(val) = opts.get::<Option<LuaValue>>("candidate_select_mode")? {
                    cfg.candidate_select_mode = lua_vm.from_value(val)?;
                }
                if let Some(val) = opts.get::<Option<LuaValue>>("text_mode")? {
                    cfg.text_mode = lua_vm.from_value(val)?;
                }
            }

            let readability = dom_smoothie::Readability::new(html, doc_url.as_deref(), Some(cfg))
                .map_err(|e| LuaError::external(e))?;
            Ok(Readability(readability))
        },
    )?;

    exports.set("Readability", readability_ctor)?;
    Ok(exports)
}


================================================
FILE: deny.toml
================================================
# This template contains all of the possible sections and their default values

# Note that all fields that take a lint level have these possible values:
# * deny - An error will be produced and the check will fail
# * warn - A warning will be produced, but the check will not fail
# * allow - No warning or error will be produced, though in some cases a note
# will be

# The values provided in this template are the default values that will be used
# when any section or field is not specified in your own configuration

# Root options

# The graph table configures how the dependency graph is constructed and thus
# which crates the checks are performed against
[graph]
# If 1 or more target triples (and optionally, target_features) are specified,
# only the specified targets will be checked when running `cargo deny check`.
# This means, if a particular package is only ever used as a target specific
# dependency, such as, for example, the `nix` crate only being used via the
# `target_family = "unix"` configuration, that only having windows targets in
# this list would mean the nix crate, as well as any of its exclusive
# dependencies not shared by any other crates, would be ignored, as the target
# list here is effectively saying which targets you are building for.
targets = [
    # The triple can be any string, but only the target triples built in to
    # rustc (as of 1.40) can be checked against actual config expressions
    #"x86_64-unknown-linux-musl",
    # You can also specify which target_features you promise are enabled for a
    # particular target. target_features are currently not validated against
    # the actual valid features supported by the target architecture.
    #{ triple = "wasm32-unknown-unknown", features = ["atomics"] },
]
# When creating the dependency graph used as the source of truth when checks are
# executed, this field can be used to prune crates from the graph, removing them
# from the view of cargo-deny. This is an extremely heavy hammer, as if a crate
# is pruned from the graph, all of its dependencies will also be pruned unless
# they are connected to another crate in the graph that hasn't been pruned,
# so it should be used with care. The identifiers are [Package ID Specifications]
# (https://doc.rust-lang.org/cargo/reference/pkgid-spec.html)
#exclude = []
# If true, metadata will be collected with `--all-features`. Note that this can't
# be toggled off if true, if you want to conditionally enable `--all-features` it
# is recommended to pass `--all-features` on the cmd line instead
all-features = false
# If true, metadata will be collected with `--no-default-features`. The same
# caveat with `all-features` applies
no-default-features = false
# If set, these feature will be enabled when collecting metadata. If `--features`
# is specified on the cmd line they will take precedence over this option.
#features = []

# The output table provides options for how/if diagnostics are outputted
[output]
# When outputting inclusion graphs in diagnostics that include features, this
# option can be used to specify the depth at which feature edges will be added.
# This option is included since the graphs can be quite large and the addition
# of features from the crate(s) to all of the graph roots can be far too verbose.
# This option can be overridden via `--feature-depth` on the cmd line
feature-depth = 1

# This section is considered when running `cargo deny check advisories`
# More documentation for the advisories section can be found here:
# https://embarkstudios.github.io/cargo-deny/checks/advisories/cfg.html
[advisories]
# The path where the advisory databases are cloned/fetched into
#db-path = "$CARGO_HOME/advisory-dbs"
# The url(s) of the advisory databases to use
#db-urls = ["https://github.com/rustsec/advisory-db"]
# A list of advisory IDs to ignore. Note that ignored advisories will still
# output a note when they are encountered.
ignore = [
    #"RUSTSEC-0000-0000",
    #{ id = "RUSTSEC-0000-0000", reason = "you can specify a reason the advisory is ignored" },
    #"a-crate-that-is-yanked@0.1.1", # you can also ignore yanked crate versions if you wish
    #{ crate = "a-crate-that-is-yanked@0.1.1", reason = "you can specify why you are ignoring the yanked crate" },

]
# If this is true, then cargo deny will use the git executable to fetch advisory database.
# If this is false, then it uses a built-in git library.
# Setting this to true can be helpful if you have special authentication requirements that cargo-deny does not support.
# See Git Authentication for more information about setting up git authentication.
#git-fetch-with-cli = true

# This section is considered when running `cargo deny check licenses`
# More documentation for the licenses section can be found here:
# https://embarkstudios.github.io/cargo-deny/checks/licenses/cfg.html
[licenses]
# List of explicitly allowed licenses
# See https://spdx.org/licenses/ for list of possible licenses
# [possible values: any SPDX 3.11 short identifier (+ optional exception)].
allow = [
    #"MIT",
    #"Apache-2.0",
    #"Apache-2.0 WITH LLVM-exception",
]
# The confidence threshold for detecting a license from license text.
# The higher the value, the more closely the license text must be to the
# canonical license text of a valid SPDX license file.
# [possible values: any between 0.0 and 1.0].
confidence-threshold = 0.8
# Allow 1 or more licenses on a per-crate basis, so that particular licenses
# aren't accepted for every possible crate as with the normal allow list
exceptions = [
    # Each entry is the crate and version constraint, and its specific allow
    # list
    #{ allow = ["Zlib"], crate = "adler32" },
]

# Some crates don't have (easily) machine readable licensing information,
# adding a clarification entry for it allows you to manually specify the
# licensing information
#[[licenses.clarify]]
# The package spec the clarification applies to
#crate = "ring"
# The SPDX expression for the license requirements of the crate
#expression = "MIT AND ISC AND OpenSSL"
# One or more files in the crate's source used as the "source of truth" for
# the license expression. If the contents match, the clarification will be used
# when running the license check, otherwise the clarification will be ignored
# and the crate will be checked normally, which may produce warnings or errors
# depending on the rest of your configuration
#license-files = [
# Each entry is a crate relative path, and the (opaque) hash of its contents
#{ path = "LICENSE", hash = 0xbd0eed23 }
#]

[licenses.private]
# If true, ignores workspace crates that aren't published, or are only
# published to private registries.
# To see how to mark a crate as unpublished (to the official registry),
# visit https://doc.rust-lang.org/cargo/reference/manifest.html#the-publish-field.
ignore = false
# One or more private registries that you might publish crates to, if a crate
# is only published to private registries, and ignore is true, the crate will
# not have its license(s) checked
registries = [
    #"https://sekretz.com/registry
]

# This section is considered when running `cargo deny check bans`.
# More documentation about the 'bans' section can be found here:
# https://embarkstudios.github.io/cargo-deny/checks/bans/cfg.html
[bans]
# Lint level for when multiple versions of the same crate are detected
multiple-versions = "warn"
# Lint level for when a crate version requirement is `*`
wildcards = "allow"
# The graph highlighting used when creating dotgraphs for crates
# with multiple versions
# * lowest-version - The path to the lowest versioned duplicate is highlighted
# * simplest-path - The path to the version with the fewest edges is highlighted
# * all - Both lowest-version and simplest-path are used
highlight = "all"
# The default lint level for `default` features for crates that are members of
# the workspace that is being checked. This can be overridden by allowing/denying
# `default` on a crate-by-crate basis if desired.
workspace-default-features = "allow"
# The default lint level for `default` features for external crates that are not
# members of the workspace. This can be overridden by allowing/denying `default`
# on a crate-by-crate basis if desired.
external-default-features = "allow"
# List of crates that are allowed. Use with care!
allow = [
    #"ansi_term@0.11.0",
    #{ crate = "ansi_term@0.11.0", reason = "you can specify a reason it is allowed" },
]
# List of crates to deny
deny = [
    #"ansi_term@0.11.0",
    #{ crate = "ansi_term@0.11.0", reason = "you can specify a reason it is banned" },
    # Wrapper crates can optionally be specified to allow the crate when it
    # is a direct dependency of the otherwise banned crate
    #{ crate = "ansi_term@0.11.0", wrappers = ["this-crate-directly-depends-on-ansi_term"] },
]

# List of features to allow/deny
# Each entry the name of a crate and a version range. If version is
# not specified, all versions will be matched.
#[[bans.features]]
#crate = "reqwest"
# Features to not allow
#deny = ["json"]
# Features to allow
#allow = [
#    "rustls",
#    "__rustls",
#    "__tls",
#    "hyper-rustls",
#    "rustls",
#    "rustls-pemfile",
#    "rustls-tls-webpki-roots",
#    "tokio-rustls",
#    "webpki-roots",
#]
# If true, the allowed features must exactly match the enabled feature set. If
# this is set there is no point setting `deny`
#exact = true

# Certain crates/versions that will be skipped when doing duplicate detection.
skip = [
    #"ansi_term@0.11.0",
    #{ crate = "ansi_term@0.11.0", reason = "you can specify a reason why it can't be updated/removed" },
]
# Similarly to `skip` allows you to skip certain crates during duplicate
# detection. Unlike skip, it also includes the entire tree of transitive
# dependencies starting at the specified crate, up to a certain depth, which is
# by default infinite.
skip-tree = [
    #"ansi_term@0.11.0", # will be skipped along with _all_ of its direct and transitive dependencies
    #{ crate = "ansi_term@0.11.0", depth = 20 },
]

# This section is considered when running `cargo deny check sources`.
# More documentation about the 'sources' section can be found here:
# https://embarkstudios.github.io/cargo-deny/checks/sources/cfg.html
[sources]
# Lint level for what to happen when a crate from a crate registry that is not
# in the allow list is encountered
unknown-registry = "warn"
# Lint level for what to happen when a crate from a git repository that is not
# in the allow list is encountered
unknown-git = "warn"
# List of URLs for allowed crate registries. Defaults to the crates.io index
# if not specified. If it is specified but empty, no registries are allowed.
allow-registry = ["https://github.com/rust-lang/crates.io-index"]
# List of URLs for allowed Git repositories
allow-git = []

[sources.allow-org]
# github.com organizations to allow git sources for
github = []
# gitlab.com organizations to allow git sources for
gitlab = []
# bitbucket.org organizations to allow git sources for
bitbucket = []


================================================
FILE: src/ac_automat.rs
================================================
use aho_corasick::{AhoCorasick, AhoCorasickKind};
use once_cell::sync::Lazy;

use crate::glob::{CLASSES_NEGATIVE, CLASSES_POSITIVE, MAYBE_CANDIDATES, UNLIKELY_CANDIDATES};

pub(crate) static AC_UNLIKELY: Lazy<AhoCorasick> = Lazy::new(|| ac_automaton(UNLIKELY_CANDIDATES));
pub(crate) static AC_MAYBE: Lazy<AhoCorasick> = Lazy::new(|| ac_automaton(MAYBE_CANDIDATES));
pub(crate) static AC_CLASSES_NEGATIVE: Lazy<AhoCorasick> =
    Lazy::new(|| ac_automaton(CLASSES_NEGATIVE));
pub(crate) static AC_CLASSES_POSITIVE: Lazy<AhoCorasick> =
    Lazy::new(|| ac_automaton(CLASSES_POSITIVE));

fn ac_automaton(patterns: &[&str]) -> AhoCorasick {
    AhoCorasick::builder()
        .kind(Some(AhoCorasickKind::ContiguousNFA))
        .build(patterns)
        .unwrap()
}


================================================
FILE: src/config.rs
================================================
use flagset::FlagSet;

use crate::{
    glob::{MIN_CONTENT_LENGTH, MIN_SCORE},
    grab_flags::GrabFlags,
};

pub(crate) static DEFAULT_N_TOP_CANDIDATES: usize = 5;
pub(crate) static DEFAULT_CHAR_THRESHOLD: usize = 500;
pub(crate) static DEFAULT_MIN_SCORE_TO_ADJUST: f32 = 5.0;

#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
#[derive(Debug, Default, Clone, Copy)]
pub enum CandidateSelectMode {
    #[default]
    Readability,
    DomSmoothie,
}

#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
#[derive(Debug, Default, Clone, Copy)]
pub enum TextMode {
    #[default]
    Raw,
    Formatted,
    Markdown,
}

/// Configuration options for [`crate::Readability`]
#[derive(Debug, Clone)]
#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
#[cfg_attr(feature = "serde", serde(default))]
pub struct Config {
    /// Set to `true` to keep all classes in the document
    pub keep_classes: bool,
    /// List of classes that will be preserved and not removed during the post-process.
    pub classes_to_preserve: Vec<String>,
    /// Maximum number of elements to parse
    pub max_elements_to_parse: usize,
    /// Disable JSON-LD extracting
    pub disable_json_ld: bool,
    /// Number of top candidates to handle
    pub n_top_candidates: usize,
    /// Character threshold for content extraction
    pub char_threshold: usize,
    /// The minimum score required for a node to be adjusted during scoring. Defaults to 5.0.  
    /// The higher this value, the faster the node scoring process, as link density calculations are performed less frequently.  
    /// A value between 5 and 10 is usually enough to yield good results.
    pub min_score_to_adjust: f32,
    /// The minimum score required for the document to be considered readable. Defaults to 20.0.
    /// Used only for [`crate::Readability::is_probably_readable`].
    pub readable_min_score: f32,
    /// The minimum content length required for the document to be considered readable. Defaults to 140.
    /// Used only for [`crate::Readability::is_probably_readable`].
    pub readable_min_content_length: usize,
    /// Determines whether the top candidate is adjusted
    /// based on [Readability.js](https://github.com/mozilla/readability)
    /// or uses the crate's exclusive implementation.
    pub candidate_select_mode: CandidateSelectMode,
    /// Allows to set the text mode, whether it should be raw (as-is), formatted or markdown
    pub text_mode: TextMode,
}

impl Default for Config {
    fn default() -> Self {
        Self {
            keep_classes: false,
            classes_to_preserve: Vec::new(),
            max_elements_to_parse: 0,
            disable_json_ld: false,
            n_top_candidates: DEFAULT_N_TOP_CANDIDATES,
            char_threshold: DEFAULT_CHAR_THRESHOLD,
            min_score_to_adjust: DEFAULT_MIN_SCORE_TO_ADJUST,
            readable_min_score: MIN_SCORE,
            readable_min_content_length: MIN_CONTENT_LENGTH,
            candidate_select_mode: CandidateSelectMode::Readability,
            text_mode: TextMode::Raw,
        }
    }
}

#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
#[derive(Debug, Default, Clone, Copy)]
/// `ParsePolicy` defines how scoring, content extraction, and cleaning should be performed.
pub enum ParsePolicy {
    /// Strict policy
    /// - removes unlikely elements before determining the elements score;
    /// - uses `id` and `class` attributes of the element to determine its score;
    /// - applies additional content cleaning after identifying the main content.
    #[default]
    Strict,
    /// Moderate policy
    /// - uses `id` and `class` attributes of the element to determine its score;
    /// - applies additional content cleaning after identifying the main content.
    Moderate,
    /// Clean policy
    /// - applies additional content cleaning after identifying the main content.
    Clean,
    /// Raw policy
    /// - applies no cleaning heuristics.
    Raw,
}

impl From<ParsePolicy> for FlagSet<GrabFlags> {
    fn from(val: ParsePolicy) -> Self {
        match val {
            ParsePolicy::Strict => FlagSet::full(),
            ParsePolicy::Moderate => GrabFlags::WeightClasses | GrabFlags::CleanConditionally,
            ParsePolicy::Clean => FlagSet::default() | GrabFlags::CleanConditionally,
            ParsePolicy::Raw => FlagSet::default(),
        }
    }
}


================================================
FILE: src/glob.rs
================================================
use dom_query::{mini_selector::MiniSelector, Matcher};
use once_cell::sync::Lazy;
use phf::phf_set;

macro_rules! lazy_matcher {
    ($pattern:expr) => {
        Lazy::new(|| Matcher::new($pattern).unwrap())
    };
}

pub(crate) static CONTENT_ID: &str = "readability-page-1";
pub(crate) static MIN_COMMON_ANCESTORS: usize = 3;
pub(crate) static SCORE_ATTR: &str = "data-readability-score";
pub(crate) static MIN_SCORE: f32 = 20.0;
pub(crate) static MIN_CONTENT_LENGTH: usize = 140;

pub(crate) static BASE64_MARKER: &str = ";base64,";
pub(crate) static BASE64_MARKER_LEN: usize = BASE64_MARKER.len();

pub(crate) static SCHEMA_ORG_SFX: &str = "://schema.org";
pub(crate) static HTTP_PFX: &str = "http://";
pub(crate) static HTTPS_PFX: &str = "https://";

pub(crate) static PROTOCOL_PFX: &str = "//";
pub(crate) static PROTOCOL_PFX_LEN: usize = PROTOCOL_PFX.len();
pub(crate) static WWW_PFX: &str = "//www.";
pub(crate) static WWW_PFX_LEN: usize = WWW_PFX.len();

// --- Matchers ---

pub(crate) static MATCHER_CONTENT_ID: Lazy<Matcher> = lazy_matcher!("#readability-page-1");
pub(crate) static MATCHER_LI_P: Lazy<Matcher> = lazy_matcher!("li p");
pub(crate) static MATCHER_TITLE: Lazy<Matcher> = lazy_matcher!("head title");
pub(crate) static MATCHER_SCRIPT: Lazy<Matcher> = lazy_matcher!("script, noscript");
pub(crate) static MATCHER_HTML_LANG: Lazy<Matcher> = lazy_matcher!("html[lang]");
pub(crate) static MATCHER_STYLE: Lazy<Matcher> = lazy_matcher!("style");
pub(crate) static MATCHER_FONT: Lazy<Matcher> = lazy_matcher!("font");
pub(crate) static MATCHER_BR: Lazy<Matcher> = lazy_matcher!("br");
pub(crate) static MATCHER_IMG: Lazy<Matcher> = lazy_matcher!("img");
pub(crate) static MATCHER_META: Lazy<Matcher> = lazy_matcher!("meta[content]");
pub(crate) static MATCHER_JS_LINK: Lazy<Matcher> = lazy_matcher!(r#"a[href^="javascript:"]"#);
pub(crate) static MATCHER_JSONLD: Lazy<Matcher> =
    lazy_matcher!(r#"script[type="application/ld+json"]"#);
pub(crate) static MATCHER_HEADING: Lazy<Matcher> = lazy_matcher!(r"h1,h2");
pub(crate) static MATCHER_DIALOGS: Lazy<Matcher> =
    lazy_matcher!(r#"dialog,[aria-modal="true"][role="dialog"]"#);
pub(crate) static MATCHER_BYLINE: Lazy<Matcher> =
    lazy_matcher!(r#"[rel="author"],[itemprop*="author"]"#);
pub(crate) static MATCHER_SOURCES: Lazy<Matcher> =
    lazy_matcher!("img,picture,figure,video,audio,source");
pub(crate) static MATCHER_P: Lazy<Matcher> = lazy_matcher!("p");
pub(crate) static MATCHER_EMBEDS: Lazy<Matcher> = lazy_matcher!("object,embed,iframe");
pub(crate) static MATCHER_CLEAN: Lazy<Matcher> =
    lazy_matcher!("object,embed,footer,link,aside,iframe,input,textarea,select,button");
pub(crate) static MATCHER_DATA_TABLE: Lazy<Matcher> =
    lazy_matcher!("table[data-readability-table]");
pub(crate) static MATCHER_TABLE: Lazy<Matcher> = lazy_matcher!("table");
pub(crate) static MATCHER_TABLE_MEMBERS: Lazy<Matcher> =
    lazy_matcher!("caption,col,colgroup,tfoot,thead,th");
pub(crate) static MATCHER_LAZY_IMG: Lazy<Matcher> =
    lazy_matcher!(r#"[class*="lazy"],img[loading="lazy"]"#);
pub(crate) static MATCHER_FAVICON: Lazy<Matcher> =
    lazy_matcher!(r#"link[rel="icon"], link[rel="shortcut icon"], link[rel="apple-touch-icon"]"#);

// --- Mini matchers ---

pub(crate) static MINI_FALLBACK_IMG: Lazy<MiniSelector> =
    Lazy::new(|| MiniSelector::new(r#"[class*="fallback-image"]"#).unwrap());
pub(crate) static MINI_ARIA_HIDDEN: Lazy<MiniSelector> =
    Lazy::new(|| MiniSelector::new(r#"[aria-hidden="true"]"#).unwrap());
pub(crate) static MINI_PRESENTATION: Lazy<MiniSelector> =
    Lazy::new(|| MiniSelector::new(r#"[role="presentation"]"#).unwrap());
pub(crate) static MINI_AINT_DATA_TABLE: Lazy<MiniSelector> =
    Lazy::new(|| MiniSelector::new(r#"[datatable="0"]"#).unwrap());

pub(crate) static TEXTISH_TAGS: &str = "blockquote,dl,div,img,ol,p,pre,table,ul,span,li,td";

pub(crate) static META_TITLE_KEYS: &[&str] = &[
    "dc:title",
    "dcterms:title",
    "og:title",
    "weibo:article:title",
    "weibo:webpage:title",
    "title",
    "twitter:title",
    "parsely-title",
];
pub(crate) static META_IMAGE_KEYS: &[&str] = &["og:image", "image", "twitter:image"];
pub(crate) static META_MOD_TIME_KEYS: &[&str] = &["article:modified_time", "dcterms.modified"];
pub(crate) static META_PUB_TIME_KEYS: &[&str] = &[
    "article:published_time",
    "dcterms.available",
    "dcterms.created",
    "dcterms.issued",
    "parsely-pub-date",
    "weibo:article:create_at",
];
pub(crate) static META_BYLINE_KEYS: &[&str] =
    &["dc:creator", "dcterms:creator", "author", "parsely-author"];
pub(crate) static META_EXCERPT_KEYS: &[&str] = &[
    "dc:description",
    "dcterms:description",
    "og:description",
    "weibo:article:description",
    "weibo:webpage:description",
    "description",
    "twitter:description",
];

#[rustfmt::skip]
pub(crate) static PRESENTATIONAL_ATTRIBUTES: &[&str] = &[
    "align", "background", "bgcolor", "border", "cellpadding", "cellspacing", 
    "frame", "hspace", "rules", "style", "valign", "vspace",
];

#[rustfmt::skip]
pub(crate) static UNLIKELY_CANDIDATES: &[&str] = &[
    "-ad-", "ai2html", "banner", "breadcrumbs", "combx", "comment", "community",
    "cover-wrap", "disqus", "extra", "footer", "gdpr", "header", "legends", "menu",
    "related", "remark", "replies", "rss", "shoutbox", "sidebar", "skyscraper",
    "social", "sponsor", "supplemental", "ad-break", "agegate", "pagination", 
    "pager", "popup", "yom-remote",
];

pub(crate) static MAYBE_CANDIDATES: &[&str] = &[
    "and", "article", "body", "column", "content", "layout", "main", "mathjax", "shadow",
];

pub(crate) static BYLINE_PATTERNS: &[&str] =
    &["byline", "author", "dateline", "writtenby", "p-author"];

pub(crate) static JSONLD_ARTICLE_TYPES: &[&str] = &[
    "Article",
    "AdvertiserContentArticle",
    "NewsArticle",
    "AnalysisNewsArticle",
    "AskPublicNewsArticle",
    "BackgroundNewsArticle",
    "OpinionNewsArticle",
    "ReportageNewsArticle",
    "ReviewNewsArticle",
    "Report",
    "SatiricalArticle",
    "ScholarlyArticle",
    "MedicalScholarlyArticle",
    "SocialMediaPosting",
    "BlogPosting",
    "LiveBlogPosting",
    "DiscussionForumPosting",
    "TechArticle",
    "APIReference",
];

pub(crate) static VIDEO_DOMAINS: &[&str] = &[
    "dailymotion.com",
    "youtube.com",
    "youtube-nocookie.com",
    "player.vimeo.com",
    "v.qq.com",
    "archive.org",
    "upload.wikimedia.org",
    "player.twitch.tv",
    "bilibili.com",
    "live.bilibili.com",
];

pub(crate) static COMMAS: &[char] = &[
    '\u{002C}', '\u{060C}', '\u{FE50}', '\u{FE10}', '\u{FE11}', '\u{2E41}', '\u{2E34}', '\u{2E32}',
    '\u{FF0C}',
];

pub(crate) static TITLE_SEPARATORS: &[char] = &['|', '-', '–', '—', '\\', '/', '>', '»'];
pub(crate) static TITLE_HIERARCHY_SEP: &[char] = &['\\', '/', '>', '»'];
pub(crate) static IMG_EXT: &[&str] = &[".jpg", ".jpeg", ".png", ".webp", ".avif", ".gif"];

#[rustfmt::skip]
pub(crate) static META_NAME_PREFIXES: &[&str] = &[
    "article", "dc", "dcterms", "og", "twitter", "parsely", "weibo:article", "weibo:webpage",
];

#[rustfmt::skip]
pub(crate) static META_NAME_KEYS: &[&str] = &[
    "author", "creator", "pub-date", "description", "title", "site_name",
];

pub(crate) static META_NAME_SEP: &[char] = &['-', '.', ':'];
pub(crate) static META_PROPERTY_PREFIXES: &[&str] = &["article", "dc", "dcterms", "og", "twitter"];

#[rustfmt::skip]
pub(crate) static META_PROPERTY_KEYS: &[&str] = &[
    "author", "creator", "description", "published_time", "title", "site_name", "image"
];

pub(crate) static SHARE_WORDS: &[&str] = &["share", "sharedaddy"];

#[rustfmt::skip]
pub(crate) static CLASSES_NEGATIVE: &[&str] = &[
    "-ad-", "hidden", "banner", "combx", "comment", "com-", "contact", "footer",
    "gdpr", "masthead", "media", "meta", "outbrain", "promo", "related", "scroll",
    "share", "shoutbox", "sidebar", "skyscraper", "sponsor", "shopping", "tags",
    "widget"
];

#[rustfmt::skip]
pub(crate) static CLASSES_POSITIVE: &[&str] = &[
    "article", "body", "content", "entry", "hentry", "h-entry", "main", "page",
    "post", "text", "blog", "story",
];

pub(crate) static CLASSES_NEGATIVE_WORDS: &[&str] = &["hid"];

// ---phf sets ---

pub(crate) static ALTER_TO_DIV_EXCEPTIONS: phf::Set<&'static str> =
    phf_set!("article", "section", "p", "ol", "ul");
pub(crate) static DEFAULT_TAGS_TO_SCORE: phf::Set<&'static str> =
    phf_set!("section", "h2", "h3", "h4", "h5", "h6", "p", "td", "pre");
pub(crate) static TAGS_WITH_CONTENT: phf::Set<&'static str> =
    phf_set!("div", "section", "header", "h1", "h2", "h3", "h4", "h5", "h6");

#[rustfmt::skip]
pub(crate) static BLOCK_ELEMS: phf::Set<&'static str> = phf_set!(
    "blockquote", "dl", "div", "img", "ol", "p", "pre", "table", "ul",
);

pub(crate) static EMBED_ELEMENTS: phf::Set<&'static str> = phf_set!("object", "embed", "iframe");

#[rustfmt::skip]
pub(crate) static UNLIKELY_ROLES: phf::Set<&'static str> = phf_set!(
    "menu", "menubar", "complementary", "navigation", "alert", "alertdialog", "dialog"
);

#[rustfmt::skip]
pub(crate) static PHRASING_ELEMS: phf::Set<&'static str> = phf_set!(
    "abbr", "audio", "b", "bdo", "br", "button", "cite", "code", "data", "datalist", "dfn", "em",
    "embed", "i", "img", "input", "kbd", "label", "mark", "math", "meter", "noscript", "object",
    "output", "progress", "q", "ruby", "samp", "script", "select", "small", "span", "strong",
    "sub", "sup", "textarea", "time", "var", "wbr"
);

pub(crate) static DEPRECATED_SIZE_ATTRIBUTE_ELEMS: phf::Set<&'static str> =
    phf_set!("table", "th", "td", "hr", "pre");

#[rustfmt::skip]
pub(crate) static AD_WORDS: phf::Set<&'static str> = phf_set!(
    "ad", "advertising", "advertisement", "pub", "publicité", 
    "werb", "werbung", "广告", "реклама", "anuncio"
);
#[rustfmt::skip]
pub(crate) static LOADING_WORDS: phf::Set<&'static str> = phf_set!(
    "loading", "正在加载", "загрузка", "chargement", "cargando"
);


================================================
FILE: src/grab.rs
================================================
use dom_query::Tree;
use foldhash::{HashMap, HashSet};
use std::vec;

use dom_query::{Document, NodeId, NodeRef};
use flagset::FlagSet;

use crate::config::CandidateSelectMode;
#[allow(clippy::wildcard_imports)]
use crate::glob::*;
use crate::grab_flags::GrabFlags;
#[allow(clippy::wildcard_imports)]
use crate::helpers::*;
#[allow(clippy::wildcard_imports)]
use crate::matching::*;
use crate::prep_article::prep_article;
#[allow(clippy::wildcard_imports)]
use crate::score::*;
use crate::Config;
use crate::Metadata;
use crate::Readability;

impl Readability {
    pub(crate) fn grab_article(&self, metadata: &Metadata) -> Option<Document> {
        let mut flags: FlagSet<GrabFlags> = FlagSet::full();
        let mut best_attempt: Option<(Document, usize)> = None;
        loop {
            let doc = self.doc.clone();
            let article_node = self.attempt_grab_article(&doc, &flags, metadata);
            // Now that we've gone through the full algorithm, check to see if
            // we got any meaningful content. If we didn't, we may need to re-run
            // `grab_article` with different flags set. This gives us a higher likelihood of
            // finding the content, and the sieve approach gives us a higher likelihood of
            // finding the -right- content.

            if let Some(ref article_node) = article_node {
                let text_length = article_node.normalized_char_count();
                if text_length >= self.config.char_threshold {
                    return Some(doc);
                }

                if let Some((_, best_text_length)) = best_attempt {
                    if text_length > best_text_length {
                        best_attempt = Some((doc, text_length));
                    }
                } else {
                    best_attempt = Some((doc, text_length));
                }
            }
            if flags.contains(GrabFlags::StripUnlikelys) {
                flags -= GrabFlags::StripUnlikelys;
            } else if flags.contains(GrabFlags::WeightClasses) {
                flags -= GrabFlags::WeightClasses;
            } else if flags.contains(GrabFlags::CleanConditionally) {
                flags -= GrabFlags::CleanConditionally;
            } else {
                // No luck after removing flags,
                // just return the longest text we found during the different loops
                let (best_doc, _) = best_attempt?;
                return Some(best_doc);
            }
        }
    }

    pub(crate) fn attempt_grab_article<'a>(
        &self,
        doc: &'a Document,
        flags: &FlagSet<GrabFlags>,
        metadata: &Metadata,
    ) -> Option<NodeRef<'a>> {
        let selection = doc.select_single("body");
        let body_node = selection.nodes().first()?;
        let strip_unlikely = flags.contains(GrabFlags::StripUnlikelys);
        let elements_to_score = collect_elements_to_score(body_node, strip_unlikely, metadata);
        let article_node = self.handle_candidates(&elements_to_score, body_node, flags);
        article_node.map(|n| NodeRef::new(n.id, &doc.tree))
    }

    fn handle_candidates<'a>(
        &self,
        elements_to_score: &[NodeRef<'a>],
        body_node: &'a NodeRef,
        flags: &FlagSet<GrabFlags>,
    ) -> Option<NodeRef<'a>> {
        let tree = body_node.tree;
        let weigh_class = flags.contains(GrabFlags::WeightClasses);
        let top_candidates = score_elements(elements_to_score, tree, &self.config, flags);

        let mut top_candidate = top_candidates.first().copied();

        let mut top_candidate_is_created = false;

        if top_candidate.is_none() || top_candidate.as_ref().is_some_and(|n| n.has_name("body")) {
            top_candidate_is_created = true;
            let tc = tree.new_element("div");

            tree.reparent_children_of(&body_node.id, Some(tc.id));
            body_node.append_child(&tc);
            init_node_score(&tc, weigh_class);
            top_candidate = Some(tc);
        } else if let Some(mut tc) = top_candidate {
            if matches!(
                self.config.candidate_select_mode,
                CandidateSelectMode::DomSmoothie
            ) {
                tc = find_common_candidate_alt(tc, &top_candidates, weigh_class);
            } else {
                // Find a better top candidate node if it contains (at least three) nodes which belong to `top_candidates` array
                // and whose scores are quite closed with current `top_candidate` node.
                tc = find_common_candidate(tc, &top_candidates, weigh_class);
            }

            // If the top candidate is the only child, use parent instead. This will help sibling
            // joining logic when adjacent content is actually located in parent's sibling node.
            let mut parent_of_top_candidate = tc.parent();

            while let Some(ref tc_parent) = parent_of_top_candidate {
                if tc_parent.has_name("body") {
                    break;
                }

                if tc_parent.element_children().len() != 1 {
                    break;
                }
                tc = *tc_parent;
                parent_of_top_candidate = tc_parent.parent();
            }
            top_candidate = Some(tc);
        }

        let tc = top_candidate.as_ref()?;

        if !has_node_score(tc) {
            init_node_score(tc, weigh_class);
        }
        // Now that we have the top candidate, look through its siblings for content
        // that might also be related. Things like preambles, content split by ads
        // that we removed, etc.
        let article_content = tree.new_element("div");
        assign_article_node(tc, &article_content);

        //prepare the article
        prep_article(&article_content, flags, &self.config);

        if top_candidate_is_created {
            tc.set_attr("id", CONTENT_ID);
            tc.set_attr("class", "page");
        } else {
            // this code does the same thing as mozilla's implementation, but it is more simpler.
            article_content.set_attr("id", CONTENT_ID);
            article_content.set_attr("class", "page");
        }

        Some(article_content)
    }
}

fn is_unlikely_candidate(node: &NodeRef) -> bool {
    // Assuming that `<body>` node can't can't reach this function
    if node.has_name("a") {
        return false;
    }

    let match_string = get_node_matching_string(node);
    if match_string.is_empty() {
        return false;
    }

    if !match_unlikely(&match_string) {
        return false;
    }

    !has_ancestor(node, Some(0), |n| {
        let Some(qual_name) = n.qual_name_ref() else {
            return false;
        };
        matches!(qual_name.local.as_ref(), "table" | "code")
    })
}

fn div_into_p(node: &NodeRef) {
    // Turn all divs that don't have children block level elements into p's
    // Put phrasing content into paragraphs.
    let mut child_node = node.first_child();
    while let Some(ref child) = child_node {
        child_node = wrap_phrasing_content(child);
    }
}

fn wrap_phrasing_content<'a>(node: &NodeRef<'a>) -> Option<NodeRef<'a>> {
    if is_phrasing_content(node) && !is_whitespace(node) {
        let mut next_sibling = node.next_sibling();
        let p = node.tree.new_element("p");
        node.insert_before(&p);
        p.append_child(node);

        while let Some(child) = next_sibling {
            next_sibling = child.next_sibling();
            if is_phrasing_content(&child) {
                p.append_child(&child);
            } else {
                break;
            }
        }
        // Because `p` starts with phrasing content that is not whitespace,
        // we can skip checking the first child for whitespace.

        while let Some(p_last_child) = p.last_child() {
            if is_whitespace(&p_last_child) {
                p_last_child.remove_from_parent();
            } else {
                break;
            }
        }

        return next_sibling;
    }

    node.next_sibling()
}

fn has_child_block_element(node: &NodeRef) -> bool {
    node.descendants_it().any(|n| {
        n.element_ref()
            .is_some_and(|el| BLOCK_ELEMS.contains(&el.name.local))
    })
}

fn score_elements<'a>(
    elements_to_score: &[NodeRef<'a>],
    tree: &'a Tree,
    cfg: &Config,
    flags: &FlagSet<GrabFlags>,
) -> Vec<NodeRef<'a>> {
    let mut score_map: HashMap<NodeId, f32> = HashMap::default();
    let mut cc_cache = CharCounterCache::default();

    for element in elements_to_score {
        let content_len = cc_cache.char_count(element);
        if content_len < 25 {
            continue;
        }
        // these elements have at least one ancestor -- their parent.
        let ancestors = element.ancestors(Some(5));

        // Count commas in the element's text content without allocating a new StrTendril.
        // Equivalent to `1 + element.text().split(COMMAS).count()`, but more efficient.
        let mut content_score = 2 + score_text_content(element);
        content_score += std::cmp::min(content_len / 100, 3);
        for (level, ancestor) in ancestors.iter().enumerate() {
            if !ancestor.is_element() || ancestor.parent().is_none() {
                continue;
            }

            let score_divider: f32 = match level {
                0 => 1.0,
                1 => 2.0,
                _ => (level * 3) as f32,
            };

            let mut ancestor_score = if let Some(score) = score_map.get(&ancestor.id) {
                *score
            } else {
                score_map.insert(ancestor.id, 0.0);
                determine_node_score(ancestor, flags.contains(GrabFlags::WeightClasses))
            };

            ancestor_score += content_score as f32 / score_divider;
            score_map
                .entry(ancestor.id)
                .and_modify(|s| *s = ancestor_score);

            if ancestor.has_name("body") {
                break;
            }
        }
    }

    // Scale the final candidates score based on link density. Good content
    // should have a relatively small link density (5% or less) and be mostly
    // unaffected by this operation.

    let mut scored_candidates: Vec<_> = score_map
        .into_iter()
        .filter(|(_, score)| *score > 0.0)
        .map(|(node_id, prev_score)| {
            let candidate = NodeRef::new(node_id, tree);
            // Skipping adjustment of low score
            let score = if prev_score > cfg.min_score_to_adjust {
                prev_score * (1.0 - link_density_fn(&candidate, None, |n| cc_cache.char_count(n)))
            } else {
                prev_score
            };
            set_node_score(&candidate, score);
            (candidate, score)
        })
        .collect();

    scored_candidates.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap());

    scored_candidates
        .into_iter()
        .take(cfg.n_top_candidates)
        .map(move |c| c.0)
        .collect()
}

fn assign_article_node(tc: &NodeRef, article_content: &NodeRef) {
    let tc_node_score = get_node_score(tc);
    let mut sibling_score_threshold = tc_node_score * 0.2;
    if sibling_score_threshold < 10.0 {
        sibling_score_threshold = 10.0;
    }
    // Keep potential top candidate's parent node to try to get text direction of it later.
    let Some(tc_parent) = tc.parent() else {
        unreachable!("Top candidate must have a parent")
    };

    let tc_class = tc.attr_or("class", "");
    let siblings: Vec<NodeRef> = tc_parent.element_children();
    for sibling in &siblings {
        let mut append = false;
        if sibling.id == tc.id {
            append = true;
        } else {
            let mut content_bonus: f32 = 0.0;
            let sibling_class = sibling.attr_or("class", "");
            if !tc_class.is_empty() && sibling_class == tc_class {
                content_bonus += tc_node_score * 0.2;
            }
            let sibling_score = get_node_score(sibling);
            if sibling_score > 0.0 {
                if sibling_score + content_bonus >= sibling_score_threshold {
                    append = true;
                }
            } else if sibling.has_name("p") {
                let sibling_text = sibling.text();
                let node_content = normalize_spaces(&sibling_text);
                let node_length = node_content.chars().count();
                let link_density = link_density(sibling, Some(node_length));

                if (node_length > 80 && link_density < 0.25)
                    || node_length < 80
                        && node_length > 0
                        && link_density == 0.0
                        && is_sentence(&node_content)
                {
                    append = true;
                }
            }
        }

        //appending sibling
        if append {
            if !node_name_in(sibling, &ALTER_TO_DIV_EXCEPTIONS) {
                // We have a node that isn't a common block level element, like a form or td tag.
                // Turn it into a div so it doesn't get filtered out later by accident.
                sibling.rename("div");
            }
            article_content.append_child(&sibling.id);
        }
    }
    tc_parent.append_child(article_content);
}

/// Find a better top candidate across other candidates in a way that `mozilla/readability` does.
fn find_common_candidate<'a>(
    mut top_candidate: NodeRef<'a>,
    top_candidates: &[NodeRef<'a>],
    weigh_class: bool,
) -> NodeRef<'a> {
    let tc = &mut top_candidate;
    let tc_score = get_node_score(tc);

    let mut alternative_candidate_ancestors = vec![];
    for alt in top_candidates.iter().skip(1) {
        if get_node_score(alt) / tc_score >= 0.75 {
            alternative_candidate_ancestors.push(alt.ancestors(Some(0)));
        }
    }
    // MIN_COMMON_ANCESTORS (in mozilla/readability.js -- MINIMUM_TOPCANDIDATES)
    // represents the number of top candidates' ancestors that may be common.
    // The idea is good, but this magic number doesn't always work very well.
    // For example, imagine we have only two candidates, and both are significant.
    // So, we end up with one top candidate and another candidate.
    // However, the second candidate will be excluded in the end because we require
    // at least three (!) lists of ancestors,
    // which is impossible to derive from just one candidate.
    // To adjust the top candidate to share a common ancestor with other candidates,
    // we would need at least three other candidates.
    // Currently, I consider this approach to be flawed...

    if alternative_candidate_ancestors.len() > MIN_COMMON_ANCESTORS {
        let mut parent_of_top_candidate = tc.parent();
        while let Some(ref tc_parent) = parent_of_top_candidate {
            if tc_parent.has_name("body") {
                break;
            }

            let mut lists_containing_this_ancestor = 0;

            for alt_ancestor in &alternative_candidate_ancestors {
                if alt_ancestor.iter().any(|n| n.id == tc_parent.id) {
                    lists_containing_this_ancestor += 1;
                }
            }

            if lists_containing_this_ancestor >= MIN_COMMON_ANCESTORS {
                top_candidate = *tc_parent;
                break;
            }

            parent_of_top_candidate = tc_parent.parent();
        }
    }

    top_candidate = adjust_top_candidate_by_parent(top_candidate, weigh_class);

    top_candidate
}

/// Find a better top candidate across other candidates (alternative approach).
fn find_common_candidate_alt<'a>(
    mut top_candidate: NodeRef<'a>,
    top_candidates: &[NodeRef<'a>],
    weigh_class: bool,
) -> NodeRef<'a> {
    if top_candidates.len() < 2 {
        return top_candidate;
    }
    let tc = &mut top_candidate;

    let tc_ancestors = get_node_ancestors_set(tc);
    let tc_score = get_node_score(tc);

    let mut ancestor_match_counter: HashMap<NodeId, usize> = HashMap::default();

    for alt in top_candidates.iter().skip(1) {
        if get_node_score(alt) / tc_score >= 0.75 {
            let alt_ancestors = get_node_ancestors_set(alt);
            if alt_ancestors.contains(&tc.id) {
                continue;
            }
            let intersect = tc_ancestors.intersection(&alt_ancestors);
            for item in intersect {
                *ancestor_match_counter.entry(*item).or_insert(0) += 1;
            }
        }
    }

    let mut require_adjustment = true;
    // choosing the best candidate by how close it to the top candidate,
    // and then by how many common ancestors it has across all other candidates
    if let Some(best_candidate_id) = ancestor_match_counter
        .into_iter()
        .max_by(|x, y| x.0.cmp(&y.0).then(x.1.cmp(&y.1)))
        .map(|n| n.0)
    {
        let best_candidate = NodeRef::new(best_candidate_id, tc.tree);
        if get_node_score(&best_candidate) > tc_score / 3.0 {
            top_candidate = best_candidate;
            require_adjustment = false;
        }
    }

    if require_adjustment {
        top_candidate = adjust_top_candidate_by_parent(top_candidate, weigh_class);
    }
    top_candidate
}

fn get_node_ancestors_set(node: &NodeRef) -> HashSet<NodeId> {
    // only elements, no html or body, and have a score
    node.ancestors(Some(0))
        .iter()
        .filter(|n| {
            n.is_element()
                && !matches!(n.node_name().as_deref(), Some("html" | "body"))
                && has_node_score(n)
        })
        .map(|n| n.id)
        .collect::<HashSet<_>>()
}

fn adjust_top_candidate_by_parent(
    mut top_candidate: NodeRef<'_>,
    weigh_class: bool,
) -> NodeRef<'_> {
    let tc = &mut top_candidate;
    if !has_node_score(tc) {
        init_node_score(tc, weigh_class);
    }
    // Because of our bonus system, parents of candidates might have scores
    // themselves. They get half of the node. There won't be nodes with higher
    // scores than our `top_candidate`, but if we see the score going *up* in the first
    // few steps up the tree, that's a decent sign that there might be more content
    // lurking in other places that we want to unify in. The sibling stuff
    // below does some of that - but only if we've looked high enough up the DOM
    // tree.
    let mut last_score = get_node_score(tc);
    let score_threshold = last_score / 3.0;
    let mut parent_of_top_candidate = tc.parent();
    while let Some(ref tc_parent) = parent_of_top_candidate {
        if tc_parent.has_name("body") {
            break;
        }

        if !has_node_score(tc_parent) {
            parent_of_top_candidate = tc_parent.parent();
            continue;
        }

        let parent_score = get_node_score(tc_parent);
        if parent_score < score_threshold {
            break;
        }
        if parent_score > last_score {
            top_candidate = *tc_parent;
            break;
        }
        last_score = parent_score;
        parent_of_top_candidate = tc_parent.parent();
    }
    top_candidate
}

/// Collecting nodes to score. Also, it removes unlikely candidates and elements without content.
fn collect_elements_to_score<'a>(
    root_node: &'a NodeRef,
    strip_unlikely: bool,
    metadata: &Metadata,
) -> Vec<NodeRef<'a>> {
    let tree = &root_node.tree;
    let mut elements_id_to_score: Vec<NodeId> = vec![];
    let mut should_remove_title_header = !metadata.title.is_empty();
    let mut next_node = next_child_or_sibling(root_node, false);
    while let Some(mut node) = next_node {
        if !is_probably_visible(&node) {
            next_node = next_child_or_sibling(&node, true);
            node.remove_from_parent();
            continue;
        }

        if node.has_name("svg") {
            next_node = next_child_or_sibling(&node, true);
            continue;
        }

        if MATCHER_DIALOGS.match_element(&node) {
            next_node = next_child_or_sibling(&node, true);
            node.remove_from_parent();
            continue;
        }

        if should_remove_title_header
            && MATCHER_HEADING.match_element(&node)
            && text_similarity(&metadata.title, &node.text()) > 0.75
        {
            should_remove_title_header = false;
            next_node = next_child_or_sibling(&node, true);
            node.remove_from_parent();
            continue;
        }

        if strip_unlikely {
            let strip = is_unlikely_candidate(&node)
                || node
                    .attr("role")
                    .is_some_and(|role| UNLIKELY_ROLES.contains(&role));
            if strip {
                next_node = next_child_or_sibling(&node, true);
                node.remove_from_parent();
                continue;
            }
        }
        if node_name_in(&node, &TAGS_WITH_CONTENT) && is_element_without_content(&node) {
            next_node = next_child_or_sibling(&node, true);
            node.remove_from_parent();
            continue;
        }

        if node_name_in(&node, &DEFAULT_TAGS_TO_SCORE) {
            elements_id_to_score.push(node.id);
        }

        if node.has_name("div") {
            div_into_p(&node);

            // Sites like http://mobile.slate.com encloses each paragraph with a DIV
            // element. DIVs with only a P element inside and no text content can be
            // safely converted into plain P elements to avoid confusing the scoring
            // algorithm with DIVs with are, in practice, paragraphs.

            // Check `p` first (cheap), then link density (expensive).
            let single_p: Option<NodeRef<'_>> =
                single_child_element(&node, "p").filter(|_| link_density(&node, None) < 0.25);
            if let Some(new_node) = single_p {
                node.replace_with(&new_node);
                elements_id_to_score.push(new_node.id);
                node = new_node;
            } else if !has_child_block_element(&node) {
                node.rename("p");
                elements_id_to_score.push(node.id);
            }
        }
        next_node = next_child_or_sibling(&node, false);
    }
    elements_id_to_score
        .iter()
        .map(|n| NodeRef::new(*n, tree))
        .collect()
}

#[cfg(not(feature = "aho-corasick"))]
fn match_unlikely(haystack: &str) -> bool {
    let check = BytePatternCheck::new(haystack);

    if !check.contains_any(UNLIKELY_CANDIDATES) {
        return false;
    }
    if check.contains_any(MAYBE_CANDIDATES) {
        return false;
    }
    true
}

#[cfg(feature = "aho-corasick")]
fn match_unlikely(haystack: &str) -> bool {
    if !crate::ac_automat::AC_UNLIKELY.is_match(haystack) {
        return false;
    }
    if crate::ac_automat::AC_MAYBE.is_match(haystack) {
        return false;
    }
    true
}

#[cfg(test)]
mod tests {

    use super::*;
    use crate::readability::Readability;

    #[test]
    fn test_removing_probably_invisible_nodes() {
        let contents = r#"<!DOCTYPE>
        <html>
            <head><title>Test</title></head>
            <body>
                 <p hidden>This paragraph should be hidden.</p>
                 <p aria-hidden="true">This paragraph should be hidden.</p>
                 <p style="display:none">This paragraph should be hidden.</p>
                 <p style="display: none !important">This paragraph should be hidden.</p>
                 <p style="display: none!important">This paragraph should be visible.</p>
                 <p style="display:">This paragraph should be visible.</p>
                 <p style="display">This paragraph should be visible.</p>
                 <p style=":">This paragraph should be visible.</p>
                 <p style="visibility:hidden">This paragraph should be hidden.</p>
                 <p aria-hidden="true" class="mwe-math-fallback-image-inline">123*123</p>
                 <p>This paragraph is visible</p>
                 <p style="DISPLAY: NONE">This paragraph should be hidden.</p>
                 <p style="display: none; visibility: visible">This paragraph should be hidden.</p>
                 <p style="font-family: 'Times New Roman'; display: none">This paragraph should be hidden.</p>
            </body>
        </html>"#;

        let doc = Document::from(contents);
        let body = doc.body().unwrap();
        collect_elements_to_score(&body, true, &Metadata::default());
        assert_eq!(6, doc.select("p").length());
    }

    #[test]
    fn test_remove_dialog() {
        let contents = r#"<!DOCTYPE>
        <html>
            <head><title>Test</title></head>
            <body>
                 <div id="dialog1" role="dialog" aria-modal="true">
                    <h2>Test dialog<h2>
                    <button id="close1">Close</button>
                 </div>
            </body>
        </html>"#;

        let doc = Document::from(contents);
        assert!(doc.select("#dialog1").exists());
        let body = doc.body().unwrap();
        collect_elements_to_score(&body, true, &Metadata::default());

        assert!(!doc.select("#dialog1").exists());
        assert!(!doc.select("#close1").exists());
    }

    #[test]
    fn test_unlikely_roles() {
        let contents = r#"<!DOCTYPE>
        <html>
            <head><title>Test</title></head>
            <body>
                 <div id="dialog1" role="dialog">
                    <h2>Test dialog<h2>
                    <button id="close1">Close</button>
                 </div>
                 <nav id="nav1" role="navigation"></nav>
            </body>
        </html>"#;

        let doc = Document::from(contents);
        assert!(doc.select("*[role]").exists());
        let body = doc.body().unwrap();

        collect_elements_to_score(&body, true, &Metadata::default());
        assert!(!doc.select("*[role]").exists());
    }

    #[test]
    fn test_remove_empty() {
        let contents = r"<!DOCTYPE>
        <html>
            <head><title>Test</title></head>
            <body>
                 <p>This paragraph is visible</p>
                 <header></header>
                 <section></section>
                 <div></div>
                 <h1></h1>
                 <h2></h2>
                 <h3></h3>
                 <h4></h4>
                 <h5></h5>
                 <h6></h6>
            </body>
        </html>";

        let ra = Readability::new(contents, None, None).unwrap();
        let sel = ra.doc.select("body > *");
        let count_before = sel.nodes().iter().filter(|n| n.is_element()).count();

        assert_eq!(count_before, 10);

        let clean_doc = ra.grab_article(&Metadata::default()).unwrap();
        let sel = clean_doc.select("body > *");
        let count_after = sel.nodes().iter().filter(|n| n.is_element()).count();
        assert_eq!(count_after, 1);
    }

    #[test]
    fn test_remove_title_duplicates() {
        let contents = r"<!DOCTYPE>
        <html>
            <head><title>Rust (programming language) - Wikipedia</title></head>
            <body>
                 <h1>Rust (programming language)</h1>
            </body>
        </html>";

        let readability = Readability::from(contents);
        let metadata = readability.get_article_metadata(None);
        let body = readability.doc.body().unwrap();

        assert!(readability.doc.select("h1").exists());
        collect_elements_to_score(&body, true, &metadata);
        assert!(!readability.doc.select("h1").exists());
    }

    #[test]
    fn test_remove_unlikely_candidates() {
        let contents = r#"<!DOCTYPE>
        <html>
            <head><title>Test</title></head>
            <body>
                 <h1>Test</h1>
                 <div class="banner">Some annoying content</div>
            </body>
        </html>"#;

        let doc = Document::from(contents);
        assert!(doc.select("div.banner").exists());
        let body = doc.body().unwrap();
        collect_elements_to_score(&body, true, &Metadata::default());
        assert!(!doc.select("div.banner").exists());
    }
    #[test]
    fn test_skip_ok_maybe_candidates() {
        let contents = r#"<!DOCTYPE>
        <html>
            <head><title>Test</title></head>
            <body>
                 <h1>Test</h1>
                 <a class="banner">Some annoying content</a>
            </body>
        </html>"#;

        let doc = Document::from(contents);
        assert!(doc.select("a.banner").exists());
        let body = doc.body().unwrap();
        collect_elements_to_score(&body, true, &Metadata::default());
        assert!(doc.select("a.banner").exists());
    }
}


================================================
FILE: src/grab_flags.rs
================================================
use flagset::flags;

flags! {
    /// Flags for the grab function, controlling different heuristics for content extraction.
    pub enum GrabFlags: u8 {
        /// Removes elements that are unlikely to be part of the main content.
        StripUnlikelys,
        /// Considers element class and id attributes when calculating content scores.
        WeightClasses,
        /// Applies additional content cleaning after identifying the main content.
        CleanConditionally,
    }
}

#[cfg(test)]
mod tests {
    use flagset::FlagSet;

    use super::*;

    #[test]
    fn test_grab_flags() {
        let mut flags: FlagSet<GrabFlags> = FlagSet::full();
        assert!(flags.contains(GrabFlags::StripUnlikelys));
        flags -= GrabFlags::StripUnlikelys;
        assert!(!flags.contains(GrabFlags::StripUnlikelys));
    }
}


================================================
FILE: src/helpers.rs
================================================
use std::collections::HashSet;

use foldhash::HashMap;
use unicode_segmentation::UnicodeSegmentation;

use dom_query::{NodeId, NodeRef, Selection};

use tendril::StrTendril;

use crate::glob::{MINI_ARIA_HIDDEN, MINI_FALLBACK_IMG, PHRASING_ELEMS};
use crate::matching::is_invisible_style;

pub(crate) fn text_similarity(text_a: &str, text_b: &str) -> f64 {
    if text_a.is_empty() || text_b.is_empty() {
        return 0.0;
    }

    let a = text_a.to_lowercase();
    let b = text_b.to_lowercase();

    if a.contains(&b) {
        return 1.0;
    }

    let tokens_a: HashSet<&str> = a.unicode_words().collect();
    let tokens_b: Vec<&str> = b.unicode_words().collect();

    let b_chars_total: usize = tokens_b.iter().map(|s| s.chars().count()).sum();
    let b_chars_unique: usize = tokens_b
        .iter()
        .filter(|&&s| !tokens_a.contains(s))
        .map(|s| s.chars().count())
        .sum();

    let distance_b = b_chars_unique as f64 / b_chars_total as f64;
    1.0 - distance_b
}

pub(crate) fn is_phrasing_content(node: &NodeRef) -> bool {
    if node.is_text() {
        return true;
    }

    // only elements has a node name
    let Some(qual_name) = node.qual_name_ref() else {
        return false;
    };
    let node_name = qual_name.local.as_ref();
    if PHRASING_ELEMS.contains(node_name) {
        return true;
    }

    if matches!(node_name, "a" | "del" | "ins") {
        // There is no big sense to consider link content as phrasing content if they doesn't have children element.
        let children = node.children();
        return !children.is_empty() && children.into_iter().all(|n| is_phrasing_content(&n));
    }
    false
}

pub(crate) fn is_whitespace(node: &NodeRef) -> bool {
    if node.is_text() && !node.is_nonempty_text() {
        return true;
    }
    // only an element node has a node_name
    node.has_name("br")
}

pub(crate) fn has_ancestor<F>(node: &NodeRef, max_depth: Option<usize>, filter_fn: F) -> bool
where
    F: Fn(&NodeRef) -> bool,
{
    let max_depth = max_depth.map(|max_depth| if max_depth == 0 { 3 } else { max_depth });
    node.ancestors_it(max_depth).any(|a| filter_fn(&a))
}

pub(crate) fn text_density(node: &NodeRef, selector: &str, char_count: Option<usize>) -> f32 {
    let sel = Selection::from(*node).select(selector);
    let sel_nodes = sel.nodes();

    let children_length: f32 = sel_nodes
        .iter()
        .map(|n| n.normalized_char_count())
        .sum::<usize>() as f32;

    if children_length == 0.0 {
        return 0.0;
    }
    let text_length = char_count.unwrap_or_else(|| node.normalized_char_count()) as f32;

    if text_length == 0.0 {
        return 0.0;
    }
    children_length / text_length
}

pub(crate) fn normalize_spaces(text: &str) -> String {
    let mut result = String::with_capacity(text.len());
    let mut iter = text.split_whitespace();

    if let Some(first) = iter.next() {
        result.push_str(first);
        for word in iter {
            result.push(' ');
            result.push_str(word);
        }
    }
    result
}

pub(crate) fn link_density_fn<F>(node: &NodeRef, char_count: Option<usize>, mut count_fn: F) -> f32
where
    F: FnMut(&NodeRef) -> usize,
{
    let mut link_length = 0f32;

    for a in node.find_descendants("a") {
        let href = a.attr_or("href", "");
        let coeff = if href.len() > 1 && href.starts_with('#') {
            0.3
        } else {
            1.0
        };
        link_length += count_fn(&a) as f32 * coeff;
    }

    if link_length == 0.0 {
        return 0.0;
    }

    let text_length = char_count.unwrap_or_else(|| count_fn(node)) as f32;
    if text_length == 0.0 {
        return 0.0;
    }

    link_length / text_length
}

pub(crate) fn link_density(node: &NodeRef, char_count: Option<usize>) -> f32 {
    link_density_fn(node, char_count, |n| n.normalized_char_count())
}

/// Returns the child element if the node contains exactly one element child with the given tag
/// and no non-empty text nodes.
pub(crate) fn single_child_element<'a>(node: &NodeRef<'a>, tag: &str) -> Option<NodeRef<'a>> {
    // There should be exactly 1 element child with given tag
    let children = node.element_children();
    if children.len() != 1 {
        return None;
    }

    let first = children.first()?;

    if !first.has_name(tag) {
        return None;
    }
    if node.children_it(false).any(|n| n.is_nonempty_text()) {
        return None;
    }
    Some(*first)
}

pub(crate) fn is_element_without_content(node: &NodeRef) -> bool {
    // since this function calls only for elements check `node.is_element()` is redundant
    let has_text = node.descendants_it().any(|n| n.is_nonempty_text());
    if has_text {
        return false;
    }

    let el_children_count = node.children_it(false).filter(|n| n.is_element()).count();
    if el_children_count == 0 {
        return true;
    }

    let line_breaks = node.find_descendants("br").len() + node.find_descendants("hr").len();
    el_children_count == line_breaks
}

pub(crate) fn get_dir_attr(node: &NodeRef) -> Option<String> {
    if let Some(first_child) = node.first_child() {
        if let Some(dir_attr) = first_child.attr("dir") {
            return Some(dir_attr.to_string());
        }

        let dir_attr = first_child.ancestors_it(None).find_map(|a| a.attr("dir"));
        if let Some(dir_attr) = dir_attr {
            return Some(dir_attr.to_string());
        }
    }
    None
}

pub(crate) fn node_name_in(node: &NodeRef, names: &phf::Set<&str>) -> bool {
    node.qual_name_ref()
        .is_some_and(|name| names.contains(name.local.as_ref()))
}

pub(crate) fn is_probably_visible(node: &NodeRef) -> bool {
    if node.has_attr("hidden") {
        return false;
    }
    if is_invisible_style(node) {
        return false;
    }
    !MINI_ARIA_HIDDEN.match_node(node) || MINI_FALLBACK_IMG.match_node(node)
}

pub(crate) fn get_node_matching_string(node: &NodeRef) -> StrTendril {
    let mut buf = StrTendril::new();
    let Some(el) = node.element_ref() else {
        return buf;
    };

    for attr in &el.attrs {
        if !matches!(attr.name.local.as_ref(), "class" | "id") {
            continue;
        }
        buf.push_tendril(&attr.value);
        buf.push_char(' ');
    }

    buf.make_ascii_lowercase();
    buf
}

pub(crate) fn next_child_or_sibling<'a>(
    node: &NodeRef<'a>,
    ignore_child: bool,
) -> Option<NodeRef<'a>> {
    if !ignore_child {
        if let Some(first_child) = node.first_element_child() {
            return Some(first_child);
        }
    }

    if let Some(sibling) = node.next_element_sibling() {
        return Some(sibling);
    }
    let mut parent = node.parent();
    while let Some(parent_node) = parent {
        if let Some(next_sibling) = parent_node.next_element_sibling() {
            return Some(next_sibling);
        }
        parent = parent_node.parent();
    }
    None
}

#[cfg(not(feature = "aho-corasick"))]
/// A lightweight byte-level pre-checker used to quickly skip patterns
/// that cannot occur in the haystack.
pub(crate) struct BytePatternCheck<'a> {
    haystack: &'a str,
    char_map: [u8; 256],
}

#[cfg(not(feature = "aho-corasick"))]
impl<'a> BytePatternCheck<'a> {
    pub(crate) fn new(haystack: &'a str) -> Self {
        let mut char_map = [0u8; 256];

        for &b in haystack.as_bytes() {
            char_map[b as usize] = 1;
        }
        Self { haystack, char_map }
    }
    #[inline]
    fn pre_check(&self, pat: &str) -> bool {
        for &b in pat.as_bytes() {
            if self.char_map[b as usize] == 0 {
                return false;
            }
        }
        true
    }
    /// Checks if the haystack contains any of the given patterns.
    /// Performs a cheap bitmap pre-check before `str::contains`.
    pub(crate) fn contains_any(&self, pats: &[&str]) -> bool {
        pats.iter()
            .any(|pat| self.pre_check(pat) && self.haystack.contains(pat))
    }
}

#[derive(Default)]
pub(crate) struct CharCounterCache {
    inner: HashMap<NodeId, usize>,
}

impl CharCounterCache {
    pub(crate) fn char_count(&mut self, node: &NodeRef) -> usize {
        *self
            .inner
            .entry(node.id)
            .or_insert_with(|| node.normalized_char_count())
    }
}

#[cfg(test)]
mod tests {

    use super::*;

    const FLOAT_TOLERANCE: f64 = 0.00001;

    #[test]
    fn test_text_similarity() {
        let text_a = "The quick brown fox";
        let text_b = "The quick fox";
        let similarity = text_similarity(text_a, text_b);
        assert!(similarity > 0.75);
    }

    #[test]
    fn test_text_similarity_contains() {
        let text_a = "the quick brown fox jumps over the lazy dog";
        let text_b = "The Quick Brown Fox";
        let similarity = text_similarity(text_a, text_b);
        assert!(1.0 - similarity < FLOAT_TOLERANCE);
    }

    #[test]
    fn test_text_similarity_similar() {
        let text_a = "DeepMind新电脑已可利用记忆自学 人工智能迈上新台阶_科技_腾讯网";
        let text_b = "DeepMind新电脑已可利用记忆自学 人工智能迈上新台阶";
        let similarity = text_similarity(text_a, text_b);
        assert!(1.0 - similarity < FLOAT_TOLERANCE);
    }

    #[test]
    fn test_normalize_spaces() {
        let text = "    The quick  brown  fox\n jumps over the lazy dog. ";
        let normalized_text = normalize_spaces(text);
        let expected = "The quick brown fox jumps over the lazy dog.";
        assert_eq!(expected, normalized_text);
    }

    #[test]
    #[cfg(not(feature = "aho-corasick"))]
    fn test_ascii_pattern_check() {
        let class = "article primary main äußerlich konnen";
        let check = BytePatternCheck::new(class);
        assert!(check.contains_any(&["äußerlich"]));
        assert!(check.contains_any(&["article", "post"]));
        assert!(!check.contains_any(&["können"]));
    }
}


================================================
FILE: src/lib.rs
================================================
mod config;
mod glob;
mod grab;
mod grab_flags;
mod helpers;
mod matching;
mod prep_article;
mod readability;
mod readable;
mod score;
#[cfg(feature = "serde")]
mod serde_helpers;
mod url_helpers;

#[cfg(feature = "aho-corasick")]
mod ac_automat;

pub use config::{CandidateSelectMode, Config, ParsePolicy, TextMode};
pub use readability::Article;
pub use readability::Metadata;
pub use readability::Readability;
pub use readable::is_probably_readable;

use thiserror::Error;

#[derive(Error, Debug)]
pub enum ReadabilityError {
    #[error("the document URL must be absolute")]
    BadDocumentURL,
    #[error("failed to grab the article")]
    GrabFailed,
    #[error("too many elements in the document to parse (found {0}, maximum {1})")]
    TooManyElements(usize, usize),
}


================================================
FILE: src/matching.rs
================================================
//! Functions below replace regex-based validation with explicit string matching
//! for better maintainability and performance

use tendril::StrTendril;

use dom_query::NodeRef;

#[allow(clippy::wildcard_imports)]
use crate::glob::*;

pub(crate) fn is_invisible_style(node: &NodeRef) -> bool {
    if let Some(mut style) = node.attr("style") {
        style.make_ascii_lowercase();
        return style_has_kv(&style, "display", "none")
            || style_has_kv(&style, "visibility", "hidden");
    }
    false
}

fn style_has_kv(style: &str, key: &str, val: &str) -> bool {
    if let Some(pos) = style.find(key) {
        let mut rest = &style[pos..];
        if let Some(pos) = rest.find(':') {
            rest = &rest[pos + 1..];
        } else {
            return false;
        }
        if let Some(pos) = rest.find(';') {
            rest = &rest[..pos];
        }
        rest = rest.trim_start();
        if let Some(pos) = rest.find(char::is_whitespace) {
            rest = &rest[..pos];
        }
        return rest.trim_end() == val;
    }
    false
}

pub(crate) fn strip_cdata(content: &StrTendril) -> &str {
    if let Some(rest) = content.trim_start().strip_prefix("<![CDATA[") {
        return rest.split("]]>").next().unwrap_or(rest);
    }
    content
}

pub(crate) fn is_schema_org_url(url: &str) -> bool {
    let trimmed_url = url.trim_end_matches('/');
    trimmed_url.ends_with(SCHEMA_ORG_SFX)
        && (trimmed_url.starts_with(HTTP_PFX) || trimmed_url.starts_with(HTTPS_PFX))
}

pub(crate) fn is_video_url(haystack: &str) -> bool {
    VIDEO_DOMAINS.iter().any(|&p| {
        if let Some(pos) = haystack.find(p) {
            if pos > 1 && &haystack[pos - PROTOCOL_PFX_LEN..pos] == PROTOCOL_PFX
                || pos > 5 && &haystack[pos - WWW_PFX_LEN..pos] == WWW_PFX
            {
                return true;
            }
        }
        false
    })
}

pub(crate) fn is_sentence(text: &str) -> bool {
    text.ends_with('.') || text.contains(". ")
}

pub(crate) fn contains_one_of_words(haystack: &str, words: &[&str]) -> bool {
    haystack
        .split_whitespace()
        .any(|word| words.contains(&word))
}

#[inline]
pub(crate) fn is_img_attr_to_srcset(s: &str) -> bool {
    for ext in IMG_EXT {
        let mut start = 0;
        while let Some(pos) = s[start..].find(ext) {
            let idx = start + pos + ext.len();
            if idx < s.len() - 1 {
                let bytes = s.as_bytes();
                if bytes[idx].is_ascii_whitespace() && bytes[idx + 1].is_ascii_digit() {
                    return true;
                }
            }
            start = idx;
        }
    }
    false
}

#[inline]
pub(crate) fn is_img_attr_to_src(s: &str) -> bool {
    s.trim()
        .split('.')
        .skip(1)
        .any(|part| IMG_EXT.iter().any(|ext| part.starts_with(&ext[1..])))
}

pub(crate) fn truncate_title_last(title: &str) -> Option<&str> {
    // This is not a perfect, but behaves as like RX_TITLE_W_LAST
    if let Some((delim_pos, sep)) = title
        .char_indices()
        .rev()
        .find(|(_, c)| TITLE_SEPARATORS.contains(c))
    {
        let next_char = title.get(delim_pos + sep.len_utf8()..)?.chars().next()?;
        if next_char == ' ' {
            return title.get(..delim_pos).map(str::trim);
        }
    }
    None
}

pub(crate) fn truncate_title_first(title: &str) -> Option<&str> {
    // This is not a perfect, but behaves as like RX_TITLE_W_LAST
    if let Some((delim_pos, sep)) = title
        .char_indices()
        .find(|(_, c)| TITLE_SEPARATORS.contains(c))
    {
        let next_char = title.get(delim_pos + sep.len_utf8()..)?.chars().next()?;
        if next_char == ' ' {
            return title.get(delim_pos + sep.len_utf8()..).map(str::trim);
        }
    }
    None
}

pub(crate) fn is_meta_name(name: &str) -> bool {
    if let Some((prefix, key)) = name.split_once(META_NAME_SEP) {
        return META_NAME_PREFIXES.contains(&prefix) && META_NAME_KEYS.contains(&key);
    }
    META_NAME_KEYS.contains(&name)
}

pub(crate) fn meta_property_name(property: &str) -> Option<&str> {
    for part in property.split_whitespace() {
        if let Some(pos_r) = part.rfind(':') {
            let key = &part[pos_r + 1..];
            if !META_PROPERTY_KEYS.contains(&key) {
                continue;
            }
            let pre_pos = part[..pos_r].find(':').map_or(0, |pos_l| pos_l + 1);
            let pre = &part[pre_pos..pos_r];
            if META_PROPERTY_PREFIXES.contains(&pre) {
                return Some(&part[pre_pos..]);
            }
        }
    }
    None
}

pub(crate) fn is_loading_word(text: &str) -> bool {
    let trimmed = text.trim_end_matches(['…', '.']);
    LOADING_WORDS.contains(trimmed)
}

pub(crate) fn contains_share_elements(value: &str) -> bool {
    value
        .split([' ', '_'])
        .any(|part| SHARE_WORDS.iter().any(|&w| part.eq_ignore_ascii_case(w)))
}

pub(crate) fn split_base64_url(src: &str) -> Option<(&str, &str)> {
    if let Some(rest) = src.strip_prefix("data:") {
        if let Some(pos) = rest.find(BASE64_MARKER) {
            let image_type = &rest[..pos];
            let image_data = &rest[pos + BASE64_MARKER_LEN..];
            if image_data.is_empty() {
                return None;
            }
            return Some((image_type, image_data));
        }
    }
    None
}

#[cfg(test)]
mod tests {

    use super::*;

    #[test]
    fn test_meta_property_name() {
        assert_eq!(
            meta_property_name("og:article:author"),
            Some("article:author")
        );
        assert_eq!(meta_property_name("x:title og:title"), Some("og:title"));
    }

    #[test]
    fn test_is_meta_name() {
        assert!(is_meta_name("author"));
        assert!(is_meta_name("dc:title"));
        assert!(is_meta_name("dc:title"));
        assert!(!is_meta_name("dc:mod-date"));
        assert!(is_meta_name("dc:pub-date"));
    }

    #[test]
    fn test_truncate_title_first() {
        let title1 = "Lazy Load with Alt includes jpg/png/webp extensions";
        assert_eq!(truncate_title_first(title1), None);
        let title2 = "Some Title | Some Extra Info ";
        assert_eq!(truncate_title_first(title2), Some("Some Extra Info"));
    }

    #[test]
    fn test_truncate_title_last() {
        let orig_title = "Lazy Load with Alt includes jpg/png/webp extensions";
        assert_eq!(truncate_title_last(orig_title), None,);

        let orig_title = "Lazy Load with Alt includes jpg / png / webp extensions";
        assert_eq!(
            truncate_title_last(orig_title),
            Some("Lazy Load with Alt includes jpg / png")
        );
    }

    #[test]
    fn test_is_img_attr_to_src() {
        let val = "https://static01.nyt.com/images/2019/02/15/nyregion/
        00winterutilitiesOAK11/merlin_94083158_9e622a52-ec2f-4fbd-845c-
        5d530e94bc82-articleLarge.jpg?quality=90&amp;auto=webp";
        assert!(is_img_attr_to_src(val));
    }

    #[test]
    fn test_contains_one_of_words() {
        assert!(contains_one_of_words(
            "something hid",
            CLASSES_NEGATIVE_WORDS
        ));
        assert!(contains_one_of_words(
            "something hid another",
            CLASSES_NEGATIVE_WORDS
        ));
        assert!(contains_one_of_words(
            "hid something",
            CLASSES_NEGATIVE_WORDS
        ));
        assert!(!contains_one_of_words(
            "something hidden",
            CLASSES_NEGATIVE_WORDS
        ));
    }

    #[test]
    fn test_strip_cdata() {
        // Test valid CDATA
        let content = StrTendril::from_slice("<![CDATA[test content]]>");
        assert_eq!(strip_cdata(&content), "test content");

        // Test missing closing marker
        let content = StrTendril::from_slice("<![CDATA[test content");
        assert_eq!(strip_cdata(&content), "test content");

        // Test no CDATA
        let content = StrTendril::from_slice("test content");
        assert_eq!(strip_cdata(&content), "test content");

        // Test empty content
        let content = StrTendril::from_slice("");
        assert_eq!(strip_cdata(&content), "");

        // Test whitespace
        let content = StrTendril::from_slice("  <![CDATA[test content]]>");
        assert_eq!(strip_cdata(&content), "test content");
    }

    #[test]
    fn test_is_schema_org_url() {
        // Valid URLs
        assert!(is_schema_org_url("http://schema.org"));
        assert!(is_schema_org_url("https://schema.org"));
        assert!(is_schema_org_url("http://schema.org/"));
        assert!(is_schema_org_url("https://schema.org/"));
        assert!(is_schema_org_url("http://schema.org////")); // multiple trailing slashes

        // Invalid URLs
        assert!(!is_schema_org_url("ftp://schema.org"));
        assert!(!is_schema_org_url("//schema.org"));
        assert!(!is_schema_org_url("schema.org"));
        assert!(!is_schema_org_url("http://schemaXorg"));
        assert!(!is_schema_org_url(""));
    }

    #[test]
    fn test_is_video_url() {
        // Valid URLs with protocol prefix
        assert!(is_video_url("//youtube.com/watch?v=123"));
        assert!(is_video_url("//player.vimeo.com/video/123"));
        assert!(is_video_url("//dailymotion.com/video/123"));
        assert!(is_video_url("//youtube-nocookie.com/embed/123"));
        assert!(is_video_url("//v.qq.com/video/123"));
        assert!(is_video_url("//archive.org/video/123"));
        assert!(is_video_url("//upload.wikimedia.org/video/123"));
        assert!(is_video_url("//player.twitch.tv/video/123"));

        // Valid URLs with www prefix
        assert!(is_video_url("//www.youtube.com/watch?v=123"));
        assert!(is_video_url("//www.dailymotion.com/video/123"));

        // Invalid URLs
        assert!(!is_video_url("youtube.com/watch?v=123")); // missing prefix
        assert!(!is_video_url("http://notvideo.com/youtube.com")); // video domain in path
        assert!(!is_video_url("//youtubeXcom/watch?v=123")); // invalid domain
        assert!(!is_video_url("//www.notvideo.com")); // non-video domain
        assert!(!is_video_url("")); // empty string
    }

    #[test]
    fn test_split_base64_url() {
        let src = "data:image/gif;base64,R0lGODlhAQABAAAAACH5BAEKAAEALAAAAAABAAEAAAICTAEAOw==";
        let (image_type, image_data) = split_base64_url(src).unwrap();
        assert_eq!(image_type, "image/gif");
        assert_eq!(
            image_data,
            "R0lGODlhAQABAAAAACH5BAEKAAEALAAAAAABAAEAAAICTAEAOw=="
        );

        // Test empty base64 data
        let src = "data:image/gif;base64,";
        assert!(split_base64_url(src).is_none());

        // Test invalid data URL format
        let src = "invalid:image/gif;base64,data";
        assert!(split_base64_url(src).is_none());

        // Test missing base64 marker
        let src = "data:image/gif,R0lGODlhAQABAAAAACH5BAEKAAEALAAAAAABAAEAAAICTAEAOw==";
        assert!(split_base64_url(src).is_none());
    }
}


================================================
FILE: src/prep_article.rs
================================================
use dom_query::{NodeRef, Selection};
use flagset::FlagSet;

#[allow(clippy::wildcard_imports)]
use crate::glob::*;
use crate::grab_flags::GrabFlags;
#[allow(clippy::wildcard_imports)]
use crate::helpers::*;
#[allow(clippy::wildcard_imports)]
use crate::matching::*;
use crate::score::get_class_weight;
use crate::Config;

fn clean(root_sel: &Selection) {
    for node in root_sel.select_matcher(&MATCHER_CLEAN).nodes() {
        // Allow youtube and vimeo videos through as people usually want to see those.

        if !node_name_in(node, &EMBED_ELEMENTS) {
            node.remove_from_parent();
            continue;
        }
        let mut should_remove = !node.attrs().iter().any(|attr| is_video_url(&attr.value));

        // For embed with <object> tag, check inner HTML as well.
        if should_remove && node.has_name("object") && is_video_url(&node.inner_html()) {
            should_remove = false;
        }

        if should_remove {
            node.remove_from_parent();
        }
    }
}

fn clean_styles(n: &NodeRef) {
    if n.has_name("svg") {
        return;
    }

    n.remove_attrs(PRESENTATIONAL_ATTRIBUTES);

    if node_name_in(n, &DEPRECATED_SIZE_ATTRIBUTE_ELEMS) {
        n.remove_attrs(&["width", "height"]);
    }

    for child_node in &n.element_children() {
        clean_styles(child_node);
    }
}

fn should_clean_conditionally(node: &NodeRef, flags: &FlagSet<GrabFlags>) -> bool {
    let sel = Selection::from(*node);
    // keep element if it has a data tables
    if sel.select_single_matcher(&MATCHER_DATA_TABLE).exists() {
        return false;
    }

    let is_data_table = |n: &NodeRef| n.has_name("table") && n.has_attr("data-readability-table");

    if is_data_table(node) {
        return false;
    }
    // Next check if we're inside a data table, in which case don't remove it as well.
    if has_ancestor(node, None, is_data_table) {
        return false;
    }

    // TODO: This is a rare case, probably it should be `pre` instead of `code`.
    if has_ancestor(node, Some(0), |n| n.has_name("code")) {
        return false;
    }

    let weight = get_class_weight(node, flags.contains(GrabFlags::WeightClasses));

    if weight < 0.0 {
        return true;
    }

    let node_text = node.text();

    if node_text.matches(',').count() < 10 {
        // If there are not very many commas, and the number of
        // non-paragraph elements is more than paragraphs or other
        // ominous signs, remove the element.

        let mut embed_count = 0;

        let embeds_sel = sel.select_matcher(&MATCHER_EMBEDS);

        for embed in embeds_sel.nodes() {
            for attr in &embed.attrs() {
                if is_video_url(&attr.value) {
                    return false;
                }
            }
            if embed.has_name("object") && is_video_url(&embed.inner_html()) {
                return false;
            }
            embed_count += 1;
        }
        let text_low = node_text.trim().to_lowercase();
        if AD_WORDS.contains(&text_low) || is_loading_word(&text_low) {
            return true;
        }

        let char_count = node.normalized_char_count();
        let Some(qual_name) = node.qual_name_ref() else {
            return false;
        };
        let tag = qual_name.local.as_ref();
        let mut is_list = matches!(tag, "ul" | "ol");
        if !is_list {
            let list_density = text_density(node, "ul,ol", Some(char_count));
            is_list = list_density > 0.9;
        }

        let img = node.find_descendants("img").len();
        let img_f32 = img as f32;
        let should_remove = || {
            let is_figure_child = has_ancestor(node, None, |n| n.has_name("figure"));
            let p = node.find_descendants("p").len() as f32;

            if !is_figure_child && img_f32 > 1.0 && p / img_f32 < 0.5 {
                return true;
            }

            // TODO: this check can probably be removed.
            // If this is a large menu block, article_content will likely not include it,
            // because it contains too little meaningful content.
            // Otherwise, the magic number '100' seems way too high; the flow usually works fine with 10–20.
            let li = node.find_descendants("li").len() as f32 - 100.0;
            if !is_list && li > p {
                return true;
            }

            let input = node.find_descendants("input").len() as f32;
            if input > (p / 3.0).floor() {
                return true;
            }

            let link_density = link_density(node, Some(char_count));

            if !is_list
                && !is_figure_child
                && char_count < 25
                && (img == 0 || img > 2)
                && link_density > 0.0
                && text_density(node, "h1,h2,h3,h4,h5,h6", Some(char_count)) < 0.9
            {
                return true;
            }
            if !is_list && weight < 25.0 && link_density > 0.2 {
                return true;
            }

            if weight >= 25.0 && link_density > 0.5 {
                return true;
            }

            if (embed_count == 1 && char_count < 75) || embed_count > 1 {
                return true;
            }

            if img == 0 && text_density(node, TEXTISH_TAGS, Some(char_count)) == 0.0 {
                return true;
            }
            false
        };
        let have_to_remove = should_remove();

        if is_list && have_to_remove {
            for child in node.children_it(false) {
                if child.element_children().len() > 1 {
                    return have_to_remove;
                }
            }
            let li_count = node.find_descendants("li").len();
            return img != li_count;
        }

        return have_to_remove;
    }
    false
}

fn clean_conditionally(sel: &Selection, tags: &str, flags: &FlagSet<GrabFlags>) {
    if !flags.contains(GrabFlags::CleanConditionally) {
        return;
    }

    let tag_sel = sel.select(tags);
    // traversing tag nodes in reverse order,
    // so that how children nodes will appear before parent nodes
    for tag_node in tag_sel.nodes().iter().rev() {
        if should_clean_conditionally(tag_node, flags) {
            tag_node.remove_from_parent();
        }
    }
}

fn set_data_readability_table(n: &NodeRef, is_data_table: bool) {
    if is_data_table {
        n.set_attr("data-readability-table", "true");
    } else {
        n.remove_attr("data-readability-table");
    }
}

fn get_row_and_col_count(table: &Selection) -> (usize, usize) {
    let mut rows = 0usize;
    let mut cols = 0usize;
    for tr in table.select("tr").iter() {
        // No need to adjust row count by the `row span` at all
        rows += 1;

        //Now look for column-related info
        let mut columns_in_this_row = 0;

        for cell in tr.select("td").iter() {
            let colspan = cell.attr_or("colspan", "1");
            columns_in_this_row += colspan.parse::<usize>().unwrap_or(1);
        }
        cols = std::cmp::max(cols, columns_in_this_row);
    }

    (rows, cols)
}

fn mark_data_tables(base_sel: &Selection) {
    for table_node in base_sel.select_matcher(&MATCHER_TABLE).nodes() {
        if MINI_PRESENTATION.match_node(table_node) {
            set_data_readability_table(table_node, false);
            continue;
        }

        // TODO: Pretty rare case (not covered by tests). Probably should be removed.
        if MINI_AINT_DATA_TABLE.match_node(table_node) {
            set_data_readability_table(table_node, false);
            continue;
        }

        if table_node.has_attr("summary") {
            set_data_readability_table(table_node, true);
            continue;
        }

        let sel = Selection::from(*table_node);

        if sel.select_single_matcher(&MATCHER_TABLE_MEMBERS).exists() {
            set_data_readability_table(table_node, true);
            continue;
        }

        // nested tables indicate a layout table
        if sel.select_single_matcher(&MATCHER_TABLE).exists() {
            set_data_readability_table(table_node, false);
            continue;
        }

        let (rows, cols) = get_row_and_col_count(&sel);
        if rows == 1 || cols == 1 {
            set_data_readability_table(table_node, false);
            continue;
        }

        if rows >= 10 || cols > 4 {
            set_data_readability_table(table_node, true);
            continue;
        }
        set_data_readability_table(table_node, (rows * cols) > 10);
    }
}

fn fix_lazy_images(sel: &Selection) {
    for node in sel.select("img,picture,figure").nodes() {
        // In some sites (e.g. Kotaku), they put 1px square image as base64 data uri in the src attribute.
        // So, here we check if the data uri is too short, just might as well remove it.
        if let Some(src) = node.attr("src") {
            if let Some((image_type, base64_data)) = split_base64_url(&src) {
                if image_type == "image/svg+xml" {
                    continue;
                }

                // Make sure this element has other attributes which contains image.
                // If it doesn't, then this src is important and shouldn't be removed.
                let mut src_could_be_removed = false;

                for attr in &node.attrs() {
                    if &attr.name.local == "src" {
                        continue;
                    }

                    if IMG_EXT.iter().any(|p| attr.value.contains(p)) {
                        src_could_be_removed = true;
                        break;
                    }
                }
                // Here we assume if image is less than 100 bytes (or 133 after encoded to base64)
                // it will be too small, therefore it might be placeholder image.
                if src_could_be_removed && base64_data.len() < 133 {
                    node.remove_attr("src");
                }
            }
        }

        if (node.has_attr("src") || node.has_attr("srcset")) && !node.is_match(&MATCHER_LAZY_IMG) {
            continue;
        }

        for attr in &node.attrs() {
            if matches!(attr.name.local.as_ref(), "src" | "srcset" | "alt") {
                continue;
            }

            let mut copy_to: Option<&str> = None;
            let val = attr.value.to_ascii_lowercase();
            if is_img_attr_to_srcset(&val) {
                copy_to = Some("srcset");
            } else if is_img_attr_to_src(&val) {
                copy_to = Some("src");
            }

            if let Some(copy_to) = copy_to {
                //if this is an img or picture, set the attribute directly
                let Some(tag_name) = node.node_name() else {
                    continue;
                };
                if matches!(tag_name.as_ref(), "img" | "picture") {
                    node.set_attr(copy_to, &attr.value);
                } else if tag_name.as_ref() == "figure" {
                    let figure_sel = Selection::from(*node);
                    if !figure_sel.select("img, picture").exists() {
                        //if the item is a <figure> that does not contain an image or picture, create one and place it inside the figure
                        //see the nytimes-3 testcase for an example
                        let img_node = node.tree.new_element("img");
                        img_node.set_attr(copy_to, &attr.value);
                        node.append_child(&img_node.id);
                    }
                }
            }
        }
    }
}

fn clean_headers(sel: &Selection, flags: &FlagSet<GrabFlags>) {
    for h_node in sel.select_matcher(&MATCHER_HEADING).nodes() {
        if get_class_weight(h_node, flags.contains(GrabFlags::WeightClasses)) < 0.0 {
            h_node.remove_from_parent();
        }
    }
}

pub(crate) fn prep_article(article_node: &NodeRef, flags: &FlagSet<GrabFlags>, cfg: &Config) {
    let article_sel = Selection::from(*article_node);
    // *Important*: Currently the order of calling 'cleaning' functions is matters.
    // It shouldn't be but it is.

    // Clean out elements with little content that have "share" in their id/class combinations from final top candidates,
    // which means we don't remove the top candidates even they have "share".
    remove_share_elements(&article_sel, cfg.char_threshold);

    // Check for data tables before we continue, to avoid removing items in
    // those tables, which will often be isolated even though they're
    // visually linked to other content-ful elements (text, images, etc.).
    mark_data_tables(&article_sel);

    fix_lazy_images(&article_sel);

    clean_conditionally(&article_sel, "form,fieldset", flags);

    // Clean out junk from the article content
    clean(&article_sel);

    clean_headers(&article_sel, flags);

    // Do these last as the previous stuff may have removed junk
    // that will affect these
    clean_conditionally(&article_sel, "table,ul,div", flags);

    // replace H1 with H2 as H1 should be only title that is displayed separately
    article_sel.select("h1").rename("h2");
    // remove all presentational attributes
    clean_styles(article_node);

    // Remove extra paragraphs

    // At this point, nasty iframes have been removed; only embedded video
    // ones remain.
    for p_node in article_sel.select("p").nodes() {
        let p_sel = Selection::from(*p_node);
        let content_el_count = p_sel.select("img,object,embed,iframe").length();
        if content_el_count == 0 && p_node.normalized_char_count() == 0 {
            p_sel.remove();
        }
    }

    for br_node in article_node.find_descendants("br") {
        if let Some(next_node) = br_node.next_element_sibling() {
            if next_node.has_name("p") {
                br_node.remove_from_parent();
            }
        }
    }

    fix_single_cell_tables(&article_sel);
}

fn fix_single_cell_tables(sel: &Selection) {
    // Remove single-cell tables
    for table_node in sel.select("table").nodes() {
        let tbody = single_child_element(table_node, "tbody").unwrap_or(*table_node);

        if let Some(row) = single_child_element(&tbody, "tr") {
            if let Some(cell) = single_child_element(&row, "td") {
                let new_name = if cell.children_it(false).all(|c| is_phrasing_content(&c)) {
                    "p"
                } else {
                    "div"
                };
                cell.rename(new_name);
                table_node.replace_with(&cell);
            }
        }
    }
}

fn remove_share_elements(root_sel: &Selection, share_element_threshold: usize) {
    for child in root_sel.select("*[class],*[id]").nodes() {
        let mut has_share_sign: bool;
        {
            let Some(el) = child.element_ref() else {
                continue;
            };
            let attrs = &el.attrs;
            has_share_sign = attrs
                .iter()
                .find(|a| a.name.local.as_ref() == "class")
                .is_some_and(|s| contains_share_elements(&s.value));
            if !has_share_sign {
                has_share_sign = attrs
                    .iter()
                    .find(|a| a.name.local.as_ref() == "id")
                    .is_some_and(|s| contains_share_elements(&s.value));
            }
        }

        if has_share_sign && child.normalized_char_count() < share_element_threshold {
            child.remove_from_parent();
        }
    }
}


================================================
FILE: src/readability.rs
================================================
use dom_query::local_name;
use dom_query::{Document, NodeRef, Selection};
use foldhash::HashMap;
use tendril::StrTendril;

use crate::config::ParsePolicy;
use crate::config::TextMode;
#[allow(clippy::wildcard_imports)]
use crate::glob::*;
#[allow(clippy::wildcard_imports)]
use crate::helpers::*;
use crate::is_probably_readable;
#[allow(clippy::wildcard_imports)]
use crate::matching::*;
use crate::url_helpers::{is_absolute_url, to_absolute_url, url_join};
use crate::Config;
use crate::ReadabilityError;

/// This struct represents the content of the article
#[derive(Debug, Clone)]
#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
pub struct Article {
    /// The title
    pub title: String,
    /// The author
    pub byline: Option<String>,
    #[cfg_attr(
        feature = "serde",
        serde(
            serialize_with = "crate::serde_helpers::serialize_str_tendril",
            deserialize_with = "crate::serde_helpers::deserialize_str_tendril"
        )
    )]
    /// The relevant HTML content
    pub content: StrTendril,
    #[cfg_attr(
        feature = "serde",
        serde(
            serialize_with = "crate::serde_helpers::serialize_str_tendril",
            deserialize_with = "crate::serde_helpers::deserialize_str_tendril"
        )
    )]
    /// The relevant text content
    pub text_content: StrTendril,
    /// The text length
    pub length: usize,
    /// The excerpt
    pub excerpt: Option<String>,
    /// The name of the site
    pub site_name: Option<String>,
    /// The text direction
    pub dir: Option<String>,
    /// The document language
    pub lang: Option<String>,
    /// The published time of the document
    pub published_time: Option<String>,
    /// The modified time of the document
    pub modified_time: Option<String>,
    /// The image of the document
    pub image: Option<String>,
    /// The favicon of the document
    pub favicon: Option<String>,
    /// The metadata's url
    pub url: Option<String>,
}

/// This struct represents the metadata extracted from the document
#[derive(Debug, Default, Clone, PartialEq, Eq)]
#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
#[cfg_attr(feature = "serde", serde(rename_all = "camelCase"))]
pub struct Metadata {
    pub title: String,
    pub byline: Option<String>,
    pub excerpt: Option<String>,
    pub site_name: Option<String>,
    pub published_time: Option<String>,
    pub modified_time: Option<String>,
    pub image: Option<String>,
    pub favicon: Option<String>,
    pub lang: Option<String>,
    pub url: Option<String>,
    pub dir: Option<String>,
}

impl Metadata {
    fn is_empty(&self) -> bool {
        self.title.is_empty()
            && self.byline.is_none()
            && self.excerpt.is_none()
            && self.site_name.is_none()
            && self.published_time.is_none()
            && self.modified_time.is_none()
            && self.image.is_none()
            && self.favicon.is_none()
            && self.lang.is_none()
            && self.url.is_none()
            && self.dir.is_none()
    }

    fn unescape_html_entities(&mut self) {
        decode_html_entities(&mut self.title);
        decode_opt_html_entities(&mut self.byline);
        decode_opt_html_entities(&mut self.excerpt);
        decode_opt_html_entities(&mut self.site_name);
        decode_opt_html_entities(&mut self.published_time);
        decode_opt_html_entities(&mut self.modified_time);
        decode_opt_html_entities(&mut self.image);
        decode_opt_html_entities(&mut self.favicon);
    }
}

/// A struct that provides readability functionality
pub struct Readability {
    /// The [Document] to be processed
    pub doc: Document,
    /// The absolute URL of the document
    pub doc_url: Option<String>,
    /// Configuration options for the readability
    pub config: Config,
}

impl<T: Into<StrTendril>> From<T> for Readability {
    fn from(html: T) -> Self {
        Self {
            doc: Document::from(html),
            doc_url: None,
            config: Config::default(),
        }
    }
}

impl Readability {
    /// Create a new `Readability` instance
    ///
    /// # Arguments
    ///
    /// - `html` -- HTML content
    /// - `document_url` -- an absolute URL of the page
    /// - `cfg` -- an optional `Config` instance
    ///
    /// # Returns
    ///
    /// A new [`Readability`] instance
    ///
    /// # Errors
    ///
    /// Returns [`ReadabilityError::BadDocumentURL`] if `document_url` is not an absolute URL
    pub fn new<T: Into<StrTendril>>(
        html: T,
        document_url: Option<&str>,
        cfg: Option<Config>,
    ) -> Result<Self, ReadabilityError> {
        Self::with_document(Document::from(html), document_url, cfg)
    }

    /// Create a new `Readability` instance with a `dom_query::Document`
    ///
    /// # Arguments
    ///
    /// - `document` -- a `dom_query::Document` instance
    /// - `document_url` -- a document (absolute) URL of the page
    /// - `cfg` -- an optional `Config` instance
    ///
    /// # Returns
    ///
    /// A new [`Readability`] instance
    ///
    /// # Errors
    ///
    /// Returns [`ReadabilityError::BadDocumentURL`] if `document_url` is not an absolute URL
    pub fn with_document(
        document: dom_query::Document,
        document_url: Option<&str>,
        cfg: Option<Config>,
    ) -> Result<Self, ReadabilityError> {
        let doc_url = if let Some(url) = document_url {
            if !is_absolute_url(url, true) {
                return Err(ReadabilityError::BadDocumentURL);
            }
            Some(url.to_string())
        } else {
            None
        };

        Ok(Self {
            doc: document,
            doc_url,
            config: cfg.unwrap_or_default(),
        })
    }
}

impl Readability {
    /// Prepares the document for parsing:
    ///
    /// 1. Remove empty images
    /// 2. Unwrap images inside `noscript` tags
    /// 3. Remove `script` tags
    /// 4. Remove `style` tags
    /// 5. Replace multiple `br` tags with a single `br` tag
    /// 6. Replace `font` tags with `span` tags
    /// 7. Remove comments
    fn prepare(&mut self) {
        self.remove_empty_imgs();

        self.unwrap_noscript_images();

        // remove scripts
        self.doc.select_matcher(&MATCHER_SCRIPT).remove();

        // remove styles
        self.doc.select_matcher(&MATCHER_STYLE).remove();

        // replace duplicating br elements
        self.replace_brs();

        // replace fonts with spans
        self.replace_fonts();

        // remove comments
        self.remove_comments();
    }

    /// Return the title of the article as a `StrTendril`.
    ///
    /// This method will try to guess the title of the article by looking at the
    /// content of the page. It will first look for a `<title>` tag in the HTML
    /// document, and if it doesn't find it, it will look for a `<h1>` tag.
    /// If it still doesn't find one, it will return an empty string.
    ///
    /// The method will also try to clean up the title by removing any
    /// unnecessary characters from it.
    pub fn get_article_title(&self) -> StrTendril {
        let title = self.doc.select_single_matcher(&MATCHER_TITLE).text();
        let orig_title = title.trim();
        let mut h1: Option<StrTendril> = None;

        let mut cur_title = orig_title;
        let char_count = orig_title.chars().count();
        let mut has_hierarchy_sep = false;
        if orig_title.chars().any(|c| TITLE_SEPARATORS.contains(&c)) {
            has_hierarchy_sep = orig_title.chars().any(|c| TITLE_HIERARCHY_SEP.contains(&c));
            if let Some(title_part) = truncate_title_last(orig_title) {
                cur_title = title_part;
            }

            if cur_title.split_whitespace().count() < 3 {
                if let Some(title_part) = truncate_title_first(orig_title) {
                    cur_title = title_part;
                }
            }
            // Everything below is such a mess
        } else if cur_title.contains(": ") {
            let matched = self.doc.select_matcher(&MATCHER_HEADING).iter().any(|h| {
                let text = h.text();
                text.trim() == cur_title
            });

            if !matched {
                if let Some(tmp_title) = orig_title
                    .rfind(':')
                    .map(|idx| orig_title[idx + 1..].trim())
                {
                    cur_title = tmp_title;
                    if cur_title.split_whitespace().count() < 3 {
                        if let Some(tmp_title) =
                            orig_title.find(':').map(|idx| orig_title[idx + 1..].trim())
                        {
                            cur_title = tmp_title;
                        }
                    } else if orig_title
                        .find(':')
                        .map_or(0, |idx| orig_title[0..=idx].split_whitespace().count())
                        > 5
                    {
                        cur_title = orig_title;
                    }
                }
            }
        } else if !(15..=150).contains(&char_count) {
            let h1_sel = self.doc.select_single("h1");
            if !h1_sel.is_empty() {
                h1 = Some(h1_sel.text());
            }
        }
        if let Some(ref h1) = h1 {
            cur_title = h1;
        }
        let normalized_title = normalize_spaces(cur_title);
        cur_title = &normalized_title;

        // If we now have 4 words or fewer as our title, and either no
        // 'hierarchical' separators (\, /, > or ») were found in the original
        // title or we decreased the number of words by more than 1 word, use
        // the original title.
        let cur_title_wc = cur_title.split_whitespace().count();

        let orig_wc = orig_title
            .split(TITLE_SEPARATORS)
            .flat_map(str::split_whitespace)
            .count();
        if cur_title_wc <= 4 && (!has_hierarchy_sep || cur_title_wc + 1 != orig_wc) {
            cur_title = orig_title;
        }

        cur_title.into()
    }

    fn replace_fonts(&mut self) {
        let sel = self.doc.select_matcher(&MATCHER_FONT);
        sel.rename("span");
        sel.remove_all_attrs();
    }

    fn replace_brs(&mut self) {
        let sel = self.doc.select_matcher(&MATCHER_BR);

        for br in sel.nodes() {
            let mut next_sibling = br.next_sibling();
            let mut replaced = false;

            while let Some(next) = next_significant_node(next_sibling) {
                if !next.has_name("br") {
                    break;
                }

                replaced = true;
                next_sibling = next.next_sibling();
                next.remove_from_parent();
            }
            if replaced {
                let p = br.tree.new_element("p");
                br.replace_with(&p);

                let mut next_sibling = p.next_sibling();
                while let Some(next) = next_sibling {
                    if next.has_name("br") {
                        if let Some(next_elem) = next_significant_node(next.next_sibling()) {
                            if next_elem.has_name("br") {
                                break;
                            }
                        }
                    }

                    if !is_phrasing_content(&next) {
                        break;
                    }

                    next_sibling = next.next_sibling();
                    p.append_child(&next);
                }

                while let Some(last) = p.last_child() {
                    if is_whitespace(&last) {
                        last.remove_from_parent();
                    } else {
                        break;
                    }
                }

                if let Some(parent) = p.parent() {
                    if parent.has_name("p") {
                        parent.rename("div");
                    }
                }
            }
        }
    }

    fn remove_empty_imgs(&mut self) {
        for node in self.doc.select_matcher(&MATCHER_IMG).nodes() {
            let has_src = node.query_or(false, |n| {
                n.as_element().is_some_and(|el| {
                    el.attrs.iter().any(|a| {
                        matches!(
                            a.name.local.as_ref(),
                            "src" | "data-src" | "data-srcset" | "srcset"
                        ) || IMG_EXT.iter().any(|p| a.value.contains(p))
                    })
                })
            });

            if !has_src {
                node.remove_from_parent();
            }
        }
    }

    fn unwrap_noscript_images(&self) {
        let noscript_sel = self.doc.select("noscript:has(img:only-child)");
        for noscript_node in noscript_sel.nodes() {
            let Some(prev_sibling) = noscript_node.prev_element_sibling() else {
                continue;
            };
            let prev_sel = Selection::from(prev_sibling);
            let prev_img: NodeRef;
            if prev_sel.is("img") {
                prev_img = prev_sibling;
            } else if prev_sel.is("*:has( > img:only-child)") {
                let prev_sel_img = prev_sel.select("img:only-child");
                prev_img = prev_sel_img.nodes()[0];
            } else {
                continue;
            }
            let noscript_img_sel = Selection::from(*noscript_node).select("img");
            // at this point noscript_img_sel always has one element
            let new_img = &noscript_img_sel.nodes()[0];

            for attr in prev_img.attrs() {
                if attr.value.as_ref() == "" {
                    continue;
                }

                if matches!(attr.name.local.as_ref(), "src" | "srcset")
                    || IMG_EXT.iter().any(|p| attr.value.contains(p))
                {
                    if new_img.attr_or(&attr.name.local, "") == attr.value {
                        continue;
                    }
                    if new_img.has_attr(&attr.name.local) {
                        let attr_name = format!("data-old-{}", attr.name.local);
                        new_img.set_attr(&attr_name, &attr.value);
                    } else {
                        new_img.set_attr(&attr.name.local, &attr.value);
                    }
                }
            }
            prev_img.replace_with(new_img);
        }
    }

    fn parse_impl(&mut self, policy: Option<ParsePolicy>) -> Result<Article, ReadabilityError> {
        self.verify_doc()?;

        let ld_meta = if self.config.disable_json_ld {
            None
        } else {
            self.parse_json_ld()
        };
        let mut metadata = self.get_article_metadata(ld_meta);

        if metadata.byline.is_none() {
            metadata.byline = self.byline_adjustment();
        }

        self.prepare();

        if let Some(policy) = policy {
            // When using a specific policy, make a single attempt to extract content
            if self
                .attempt_grab_article(&self.doc, &policy.into(), &metadata)
                .is_none()
            {
                return Err(ReadabilityError::GrabFailed);
            }
        } else {
            // When no policy is specified, use the multi-attempt approach for better results
            let Some(doc) = self.grab_article(&metadata) else {
                return Err(ReadabilityError::GrabFailed);
            };
            self.doc = doc;
        }

        let root_sel = self.doc.select_single_matcher(&MATCHER_CONTENT_ID);

        let Some(root_node) = root_sel.nodes().first() else {
            // After `grab_article` successfully returns a document
            // there is no way that `#readability-page-1 does not exists
            return Err(ReadabilityError::GrabFailed);
        };

        metadata.dir = get_dir_attr(root_node);

        self.post_process_content(&root_sel);

        self.fix_relative_uris(&root_sel);

        // If we haven't found an excerpt in the article's metadata, use the article's
        // first paragraph as the excerpt. This is used for displaying a preview of
        // the article's content.

        if metadata.excerpt.is_none() {
            // TODO: Although this matches readability.js,
            // the procedure is far from perfect and requires improvement.
            metadata.excerpt = extract_excerpt(&root_sel);
        }

        let text_content = match self.config.text_mode {
            TextMode::Raw => root_node.text(),
            TextMode::Formatted => root_node.formatted_text(),
            TextMode::Markdown => root_node.md(None),
        };
        let text_length = text_content.chars().count();

        Ok(Article {
            title: metadata.title,
            byline: metadata.byline,
            dir: metadata.dir,
            lang: metadata.lang,
            content: root_sel.html(),
            text_content,
            length: text_length,
            excerpt: metadata.excerpt,
            site_name: metadata.site_name,
            published_time: metadata.published_time,
            modified_time: metadata.modified_time,
            image: metadata.image,
            favicon: metadata.favicon,
            url: metadata.url,
        })
    }

    /// Extracts the relevant content from the document and provides it as a [`Article`] object.
    ///
    /// This is the primary method of the crate. It performs the following steps:
    ///
    /// - Verify the document (element nodes length)
    /// - Extracts the metadata
    /// - Cleans the document
    /// - Extracts the main content of the document
    /// - Post-processes the content
    /// - Returns the content and the metadata as an [`Article`] object
    ///
    /// # Returns
    ///
    /// An [`Article`] object containing the content and the metadata.
    ///
    /// # Errors
    /// If `config.max_elements_to_parse` is > 0 and the document's number of element nodes exceeds this limit,
    /// a [`ReadabilityError::TooManyElements`] error is returned.
    /// If the document fails to extract the content, a [`ReadabilityError::GrabFailed`] error is returned.
    pub fn parse(&mut self) -> Result<Article, ReadabilityError> {
        self.parse_impl(None)
    }

    /// Extracts the relevant content from the document and provides it as an [`Article`] object.
    ///
    /// This method performs the same steps as [`Readability::parse`], but performs only one attempt with the specified [`ParsePolicy`].
    /// The results of this method are likely to be worse than those of [`Readability::parse`], but it consumes significantly
    /// less memory because it does not need to keep the best attempt.
    /// If you need more precise results, use [`Readability::parse`],  
    /// as it sequentially applies all policies, from strict to raw.
    ///
    /// # Errors
    /// If `config.max_elements_to_parse` is > 0 and the document's number of element nodes exceeds this limit,
    /// a [`ReadabilityError::TooManyElements`] error is returned.
    /// If the document fails to extract the content, a [`ReadabilityError::GrabFailed`] error is returned.
    pub fn parse_with_policy(&mut self, policy: ParsePolicy) -> Result<Article, ReadabilityError> {
        self.parse_impl(Some(policy))
    }

    /// This method will search for a JSON-LD block in the page and
    /// extract the metadata from it.
    ///
    /// # Returns
    ///
    /// A [Metadata] object containing the metadata extracted from the JSON-LD block.
    /// If no valid JSON-LD block is found, this method returns `None`.
    #[allow(clippy::too_many_lines)]
    pub fn parse_json_ld(&self) -> Option<Metadata> {
        for sel in self.doc.select_matcher(&MATCHER_JSONLD).iter() {
            let text = sel.text();
            let content = strip_cdata(&text);

            /*
               Because of `gjson` reserved "@" symbol for its own modifiers,
               it is necessary to replace it with other symbol to be able of using `gjson`.
               Or decline using `gjson` at all and replace it with other crate.
               TODO: don't leave it like this!.
            */

            let content = content.trim().replace(r#""@"#, r#""^"#);

            let mut parsed = gjson::parse(&content);
            let clipped_content: String;

            if parsed.kind() == gjson::Kind::Array {
                for it in &parsed.array() {
                    let typ = it.get("^type");
                    if typ.kind() == gjson::Kind::String
                        && JSONLD_ARTICLE_TYPES.iter().any(|p| typ.str().contains(p))
                    {
                        clipped_content = it.to_string();
                        parsed = gjson::parse(&clipped_content);
                        break;
                    }
                }
            }

            let mut context_matched = false;

            let context_val = parsed.get("^context");
            if context_val.kind() == gjson::Kind::String && is_schema_org_url(context_val.str()) {
                // validating @context
                context_matched = true;
            }

            let context_vocab = parsed.get("^context.^vocab");
            if context_vocab.kind() == gjson::Kind::String && is_schema_org_url(context_vocab.str())
            {
                // validating @context
                context_matched = true;
            }

            if !context_matched {
                continue;
            }

            // validating @type

            let mut article_type: Option<String> = None;

            let type_val = parsed.get("^type");
            //There are no examples with @graph array, so it is not clear how to check it
            //TODO: implement same @graph logic as mozilla, when there will be examples.
            if type_val.exists() {
                article_type = Some(type_val.str().to_string());
            } else {
                let type_val = parsed.get("^graph.#.^type");
                if matches!(type_val.kind(), gjson::Kind::String) {
                    article_type = Some(type_val.str().to_string());
                }
            }

            let Some(article_type) = article_type else {
                continue;
            };

            if !JSONLD_ARTICLE_TYPES
                .iter()
                .any(|p| article_type.contains(p))
            {
                continue;
            }

            // Title
            let name_val = parsed.get("name");
            let headline_val = parsed.get("headline");
            let name_is_string = matches!(name_val.kind(), gjson::Kind::String);
            let headline_is_string = matches!(headline_val.kind(), gjson::Kind::String);

            let name = if name_is_string {
                name_val.str().trim().to_string()
            } else {
                String::new()
            };

            let headline = if headline_is_string {
                headline_val.str().trim().to_string()
            } else {
                String::new()
            };

            let mut ld_meta = Metadata::default();

            if name_is_string && headline_is_string && name != headline {
                let title = self.get_article_title();
                let name_matches = text_similarity(&name, &title) > 0.75;
                let headline_matches = text_similarity(&headline, &title) > 0.75;
                if headline_matches && !name_matches {
                    ld_meta.title = headline;
                } else {
                    ld_meta.title = name;
                }
            } else if name_is_string {
                ld_meta.title = name;
            } else if headline_is_string {
                ld_meta.title = headline;
            }

            //Author

            let author_val = parsed.get("author");

            let byline = match author_val.kind() {
                gjson::Kind::Object => Some(author_val.get("name").str().trim().to_string()),
                gjson::Kind::Array => {
                    let names: Vec<String> = author_val
                        .get("#.name")
                        .array()
                        .iter()
                        .map(|v| v.str().trim().to_string())
                        .collect();
                    Some(names.join(", "))
                }
                _ => None,
            };

            if let Some(byline) = byline.filter(|s| !s.is_empty()) {
                ld_meta.byline = Some(byline);
            }

            // Description
            ld_meta.excerpt = get_json_ld_string_value(&parsed, "description");

            // Publisher
            ld_meta.site_name = get_json_ld_string_value(&parsed, "publisher.name");

            // DatePublished
            ld_meta.published_time = get_json_ld_string_value(&parsed, "datePublished");

            // DateModified
            ld_meta.modified_time = get_json_ld_string_value(&parsed, "dateModified");

            // Url
            ld_meta.url = get_json_ld_string_value(&parsed, "url");

            // Image
            ld_meta.image = get_json_ld_string_value(&parsed, "image");

            if !ld_meta.is_empty() {
                return Some(ld_meta);
            }
        }
        None
    }

    /// Extracts metadata from a web page.
    ///
    /// This function takes into account standard metadata formats like `OpenGraph`, Dublin Core,
    /// schema.org, and also tries to extract some metadata from HTML tags like `<title>`.
    ///
    /// The function takes an optional `Metadata` object as input, which is used as a fallback
    /// if no metadata can be found on the page. If the input `Metadata` object contains any
    /// of the following fields, they will not be overwritten by this function:
    /// - `title`
    /// - `byline`
    /// - `excerpt`
    /// - `site_name`
    /// - `published_time`
    /// - `lang`
    ///
    /// # Arguments
    ///
    /// - `json_ld` -- An optional [`Metadata`] object, containing metadata extracted from JSON-LD.
    ///
    /// # Returns
    ///
    /// A [`Metadata`] object containing the extracted metadata.
    pub fn get_article_metadata(&self, json_ld: Option<Metadata>) -> Metadata {
        let mut values: HashMap<String, String> = HashMap::default();
        let mut metadata = json_ld.unwrap_or_default();

        let selection = self.doc.select_matcher(&MATCHER_META);

        for node in selection.nodes() {
            if let Some(content) = node.attr("content") {
                let content = content.trim();
                if content.is_empty() {
                    continue;
                }
                if let Some(mut property) = node.attr("property") {
                    property.make_ascii_lowercase();
                    if let Some(property_name) = meta_property_name(&property) {
                        let k = property_name.trim().to_string();
                        values.insert(k, content.into());
                    }
                    continue;
                }
                if let Some(mut name) = node.attr("name") {
                    name.make_ascii_lowercase();
                    if is_meta_name(&name) {
                        values.insert(normalize_meta_key(&name), content.into());
                    }
                }
            }
        }

        // title
        if metadata.title.is_empty() {
            if let Some(val) = get_map_any_value(&values, META_TITLE_KEYS) {
                metadata.title = val;
            }
        }

        if metadata.title.is_empty() {
            metadata.title = self.get_article_title().to_string();
        }

        // author
        if metadata.byline.is_none() {
            metadata.byline = get_map_any_value(&values, META_BYLINE_KEYS);
        }
        // if metadata is still none
        if metadata.byline.is_none() {
            if let Some(v) = values.get("article:author") {
                if !is_absolute_url(v, true) {
                    metadata.byline = Some(v.clone());
                }
            }
        }

        // description
        if metadata.excerpt.is_none() {
            metadata.excerpt = get_map_any_value(&values, META_EXCERPT_KEYS);
        }

        //site name
        if metadata.site_name.is_none() {
            metadata.site_name = values.get("og:site_name").cloned();
        }

        //published time
        if metadata.published_time.is_none() {
            metadata.published_time = get_map_any_value(&values, META_PUB_TIME_KEYS);
        }

        self.assign_extra_article_metadat

Download .txt

gitextract_hsrmhacy/

├── .cargo/
│   └── config.toml
├── .gitattributes
├── .github/
│   ├── dependabot.yml
│   └── workflows/
│       ├── audit.yml
│       ├── benchmark.yml
│       ├── coverage.yml
│       ├── release.yml
│       ├── rust.yml
│       └── wasm.yml
├── .gitignore
├── CHANGELOG.md
├── Cargo.toml
├── LICENSE
├── README.md
├── crates/
│   ├── bench/
│   │   ├── Cargo.toml
│   │   └── benches/
│   │       └── parse.rs
│   ├── cli/
│   │   ├── Cargo.toml
│   │   └── src/
│   │       └── main.rs
│   ├── js/
│   │   ├── .gitignore
│   │   ├── Cargo.toml
│   │   ├── LICENSE_MIT
│   │   ├── README.md
│   │   ├── src/
│   │   │   ├── lib.rs
│   │   │   └── utils.rs
│   │   └── tests/
│   │       └── web.rs
│   └── lua/
│       ├── Cargo.toml
│       └── src/
│           └── lib.rs
├── deny.toml
├── src/
│   ├── ac_automat.rs
│   ├── config.rs
│   ├── glob.rs
│   ├── grab.rs
│   ├── grab_flags.rs
│   ├── helpers.rs
│   ├── lib.rs
│   ├── matching.rs
│   ├── prep_article.rs
│   ├── readability.rs
│   ├── readable.rs
│   ├── score.rs
│   ├── serde_helpers.rs
│   └── url_helpers.rs
├── test-pages/
│   ├── aclu_ld_meta.json
│   ├── alice-two-paragraphs.html
│   ├── alt/
│   │   ├── arstechnica/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   ├── expected.md
│   │   │   ├── expected_alt.txt
│   │   │   └── source.html
│   │   ├── hacker-news/
│   │   │   ├── expected.md
│   │   │   ├── expected_alt.txt
│   │   │   └── source.html
│   │   ├── mozilla_readability/
│   │   │   ├── expected.md
│   │   │   ├── expected_alt.txt
│   │   │   └── source.html
│   │   └── rust-blog/
│   │       ├── expected.md
│   │       ├── expected_alt.txt
│   │       └── source.html
│   ├── ld.json
│   ├── not-matching/
│   │   ├── empty-links/
│   │   │   ├── google-sre-book-1/
│   │   │   │   ├── expected-metadata.json
│   │   │   │   ├── expected.html
│   │   │   │   └── source.html
│   │   │   ├── lazy-image-2/
│   │   │   │   ├── expected-metadata.json
│   │   │   │   ├── expected.html
│   │   │   │   └── source.html
│   │   │   └── yahoo-3/
│   │   │       ├── expected-metadata.json
│   │   │       ├── expected.html
│   │   │       └── source.html
│   │   ├── redundant-class-page/
│   │   │   ├── nytimes-1/
│   │   │   │   ├── expected-metadata.json
│   │   │   │   ├── expected.html
│   │   │   │   └── source.html
│   │   │   └── nytimes-2/
│   │   │       ├── expected-metadata.json
│   │   │       ├── expected.html
│   │   │       └── source.html
│   │   ├── redundant-div/
│   │   │   ├── citylab-1/
│   │   │   │   ├── expected-metadata.json
│   │   │   │   ├── expected.html
│   │   │   │   └── source.html
│   │   │   ├── la-nacion/
│   │   │   │   ├── expected-metadata.json
│   │   │   │   ├── expected.html
│   │   │   │   └── source.html
│   │   │   ├── lwn-1/
│   │   │   │   ├── expected-metadata.json
│   │   │   │   ├── expected.html
│   │   │   │   └── source.html
│   │   │   └── wapo-2/
│   │   │       ├── expected-metadata.json
│   │   │       ├── expected.html
│   │   │       └── source.html
│   │   ├── redundant-font-attrs/
│   │   │   ├── clean-links/
│   │   │   │   ├── expected-metadata.json
│   │   │   │   ├── expected.html
│   │   │   │   └── source.html
│   │   │   ├── gmw/
│   │   │   │   ├── expected-metadata.json
│   │   │   │   ├── expected.html
│   │   │   │   └── source.html
│   │   │   ├── hukumusume/
│   │   │   │   ├── expected-metadata.json
│   │   │   │   ├── expected.html
│   │   │   │   └── source.html
│   │   │   ├── keep-tabular-data/
│   │   │   │   ├── expected-metadata.json
│   │   │   │   ├── expected.html
│   │   │   │   └── source.html
│   │   │   ├── replace-font-tags/
│   │   │   │   ├── expected-metadata.json
│   │   │   │   ├── expected.html
│   │   │   │   └── source.html
│   │   │   └── table-style-attributes/
│   │   │       ├── expected-metadata.json
│   │   │       ├── expected.html
│   │   │       └── source.html
│   │   └── urls/
│   │       ├── 002/
│   │       │   ├── expected-metadata.json
│   │       │   ├── expected.html
│   │       │   └── source.html
│   │       ├── ietf-1/
│   │       │   ├── expected-metadata.json
│   │       │   ├── expected.html
│   │       │   └── source.html
│   │       ├── toc-missing/
│   │       │   ├── expected-metadata.json
│   │       │   ├── expected.html
│   │       │   └── source.html
│   │       ├── v8-blog/
│   │       │   ├── expected-metadata.json
│   │       │   ├── expected.html
│   │       │   └── source.html
│   │       ├── videos-1/
│   │       │   ├── expected-metadata.json
│   │       │   ├── expected.html
│   │       │   └── source.html
│   │       ├── wikia/
│   │       │   ├── expected-metadata.json
│   │       │   ├── expected.html
│   │       │   └── source.html
│   │       ├── wikipedia/
│   │       │   ├── expected-metadata.json
│   │       │   ├── expected.html
│   │       │   └── source.html
│   │       └── wikipedia-2/
│   │           ├── expected-metadata.json
│   │           ├── expected.html
│   │           └── source.html
│   ├── ok/
│   │   ├── 001/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── 002/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── 003-metadata-preferred/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── 004-metadata-space-separated-properties/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── 005-unescape-html-entities/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── aclu/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── aktualne/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── archive-of-our-own/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── ars-1/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── base-url/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── base-url-base-element-relative/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── breitbart/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── citylab-1/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── clean-links/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── cnn/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── dev418/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── ehow-1/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── engadget/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── gmw/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── hukumusume/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── ietf-1/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── js-link-replacement/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── keep-tabular-data/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── la-nacion/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── lwn-1/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── medicalnewstoday/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── medium-3/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── qq/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── replace-brs/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── replace-font-tags/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── social-buttons/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── table-style-attributes/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── tmz-1/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── toc-missing/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── v8-blog/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── videos-1/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── wikia/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── wikipedia/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── wikipedia-2/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   └── wikipedia-3/
│   │       ├── expected-metadata.json
│   │       ├── expected.html
│   │       └── source.html
│   ├── readability/
│   │   ├── article-author-tag/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── arxiv/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   ├── expected.md
│   │   │   ├── expected_alt.txt
│   │   │   └── source.html
│   │   ├── base-url-base-element/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── basic-tags-cleaning/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── bbc-1/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── blogger/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── bug-1255978/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── buzzfeed-1/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── cnet/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── cnet-svg-classes/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── comment-inside-script-parsing/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── daringfireball-1/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── data-url-image/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── dropbox-blog/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── ebb-org/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── ehow-2/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── embedded-videos/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── firefox-nightly-blog/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── folha/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── gitlab-blog/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── google-sre-book-1/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── guardian-1/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── heise/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── herald-sun-1/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── hidden-nodes/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── iab-1/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── invalid-attributes/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── keep-images/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── lazy-image-1/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── lazy-image-2/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── lazy-image-3/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── lemonde-1/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── liberation-1/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── lifehacker-post-comment-load/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── lifehacker-working/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── links-in-tables/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── mathjax/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── medium-1/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── medium-2/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── mercurial/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── metadata-content-missing/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── missing-paragraphs/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── mozilla-1/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── mozilla-2/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── msn/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── normalize-spaces/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── nytimes-1/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── nytimes-2/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── nytimes-3/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── nytimes-4/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── nytimes-5/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── ol/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── parsely-metadata/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── pixnet/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── quanta-1/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── remove-aria-hidden/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── remove-extra-brs/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── remove-extra-paragraphs/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── remove-script-tags/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── reordering-paragraphs/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── royal-road/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── rtl-1/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── rtl-2/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── rtl-3/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── rtl-4/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── salon-1/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── schema-org-context-object/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── seattletimes-1/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── simplyfound-1/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── spiceworks/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── style-tags-removal/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── svg-parsing/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── telegraph/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── theverge/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── title-and-h1-discrepancy/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── title-en-dash/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── topicseed-1/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── tumblr/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── videos-2/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── visibility-hidden/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── wapo-1/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── wapo-2/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── webmd-1/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── webmd-2/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── wikipedia-4/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── wordpress/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── yahoo-1/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── yahoo-2/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── yahoo-3/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   ├── yahoo-4/
│   │   │   ├── expected-metadata.json
│   │   │   ├── expected.html
│   │   │   └── source.html
│   │   └── youth/
│   │       ├── expected-metadata.json
│   │       ├── expected.html
│   │       └── source.html
│   ├── rustwiki_2024.html
│   ├── rustwiki_2024_result.html
│   └── rustwiki_2024_result.txt
└── tests/
    ├── alt.rs
    ├── bad.rs
    ├── candidate_modes.rs
    ├── common.rs
    ├── favicon.rs
    ├── metadata.rs
    ├── parse_policy.rs
    ├── readability.rs
    ├── readability_ok.rs
    └── wasm.rs

Download .txt

SYMBOL INDEX (277 symbols across 29 files)

FILE: crates/bench/benches/parse.rs
  function dom_smoothie_parse (line 6) | fn dom_smoothie_parse(contents: &str, cfg: &Config) -> Result<Article, R...
  function bench_dom_smoothie_parse (line 11) | fn bench_dom_smoothie_parse(c: &mut Criterion) {
  function configure_criterion (line 44) | fn configure_criterion() -> Criterion {

FILE: crates/cli/src/main.rs
  type OutputFormat (line 37) | enum OutputFormat {
  type Cli (line 49) | struct Cli {
  type Metadata (line 95) | struct Metadata {
    method from (line 108) | fn from(value: &Article) -> Self {
  function read_input (line 125) | fn read_input(input: &Option<PathBuf>) -> Result<(String, Option<OsStrin...
  function main (line 147) | fn main() -> Result<(), Box<dyn Error>> {

FILE: crates/js/src/lib.rs
  type ParsePolicy (line 17) | pub enum ParsePolicy {
  function from (line 37) | fn from(val: ParsePolicy) -> Self {
  type Readability (line 49) | pub struct Readability(dom_smoothie::Readability);
    method new (line 73) | pub fn new(
    method parse (line 110) | pub fn parse(&mut self) -> Result<JsValue, JsError> {
    method parse_with_policy (line 127) | pub fn parse_with_policy(&mut self, policy: ParsePolicy) -> Result<JsV...
    method get_article_title (line 138) | pub fn get_article_title(&mut self) -> String {
    method parse_json_ld (line 149) | pub fn parse_json_ld(&mut self) -> JsValue {
    method get_article_metadata (line 177) | pub fn get_article_metadata(&mut self, json_ld: JsValue) -> JsValue {
    method is_probably_readable (line 190) | pub fn is_probably_readable(&mut self) -> bool {
  function parse (line 208) | pub fn parse(content: &str) -> Result<JsValue, JsError> {

FILE: crates/js/src/utils.rs
  function set_panic_hook (line 2) | pub fn set_panic_hook() {

FILE: crates/js/tests/web.rs
  function test_parse (line 12) | fn test_parse() {
  function test_parse_constructor (line 19) | fn test_parse_constructor() {

FILE: crates/lua/src/lib.rs
  type Readability (line 5) | pub struct Readability(dom_smoothie::Readability);
  method add_fields (line 8) | fn add_fields<F: LuaUserDataFields<Self>>(_fields: &mut F) {}
  method add_methods (line 10) | fn add_methods<M: LuaUserDataMethods<Self>>(methods: &mut M) {
  function dom_smoothie_module (line 46) | fn dom_smoothie_module(lua: &'_ Lua) -> LuaResult<LuaTable> {

FILE: src/ac_automat.rs
  function ac_automaton (line 13) | fn ac_automaton(patterns: &[&str]) -> AhoCorasick {

FILE: src/config.rs
  type CandidateSelectMode (line 14) | pub enum CandidateSelectMode {
  type TextMode (line 22) | pub enum TextMode {
  type Config (line 33) | pub struct Config {
  method default (line 65) | fn default() -> Self {
  type ParsePolicy (line 85) | pub enum ParsePolicy {
  function from (line 105) | fn from(val: ParsePolicy) -> Self {

FILE: src/grab.rs
  method grab_article (line 24) | pub(crate) fn grab_article(&self, metadata: &Metadata) -> Option<Documen...
  method attempt_grab_article (line 65) | pub(crate) fn attempt_grab_article<'a>(
  method handle_candidates (line 79) | fn handle_candidates<'a>(
  function is_unlikely_candidate (line 158) | fn is_unlikely_candidate(node: &NodeRef) -> bool {
  function div_into_p (line 181) | fn div_into_p(node: &NodeRef) {
  function wrap_phrasing_content (line 190) | fn wrap_phrasing_content<'a>(node: &NodeRef<'a>) -> Option<NodeRef<'a>> {
  function has_child_block_element (line 222) | fn has_child_block_element(node: &NodeRef) -> bool {
  function score_elements (line 229) | fn score_elements<'a>(
  function assign_article_node (line 308) | fn assign_article_node(tc: &NodeRef, article_content: &NodeRef) {
  function find_common_candidate (line 367) | fn find_common_candidate<'a>(
  function find_common_candidate_alt (line 423) | fn find_common_candidate_alt<'a>(
  function get_node_ancestors_set (line 472) | fn get_node_ancestors_set(node: &NodeRef) -> HashSet<NodeId> {
  function adjust_top_candidate_by_parent (line 485) | fn adjust_top_candidate_by_parent(
  function collect_elements_to_score (line 528) | fn collect_elements_to_score<'a>(
  function match_unlikely (line 615) | fn match_unlikely(haystack: &str) -> bool {
  function match_unlikely (line 628) | fn match_unlikely(haystack: &str) -> bool {
  function test_removing_probably_invisible_nodes (line 645) | fn test_removing_probably_invisible_nodes() {
  function test_remove_dialog (line 674) | fn test_remove_dialog() {
  function test_unlikely_roles (line 696) | fn test_unlikely_roles() {
  function test_remove_empty (line 718) | fn test_remove_empty() {
  function test_remove_title_duplicates (line 749) | fn test_remove_title_duplicates() {
  function test_remove_unlikely_candidates (line 768) | fn test_remove_unlikely_candidates() {
  function test_skip_ok_maybe_candidates (line 785) | fn test_skip_ok_maybe_candidates() {

FILE: src/grab_flags.rs
  function test_grab_flags (line 22) | fn test_grab_flags() {

FILE: src/helpers.rs
  function text_similarity (line 13) | pub(crate) fn text_similarity(text_a: &str, text_b: &str) -> f64 {
  function is_phrasing_content (line 39) | pub(crate) fn is_phrasing_content(node: &NodeRef) -> bool {
  function is_whitespace (line 61) | pub(crate) fn is_whitespace(node: &NodeRef) -> bool {
  function has_ancestor (line 69) | pub(crate) fn has_ancestor<F>(node: &NodeRef, max_depth: Option<usize>, ...
  function text_density (line 77) | pub(crate) fn text_density(node: &NodeRef, selector: &str, char_count: O...
  function normalize_spaces (line 97) | pub(crate) fn normalize_spaces(text: &str) -> String {
  function link_density_fn (line 111) | pub(crate) fn link_density_fn<F>(node: &NodeRef, char_count: Option<usiz...
  function link_density (line 139) | pub(crate) fn link_density(node: &NodeRef, char_count: Option<usize>) ->...
  function single_child_element (line 145) | pub(crate) fn single_child_element<'a>(node: &NodeRef<'a>, tag: &str) ->...
  function is_element_without_content (line 163) | pub(crate) fn is_element_without_content(node: &NodeRef) -> bool {
  function get_dir_attr (line 179) | pub(crate) fn get_dir_attr(node: &NodeRef) -> Option<String> {
  function node_name_in (line 193) | pub(crate) fn node_name_in(node: &NodeRef, names: &phf::Set<&str>) -> bo...
  function is_probably_visible (line 198) | pub(crate) fn is_probably_visible(node: &NodeRef) -> bool {
  function get_node_matching_string (line 208) | pub(crate) fn get_node_matching_string(node: &NodeRef) -> StrTendril {
  function next_child_or_sibling (line 226) | pub(crate) fn next_child_or_sibling<'a>(
  type BytePatternCheck (line 252) | pub(crate) struct BytePatternCheck<'a> {
  function new (line 259) | pub(crate) fn new(haystack: &'a str) -> Self {
  function pre_check (line 268) | fn pre_check(&self, pat: &str) -> bool {
  function contains_any (line 278) | pub(crate) fn contains_any(&self, pats: &[&str]) -> bool {
  type CharCounterCache (line 285) | pub(crate) struct CharCounterCache {
    method char_count (line 290) | pub(crate) fn char_count(&mut self, node: &NodeRef) -> usize {
  constant FLOAT_TOLERANCE (line 303) | const FLOAT_TOLERANCE: f64 = 0.00001;
  function test_text_similarity (line 306) | fn test_text_similarity() {
  function test_text_similarity_contains (line 314) | fn test_text_similarity_contains() {
  function test_text_similarity_similar (line 322) | fn test_text_similarity_similar() {
  function test_normalize_spaces (line 330) | fn test_normalize_spaces() {
  function test_ascii_pattern_check (line 339) | fn test_ascii_pattern_check() {

FILE: src/lib.rs
  type ReadabilityError (line 27) | pub enum ReadabilityError {

FILE: src/matching.rs
  function is_invisible_style (line 11) | pub(crate) fn is_invisible_style(node: &NodeRef) -> bool {
  function style_has_kv (line 20) | fn style_has_kv(style: &str, key: &str, val: &str) -> bool {
  function strip_cdata (line 40) | pub(crate) fn strip_cdata(content: &StrTendril) -> &str {
  function is_schema_org_url (line 47) | pub(crate) fn is_schema_org_url(url: &str) -> bool {
  function is_video_url (line 53) | pub(crate) fn is_video_url(haystack: &str) -> bool {
  function is_sentence (line 66) | pub(crate) fn is_sentence(text: &str) -> bool {
  function contains_one_of_words (line 70) | pub(crate) fn contains_one_of_words(haystack: &str, words: &[&str]) -> b...
  function is_img_attr_to_srcset (line 77) | pub(crate) fn is_img_attr_to_srcset(s: &str) -> bool {
  function is_img_attr_to_src (line 95) | pub(crate) fn is_img_attr_to_src(s: &str) -> bool {
  function truncate_title_last (line 102) | pub(crate) fn truncate_title_last(title: &str) -> Option<&str> {
  function truncate_title_first (line 117) | pub(crate) fn truncate_title_first(title: &str) -> Option<&str> {
  function is_meta_name (line 131) | pub(crate) fn is_meta_name(name: &str) -> bool {
  function meta_property_name (line 138) | pub(crate) fn meta_property_name(property: &str) -> Option<&str> {
  function is_loading_word (line 155) | pub(crate) fn is_loading_word(text: &str) -> bool {
  function contains_share_elements (line 160) | pub(crate) fn contains_share_elements(value: &str) -> bool {
  function split_base64_url (line 166) | pub(crate) fn split_base64_url(src: &str) -> Option<(&str, &str)> {
  function test_meta_property_name (line 186) | fn test_meta_property_name() {
  function test_is_meta_name (line 195) | fn test_is_meta_name() {
  function test_truncate_title_first (line 204) | fn test_truncate_title_first() {
  function test_truncate_title_last (line 212) | fn test_truncate_title_last() {
  function test_is_img_attr_to_src (line 224) | fn test_is_img_attr_to_src() {
  function test_contains_one_of_words (line 232) | fn test_contains_one_of_words() {
  function test_strip_cdata (line 252) | fn test_strip_cdata() {
  function test_is_schema_org_url (line 275) | fn test_is_schema_org_url() {
  function test_is_video_url (line 292) | fn test_is_video_url() {
  function test_split_base64_url (line 316) | fn test_split_base64_url() {

FILE: src/prep_article.rs
  function clean (line 14) | fn clean(root_sel: &Selection) {
  function clean_styles (line 35) | fn clean_styles(n: &NodeRef) {
  function should_clean_conditionally (line 51) | fn should_clean_conditionally(node: &NodeRef, flags: &FlagSet<GrabFlags>...
  function clean_conditionally (line 186) | fn clean_conditionally(sel: &Selection, tags: &str, flags: &FlagSet<Grab...
  function set_data_readability_table (line 201) | fn set_data_readability_table(n: &NodeRef, is_data_table: bool) {
  function get_row_and_col_count (line 209) | fn get_row_and_col_count(table: &Selection) -> (usize, usize) {
  function mark_data_tables (line 229) | fn mark_data_tables(base_sel: &Selection) {
  function fix_lazy_images (line 274) | fn fix_lazy_images(sel: &Selection) {
  function clean_headers (line 345) | fn clean_headers(sel: &Selection, flags: &FlagSet<GrabFlags>) {
  function prep_article (line 353) | pub(crate) fn prep_article(article_node: &NodeRef, flags: &FlagSet<GrabF...
  function fix_single_cell_tables (line 408) | fn fix_single_cell_tables(sel: &Selection) {
  function remove_share_elements (line 427) | fn remove_share_elements(root_sel: &Selection, share_element_threshold: ...

FILE: src/readability.rs
  type Article (line 22) | pub struct Article {
  type Metadata (line 71) | pub struct Metadata {
    method is_empty (line 86) | fn is_empty(&self) -> bool {
    method unescape_html_entities (line 100) | fn unescape_html_entities(&mut self) {
  type Readability (line 113) | pub struct Readability {
    method from (line 123) | fn from(html: T) -> Self {
    method new (line 148) | pub fn new<T: Into<StrTendril>>(
    method with_document (line 171) | pub fn with_document(
    method prepare (line 203) | fn prepare(&mut self) {
    method get_article_title (line 233) | pub fn get_article_title(&self) -> StrTendril {
    method replace_fonts (line 309) | fn replace_fonts(&mut self) {
    method replace_brs (line 315) | fn replace_brs(&mut self) {
    method remove_empty_imgs (line 370) | fn remove_empty_imgs(&mut self) {
    method unwrap_noscript_images (line 389) | fn unwrap_noscript_images(&self) {
    method parse_impl (line 432) | fn parse_impl(&mut self, policy: Option<ParsePolicy>) -> Result<Articl...
    method parse (line 532) | pub fn parse(&mut self) -> Result<Article, ReadabilityError> {
    method parse_with_policy (line 548) | pub fn parse_with_policy(&mut self, policy: ParsePolicy) -> Result<Art...
    method parse_json_ld (line 560) | pub fn parse_json_ld(&self) -> Option<Metadata> {
    method get_article_metadata (line 740) | pub fn get_article_metadata(&self, json_ld: Option<Metadata>) -> Metad...
    method assign_extra_article_metadata (line 816) | fn assign_extra_article_metadata(
    method byline_adjustment (line 834) | fn byline_adjustment(&self) -> Option<String> {
    method remove_comments (line 855) | fn remove_comments(&self) {
    method get_html_lang (line 868) | fn get_html_lang(&self) -> Option<StrTendril> {
    method post_process_content (line 873) | fn post_process_content(&self, root_sel: &Selection) {
    method clean_classes (line 889) | fn clean_classes(&self, sel: &Selection) {
    method verify_doc (line 925) | fn verify_doc(&self) -> Result<(), ReadabilityError> {
    method is_probably_readable (line 950) | pub fn is_probably_readable(&self) -> bool {
    method parse_base_url (line 961) | fn parse_base_url(&self) -> Option<String> {
    method fix_relative_uris (line 975) | fn fix_relative_uris(&self, root_sel: &Selection) {
  function get_map_any_value (line 1024) | fn get_map_any_value(map: &HashMap<String, String>, keys: &[&str]) -> Op...
  function next_significant_node (line 1028) | fn next_significant_node(node: Option<NodeRef>) -> Option<NodeRef> {
  function fix_links (line 1040) | fn fix_links(root_sel: &Selection) {
  function simplify_nested_elements (line 1069) | fn simplify_nested_elements(root_sel: &Selection) {
  function simplify_nested_divs (line 1081) | fn simplify_nested_divs(root_sel: &Selection) {
  function extract_excerpt (line 1107) | fn extract_excerpt(sel: &Selection) -> Option<String> {
  function normalize_meta_key (line 1116) | fn normalize_meta_key(raw_key: &str) -> String {
  function get_json_ld_string_value (line 1124) | fn get_json_ld_string_value(value: &gjson::Value, path: &str) -> Option<...
  function extract_favicon (line 1135) | fn extract_favicon(root_node: &Document, base_url: Option<String>) -> Op...
  function decode_html_entities (line 1192) | fn decode_html_entities(s: &mut String) {
  function decode_opt_html_entities (line 1199) | fn decode_opt_html_entities(opt: &mut Option<String>) {
  function is_valid_byline (line 1205) | fn is_valid_byline(node: &NodeRef) -> bool {
  function test_simplify_nested_divs (line 1225) | fn test_simplify_nested_divs() {
  function test_replace_font_tags (line 1247) | fn test_replace_font_tags() {
  function test_remove_unwanted_urls (line 1267) | fn test_remove_unwanted_urls() {
  function test_get_title (line 1282) | fn test_get_title() {
  function test_normalize_spaces (line 1293) | fn test_normalize_spaces() {
  function test_parse_json_ld (line 1301) | fn test_parse_json_ld() {
  function test_disable_sparse_json_ld (line 1314) | fn test_disable_sparse_json_ld() {
  function test_max_elements (line 1339) | fn test_max_elements() {
  function test_get_article_metadata_without_json_ld (line 1363) | fn test_get_article_metadata_without_json_ld() {
  function test_base_uri (line 1386) | fn test_base_uri() {
  function test_get_title_with_separator_only (line 1402) | fn test_get_title_with_separator_only() {
  function test_bad_document_url (line 1414) | fn test_bad_document_url() {
  function test_parse_base_url_with_doc_and_base (line 1428) | fn test_parse_base_url_with_doc_and_base() {
  function test_fix_relative_uris_srcset_without_descriptor (line 1445) | fn test_fix_relative_uris_srcset_without_descriptor() {
  function test_metadata_is_empty (line 1472) | fn test_metadata_is_empty() {
  function test_consume_byline (line 1489) | fn test_consume_byline() {
  function test_skipping_byline (line 1509) | fn test_skipping_byline() {

FILE: src/readable.rs
  function is_probably_readable (line 19) | pub fn is_probably_readable(

FILE: src/score.rs
  function get_node_score (line 7) | pub(crate) fn get_node_score(node: &NodeRef) -> f32 {
  function has_node_score (line 13) | pub(crate) fn has_node_score(node: &NodeRef) -> bool {
  function set_node_score (line 17) | pub(crate) fn set_node_score(node: &NodeRef, score: f32) {
  function init_node_score (line 21) | pub(crate) fn init_node_score(node: &NodeRef, weigh_classes: bool) -> f32 {
  function determine_node_score (line 27) | pub(crate) fn determine_node_score(node: &NodeRef, weigh_classes: bool) ...
  function get_class_weight (line 43) | pub(crate) fn get_class_weight(node: &NodeRef, weigh_classes: bool) -> f...
  function determine_attr_weight (line 66) | fn determine_attr_weight(attr: &str) -> f32 {
  function determine_attr_weight (line 81) | fn determine_attr_weight(attr: &str) -> f32 {
  function score_text_content (line 94) | pub(crate) fn score_text_content(node: &NodeRef) -> usize {

FILE: src/serde_helpers.rs
  function serialize_str_tendril (line 4) | pub fn serialize_str_tendril<S>(value: &StrTendril, serializer: S) -> Re...
  function deserialize_str_tendril (line 11) | pub fn deserialize_str_tendril<'de, D>(deserializer: D) -> Result<StrTen...

FILE: src/url_helpers.rs
  function delimiter (line 1) | fn delimiter(s: &str) -> &str {
  function is_absolute_url (line 22) | pub(crate) fn is_absolute_url(s: &str, strict: bool) -> bool {
  function to_absolute_url (line 42) | pub(crate) fn to_absolute_url(raw_url: &str, base_uri: &str) -> String {
  function url_join (line 51) | pub(crate) fn url_join(base: &str, relative: &str) -> String {
  function test_valid_urls (line 123) | fn test_valid_urls() {
  function test_invalid_urls (line 133) | fn test_invalid_urls() {
  function test_url_join (line 148) | fn test_url_join() {

FILE: tests/alt.rs
  function test_alt_formatted_last_fail (line 9) | fn test_alt_formatted_last_fail() {
  function table_test_alt_formatted_text (line 15) | fn table_test_alt_formatted_text() {
  function table_test_alt_markdown (line 25) | fn table_test_alt_markdown() {

FILE: tests/bad.rs
  function test_skip_body_ancestor (line 8) | fn test_skip_body_ancestor() {
  function test_skip_body_ancestor_fragment (line 37) | fn test_skip_body_ancestor_fragment() {
  function test_fragments (line 57) | fn test_fragments() {
  function test_frameset_fail (line 81) | fn test_frameset_fail() {

FILE: tests/candidate_modes.rs
  function test_candidates (line 8) | fn test_candidates() {

FILE: tests/common.rs
  type ExpectedMetadata (line 47) | struct ExpectedMetadata {
  type TestData (line 59) | pub struct TestData {
    method new (line 67) | pub fn new<P>(test_path: P, source_contents: String, expected_contents...
    method from_path (line 84) | pub fn from_path<P>(
  function test_alt_text (line 102) | pub(crate) fn test_alt_text(data: TestData, text_mode: TextMode) {
  function test_readability (line 118) | pub(crate) fn test_readability(data: TestData) {
  function test_metadata (line 155) | pub fn test_metadata(data: TestData, host: Option<&str>) {
  function test_favicon (line 188) | pub fn test_favicon<P>(test_path: P, host: Option<&str>, expected: Optio...

FILE: tests/favicon.rs
  function test_favicon_aclu (line 6) | fn test_favicon_aclu() {
  function test_favicon_aktualne (line 15) | fn test_favicon_aktualne() {
  function test_favicon_breitbart (line 24) | fn test_favicon_breitbart() {
  function test_favicon_engadget (line 34) | fn test_favicon_engadget() {
  function test_favicon_folha (line 43) | fn test_favicon_folha() {
  function test_favicon_gitlab_blog (line 52) | fn test_favicon_gitlab_blog() {
  function test_favicon_gitlab_nytimes_2 (line 61) | fn test_favicon_gitlab_nytimes_2() {
  function test_favicon_schema_org_context (line 70) | fn test_favicon_schema_org_context() {
  function test_favicon_topicseed_1 (line 79) | fn test_favicon_topicseed_1() {
  function test_favicon_yahoo_1 (line 88) | fn test_favicon_yahoo_1() {

FILE: tests/metadata.rs
  function test_metadata_last_fail (line 9) | fn test_metadata_last_fail() {
  function table_test_metadata (line 20) | fn table_test_metadata() {

FILE: tests/parse_policy.rs
  function hash_text (line 10) | fn hash_text<T: Hash>(text: &T) -> u64 {
  function test_parse_with_policy (line 18) | pub(crate) fn test_parse_with_policy() -> Result<(), Box<dyn Error>> {
  function test_parse_with_policy_fail (line 44) | pub(crate) fn test_parse_with_policy_fail() -> Result<(), Box<dyn Error>> {

FILE: tests/readability.rs
  function table_test_readability (line 9) | fn table_test_readability() {
  function test_serde (line 21) | fn test_serde() {

FILE: tests/readability_ok.rs
  function test_001 (line 7) | fn test_001() {
  function test_002 (line 12) | fn test_002() {
  function test_003 (line 17) | fn test_003() {
  function test_004 (line 25) | fn test_004() {
  function test_005 (line 33) | fn test_005() {
  function test_aclu (line 41) | fn test_aclu() {
  function test_aktualne (line 46) | fn test_aktualne() {
  function test_archive_of_our_own (line 51) | fn test_archive_of_our_own() {
  function test_ars_1 (line 59) | fn test_ars_1() {
  function test_base_url (line 64) | fn test_base_url() {
  function test_base_url_base_element_relative (line 69) | fn test_base_url_base_element_relative() {
  function test_breitbart (line 77) | fn test_breitbart() {
  function test_clean_links (line 82) | fn test_clean_links() {
  function test_cnn (line 87) | fn test_cnn() {
  function test_ehow_1 (line 92) | fn test_ehow_1() {
  function test_js_link_replacement (line 97) | fn test_js_link_replacement() {
  function test_keep_tabular_data (line 105) | fn test_keep_tabular_data() {
  function test_medicalnewstoday (line 113) | fn test_medicalnewstoday() {
  function test_medium_3 (line 121) | fn test_medium_3() {
  function test_qq (line 126) | fn test_qq() {
  function test_replace_brs (line 131) | fn test_replace_brs() {
  function test_social_buttons (line 136) | fn test_social_buttons() {
  function test_tmz_1 (line 141) | fn test_tmz_1() {
  function test_wikia (line 146) | fn test_wikia() {
  function test_wikipedia (line 151) | fn test_wikipedia() {
  function test_gmw (line 156) | fn test_gmw() {
  function test_videos_1 (line 161) | fn test_videos_1() {
  function test_v8_blog (line 166) | fn test_v8_blog() {
  function test_lwn_1 (line 171) | fn test_lwn_1() {
  function test_ietf_1 (line 176) | fn test_ietf_1() {
  function test_toc_missing (line 181) | fn test_toc_missing() {
  function test_table_style_attributes (line 186) | fn test_table_style_attributes() {
  function test_dev418 (line 194) | fn test_dev418() {
  function test_citylab_1 (line 199) | fn test_citylab_1() {
  function test_lemonde_1 (line 204) | fn test_lemonde_1() {
  function test_hukumusume (line 212) | fn test_hukumusume() {
  function test_engadget (line 217) | fn test_engadget() {
  function test_la_nacion (line 223) | fn test_la_nacion() {
  function test_wikipedia_3 (line 229) | fn test_wikipedia_3() {
  function test_wikipedia_2 (line 235) | fn test_wikipedia_2() {
  function arstechnica (line 241) | fn arstechnica() {

FILE: tests/wasm.rs
  function test_readability_wikipedia_2 (line 11) | fn test_readability_wikipedia_2() {
  function test_metadata_wikipedia_2 (line 21) | fn test_metadata_wikipedia_2() {

Copy disabled (too large) Download .json

Condensed preview — 536 files, each showing path, character count, and a content snippet. Download the .json file for the full structured content (39,187K chars).

[
  {
    "path": ".cargo/config.toml",
    "chars": 68,
    "preview": "[target.wasm32-unknown-unknown]\nrunner = 'wasm-bindgen-test-runner'\n"
  },
  {
    "path": ".gitattributes",
    "chars": 39,
    "preview": "\ntest-pages/** linguist-generated=true\n"
  },
  {
    "path": ".github/dependabot.yml",
    "chars": 527,
    "preview": "# To get started with Dependabot version updates, you'll need to specify which\n# package ecosystems to update and where "
  },
  {
    "path": ".github/workflows/audit.yml",
    "chars": 860,
    "preview": "name: Rust Audit\n\non:\n  push:\n    branches: [ \"main\" ]\n  pull_request:\n    branches: [ \"main\" ]\n\nenv:\n  CARGO_TERM_COLOR"
  },
  {
    "path": ".github/workflows/benchmark.yml",
    "chars": 1805,
    "preview": "name: Benchmark\n\non:\n  push:\n    branches: [main]\n\npermissions:\n  # deployments permission to deploy GitHub pages websit"
  },
  {
    "path": ".github/workflows/coverage.yml",
    "chars": 669,
    "preview": "name: Coverage\n\non:\n  push:\n    branches: [ \"main\" ]\n  pull_request:\n    branches: [ \"main\" ]\n\njobs:\n  coverage:\n    run"
  },
  {
    "path": ".github/workflows/release.yml",
    "chars": 1561,
    "preview": "name: Build and Release Binaries\n\non:\n  release:\n    types: [published]\n\n  workflow_dispatch:\n\npermissions:\n  contents: "
  },
  {
    "path": ".github/workflows/rust.yml",
    "chars": 869,
    "preview": "name: Rust CI\n\non:\n  push:\n    branches: [ \"main\" ]\n  pull_request:\n    branches: [ \"main\" ]\n\nenv:\n  CARGO_TERM_COLOR: a"
  },
  {
    "path": ".github/workflows/wasm.yml",
    "chars": 874,
    "preview": "name: wasm ci\n\non:\n  push:\n    branches: [ \"main\", \"feature/*\" ]\n  pull_request:\n    branches: [ \"main\" ]\n\nenv:\n  CARGO_"
  },
  {
    "path": ".gitignore",
    "chars": 31,
    "preview": "/target\n/examples\n*.js\n**/draft"
  },
  {
    "path": "CHANGELOG.md",
    "chars": 12787,
    "preview": "# Changelog\n\nAll notable changes to the `dom_smoothie` crate will be documented in this file.\n\n## [Unreleased]\n### Chang"
  },
  {
    "path": "Cargo.toml",
    "chars": 2001,
    "preview": "\n[workspace.package]\nversion = \"0.17.0\"\nedition = \"2021\"\nlicense = \"MIT\"\nrust-version = \"1.75\"\nrepository = \"https://git"
  },
  {
    "path": "LICENSE",
    "chars": 1071,
    "preview": "MIT License\n\nCopyright (c) 2024 Mykola Humanov\n\nPermission is hereby granted, free of charge, to any person obtaining a "
  },
  {
    "path": "README.md",
    "chars": 14406,
    "preview": "# DOM_SMOOTHIE\n\n[![Crates.io version](https://img.shields.io/crates/v/dom_smoothie.svg?style=flat)](https://crates.io/cr"
  },
  {
    "path": "crates/bench/Cargo.toml",
    "chars": 358,
    "preview": "[package]\nname = \"dom-smoothie-bench\"\nversion.workspace = true\nedition.workspace = true\nlicense.workspace = true\nrust-ve"
  },
  {
    "path": "crates/bench/benches/parse.rs",
    "chars": 1747,
    "preview": "use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion};\nuse std::hint::black_box;\n\nuse dom_smoothie::{"
  },
  {
    "path": "crates/cli/Cargo.toml",
    "chars": 444,
    "preview": "[package]\nname = \"dom_smoothie_cli\"\nversion.workspace = true\nedition.workspace = true\nlicense.workspace = true\nrust-vers"
  },
  {
    "path": "crates/cli/src/main.rs",
    "chars": 8480,
    "preview": "//! This is a reference implementation of a CLI tool for the `dom_smoothie` crate.\n//!\n//! The tool processes an HTML do"
  },
  {
    "path": "crates/js/.gitignore",
    "chars": 54,
    "preview": "/target\n**/*.rs.bk\nCargo.lock\nbin/\npkg/\nwasm-pack.log\n"
  },
  {
    "path": "crates/js/Cargo.toml",
    "chars": 1316,
    "preview": "[package]\nname = \"dom-smoothie-js\"\nversion.workspace = true\nedition.workspace = true\nlicense.workspace = true\nrust-versi"
  },
  {
    "path": "crates/js/LICENSE_MIT",
    "chars": 1071,
    "preview": "MIT License\n\nCopyright (c) 2025 Mykola Humanov\n\nPermission is hereby granted, free of charge, to any person obtaining a "
  },
  {
    "path": "crates/js/README.md",
    "chars": 9713,
    "preview": "# DOM-SMOOTHIE-JS\n> `dom-smoothie-js` is a nodejs package for extracting readable content from web pages. \n> It is a wra"
  },
  {
    "path": "crates/js/src/lib.rs",
    "chars": 7677,
    "preview": "mod utils;\n\nuse cfg_if::cfg_if;\nuse wasm_bindgen::prelude::*;\n\ncfg_if! {\n    if #[cfg(all(feature = \"lol_alloc\", target_"
  },
  {
    "path": "crates/js/src/utils.rs",
    "chars": 466,
    "preview": "#![allow(dead_code)]\npub fn set_panic_hook() {\n    // When the `console_error_panic_hook` feature is enabled, we can cal"
  },
  {
    "path": "crates/js/tests/web.rs",
    "chars": 702,
    "preview": "//! Test suite for the Web and headless browsers.\n\n#![cfg(target_arch = \"wasm32\")]\n\nextern crate wasm_bindgen_test;\nuse "
  },
  {
    "path": "crates/lua/Cargo.toml",
    "chars": 383,
    "preview": "[package]\nname = \"dom-smoothie-lua\"\nversion.workspace = true\nedition.workspace = true\nlicense.workspace = true\nrust-vers"
  },
  {
    "path": "crates/lua/src/lib.rs",
    "chars": 3873,
    "preview": "use dom_smoothie;\nuse mlua::prelude::*;\nuse mlua::LuaSerdeExt;\n\npub struct Readability(dom_smoothie::Readability);\n\nimpl"
  },
  {
    "path": "deny.toml",
    "chars": 11051,
    "preview": "# This template contains all of the possible sections and their default values\n\n# Note that all fields that take a lint "
  },
  {
    "path": "src/ac_automat.rs",
    "chars": 762,
    "preview": "use aho_corasick::{AhoCorasick, AhoCorasickKind};\nuse once_cell::sync::Lazy;\n\nuse crate::glob::{CLASSES_NEGATIVE, CLASSE"
  },
  {
    "path": "src/config.rs",
    "chars": 4464,
    "preview": "use flagset::FlagSet;\n\nuse crate::{\n    glob::{MIN_CONTENT_LENGTH, MIN_SCORE},\n    grab_flags::GrabFlags,\n};\n\npub(crate)"
  },
  {
    "path": "src/glob.rs",
    "chars": 9967,
    "preview": "use dom_query::{mini_selector::MiniSelector, Matcher};\nuse once_cell::sync::Lazy;\nuse phf::phf_set;\n\nmacro_rules! lazy_m"
  },
  {
    "path": "src/grab.rs",
    "chars": 28604,
    "preview": "use dom_query::Tree;\nuse foldhash::{HashMap, HashSet};\nuse std::vec;\n\nuse dom_query::{Document, NodeId, NodeRef};\nuse fl"
  },
  {
    "path": "src/grab_flags.rs",
    "chars": 831,
    "preview": "use flagset::flags;\n\nflags! {\n    /// Flags for the grab function, controlling different heuristics for content extracti"
  },
  {
    "path": "src/helpers.rs",
    "chars": 9836,
    "preview": "use std::collections::HashSet;\n\nuse foldhash::HashMap;\nuse unicode_segmentation::UnicodeSegmentation;\n\nuse dom_query::{N"
  },
  {
    "path": "src/lib.rs",
    "chars": 779,
    "preview": "mod config;\nmod glob;\nmod grab;\nmod grab_flags;\nmod helpers;\nmod matching;\nmod prep_article;\nmod readability;\nmod readab"
  },
  {
    "path": "src/matching.rs",
    "chars": 10982,
    "preview": "//! Functions below replace regex-based validation with explicit string matching\n//! for better maintainability and perf"
  },
  {
    "path": "src/prep_article.rs",
    "chars": 15481,
    "preview": "use dom_query::{NodeRef, Selection};\nuse flagset::FlagSet;\n\n#[allow(clippy::wildcard_imports)]\nuse crate::glob::*;\nuse c"
  },
  {
    "path": "src/readability.rs",
    "chars": 51663,
    "preview": "use dom_query::local_name;\nuse dom_query::{Document, NodeRef, Selection};\nuse foldhash::HashMap;\nuse tendril::StrTendril"
  },
  {
    "path": "src/readable.rs",
    "chars": 1955,
    "preview": "use dom_query::Document;\n\n#[allow(clippy::wildcard_imports)]\nuse crate::glob::*;\nuse crate::helpers::{get_node_matching_"
  },
  {
    "path": "src/score.rs",
    "chars": 2923,
    "preview": "use dom_query::{NodeData, NodeRef};\n\n#[allow(clippy::wildcard_imports)]\nuse crate::glob::*;\nuse crate::matching::contain"
  },
  {
    "path": "src/serde_helpers.rs",
    "chars": 454,
    "preview": "use serde::{Deserializer, Serializer};\nuse tendril::StrTendril;\n\npub fn serialize_str_tendril<S>(value: &StrTendril, ser"
  },
  {
    "path": "src/url_helpers.rs",
    "chars": 6901,
    "preview": "fn delimiter(s: &str) -> &str {\n    let mut count = 0;\n    for (i, c) in s.char_indices() {\n        if c == ':' || c == "
  },
  {
    "path": "test-pages/aclu_ld_meta.json",
    "chars": 491,
    "preview": "{\n  \"title\": \"Facebook Is Tracking Me Even Though I’m Not on Facebook\",\n  \"byline\": \"Daniel Kahn Gillmor\",\n  \"excerpt\": "
  },
  {
    "path": "test-pages/alice-two-paragraphs.html",
    "chars": 1735,
    "preview": "<!doctype html>\n<html>\n    <head>\n        <title>ALICE'S ADVENTURES IN WONDERLAND</title>\n    </head>\n    <body>\n       "
  },
  {
    "path": "test-pages/alt/arstechnica/expected-metadata.json",
    "chars": 332,
    "preview": "{\n  \"title\": \"Camera owner asks Canon, skies: Why is it $5/month for webcam software?\",\n  \"byline\": \"Kevin Purdy\",\n  \"di"
  },
  {
    "path": "test-pages/alt/arstechnica/expected.html",
    "chars": 3684,
    "preview": "<div id=\"readability-page-1\" class=\"page\"><div>\n          \n          \n  <figure>\n      <div>\n              <p><a data-ps"
  },
  {
    "path": "test-pages/alt/arstechnica/expected.md",
    "chars": 4181,
    "preview": "Ownership, now that's a tricky word\n\nJust because it's a good rig doesn't mean you can use it on Zoom\\.\n\nThe Canon Power"
  },
  {
    "path": "test-pages/alt/arstechnica/expected_alt.txt",
    "chars": 3663,
    "preview": "Ownership, now that's a tricky word\n\nJust because it's a good rig doesn't mean you can use it on Zoom.\n\nThe Canon PowerS"
  },
  {
    "path": "test-pages/alt/arstechnica/source.html",
    "chars": 133829,
    "preview": "<!doctype html>\n<html lang=\"en-US\" class=\"view-grid\">\n\n<head>\n  <meta charset=\"utf-8\">\n  <meta name=\"viewport\" content=\""
  },
  {
    "path": "test-pages/alt/hacker-news/expected.md",
    "chars": 9412,
    "preview": "1\\. [Steam Brick: No screen, no controller, just a power button and a USB port](https://crastinator-pro.github.io/steam-"
  },
  {
    "path": "test-pages/alt/hacker-news/expected_alt.txt",
    "chars": 3801,
    "preview": "1. Steam Brick: No screen, no controller, just a power button and a USB port (crastinator-pro.github.io)\n568 points by s"
  },
  {
    "path": "test-pages/alt/hacker-news/source.html",
    "chars": 36171,
    "preview": "<html lang=\"en\" op=\"news\"><head><meta name=\"referrer\" content=\"origin\"><meta name=\"viewport\" content=\"width=device-width"
  },
  {
    "path": "test-pages/alt/mozilla_readability/expected.md",
    "chars": 7447,
    "preview": "## Readability\\.js\n\nA standalone version of the readability library used for [Firefox Reader View](https://support.mozil"
  },
  {
    "path": "test-pages/alt/mozilla_readability/expected_alt.txt",
    "chars": 6646,
    "preview": "Readability.js\n\nA standalone version of the readability library used for Firefox Reader View.\n\nInstallation\n\nReadability"
  },
  {
    "path": "test-pages/alt/mozilla_readability/source.html",
    "chars": 348048,
    "preview": "\n\n\n\n\n\n\n<!DOCTYPE html>\n<html\n  lang=\"en\"\n  \n  data-color-mode=\"auto\" data-light-theme=\"light\" data-dark-theme=\"dark\"\n  d"
  },
  {
    "path": "test-pages/alt/rust-blog/expected.md",
    "chars": 11564,
    "preview": "The Rust team is happy to announce a new version of Rust, 1\\.84\\.0\\. Rust is a programming language empowering everyone "
  },
  {
    "path": "test-pages/alt/rust-blog/expected_alt.txt",
    "chars": 6666,
    "preview": "The Rust team is happy to announce a new version of Rust, 1.84.0. Rust is a programming language empowering everyone to "
  },
  {
    "path": "test-pages/alt/rust-blog/source.html",
    "chars": 21669,
    "preview": "<!DOCTYPE html>\n<html lang=\"en\">\n  <head>\n    <meta charset=\"utf-8\">\n    <title>Announcing Rust 1.84.0 | Rust Blog</titl"
  },
  {
    "path": "test-pages/ld.json",
    "chars": 868,
    "preview": "{\n  \"@context\": \"https://schema.org\",\n  \"@type\": \"Article\",\n  \"name\": \"Rust (programming language)\",\n  \"url\": \"https://e"
  },
  {
    "path": "test-pages/not-matching/empty-links/google-sre-book-1/expected-metadata.json",
    "chars": 539,
    "preview": "{\n  \"title\": \"Google - Site Reliability Engineering\",\n  \"byline\": \"Written by Rob Ewaschuk\\n                            "
  },
  {
    "path": "test-pages/not-matching/empty-links/google-sre-book-1/expected.html",
    "chars": 41915,
    "preview": "<div id=\"readability-page-1\" class=\"page\">\n    <section data-type=\"chapter\" id=\"maia-main\" role=\"main\">\n        <h2> Mon"
  },
  {
    "path": "test-pages/not-matching/empty-links/google-sre-book-1/source.html",
    "chars": 69880,
    "preview": "<!DOCTYPE html>\n<html lang=\"en\" xmlns=\"http://www.w3.org/1999/xhtml\" xml:lang=\"en\">\n    <head>\n        <meta charset=\"ut"
  },
  {
    "path": "test-pages/not-matching/empty-links/lazy-image-2/expected-metadata.json",
    "chars": 668,
    "preview": "{\n  \"title\": \"The Spectacular Story Of Metroid, One Of Gaming's Richest Universes\",\n  \"byline\": \"Mama Robotnik\",\n  \"dir\""
  },
  {
    "path": "test-pages/not-matching/empty-links/lazy-image-2/expected.html",
    "chars": 152915,
    "preview": "<div id=\"readability-page-1\" class=\"page\">\n    <div>\n        <figure data-id=\"18zu12g5xzyxojpg\" data-recommend-id=\"image"
  },
  {
    "path": "test-pages/not-matching/empty-links/lazy-image-2/source.html",
    "chars": 729852,
    "preview": "<!DOCTYPE html>\n<html lang=\"en-us\" data-reactroot=\"\" xmlns=\"http://www.w3.org/1999/xhtml\" xml:lang=\"en-us\">\n    <head>\n "
  },
  {
    "path": "test-pages/not-matching/empty-links/yahoo-3/expected-metadata.json",
    "chars": 853,
    "preview": "{\n  \"title\": \"Veteran Wraps Baby in American Flag, Photo Sparks Controversy\",\n  \"byline\": \"By GILLIAN MOHNEY\\n          "
  },
  {
    "path": "test-pages/not-matching/empty-links/yahoo-3/expected.html",
    "chars": 6174,
    "preview": "<div id=\"readability-page-1\" class=\"page\">\n    <div id=\"Main\" tabindex=\"0\" role=\"main\">\n        <section id=\"mediaconten"
  },
  {
    "path": "test-pages/not-matching/empty-links/yahoo-3/source.html",
    "chars": 1285633,
    "preview": "<!DOCTYPE html>\n<html xmlns=\"http://www.w3.org/1999/xhtml\" id=\"Stencil\" xmlns:og=\"http://ogp.me/ns#\" xmlns:fb=\"http://ww"
  },
  {
    "path": "test-pages/not-matching/redundant-class-page/nytimes-1/expected-metadata.json",
    "chars": 308,
    "preview": "{\n  \"title\": \"United States to Lift Sudan Sanctions\",\n  \"byline\": \"Jeffrey Gettleman\",\n  \"dir\": null,\n  \"lang\": \"en\",\n  "
  },
  {
    "path": "test-pages/not-matching/redundant-class-page/nytimes-1/expected.html",
    "chars": 9251,
    "preview": "<div id=\"readability-page-1\" class=\"page\">\n    <div id=\"page\" class=\"page\">\n        <main id=\"main\" role=\"main\">\n       "
  },
  {
    "path": "test-pages/not-matching/redundant-class-page/nytimes-1/source.html",
    "chars": 309092,
    "preview": "<!DOCTYPE html>\n<!--[if (gt IE 9)|!(IE)]> <!-->\n<html xmlns=\"http://www.w3.org/1999/xhtml\" class=\"flag-limitFabrikSave-o"
  },
  {
    "path": "test-pages/not-matching/redundant-class-page/nytimes-2/expected-metadata.json",
    "chars": 373,
    "preview": "{\n  \"title\": \"Yahoo’s Sale to Verizon Leaves Shareholders With Little Say\",\n  \"byline\": \"Steven Davidoff Solomon\",\n  \"di"
  },
  {
    "path": "test-pages/not-matching/redundant-class-page/nytimes-2/expected.html",
    "chars": 11471,
    "preview": "<div id=\"readability-page-1\" class=\"page\">\n    <div id=\"page\" class=\"page\">\n        <main id=\"main\" role=\"main\">\n       "
  },
  {
    "path": "test-pages/not-matching/redundant-class-page/nytimes-2/source.html",
    "chars": 299376,
    "preview": "<!DOCTYPE html>\n<!--[if (gt IE 9)|!(IE)]> <!-->\n<html xmlns=\"http://www.w3.org/1999/xhtml\" class=\"flag-limitFabrikSave-o"
  },
  {
    "path": "test-pages/not-matching/redundant-div/citylab-1/expected-metadata.json",
    "chars": 333,
    "preview": "{\n  \"title\": \"The Modern Ambitions Behind Neon\",\n  \"byline\": \"Sarah Archer\",\n  \"dir\": null,\n  \"lang\": \"en\",\n  \"excerpt\":"
  },
  {
    "path": "test-pages/not-matching/redundant-div/citylab-1/expected.html",
    "chars": 14005,
    "preview": "<div id=\"readability-page-1\" class=\"page\">\n    <article itemscope=\"itemscope\" itemtype=\"https://schema.org/NewsArticle\">"
  },
  {
    "path": "test-pages/not-matching/redundant-div/citylab-1/source.html",
    "chars": 173759,
    "preview": "<!DOCTYPE html>\n<html lang=\"en\" class=\"no-js\" xmlns=\"http://www.w3.org/1999/xhtml\" xml:lang=\"en\">\n    <head>\n        <me"
  },
  {
    "path": "test-pages/not-matching/redundant-div/la-nacion/expected-metadata.json",
    "chars": 317,
    "preview": "{\n  \"title\": \"Una solución no violenta para la cuestión mapuche\",\n  \"byline\": null,\n  \"dir\": null,\n  \"excerpt\": \"Los pue"
  },
  {
    "path": "test-pages/not-matching/redundant-div/la-nacion/expected.html",
    "chars": 8282,
    "preview": "<div id=\"readability-page-1\" class=\"page\">\n    <article id=\"nota\" itemscope=\"\" itemtype=\"http://schema.org/NewsArticle\" "
  },
  {
    "path": "test-pages/not-matching/redundant-div/la-nacion/source.html",
    "chars": 63203,
    "preview": "<!DOCTYPE html>\n<html>\n<head>\n    <meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\"></meta>\n    <link "
  },
  {
    "path": "test-pages/not-matching/redundant-div/lwn-1/expected-metadata.json",
    "chars": 619,
    "preview": "{\n  \"title\": \"LWN.net Weekly Edition for March 26, 2015 [LWN.net]\",\n  \"byline\": \"By Nathan Willis\\n                     "
  },
  {
    "path": "test-pages/not-matching/redundant-div/lwn-1/expected.html",
    "chars": 59198,
    "preview": "<div id=\"readability-page-1\" class=\"page\">\n    <div>\n        <td>\n            <div>\n                <h2><a href=\"http://"
  },
  {
    "path": "test-pages/not-matching/redundant-div/lwn-1/source.html",
    "chars": 87105,
    "preview": "<!DOCTYPE html PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\" \"http://www.w3.org/TR/html4/loose.dtd\">\n<html>\n\n<head>\n  "
  },
  {
    "path": "test-pages/not-matching/redundant-div/wapo-2/expected-metadata.json",
    "chars": 354,
    "preview": "{\n  \"title\": \"Where do strained U.S.-Israeli relations go after Netanyahu’s victory?\",\n  \"byline\": \"By Steven Mufson\",\n "
  },
  {
    "path": "test-pages/not-matching/redundant-div/wapo-2/expected.html",
    "chars": 8286,
    "preview": "<div id=\"readability-page-1\" class=\"page\">\n    <p><a name=\"1c164a7079bfe20ebd611d79f96418b5a225cbc6\"></a>\n        <img s"
  },
  {
    "path": "test-pages/not-matching/redundant-div/wapo-2/source.html",
    "chars": 125653,
    "preview": "<!DOCTYPE html>\n<html>\n    \n    <head>\n        <meta name=\"eomportal-uuid\" content=\"ca175d64-cd76-11e4-a2a7-9517a3a70506"
  },
  {
    "path": "test-pages/not-matching/redundant-font-attrs/clean-links/expected-metadata.json",
    "chars": 495,
    "preview": "{\n  \"title\": \"Bartleby the Scrivener Web Study Text\",\n  \"byline\": null,\n  \"dir\": null,\n  \"excerpt\": \"Ere introducing the"
  },
  {
    "path": "test-pages/not-matching/redundant-font-attrs/clean-links/expected.html",
    "chars": 85843,
    "preview": "<div id=\"readability-page-1\" class=\"page\">\n    <div>\n        <h3>Study Webtext</h3>\n        <h2><span>\"Bartleby the Scri"
  },
  {
    "path": "test-pages/not-matching/redundant-font-attrs/clean-links/source.html",
    "chars": 182350,
    "preview": "<!DOCTYPE html>\n<html>\n    \n    <head>\n        <title>Bartleby the Scrivener Web Study Text</title>\n        <meta http-e"
  },
  {
    "path": "test-pages/not-matching/redundant-font-attrs/gmw/expected-metadata.json",
    "chars": 220,
    "preview": "{\n  \"title\": \"宇航员在太空中喝酒会怎么样？后果很严重 _探索者 _光明网\",\n  \"byline\": \"肖春芳\",\n  \"dir\": null,\n  \"excerpt\": \"不幸的是，对于希望能喝上一杯的太空探险者，那些将他们"
  },
  {
    "path": "test-pages/not-matching/redundant-font-attrs/gmw/expected.html",
    "chars": 5013,
    "preview": "<div id=\"readability-page-1\" class=\"page\">\n    <div id=\"contentMain\">\n        <p>　　翱翔于距地球数千公里的太空中，进入广袤漆黑的未知领域，是一项艰苦卓绝的工作"
  },
  {
    "path": "test-pages/not-matching/redundant-font-attrs/gmw/source.html",
    "chars": 134867,
    "preview": "<!DOCTYPE html>\n<html xmlns=\"http://www.w3.org/1999/xhtml\">\n\n<head>\n    <script type=\"text/javascript\" async=\"\" src=\"htt"
  },
  {
    "path": "test-pages/not-matching/redundant-font-attrs/hukumusume/expected-metadata.json",
    "chars": 199,
    "preview": "{\n  \"title\": \"欲張りなイヌ　＜福娘童話集　きょうのイソップ童話＞\",\n  \"byline\": null,\n  \"dir\": null,\n  \"excerpt\": \"福娘童話集 > きょうのイソップ童話 > １月のイソップ童話 "
  },
  {
    "path": "test-pages/not-matching/redundant-font-attrs/hukumusume/expected.html",
    "chars": 12332,
    "preview": "<div id=\"readability-page-1\" class=\"page\">\n    <div>\n        <td>\n            <table>\n                <tbody>\n          "
  },
  {
    "path": "test-pages/not-matching/redundant-font-attrs/hukumusume/source.html",
    "chars": 22824,
    "preview": "<!DOCTYPE html>\n<html xmlns=\"http://www.w3.org/1999/xhtml\">\n    <head>\n        <title>\n            欲張りなイヌ　＜福娘童話集　きょうのイソッ"
  },
  {
    "path": "test-pages/not-matching/redundant-font-attrs/keep-tabular-data/expected-metadata.json",
    "chars": 256,
    "preview": "{\n  \"title\": \"Friday Facts #282 - 0.17 in sight | Factorio\",\n  \"byline\": null,\n  \"dir\": null,\n  \"excerpt\": \"Posted by ko"
  },
  {
    "path": "test-pages/not-matching/redundant-font-attrs/keep-tabular-data/expected.html",
    "chars": 43079,
    "preview": "<div id=\"readability-page-1\" class=\"page\">\n    <div>\n        <p>Posted by kovarex, TOGos, Ernestas, Albert on 2019-02-15"
  },
  {
    "path": "test-pages/not-matching/redundant-font-attrs/keep-tabular-data/source.html",
    "chars": 64201,
    "preview": "<html>\n\n<head>\n    <title>Friday Facts #282 - 0.17 in sight | Factorio</title>\n    <meta property=\"og:title\" content=\"Fr"
  },
  {
    "path": "test-pages/not-matching/redundant-font-attrs/replace-font-tags/expected-metadata.json",
    "chars": 638,
    "preview": "{\n  \"title\": \"Replace font tags test\",\n  \"byline\": null,\n  \"dir\": null,\n  \"excerpt\": \"Lorem ipsum dolor sit amet, consec"
  },
  {
    "path": "test-pages/not-matching/redundant-font-attrs/replace-font-tags/expected.html",
    "chars": 1152,
    "preview": "<div id=\"readability-page-1\" class=\"page\">\n    <article>\n        <h2>Lorem</h2>\n        <p><span face=\"Arial\" size=\"2\"><"
  },
  {
    "path": "test-pages/not-matching/redundant-font-attrs/replace-font-tags/source.html",
    "chars": 1293,
    "preview": "<!DOCTYPE html>\n<html>\n<head>\n  <meta charset=\"utf-8\"/>\n  <title>Replace font tags test</title>\n</head>\n<body>\n  <articl"
  },
  {
    "path": "test-pages/not-matching/redundant-font-attrs/table-style-attributes/expected-metadata.json",
    "chars": 227,
    "preview": "{\n  \"title\": \"linux video\",\n  \"byline\": null,\n  \"dir\": null,\n  \"excerpt\": \"linux usability\\n    ...or, why do I bother. "
  },
  {
    "path": "test-pages/not-matching/redundant-font-attrs/table-style-attributes/expected.html",
    "chars": 12236,
    "preview": "<div id=\"readability-page-1\" class=\"page\">\n    <p>\n        <span size=\"+3\"><b>linux usability <span size=\"4\"><br />...or"
  },
  {
    "path": "test-pages/not-matching/redundant-font-attrs/table-style-attributes/source.html",
    "chars": 14416,
    "preview": "<!DOCTYPE html PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\" \"http://www.w3.org/TR/html4/loose.dtd\">\n<html>\n\n<head>\n  "
  },
  {
    "path": "test-pages/not-matching/urls/002/expected-metadata.json",
    "chars": 375,
    "preview": "{\n  \"title\": \"This API is so Fetching!\",\n  \"byline\": \"Nikhil Marathe\",\n  \"dir\": null,\n  \"lang\": \"en-US\",\n  \"excerpt\": \"F"
  },
  {
    "path": "test-pages/not-matching/urls/002/expected.html",
    "chars": 30017,
    "preview": "<div id=\"readability-page-1\" class=\"page\">\n    <div id=\"content-main\">\n        <article role=\"article\">\n            <p>F"
  },
  {
    "path": "test-pages/not-matching/urls/002/source.html",
    "chars": 142048,
    "preview": "<!DOCTYPE html>\n<html lang=\"en-US\" id=\"hacks-mozilla-org\">\n    \n    <head>\n        <meta name=\"viewport\" content=\"width="
  },
  {
    "path": "test-pages/not-matching/urls/ietf-1/expected-metadata.json",
    "chars": 161,
    "preview": "{\n  \"title\": \"remoteStorage\",\n  \"byline\": \"Jong, Michiel de\",\n  \"dir\": null,\n  \"lang\": \"en\",\n  \"siteName\": null,\n  \"publ"
  },
  {
    "path": "test-pages/not-matching/urls/ietf-1/expected.html",
    "chars": 56074,
    "preview": "<div id=\"readability-page-1\" class=\"page\">\n    <span>[<a href=\"http://fakehost/html/\" title=\"Document search and retriev"
  },
  {
    "path": "test-pages/not-matching/urls/ietf-1/source.html",
    "chars": 64711,
    "preview": "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\"\n  \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dt"
  },
  {
    "path": "test-pages/not-matching/urls/toc-missing/expected-metadata.json",
    "chars": 546,
    "preview": "{\n  \"title\": \"Simple Anomaly Detection Using Plain SQL\",\n  \"byline\": \"Haki Benita\",\n  \"dir\": null,\n  \"lang\": \"en\",\n  \"ex"
  },
  {
    "path": "test-pages/not-matching/urls/toc-missing/expected.html",
    "chars": 69612,
    "preview": "<div id=\"readability-page-1\" class=\"page\">\n    <article data-progress-indicator=\"\">\n        <hr />\n        <p> Many deve"
  },
  {
    "path": "test-pages/not-matching/urls/toc-missing/source.html",
    "chars": 125401,
    "preview": "<!DOCTYPE html>\n<html lang=\"en\" class=\"nojs\" xmlns=\"http://www.w3.org/1999/xhtml\" xml:lang=\"en\">\n    <head>\n        <tit"
  },
  {
    "path": "test-pages/not-matching/urls/v8-blog/expected-metadata.json",
    "chars": 280,
    "preview": "{\n  \"title\": \"standalone WebAssembly binaries using Emscripten · V8\",\n  \"byline\": null,\n  \"dir\": null,\n  \"lang\": \"en\",\n "
  },
  {
    "path": "test-pages/not-matching/urls/v8-blog/expected.html",
    "chars": 21688,
    "preview": "<div id=\"readability-page-1\" class=\"page\">\n    <div itemprop=\"articleBody\">\n        <p> Emscripten has always focused fi"
  },
  {
    "path": "test-pages/not-matching/urls/v8-blog/source.html",
    "chars": 33065,
    "preview": "<!DOCTYPE html>\n<html lang=\"en\" xmlns=\"http://www.w3.org/1999/xhtml\" xml:lang=\"en\">\n    <head>\n        <meta charset=\"ut"
  },
  {
    "path": "test-pages/not-matching/urls/videos-1/expected-metadata.json",
    "chars": 251,
    "preview": "{\n  \"title\": \"How to watch the 21 best films of 2017\",\n  \"byline\": \"Alissa Wilkinson\",\n  \"dir\": null,\n  \"excerpt\": \"It w"
  },
  {
    "path": "test-pages/not-matching/urls/videos-1/expected.html",
    "chars": 43858,
    "preview": "<div id=\"readability-page-1\" class=\"page\">\n    <div>\n        <p id=\"oFNvY2\"> In the introduction to her review anthology"
  },
  {
    "path": "test-pages/not-matching/urls/videos-1/source.html",
    "chars": 209874,
    "preview": "<!DOCTYPE html>\n<html xmlns=\"http://www.w3.org/1999/xhtml\">\n    <head>\n        <title>\n            How to watch the 21 b"
  },
  {
    "path": "test-pages/not-matching/urls/wikia/expected-metadata.json",
    "chars": 413,
    "preview": "{\n  \"title\": \"'Star Wars' Original Cuts Might Get Released for 40th Anniversary\",\n  \"byline\": \"James Akinaka\",\n  \"dir\": "
  },
  {
    "path": "test-pages/not-matching/urls/wikia/expected.html",
    "chars": 8806,
    "preview": "<div id=\"readability-page-1\" class=\"page\">\n    <div>\n        <p>Although Lucasfilm is already planning a birthday bash f"
  },
  {
    "path": "test-pages/not-matching/urls/wikia/source.html",
    "chars": 854552,
    "preview": "<!DOCTYPE html>\n<html xmlns=\"http://www.w3.org/1999/xhtml\" class=\"js non-touch-device sliding-icons\" data-region=\"united"
  },
  {
    "path": "test-pages/not-matching/urls/wikipedia/expected-metadata.json",
    "chars": 548,
    "preview": "{\n  \"title\": \"Mozilla - Wikipedia\",\n  \"byline\": null,\n  \"dir\": \"ltr\",\n  \"lang\": \"en\",\n  \"excerpt\": \"Mozilla is a free-so"
  },
  {
    "path": "test-pages/not-matching/urls/wikipedia/expected.html",
    "chars": 114417,
    "preview": "<div id=\"readability-page-1\" class=\"page\">\n    <div id=\"mw-content-text\" lang=\"en\" dir=\"ltr\">\n        <table>\n          "
  },
  {
    "path": "test-pages/not-matching/urls/wikipedia/source.html",
    "chars": 243907,
    "preview": "<!DOCTYPE html>\n<html class=\"client-nojs\" lang=\"en\" dir=\"ltr\">\n\n<head>\n    <meta charset=\"UTF-8\" />\n    <title>Mozilla -"
  },
  {
    "path": "test-pages/not-matching/urls/wikipedia-2/expected-metadata.json",
    "chars": 275,
    "preview": "{\n  \"title\": \"New Zealand\",\n  \"byline\": \"Contributors to Wikimedia projects\",\n  \"dir\": \"ltr\",\n  \"lang\": \"en\",\n  \"excerpt"
  },
  {
    "path": "test-pages/not-matching/urls/wikipedia-2/expected.html",
    "chars": 418017,
    "preview": "<div id=\"readability-page-1\" class=\"page\">\n    <div id=\"mw-content-text\" lang=\"en\" dir=\"ltr\" xml:lang=\"en\">\n        <p>\n"
  },
  {
    "path": "test-pages/not-matching/urls/wikipedia-2/source.html",
    "chars": 1037081,
    "preview": "<!DOCTYPE html>\n<html class=\"client-nojs\" lang=\"en\" dir=\"ltr\" xmlns=\"http://www.w3.org/1999/xhtml\" xml:lang=\"en\">\n    <h"
  },
  {
    "path": "test-pages/ok/001/expected-metadata.json",
    "chars": 243,
    "preview": "{\n  \"title\": \"Get your Frontend JavaScript Code Covered | Code\",\n  \"byline\": \"Nicolas Perriault\",\n  \"dir\": null,\n  \"lang"
  },
  {
    "path": "test-pages/ok/001/expected.html",
    "chars": 5902,
    "preview": "<div id=\"readability-page-1\" class=\"page\">\n    <section>\n        <p><strong>So finally you're <a href=\"http://fakehost/c"
  },
  {
    "path": "test-pages/ok/001/source.html",
    "chars": 12504,
    "preview": "<!DOCTYPE html>\n<html class=\"no-js\" lang=\"en\">\n    \n    <head>\n        <meta charset=\"utf-8\"/>\n        <meta http-equiv="
  },
  {
    "path": "test-pages/ok/002/expected-metadata.json",
    "chars": 375,
    "preview": "{\n  \"title\": \"This API is so Fetching!\",\n  \"byline\": \"Nikhil Marathe\",\n  \"dir\": null,\n  \"lang\": \"en-US\",\n  \"excerpt\": \"F"
  },
  {
    "path": "test-pages/ok/002/expected.html",
    "chars": 30015,
    "preview": "<div id=\"readability-page-1\" class=\"page\">\n    <div id=\"content-main\">\n        <article role=\"article\">\n            <p>F"
  },
  {
    "path": "test-pages/ok/002/source.html",
    "chars": 142048,
    "preview": "<!DOCTYPE html>\n<html lang=\"en-US\" id=\"hacks-mozilla-org\">\n    \n    <head>\n        <meta name=\"viewport\" content=\"width="
  },
  {
    "path": "test-pages/ok/003-metadata-preferred/expected-metadata.json",
    "chars": 218,
    "preview": "{\n  \"title\": \"Dublin Core property title\",\n  \"byline\": \"Dublin Core property author\",\n  \"dir\": null,\n  \"excerpt\": \"Dubli"
  },
  {
    "path": "test-pages/ok/003-metadata-preferred/expected.html",
    "chars": 1043,
    "preview": "<div id=\"readability-page-1\" class=\"page\">\n    <article>\n        <h2>Test document title</h2>\n        <p> Lorem ipsum do"
  },
  {
    "path": "test-pages/ok/003-metadata-preferred/source.html",
    "chars": 2408,
    "preview": "<!DOCTYPE html>\n<html>\n  <head>\n    <meta charset=\"utf-8\"/>\n    <title>Title Element</title>\n    <meta name=\"title\" cont"
  },
  {
    "path": "test-pages/ok/004-metadata-space-separated-properties/expected-metadata.json",
    "chars": 181,
    "preview": "{\n  \"title\": \"Preferred title\",\n  \"byline\": \"Creator Name\",\n  \"dir\": null,\n  \"excerpt\": \"Preferred description\",\n  \"site"
  },
  {
    "path": "test-pages/ok/004-metadata-space-separated-properties/expected.html",
    "chars": 1043,
    "preview": "<div id=\"readability-page-1\" class=\"page\">\n    <article>\n        <h2>Test document title</h2>\n        <p> Lorem ipsum do"
  },
  {
    "path": "test-pages/ok/004-metadata-space-separated-properties/source.html",
    "chars": 1725,
    "preview": "<!DOCTYPE html>\n<html>\n  <head>\n    <meta charset=\"utf-8\"/>\n    <title>Title Element</title>\n    <meta property=\"x:title"
  },
  {
    "path": "test-pages/ok/005-unescape-html-entities/expected-metadata.json",
    "chars": 165,
    "preview": "{\n  \"title\": \"\",\n  \"byline\": null,\n  \"dir\": null,\n  \"excerpt\": \"&#xg; 😭 😭 &#xFFFFFFFF; \\u0000\",\n  \"siteName\": null,\n  \"p"
  },
  {
    "path": "test-pages/ok/005-unescape-html-entities/expected.html",
    "chars": 54,
    "preview": "<div id=\"readability-page-1\" class=\"page\"> Test </div>"
  },
  {
    "path": "test-pages/ok/005-unescape-html-entities/source.html",
    "chars": 221,
    "preview": "<!DOCTYPE html>\n<html>\n    <head>\n        <meta property=\"dc:description og:description\" content=\"&amp;#xg; &amp;#x1F62D"
  },
  {
    "path": "test-pages/ok/aclu/expected-metadata.json",
    "chars": 561,
    "preview": "{\n  \"title\": \"Facebook Is Tracking Me Even Though I’m Not on Facebook\",\n  \"byline\": \"Daniel Kahn Gillmor\",\n  \"dir\": \"ltr"
  },
  {
    "path": "test-pages/ok/aclu/expected.html",
    "chars": 13782,
    "preview": "<div id=\"readability-page-1\" class=\"page\">\n    <div>\n        <p> I don't use Facebook. I'm not technophobic — I'm a geek"
  },
  {
    "path": "test-pages/ok/aclu/source.html",
    "chars": 205096,
    "preview": "<!DOCTYPE html>\n<!--[if IEMobile 7]><html class=\"iem7\"  lang=\"en\" dir=\"ltr\"><![endif]--><!--[if lte IE 6]><html class=\"l"
  },
  {
    "path": "test-pages/ok/aktualne/expected-metadata.json",
    "chars": 422,
    "preview": "{\n  \"title\": \"West Ham hrozí gigantům, okouzlil i Linekera. Součka je snadné přehlédnout\",\n  \"byline\": \"Aleš Vávra\",\n  \""
  },
  {
    "path": "test-pages/ok/aktualne/expected.html",
    "chars": 5379,
    "preview": "<div id=\"readability-page-1\" class=\"page\">\n    <p> Zázrak jedné sezony? West Ham dává pochybovačům stále pádnější odpově"
  },
  {
    "path": "test-pages/ok/aktualne/source.html",
    "chars": 298889,
    "preview": "<!DOCTYPE html>\n<html lang=\"cs\" prefix=\"og: http://ogp.me/ns# fb: http://ogp.me/ns/fb# article: http://ogp.me/ns/article"
  },
  {
    "path": "test-pages/ok/archive-of-our-own/expected-metadata.json",
    "chars": 367,
    "preview": "{\n  \"title\": \"Conversations with a Cryptid - Chapter 1 - AMournfulHowlInTheNight - 僕のヒーローアカデミア | Boku no Hero Academia\","
  },
  {
    "path": "test-pages/ok/archive-of-our-own/expected.html",
    "chars": 24223,
    "preview": "<div id=\"readability-page-1\" class=\"page\">\n    <div role=\"article\" id=\"chapters\">\n        <h3 id=\"work\"> Chapter Text </"
  },
  {
    "path": "test-pages/ok/archive-of-our-own/source.html",
    "chars": 265146,
    "preview": "<!DOCTYPE html>\n<html lang=\"en\" xmlns=\"http://www.w3.org/1999/xhtml\" xml:lang=\"en\">\n    <head>\n        <meta charset=\"ut"
  },
  {
    "path": "test-pages/ok/ars-1/expected-metadata.json",
    "chars": 399,
    "preview": "{\n  \"title\": \"Just-released Minecraft exploit makes it easy to crash game servers\",\n  \"byline\": \"Dan Goodin\",\n  \"dir\": n"
  },
  {
    "path": "test-pages/ok/ars-1/expected.html",
    "chars": 5976,
    "preview": "<div id=\"readability-page-1\" class=\"page\">\n    <div>\n        <header>\n            <h4> Biz &amp; IT — </h4>\n            "
  },
  {
    "path": "test-pages/ok/ars-1/source.html",
    "chars": 55962,
    "preview": "<!DOCTYPE html>\n<html lang=\"en-us\" xmlns=\"http://www.w3.org/1999/xhtml\" xml:lang=\"en-us\">\n    <head>\n        <title>\n   "
  },
  {
    "path": "test-pages/ok/base-url/expected-metadata.json",
    "chars": 629,
    "preview": "{\n  \"title\": \"Base URL test\",\n  \"byline\": null,\n  \"dir\": null,\n  \"excerpt\": \"Lorem ipsum dolor sit amet, consectetur adi"
  },
  {
    "path": "test-pages/ok/base-url/expected.html",
    "chars": 1844,
    "preview": "<div id=\"readability-page-1\" class=\"page\">\n    <article>\n        <h2>Lorem</h2>\n        <p> Lorem ipsum dolor sit amet, "
  },
  {
    "path": "test-pages/ok/base-url/source.html",
    "chars": 1764,
    "preview": "<!DOCTYPE html>\n<html>\n<head>\n  <meta charset=\"utf-8\"/>\n  <title>Base URL test</title>\n</head>\n<body>\n  <article>\n    <h"
  },
  {
    "path": "test-pages/ok/base-url-base-element-relative/expected-metadata.json",
    "chars": 648,
    "preview": "{\n  \"title\": \"Base URL with base relative test\",\n  \"byline\": null,\n  \"dir\": null,\n  \"excerpt\": \"Lorem ipsum dolor sit am"
  },
  {
    "path": "test-pages/ok/base-url-base-element-relative/expected.html",
    "chars": 1895,
    "preview": "<div id=\"readability-page-1\" class=\"page\">\n    <article>\n        <h2>Lorem</h2>\n        <p> Lorem ipsum dolor sit amet, "
  },
  {
    "path": "test-pages/ok/base-url-base-element-relative/source.html",
    "chars": 1806,
    "preview": "<!DOCTYPE html>\n<html>\n<head>\n  <meta charset=\"utf-8\"/>\n  <base href=\"base/\"/>\n  <title>Base URL with base relative test"
  },
  {
    "path": "test-pages/ok/breitbart/expected-metadata.json",
    "chars": 506,
    "preview": "{\n  \"title\": \"'Neutral' Snopes Fact-Checker David Emery: 'Are There Any Un-Angry Trump Supporters?' - Breitbart\",\n  \"byl"
  },
  {
    "path": "test-pages/ok/breitbart/expected.html",
    "chars": 3773,
    "preview": "<div id=\"readability-page-1\" class=\"page\">\n    <div>\n        <figure>\n            <div>\n                <p><img itemprop"
  },
  {
    "path": "test-pages/ok/breitbart/source.html",
    "chars": 854445,
    "preview": "<!DOCTYPE html>\n<html xmlns=\"http://www.w3.org/1999/xhtml\" xml:lang=\"en\" dir=\"ltr\" prefix=\"og: http://ogp.me/ns# fb: htt"
  },
  {
    "path": "test-pages/ok/citylab-1/expected-metadata.json",
    "chars": 425,
    "preview": "{\n  \"title\": \"The Modern Ambitions Behind Neon\",\n  \"byline\": \"Sarah Archer\",\n  \"dir\": null,\n  \"lang\": \"en\",\n  \"excerpt\":"
  },
  {
    "path": "test-pages/ok/citylab-1/expected.html",
    "chars": 13960,
    "preview": "<div id=\"readability-page-1\" class=\"page\">\n    <article itemscope=\"itemscope\" itemtype=\"https://schema.org/NewsArticle\">"
  },
  {
    "path": "test-pages/ok/citylab-1/source.html",
    "chars": 173759,
    "preview": "<!DOCTYPE html>\n<html lang=\"en\" class=\"no-js\" xmlns=\"http://www.w3.org/1999/xhtml\" xml:lang=\"en\">\n    <head>\n        <me"
  },
  {
    "path": "test-pages/ok/clean-links/expected-metadata.json",
    "chars": 495,
    "preview": "{\n  \"title\": \"Bartleby the Scrivener Web Study Text\",\n  \"byline\": null,\n  \"dir\": null,\n  \"excerpt\": \"Ere introducing the"
  },
  {
    "path": "test-pages/ok/clean-links/expected.html",
    "chars": 85843,
    "preview": "<div id=\"readability-page-1\" class=\"page\">\n    <div>\n        <h3>Study Webtext</h3>\n        <h2><span>\"Bartleby the Scri"
  },
  {
    "path": "test-pages/ok/clean-links/source.html",
    "chars": 182350,
    "preview": "<!DOCTYPE html>\n<html>\n    \n    <head>\n        <title>Bartleby the Scrivener Web Study Text</title>\n        <meta http-e"
  },
  {
    "path": "test-pages/ok/cnn/expected-metadata.json",
    "chars": 417,
    "preview": "{\n  \"title\": \"The 'birth lottery' and economic mobility\",\n  \"byline\": \"Ahiza Garcia\",\n  \"dir\": null,\n  \"excerpt\": \"A rec"
  },
  {
    "path": "test-pages/ok/cnn/expected.html",
    "chars": 3265,
    "preview": "<div id=\"readability-page-1\" class=\"page\">\n    <div id=\"storytext\">\n        <h2>The U.S. has long been heralded as a lan"
  },
  {
    "path": "test-pages/ok/cnn/source.html",
    "chars": 258652,
    "preview": "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\""
  },
  {
    "path": "test-pages/ok/dev418/expected-metadata.json",
    "chars": 382,
    "preview": "{\n  \"title\": \"Readability Test\",\n  \"byline\": null,\n  \"dir\": null,\n  \"excerpt\": \"Lorem ipsum dolor sit amet, consectetur "
  },
  {
    "path": "test-pages/ok/dev418/expected.html",
    "chars": 2937,
    "preview": "<div id=\"readability-page-1\" class=\"page\">\n    <p> Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusm"
  },
  {
    "path": "test-pages/ok/dev418/source.html",
    "chars": 3815,
    "preview": "<!DOCTYPE html>\n<html xmlns=\"http://www.w3.org/1999/xhtml\">\n    <head>\n        <meta charset=\"utf-8\" />\n        <title>\n"
  },
  {
    "path": "test-pages/ok/ehow-1/expected-metadata.json",
    "chars": 667,
    "preview": "{\n  \"title\": \"How to Build a Terrarium | eHow\",\n  \"byline\": \"Lucy Akins\",\n  \"dir\": null,\n  \"lang\": \"en-US\",\n  \"excerpt\":"
  },
  {
    "path": "test-pages/ok/ehow-1/expected.html",
    "chars": 7374,
    "preview": "<div id=\"readability-page-1\" class=\"page\">\n    <div>\n        <header>\n            <div>\n                <p><span></span>"
  },
  {
    "path": "test-pages/ok/ehow-1/source.html",
    "chars": 67355,
    "preview": "<!DOCTYPE html>\n<!--[if IE]><![endif]-->\n<html class=\"Crafts en-US\" lang=\"en-US\" xmlns:fb=\"http://www.facebook.com/2008/"
  },
  {
    "path": "test-pages/ok/engadget/expected-metadata.json",
    "chars": 911,
    "preview": "{\n  \"title\": \"Xbox One X review:  A console that keeps up with gaming PCs\",\n  \"byline\": \"Devindra Hardawar\",\n  \"dir\": nu"
  },
  {
    "path": "test-pages/ok/engadget/expected.html",
    "chars": 18717,
    "preview": "<div id=\"readability-page-1\" class=\"page\">\n    <div>\n        <p>The <a href=\"https://www.engadget.com/2017/06/13/the-xbo"
  },
  {
    "path": "test-pages/ok/engadget/source.html",
    "chars": 350100,
    "preview": "<html lang=\"en\">\n\n<head>\n    <meta charset=\"UTF-8\"></meta>\n    <meta http-equiv=\"cache-control\" content=\"no-cache\"></met"
  },
  {
    "path": "test-pages/ok/gmw/expected-metadata.json",
    "chars": 220,
    "preview": "{\n  \"title\": \"宇航员在太空中喝酒会怎么样？后果很严重 _探索者 _光明网\",\n  \"byline\": \"肖春芳\",\n  \"dir\": null,\n  \"excerpt\": \"不幸的是，对于希望能喝上一杯的太空探险者，那些将他们"
  },
  {
    "path": "test-pages/ok/gmw/expected.html",
    "chars": 4982,
    "preview": "<div id=\"readability-page-1\" class=\"page\">\n    <div id=\"contentMain\">\n        <p>　　翱翔于距地球数千公里的太空中，进入广袤漆黑的未知领域，是一项艰苦卓绝的工作"
  },
  {
    "path": "test-pages/ok/gmw/source.html",
    "chars": 134867,
    "preview": "<!DOCTYPE html>\n<html xmlns=\"http://www.w3.org/1999/xhtml\">\n\n<head>\n    <script type=\"text/javascript\" async=\"\" src=\"htt"
  },
  {
    "path": "test-pages/ok/hukumusume/expected-metadata.json",
    "chars": 199,
    "preview": "{\n  \"title\": \"欲張りなイヌ　＜福娘童話集　きょうのイソップ童話＞\",\n  \"byline\": null,\n  \"dir\": null,\n  \"excerpt\": \"福娘童話集 > きょうのイソップ童話 > １月のイソップ童話 "
  },
  {
    "path": "test-pages/ok/hukumusume/expected.html",
    "chars": 11890,
    "preview": "<div id=\"readability-page-1\" class=\"page\">\n    <div>\n        <td>\n            <table>\n                <tbody>\n          "
  },
  {
    "path": "test-pages/ok/hukumusume/source.html",
    "chars": 22824,
    "preview": "<!DOCTYPE html>\n<html xmlns=\"http://www.w3.org/1999/xhtml\">\n    <head>\n        <title>\n            欲張りなイヌ　＜福娘童話集　きょうのイソッ"
  },
  {
    "path": "test-pages/ok/ietf-1/expected-metadata.json",
    "chars": 161,
    "preview": "{\n  \"title\": \"remoteStorage\",\n  \"byline\": \"Jong, Michiel de\",\n  \"dir\": null,\n  \"lang\": \"en\",\n  \"siteName\": null,\n  \"publ"
  },
  {
    "path": "test-pages/ok/ietf-1/expected.html",
    "chars": 56058,
    "preview": "<div id=\"readability-page-1\" class=\"page\">\n    <span>[<a href=\"http://fakehost/html/\" title=\"Document search and retriev"
  },
  {
    "path": "test-pages/ok/ietf-1/source.html",
    "chars": 64711,
    "preview": "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\"\n  \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dt"
  },
  {
    "path": "test-pages/ok/js-link-replacement/expected-metadata.json",
    "chars": 164,
    "preview": "{\n  \"title\": \"Replace javascript: links\",\n  \"byline\": null,\n  \"dir\": null,\n  \"excerpt\": \"abc\",\n  \"siteName\": null,\n  \"pu"
  },
  {
    "path": "test-pages/ok/js-link-replacement/expected.html",
    "chars": 114,
    "preview": "<div id=\"readability-page-1\" class=\"page\">\n    <span>\n        <p>abc</p>\n        <p>def</p> ghi\n    </span>\n</div>"
  },
  {
    "path": "test-pages/ok/js-link-replacement/source.html",
    "chars": 209,
    "preview": "<!DOCTYPE html>\n<html>\n<head>\n  <meta charset=\"utf-8\"/>\n  <title>Replace javascript: links</title>\n</head>\n<body>  \n  <a"
  },
  {
    "path": "test-pages/ok/keep-tabular-data/expected-metadata.json",
    "chars": 324,
    "preview": "{\n  \"title\": \"Friday Facts #282 - 0.17 in sight | Factorio\",\n  \"byline\": null,\n  \"dir\": null,\n  \"excerpt\": \"Posted by ko"
  },
  {
    "path": "test-pages/ok/keep-tabular-data/expected.html",
    "chars": 43043,
    "preview": "<div id=\"readability-page-1\" class=\"page\">\n    <div>\n        <p>Posted by kovarex, TOGos, Ernestas, Albert on 2019-02-15"
  },
  {
    "path": "test-pages/ok/keep-tabular-data/source.html",
    "chars": 64201,
    "preview": "<html>\n\n<head>\n    <title>Friday Facts #282 - 0.17 in sight | Factorio</title>\n    <meta property=\"og:title\" content=\"Fr"
  },
  {
    "path": "test-pages/ok/la-nacion/expected-metadata.json",
    "chars": 387,
    "preview": "{\n  \"title\": \"Una solución no violenta para la cuestión mapuche\",\n  \"byline\": null,\n  \"dir\": null,\n  \"excerpt\": \"Los pue"
  },
  {
    "path": "test-pages/ok/la-nacion/expected.html",
    "chars": 8282,
    "preview": "<div id=\"readability-page-1\" class=\"page\">\n    <article id=\"nota\" itemscope=\"\" itemtype=\"http://schema.org/NewsArticle\" "
  },
  {
    "path": "test-pages/ok/la-nacion/source.html",
    "chars": 63203,
    "preview": "<!DOCTYPE html>\n<html>\n<head>\n    <meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\"></meta>\n    <link "
  }
]

// ... and 336 more files (download for full content)

About this extraction

This page contains the full source code of the niklak/dom_smoothie GitHub repository, extracted and formatted as plain text for AI agents and large language models (LLMs). The extraction includes 536 files (35.8 MB), approximately 9.4M tokens, and a symbol index with 277 extracted functions, classes, methods, constants, and types. Use this with OpenClaw, Claude, ChatGPT, Cursor, Windsurf, or any other AI tool that accepts text input. You can copy the full output to your clipboard or download it as a .txt file.

Extracted by GitExtract — free GitHub repo to text converter for AI. Built by Nikandr Surkov.

Extract another repo