Full Code of codelucas/newspaper for AI

master 648fb2a18bcc cached
417 files
14.5 MB
3.8M tokens
405 symbols
1 requests
Copy disabled (too large) Download .txt
Showing preview only (15,267K chars total). Download the full file to get everything.
Repository: codelucas/newspaper
Branch: master
Commit: 648fb2a18bcc
Files: 417
Total size: 14.5 MB

Directory structure:
gitextract_g2lraicp/

├── .gitattributes
├── .gitignore
├── .travis.yml
├── CHANGELOG.md
├── GOOSE-LICENSE.txt
├── LICENSE
├── MANIFEST.in
├── README.rst
├── docs/
│   ├── Makefile
│   ├── _templates/
│   │   ├── sidebarintro.html
│   │   └── sidebarlogo.html
│   ├── _themes/
│   │   ├── .gitignore
│   │   ├── LICENSE
│   │   ├── README.rst
│   │   ├── flask_theme_support.py
│   │   ├── kr/
│   │   │   ├── layout.html
│   │   │   ├── relations.html
│   │   │   ├── static/
│   │   │   │   └── flasky.css_t
│   │   │   └── theme.conf
│   │   └── kr_small/
│   │       ├── layout.html
│   │       ├── static/
│   │       │   └── flasky.css_t
│   │       └── theme.conf
│   ├── conf.py
│   ├── index.rst
│   ├── make.bat
│   └── user_guide/
│       ├── advanced.rst
│       ├── api.rst
│       ├── contributors.rst
│       └── quickstart.rst
├── download_corpora.py
├── newspaper/
│   ├── __init__.py
│   ├── api.py
│   ├── article.py
│   ├── cleaners.py
│   ├── configuration.py
│   ├── extractors.py
│   ├── images.py
│   ├── mthreading.py
│   ├── network.py
│   ├── nlp.py
│   ├── outputformatters.py
│   ├── parsers.py
│   ├── resources/
│   │   ├── misc/
│   │   │   ├── google_sources.txt
│   │   │   ├── popular_sources.txt
│   │   │   ├── stopwords-nlp-en.txt
│   │   │   └── useragents.txt
│   │   └── text/
│   │       ├── stopwords-ar.txt
│   │       ├── stopwords-be.txt
│   │       ├── stopwords-bg.txt
│   │       ├── stopwords-da.txt
│   │       ├── stopwords-de.txt
│   │       ├── stopwords-el.txt
│   │       ├── stopwords-en.txt
│   │       ├── stopwords-es.txt
│   │       ├── stopwords-et.txt
│   │       ├── stopwords-fa.txt
│   │       ├── stopwords-fi.txt
│   │       ├── stopwords-fr.txt
│   │       ├── stopwords-he.txt
│   │       ├── stopwords-hi.txt
│   │       ├── stopwords-hr.txt
│   │       ├── stopwords-hu.txt
│   │       ├── stopwords-id.txt
│   │       ├── stopwords-it.txt
│   │       ├── stopwords-ja.txt
│   │       ├── stopwords-ko.txt
│   │       ├── stopwords-lt.txt
│   │       ├── stopwords-mk.txt
│   │       ├── stopwords-nb.txt
│   │       ├── stopwords-nl.txt
│   │       ├── stopwords-no.txt
│   │       ├── stopwords-pl.txt
│   │       ├── stopwords-pt.txt
│   │       ├── stopwords-ro.txt
│   │       ├── stopwords-ru.txt
│   │       ├── stopwords-sl.txt
│   │       ├── stopwords-sr.txt
│   │       ├── stopwords-sv.txt
│   │       ├── stopwords-sw.txt
│   │       ├── stopwords-th.txt
│   │       ├── stopwords-tr.txt
│   │       ├── stopwords-uk.txt
│   │       ├── stopwords-vi.txt
│   │       └── stopwords-zh.txt
│   ├── settings.py
│   ├── source.py
│   ├── text.py
│   ├── urls.py
│   ├── utils.py
│   ├── version.py
│   └── videos/
│       ├── __init__.py
│       ├── extractors.py
│       └── videos.py
├── requirements.txt
├── setup.py
└── tests/
    ├── __init__.py
    ├── benchmarks.py
    ├── data/
    │   ├── fulltext_domain_list.txt
    │   ├── fulltext_url_list.txt
    │   ├── html/
    │   │   ├── 247wallst.com1.html
    │   │   ├── 247wallst.com2.html
    │   │   ├── about.com1.html
    │   │   ├── about.com2.html
    │   │   ├── adoption.com1.html
    │   │   ├── al.com1.html
    │   │   ├── al.com2.html
    │   │   ├── ap_meta_refresh.html
    │   │   ├── apartmenttherapy.com1.html
    │   │   ├── apartmenttherapy.com2.html
    │   │   ├── arabic_article.html
    │   │   ├── architecturaldigest.com1.html
    │   │   ├── architecturaldigest.com2.html
    │   │   ├── avclub.com1.html
    │   │   ├── avclub.com2.html
    │   │   ├── backstage.com1.html
    │   │   ├── backstage.com2.html
    │   │   ├── bhg.com1.html
    │   │   ├── bhg.com2.html
    │   │   ├── bloomberg.com1.html
    │   │   ├── bostonherald.com1.html
    │   │   ├── bostonherald.com2.html
    │   │   ├── businessinsider.com1.html
    │   │   ├── businessinsider.com2.html
    │   │   ├── businessweek.com1.html
    │   │   ├── businessweek.com2.html
    │   │   ├── chinese_article.html
    │   │   ├── cleveland.com1.html
    │   │   ├── cleveland.com2.html
    │   │   ├── cnn_article.html
    │   │   ├── cnn_main_site.html
    │   │   ├── cntraveler.com1.html
    │   │   ├── cntraveler.com2.html
    │   │   ├── coolhunting.com1.html
    │   │   ├── coolhunting.com2.html
    │   │   ├── cricket.com.au1.html
    │   │   ├── cricket.com.au2.html
    │   │   ├── dailycaller.com1.html
    │   │   ├── dailycaller.com2.html
    │   │   ├── dailystar.co.uk1.html
    │   │   ├── dailystar.co.uk2.html
    │   │   ├── dallasnews.com1.html
    │   │   ├── dallasnews.com2.html
    │   │   ├── details.com1.html
    │   │   ├── details.com2.html
    │   │   ├── elle.com1.html
    │   │   ├── elle.com2.html
    │   │   ├── flavorwire.com1.html
    │   │   ├── flavorwire.com2.html
    │   │   ├── fool.com1.html
    │   │   ├── fool.com2.html
    │   │   ├── foxbusiness.com1.html
    │   │   ├── foxbusiness.com2.html
    │   │   ├── foxnews.com1.html
    │   │   ├── foxnews.com2.html
    │   │   ├── glamour.com1.html
    │   │   ├── glamour.com2.html
    │   │   ├── globalnews.ca1.html
    │   │   ├── globalnews.ca2.html
    │   │   ├── google_meta_refresh.html
    │   │   ├── gq.com1.html
    │   │   ├── gq.com2.html
    │   │   ├── graziadaily.co.uk1.html
    │   │   ├── graziadaily.co.uk2.html
    │   │   ├── gulflive.com1.html
    │   │   ├── gulflive.com2.html
    │   │   ├── huffingtonpost.com1.html
    │   │   ├── japanese_article.html
    │   │   ├── japanese_article2.html
    │   │   ├── lifebuzz.com1.html
    │   │   ├── lifebuzz.com2.html
    │   │   ├── livescience.com1.html
    │   │   ├── livescience.com2.html
    │   │   ├── mashable.com1.html
    │   │   ├── mashable.com2.html
    │   │   ├── mlive.com1.html
    │   │   ├── mlive.com2.html
    │   │   ├── newyorker.com1.html
    │   │   ├── nj.com1.html
    │   │   ├── nola.com1.html
    │   │   ├── nydailynews.com1.html
    │   │   ├── nypost.com1.html
    │   │   ├── nypost.com2.html
    │   │   ├── ok.co.uk1.html
    │   │   ├── ok.co.uk2.html
    │   │   ├── oregonlive.com1.html
    │   │   ├── oregonlive.com2.html
    │   │   ├── parsely.com1.html
    │   │   ├── parsely.com2.html
    │   │   ├── pe.com1.html
    │   │   ├── pewresearch.org1.html
    │   │   ├── pewresearch.org2.html
    │   │   ├── pixable.com1.html
    │   │   ├── pixable.com2.html
    │   │   ├── pixelmonkey.org1.html
    │   │   ├── pixelmonkey.org2.html
    │   │   ├── readwrite.com1.html
    │   │   ├── recipe.com1.html
    │   │   ├── recipe.com2.html
    │   │   ├── reuters.com1.html
    │   │   ├── reuters.com2.html
    │   │   ├── reuters.com3.html
    │   │   ├── reuters.com4.html
    │   │   ├── self.com1.html
    │   │   ├── self.com2.html
    │   │   ├── sitepoint.com1.html
    │   │   ├── sitepoint.com2.html
    │   │   ├── slate.com1.html
    │   │   ├── slate.com2.html
    │   │   ├── space.com1.html
    │   │   ├── space.com2.html
    │   │   ├── spanish_article.html
    │   │   ├── syracuse.com1.html
    │   │   ├── syracuse.com2.html
    │   │   ├── talkingpointsmemo.com1.html
    │   │   ├── technologyreview.com1.html
    │   │   ├── technologyreview.com2.html
    │   │   ├── teenvogue.com1.html
    │   │   ├── teenvogue.com2.html
    │   │   ├── telegraph.co.uk1.html
    │   │   ├── telegraph.co.uk2.html
    │   │   ├── thai_article.html
    │   │   ├── theatlantic.com1.html
    │   │   ├── theatlantic.com2.html
    │   │   ├── theatlanticcities.com1.html
    │   │   ├── theatlanticcities.com2.html
    │   │   ├── thedailybeast.com1.html
    │   │   ├── thedailybeast.com2.html
    │   │   ├── thedebrief.co.uk1.html
    │   │   ├── thedebrief.co.uk2.html
    │   │   ├── theglobeandmail.com1.html
    │   │   ├── theglobeandmail.com2.html
    │   │   ├── thekitchn.com1.html
    │   │   ├── thekitchn.com2.html
    │   │   ├── thenextweb.com1.html
    │   │   ├── theonion.com1.html
    │   │   ├── theroot.com1.html
    │   │   ├── tnr.com1.html
    │   │   ├── tnr.com2.html
    │   │   ├── uproxx.com1.html
    │   │   ├── uproxx.com2.html
    │   │   ├── upworthy.com1.html
    │   │   ├── upworthy.com2.html
    │   │   ├── usnews.com1.html
    │   │   ├── usnews.com2.html
    │   │   ├── vanityfair.com1.html
    │   │   ├── vogue.de1.html
    │   │   ├── vogue.de2.html
    │   │   ├── wetpaint.com1.html
    │   │   ├── wetpaint.com2.html
    │   │   ├── wired.com1.html
    │   │   ├── wired.com2.html
    │   │   ├── wnet.org1.html
    │   │   ├── wnet.org2.html
    │   │   ├── yahoo_main_site.html
    │   │   ├── youbeauty.com1.html
    │   │   └── youbeauty.com2.html
    │   ├── test_prepare_urls.txt
    │   ├── test_urls.txt
    │   ├── test_urls_pubdate.txt
    │   └── text/
    │       ├── 247wallst.com1.txt
    │       ├── 247wallst.com2.txt
    │       ├── about.com1.txt
    │       ├── about.com2.txt
    │       ├── adoption.com1.txt
    │       ├── al.com1.txt
    │       ├── al.com2.txt
    │       ├── apartmenttherapy.com1.txt
    │       ├── apartmenttherapy.com2.txt
    │       ├── arabic.txt
    │       ├── architecturaldigest.com1.txt
    │       ├── architecturaldigest.com2.txt
    │       ├── avclub.com1.txt
    │       ├── avclub.com2.txt
    │       ├── backstage.com1.txt
    │       ├── backstage.com2.txt
    │       ├── bhg.com1.txt
    │       ├── bhg.com2.txt
    │       ├── bloomberg.com1.txt
    │       ├── bostonherald.com1.txt
    │       ├── bostonherald.com2.txt
    │       ├── businessinsider.com1.txt
    │       ├── businessinsider.com2.txt
    │       ├── businessweek.com1.txt
    │       ├── businessweek.com2.txt
    │       ├── chinese.txt
    │       ├── cleveland.com1.txt
    │       ├── cleveland.com2.txt
    │       ├── cnn.txt
    │       ├── cnn_summary.txt
    │       ├── cntraveler.com1.txt
    │       ├── cntraveler.com2.txt
    │       ├── coolhunting.com1.txt
    │       ├── cricket.com.au1.txt
    │       ├── cricket.com.au2.txt
    │       ├── dailycaller.com1.txt
    │       ├── dailycaller.com2.txt
    │       ├── dailystar.co.uk1.txt
    │       ├── dailystar.co.uk2.txt
    │       ├── dallasnews.com1.txt
    │       ├── dallasnews.com2.txt
    │       ├── details.com1.txt
    │       ├── details.com2.txt
    │       ├── elle.com1.txt
    │       ├── elle.com2.txt
    │       ├── flavorwire.com1.txt
    │       ├── flavorwire.com2.txt
    │       ├── fool.com1.txt
    │       ├── fool.com2.txt
    │       ├── foxbusiness.com1.txt
    │       ├── foxbusiness.com2.txt
    │       ├── foxnews.com1.txt
    │       ├── foxnews.com2.txt
    │       ├── foxnews.com3.txt
    │       ├── foxnews.com4.txt
    │       ├── glamour.com1.txt
    │       ├── glamour.com2.txt
    │       ├── globalnews.ca1.txt
    │       ├── globalnews.ca2.txt
    │       ├── gq.com1.txt
    │       ├── gq.com2.txt
    │       ├── graziadaily.co.uk1.txt
    │       ├── graziadaily.co.uk2.txt
    │       ├── gulflive.com1.txt
    │       ├── gulflive.com2.txt
    │       ├── huffingtonpost.com1.txt
    │       ├── japanese.txt
    │       ├── japanese2.txt
    │       ├── lifebuzz.com1.txt
    │       ├── lifebuzz.com2.txt
    │       ├── livescience.com1.txt
    │       ├── livescience.com2.txt
    │       ├── mashable.com1.txt
    │       ├── mashable.com2.txt
    │       ├── mlive.com1.txt
    │       ├── mlive.com2.txt
    │       ├── newyorker.com1.txt
    │       ├── nj.com1.txt
    │       ├── nola.com1.txt
    │       ├── nydailynews.com1.txt
    │       ├── nypost.com1.txt
    │       ├── nypost.com2.txt
    │       ├── ok.co.uk1.txt
    │       ├── ok.co.uk2.txt
    │       ├── oregonlive.com1.txt
    │       ├── oregonlive.com2.txt
    │       ├── parsely.com1.txt
    │       ├── parsely.com2.txt
    │       ├── pe.com1.txt
    │       ├── pewresearch.org1.txt
    │       ├── pewresearch.org2.txt
    │       ├── pixable.com1.txt
    │       ├── pixable.com2.txt
    │       ├── pixelmonkey.org1.txt
    │       ├── pixelmonkey.org2.txt
    │       ├── readwrite.com1.txt
    │       ├── recipe.com1.txt
    │       ├── recipe.com2.txt
    │       ├── reuters.com1.txt
    │       ├── reuters.com2.txt
    │       ├── reuters.com3.txt
    │       ├── reuters.com4.txt
    │       ├── reuters.com5.txt
    │       ├── reuters.com6.txt
    │       ├── self.com1.txt
    │       ├── self.com2.txt
    │       ├── sitepoint.com1.txt
    │       ├── sitepoint.com2.txt
    │       ├── slate.com1.txt
    │       ├── slate.com2.txt
    │       ├── space.com1.txt
    │       ├── space.com2.txt
    │       ├── spanish.txt
    │       ├── syracuse.com1.txt
    │       ├── syracuse.com2.txt
    │       ├── talkingpointsmemo.com1.txt
    │       ├── technologyreview.com1.txt
    │       ├── technologyreview.com2.txt
    │       ├── teenvogue.com1.txt
    │       ├── teenvogue.com2.txt
    │       ├── telegraph.co.uk1.txt
    │       ├── telegraph.co.uk2.txt
    │       ├── thai.txt
    │       ├── theatlantic.com1.txt
    │       ├── theatlantic.com2.txt
    │       ├── theatlanticcities.com1.txt
    │       ├── theatlanticcities.com2.txt
    │       ├── thedailybeast.com1.txt
    │       ├── thedailybeast.com2.txt
    │       ├── thedebrief.co.uk1.txt
    │       ├── thedebrief.co.uk2.txt
    │       ├── theglobeandmail.com1.txt
    │       ├── theglobeandmail.com2.txt
    │       ├── thekitchn.com1.txt
    │       ├── thekitchn.com2.txt
    │       ├── thenextweb.com1.txt
    │       ├── theonion.com1.txt
    │       ├── theroot.com1.txt
    │       ├── tnr.com1.txt
    │       ├── tnr.com2.txt
    │       ├── uproxx.com1.txt
    │       ├── uproxx.com2.txt
    │       ├── upworthy.com1.txt
    │       ├── upworthy.com2.txt
    │       ├── usnews.com1.txt
    │       ├── usnews.com2.txt
    │       ├── vanityfair.com1.txt
    │       ├── vogue.de1.txt
    │       ├── vogue.de2.txt
    │       ├── wetpaint.com1.txt
    │       ├── wetpaint.com2.txt
    │       ├── wired.com1.txt
    │       ├── wired.com2.txt
    │       ├── wnet.org1.txt
    │       ├── wnet.org2.txt
    │       ├── youbeauty.com1.txt
    │       └── youbeauty.com2.txt
    └── unit_tests.py

================================================
FILE CONTENTS
================================================

================================================
FILE: .gitattributes
================================================
docs/* linguist-documentation
tests/* linguist-vendored


================================================
FILE: .gitignore
================================================
*.pyc

.DS_Store
.idea
.pypirc

# C extensions
*.so

# Packages
*.egg
*.egg-info
dist
build
_build
eggs
parts
bin
var
sdist
develop-eggs
.installed.cfg
lib
lib64
__pycache__

# Installer logs
pip-log.txt

# Unit test / coverage reports
.coverage
.tox
nosetests.xml

# Translations
*.mo

# Mr Developer
.mr.developer.cfg
.project
.pydevproject
venv


================================================
FILE: .travis.yml
================================================
language: python
python:
 - "3.5"
 - "3.6"
 - "3.7"
install:
 - pip install -r requirements.txt coverage coveralls
 - python download_corpora.py
script:
  - coverage run --source newspaper tests/unit_tests.py
after_success:
  coveralls


================================================
FILE: CHANGELOG.md
================================================
# Change Log

## [0.1.7](https://github.com/codelucas/newspaper/tree/0.1.7) (2016-01-30)
[Full Changelog](https://github.com/codelucas/newspaper/compare/0.1.6...0.1.7)

**Closed issues:**

- ImportError: cannot import name 'Image' [\#183](https://github.com/codelucas/newspaper/issues/183)
- Won't let me import [\#182](https://github.com/codelucas/newspaper/issues/182)
- Install on Mac - El Capitan Failed - "Operation not permitted"  [\#181](https://github.com/codelucas/newspaper/issues/181)
- Downgrades to old versions of required packages upon installation [\#174](https://github.com/codelucas/newspaper/issues/174)
- Handling 404, 500, and other non-200 http response codes to prevent scraping error pages [\#142](https://github.com/codelucas/newspaper/issues/142)
- Libray downgrading in installation [\#138](https://github.com/codelucas/newspaper/issues/138)

**Merged pull requests:**

- Don't scrape error pages [\#190](https://github.com/codelucas/newspaper/pull/190) ([yprez](https://github.com/yprez))
- Added Hebrew stop words for language support [\#188](https://github.com/codelucas/newspaper/pull/188) ([alon7](https://github.com/alon7))
- Fix installation and build [\#187](https://github.com/codelucas/newspaper/pull/187) ([yprez](https://github.com/yprez))
- Fix installation docs [\#184](https://github.com/codelucas/newspaper/pull/184) ([yprez](https://github.com/yprez))
- Travis CI integration [\#180](https://github.com/codelucas/newspaper/pull/180) ([yprez](https://github.com/yprez))
- requirements.txt - Use minimal instead of exact versions [\#179](https://github.com/codelucas/newspaper/pull/179) ([yprez](https://github.com/yprez))
- Handle lxml raising ValueError on node.itertext\(\) - Python 3 [\#178](https://github.com/codelucas/newspaper/pull/178) ([yprez](https://github.com/yprez))
- Handle lxml raising ValueError on node.itertext\(\) [\#144](https://github.com/codelucas/newspaper/pull/144) ([yprez](https://github.com/yprez))
- Parse byline fix [\#132](https://github.com/codelucas/newspaper/pull/132) ([davecrumbacher](https://github.com/davecrumbacher))

## [0.1.6](https://github.com/codelucas/newspaper/tree/0.1.6) (2016-01-10)
[Full Changelog](https://github.com/codelucas/newspaper/compare/0.1.5...0.1.6)

**Closed issues:**

- Critical leak in newspaper.mthreading.Worker [\#177](https://github.com/codelucas/newspaper/issues/177)
- HTMLParseError [\#165](https://github.com/codelucas/newspaper/issues/165)
- Take local paths to .html files [\#153](https://github.com/codelucas/newspaper/issues/153)
- Wall Street Journal Full Text is not Correctly Scraped [\#150](https://github.com/codelucas/newspaper/issues/150)
- Article HTML Returning Null [\#131](https://github.com/codelucas/newspaper/issues/131)
- No articles [\#130](https://github.com/codelucas/newspaper/issues/130)
- Loading Pages that use heavy javascript [\#127](https://github.com/codelucas/newspaper/issues/127)
- Login handling for premium websites [\#126](https://github.com/codelucas/newspaper/issues/126)
- Installation of nltk is failing [\#121](https://github.com/codelucas/newspaper/issues/121)

**Merged pull requests:**

- Support urls with dots [\#176](https://github.com/codelucas/newspaper/pull/176) ([alexanderlukanin13](https://github.com/alexanderlukanin13))
- upgrade beautifulsoup4 to 4.4.1 for python 3.5 [\#171](https://github.com/codelucas/newspaper/pull/171) ([AlJohri](https://github.com/AlJohri))
- Updated requests version [\#170](https://github.com/codelucas/newspaper/pull/170) ([adrienthiery](https://github.com/adrienthiery))
- Turkish Language added [\#169](https://github.com/codelucas/newspaper/pull/169) ([muratcorlu](https://github.com/muratcorlu))
- Add macedonian stopwords [\#166](https://github.com/codelucas/newspaper/pull/166) ([dimitrovskif](https://github.com/dimitrovskif))
- Issue\#95 added graceful string concatenation [\#157](https://github.com/codelucas/newspaper/pull/157) ([surajssd](https://github.com/surajssd))
- fix for "jpeg error with PIL, Can't convert 'NoneType' object to str implicitly" [\#154](https://github.com/codelucas/newspaper/pull/154) ([hnykda](https://github.com/hnykda))
- bugfix in article.py, is\_valid\_body [\#149](https://github.com/codelucas/newspaper/pull/149) ([ms8r](https://github.com/ms8r))
- Fixed typo [\#139](https://github.com/codelucas/newspaper/pull/139) ([Eleonore9](https://github.com/Eleonore9))
- Correct link for the Python 3 branch [\#136](https://github.com/codelucas/newspaper/pull/136) ([jtpio](https://github.com/jtpio))
- Add python3-pip install step for Ubuntu [\#135](https://github.com/codelucas/newspaper/pull/135) ([irnc](https://github.com/irnc))

## [0.1.5](https://github.com/codelucas/newspaper/tree/0.1.5) (2015-03-04)
[Full Changelog](https://github.com/codelucas/newspaper/compare/0.1.4...0.1.5)

**Closed issues:**

- is there any kind of documentation on centos 7? [\#114](https://github.com/codelucas/newspaper/issues/114)
- Add extraction publishing date from article. [\#3](https://github.com/codelucas/newspaper/issues/3)

**Merged pull requests:**

- bumping nltk to 2.0.5 - see \#824 in nltk [\#125](https://github.com/codelucas/newspaper/pull/125) ([hexelon](https://github.com/hexelon))

## [0.1.4](https://github.com/codelucas/newspaper/tree/0.1.4) (2015-02-04)
[Full Changelog](https://github.com/codelucas/newspaper/compare/0.1.3...0.1.4)

**Closed issues:**

- Getting rate limiting issue? [\#116](https://github.com/codelucas/newspaper/issues/116)
- newspaper.build\( \) error [\#111](https://github.com/codelucas/newspaper/issues/111)
- Allow lists in Parser.clean\_article\_html\(\) [\#108](https://github.com/codelucas/newspaper/issues/108)

**Merged pull requests:**

- Fix incorrect log call while generating articles [\#115](https://github.com/codelucas/newspaper/pull/115) ([curita](https://github.com/curita))
- Allow lists in clean\_article\_html\(\) - fixes \#108 [\#112](https://github.com/codelucas/newspaper/pull/112) ([ecesena](https://github.com/ecesena))
- Fixed nodeToString\(\) to return valid HTML [\#110](https://github.com/codelucas/newspaper/pull/110) ([ecesena](https://github.com/ecesena))
- Fixed empty return in top\_meta\_image [\#109](https://github.com/codelucas/newspaper/pull/109) ([ecesena](https://github.com/ecesena))

## [0.1.3](https://github.com/codelucas/newspaper/tree/0.1.3) (2015-01-15)
[Full Changelog](https://github.com/codelucas/newspaper/compare/0.1.2...0.1.3)

**Implemented enhancements:**

- Fulltext extraction improvement \#1 [\#105](https://github.com/codelucas/newspaper/issues/105)

**Closed issues:**

- Tags h1 in article\_html - indented behavior? [\#107](https://github.com/codelucas/newspaper/issues/107)

**Merged pull requests:**

- Fulltext extraction improvement \#1 [\#106](https://github.com/codelucas/newspaper/pull/106) ([codelucas](https://github.com/codelucas))

## [0.1.2](https://github.com/codelucas/newspaper/tree/0.1.2) (2015-01-01)
[Full Changelog](https://github.com/codelucas/newspaper/compare/0.1.1...0.1.2)

**Closed issues:**

- Metatags on Vice.com [\#103](https://github.com/codelucas/newspaper/issues/103)
- Can't extract images from german newspapers [\#96](https://github.com/codelucas/newspaper/issues/96)
- article\_html misses many of the images [\#89](https://github.com/codelucas/newspaper/issues/89)

**Merged pull requests:**

- Integrate UnicodeDammit, deprecate parser\_class, deprecate encodeValue, refactor, scaffolding for more unit tests [\#104](https://github.com/codelucas/newspaper/pull/104) ([codelucas](https://github.com/codelucas))

## [0.1.1](https://github.com/codelucas/newspaper/tree/0.1.1) (2014-12-27)
[Full Changelog](https://github.com/codelucas/newspaper/compare/0.1.0...0.1.1)

**Closed issues:**

- UnicodeDecodeError: 'utf8' codec can't decode byte 0xcc [\#99](https://github.com/codelucas/newspaper/issues/99)
- TypeError: Can't convert 'bytes' object to str implicitly [\#98](https://github.com/codelucas/newspaper/issues/98)
- \[Parse lxml ERR\] Unicode strings with encoding declaration are not supported. Please use bytes input or XML fragments without declaration. [\#78](https://github.com/codelucas/newspaper/issues/78)
- UnicodeDecodeError: 'ascii' codec can't decode byte 0xe2 in position 11: ordinal not in range\(128\) [\#77](https://github.com/codelucas/newspaper/issues/77)
- article.text  and keywords error [\#47](https://github.com/codelucas/newspaper/issues/47)

**Merged pull requests:**

- Huge bugfix to aid lxml DOM parsing + remove unhelpful and excess exception messages and added tracebacks to exception logging [\#102](https://github.com/codelucas/newspaper/pull/102) ([codelucas](https://github.com/codelucas))
- Decode bytestring returned from lxml's `toString` early on before sending it out to outer code [\#101](https://github.com/codelucas/newspaper/pull/101) ([codelucas](https://github.com/codelucas))
- Fixed \#78: Remove encoding tag because lxml won't accept it for unicode [\#97](https://github.com/codelucas/newspaper/pull/97) ([mhall1](https://github.com/mhall1))

## [0.1.0](https://github.com/codelucas/newspaper/tree/0.1.0) (2014-12-17)
[Full Changelog](https://github.com/codelucas/newspaper/compare/0.0.9...0.1.0)

## [0.0.9](https://github.com/codelucas/newspaper/tree/0.0.9) (2014-12-17)
[Full Changelog](https://github.com/codelucas/newspaper/compare/0.0.8...0.0.9)

**Closed issues:**

- object has no attribute clean Error when using parse method [\#90](https://github.com/codelucas/newspaper/issues/90)
- Questions [\#85](https://github.com/codelucas/newspaper/issues/85)
- \[nltk\_data\] Error loading brown: \<urlopen error \[Errno -2\] Name or \[nltk\_data\]     service not known\> [\#84](https://github.com/codelucas/newspaper/issues/84)
- newspaper unable to find embeded youtube video [\#82](https://github.com/codelucas/newspaper/issues/82)
- Bound for memory usage [\#81](https://github.com/codelucas/newspaper/issues/81)
- Hosted demo [\#80](https://github.com/codelucas/newspaper/issues/80)
- Having issues installing due to lxml [\#79](https://github.com/codelucas/newspaper/issues/79)
- Add a BeautifulSoup4 parser. [\#44](https://github.com/codelucas/newspaper/issues/44)
- python 3 support request [\#36](https://github.com/codelucas/newspaper/issues/36)

**Merged pull requests:**

- update jieba to 0.35 [\#94](https://github.com/codelucas/newspaper/pull/94) ([WingGao](https://github.com/WingGao))
- Parse was breaking in the method clean\_article\_html when keep\_article\_ht... [\#88](https://github.com/codelucas/newspaper/pull/88) ([phoenixwizard](https://github.com/phoenixwizard))
- split title with \_  [\#87](https://github.com/codelucas/newspaper/pull/87) ([deweydu](https://github.com/deweydu))
- Update to support python3 [\#86](https://github.com/codelucas/newspaper/pull/86) ([log0ymxm](https://github.com/log0ymxm))
- Added link to basic demo [\#83](https://github.com/codelucas/newspaper/pull/83) ([iwasrobbed](https://github.com/iwasrobbed))
- Add splitting of slash-separated titles [\#75](https://github.com/codelucas/newspaper/pull/75) ([igor-shevchenko](https://github.com/igor-shevchenko))

## [0.0.8](https://github.com/codelucas/newspaper/tree/0.0.8) (2014-10-13)
[Full Changelog](https://github.com/codelucas/newspaper/compare/0.0.7...0.0.8)

**Closed issues:**

- Parsing Raw HTML [\#74](https://github.com/codelucas/newspaper/issues/74)
- Can't install newspaper [\#72](https://github.com/codelucas/newspaper/issues/72)
- Refactor codebase so newspaper is actually pythonic [\#70](https://github.com/codelucas/newspaper/issues/70)
- Article.top\_node == Article.clean\_top\_node [\#65](https://github.com/codelucas/newspaper/issues/65)
- article.movies missing 'http:' [\#64](https://github.com/codelucas/newspaper/issues/64)
- KeyError when calling newspaper.languages\(\) [\#62](https://github.com/codelucas/newspaper/issues/62)
- Memoize Articles - Not Printing [\#61](https://github.com/codelucas/newspaper/issues/61)
- Add URL headers while building a "paper" [\#60](https://github.com/codelucas/newspaper/issues/60)
- AttributeError: 'module' object has no attribute 'build' [\#59](https://github.com/codelucas/newspaper/issues/59)
- Typo in newspaper.build argument "memoize\_articles" [\#58](https://github.com/codelucas/newspaper/issues/58)
- issue with stopwords-tr.txt [\#51](https://github.com/codelucas/newspaper/issues/51)
- Other language support.  [\#34](https://github.com/codelucas/newspaper/issues/34)
- Character encoding detection [\#2](https://github.com/codelucas/newspaper/issues/2)

**Merged pull requests:**

- Huge refactor: entire codebase in PEP8, imports alphabetized, bugfixes, core changes [\#71](https://github.com/codelucas/newspaper/pull/71) ([codelucas](https://github.com/codelucas))
- Meta tag extraction fixes [\#69](https://github.com/codelucas/newspaper/pull/69) ([karls](https://github.com/karls))
- Test suite improvements [\#68](https://github.com/codelucas/newspaper/pull/68) ([karls](https://github.com/karls))
- Test suite fixes [\#67](https://github.com/codelucas/newspaper/pull/67) ([karls](https://github.com/karls))
- Revert "Added published date to the extractor+article" [\#66](https://github.com/codelucas/newspaper/pull/66) ([codelucas](https://github.com/codelucas))
- Added published date to the extractor+article [\#63](https://github.com/codelucas/newspaper/pull/63) ([parhammmm](https://github.com/parhammmm))

## [0.0.7](https://github.com/codelucas/newspaper/tree/0.0.7) (2014-06-17)
[Full Changelog](https://github.com/codelucas/newspaper/compare/0.0.6...0.0.7)

**Closed issues:**

- no document on how to add language [\#57](https://github.com/codelucas/newspaper/issues/57)
- Retain \<a\> tags in top article node? [\#56](https://github.com/codelucas/newspaper/issues/56)
- DocumentCleaner is missing clean\_body\_classes [\#55](https://github.com/codelucas/newspaper/issues/55)
- You must download and parse an article before parsing it [\#52](https://github.com/codelucas/newspaper/issues/52)
- Not extracting UL LI text [\#50](https://github.com/codelucas/newspaper/issues/50)
- article does not release\_resources\(\) [\#42](https://github.com/codelucas/newspaper/issues/42)
- Doesn't work on http://www.le360.ma/fr [\#40](https://github.com/codelucas/newspaper/issues/40)
- How to assign html content without downloading it? [\#37](https://github.com/codelucas/newspaper/issues/37)
- Python venv only? [\#32](https://github.com/codelucas/newspaper/issues/32)
- .nlp\(\) could not work [\#27](https://github.com/codelucas/newspaper/issues/27)
- Doesn't work with Arabic news sites [\#23](https://github.com/codelucas/newspaper/issues/23)
- SyntaxError: invalid syntax [\#19](https://github.com/codelucas/newspaper/issues/19)
- Retain HTML markup for extracted article [\#18](https://github.com/codelucas/newspaper/issues/18)
- Portuguese is misspelled [\#14](https://github.com/codelucas/newspaper/issues/14)
- Multi-threading article downloads not working [\#12](https://github.com/codelucas/newspaper/issues/12)
- Timegm error? [\#10](https://github.com/codelucas/newspaper/issues/10)
- Problem in Brazilian sites [\#9](https://github.com/codelucas/newspaper/issues/9)
- Brazilian portuguese support [\#6](https://github.com/codelucas/newspaper/issues/6)

**Merged pull requests:**

- Fix typo in code and documentation [\#54](https://github.com/codelucas/newspaper/pull/54) ([jacquerie](https://github.com/jacquerie))
- removed quotes of 'filename' in utils\\_\_init\_\_.py [\#53](https://github.com/codelucas/newspaper/pull/53) ([jay8688](https://github.com/jay8688))
- Fixed long-form article issue w/ calculate\_best\_node [\#49](https://github.com/codelucas/newspaper/pull/49) ([jeffnappi](https://github.com/jeffnappi))
- Use first image from article top\_node [\#35](https://github.com/codelucas/newspaper/pull/35) ([otemnov](https://github.com/otemnov))
- Add a section with links to related projects [\#33](https://github.com/codelucas/newspaper/pull/33) ([cantino](https://github.com/cantino))
- Original [\#30](https://github.com/codelucas/newspaper/pull/30) ([otemnov](https://github.com/otemnov))
- Fix reddit top image [\#29](https://github.com/codelucas/newspaper/pull/29) ([otemnov](https://github.com/otemnov))
- Extract Meta Tags in structured way [\#28](https://github.com/codelucas/newspaper/pull/28) ([voidfiles](https://github.com/voidfiles))
- Replace instances of 'Portugease' with 'Portuguese' [\#26](https://github.com/codelucas/newspaper/pull/26) ([WheresWardy](https://github.com/WheresWardy))
- It's The Changelog not The ChangeLog :\) [\#24](https://github.com/codelucas/newspaper/pull/24) ([adamstac](https://github.com/adamstac))
- syntax errors [\#22](https://github.com/codelucas/newspaper/pull/22) ([arjun024](https://github.com/arjun024))
- Support for more HTML tags in parsers.py [\#21](https://github.com/codelucas/newspaper/pull/21) ([WheresWardy](https://github.com/WheresWardy))
- Fixed syntax error [\#20](https://github.com/codelucas/newspaper/pull/20) ([damilare](https://github.com/damilare))
- Minor Performance tweaks [\#17](https://github.com/codelucas/newspaper/pull/17) ([techaddict](https://github.com/techaddict))
- Update README.rst [\#15](https://github.com/codelucas/newspaper/pull/15) ([girasquid](https://github.com/girasquid))
- Minor Typo candiate\_words -\> candidate\_words [\#13](https://github.com/codelucas/newspaper/pull/13) ([techaddict](https://github.com/techaddict))

## [0.0.6](https://github.com/codelucas/newspaper/tree/0.0.6) (2014-01-18)
[Full Changelog](https://github.com/codelucas/newspaper/compare/0.0.5...0.0.6)

**Closed issues:**

- Port to Ruby [\#8](https://github.com/codelucas/newspaper/issues/8)
- Huge internationalization / API revamp underway! [\#7](https://github.com/codelucas/newspaper/issues/7)
- Multithread & gevent framework built into newspaper [\#4](https://github.com/codelucas/newspaper/issues/4)

**Merged pull requests:**

- Add article html extraction [\#11](https://github.com/codelucas/newspaper/pull/11) ([voidfiles](https://github.com/voidfiles))

## [0.0.5](https://github.com/codelucas/newspaper/tree/0.0.5) (2014-01-09)
[Full Changelog](https://github.com/codelucas/newspaper/compare/0.0.4...0.0.5)

## [0.0.4](https://github.com/codelucas/newspaper/tree/0.0.4) (2013-12-31)
[Full Changelog](https://github.com/codelucas/newspaper/compare/0.0.3...0.0.4)

**Closed issues:**

- Calling nlp\(\) on an article causes 'tokenizers/punkt/english.pickle' Not Found Error [\#1](https://github.com/codelucas/newspaper/issues/1)

**Merged pull requests:**

- Fix for keyword arg usage in print\(\) on Python 2.7 [\#5](https://github.com/codelucas/newspaper/pull/5) ([michaelhood](https://github.com/michaelhood))

## [0.0.3](https://github.com/codelucas/newspaper/tree/0.0.3) (2013-12-22)
[Full Changelog](https://github.com/codelucas/newspaper/compare/0.0.2...0.0.3)

## [0.0.2](https://github.com/codelucas/newspaper/tree/0.0.2) (2013-12-21)
[Full Changelog](https://github.com/codelucas/newspaper/compare/0.0.1...0.0.2)

## [0.0.1](https://github.com/codelucas/newspaper/tree/0.0.1) (2013-12-21)


\* *This Change Log was automatically generated by [github_changelog_generator](https://github.com/skywinder/Github-Changelog-Generator)*

================================================
FILE: GOOSE-LICENSE.txt
================================================

                              Apache License
                        Version 2.0, January 2004
                     http://www.apache.org/licenses/

TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION

1. Definitions.

   "License" shall mean the terms and conditions for use, reproduction,
   and distribution as defined by Sections 1 through 9 of this document.

   "Licensor" shall mean the copyright owner or entity authorized by
   the copyright owner that is granting the License.

   "Legal Entity" shall mean the union of the acting entity and all
   other entities that control, are controlled by, or are under common
   control with that entity. For the purposes of this definition,
   "control" means (i) the power, direct or indirect, to cause the
   direction or management of such entity, whether by contract or
   otherwise, or (ii) ownership of fifty percent (50%) or more of the
   outstanding shares, or (iii) beneficial ownership of such entity.

   "You" (or "Your") shall mean an individual or Legal Entity
   exercising permissions granted by this License.

   "Source" form shall mean the preferred form for making modifications,
   including but not limited to software source code, documentation
   source, and configuration files.

   "Object" form shall mean any form resulting from mechanical
   transformation or translation of a Source form, including but
   not limited to compiled object code, generated documentation,
   and conversions to other media types.

   "Work" shall mean the work of authorship, whether in Source or
   Object form, made available under the License, as indicated by a
   copyright notice that is included in or attached to the work
   (an example is provided in the Appendix below).

   "Derivative Works" shall mean any work, whether in Source or Object
   form, that is based on (or derived from) the Work and for which the
   editorial revisions, annotations, elaborations, or other modifications
   represent, as a whole, an original work of authorship. For the purposes
   of this License, Derivative Works shall not include works that remain
   separable from, or merely link (or bind by name) to the interfaces of,
   the Work and Derivative Works thereof.

   "Contribution" shall mean any work of authorship, including
   the original version of the Work and any modifications or additions
   to that Work or Derivative Works thereof, that is intentionally
   submitted to Licensor for inclusion in the Work by the copyright owner
   or by an individual or Legal Entity authorized to submit on behalf of
   the copyright owner. For the purposes of this definition, "submitted"
   means any form of electronic, verbal, or written communication sent
   to the Licensor or its representatives, including but not limited to
   communication on electronic mailing lists, source code control systems,
   and issue tracking systems that are managed by, or on behalf of, the
   Licensor for the purpose of discussing and improving the Work, but
   excluding communication that is conspicuously marked or otherwise
   designated in writing by the copyright owner as "Not a Contribution."

   "Contributor" shall mean Licensor and any individual or Legal Entity
   on behalf of whom a Contribution has been received by Licensor and
   subsequently incorporated within the Work.

2. Grant of Copyright License. Subject to the terms and conditions of
   this License, each Contributor hereby grants to You a perpetual,
   worldwide, non-exclusive, no-charge, royalty-free, irrevocable
   copyright license to reproduce, prepare Derivative Works of,
   publicly display, publicly perform, sublicense, and distribute the
   Work and such Derivative Works in Source or Object form.

3. Grant of Patent License. Subject to the terms and conditions of
   this License, each Contributor hereby grants to You a perpetual,
   worldwide, non-exclusive, no-charge, royalty-free, irrevocable
   (except as stated in this section) patent license to make, have made,
   use, offer to sell, sell, import, and otherwise transfer the Work,
   where such license applies only to those patent claims licensable
   by such Contributor that are necessarily infringed by their
   Contribution(s) alone or by combination of their Contribution(s)
   with the Work to which such Contribution(s) was submitted. If You
   institute patent litigation against any entity (including a
   cross-claim or counterclaim in a lawsuit) alleging that the Work
   or a Contribution incorporated within the Work constitutes direct
   or contributory patent infringement, then any patent licenses
   granted to You under this License for that Work shall terminate
   as of the date such litigation is filed.

4. Redistribution. You may reproduce and distribute copies of the
   Work or Derivative Works thereof in any medium, with or without
   modifications, and in Source or Object form, provided that You
   meet the following conditions:

   (a) You must give any other recipients of the Work or
       Derivative Works a copy of this License; and

   (b) You must cause any modified files to carry prominent notices
       stating that You changed the files; and

   (c) You must retain, in the Source form of any Derivative Works
       that You distribute, all copyright, patent, trademark, and
       attribution notices from the Source form of the Work,
       excluding those notices that do not pertain to any part of
       the Derivative Works; and

   (d) If the Work includes a "NOTICE" text file as part of its
       distribution, then any Derivative Works that You distribute must
       include a readable copy of the attribution notices contained
       within such NOTICE file, excluding those notices that do not
       pertain to any part of the Derivative Works, in at least one
       of the following places: within a NOTICE text file distributed
       as part of the Derivative Works; within the Source form or
       documentation, if provided along with the Derivative Works; or,
       within a display generated by the Derivative Works, if and
       wherever such third-party notices normally appear. The contents
       of the NOTICE file are for informational purposes only and
       do not modify the License. You may add Your own attribution
       notices within Derivative Works that You distribute, alongside
       or as an addendum to the NOTICE text from the Work, provided
       that such additional attribution notices cannot be construed
       as modifying the License.

   You may add Your own copyright statement to Your modifications and
   may provide additional or different license terms and conditions
   for use, reproduction, or distribution of Your modifications, or
   for any such Derivative Works as a whole, provided Your use,
   reproduction, and distribution of the Work otherwise complies with
   the conditions stated in this License.

5. Submission of Contributions. Unless You explicitly state otherwise,
   any Contribution intentionally submitted for inclusion in the Work
   by You to the Licensor shall be under the terms and conditions of
   this License, without any additional terms or conditions.
   Notwithstanding the above, nothing herein shall supersede or modify
   the terms of any separate license agreement you may have executed
   with Licensor regarding such Contributions.

6. Trademarks. This License does not grant permission to use the trade
   names, trademarks, service marks, or product names of the Licensor,
   except as required for reasonable and customary use in describing the
   origin of the Work and reproducing the content of the NOTICE file.

7. Disclaimer of Warranty. Unless required by applicable law or
   agreed to in writing, Licensor provides the Work (and each
   Contributor provides its Contributions) on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
   implied, including, without limitation, any warranties or conditions
   of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
   PARTICULAR PURPOSE. You are solely responsible for determining the
   appropriateness of using or redistributing the Work and assume any
   risks associated with Your exercise of permissions under this License.

8. Limitation of Liability. In no event and under no legal theory,
   whether in tort (including negligence), contract, or otherwise,
   unless required by applicable law (such as deliberate and grossly
   negligent acts) or agreed to in writing, shall any Contributor be
   liable to You for damages, including any direct, indirect, special,
   incidental, or consequential damages of any character arising as a
   result of this License or out of the use or inability to use the
   Work (including but not limited to damages for loss of goodwill,
   work stoppage, computer failure or malfunction, or any and all
   other commercial damages or losses), even if such Contributor
   has been advised of the possibility of such damages.

9. Accepting Warranty or Additional Liability. While redistributing
   the Work or Derivative Works thereof, You may choose to offer,
   and charge a fee for, acceptance of support, warranty, indemnity,
   or other liability obligations and/or rights consistent with this
   License. However, in accepting such obligations, You may act only
   on Your own behalf and on Your sole responsibility, not on behalf
   of any other Contributor, and only if You agree to indemnify,
   defend, and hold each Contributor harmless for any liability
   incurred by, or claims asserted against, such Contributor by reason
   of your accepting any such warranty or additional liability.

END OF TERMS AND CONDITIONS

APPENDIX: How to apply the Apache License to your work.

   To apply the Apache License to your work, attach the following
   boilerplate notice, with the fields enclosed by brackets "[]"
   replaced with your own identifying information. (Don't include
   the brackets!)  The text should be enclosed in the appropriate
   comment syntax for the file format. We also recommend that a
   file or class name and description of purpose be included on the
   same "printed page" as the copyright notice for easier
   identification within third-party archives.

Copyright [yyyy] [name of copyright owner]

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

================================================
FILE: LICENSE
================================================
The MIT License (MIT)

Copyright (c) 2013 Lucas Ou-Yang

Permission is hereby granted, free of charge, to any person obtaining a copy of
this software and associated documentation files (the "Software"), to deal in
the Software without restriction, including without limitation the rights to
use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
the Software, and to permit persons to whom the Software is furnished to do so,
subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.


================================================
FILE: MANIFEST.in
================================================
include requirements.txt README.rst LICENSE
recursive-include newspaper *
recursive-exclude * __pycache__
recursive-exclude * *.py[co]

================================================
FILE: README.rst
================================================
Newspaper3k: Article scraping & curation
========================================

.. image:: https://badge.fury.io/py/newspaper3k.svg
    :target: http://badge.fury.io/py/newspaper3k.svg
        :alt: Latest version

.. image:: https://travis-ci.org/codelucas/newspaper.svg
        :target: http://travis-ci.org/codelucas/newspaper/
        :alt: Build status

.. image:: https://coveralls.io/repos/github/codelucas/newspaper/badge.svg?branch=master
        :target: https://coveralls.io/github/codelucas/newspaper
        :alt: Coverage status


Inspired by `requests`_ for its simplicity and powered by `lxml`_ for its speed:

    "Newspaper is an amazing python library for extracting & curating articles."
    -- `tweeted by`_ Kenneth Reitz, Author of `requests`_

    "Newspaper delivers Instapaper style article extraction." -- `The Changelog`_

.. _`tweeted by`: https://twitter.com/kennethreitz/status/419520678862548992
.. _`The Changelog`: http://thechangelog.com/newspaper-delivers-instapaper-style-article-extraction/

**Newspaper is a Python3 library**! Or, view our **deprecated and buggy** `Python2 branch`_

.. _`Python2 branch`: https://github.com/codelucas/newspaper/tree/python-2-head

A Glance:
---------

.. code-block:: pycon

    >>> from newspaper import Article

    >>> url = 'http://fox13now.com/2013/12/30/new-year-new-laws-obamacare-pot-guns-and-drones/'
    >>> article = Article(url)

.. code-block:: pycon

    >>> article.download()

    >>> article.html
    '<!DOCTYPE HTML><html itemscope itemtype="http://...'

.. code-block:: pycon

    >>> article.parse()

    >>> article.authors
    ['Leigh Ann Caldwell', 'John Honway']

    >>> article.publish_date
    datetime.datetime(2013, 12, 30, 0, 0)

    >>> article.text
    'Washington (CNN) -- Not everyone subscribes to a New Year's resolution...'

    >>> article.top_image
    'http://someCDN.com/blah/blah/blah/file.png'

    >>> article.movies
    ['http://youtube.com/path/to/link.com', ...]

.. code-block:: pycon

    >>> article.nlp()

    >>> article.keywords
    ['New Years', 'resolution', ...]

    >>> article.summary
    'The study shows that 93% of people ...'

.. code-block:: pycon

    >>> import newspaper

    >>> cnn_paper = newspaper.build('http://cnn.com')

    >>> for article in cnn_paper.articles:
    >>>     print(article.url)
    http://www.cnn.com/2013/11/27/justice/tucson-arizona-captive-girls/
    http://www.cnn.com/2013/12/11/us/texas-teen-dwi-wreck/index.html
    ...

    >>> for category in cnn_paper.category_urls():
    >>>     print(category)

    http://lifestyle.cnn.com
    http://cnn.com/world
    http://tech.cnn.com
    ...

    >>> cnn_article = cnn_paper.articles[0]
    >>> cnn_article.download()
    >>> cnn_article.parse()
    >>> cnn_article.nlp()
    ...

.. code-block:: pycon

    >>> from newspaper import fulltext

    >>> html = requests.get(...).text
    >>> text = fulltext(html)


Newspaper can extract and detect languages *seamlessly*.
If no language is specified, Newspaper will attempt to auto detect a language.

.. code-block:: pycon

    >>> from newspaper import Article
    >>> url = 'http://www.bbc.co.uk/zhongwen/simp/chinese_news/2012/12/121210_hongkong_politics.shtml'

    >>> a = Article(url, language='zh') # Chinese

    >>> a.download()
    >>> a.parse()

    >>> print(a.text[:150])
    香港行政长官梁振英在各方压力下就其大宅的违章建
    筑(僭建)问题到立法会接受质询,并向香港民众道歉。
    梁振英在星期二(12月10日)的答问大会开始之际
    在其演说中道歉,但强调他在违章建筑问题上没有隐瞒的
    意图和动机。 一些亲北京阵营议员欢迎梁振英道歉,
    且认为应能获得香港民众接受,但这些议员也质问梁振英有

    >>> print(a.title)
    港特首梁振英就住宅违建事件道歉

Multi-lingual
=============

If you are certain that an *entire* news source is in one language, **go ahead and use the same api :)**

.. code-block:: pycon

    >>> import newspaper
    >>> sina_paper = newspaper.build('http://www.sina.com.cn/', language='zh')

    >>> for category in sina_paper.category_urls():
    >>>     print(category)
    http://health.sina.com.cn
    http://eladies.sina.com.cn
    http://english.sina.com
    ...

    >>> article = sina_paper.articles[0]
    >>> article.download()
    >>> article.parse()

    >>> print(article.text)
    新浪武汉汽车综合 随着汽车市场的日趋成熟,
    传统的“集全家之力抱得爱车归”的全额购车模式已然过时,
    另一种轻松的新兴 车模式――金融购车正逐步成为时下消费者购
    买爱车最为时尚的消费理念,他们认为,这种新颖的购车
    模式既能在短期内
    ...

    >>> print(article.title)
    两年双免0手续0利率 科鲁兹掀背金融轻松购_武汉车市_武汉汽
    车网_新浪汽车_新浪网


Docs
----

Check out `The Docs`_ for full and detailed guides using newspaper.

Interested in adding a new language for us? Refer to: `Docs - Adding new languages <https://newspaper.readthedocs.io/en/latest/user_guide/advanced.html#adding-new-languages>`_

Features
--------

- Multi-threaded article download framework
- News url identification
- Text extraction from html
- Top image extraction from html
- All image extraction from html
- Keyword extraction from text
- Summary extraction from text
- Author extraction from text
- Google trending terms extraction
- Works in 10+ languages (English, Chinese, German, Arabic, ...)

.. code-block:: pycon

    >>> import newspaper
    >>> newspaper.languages()

    Your available languages are:
    input code      full name

      ar              Arabic
      be              Belarusian
      bg              Bulgarian
      da              Danish
      de              German
      el              Greek
      en              English
      es              Spanish
      et              Estonian
      fa              Persian
      fi              Finnish
      fr              French
      he              Hebrew
      hi              Hindi
      hr              Croatian
      hu              Hungarian
      id              Indonesian
      it              Italian
      ja              Japanese
      ko              Korean
      lt              Lithuanian
      mk              Macedonian
      nb              Norwegian (Bokmål)
      nl              Dutch
      no              Norwegian
      pl              Polish
      pt              Portuguese
      ro              Romanian
      ru              Russian
      sl              Slovenian
      sr              Serbian
      sv              Swedish
      sw              Swahili
      th              Thai
      tr              Turkish
      uk              Ukrainian
      vi              Vietnamese
      zh              Chinese

Get it now
----------

Run ✅ ``pip3 install newspaper3k`` ✅

NOT ⛔ ``pip3 install newspaper`` ⛔

On python3 you must install ``newspaper3k``, **not** ``newspaper``. ``newspaper`` is our python2 library.
Although installing newspaper is simple with `pip <http://www.pip-installer.org/>`_, you will
run into fixable issues if you are trying to install on ubuntu.

**If you are on Debian / Ubuntu**, install using the following:

- Install ``pip3`` command needed to install ``newspaper3k`` package::

    $ sudo apt-get install python3-pip

- Python development version, needed for Python.h::

    $ sudo apt-get install python-dev

- lxml requirements::

    $ sudo apt-get install libxml2-dev libxslt-dev

- For PIL to recognize .jpg images::

    $ sudo apt-get install libjpeg-dev zlib1g-dev libpng12-dev

NOTE: If you find problem installing ``libpng12-dev``, try installing ``libpng-dev``.

- Download NLP related corpora::

    $ curl https://raw.githubusercontent.com/codelucas/newspaper/master/download_corpora.py | python3

- Install the distribution via pip::

    $ pip3 install newspaper3k

**If you are on OSX**, install using the following, you may use both homebrew or macports:

::

    $ brew install libxml2 libxslt

    $ brew install libtiff libjpeg webp little-cms2

    $ pip3 install newspaper3k

    $ curl https://raw.githubusercontent.com/codelucas/newspaper/master/download_corpora.py | python3


**Otherwise**, install with the following:

NOTE: You will still most likely need to install the following libraries via your package manager

- PIL: ``libjpeg-dev`` ``zlib1g-dev`` ``libpng12-dev``
- lxml: ``libxml2-dev`` ``libxslt-dev``
- Python Development version: ``python-dev``

::

    $ pip3 install newspaper3k

    $ curl https://raw.githubusercontent.com/codelucas/newspaper/master/download_corpora.py | python3

Development
-----------

If you'd like to contribute and hack on the newspaper project, feel free to clone
a development version of this repository locally::

    git clone git://github.com/codelucas/newspaper.git

Once you have a copy of the source, you can embed it in your Python package,
or install it into your site-packages easily::

    $ pip3 install -r requirements.txt
    $ python3 setup.py install

Feel free to give our testing suite a shot, everything is mocked!::

    $ python3 tests/unit_tests.py

Planning on tweaking our full-text algorithm? Add the ``fulltext`` parameter::

    $ python3 tests/unit_tests.py fulltext

Demo
----

View a working online demo here: http://newspaper-demo.herokuapp.com

This is another working online demo: http://newspaper.chinazt.cc/


Interested in proxies?
======================

Stay private, fast, and fully in control
----------------------------------------
`Click here to explore BestProxy`_, your go-to solution for premium residential proxies. BestProxy's proxies ensure smooth browsing, fast speeds, and total anonymity. `Get Started`_ today and experience the difference!

.. image:: https://github.com/user-attachments/assets/1c6ef38c-f0c0-4db0-aad2-3ed9d6adf0b5
        :target: https://bestproxy.com/?keyword=b2vgzl0r
        :alt: Experience BestProxy, smooth browsing, fast speeds, and total anonymity.

.. _`Click here to explore BestProxy`: https://bestproxy.com/?keyword=b2vgzl0r
.. _`Get Started`: https://bestproxy.com/?keyword=b2vgzl0r


Unlock the Web — the Smart Way
------------------------------
`Click here to see SerpApi, scrape search engines easily with SerpApi - Search API`_. 
Scrape Google Search, Google News, Google Maps, and more!

.. image:: https://github.com/user-attachments/assets/9a80eeb4-72a8-43f1-9413-93c7a47b2bf6
        :target: https://serpapi.com/google-news-api?utm_source=newspaper3k_github
        :alt: Scrape search engines easily with SerpApi - Search API. 

.. _`Click here to see SerpApi, scrape search engines easily with SerpApi - Search API`: https://serpapi.com?utm_source=newspaper3k_github

LICENSE
-------

Authored and maintained by `Lucas Ou-Yang`_.

`Parse.ly`_ sponsored some work on newspaper, specifically focused on
automatic extraction.

Newspaper uses a lot of `python-goose's`_ parsing code. View their license `here`_.

Please feel free to `email & contact me`_ if you run into issues or just would like
to talk about the future of this library and news extraction in general!

.. _`Lucas Ou-Yang`: http://codelucas.com
.. _`email & contact me`: mailto:lucasyangpersonal@gmail.com
.. _`python-goose's`: https://github.com/grangier/python-goose
.. _`here`: https://github.com/codelucas/newspaper/blob/master/GOOSE-LICENSE.txt

.. _`https://www.paypal.me/codelucas`: https://www.paypal.me/codelucas
.. _`Venmo`: https://www.venmo.com/Lucas-Ou-Yang

.. _`Quickstart guide`: https://newspaper.readthedocs.io/en/latest/
.. _`The Docs`: https://newspaper.readthedocs.io
.. _`lxml`: http://lxml.de/
.. _`requests`: https://github.com/kennethreitz/requests
.. _`Parse.ly`: http://parse.ly
.. _`It takes only one click`: https://tracking.gitads.io/?campaign=gitads&repo=newspaper&redirect=gitads.io


================================================
FILE: docs/Makefile
================================================
# Makefile for Sphinx documentation
#

# You can set these variables from the command line.
SPHINXOPTS    =
SPHINXBUILD   = sphinx-build
PAPER         =
BUILDDIR      = _build

# User-friendly check for sphinx-build
ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1)
$(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/)
endif

# Internal variables.
PAPEROPT_a4     = -D latex_paper_size=a4
PAPEROPT_letter = -D latex_paper_size=letter
ALLSPHINXOPTS   = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
# the i18n builder cannot share the environment and doctrees with the others
I18NSPHINXOPTS  = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .

.PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext

help:
	@echo "Please use \`make <target>' where <target> is one of"
	@echo "  html       to make standalone HTML files"
	@echo "  dirhtml    to make HTML files named index.html in directories"
	@echo "  singlehtml to make a single large HTML file"
	@echo "  pickle     to make pickle files"
	@echo "  json       to make JSON files"
	@echo "  htmlhelp   to make HTML files and a HTML help project"
	@echo "  qthelp     to make HTML files and a qthelp project"
	@echo "  devhelp    to make HTML files and a Devhelp project"
	@echo "  epub       to make an epub"
	@echo "  latex      to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
	@echo "  latexpdf   to make LaTeX files and run them through pdflatex"
	@echo "  latexpdfja to make LaTeX files and run them through platex/dvipdfmx"
	@echo "  text       to make text files"
	@echo "  man        to make manual pages"
	@echo "  texinfo    to make Texinfo files"
	@echo "  info       to make Texinfo files and run them through makeinfo"
	@echo "  gettext    to make PO message catalogs"
	@echo "  changes    to make an overview of all changed/added/deprecated items"
	@echo "  xml        to make Docutils-native XML files"
	@echo "  pseudoxml  to make pseudoxml-XML files for display purposes"
	@echo "  linkcheck  to check all external links for integrity"
	@echo "  doctest    to run all doctests embedded in the documentation (if enabled)"

clean:
	rm -rf $(BUILDDIR)/*

html:
	$(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
	@echo
	@echo "Build finished. The HTML pages are in $(BUILDDIR)/html."

dirhtml:
	$(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
	@echo
	@echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."

singlehtml:
	$(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml
	@echo
	@echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml."

pickle:
	$(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
	@echo
	@echo "Build finished; now you can process the pickle files."

json:
	$(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
	@echo
	@echo "Build finished; now you can process the JSON files."

htmlhelp:
	$(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
	@echo
	@echo "Build finished; now you can run HTML Help Workshop with the" \
	      ".hhp project file in $(BUILDDIR)/htmlhelp."

qthelp:
	$(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
	@echo
	@echo "Build finished; now you can run "qcollectiongenerator" with the" \
	      ".qhcp project file in $(BUILDDIR)/qthelp, like this:"
	@echo "# qcollectiongenerator $(BUILDDIR)/qthelp/newspaper.qhcp"
	@echo "To view the help file:"
	@echo "# assistant -collectionFile $(BUILDDIR)/qthelp/newspaper.qhc"

devhelp:
	$(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp
	@echo
	@echo "Build finished."
	@echo "To view the help file:"
	@echo "# mkdir -p $$HOME/.local/share/devhelp/newspaper"
	@echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/newspaper"
	@echo "# devhelp"

epub:
	$(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub
	@echo
	@echo "Build finished. The epub file is in $(BUILDDIR)/epub."

latex:
	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
	@echo
	@echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
	@echo "Run \`make' in that directory to run these through (pdf)latex" \
	      "(use \`make latexpdf' here to do that automatically)."

latexpdf:
	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
	@echo "Running LaTeX files through pdflatex..."
	$(MAKE) -C $(BUILDDIR)/latex all-pdf
	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."

latexpdfja:
	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
	@echo "Running LaTeX files through platex and dvipdfmx..."
	$(MAKE) -C $(BUILDDIR)/latex all-pdf-ja
	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."

text:
	$(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text
	@echo
	@echo "Build finished. The text files are in $(BUILDDIR)/text."

man:
	$(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man
	@echo
	@echo "Build finished. The manual pages are in $(BUILDDIR)/man."

texinfo:
	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
	@echo
	@echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo."
	@echo "Run \`make' in that directory to run these through makeinfo" \
	      "(use \`make info' here to do that automatically)."

info:
	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
	@echo "Running Texinfo files through makeinfo..."
	make -C $(BUILDDIR)/texinfo info
	@echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo."

gettext:
	$(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale
	@echo
	@echo "Build finished. The message catalogs are in $(BUILDDIR)/locale."

changes:
	$(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
	@echo
	@echo "The overview file is in $(BUILDDIR)/changes."

linkcheck:
	$(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
	@echo
	@echo "Link check complete; look for any errors in the above output " \
	      "or in $(BUILDDIR)/linkcheck/output.txt."

doctest:
	$(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
	@echo "Testing of doctests in the sources finished, look at the " \
	      "results in $(BUILDDIR)/doctest/output.txt."

xml:
	$(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml
	@echo
	@echo "Build finished. The XML files are in $(BUILDDIR)/xml."

pseudoxml:
	$(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml
	@echo
	@echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml."


================================================
FILE: docs/_templates/sidebarintro.html
================================================
<p class="logo">
  <a href="{{ pathto(master_doc) }}">
    <img class="logo" style="margin-right:40px;" src="{{ pathto('_static/newspaper.jpg', 1) }}" title="Newspaper logo."/>
  </a>
</p>

<p>
  <iframe src="http://ghbtns.com/github-btn.html?user=codelucas&repo=newspaper&type=watch&count=true&size=large"
    allowtransparency="true" frameborder="0" scrolling="0" width="200px" height="35px"></iframe>
</p>

<h3>Useful Links</h3>
<ul>
  <li><a href="http://github.com/codelucas/newspaper">Newspaper @ GitHub</a></li>
  <li><a href="http://pypi.python.org/pypi/newspaper">Newspaper @ PyPI</a></li>
  <li><a href="http://github.com/codelucas/newspaper/issues">Issue Tracker</a></li>
</ul>


================================================
FILE: docs/_templates/sidebarlogo.html
================================================
<p class="logo">
  <a href="{{ pathto(master_doc) }}">
    <img class="logo" style="margin-right:40px;" src="{{ pathto('_static/newspaper.jpg', 1) }}" title="Newspaper logo."/>
  </a>
</p>

<p>
  <iframe src="http://ghbtns.com/github-btn.html?user=codelucas&repo=newspaper&type=watch&count=true&size=large"
    allowtransparency="true" frameborder="0" scrolling="0" width="200px" height="35px"></iframe>
</p>

<h3>Useful Links</h3>
<ul>
  <li><a href="http://github.com/codelucas/newspaper">Newspaper @ GitHub</a></li>
  <li><a href="http://pypi.python.org/pypi/newspaper">Newspaper @ PyPI</a></li>
  <li><a href="http://github.com/codelucas/newspaper/issues">Issue Tracker</a></li>
</ul>

================================================
FILE: docs/_themes/.gitignore
================================================
*.pyc
*.pyo
.DS_Store


================================================
FILE: docs/_themes/LICENSE
================================================
Modifications:

Copyright (c) 2011 Kenneth Reitz.


Original Project:

Copyright (c) 2010 by Armin Ronacher.


Some rights reserved.

Redistribution and use in source and binary forms of the theme, with or
without modification, are permitted provided that the following conditions
are met:

* Redistributions of source code must retain the above copyright
  notice, this list of conditions and the following disclaimer.

* Redistributions in binary form must reproduce the above
  copyright notice, this list of conditions and the following
  disclaimer in the documentation and/or other materials provided
  with the distribution.

* The names of the contributors may not be used to endorse or
  promote products derived from this software without specific
  prior written permission.

We kindly ask you to only use these themes in an unmodified manner just
for Flask and Flask-related products, not for unrelated projects.  If you
like the visual style and want to use it for your own projects, please
consider making some larger changes to the themes (such as changing
font faces, sizes, colors or margins).

THIS THEME IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS THEME, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.


================================================
FILE: docs/_themes/README.rst
================================================
krTheme Sphinx Style
====================

This repository contains sphinx styles Kenneth Reitz uses in most of
his projects. It is a derivative of Mitsuhiko's themes for Flask and Flask related
projects.  To use this style in your Sphinx documentation, follow
this guide:

1. put this folder as _themes into your docs folder.  Alternatively
   you can also use git submodules to check out the contents there.

2. add this to your conf.py: ::

    sys.path.append(os.path.abspath('_themes'))
    html_theme_path = ['_themes']
    html_theme = 'flask'

The following themes exist:

**kr**
    the standard flask documentation theme for large projects

**kr_small**
    small one-page theme.  Intended to be used by very small addon libraries.



================================================
FILE: docs/_themes/flask_theme_support.py
================================================
# flasky extensions.  flasky pygments style based on tango style
from pygments.style import Style
from pygments.token import Keyword, Name, Comment, String, Error, \
     Number, Operator, Generic, Whitespace, Punctuation, Other, Literal


class FlaskyStyle(Style):
    background_color = "#f8f8f8"
    default_style = ""

    styles = {
        # No corresponding class for the following:
        #Text:                     "", # class:  ''
        Whitespace:                "underline #f8f8f8",      # class: 'w'
        Error:                     "#a40000 border:#ef2929", # class: 'err'
        Other:                     "#000000",                # class 'x'

        Comment:                   "italic #8f5902", # class: 'c'
        Comment.Preproc:           "noitalic",       # class: 'cp'

        Keyword:                   "bold #004461",   # class: 'k'
        Keyword.Constant:          "bold #004461",   # class: 'kc'
        Keyword.Declaration:       "bold #004461",   # class: 'kd'
        Keyword.Namespace:         "bold #004461",   # class: 'kn'
        Keyword.Pseudo:            "bold #004461",   # class: 'kp'
        Keyword.Reserved:          "bold #004461",   # class: 'kr'
        Keyword.Type:              "bold #004461",   # class: 'kt'

        Operator:                  "#582800",   # class: 'o'
        Operator.Word:             "bold #004461",   # class: 'ow' - like keywords

        Punctuation:               "bold #000000",   # class: 'p'

        # because special names such as Name.Class, Name.Function, etc.
        # are not recognized as such later in the parsing, we choose them
        # to look the same as ordinary variables.
        Name:                      "#000000",        # class: 'n'
        Name.Attribute:            "#c4a000",        # class: 'na' - to be revised
        Name.Builtin:              "#004461",        # class: 'nb'
        Name.Builtin.Pseudo:       "#3465a4",        # class: 'bp'
        Name.Class:                "#000000",        # class: 'nc' - to be revised
        Name.Constant:             "#000000",        # class: 'no' - to be revised
        Name.Decorator:            "#888",           # class: 'nd' - to be revised
        Name.Entity:               "#ce5c00",        # class: 'ni'
        Name.Exception:            "bold #cc0000",   # class: 'ne'
        Name.Function:             "#000000",        # class: 'nf'
        Name.Property:             "#000000",        # class: 'py'
        Name.Label:                "#f57900",        # class: 'nl'
        Name.Namespace:            "#000000",        # class: 'nn' - to be revised
        Name.Other:                "#000000",        # class: 'nx'
        Name.Tag:                  "bold #004461",   # class: 'nt' - like a keyword
        Name.Variable:             "#000000",        # class: 'nv' - to be revised
        Name.Variable.Class:       "#000000",        # class: 'vc' - to be revised
        Name.Variable.Global:      "#000000",        # class: 'vg' - to be revised
        Name.Variable.Instance:    "#000000",        # class: 'vi' - to be revised

        Number:                    "#990000",        # class: 'm'

        Literal:                   "#000000",        # class: 'l'
        Literal.Date:              "#000000",        # class: 'ld'

        String:                    "#4e9a06",        # class: 's'
        String.Backtick:           "#4e9a06",        # class: 'sb'
        String.Char:               "#4e9a06",        # class: 'sc'
        String.Doc:                "italic #8f5902", # class: 'sd' - like a comment
        String.Double:             "#4e9a06",        # class: 's2'
        String.Escape:             "#4e9a06",        # class: 'se'
        String.Heredoc:            "#4e9a06",        # class: 'sh'
        String.Interpol:           "#4e9a06",        # class: 'si'
        String.Other:              "#4e9a06",        # class: 'sx'
        String.Regex:              "#4e9a06",        # class: 'sr'
        String.Single:             "#4e9a06",        # class: 's1'
        String.Symbol:             "#4e9a06",        # class: 'ss'

        Generic:                   "#000000",        # class: 'g'
        Generic.Deleted:           "#a40000",        # class: 'gd'
        Generic.Emph:              "italic #000000", # class: 'ge'
        Generic.Error:             "#ef2929",        # class: 'gr'
        Generic.Heading:           "bold #000080",   # class: 'gh'
        Generic.Inserted:          "#00A000",        # class: 'gi'
        Generic.Output:            "#888",           # class: 'go'
        Generic.Prompt:            "#745334",        # class: 'gp'
        Generic.Strong:            "bold #000000",   # class: 'gs'
        Generic.Subheading:        "bold #800080",   # class: 'gu'
        Generic.Traceback:         "bold #a40000",   # class: 'gt'
    }


================================================
FILE: docs/_themes/kr/layout.html
================================================
{%- extends "basic/layout.html" %}

{%- block extrahead %}
  {{ super() }}
  {% if theme_touch_icon %}
    <link rel="apple-touch-icon" href="{{ pathto('_static/' ~ theme_touch_icon, 1) }}" />
  {% endif %}
  <meta name="viewport" content="width=device-width, initial-scale=0.9, maximum-scale=0.9">
{% endblock %}

{%- block relbar2 %}{% endblock %}

{%- block footer %}
    <div class="footer">
      &copy; Copyright {{ copyright }}.
    </div>
{%- endblock %}


================================================
FILE: docs/_themes/kr/relations.html
================================================
<h3>Related Topics</h3>
<ul>
  <li><a href="{{ pathto(master_doc) }}">Documentation overview</a><ul>
  {%- for parent in parents %}
  <li><a href="{{ parent.link|e }}">{{ parent.title }}</a><ul>
  {%- endfor %}
    {%- if prev %}
      <li>Previous: <a href="{{ prev.link|e }}" title="{{ _('previous chapter')
        }}">{{ prev.title }}</a></li>
    {%- endif %}
    {%- if next %}
      <li>Next: <a href="{{ next.link|e }}" title="{{ _('next chapter')
        }}">{{ next.title }}</a></li>
    {%- endif %}
  {%- for parent in parents %}
  </ul></li>
  {%- endfor %}
  </ul></li>
</ul>


================================================
FILE: docs/_themes/kr/static/flasky.css_t
================================================
/*
 * flasky.css_t
 * ~~~~~~~~~~~~
 *
 * :copyright: Copyright 2010 by Armin Ronacher. Modifications by Kenneth Reitz.
 * :license: Flask Design License, see LICENSE for details.
 */

{% set page_width = '940px' %}
{% set sidebar_width = '220px' %}

@import url("basic.css");

/* -- page layout ----------------------------------------------------------- */

body {
    font-family: 'goudy old style', 'minion pro', 'bell mt', Georgia, 'Hiragino Mincho Pro';
    font-size: 17px;
    background-color: white;
    color: #000;
    margin: 0;
    padding: 0;
}

div.document {
    width: {{ page_width }};
    margin: 30px auto 0 auto;
}

div.documentwrapper {
    float: left;
    width: 100%;
}

div.bodywrapper {
    margin: 0 0 0 {{ sidebar_width }};
}

div.sphinxsidebar {
    width: {{ sidebar_width }};
}

hr {
    border: 1px solid #B1B4B6;
}

div.body {
    background-color: #ffffff;
    color: #3E4349;
    padding: 0 30px 0 30px;
}

img.floatingflask {
    padding: 0 0 10px 10px;
    float: right;
}

div.footer {
    width: {{ page_width }};
    margin: 20px auto 30px auto;
    font-size: 14px;
    color: #888;
    text-align: right;
}

div.footer a {
    color: #888;
}

div.related {
    display: none;
}

div.sphinxsidebar a {
    color: #444;
    text-decoration: none;
    border-bottom: 1px dotted #999;
}

div.sphinxsidebar a:hover {
    border-bottom: 1px solid #999;
}

div.sphinxsidebar {
    font-size: 14px;
    line-height: 1.5;
}

div.sphinxsidebarwrapper {
    padding: 18px 10px;
}

div.sphinxsidebarwrapper p.logo {
    padding: 0;
    margin: -10px 0 0 -20px;
    text-align: center;
}

div.sphinxsidebar h3,
div.sphinxsidebar h4 {
    font-family: 'Garamond', 'Georgia', serif;
    color: #444;
    font-size: 24px;
    font-weight: normal;
    margin: 0 0 5px 0;
    padding: 0;
}

div.sphinxsidebar h4 {
    font-size: 20px;
}

div.sphinxsidebar h3 a {
    color: #444;
}

div.sphinxsidebar p.logo a,
div.sphinxsidebar h3 a,
div.sphinxsidebar p.logo a:hover,
div.sphinxsidebar h3 a:hover {
    border: none;
}

div.sphinxsidebar p {
    color: #555;
    margin: 10px 0;
}

div.sphinxsidebar ul {
    margin: 10px 0;
    padding: 0;
    color: #000;
}

div.sphinxsidebar input {
    border: 1px solid #ccc;
    font-family: 'Georgia', serif;
    font-size: 1em;
}

/* -- body styles ----------------------------------------------------------- */

a {
    color: #004B6B;
    text-decoration: underline;
}

a:hover {
    color: #6D4100;
    text-decoration: underline;
}

div.body h1,
div.body h2,
div.body h3,
div.body h4,
div.body h5,
div.body h6 {
    font-family: 'Garamond', 'Georgia', serif;
    font-weight: normal;
    margin: 30px 0px 10px 0px;
    padding: 0;
}

div.body h1 { margin-top: 0; padding-top: 0; font-size: 240%; }
div.body h2 { font-size: 180%; }
div.body h3 { font-size: 150%; }
div.body h4 { font-size: 130%; }
div.body h5 { font-size: 100%; }
div.body h6 { font-size: 100%; }

a.headerlink {
    color: #ddd;
    padding: 0 4px;
    text-decoration: none;
}

a.headerlink:hover {
    color: #444;
    background: #eaeaea;
}

div.body p, div.body dd, div.body li {
    line-height: 1.4em;
}

div.admonition {
    background: #fafafa;
    margin: 20px -30px;
    padding: 10px 30px;
    border-top: 1px solid #ccc;
    border-bottom: 1px solid #ccc;
}

div.admonition tt.xref, div.admonition a tt {
    border-bottom: 1px solid #fafafa;
}

dd div.admonition {
    margin-left: -60px;
    padding-left: 60px;
}

div.admonition p.admonition-title {
    font-family: 'Garamond', 'Georgia', serif;
    font-weight: normal;
    font-size: 24px;
    margin: 0 0 10px 0;
    padding: 0;
    line-height: 1;
}

div.admonition p.last {
    margin-bottom: 0;
}

div.highlight {
    background-color: white;
}

dt:target, .highlight {
    background: #FAF3E8;
}

div.note {
    background-color: #eee;
    border: 1px solid #ccc;
}

div.seealso {
    background-color: #ffc;
    border: 1px solid #ff6;
}

div.topic {
    background-color: #eee;
}

p.admonition-title {
    display: inline;
}

p.admonition-title:after {
    content: ":";
}

pre, tt {
    font-family: 'Consolas', 'Menlo', 'Deja Vu Sans Mono', 'Bitstream Vera Sans Mono', monospace;
    font-size: 0.9em;
}

img.screenshot {
}

tt.descname, tt.descclassname {
    font-size: 0.95em;
}

tt.descname {
    padding-right: 0.08em;
}

img.screenshot {
    -moz-box-shadow: 2px 2px 4px #eee;
    -webkit-box-shadow: 2px 2px 4px #eee;
    box-shadow: 2px 2px 4px #eee;
}

table.docutils {
    border: 1px solid #888;
    -moz-box-shadow: 2px 2px 4px #eee;
    -webkit-box-shadow: 2px 2px 4px #eee;
    box-shadow: 2px 2px 4px #eee;
}

table.docutils td, table.docutils th {
    border: 1px solid #888;
    padding: 0.25em 0.7em;
}

table.field-list, table.footnote {
    border: none;
    -moz-box-shadow: none;
    -webkit-box-shadow: none;
    box-shadow: none;
}

table.footnote {
    margin: 15px 0;
    width: 100%;
    border: 1px solid #eee;
    background: #fdfdfd;
    font-size: 0.9em;
}

table.footnote + table.footnote {
    margin-top: -15px;
    border-top: none;
}

table.field-list th {
    padding: 0 0.8em 0 0;
}

table.field-list td {
    padding: 0;
}

table.footnote td.label {
    width: 0px;
    padding: 0.3em 0 0.3em 0.5em;
}

table.footnote td {
    padding: 0.3em 0.5em;
}

dl {
    margin: 0;
    padding: 0;
}

dl dd {
    margin-left: 30px;
}

blockquote {
    margin: 0 0 0 30px;
    padding: 0;
}

ul, ol {
    margin: 10px 0 10px 30px;
    padding: 0;
}

pre {
    background: #eee;
    padding: 7px 30px;
    margin: 15px -30px;
    line-height: 1.3em;
}

dl pre, blockquote pre, li pre {
    margin-left: -60px;
    padding-left: 60px;
}

dl dl pre {
    margin-left: -90px;
    padding-left: 90px;
}

tt {
    background-color: #ecf0f3;
    color: #222;
    /* padding: 1px 2px; */
}

tt.xref, a tt {
    background-color: #FBFBFB;
    border-bottom: 1px solid white;
}

a.reference {
    text-decoration: none;
    border-bottom: 1px dotted #004B6B;
}

a.reference:hover {
    border-bottom: 1px solid #6D4100;
}

a.footnote-reference {
    text-decoration: none;
    font-size: 0.7em;
    vertical-align: top;
    border-bottom: 1px dotted #004B6B;
}

a.footnote-reference:hover {
    border-bottom: 1px solid #6D4100;
}

a:hover tt {
    background: #EEE;
}


@media screen and (max-width: 870px) {

    div.sphinxsidebar {
    	display: none;
    }

    div.document {
       width: 100%;

    }

    div.documentwrapper {
    	margin-left: 0;
    	margin-top: 0;
    	margin-right: 0;
    	margin-bottom: 0;
    }

    div.bodywrapper {
    	margin-top: 0;
    	margin-right: 0;
    	margin-bottom: 0;
    	margin-left: 0;
    }

    ul {
    	margin-left: 0;
    }

    .document {
    	width: auto;
    }

    .footer {
    	width: auto;
    }

    .bodywrapper {
    	margin: 0;
    }

    .footer {
    	width: auto;
    }

    .github {
        display: none;
    }



}



@media screen and (max-width: 875px) {

    body {
        margin: 0;
        padding: 20px 30px;
    }

    div.documentwrapper {
        float: none;
        background: white;
    }

    div.sphinxsidebar {
        display: block;
        float: none;
        width: 102.5%;
        margin: 50px -30px -20px -30px;
        padding: 10px 20px;
        background: #333;
        color: white;
    }

    div.sphinxsidebar h3, div.sphinxsidebar h4, div.sphinxsidebar p,
    div.sphinxsidebar h3 a {
        color: white;
    }

    div.sphinxsidebar a {
        color: #aaa;
    }

    div.sphinxsidebar p.logo {
        display: none;
    }

    div.document {
        width: 100%;
        margin: 0;
    }

    div.related {
        display: block;
        margin: 0;
        padding: 10px 0 20px 0;
    }

    div.related ul,
    div.related ul li {
        margin: 0;
        padding: 0;
    }

    div.footer {
        display: none;
    }

    div.bodywrapper {
        margin: 0;
    }

    div.body {
        min-height: 0;
        padding: 0;
    }

    .rtd_doc_footer {
        display: none;
    }

    .document {
        width: auto;
    }

    .footer {
        width: auto;
    }

    .footer {
        width: auto;
    }

    .github {
        display: none;
    }
}


/* misc. */

.revsys-inline {
    display: none!important;
}

================================================
FILE: docs/_themes/kr/theme.conf
================================================
[theme]
inherit = basic
stylesheet = flasky.css
pygments_style = flask_theme_support.FlaskyStyle

[options]
touch_icon =


================================================
FILE: docs/_themes/kr_small/layout.html
================================================
{% extends "basic/layout.html" %}
{% block header %}
  {{ super() }}
  {% if pagename == 'index' %}
  <div class=indexwrapper>
  {% endif %}
{% endblock %}
{% block footer %}
  {% if pagename == 'index' %}
  </div>
  {% endif %}
{% endblock %}
{# do not display relbars #}
{% block relbar1 %}{% endblock %}
{% block relbar2 %}
  {% if theme_github_fork %}
    <a href="http://github.com/{{ theme_github_fork }}"><img style="position: fixed; top: 0; right: 0; border: 0;"
    src="http://s3.amazonaws.com/github/ribbons/forkme_right_darkblue_121621.png" alt="Fork me on GitHub" /></a>
  {% endif %}
{% endblock %}
{% block sidebar1 %}{% endblock %}
{% block sidebar2 %}{% endblock %}


================================================
FILE: docs/_themes/kr_small/static/flasky.css_t
================================================
/*
 * flasky.css_t
 * ~~~~~~~~~~~~
 *
 * Sphinx stylesheet -- flasky theme based on nature theme.
 *
 * :copyright: Copyright 2007-2010 by the Sphinx team, see AUTHORS.
 * :license: BSD, see LICENSE for details.
 *
 */

@import url("basic.css");

/* -- page layout ----------------------------------------------------------- */

body {
    font-family: 'Georgia', serif;
    font-size: 17px;
    color: #000;
    background: white;
    margin: 0;
    padding: 0;
}

div.documentwrapper {
    float: left;
    width: 100%;
}

div.bodywrapper {
    margin: 40px auto 0 auto;
    width: 700px;
}

hr {
    border: 1px solid #B1B4B6;
}

div.body {
    background-color: #ffffff;
    color: #3E4349;
    padding: 0 30px 30px 30px;
}

img.floatingflask {
    padding: 0 0 10px 10px;
    float: right;
}

div.footer {
    text-align: right;
    color: #888;
    padding: 10px;
    font-size: 14px;
    width: 650px;
    margin: 0 auto 40px auto;
}

div.footer a {
    color: #888;
    text-decoration: underline;
}

div.related {
    line-height: 32px;
    color: #888;
}

div.related ul {
    padding: 0 0 0 10px;
}

div.related a {
    color: #444;
}

/* -- body styles ----------------------------------------------------------- */

a {
    color: #004B6B;
    text-decoration: underline;
}

a:hover {
    color: #6D4100;
    text-decoration: underline;
}

div.body {
    padding-bottom: 40px; /* saved for footer */
}

div.body h1,
div.body h2,
div.body h3,
div.body h4,
div.body h5,
div.body h6 {
    font-family: 'Garamond', 'Georgia', serif;
    font-weight: normal;
    margin: 30px 0px 10px 0px;
    padding: 0;
}

{% if theme_index_logo %}
div.indexwrapper h1 {
    text-indent: -999999px;
    background: url({{ theme_index_logo }}) no-repeat center center;
    height: {{ theme_index_logo_height }};
}
{% endif %}

div.body h2 { font-size: 180%; }
div.body h3 { font-size: 150%; }
div.body h4 { font-size: 130%; }
div.body h5 { font-size: 100%; }
div.body h6 { font-size: 100%; }

a.headerlink {
    color: white;
    padding: 0 4px;
    text-decoration: none;
}

a.headerlink:hover {
    color: #444;
    background: #eaeaea;
}

div.body p, div.body dd, div.body li {
    line-height: 1.4em;
}

div.admonition {
    background: #fafafa;
    margin: 20px -30px;
    padding: 10px 30px;
    border-top: 1px solid #ccc;
    border-bottom: 1px solid #ccc;
}

div.admonition p.admonition-title {
    font-family: 'Garamond', 'Georgia', serif;
    font-weight: normal;
    font-size: 24px;
    margin: 0 0 10px 0;
    padding: 0;
    line-height: 1;
}

div.admonition p.last {
    margin-bottom: 0;
}

div.highlight{
    background-color: white;
}

dt:target, .highlight {
    background: #FAF3E8;
}

div.note {
    background-color: #eee;
    border: 1px solid #ccc;
}

div.seealso {
    background-color: #ffc;
    border: 1px solid #ff6;
}

div.topic {
    background-color: #eee;
}

div.warning {
    background-color: #ffe4e4;
    border: 1px solid #f66;
}

p.admonition-title {
    display: inline;
}

p.admonition-title:after {
    content: ":";
}

pre, tt {
    font-family: 'Consolas', 'Menlo', 'Deja Vu Sans Mono', 'Bitstream Vera Sans Mono', monospace;
    font-size: 0.85em;
}

img.screenshot {
}

tt.descname, tt.descclassname {
    font-size: 0.95em;
}

tt.descname {
    padding-right: 0.08em;
}

img.screenshot {
    -moz-box-shadow: 2px 2px 4px #eee;
    -webkit-box-shadow: 2px 2px 4px #eee;
    box-shadow: 2px 2px 4px #eee;
}

table.docutils {
    border: 1px solid #888;
    -moz-box-shadow: 2px 2px 4px #eee;
    -webkit-box-shadow: 2px 2px 4px #eee;
    box-shadow: 2px 2px 4px #eee;
}

table.docutils td, table.docutils th {
    border: 1px solid #888;
    padding: 0.25em 0.7em;
}

table.field-list, table.footnote {
    border: none;
    -moz-box-shadow: none;
    -webkit-box-shadow: none;
    box-shadow: none;
}

table.footnote {
    margin: 15px 0;
    width: 100%;
    border: 1px solid #eee;
}

table.field-list th {
    padding: 0 0.8em 0 0;
}

table.field-list td {
    padding: 0;
}

table.footnote td {
    padding: 0.5em;
}

dl {
    margin: 0;
    padding: 0;
}

dl dd {
    margin-left: 30px;
}

pre {
    padding: 0;
    margin: 15px -30px;
    padding: 8px;
    line-height: 1.3em;
    padding: 7px 30px;
    background: #eee;
    border-radius: 2px;
    -moz-border-radius: 2px;
    -webkit-border-radius: 2px;
}

dl pre {
    margin-left: -60px;
    padding-left: 60px;
}

tt {
    background-color: #ecf0f3;
    color: #222;
    /* padding: 1px 2px; */
}

tt.xref, a tt {
    background-color: #FBFBFB;
}

a:hover tt {
    background: #EEE;
}


================================================
FILE: docs/_themes/kr_small/theme.conf
================================================
[theme]
inherit = basic
stylesheet = flasky.css
nosidebar = true
pygments_style = flask_theme_support.FlaskyStyle

[options]
index_logo = ''
index_logo_height = 120px
github_fork = ''


================================================
FILE: docs/conf.py
================================================
# -*- coding: utf-8 -*-
#
# newspaper documentation build configuration file, created by
# sphinx-quickstart on Sat Dec 21 22:26:51 2013.
#
# This file is execfile()d with the current directory set to its
# containing dir.
#
# Note that not all possible configuration values are present in this
# autogenerated file.
#
# All configuration values have a default; values that are commented out
# serve to show the default.

import sys
import os

sys.path.append(os.path.abspath('_themes'))


# If extensions (or modules to document with autodoc) are in another directory,
# add these directories to sys.path here. If the directory is relative to the
# documentation root, use os.path.abspath to make it absolute, like shown here.
#sys.path.insert(0, os.path.abspath('.'))

# -- General configuration ------------------------------------------------

# If your documentation needs a minimal Sphinx version, state it here.
#needs_sphinx = '1.0'

# Add any Sphinx extension module names here, as strings. They can be
# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
# ones.
extensions = [
    'sphinx.ext.todo',
]

# Add any paths that contain templates here, relative to this directory.
templates_path = ['_templates']

# The suffix of source filenames.
source_suffix = '.rst'

# The encoding of source files.
#source_encoding = 'utf-8-sig'

# The master toctree document.
master_doc = 'index'

# General information about the project.
project = 'newspaper'
copyright = '2013, <a href="http://codelucas.com">Lucas Ou-Yang</a>'

# The version info for the project you're documenting, acts as replacement for
# |version| and |release|, also used in various other places throughout the
# built documents.
#
# The short X.Y version.
version = '0.0.2'
# The full version, including alpha/beta/rc tags.
release = '0.0.2'

# The language for content autogenerated by Sphinx. Refer to documentation
# for a list of supported languages.
#language = None

# There are two options for replacing |today|: either, you set today to some
# non-false value, then it is used:
#today = ''
# Else, today_fmt is used as the format for a strftime call.
#today_fmt = '%B %d, %Y'

# List of patterns, relative to source directory, that match files and
# directories to ignore when looking for source files.
exclude_patterns = ['_build']

# The reST default role (used for this markup: `text`) to use for all
# documents.
#default_role = None

# If true, '()' will be appended to :func: etc. cross-reference text.
#add_function_parentheses = True

# If true, the current module name will be prepended to all description
# unit titles (such as .. function::).
#add_module_names = True

# If true, sectionauthor and moduleauthor directives will be shown in the
# output. They are ignored by default.
#show_authors = False

# The name of the Pygments (syntax highlighting) style to use.
pygments_style = 'sphinx'

# A list of ignored prefixes for module index sorting.
#modindex_common_prefix = []

# If true, keep warnings as "system message" paragraphs in the built documents.
#keep_warnings = False


# -- Options for HTML output ----------------------------------------------

# The theme to use for HTML and HTML Help pages.  See the documentation for
# a list of builtin themes.

# Theme options are theme-specific and customize the look and feel of a theme
# further.  For a list of options available for each theme, see the
# documentation.
#html_theme_options = {}

# Add any paths that contain custom themes here, relative to this directory.
html_theme_path = ['_themes']

html_theme = 'kr'

# The name for this set of Sphinx documents.  If None, it defaults to
# "<project> v<release> documentation".
#html_title = None

# A shorter title for the navigation bar.  Default is the same as html_title.
#html_short_title = None

# The name of an image file (relative to this directory) to place at the top
# of the sidebar.
#html_logo = None

# The name of an image file (within the static path) to use as favicon of the
# docs.  This file should be a Windows icon file (.ico) being 16x16 or 32x32
# pixels large.
#html_favicon = None

# Add any paths that contain custom static files (such as style sheets) here,
# relative to this directory. They are copied after the builtin static files,
# so a file named "default.css" will overwrite the builtin "default.css".
html_static_path = ['_static']

# Add any extra paths that contain custom files (such as robots.txt or
# .htaccess) here, relative to this directory. These files are copied
# directly to the root of the documentation.
#html_extra_path = []

# If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
# using the given strftime format.
#html_last_updated_fmt = '%b %d, %Y'

# If true, SmartyPants will be used to convert quotes and dashes to
# typographically correct entities.
#html_use_smartypants = True

# Custom sidebar templates, maps document names to template names.
html_sidebars = {
    'index':    ['sidebarintro.html', 'sourcelink.html', 'searchbox.html'],
    '**':       ['sidebarlogo.html', 'localtoc.html', 'relations.html',
                 'sourcelink.html', 'searchbox.html']
}

# Additional templates that should be rendered to pages, maps page names to
# template names.
#html_additional_pages = {}

# If false, no module index is generated.
#html_domain_indices = True

# If false, no index is generated.
#html_use_index = True

# If true, the index is split into individual pages for each letter.
#html_split_index = False

# If true, links to the reST sources are added to the pages.
#html_show_sourcelink = True

# If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
#html_show_sphinx = True

# If true, "(C) Copyright ..." is shown in the HTML footer. Default is True.
#html_show_copyright = True

# If true, an OpenSearch description file will be output, and all pages will
# contain a <link> tag referring to it.  The value of this option must be the
# base URL from which the finished HTML is served.
#html_use_opensearch = ''

# This is the file name suffix for HTML files (e.g. ".xhtml").
#html_file_suffix = None

# Output file base name for HTML help builder.
htmlhelp_basename = 'newspaperdoc'


# -- Options for LaTeX output ---------------------------------------------

latex_elements = {
# The paper size ('letterpaper' or 'a4paper').
#'papersize': 'letterpaper',

# The font size ('10pt', '11pt' or '12pt').
#'pointsize': '10pt',

# Additional stuff for the LaTeX preamble.
#'preamble': '',
}

# Grouping the document tree into LaTeX files. List of tuples
# (source start file, target name, title,
#  author, documentclass [howto, manual, or own class]).
latex_documents = [
  ('index', 'newspaper.tex', 'newspaper Documentation',
   'Lucas Ou-Yang', 'manual'),
]

# The name of an image file (relative to this directory) to place at the top of
# the title page.
#latex_logo = None

# For "manual" documents, if this is true, then toplevel headings are parts,
# not chapters.
#latex_use_parts = False

# If true, show page references after internal links.
#latex_show_pagerefs = False

# If true, show URL addresses after external links.
#latex_show_urls = False

# Documents to append as an appendix to all manuals.
#latex_appendices = []

# If false, no module index is generated.
#latex_domain_indices = True


# -- Options for manual page output ---------------------------------------

# One entry per manual page. List of tuples
# (source start file, name, description, authors, manual section).
man_pages = [
    ('index', 'newspaper', 'newspaper Documentation',
     ['Lucas Ou-Yang'], 1)
]

# If true, show URL addresses after external links.
#man_show_urls = False


# -- Options for Texinfo output -------------------------------------------

# Grouping the document tree into Texinfo files. List of tuples
# (source start file, target name, title, author,
#  dir menu entry, description, category)
texinfo_documents = [
  ('index', 'newspaper', 'newspaper Documentation',
   'Lucas Ou-Yang', 'newspaper', 'One line description of project.',
   'Miscellaneous'),
]

# Documents to append as an appendix to all manuals.
#texinfo_appendices = []

# If false, no module index is generated.
#texinfo_domain_indices = True

# How to display URL addresses: 'footnote', 'no', or 'inline'.
#texinfo_show_urls = 'footnote'

# If true, do not generate a @detailmenu in the "Top" node's menu.
#texinfo_no_detailmenu = False


================================================
FILE: docs/index.rst
================================================
Newspaper3k: Article scraping & curation
========================================

.. image:: https://badge.fury.io/py/newspaper3k.svg
    :target: http://badge.fury.io/py/newspaper3k.svg
        :alt: Latest version

.. image:: https://travis-ci.org/codelucas/newspaper.svg
        :target: http://travis-ci.org/codelucas/newspaper/
        :alt: Build status

.. image:: https://coveralls.io/repos/github/codelucas/newspaper/badge.svg?branch=master
        :target: https://coveralls.io/github/codelucas/newspaper
        :alt: Coverage status


Inspired by `requests`_ for its simplicity and powered by `lxml`_ for its speed:

    "Newspaper is an amazing python library for extracting & curating articles."
    -- `tweeted by`_ Kenneth Reitz, Author of `requests`_

    "Newspaper delivers Instapaper style article extraction." -- `The Changelog`_

.. _`tweeted by`: https://twitter.com/kennethreitz/status/419520678862548992
.. _`The Changelog`: http://thechangelog.com/newspaper-delivers-instapaper-style-article-extraction/

**Newspaper is a Python3 library**! `View on Github here`_, or, view our **deprecated and buggy** `Python2 branch`_

.. _`Python2 branch`: https://github.com/codelucas/newspaper/tree/python-2-head
.. _`View on Github here`: https://github.com/codelucas/newspaper

A Glance:
---------

.. code-block:: pycon

    >>> from newspaper import Article

    >>> url = 'http://fox13now.com/2013/12/30/new-year-new-laws-obamacare-pot-guns-and-drones/'
    >>> article = Article(url)

.. code-block:: pycon

    >>> article.download()

    >>> article.html
    '<!DOCTYPE HTML><html itemscope itemtype="http://...'

.. code-block:: pycon

    >>> article.parse()

    >>> article.authors
    ['Leigh Ann Caldwell', 'John Honway']

    >>> article.publish_date
    datetime.datetime(2013, 12, 30, 0, 0)

    >>> article.text
    'Washington (CNN) -- Not everyone subscribes to a New Year's resolution...'

    >>> article.top_image
    'http://someCDN.com/blah/blah/blah/file.png'

    >>> article.movies
    ['http://youtube.com/path/to/link.com', ...]

.. code-block:: pycon

    >>> article.nlp()

    >>> article.keywords
    ['New Years', 'resolution', ...]

    >>> article.summary
    'The study shows that 93% of people ...'

.. code-block:: pycon

    >>> import newspaper

    >>> cnn_paper = newspaper.build('http://cnn.com')

    >>> for article in cnn_paper.articles:
    >>>     print(article.url)
    http://www.cnn.com/2013/11/27/justice/tucson-arizona-captive-girls/
    http://www.cnn.com/2013/12/11/us/texas-teen-dwi-wreck/index.html
    ...

    >>> for category in cnn_paper.category_urls():
    >>>     print(category)

    http://lifestyle.cnn.com
    http://cnn.com/world
    http://tech.cnn.com
    ...

    >>> cnn_article = cnn_paper.articles[0]
    >>> cnn_article.download()
    >>> cnn_article.parse()
    >>> cnn_article.nlp()
    ...

.. code-block:: pycon

    >>> from newspaper import fulltext

    >>> html = requests.get(...).text
    >>> text = fulltext(html)


Newspaper has *seamless* language extraction and detection.
If no language is specified, Newspaper will attempt to auto detect a language.

.. code-block:: pycon

    >>> from newspaper import Article
    >>> url = 'http://www.bbc.co.uk/zhongwen/simp/chinese_news/2012/12/121210_hongkong_politics.shtml'

    >>> a = Article(url, language='zh') # Chinese

    >>> a.download()
    >>> a.parse()

    >>> print(a.text[:150])
    香港行政长官梁振英在各方压力下就其大宅的违章建
    筑(僭建)问题到立法会接受质询,并向香港民众道歉。
    梁振英在星期二(12月10日)的答问大会开始之际
    在其演说中道歉,但强调他在违章建筑问题上没有隐瞒的
    意图和动机。 一些亲北京阵营议员欢迎梁振英道歉,
    且认为应能获得香港民众接受,但这些议员也质问梁振英有

    >>> print(a.title)
    港特首梁振英就住宅违建事件道歉


If you are certain that an *entire* news source is in one language, **go ahead and use the same api :)**

.. code-block:: pycon

    >>> import newspaper
    >>> sina_paper = newspaper.build('http://www.sina.com.cn/', language='zh')

    >>> for category in sina_paper.category_urls():
    >>>     print(category)
    http://health.sina.com.cn
    http://eladies.sina.com.cn
    http://english.sina.com
    ...

    >>> article = sina_paper.articles[0]
    >>> article.download()
    >>> article.parse()

    >>> print(article.text)
    新浪武汉汽车综合 随着汽车市场的日趋成熟,
    传统的“集全家之力抱得爱车归”的全额购车模式已然过时,
    另一种轻松的新兴 车模式――金融购车正逐步成为时下消费者购
    买爱车最为时尚的消费理念,他们认为,这种新颖的购车
    模式既能在短期内
    ...

    >>> print(article.title)
    两年双免0手续0利率 科鲁兹掀背金融轻松购_武汉车市_武汉汽
    车网_新浪汽车_新浪网

Documentation
-------------

Check out `The Documentation`_ for full and detailed guides using newspaper.

Interested in adding a new language for us? Refer to: `Docs - Adding new languages <https://newspaper.readthedocs.io/en/latest/user_guide/advanced.html#adding-new-languages>`_

Features
--------

- Multi-threaded article download framework
- News url identification
- Text extraction from html
- Top image extraction from html
- All image extraction from html
- Keyword extraction from text
- Summary extraction from text
- Author extraction from text
- Google trending terms extraction
- Works in 10+ languages (English, Chinese, German, Arabic, ...)

.. code-block:: pycon

    >>> import newspaper
    >>> newspaper.languages()

    Your available languages are:
    input code      full name

      ar              Arabic
      be              Belarusian
      bg              Bulgarian
      da              Danish
      de              German
      el              Greek
      en              English
      es              Spanish
      et              Estonian
      fa              Persian
      fi              Finnish
      fr              French
      he              Hebrew
      hi              Hindi
      hr              Croatian
      hu              Hungarian
      id              Indonesian
      it              Italian
      ja              Japanese
      ko              Korean
      lt              Lithuanian
      mk              Macedonian
      nb              Norwegian (Bokmål)
      nl              Dutch
      no              Norwegian
      pl              Polish
      pt              Portuguese
      ro              Romanian
      ru              Russian
      sl              Slovenian
      sr              Serbian
      sv              Swedish
      sw              Swahili
      th              Thai
      tr              Turkish
      uk              Ukrainian
      vi              Vietnamese
      zh              Chinese


Get it now
----------

Run ✅ ``pip3 install newspaper3k`` ✅

NOT ⛔ ``pip3 install newspaper`` ⛔

On python3 you must install ``newspaper3k``, **not** ``newspaper``. ``newspaper`` is our python2 library.
Although installing newspaper is simple with `pip <http://www.pip-installer.org/>`_, you will
run into fixable issues if you are trying to install on ubuntu.

**If you are on Debian / Ubuntu**, install using the following:

- Install ``pip3`` command needed to install ``newspaper3k`` package::

    $ sudo apt-get install python3-pip

- Python development version, needed for Python.h::

    $ sudo apt-get install python-dev

- lxml requirements::

    $ sudo apt-get install libxml2-dev libxslt-dev

- For PIL to recognize .jpg images::

    $ sudo apt-get install libjpeg-dev zlib1g-dev libpng12-dev

NOTE: If you find problem installing ``libpng12-dev``, try installing ``libpng-dev``.

- Download NLP related corpora::

    $ curl https://raw.githubusercontent.com/codelucas/newspaper/master/download_corpora.py | python3

- Install the distribution via pip::

    $ pip3 install newspaper3k

**If you are on OSX**, install using the following, you may use both homebrew or macports:

::

    $ brew install libxml2 libxslt

    $ brew install libtiff libjpeg webp little-cms2

    $ pip3 install newspaper3k

    $ curl https://raw.githubusercontent.com/codelucas/newspaper/master/download_corpora.py | python3


**Otherwise**, install with the following:

NOTE: You will still most likely need to install the following libraries via your package manager

- PIL: ``libjpeg-dev`` ``zlib1g-dev`` ``libpng12-dev``
- lxml: ``libxml2-dev`` ``libxslt-dev``
- Python Development version: ``python-dev``

::

    $ pip3 install newspaper3k

    $ curl https://raw.githubusercontent.com/codelucas/newspaper/master/download_corpora.py | python3

Using python 2.X? We support python 2, however development work has stopped on the 2.X branch for a
few years now so it is behind in features and is more buggy. `See python 2 installation instructions here <https://github.com/codelucas/newspaper/blob/python-2-head/README.rst/>`_

Development
-----------

If you'd like to contribute and hack on the newspaper project, feel free to clone
a development version of this repository locally::

    git clone git://github.com/codelucas/newspaper.git

Once you have a copy of the source, you can embed it in your Python package,
or install it into your site-packages easily::

    $ pip3 install -r requirements.txt
    $ python3 setup.py install

Feel free to give our testing suite a shot, everything is mocked!::

    $ python3 tests/unit_tests.py

Planning on tweaking our full-text algorithm? Add the ``fulltext`` parameter::

    $ python3 tests/unit_tests.py fulltext

User Guide
----------

.. toctree::
   :maxdepth: 2

   user_guide/quickstart
   user_guide/advanced

Demo
----

View a working online demo here: http://newspaper-demo.herokuapp.com
This is another working online demo: http://newspaper.chinazt.cc/

LICENSE
-------

Authored and maintained by `Lucas Ou-Yang`_.

`Parse.ly`_ sponsored some work on newspaper, specifically focused on
automatic extraction.

Newspaper uses a lot of `python-goose's`_ parsing code. View their license `here`_.

Please feel free to `email & contact me`_ if you run into issues or just would like
to talk about the future of this library and news extraction in general!

.. _`Lucas Ou-Yang`: http://codelucas.com
.. _`email & contact me`: mailto:lucasyangpersonal@gmail.com
.. _`python-goose's`: https://github.com/grangier/python-goose
.. _`here`: https://github.com/codelucas/newspaper/blob/master/GOOSE-LICENSE.txt

.. _`Quickstart guide`: https://newspaper.readthedocs.io/en/latest/
.. _`The Documentation`: https://newspaper.readthedocs.io
.. _`lxml`: http://lxml.de/
.. _`requests`: https://github.com/kennethreitz/requests
.. _`Parse.ly`: http://parse.ly


================================================
FILE: docs/make.bat
================================================
@ECHO OFF

REM Command file for Sphinx documentation

if "%SPHINXBUILD%" == "" (
	set SPHINXBUILD=sphinx-build
)
set BUILDDIR=_build
set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% .
set I18NSPHINXOPTS=%SPHINXOPTS% .
if NOT "%PAPER%" == "" (
	set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS%
	set I18NSPHINXOPTS=-D latex_paper_size=%PAPER% %I18NSPHINXOPTS%
)

if "%1" == "" goto help

if "%1" == "help" (
	:help
	echo.Please use `make ^<target^>` where ^<target^> is one of
	echo.  html       to make standalone HTML files
	echo.  dirhtml    to make HTML files named index.html in directories
	echo.  singlehtml to make a single large HTML file
	echo.  pickle     to make pickle files
	echo.  json       to make JSON files
	echo.  htmlhelp   to make HTML files and a HTML help project
	echo.  qthelp     to make HTML files and a qthelp project
	echo.  devhelp    to make HTML files and a Devhelp project
	echo.  epub       to make an epub
	echo.  latex      to make LaTeX files, you can set PAPER=a4 or PAPER=letter
	echo.  text       to make text files
	echo.  man        to make manual pages
	echo.  texinfo    to make Texinfo files
	echo.  gettext    to make PO message catalogs
	echo.  changes    to make an overview over all changed/added/deprecated items
	echo.  xml        to make Docutils-native XML files
	echo.  pseudoxml  to make pseudoxml-XML files for display purposes
	echo.  linkcheck  to check all external links for integrity
	echo.  doctest    to run all doctests embedded in the documentation if enabled
	goto end
)

if "%1" == "clean" (
	for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i
	del /q /s %BUILDDIR%\*
	goto end
)


%SPHINXBUILD% 2> nul
if errorlevel 9009 (
	echo.
	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
	echo.installed, then set the SPHINXBUILD environment variable to point
	echo.to the full path of the 'sphinx-build' executable. Alternatively you
	echo.may add the Sphinx directory to PATH.
	echo.
	echo.If you don't have Sphinx installed, grab it from
	echo.http://sphinx-doc.org/
	exit /b 1
)

if "%1" == "html" (
	%SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html
	if errorlevel 1 exit /b 1
	echo.
	echo.Build finished. The HTML pages are in %BUILDDIR%/html.
	goto end
)

if "%1" == "dirhtml" (
	%SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml
	if errorlevel 1 exit /b 1
	echo.
	echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml.
	goto end
)

if "%1" == "singlehtml" (
	%SPHINXBUILD% -b singlehtml %ALLSPHINXOPTS% %BUILDDIR%/singlehtml
	if errorlevel 1 exit /b 1
	echo.
	echo.Build finished. The HTML pages are in %BUILDDIR%/singlehtml.
	goto end
)

if "%1" == "pickle" (
	%SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle
	if errorlevel 1 exit /b 1
	echo.
	echo.Build finished; now you can process the pickle files.
	goto end
)

if "%1" == "json" (
	%SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json
	if errorlevel 1 exit /b 1
	echo.
	echo.Build finished; now you can process the JSON files.
	goto end
)

if "%1" == "htmlhelp" (
	%SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp
	if errorlevel 1 exit /b 1
	echo.
	echo.Build finished; now you can run HTML Help Workshop with the ^
.hhp project file in %BUILDDIR%/htmlhelp.
	goto end
)

if "%1" == "qthelp" (
	%SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp
	if errorlevel 1 exit /b 1
	echo.
	echo.Build finished; now you can run "qcollectiongenerator" with the ^
.qhcp project file in %BUILDDIR%/qthelp, like this:
	echo.^> qcollectiongenerator %BUILDDIR%\qthelp\newspaper.qhcp
	echo.To view the help file:
	echo.^> assistant -collectionFile %BUILDDIR%\qthelp\newspaper.ghc
	goto end
)

if "%1" == "devhelp" (
	%SPHINXBUILD% -b devhelp %ALLSPHINXOPTS% %BUILDDIR%/devhelp
	if errorlevel 1 exit /b 1
	echo.
	echo.Build finished.
	goto end
)

if "%1" == "epub" (
	%SPHINXBUILD% -b epub %ALLSPHINXOPTS% %BUILDDIR%/epub
	if errorlevel 1 exit /b 1
	echo.
	echo.Build finished. The epub file is in %BUILDDIR%/epub.
	goto end
)

if "%1" == "latex" (
	%SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
	if errorlevel 1 exit /b 1
	echo.
	echo.Build finished; the LaTeX files are in %BUILDDIR%/latex.
	goto end
)

if "%1" == "latexpdf" (
	%SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
	cd %BUILDDIR%/latex
	make all-pdf
	cd %BUILDDIR%/..
	echo.
	echo.Build finished; the PDF files are in %BUILDDIR%/latex.
	goto end
)

if "%1" == "latexpdfja" (
	%SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
	cd %BUILDDIR%/latex
	make all-pdf-ja
	cd %BUILDDIR%/..
	echo.
	echo.Build finished; the PDF files are in %BUILDDIR%/latex.
	goto end
)

if "%1" == "text" (
	%SPHINXBUILD% -b text %ALLSPHINXOPTS% %BUILDDIR%/text
	if errorlevel 1 exit /b 1
	echo.
	echo.Build finished. The text files are in %BUILDDIR%/text.
	goto end
)

if "%1" == "man" (
	%SPHINXBUILD% -b man %ALLSPHINXOPTS% %BUILDDIR%/man
	if errorlevel 1 exit /b 1
	echo.
	echo.Build finished. The manual pages are in %BUILDDIR%/man.
	goto end
)

if "%1" == "texinfo" (
	%SPHINXBUILD% -b texinfo %ALLSPHINXOPTS% %BUILDDIR%/texinfo
	if errorlevel 1 exit /b 1
	echo.
	echo.Build finished. The Texinfo files are in %BUILDDIR%/texinfo.
	goto end
)

if "%1" == "gettext" (
	%SPHINXBUILD% -b gettext %I18NSPHINXOPTS% %BUILDDIR%/locale
	if errorlevel 1 exit /b 1
	echo.
	echo.Build finished. The message catalogs are in %BUILDDIR%/locale.
	goto end
)

if "%1" == "changes" (
	%SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes
	if errorlevel 1 exit /b 1
	echo.
	echo.The overview file is in %BUILDDIR%/changes.
	goto end
)

if "%1" == "linkcheck" (
	%SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck
	if errorlevel 1 exit /b 1
	echo.
	echo.Link check complete; look for any errors in the above output ^
or in %BUILDDIR%/linkcheck/output.txt.
	goto end
)

if "%1" == "doctest" (
	%SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest
	if errorlevel 1 exit /b 1
	echo.
	echo.Testing of doctests in the sources finished, look at the ^
results in %BUILDDIR%/doctest/output.txt.
	goto end
)

if "%1" == "xml" (
	%SPHINXBUILD% -b xml %ALLSPHINXOPTS% %BUILDDIR%/xml
	if errorlevel 1 exit /b 1
	echo.
	echo.Build finished. The XML files are in %BUILDDIR%/xml.
	goto end
)

if "%1" == "pseudoxml" (
	%SPHINXBUILD% -b pseudoxml %ALLSPHINXOPTS% %BUILDDIR%/pseudoxml
	if errorlevel 1 exit /b 1
	echo.
	echo.Build finished. The pseudo-XML files are in %BUILDDIR%/pseudoxml.
	goto end
)

:end


================================================
FILE: docs/user_guide/advanced.rst
================================================
.. _advanced:

Advanced
========

This section of the docs shows how to do some useful but advanced things
with newspaper.

Multi-threading article downloads
---------------------------------

**Downloading articles one at a time is slow.** But spamming a single news source
like cnn.com with tons of threads or with ASYNC-IO will cause rate limiting
and also doing that is very mean.

We solve this problem by allocating 1-2 threads per news source to both greatly
speed up the download time while being respectful.

.. code-block:: pycon

    >>> import newspaper
    >>> from newspaper import news_pool

    >>> slate_paper = newspaper.build('http://slate.com')
    >>> tc_paper = newspaper.build('http://techcrunch.com')
    >>> espn_paper = newspaper.build('http://espn.com')

    >>> papers = [slate_paper, tc_paper, espn_paper]
    >>> news_pool.set(papers, threads_per_source=2) # (3*2) = 6 threads total
    >>> news_pool.join()

    At this point, you can safely assume that download() has been
    called on every single article for all 3 sources.

    >>> print(slate_paper.articles[10].html)
    u'<html> ...'

Keeping Html of main body article
---------------------------------

Keeping the html of just an article's body text is helpbut because it allows you
to retain some of the semantic information in the html. Also it will help if you
end up displaying the extracted article somehow.

Here is how to do so:

.. code-block:: pycon

    >>> from newspaper import Article

    >>> a = Article('http://www.cnn.com/2014/01/12/world/asia/north-korea-charles-smith/index.html'
        , keep_article_html=True)

    >>> a.download()
    >>> a.parse()

    >>> a.article_html
    u'<div> \n<p><strong>(CNN)</strong> -- Charles Smith insisted Sunda...'

The lxml (dom object) and top_node (chunk of dom that contains our 'Article') are also
cached incase users would like to use them.

Access **after parsing()** with:

.. code-block:: pycon

    >>> a.download()
    >>> a.parse()
    >>> a.clean_dom
    <lxml object ...  >

    >>> a.clean_top_node
    <lxml object ...  >


Adding new languages
--------------------

First, please reference this file and read from the highlighted line all the way
down to the end of the file.

`https://github.com/codelucas/newspaper/blob/master/newspaper/text.py#L57 <https://github.com/codelucas/newspaper/blob/master/newspaper/text.py#L57>`_

One aspect of our text extraction algorithm revolves around counting the number of
**stopwords** present in a text. Stopwords are: *some of the most common, short
function words, such as the, is, at, which, and on* in a language.

Reference this line to see it in action:
`https://github.com/codelucas/newspaper/blob/master/newspaper/extractors.py#L668 <https://github.com/codelucas/newspaper/blob/master/newspaper/extractors.py#L668>`_

**So for latin languages**, it is pretty basic. We first provide a list of
stopwords in ``stopwords-<language-code>.txt`` form. We then take some input text and
tokenize it into words by splitting the white space. After that we perform some
bookkeeping and then proceed to count the number of stopwords present.

**For non-latin languages**, as you may have noticed in the code above, we need to
tokenize the words in a different way, *splitting by whitespace simply won't work for
languages like Chinese or Arabic*. For the Chinese language we are using a whole new
open source library called *jieba* to split the text into words. For arabic we are
using a special nltk tokenizer to do the same job.

**So, to add full text extraction to a new (non-latin) language, we need:**

1. Push up a stopwords file in the format of ``stopwords-<2-char-language-code>.txt``
in ``newspaper/resources/text/.``

2. Provide a way of splitting/tokenizing text in that foreign language into words.
`Here are some examples for Chinese, Arabic, English <https://github.com/codelucas/newspaper/blob/master/newspaper/text.py#L105>`_

**For latin languages:**

1. Push up a stopwords file in the format of ``stopwords-<2-char-language-code>.txt``
in ``newspaper/resources/text/.`` and we are done!

**Finally, add the new language to the list of available languages in the following files:**

* README.rst
* docs/index.rst
* docs/user_guide/quickstart.rst
* newspaper/utils.py


Explicitly building a news source
---------------------------------

Instead of using the ``newspaper.build(..)`` api, we can take one step lower
into newspaper's ``Source`` api.

.. code-block:: pycon

    >>> from newspaper import Source
    >>> cnn_paper = Source('http://cnn.com')

    >>> print(cnn_paper.size()) # no articles, we have not built the source
    0

    >>> cnn_paper.build()
    >>> print(cnn_paper.size())
    3100

Note the ``build()`` method above. You may go lower level and de-abstract it
for absolute control over how your sources are constructed.

.. code-block:: pycon

    >>> cnn_paper = Source('http://cnn.com')
    >>> cnn_paper.download()
    >>> cnn_paper.parse()
    >>> cnn_paper.set_categories()
    >>> cnn_paper.download_categories()
    >>> cnn_paper.parse_categories()
    >>> cnn_paper.set_feeds()
    >>> cnn_paper.download_feeds()
    >>> cnn_paper.generate_articles()

    >>> print(cnn_paper.size())
    3100

And voila, we have mimic'd the ``build()`` method. In the above sequence,
every method is dependant on the method above it. Stop whenever you wish.

Parameters and Configurations
-----------------------------

Newspaper provides two api's for users to configure their ``Article`` and
``Source`` objects. One is via named parameter passing **recommended** and
the other is via ``Config`` objects.

Here are some named parameter passing examples:

.. code-block:: pycon

    >>> import newspaper
    >>> from newspaper import Article, Source

    >>> cnn = newspaper.build('http://cnn.com', language='en', memoize_articles=False)

    >>> article = Article(url='http://cnn.com/french/...', language='fr', fetch_images=False)

    >>> cnn = Source(url='http://latino.cnn.com/...', language='es', request_timeout=10,
                                                                number_threads=20)


Here are some examples of how Config objects are passed.

.. code-block:: pycon

    >>> import newspaper
    >>> from newspaper import Config, Article, Source

    >>> config = Config()
    >>> config.memoize_articles = False

    >>> cbs_paper = newspaper.build('http://cbs.com', config)

    >>> article_1 = Article(url='http://espn/2013/09/...', config)

    >>> cbs_paper = Source('http://cbs.com', config)


Here is a full list of the configuration options:

``keep_article_html``, default False, "set to True if you want to preserve html of body text"

``http_success_only``, default True, "set to False to capture non 2XX responses as well"

``MIN_WORD_COUNT``, default 300, "num of word tokens in article text"

``MIN_SENT_COUNT``, default 7, "num of sentence tokens"

``MAX_TITLE``, default 200, "num of chars in article title"

``MAX_TEXT``, default 100000, "num of chars in article text"

``MAX_KEYWORDS``, default 35, "num of keywords in article"

``MAX_AUTHORS``, default 10, "num of author names in article"

``MAX_SUMMARY``, default 5000, "num of chars of the summary"

``MAX_SUMMARY_SENT``, default 5, "num of sentences in summary"

``MAX_FILE_MEMO``, default 20000, "python setup.py sdist bdist_wininst upload"

``memoize_articles``, default True, "cache and save articles run after run"

``fetch_images``, default True, "set this to false if you don't care about getting images"

``follow_meta_refresh``, default False, "follows a redirect url in a meta refresh html tag"

``image_dimension_ration``, default 16/9.0, "max ratio for height/width, we ignore if greater"

``language``, default 'en', "run ``newspaper.languages()`` to see available options."

``browser_user_agent``, default 'newspaper/%s' % __version__

``request_timeout``, default 7

``number_threads``, default 10, "number of threads when mthreading"

``verbose``, default False, "turn this on when debugging"

You may notice other config options in the ``newspaper/configuration.py`` file,
however, they are private, **please do not toggle them**.

Caching
-------

TODO

Specifications
--------------

Here, we will define exactly *how* newspaper handles a lot of the data extraction.

TODO


================================================
FILE: docs/user_guide/api.rst
================================================
.. _api:

Newspaper API
=============


================================================
FILE: docs/user_guide/contributors.rst
================================================
.. _contributors:

Contributors
============

Maintained and authored by:
---------------------------
Lucas Ou-Yang -- http://codelucas.com, lucasyangpersonal@gmail.com

Thanks to the following contributors:
-------------------------------------
https://github.com/codelucas/newspaper/graphs/contributors

Newspaper relied on some code of a few other open source projects:
------------------------------------------------------------------
Thanks to all who have contributed to python-goose.
You can find the contributors list here:
https://github.com/grangier/python-goose/graphs/contributors

Thanks to all who have contributed to PyTeaser.
You can find the contributors list here:
https://github.com/xiaoxu193/PyTeaser/graphs/contributors

Thanks to all who have contributed to gravity-goose.
You can find the contributors list here:
https://github.com/GravityLabs/goose/graphs/contributors

Thanks to all who have contributed to jieba.
You can find the contributors list here:
https://github.com/fxsjy/jieba/graphs/contributors

Thanks to all who have contributed to nltk.
You can find the contributors list here:
https://github.com/nltk/nltk/graphs/contributors

Thanks to all who have contributed to lxml.
You can find the contributors list here:
http://lxml.de/credits.html

Thanks to all who have contributed to requests.
You can find the contributors list here:
https://github.com/kennethreitz/requests/graphs/contributors



================================================
FILE: docs/user_guide/quickstart.rst
================================================
.. _quickstart:

Quickstart
==========

Eager to get started? This page gives a good introduction in how to get started
with newspaper. This assumes you already have newspaper installed. If you do not,
head over to the :ref:`Installation <install>` section.

Building a news source
----------------------

Source objects are an abstraction of online news media websites like CNN or ESPN.
You can initialize them in two *different* ways.

Building a ``Source`` will extract its categories, feeds, articles, brand, and description for you.

You may also provide configuration parameters like ``language``, ``browser_user_agent``, and etc seamlessly. Navigate to the :ref:`advanced <advanced>` section for details.

.. code-block:: pycon

    >>> import newspaper
    >>> cnn_paper = newspaper.build('http://cnn.com')

    >>> sina_paper = newspaper.build('http://www.lemonde.fr/', language='fr')

However, if needed, you may also play with the lower level ``Source`` object as described
in the :ref:`advanced <advanced>` section.

Extracting articles
-------------------

Every news source has a set of *recent* articles.

The following examples assume that a news source has been
initialized and built.

.. code-block:: pycon

    >>> for article in cnn_paper.articles:
    >>>     print(article.url)

    u'http://www.cnn.com/2013/11/27/justice/tucson-arizona-captive-girls/'
    u'http://www.cnn.com/2013/12/11/us/texas-teen-dwi-wreck/index.html'
    ...

    >>> print(cnn_paper.size()) # cnn has 3100 articles
    3100

Article caching
---------------

By default, newspaper caches all previously extracted articles and **eliminates any
article which it has already extracted**.

This feature exists to prevent duplicate articles and to increase extraction speed.

.. code-block:: pycon

    >>> cbs_paper = newspaper.build('http://cbs.com')
    >>> cbs_paper.size()
    1030

    >>> cbs_paper = newspaper.build('http://cbs.com')
    >>> cbs_paper.size()
    2

The return value of ``cbs_paper.size()`` changes from 1030 to 2 because when we first
crawled cbs we found 1030 articles. However, on our second crawl, we eliminate all
articles which have already been crawled.

This means **2** new articles have been published since our first extraction.

You may opt out of this feature with the ``memoize_articles`` parameter.

You may also pass in the lower level``Config`` objects as covered in the :ref:`advanced <advanced>` section.

.. code-block:: pycon

    >>> import newspaper

    >>> cbs_paper = newspaper.build('http://cbs.com', memoize_articles=False)
    >>> cbs_paper.size()
    1030

    >>> cbs_paper = newspaper.build('http://cbs.com', memoize_articles=False)
    >>> cbs_paper.size()
    1030


Extracting Source categories
----------------------------

.. code-block:: pycon

    >>> for category in cnn_paper.category_urls():
    >>>     print(category)

    u'http://lifestyle.cnn.com'
    u'http://cnn.com/world'
    u'http://tech.cnn.com'
    ...

Extracting Source feeds
-----------------------

.. code-block:: pycon

    >>> for feed_url in cnn_paper.feed_urls():
    >>>     print(feed_url)

    u'http://rss.cnn.com/rss/cnn_crime.rss'
    u'http://rss.cnn.com/rss/cnn_tech.rss'
    ...

Extracting Source brand & description
-------------------------------------

.. code-block:: pycon

    >>> print(cnn_paper.brand)
    u'cnn'

    >>> print(cnn_paper.description)
    u'CNN.com delivers the latest breaking news and information on the latest...'

News Articles
-------------

Article objects are abstractions of news articles. For example, a news ``Source``
would be CNN while a news ``Article`` would be a specific CNN article.
You may reference an ``Article`` from an existing news ``Source`` or initialize
one by itself.

Referencing it from a ``Source``.

.. code-block:: pycon

    >>> first_article = cnn_paper.articles[0]

Initializing an ``Article`` by itself.

.. code-block:: pycon

    >>> from newspaper import Article
    >>> first_article = Article(url="http://www.lemonde.fr/...", language='fr')


Note the similar ``language=`` named paramater above. All the config parameters as described for ``Source`` objects also apply for ``Article`` objects! **Source and Article objects have a very similar api**.

Initializing an ``Article`` with the particular content-type ignoring.

There is option to skip loading of articles with particular content-type,
that can be useful if it is not desired to have delays because of long PDF resources.
The default html value for the particular content type can be provided and then used in order to define the actual content-type of the article

.. code-block:: pycon

    >>> from newspaper import Article
    >>> pdf_defaults = {"application/pdf": "%PDF-",
                      "application/x-pdf": "%PDF-",
                      "application/x-bzpdf": "%PDF-",
                      "application/x-gzpdf": "%PDF-"}
    >>> pdf_article = Article(url='https://www.adobe.com/pdf/pdfs/ISO32000-1PublicPatentLicense.pdf',
                                            ignored_content_types_defaults=pdf_defaults)
    >>> pdf_article.download()
    >>> print(pdf_article.html)
    %PDF-

There are endless possibilities on how we can manipulate and build articles.

Downloading an Article
----------------------

We begin by calling ``download()`` on an article. If you are interested in how to
quickly download articles concurrently with multi-threading check out the
:ref:`advanced <advanced>` section.

.. code-block:: pycon

    >>> first_article = cnn_paper.articles[0]

    >>> first_article.download()

    >>> print(first_article.html)
    u'<!DOCTYPE HTML><html itemscope itemtype="http://...'

    >>> print(cnn_paper.articles[7].html)
    u'' fail, not downloaded yet

Parsing an Article
------------------

You may also extract meaningful content from the html, like authors and body-text.
You **must** have called ``download()`` on an article before calling ``parse()``.

.. code-block:: pycon

    >>> first_article.parse()

    >>> print(first_article.text)
    u'Three sisters who were imprisoned for possibly...'

    >>> print(first_article.top_image)
    u'http://some.cdn.com/3424hfd4565sdfgdg436/

    >>> print(first_article.authors)
    [u'Eliott C. McLaughlin', u'Some CoAuthor']

    >>> print(first_article.title)
    u'Police: 3 sisters imprisoned in Tucson home'

    >>> print(first_article.images)
    ['url_to_img_1', 'url_to_img_2', 'url_to_img_3', ...]

    >>> print(first_article.movies)
    ['url_to_youtube_link_1', ...] # youtube, vimeo, etc


Performing NLP on an Article
----------------------------

Finally, you may extract out natural language properties from the text.
You **must** have called both ``download()`` and ``parse()`` on the article
before calling ``nlp()``.

**As of the current build, nlp() features only work on western languages.**

.. code-block:: pycon

    >>> first_article.nlp()

    >>> print(first_article.summary)
    u'...imprisoned for possibly a constant barrage...'

    >>> print(first_article.keywords)
    [u'music', u'Tucson', ... ]

    >>> print(cnn_paper.articles[100].nlp()) # fail, not been downloaded yet
    Traceback (...
    ArticleException: You must parse an article before you try to..


``nlp()`` is expensive, as is ``parse()``, make sure you actually need them before calling them on
all of your articles! In some cases, if you just need urls, even ``download()`` is not necessary.

Easter Eggs
-----------

Here are random but hopefully useful features! ``hot()`` returns a list of the top
trending terms on Google using a public api. ``popular_urls()`` returns a list
of popular news source urls.. In case you need help choosing a news source!

.. code-block:: pycon

    >>> import newspaper

    >>> newspaper.hot()
    ['Ned Vizzini', Brian Boitano', Crossword Inventor', 'Alex & Sierra', ... ]

    >>> newspaper.popular_urls()
    ['http://slate.com', 'http://cnn.com', 'http://huffingtonpost.com', ... ]

    >>> newspaper.languages()

    Your available languages are:
    input code      full name

      ar              Arabic
      de              German
      en              English
      es              Spanish
      fr              French
      he              Hebrew
      it              Italian
      ko              Korean
      no              Norwegian
      fa              Persian
      pl              Polish
      pt              Portuguese
      sv              Swedish
      zh              Chinese
      uk              Ukrainian
      sw              Swahili
      bg              Bulgarian
      hr              Croatian
      ro              Romanian
      sl              Slovenian
      sr              Serbian
      et              Estonian
      ja              Japanese
      be              Belarusian
      lt              Lithuanian


================================================
FILE: download_corpora.py
================================================
# -*- coding: utf-8 -*-
"""
Downloads the necessary NLTK models and corpora required to support
all of newspaper's features. Modify for your own needs.
"""
import nltk

REQUIRED_CORPORA = [
    'brown',  # Required for FastNPExtractor
    'punkt',  # Required for WordTokenizer
    'maxent_treebank_pos_tagger',  # Required for NLTKTagger
    'movie_reviews',  # Required for NaiveBayesAnalyzer
    'wordnet',  # Required for lemmatization and Wordnet
    'stopwords'
]

def main():
    for each in REQUIRED_CORPORA:
        print(('Downloading "{0}"'.format(each)))
        nltk.download(each)
    print("Finished.")

if __name__ == '__main__':
    main()


================================================
FILE: newspaper/__init__.py
================================================
# -*- coding: utf-8 -*-
"""
Wherever smart people work, doors are unlocked. -- Steve Wozniak
"""
__title__ = 'newspaper'
__author__ = 'Lucas Ou-Yang'
__license__ = 'MIT'
__copyright__ = 'Copyright 2014, Lucas Ou-Yang'

from .api import (build, build_article, fulltext, hot, languages,
                  popular_urls, Configuration as Config)
from .article import Article, ArticleException
from .mthreading import NewsPool
from .source import Source
from .version import __version__

news_pool = NewsPool()

# Set default logging handler to avoid "No handler found" warnings.
import logging

try:  # Python 2.7+
    from logging import NullHandler
except ImportError:
    class NullHandler(logging.Handler):
        def emit(self, record):
            pass

logging.getLogger(__name__).addHandler(NullHandler())


================================================
FILE: newspaper/api.py
================================================
# -*- coding: utf-8 -*-
"""
Ignore the unused imports, this file's purpose is to make visible
anything which a user might need to import from newspaper.
View newspaper/__init__.py for its usage.
"""
__title__ = 'newspaper'
__author__ = 'Lucas Ou-Yang'
__license__ = 'MIT'
__copyright__ = 'Copyright 2014, Lucas Ou-Yang'

import feedparser

from .article import Article
from .configuration import Configuration
from .settings import POPULAR_URLS, TRENDING_URL
from .source import Source
from .utils import extend_config, print_available_languages


def build(url='', dry=False, config=None, **kwargs) -> Source:
    """Returns a constructed source object without
    downloading or parsing the articles
    """
    config = config or Configuration()
    config = extend_config(config, kwargs)
    url = url or ''
    s = Source(url, config=config)
    if not dry:
        s.build()
    return s


def build_article(url='', config=None, **kwargs) -> Article:
    """Returns a constructed article object without downloading
    or parsing
    """
    config = config or Configuration()
    config = extend_config(config, kwargs)
    url = url or ''
    a = Article(url, config=config)
    return a


def languages():
    """Returns a list of the supported languages
    """
    print_available_languages()


def popular_urls():
    """Returns a list of pre-extracted popular source urls
    """
    with open(POPULAR_URLS) as f:
        urls = ['http://' + u.strip() for u in f.readlines()]
        return urls


def hot():
    """Returns a list of hit terms via google trends
    """
    try:
        listing = feedparser.parse(TRENDING_URL)['entries']
        trends = [item['title'] for item in listing]
        return trends
    except Exception as e:
        print('ERR hot terms failed!', str(e))
        return None


def fulltext(html, language='en'):
    """Takes article HTML string input and outputs the fulltext
    Input string is decoded via UnicodeDammit if needed
    """
    from .cleaners import DocumentCleaner
    from .configuration import Configuration
    from .extractors import ContentExtractor
    from .outputformatters import OutputFormatter

    config = Configuration()
    config.language = language

    extractor = ContentExtractor(config)
    document_cleaner = DocumentCleaner(config)
    output_formatter = OutputFormatter(config)

    doc = config.get_parser().fromstring(html)
    doc = document_cleaner.clean(doc)

    top_node = extractor.calculate_best_node(doc)
    top_node = extractor.post_cleanup(top_node)
    text, article_html = output_formatter.get_formatted(top_node)
    return text


================================================
FILE: newspaper/article.py
================================================
# -*- coding: utf-8 -*-
__title__ = 'newspaper'
__author__ = 'Lucas Ou-Yang'
__license__ = 'MIT'
__copyright__ = 'Copyright 2014, Lucas Ou-Yang'

import logging
import copy
import os
import glob
from urllib.parse import urlparse

import requests

from . import images
from . import network
from . import nlp
from . import settings
from . import urls

from .cleaners import DocumentCleaner
from .configuration import Configuration
from .extractors import ContentExtractor
from .outputformatters import OutputFormatter
from .utils import (URLHelper, RawHelper, extend_config,
                    get_available_languages, extract_meta_refresh)
from .videos.extractors import VideoExtractor

log = logging.getLogger(__name__)


class ArticleDownloadState(object):
    NOT_STARTED = 0
    FAILED_RESPONSE = 1
    SUCCESS = 2


class ArticleException(Exception):
    pass


class Article(object):
    """Article objects abstract an online news article page
    """
    def __init__(self, url, title='', source_url='', config=None, **kwargs):
        """The **kwargs argument may be filled with config values, which
        is added into the config object
        """
        if isinstance(title, Configuration) or \
                isinstance(source_url, Configuration):
            raise ArticleException(
                'Configuration object being passed incorrectly as title or '
                'source_url! Please verify `Article`s __init__() fn.')

        self.config = config or Configuration()
        self.config = extend_config(self.config, kwargs)

        self.extractor = ContentExtractor(self.config)

        if source_url == '':
            scheme = urls.get_scheme(url)
            if scheme is None:
                scheme = 'http'
            source_url = scheme + '://' + urls.get_domain(url)

        if source_url is None or source_url == '':
            raise ArticleException('input url bad format')

        # URL to the main page of the news source which owns this article
        self.source_url = source_url

        self.url = urls.prepare_url(url, self.source_url)

        self.title = title

        # URL of the "best image" to represent this article
        self.top_img = self.top_image = ''

        # stores image provided by metadata
        self.meta_img = ''

        # All image urls in this article
        self.imgs = self.images = []

        # All videos in this article: youtube, vimeo, etc
        self.movies = []

        # Body text from this article
        self.text = ''

        # `keywords` are extracted via nlp() from the body text
        self.keywords = []

        # `meta_keywords` are extracted via parse() from <meta> tags
        self.meta_keywords = []

        # `tags` are also extracted via parse() from <meta> tags
        self.tags = set()

        # List of authors who have published the article, via parse()
        self.authors = []

        self.publish_date = ''

        # Summary generated from the article's body txt
        self.summary = ''

        # This article's unchanged and raw HTML
        self.html = ''

        # The HTML of this article's main node (most important part)
        self.article_html = ''

        # Keep state for downloads and parsing
        self.is_parsed = False
        self.download_state = ArticleDownloadState.NOT_STARTED
        self.download_exception_msg = None

        # Meta description field in the HTML source
        self.meta_description = ""

        # Meta language field in HTML source
        self.meta_lang = ""

        # Meta favicon field in HTML source
        self.meta_favicon = ""

        # Meta site_name field in HTML source
        self.meta_site_name = ""

        # Meta tags contain a lot of structured data, e.g. OpenGraph
        self.meta_data = {}

        # The canonical link of this article if found in the meta data
        self.canonical_link = ""

        # Holds the top element of the DOM that we determine is a candidate
        # for the main body of the article
        self.top_node = None

        # A deepcopied clone of the above object before heavy parsing
        # operations, useful for users to query data in the
        # "most important part of the page"
        self.clean_top_node = None

        # lxml DOM object generated from HTML
        self.doc = None

        # A deepcopied clone of the above object before undergoing heavy
        # cleaning operations, serves as an API if users need to query the DOM
        self.clean_doc = None

        # A property dict for users to store custom data.
        self.additional_data = {}

    def build(self):
        """Build a lone article from a URL independent of the source (newspaper).
        Don't normally call this method b/c it's good to multithread articles
        on a source (newspaper) level.
        """
        self.download()
        self.parse()
        self.nlp()

    def _parse_scheme_file(self, path):
        try:
            with open(path, "r") as fin:
                return fin.read()
        except OSError as e:
            self.download_state = ArticleDownloadState.FAILED_RESPONSE
            self.download_exception_msg = e.strerror
            return None

    def _parse_scheme_http(self):
        try:
            return network.get_html_2XX_only(self.url, self.config)
        except requests.exceptions.RequestException as e:
            self.download_state = ArticleDownloadState.FAILED_RESPONSE
            self.download_exception_msg = str(e)
            return None

    def download(self, input_html=None, title=None, recursion_counter=0):
        """Downloads the link's HTML content, don't use if you are batch async
        downloading articles

        recursion_counter (currently 1) stops refreshes that are potentially
        infinite
        """
        if input_html is None:
            parsed_url = urlparse(self.url)
            if parsed_url.scheme == "file":
                html = self._parse_scheme_file(parsed_url.path)
            else:
                html = self._parse_scheme_http()
            if html is None:
                log.debug('Download failed on URL %s because of %s' %
                          (self.url, self.download_exception_msg))
                return
        else:
            html = input_html

        if self.config.follow_meta_refresh:
            meta_refresh_url = extract_meta_refresh(html)
            if meta_refresh_url and recursion_counter < 1:
                return self.download(
                    input_html=network.get_html(meta_refresh_url),
                    recursion_counter=recursion_counter + 1)

        self.set_html(html)
        self.set_title(title)

    def parse(self):
        self.throw_if_not_downloaded_verbose()

        self.doc = self.config.get_parser().fromstring(self.html)
        self.clean_doc = copy.deepcopy(self.doc)

        if self.doc is None:
            # `parse` call failed, return nothing
            return

        # TODO: Fix this, sync in our fix_url() method
        parse_candidate = self.get_parse_candidate()
        self.link_hash = parse_candidate.link_hash  # MD5

        document_cleaner = DocumentCleaner(self.config)
        output_formatter = OutputFormatter(self.config)

        title = self.extractor.get_title(self.clean_doc)
        self.set_title(title)

        authors = self.extractor.get_authors(self.clean_doc)
        self.set_authors(authors)

        meta_lang = self.extractor.get_meta_lang(self.clean_doc)
        self.set_meta_language(meta_lang)

        if self.config.use_meta_language:
            self.extractor.update_language(self.meta_lang)
            output_formatter.update_language(self.meta_lang)

        meta_favicon = self.extractor.get_favicon(self.clean_doc)
        self.set_meta_favicon(meta_favicon)

        meta_site_name = self.extractor.get_meta_site_name(self.clean_doc)
        self.set_meta_site_name(meta_site_name)

        meta_description = \
            self.extractor.get_meta_description(self.clean_doc)
        self.set_meta_description(meta_description)

        canonical_link = self.extractor.get_canonical_link(
            self.url, self.clean_doc)
        self.set_canonical_link(canonical_link)

        tags = self.extractor.extract_tags(self.clean_doc)
        self.set_tags(tags)

        meta_keywords = self.extractor.get_meta_keywords(
            self.clean_doc)
        self.set_meta_keywords(meta_keywords)

        meta_data = self.extractor.get_meta_data(self.clean_doc)
        self.set_meta_data(meta_data)

        self.publish_date = self.extractor.get_publishing_date(
            self.url,
            self.clean_doc)

        # Before any computations on the body, clean DOM object
        self.doc = document_cleaner.clean(self.doc)

        self.top_node = self.extractor.calculate_best_node(self.doc)
        if self.top_node is not None:
            video_extractor = VideoExtractor(self.config, self.top_node)
            self.set_movies(video_extractor.get_videos())

            self.top_node = self.extractor.post_cleanup(self.top_node)
            self.clean_top_node = copy.deepcopy(self.top_node)

            text, article_html = output_formatter.get_formatted(
                self.top_node)
            self.set_article_html(article_html)
            self.set_text(text)

        self.fetch_images()

        self.is_parsed = True
        self.release_resources()

    def fetch_images(self):
        if self.clean_doc is not None:
            meta_img_url = self.extractor.get_meta_img_url(
                self.url, self.clean_doc)
            self.set_meta_img(meta_img_url)

            imgs = self.extractor.get_img_urls(self.url, self.clean_doc)
            if self.meta_img:
                imgs.add(self.meta_img)
            self.set_imgs(imgs)

        if self.clean_top_node is not None and not self.has_top_image():
            first_img = self.extractor.get_first_img_url(
                self.url, self.clean_top_node)
            if self.config.fetch_images:
                self.set_top_img(first_img)
            else:
                self.set_top_img_no_check(first_img)

        if not self.has_top_image() and self.config.fetch_images:
            self.set_reddit_top_img()

    def has_top_image(self):
        return self.top_img is not None and self.top_img != ''

    def is_valid_url(self):
        """Performs a check on the url of this link to determine if article
        is a real news article or not
        """
        return urls.valid_url(self.url)

    def is_valid_body(self):
        """If the article's body text is long enough to meet
        standard article requirements, keep the article
        """
        if not self.is_parsed:
            raise ArticleException('must parse article before checking \
                                    if it\'s body is valid!')
        meta_type = self.extractor.get_meta_type(self.clean_doc)
        wordcount = self.text.split(' ')
        sentcount = self.text.split('.')

        if (meta_type == 'article' and len(wordcount) >
                (self.config.MIN_WORD_COUNT)):
            log.debug('%s verified for article and wc' % self.url)
            return True

        if not self.is_media_news() and not self.text:
            log.debug('%s caught for no media no text' % self.url)
            return False

        if self.title is None or len(self.title.split(' ')) < 2:
            log.debug('%s caught for bad title' % self.url)
            return False

        if len(wordcount) < self.config.MIN_WORD_COUNT:
            log.debug('%s caught for word cnt' % self.url)
            return False

        if len(sentcount) < self.config.MIN_SENT_COUNT:
            log.debug('%s caught for sent cnt' % self.url)
            return False

        if self.html is None or self.html == '':
            log.debug('%s caught for no html' % self.url)
            return False

        log.debug('%s verified for default true' % self.url)
        return True

    def is_media_news(self):
        """If the article is related heavily to media:
        gallery, video, big pictures, etc
        """
        safe_urls = ['/video', '/slide', '/gallery', '/powerpoint',
                     '/fashion', '/glamour', '/cloth']
        for s in safe_urls:
            if s in self.url:
                return True
        return False

    def nlp(self):
        """Keyword extraction wrapper
        """
        self.throw_if_not_downloaded_verbose()
        self.throw_if_not_parsed_verbose()

        nlp.load_stopwords(self.config.get_language())
        text_keyws = list(nlp.keywords(self.text).keys())
        title_keyws = list(nlp.keywords(self.title).keys())
        keyws = list(set(title_keyws + text_keyws))
        self.set_keywords(keyws)

        max_sents = self.config.MAX_SUMMARY_SENT

        summary_sents = nlp.summarize(title=self.title, text=self.text, max_sents=max_sents)
        summary = '\n'.join(summary_sents)
        self.set_summary(summary)

    def get_parse_candidate(self):
        """A parse candidate is a wrapper object holding a link hash of this
        article and a final_url of the article
        """
        if self.html:
            return RawHelper.get_parsing_candidate(self.url, self.html)
        return URLHelper.get_parsing_candidate(self.url)

    def build_resource_path(self):
        """Must be called after computing HTML/final URL
        """
        res_path = self.get_resource_path()
        if not os.path.exists(res_path):
            os.mkdir(res_path)

    def get_resource_path(self):
        """Every article object has a special directory to store data in from
        initialization to garbage collection
        """
        res_dir_fn = 'article_resources'
        resource_directory = os.path.join(settings.TOP_DIRECTORY, res_dir_fn)
        if not os.path.exists(resource_directory):
            os.mkdir(resource_directory)
        dir_path = os.path.join(resource_directory, '%s_' % self.link_hash)
        return dir_path

    def release_resources(self):
        # TODO: implement in entirety
        path = self.get_resource_path()
        for fname in glob.glob(path):
            try:
                os.remove(fname)
            except OSError:
                pass
        # os.remove(path)

    def set_reddit_top_img(self):
        """Wrapper for setting images. Queries known image attributes
        first, then uses Reddit's image algorithm as a fallback.
        """
        try:
            s = images.Scraper(self)
            self.set_top_img(s.largest_image_url())
        except TypeError as e:
            if "Can't convert 'NoneType' object to str implicitly" in e.args[0]:
                log.debug('No pictures found. Top image not set, %s' % e)
            elif 'timed out' in e.args[0]:
                log.debug('Download of picture timed out. Top image not set, %s' % e)
            else:
                log.critical('TypeError other than None type error. '
                             'Cannot set top image using the Reddit '
                             'algorithm. Possible error with PIL., %s' % e)
        except Exception as e:
            log.critical('Other error with setting top image using the '
                         'Reddit algorithm. Possible error with PIL, %s' % e)

    def set_title(self, input_title):
        if input_title:
            self.title = input_title[:self.config.MAX_TITLE]

    def set_text(self, text):
        text = text[:self.config.MAX_TEXT]
        if text:
            self.text = text

    def set_html(self, html):
        """Encode HTML before setting it
        """
        if html:
            if isinstance(html, bytes):
                html = self.config.get_parser().get_unicode_html(html)
            self.html = html
            self.download_state = ArticleDownloadState.SUCCESS

    def set_article_html(self, article_html):
        """Sets the HTML of just the article's `top_node`
        """
        if article_html:
            self.article_html = article_html

    def set_meta_img(self, src_url):
        self.meta_img = src_url
        self.set_top_img_no_check(src_url)

    def set_top_img(self, src_url):
        if src_url is not None:
            s = images.Scraper(self)
            if s.satisfies_requirements(src_url):
                self.set_top_img_no_check(src_url)

    def set_top_img_no_check(self, src_url):
        """Provide 2 APIs for images. One at "top_img", "imgs"
        and one at "top_image", "images"
        """
        self.top_img = src_url
        self.top_image = src_url

    def set_imgs(self, imgs):
        """The motive for this method is the same as above, provide APIs
        for both `article.imgs` and `article.images`
        """
        self.images = imgs
        self.imgs = imgs

    def set_keywords(self, keywords):
        """Keys are stored in list format
        """
        if not isinstance(keywords, list):
            raise Exception("Keyword input must be list!")
        if keywords:
            self.keywords = keywords[:self.config.MAX_KEYWORDS]

    def set_authors(self, authors):
        """Authors are in ["firstName lastName", "firstName lastName"] format
        """
        if not isinstance(authors, list):
            raise Exception("authors input must be list!")
        if authors:
            self.authors = authors[:self.config.MAX_AUTHORS]

    def set_summary(self, summary):
        """Summary here refers to a paragraph of text from the
        title text and body text
        """
        self.summary = summary[:self.config.MAX_SUMMARY]

    def set_meta_language(self, meta_lang):
        """Save langauges in their ISO 2-character form
        """
        if meta_lang and len(meta_lang) >= 2 and \
           meta_lang in get_available_languages():
            self.meta_lang = meta_lang[:2]

    def set_meta_keywords(self, meta_keywords):
        """Store the keys in list form
        """
        self.meta_keywords = [k.strip() for k in meta_keywords.split(',')]

    def set_meta_favicon(self, meta_favicon):
        self.meta_favicon = meta_favicon

    def set_meta_site_name(self, meta_site_name):
        self.meta_site_name = meta_site_name

    def set_meta_description(self, meta_description):
        self.meta_description = meta_description

    def set_meta_data(self, meta_data):
        self.meta_data = meta_data

    def set_canonical_link(self, canonical_link):
        self.canonical_link = canonical_link

    def set_tags(self, tags):
        self.tags = tags

    def set_movies(self, movie_objects):
        """Trim video objects into just urls
        """
        movie_urls = [o.src for o in movie_objects if o and o.src]
        self.movies = movie_urls

    def throw_if_not_downloaded_verbose(self):
        """Parse ArticleDownloadState -> log readable status
        -> maybe throw ArticleException
        """
        if self.download_state == ArticleDownloadState.NOT_STARTED:
            raise ArticleException('You must `download()` an article first!')
        elif self.download_state == ArticleDownloadState.FAILED_RESPONSE:
            raise ArticleException('Article `download()` failed with %s on URL %s' %
                  (self.download_exception_msg, self.url))

    def throw_if_not_parsed_verbose(self):
        """Parse `is_parsed` status -> log readable status
        -> maybe throw ArticleException
        """
        if not self.is_parsed:
            raise ArticleException('You must `parse()` an article first!')


================================================
FILE: newspaper/cleaners.py
================================================
# -*- coding: utf-8 -*-
"""
Holds the code for cleaning out unwanted tags from the lxml
dom xpath.
"""
import copy
from .utils import ReplaceSequence


class DocumentCleaner(object):

    def __init__(self, config):
        """Set appropriate tag names and regexes of tags to remove
        from the HTML
        """
        self.config = config
        self.parser = self.config.get_parser()
        self.remove_nodes_re = (
            "^side$|combx|retweet|mediaarticlerelated|menucontainer|"
            "navbar|storytopbar-bucket|utility-bar|inline-share-tools"
            "|comment|PopularQuestions|contact|foot|footer|Footer|footnote"
            "|cnn_strycaptiontxt|cnn_html_slideshow|cnn_strylftcntnt"
            "|links|meta$|shoutbox|sponsor"
            "|tags|socialnetworking|socialNetworking|cnnStryHghLght"
            "|cnn_stryspcvbx|^inset$|pagetools|post-attributes"
            "|welcome_form|contentTools2|the_answers"
            "|communitypromo|runaroundLeft|subscribe|vcard|articleheadings"
            "|date|^print$|popup|author-dropdown|tools|socialtools|byline"
            "|konafilter|KonaFilter|breadcrumbs|^fn$|wp-caption-text"
            "|legende|ajoutVideo|timestamp|js_replies"
        )
        self.regexp_namespace = "http://exslt.org/regular-expressions"
        self.nauthy_ids_re = ("//*[re:test(@id, '%s', 'i')]" %
                              self.remove_nodes_re)
        self.nauthy_classes_re = ("//*[re:test(@class, '%s', 'i')]" %
                                  self.remove_nodes_re)
        self.nauthy_names_re = ("//*[re:test(@name, '%s', 'i')]" %
                                self.remove_nodes_re)
        self.div_to_p_re = r"<(a|blockquote|dl|div|img|ol|p|pre|table|ul)"
        self.caption_re = "^caption$"
        self.google_re = " google "
        self.entries_re = "^[^entry-]more.*$"
        self.facebook_re = "[^-]facebook"
        self.facebook_broadcasting_re = "facebook-broadcasting"
        self.twitter_re = "[^-]twitter"
        self.tablines_replacements = ReplaceSequence()\
            .create("\n", "\n\n")\
            .append("\t")\
            .append("^\\s+$")
        self.contains_article = './/article|.//*[@id="article"]|.//*[@itemprop="articleBody"]'

    def clean(self, doc_to_clean):
        """Remove chunks of the DOM as specified
        """
        doc_to_clean = self.clean_body_classes(doc_to_clean)
        doc_to_clean = self.clean_article_tags(doc_to_clean)
        doc_to_clean = self.clean_em_tags(doc_to_clean)
        doc_to_clean = self.remove_drop_caps(doc_to_clean)
        doc_to_clean = self.remove_scripts_styles(doc_to_clean)
        doc_to_clean = self.clean_bad_tags(doc_to_clean)
        doc_to_clean = self.remove_nodes_regex(doc_to_clean, self.caption_re)
        doc_to_clean = self.remove_nodes_regex(doc_to_clean, self.google_re)
        doc_to_clean = self.remove_nodes_regex(doc_to_clean, self.entries_re)
        doc_to_clean = self.remove_nodes_regex(doc_to_clean, self.facebook_re)
        doc_to_clean = self.remove_nodes_regex(doc_to_clean,
                                               self.facebook_broadcasting_re)
        doc_to_clean = self.remove_nodes_regex(doc_to_clean, self.twitter_re)
        doc_to_clean = self.clean_para_spans(doc_to_clean)
        doc_to_clean = self.div_to_para(doc_to_clean, 'div')
        doc_to_clean = self.div_to_para(doc_to_clean, 'span')
        doc_to_clean = self.div_to_para(doc_to_clean, 'section')
        return doc_to_clean

    def clean_body_classes(self, doc):
        """Removes the `class` attribute from the <body> tag because
        if there is a bad match, the entire DOM will be empty!
        """
        elements = self.parser.getElementsByTag(doc, tag="body")
        if elements:
            self.parser.delAttribute(elements[0], attr="class")
        return doc

    def clean_article_tags(self, doc):
        articles = self.parser.getElementsByTag(doc, tag='article')
        for article in articles:
            for attr in ['id', 'name', 'class']:
                self.parser.delAttribute(article, attr=attr)
        return doc

    def clean_em_tags(self, doc):
        ems = self.parser.getElementsByTag(doc, tag='em')
        for node in ems:
            images = self.parser.getElementsByTag(node, tag='img')
            if len(images) == 0:
                self.parser.drop_tag(node)
        return doc

    def remove_drop_caps(self, doc):
        items = self.parser.css_select(doc, 'span[class~=dropcap], '
                                       'span[class~=drop_cap]')
        for item in items:
            self.parser.drop_tag(item)
        return doc

    def remove_scripts_styles(self, doc):
        # remove scripts
        scripts = self.parser.getElementsByTag(doc, tag='script')
        for item in scripts:
            self.parser.remove(item)
        # remove styles
        styles = self.parser.getElementsByTag(doc, tag='style')
        for item in styles:
            self.parser.remove(item)
        # remove comments
        comments = self.parser.getComments(doc)
        for item in comments:
            self.parser.remove(item)

        return doc

    def clean_bad_tags(self, doc):
        # ids
        naughty_list = self.parser.xpath_re(doc, self.nauthy_ids_re)
        for node in naughty_list:
            if not node.xpath(self.contains_article):
                self.parser.remove(node)
        # class
        naughty_classes = self.parser.xpath_re(doc, self.nauthy_classes_re)
        for node in naughty_classes:
            if not node.xpath(self.contains_article):
                self.parser.remove(node)
        # name
        naughty_names = self.parser.xpath_re(doc, self.nauthy_names_re)
        for node in naughty_names:
            if not node.xpath(self.contains_article):
                self.parser.remove(node)
        return doc

    def remove_nodes_regex(self, doc, pattern):
        for selector in ['id', 'class']:
            reg = "//*[re:test(@%s, '%s', 'i')]" % (selector, pattern)
            naughty_list = self.parser.xpath_re(doc, reg)
            for node in naughty_list:
                self.parser.remove(node)
        return doc

    def clean_para_spans(self, doc):
        spans = self.parser.css_select(doc, 'p span')
        for item in spans:
            self.parser.drop_tag(item)
        return doc

    def get_flushed_buffer(self, replacement_text, doc):
        return self.parser.textToPara(replacement_text)

    def replace_walk_left_right(self, kid, kid_text,
                                replacement_text, nodes_to_remove):
        kid_text_node = kid
        replace_text = self.tablines_replacements.replaceAll(kid_text)
        if len(replace_text) > 1:
            prev_node = self.parser.previousSibling(kid_text_node)
            while prev_node is not None \
                    and self.parser.getTag(prev_node) == "a" \
                    and self.parser.getAttribute(
                        prev_node, 'grv-usedalready') != 'yes':
                outer = " " + self.parser.outerHtml(prev_node) + " "
                replacement_text.append(outer)
                nodes_to_remove.append(prev_node)
                self.parser.setAttribute(prev_node, attr='grv-usedalready',
                                         value='yes')
                prev_node = self.parser.previousSibling(prev_node)

            replacement_text.append(replace_text)
            next_node = self.parser.nextSibling(kid_text_node)
            while next_node is not None \
                    and self.parser.getTag(next_node) == "a" \
                    and self.parser.getAttribute(
                        next_node, 'grv-usedalready') != 'yes':
                outer = " " + self.parser.outerHtml(next_node) + " "
                replacement_text.append(outer)
                nodes_to_remove.append(next_node)
                self.parser.setAttribute(next_node, attr='grv-usedalready',
                                         value='yes')
                next_node = self.parser.nextSibling(next_node)

    def get_replacement_nodes(self, doc, div):
        replacement_text = []
        nodes_to_return = []
        nodes_to_remove = []
        kids = self.parser.childNodesWithText(div)
        for kid in kids:
            # The node is a <p> and already has some replacement text
            if self.parser.getTag(kid) == 'p' and len(replacement_text) > 0:
                new_node = self.get_flushed_buffer(
                    ''.join(replacement_text), doc)
                nodes_to_return.append(new_node)
                replacement_text = []
                nodes_to_return.append(kid)
            # The node is a text node
            elif self.parser.isTextNode(kid):
                kid_text = self.parser.getText(kid)
                self.replace_walk_left_right(kid, kid_text, replacement_text,
                                             nodes_to_remove)
            else:
                nodes_to_return.append(kid)

        # flush out anything still remaining
        if(len(replacement_text) > 0):
            new_node = self.get_flushed_buffer(''.join(replacement_text), doc)
            nodes_to_return.append(new_node)
            replacement_text = []

        for n in nodes_to_remove:
            self.parser.remove(n)

        return nodes_to_return

    def replace_with_para(self, doc, div):
        self.parser.replaceTag(div, 'p')

    def div_to_para(self, doc, dom_type):
        bad_divs = 0
        else_divs = 0
        divs = self.parser.getElementsByTag(doc, tag=dom_type)
        tags = ['a', 'blockquote', 'dl', 'div', 'img', 'ol', 'p',
                'pre', 'table', 'ul']
        for div in divs:
            items = self.parser.getElementsByTags(div, tags)
            if div is not None and len(items) == 0:
                self.replace_with_para(doc, div)
                bad_divs += 1
            elif div is not None:
                replace_nodes = self.get_replacement_nodes(doc, div)
                replace_nodes = [n for n in replace_nodes if n is not None]
                attrib = copy.deepcopy(div.attrib)
                div.clear()
                for i, node in enumerate(replace_nodes):
                    div.insert(i, node)
                for name, value in attrib.items():
                    div.set(name, value)
                else_divs += 1
        return doc


================================================
FILE: newspaper/configuration.py
================================================
# -*- coding: utf-8 -*-
"""
This class holds configuration objects, which can be thought of
as settings.py but dynamic and changing for whatever parent object
holds them. For example, pass in a config object to an Article
object, Source object, or even network methods, and it just works.
"""
__title__ = 'newspaper'
__author__ = 'Lucas Ou-Yang'
__license__ = 'MIT'
__copyright__ = 'Copyright 2014, Lucas Ou-Yang'

import logging

from .parsers import Parser
from .text import (StopWords, StopWordsArabic, StopWordsChinese,
                   StopWordsKorean, StopWordsHindi, StopWordsJapanese, StopWordsThai)
from .version import __version__

log = logging.getLogger(__name__)


class Configuration(object):
    def __init__(self):
        """
        Modify any of these Article / Source properties
        TODO: Have a separate ArticleConfig and SourceConfig extend this!
        """
        self.MIN_WORD_COUNT = 300  # num of word tokens in text
        self.MIN_SENT_COUNT = 7  # num of sentence tokens
        self.MAX_TITLE = 200  # num of chars
        self.MAX_TEXT = 100000  # num of chars
        self.MAX_KEYWORDS = 35  # num of strings in list
        self.MAX_AUTHORS = 10  # num strings in list
        self.MAX_SUMMARY = 5000  # num of chars
        self.MAX_SUMMARY_SENT = 5  # num of sentences

        # max number of urls we cache for each news source
        self.MAX_FILE_MEMO = 20000

        # Cache and save articles run after run
        self.memoize_articles = True

        # Set this to false if you don't care about getting images
        self.fetch_images = True
        self.image_dimension_ration = 16 / 9.0

        # Follow meta refresh redirect when downloading
        self.follow_meta_refresh = False

        # Don't toggle this variable, done internally
        self.use_meta_language = True

        # You may keep the html of just the main article body
        self.keep_article_html = False

        # Fail for error responses (e.g. 404 page)
        self.http_success_only = True

        # English is the fallback
        self._language = 'en'

        # Unique stopword classes for oriental languages, don't toggle
        self.stopwords_class = StopWords

        self.browser_user_agent = 'newspaper/%s' % __version__
        self.headers = {}
        self.request_timeout = 7
        self.proxies = {}
        self.number_threads = 10

        self.verbose = False  # for debugging

        self.thread_timeout_seconds = 1
        self.ignored_content_types_defaults = {}
        # Set this to False if you want to recompute the categories
        # *every* time you build a `Source` object
        # TODO: Actually make this work
        # self.use_cached_categories = True

    def get_language(self):
        return self._language

    def del_language(self):
        raise Exception('wtf are you doing?')

    def set_language(self, language):
        """Language setting must be set in this method b/c non-occidental
        (western) languages require a separate stopwords class.
        """
        if not language or len(language) != 2:
            raise Exception("Your input language must be a 2 char language code, \
                for example: english-->en \n and german-->de")

        # If explicitly set language, don't use meta
        self.use_meta_language = False

        # Set oriental language stopword class
        self._language = language
        self.stopwords_class = self.get_stopwords_class(language)

    language = property(get_language, set_language,
                        del_language, "language prop")

    @staticmethod
    def get_stopwords_class(language):
        if language == 'ko':
            return StopWordsKorean
        elif language == 'hi':
            return StopWordsHindi
        elif language == 'zh':
            return StopWordsChinese
        # Persian and Arabic Share an alphabet
        # There is a persian parser https://github.com/sobhe/hazm, but nltk is likely sufficient
        elif language == 'ar' or language == 'fa':
            return StopWordsArabic
        elif language == 'ja':
            return StopWordsJapanese
        elif language == 'th':
            return StopWordsThai
        return StopWords

    @staticmethod
    def get_parser():
        return Parser


class ArticleConfiguration(Configuration):
    pass


class SourceConfiguration(Configuration):
    pass


================================================
FILE: newspaper/extractors.py
================================================
# -*- coding: utf-8 -*-
"""
Newspaper uses much of python-goose's extraction code. View their license:
https://github.com/codelucas/newspaper/blob/master/GOOSE-LICENSE.txt

Keep all html page extraction code within this file. Abstract any
lxml or soup parsing code in the parsers.py file!
"""
__title__ = 'newspaper'
__author__ = 'Lucas Ou-Yang'
__license__ = 'MIT'
__copyright__ = 'Copyright 2014, Lucas Ou-Yang'

import copy
import logging
import re
import re
from collections import defaultdict

from dateutil.parser import parse as date_parser
from tldextract import tldextract
from urllib.parse import urljoin, urlparse, urlunparse

from . import urls
from .utils import StringReplacement, StringSplitter

log = logging.getLogger(__name__)

MOTLEY_REPLACEMENT = StringReplacement("&#65533;", "")
ESCAPED_FRAGMENT_REPLACEMENT = StringReplacement(
    "#!", "?_escaped_fragment_=")
TITLE_REPLACEMENTS = StringReplacement("&raquo;", "»")
PIPE_SPLITTER = StringSplitter("\\|")
DASH_SPLITTER = StringSplitter(" - ")
UNDERSCORE_SPLITTER = StringSplitter("_")
SLASH_SPLITTER = StringSplitter("/")
ARROWS_SPLITTER = StringSplitter(" » ")
COLON_SPLITTER = StringSplitter(":")
SPACE_SPLITTER = StringSplitter(' ')
NO_STRINGS = set()
A_REL_TAG_SELECTOR = "a[rel=tag]"
A_HREF_TAG_SELECTOR = ("a[href*='/tag/'], a[href*='/tags/'], "
                       "a[href*='/topic/'], a[href*='?keyword=']")
RE_LANG = r'^[A-Za-z]{2}$'

good_paths = ['story', 'article', 'feature', 'featured', 'slides',
              'slideshow', 'gallery', 'news', 'video', 'media',
              'v', 'radio', 'press']
bad_chunks = ['careers', 'contact', 'about', 'faq', 'terms', 'privacy',
              'advert', 'preferences', 'feedback', 'info', 'browse', 'howto',
              'account', 'subscribe', 'donate', 'shop', 'admin']
bad_domains = ['amazon', 'doubleclick', 'twitter']


class ContentExtractor(object):
    def __init__(self, config):
        self.config = config
        self.parser = self.config.get_parser()
        self.language = config.language
        self.stopwords_class = config.stopwords_class

    def update_language(self, meta_lang):
        """Required to be called before the extraction process in some
        cases because the stopwords_class has to set incase the lang
        is not latin based
        """
        if meta_lang:
            self.language = meta_lang
            self.stopwords_class = \
                self.config.get_stopwords_class(meta_lang)

    def get_authors(self, doc):
        """Fetch the authors of the article, return as a list
        Only works for english articles
        """
        _digits = re.compile('\d')

        def contains_digits(d):
            return bool(_digits.search(d))

        def uniqify_list(lst):
            """Remove duplicates from provided list but maintain original order.
              Derived from http://www.peterbe.com/plog/uniqifiers-benchmark
            """
            seen = {}
            result = []
            for item in lst:
                if item.lower() in seen:
                    continue
                seen[item.lower()] = 1
                result.append(item.title())
            return result

        def parse_byline(search_str):
            """
            Takes a candidate line of html or text and
            extracts out the name(s) in list form:
            >>> parse_byline('<div>By: <strong>Lucas Ou-Yang</strong>,<strong>Alex Smith</strong></div>')
            ['Lucas Ou-Yang', 'Alex Smith']
            """
            # Remove HTML boilerplate
            search_str = re.sub('<[^<]+?>', '', search_str)

            # Remove original By statement
            search_str = re.sub('[bB][yY][\:\s]|[fF]rom[\:\s]', '', search_str)

            search_str = search_str.strip()

            # Chunk the line by non alphanumeric tokens (few name exceptions)
            # >>> re.split("[^\w\'\-\.]", "Tyler G. Jones, Lucas Ou, Dean O'Brian and Ronald")
            # ['Tyler', 'G.', 'Jones', '', 'Lucas', 'Ou', '', 'Dean', "O'Brian", 'and', 'Ronald']
            name_tokens = re.split("[^\w\'\-\.]", search_str)
            name_tokens = [s.strip() for s in name_tokens]

            _authors = []
            # List of first, last name tokens
            curname = []
            delimiters = ['and', ',', '']

            for token in name_tokens:
                if token in delimiters:
                    if len(curname) > 0:
                        _authors.append(' '.join(curname))
                        curname = []

                elif not contains_digits(token):
                    curname.append(token)

            # One last check at end
            valid_name = (len(curname) >= 2)
            if valid_name:
                _authors.append(' '.join(curname))

            return _authors

        # Try 1: Search popular author tags for authors

        ATTRS = ['name', 'rel', 'itemprop', 'class', 'id']
        VALS = ['author', 'byline', 'dc.creator', 'byl']
        matches = []
        authors = []

        for attr in ATTRS:
            for val in VALS:
                # found = doc.xpath('//*[@%s="%s"]' % (attr, val))
                found = self.parser.getElementsByTag(doc, attr=attr, value=val)
                matches.extend(found)

        for match in matches:
            content = ''
            if match.tag == 'meta':
                mm = match.xpath('@content')
                if len(mm) > 0:
                    content = mm[0]
            else:
                content = match.text_content() or ''
            if len(content) > 0:
                authors.extend(parse_byline(content))

        return uniqify_list(authors)

        # TODO Method 2: Search raw html for a by-line
        # match = re.search('By[\: ].*\\n|From[\: ].*\\n', html)
        # try:
        #    # Don't let zone be too long
        #    line = match.group(0)[:100]
        #    authors = parse_byline(line)
        # except:
        #    return [] # Failed to find anything
        # return authors

    def get_publishing_date(self, url, doc):
        """3 strategies for publishing date extraction. The strategies
        are descending in accuracy and the next strategy is only
        attempted if a preferred one fails.

        1. Pubdate from URL
        2. Pubdate from metadata
        3. Raw regex searches in the HTML + added heuristics
        """

        def parse_date_str(date_str):
            if date_str:
                try:
                    return date_parser(date_str)
                except (ValueError, OverflowError, AttributeError, TypeError):
                    # near all parse failures are due to URL dates without a day
                    # specifier, e.g. /2014/04/
                    return None

        date_match = re.search(urls.STRICT_DATE_REGEX, url)
        if date_match:
            date_str = date_match.group(0)
            datetime_obj = parse_date_str(date_str)
            if datetime_obj:
                return datetime_obj

        PUBLISH_DATE_TAGS = [
            {'attribute': 'property', 'value': 'rnews:datePublished',
             'content': 'content'},
            {'attribute': 'property', 'value': 'article:published_time',
             'content': 'content'},
            {'attribute': 'name', 'value': 'OriginalPublicationDate',
             'content': 'content'},
            {'attribute': 'itemprop', 'value': 'datePublished',
             'content': 'datetime'},
            {'attribute': 'property', 'value': 'og:published_time',
             'content': 'content'},
            {'attribute': 'name', 'value': 'article_date_original',
             'content': 'content'},
            {'attribute': 'name', 'value': 'publication_date',
             'content': 'content'},
            {'attribute': 'name', 'value': 'sailthru.date',
             'content': 'content'},
            {'attribute': 'name', 'value': 'PublishDate',
             'content': 'content'},
            {'attribute': 'pubdate', 'value': 'pubdate',
             'content': 'datetime'},
            {'attribute': 'name', 'value': 'publish_date',
             'content': 'content'},
        ]
        for known_meta_tag in PUBLISH_DATE_TAGS:
            meta_tags = self.parser.getElementsByTag(
                doc,
                attr=known_meta_tag['attribute'],
                value=known_meta_tag['value'])
            if meta_tags:
                date_str = self.parser.getAttribute(
                    meta_tags[0],
                    known_meta_tag['content'])
                datetime_obj = parse_date_str(date_str)
                if datetime_obj:
                    return datetime_obj

        return None

    def get_title(self, doc):
        """Fetch the article title and analyze it

        Assumptions:
        - title tag is the most reliable (inherited from Goose)
        - h1, if properly detected, is the best (visible to users)
        - og:title and h1 can help improve the title extraction
        - python == is too strict, often we need to compare filtered
          versions, i.e. lowercase and ignoring special chars

        Explicit rules:
        1. title == h1, no need to split
        2. h1 similar to og:title, use h1
        3. title contains h1, title contains og:title, len(h1) > len(og:title), use h1
        4. title starts with og:title, use og:title
        5. use title, after splitting
        """
        title = ''
        title_element = self.parser.getElementsByTag(doc, tag='title')
        # no title found
        if title_element is None or len(title_element) == 0:
            return title

        # title elem found
        title_text = self.parser.getText(title_element[0])
        used_delimeter = False

        # title from h1
        # - extract the longest text from all h1 elements
        # - too short texts (fewer than 2 words) are discarded
        # - clean double spaces
        title_text_h1 = ''
        title_element_h1_list = self.parser.getElementsByTag(doc,
                                                             tag='h1') or []
        title_text_h1_list = [self.parser.getText(tag) for tag in
                              title_element_h1_list]
        if title_text_h1_list:
            # sort by len and set the longest
            title_text_h1_list.sort(key=len, reverse=True)
            title_text_h1 = title_text_h1_list[0]
            # discard too short texts
            if len(title_text_h1.split(' ')) <= 2:
                title_text_h1 = ''
            # clean double spaces
            title_text_h1 = ' '.join([x for x in title_text_h1.split() if x])

        # title from og:title
        title_text_fb = (
        self.get_meta_content(doc, 'meta[property="og:title"]') or
        self.get_meta_content(doc, 'meta[name="og:title"]') or '')

        # create filtered versions of title_text, title_text_h1, title_text_fb
        # for finer comparison
        filter_regex = re.compile(r'[^\u4e00-\u9fa5a-zA-Z0-9\ ]')
        filter_title_text = filter_regex.sub('', title_text).lower()
        filter_title_text_h1 = filter_regex.sub('', title_text_h1).lower()
        filter_title_text_fb = filter_regex.sub('', title_text_fb).lower()

        # check for better alternatives for title_text and possibly skip splitting
        if title_text_h1 == title_text:
            used_delimeter = True
        elif filter_title_text_h1 and filter_title_text_h1 == filter_title_text_fb:
            title_text = title_text_h1
            used_delimeter = True
        elif filter_title_text_h1 and filter_title_text_h1 in filter_title_text \
                and filter_title_text_fb and filter_title_text_fb in filter_title_text \
                and len(title_text_h1) > len(title_text_fb):
            title_text = title_text_h1
            used_delimeter = True
        elif filter_title_text_fb and filter_title_text_fb != filter_title_text \
                and filter_title_text.startswith(filter_title_text_fb):
            title_text = title_text_fb
            used_delimeter = True

        # split title with |
        if not used_delimeter and '|' in title_text:
            title_text = self.split_title(title_text, PIPE_SPLITTER,
                                          title_text_h1)
            used_delimeter = True

        # split title with -
        if not used_delimeter and '-' in title_text:
            title_text = self.split_title(title_text, DASH_SPLITTER,
                                          title_text_h1)
            used_delimeter = True

        # split title with _
        if not used_delimeter and '_' in title_text:
            title_text = self.split_title(title_text, UNDERSCORE_SPLITTER,
                                          title_text_h1)
            used_delimeter = True

        # split title with /
        if not used_delimeter and '/' in title_text:
            title_text = self.split_title(title_text, SLASH_SPLITTER,
                                          title_text_h1)
            used_delimeter = True

        # split title with »
        if not used_delimeter and ' » ' in title_text:
            title_text = self.split_title(title_text, ARROWS_SPLITTER,
                                          title_text_h1)
            used_delimeter = True

        title = MOTLEY_REPLACEMENT.replaceAll(title_text)

        # in some cases the final title is quite similar to title_text_h1
        # (either it differs for case, for special chars, or it's truncated)
        # in these cases, we prefer the title_text_h1
        filter_title = filter_regex.sub('', title).lower()
        if filter_title_text_h1 == filter_title:
            title = title_text_h1

        return title

    def split_title(self, title, splitter, hint=None):
        """Split the title to best part possible
        """
        large_text_length = 0
        large_text_index = 0
        title_pieces = splitter.split(title)

        if hint:
            filter_regex = re.compile(r'[^a-zA-Z0-9\ ]')
            hint = filter_regex.sub('', hint).lower()

        # find the largest title piece
        for i, title_piece in enumerate(title_pieces):
            current = title_piece.strip()
            if hint and hint in filter_regex.sub('', current).lower():
                large_text_index = i
                break
            if len(current) > large_text_length:
                large_text_length = len(current)
                large_text_index = i

        # replace content
        title = title_pieces[large_text_index]
        return TITLE_REPLACEMENTS.replaceAll(title).strip()

    def get_feed_urls(self, source_url, categories):
        """Takes a source url and a list of category objects and returns
        a list of feed urls
        """
        total_feed_urls = []
        for category in categories:
            kwargs = {'attr': 'type', 'value': 'application\/rss\+xml'}
            feed_elements = self.parser.getElementsByTag(
                category.doc, **kwargs)
            feed_urls = [e.get('href') for e in feed_elements if e.get('href')]
            total_feed_urls.extend(feed_urls)

        total_feed_urls = total_feed_urls[:50]
        total_feed_urls = [urls.prepare_url(f, source_url)
                           for f in total_feed_urls]
        total_feed_urls = list(set(total_feed_urls))
        return total_feed_urls

    def get_favicon(self, doc):
        """Extract the favicon from a website http://en.wikipedia.org/wiki/Favicon
        <link rel="shortcut icon" type="image/png" href="favicon.png" />
        <link rel="icon" type="image/png" href="favicon.png" />
        """
        kwargs = {'tag': 'link', 'attr': 'rel', 'value': 'icon'}
        meta = self.parser.getElementsByTag(doc, **kwargs)
        if meta:
            favicon = self.parser.getAttribute(meta[0], 'href')
            return favicon
        return ''

    def get_meta_lang(self, doc):
        """Extract content language from meta
        """
        # we have a lang attribute in html
        attr = self.parser.getAttribute(doc, attr='lang')
        if attr is None:
            # look up for a Content-Language in meta
            items = [
                {'tag': 'meta', 'attr': 'http-equiv',
                 'value': 'content-language'},
                {'tag': 'meta', 'attr': 'name', 'value': 'lang'}
            ]
            for item in items:
                meta = self.parser.getElementsByTag(doc, **item)
                if meta:
                    attr = self.parser.getAttribute(
                        meta[0], attr='content')
                    break
        if attr:
            value = attr[:2]
            if re.search(RE_LANG, value):
                return value.lower()

        return None

    def get_meta_content(self, doc, metaname):
        """Extract a given meta content form document.
        Example metaNames:
            "meta[name=description]"
            "meta[name=keywords]"
            "meta[property=og:type]"
        """
        meta = self.parser.css_select(doc, metaname)
        content = None
        if meta is not None and len(meta) > 0:
            content = self.parser.getAttribute(meta[0], 'content')
        if content:
            return content.strip()
        return ''

    def get_meta_img_url(self, article_url, doc):
        """Returns the 'top img' as specified by the website
        """
        top_meta_image, try_one, try_two, try_three, try_four = [None] * 5
        try_one = self.get_meta_content(doc, 'meta[property="og:image"]')
        if not try_one:
            link_img_src_kwargs = \
                {'tag': 'link', 'attr': 'rel', 'value': 'img_src|image_src'}
            elems = self.parser.getElementsByTag(doc, use_regex=True, **link_img_src_kwargs)
            try_two = elems[0].get('href') if elems else None

            if not try_two:
                try_three = self.get_meta_content(doc, 'meta[name="og:image"]')

                if not try_three:
                    link_icon_kwargs = {'tag': 'link', 'attr': 'rel', 'value': 'icon'}
                    elems = self.parser.getElementsByTag(doc, **link_icon_kwargs)
                    try_four = elems[0].get('href') if elems else None

        top_meta_image = try_one or try_two or try_three or try_four

        if top_meta_image:
            return urljoin(article_url, top_meta_image)
        return ''

    def get_meta_type(self, doc):
        """Returns meta type of article, open graph protocol
        """
        return self.get_meta_content(doc, 'meta[property="og:type"]')

    def get_meta_site_name(self, doc):
        """Returns site name of article, open graph protocol
        """
        return self.get_meta_content(doc, 'meta[property="og:site_name"]')

    def get_meta_description(self, doc):
        """If the article has meta description set in the source, use that
        """
        return self.get_meta_content(doc, "meta[name=description]")

    def get_meta_keywords(self, doc):
        """If the article has meta keywords set in the source, use that
        """
        return self.get_meta_content(doc, "meta[name=keywords]")

    def get_meta_data(self, doc):
        data = defaultdict(dict)
        properties = self.parser.css_select(doc, 'meta')
        for prop in properties:
            key = prop.attrib.get('property') or prop.attrib.get('name')
            value = prop.attrib.get('content') or prop.attrib.get('value')

            if not key or not value:
                continue

            key, value = key.strip(), value.strip()
            if value.isdigit():
                value = int(value)

            if ':' not in key:
                data[key] = value
                continue

            key = key.split(':')
            key_head = key.pop(0)
            ref = data[key_head]

            if isinstance(ref, str) or isinstance(ref, int):
                data[key_head] = {key_head: ref}
                ref = data[key_head]

            for idx, part in enumerate(key):
                if idx == len(key) - 1:
                    ref[part] = value
                    break
                if not ref.get(part):
                    ref[part] = dict()
                elif isinstance(ref.get(part), str) or isinstance(ref.get(part), int):
                    # Not clear what to do in this scenario,
                    # it's not always a URL, but an ID of some sort
                    ref[part] = {'identifier': ref[part]}
                ref = ref[part]
        return data

    def get_canonical_link(self, article_url, doc):
        """
        Return the article's canonical URL

        Gets the first available value of:
        1. The rel=canonical tag
        2. The og:url tag
        """
        links = self.parser.getElementsByTag(doc, tag='link', attr='rel',
                                             value='canonical')

        canonical = self.parser.getAttribute(links[0], 'href') if links else ''
        og_url = self.get_meta_content(doc, 'meta[property="og:url"]')
        meta_url = canonical or og_url or ''
        if meta_url:
            meta_url = meta_url.strip()
            parsed_meta_url = urlparse(meta_url)
            if not parsed_meta_url.hostname:
                # MIGHT not have a hostname in meta_url
                # parsed_url.path might be 'example.com/article.html' where
                # clearly example.com is the hostname
                parsed_article_url = urlparse(article_url)
                strip_hostname_in_meta_path = re. \
                    match(".*{}(?=/)/(.*)".
                          format(parsed_article_url.hostname),
                          parsed_meta_url.path)
                try:
                    true_path = strip_hostname_in_meta_path.group(1)
                except AttributeError:
                    true_path = parsed_meta_url.path

                # true_path may contain querystrings and fragments
                meta_url = urlunparse((parsed_article_url.scheme,
                                       parsed_article_url.hostname, true_path,
                                       '', '', ''))

        return meta_url

    def get_img_urls(self, article_url, doc):
        """Return all of the images on an html page, lxml root
        """
        img_kwargs = {'tag': 'img'}
        img_tags = self.parser.getElementsByTag(doc, **img_kwargs)
        urls = [img_tag.get('src')
                for img_tag in img_tags if img_tag.get('src')]
        img_links = set([urljoin(article_url, url)
                         for url in urls])
        return img_links

    def get_first_img_url(self, article_url, top_node):
        """Retrieves the first image in the 'top_node'
        The top node is essentially the HTML markdown where the main
        article lies and the first image in that area is probably signifigcant.
        """
        node_images = self.get_img_urls(article_url, top_node)
        node_images = list(node_images)
        if node_images:
            return urljoin(article_url, node_images[0])
        return ''

    def _get_urls(self, doc, titles):
        """Return a list of urls or a list of (url, title_text) tuples
        if specified.
        """
        if doc is None:
            return []

        a_kwargs = {'tag': 'a'}
        a_tags = self.parser.getElementsByTag(doc, **a_kwargs)

        # TODO: this should be refactored! We should have a separate
        # method which siphones the titles our of a list of <a> tags.
        if titles:
            return [(a.get('href'), a.text) for a in a_tags if a.get('href')]
        return [a.get('href') for a in a_tags if a.get('href')]

    def get_urls(self, doc_or_html, titles=False, regex=False):
        """`doc_or_html`s html page or doc and returns list of urls, the regex
        flag indicates we don't parse via lxml and just search the html.
        """
        if doc_or_html is None:
            log.critical('Must extract urls from either html, text or doc!')
            return []
        # If we are extracting from raw text
        if regex:
            doc_or_html = re.sub('<[^<]+?>', ' ', str(doc_or_html))
            doc_or_html = re.findall(
                'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|'
                '(?:%[0-9a-fA-F][0-9a-fA-F]))+', doc_or_html)
            doc_or_html = [i.strip() for i in doc_or_html]
            return doc_or_html or []
        # If the doc_or_html is html, parse it into a root
        if isinstance(doc_or_html, str):
            doc = self.parser.fromstring(doc_or_html)
        else:
            doc = doc_or_html
        return self._get_urls(doc, titles)

    def get_category_urls(self, source_url, doc):
        """Inputs source lxml root and source url, extracts domain and
        finds all of the top level urls, we are assuming that these are
        the category urls.
        cnn.com --> [cnn.com/latest, world.cnn.com, cnn.com/asia]
        """
        page_urls = self.get_urls(doc)
        valid_categories = []
        for p_url in page_urls:
            scheme = urls.get_scheme(p_url, allow_fragments=False)
            domain = urls.get_domain(p_url, allow_fragments=False)
            path = urls.get_path(p_url, allow_fragments=False)

            if not domain and not path:
                if self.config.verbose:
                    print('elim category url %s for no domain and path'
                          % p_url)
                continue
            if path and path.startswith('#'):
                if self.config.verbose:
                    print('elim category url %s path starts with #' % p_url)
                continue
            if scheme and (scheme != 'http' and scheme != 'https'):
                if self.config.verbose:
                    print(('elim category url %s for bad scheme, '
                           'not http nor https' % p_url))
                continue

            if domain:
                child_tld = tldextract.extract(p_url)
                domain_tld = tldextract.extract(source_url)
                child_subdomain_parts = child_tld.subdomain.split('.')
                subdomain_contains = False
                for part in child_subdomain_parts:
                    if part == domain_tld.domain:
                        if self.config.verbose:
                            print(('subdomain contains at %s and %s' %
                                   (str(part), str(domain_tld.domain))))
                        subdomain_contains = True
                        break

                # Ex. microsoft.com is definitely not related to
                # espn.com, but espn.go.com is probably related to espn.com
                if not subdomain_contains and \
                        (child_tld.domain != domain_tld.domain):
                    if self.config.verbose:
                        print(('elim category url %s for domain '
                               'mismatch' % p_url))
                        continue
                elif child_tld.subdomain in ['m', 'i']:
                    if self.config.verbose:
                        print(('elim category url %s for mobile '
                               'subdomain' % p_url))
                    continue
                else:
                    valid_categories.append(scheme + '://' + domain)
                    # TODO account for case where category is in form
                    # http://subdomain.domain.tld/category/ <-- still legal!
            else:
                # we want a path with just one subdir
                # cnn.com/world and cnn.com/world/ are both valid_categories
                path_chunks = [x for x in path.split('/') if len(x) > 0]
                if 'index.html' in path_chunks:
                    path_chunks.remove('index.html')

                if len(path_chunks) == 1 and len(path_chunks[0]) < 14:
                    valid_categories.append(domain + path)
                else:
                    if self.config.verbose:
                        print(('elim category url %s for >1 path chunks '
                               'or size path chunks' % p_url))
        stopwords = [
            'about', 'help', 'privacy', 'legal', 'feedback', 'sitemap',
            'profile', 'account', 'mobile', 'sitemap', 'facebook', 'myspace',
            'twitter', 'linkedin', 'bebo', 'friendster', 'stumbleupon',
            'youtube', 'vimeo', 'store', 'mail', 'preferences', 'maps',
            'password', 'imgur', 'flickr', 'search', 'subscription', 'itunes',
            'siteindex', 'events', 'stop', 'jobs', 'careers', 'newsletter',
            'subscribe', 'academy', 'shopping', 'purchase', 'site-map',
            'shop', 'donate', 'newsletter', 'product', 'advert', 'info',
            'tickets', 'coupons', 'forum', 'board', 'archive', 'browse',
            'howto', 'how to', 'faq', 'terms', 'charts', 'services',
            'contact', 'plus', 'admin', 'login', 'signup', 'register',
            'developer', 'proxy']

        _valid_categories = []

        # TODO Stop spamming urlparse and tldextract calls...

        for p_url in valid_categories:
            path = urls.get_path(p_url)
            subdomain = tldextract.extract(p_url).subdomain
            conjunction = path + ' ' + subdomain
            bad = False
            for badword in stopwords:
                if badword.lower() in conjunction.lower():
                    if self.config.verbose:
                        print(('elim category url %s for subdomain '
                               'contain stopword!' % p_url))
                    bad = True
                    break
            if not bad:
                _valid_categories.append(p_url)

        _valid_categories.append('/')  # add the root

        for i, p_url in enumerate(_valid_categories):
            if p_url.startswith('://'):
                p_url = 'http' + p_url
                _valid_categories[i] = p_url

            elif p_url.startswith('//'):
                p_url = 'http:' + p_url
                _valid_categories[i] = p_url

            if p_url.endswith('/'):
                p_url = p_url[:-1]
                _valid_categories[i] = p_url

        _valid_categories = list(set(_valid_categories))

        category_urls = [urls.prepare_url(p_url, source_url)
                         for p_url in _valid_categories]
        category_urls = [c for c in category_urls if c is not None]
        return category_urls

    def extract_tags(self, doc):
        if len(list(doc)) == 0:
            return NO_STRINGS
        elements = self.parser.css_select(
            doc, A_REL_TAG_SELECTOR)
        if not elements:
            elements = self.parser.css_select(
                doc, A_HREF_TAG_SELECTOR)
            if not elements:
                return NO_STRINGS

        tags = []
        for el in elements:
            tag = self.parser.getText(el)
            if tag:
                tags.append(tag)
        return set(tags)

    def calculate_best_node(self, doc):
        top_node = None
        nodes_to_check = self.nodes_to_check(doc)
        starting_boost = float(1.0)
        cnt = 0
        i = 0
        parent_nodes = []
        nodes_with_text = []

        for node in nodes_to_check:
            text_node = self.parser.getText(node)
            word_stats = self.stopwords_class(language=self.language). \
                get_stopword_count(text_node)
            high_link_density = self.is_highlink_density(node)
            if word_stats.get_stopword_count() > 2 and not high_link_density:
                nodes_with_text.append(node)

        nodes_number = len(nodes_with_text)
        negative_scoring = 0
        bottom_negativescore_nodes = float(nodes_number) * 0.25

        for node in nodes_with_text:
            boost_score = float(0)
            # boost
            if self.is_boostable(node):
                if cnt >= 0:
                    boost_score = float((1.0 / starting_boost) * 50)
                    starting_boost += 1
            # nodes_number
            if nodes_number > 15:
                if (nodes_number - i) <= bottom_negativescore_nodes:
                    booster = float(
                        bottom_negativescore_nodes - (nodes_number - i))
                    boost_score = float(-pow(booster, float(2)))
                    negscore = abs(boost_score) + negative_scoring
                    if negscore > 40:
                        boost_score = float(5)

            text_node = self.parser.getText(node)
            word_stats = self.stopwords_class(language=self.language). \
                get_stopword_count(text_node)
            upscore = int(word_stats.get_stopword_count() + boost_score)

            parent_node = self.parser.getParent(node)
            self.update_score(parent_node, upscore)
            self.update_node_count(parent_node, 1)

            if parent_node not in parent_nodes:
                parent_nodes.append(parent_node)

            # Parent of parent node
            parent_parent_node = self.parser.getParent(parent_node)
            if parent_parent_node is not None:
                self.update_node_count(parent_parent_node, 1)
                self.update_score(parent_parent_node, upscore / 2)
                if parent_parent_node not in parent_nodes:
                    parent_nodes.append(parent_parent_node)
            cnt += 1
            i += 1

        top_node_score = 0
        for e in parent_nodes:
            score = self.get_score(e)

            if score > top_node_score:
                top_node = e
                top_node_score = score

            if top_node is None:
                top_node = e
        return top_node

    def is_boostable(self, node):
        """A lot of times the first paragraph might be the caption under an image
        so we'll want to make sure if we're going to boost a parent node that
        it should be connected to other paragraphs, at least for the first n
        paragraphs so we'll want to make sure that the next sibling is a
        paragraph and has at least some substantial weight to it.
        """
        para = "p"
        steps_away = 0
        minimum_stopword_count = 5
        max_stepsaway_from_node = 3

        nodes = self.walk_siblings(node)
        for current_node in nodes:
            # <p>
            current_node_tag = self.parser.getTag(current_node)
            if current_node_tag == para:
                if steps_away >= max_stepsaway_from_node:
                    return False
                paragraph_text = self.parser.getText(current_node)
                word_stats = self.stopwords_class(language=self.language). \
                    get_stopword_count(paragraph_text)
                if word_stats.get_stopword_count() > minimum_stopword_count:
                    return True
                steps_away += 1
        return False

    def walk_siblings(self, node):
        return self.parser.previousSiblings(node)

    def add_siblings(self, top_node):
        baseline_score_siblings_para = self.get_siblings_score(top_node)
        results = self.walk_siblings(top_node)
        for current_node in results:
            ps = self.get_siblings_content(
                current_node, baseline_score_siblings_para)
            for p in ps:
                top_node.insert(0, p)
        return top_node

    def get_siblings_content(
            self, current_sibling, baseline_score_siblings_para):
        """Adds any siblings that may have a decent score to this node
        """
        if current_sibling.tag == 'p' and \
                        len(self.parser.getText(current_sibling)) > 0:
            e0 = current_sibling
            if e0.tail:
                e0 = copy.deepcopy(e0)
                e0.tail = ''
            return [e0]
        else:
            potential_paragraphs = self.parser.getElementsByTag(
                current_sibling, tag='p')
            if potential_paragraphs is None:
                return None
            else:
                ps = []
                for first_paragraph in potential_paragraphs:
                    text = self.parser.getText(first_paragraph)
                    if len(text) > 0:
                        word_stats = self.stopwords_class(
                            language=self.language). \
                            get_stopword_count(text)
                        paragraph_score = word_stats.get_stopword_count()
                        sibling_baseline_score = float(.30)
                        high_link_density = self.is_highlink_density(
                            first_paragraph)
                        score = float(baseline_score_siblings_para *
                                      sibling_baseline_score)
                        if score < paragraph_score and not high_link_density:
                            p = self.parser.createElement(
                                tag='p', text=text, tail=None)
                            ps.append(p)
                return ps

    def get_siblings_score(self, top_node):
        """We could have long articles that have tons of paragraphs
        so if we tried to calculate the base score against
        the total text score of those paragraphs it would be unfair.
        So we need to normalize the score based on the average scoring
        of the paragraphs within the top node.
        For example if our total score of 10 paragraphs was 1000
        but each had an average value of 100 then 100 should be our base.
        """
        base = 100000
        paragraphs_number = 0
        paragraphs_score = 0
        nodes_to_check = self.parser.getElementsByTag(top_node, tag='p')

        for node in nodes_to_check:
            text_node = self.parser.getText(node)
            word_stats = self.stopwords_class(language=self.language). \
                get_stopword_count(text_node)
            high_link_density = self.is_highlink_density(node)
            if word_stats.get_stopword_count() > 2 and not high_link_density:
                paragraphs_number += 1
                paragraphs_score += word_stats.get_stopword_count()

        if paragraphs_number > 0:
            base = paragraphs_score / paragraphs_number

        return base

    def update_score(self, node, add_to_score):
        """Adds a score to the gravityScore Attribute we put on divs
        we'll get the current score then add the score we're passing
        in to the current.
        """
        current_score = 0
        score_string = self.parser.getAttribute(node, 'gravityScore')
        if score_string:
            current_score = float(score_string)

        new_score = current_score + add_to_score
        self.parser.setAttribute(node, "gravityScore", str(new_score))

    def update_node_count(self, node, add_to_count):
        """Stores how many decent nodes are under
Download .txt
gitextract_g2lraicp/

├── .gitattributes
├── .gitignore
├── .travis.yml
├── CHANGELOG.md
├── GOOSE-LICENSE.txt
├── LICENSE
├── MANIFEST.in
├── README.rst
├── docs/
│   ├── Makefile
│   ├── _templates/
│   │   ├── sidebarintro.html
│   │   └── sidebarlogo.html
│   ├── _themes/
│   │   ├── .gitignore
│   │   ├── LICENSE
│   │   ├── README.rst
│   │   ├── flask_theme_support.py
│   │   ├── kr/
│   │   │   ├── layout.html
│   │   │   ├── relations.html
│   │   │   ├── static/
│   │   │   │   └── flasky.css_t
│   │   │   └── theme.conf
│   │   └── kr_small/
│   │       ├── layout.html
│   │       ├── static/
│   │       │   └── flasky.css_t
│   │       └── theme.conf
│   ├── conf.py
│   ├── index.rst
│   ├── make.bat
│   └── user_guide/
│       ├── advanced.rst
│       ├── api.rst
│       ├── contributors.rst
│       └── quickstart.rst
├── download_corpora.py
├── newspaper/
│   ├── __init__.py
│   ├── api.py
│   ├── article.py
│   ├── cleaners.py
│   ├── configuration.py
│   ├── extractors.py
│   ├── images.py
│   ├── mthreading.py
│   ├── network.py
│   ├── nlp.py
│   ├── outputformatters.py
│   ├── parsers.py
│   ├── resources/
│   │   ├── misc/
│   │   │   ├── google_sources.txt
│   │   │   ├── popular_sources.txt
│   │   │   ├── stopwords-nlp-en.txt
│   │   │   └── useragents.txt
│   │   └── text/
│   │       ├── stopwords-ar.txt
│   │       ├── stopwords-be.txt
│   │       ├── stopwords-bg.txt
│   │       ├── stopwords-da.txt
│   │       ├── stopwords-de.txt
│   │       ├── stopwords-el.txt
│   │       ├── stopwords-en.txt
│   │       ├── stopwords-es.txt
│   │       ├── stopwords-et.txt
│   │       ├── stopwords-fa.txt
│   │       ├── stopwords-fi.txt
│   │       ├── stopwords-fr.txt
│   │       ├── stopwords-he.txt
│   │       ├── stopwords-hi.txt
│   │       ├── stopwords-hr.txt
│   │       ├── stopwords-hu.txt
│   │       ├── stopwords-id.txt
│   │       ├── stopwords-it.txt
│   │       ├── stopwords-ja.txt
│   │       ├── stopwords-ko.txt
│   │       ├── stopwords-lt.txt
│   │       ├── stopwords-mk.txt
│   │       ├── stopwords-nb.txt
│   │       ├── stopwords-nl.txt
│   │       ├── stopwords-no.txt
│   │       ├── stopwords-pl.txt
│   │       ├── stopwords-pt.txt
│   │       ├── stopwords-ro.txt
│   │       ├── stopwords-ru.txt
│   │       ├── stopwords-sl.txt
│   │       ├── stopwords-sr.txt
│   │       ├── stopwords-sv.txt
│   │       ├── stopwords-sw.txt
│   │       ├── stopwords-th.txt
│   │       ├── stopwords-tr.txt
│   │       ├── stopwords-uk.txt
│   │       ├── stopwords-vi.txt
│   │       └── stopwords-zh.txt
│   ├── settings.py
│   ├── source.py
│   ├── text.py
│   ├── urls.py
│   ├── utils.py
│   ├── version.py
│   └── videos/
│       ├── __init__.py
│       ├── extractors.py
│       └── videos.py
├── requirements.txt
├── setup.py
└── tests/
    ├── __init__.py
    ├── benchmarks.py
    ├── data/
    │   ├── fulltext_domain_list.txt
    │   ├── fulltext_url_list.txt
    │   ├── html/
    │   │   ├── 247wallst.com1.html
    │   │   ├── 247wallst.com2.html
    │   │   ├── about.com1.html
    │   │   ├── about.com2.html
    │   │   ├── adoption.com1.html
    │   │   ├── al.com1.html
    │   │   ├── al.com2.html
    │   │   ├── ap_meta_refresh.html
    │   │   ├── apartmenttherapy.com1.html
    │   │   ├── apartmenttherapy.com2.html
    │   │   ├── arabic_article.html
    │   │   ├── architecturaldigest.com1.html
    │   │   ├── architecturaldigest.com2.html
    │   │   ├── avclub.com1.html
    │   │   ├── avclub.com2.html
    │   │   ├── backstage.com1.html
    │   │   ├── backstage.com2.html
    │   │   ├── bhg.com1.html
    │   │   ├── bhg.com2.html
    │   │   ├── bloomberg.com1.html
    │   │   ├── bostonherald.com1.html
    │   │   ├── bostonherald.com2.html
    │   │   ├── businessinsider.com1.html
    │   │   ├── businessinsider.com2.html
    │   │   ├── businessweek.com1.html
    │   │   ├── businessweek.com2.html
    │   │   ├── chinese_article.html
    │   │   ├── cleveland.com1.html
    │   │   ├── cleveland.com2.html
    │   │   ├── cnn_article.html
    │   │   ├── cnn_main_site.html
    │   │   ├── cntraveler.com1.html
    │   │   ├── cntraveler.com2.html
    │   │   ├── coolhunting.com1.html
    │   │   ├── coolhunting.com2.html
    │   │   ├── cricket.com.au1.html
    │   │   ├── cricket.com.au2.html
    │   │   ├── dailycaller.com1.html
    │   │   ├── dailycaller.com2.html
    │   │   ├── dailystar.co.uk1.html
    │   │   ├── dailystar.co.uk2.html
    │   │   ├── dallasnews.com1.html
    │   │   ├── dallasnews.com2.html
    │   │   ├── details.com1.html
    │   │   ├── details.com2.html
    │   │   ├── elle.com1.html
    │   │   ├── elle.com2.html
    │   │   ├── flavorwire.com1.html
    │   │   ├── flavorwire.com2.html
    │   │   ├── fool.com1.html
    │   │   ├── fool.com2.html
    │   │   ├── foxbusiness.com1.html
    │   │   ├── foxbusiness.com2.html
    │   │   ├── foxnews.com1.html
    │   │   ├── foxnews.com2.html
    │   │   ├── glamour.com1.html
    │   │   ├── glamour.com2.html
    │   │   ├── globalnews.ca1.html
    │   │   ├── globalnews.ca2.html
    │   │   ├── google_meta_refresh.html
    │   │   ├── gq.com1.html
    │   │   ├── gq.com2.html
    │   │   ├── graziadaily.co.uk1.html
    │   │   ├── graziadaily.co.uk2.html
    │   │   ├── gulflive.com1.html
    │   │   ├── gulflive.com2.html
    │   │   ├── huffingtonpost.com1.html
    │   │   ├── japanese_article.html
    │   │   ├── japanese_article2.html
    │   │   ├── lifebuzz.com1.html
    │   │   ├── lifebuzz.com2.html
    │   │   ├── livescience.com1.html
    │   │   ├── livescience.com2.html
    │   │   ├── mashable.com1.html
    │   │   ├── mashable.com2.html
    │   │   ├── mlive.com1.html
    │   │   ├── mlive.com2.html
    │   │   ├── newyorker.com1.html
    │   │   ├── nj.com1.html
    │   │   ├── nola.com1.html
    │   │   ├── nydailynews.com1.html
    │   │   ├── nypost.com1.html
    │   │   ├── nypost.com2.html
    │   │   ├── ok.co.uk1.html
    │   │   ├── ok.co.uk2.html
    │   │   ├── oregonlive.com1.html
    │   │   ├── oregonlive.com2.html
    │   │   ├── parsely.com1.html
    │   │   ├── parsely.com2.html
    │   │   ├── pe.com1.html
    │   │   ├── pewresearch.org1.html
    │   │   ├── pewresearch.org2.html
    │   │   ├── pixable.com1.html
    │   │   ├── pixable.com2.html
    │   │   ├── pixelmonkey.org1.html
    │   │   ├── pixelmonkey.org2.html
    │   │   ├── readwrite.com1.html
    │   │   ├── recipe.com1.html
    │   │   ├── recipe.com2.html
    │   │   ├── reuters.com1.html
    │   │   ├── reuters.com2.html
    │   │   ├── reuters.com3.html
    │   │   ├── reuters.com4.html
    │   │   ├── self.com1.html
    │   │   ├── self.com2.html
    │   │   ├── sitepoint.com1.html
    │   │   ├── sitepoint.com2.html
    │   │   ├── slate.com1.html
    │   │   ├── slate.com2.html
    │   │   ├── space.com1.html
    │   │   ├── space.com2.html
    │   │   ├── spanish_article.html
    │   │   ├── syracuse.com1.html
    │   │   ├── syracuse.com2.html
    │   │   ├── talkingpointsmemo.com1.html
    │   │   ├── technologyreview.com1.html
    │   │   ├── technologyreview.com2.html
    │   │   ├── teenvogue.com1.html
    │   │   ├── teenvogue.com2.html
    │   │   ├── telegraph.co.uk1.html
    │   │   ├── telegraph.co.uk2.html
    │   │   ├── thai_article.html
    │   │   ├── theatlantic.com1.html
    │   │   ├── theatlantic.com2.html
    │   │   ├── theatlanticcities.com1.html
    │   │   ├── theatlanticcities.com2.html
    │   │   ├── thedailybeast.com1.html
    │   │   ├── thedailybeast.com2.html
    │   │   ├── thedebrief.co.uk1.html
    │   │   ├── thedebrief.co.uk2.html
    │   │   ├── theglobeandmail.com1.html
    │   │   ├── theglobeandmail.com2.html
    │   │   ├── thekitchn.com1.html
    │   │   ├── thekitchn.com2.html
    │   │   ├── thenextweb.com1.html
    │   │   ├── theonion.com1.html
    │   │   ├── theroot.com1.html
    │   │   ├── tnr.com1.html
    │   │   ├── tnr.com2.html
    │   │   ├── uproxx.com1.html
    │   │   ├── uproxx.com2.html
    │   │   ├── upworthy.com1.html
    │   │   ├── upworthy.com2.html
    │   │   ├── usnews.com1.html
    │   │   ├── usnews.com2.html
    │   │   ├── vanityfair.com1.html
    │   │   ├── vogue.de1.html
    │   │   ├── vogue.de2.html
    │   │   ├── wetpaint.com1.html
    │   │   ├── wetpaint.com2.html
    │   │   ├── wired.com1.html
    │   │   ├── wired.com2.html
    │   │   ├── wnet.org1.html
    │   │   ├── wnet.org2.html
    │   │   ├── yahoo_main_site.html
    │   │   ├── youbeauty.com1.html
    │   │   └── youbeauty.com2.html
    │   ├── test_prepare_urls.txt
    │   ├── test_urls.txt
    │   ├── test_urls_pubdate.txt
    │   └── text/
    │       ├── 247wallst.com1.txt
    │       ├── 247wallst.com2.txt
    │       ├── about.com1.txt
    │       ├── about.com2.txt
    │       ├── adoption.com1.txt
    │       ├── al.com1.txt
    │       ├── al.com2.txt
    │       ├── apartmenttherapy.com1.txt
    │       ├── apartmenttherapy.com2.txt
    │       ├── arabic.txt
    │       ├── architecturaldigest.com1.txt
    │       ├── architecturaldigest.com2.txt
    │       ├── avclub.com1.txt
    │       ├── avclub.com2.txt
    │       ├── backstage.com1.txt
    │       ├── backstage.com2.txt
    │       ├── bhg.com1.txt
    │       ├── bhg.com2.txt
    │       ├── bloomberg.com1.txt
    │       ├── bostonherald.com1.txt
    │       ├── bostonherald.com2.txt
    │       ├── businessinsider.com1.txt
    │       ├── businessinsider.com2.txt
    │       ├── businessweek.com1.txt
    │       ├── businessweek.com2.txt
    │       ├── chinese.txt
    │       ├── cleveland.com1.txt
    │       ├── cleveland.com2.txt
    │       ├── cnn.txt
    │       ├── cnn_summary.txt
    │       ├── cntraveler.com1.txt
    │       ├── cntraveler.com2.txt
    │       ├── coolhunting.com1.txt
    │       ├── cricket.com.au1.txt
    │       ├── cricket.com.au2.txt
    │       ├── dailycaller.com1.txt
    │       ├── dailycaller.com2.txt
    │       ├── dailystar.co.uk1.txt
    │       ├── dailystar.co.uk2.txt
    │       ├── dallasnews.com1.txt
    │       ├── dallasnews.com2.txt
    │       ├── details.com1.txt
    │       ├── details.com2.txt
    │       ├── elle.com1.txt
    │       ├── elle.com2.txt
    │       ├── flavorwire.com1.txt
    │       ├── flavorwire.com2.txt
    │       ├── fool.com1.txt
    │       ├── fool.com2.txt
    │       ├── foxbusiness.com1.txt
    │       ├── foxbusiness.com2.txt
    │       ├── foxnews.com1.txt
    │       ├── foxnews.com2.txt
    │       ├── foxnews.com3.txt
    │       ├── foxnews.com4.txt
    │       ├── glamour.com1.txt
    │       ├── glamour.com2.txt
    │       ├── globalnews.ca1.txt
    │       ├── globalnews.ca2.txt
    │       ├── gq.com1.txt
    │       ├── gq.com2.txt
    │       ├── graziadaily.co.uk1.txt
    │       ├── graziadaily.co.uk2.txt
    │       ├── gulflive.com1.txt
    │       ├── gulflive.com2.txt
    │       ├── huffingtonpost.com1.txt
    │       ├── japanese.txt
    │       ├── japanese2.txt
    │       ├── lifebuzz.com1.txt
    │       ├── lifebuzz.com2.txt
    │       ├── livescience.com1.txt
    │       ├── livescience.com2.txt
    │       ├── mashable.com1.txt
    │       ├── mashable.com2.txt
    │       ├── mlive.com1.txt
    │       ├── mlive.com2.txt
    │       ├── newyorker.com1.txt
    │       ├── nj.com1.txt
    │       ├── nola.com1.txt
    │       ├── nydailynews.com1.txt
    │       ├── nypost.com1.txt
    │       ├── nypost.com2.txt
    │       ├── ok.co.uk1.txt
    │       ├── ok.co.uk2.txt
    │       ├── oregonlive.com1.txt
    │       ├── oregonlive.com2.txt
    │       ├── parsely.com1.txt
    │       ├── parsely.com2.txt
    │       ├── pe.com1.txt
    │       ├── pewresearch.org1.txt
    │       ├── pewresearch.org2.txt
    │       ├── pixable.com1.txt
    │       ├── pixable.com2.txt
    │       ├── pixelmonkey.org1.txt
    │       ├── pixelmonkey.org2.txt
    │       ├── readwrite.com1.txt
    │       ├── recipe.com1.txt
    │       ├── recipe.com2.txt
    │       ├── reuters.com1.txt
    │       ├── reuters.com2.txt
    │       ├── reuters.com3.txt
    │       ├── reuters.com4.txt
    │       ├── reuters.com5.txt
    │       ├── reuters.com6.txt
    │       ├── self.com1.txt
    │       ├── self.com2.txt
    │       ├── sitepoint.com1.txt
    │       ├── sitepoint.com2.txt
    │       ├── slate.com1.txt
    │       ├── slate.com2.txt
    │       ├── space.com1.txt
    │       ├── space.com2.txt
    │       ├── spanish.txt
    │       ├── syracuse.com1.txt
    │       ├── syracuse.com2.txt
    │       ├── talkingpointsmemo.com1.txt
    │       ├── technologyreview.com1.txt
    │       ├── technologyreview.com2.txt
    │       ├── teenvogue.com1.txt
    │       ├── teenvogue.com2.txt
    │       ├── telegraph.co.uk1.txt
    │       ├── telegraph.co.uk2.txt
    │       ├── thai.txt
    │       ├── theatlantic.com1.txt
    │       ├── theatlantic.com2.txt
    │       ├── theatlanticcities.com1.txt
    │       ├── theatlanticcities.com2.txt
    │       ├── thedailybeast.com1.txt
    │       ├── thedailybeast.com2.txt
    │       ├── thedebrief.co.uk1.txt
    │       ├── thedebrief.co.uk2.txt
    │       ├── theglobeandmail.com1.txt
    │       ├── theglobeandmail.com2.txt
    │       ├── thekitchn.com1.txt
    │       ├── thekitchn.com2.txt
    │       ├── thenextweb.com1.txt
    │       ├── theonion.com1.txt
    │       ├── theroot.com1.txt
    │       ├── tnr.com1.txt
    │       ├── tnr.com2.txt
    │       ├── uproxx.com1.txt
    │       ├── uproxx.com2.txt
    │       ├── upworthy.com1.txt
    │       ├── upworthy.com2.txt
    │       ├── usnews.com1.txt
    │       ├── usnews.com2.txt
    │       ├── vanityfair.com1.txt
    │       ├── vogue.de1.txt
    │       ├── vogue.de2.txt
    │       ├── wetpaint.com1.txt
    │       ├── wetpaint.com2.txt
    │       ├── wired.com1.txt
    │       ├── wired.com2.txt
    │       ├── wnet.org1.txt
    │       ├── wnet.org2.txt
    │       ├── youbeauty.com1.txt
    │       └── youbeauty.com2.txt
    └── unit_tests.py
Download .txt
SYMBOL INDEX (405 symbols across 22 files)

FILE: docs/_themes/flask_theme_support.py
  class FlaskyStyle (line 7) | class FlaskyStyle(Style):

FILE: download_corpora.py
  function main (line 17) | def main():

FILE: newspaper/__init__.py
  class NullHandler (line 25) | class NullHandler(logging.Handler):
    method emit (line 26) | def emit(self, record):

FILE: newspaper/api.py
  function build (line 21) | def build(url='', dry=False, config=None, **kwargs) -> Source:
  function build_article (line 34) | def build_article(url='', config=None, **kwargs) -> Article:
  function languages (line 45) | def languages():
  function popular_urls (line 51) | def popular_urls():
  function hot (line 59) | def hot():
  function fulltext (line 71) | def fulltext(html, language='en'):

FILE: newspaper/article.py
  class ArticleDownloadState (line 32) | class ArticleDownloadState(object):
  class ArticleException (line 38) | class ArticleException(Exception):
  class Article (line 42) | class Article(object):
    method __init__ (line 45) | def __init__(self, url, title='', source_url='', config=None, **kwargs):
    method build (line 156) | def build(self):
    method _parse_scheme_file (line 165) | def _parse_scheme_file(self, path):
    method _parse_scheme_http (line 174) | def _parse_scheme_http(self):
    method download (line 182) | def download(self, input_html=None, title=None, recursion_counter=0):
    method parse (line 212) | def parse(self):
    method fetch_images (line 291) | def fetch_images(self):
    method has_top_image (line 313) | def has_top_image(self):
    method is_valid_url (line 316) | def is_valid_url(self):
    method is_valid_body (line 322) | def is_valid_body(self):
    method is_media_news (line 361) | def is_media_news(self):
    method nlp (line 372) | def nlp(self):
    method get_parse_candidate (line 390) | def get_parse_candidate(self):
    method build_resource_path (line 398) | def build_resource_path(self):
    method get_resource_path (line 405) | def get_resource_path(self):
    method release_resources (line 416) | def release_resources(self):
    method set_reddit_top_img (line 426) | def set_reddit_top_img(self):
    method set_title (line 446) | def set_title(self, input_title):
    method set_text (line 450) | def set_text(self, text):
    method set_html (line 455) | def set_html(self, html):
    method set_article_html (line 464) | def set_article_html(self, article_html):
    method set_meta_img (line 470) | def set_meta_img(self, src_url):
    method set_top_img (line 474) | def set_top_img(self, src_url):
    method set_top_img_no_check (line 480) | def set_top_img_no_check(self, src_url):
    method set_imgs (line 487) | def set_imgs(self, imgs):
    method set_keywords (line 494) | def set_keywords(self, keywords):
    method set_authors (line 502) | def set_authors(self, authors):
    method set_summary (line 510) | def set_summary(self, summary):
    method set_meta_language (line 516) | def set_meta_language(self, meta_lang):
    method set_meta_keywords (line 523) | def set_meta_keywords(self, meta_keywords):
    method set_meta_favicon (line 528) | def set_meta_favicon(self, meta_favicon):
    method set_meta_site_name (line 531) | def set_meta_site_name(self, meta_site_name):
    method set_meta_description (line 534) | def set_meta_description(self, meta_description):
    method set_meta_data (line 537) | def set_meta_data(self, meta_data):
    method set_canonical_link (line 540) | def set_canonical_link(self, canonical_link):
    method set_tags (line 543) | def set_tags(self, tags):
    method set_movies (line 546) | def set_movies(self, movie_objects):
    method throw_if_not_downloaded_verbose (line 552) | def throw_if_not_downloaded_verbose(self):
    method throw_if_not_parsed_verbose (line 562) | def throw_if_not_parsed_verbose(self):

FILE: newspaper/cleaners.py
  class DocumentCleaner (line 10) | class DocumentCleaner(object):
    method __init__ (line 12) | def __init__(self, config):
    method clean (line 52) | def clean(self, doc_to_clean):
    method clean_body_classes (line 74) | def clean_body_classes(self, doc):
    method clean_article_tags (line 83) | def clean_article_tags(self, doc):
    method clean_em_tags (line 90) | def clean_em_tags(self, doc):
    method remove_drop_caps (line 98) | def remove_drop_caps(self, doc):
    method remove_scripts_styles (line 105) | def remove_scripts_styles(self, doc):
    method clean_bad_tags (line 121) | def clean_bad_tags(self, doc):
    method remove_nodes_regex (line 139) | def remove_nodes_regex(self, doc, pattern):
    method clean_para_spans (line 147) | def clean_para_spans(self, doc):
    method get_flushed_buffer (line 153) | def get_flushed_buffer(self, replacement_text, doc):
    method replace_walk_left_right (line 156) | def replace_walk_left_right(self, kid, kid_text,
    method get_replacement_nodes (line 186) | def get_replacement_nodes(self, doc, div):
    method replace_with_para (line 218) | def replace_with_para(self, doc, div):
    method div_to_para (line 221) | def div_to_para(self, doc, dom_type):

FILE: newspaper/configuration.py
  class Configuration (line 23) | class Configuration(object):
    method __init__ (line 24) | def __init__(self):
    method get_language (line 81) | def get_language(self):
    method del_language (line 84) | def del_language(self):
    method set_language (line 87) | def set_language(self, language):
    method get_stopwords_class (line 106) | def get_stopwords_class(language):
    method get_parser (line 124) | def get_parser():
  class ArticleConfiguration (line 128) | class ArticleConfiguration(Configuration):
  class SourceConfiguration (line 132) | class SourceConfiguration(Configuration):

FILE: newspaper/extractors.py
  class ContentExtractor (line 55) | class ContentExtractor(object):
    method __init__ (line 56) | def __init__(self, config):
    method update_language (line 62) | def update_language(self, meta_lang):
    method get_authors (line 72) | def get_authors(self, doc):
    method get_publishing_date (line 172) | def get_publishing_date(self, url, doc):
    method get_title (line 237) | def get_title(self, doc):
    method split_title (line 352) | def split_title(self, title, splitter, hint=None):
    method get_feed_urls (line 377) | def get_feed_urls(self, source_url, categories):
    method get_favicon (line 395) | def get_favicon(self, doc):
    method get_meta_lang (line 407) | def get_meta_lang(self, doc):
    method get_meta_content (line 432) | def get_meta_content(self, doc, metaname):
    method get_meta_img_url (line 447) | def get_meta_img_url(self, article_url, doc):
    method get_meta_type (line 472) | def get_meta_type(self, doc):
    method get_meta_site_name (line 477) | def get_meta_site_name(self, doc):
    method get_meta_description (line 482) | def get_meta_description(self, doc):
    method get_meta_keywords (line 487) | def get_meta_keywords(self, doc):
    method get_meta_data (line 492) | def get_meta_data(self, doc):
    method get_canonical_link (line 531) | def get_canonical_link(self, article_url, doc):
    method get_img_urls (line 569) | def get_img_urls(self, article_url, doc):
    method get_first_img_url (line 580) | def get_first_img_url(self, article_url, top_node):
    method _get_urls (line 591) | def _get_urls(self, doc, titles):
    method get_urls (line 607) | def get_urls(self, doc_or_html, titles=False, regex=False):
    method get_category_urls (line 629) | def get_category_urls(self, source_url, doc):
    method extract_tags (line 755) | def extract_tags(self, doc):
    method calculate_best_node (line 773) | def calculate_best_node(self, doc):
    method is_boostable (line 845) | def is_boostable(self, node):
    method walk_siblings (line 872) | def walk_siblings(self, node):
    method add_siblings (line 875) | def add_siblings(self, top_node):
    method get_siblings_content (line 885) | def get_siblings_content(
    method get_siblings_score (line 921) | def get_siblings_score(self, top_node):
    method update_score (line 949) | def update_score(self, node, add_to_score):
    method update_node_count (line 962) | def update_node_count(self, node, add_to_count):
    method is_highlink_density (line 973) | def is_highlink_density(self, e):
    method get_score (line 1001) | def get_score(self, node):
    method get_node_gravity_score (line 1006) | def get_node_gravity_score(self, node):
    method nodes_to_check (line 1012) | def nodes_to_check(self, doc):
    method is_table_and_no_para_exist (line 1022) | def is_table_and_no_para_exist(self, e):
    method is_nodescore_threshold_met (line 1034) | def is_nodescore_threshold_met(self, node, e):
    method post_cleanup (line 1043) | def post_cleanup(self, top_node):

FILE: newspaper/images.py
  function image_to_str (line 29) | def image_to_str(image):
  function str_to_image (line 36) | def str_to_image(s):
  function prepare_image (line 43) | def prepare_image(image):
  function image_entropy (line 49) | def image_entropy(img):
  function square_image (line 58) | def square_image(img):
  function clean_url (line 77) | def clean_url(url):
  function fetch_url (line 86) | def fetch_url(url, useragent, referer=None, retries=1, dimension=False):
  function fetch_image_dimension (line 166) | def fetch_image_dimension(url, useragent, referer=None, retries=1):
  class Scraper (line 170) | class Scraper:
    method __init__ (line 172) | def __init__(self, article):
    method largest_image_url (line 179) | def largest_image_url(self):
    method calculate_area (line 198) | def calculate_area(self, img_url, dimension):
    method satisfies_requirements (line 222) | def satisfies_requirements(self, img_url):
    method thumbnail (line 228) | def thumbnail(self):

FILE: newspaper/mthreading.py
  class ConcurrencyException (line 23) | class ConcurrencyException(Exception):
  class Worker (line 27) | class Worker(Thread):
    method __init__ (line 31) | def __init__(self, tasks, timeout_seconds):
    method run (line 38) | def run(self):
  class ThreadPool (line 53) | class ThreadPool:
    method __init__ (line 54) | def __init__(self, num_threads, timeout_seconds):
    method add_task (line 59) | def add_task(self, func, *args, **kargs):
    method wait_completion (line 62) | def wait_completion(self):
  class NewsPool (line 66) | class NewsPool(object):
    method __init__ (line 68) | def __init__(self, config=None):
    method join (line 95) | def join(self):
    method set (line 106) | def set(self, news_list, threads_per_source=1, override_threads=None):

FILE: newspaper/network.py
  function get_request_kwargs (line 24) | def get_request_kwargs(timeout, useragent, proxies, headers):
  function get_html (line 37) | def get_html(url, config=None, response=None):
  function get_html_2XX_only (line 47) | def get_html_2XX_only(url, config=None, response=None):
  function _get_html_from_response (line 74) | def _get_html_from_response(response, config):
  class MRequest (line 91) | class MRequest(object):
    method __init__ (line 97) | def __init__(self, url, config=None):
    method send (line 107) | def send(self):
  function multithread_request (line 117) | def multithread_request(urls, config=None):

FILE: newspaper/nlp.py
  function load_stopwords (line 22) | def load_stopwords(language):
  function summarize (line 40) | def summarize(url='', title='', text='', max_sents=5):
  function score (line 57) | def score(sentences, titleWords, keywords):
  function sbs (line 77) | def sbs(words, keywords):
  function dbs (line 87) | def dbs(words, keywords):
  function split_words (line 109) | def split_words(text):
  function keywords (line 119) | def keywords(text):
  function split_sentences (line 153) | def split_sentences(text):
  function length_score (line 164) | def length_score(sentence_len):
  function title_score (line 168) | def title_score(title, sentence):
  function sentence_position (line 180) | def sentence_position(i, size):

FILE: newspaper/outputformatters.py
  class OutputFormatter (line 19) | class OutputFormatter(object):
    method __init__ (line 21) | def __init__(self, config):
    method update_language (line 28) | def update_language(self, meta_lang):
    method get_top_node (line 38) | def get_top_node(self):
    method get_formatted (line 41) | def get_formatted(self, top_node):
    method convert_to_text (line 63) | def convert_to_text(self):
    method convert_to_html (line 79) | def convert_to_html(self):
    method add_newline_to_br (line 83) | def add_newline_to_br(self):
    method add_newline_to_li (line 87) | def add_newline_to_li(self):
    method links_to_text (line 95) | def links_to_text(self):
    method remove_negativescores_nodes (line 101) | def remove_negativescores_nodes(self):
    method replace_with_text (line 113) | def replace_with_text(self):
    method remove_empty_tags (line 123) | def remove_empty_tags(self):
    method remove_trailing_media_div (line 141) | def remove_trailing_media_div(self):

FILE: newspaper/parsers.py
  class Parser (line 25) | class Parser(object):
    method xpath_re (line 28) | def xpath_re(cls, node, expression):
    method drop_tag (line 34) | def drop_tag(cls, nodes):
    method css_select (line 42) | def css_select(cls, node, selector):
    method get_unicode_html (line 46) | def get_unicode_html(cls, html):
    method fromstring (line 60) | def fromstring(cls, html):
    method clean_article_html (line 75) | def clean_article_html(cls, node):
    method nodeToString (line 88) | def nodeToString(cls, node):
    method replaceTag (line 95) | def replaceTag(cls, node, tag):
    method stripTags (line 99) | def stripTags(cls, node, *tags):
    method getElementById (line 103) | def getElementById(cls, node, idd):
    method getElementsByTag (line 111) | def getElementsByTag(
    method appendChild (line 131) | def appendChild(cls, node, child):
    method childNodes (line 135) | def childNodes(cls, node):
    method childNodesWithText (line 139) | def childNodesWithText(cls, node):
    method textToPara (line 162) | def textToPara(cls, text):
    method getChildren (line 166) | def getChildren(cls, node):
    method getElementsByTags (line 170) | def getElementsByTags(cls, node, tags):
    method createElement (line 177) | def createElement(cls, tag='p', text=None, tail=None):
    method getComments (line 185) | def getComments(cls, node):
    method getParent (line 189) | def getParent(cls, node):
    method remove (line 193) | def remove(cls, node):
    method getTag (line 210) | def getTag(cls, node):
    method getText (line 214) | def getText(cls, node):
    method previousSiblings (line 219) | def previousSiblings(cls, node):
    method previousSibling (line 226) | def previousSibling(cls, node):
    method nextSibling (line 230) | def nextSibling(cls, node):
    method isTextNode (line 234) | def isTextNode(cls, node):
    method getAttribute (line 238) | def getAttribute(cls, node, attr=None):
    method delAttribute (line 246) | def delAttribute(cls, node, attr=None):
    method setAttribute (line 253) | def setAttribute(cls, node, attr=None, value=None):
    method outerHtml (line 258) | def outerHtml(cls, node):

FILE: newspaper/source.py
  class Category (line 27) | class Category(object):
    method __init__ (line 28) | def __init__(self, url):
  class Feed (line 34) | class Feed(object):
    method __init__ (line 35) | def __init__(self, url):
  class Source (line 44) | class Source(object):
    method __init__ (line 54) | def __init__(self, url, config=None, **kwargs):
    method build (line 87) | def build(self):
    method purge_articles (line 104) | def purge_articles(self, reason, articles):
    method _get_category_urls (line 120) | def _get_category_urls(self, domain):
    method set_categories (line 127) | def set_categories(self):
    method set_feeds (line 131) | def set_feeds(self):
    method set_description (line 170) | def set_description(self):
    method download (line 177) | def download(self):
    method download_categories (line 182) | def download_categories(self):
    method download_feeds (line 199) | def download_feeds(self):
    method parse (line 216) | def parse(self):
    method parse_categories (line 227) | def parse_categories(self):
    method _map_title_to_feed (line 238) | def _map_title_to_feed(self, feed):
    method parse_feeds (line 248) | def parse_feeds(self):
    method feeds_to_articles (line 255) | def feeds_to_articles(self):
    method categories_to_articles (line 284) | def categories_to_articles(self):
    method _generate_articles (line 319) | def _generate_articles(self):
    method generate_articles (line 329) | def generate_articles(self, limit=5000):
    method download_articles (line 337) | def download_articles(self, threads=1):
    method parse_articles (line 370) | def parse_articles(self):
    method size (line 379) | def size(self):
    method clean_memo_cache (line 386) | def clean_memo_cache(self):
    method feed_urls (line 391) | def feed_urls(self):
    method category_urls (line 396) | def category_urls(self):
    method article_urls (line 401) | def article_urls(self):
    method print_summary (line 406) | def print_summary(self):

FILE: newspaper/text.py
  function innerTrim (line 19) | def innerTrim(value):
  class WordStats (line 28) | class WordStats(object):
    method __init__ (line 30) | def __init__(self):
    method get_stop_words (line 40) | def get_stop_words(self):
    method set_stop_words (line 43) | def set_stop_words(self, words):
    method get_stopword_count (line 46) | def get_stopword_count(self):
    method set_stopword_count (line 49) | def set_stopword_count(self, wordcount):
    method get_word_count (line 52) | def get_word_count(self):
    method set_word_count (line 55) | def set_word_count(self, cnt):
  class StopWords (line 59) | class StopWords(object):
    method __init__ (line 64) | def __init__(self, language='en'):
    method remove_punctuation (line 71) | def remove_punctuation(self, content):
    method candidate_words (line 82) | def candidate_words(self, stripped_input):
    method get_stopword_count (line 85) | def get_stopword_count(self, content):
  class StopWordsChinese (line 104) | class StopWordsChinese(StopWords):
    method __init__ (line 107) | def __init__(self, language='zh'):
    method candidate_words (line 110) | def candidate_words(self, stripped_input):
  class StopWordsArabic (line 117) | class StopWordsArabic(StopWords):
    method __init__ (line 120) | def __init__(self, language='ar'):
    method remove_punctuation (line 124) | def remove_punctuation(self, content):
    method candidate_words (line 127) | def candidate_words(self, stripped_input):
  class StopWordsKorean (line 136) | class StopWordsKorean(StopWords):
    method __init__ (line 139) | def __init__(self, language='ko'):
    method get_stopword_count (line 142) | def get_stopword_count(self, content):
  class StopWordsHindi (line 162) | class StopWordsHindi(StopWords):
    method __init__ (line 165) | def __init__(self, language='hi'):
    method get_stopword_count (line 168) | def get_stopword_count(self, content):
  class StopWordsJapanese (line 187) | class StopWordsJapanese(StopWords):
    method __init__ (line 190) | def __init__(self, language='ja'):
    method candidate_words (line 193) | def candidate_words(self, stripped_input):
  class StopWordsThai (line 200) | class StopWordsThai(StopWords):
    method __init__ (line 203) | def __init__(self, language='th'):
    method candidate_words (line 206) | def candidate_words(self, stripped_input):

FILE: newspaper/urls.py
  function remove_args (line 41) | def remove_args(url, keep_params=(), frags=False):
  function redirect_back (line 58) | def redirect_back(url, source_domain):
  function prepare_url (line 81) | def prepare_url(url, source_url=None):
  function valid_url (line 102) | def valid_url(url, verbose=False, test=False):
  function url_to_filetype (line 242) | def url_to_filetype(abs_url):
  function get_domain (line 264) | def get_domain(abs_url, **kwargs):
  function get_scheme (line 274) | def get_scheme(abs_url, **kwargs):
  function get_path (line 282) | def get_path(abs_url, **kwargs):
  function is_abs_url (line 290) | def is_abs_url(url):

FILE: newspaper/utils.py
  class FileHelper (line 33) | class FileHelper(object):
    method loadResourceFile (line 35) | def loadResourceFile(filename):
  class ParsingCandidate (line 50) | class ParsingCandidate(object):
    method __init__ (line 52) | def __init__(self, url, link_hash):
  class RawHelper (line 57) | class RawHelper(object):
    method get_parsing_candidate (line 59) | def get_parsing_candidate(url, raw_html):
  class URLHelper (line 66) | class URLHelper(object):
    method get_parsing_candidate (line 68) | def get_parsing_candidate(url_to_crawl):
  class StringSplitter (line 76) | class StringSplitter(object):
    method __init__ (line 77) | def __init__(self, pattern):
    method split (line 80) | def split(self, string):
  class StringReplacement (line 86) | class StringReplacement(object):
    method __init__ (line 87) | def __init__(self, pattern, replaceWith):
    method replaceAll (line 91) | def replaceAll(self, string):
  class ReplaceSequence (line 97) | class ReplaceSequence(object):
    method __init__ (line 98) | def __init__(self):
    method create (line 101) | def create(self, firstPattern, replaceWith=None):
    method append (line 106) | def append(self, pattern, replaceWith=None):
    method replaceAll (line 109) | def replaceAll(self, string):
  class TimeoutError (line 119) | class TimeoutError(Exception):
  function timelimit (line 123) | def timelimit(timeout):
  function domain_to_filename (line 153) | def domain_to_filename(domain):
  function filename_to_domain (line 164) | def filename_to_domain(filename):
  function is_ascii (line 170) | def is_ascii(word):
  function extract_meta_refresh (line 184) | def extract_meta_refresh(html):
  function to_valid_filename (line 207) | def to_valid_filename(s):
  function cache_disk (line 215) | def cache_disk(seconds=(86400 * 5), cache_folder="/tmp"):
  function print_duration (line 244) | def print_duration(method):
  function chunks (line 256) | def chunks(l, n):
  function purge (line 265) | def purge(fn, pattern):
  function clear_memo_cache (line 273) | def clear_memo_cache(source):
  function memoize_articles (line 283) | def memoize_articles(source, articles):
  function get_useragent (line 333) | def get_useragent():
  function get_available_languages (line 343) | def get_available_languages():
  function print_available_languages (line 354) | def print_available_languages():
  function extend_config (line 406) | def extend_config(config, config_items):

FILE: newspaper/videos/extractors.py
  class VideoExtractor (line 8) | class VideoExtractor(object):
    method __init__ (line 11) | def __init__(self, config, top_node):
    method get_embed_code (line 18) | def get_embed_code(self, node):
    method get_embed_type (line 23) | def get_embed_type(self, node):
    method get_width (line 26) | def get_width(self, node):
    method get_height (line 29) | def get_height(self, node):
    method get_src (line 32) | def get_src(self, node):
    method get_provider (line 35) | def get_provider(self, src):
    method get_video (line 42) | def get_video(self, node):
    method get_iframe_tag (line 54) | def get_iframe_tag(self, node):
    method get_video_tag (line 57) | def get_video_tag(self, node):
    method get_embed_tag (line 62) | def get_embed_tag(self, node):
    method get_object_tag (line 73) | def get_object_tag(self, node):
    method get_videos (line 100) | def get_videos(self):

FILE: newspaper/videos/videos.py
  class Video (line 4) | class Video(object):
    method __init__ (line 7) | def __init__(self):

FILE: tests/benchmarks.py
  class NullHandler (line 22) | class NullHandler(logging.Handler):
    method emit (line 23) | def emit(self, record):
  function naive_run (line 37) | def naive_run(urls):
  function mthread_run (line 47) | def mthread_run(urls):
  function asyncio_run (line 55) | def asyncio_run(urls):
  function benchmark (line 64) | def benchmark():

FILE: tests/unit_tests.py
  function print_test (line 33) | def print_test(method):
  function mock_resource_with (line 49) | def mock_resource_with(filename, resource_type):
  function get_base_domain (line 64) | def get_base_domain(url):
  function check_url (line 78) | def check_url(*args, **kwargs):
  class ExhaustiveFullTextCase (line 83) | class ExhaustiveFullTextCase(unittest.TestCase):
    method check_url (line 85) | def check_url(args):
    method test_exhaustive (line 119) | def test_exhaustive(self):
  class ArticleTestCase (line 146) | class ArticleTestCase(unittest.TestCase):
    method setup_stage (line 147) | def setup_stage(self, stage_name):
    method setUp (line 162) | def setUp(self):
    method test_url (line 170) | def test_url(self):
    method test_download_html (line 177) | def test_download_html(self):
    method test_meta_refresh_redirect (line 186) | def test_meta_refresh_redirect(self):
    method test_meta_refresh_no_url_redirect (line 199) | def test_meta_refresh_no_url_redirect(self):
    method test_pre_download_parse (line 210) | def test_pre_download_parse(self):
    method test_parse_html (line 217) | def test_parse_html(self):
    method test_meta_type_extraction (line 248) | def test_meta_type_extraction(self):
    method test_meta_extraction (line 255) | def test_meta_extraction(self):
    method test_pre_download_nlp (line 304) | def test_pre_download_nlp(self):
    method test_pre_parse_nlp (line 312) | def test_pre_parse_nlp(self):
    method test_nlp_body (line 319) | def test_nlp_body(self):
  class TestDownloadScheme (line 330) | class TestDownloadScheme(unittest.TestCase):
    method test_download_file_success (line 332) | def test_download_file_success(self):
    method test_download_file_failure (line 341) | def test_download_file_failure(self):
  class ContentExtractorTestCase (line 350) | class ContentExtractorTestCase(unittest.TestCase):
    method setUp (line 353) | def setUp(self):
    method _get_title (line 357) | def _get_title(self, html):
    method test_get_title_basic (line 361) | def test_get_title_basic(self):
    method test_get_title_split (line 365) | def test_get_title_split(self):
    method test_get_title_split_escaped (line 369) | def test_get_title_split_escaped(self):
    method test_get_title_quotes (line 373) | def test_get_title_quotes(self):
    method _get_canonical_link (line 378) | def _get_canonical_link(self, article_url, html):
    method test_get_canonical_link_rel_canonical (line 382) | def test_get_canonical_link_rel_canonical(self):
    method test_get_canonical_link_rel_canonical_absolute_url (line 387) | def test_get_canonical_link_rel_canonical_absolute_url(self):
    method test_get_canonical_link_og_url_absolute_url (line 393) | def test_get_canonical_link_og_url_absolute_url(self):
    method test_get_canonical_link_hostname_og_url_absolute_url (line 399) | def test_get_canonical_link_hostname_og_url_absolute_url(self):
    method test_get_top_image_from_meta (line 405) | def test_get_top_image_from_meta(self):
  class SourceTestCase (line 448) | class SourceTestCase(unittest.TestCase):
    method test_source_url_input_none (line 450) | def test_source_url_input_none(self):
    method test_source_build (line 456) | def test_source_build(self):
    method test_cache_categories (line 509) | def test_cache_categories(self):
  class UrlTestCase (line 525) | class UrlTestCase(unittest.TestCase):
    method test_valid_urls (line 527) | def test_valid_urls(self):
    method test_pubdate (line 549) | def test_pubdate(self):
    method test_prepare_url (line 574) | def test_prepare_url(self):
  class APITestCase (line 593) | class APITestCase(unittest.TestCase):
    method test_hot_trending (line 595) | def test_hot_trending(self):
    method test_popular_urls (line 601) | def test_popular_urls(self):
  class MThreadingTestCase (line 608) | class MThreadingTestCase(unittest.TestCase):
    method test_download_works (line 610) | def test_download_works(self):
  class ConfigBuildTestCase (line 633) | class ConfigBuildTestCase(unittest.TestCase):
    method test_article_default_params (line 639) | def test_article_default_params(self):
    method test_article_custom_params (line 648) | def test_article_custom_params(self):
    method test_source_default_params (line 657) | def test_source_default_params(self):
    method test_source_custom_params (line 665) | def test_source_custom_params(self):
  class MultiLanguageTestCase (line 674) | class MultiLanguageTestCase(unittest.TestCase):
    method test_chinese_fulltext_extract (line 676) | def test_chinese_fulltext_extract(self):
    method test_arabic_fulltext_extract (line 687) | def test_arabic_fulltext_extract(self):
    method test_spanish_fulltext_extract (line 700) | def test_spanish_fulltext_extract(self):
    method test_japanese_fulltext_extract (line 712) | def test_japanese_fulltext_extract(self):
    method test_japanese_fulltext_extract2 (line 723) | def test_japanese_fulltext_extract2(self):
    method test_thai_fulltext_extract (line 734) | def test_thai_fulltext_extract(self):
  class TestNewspaperLanguagesApi (line 745) | class TestNewspaperLanguagesApi(unittest.TestCase):
    method test_languages_api_call (line 747) | def test_languages_api_call(self):
  class TestDownloadPdf (line 751) | class TestDownloadPdf(unittest.TestCase):
    method test_article_pdf_ignoring (line 754) | def test_article_pdf_ignoring(self):
    method test_article_pdf_fetching (line 765) | def test_article_pdf_fetching(self):
Copy disabled (too large) Download .json
Condensed preview — 417 files, each showing path, character count, and a content snippet. Download the .json file for the full structured content (16,209K chars).
[
  {
    "path": ".gitattributes",
    "chars": 56,
    "preview": "docs/* linguist-documentation\ntests/* linguist-vendored\n"
  },
  {
    "path": ".gitignore",
    "chars": 348,
    "preview": "*.pyc\n\n.DS_Store\n.idea\n.pypirc\n\n# C extensions\n*.so\n\n# Packages\n*.egg\n*.egg-info\ndist\nbuild\n_build\neggs\nparts\nbin\nvar\nsd"
  },
  {
    "path": ".travis.yml",
    "chars": 236,
    "preview": "language: python\npython:\n - \"3.5\"\n - \"3.6\"\n - \"3.7\"\ninstall:\n - pip install -r requirements.txt coverage coveralls\n - py"
  },
  {
    "path": "CHANGELOG.md",
    "chars": 19266,
    "preview": "# Change Log\n\n## [0.1.7](https://github.com/codelucas/newspaper/tree/0.1.7) (2016-01-30)\n[Full Changelog](https://github"
  },
  {
    "path": "GOOSE-LICENSE.txt",
    "chars": 10850,
    "preview": "\n                              Apache License\n                        Version 2.0, January 2004\n                     htt"
  },
  {
    "path": "LICENSE",
    "chars": 1080,
    "preview": "The MIT License (MIT)\n\nCopyright (c) 2013 Lucas Ou-Yang\n\nPermission is hereby granted, free of charge, to any person obt"
  },
  {
    "path": "MANIFEST.in",
    "chars": 135,
    "preview": "include requirements.txt README.rst LICENSE\rrecursive-include newspaper *\rrecursive-exclude * __pycache__\rrecursive-excl"
  },
  {
    "path": "README.rst",
    "chars": 11371,
    "preview": "Newspaper3k: Article scraping & curation\n========================================\n\n.. image:: https://badge.fury.io/py/n"
  },
  {
    "path": "docs/Makefile",
    "chars": 6774,
    "preview": "# Makefile for Sphinx documentation\n#\n\n# You can set these variables from the command line.\nSPHINXOPTS    =\nSPHINXBUILD "
  },
  {
    "path": "docs/_templates/sidebarintro.html",
    "chars": 689,
    "preview": "<p class=\"logo\">\n  <a href=\"{{ pathto(master_doc) }}\">\n    <img class=\"logo\" style=\"margin-right:40px;\" src=\"{{ pathto('"
  },
  {
    "path": "docs/_templates/sidebarlogo.html",
    "chars": 688,
    "preview": "<p class=\"logo\">\n  <a href=\"{{ pathto(master_doc) }}\">\n    <img class=\"logo\" style=\"margin-right:40px;\" src=\"{{ pathto('"
  },
  {
    "path": "docs/_themes/.gitignore",
    "chars": 22,
    "preview": "*.pyc\n*.pyo\n.DS_Store\n"
  },
  {
    "path": "docs/_themes/LICENSE",
    "chars": 1861,
    "preview": "Modifications:\n\nCopyright (c) 2011 Kenneth Reitz.\n\n\nOriginal Project:\n\nCopyright (c) 2010 by Armin Ronacher.\n\n\nSome righ"
  },
  {
    "path": "docs/_themes/README.rst",
    "chars": 743,
    "preview": "krTheme Sphinx Style\n====================\n\nThis repository contains sphinx styles Kenneth Reitz uses in most of\nhis proj"
  },
  {
    "path": "docs/_themes/flask_theme_support.py",
    "chars": 4875,
    "preview": "# flasky extensions.  flasky pygments style based on tango style\nfrom pygments.style import Style\nfrom pygments.token im"
  },
  {
    "path": "docs/_themes/kr/layout.html",
    "chars": 463,
    "preview": "{%- extends \"basic/layout.html\" %}\n\n{%- block extrahead %}\n  {{ super() }}\n  {% if theme_touch_icon %}\n    <link rel=\"ap"
  },
  {
    "path": "docs/_themes/kr/relations.html",
    "chars": 590,
    "preview": "<h3>Related Topics</h3>\n<ul>\n  <li><a href=\"{{ pathto(master_doc) }}\">Documentation overview</a><ul>\n  {%- for parent in"
  },
  {
    "path": "docs/_themes/kr/static/flasky.css_t",
    "chars": 8272,
    "preview": "/*\n * flasky.css_t\n * ~~~~~~~~~~~~\n *\n * :copyright: Copyright 2010 by Armin Ronacher. Modifications by Kenneth Reitz.\n "
  },
  {
    "path": "docs/_themes/kr/theme.conf",
    "chars": 121,
    "preview": "[theme]\ninherit = basic\nstylesheet = flasky.css\npygments_style = flask_theme_support.FlaskyStyle\n\n[options]\ntouch_icon ="
  },
  {
    "path": "docs/_themes/kr_small/layout.html",
    "chars": 683,
    "preview": "{% extends \"basic/layout.html\" %}\n{% block header %}\n  {{ super() }}\n  {% if pagename == 'index' %}\n  <div class=indexwr"
  },
  {
    "path": "docs/_themes/kr_small/static/flasky.css_t",
    "chars": 4587,
    "preview": "/*\n * flasky.css_t\n * ~~~~~~~~~~~~\n *\n * Sphinx stylesheet -- flasky theme based on nature theme.\n *\n * :copyright: Copy"
  },
  {
    "path": "docs/_themes/kr_small/theme.conf",
    "chars": 184,
    "preview": "[theme]\ninherit = basic\nstylesheet = flasky.css\nnosidebar = true\npygments_style = flask_theme_support.FlaskyStyle\n\n[opti"
  },
  {
    "path": "docs/conf.py",
    "chars": 8470,
    "preview": "# -*- coding: utf-8 -*-\n#\n# newspaper documentation build configuration file, created by\n# sphinx-quickstart on Sat Dec "
  },
  {
    "path": "docs/index.rst",
    "chars": 10304,
    "preview": "Newspaper3k: Article scraping & curation\n========================================\n\n.. image:: https://badge.fury.io/py/n"
  },
  {
    "path": "docs/make.bat",
    "chars": 6707,
    "preview": "@ECHO OFF\r\n\r\nREM Command file for Sphinx documentation\r\n\r\nif \"%SPHINXBUILD%\" == \"\" (\r\n\tset SPHINXBUILD=sphinx-build\r\n)\r\n"
  },
  {
    "path": "docs/user_guide/advanced.rst",
    "chars": 8325,
    "preview": ".. _advanced:\n\nAdvanced\n========\n\nThis section of the docs shows how to do some useful but advanced things\nwith newspape"
  },
  {
    "path": "docs/user_guide/api.rst",
    "chars": 38,
    "preview": ".. _api:\n\nNewspaper API\n=============\n"
  },
  {
    "path": "docs/user_guide/contributors.rst",
    "chars": 1433,
    "preview": ".. _contributors:\n\nContributors\n============\n\nMaintained and authored by:\n---------------------------\nLucas Ou-Yang -- h"
  },
  {
    "path": "docs/user_guide/quickstart.rst",
    "chars": 8848,
    "preview": ".. _quickstart:\n\nQuickstart\n==========\n\nEager to get started? This page gives a good introduction in how to get started\n"
  },
  {
    "path": "download_corpora.py",
    "chars": 657,
    "preview": "# -*- coding: utf-8 -*-\n\"\"\"\nDownloads the necessary NLTK models and corpora required to support\nall of newspaper's featu"
  },
  {
    "path": "newspaper/__init__.py",
    "chars": 811,
    "preview": "# -*- coding: utf-8 -*-\n\"\"\"\nWherever smart people work, doors are unlocked. -- Steve Wozniak\n\"\"\"\n__title__ = 'newspaper'"
  },
  {
    "path": "newspaper/api.py",
    "chars": 2631,
    "preview": "# -*- coding: utf-8 -*-\n\"\"\"\nIgnore the unused imports, this file's purpose is to make visible\nanything which a user migh"
  },
  {
    "path": "newspaper/article.py",
    "chars": 19563,
    "preview": "# -*- coding: utf-8 -*-\n__title__ = 'newspaper'\n__author__ = 'Lucas Ou-Yang'\n__license__ = 'MIT'\n__copyright__ = 'Copyri"
  },
  {
    "path": "newspaper/cleaners.py",
    "chars": 10419,
    "preview": "# -*- coding: utf-8 -*-\n\"\"\"\nHolds the code for cleaning out unwanted tags from the lxml\ndom xpath.\n\"\"\"\nimport copy\nfrom "
  },
  {
    "path": "newspaper/configuration.py",
    "chars": 4401,
    "preview": "# -*- coding: utf-8 -*-\n\"\"\"\nThis class holds configuration objects, which can be thought of\nas settings.py but dynamic a"
  },
  {
    "path": "newspaper/extractors.py",
    "chars": 42039,
    "preview": "# -*- coding: utf-8 -*-\n\"\"\"\nNewspaper uses much of python-goose's extraction code. View their license:\nhttps://github.co"
  },
  {
    "path": "newspaper/images.py",
    "chars": 7575,
    "preview": "# -*- coding: utf-8 -*-\n\"\"\"\nThe following image extraction implementation was taken from an old\ncopy of Reddit's source "
  },
  {
    "path": "newspaper/mthreading.py",
    "chars": 4074,
    "preview": "# -*- coding: utf-8 -*-\n\"\"\"\nAnything that has to do with threading in this library\nmust be abstracted in this file. If w"
  },
  {
    "path": "newspaper/network.py",
    "chars": 4291,
    "preview": "# -*- coding: utf-8 -*-\n\"\"\"\nAll code involving requests and responses over the http network\nmust be abstracted in this f"
  },
  {
    "path": "newspaper/nlp.py",
    "chars": 5911,
    "preview": "# -*- coding: utf-8 -*-\n\"\"\"\nAnything natural language related should be abstracted into this file.\n\"\"\"\n__title__ = 'news"
  },
  {
    "path": "newspaper/outputformatters.py",
    "chars": 6092,
    "preview": "# -*- coding: utf-8 -*-\n\"\"\"\nOutput formatting to text via lxml xpath nodes abstracted in this file.\n\"\"\"\n__title__ = 'new"
  },
  {
    "path": "newspaper/parsers.py",
    "chars": 7803,
    "preview": "# -*- coding: utf-8 -*-\n\"\"\"\nNewspaper uses a lot of python-goose's parsing code. View theirlicense:\nhttps://github.com/c"
  },
  {
    "path": "newspaper/resources/misc/google_sources.txt",
    "chars": 349280,
    "preview": "10news.com\r\n10tv.com\r\n11alive.com\r\n121carhire.com\r\n1340wgau.com\r\n13wham.com\r\n14wfie.com\r\n234next.com\r\n247wallst.com\r\n24d"
  },
  {
    "path": "newspaper/resources/misc/popular_sources.txt",
    "chars": 4264,
    "preview": "www.huffingtonpost.com\r\ncnn.com\r\nwww.time.com\r\nwww.ted.com\r\npandodaily.com\r\nwww.cnbc.com\r\nwww.mlb.com\r\nwww.pcmag.com\r\nww"
  },
  {
    "path": "newspaper/resources/misc/stopwords-nlp-en.txt",
    "chars": 2389,
    "preview": "-\n \n,\n.\na\ne\ni\no\nu\nt\nabout\nabove\nabove\nacross\nafter\nafterwards\nagain\nagainst\nall\nalmost\nalone\nalong\nalready\nalso\nalthough"
  },
  {
    "path": "newspaper/resources/misc/useragents.txt",
    "chars": 4241,
    "preview": "Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1667.0 Safari/537.36\nMozilla"
  },
  {
    "path": "newspaper/resources/text/stopwords-ar.txt",
    "chars": 886,
    "preview": "فى\r\nفي\r\nكل\r\nلم\r\nلن\r\nله\r\nمن\r\nهو\r\nهي\r\nقوة\r\nكما\r\nلها\r\nمنذ\r\nوقد\r\nولا\r\nنفسه\r\nلقاء\r\nمقابل\r\nهناك\r\nوقال\r\nوكان\r\nنهاية\r\nوقالت\r\nوك"
  },
  {
    "path": "newspaper/resources/text/stopwords-be.txt",
    "chars": 516,
    "preview": "без\nболее\nбыл\nбыла\nбыли\nбыло\nбыть\nвам\nвас\nвесь\nвот\nвсе\nвсего\nвсех\nгде\nгэтыя\nдаже\nдля\nего\nесли\nесть\nеще\nже\nздесь\nили\nкак\n"
  },
  {
    "path": "newspaper/resources/text/stopwords-bg.txt",
    "chars": 1335,
    "preview": "а\nавтентичен\nаз\nако\nала\nбе\nбез\nбеше\nби\nбивш\nбивша\nбившо\nбил\nбила\nбили\nбило\nблагодаря\nблизо\nбъдат\nбъде\nбяха\nв\nвас\nваш\nваш"
  },
  {
    "path": "newspaper/resources/text/stopwords-da.txt",
    "chars": 477,
    "preview": "af\nalle\nandet\nandre\nat\nbegge\nda\nde\nden\ndenne\nder\nderes\ndet\ndette\ndig\ndin\ndog\ndu\nej\neller\nen\nend\nene\neneste\nenhver\net\nfem"
  },
  {
    "path": "newspaper/resources/text/stopwords-de.txt",
    "chars": 5877,
    "preview": "der\ndie\nund\nin\nden\nvon\nzu\nmit\nist\ndas\ndes\nim\nfür\nauf\nsich\ndem\nDie\nnicht\nein\neine\nals\nauch\nan\nes\ner\naus\nbei\nwerden\nsie\nna"
  },
  {
    "path": "newspaper/resources/text/stopwords-el.txt",
    "chars": 7464,
    "preview": "αγάπη\nαγιάζι\nαγορά\nαγώνα\nαγώνας\nαγώνες\nαγωνιστική\nαγωνιστικής\nάδεια\nαίμα\nαίτημα\nαίτηση\nαιτία\nαιώνα\nακόμα\nακόμη\nακριβώς\nα"
  },
  {
    "path": "newspaper/resources/text/stopwords-en.txt",
    "chars": 3585,
    "preview": "a's\nable\nabout\nabove\naccording\naccordingly\nacross\nactually\nafter\nafterwards\nagain\nagainst\nain't\nall\nallow\nallows\nalmost\n"
  },
  {
    "path": "newspaper/resources/text/stopwords-es.txt",
    "chars": 2101,
    "preview": "de\nla\nque\nel\nen\ny\na\nlos\ndel\nse\nlas\npor\nun\npara\ncon\nno\nuna\nsu\nal\nlo\ncomo\nmás\npero\nsus\nle\nya\no\neste\nsí\nporque\nesta\nentre\nc"
  },
  {
    "path": "newspaper/resources/text/stopwords-et.txt",
    "chars": 186,
    "preview": "ei\noma\njah\nnad\nkes\nmind\nning\nnii\nsa\nolen\nka\nmulle\nvõi\nte\nära\noled\nsest\nole\nmis\nolema\nsee\nseda\nmida\noli\nmul\non\nkõik\nminu\n"
  },
  {
    "path": "newspaper/resources/text/stopwords-fa.txt",
    "chars": 4257,
    "preview": "!\n,\n.\n:\n;\n،\n؛\n؟\nآباد\nآره\nآری\nآمد\nآمده\nآن\nآنان\nآنجا\nآنطور\nآنقدر\nآنكه\nآنها\nآنچه\nآنکه\nآورد\nآورده\nآيد\nآی\nآیا\nآیند\nاتفاقا\nاثر"
  },
  {
    "path": "newspaper/resources/text/stopwords-fi.txt",
    "chars": 441,
    "preview": "alla\nansiosta\nehkä\nei\nenemmän\nennen\netessa\nf\nhaikki\nhe\nhitaasti\nhoikein\nhyvin\nhän\nilman\nja\njos\njälkeen\nkanssa\nkaukana\nke"
  },
  {
    "path": "newspaper/resources/text/stopwords-fr.txt",
    "chars": 1978,
    "preview": "# Licensed to the Apache Software Foundation (ASF) under one or more\n# contributor license agreements.  See the NOTICE f"
  },
  {
    "path": "newspaper/resources/text/stopwords-he.txt",
    "chars": 1032,
    "preview": "אני\nאת\nאתה\nאנחנו\nאתן\nאתם\nהם\nהן\nהיא\nהוא\nשלי\nשלו\nשלך\nשלה\nשלנו\nשלכם\nשלכן\nשלהם\nשלהן\nלי\nלו\nלה\nלנו\nלכם\nלכן\nלהם\nלהן\nאותה\nאותו\nז"
  },
  {
    "path": "newspaper/resources/text/stopwords-hi.txt",
    "chars": 1088,
    "preview": "अंदर\nअत\nअदि\nअप\nअपना\nअपनि\nअपनी\nअपने\nअभि\nअभी\nआदि\nइंहिं\nइंहें\nइंहों\nइतयादि\nइत्यादि\nइन\nइनका\nइन्हीं\nइन्हें\nइन्हों\nइस\nइसका\nइसक"
  },
  {
    "path": "newspaper/resources/text/stopwords-hr.txt",
    "chars": 840,
    "preview": "a\nako\nali\nbi\nbih\nbila\nbili\nbilo\nbio\nbismo\nbiste\nbiti\nbumo\nda\ndo\nduž\nga\nhoće\nhoćemo\nhoćete\nhoćeš\nhoću\ni\niako\nih\nili\niz\nja"
  },
  {
    "path": "newspaper/resources/text/stopwords-hu.txt",
    "chars": 2187,
    "preview": "a\ná\nahogy\nahol\naki\nakik\nakkor\nalatt\náltal\náltalában\namely\namelyek\namelyekben\namelyeket\namelyet\namelynek\nami\namit\namolyan"
  },
  {
    "path": "newspaper/resources/text/stopwords-id.txt",
    "chars": 10499,
    "preview": "a\nabad\nacara\naceh\nada\nadalah\nadanya\nadapun\nagak\nagaknya\nagama\nagar\nagustus\nair\nakan\nakankah\nakhir\nakhiri\nakhirnya\nakibat"
  },
  {
    "path": "newspaper/resources/text/stopwords-it.txt",
    "chars": 1681,
    "preview": "ad\nal\nallo\nai\nagli\nall\nagl\nalla\nalle\ncon\ncol\ncoi\nda\ndal\ndallo\ndai\ndagli\ndall\ndagl\ndalla\ndalle\ndi\ndel\ndello\ndei\ndegli\nde"
  },
  {
    "path": "newspaper/resources/text/stopwords-ja.txt",
    "chars": 424,
    "preview": "あそこ\nあっ\nあの\nあのかた\nあの人\nあり\nあります\nある\nあれ\nい\nいう\nいます\nいる\nう\nうち\nえ\nお\nおよび\nおり\nおります\nか\nかつて\nから\nが\nき\nここ\nこちら\nこと\nこの\nこれ\nこれら\nさ\nさらに\nし\nしかし\nする\nず\nせ\nせる"
  },
  {
    "path": "newspaper/resources/text/stopwords-ko.txt",
    "chars": 199,
    "preview": "을\n의\n에\n이\n를\n으로\n은\n는\n가\n로\n하고\n과\n에서\n도\n와\n이다\n고\n부터\n까지\n께\n에는\n이라고\n만\n라고\n보다\n에도\n다\n토록\n에게\n나\n대로\n에서는\n이나\n이며\n요\n든\n으로써\n같이\n로는\n밖에\n과의\n며\n로부터\n처럼\n아\n라\n"
  },
  {
    "path": "newspaper/resources/text/stopwords-lt.txt",
    "chars": 712,
    "preview": "taip\njas\nbe\nkito\ntaps\njuos\ndvi\nbūti\njo\nkita\njuo\nkokį\ngero\ntie\nmes\nbei\nsavo\nmūsų\nbus\nrodo\njame\nkam\nprie\nkada\nitin\nkuo\ntie"
  },
  {
    "path": "newspaper/resources/text/stopwords-mk.txt",
    "chars": 839,
    "preview": "а\nе\nод\nдо\nбез\nсо\nза\nна\nја\nго\nги\nниз\nисто\nистото\nпод\nнад\nда\nќе\nво\nнего\nнеа\nтој\nтаа\nтоа\nние\nвие\nтие\nкој\nкоја\nкои\nдали\nсе\nн"
  },
  {
    "path": "newspaper/resources/text/stopwords-nb.txt",
    "chars": 571,
    "preview": "alle\nandre\narbeid\nav\nbegge\nbort\nbra\nbruke\nda\ndenne\nder\nderes\ndet\ndin\ndisse\ndu\neller\nen\nene\neneste\nenhver\nenn\ner\net\nfolk\n"
  },
  {
    "path": "newspaper/resources/text/stopwords-nl.txt",
    "chars": 177,
    "preview": "aan\naf\nal\nals\nbij\ndan\ndat\ndie\ndit\neen\nen\ner\nhad\nheb\nhem\nhet\nhij\nhoe\nhun\nik\nin\nis\nje\nkan\nme\nmen\nmet\nmij\nnog\nnu\nof\nons\nook"
  },
  {
    "path": "newspaper/resources/text/stopwords-no.txt",
    "chars": 497,
    "preview": "at\nav\nde\nden\nder\ndet\ndu\nen\ner\net\nfor\nfra\nfør\nmed\nog\nom\nover\npå\nsom\ntil\nved\når\nalle\nbare\nble\nbort\nbra\nda\ndeg\ndem\ndenne\nde"
  },
  {
    "path": "newspaper/resources/text/stopwords-pl.txt",
    "chars": 1929,
    "preview": "a\naby\nach\nacz\naczkolwiek\naj\nalbo\nale\nalez\należ\nani\naz\naż\nbardziej\nbardzo\nbeda\nbedzie\nbez\ndeda\nbędą\nbede\nbędę\nbędzie\nbo\nb"
  },
  {
    "path": "newspaper/resources/text/stopwords-pt.txt",
    "chars": 3519,
    "preview": "a\nacerca\nadeus\nagora\nainda\nalem\nalgmas\nalgo\nalgumas\nalguns\nali\nalém\nambas\nambos\nano\nanos\nantes\nao\naonde\naos\napenas\napoio"
  },
  {
    "path": "newspaper/resources/text/stopwords-ro.txt",
    "chars": 1796,
    "preview": "acea\r\naceasta\r\naceastă\r\naceea\r\nacei\r\naceia\r\nacel\r\nacela\r\nacele\r\nacelea\r\nacest\r\nacesta\r\naceste\r\nacestea\r\naceşti\r\naceştia\r"
  },
  {
    "path": "newspaper/resources/text/stopwords-ru.txt",
    "chars": 2900,
    "preview": "а\r\nе\r\nи\r\nж\r\nм\r\nо\r\nна\r\nне\r\nни\r\nоб\r\nно\r\nон\r\nмне\r\nмои\r\nмож\r\nона\r\nони\r\nоно\r\nмной\r\nмного\r\nмногочисленное\r\nмногочисленная\r\nмно"
  },
  {
    "path": "newspaper/resources/text/stopwords-sl.txt",
    "chars": 2397,
    "preview": "a\nali\napril\navgust\nb\nbi\nbil\nbila\nbile\nbili\nbilo\nbiti\nblizu\nbo\nbodo\nbojo\nbolj\nbom\nbomo\nboste\nbova\nboš\nbrez\nc\ncel\ncela\ncel"
  },
  {
    "path": "newspaper/resources/text/stopwords-sr.txt",
    "chars": 747,
    "preview": "baš\nbez\nbiæe\nbio\nbiti\nblizu\nbroj\ndana\ndanas\ndoæi\ndobar\ndobiti\ndok\ndole\ndošao\ndrugi\nduž\ndva\nèesto\nèiji\ngde\ngore\nhvala\niæi"
  },
  {
    "path": "newspaper/resources/text/stopwords-sv.txt",
    "chars": 3756,
    "preview": "#-----------------------------------------------------------------------\n# translated\n#---------------------------------"
  },
  {
    "path": "newspaper/resources/text/stopwords-sw.txt",
    "chars": 407,
    "preview": "akasema\nalikuwa\nalisema\nbaada\nbasi\nbila\ncha\nchini\nhadi\nhapo\nhata\nhivyo\nhiyo\nhuku\nhuo\nili\nilikuwa\njuu\nkama\nkaribu\nkatika\n"
  },
  {
    "path": "newspaper/resources/text/stopwords-th.txt",
    "chars": 550,
    "preview": "กล่าว\nกว่า\nกัน\nกับ\nการ\nก็\nก่อน\nขณะ\nขอ\nของ\nขึ้น\nคง\nครั้ง\nความ\nคือ\nจะ\nจัด\nจาก\nจึง\nช่วง\nซึ่ง\nดัง\nด้วย\nด้าน\nตั้ง\nตั้งแต่\nตาม"
  },
  {
    "path": "newspaper/resources/text/stopwords-tr.txt",
    "chars": 1279,
    "preview": "acaba\naltmış\naltı\nama\nancak\narada\naslında\nayrıca\nbana\nbazı\nbelki\nben\nbenden\nbeni\nbenim\nberi\nbeş\nbile\nbin\nbir\nbirçok\nbiri"
  },
  {
    "path": "newspaper/resources/text/stopwords-uk.txt",
    "chars": 2206,
    "preview": "a\nб\nв\nж\nз\nу\nя\nє\nі\nаж\nви\nде\nдо\nза\nзі\nми\nна\nне\nну\nні\nпо\nта\nти\nто\nту\nті\nце\nцю\nця\nці\nчи\nще\nщо\nяк\nїй\nїм\nїх\nїї\nабо\nале\nбез\nбув"
  },
  {
    "path": "newspaper/resources/text/stopwords-vi.txt",
    "chars": 533,
    "preview": "nhận\nrằng\ncao\nnhà\nquá\nriêng\ngì\nmuốn\nrồi\nsố\nthấy\nhay\nlên\nlần\nnào\nqua\nbằng\nđiều\nbiết\nlớn\nkhác\nvừa\nnếu\nthời gian\nhọ\ntừng\nđâ"
  },
  {
    "path": "newspaper/resources/text/stopwords-zh.txt",
    "chars": 373,
    "preview": "的\r\n一\r\n不\r\n在\r\n人\r\n有\r\n是\r\n为\r\n以\r\n于\r\n上\r\n他\r\n而\r\n后\r\n之\r\n来\r\n及\r\n了\r\n因\r\n下\r\n可\r\n到\r\n由\r\n这\r\n与\r\n也\r\n此\r\n但\r\n并\r\n个\r\n其\r\n已\r\n无\r\n小\r\n我\r\n们\r\n起\r\n最\r\n再\r\n今\r\n"
  },
  {
    "path": "newspaper/settings.py",
    "chars": 1649,
    "preview": "# -*- coding: utf-8 -*-\n\"\"\"\nUnlike configuration.py, this file is meant for static, entire project\nencompassing settings"
  },
  {
    "path": "newspaper/source.py",
    "chars": 15336,
    "preview": "# -*- coding: utf-8 -*-\n\"\"\"\nSource objects abstract online news source websites & domains.\nwww.cnn.com would be its own "
  },
  {
    "path": "newspaper/text.py",
    "chars": 6046,
    "preview": "# -*- coding: utf-8 -*-\n\"\"\"\nStopword extraction and stopword classes.\n\"\"\"\n__title__ = 'newspaper'\n__author__ = 'Lucas Ou"
  },
  {
    "path": "newspaper/urls.py",
    "chars": 10222,
    "preview": "# -*- coding: utf-8 -*-\n\"\"\"\nNewspaper treats urls for news articles as critical components.\nHence, we have an entire mod"
  },
  {
    "path": "newspaper/utils.py",
    "chars": 12194,
    "preview": "# -*- coding: utf-8 -*-\n\"\"\"\nHolds misc. utility methods which prove to be\nuseful throughout this library.\n\"\"\"\n__title__ "
  },
  {
    "path": "newspaper/version.py",
    "chars": 296,
    "preview": "# -*- coding: utf-8 -*-\n\"\"\"\nTo change the version of entire package, just edit this one location.\n\"\"\"\n__title__ = 'newsp"
  },
  {
    "path": "newspaper/videos/__init__.py",
    "chars": 0,
    "preview": ""
  },
  {
    "path": "newspaper/videos/extractors.py",
    "chars": 3793,
    "preview": "# -*- coding: utf-8 -*-\nfrom .videos import Video\n\nVIDEOS_TAGS = ['iframe', 'embed', 'object', 'video']\nVIDEO_PROVIDERS "
  },
  {
    "path": "newspaper/videos/videos.py",
    "chars": 421,
    "preview": "# -*- coding: utf-8 -*-\n\n\nclass Video(object):\n    \"\"\"Video object\n    \"\"\"\n    def __init__(self):\n        # type of emb"
  },
  {
    "path": "requirements.txt",
    "chars": 286,
    "preview": "beautifulsoup4>=4.4.1\ncssselect>=0.9.2\nfeedfinder2>=0.0.4\nfeedparser>=5.2.1\njieba3k>=0.35.1\nlxml>=3.6.0\nnltk>=3.2.1\nPill"
  },
  {
    "path": "setup.py",
    "chars": 1656,
    "preview": "#!/bin/python2.7\n# -*- coding: utf-8 -*-\n\"\"\"\nLucas Ou-Yang 2014 -- http://codelucas.com\n\"\"\"\n\nimport sys\nimport os\nimport"
  },
  {
    "path": "tests/__init__.py",
    "chars": 0,
    "preview": ""
  },
  {
    "path": "tests/benchmarks.py",
    "chars": 1694,
    "preview": "# -*- coding: utf-8 -*-\n\"\"\"\nAsync IO vs multi-threading\n\nMulti-thread:           5.9 secs (10 threads) for 100 requests\n"
  },
  {
    "path": "tests/data/fulltext_domain_list.txt",
    "chars": 1203,
    "preview": "about.com\nal.com\nannarbor.com\napartmenttherapy.com\narchitecturaldigest.com\nblog.parsely.com\npixelmonkey.org\nbostonherald"
  },
  {
    "path": "tests/data/fulltext_url_list.txt",
    "chars": 13387,
    "preview": "http://bandb.about.com/od/illinois/fl/Galena-IL-A-BampB-Kinda-Town.htm?utm_source=google&utm_medium=social&utm_campaign="
  },
  {
    "path": "tests/data/html/247wallst.com1.html",
    "chars": 74832,
    "preview": "<!DOCTYPE html>\n<!--[if IE 6]>\n<html id=\"ie6\" class=\"no-js ie6\" xmlns:fb=\"http://ogp.me/ns/fb#\"\n lang=\"en\">\n<![endif]-->"
  },
  {
    "path": "tests/data/html/247wallst.com2.html",
    "chars": 73593,
    "preview": "<!DOCTYPE html>\n<!--[if IE 6]>\n<html id=\"ie6\" class=\"no-js ie6\" xmlns:fb=\"http://ogp.me/ns/fb#\"\n lang=\"en\">\n<![endif]-->"
  },
  {
    "path": "tests/data/html/about.com1.html",
    "chars": 92033,
    "preview": "<!DOCTYPE html>\n\n<html class=\"no-js\"\n\tdata-ab=\"99,99,99,99\"\n\tdata-articleId=\"2f53fe24dc0064e02f005f51\" \n\tdata-ch=\"travel"
  },
  {
    "path": "tests/data/html/about.com2.html",
    "chars": 100677,
    "preview": "<!DOCTYPE html>\n\n<html class=\"no-js\"\n\tdata-ab=\"99,99,99,99\"\n\tdata-articleId=\"3152ce7a100001c2bb001a9f\" \n\tdata-ch=\"health"
  },
  {
    "path": "tests/data/html/adoption.com1.html",
    "chars": 43160,
    "preview": "<!DOCTYPE html>\n<html>\n<head>\n<title>What Not to Say to a Birth Mom or Adoptee | Adoption.com </title>\n<meta charset=\"UT"
  },
  {
    "path": "tests/data/html/al.com1.html",
    "chars": 69366,
    "preview": "<!doctype html>\n<!--[if lt IE 7]> <html class=\"no-js lt-ie9 lt-ie8 lt-ie7\" lang=\"en-US\" xmlns:fb=\"http://www.facebook.co"
  },
  {
    "path": "tests/data/html/al.com2.html",
    "chars": 98772,
    "preview": "<!doctype html>\n<!--[if lt IE 7]> <html class=\"no-js lt-ie9 lt-ie8 lt-ie7\" lang=\"en-US\" xmlns:fb=\"http://www.facebook.co"
  },
  {
    "path": "tests/data/html/ap_meta_refresh.html",
    "chars": 40849,
    "preview": "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01//EN\" \"http://www.w3.org/TR/html4/strict.dtd\">\n<html>\n<head>\n    <title>News"
  },
  {
    "path": "tests/data/html/apartmenttherapy.com1.html",
    "chars": 68847,
    "preview": "<!DOCTYPE html>\n<html class=\"no-js\" lang=\"en\">\n<head>\n  <meta http-equiv=\"X-UA-Compatible\" content=\"IE=edge\" />\n  \n\n  <t"
  },
  {
    "path": "tests/data/html/apartmenttherapy.com2.html",
    "chars": 93708,
    "preview": "<!DOCTYPE html>\n<html class=\"no-js\" lang=\"en\">\n<head>\n  <meta http-equiv=\"X-UA-Compatible\" content=\"IE=edge\" />\n  \n\n  <t"
  },
  {
    "path": "tests/data/html/arabic_article.html",
    "chars": 26701,
    "preview": "<!DOCTYPE html PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\" \"http://www.w3.org/TR/html4/loose.dtd\">\n<html lang=\"ar\"><"
  },
  {
    "path": "tests/data/html/architecturaldigest.com1.html",
    "chars": 76535,
    "preview": "<!DOCTYPE HTML><!--[if lt IE 7 ]><html class=\"ie6\" xmlns=\"http://www.w3.org/1999/xhtml\" xml:lang=\"en\" lang=\"en\" xmlns:og"
  },
  {
    "path": "tests/data/html/architecturaldigest.com2.html",
    "chars": 65193,
    "preview": "<!DOCTYPE HTML><!--[if lt IE 7 ]><html class=\"ie6\" xmlns=\"http://www.w3.org/1999/xhtml\" xml:lang=\"en\" lang=\"en\" xmlns:og"
  },
  {
    "path": "tests/data/html/avclub.com1.html",
    "chars": 92356,
    "preview": "<!DOCTYPE html>\n<html>\n    <head>\n        <script src=\"//cdn.optimizely.com/js/742480062.js\"></script>\n        \n        "
  },
  {
    "path": "tests/data/html/avclub.com2.html",
    "chars": 61499,
    "preview": "<!DOCTYPE html>\n<html>\n    <head>\n        <script src=\"//cdn.optimizely.com/js/742480062.js\"></script>\n        \n        "
  },
  {
    "path": "tests/data/html/backstage.com1.html",
    "chars": 35693,
    "preview": "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\""
  },
  {
    "path": "tests/data/html/backstage.com2.html",
    "chars": 45156,
    "preview": "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\""
  },
  {
    "path": "tests/data/html/bhg.com1.html",
    "chars": 69103,
    "preview": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n<!DOCTYPE html>\n    <!--[if lt IE 7 ]><html lang=\"en\" class=\"lt-ie9 ie6 no-js\"><![endif]-->\n "
  },
  {
    "path": "tests/data/html/bhg.com2.html",
    "chars": 53363,
    "preview": "<!DOCTYPE html PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\" \"http://www.w3.org/TR/html4/loose.dtd\">\n<html xmlns:fb=\"h"
  },
  {
    "path": "tests/data/html/bloomberg.com1.html",
    "chars": 130807,
    "preview": "<!DOCTYPE html>\n<html itemscope itemtype='http://schema.org/Article' lang='en' xmlns:fb='http://www.facebook.com/2008/fb"
  },
  {
    "path": "tests/data/html/bostonherald.com1.html",
    "chars": 74115,
    "preview": "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML+RDFa 1.0//EN\"\n  \"http://www.w3.org/MarkUp/DTD/xhtml-rdfa-1.dtd\">\n<html xmlns=\"h"
  },
  {
    "path": "tests/data/html/bostonherald.com2.html",
    "chars": 67455,
    "preview": "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML+RDFa 1.0//EN\"\n  \"http://www.w3.org/MarkUp/DTD/xhtml-rdfa-1.dtd\">\n<html xmlns=\"h"
  },
  {
    "path": "tests/data/html/businessinsider.com1.html",
    "chars": 117018,
    "preview": "<!DOCTYPE html>\n<html lang=\"en\" xmlns=\"http://www.w3.org/1999/xhtml\" xml:lang=\"en\" lang=\"en\" xmlns:fb=\"http://www.facebo"
  },
  {
    "path": "tests/data/html/businessinsider.com2.html",
    "chars": 118356,
    "preview": "<!DOCTYPE html>\n<html lang=\"en\" xmlns=\"http://www.w3.org/1999/xhtml\" xml:lang=\"en\" lang=\"en\" xmlns:fb=\"http://www.facebo"
  },
  {
    "path": "tests/data/html/businessweek.com1.html",
    "chars": 76450,
    "preview": "<!DOCTYPE html>\n<!--[if lt IE 7]> <html class=\"ie6 no-js light_layout\" lang=\"en\"> <![endif]-->\n<!--[if IE 7]>    <html c"
  },
  {
    "path": "tests/data/html/businessweek.com2.html",
    "chars": 72896,
    "preview": "<!DOCTYPE html>\n<!--[if lt IE 7]> <html class=\"ie6 no-js light_layout\" lang=\"en\"> <![endif]-->\n<!--[if IE 7]>    <html c"
  },
  {
    "path": "tests/data/html/chinese_article.html",
    "chars": 74410,
    "preview": "<!DOCTYPE html\n  PUBLIC \"-//W3C//DTD XHTML 1.0 Strict//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd\">\n<html xm"
  },
  {
    "path": "tests/data/html/cleveland.com1.html",
    "chars": 69583,
    "preview": "<!doctype html>\n<!--[if lt IE 7]> <html class=\"no-js lt-ie9 lt-ie8 lt-ie7\" lang=\"en-US\" xmlns:fb=\"http://www.facebook.co"
  },
  {
    "path": "tests/data/html/cleveland.com2.html",
    "chars": 74967,
    "preview": "<!doctype html>\n<!--[if lt IE 7]> <html class=\"no-js lt-ie9 lt-ie8 lt-ie7\" lang=\"en-US\" xmlns:fb=\"http://www.facebook.co"
  },
  {
    "path": "tests/data/html/cnn_article.html",
    "chars": 75475,
    "preview": "<!DOCTYPE HTML>\n<html itemscope itemtype=\"http://schema.org/NewsArticle\" lang=\"en-US\">\n<head>\n<title>After storm, foreca"
  },
  {
    "path": "tests/data/html/cnn_main_site.html",
    "chars": 138466,
    "preview": "\n<!DOCTYPE HTML>\n<html lang=\"en-US\">\n<head>\n<title>CNN.com International - Breaking, World, Business, Sports, Entertainm"
  },
  {
    "path": "tests/data/html/cntraveler.com1.html",
    "chars": 144085,
    "preview": "<!DOCTYPE html>\n<html>\n<head>\n  <meta charset=utf-8 />\n<meta name=\"viewport\" content=\"width=device-width, initial-scale="
  },
  {
    "path": "tests/data/html/cntraveler.com2.html",
    "chars": 145959,
    "preview": "<!DOCTYPE html>\n<html>\n<head>\n  <meta charset=utf-8 />\n<meta name=\"viewport\" content=\"width=device-width, initial-scale="
  },
  {
    "path": "tests/data/html/coolhunting.com1.html",
    "chars": 67713,
    "preview": "<!DOCTYPE html>\n<html lang=\"en\" xmlns:fb=\"http://ogp.me/ns/fb#\">\n  <head>\n    <meta charset=\"utf-8\">\n<script type=\"text/"
  },
  {
    "path": "tests/data/html/coolhunting.com2.html",
    "chars": 67713,
    "preview": "<!DOCTYPE html>\n<html lang=\"en\" xmlns:fb=\"http://ogp.me/ns/fb#\">\n  <head>\n    <meta charset=\"utf-8\">\n<script type=\"text/"
  },
  {
    "path": "tests/data/html/cricket.com.au1.html",
    "chars": 82572,
    "preview": "\r\n<!DOCTYPE html>\r\n<html lang=\"en\">\r\n<head>\r\n    <!--START NINEMSN KIT - LOADER BLOCK 1-->\r\n<script type=\"text/javascrip"
  },
  {
    "path": "tests/data/html/cricket.com.au2.html",
    "chars": 59332,
    "preview": "\r\n<!DOCTYPE html>\r\n<html lang=\"en\">\r\n<head>\r\n    <!--START NINEMSN KIT - LOADER BLOCK 1-->\r\n<script type=\"text/javascrip"
  },
  {
    "path": "tests/data/html/dailycaller.com1.html",
    "chars": 77324,
    "preview": "<!DOCTYPE html>\n<!--[if lt IE 9]> <html class=\"no-js lt-ie10 lt-ie9\" lang=\"en-US\" xmlns:fb=\"http://ogp.me/ns/fb#\"> <![en"
  },
  {
    "path": "tests/data/html/dailycaller.com2.html",
    "chars": 98624,
    "preview": "<!DOCTYPE html>\n<!--[if lt IE 9]> <html class=\"no-js lt-ie10 lt-ie9\" lang=\"en-US\" xmlns:fb=\"http://ogp.me/ns/fb#\"> <![en"
  },
  {
    "path": "tests/data/html/dailystar.co.uk1.html",
    "chars": 120227,
    "preview": "<!DOCTYPE html><!--OVOLABS_1 START-->\n<!--[if lt IE 7]> <html class=\"no-js lt-ie9 lt-ie8 lt-ie7\" lang=\"en\"> <![endif]-->"
  },
  {
    "path": "tests/data/html/dailystar.co.uk2.html",
    "chars": 100794,
    "preview": "<!DOCTYPE html><!--OVOLABS_1 START-->\n<!--[if lt IE 7]> <html class=\"no-js lt-ie9 lt-ie8 lt-ie7\" lang=\"en\"> <![endif]-->"
  },
  {
    "path": "tests/data/html/dallasnews.com1.html",
    "chars": 45211,
    "preview": "\r\n<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dt"
  },
  {
    "path": "tests/data/html/dallasnews.com2.html",
    "chars": 52935,
    "preview": "\r\n<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dt"
  },
  {
    "path": "tests/data/html/details.com1.html",
    "chars": 89107,
    "preview": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n    \n            \n        \n\n\n\n\n\n\n<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Trans"
  },
  {
    "path": "tests/data/html/details.com2.html",
    "chars": 120083,
    "preview": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n    \n            \n        \n\n\n\n\n\n\n<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Trans"
  },
  {
    "path": "tests/data/html/elle.com1.html",
    "chars": 223074,
    "preview": "<!doctype html>\n<!--[if lt IE 7 ]> <html lang=\"en\" class=\"no-js ie6\" xmlns=\"http://www.w3.org/1999/xhtml\" xmlns:fb=\"http"
  },
  {
    "path": "tests/data/html/elle.com2.html",
    "chars": 145712,
    "preview": "<!doctype html>\n<!--[if lt IE 7 ]> <html lang=\"en\" class=\"no-js ie6\" xmlns=\"http://www.w3.org/1999/xhtml\" xmlns:fb=\"http"
  },
  {
    "path": "tests/data/html/flavorwire.com1.html",
    "chars": 87756,
    "preview": "<!doctype html public>\n<!--[if lt IE 7]> <html lang=\"en-us\" class=\"lt-ie9 lt-ie8 lt-ie7\"> <![endif]-->\n<!--[if IE 7]>   "
  },
  {
    "path": "tests/data/html/flavorwire.com2.html",
    "chars": 94516,
    "preview": "<!doctype html public>\n<!--[if lt IE 7]> <html lang=\"en-us\" class=\"lt-ie9 lt-ie8 lt-ie7\"> <![endif]-->\n<!--[if IE 7]>   "
  },
  {
    "path": "tests/data/html/fool.com1.html",
    "chars": 147131,
    "preview": "\r\n\r\n<!DOCTYPE html>\r\n<html xmlns=\"http://www.w3.org/1999/xhtml\" xmlns:fb=\"http://www.facebook.com/2008/fbml\" xmlns:og=\"h"
  },
  {
    "path": "tests/data/html/fool.com2.html",
    "chars": 749,
    "preview": "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01//EN\"\"http://www.w3.org/TR/html4/strict.dtd\">\r\n<HTML><HEAD><TITLE>Forbidden<"
  },
  {
    "path": "tests/data/html/foxbusiness.com1.html",
    "chars": 36653,
    "preview": "<!DOCTYPE html>\n<html xmlns=\"http://www.w3.org/1999/xhtml\" xmlns:og=\"http://opengraphprotocol.org/schema/\" xml:lang=\"en\""
  },
  {
    "path": "tests/data/html/foxbusiness.com2.html",
    "chars": 31182,
    "preview": "<!DOCTYPE html>\n<html xmlns=\"http://www.w3.org/1999/xhtml\" xmlns:og=\"http://opengraphprotocol.org/schema/\" xml:lang=\"en\""
  },
  {
    "path": "tests/data/html/foxnews.com1.html",
    "chars": 58153,
    "preview": "<!DOCTYPE html>\n<html xmlns=\"http://www.w3.org/1999/xhtml\" xmlns:og=\"http://opengraphprotocol.org/schema/\" xml:lang=\"en\""
  },
  {
    "path": "tests/data/html/foxnews.com2.html",
    "chars": 53040,
    "preview": "<!DOCTYPE html>\n<html xmlns=\"http://www.w3.org/1999/xhtml\" xmlns:og=\"http://opengraphprotocol.org/schema/\" xml:lang=\"en\""
  },
  {
    "path": "tests/data/html/glamour.com1.html",
    "chars": 75008,
    "preview": "<!DOCTYPE html>\n<html class=\"no-js\" lang=\"en-us\">\n<head>\n    <meta charset=\"utf-8\">\n<meta name=\"viewport\" content=\"width"
  },
  {
    "path": "tests/data/html/glamour.com2.html",
    "chars": 76672,
    "preview": "<!DOCTYPE html>\n<html class=\"no-js\" lang=\"en-us\">\n<head>\n    <meta charset=\"utf-8\">\n<meta name=\"viewport\" content=\"width"
  },
  {
    "path": "tests/data/html/globalnews.ca1.html",
    "chars": 118496,
    "preview": "<!DOCTYPE html>\n<!--[if IE 6]><html class=\"is_ie6 is_ie\" lang=\"en\"><![endif]-->\n<!--[if IE 7]><html class=\"is_ie7 is_ie\""
  },
  {
    "path": "tests/data/html/globalnews.ca2.html",
    "chars": 114125,
    "preview": "<!DOCTYPE html>\n<!--[if IE 6]><html class=\"is_ie6 is_ie\" lang=\"en\"><![endif]-->\n<!--[if IE 7]><html class=\"is_ie7 is_ie\""
  },
  {
    "path": "tests/data/html/google_meta_refresh.html",
    "chars": 570,
    "preview": "<script>window.googleJavaScriptRedirect = 1</script>\n<script>var n = {\n    navigateTo: function (b, a, d) {\n        if ("
  },
  {
    "path": "tests/data/html/gq.com1.html",
    "chars": 118672,
    "preview": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n        \n    \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n  \n\n\n\n    \n    \n "
  },
  {
    "path": "tests/data/html/gq.com2.html",
    "chars": 130603,
    "preview": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n        \n    \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n  \n\n\n\n    \n    \n "
  },
  {
    "path": "tests/data/html/graziadaily.co.uk1.html",
    "chars": 42011,
    "preview": "\t<!DOCTYPE html>\n<!--[if IE 8]><html lang=\"en\" class=\"no-js lt-ie10 lt-ie9\"><![endif]-->\n<!--[if IE 9]><html lang=\"en\" c"
  },
  {
    "path": "tests/data/html/graziadaily.co.uk2.html",
    "chars": 45915,
    "preview": "\t<!DOCTYPE html>\n<!--[if IE 8]><html lang=\"en\" class=\"no-js lt-ie10 lt-ie9\"><![endif]-->\n<!--[if IE 9]><html lang=\"en\" c"
  },
  {
    "path": "tests/data/html/gulflive.com1.html",
    "chars": 55567,
    "preview": "<!doctype html>\n<!--[if lt IE 7]> <html class=\"no-js lt-ie9 lt-ie8 lt-ie7\" lang=\"en-US\" xmlns:fb=\"http://www.facebook.co"
  },
  {
    "path": "tests/data/html/gulflive.com2.html",
    "chars": 89318,
    "preview": "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\""
  },
  {
    "path": "tests/data/html/huffingtonpost.com1.html",
    "chars": 190377,
    "preview": "<!DOCTYPE html>\n\t\n<!--  Mobile redirect: 80 -->\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\n\n\n\n\n\n\n\n<!--[if lt IE 7]>  "
  },
  {
    "path": "tests/data/html/japanese_article.html",
    "chars": 126778,
    "preview": "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\""
  },
  {
    "path": "tests/data/html/japanese_article2.html",
    "chars": 33812,
    "preview": "\n\n<!doctype html>\n<!--[if IE]><html class=\"ie\" lang=\"ja\" prefix=\"og: http://ogp.me/ns# fb: http://ogp.me/ns/fb# article:"
  },
  {
    "path": "tests/data/html/lifebuzz.com1.html",
    "chars": 57025,
    "preview": "<!doctype html>\n<html lang=\"en-US\" prefix=\"og: http://ogp.me/ns# fb: http://ogp.me/ns/fb#\">\n<head>\n  <meta charset=\"utf-"
  },
  {
    "path": "tests/data/html/lifebuzz.com2.html",
    "chars": 65302,
    "preview": "<!doctype html>\n<html lang=\"en-US\" prefix=\"og: http://ogp.me/ns# fb: http://ogp.me/ns/fb#\">\n<head>\n  <meta charset=\"utf-"
  },
  {
    "path": "tests/data/html/livescience.com1.html",
    "chars": 42546,
    "preview": "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\"\r\n       \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitio"
  },
  {
    "path": "tests/data/html/livescience.com2.html",
    "chars": 48930,
    "preview": "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\"\r\n       \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitio"
  },
  {
    "path": "tests/data/html/mashable.com1.html",
    "chars": 31416,
    "preview": "<!DOCTYPE html>\n<!--\no o     o     +              o\n+   +     +             o     +       +\n            +\no  +    +     "
  },
  {
    "path": "tests/data/html/mashable.com2.html",
    "chars": 36230,
    "preview": "<!DOCTYPE html>\n<!--\no o     o     +              o\n+   +     +             o     +       +\n            +\no  +    +     "
  },
  {
    "path": "tests/data/html/mlive.com1.html",
    "chars": 84772,
    "preview": "<!doctype html>\n<!--[if lt IE 7]> <html class=\"no-js lt-ie9 lt-ie8 lt-ie7\" lang=\"en-US\" xmlns:fb=\"http://www.facebook.co"
  },
  {
    "path": "tests/data/html/mlive.com2.html",
    "chars": 74765,
    "preview": "<!doctype html>\n<!--[if lt IE 7]> <html class=\"no-js lt-ie9 lt-ie8 lt-ie7\" lang=\"en-US\" xmlns:fb=\"http://www.facebook.co"
  },
  {
    "path": "tests/data/html/newyorker.com1.html",
    "chars": 80005,
    "preview": "<!doctype html>\n<!--[if IE 7]> <html class=\"no-js oldie ie8 ie7\" lang=\"en\"> <![endif]-->\n<!--[if IE 8]> <html class=\"no-"
  },
  {
    "path": "tests/data/html/nj.com1.html",
    "chars": 103822,
    "preview": "<!doctype html>\n<!--[if lt IE 7]> <html class=\"no-js lt-ie9 lt-ie8 lt-ie7\" lang=\"en-US\" xmlns:fb=\"http://www.facebook.co"
  },
  {
    "path": "tests/data/html/nola.com1.html",
    "chars": 80273,
    "preview": "<!doctype html>\n<!--[if lt IE 7]> <html class=\"no-js lt-ie9 lt-ie8 lt-ie7\" lang=\"en-US\" xmlns:fb=\"http://www.facebook.co"
  },
  {
    "path": "tests/data/html/nydailynews.com1.html",
    "chars": 68350,
    "preview": " <!DOCTYPE html>\r\n<!--NEW-->\r\n\r\n<!--- www pageHead.vm --->\r\n<!--- mode = www --->\r\n<!--- URI = /new-york/brooklyn/boyfri"
  },
  {
    "path": "tests/data/html/nypost.com1.html",
    "chars": 99230,
    "preview": "<!DOCTYPE html>\n<!--[if IE 6]>\n<html id=\"ie6\" lang=\"en\">\n<![endif]-->\n<!--[if IE 7]>\n<html id=\"ie7\" lang=\"en\">\n<![endif]"
  },
  {
    "path": "tests/data/html/nypost.com2.html",
    "chars": 96125,
    "preview": "<!DOCTYPE html>\n<!--[if IE 6]>\n<html id=\"ie6\" lang=\"en\">\n<![endif]-->\n<!--[if IE 7]>\n<html id=\"ie7\" lang=\"en\">\n<![endif]"
  },
  {
    "path": "tests/data/html/ok.co.uk1.html",
    "chars": 60913,
    "preview": "<!DOCTYPE html>\n<!--OVOLABS_1 START-->\n<!--[if IEMobile 7]><html class=\"iem7\"  lang=\"en\" dir=\"ltr\" prefix=\"http://ogp.me"
  },
  {
    "path": "tests/data/html/ok.co.uk2.html",
    "chars": 60309,
    "preview": "<!DOCTYPE html>\n<!--OVOLABS_1 START-->\n<!--[if IEMobile 7]><html class=\"iem7\"  lang=\"en\" dir=\"ltr\" prefix=\"http://ogp.me"
  },
  {
    "path": "tests/data/html/oregonlive.com1.html",
    "chars": 80953,
    "preview": "<!doctype html>\n<!--[if lt IE 7]> <html class=\"no-js lt-ie9 lt-ie8 lt-ie7\" lang=\"en-US\" xmlns:fb=\"http://www.facebook.co"
  },
  {
    "path": "tests/data/html/oregonlive.com2.html",
    "chars": 67962,
    "preview": "<!doctype html>\n<!--[if lt IE 7]> <html class=\"no-js lt-ie9 lt-ie8 lt-ie7\" lang=\"en-US\" xmlns:fb=\"http://www.facebook.co"
  },
  {
    "path": "tests/data/html/parsely.com1.html",
    "chars": 47555,
    "preview": "<!DOCTYPE html>\n<!--[if lt IE 7 ]><html lang=\"en-US\" class=\"no-js ie ie6 lte7 lte8 lte9\"><![endif]-->\n<!--[if IE 7 ]><ht"
  },
  {
    "path": "tests/data/html/parsely.com2.html",
    "chars": 55510,
    "preview": "<!DOCTYPE html>\n<!--[if lt IE 7 ]><html lang=\"en-US\" class=\"no-js ie ie6 lte7 lte8 lte9\"><![endif]-->\n<!--[if IE 7 ]><ht"
  },
  {
    "path": "tests/data/html/pe.com1.html",
    "chars": 56775,
    "preview": "  \n<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.d"
  },
  {
    "path": "tests/data/html/pewresearch.org1.html",
    "chars": 65817,
    "preview": "<!DOCTYPE html>\n<!--[if lt IE 7 ]> <html class=\"ie ie6\" lang=\"en-US\"> <![endif]-->\n<!--[if IE 7 ]>    <html class=\"ie ie"
  },
  {
    "path": "tests/data/html/pewresearch.org2.html",
    "chars": 35093,
    "preview": "<!DOCTYPE html>\n<!--[if lt IE 7 ]> <html class=\"ie ie6\" lang=\"en-US\"> <![endif]-->\n<!--[if IE 7 ]>    <html class=\"ie ie"
  },
  {
    "path": "tests/data/html/pixable.com1.html",
    "chars": 286407,
    "preview": "<!DOCTYPE html>\n<!-- \"|| $CMSContentType === 'topic'\" was added to show something if pixable 1.0 user opens branded page"
  },
  {
    "path": "tests/data/html/pixable.com2.html",
    "chars": 313861,
    "preview": "<!DOCTYPE html>\n<!-- \"|| $CMSContentType === 'topic'\" was added to show something if pixable 1.0 user opens branded page"
  },
  {
    "path": "tests/data/html/pixelmonkey.org1.html",
    "chars": 34647,
    "preview": "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\""
  },
  {
    "path": "tests/data/html/pixelmonkey.org2.html",
    "chars": 30297,
    "preview": "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\""
  },
  {
    "path": "tests/data/html/readwrite.com1.html",
    "chars": 6649,
    "preview": "<!DOCTYPE html>\n\n\n<!--[if lt IE 7]><html class=\"lt-ie9 lt-ie8 lt-ie7\" id=\"phx-wrapper\"><![endif]-->\n<!--[if IE 7]><html "
  },
  {
    "path": "tests/data/html/recipe.com1.html",
    "chars": 67451,
    "preview": "<!DOCTYPE HTML>\n        <!--[if lt IE 7 ]><html lang=\"en\" class=\"ie6 no-js\"><![endif]-->\n    <!--[if IE 7 ]><html lang=\""
  },
  {
    "path": "tests/data/html/recipe.com2.html",
    "chars": 44781,
    "preview": "\n<!DOCTYPE html>\n<!--[if lt IE 7 ]><html lang=\"en\" class=\"ie6 no-js\"><![endif]-->\n<!--[if IE 7 ]><html lang=\"en\" class=\""
  },
  {
    "path": "tests/data/html/reuters.com1.html",
    "chars": 99378,
    "preview": "<!--[if !IE]> This has been served from cache <![endif]-->\n<!--[if !IE]> Request served from apache server: S263585RGSF2"
  },
  {
    "path": "tests/data/html/reuters.com2.html",
    "chars": 108601,
    "preview": "<!--[if !IE]> This has been served from cache <![endif]-->\n<!--[if !IE]> Request served from apache server: S263585RGSF0"
  }
]

// ... and 217 more files (download for full content)

About this extraction

This page contains the full source code of the codelucas/newspaper GitHub repository, extracted and formatted as plain text for AI agents and large language models (LLMs). The extraction includes 417 files (14.5 MB), approximately 3.8M tokens, and a symbol index with 405 extracted functions, classes, methods, constants, and types. Use this with OpenClaw, Claude, ChatGPT, Cursor, Windsurf, or any other AI tool that accepts text input. You can copy the full output to your clipboard or download it as a .txt file.

Extracted by GitExtract — free GitHub repo to text converter for AI. Built by Nikandr Surkov.

Copied to clipboard!