Repository: code4craft/webmagic
Branch: develop
Commit: 67816a19d68a
Files: 310
Total size: 1.0 MB
Directory structure:
gitextract_m56n222u/
├── .gitignore
├── .travis.yml
├── LICENSE
├── README-zh.md
├── README.md
├── pom.xml
├── src/
│ └── site/
│ └── site.xml
├── webmagic-core/
│ ├── README.md
│ ├── module_webmagic-core.xml
│ ├── pom.xml
│ └── src/
│ ├── main/
│ │ └── java/
│ │ └── us/
│ │ └── codecraft/
│ │ └── webmagic/
│ │ ├── Page.java
│ │ ├── Request.java
│ │ ├── ResultItems.java
│ │ ├── Site.java
│ │ ├── Spider.java
│ │ ├── SpiderListener.java
│ │ ├── SpiderScheduler.java
│ │ ├── Task.java
│ │ ├── downloader/
│ │ │ ├── AbstractDownloader.java
│ │ │ ├── CustomRedirectStrategy.java
│ │ │ ├── Downloader.java
│ │ │ ├── HttpClientDownloader.java
│ │ │ ├── HttpClientGenerator.java
│ │ │ ├── HttpClientRequestContext.java
│ │ │ ├── HttpUriRequestConverter.java
│ │ │ └── package.html
│ │ ├── model/
│ │ │ └── HttpRequestBody.java
│ │ ├── package.html
│ │ ├── pipeline/
│ │ │ ├── CollectorPipeline.java
│ │ │ ├── ConsolePipeline.java
│ │ │ ├── FilePipeline.java
│ │ │ ├── Pipeline.java
│ │ │ ├── ResultItemsCollectorPipeline.java
│ │ │ └── package.html
│ │ ├── processor/
│ │ │ ├── PageProcessor.java
│ │ │ ├── SimplePageProcessor.java
│ │ │ ├── example/
│ │ │ │ ├── BaiduBaikePageProcessor.java
│ │ │ │ ├── GithubRepoPageProcessor.java
│ │ │ │ └── ZhihuPageProcessor.java
│ │ │ └── package.html
│ │ ├── proxy/
│ │ │ ├── Proxy.java
│ │ │ ├── ProxyProvider.java
│ │ │ └── SimpleProxyProvider.java
│ │ ├── scheduler/
│ │ │ ├── DuplicateRemovedScheduler.java
│ │ │ ├── MonitorableScheduler.java
│ │ │ ├── PriorityScheduler.java
│ │ │ ├── QueueScheduler.java
│ │ │ ├── Scheduler.java
│ │ │ ├── component/
│ │ │ │ ├── DuplicateRemover.java
│ │ │ │ ├── HashSetDuplicateRemover.java
│ │ │ │ └── package.html
│ │ │ └── package.html
│ │ ├── selector/
│ │ │ ├── AbstractSelectable.java
│ │ │ ├── AndSelector.java
│ │ │ ├── BaseElementSelector.java
│ │ │ ├── CssSelector.java
│ │ │ ├── ElementSelector.java
│ │ │ ├── Html.java
│ │ │ ├── HtmlNode.java
│ │ │ ├── Json.java
│ │ │ ├── JsonPathSelector.java
│ │ │ ├── LinksSelector.java
│ │ │ ├── OrSelector.java
│ │ │ ├── PlainText.java
│ │ │ ├── RegexResult.java
│ │ │ ├── RegexSelector.java
│ │ │ ├── ReplaceSelector.java
│ │ │ ├── Selectable.java
│ │ │ ├── Selector.java
│ │ │ ├── Selectors.java
│ │ │ ├── SmartContentSelector.java
│ │ │ ├── XpathSelector.java
│ │ │ └── package.html
│ │ ├── thread/
│ │ │ └── CountableThreadPool.java
│ │ └── utils/
│ │ ├── BaseSelectorUtils.java
│ │ ├── CharsetUtils.java
│ │ ├── Experimental.java
│ │ ├── FilePersistentBase.java
│ │ ├── HttpClientUtils.java
│ │ ├── HttpConstant.java
│ │ ├── NumberUtils.java
│ │ ├── ProxyUtils.java
│ │ ├── UrlUtils.java
│ │ ├── WMCollections.java
│ │ └── package.html
│ └── test/
│ ├── java/
│ │ └── us/
│ │ └── codecraft/
│ │ └── webmagic/
│ │ ├── HtmlTest.java
│ │ ├── RequestTest.java
│ │ ├── ResultItemsTest.java
│ │ ├── SiteTest.java
│ │ ├── SpiderTest.java
│ │ ├── downloader/
│ │ │ ├── HttpClientDownloaderTest.java
│ │ │ ├── HttpUriRequestConverterTest.java
│ │ │ ├── MockGithubDownloader.java
│ │ │ └── SSLCompatibilityTest.java
│ │ ├── example/
│ │ │ └── GithubRepoPageProcessorTest.java
│ │ ├── pipeline/
│ │ │ └── FilePipelineTest.java
│ │ ├── processor/
│ │ │ └── PageProcessorTest.java
│ │ ├── proxy/
│ │ │ ├── ProxyTest.java
│ │ │ └── SimpleProxyProviderTest.java
│ │ ├── scheduler/
│ │ │ ├── DuplicateRemovedSchedulerTest.java
│ │ │ └── PrioritySchedulerTest.java
│ │ ├── selector/
│ │ │ ├── AndSelectorTest.java
│ │ │ ├── CssSelectorTest.java
│ │ │ ├── ExtractorsTest.java
│ │ │ ├── JsonPathSelectorTest.java
│ │ │ ├── JsonTest.java
│ │ │ ├── LinksSelectorTest.java
│ │ │ ├── OrSelectorTest.java
│ │ │ ├── RegexSelectorTest.java
│ │ │ └── SelectorTest.java
│ │ └── utils/
│ │ ├── CharsetUtilsTest.java
│ │ ├── NumberUtilsTest.java
│ │ └── UrlUtilsTest.java
│ └── resources/
│ ├── html/
│ │ └── mock-github.html
│ └── log4j2-test.xml
├── webmagic-coverage/
│ └── pom.xml
├── webmagic-extension/
│ ├── README.md
│ ├── pom.xml
│ └── src/
│ ├── main/
│ │ ├── java/
│ │ │ └── us/
│ │ │ └── codecraft/
│ │ │ └── webmagic/
│ │ │ ├── MultiPageModel.java
│ │ │ ├── SimpleHttpClient.java
│ │ │ ├── configurable/
│ │ │ │ ├── ConfigurablePageProcessor.java
│ │ │ │ ├── ExpressionType.java
│ │ │ │ └── ExtractRule.java
│ │ │ ├── downloader/
│ │ │ │ └── PhantomJSDownloader.java
│ │ │ ├── example/
│ │ │ │ ├── AppStore.java
│ │ │ │ ├── BaiduBaike.java
│ │ │ │ ├── GithubRepo.java
│ │ │ │ ├── GithubRepoApi.java
│ │ │ │ ├── GithubRepoPageMapper.java
│ │ │ │ ├── MonitorExample.java
│ │ │ │ ├── OschinaBlog.java
│ │ │ │ └── PatternProcessorExample.java
│ │ │ ├── handler/
│ │ │ │ ├── CompositePageProcessor.java
│ │ │ │ ├── CompositePipeline.java
│ │ │ │ ├── PatternProcessor.java
│ │ │ │ ├── PatternRequestMatcher.java
│ │ │ │ ├── RequestMatcher.java
│ │ │ │ ├── SubPageProcessor.java
│ │ │ │ └── SubPipeline.java
│ │ │ ├── model/
│ │ │ │ ├── AfterExtractor.java
│ │ │ │ ├── ConsolePageModelPipeline.java
│ │ │ │ ├── Extractor.java
│ │ │ │ ├── FieldExtractor.java
│ │ │ │ ├── HasKey.java
│ │ │ │ ├── ModelPageProcessor.java
│ │ │ │ ├── ModelPipeline.java
│ │ │ │ ├── OOSpider.java
│ │ │ │ ├── PageMapper.java
│ │ │ │ ├── PageModelCollectorPipeline.java
│ │ │ │ ├── PageModelExtractor.java
│ │ │ │ ├── annotation/
│ │ │ │ │ ├── ComboExtract.java
│ │ │ │ │ ├── ExtractBy.java
│ │ │ │ │ ├── ExtractByUrl.java
│ │ │ │ │ ├── Formatter.java
│ │ │ │ │ ├── HelpUrl.java
│ │ │ │ │ ├── TargetUrl.java
│ │ │ │ │ └── package.html
│ │ │ │ ├── fields/
│ │ │ │ │ ├── MultipleField.java
│ │ │ │ │ ├── PageField.java
│ │ │ │ │ └── SingleField.java
│ │ │ │ ├── formatter/
│ │ │ │ │ ├── BasicClassDetector.java
│ │ │ │ │ ├── BasicTypeFormatter.java
│ │ │ │ │ ├── DateFormatter.java
│ │ │ │ │ ├── ObjectFormatter.java
│ │ │ │ │ ├── ObjectFormatterBuilder.java
│ │ │ │ │ └── ObjectFormatters.java
│ │ │ │ ├── package.html
│ │ │ │ └── sources/
│ │ │ │ ├── Source.java
│ │ │ │ └── SourceTextExtractor.java
│ │ │ ├── monitor/
│ │ │ │ ├── SpiderMonitor.java
│ │ │ │ ├── SpiderStatus.java
│ │ │ │ └── SpiderStatusMXBean.java
│ │ │ ├── pipeline/
│ │ │ │ ├── CollectorPageModelPipeline.java
│ │ │ │ ├── FilePageModelPipeline.java
│ │ │ │ ├── JsonFilePageModelPipeline.java
│ │ │ │ ├── JsonFilePipeline.java
│ │ │ │ ├── MultiPagePipeline.java
│ │ │ │ └── PageModelPipeline.java
│ │ │ ├── scheduler/
│ │ │ │ ├── BloomFilterDuplicateRemover.java
│ │ │ │ ├── FileCacheQueueScheduler.java
│ │ │ │ ├── RedisPriorityScheduler.java
│ │ │ │ └── RedisScheduler.java
│ │ │ └── utils/
│ │ │ ├── ClassUtils.java
│ │ │ ├── DoubleKeyMap.java
│ │ │ ├── ExtractorUtils.java
│ │ │ ├── IPUtils.java
│ │ │ ├── MultiKeyMapBase.java
│ │ │ └── RequestUtils.java
│ │ └── resources/
│ │ ├── crawl.js
│ │ └── spider-config-draft.xml
│ └── test/
│ ├── java/
│ │ └── us/
│ │ └── codecraft/
│ │ └── webmagic/
│ │ ├── MockPageModelPipeline.java
│ │ ├── MockPipeline.java
│ │ ├── SimpleHttpClientTest.java
│ │ ├── configurable/
│ │ │ └── ConfigurablePageProcessorTest.java
│ │ ├── downloader/
│ │ │ └── MockGithubDownloader.java
│ │ ├── formatter/
│ │ │ └── DateFormatterTest.java
│ │ ├── model/
│ │ │ ├── BaseRepo.java
│ │ │ ├── GithubRepo.java
│ │ │ ├── GithubRepoApi.java
│ │ │ ├── GithubRepoTest.java
│ │ │ ├── ModelPageProcessorTest.java
│ │ │ ├── PageMapperTest.java
│ │ │ ├── PageMocker.java
│ │ │ └── PageModelExtractorTest.java
│ │ ├── monitor/
│ │ │ ├── CustomSpiderStatus.java
│ │ │ ├── CustomSpiderStatusMXBean.java
│ │ │ ├── SeedUrlWithPortTest.java
│ │ │ └── SpiderMonitorTest.java
│ │ ├── processor/
│ │ │ └── GithubRepoProcessor.java
│ │ ├── scheduler/
│ │ │ ├── BloomFilterDuplicateRemoverTest.java
│ │ │ ├── RedisPrioritySchedulerTest.java
│ │ │ └── RedisSchedulerTest.java
│ │ └── utils/
│ │ ├── IPUtilsTest.java
│ │ └── RequestUtilsTest.java
│ └── resources/
│ ├── html/
│ │ ├── mock-github.html
│ │ └── mock-webmagic.html
│ ├── json/
│ │ └── mock-githubrepo.json
│ └── log4j2-test.xml
├── webmagic-samples/
│ ├── README.md
│ ├── pom.xml
│ └── src/
│ ├── main/
│ │ ├── java/
│ │ │ └── us/
│ │ │ └── codecraft/
│ │ │ └── webmagic/
│ │ │ ├── main/
│ │ │ │ └── QuickStarter.java
│ │ │ ├── model/
│ │ │ │ └── samples/
│ │ │ │ ├── BaiduNews.java
│ │ │ │ ├── Blog.java
│ │ │ │ ├── DianpingFtlDataScanner.java
│ │ │ │ ├── GithubRepo.java
│ │ │ │ ├── IteyeBlog.java
│ │ │ │ ├── JokejiModel.java
│ │ │ │ ├── Kr36NewsModel.java
│ │ │ │ ├── News163.java
│ │ │ │ ├── OschinaAnswer.java
│ │ │ │ ├── OschinaBlog.java
│ │ │ │ └── QQMeishi.java
│ │ │ ├── recover/
│ │ │ │ ├── DuplicateStorageRemover.java
│ │ │ │ ├── MmapQueueScheduler.java
│ │ │ │ └── RecoverSample.java
│ │ │ └── samples/
│ │ │ ├── AlexanderMcqueenGoodsProcessor.java
│ │ │ ├── AmanzonPageProcessor.java
│ │ │ ├── AngularJSProcessor.java
│ │ │ ├── DiandianBlogProcessor.java
│ │ │ ├── DiaoyuwengProcessor.java
│ │ │ ├── F58PageProcesser.java
│ │ │ ├── GithubRepo.java
│ │ │ ├── GithubRepoPageProcessor.java
│ │ │ ├── HuxiuProcessor.java
│ │ │ ├── InfoQMiniBookProcessor.java
│ │ │ ├── IteyeBlogProcessor.java
│ │ │ ├── KaichibaProcessor.java
│ │ │ ├── MamacnPageProcessor.java
│ │ │ ├── MeicanProcessor.java
│ │ │ ├── NjuBBSProcessor.java
│ │ │ ├── PhantomJSPageProcessor.java
│ │ │ ├── QzoneBlogProcessor.java
│ │ │ ├── SinaBlogProcessor.java
│ │ │ ├── TianyaPageProcesser.java
│ │ │ ├── ZhihuPageProcessor.java
│ │ │ ├── formatter/
│ │ │ │ └── StringTemplateFormatter.java
│ │ │ ├── pipeline/
│ │ │ │ ├── OneFilePipeline.java
│ │ │ │ └── ReplacePipeline.java
│ │ │ └── scheduler/
│ │ │ ├── DelayQueueScheduler.java
│ │ │ ├── LevelLimitScheduler.java
│ │ │ └── ZipCodePageProcessor.java
│ │ └── resources/
│ │ ├── crawl.js
│ │ └── log4j2.xml
│ └── test/
│ └── java/
│ └── us/
│ └── codecraft/
│ └── webmagic/
│ ├── SpiderTest.java
│ ├── model/
│ │ └── ProcessorBenchmark.java
│ ├── processor/
│ │ └── SinablogProcessorTest.java
│ └── samples/
│ └── scheduler/
│ └── DelayQueueSchedulerTest.java
├── webmagic-saxon/
│ ├── README.md
│ ├── pom.xml
│ └── src/
│ ├── main/
│ │ └── java/
│ │ └── us/
│ │ └── codecraft/
│ │ └── webmagic/
│ │ └── selector/
│ │ ├── JaxpSelectorUtils.java
│ │ ├── NodeSelector.java
│ │ └── Xpath2Selector.java
│ └── test/
│ └── java/
│ └── us/
│ └── codecraft/
│ └── webmagic/
│ └── selector/
│ └── XpathSelectorTest.java
├── webmagic-scripts/
│ ├── README.md
│ ├── deploy.sh
│ ├── pom.xml
│ └── src/
│ ├── main/
│ │ ├── groovy/
│ │ │ └── Github.groovy
│ │ ├── java/
│ │ │ └── us/
│ │ │ └── codecraft/
│ │ │ └── webmagic/
│ │ │ └── scripts/
│ │ │ ├── Params.java
│ │ │ ├── ScriptConsole.java
│ │ │ ├── ScriptEnginePool.java
│ │ │ ├── ScriptProcessor.java
│ │ │ ├── ScriptProcessorBuilder.java
│ │ │ ├── config/
│ │ │ │ ├── CommandLineOption.java
│ │ │ │ └── ConfigLogger.java
│ │ │ └── languages/
│ │ │ ├── JRuby.java
│ │ │ ├── Javascript.java
│ │ │ ├── Jython.java
│ │ │ └── Language.java
│ │ ├── kotlin/
│ │ │ └── Github.kt
│ │ └── resources/
│ │ ├── js/
│ │ │ ├── defines.js
│ │ │ ├── github.js
│ │ │ └── oschina.js
│ │ ├── python/
│ │ │ ├── defines.py
│ │ │ └── oschina.py
│ │ └── ruby/
│ │ ├── defines.rb
│ │ ├── github.rb
│ │ └── oschina.rb
│ └── test/
│ ├── java/
│ │ └── us/
│ │ └── codecraft/
│ │ └── webmagic/
│ │ └── scripts/
│ │ └── ScriptProcessorTest.java
│ └── resources/
│ └── log4j2-test.xml
└── webmagic-selenium/
├── README.md
├── config.ini
├── pom.xml
└── src/
├── main/
│ └── java/
│ └── us/
│ └── codecraft/
│ └── webmagic/
│ └── downloader/
│ └── selenium/
│ ├── SeleniumDownloader.java
│ └── WebDriverPool.java
└── test/
├── java/
│ └── us/
│ └── codecraft/
│ └── webmagic/
│ ├── downloader/
│ │ ├── SeleniumTest.java
│ │ └── selenium/
│ │ ├── SeleniumDownloaderTest.java
│ │ └── WebDriverPoolTest.java
│ └── samples/
│ ├── GooglePlayProcessor.java
│ └── HuabanProcessor.java
└── resources/
└── config.ini
================================================
FILE CONTENTS
================================================
================================================
FILE: .gitignore
================================================
target/
pom.xml.tag
pom.xml.releaseBackup
pom.xml.versionsBackup
pom.xml.next
release.properties
dependency-reduced-pom.xml
buildNumber.properties
.mvn/timing.properties
# https://github.com/takari/maven-wrapper#usage-without-binary-jar
.mvn/wrapper/maven-wrapper.jar
# Eclipse m2e generated files
# Eclipse Core
.project
# JDT-specific (Eclipse Java Development Tools)
.classpath
.metadata
bin/
tmp/
*.tmp
*.bak
*.swp
*~.nib
local.properties
.settings/
.loadpath
.recommenders
# External tool builders
.externalToolBuilders/
# Locally stored "Eclipse launch configurations"
*.launch
# PyDev specific (Python IDE for Eclipse)
*.pydevproject
# CDT-specific (C/C++ Development Tooling)
.cproject
# CDT- autotools
.autotools
# Java annotation processor (APT)
.factorypath
# PDT-specific (PHP Development Tools)
.buildpath
# sbteclipse plugin
.target
# Tern plugin
.tern-project
# TeXlipse plugin
.texlipse
# STS (Spring Tool Suite)
.springBeans
# Code Recommenders
.recommenders/
# Annotation Processing
.apt_generated/
.apt_generated_test/
# Scala IDE specific (Scala & Java development for Eclipse)
.cache-main
.scala_dependencies
.worksheet
# Uncomment this line if you wish to ignore the project description file.
# Typically, this file would be tracked if it contains build/dependency configurations:
#.project
================================================
FILE: .travis.yml
================================================
language: java
jdk:
- openjdk9
================================================
FILE: LICENSE
================================================
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction, and
distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by the copyright
owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all other entities
that control, are controlled by, or are under common control with that entity.
For the purposes of this definition, "control" means (i) the power, direct or
indirect, to cause the direction or management of such entity, whether by
contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity exercising
permissions granted by this License.
"Source" form shall mean the preferred form for making modifications, including
but not limited to software source code, documentation source, and configuration
files.
"Object" form shall mean any form resulting from mechanical transformation or
translation of a Source form, including but not limited to compiled object code,
generated documentation, and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or Object form, made
available under the License, as indicated by a copyright notice that is included
in or attached to the work (an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object form, that
is based on (or derived from) the Work and for which the editorial revisions,
annotations, elaborations, or other modifications represent, as a whole, an
original work of authorship. For the purposes of this License, Derivative Works
shall not include works that remain separable from, or merely link (or bind by
name) to the interfaces of, the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including the original version
of the Work and any modifications or additions to that Work or Derivative Works
thereof, that is intentionally submitted to Licensor for inclusion in the Work
by the copyright owner or by an individual or Legal Entity authorized to submit
on behalf of the copyright owner. For the purposes of this definition,
"submitted" means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems, and
issue tracking systems that are managed by, or on behalf of, the Licensor for
the purpose of discussing and improving the Work, but excluding communication
that is conspicuously marked or otherwise designated in writing by the copyright
owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity on behalf
of whom a Contribution has been received by Licensor and subsequently
incorporated within the Work.
2. Grant of Copyright License.
Subject to the terms and conditions of this License, each Contributor hereby
grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free,
irrevocable copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the Work and such
Derivative Works in Source or Object form.
3. Grant of Patent License.
Subject to the terms and conditions of this License, each Contributor hereby
grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free,
irrevocable (except as stated in this section) patent license to make, have
made, use, offer to sell, sell, import, and otherwise transfer the Work, where
such license applies only to those patent claims licensable by such Contributor
that are necessarily infringed by their Contribution(s) alone or by combination
of their Contribution(s) with the Work to which such Contribution(s) was
submitted. If You institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work or a
Contribution incorporated within the Work constitutes direct or contributory
patent infringement, then any patent licenses granted to You under this License
for that Work shall terminate as of the date such litigation is filed.
4. Redistribution.
You may reproduce and distribute copies of the Work or Derivative Works thereof
in any medium, with or without modifications, and in Source or Object form,
provided that You meet the following conditions:
You must give any other recipients of the Work or Derivative Works a copy of
this License; and
You must cause any modified files to carry prominent notices stating that You
changed the files; and
You must retain, in the Source form of any Derivative Works that You distribute,
all copyright, patent, trademark, and attribution notices from the Source form
of the Work, excluding those notices that do not pertain to any part of the
Derivative Works; and
If the Work includes a "NOTICE" text file as part of its distribution, then any
Derivative Works that You distribute must include a readable copy of the
attribution notices contained within such NOTICE file, excluding those notices
that do not pertain to any part of the Derivative Works, in at least one of the
following places: within a NOTICE text file distributed as part of the
Derivative Works; within the Source form or documentation, if provided along
with the Derivative Works; or, within a display generated by the Derivative
Works, if and wherever such third-party notices normally appear. The contents of
the NOTICE file are for informational purposes only and do not modify the
License. You may add Your own attribution notices within Derivative Works that
You distribute, alongside or as an addendum to the NOTICE text from the Work,
provided that such additional attribution notices cannot be construed as
modifying the License.
You may add Your own copyright statement to Your modifications and may provide
additional or different license terms and conditions for use, reproduction, or
distribution of Your modifications, or for any such Derivative Works as a whole,
provided Your use, reproduction, and distribution of the Work otherwise complies
with the conditions stated in this License.
5. Submission of Contributions.
Unless You explicitly state otherwise, any Contribution intentionally submitted
for inclusion in the Work by You to the Licensor shall be under the terms and
conditions of this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify the terms of
any separate license agreement you may have executed with Licensor regarding
such Contributions.
6. Trademarks.
This License does not grant permission to use the trade names, trademarks,
service marks, or product names of the Licensor, except as required for
reasonable and customary use in describing the origin of the Work and
reproducing the content of the NOTICE file.
7. Disclaimer of Warranty.
Unless required by applicable law or agreed to in writing, Licensor provides the
Work (and each Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied,
including, without limitation, any warranties or conditions of TITLE,
NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are
solely responsible for determining the appropriateness of using or
redistributing the Work and assume any risks associated with Your exercise of
permissions under this License.
8. Limitation of Liability.
In no event and under no legal theory, whether in tort (including negligence),
contract, or otherwise, unless required by applicable law (such as deliberate
and grossly negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special, incidental,
or consequential damages of any character arising as a result of this License or
out of the use or inability to use the Work (including but not limited to
damages for loss of goodwill, work stoppage, computer failure or malfunction, or
any and all other commercial damages or losses), even if such Contributor has
been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability.
While redistributing the Work or Derivative Works thereof, You may choose to
offer, and charge a fee for, acceptance of support, warranty, indemnity, or
other liability obligations and/or rights consistent with this License. However,
in accepting such obligations, You may act only on Your own behalf and on Your
sole responsibility, not on behalf of any other Contributor, and only if You
agree to indemnify, defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason of your
accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work
To apply the Apache License to your work, attach the following boilerplate
notice, with the fields enclosed by brackets "{}" replaced with your own
identifying information. (Don't include the brackets!) The text should be
enclosed in the appropriate comment syntax for the file format. We also
recommend that a file or class name and description of purpose be included on
the same "printed page" as the copyright notice for easier identification within
third-party archives.
Copyright 2025 code4craft
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
================================================
FILE: README-zh.md
================================================

[](https://maven-badges.herokuapp.com/maven-central/us.codecraft/webmagic-parent/)
[](https://www.apache.org/licenses/LICENSE-2.0.html)
[](https://travis-ci.org/code4craft/webmagic)
官方网站[http://webmagic.io/](http://webmagic.io/)
>webmagic是一个开源的Java垂直爬虫框架,目标是简化爬虫的开发流程,让开发者专注于逻辑功能的开发。webmagic的核心非常简单,但是覆盖爬虫的整个流程,也是很好的学习爬虫开发的材料。
webmagic的主要特色:
* 完全模块化的设计,强大的可扩展性。
* 核心简单但是涵盖爬虫的全部流程,灵活而强大,也是学习爬虫入门的好材料。
* 提供丰富的抽取页面API。
* 无配置,但是可通过POJO+注解形式实现一个爬虫。
* 支持多线程。
* 支持分布式。
* 支持爬取js动态渲染的页面。
* 无框架依赖,可以灵活的嵌入到项目中去。
webmagic的架构和设计参考了以下两个项目,感谢以下两个项目的作者:
python爬虫 **scrapy** [https://github.com/scrapy/scrapy](https://github.com/scrapy/scrapy)
Java爬虫 **Spiderman** [http://git.oschina.net/l-weiwei/spiderman](http://git.oschina.net/l-weiwei/spiderman)
webmagic的github地址:[https://github.com/code4craft/webmagic](https://github.com/code4craft/webmagic)。
## 快速开始
### 使用maven
webmagic使用maven管理依赖,在项目中添加对应的依赖即可使用webmagic:
```xml
us.codecraftwebmagic-core${webmagic.version}us.codecraftwebmagic-extension${webmagic.version}
```
WebMagic 使用slf4j-log4j12作为slf4j的实现.如果你自己定制了slf4j的实现,请在项目中去掉此依赖。
```xml
org.slf4jslf4j-log4j12
```
#### 项目结构
webmagic主要包括两个包:
* **webmagic-core**
webmagic核心部分,只包含爬虫基本模块和基本抽取器。webmagic-core的目标是成为网页爬虫的一个教科书般的实现。
* **webmagic-extension**
webmagic的扩展模块,提供一些更方便的编写爬虫的工具。包括注解格式定义爬虫、JSON、分布式等支持。
webmagic还包含两个可用的扩展包,因为这两个包都依赖了比较重量级的工具,所以从主要包中抽离出来,这些包需要下载源码后自己编译::
* **webmagic-saxon**
webmagic与Saxon结合的模块。Saxon是一个XPath、XSLT的解析工具,webmagic依赖Saxon来进行XPath2.0语法解析支持。
* **webmagic-selenium**
webmagic与Selenium结合的模块。Selenium是一个模拟浏览器进行页面渲染的工具,webmagic依赖Selenium进行动态页面的抓取。
在项目中,你可以根据需要依赖不同的包。
### 不使用maven
在项目的**lib**目录下,有依赖的所有jar包,直接在IDE里import即可。
### 第一个爬虫
#### 定制PageProcessor
PageProcessor是webmagic-core的一部分,定制一个PageProcessor即可实现自己的爬虫逻辑。以下是抓取osc博客的一段代码:
```java
public class OschinaBlogPageProcessor implements PageProcessor {
private Site site = Site.me().setDomain("my.oschina.net");
@Override
public void process(Page page) {
List links = page.getHtml().links().regex("http://my\\.oschina\\.net/flashsword/blog/\\d+").all();
page.addTargetRequests(links);
page.putField("title", page.getHtml().xpath("//div[@class='BlogEntity']/div[@class='BlogTitle']/h1").toString());
page.putField("content", page.getHtml().$("div.content").toString());
page.putField("tags",page.getHtml().xpath("//div[@class='BlogTags']/a/text()").all());
}
@Override
public Site getSite() {
return site;
}
public static void main(String[] args) {
Spider.create(new OschinaBlogPageProcessor()).addUrl("http://my.oschina.net/flashsword/blog")
.addPipeline(new ConsolePipeline()).run();
}
}
```
这里通过page.addTargetRequests()方法来增加要抓取的URL,并通过page.putField()来保存抽取结果。page.getHtml().xpath()则是按照某个规则对结果进行抽取,这里抽取支持链式调用。调用结束后,toString()表示转化为单个String,all()则转化为一个String列表。
Spider是爬虫的入口类。Pipeline是结果输出和持久化的接口,这里ConsolePipeline表示结果输出到控制台。
执行这个main方法,即可在控制台看到抓取结果。webmagic默认有3秒抓取间隔,请耐心等待。
#### 使用注解
webmagic-extension包括了注解方式编写爬虫的方法,只需基于一个POJO增加注解即可完成一个爬虫。以下仍然是抓取oschina博客的一段代码,功能与OschinaBlogPageProcesser完全相同:
```java
@TargetUrl("http://my.oschina.net/flashsword/blog/\\d+")
public class OschinaBlog {
@ExtractBy("//title")
private String title;
@ExtractBy(value = "div.BlogContent",type = ExtractBy.Type.Css)
private String content;
@ExtractBy(value = "//div[@class='BlogTags']/a/text()", multi = true)
private List tags;
public static void main(String[] args) {
OOSpider.create(
Site.me(),
new ConsolePageModelPipeline(), OschinaBlog.class).addUrl("http://my.oschina.net/flashsword/blog").run();
}
}
```
这个例子定义了一个Model类,Model类的字段'title'、'content'、'tags'均为要抽取的属性。这个类在Pipeline里是可以复用的。
### 详细文档
见[http://webmagic.io/docs/](http://webmagic.io/docs/)。
### 示例
webmagic-samples目录里有一些定制PageProcessor以抽取不同站点的例子。
webmagic的使用可以参考:[oschina openapi 应用:博客搬家](https://git.oschina.net/yashin/MoveBlog)
### 协议
webmagic遵循[Apache 2.0协议](http://opensource.org/licenses/Apache-2.0)
### 邮件组:
Gmail:
[https://groups.google.com/forum/#!forum/webmagic-java](https://groups.google.com/forum/#!forum/webmagic-java)
QQ:
[http://list.qq.com/cgi-bin/qf_invite?id=023a01f505246785f77c5a5a9aff4e57ab20fcdde871e988](http://list.qq.com/cgi-bin/qf_invite?id=023a01f505246785f77c5a5a9aff4e57ab20fcdde871e988)
### QQ群:
373225642(已满) 542327088
### 相关项目:
[Gather Platform](https://github.com/gsh199449/spider)
Gather Platform 数据抓取平台是一套基于Webmagic内核的,具有Web任务配置和任务管理界面的数据采集与搜索平台。
================================================
FILE: README.md
================================================

[Readme in Chinese](https://github.com/code4craft/webmagic/tree/master/README-zh.md)
[](https://maven-badges.herokuapp.com/maven-central/us.codecraft/webmagic-parent/)
[](https://www.apache.org/licenses/LICENSE-2.0.html)
[](https://travis-ci.org/code4craft/webmagic)
>A scalable crawler framework. It covers the whole lifecycle of crawler: downloading, url management, content extraction and persistent. It can simplify the development of a specific crawler.
## Features:
* Simple core with high flexibility.
* Simple API for html extracting.
* Annotation with POJO to customize a crawler, no configuration.
* Multi-thread and Distribution support.
* Easy to be integrated.
## Install:
Add dependencies to your pom.xml:
```xml
us.codecraftwebmagic-core${webmagic.version}us.codecraftwebmagic-extension${webmagic.version}
```
WebMagic use slf4j with slf4j-log4j12 implementation. If you customized your slf4j implementation, please exclude slf4j-log4j12.
```xml
org.slf4jslf4j-log4j12
```
## Get Started:
### First crawler:
Write a class implements PageProcessor. For example, I wrote a crawler of github repository information.
```java
public class GithubRepoPageProcessor implements PageProcessor {
private Site site = Site.me().setRetryTimes(3).setSleepTime(1000);
@Override
public void process(Page page) {
page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/\\w+/\\w+)").all());
page.putField("author", page.getUrl().regex("https://github\\.com/(\\w+)/.*").toString());
page.putField("name", page.getHtml().xpath("//h1[@class='public']/strong/a/text()").toString());
if (page.getResultItems().get("name")==null){
//skip this page
page.setSkip(true);
}
page.putField("readme", page.getHtml().xpath("//div[@id='readme']/tidyText()"));
}
@Override
public Site getSite() {
return site;
}
public static void main(String[] args) {
Spider.create(new GithubRepoPageProcessor()).addUrl("https://github.com/code4craft").thread(5).run();
}
}
```
* `page.addTargetRequests(links)`
Add urls for crawling.
You can also use annotation way:
```java
@TargetUrl("https://github.com/\\w+/\\w+")
@HelpUrl("https://github.com/\\w+")
public class GithubRepo {
@ExtractBy(value = "//h1[@class='public']/strong/a/text()", notNull = true)
private String name;
@ExtractByUrl("https://github\\.com/(\\w+)/.*")
private String author;
@ExtractBy("//div[@id='readme']/tidyText()")
private String readme;
public static void main(String[] args) {
OOSpider.create(Site.me().setSleepTime(1000)
, new ConsolePageModelPipeline(), GithubRepo.class)
.addUrl("https://github.com/code4craft").thread(5).run();
}
}
```
### Docs and samples:
Documents: [http://webmagic.io/docs/](http://webmagic.io/docs/)
The architecture of webmagic (referred to [Scrapy](http://scrapy.org/))

There are more examples in `webmagic-samples` package.
### License:
Licensed under [Apache 2.0 license](http://opensource.org/licenses/Apache-2.0)
### Thanks:
To write webmagic, I refered to the projects below :
* **Scrapy**
A crawler framework in Python.
[http://scrapy.org/](http://scrapy.org/)
* **Spiderman**
Another crawler framework in Java.
[http://git.oschina.net/l-weiwei/spiderman](http://git.oschina.net/l-weiwei/spiderman)
### Mail-list:
[https://groups.google.com/forum/#!forum/webmagic-java](https://groups.google.com/forum/#!forum/webmagic-java)
[http://list.qq.com/cgi-bin/qf_invite?id=023a01f505246785f77c5a5a9aff4e57ab20fcdde871e988](http://list.qq.com/cgi-bin/qf_invite?id=023a01f505246785f77c5a5a9aff4e57ab20fcdde871e988)
QQ Group: 373225642 542327088
### Related Project
* Gather Platform
A web console based on WebMagic for Spider configuration and management.
================================================
FILE: pom.xml
================================================
4.0.0org.oxerroxerr-parent2.3.1us.codecraft1.0.4-SNAPSHOTpomUTF-8UTF-811113.23.11.5.04.42.14.03.12.02.0.19.graal3.0.1332.0.0-jre2.294.5.134.4.153.7.19.4.12.12.9.05.10.21.10.22.7.32.23.12.0.2-beta1.3.01.2.012.44.14.12.0.44.0.0.RELEASE0.3.5webmagicwebmagic
A crawler framework. It covers the whole lifecycle of crawler: downloading, url management, content
extraction and persistent. It can simply the development of a specific crawler.
https://github.com/code4craft/webmagic/code4craftYihua huangcode4crafer@gmail.comyuanyLigang Yaoligang.yao@answers.comscm:git:git@github.com:code4craft/webmagic.gitscm:git:git@github.com:code4craft/webmagic.gitgit@github.com:code4craft/webmagic.gitWebMagic-${project.version}Apache License, Version 2.0http://www.apache.org/licenses/LICENSE-2.0webmagic-corewebmagic-extension/webmagic-scripts/webmagic-seleniumwebmagic-saxonwebmagic-sampleswebmagic-coverageorg.apache.logging.log4jlog4j-coretestorg.apache.logging.log4jlog4j-slf4j2-impltestorg.junit.jupiterjunit-jupiter-enginetestorg.junit.vintagejunit-vintage-enginetestorg.junit.platformjunit-platform-launchertestorg.junit.platformjunit-platform-runnertestorg.mockitomockito-all${mockito-all.version}testorg.apache.httpcomponentshttpclient${httpclient.version}org.apache.httpcomponentshttpcore${httpcore.version}org.apache.logging.log4jlog4j-core${log4j2.version}org.apache.logging.log4jlog4j-slf4j2-impl${log4j2.version}com.google.guavaguava${guava.version}com.jayway.jsonpathjson-path${json-path.version}org.junit.jupiterjunit-jupiter-engine${junit.version}org.junit.vintagejunit-vintage-engine${junit.version}org.junit.platformjunit-platform-launcher${junit.platform.version}org.junit.platformjunit-platform-runner${junit.platform.version}org.slf4jslf4j-api${slf4j.version}us.codecraftxsoup0.3.7com.alibabafastjson${fastjson.version}com.github.dreamheadmoco-core${moco.version}testorg.slf4jslf4j-simpleorg.assertjassertj-core${assertj.version}testorg.apache.commonscommons-lang3${commons-lang3.version}org.apache.commonscommons-collections4${commons-collections4.version}commons-iocommons-io${commons-io.version}org.codehaus.groovygroovy-all${groovy-all.version}org.jrubyjruby${jruby.version}org.pythonjython${jython.version}org.seleniumhq.seleniumselenium-java${selenium-java.version}net.sf.saxonSaxon-HE${saxon-he.version}net.sourceforge.htmlcleanerhtmlcleaner${htmlcleaner.version}com.github.detrophantomjsdriver${phantomjsdriver.version}commons-clicommons-cli${commons-cli.version}redis.clientsjedis${jedis.version}org.apache.maven.pluginsmaven-javadoc-pluginWebMagic ${project.version}en_USfalseaggregateaggregatesiteattach-javadocsjarorg.jacocojacoco-maven-pluginprepare-agentreportverifyreportcom.amashchenko.maven.plugingitflow-maven-pluginWebMagic-
================================================
FILE: src/site/site.xml
================================================
org.apache.maven.skinsmaven-fluido-skin1.11.1truetruetruepull-right
================================================
FILE: webmagic-core/README.md
================================================
webmagic-core
-------
webmagic核心部分。只包含爬虫基本模块和基本抽取器。webmagic-core的目标是成为网页爬虫的一个教科书般的实现。
================================================
FILE: webmagic-core/module_webmagic-core.xml
================================================
================================================
FILE: webmagic-core/pom.xml
================================================
us.codecraftwebmagic1.0.4-SNAPSHOT4.0.0webmagic-coreorg.apache.httpcomponentshttpclientorg.apache.commonscommons-lang3us.codecraftxsoupcom.github.dreamheadmoco-coreorg.slf4jslf4j-apiorg.mockitomockito-allorg.apache.commonscommons-collections4org.assertjassertj-corecommons-iocommons-iocom.jayway.jsonpathjson-pathcom.alibabafastjson
================================================
FILE: webmagic-core/src/main/java/us/codecraft/webmagic/Page.java
================================================
package us.codecraft.webmagic;
import org.apache.commons.lang3.StringUtils;
import us.codecraft.webmagic.selector.Html;
import us.codecraft.webmagic.selector.Json;
import us.codecraft.webmagic.selector.Selectable;
import us.codecraft.webmagic.utils.UrlUtils;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Map;
/**
* Object storing extracted result and urls to fetch.
* Not thread safe.
* Main method:
* {@link #getUrl()} get url of current page
* {@link #getHtml()} get content of current page
* {@link #putField(String, Object)} save extracted result
* {@link #getResultItems()} get extract results to be used in {@link us.codecraft.webmagic.pipeline.Pipeline}
* {@link #addTargetRequests(Iterable)} {@link #addTargetRequest(String)} add urls to fetch
*
* @author code4crafter@gmail.com
* @see us.codecraft.webmagic.downloader.Downloader
* @see us.codecraft.webmagic.processor.PageProcessor
* @since 0.1.0
*/
public class Page {
private Request request;
private ResultItems resultItems = new ResultItems();
private Html html;
private Json json;
private String rawText;
private Selectable url;
private Map> headers;
private int statusCode;
private boolean downloadSuccess;
private byte[] bytes;
private List targetRequests = new ArrayList<>();
private String charset;
/**
* Returns a {@link Page} with {@link #downloadSuccess} is {@code true},
* and {@link #request} is specified.
*
* @param request the request.
* @since 1.0.2
*/
public static Page ofSuccess(Request request) {
return new Page(request, true);
}
/**
* Returns a {@link Page} with {@link #downloadSuccess} is {@code true},
* and {@link #request} is specified.
*
* @param request the request.
* @since 1.0.2
*/
public static Page ofFailure(Request request) {
return new Page(request, false);
}
public Page() {
}
/**
* Constructs a {@link Page} with {@link #request}
* and {@link #downloadSuccess} specified.
*
* @param request the request.
* @param downloadSuccess the download success flag.
* @since 1.0.2
*/
private Page(Request request, boolean downloadSuccess) {
this.request = request;
this.downloadSuccess = downloadSuccess;
}
/**
* Returns a {@link Page} with {@link #downloadSuccess} is {@code false}.
*
* @return the page.
* @deprecated Use {@link #fail(Request)} instead.
*/
@Deprecated
public static Page fail() {
return fail(null);
}
/**
* Returns a {@link Page} with {@link #downloadSuccess} is {@code false},
* and {@link #request} is specified.
*
* @param request the {@link Request}.
* @return the page.
* @since 0.10.0
* @deprecated Use {@link #ofFailure(Request)} instead.
*/
@Deprecated(since = "1.0.2", forRemoval = true)
public static Page fail(Request request){
Page page = new Page();
page.setRequest(request);
page.setDownloadSuccess(false);
return page;
}
public Page setSkip(boolean skip) {
resultItems.setSkip(skip);
return this;
}
/**
* store extract results
*
* @param key key
* @param field field
*/
public void putField(String key, Object field) {
resultItems.put(key, field);
}
/**
* get html content of page
*
* @return html
*/
public Html getHtml() {
if (html == null) {
html = new Html(rawText, request.getUrl());
}
return html;
}
/**
* get json content of page
*
* @return json
* @since 0.5.0
*/
public Json getJson() {
if (json == null) {
json = new Json(rawText);
}
return json;
}
/**
* @param html html
* @deprecated since 0.4.0
* The html is parse just when first time of calling {@link #getHtml()}, so use {@link #setRawText(String)} instead.
*/
@Deprecated
public void setHtml(Html html) {
this.html = html;
}
public List getTargetRequests() {
return targetRequests;
}
/**
* add urls to fetch
*
* @param requests requests
*/
public void addTargetRequests(Iterable requests) {
addTargetRequests(requests, 0); // Default priority is 0
}
/**
* add urls to fetch
*
* @param requests requests
* @param priority priority
*/
public void addTargetRequests(Iterable requests, long priority) {
if(requests == null) {
return;
}
for (String req : requests) {
addRequestIfValid(req, priority);
}
}
/**
* Helper method to add a request if it's valid.
*
* @param url URL to add
* @param priority Priority for the URL
*/
private void addRequestIfValid(String url, long priority) {
if (StringUtils.isBlank(url) || url.equals("#") || url.startsWith("javascript:")) {
return;
}
String canonicalizedUrl = UrlUtils.canonicalizeUrl(url, this.url.toString());
Request req = new Request(canonicalizedUrl);
if(priority > 0) {
req.setPriority(priority);
}
targetRequests.add(req);
}
/**
* add url to fetch
*
* @param requestString requestString
*/
public void addTargetRequest(String requestString) {
if (StringUtils.isBlank(requestString) || requestString.equals("#")) {
return;
}
requestString = UrlUtils.canonicalizeUrl(requestString, url.toString());
targetRequests.add(new Request(requestString));
}
/**
* add requests to fetch
*
* @param request request
*/
public void addTargetRequest(Request request) {
targetRequests.add(request);
}
/**
* get url of current page
*
* @return url of current page
*/
public Selectable getUrl() {
return url;
}
public void setUrl(Selectable url) {
this.url = url;
}
/**
* get request of current page
*
* @return request
*/
public Request getRequest() {
return request;
}
public void setRequest(Request request) {
this.request = request;
this.resultItems.setRequest(request);
}
public ResultItems getResultItems() {
return resultItems;
}
public int getStatusCode() {
return statusCode;
}
public void setStatusCode(int statusCode) {
this.statusCode = statusCode;
}
public String getRawText() {
return rawText;
}
public Page setRawText(String rawText) {
this.rawText = rawText;
return this;
}
public Map> getHeaders() {
return headers;
}
public void setHeaders(Map> headers) {
this.headers = headers;
}
public boolean isDownloadSuccess() {
return downloadSuccess;
}
public void setDownloadSuccess(boolean downloadSuccess) {
this.downloadSuccess = downloadSuccess;
}
public byte[] getBytes() {
return bytes;
}
public void setBytes(byte[] bytes) {
this.bytes = bytes;
}
public String getCharset() {
return charset;
}
public void setCharset(String charset) {
this.charset = charset;
}
@Override
public String toString() {
return "Page{" +
"request=" + request +
", resultItems=" + resultItems +
", html=" + html +
", json=" + json +
", rawText='" + rawText + '\'' +
", url=" + url +
", headers=" + headers +
", statusCode=" + statusCode +
", downloadSuccess=" + downloadSuccess +
", targetRequests=" + targetRequests +
", charset='" + charset + '\'' +
", bytes=" + Arrays.toString(bytes) +
'}';
}
}
================================================
FILE: webmagic-core/src/main/java/us/codecraft/webmagic/Request.java
================================================
package us.codecraft.webmagic;
import java.io.Serializable;
import java.util.Collections;
import java.util.HashMap;
import java.util.Map;
import us.codecraft.webmagic.downloader.Downloader;
import us.codecraft.webmagic.model.HttpRequestBody;
import us.codecraft.webmagic.utils.Experimental;
/**
* Object contains url to crawl.
* It contains some additional information.
*
* @author code4crafter@gmail.com
* @since 0.1.0
*/
public class Request implements Serializable {
private static final long serialVersionUID = 2062192774891352043L;
public static final String CYCLE_TRIED_TIMES = "_cycle_tried_times";
private String url;
private String method;
private HttpRequestBody requestBody;
/**
* this req use this downloader
*/
private Downloader downloader;
/**
* Store additional information in extras.
*/
private Map extras = new HashMap<>();
/**
* cookies for current url, if not set use Site's cookies
*/
private Map cookies = new HashMap();
private Map headers = new HashMap();
/**
* Priority of the request.
* The bigger will be processed earlier.
* @see us.codecraft.webmagic.scheduler.PriorityScheduler
*/
private long priority;
/**
* When it is set to TRUE, the downloader will not try to parse response body to text.
*
*/
private boolean binaryContent = false;
private String charset;
public Request() {
}
public Request(String url) {
this.url = url;
}
public long getPriority() {
return priority;
}
/**
* Set the priority of request for sorting.
* Need a scheduler supporting priority.
* @see us.codecraft.webmagic.scheduler.PriorityScheduler
*
* @param priority priority
* @return this
*/
@Experimental
public Request setPriority(long priority) {
this.priority = priority;
return this;
}
@SuppressWarnings("unchecked")
public T getExtra(String key) {
if (extras == null) {
return null;
}
return (T) extras.get(key);
}
public Request putExtra(String key, T value) {
extras.put(key, value);
return this;
}
public String getUrl() {
return url;
}
public Map getExtras() {
return Collections.unmodifiableMap(extras);
}
public Request setExtras(Map extras) {
this.extras.putAll(extras);
return this;
}
public Request setUrl(String url) {
this.url = url;
return this;
}
/**
* The http method of the request. Get for default.
* @return httpMethod
* @see us.codecraft.webmagic.utils.HttpConstant.Method
* @since 0.5.0
*/
public String getMethod() {
return method;
}
public Request setMethod(String method) {
this.method = method;
return this;
}
@Override
public int hashCode() {
int result = url != null ? url.hashCode() : 0;
result = 31 * result + (method != null ? method.hashCode() : 0);
return result;
}
@Override
public boolean equals(Object o) {
if (this == o) return true;
if (o == null || getClass() != o.getClass()) return false;
Request request = (Request) o;
if (url != null ? !url.equals(request.url) : request.url != null) return false;
return method != null ? method.equals(request.method) : request.method == null;
}
public Request addCookie(String name, String value) {
cookies.put(name, value);
return this;
}
public Request addHeader(String name, String value) {
headers.put(name, value);
return this;
}
public Map getCookies() {
return cookies;
}
public Map getHeaders() {
return headers;
}
public HttpRequestBody getRequestBody() {
return requestBody;
}
public void setRequestBody(HttpRequestBody requestBody) {
this.requestBody = requestBody;
}
public boolean isBinaryContent() {
return binaryContent;
}
public Downloader getDownloader() {
return downloader;
}
public void setDownloader(Downloader downloader) {
this.downloader = downloader;
}
public Request setBinaryContent(boolean binaryContent) {
this.binaryContent = binaryContent;
return this;
}
public String getCharset() {
return charset;
}
public Request setCharset(String charset) {
this.charset = charset;
return this;
}
@Override
public String toString() {
return "Request{" +
"url='" + url + '\'' +
", method='" + method + '\'' +
", extras=" + extras +
", priority=" + priority +
", headers=" + headers +
", cookies="+ cookies+
'}';
}
}
================================================
FILE: webmagic-core/src/main/java/us/codecraft/webmagic/ResultItems.java
================================================
package us.codecraft.webmagic;
import java.util.LinkedHashMap;
import java.util.Map;
/**
* Object contains extract results.
* It is contained in Page and will be processed in pipeline.
*
* @author code4crafter@gmail.com
* @since 0.1.0
* @see Page
* @see us.codecraft.webmagic.pipeline.Pipeline
*/
public class ResultItems {
private Map fields = new LinkedHashMap();
private Request request;
private boolean skip;
@SuppressWarnings("unchecked")
public T get(String key) {
Object o = fields.get(key);
if (o == null) {
return null;
}
return (T) fields.get(key);
}
public Map getAll() {
return fields;
}
public ResultItems put(String key, T value) {
fields.put(key, value);
return this;
}
public Request getRequest() {
return request;
}
public ResultItems setRequest(Request request) {
this.request = request;
return this;
}
/**
* Whether to skip the result.
* Result which is skipped will not be processed by Pipeline.
*
* @return whether to skip the result
*/
public boolean isSkip() {
return skip;
}
/**
* Set whether to skip the result.
* Result which is skipped will not be processed by Pipeline.
*
* @param skip whether to skip the result
* @return this
*/
public ResultItems setSkip(boolean skip) {
this.skip = skip;
return this;
}
@Override
public String toString() {
return "ResultItems{" +
"fields=" + fields +
", request=" + request +
", skip=" + skip +
'}';
}
}
================================================
FILE: webmagic-core/src/main/java/us/codecraft/webmagic/Site.java
================================================
package us.codecraft.webmagic;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedHashMap;
import java.util.Map;
import java.util.Set;
import java.util.UUID;
import us.codecraft.webmagic.utils.HttpConstant;
/**
* Object contains setting for crawler.
*
* @author code4crafter@gmail.com
* @see us.codecraft.webmagic.processor.PageProcessor
* @since 0.1.0
*/
public class Site {
private String domain;
private String userAgent;
private Map defaultCookies = new LinkedHashMap();
private Map> cookies = new HashMap>();
private String charset;
private String defaultCharset;
private int sleepTime = 5000;
private int retryTimes = 0;
private int cycleRetryTimes = 0;
private int retrySleepTime = 1000;
private int timeOut = 5000;
private static final Set DEFAULT_STATUS_CODE_SET = new HashSet();
private Set acceptStatCode = DEFAULT_STATUS_CODE_SET;
private Map headers = new HashMap();
private boolean useGzip = true;
private boolean disableCookieManagement = false;
static {
DEFAULT_STATUS_CODE_SET.add(HttpConstant.StatusCode.CODE_200);
}
/**
* new a Site
*
* @return new site
*/
public static Site me() {
return new Site();
}
/**
* Add a cookie with domain {@link #getDomain()}
*
* @param name name
* @param value value
* @return this
*/
public Site addCookie(String name, String value) {
defaultCookies.put(name, value);
return this;
}
/**
* Add a cookie with specific domain.
*
* @param domain domain
* @param name name
* @param value value
* @return this
*/
public Site addCookie(String domain, String name, String value) {
if (!cookies.containsKey(domain)){
cookies.put(domain,new HashMap());
}
cookies.get(domain).put(name, value);
return this;
}
/**
* set user agent
*
* @param userAgent userAgent
* @return this
*/
public Site setUserAgent(String userAgent) {
this.userAgent = userAgent;
return this;
}
/**
* get cookies
*
* @return get cookies
*/
public Map getCookies() {
return defaultCookies;
}
/**
* get cookies of all domains
*
* @return get cookies
*/
public Map> getAllCookies() {
return cookies;
}
/**
* get user agent
*
* @return user agent
*/
public String getUserAgent() {
return userAgent;
}
/**
* get domain
*
* @return get domain
*/
public String getDomain() {
return domain;
}
/**
* set the domain of site.
*
* @param domain domain
* @return this
*/
public Site setDomain(String domain) {
this.domain = domain;
return this;
}
/**
* Set charset of page manually.
* When charset is not set or set to null, it can be auto detected by Http header.
*
* @param charset charset
* @return this
*/
public Site setCharset(String charset) {
this.charset = charset;
return this;
}
/**
* get charset set manually
*
* @return charset
*/
public String getCharset() {
return charset;
}
/**
* Set default charset of page.
*
* When charset detect failed, use this default charset.
*
* @param defaultCharset the default charset
* @return this
* @since 0.9.0
*/
public Site setDefaultCharset(String defaultCharset) {
this.defaultCharset = defaultCharset;
return this;
}
/**
* The default charset if charset detected failed.
*
* @return the defulat charset
* @since 0.9.0
*/
public String getDefaultCharset() {
return defaultCharset;
}
public int getTimeOut() {
return timeOut;
}
/**
* set timeout for downloader in ms
*
* @param timeOut timeOut
* @return this
*/
public Site setTimeOut(int timeOut) {
this.timeOut = timeOut;
return this;
}
/**
* Set acceptStatCode.
* When status code of http response is in acceptStatCodes, it will be processed.
* {200} by default.
* It is not necessarily to be set.
*
* @param acceptStatCode acceptStatCode
* @return this
*/
public Site setAcceptStatCode(Set acceptStatCode) {
this.acceptStatCode = acceptStatCode;
return this;
}
/**
* get acceptStatCode
*
* @return acceptStatCode
*/
public Set getAcceptStatCode() {
return acceptStatCode;
}
/**
* Set the interval between the processing of two pages.
* Time unit is milliseconds.
*
* @param sleepTime sleepTime
* @return this
*/
public Site setSleepTime(int sleepTime) {
this.sleepTime = sleepTime;
return this;
}
/**
* Get the interval between the processing of two pages.
* Time unit is milliseconds.
*
* @return the interval between the processing of two pages,
*/
public int getSleepTime() {
return sleepTime;
}
/**
* Get retry times immediately when download fail, 0 by default.
*
* @return retry times when download fail
*/
public int getRetryTimes() {
return retryTimes;
}
public Map getHeaders() {
return headers;
}
/**
* Put an Http header for downloader.
* Use {@link #addCookie(String, String)} for cookie and {@link #setUserAgent(String)} for user-agent.
*
* @param key key of http header, there are some keys constant in {@link HttpConstant.Header}
* @param value value of header
* @return this
*/
public Site addHeader(String key, String value) {
headers.put(key, value);
return this;
}
/**
* Set retry times when download fail, 0 by default.
*
* @param retryTimes retryTimes
* @return this
*/
public Site setRetryTimes(int retryTimes) {
this.retryTimes = retryTimes;
return this;
}
/**
* When cycleRetryTimes is more than 0, it will add back to scheduler and try download again.
*
* @return retry times when download fail
*/
public int getCycleRetryTimes() {
return cycleRetryTimes;
}
/**
* Set cycleRetryTimes times when download fail, 0 by default.
*
* @param cycleRetryTimes cycleRetryTimes
* @return this
*/
public Site setCycleRetryTimes(int cycleRetryTimes) {
this.cycleRetryTimes = cycleRetryTimes;
return this;
}
public boolean isUseGzip() {
return useGzip;
}
public int getRetrySleepTime() {
return retrySleepTime;
}
/**
* Set retry sleep times when download fail, 1000 by default.
*
* @param retrySleepTime retrySleepTime
* @return this
*/
public Site setRetrySleepTime(int retrySleepTime) {
this.retrySleepTime = retrySleepTime;
return this;
}
/**
* Whether use gzip.
* Default is true, you can set it to false to disable gzip.
*
* @param useGzip useGzip
* @return this
*/
public Site setUseGzip(boolean useGzip) {
this.useGzip = useGzip;
return this;
}
public boolean isDisableCookieManagement() {
return disableCookieManagement;
}
/**
* Downloader is supposed to store response cookie.
* Disable it to ignore all cookie fields and stay clean.
* Warning: Set cookie will still NOT work if disableCookieManagement is true.
* @param disableCookieManagement disableCookieManagement
* @return this
*/
public Site setDisableCookieManagement(boolean disableCookieManagement) {
this.disableCookieManagement = disableCookieManagement;
return this;
}
public Task toTask() {
return new Task() {
@Override
public String getUUID() {
String uuid = Site.this.getDomain();
if (uuid == null) {
uuid = UUID.randomUUID().toString();
}
return uuid;
}
@Override
public Site getSite() {
return Site.this;
}
};
}
@Override
public boolean equals(Object o) {
if (this == o) return true;
if (o == null || getClass() != o.getClass()) return false;
Site site = (Site) o;
if (cycleRetryTimes != site.cycleRetryTimes) return false;
if (retryTimes != site.retryTimes) return false;
if (sleepTime != site.sleepTime) return false;
if (timeOut != site.timeOut) return false;
if (acceptStatCode != null ? !acceptStatCode.equals(site.acceptStatCode) : site.acceptStatCode != null)
return false;
if (charset != null ? !charset.equals(site.charset) : site.charset != null) return false;
if (defaultCookies != null ? !defaultCookies.equals(site.defaultCookies) : site.defaultCookies != null)
return false;
if (domain != null ? !domain.equals(site.domain) : site.domain != null) return false;
if (headers != null ? !headers.equals(site.headers) : site.headers != null) return false;
if (userAgent != null ? !userAgent.equals(site.userAgent) : site.userAgent != null) return false;
return true;
}
@Override
public int hashCode() {
int result = domain != null ? domain.hashCode() : 0;
result = 31 * result + (userAgent != null ? userAgent.hashCode() : 0);
result = 31 * result + (defaultCookies != null ? defaultCookies.hashCode() : 0);
result = 31 * result + (charset != null ? charset.hashCode() : 0);
result = 31 * result + sleepTime;
result = 31 * result + retryTimes;
result = 31 * result + cycleRetryTimes;
result = 31 * result + timeOut;
result = 31 * result + (acceptStatCode != null ? acceptStatCode.hashCode() : 0);
result = 31 * result + (headers != null ? headers.hashCode() : 0);
return result;
}
@Override
public String toString() {
return "Site{" +
"domain='" + domain + '\'' +
", userAgent='" + userAgent + '\'' +
", cookies=" + defaultCookies +
", charset='" + charset + '\'' +
", sleepTime=" + sleepTime +
", retryTimes=" + retryTimes +
", cycleRetryTimes=" + cycleRetryTimes +
", timeOut=" + timeOut +
", acceptStatCode=" + acceptStatCode +
", headers=" + headers +
'}';
}
}
================================================
FILE: webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java
================================================
package us.codecraft.webmagic;
import java.io.Closeable;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Date;
import java.util.List;
import java.util.UUID;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicLong;
import org.apache.commons.collections4.CollectionUtils;
import org.apache.commons.lang3.SerializationUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import us.codecraft.webmagic.downloader.Downloader;
import us.codecraft.webmagic.downloader.HttpClientDownloader;
import us.codecraft.webmagic.pipeline.CollectorPipeline;
import us.codecraft.webmagic.pipeline.ConsolePipeline;
import us.codecraft.webmagic.pipeline.Pipeline;
import us.codecraft.webmagic.pipeline.ResultItemsCollectorPipeline;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.scheduler.QueueScheduler;
import us.codecraft.webmagic.scheduler.Scheduler;
import us.codecraft.webmagic.thread.CountableThreadPool;
import us.codecraft.webmagic.utils.UrlUtils;
import us.codecraft.webmagic.utils.WMCollections;
/**
* Entrance of a crawler.
* A spider contains four modules: Downloader, Scheduler, PageProcessor and
* Pipeline.
* Every module is a field of Spider.
* The modules are defined in interface.
* You can customize a spider with various implementations of them.
* Examples:
*
* A simple crawler:
* Spider.create(new SimplePageProcessor("http://my.oschina.net/",
* "http://my.oschina.net/*blog/*")).run();
*
* Store results to files by FilePipeline:
* Spider.create(new SimplePageProcessor("http://my.oschina.net/",
* "http://my.oschina.net/*blog/*"))
* .pipeline(new FilePipeline("/data/temp/webmagic/")).run();
*
* Use FileCacheQueueScheduler to store urls and cursor in files, so that a
* Spider can resume the status when shutdown.
* Spider.create(new SimplePageProcessor("http://my.oschina.net/",
* "http://my.oschina.net/*blog/*"))
* .scheduler(new FileCacheQueueScheduler("/data/temp/webmagic/cache/")).run();
*
* @author code4crafter@gmail.com
* @see Downloader
* @see Scheduler
* @see PageProcessor
* @see Pipeline
* @since 0.1.0
*/
public class Spider implements Runnable, Task {
protected Downloader downloader;
protected List pipelines = new ArrayList();
protected PageProcessor pageProcessor;
protected List startRequests;
protected Site site;
protected String uuid;
protected SpiderScheduler scheduler;
protected Logger logger = LoggerFactory.getLogger(getClass());
protected CountableThreadPool threadPool;
protected ExecutorService executorService;
protected int threadNum = 1;
protected AtomicInteger stat = new AtomicInteger(STAT_INIT);
protected volatile boolean exitWhenComplete = true;
protected final static int STAT_INIT = 0;
protected final static int STAT_RUNNING = 1;
protected final static int STAT_STOPPED = 2;
protected boolean spawnUrl = true;
protected boolean destroyWhenExit = true;
private List spiderListeners;
private final AtomicLong pageCount = new AtomicLong(0);
private Date startTime;
private long emptySleepTime = 30000;
/**
* create a spider with pageProcessor.
*
* @param pageProcessor pageProcessor
* @return new spider
* @see PageProcessor
*/
public static Spider create(PageProcessor pageProcessor) {
return new Spider(pageProcessor);
}
/**
* create a spider with pageProcessor.
*
* @param pageProcessor pageProcessor
*/
public Spider(PageProcessor pageProcessor) {
this.pageProcessor = pageProcessor;
this.site = pageProcessor.getSite();
this.scheduler = new SpiderScheduler(new QueueScheduler());
}
/**
* Set startUrls of Spider.
* Prior to startUrls of Site.
*
* @param startUrls startUrls
* @return this
*/
public Spider startUrls(List startUrls) {
checkIfRunning();
this.startRequests = UrlUtils.convertToRequests(startUrls);
return this;
}
/**
* Set startUrls of Spider.
* Prior to startUrls of Site.
*
* @param startRequests startRequests
* @return this
*/
public Spider startRequest(List startRequests) {
checkIfRunning();
this.startRequests = startRequests;
return this;
}
/**
* Set an uuid for spider.
* Default uuid is domain of site.
*
* @param uuid uuid
* @return this
*/
public Spider setUUID(String uuid) {
this.uuid = uuid;
return this;
}
/**
* set scheduler for Spider
*
* @param scheduler scheduler
* @return this
* @see #setScheduler(us.codecraft.webmagic.scheduler.Scheduler)
*/
@Deprecated
public Spider scheduler(Scheduler scheduler) {
return setScheduler(scheduler);
}
/**
* set scheduler for Spider
*
* @param updateScheduler scheduler
* @return this
* @see Scheduler
* @since 0.2.1
*/
public Spider setScheduler(Scheduler updateScheduler) {
checkIfRunning();
Scheduler oldScheduler = scheduler.getScheduler();
scheduler.setScheduler(updateScheduler);
if (oldScheduler != null) {
Request request;
while ((request = oldScheduler.poll(this)) != null) {
this.scheduler.push(request, this);
}
}
return this;
}
/**
* add a pipeline for Spider
*
* @param pipeline pipeline
* @return this
* @see #addPipeline(us.codecraft.webmagic.pipeline.Pipeline)
* @deprecated
*/
@Deprecated
public Spider pipeline(Pipeline pipeline) {
return addPipeline(pipeline);
}
/**
* add a pipeline for Spider
*
* @param pipeline pipeline
* @return this
* @see Pipeline
* @since 0.2.1
*/
public Spider addPipeline(Pipeline pipeline) {
checkIfRunning();
this.pipelines.add(pipeline);
return this;
}
/**
* set pipelines for Spider
*
* @param pipelines pipelines
* @return this
* @see Pipeline
* @since 0.4.1
*/
public Spider setPipelines(List pipelines) {
checkIfRunning();
this.pipelines = pipelines;
return this;
}
/**
* clear the pipelines set
*
* @return this
*/
public Spider clearPipeline() {
pipelines = new ArrayList();
return this;
}
/**
* set the downloader of spider
*
* @param downloader downloader
* @return this
* @see #setDownloader(us.codecraft.webmagic.downloader.Downloader)
* @deprecated
*/
@Deprecated
public Spider downloader(Downloader downloader) {
return setDownloader(downloader);
}
/**
* set the downloader of spider
*
* @param downloader downloader
* @return this
* @see Downloader
*/
public Spider setDownloader(Downloader downloader) {
checkIfRunning();
this.downloader = downloader;
return this;
}
protected void initComponent() {
if (downloader == null) {
this.downloader = new HttpClientDownloader();
}
if (pipelines.isEmpty()) {
pipelines.add(new ConsolePipeline());
}
downloader.setThread(threadNum);
if (threadPool == null || threadPool.isShutdown()) {
if (executorService != null && !executorService.isShutdown()) {
threadPool = new CountableThreadPool(threadNum, executorService);
} else {
threadPool = new CountableThreadPool(threadNum);
}
}
if (startRequests != null) {
for (Request request : startRequests) {
addRequest(request);
}
startRequests.clear();
}
startTime = new Date();
}
@Override
public void run() {
checkRunningStat();
initComponent();
logger.info("Spider {} started!", getUUID());
// interrupt won't be necessarily detected
while (!Thread.currentThread().isInterrupted() && stat.get() == STAT_RUNNING) {
Request poll = scheduler.poll(this);
if (poll == null) {
if (threadPool.getThreadAlive() == 0) {
//no alive thread anymore , try again
poll = scheduler.poll(this);
if (poll == null) {
if (exitWhenComplete) {
break;
} else {
// wait
try {
Thread.sleep(emptySleepTime);
continue;
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
break;
}
}
}
} else {
// wait until new url added,
if (scheduler.waitNewUrl(threadPool, emptySleepTime)) {
// if interrupted
break;
}
continue;
}
}
final Request request = poll;
//this may swallow the interruption
threadPool.execute(new Runnable() {
@Override
public void run() {
try {
processRequest(request);
onSuccess(request);
} catch (Exception e) {
onError(request, e);
logger.error("process request " + request + " error", e);
} finally {
pageCount.incrementAndGet();
scheduler.signalNewUrl();
}
}
});
}
stat.set(STAT_STOPPED);
// release some resources
if (destroyWhenExit) {
close();
}
logger.info("Spider {} closed! {} pages downloaded.", getUUID(), pageCount.get());
}
/**
* @deprecated Use {@link #onError(Request, Exception)} instead.
*/
@Deprecated
protected void onError(Request request) {
}
protected void onError(Request request, Exception e) {
this.onError(request);
if (CollectionUtils.isNotEmpty(spiderListeners)) {
for (SpiderListener spiderListener : spiderListeners) {
spiderListener.onError(request, e);
}
}
}
protected void onSuccess(Request request) {
if (CollectionUtils.isNotEmpty(spiderListeners)) {
for (SpiderListener spiderListener : spiderListeners) {
spiderListener.onSuccess(request);
}
}
}
private void checkRunningStat() {
while (true) {
int statNow = stat.get();
if (statNow == STAT_RUNNING) {
throw new IllegalStateException("Spider is already running!");
}
if (stat.compareAndSet(statNow, STAT_RUNNING)) {
break;
}
}
}
public void close() {
destroyEach(downloader);
destroyEach(pageProcessor);
destroyEach(scheduler);
for (Pipeline pipeline : pipelines) {
destroyEach(pipeline);
}
threadPool.shutdown();
}
private void destroyEach(Object object) {
if (object instanceof Closeable) {
try {
((Closeable) object).close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
/**
* Process specific urls without url discovering.
*
* @param urls urls to process
*/
public void test(String... urls) {
initComponent();
if (urls.length > 0) {
for (String url : urls) {
processRequest(new Request(url));
}
}
}
private void processRequest(Request request) {
Page page;
if (null != request.getDownloader()){
page = request.getDownloader().download(request,this);
}else {
page = downloader.download(request, this);
}
if (page.isDownloadSuccess()){
onDownloadSuccess(request, page);
} else {
onDownloaderFail(request);
}
}
private void onDownloadSuccess(Request request, Page page) {
if (site.getAcceptStatCode().contains(page.getStatusCode())){
pageProcessor.process(page);
extractAndAddRequests(page, spawnUrl);
if (!page.getResultItems().isSkip()) {
for (Pipeline pipeline : pipelines) {
pipeline.process(page.getResultItems(), this);
}
}
} else {
logger.info("page status code error, page {} , code: {}", request.getUrl(), page.getStatusCode());
}
sleep(site.getSleepTime());
}
private void onDownloaderFail(Request request) {
if (site.getCycleRetryTimes() == 0) {
sleep(site.getSleepTime());
} else {
// for cycle retry
doCycleRetry(request);
}
}
private void doCycleRetry(Request request) {
Object cycleTriedTimesObject = request.getExtra(Request.CYCLE_TRIED_TIMES);
if (cycleTriedTimesObject == null) {
addRequest(SerializationUtils.clone(request).setPriority(0).putExtra(Request.CYCLE_TRIED_TIMES, 1));
} else {
int cycleTriedTimes = (Integer) cycleTriedTimesObject;
cycleTriedTimes++;
if (cycleTriedTimes < site.getCycleRetryTimes()) {
addRequest(SerializationUtils.clone(request).setPriority(0).putExtra(Request.CYCLE_TRIED_TIMES, cycleTriedTimes));
}
}
sleep(site.getRetrySleepTime());
}
protected void sleep(int time) {
try {
Thread.sleep(time);
} catch (InterruptedException e) {
logger.error("Thread interrupted when sleep",e);
Thread.currentThread().interrupt();
}
}
protected void extractAndAddRequests(Page page, boolean spawnUrl) {
if (spawnUrl && CollectionUtils.isNotEmpty(page.getTargetRequests())) {
for (Request request : page.getTargetRequests()) {
addRequest(request);
}
}
}
private void addRequest(Request request) {
if (site.getDomain() == null && request != null && request.getUrl() != null) {
site.setDomain(UrlUtils.getDomain(request.getUrl()));
}
scheduler.push(request, this);
}
protected void checkIfRunning() {
if (stat.get() == STAT_RUNNING) {
throw new IllegalStateException("Spider is already running!");
}
}
public void runAsync() {
Thread thread = new Thread(this);
thread.setDaemon(false);
thread.start();
}
/**
* Add urls to crawl.
*
* @param urls urls
* @return this
*/
public Spider addUrl(String... urls) {
for (String url : urls) {
addRequest(new Request(url));
}
scheduler.signalNewUrl();
return this;
}
/**
* Download urls synchronizing.
*
* @param urls urls
* @param type of process result
* @return list downloaded
*/
public List getAll(Collection urls) {
destroyWhenExit = false;
spawnUrl = false;
if (startRequests!=null){
startRequests.clear();
}
for (Request request : UrlUtils.convertToRequests(urls)) {
addRequest(request);
}
CollectorPipeline collectorPipeline = getCollectorPipeline();
pipelines.add(collectorPipeline);
run();
spawnUrl = true;
destroyWhenExit = true;
return collectorPipeline.getCollected();
}
protected CollectorPipeline getCollectorPipeline() {
return new ResultItemsCollectorPipeline();
}
public T get(String url) {
List urls = WMCollections.newArrayList(url);
List resultItemses = getAll(urls);
if (resultItemses != null && resultItemses.size() > 0) {
return resultItemses.get(0);
} else {
return null;
}
}
/**
* Add urls with information to crawl.
*
* @param requests requests
* @return this
*/
public Spider addRequest(Request... requests) {
for (Request request : requests) {
addRequest(request);
}
scheduler.signalNewUrl();
return this;
}
public void start() {
runAsync();
}
public void stop() {
if (stat.compareAndSet(STAT_RUNNING, STAT_STOPPED)) {
logger.info("Spider " + getUUID() + " stop success!");
} else {
logger.info("Spider " + getUUID() + " stop fail!");
}
}
/**
* Stop when all tasks in the queue are completed and all worker threads are also completed
*/
public void stopWhenComplete(){
this.exitWhenComplete = true;
}
/**
* start with more than one threads
*
* @param threadNum threadNum
* @return this
*/
public Spider thread(int threadNum) {
checkIfRunning();
this.threadNum = threadNum;
if (threadNum <= 0) {
throw new IllegalArgumentException("threadNum should be more than one!");
}
return this;
}
/**
* start with more than one threads
*
* @param executorService executorService to run the spider
* @param threadNum threadNum
* @return this
*/
public Spider thread(ExecutorService executorService, int threadNum) {
checkIfRunning();
this.threadNum = threadNum;
if (threadNum <= 0) {
throw new IllegalArgumentException("threadNum should be more than one!");
}
this.executorService = executorService;
return this;
}
public boolean isExitWhenComplete() {
return exitWhenComplete;
}
/**
* Exit when complete.
* True: exit when all url of the site is downloaded.
* False: not exit until call stop() manually.
*
* @param exitWhenComplete exitWhenComplete
* @return this
*/
public Spider setExitWhenComplete(boolean exitWhenComplete) {
this.exitWhenComplete = exitWhenComplete;
return this;
}
public boolean isSpawnUrl() {
return spawnUrl;
}
/**
* Get page count downloaded by spider.
*
* @return total downloaded page count
* @since 0.4.1
*/
public long getPageCount() {
return pageCount.get();
}
/**
* Get running status by spider.
*
* @return running status
* @see Status
* @since 0.4.1
*/
public Status getStatus() {
return Status.fromValue(stat.get());
}
public enum Status {
Init(0), Running(1), Stopped(2);
private Status(int value) {
this.value = value;
}
private int value;
int getValue() {
return value;
}
public static Status fromValue(int value) {
for (Status status : Status.values()) {
if (status.getValue() == value) {
return status;
}
}
//default value
return Init;
}
}
/**
* Get thread count which is running
*
* @return thread count which is running
* @since 0.4.1
*/
public int getThreadAlive() {
if (threadPool == null) {
return 0;
}
return threadPool.getThreadAlive();
}
/**
* Whether add urls extracted to download.
* Add urls to download when it is true, and just download seed urls when it is false.
* DO NOT set it unless you know what it means!
*
* @param spawnUrl spawnUrl
* @return this
* @since 0.4.0
*/
public Spider setSpawnUrl(boolean spawnUrl) {
this.spawnUrl = spawnUrl;
return this;
}
@Override
public String getUUID() {
if (uuid != null) {
return uuid;
}
if (site != null) {
return site.getDomain();
}
uuid = UUID.randomUUID().toString();
return uuid;
}
public Spider setExecutorService(ExecutorService executorService) {
checkIfRunning();
this.executorService = executorService;
return this;
}
@Override
public Site getSite() {
return site;
}
public List getSpiderListeners() {
return spiderListeners;
}
public Spider setSpiderListeners(List spiderListeners) {
this.spiderListeners = spiderListeners;
return this;
}
public Date getStartTime() {
return startTime;
}
public Scheduler getScheduler() {
return scheduler.getScheduler();
}
/**
* Set wait time when no url is polled.
*
* @param emptySleepTime In MILLISECONDS.
* @return this
*/
public Spider setEmptySleepTime(long emptySleepTime) {
if(emptySleepTime<=0){
throw new IllegalArgumentException("emptySleepTime should be more than zero!");
}
this.emptySleepTime = emptySleepTime;
return this;
}
}
================================================
FILE: webmagic-core/src/main/java/us/codecraft/webmagic/SpiderListener.java
================================================
package us.codecraft.webmagic;
/**
* Listener of Spider on page processing. Used for monitor and such on.
*
* @author code4crafer@gmail.com
* @since 0.5.0
*/
public interface SpiderListener {
void onSuccess(Request request);
/**
* @deprecated Use {@link #onError(Request, Exception)} instead.
*/
@Deprecated
default void onError(Request request) {
}
default void onError(Request request, Exception e) {
this.onError(request);
}
}
================================================
FILE: webmagic-core/src/main/java/us/codecraft/webmagic/SpiderScheduler.java
================================================
package us.codecraft.webmagic;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.locks.Condition;
import java.util.concurrent.locks.ReentrantLock;
import us.codecraft.webmagic.scheduler.Scheduler;
import us.codecraft.webmagic.thread.CountableThreadPool;
public class SpiderScheduler {
private Scheduler scheduler;
private final ReentrantLock newUrlLock = new ReentrantLock();
private final Condition newUrlCondition = newUrlLock.newCondition();
public SpiderScheduler(Scheduler scheduler) {
this.scheduler = scheduler;
}
public Scheduler getScheduler() {
return scheduler;
}
public void setScheduler(Scheduler scheduler) {
this.scheduler = scheduler;
}
public Request poll(Spider spider) {
return scheduler.poll(spider);
}
public void push(Request request, Spider spider) {
scheduler.push(request, spider);
}
public boolean waitNewUrl(CountableThreadPool threadPool, long emptySleepTime) {
newUrlLock.lock();
try {
if (threadPool.getThreadAlive() == 0) {
return false;
}
newUrlCondition.await(emptySleepTime, TimeUnit.MILLISECONDS);
return false;
} catch (InterruptedException e) {
return true;
} finally {
newUrlLock.unlock();
}
}
public void signalNewUrl() {
try {
newUrlLock.lock();
newUrlCondition.signalAll();
} finally {
newUrlLock.unlock();
}
}
}
================================================
FILE: webmagic-core/src/main/java/us/codecraft/webmagic/Task.java
================================================
package us.codecraft.webmagic;
/**
* Interface for identifying different tasks.
*
* @author code4crafter@gmail.com
* @since 0.1.0
* @see us.codecraft.webmagic.scheduler.Scheduler
* @see us.codecraft.webmagic.pipeline.Pipeline
*/
public interface Task {
/**
* unique id for a task.
*
* @return uuid
*/
public String getUUID();
/**
* site of a task
*
* @return site
*/
public Site getSite();
}
================================================
FILE: webmagic-core/src/main/java/us/codecraft/webmagic/downloader/AbstractDownloader.java
================================================
package us.codecraft.webmagic.downloader;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.selector.Html;
/**
* Base class of downloader with some common methods.
*
* @author code4crafter@gmail.com
* @since 0.5.0
*/
public abstract class AbstractDownloader implements Downloader {
/**
* A simple method to download a url.
*
* @param url url
* @return html
*/
public Html download(String url) {
return download(url, null);
}
/**
* A simple method to download a url.
*
* @param url url
* @param charset charset
* @return html
*/
public Html download(String url, String charset) {
Page page = download(new Request(url), Site.me().setCharset(charset).toTask());
return (Html) page.getHtml();
}
/**
* @param request the {@link Request}.
* @deprecated Use {@link #onSuccess(Page, Task)} instead.
*/
@Deprecated
protected void onSuccess(Request request) {
}
/**
* @param request the {@link Request}.
* @param task the {@link Task}.
* @since 0.7.6
* @deprecated Use {@link #onSuccess(Page, Task)} instead.
*/
@Deprecated
protected void onSuccess(Request request, Task task) {
this.onSuccess(request);
}
/**
* @param page the {@link Page}.
* @param task the {@link Task}.
* @since 0.10.0
*/
protected void onSuccess(Page page, Task task) {
this.onSuccess(page.getRequest(), task);
}
/**
* @param request the {@link Request}.
* @deprecated Use {@link #onError(Page, Task, Throwable)} instead.
*/
@Deprecated
protected void onError(Request request) {
}
/**
* @param request the {@link Request}.
* @param task the {@link Task}.
* @param e the exception.
* @since 0.7.6
* @deprecated Use {@link #onError(Page, Task, Throwable)} instead.
*/
@Deprecated
protected void onError(Request request, Task task, Throwable e) {
this.onError(request);
}
/**
* @param page the {@link Page}.
* @param task the {@link Task}.
* @param e the exception.
* @since 0.10.0
*/
protected void onError(Page page, Task task, Throwable e) {
this.onError(page.getRequest(), task, e);
}
}
================================================
FILE: webmagic-core/src/main/java/us/codecraft/webmagic/downloader/CustomRedirectStrategy.java
================================================
package us.codecraft.webmagic.downloader;
import java.net.URI;
import org.apache.http.HttpRequest;
import org.apache.http.HttpResponse;
import org.apache.http.ProtocolException;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.client.methods.HttpRequestWrapper;
import org.apache.http.client.methods.HttpUriRequest;
import org.apache.http.impl.client.LaxRedirectStrategy;
import org.apache.http.protocol.HttpContext;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
*支持post 302跳转策略实现类
*HttpClient默认跳转:httpClientBuilder.setRedirectStrategy(new LaxRedirectStrategy());
*上述代码在post/redirect/post这种情况下不会传递原有请求的数据信息。所以参考了下SeimiCrawler这个项目的重定向策略。
*原代码地址:https://github.com/zhegexiaohuozi/SeimiCrawler/blob/master/project/src/main/java/cn/wanghaomiao/seimi/http/hc/SeimiRedirectStrategy.java
*/
public class CustomRedirectStrategy extends LaxRedirectStrategy {
private Logger logger = LoggerFactory.getLogger(getClass());
@Override
public HttpUriRequest getRedirect(HttpRequest request, HttpResponse response, HttpContext context) throws ProtocolException {
URI uri = getLocationURI(request, response, context);
String method = request.getRequestLine().getMethod();
if ("post".equalsIgnoreCase(method)) {
try {
HttpRequestWrapper httpRequestWrapper = (HttpRequestWrapper) request;
httpRequestWrapper.setURI(uri);
httpRequestWrapper.removeHeaders("Content-Length");
return httpRequestWrapper;
} catch (Exception e) {
logger.error("强转为HttpRequestWrapper出错");
}
return new HttpPost(uri);
} else {
return new HttpGet(uri);
}
}
}
================================================
FILE: webmagic-core/src/main/java/us/codecraft/webmagic/downloader/Downloader.java
================================================
package us.codecraft.webmagic.downloader;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Task;
/**
* Downloader is the part that downloads web pages and store in Page object.
* Downloader has {@link #setThread(int)} method because downloader is always the bottleneck of a crawler,
* there are always some mechanisms such as pooling in downloader, and pool size is related to thread numbers.
*
* @author code4crafter@gmail.com
* @since 0.1.0
*/
public interface Downloader {
/**
* Downloads web pages and store in Page object.
*
* @param request request
* @param task task
* @return page
*/
public Page download(Request request, Task task);
/**
* Tell the downloader how many threads the spider used.
* @param threadNum number of threads
*/
public void setThread(int threadNum);
}
================================================
FILE: webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java
================================================
package us.codecraft.webmagic.downloader;
import java.io.IOException;
import java.nio.charset.Charset;
import java.util.HashMap;
import java.util.Map;
import java.util.Optional;
import org.apache.commons.io.IOUtils;
import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.util.EntityUtils;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.proxy.Proxy;
import us.codecraft.webmagic.proxy.ProxyProvider;
import us.codecraft.webmagic.selector.PlainText;
import us.codecraft.webmagic.utils.CharsetUtils;
import us.codecraft.webmagic.utils.HttpClientUtils;
/**
* The http downloader based on HttpClient.
*
* @author code4crafter@gmail.com
* @since 0.1.0
*/
public class HttpClientDownloader extends AbstractDownloader {
private final Map httpClients = new HashMap();
private HttpClientGenerator httpClientGenerator = new HttpClientGenerator();
private HttpUriRequestConverter httpUriRequestConverter = new HttpUriRequestConverter();
private ProxyProvider proxyProvider;
private boolean responseHeader = true;
public void setHttpUriRequestConverter(HttpUriRequestConverter httpUriRequestConverter) {
this.httpUriRequestConverter = httpUriRequestConverter;
}
public void setProxyProvider(ProxyProvider proxyProvider) {
this.proxyProvider = proxyProvider;
}
private CloseableHttpClient getHttpClient(Site site) {
if (site == null) {
return httpClientGenerator.getClient(null);
}
String domain = site.getDomain();
CloseableHttpClient httpClient = httpClients.get(domain);
if (httpClient == null) {
synchronized (this) {
httpClient = httpClients.get(domain);
if (httpClient == null) {
httpClient = httpClientGenerator.getClient(site);
httpClients.put(domain, httpClient);
}
}
}
return httpClient;
}
@Override
public Page download(Request request, Task task) {
if (task == null || task.getSite() == null) {
throw new NullPointerException("task or site can not be null");
}
CloseableHttpResponse httpResponse = null;
CloseableHttpClient httpClient = getHttpClient(task.getSite());
Proxy proxy = proxyProvider != null ? proxyProvider.getProxy(request, task) : null;
HttpClientRequestContext requestContext = httpUriRequestConverter.convert(request, task.getSite(), proxy);
Page page = null;
try {
httpResponse = httpClient.execute(requestContext.getHttpUriRequest(), requestContext.getHttpClientContext());
page = handleResponse(request, request.getCharset() != null ? request.getCharset() : task.getSite().getCharset(), httpResponse, task);
onSuccess(page, task);
return page;
} catch (IOException e) {
page = Page.ofFailure(request);
onError(page, task, e);
return page;
} finally {
if (httpResponse != null) {
//ensure the connection is released back to pool
EntityUtils.consumeQuietly(httpResponse.getEntity());
}
if (proxyProvider != null && proxy != null) {
proxyProvider.returnProxy(proxy, page, task);
}
}
}
@Override
public void setThread(int thread) {
httpClientGenerator.setPoolSize(thread);
}
protected Page handleResponse(Request request, String charset, HttpResponse httpResponse, Task task) throws IOException {
HttpEntity entity = httpResponse.getEntity();
byte[] bytes = entity != null ? IOUtils.toByteArray(entity.getContent()) : new byte[0];
String contentType = entity != null && entity.getContentType() != null ? entity.getContentType().getValue() : null;
Page page = Page.ofSuccess(request);
page.setBytes(bytes);
if (!request.isBinaryContent()) {
if (charset == null) {
charset = getHtmlCharset(contentType, bytes, task);
}
page.setCharset(charset);
page.setRawText(new String(bytes, charset));
}
page.setUrl(new PlainText(request.getUrl()));
page.setRequest(request);
page.setStatusCode(httpResponse.getStatusLine().getStatusCode());
if (responseHeader) {
page.setHeaders(HttpClientUtils.convertHeaders(httpResponse.getAllHeaders()));
}
return page;
}
private String getHtmlCharset(String contentType, byte[] contentBytes, Task task) throws IOException {
String charset = CharsetUtils.detectCharset(contentType, contentBytes);
if (charset == null) {
charset = Optional.ofNullable(task.getSite().getDefaultCharset()).orElseGet(Charset.defaultCharset()::name);
}
return charset;
}
}
================================================
FILE: webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java
================================================
package us.codecraft.webmagic.downloader;
import org.apache.commons.lang3.JavaVersion;
import org.apache.commons.lang3.SystemUtils;
import org.apache.http.HttpException;
import org.apache.http.HttpRequest;
import org.apache.http.HttpRequestInterceptor;
import org.apache.http.client.CookieStore;
import org.apache.http.config.Registry;
import org.apache.http.config.RegistryBuilder;
import org.apache.http.config.SocketConfig;
import org.apache.http.conn.socket.ConnectionSocketFactory;
import org.apache.http.conn.socket.PlainConnectionSocketFactory;
import org.apache.http.conn.ssl.SSLConnectionSocketFactory;
import org.apache.http.impl.client.*;
import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
import org.apache.http.impl.cookie.BasicClientCookie;
import org.apache.http.protocol.HttpContext;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import us.codecraft.webmagic.Site;
import javax.net.ssl.SSLContext;
import javax.net.ssl.TrustManager;
import javax.net.ssl.X509TrustManager;
import java.io.IOException;
import java.security.KeyManagementException;
import java.security.NoSuchAlgorithmException;
import java.security.cert.CertificateException;
import java.security.cert.X509Certificate;
import java.util.Map;
/**
* @author code4crafter@gmail.com
* @since 0.4.0
*/
public class HttpClientGenerator {
private Logger logger = LoggerFactory.getLogger(getClass());
private PoolingHttpClientConnectionManager connectionManager;
public HttpClientGenerator() {
Registry reg = RegistryBuilder.create()
.register("http", PlainConnectionSocketFactory.INSTANCE)
.register("https", buildSSLConnectionSocketFactory())
.build();
connectionManager = new PoolingHttpClientConnectionManager(reg);
connectionManager.setDefaultMaxPerRoute(100);
}
private SSLConnectionSocketFactory buildSSLConnectionSocketFactory() {
try {
SSLContext sslContext = createIgnoreVerifySSL();
String[] supportedProtocols;
if (SystemUtils.isJavaVersionAtLeast(JavaVersion.JAVA_11)) {
supportedProtocols = new String[]{"SSLv3", "TLSv1", "TLSv1.1", "TLSv1.2", "TLSv1.3"};
} else {
supportedProtocols = new String[]{"SSLv3", "TLSv1", "TLSv1.1", "TLSv1.2"};
}
logger.debug("supportedProtocols: {}", String.join(", ", supportedProtocols));
return new SSLConnectionSocketFactory(sslContext, supportedProtocols,
null,
//不进行主机校验
(host, sslSession) -> true); // 优先绕过安全证书
} catch (KeyManagementException | NoSuchAlgorithmException e) {
logger.error("ssl connection fail", e);
}
return SSLConnectionSocketFactory.getSocketFactory();
}
private SSLContext createIgnoreVerifySSL() throws NoSuchAlgorithmException, KeyManagementException {
// 实现一个X509TrustManager接口,用于绕过验证,不用修改里面的方法
X509TrustManager trustManager = new X509TrustManager() {
@Override
public void checkClientTrusted(X509Certificate[] chain, String authType) throws CertificateException {
}
@Override
public void checkServerTrusted(X509Certificate[] chain, String authType) throws CertificateException {
}
@Override
public X509Certificate[] getAcceptedIssuers() {
return null;
}
};
SSLContext sc = SSLContext.getInstance("TLS");
sc.init(null, new TrustManager[]{trustManager}, null);
return sc;
}
public HttpClientGenerator setPoolSize(int poolSize) {
connectionManager.setMaxTotal(poolSize);
return this;
}
public CloseableHttpClient getClient(Site site) {
return generateClient(site);
}
private CloseableHttpClient generateClient(Site site) {
HttpClientBuilder httpClientBuilder = HttpClients.custom();
httpClientBuilder.setConnectionManager(connectionManager);
if (site.getUserAgent() != null) {
httpClientBuilder.setUserAgent(site.getUserAgent());
} else {
httpClientBuilder.setUserAgent("");
}
if (site.isUseGzip()) {
httpClientBuilder.addInterceptorFirst(new HttpRequestInterceptor() {
public void process(
final HttpRequest request,
final HttpContext context) throws HttpException, IOException {
if (!request.containsHeader("Accept-Encoding")) {
request.addHeader("Accept-Encoding", "gzip");
}
}
});
}
//解决post/redirect/post 302跳转问题
httpClientBuilder.setRedirectStrategy(new CustomRedirectStrategy());
SocketConfig.Builder socketConfigBuilder = SocketConfig.custom();
socketConfigBuilder.setSoKeepAlive(true).setTcpNoDelay(true);
socketConfigBuilder.setSoTimeout(site.getTimeOut());
SocketConfig socketConfig = socketConfigBuilder.build();
httpClientBuilder.setDefaultSocketConfig(socketConfig);
connectionManager.setDefaultSocketConfig(socketConfig);
httpClientBuilder.setRetryHandler(new DefaultHttpRequestRetryHandler(site.getRetryTimes(), true));
generateCookie(httpClientBuilder, site);
return httpClientBuilder.build();
}
private void generateCookie(HttpClientBuilder httpClientBuilder, Site site) {
if (site.isDisableCookieManagement()) {
httpClientBuilder.disableCookieManagement();
return;
}
CookieStore cookieStore = new BasicCookieStore();
for (Map.Entry cookieEntry : site.getCookies().entrySet()) {
BasicClientCookie cookie = new BasicClientCookie(cookieEntry.getKey(), cookieEntry.getValue());
cookie.setDomain(site.getDomain());
cookieStore.addCookie(cookie);
}
for (Map.Entry> domainEntry : site.getAllCookies().entrySet()) {
for (Map.Entry cookieEntry : domainEntry.getValue().entrySet()) {
BasicClientCookie cookie = new BasicClientCookie(cookieEntry.getKey(), cookieEntry.getValue());
cookie.setDomain(domainEntry.getKey());
cookieStore.addCookie(cookie);
}
}
httpClientBuilder.setDefaultCookieStore(cookieStore);
}
}
================================================
FILE: webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientRequestContext.java
================================================
package us.codecraft.webmagic.downloader;
import org.apache.http.client.methods.HttpUriRequest;
import org.apache.http.client.protocol.HttpClientContext;
/**
* @author code4crafter@gmail.com
* Date: 17/4/8
* Time: 19:43
* @since 0.7.0
*/
public class HttpClientRequestContext {
private HttpUriRequest httpUriRequest;
private HttpClientContext httpClientContext;
public HttpUriRequest getHttpUriRequest() {
return httpUriRequest;
}
public void setHttpUriRequest(HttpUriRequest httpUriRequest) {
this.httpUriRequest = httpUriRequest;
}
public HttpClientContext getHttpClientContext() {
return httpClientContext;
}
public void setHttpClientContext(HttpClientContext httpClientContext) {
this.httpClientContext = httpClientContext;
}
}
================================================
FILE: webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpUriRequestConverter.java
================================================
package us.codecraft.webmagic.downloader;
import org.apache.http.HttpHost;
import org.apache.http.auth.AuthState;
import org.apache.http.auth.ChallengeState;
import org.apache.http.auth.UsernamePasswordCredentials;
import org.apache.http.client.CookieStore;
import org.apache.http.client.config.CookieSpecs;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.HttpUriRequest;
import org.apache.http.client.methods.RequestBuilder;
import org.apache.http.client.protocol.HttpClientContext;
import org.apache.http.entity.ByteArrayEntity;
import org.apache.http.impl.auth.BasicScheme;
import org.apache.http.impl.client.BasicCookieStore;
import org.apache.http.impl.cookie.BasicClientCookie;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.proxy.Proxy;
import us.codecraft.webmagic.utils.HttpConstant;
import us.codecraft.webmagic.utils.UrlUtils;
import java.util.Map;
/**
* @author code4crafter@gmail.com
* Date: 17/3/18
* Time: 11:28
*
* @since 0.7.0
*/
public class HttpUriRequestConverter {
public HttpClientRequestContext convert(Request request, Site site, Proxy proxy) {
HttpClientRequestContext httpClientRequestContext = new HttpClientRequestContext();
httpClientRequestContext.setHttpUriRequest(convertHttpUriRequest(request, site, proxy));
httpClientRequestContext.setHttpClientContext(convertHttpClientContext(request, site, proxy));
return httpClientRequestContext;
}
private HttpClientContext convertHttpClientContext(Request request, Site site, Proxy proxy) {
HttpClientContext httpContext = new HttpClientContext();
if (proxy != null && proxy.getUsername() != null) {
AuthState authState = new AuthState();
BasicScheme proxyAuthScheme = new BasicScheme(ChallengeState.PROXY);
UsernamePasswordCredentials proxyCredentials = new UsernamePasswordCredentials(proxy.getUsername(), proxy.getPassword());
authState.update(proxyAuthScheme, proxyCredentials);
httpContext.setAttribute(HttpClientContext.PROXY_AUTH_STATE, authState);
}
if (request.getCookies() != null && !request.getCookies().isEmpty()) {
CookieStore cookieStore = new BasicCookieStore();
for (Map.Entry cookieEntry : request.getCookies().entrySet()) {
BasicClientCookie cookie1 = new BasicClientCookie(cookieEntry.getKey(), cookieEntry.getValue());
cookie1.setDomain(UrlUtils.removePort(UrlUtils.getDomain(request.getUrl())));
cookieStore.addCookie(cookie1);
}
httpContext.setCookieStore(cookieStore);
}
return httpContext;
}
private HttpUriRequest convertHttpUriRequest(Request request, Site site, Proxy proxy) {
RequestBuilder requestBuilder = selectRequestMethod(request).setUri(UrlUtils.fixIllegalCharacterInUrl(request.getUrl()));
if (site.getHeaders() != null) {
for (Map.Entry headerEntry : site.getHeaders().entrySet()) {
requestBuilder.addHeader(headerEntry.getKey(), headerEntry.getValue());
}
}
RequestConfig.Builder requestConfigBuilder = RequestConfig.custom();
if (site != null) {
requestConfigBuilder.setConnectionRequestTimeout(site.getTimeOut())
.setSocketTimeout(site.getTimeOut())
.setConnectTimeout(site.getTimeOut())
.setCookieSpec(CookieSpecs.STANDARD);
}
if (proxy != null) {
requestConfigBuilder.setProxy(new HttpHost(proxy.getHost(), proxy.getPort(), proxy.getScheme()));
}
requestBuilder.setConfig(requestConfigBuilder.build());
HttpUriRequest httpUriRequest = requestBuilder.build();
if (request.getHeaders() != null && !request.getHeaders().isEmpty()) {
for (Map.Entry header : request.getHeaders().entrySet()) {
httpUriRequest.addHeader(header.getKey(), header.getValue());
}
}
return httpUriRequest;
}
private RequestBuilder selectRequestMethod(Request request) {
String method = request.getMethod();
if (method == null || method.equalsIgnoreCase(HttpConstant.Method.GET)) {
//default get
return RequestBuilder.get();
} else if (method.equalsIgnoreCase(HttpConstant.Method.POST)) {
return addFormParams(RequestBuilder.post(),request);
} else if (method.equalsIgnoreCase(HttpConstant.Method.HEAD)) {
return RequestBuilder.head();
} else if (method.equalsIgnoreCase(HttpConstant.Method.PUT)) {
return addFormParams(RequestBuilder.put(), request);
} else if (method.equalsIgnoreCase(HttpConstant.Method.DELETE)) {
return RequestBuilder.delete();
} else if (method.equalsIgnoreCase(HttpConstant.Method.TRACE)) {
return RequestBuilder.trace();
}
throw new IllegalArgumentException("Illegal HTTP Method " + method);
}
private RequestBuilder addFormParams(RequestBuilder requestBuilder, Request request) {
if (request.getRequestBody() != null) {
ByteArrayEntity entity = new ByteArrayEntity(request.getRequestBody().getBody());
entity.setContentType(request.getRequestBody().getContentType());
requestBuilder.setEntity(entity);
}
return requestBuilder;
}
}
================================================
FILE: webmagic-core/src/main/java/us/codecraft/webmagic/downloader/package.html
================================================
Downloader is the part that downloads web pages and store in Page object.
================================================
FILE: webmagic-core/src/main/java/us/codecraft/webmagic/model/HttpRequestBody.java
================================================
package us.codecraft.webmagic.model;
import org.apache.http.NameValuePair;
import org.apache.http.client.utils.URLEncodedUtils;
import org.apache.http.message.BasicNameValuePair;
import java.io.Serializable;
import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
/**
* @author code4crafter@gmail.com
* Date: 17/4/8
*/
public class HttpRequestBody implements Serializable {
private static final long serialVersionUID = 5659170945717023595L;
public static abstract class ContentType {
public static final String JSON = "application/json";
public static final String XML = "text/xml";
public static final String FORM = "application/x-www-form-urlencoded";
public static final String MULTIPART = "multipart/form-data";
}
private byte[] body;
private String contentType;
private String encoding;
public HttpRequestBody() {
}
public HttpRequestBody(byte[] body, String contentType, String encoding) {
this.body = body;
this.contentType = contentType;
this.encoding = encoding;
}
public String getContentType() {
return contentType;
}
public String getEncoding() {
return encoding;
}
public void setBody(byte[] body) {
this.body = body;
}
public void setContentType(String contentType) {
this.contentType = contentType;
}
public void setEncoding(String encoding) {
this.encoding = encoding;
}
public static HttpRequestBody json(String json, String encoding) {
try {
return new HttpRequestBody(json.getBytes(encoding), ContentType.JSON, encoding);
} catch (UnsupportedEncodingException e) {
throw new IllegalArgumentException("illegal encoding " + encoding, e);
}
}
public static HttpRequestBody xml(String xml, String encoding) {
try {
return new HttpRequestBody(xml.getBytes(encoding), ContentType.XML, encoding);
} catch (UnsupportedEncodingException e) {
throw new IllegalArgumentException("illegal encoding " + encoding, e);
}
}
public static HttpRequestBody custom(byte[] body, String contentType, String encoding) {
return new HttpRequestBody(body, contentType, encoding);
}
public static HttpRequestBody form(Map params, String encoding){
List nameValuePairs = new ArrayList(params.size());
for (Map.Entry entry : params.entrySet()) {
nameValuePairs.add(new BasicNameValuePair(entry.getKey(), String.valueOf(entry.getValue())));
}
try {
return new HttpRequestBody(URLEncodedUtils.format(nameValuePairs, encoding).getBytes(encoding), ContentType.FORM, encoding);
} catch (UnsupportedEncodingException e) {
throw new IllegalArgumentException("illegal encoding " + encoding, e);
}
}
public byte[] getBody() {
return body;
}
}
================================================
FILE: webmagic-core/src/main/java/us/codecraft/webmagic/package.html
================================================
Main class "Spider" and models.
================================================
FILE: webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/CollectorPipeline.java
================================================
package us.codecraft.webmagic.pipeline;
import java.util.List;
/**
* Pipeline that can collect and store results.
* Used for {@link us.codecraft.webmagic.Spider#getAll(java.util.Collection)}
*
* @author code4crafter@gmail.com
* @since 0.4.0
*/
public interface CollectorPipeline extends Pipeline {
/**
* Get all results collected.
*
* @return collected results
*/
public List getCollected();
}
================================================
FILE: webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/ConsolePipeline.java
================================================
package us.codecraft.webmagic.pipeline;
import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Task;
import java.util.Map;
/**
* Write results in console.
* Usually used in test.
*
* @author code4crafter@gmail.com
* @since 0.1.0
*/
public class ConsolePipeline implements Pipeline {
@Override
public void process(ResultItems resultItems, Task task) {
System.out.println("get page: " + resultItems.getRequest().getUrl());
for (Map.Entry entry : resultItems.getAll().entrySet()) {
System.out.println(entry.getKey() + ":\t" + entry.getValue());
}
}
}
================================================
FILE: webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/FilePipeline.java
================================================
package us.codecraft.webmagic.pipeline;
import org.apache.commons.codec.digest.DigestUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.utils.FilePersistentBase;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.util.Map;
/**
* Store results in files.
*
* @author code4crafter@gmail.com
* @since 0.1.0
*/
public class FilePipeline extends FilePersistentBase implements Pipeline {
private Logger logger = LoggerFactory.getLogger(getClass());
/**
* create a FilePipeline with default path"/data/webmagic/"
*/
public FilePipeline() {
setPath("/data/webmagic/");
}
public FilePipeline(String path) {
setPath(path);
}
@Override
public void process(ResultItems resultItems, Task task) {
String path = this.path + PATH_SEPERATOR + task.getUUID() + PATH_SEPERATOR;
try {
PrintWriter printWriter = new PrintWriter(new OutputStreamWriter(new FileOutputStream(getFile(path + DigestUtils.md5Hex(resultItems.getRequest().getUrl()) + ".html")),"UTF-8"));
printWriter.println("url:\t" + resultItems.getRequest().getUrl());
for (Map.Entry entry : resultItems.getAll().entrySet()) {
if (entry.getValue() instanceof Iterable) {
Iterable value = (Iterable) entry.getValue();
printWriter.println(entry.getKey() + ":");
for (Object o : value) {
printWriter.println(o);
}
} else {
printWriter.println(entry.getKey() + ":\t" + entry.getValue());
}
}
printWriter.close();
} catch (IOException e) {
logger.warn("write file error", e);
}
}
}
================================================
FILE: webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/Pipeline.java
================================================
package us.codecraft.webmagic.pipeline;
import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Task;
/**
* Pipeline is the persistent and offline process part of crawler.
* The interface Pipeline can be implemented to customize ways of persistent.
*
* @author code4crafter@gmail.com
* @since 0.1.0
* @see ConsolePipeline
* @see FilePipeline
*/
public interface Pipeline {
/**
* Process extracted results.
*
* @param resultItems resultItems
* @param task task
*/
public void process(ResultItems resultItems, Task task);
}
================================================
FILE: webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/ResultItemsCollectorPipeline.java
================================================
package us.codecraft.webmagic.pipeline;
import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Task;
import java.util.ArrayList;
import java.util.List;
/**
* @author code4crafter@gmail.com
* @since 0.4.0
*/
public class ResultItemsCollectorPipeline implements CollectorPipeline {
private List collector = new ArrayList();
@Override
public synchronized void process(ResultItems resultItems, Task task) {
collector.add(resultItems);
}
@Override
public List getCollected() {
return collector;
}
}
================================================
FILE: webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/package.html
================================================
Pipeline is the persistent and offline process part of crawler.
================================================
FILE: webmagic-core/src/main/java/us/codecraft/webmagic/processor/PageProcessor.java
================================================
package us.codecraft.webmagic.processor;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
/**
* Interface to be implemented to customize a crawler.
*
*
* In PageProcessor, you can customize:
*
*
*
start URLs and other settings in {@link Site}
*
how the URLs to fetch are detected
*
how the data are extracted and stored
*
*
* @author code4crafter@gmail.com
* @see Site
* @see Page
* @since 0.1.0
*/
public interface PageProcessor {
/**
* Processes the page, extract URLs to fetch, extract the data and store.
*
* @param page page
*/
void process(Page page);
/**
* Returns the site settings.
*
* @return site
* @see Site
*/
default Site getSite() {
return Site.me();
}
}
================================================
FILE: webmagic-core/src/main/java/us/codecraft/webmagic/processor/SimplePageProcessor.java
================================================
package us.codecraft.webmagic.processor;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import java.util.List;
/**
* A simple PageProcessor.
*
* @author code4crafter@gmail.com
* @since 0.1.0
*/
public class SimplePageProcessor implements PageProcessor {
private String urlPattern;
private Site site;
public SimplePageProcessor(String urlPattern) {
this.site = Site.me();
//compile "*" expression to regex
this.urlPattern = "(" + urlPattern.replace(".", "\\.").replace("*", "[^\"'#]*") + ")";
}
@Override
public void process(Page page) {
List requests = page.getHtml().links().regex(urlPattern).all();
//add urls to fetch
page.addTargetRequests(requests);
//extract by XPath
page.putField("title", page.getHtml().xpath("//title"));
page.putField("html", page.getHtml().toString());
//extract by Readability
page.putField("content", page.getHtml().smartContent());
}
@Override
public Site getSite() {
//settings
return site;
}
}
================================================
FILE: webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/BaiduBaikePageProcessor.java
================================================
package us.codecraft.webmagic.processor.example;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;
import java.util.ArrayList;
import java.util.List;
/**
* @author code4crafter@gmail.com
* @since 0.4.0
*/
public class BaiduBaikePageProcessor implements PageProcessor {
private Site site = Site.me()//.setHttpProxy(new HttpHost("127.0.0.1",8888))
.setRetryTimes(3).setSleepTime(1000).setUseGzip(true);
@Override
public void process(Page page) {
page.putField("name", page.getHtml().css("dl.lemmaWgt-lemmaTitle h1","text").toString());
page.putField("description", page.getHtml().xpath("//div[@class='lemma-summary']/allText()"));
}
@Override
public Site getSite() {
return site;
}
public static void main(String[] args) {
//single download
Spider spider = Spider.create(new BaiduBaikePageProcessor()).thread(2);
String urlTemplate = "http://baike.baidu.com/search/word?word=%s&pic=1&sug=1&enc=utf8";
ResultItems resultItems = spider.get(String.format(urlTemplate, "水力发电"));
System.out.println(resultItems);
//multidownload
List list = new ArrayList();
list.add(String.format(urlTemplate,"风力发电"));
list.add(String.format(urlTemplate,"太阳能"));
list.add(String.format(urlTemplate,"地热发电"));
list.add(String.format(urlTemplate,"地热发电"));
List resultItemses = spider.getAll(list);
for (ResultItems resultItemse : resultItemses) {
System.out.println(resultItemse.getAll());
}
spider.close();
}
}
================================================
FILE: webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/GithubRepoPageProcessor.java
================================================
package us.codecraft.webmagic.processor.example;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;
/**
* @author code4crafter@gmail.com
* @since 0.3.2
*/
public class GithubRepoPageProcessor implements PageProcessor {
private Site site = Site.me().setRetryTimes(3).setSleepTime(1000).setTimeOut(10000);
@Override
public void process(Page page) {
page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/[\\w\\-]+/[\\w\\-]+)").all());
page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/[\\w\\-])").all());
page.putField("author", page.getUrl().regex("https://github\\.com/(\\w+)/.*").toString());
page.putField("name", page.getHtml().xpath("//h1[@class='public']/strong/a/text()").toString());
if (page.getResultItems().get("name")==null){
//skip this page
page.setSkip(true);
}
page.putField("readme", page.getHtml().xpath("//div[@id='readme']/tidyText()"));
}
@Override
public Site getSite() {
return site;
}
public static void main(String[] args) {
Spider.create(new GithubRepoPageProcessor()).addUrl("https://github.com/code4craft").thread(5).run();
}
}
================================================
FILE: webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/ZhihuPageProcessor.java
================================================
package us.codecraft.webmagic.processor.example;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;
/**
* @author code4crafter@gmail.com
* @since 0.6.0
*/
public class ZhihuPageProcessor implements PageProcessor {
private Site site = Site.me().setRetryTimes(3).setSleepTime(1000);
@Override
public void process(Page page) {
page.addTargetRequests(page.getHtml().links().regex("https://www\\.zhihu\\.com/question/\\d+/answer/\\d+.*").all());
page.putField("title", page.getHtml().xpath("//h1[@class='QuestionHeader-title']/text()").toString());
page.putField("question", page.getHtml().xpath("//div[@class='QuestionRichText']//tidyText()").toString());
page.putField("answer", page.getHtml().xpath("//div[@class='QuestionAnswer-content']/tidyText()").toString());
if (page.getResultItems().get("title")==null){
//skip this page
page.setSkip(true);
}
}
@Override
public Site getSite() {
return site;
}
public static void main(String[] args) {
Spider.create(new ZhihuPageProcessor()).addUrl("https://www.zhihu.com/explore").run();
}
}
================================================
FILE: webmagic-core/src/main/java/us/codecraft/webmagic/processor/package.html
================================================
PageProcessor custom part of a crawler for specific site.
================================================
FILE: webmagic-core/src/main/java/us/codecraft/webmagic/proxy/Proxy.java
================================================
package us.codecraft.webmagic.proxy;
import java.io.UnsupportedEncodingException;
import java.net.URI;
import java.net.URISyntaxException;
import java.net.URLEncoder;
import java.nio.charset.StandardCharsets;
import org.apache.commons.lang3.StringUtils;
public class Proxy {
private String scheme;
private String host;
private int port;
private String username;
private String password;
public static Proxy create(final URI uri) {
Proxy proxy = new Proxy(uri.getHost(), uri.getPort(), uri.getScheme());
String userInfo = uri.getUserInfo();
if (userInfo != null) {
String[] up = userInfo.split(":");
if (up.length == 1) {
proxy.username = up[0].isEmpty() ? null : up[0];
} else {
proxy.username = up[0].isEmpty() ? null : up[0];
proxy.password = up[1].isEmpty() ? null : up[1];
}
}
return proxy;
}
public Proxy(String host, int port) {
this(host, port, null);
}
public Proxy(String host, int port, String scheme) {
this.host = host;
this.port = port;
this.scheme = scheme;
}
public Proxy(String host, int port, String username, String password) {
this.host = host;
this.port = port;
this.username = username;
this.password = password;
}
public String getScheme() {
return scheme;
}
public void setScheme(String scheme) {
this.scheme = scheme;
}
public String getHost() {
return host;
}
public int getPort() {
return port;
}
public String getUsername() {
return username;
}
public String getPassword() {
return password;
}
public URI toURI() {
final StringBuilder userInfoBuffer = new StringBuilder();
if (username != null) {
userInfoBuffer.append(urlencode(username));
}
if (password != null) {
userInfoBuffer.append(":").append(urlencode(password));
}
final String userInfo = StringUtils.defaultIfEmpty(userInfoBuffer.toString(), null);
URI uri;
try {
uri = new URI(scheme, userInfo, host, port, null, null, null);
} catch (URISyntaxException e) {
throw new IllegalArgumentException(e.getMessage(), e);
}
return uri;
}
private String urlencode(String s) {
String enc = StandardCharsets.UTF_8.name();
try {
return URLEncoder.encode(s, enc);
} catch (UnsupportedEncodingException e) {
throw new IllegalArgumentException(e);
}
}
@Override
public boolean equals(Object o) {
if (this == o) return true;
if (o == null || getClass() != o.getClass()) return false;
Proxy proxy = (Proxy) o;
if (port != proxy.port) return false;
if (host != null ? !host.equals(proxy.host) : proxy.host != null) return false;
if (scheme != null ? !scheme.equals(proxy.scheme) : proxy.scheme != null) return false;
if (username != null ? !username.equals(proxy.username) : proxy.username != null) return false;
return password != null ? password.equals(proxy.password) : proxy.password == null;
}
@Override
public int hashCode() {
int result = host != null ? host.hashCode() : 0;
result = 31 * result + port;
result = 31 * result + (scheme != null ? scheme.hashCode() : 0);
result = 31 * result + (username != null ? username.hashCode() : 0);
result = 31 * result + (password != null ? password.hashCode() : 0);
return result;
}
@Override
public String toString() {
return this.toURI().toString();
}
}
================================================
FILE: webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyProvider.java
================================================
package us.codecraft.webmagic.proxy;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Task;
/**
* Proxy provider.
*
* @since 0.7.0
*/
public interface ProxyProvider {
/**
*
* Return proxy to Provider when complete a download.
* @param proxy the proxy config contains host,port and identify info
* @param page the download result
* @param task the download task
*/
void returnProxy(Proxy proxy, Page page, Task task);
/**
* Get a proxy for task by some strategy.
* @param task the download task
* @return proxy
* @deprecated Use {@link #getProxy(Request, Task)} instead.
*/
@Deprecated
default Proxy getProxy(Task task) {
throw new UnsupportedOperationException();
}
/**
* Returns a proxy for the request.
*
* @param request the request
* @param task the download task
* @return proxy
* @since 0.9.0
*/
default Proxy getProxy(Request request, Task task) {
return this.getProxy(task);
}
}
================================================
FILE: webmagic-core/src/main/java/us/codecraft/webmagic/proxy/SimpleProxyProvider.java
================================================
package us.codecraft.webmagic.proxy;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Task;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.concurrent.atomic.AtomicInteger;
/**
* A simple ProxyProvider. Provide proxy as round-robin without heartbeat and error check. It can be used when all proxies are stable.
* @author code4crafter@gmail.com
* Date: 17/4/16
* Time: 10:18
* @since 0.7.0
*/
public class SimpleProxyProvider implements ProxyProvider {
private final List proxies;
private final AtomicInteger pointer;
public SimpleProxyProvider(List proxies) {
this(proxies, new AtomicInteger(-1));
}
private SimpleProxyProvider(List proxies, AtomicInteger pointer) {
this.proxies = proxies;
this.pointer = pointer;
}
public static SimpleProxyProvider from(Proxy... proxies) {
List proxiesTemp = new ArrayList(proxies.length);
for (Proxy proxy : proxies) {
proxiesTemp.add(proxy);
}
return new SimpleProxyProvider(Collections.unmodifiableList(proxiesTemp));
}
@Override
public void returnProxy(Proxy proxy, Page page, Task task) {
//Donothing
}
@Override
public Proxy getProxy(Request request, Task task) {
return proxies.get(incrForLoop());
}
private int incrForLoop() {
int p = pointer.incrementAndGet();
int size = proxies.size();
if (p < size) {
return p;
}
while (!pointer.compareAndSet(p, p % size)) {
p = pointer.get();
}
return p % size;
}
}
================================================
FILE: webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/DuplicateRemovedScheduler.java
================================================
package us.codecraft.webmagic.scheduler;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.scheduler.component.DuplicateRemover;
import us.codecraft.webmagic.scheduler.component.HashSetDuplicateRemover;
import us.codecraft.webmagic.utils.HttpConstant;
/**
* Remove duplicate urls and only push urls which are not duplicate.
*
* @author code4crafer@gmail.com
* @since 0.5.0
*/
public abstract class DuplicateRemovedScheduler implements Scheduler {
protected Logger logger = LoggerFactory.getLogger(getClass());
private DuplicateRemover duplicatedRemover = new HashSetDuplicateRemover();
public DuplicateRemover getDuplicateRemover() {
return duplicatedRemover;
}
public DuplicateRemovedScheduler setDuplicateRemover(DuplicateRemover duplicatedRemover) {
this.duplicatedRemover = duplicatedRemover;
return this;
}
@Override
public void push(Request request, Task task) {
logger.trace("get a candidate url {}", request.getUrl());
if (shouldReserved(request) || noNeedToRemoveDuplicate(request) || !duplicatedRemover.isDuplicate(request, task)) {
logger.debug("push to queue {}", request.getUrl());
pushWhenNoDuplicate(request, task);
}
}
protected boolean shouldReserved(Request request) {
return request.getExtra(Request.CYCLE_TRIED_TIMES) != null;
}
protected boolean noNeedToRemoveDuplicate(Request request) {
return HttpConstant.Method.POST.equalsIgnoreCase(request.getMethod());
}
protected void pushWhenNoDuplicate(Request request, Task task) {
}
}
================================================
FILE: webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/MonitorableScheduler.java
================================================
package us.codecraft.webmagic.scheduler;
import us.codecraft.webmagic.Task;
/**
* The scheduler whose requests can be counted for monitor.
*
* @author code4crafter@gmail.com
* @since 0.5.0
*/
public interface MonitorableScheduler extends Scheduler {
public int getLeftRequestsCount(Task task);
public int getTotalRequestsCount(Task task);
}
================================================
FILE: webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/PriorityScheduler.java
================================================
package us.codecraft.webmagic.scheduler;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.utils.NumberUtils;
import java.util.Comparator;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.LinkedBlockingQueue;
import java.util.concurrent.PriorityBlockingQueue;
/**
* Priority scheduler. Request with higher priority will poll earlier.
*
* @author code4crafter@gmail.com
* @since 0.2.1
*/
public class PriorityScheduler extends DuplicateRemovedScheduler implements MonitorableScheduler {
public static final int INITIAL_CAPACITY = 5;
private BlockingQueue noPriorityQueue = new LinkedBlockingQueue();
private PriorityBlockingQueue priorityQueuePlus = new PriorityBlockingQueue(INITIAL_CAPACITY, new Comparator() {
@Override
public int compare(Request o1, Request o2) {
return -NumberUtils.compareLong(o1.getPriority(), o2.getPriority());
}
});
private PriorityBlockingQueue priorityQueueMinus = new PriorityBlockingQueue(INITIAL_CAPACITY, new Comparator() {
@Override
public int compare(Request o1, Request o2) {
return -NumberUtils.compareLong(o1.getPriority(), o2.getPriority());
}
});
@Override
public void pushWhenNoDuplicate(Request request, Task task) {
if (request.getPriority() == 0) {
noPriorityQueue.add(request);
} else if (request.getPriority() > 0) {
priorityQueuePlus.put(request);
} else {
priorityQueueMinus.put(request);
}
}
@Override
public synchronized Request poll(Task task) {
Request poll = priorityQueuePlus.poll();
if (poll != null) {
return poll;
}
poll = noPriorityQueue.poll();
if (poll != null) {
return poll;
}
return priorityQueueMinus.poll();
}
@Override
public int getLeftRequestsCount(Task task) {
return noPriorityQueue.size();
}
@Override
public int getTotalRequestsCount(Task task) {
return getDuplicateRemover().getTotalRequestsCount(task);
}
}
================================================
FILE: webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/QueueScheduler.java
================================================
package us.codecraft.webmagic.scheduler;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.LinkedBlockingQueue;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Task;
/**
* Basic Scheduler implementation.
* Store urls to fetch in LinkedBlockingQueue and remove duplicate urls by HashMap.
*
* Note: if you use this {@link QueueScheduler}
* with {@link Site#getCycleRetryTimes()} enabled, you may encountered dead-lock
* when the queue is full.
*
* @author code4crafter@gmail.com
* @since 0.1.0
*/
public class QueueScheduler extends DuplicateRemovedScheduler implements MonitorableScheduler {
private final BlockingQueue queue;
public QueueScheduler() {
this.queue = new LinkedBlockingQueue<>();
}
/**
* Creates a {@code QueueScheduler} with the given (fixed) capacity.
*
* @param capacity the capacity of this queue,
* see {@link LinkedBlockingQueue#LinkedBlockingQueue(int)}
* @since 0.8.0
*/
public QueueScheduler(int capacity) {
this.queue = new LinkedBlockingQueue<>(capacity);
}
@Override
public void pushWhenNoDuplicate(Request request, Task task) {
logger.trace("Remaining capacity: {}", this.queue.remainingCapacity());
try {
queue.put(request);
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
}
}
@Override
public Request poll(Task task) {
return queue.poll();
}
@Override
public int getLeftRequestsCount(Task task) {
return queue.size();
}
@Override
public int getTotalRequestsCount(Task task) {
return getDuplicateRemover().getTotalRequestsCount(task);
}
}
================================================
FILE: webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/Scheduler.java
================================================
package us.codecraft.webmagic.scheduler;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Task;
/**
* Scheduler is the part of url management.
* You can implement interface Scheduler to do:
* manage urls to fetch
* remove duplicate urls
*
* @author code4crafter@gmail.com
* @since 0.1.0
*/
public interface Scheduler {
/**
* add a url to fetch
*
* @param request request
* @param task task
*/
public void push(Request request, Task task);
/**
* get an url to crawl
*
* @param task the task of spider
* @return the url to crawl
*/
public Request poll(Task task);
}
================================================
FILE: webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/component/DuplicateRemover.java
================================================
package us.codecraft.webmagic.scheduler.component;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Task;
/**
* Remove duplicate requests.
* @author code4crafer@gmail.com
* @since 0.5.1
*/
public interface DuplicateRemover {
/**
*
* Check whether the request is duplicate.
*
* @param request request
* @param task task
* @return true if is duplicate
*/
public boolean isDuplicate(Request request, Task task);
/**
* Reset duplicate check.
* @param task task
*/
public void resetDuplicateCheck(Task task);
/**
* Get TotalRequestsCount for monitor.
* @param task task
* @return number of total request
*/
public int getTotalRequestsCount(Task task);
}
================================================
FILE: webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/component/HashSetDuplicateRemover.java
================================================
package us.codecraft.webmagic.scheduler.component;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Task;
import java.util.Collections;
import java.util.Set;
import java.util.concurrent.ConcurrentHashMap;
/**
* @author code4crafer@gmail.com
*/
public class HashSetDuplicateRemover implements DuplicateRemover {
private Set urls = Collections.newSetFromMap(new ConcurrentHashMap());
@Override
public boolean isDuplicate(Request request, Task task) {
return !urls.add(getUrl(request));
}
protected String getUrl(Request request) {
return request.getUrl();
}
@Override
public void resetDuplicateCheck(Task task) {
urls.clear();
}
@Override
public int getTotalRequestsCount(Task task) {
return urls.size();
}
}
================================================
FILE: webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/component/package.html
================================================
Component of scheduler.
================================================
FILE: webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/package.html
================================================
Scheduler is the part of url management.
================================================
FILE: webmagic-core/src/main/java/us/codecraft/webmagic/selector/AbstractSelectable.java
================================================
package us.codecraft.webmagic.selector;
import java.util.ArrayList;
import java.util.List;
import org.apache.commons.collections4.CollectionUtils;
/**
* @author code4crafer@gmail.com
* @since 0.5.2
*/
public abstract class AbstractSelectable implements Selectable {
protected abstract List getSourceTexts();
@Override
public Selectable css(String selector) {
return $(selector);
}
@Override
public Selectable css(String selector, String attrName) {
return $(selector, attrName);
}
protected Selectable select(Selector selector, List strings) {
List results = new ArrayList();
for (String string : strings) {
String result = selector.select(string);
if (result != null) {
results.add(result);
}
}
return new PlainText(results);
}
protected Selectable selectList(Selector selector, List strings) {
List results = new ArrayList();
for (String string : strings) {
List result = selector.selectList(string);
results.addAll(result);
}
return new PlainText(results);
}
@Override
public List all() {
return getSourceTexts();
}
@Override
public Selectable jsonPath(String jsonPath) {
throw new UnsupportedOperationException();
}
@Override
public String get() {
List sourceTexts = all();
if (CollectionUtils.isNotEmpty(sourceTexts)) {
return sourceTexts.get(0);
}
return null;
}
@Override
public Selectable select(Selector selector) {
return select(selector, getSourceTexts());
}
@Override
public Selectable selectList(Selector selector) {
return selectList(selector, getSourceTexts());
}
@Override
public Selectable regex(String regex) {
RegexSelector regexSelector = Selectors.regex(regex);
return selectList(regexSelector, getSourceTexts());
}
@Override
public Selectable regex(String regex, int group) {
RegexSelector regexSelector = Selectors.regex(regex, group);
return selectList(regexSelector, getSourceTexts());
}
@Override
public Selectable replace(String regex, String replacement) {
ReplaceSelector replaceSelector = new ReplaceSelector(regex,replacement);
return select(replaceSelector, getSourceTexts());
}
public String getFirstSourceText() {
List sourceTexts = getSourceTexts();
if (CollectionUtils.isNotEmpty(sourceTexts)) {
return sourceTexts.get(0);
}
return null;
}
@Override
public String toString() {
return get();
}
@Override
public boolean match() {
return CollectionUtils.isNotEmpty(getSourceTexts());
}
}
================================================
FILE: webmagic-core/src/main/java/us/codecraft/webmagic/selector/AndSelector.java
================================================
package us.codecraft.webmagic.selector;
import java.util.ArrayList;
import java.util.List;
/**
* All selectors will be arranged as a pipeline.
* The next selector uses the result of the previous as source.
* @author code4crafter@gmail.com
* @since 0.2.0
*/
public class AndSelector implements Selector {
private List selectors = new ArrayList();
public AndSelector(Selector... selectors) {
for (Selector selector : selectors) {
this.selectors.add(selector);
}
}
public AndSelector(List selectors) {
this.selectors = selectors;
}
@Override
public String select(String text) {
for (Selector selector : selectors) {
if (text == null) {
return null;
}
text = selector.select(text);
}
return text;
}
@Override
public List selectList(String text) {
List results = new ArrayList();
boolean first = true;
for (Selector selector : selectors) {
if (first) {
results = selector.selectList(text);
first = false;
} else {
List resultsTemp = new ArrayList();
for (String result : results) {
resultsTemp.addAll(selector.selectList(result));
}
results = resultsTemp;
if (results == null || results.size() == 0) {
return results;
}
}
}
return results;
}
}
================================================
FILE: webmagic-core/src/main/java/us/codecraft/webmagic/selector/BaseElementSelector.java
================================================
package us.codecraft.webmagic.selector;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import us.codecraft.webmagic.utils.BaseSelectorUtils;
import java.util.ArrayList;
import java.util.List;
/**
* @author code4crafter@gmail.com
* @since 0.3.0
*/
public abstract class BaseElementSelector implements Selector, ElementSelector {
private Document parse(String text) {
// Jsoup could not parse