Showing preview only (1,150K chars total). Download the full file or copy to clipboard to get everything.
Repository: code4craft/webmagic
Branch: develop
Commit: 67816a19d68a
Files: 310
Total size: 1.0 MB
Directory structure:
gitextract_m56n222u/
├── .gitignore
├── .travis.yml
├── LICENSE
├── README-zh.md
├── README.md
├── pom.xml
├── src/
│ └── site/
│ └── site.xml
├── webmagic-core/
│ ├── README.md
│ ├── module_webmagic-core.xml
│ ├── pom.xml
│ └── src/
│ ├── main/
│ │ └── java/
│ │ └── us/
│ │ └── codecraft/
│ │ └── webmagic/
│ │ ├── Page.java
│ │ ├── Request.java
│ │ ├── ResultItems.java
│ │ ├── Site.java
│ │ ├── Spider.java
│ │ ├── SpiderListener.java
│ │ ├── SpiderScheduler.java
│ │ ├── Task.java
│ │ ├── downloader/
│ │ │ ├── AbstractDownloader.java
│ │ │ ├── CustomRedirectStrategy.java
│ │ │ ├── Downloader.java
│ │ │ ├── HttpClientDownloader.java
│ │ │ ├── HttpClientGenerator.java
│ │ │ ├── HttpClientRequestContext.java
│ │ │ ├── HttpUriRequestConverter.java
│ │ │ └── package.html
│ │ ├── model/
│ │ │ └── HttpRequestBody.java
│ │ ├── package.html
│ │ ├── pipeline/
│ │ │ ├── CollectorPipeline.java
│ │ │ ├── ConsolePipeline.java
│ │ │ ├── FilePipeline.java
│ │ │ ├── Pipeline.java
│ │ │ ├── ResultItemsCollectorPipeline.java
│ │ │ └── package.html
│ │ ├── processor/
│ │ │ ├── PageProcessor.java
│ │ │ ├── SimplePageProcessor.java
│ │ │ ├── example/
│ │ │ │ ├── BaiduBaikePageProcessor.java
│ │ │ │ ├── GithubRepoPageProcessor.java
│ │ │ │ └── ZhihuPageProcessor.java
│ │ │ └── package.html
│ │ ├── proxy/
│ │ │ ├── Proxy.java
│ │ │ ├── ProxyProvider.java
│ │ │ └── SimpleProxyProvider.java
│ │ ├── scheduler/
│ │ │ ├── DuplicateRemovedScheduler.java
│ │ │ ├── MonitorableScheduler.java
│ │ │ ├── PriorityScheduler.java
│ │ │ ├── QueueScheduler.java
│ │ │ ├── Scheduler.java
│ │ │ ├── component/
│ │ │ │ ├── DuplicateRemover.java
│ │ │ │ ├── HashSetDuplicateRemover.java
│ │ │ │ └── package.html
│ │ │ └── package.html
│ │ ├── selector/
│ │ │ ├── AbstractSelectable.java
│ │ │ ├── AndSelector.java
│ │ │ ├── BaseElementSelector.java
│ │ │ ├── CssSelector.java
│ │ │ ├── ElementSelector.java
│ │ │ ├── Html.java
│ │ │ ├── HtmlNode.java
│ │ │ ├── Json.java
│ │ │ ├── JsonPathSelector.java
│ │ │ ├── LinksSelector.java
│ │ │ ├── OrSelector.java
│ │ │ ├── PlainText.java
│ │ │ ├── RegexResult.java
│ │ │ ├── RegexSelector.java
│ │ │ ├── ReplaceSelector.java
│ │ │ ├── Selectable.java
│ │ │ ├── Selector.java
│ │ │ ├── Selectors.java
│ │ │ ├── SmartContentSelector.java
│ │ │ ├── XpathSelector.java
│ │ │ └── package.html
│ │ ├── thread/
│ │ │ └── CountableThreadPool.java
│ │ └── utils/
│ │ ├── BaseSelectorUtils.java
│ │ ├── CharsetUtils.java
│ │ ├── Experimental.java
│ │ ├── FilePersistentBase.java
│ │ ├── HttpClientUtils.java
│ │ ├── HttpConstant.java
│ │ ├── NumberUtils.java
│ │ ├── ProxyUtils.java
│ │ ├── UrlUtils.java
│ │ ├── WMCollections.java
│ │ └── package.html
│ └── test/
│ ├── java/
│ │ └── us/
│ │ └── codecraft/
│ │ └── webmagic/
│ │ ├── HtmlTest.java
│ │ ├── RequestTest.java
│ │ ├── ResultItemsTest.java
│ │ ├── SiteTest.java
│ │ ├── SpiderTest.java
│ │ ├── downloader/
│ │ │ ├── HttpClientDownloaderTest.java
│ │ │ ├── HttpUriRequestConverterTest.java
│ │ │ ├── MockGithubDownloader.java
│ │ │ └── SSLCompatibilityTest.java
│ │ ├── example/
│ │ │ └── GithubRepoPageProcessorTest.java
│ │ ├── pipeline/
│ │ │ └── FilePipelineTest.java
│ │ ├── processor/
│ │ │ └── PageProcessorTest.java
│ │ ├── proxy/
│ │ │ ├── ProxyTest.java
│ │ │ └── SimpleProxyProviderTest.java
│ │ ├── scheduler/
│ │ │ ├── DuplicateRemovedSchedulerTest.java
│ │ │ └── PrioritySchedulerTest.java
│ │ ├── selector/
│ │ │ ├── AndSelectorTest.java
│ │ │ ├── CssSelectorTest.java
│ │ │ ├── ExtractorsTest.java
│ │ │ ├── JsonPathSelectorTest.java
│ │ │ ├── JsonTest.java
│ │ │ ├── LinksSelectorTest.java
│ │ │ ├── OrSelectorTest.java
│ │ │ ├── RegexSelectorTest.java
│ │ │ └── SelectorTest.java
│ │ └── utils/
│ │ ├── CharsetUtilsTest.java
│ │ ├── NumberUtilsTest.java
│ │ └── UrlUtilsTest.java
│ └── resources/
│ ├── html/
│ │ └── mock-github.html
│ └── log4j2-test.xml
├── webmagic-coverage/
│ └── pom.xml
├── webmagic-extension/
│ ├── README.md
│ ├── pom.xml
│ └── src/
│ ├── main/
│ │ ├── java/
│ │ │ └── us/
│ │ │ └── codecraft/
│ │ │ └── webmagic/
│ │ │ ├── MultiPageModel.java
│ │ │ ├── SimpleHttpClient.java
│ │ │ ├── configurable/
│ │ │ │ ├── ConfigurablePageProcessor.java
│ │ │ │ ├── ExpressionType.java
│ │ │ │ └── ExtractRule.java
│ │ │ ├── downloader/
│ │ │ │ └── PhantomJSDownloader.java
│ │ │ ├── example/
│ │ │ │ ├── AppStore.java
│ │ │ │ ├── BaiduBaike.java
│ │ │ │ ├── GithubRepo.java
│ │ │ │ ├── GithubRepoApi.java
│ │ │ │ ├── GithubRepoPageMapper.java
│ │ │ │ ├── MonitorExample.java
│ │ │ │ ├── OschinaBlog.java
│ │ │ │ └── PatternProcessorExample.java
│ │ │ ├── handler/
│ │ │ │ ├── CompositePageProcessor.java
│ │ │ │ ├── CompositePipeline.java
│ │ │ │ ├── PatternProcessor.java
│ │ │ │ ├── PatternRequestMatcher.java
│ │ │ │ ├── RequestMatcher.java
│ │ │ │ ├── SubPageProcessor.java
│ │ │ │ └── SubPipeline.java
│ │ │ ├── model/
│ │ │ │ ├── AfterExtractor.java
│ │ │ │ ├── ConsolePageModelPipeline.java
│ │ │ │ ├── Extractor.java
│ │ │ │ ├── FieldExtractor.java
│ │ │ │ ├── HasKey.java
│ │ │ │ ├── ModelPageProcessor.java
│ │ │ │ ├── ModelPipeline.java
│ │ │ │ ├── OOSpider.java
│ │ │ │ ├── PageMapper.java
│ │ │ │ ├── PageModelCollectorPipeline.java
│ │ │ │ ├── PageModelExtractor.java
│ │ │ │ ├── annotation/
│ │ │ │ │ ├── ComboExtract.java
│ │ │ │ │ ├── ExtractBy.java
│ │ │ │ │ ├── ExtractByUrl.java
│ │ │ │ │ ├── Formatter.java
│ │ │ │ │ ├── HelpUrl.java
│ │ │ │ │ ├── TargetUrl.java
│ │ │ │ │ └── package.html
│ │ │ │ ├── fields/
│ │ │ │ │ ├── MultipleField.java
│ │ │ │ │ ├── PageField.java
│ │ │ │ │ └── SingleField.java
│ │ │ │ ├── formatter/
│ │ │ │ │ ├── BasicClassDetector.java
│ │ │ │ │ ├── BasicTypeFormatter.java
│ │ │ │ │ ├── DateFormatter.java
│ │ │ │ │ ├── ObjectFormatter.java
│ │ │ │ │ ├── ObjectFormatterBuilder.java
│ │ │ │ │ └── ObjectFormatters.java
│ │ │ │ ├── package.html
│ │ │ │ └── sources/
│ │ │ │ ├── Source.java
│ │ │ │ └── SourceTextExtractor.java
│ │ │ ├── monitor/
│ │ │ │ ├── SpiderMonitor.java
│ │ │ │ ├── SpiderStatus.java
│ │ │ │ └── SpiderStatusMXBean.java
│ │ │ ├── pipeline/
│ │ │ │ ├── CollectorPageModelPipeline.java
│ │ │ │ ├── FilePageModelPipeline.java
│ │ │ │ ├── JsonFilePageModelPipeline.java
│ │ │ │ ├── JsonFilePipeline.java
│ │ │ │ ├── MultiPagePipeline.java
│ │ │ │ └── PageModelPipeline.java
│ │ │ ├── scheduler/
│ │ │ │ ├── BloomFilterDuplicateRemover.java
│ │ │ │ ├── FileCacheQueueScheduler.java
│ │ │ │ ├── RedisPriorityScheduler.java
│ │ │ │ └── RedisScheduler.java
│ │ │ └── utils/
│ │ │ ├── ClassUtils.java
│ │ │ ├── DoubleKeyMap.java
│ │ │ ├── ExtractorUtils.java
│ │ │ ├── IPUtils.java
│ │ │ ├── MultiKeyMapBase.java
│ │ │ └── RequestUtils.java
│ │ └── resources/
│ │ ├── crawl.js
│ │ └── spider-config-draft.xml
│ └── test/
│ ├── java/
│ │ └── us/
│ │ └── codecraft/
│ │ └── webmagic/
│ │ ├── MockPageModelPipeline.java
│ │ ├── MockPipeline.java
│ │ ├── SimpleHttpClientTest.java
│ │ ├── configurable/
│ │ │ └── ConfigurablePageProcessorTest.java
│ │ ├── downloader/
│ │ │ └── MockGithubDownloader.java
│ │ ├── formatter/
│ │ │ └── DateFormatterTest.java
│ │ ├── model/
│ │ │ ├── BaseRepo.java
│ │ │ ├── GithubRepo.java
│ │ │ ├── GithubRepoApi.java
│ │ │ ├── GithubRepoTest.java
│ │ │ ├── ModelPageProcessorTest.java
│ │ │ ├── PageMapperTest.java
│ │ │ ├── PageMocker.java
│ │ │ └── PageModelExtractorTest.java
│ │ ├── monitor/
│ │ │ ├── CustomSpiderStatus.java
│ │ │ ├── CustomSpiderStatusMXBean.java
│ │ │ ├── SeedUrlWithPortTest.java
│ │ │ └── SpiderMonitorTest.java
│ │ ├── processor/
│ │ │ └── GithubRepoProcessor.java
│ │ ├── scheduler/
│ │ │ ├── BloomFilterDuplicateRemoverTest.java
│ │ │ ├── RedisPrioritySchedulerTest.java
│ │ │ └── RedisSchedulerTest.java
│ │ └── utils/
│ │ ├── IPUtilsTest.java
│ │ └── RequestUtilsTest.java
│ └── resources/
│ ├── html/
│ │ ├── mock-github.html
│ │ └── mock-webmagic.html
│ ├── json/
│ │ └── mock-githubrepo.json
│ └── log4j2-test.xml
├── webmagic-samples/
│ ├── README.md
│ ├── pom.xml
│ └── src/
│ ├── main/
│ │ ├── java/
│ │ │ └── us/
│ │ │ └── codecraft/
│ │ │ └── webmagic/
│ │ │ ├── main/
│ │ │ │ └── QuickStarter.java
│ │ │ ├── model/
│ │ │ │ └── samples/
│ │ │ │ ├── BaiduNews.java
│ │ │ │ ├── Blog.java
│ │ │ │ ├── DianpingFtlDataScanner.java
│ │ │ │ ├── GithubRepo.java
│ │ │ │ ├── IteyeBlog.java
│ │ │ │ ├── JokejiModel.java
│ │ │ │ ├── Kr36NewsModel.java
│ │ │ │ ├── News163.java
│ │ │ │ ├── OschinaAnswer.java
│ │ │ │ ├── OschinaBlog.java
│ │ │ │ └── QQMeishi.java
│ │ │ ├── recover/
│ │ │ │ ├── DuplicateStorageRemover.java
│ │ │ │ ├── MmapQueueScheduler.java
│ │ │ │ └── RecoverSample.java
│ │ │ └── samples/
│ │ │ ├── AlexanderMcqueenGoodsProcessor.java
│ │ │ ├── AmanzonPageProcessor.java
│ │ │ ├── AngularJSProcessor.java
│ │ │ ├── DiandianBlogProcessor.java
│ │ │ ├── DiaoyuwengProcessor.java
│ │ │ ├── F58PageProcesser.java
│ │ │ ├── GithubRepo.java
│ │ │ ├── GithubRepoPageProcessor.java
│ │ │ ├── HuxiuProcessor.java
│ │ │ ├── InfoQMiniBookProcessor.java
│ │ │ ├── IteyeBlogProcessor.java
│ │ │ ├── KaichibaProcessor.java
│ │ │ ├── MamacnPageProcessor.java
│ │ │ ├── MeicanProcessor.java
│ │ │ ├── NjuBBSProcessor.java
│ │ │ ├── PhantomJSPageProcessor.java
│ │ │ ├── QzoneBlogProcessor.java
│ │ │ ├── SinaBlogProcessor.java
│ │ │ ├── TianyaPageProcesser.java
│ │ │ ├── ZhihuPageProcessor.java
│ │ │ ├── formatter/
│ │ │ │ └── StringTemplateFormatter.java
│ │ │ ├── pipeline/
│ │ │ │ ├── OneFilePipeline.java
│ │ │ │ └── ReplacePipeline.java
│ │ │ └── scheduler/
│ │ │ ├── DelayQueueScheduler.java
│ │ │ ├── LevelLimitScheduler.java
│ │ │ └── ZipCodePageProcessor.java
│ │ └── resources/
│ │ ├── crawl.js
│ │ └── log4j2.xml
│ └── test/
│ └── java/
│ └── us/
│ └── codecraft/
│ └── webmagic/
│ ├── SpiderTest.java
│ ├── model/
│ │ └── ProcessorBenchmark.java
│ ├── processor/
│ │ └── SinablogProcessorTest.java
│ └── samples/
│ └── scheduler/
│ └── DelayQueueSchedulerTest.java
├── webmagic-saxon/
│ ├── README.md
│ ├── pom.xml
│ └── src/
│ ├── main/
│ │ └── java/
│ │ └── us/
│ │ └── codecraft/
│ │ └── webmagic/
│ │ └── selector/
│ │ ├── JaxpSelectorUtils.java
│ │ ├── NodeSelector.java
│ │ └── Xpath2Selector.java
│ └── test/
│ └── java/
│ └── us/
│ └── codecraft/
│ └── webmagic/
│ └── selector/
│ └── XpathSelectorTest.java
├── webmagic-scripts/
│ ├── README.md
│ ├── deploy.sh
│ ├── pom.xml
│ └── src/
│ ├── main/
│ │ ├── groovy/
│ │ │ └── Github.groovy
│ │ ├── java/
│ │ │ └── us/
│ │ │ └── codecraft/
│ │ │ └── webmagic/
│ │ │ └── scripts/
│ │ │ ├── Params.java
│ │ │ ├── ScriptConsole.java
│ │ │ ├── ScriptEnginePool.java
│ │ │ ├── ScriptProcessor.java
│ │ │ ├── ScriptProcessorBuilder.java
│ │ │ ├── config/
│ │ │ │ ├── CommandLineOption.java
│ │ │ │ └── ConfigLogger.java
│ │ │ └── languages/
│ │ │ ├── JRuby.java
│ │ │ ├── Javascript.java
│ │ │ ├── Jython.java
│ │ │ └── Language.java
│ │ ├── kotlin/
│ │ │ └── Github.kt
│ │ └── resources/
│ │ ├── js/
│ │ │ ├── defines.js
│ │ │ ├── github.js
│ │ │ └── oschina.js
│ │ ├── python/
│ │ │ ├── defines.py
│ │ │ └── oschina.py
│ │ └── ruby/
│ │ ├── defines.rb
│ │ ├── github.rb
│ │ └── oschina.rb
│ └── test/
│ ├── java/
│ │ └── us/
│ │ └── codecraft/
│ │ └── webmagic/
│ │ └── scripts/
│ │ └── ScriptProcessorTest.java
│ └── resources/
│ └── log4j2-test.xml
└── webmagic-selenium/
├── README.md
├── config.ini
├── pom.xml
└── src/
├── main/
│ └── java/
│ └── us/
│ └── codecraft/
│ └── webmagic/
│ └── downloader/
│ └── selenium/
│ ├── SeleniumDownloader.java
│ └── WebDriverPool.java
└── test/
├── java/
│ └── us/
│ └── codecraft/
│ └── webmagic/
│ ├── downloader/
│ │ ├── SeleniumTest.java
│ │ └── selenium/
│ │ ├── SeleniumDownloaderTest.java
│ │ └── WebDriverPoolTest.java
│ └── samples/
│ ├── GooglePlayProcessor.java
│ └── HuabanProcessor.java
└── resources/
└── config.ini
================================================
FILE CONTENTS
================================================
================================================
FILE: .gitignore
================================================
target/
pom.xml.tag
pom.xml.releaseBackup
pom.xml.versionsBackup
pom.xml.next
release.properties
dependency-reduced-pom.xml
buildNumber.properties
.mvn/timing.properties
# https://github.com/takari/maven-wrapper#usage-without-binary-jar
.mvn/wrapper/maven-wrapper.jar
# Eclipse m2e generated files
# Eclipse Core
.project
# JDT-specific (Eclipse Java Development Tools)
.classpath
.metadata
bin/
tmp/
*.tmp
*.bak
*.swp
*~.nib
local.properties
.settings/
.loadpath
.recommenders
# External tool builders
.externalToolBuilders/
# Locally stored "Eclipse launch configurations"
*.launch
# PyDev specific (Python IDE for Eclipse)
*.pydevproject
# CDT-specific (C/C++ Development Tooling)
.cproject
# CDT- autotools
.autotools
# Java annotation processor (APT)
.factorypath
# PDT-specific (PHP Development Tools)
.buildpath
# sbteclipse plugin
.target
# Tern plugin
.tern-project
# TeXlipse plugin
.texlipse
# STS (Spring Tool Suite)
.springBeans
# Code Recommenders
.recommenders/
# Annotation Processing
.apt_generated/
.apt_generated_test/
# Scala IDE specific (Scala & Java development for Eclipse)
.cache-main
.scala_dependencies
.worksheet
# Uncomment this line if you wish to ignore the project description file.
# Typically, this file would be tracked if it contains build/dependency configurations:
#.project
================================================
FILE: .travis.yml
================================================
language: java
jdk:
- openjdk9
================================================
FILE: LICENSE
================================================
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction, and
distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by the copyright
owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all other entities
that control, are controlled by, or are under common control with that entity.
For the purposes of this definition, "control" means (i) the power, direct or
indirect, to cause the direction or management of such entity, whether by
contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity exercising
permissions granted by this License.
"Source" form shall mean the preferred form for making modifications, including
but not limited to software source code, documentation source, and configuration
files.
"Object" form shall mean any form resulting from mechanical transformation or
translation of a Source form, including but not limited to compiled object code,
generated documentation, and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or Object form, made
available under the License, as indicated by a copyright notice that is included
in or attached to the work (an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object form, that
is based on (or derived from) the Work and for which the editorial revisions,
annotations, elaborations, or other modifications represent, as a whole, an
original work of authorship. For the purposes of this License, Derivative Works
shall not include works that remain separable from, or merely link (or bind by
name) to the interfaces of, the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including the original version
of the Work and any modifications or additions to that Work or Derivative Works
thereof, that is intentionally submitted to Licensor for inclusion in the Work
by the copyright owner or by an individual or Legal Entity authorized to submit
on behalf of the copyright owner. For the purposes of this definition,
"submitted" means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems, and
issue tracking systems that are managed by, or on behalf of, the Licensor for
the purpose of discussing and improving the Work, but excluding communication
that is conspicuously marked or otherwise designated in writing by the copyright
owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity on behalf
of whom a Contribution has been received by Licensor and subsequently
incorporated within the Work.
2. Grant of Copyright License.
Subject to the terms and conditions of this License, each Contributor hereby
grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free,
irrevocable copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the Work and such
Derivative Works in Source or Object form.
3. Grant of Patent License.
Subject to the terms and conditions of this License, each Contributor hereby
grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free,
irrevocable (except as stated in this section) patent license to make, have
made, use, offer to sell, sell, import, and otherwise transfer the Work, where
such license applies only to those patent claims licensable by such Contributor
that are necessarily infringed by their Contribution(s) alone or by combination
of their Contribution(s) with the Work to which such Contribution(s) was
submitted. If You institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work or a
Contribution incorporated within the Work constitutes direct or contributory
patent infringement, then any patent licenses granted to You under this License
for that Work shall terminate as of the date such litigation is filed.
4. Redistribution.
You may reproduce and distribute copies of the Work or Derivative Works thereof
in any medium, with or without modifications, and in Source or Object form,
provided that You meet the following conditions:
You must give any other recipients of the Work or Derivative Works a copy of
this License; and
You must cause any modified files to carry prominent notices stating that You
changed the files; and
You must retain, in the Source form of any Derivative Works that You distribute,
all copyright, patent, trademark, and attribution notices from the Source form
of the Work, excluding those notices that do not pertain to any part of the
Derivative Works; and
If the Work includes a "NOTICE" text file as part of its distribution, then any
Derivative Works that You distribute must include a readable copy of the
attribution notices contained within such NOTICE file, excluding those notices
that do not pertain to any part of the Derivative Works, in at least one of the
following places: within a NOTICE text file distributed as part of the
Derivative Works; within the Source form or documentation, if provided along
with the Derivative Works; or, within a display generated by the Derivative
Works, if and wherever such third-party notices normally appear. The contents of
the NOTICE file are for informational purposes only and do not modify the
License. You may add Your own attribution notices within Derivative Works that
You distribute, alongside or as an addendum to the NOTICE text from the Work,
provided that such additional attribution notices cannot be construed as
modifying the License.
You may add Your own copyright statement to Your modifications and may provide
additional or different license terms and conditions for use, reproduction, or
distribution of Your modifications, or for any such Derivative Works as a whole,
provided Your use, reproduction, and distribution of the Work otherwise complies
with the conditions stated in this License.
5. Submission of Contributions.
Unless You explicitly state otherwise, any Contribution intentionally submitted
for inclusion in the Work by You to the Licensor shall be under the terms and
conditions of this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify the terms of
any separate license agreement you may have executed with Licensor regarding
such Contributions.
6. Trademarks.
This License does not grant permission to use the trade names, trademarks,
service marks, or product names of the Licensor, except as required for
reasonable and customary use in describing the origin of the Work and
reproducing the content of the NOTICE file.
7. Disclaimer of Warranty.
Unless required by applicable law or agreed to in writing, Licensor provides the
Work (and each Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied,
including, without limitation, any warranties or conditions of TITLE,
NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are
solely responsible for determining the appropriateness of using or
redistributing the Work and assume any risks associated with Your exercise of
permissions under this License.
8. Limitation of Liability.
In no event and under no legal theory, whether in tort (including negligence),
contract, or otherwise, unless required by applicable law (such as deliberate
and grossly negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special, incidental,
or consequential damages of any character arising as a result of this License or
out of the use or inability to use the Work (including but not limited to
damages for loss of goodwill, work stoppage, computer failure or malfunction, or
any and all other commercial damages or losses), even if such Contributor has
been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability.
While redistributing the Work or Derivative Works thereof, You may choose to
offer, and charge a fee for, acceptance of support, warranty, indemnity, or
other liability obligations and/or rights consistent with this License. However,
in accepting such obligations, You may act only on Your own behalf and on Your
sole responsibility, not on behalf of any other Contributor, and only if You
agree to indemnify, defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason of your
accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work
To apply the Apache License to your work, attach the following boilerplate
notice, with the fields enclosed by brackets "{}" replaced with your own
identifying information. (Don't include the brackets!) The text should be
enclosed in the appropriate comment syntax for the file format. We also
recommend that a file or class name and description of purpose be included on
the same "printed page" as the copyright notice for easier identification within
third-party archives.
Copyright 2025 code4craft
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
================================================
FILE: README-zh.md
================================================

[](https://maven-badges.herokuapp.com/maven-central/us.codecraft/webmagic-parent/)
[](https://www.apache.org/licenses/LICENSE-2.0.html)
[](https://travis-ci.org/code4craft/webmagic)
官方网站[http://webmagic.io/](http://webmagic.io/)
>webmagic是一个开源的Java垂直爬虫框架,目标是简化爬虫的开发流程,让开发者专注于逻辑功能的开发。webmagic的核心非常简单,但是覆盖爬虫的整个流程,也是很好的学习爬虫开发的材料。
webmagic的主要特色:
* 完全模块化的设计,强大的可扩展性。
* 核心简单但是涵盖爬虫的全部流程,灵活而强大,也是学习爬虫入门的好材料。
* 提供丰富的抽取页面API。
* 无配置,但是可通过POJO+注解形式实现一个爬虫。
* 支持多线程。
* 支持分布式。
* 支持爬取js动态渲染的页面。
* 无框架依赖,可以灵活的嵌入到项目中去。
webmagic的架构和设计参考了以下两个项目,感谢以下两个项目的作者:
python爬虫 **scrapy** [https://github.com/scrapy/scrapy](https://github.com/scrapy/scrapy)
Java爬虫 **Spiderman** [http://git.oschina.net/l-weiwei/spiderman](http://git.oschina.net/l-weiwei/spiderman)
webmagic的github地址:[https://github.com/code4craft/webmagic](https://github.com/code4craft/webmagic)。
## 快速开始
### 使用maven
webmagic使用maven管理依赖,在项目中添加对应的依赖即可使用webmagic:
```xml
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-core</artifactId>
<version>${webmagic.version}</version>
</dependency>
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-extension</artifactId>
<version>${webmagic.version}</version>
</dependency>
```
WebMagic 使用slf4j-log4j12作为slf4j的实现.如果你自己定制了slf4j的实现,请在项目中去掉此依赖。
```xml
<exclusions>
<exclusion>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-log4j12</artifactId>
</exclusion>
</exclusions>
```
#### 项目结构
webmagic主要包括两个包:
* **webmagic-core**
webmagic核心部分,只包含爬虫基本模块和基本抽取器。webmagic-core的目标是成为网页爬虫的一个教科书般的实现。
* **webmagic-extension**
webmagic的扩展模块,提供一些更方便的编写爬虫的工具。包括注解格式定义爬虫、JSON、分布式等支持。
webmagic还包含两个可用的扩展包,因为这两个包都依赖了比较重量级的工具,所以从主要包中抽离出来,这些包需要下载源码后自己编译::
* **webmagic-saxon**
webmagic与Saxon结合的模块。Saxon是一个XPath、XSLT的解析工具,webmagic依赖Saxon来进行XPath2.0语法解析支持。
* **webmagic-selenium**
webmagic与Selenium结合的模块。Selenium是一个模拟浏览器进行页面渲染的工具,webmagic依赖Selenium进行动态页面的抓取。
在项目中,你可以根据需要依赖不同的包。
### 不使用maven
在项目的**lib**目录下,有依赖的所有jar包,直接在IDE里import即可。
### 第一个爬虫
#### 定制PageProcessor
PageProcessor是webmagic-core的一部分,定制一个PageProcessor即可实现自己的爬虫逻辑。以下是抓取osc博客的一段代码:
```java
public class OschinaBlogPageProcessor implements PageProcessor {
private Site site = Site.me().setDomain("my.oschina.net");
@Override
public void process(Page page) {
List<String> links = page.getHtml().links().regex("http://my\\.oschina\\.net/flashsword/blog/\\d+").all();
page.addTargetRequests(links);
page.putField("title", page.getHtml().xpath("//div[@class='BlogEntity']/div[@class='BlogTitle']/h1").toString());
page.putField("content", page.getHtml().$("div.content").toString());
page.putField("tags",page.getHtml().xpath("//div[@class='BlogTags']/a/text()").all());
}
@Override
public Site getSite() {
return site;
}
public static void main(String[] args) {
Spider.create(new OschinaBlogPageProcessor()).addUrl("http://my.oschina.net/flashsword/blog")
.addPipeline(new ConsolePipeline()).run();
}
}
```
这里通过page.addTargetRequests()方法来增加要抓取的URL,并通过page.putField()来保存抽取结果。page.getHtml().xpath()则是按照某个规则对结果进行抽取,这里抽取支持链式调用。调用结束后,toString()表示转化为单个String,all()则转化为一个String列表。
Spider是爬虫的入口类。Pipeline是结果输出和持久化的接口,这里ConsolePipeline表示结果输出到控制台。
执行这个main方法,即可在控制台看到抓取结果。webmagic默认有3秒抓取间隔,请耐心等待。
#### 使用注解
webmagic-extension包括了注解方式编写爬虫的方法,只需基于一个POJO增加注解即可完成一个爬虫。以下仍然是抓取oschina博客的一段代码,功能与OschinaBlogPageProcesser完全相同:
```java
@TargetUrl("http://my.oschina.net/flashsword/blog/\\d+")
public class OschinaBlog {
@ExtractBy("//title")
private String title;
@ExtractBy(value = "div.BlogContent",type = ExtractBy.Type.Css)
private String content;
@ExtractBy(value = "//div[@class='BlogTags']/a/text()", multi = true)
private List<String> tags;
public static void main(String[] args) {
OOSpider.create(
Site.me(),
new ConsolePageModelPipeline(), OschinaBlog.class).addUrl("http://my.oschina.net/flashsword/blog").run();
}
}
```
这个例子定义了一个Model类,Model类的字段'title'、'content'、'tags'均为要抽取的属性。这个类在Pipeline里是可以复用的。
### 详细文档
见[http://webmagic.io/docs/](http://webmagic.io/docs/)。
### 示例
webmagic-samples目录里有一些定制PageProcessor以抽取不同站点的例子。
webmagic的使用可以参考:[oschina openapi 应用:博客搬家](https://git.oschina.net/yashin/MoveBlog)
### 协议
webmagic遵循[Apache 2.0协议](http://opensource.org/licenses/Apache-2.0)
### 邮件组:
Gmail:
[https://groups.google.com/forum/#!forum/webmagic-java](https://groups.google.com/forum/#!forum/webmagic-java)
QQ:
[http://list.qq.com/cgi-bin/qf_invite?id=023a01f505246785f77c5a5a9aff4e57ab20fcdde871e988](http://list.qq.com/cgi-bin/qf_invite?id=023a01f505246785f77c5a5a9aff4e57ab20fcdde871e988)
### QQ群:
373225642(已满) 542327088
### 相关项目:
[Gather Platform](https://github.com/gsh199449/spider)
Gather Platform 数据抓取平台是一套基于Webmagic内核的,具有Web任务配置和任务管理界面的数据采集与搜索平台。
================================================
FILE: README.md
================================================

[Readme in Chinese](https://github.com/code4craft/webmagic/tree/master/README-zh.md)
[](https://maven-badges.herokuapp.com/maven-central/us.codecraft/webmagic-parent/)
[](https://www.apache.org/licenses/LICENSE-2.0.html)
[](https://travis-ci.org/code4craft/webmagic)
>A scalable crawler framework. It covers the whole lifecycle of crawler: downloading, url management, content extraction and persistent. It can simplify the development of a specific crawler.
## Features:
* Simple core with high flexibility.
* Simple API for html extracting.
* Annotation with POJO to customize a crawler, no configuration.
* Multi-thread and Distribution support.
* Easy to be integrated.
## Install:
Add dependencies to your pom.xml:
```xml
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-core</artifactId>
<version>${webmagic.version}</version>
</dependency>
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-extension</artifactId>
<version>${webmagic.version}</version>
</dependency>
```
WebMagic use slf4j with slf4j-log4j12 implementation. If you customized your slf4j implementation, please exclude slf4j-log4j12.
```xml
<exclusions>
<exclusion>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-log4j12</artifactId>
</exclusion>
</exclusions>
```
## Get Started:
### First crawler:
Write a class implements PageProcessor. For example, I wrote a crawler of github repository information.
```java
public class GithubRepoPageProcessor implements PageProcessor {
private Site site = Site.me().setRetryTimes(3).setSleepTime(1000);
@Override
public void process(Page page) {
page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/\\w+/\\w+)").all());
page.putField("author", page.getUrl().regex("https://github\\.com/(\\w+)/.*").toString());
page.putField("name", page.getHtml().xpath("//h1[@class='public']/strong/a/text()").toString());
if (page.getResultItems().get("name")==null){
//skip this page
page.setSkip(true);
}
page.putField("readme", page.getHtml().xpath("//div[@id='readme']/tidyText()"));
}
@Override
public Site getSite() {
return site;
}
public static void main(String[] args) {
Spider.create(new GithubRepoPageProcessor()).addUrl("https://github.com/code4craft").thread(5).run();
}
}
```
* `page.addTargetRequests(links)`
Add urls for crawling.
You can also use annotation way:
```java
@TargetUrl("https://github.com/\\w+/\\w+")
@HelpUrl("https://github.com/\\w+")
public class GithubRepo {
@ExtractBy(value = "//h1[@class='public']/strong/a/text()", notNull = true)
private String name;
@ExtractByUrl("https://github\\.com/(\\w+)/.*")
private String author;
@ExtractBy("//div[@id='readme']/tidyText()")
private String readme;
public static void main(String[] args) {
OOSpider.create(Site.me().setSleepTime(1000)
, new ConsolePageModelPipeline(), GithubRepo.class)
.addUrl("https://github.com/code4craft").thread(5).run();
}
}
```
### Docs and samples:
Documents: [http://webmagic.io/docs/](http://webmagic.io/docs/)
The architecture of webmagic (referred to [Scrapy](http://scrapy.org/))

There are more examples in `webmagic-samples` package.
### License:
Licensed under [Apache 2.0 license](http://opensource.org/licenses/Apache-2.0)
### Thanks:
To write webmagic, I refered to the projects below :
* **Scrapy**
A crawler framework in Python.
[http://scrapy.org/](http://scrapy.org/)
* **Spiderman**
Another crawler framework in Java.
[http://git.oschina.net/l-weiwei/spiderman](http://git.oschina.net/l-weiwei/spiderman)
### Mail-list:
[https://groups.google.com/forum/#!forum/webmagic-java](https://groups.google.com/forum/#!forum/webmagic-java)
[http://list.qq.com/cgi-bin/qf_invite?id=023a01f505246785f77c5a5a9aff4e57ab20fcdde871e988](http://list.qq.com/cgi-bin/qf_invite?id=023a01f505246785f77c5a5a9aff4e57ab20fcdde871e988)
QQ Group: 373225642 542327088
### Related Project
* <a href="https://github.com/gsh199449/spider" target="_blank">Gather Platform</a>
A web console based on WebMagic for Spider configuration and management.
================================================
FILE: pom.xml
================================================
<?xml version="1.0" encoding="UTF-8"?>
<project
xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="
http://maven.apache.org/POM/4.0.0
http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<parent>
<groupId>org.oxerr</groupId>
<artifactId>oxerr-parent</artifactId>
<version>2.3.1</version>
<relativePath /> <!-- lookup parent from repository -->
</parent>
<groupId>us.codecraft</groupId>
<version>1.0.4-SNAPSHOT</version>
<packaging>pom</packaging>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
<maven.compiler.source>11</maven.compiler.source>
<maven.compiler.target>11</maven.compiler.target>
<assertj.version>3.23.1</assertj.version>
<commons-cli.version>1.5.0</commons-cli.version>
<commons-collections4.version>4.4</commons-collections4.version>
<commons-io.version>2.14.0</commons-io.version>
<commons-lang3.version>3.12.0</commons-lang3.version>
<fastjson.version>2.0.19.graal</fastjson.version>
<groovy-all.version>3.0.13</groovy-all.version>
<guava.version>32.0.0-jre</guava.version>
<htmlcleaner.version>2.29</htmlcleaner.version>
<httpclient.version>4.5.13</httpclient.version>
<httpcore.version>4.4.15</httpcore.version>
<jedis.version>3.7.1</jedis.version>
<jruby.version>9.4.12.1</jruby.version>
<json-path.version>2.9.0</json-path.version>
<junit.version>5.10.2</junit.version>
<junit.platform.version>1.10.2</junit.platform.version>
<jython.version>2.7.3</jython.version>
<log4j2.version>2.23.1</log4j2.version>
<mockito-all.version>2.0.2-beta</mockito-all.version>
<moco.version>1.3.0</moco.version>
<phantomjsdriver.version>1.2.0</phantomjsdriver.version>
<saxon-he.version>12.4</saxon-he.version>
<selenium-java.version>4.14.1</selenium-java.version>
<slf4j.version>2.0.4</slf4j.version>
<spring-version>4.0.0.RELEASE</spring-version>
<xsoup.version>0.3.5</xsoup.version>
</properties>
<artifactId>webmagic</artifactId>
<name>webmagic</name>
<description>
A crawler framework. It covers the whole lifecycle of crawler: downloading, url management, content
extraction and persistent. It can simply the development of a specific crawler.
</description>
<url>https://github.com/code4craft/webmagic/</url>
<developers>
<developer>
<id>code4craft</id>
<name>Yihua huang</name>
<email>code4crafer@gmail.com</email>
</developer>
<developer>
<id>yuany</id>
<name>Ligang Yao</name>
<email>ligang.yao@answers.com</email>
</developer>
</developers>
<scm>
<connection>scm:git:git@github.com:code4craft/webmagic.git</connection>
<developerConnection>scm:git:git@github.com:code4craft/webmagic.git</developerConnection>
<url>git@github.com:code4craft/webmagic.git</url>
<tag>WebMagic-${project.version}</tag>
</scm>
<licenses>
<license>
<name>Apache License, Version 2.0</name>
<url>http://www.apache.org/licenses/LICENSE-2.0</url>
</license>
</licenses>
<modules>
<module>webmagic-core</module>
<module>webmagic-extension/</module>
<module>webmagic-scripts/</module>
<module>webmagic-selenium</module>
<module>webmagic-saxon</module>
<module>webmagic-samples</module>
<module>webmagic-coverage</module>
</modules>
<dependencies>
<dependency>
<groupId>org.apache.logging.log4j</groupId>
<artifactId>log4j-core</artifactId>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.apache.logging.log4j</groupId>
<artifactId>log4j-slf4j2-impl</artifactId>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.junit.jupiter</groupId>
<artifactId>junit-jupiter-engine</artifactId>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.junit.vintage</groupId>
<artifactId>junit-vintage-engine</artifactId>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.junit.platform</groupId>
<artifactId>junit-platform-launcher</artifactId>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.junit.platform</groupId>
<artifactId>junit-platform-runner</artifactId>
<scope>test</scope>
</dependency>
</dependencies>
<dependencyManagement>
<dependencies>
<dependency>
<groupId>org.mockito</groupId>
<artifactId>mockito-all</artifactId>
<version>${mockito-all.version}</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
<version>${httpclient.version}</version>
</dependency>
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpcore</artifactId>
<version>${httpcore.version}</version>
</dependency>
<dependency>
<groupId>org.apache.logging.log4j</groupId>
<artifactId>log4j-core</artifactId>
<version>${log4j2.version}</version>
</dependency>
<dependency>
<groupId>org.apache.logging.log4j</groupId>
<artifactId>log4j-slf4j2-impl</artifactId>
<version>${log4j2.version}</version>
</dependency>
<dependency>
<groupId>com.google.guava</groupId>
<artifactId>guava</artifactId>
<version>${guava.version}</version>
</dependency>
<dependency>
<groupId>com.jayway.jsonpath</groupId>
<artifactId>json-path</artifactId>
<version>${json-path.version}</version>
</dependency>
<dependency>
<groupId>org.junit.jupiter</groupId>
<artifactId>junit-jupiter-engine</artifactId>
<version>${junit.version}</version>
</dependency>
<dependency>
<groupId>org.junit.vintage</groupId>
<artifactId>junit-vintage-engine</artifactId>
<version>${junit.version}</version>
</dependency>
<dependency>
<groupId>org.junit.platform</groupId>
<artifactId>junit-platform-launcher</artifactId>
<version>${junit.platform.version}</version>
</dependency>
<dependency>
<groupId>org.junit.platform</groupId>
<artifactId>junit-platform-runner</artifactId>
<version>${junit.platform.version}</version>
</dependency>
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-api</artifactId>
<version>${slf4j.version}</version>
</dependency>
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>xsoup</artifactId>
<version>0.3.7</version>
</dependency>
<dependency>
<groupId>com.alibaba</groupId>
<artifactId>fastjson</artifactId>
<version>${fastjson.version}</version>
</dependency>
<dependency>
<groupId>com.github.dreamhead</groupId>
<artifactId>moco-core</artifactId>
<version>${moco.version}</version>
<scope>test</scope>
<exclusions>
<exclusion>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-simple</artifactId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>org.assertj</groupId>
<artifactId>assertj-core</artifactId>
<version>${assertj.version}</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-lang3</artifactId>
<version>${commons-lang3.version}</version>
</dependency>
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-collections4</artifactId>
<version>${commons-collections4.version}</version>
</dependency>
<dependency>
<groupId>commons-io</groupId>
<artifactId>commons-io</artifactId>
<version>${commons-io.version}</version>
</dependency>
<dependency>
<groupId>org.codehaus.groovy</groupId>
<artifactId>groovy-all</artifactId>
<version>${groovy-all.version}</version>
</dependency>
<dependency>
<groupId>org.jruby</groupId>
<artifactId>jruby</artifactId>
<version>${jruby.version}</version>
</dependency>
<dependency>
<groupId>org.python</groupId>
<artifactId>jython</artifactId>
<version>${jython.version}</version>
</dependency>
<dependency>
<groupId>org.seleniumhq.selenium</groupId>
<artifactId>selenium-java</artifactId>
<version>${selenium-java.version}</version>
</dependency>
<dependency>
<groupId>net.sf.saxon</groupId>
<artifactId>Saxon-HE</artifactId>
<version>${saxon-he.version}</version>
</dependency>
<dependency>
<groupId>net.sourceforge.htmlcleaner</groupId>
<artifactId>htmlcleaner</artifactId>
<version>${htmlcleaner.version}</version>
</dependency>
<dependency>
<groupId>com.github.detro</groupId>
<artifactId>phantomjsdriver</artifactId>
<version>${phantomjsdriver.version}</version>
</dependency>
<dependency>
<groupId>commons-cli</groupId>
<artifactId>commons-cli</artifactId>
<version>${commons-cli.version}</version>
</dependency>
<dependency>
<groupId>redis.clients</groupId>
<artifactId>jedis</artifactId>
<version>${jedis.version}</version>
</dependency>
</dependencies>
</dependencyManagement>
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-javadoc-plugin</artifactId>
<configuration>
<doctitle>WebMagic ${project.version}</doctitle>
<locale>en_US</locale>
<!-- avoid the issue: https://bugs.openjdk.java.net/browse/JDK-8212233 -->
<detectJavaApiLink>false</detectJavaApiLink>
</configuration>
<executions>
<execution>
<id>aggregate</id>
<goals>
<goal>aggregate</goal>
</goals>
<phase>site</phase>
</execution>
<execution>
<id>attach-javadocs</id>
<goals>
<goal>jar</goal>
</goals>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.jacoco</groupId>
<artifactId>jacoco-maven-plugin</artifactId>
<executions>
<execution>
<goals>
<goal>prepare-agent</goal>
</goals>
</execution>
<execution>
<id>report</id>
<phase>verify</phase>
<goals>
<goal>report</goal>
</goals>
</execution>
</executions>
</plugin>
<plugin>
<groupId>com.amashchenko.maven.plugin</groupId>
<artifactId>gitflow-maven-plugin</artifactId>
<configuration>
<gitFlowConfig>
<versionTagPrefix>WebMagic-</versionTagPrefix>
</gitFlowConfig>
</configuration>
</plugin>
</plugins>
</build>
</project>
================================================
FILE: src/site/site.xml
================================================
<project xmlns="http://maven.apache.org/DECORATION/1.6.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/DECORATION/1.6.0
http://maven.apache.org/xsd/decoration-1.6.0.xsd">
<skin>
<groupId>org.apache.maven.skins</groupId>
<artifactId>maven-fluido-skin</artifactId>
<version>1.11.1</version>
</skin>
<body>
<menu ref="parent" inherit="top" />
<menu ref="modules" inherit="top" />
<menu ref="reports" inherit="top" />
</body>
<custom>
<fluidoSkin>
<topBarEnabled>true</topBarEnabled>
<sideBarEnabled>true</sideBarEnabled>
<sourceLineNumbersEnabled>true</sourceLineNumbersEnabled>
<copyrightClass>pull-right</copyrightClass>
</fluidoSkin>
</custom>
</project>
================================================
FILE: webmagic-core/README.md
================================================
webmagic-core
-------
webmagic核心部分。只包含爬虫基本模块和基本抽取器。webmagic-core的目标是成为网页爬虫的一个教科书般的实现。
================================================
FILE: webmagic-core/module_webmagic-core.xml
================================================
<?xml version="1.0" encoding="UTF-8"?>
<project name="module_webmagic-core" default="compile.module.webmagic-core">
<dirname property="module.webmagic-core.basedir" file="${ant.file.module_webmagic-core}"/>
<property name="module.jdk.home.webmagic-core" value="${project.jdk.home}"/>
<property name="module.jdk.bin.webmagic-core" value="${project.jdk.bin}"/>
<property name="module.jdk.classpath.webmagic-core" value="${project.jdk.classpath}"/>
<property name="compiler.args.webmagic-core" value="${compiler.args}"/>
<property name="webmagic-core.output.dir" value="${module.webmagic-core.basedir}/target/classes"/>
<property name="webmagic-core.testoutput.dir" value="${module.webmagic-core.basedir}/target/test-classes"/>
<path id="webmagic-core.module.bootclasspath">
<!-- Paths to be included in compilation bootclasspath -->
</path>
<path id="webmagic-core.module.production.classpath">
<path refid="${module.jdk.classpath.webmagic-core}"/>
<path refid="library.maven:_org.apache.httpcomponents:httpclient:4.2.4.classpath"/>
<path refid="library.maven:_org.apache.httpcomponents:httpcore:4.2.4.classpath"/>
<path refid="library.maven:_commons-logging:commons-logging:1.1.1.classpath"/>
<path refid="library.maven:_commons-codec:commons-codec:1.6.classpath"/>
<path refid="library.maven:_com.google.guava:guava:13.0.1.classpath"/>
<path refid="library.maven:_org.apache.commons:commons-lang3:3.1.classpath"/>
<path refid="library.maven:_log4j:log4j:1.2.17.classpath"/>
<path refid="library.maven:_commons-collections:commons-collections:3.2.1.classpath"/>
<path refid="library.maven:_net.sourceforge.htmlcleaner:htmlcleaner:2.4.classpath"/>
<path refid="library.maven:_org.jdom:jdom2:2.0.4.classpath"/>
<path refid="library.maven:_commons-io:commons-io:1.3.2.classpath"/>
</path>
<path id="webmagic-core.runtime.production.module.classpath">
<pathelement location="${webmagic-core.output.dir}"/>
<path refid="library.maven:_org.apache.httpcomponents:httpclient:4.2.4.classpath"/>
<path refid="library.maven:_org.apache.httpcomponents:httpcore:4.2.4.classpath"/>
<path refid="library.maven:_commons-logging:commons-logging:1.1.1.classpath"/>
<path refid="library.maven:_commons-codec:commons-codec:1.6.classpath"/>
<path refid="library.maven:_com.google.guava:guava:13.0.1.classpath"/>
<path refid="library.maven:_org.apache.commons:commons-lang3:3.1.classpath"/>
<path refid="library.maven:_log4j:log4j:1.2.17.classpath"/>
<path refid="library.maven:_commons-collections:commons-collections:3.2.1.classpath"/>
<path refid="library.maven:_net.sourceforge.htmlcleaner:htmlcleaner:2.4.classpath"/>
<path refid="library.maven:_org.jdom:jdom2:2.0.4.classpath"/>
<path refid="library.maven:_commons-io:commons-io:1.3.2.classpath"/>
</path>
<path id="webmagic-core.module.classpath">
<path refid="${module.jdk.classpath.webmagic-core}"/>
<pathelement location="${webmagic-core.output.dir}"/>
<path refid="library.maven:_org.apache.httpcomponents:httpclient:4.2.4.classpath"/>
<path refid="library.maven:_org.apache.httpcomponents:httpcore:4.2.4.classpath"/>
<path refid="library.maven:_commons-logging:commons-logging:1.1.1.classpath"/>
<path refid="library.maven:_commons-codec:commons-codec:1.6.classpath"/>
<path refid="library.maven:_junit:junit:4.7.classpath"/>
<path refid="library.maven:_com.google.guava:guava:13.0.1.classpath"/>
<path refid="library.maven:_org.apache.commons:commons-lang3:3.1.classpath"/>
<path refid="library.maven:_log4j:log4j:1.2.17.classpath"/>
<path refid="library.maven:_commons-collections:commons-collections:3.2.1.classpath"/>
<path refid="library.maven:_net.sourceforge.htmlcleaner:htmlcleaner:2.4.classpath"/>
<path refid="library.maven:_org.jdom:jdom2:2.0.4.classpath"/>
<path refid="library.maven:_commons-io:commons-io:1.3.2.classpath"/>
</path>
<path id="webmagic-core.runtime.module.classpath">
<pathelement location="${webmagic-core.testoutput.dir}"/>
<pathelement location="${webmagic-core.output.dir}"/>
<path refid="library.maven:_org.apache.httpcomponents:httpclient:4.2.4.classpath"/>
<path refid="library.maven:_org.apache.httpcomponents:httpcore:4.2.4.classpath"/>
<path refid="library.maven:_commons-logging:commons-logging:1.1.1.classpath"/>
<path refid="library.maven:_commons-codec:commons-codec:1.6.classpath"/>
<path refid="library.maven:_junit:junit:4.7.classpath"/>
<path refid="library.maven:_com.google.guava:guava:13.0.1.classpath"/>
<path refid="library.maven:_org.apache.commons:commons-lang3:3.1.classpath"/>
<path refid="library.maven:_log4j:log4j:1.2.17.classpath"/>
<path refid="library.maven:_commons-collections:commons-collections:3.2.1.classpath"/>
<path refid="library.maven:_net.sourceforge.htmlcleaner:htmlcleaner:2.4.classpath"/>
<path refid="library.maven:_org.jdom:jdom2:2.0.4.classpath"/>
<path refid="library.maven:_commons-io:commons-io:1.3.2.classpath"/>
</path>
<patternset id="excluded.from.module.webmagic-core">
<patternset refid="ignored.files"/>
</patternset>
<patternset id="excluded.from.compilation.webmagic-core">
<patternset refid="excluded.from.module.webmagic-core"/>
</patternset>
<path id="webmagic-core.module.sourcepath">
<dirset dir="${module.webmagic-core.basedir}">
<include name="src/main/java"/>
<include name="src/main/resources"/>
</dirset>
</path>
<path id="webmagic-core.module.test.sourcepath">
<dirset dir="${module.webmagic-core.basedir}">
<include name="src/test/java"/>
<include name="src/test/resources"/>
</dirset>
</path>
<target name="compile.module.webmagic-core" depends="compile.module.webmagic-core.production,compile.module.webmagic-core.tests" description="Compile module webmagic-core"/>
<target name="compile.module.webmagic-core.production" depends="register.custom.compilers" description="Compile module webmagic-core; production classes">
<mkdir dir="${webmagic-core.output.dir}"/>
<javac2 destdir="${webmagic-core.output.dir}" debug="${compiler.debug}" nowarn="${compiler.generate.no.warnings}" memorymaximumsize="${compiler.max.memory}" fork="true" executable="${module.jdk.bin.webmagic-core}/javac">
<compilerarg line="${compiler.args.webmagic-core}"/>
<bootclasspath refid="webmagic-core.module.bootclasspath"/>
<classpath refid="webmagic-core.module.production.classpath"/>
<src refid="webmagic-core.module.sourcepath"/>
<patternset refid="excluded.from.compilation.webmagic-core"/>
</javac2>
<copy todir="${webmagic-core.output.dir}">
<fileset dir="${module.webmagic-core.basedir}/src/main/java">
<patternset refid="compiler.resources"/>
<type type="file"/>
</fileset>
<fileset dir="${module.webmagic-core.basedir}/src/main/resources">
<patternset refid="compiler.resources"/>
<type type="file"/>
</fileset>
</copy>
</target>
<target name="compile.module.webmagic-core.tests" depends="register.custom.compilers,compile.module.webmagic-core.production" description="compile module webmagic-core; test classes" unless="skip.tests">
<mkdir dir="${webmagic-core.testoutput.dir}"/>
<javac2 destdir="${webmagic-core.testoutput.dir}" debug="${compiler.debug}" nowarn="${compiler.generate.no.warnings}" memorymaximumsize="${compiler.max.memory}" fork="true" executable="${module.jdk.bin.webmagic-core}/javac">
<compilerarg line="${compiler.args.webmagic-core}"/>
<bootclasspath refid="webmagic-core.module.bootclasspath"/>
<classpath refid="webmagic-core.module.classpath"/>
<src refid="webmagic-core.module.test.sourcepath"/>
<patternset refid="excluded.from.compilation.webmagic-core"/>
</javac2>
<copy todir="${webmagic-core.testoutput.dir}">
<fileset dir="${module.webmagic-core.basedir}/src/test/java">
<patternset refid="compiler.resources"/>
<type type="file"/>
</fileset>
<fileset dir="${module.webmagic-core.basedir}/src/test/resources">
<patternset refid="compiler.resources"/>
<type type="file"/>
</fileset>
</copy>
</target>
<target name="clean.module.webmagic-core" description="cleanup module">
<delete dir="${webmagic-core.output.dir}"/>
<delete dir="${webmagic-core.testoutput.dir}"/>
</target>
</project>
================================================
FILE: webmagic-core/pom.xml
================================================
<?xml version="1.0" encoding="UTF-8"?>
<project
xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="
http://maven.apache.org/POM/4.0.0
http://maven.apache.org/xsd/maven-4.0.0.xsd">
<parent>
<groupId>us.codecraft</groupId>
<artifactId>webmagic</artifactId>
<version>1.0.4-SNAPSHOT</version>
</parent>
<modelVersion>4.0.0</modelVersion>
<artifactId>webmagic-core</artifactId>
<dependencies>
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
</dependency>
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-lang3</artifactId>
</dependency>
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>xsoup</artifactId>
</dependency>
<dependency>
<groupId>com.github.dreamhead</groupId>
<artifactId>moco-core</artifactId>
</dependency>
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-api</artifactId>
</dependency>
<dependency>
<groupId>org.mockito</groupId>
<artifactId>mockito-all</artifactId>
</dependency>
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-collections4</artifactId>
</dependency>
<dependency>
<groupId>org.assertj</groupId>
<artifactId>assertj-core</artifactId>
</dependency>
<dependency>
<groupId>commons-io</groupId>
<artifactId>commons-io</artifactId>
</dependency>
<dependency>
<groupId>com.jayway.jsonpath</groupId>
<artifactId>json-path</artifactId>
</dependency>
<dependency>
<groupId>com.alibaba</groupId>
<artifactId>fastjson</artifactId>
</dependency>
</dependencies>
</project>
================================================
FILE: webmagic-core/src/main/java/us/codecraft/webmagic/Page.java
================================================
package us.codecraft.webmagic;
import org.apache.commons.lang3.StringUtils;
import us.codecraft.webmagic.selector.Html;
import us.codecraft.webmagic.selector.Json;
import us.codecraft.webmagic.selector.Selectable;
import us.codecraft.webmagic.utils.UrlUtils;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Map;
/**
* Object storing extracted result and urls to fetch.<br>
* Not thread safe.<br>
* Main method: <br>
* {@link #getUrl()} get url of current page <br>
* {@link #getHtml()} get content of current page <br>
* {@link #putField(String, Object)} save extracted result <br>
* {@link #getResultItems()} get extract results to be used in {@link us.codecraft.webmagic.pipeline.Pipeline}<br>
* {@link #addTargetRequests(Iterable)} {@link #addTargetRequest(String)} add urls to fetch <br>
*
* @author code4crafter@gmail.com <br>
* @see us.codecraft.webmagic.downloader.Downloader
* @see us.codecraft.webmagic.processor.PageProcessor
* @since 0.1.0
*/
public class Page {
private Request request;
private ResultItems resultItems = new ResultItems();
private Html html;
private Json json;
private String rawText;
private Selectable url;
private Map<String,List<String>> headers;
private int statusCode;
private boolean downloadSuccess;
private byte[] bytes;
private List<Request> targetRequests = new ArrayList<>();
private String charset;
/**
* Returns a {@link Page} with {@link #downloadSuccess} is {@code true},
* and {@link #request} is specified.
*
* @param request the request.
* @since 1.0.2
*/
public static Page ofSuccess(Request request) {
return new Page(request, true);
}
/**
* Returns a {@link Page} with {@link #downloadSuccess} is {@code true},
* and {@link #request} is specified.
*
* @param request the request.
* @since 1.0.2
*/
public static Page ofFailure(Request request) {
return new Page(request, false);
}
public Page() {
}
/**
* Constructs a {@link Page} with {@link #request}
* and {@link #downloadSuccess} specified.
*
* @param request the request.
* @param downloadSuccess the download success flag.
* @since 1.0.2
*/
private Page(Request request, boolean downloadSuccess) {
this.request = request;
this.downloadSuccess = downloadSuccess;
}
/**
* Returns a {@link Page} with {@link #downloadSuccess} is {@code false}.
*
* @return the page.
* @deprecated Use {@link #fail(Request)} instead.
*/
@Deprecated
public static Page fail() {
return fail(null);
}
/**
* Returns a {@link Page} with {@link #downloadSuccess} is {@code false},
* and {@link #request} is specified.
*
* @param request the {@link Request}.
* @return the page.
* @since 0.10.0
* @deprecated Use {@link #ofFailure(Request)} instead.
*/
@Deprecated(since = "1.0.2", forRemoval = true)
public static Page fail(Request request){
Page page = new Page();
page.setRequest(request);
page.setDownloadSuccess(false);
return page;
}
public Page setSkip(boolean skip) {
resultItems.setSkip(skip);
return this;
}
/**
* store extract results
*
* @param key key
* @param field field
*/
public void putField(String key, Object field) {
resultItems.put(key, field);
}
/**
* get html content of page
*
* @return html
*/
public Html getHtml() {
if (html == null) {
html = new Html(rawText, request.getUrl());
}
return html;
}
/**
* get json content of page
*
* @return json
* @since 0.5.0
*/
public Json getJson() {
if (json == null) {
json = new Json(rawText);
}
return json;
}
/**
* @param html html
* @deprecated since 0.4.0
* The html is parse just when first time of calling {@link #getHtml()}, so use {@link #setRawText(String)} instead.
*/
@Deprecated
public void setHtml(Html html) {
this.html = html;
}
public List<Request> getTargetRequests() {
return targetRequests;
}
/**
* add urls to fetch
*
* @param requests requests
*/
public void addTargetRequests(Iterable<String> requests) {
addTargetRequests(requests, 0); // Default priority is 0
}
/**
* add urls to fetch
*
* @param requests requests
* @param priority priority
*/
public void addTargetRequests(Iterable<String> requests, long priority) {
if(requests == null) {
return;
}
for (String req : requests) {
addRequestIfValid(req, priority);
}
}
/**
* Helper method to add a request if it's valid.
*
* @param url URL to add
* @param priority Priority for the URL
*/
private void addRequestIfValid(String url, long priority) {
if (StringUtils.isBlank(url) || url.equals("#") || url.startsWith("javascript:")) {
return;
}
String canonicalizedUrl = UrlUtils.canonicalizeUrl(url, this.url.toString());
Request req = new Request(canonicalizedUrl);
if(priority > 0) {
req.setPriority(priority);
}
targetRequests.add(req);
}
/**
* add url to fetch
*
* @param requestString requestString
*/
public void addTargetRequest(String requestString) {
if (StringUtils.isBlank(requestString) || requestString.equals("#")) {
return;
}
requestString = UrlUtils.canonicalizeUrl(requestString, url.toString());
targetRequests.add(new Request(requestString));
}
/**
* add requests to fetch
*
* @param request request
*/
public void addTargetRequest(Request request) {
targetRequests.add(request);
}
/**
* get url of current page
*
* @return url of current page
*/
public Selectable getUrl() {
return url;
}
public void setUrl(Selectable url) {
this.url = url;
}
/**
* get request of current page
*
* @return request
*/
public Request getRequest() {
return request;
}
public void setRequest(Request request) {
this.request = request;
this.resultItems.setRequest(request);
}
public ResultItems getResultItems() {
return resultItems;
}
public int getStatusCode() {
return statusCode;
}
public void setStatusCode(int statusCode) {
this.statusCode = statusCode;
}
public String getRawText() {
return rawText;
}
public Page setRawText(String rawText) {
this.rawText = rawText;
return this;
}
public Map<String, List<String>> getHeaders() {
return headers;
}
public void setHeaders(Map<String, List<String>> headers) {
this.headers = headers;
}
public boolean isDownloadSuccess() {
return downloadSuccess;
}
public void setDownloadSuccess(boolean downloadSuccess) {
this.downloadSuccess = downloadSuccess;
}
public byte[] getBytes() {
return bytes;
}
public void setBytes(byte[] bytes) {
this.bytes = bytes;
}
public String getCharset() {
return charset;
}
public void setCharset(String charset) {
this.charset = charset;
}
@Override
public String toString() {
return "Page{" +
"request=" + request +
", resultItems=" + resultItems +
", html=" + html +
", json=" + json +
", rawText='" + rawText + '\'' +
", url=" + url +
", headers=" + headers +
", statusCode=" + statusCode +
", downloadSuccess=" + downloadSuccess +
", targetRequests=" + targetRequests +
", charset='" + charset + '\'' +
", bytes=" + Arrays.toString(bytes) +
'}';
}
}
================================================
FILE: webmagic-core/src/main/java/us/codecraft/webmagic/Request.java
================================================
package us.codecraft.webmagic;
import java.io.Serializable;
import java.util.Collections;
import java.util.HashMap;
import java.util.Map;
import us.codecraft.webmagic.downloader.Downloader;
import us.codecraft.webmagic.model.HttpRequestBody;
import us.codecraft.webmagic.utils.Experimental;
/**
* Object contains url to crawl.<br>
* It contains some additional information.<br>
*
* @author code4crafter@gmail.com <br>
* @since 0.1.0
*/
public class Request implements Serializable {
private static final long serialVersionUID = 2062192774891352043L;
public static final String CYCLE_TRIED_TIMES = "_cycle_tried_times";
private String url;
private String method;
private HttpRequestBody requestBody;
/**
* this req use this downloader
*/
private Downloader downloader;
/**
* Store additional information in extras.
*/
private Map<String, Object> extras = new HashMap<>();
/**
* cookies for current url, if not set use Site's cookies
*/
private Map<String, String> cookies = new HashMap<String, String>();
private Map<String, String> headers = new HashMap<String, String>();
/**
* Priority of the request.<br>
* The bigger will be processed earlier. <br>
* @see us.codecraft.webmagic.scheduler.PriorityScheduler
*/
private long priority;
/**
* When it is set to TRUE, the downloader will not try to parse response body to text.
*
*/
private boolean binaryContent = false;
private String charset;
public Request() {
}
public Request(String url) {
this.url = url;
}
public long getPriority() {
return priority;
}
/**
* Set the priority of request for sorting.<br>
* Need a scheduler supporting priority.<br>
* @see us.codecraft.webmagic.scheduler.PriorityScheduler
*
* @param priority priority
* @return this
*/
@Experimental
public Request setPriority(long priority) {
this.priority = priority;
return this;
}
@SuppressWarnings("unchecked")
public <T> T getExtra(String key) {
if (extras == null) {
return null;
}
return (T) extras.get(key);
}
public <T> Request putExtra(String key, T value) {
extras.put(key, value);
return this;
}
public String getUrl() {
return url;
}
public Map<String, Object> getExtras() {
return Collections.unmodifiableMap(extras);
}
public Request setExtras(Map<String, Object> extras) {
this.extras.putAll(extras);
return this;
}
public Request setUrl(String url) {
this.url = url;
return this;
}
/**
* The http method of the request. Get for default.
* @return httpMethod
* @see us.codecraft.webmagic.utils.HttpConstant.Method
* @since 0.5.0
*/
public String getMethod() {
return method;
}
public Request setMethod(String method) {
this.method = method;
return this;
}
@Override
public int hashCode() {
int result = url != null ? url.hashCode() : 0;
result = 31 * result + (method != null ? method.hashCode() : 0);
return result;
}
@Override
public boolean equals(Object o) {
if (this == o) return true;
if (o == null || getClass() != o.getClass()) return false;
Request request = (Request) o;
if (url != null ? !url.equals(request.url) : request.url != null) return false;
return method != null ? method.equals(request.method) : request.method == null;
}
public Request addCookie(String name, String value) {
cookies.put(name, value);
return this;
}
public Request addHeader(String name, String value) {
headers.put(name, value);
return this;
}
public Map<String, String> getCookies() {
return cookies;
}
public Map<String, String> getHeaders() {
return headers;
}
public HttpRequestBody getRequestBody() {
return requestBody;
}
public void setRequestBody(HttpRequestBody requestBody) {
this.requestBody = requestBody;
}
public boolean isBinaryContent() {
return binaryContent;
}
public Downloader getDownloader() {
return downloader;
}
public void setDownloader(Downloader downloader) {
this.downloader = downloader;
}
public Request setBinaryContent(boolean binaryContent) {
this.binaryContent = binaryContent;
return this;
}
public String getCharset() {
return charset;
}
public Request setCharset(String charset) {
this.charset = charset;
return this;
}
@Override
public String toString() {
return "Request{" +
"url='" + url + '\'' +
", method='" + method + '\'' +
", extras=" + extras +
", priority=" + priority +
", headers=" + headers +
", cookies="+ cookies+
'}';
}
}
================================================
FILE: webmagic-core/src/main/java/us/codecraft/webmagic/ResultItems.java
================================================
package us.codecraft.webmagic;
import java.util.LinkedHashMap;
import java.util.Map;
/**
* Object contains extract results.<br>
* It is contained in Page and will be processed in pipeline.
*
* @author code4crafter@gmail.com <br>
* @since 0.1.0
* @see Page
* @see us.codecraft.webmagic.pipeline.Pipeline
*/
public class ResultItems {
private Map<String, Object> fields = new LinkedHashMap<String, Object>();
private Request request;
private boolean skip;
@SuppressWarnings("unchecked")
public <T> T get(String key) {
Object o = fields.get(key);
if (o == null) {
return null;
}
return (T) fields.get(key);
}
public Map<String, Object> getAll() {
return fields;
}
public <T> ResultItems put(String key, T value) {
fields.put(key, value);
return this;
}
public Request getRequest() {
return request;
}
public ResultItems setRequest(Request request) {
this.request = request;
return this;
}
/**
* Whether to skip the result.<br>
* Result which is skipped will not be processed by Pipeline.
*
* @return whether to skip the result
*/
public boolean isSkip() {
return skip;
}
/**
* Set whether to skip the result.<br>
* Result which is skipped will not be processed by Pipeline.
*
* @param skip whether to skip the result
* @return this
*/
public ResultItems setSkip(boolean skip) {
this.skip = skip;
return this;
}
@Override
public String toString() {
return "ResultItems{" +
"fields=" + fields +
", request=" + request +
", skip=" + skip +
'}';
}
}
================================================
FILE: webmagic-core/src/main/java/us/codecraft/webmagic/Site.java
================================================
package us.codecraft.webmagic;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedHashMap;
import java.util.Map;
import java.util.Set;
import java.util.UUID;
import us.codecraft.webmagic.utils.HttpConstant;
/**
* Object contains setting for crawler.<br>
*
* @author code4crafter@gmail.com <br>
* @see us.codecraft.webmagic.processor.PageProcessor
* @since 0.1.0
*/
public class Site {
private String domain;
private String userAgent;
private Map<String, String> defaultCookies = new LinkedHashMap<String, String>();
private Map<String, Map<String, String>> cookies = new HashMap<String, Map<String, String>>();
private String charset;
private String defaultCharset;
private int sleepTime = 5000;
private int retryTimes = 0;
private int cycleRetryTimes = 0;
private int retrySleepTime = 1000;
private int timeOut = 5000;
private static final Set<Integer> DEFAULT_STATUS_CODE_SET = new HashSet<Integer>();
private Set<Integer> acceptStatCode = DEFAULT_STATUS_CODE_SET;
private Map<String, String> headers = new HashMap<String, String>();
private boolean useGzip = true;
private boolean disableCookieManagement = false;
static {
DEFAULT_STATUS_CODE_SET.add(HttpConstant.StatusCode.CODE_200);
}
/**
* new a Site
*
* @return new site
*/
public static Site me() {
return new Site();
}
/**
* Add a cookie with domain {@link #getDomain()}
*
* @param name name
* @param value value
* @return this
*/
public Site addCookie(String name, String value) {
defaultCookies.put(name, value);
return this;
}
/**
* Add a cookie with specific domain.
*
* @param domain domain
* @param name name
* @param value value
* @return this
*/
public Site addCookie(String domain, String name, String value) {
if (!cookies.containsKey(domain)){
cookies.put(domain,new HashMap<String, String>());
}
cookies.get(domain).put(name, value);
return this;
}
/**
* set user agent
*
* @param userAgent userAgent
* @return this
*/
public Site setUserAgent(String userAgent) {
this.userAgent = userAgent;
return this;
}
/**
* get cookies
*
* @return get cookies
*/
public Map<String, String> getCookies() {
return defaultCookies;
}
/**
* get cookies of all domains
*
* @return get cookies
*/
public Map<String,Map<String, String>> getAllCookies() {
return cookies;
}
/**
* get user agent
*
* @return user agent
*/
public String getUserAgent() {
return userAgent;
}
/**
* get domain
*
* @return get domain
*/
public String getDomain() {
return domain;
}
/**
* set the domain of site.
*
* @param domain domain
* @return this
*/
public Site setDomain(String domain) {
this.domain = domain;
return this;
}
/**
* Set charset of page manually.<br>
* When charset is not set or set to null, it can be auto detected by Http header.
*
* @param charset charset
* @return this
*/
public Site setCharset(String charset) {
this.charset = charset;
return this;
}
/**
* get charset set manually
*
* @return charset
*/
public String getCharset() {
return charset;
}
/**
* Set default charset of page.
*
* When charset detect failed, use this default charset.
*
* @param defaultCharset the default charset
* @return this
* @since 0.9.0
*/
public Site setDefaultCharset(String defaultCharset) {
this.defaultCharset = defaultCharset;
return this;
}
/**
* The default charset if charset detected failed.
*
* @return the defulat charset
* @since 0.9.0
*/
public String getDefaultCharset() {
return defaultCharset;
}
public int getTimeOut() {
return timeOut;
}
/**
* set timeout for downloader in ms
*
* @param timeOut timeOut
* @return this
*/
public Site setTimeOut(int timeOut) {
this.timeOut = timeOut;
return this;
}
/**
* Set acceptStatCode.<br>
* When status code of http response is in acceptStatCodes, it will be processed.<br>
* {200} by default.<br>
* It is not necessarily to be set.<br>
*
* @param acceptStatCode acceptStatCode
* @return this
*/
public Site setAcceptStatCode(Set<Integer> acceptStatCode) {
this.acceptStatCode = acceptStatCode;
return this;
}
/**
* get acceptStatCode
*
* @return acceptStatCode
*/
public Set<Integer> getAcceptStatCode() {
return acceptStatCode;
}
/**
* Set the interval between the processing of two pages.<br>
* Time unit is milliseconds.<br>
*
* @param sleepTime sleepTime
* @return this
*/
public Site setSleepTime(int sleepTime) {
this.sleepTime = sleepTime;
return this;
}
/**
* Get the interval between the processing of two pages.<br>
* Time unit is milliseconds.<br>
*
* @return the interval between the processing of two pages,
*/
public int getSleepTime() {
return sleepTime;
}
/**
* Get retry times immediately when download fail, 0 by default.<br>
*
* @return retry times when download fail
*/
public int getRetryTimes() {
return retryTimes;
}
public Map<String, String> getHeaders() {
return headers;
}
/**
* Put an Http header for downloader. <br>
* Use {@link #addCookie(String, String)} for cookie and {@link #setUserAgent(String)} for user-agent. <br>
*
* @param key key of http header, there are some keys constant in {@link HttpConstant.Header}
* @param value value of header
* @return this
*/
public Site addHeader(String key, String value) {
headers.put(key, value);
return this;
}
/**
* Set retry times when download fail, 0 by default.<br>
*
* @param retryTimes retryTimes
* @return this
*/
public Site setRetryTimes(int retryTimes) {
this.retryTimes = retryTimes;
return this;
}
/**
* When cycleRetryTimes is more than 0, it will add back to scheduler and try download again. <br>
*
* @return retry times when download fail
*/
public int getCycleRetryTimes() {
return cycleRetryTimes;
}
/**
* Set cycleRetryTimes times when download fail, 0 by default. <br>
*
* @param cycleRetryTimes cycleRetryTimes
* @return this
*/
public Site setCycleRetryTimes(int cycleRetryTimes) {
this.cycleRetryTimes = cycleRetryTimes;
return this;
}
public boolean isUseGzip() {
return useGzip;
}
public int getRetrySleepTime() {
return retrySleepTime;
}
/**
* Set retry sleep times when download fail, 1000 by default. <br>
*
* @param retrySleepTime retrySleepTime
* @return this
*/
public Site setRetrySleepTime(int retrySleepTime) {
this.retrySleepTime = retrySleepTime;
return this;
}
/**
* Whether use gzip. <br>
* Default is true, you can set it to false to disable gzip.
*
* @param useGzip useGzip
* @return this
*/
public Site setUseGzip(boolean useGzip) {
this.useGzip = useGzip;
return this;
}
public boolean isDisableCookieManagement() {
return disableCookieManagement;
}
/**
* Downloader is supposed to store response cookie.
* Disable it to ignore all cookie fields and stay clean.
* Warning: Set cookie will still NOT work if disableCookieManagement is true.
* @param disableCookieManagement disableCookieManagement
* @return this
*/
public Site setDisableCookieManagement(boolean disableCookieManagement) {
this.disableCookieManagement = disableCookieManagement;
return this;
}
public Task toTask() {
return new Task() {
@Override
public String getUUID() {
String uuid = Site.this.getDomain();
if (uuid == null) {
uuid = UUID.randomUUID().toString();
}
return uuid;
}
@Override
public Site getSite() {
return Site.this;
}
};
}
@Override
public boolean equals(Object o) {
if (this == o) return true;
if (o == null || getClass() != o.getClass()) return false;
Site site = (Site) o;
if (cycleRetryTimes != site.cycleRetryTimes) return false;
if (retryTimes != site.retryTimes) return false;
if (sleepTime != site.sleepTime) return false;
if (timeOut != site.timeOut) return false;
if (acceptStatCode != null ? !acceptStatCode.equals(site.acceptStatCode) : site.acceptStatCode != null)
return false;
if (charset != null ? !charset.equals(site.charset) : site.charset != null) return false;
if (defaultCookies != null ? !defaultCookies.equals(site.defaultCookies) : site.defaultCookies != null)
return false;
if (domain != null ? !domain.equals(site.domain) : site.domain != null) return false;
if (headers != null ? !headers.equals(site.headers) : site.headers != null) return false;
if (userAgent != null ? !userAgent.equals(site.userAgent) : site.userAgent != null) return false;
return true;
}
@Override
public int hashCode() {
int result = domain != null ? domain.hashCode() : 0;
result = 31 * result + (userAgent != null ? userAgent.hashCode() : 0);
result = 31 * result + (defaultCookies != null ? defaultCookies.hashCode() : 0);
result = 31 * result + (charset != null ? charset.hashCode() : 0);
result = 31 * result + sleepTime;
result = 31 * result + retryTimes;
result = 31 * result + cycleRetryTimes;
result = 31 * result + timeOut;
result = 31 * result + (acceptStatCode != null ? acceptStatCode.hashCode() : 0);
result = 31 * result + (headers != null ? headers.hashCode() : 0);
return result;
}
@Override
public String toString() {
return "Site{" +
"domain='" + domain + '\'' +
", userAgent='" + userAgent + '\'' +
", cookies=" + defaultCookies +
", charset='" + charset + '\'' +
", sleepTime=" + sleepTime +
", retryTimes=" + retryTimes +
", cycleRetryTimes=" + cycleRetryTimes +
", timeOut=" + timeOut +
", acceptStatCode=" + acceptStatCode +
", headers=" + headers +
'}';
}
}
================================================
FILE: webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java
================================================
package us.codecraft.webmagic;
import java.io.Closeable;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Date;
import java.util.List;
import java.util.UUID;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicLong;
import org.apache.commons.collections4.CollectionUtils;
import org.apache.commons.lang3.SerializationUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import us.codecraft.webmagic.downloader.Downloader;
import us.codecraft.webmagic.downloader.HttpClientDownloader;
import us.codecraft.webmagic.pipeline.CollectorPipeline;
import us.codecraft.webmagic.pipeline.ConsolePipeline;
import us.codecraft.webmagic.pipeline.Pipeline;
import us.codecraft.webmagic.pipeline.ResultItemsCollectorPipeline;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.scheduler.QueueScheduler;
import us.codecraft.webmagic.scheduler.Scheduler;
import us.codecraft.webmagic.thread.CountableThreadPool;
import us.codecraft.webmagic.utils.UrlUtils;
import us.codecraft.webmagic.utils.WMCollections;
/**
* Entrance of a crawler.<br>
* A spider contains four modules: Downloader, Scheduler, PageProcessor and
* Pipeline.<br>
* Every module is a field of Spider. <br>
* The modules are defined in interface. <br>
* You can customize a spider with various implementations of them. <br>
* Examples: <br>
* <br>
* A simple crawler: <br>
* Spider.create(new SimplePageProcessor("http://my.oschina.net/",
* "http://my.oschina.net/*blog/*")).run();<br>
* <br>
* Store results to files by FilePipeline: <br>
* Spider.create(new SimplePageProcessor("http://my.oschina.net/",
* "http://my.oschina.net/*blog/*")) <br>
* .pipeline(new FilePipeline("/data/temp/webmagic/")).run(); <br>
* <br>
* Use FileCacheQueueScheduler to store urls and cursor in files, so that a
* Spider can resume the status when shutdown. <br>
* Spider.create(new SimplePageProcessor("http://my.oschina.net/",
* "http://my.oschina.net/*blog/*")) <br>
* .scheduler(new FileCacheQueueScheduler("/data/temp/webmagic/cache/")).run(); <br>
*
* @author code4crafter@gmail.com <br>
* @see Downloader
* @see Scheduler
* @see PageProcessor
* @see Pipeline
* @since 0.1.0
*/
public class Spider implements Runnable, Task {
protected Downloader downloader;
protected List<Pipeline> pipelines = new ArrayList<Pipeline>();
protected PageProcessor pageProcessor;
protected List<Request> startRequests;
protected Site site;
protected String uuid;
protected SpiderScheduler scheduler;
protected Logger logger = LoggerFactory.getLogger(getClass());
protected CountableThreadPool threadPool;
protected ExecutorService executorService;
protected int threadNum = 1;
protected AtomicInteger stat = new AtomicInteger(STAT_INIT);
protected volatile boolean exitWhenComplete = true;
protected final static int STAT_INIT = 0;
protected final static int STAT_RUNNING = 1;
protected final static int STAT_STOPPED = 2;
protected boolean spawnUrl = true;
protected boolean destroyWhenExit = true;
private List<SpiderListener> spiderListeners;
private final AtomicLong pageCount = new AtomicLong(0);
private Date startTime;
private long emptySleepTime = 30000;
/**
* create a spider with pageProcessor.
*
* @param pageProcessor pageProcessor
* @return new spider
* @see PageProcessor
*/
public static Spider create(PageProcessor pageProcessor) {
return new Spider(pageProcessor);
}
/**
* create a spider with pageProcessor.
*
* @param pageProcessor pageProcessor
*/
public Spider(PageProcessor pageProcessor) {
this.pageProcessor = pageProcessor;
this.site = pageProcessor.getSite();
this.scheduler = new SpiderScheduler(new QueueScheduler());
}
/**
* Set startUrls of Spider.<br>
* Prior to startUrls of Site.
*
* @param startUrls startUrls
* @return this
*/
public Spider startUrls(List<String> startUrls) {
checkIfRunning();
this.startRequests = UrlUtils.convertToRequests(startUrls);
return this;
}
/**
* Set startUrls of Spider.<br>
* Prior to startUrls of Site.
*
* @param startRequests startRequests
* @return this
*/
public Spider startRequest(List<Request> startRequests) {
checkIfRunning();
this.startRequests = startRequests;
return this;
}
/**
* Set an uuid for spider.<br>
* Default uuid is domain of site.<br>
*
* @param uuid uuid
* @return this
*/
public Spider setUUID(String uuid) {
this.uuid = uuid;
return this;
}
/**
* set scheduler for Spider
*
* @param scheduler scheduler
* @return this
* @see #setScheduler(us.codecraft.webmagic.scheduler.Scheduler)
*/
@Deprecated
public Spider scheduler(Scheduler scheduler) {
return setScheduler(scheduler);
}
/**
* set scheduler for Spider
*
* @param updateScheduler scheduler
* @return this
* @see Scheduler
* @since 0.2.1
*/
public Spider setScheduler(Scheduler updateScheduler) {
checkIfRunning();
Scheduler oldScheduler = scheduler.getScheduler();
scheduler.setScheduler(updateScheduler);
if (oldScheduler != null) {
Request request;
while ((request = oldScheduler.poll(this)) != null) {
this.scheduler.push(request, this);
}
}
return this;
}
/**
* add a pipeline for Spider
*
* @param pipeline pipeline
* @return this
* @see #addPipeline(us.codecraft.webmagic.pipeline.Pipeline)
* @deprecated
*/
@Deprecated
public Spider pipeline(Pipeline pipeline) {
return addPipeline(pipeline);
}
/**
* add a pipeline for Spider
*
* @param pipeline pipeline
* @return this
* @see Pipeline
* @since 0.2.1
*/
public Spider addPipeline(Pipeline pipeline) {
checkIfRunning();
this.pipelines.add(pipeline);
return this;
}
/**
* set pipelines for Spider
*
* @param pipelines pipelines
* @return this
* @see Pipeline
* @since 0.4.1
*/
public Spider setPipelines(List<Pipeline> pipelines) {
checkIfRunning();
this.pipelines = pipelines;
return this;
}
/**
* clear the pipelines set
*
* @return this
*/
public Spider clearPipeline() {
pipelines = new ArrayList<Pipeline>();
return this;
}
/**
* set the downloader of spider
*
* @param downloader downloader
* @return this
* @see #setDownloader(us.codecraft.webmagic.downloader.Downloader)
* @deprecated
*/
@Deprecated
public Spider downloader(Downloader downloader) {
return setDownloader(downloader);
}
/**
* set the downloader of spider
*
* @param downloader downloader
* @return this
* @see Downloader
*/
public Spider setDownloader(Downloader downloader) {
checkIfRunning();
this.downloader = downloader;
return this;
}
protected void initComponent() {
if (downloader == null) {
this.downloader = new HttpClientDownloader();
}
if (pipelines.isEmpty()) {
pipelines.add(new ConsolePipeline());
}
downloader.setThread(threadNum);
if (threadPool == null || threadPool.isShutdown()) {
if (executorService != null && !executorService.isShutdown()) {
threadPool = new CountableThreadPool(threadNum, executorService);
} else {
threadPool = new CountableThreadPool(threadNum);
}
}
if (startRequests != null) {
for (Request request : startRequests) {
addRequest(request);
}
startRequests.clear();
}
startTime = new Date();
}
@Override
public void run() {
checkRunningStat();
initComponent();
logger.info("Spider {} started!", getUUID());
// interrupt won't be necessarily detected
while (!Thread.currentThread().isInterrupted() && stat.get() == STAT_RUNNING) {
Request poll = scheduler.poll(this);
if (poll == null) {
if (threadPool.getThreadAlive() == 0) {
//no alive thread anymore , try again
poll = scheduler.poll(this);
if (poll == null) {
if (exitWhenComplete) {
break;
} else {
// wait
try {
Thread.sleep(emptySleepTime);
continue;
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
break;
}
}
}
} else {
// wait until new url added,
if (scheduler.waitNewUrl(threadPool, emptySleepTime)) {
// if interrupted
break;
}
continue;
}
}
final Request request = poll;
//this may swallow the interruption
threadPool.execute(new Runnable() {
@Override
public void run() {
try {
processRequest(request);
onSuccess(request);
} catch (Exception e) {
onError(request, e);
logger.error("process request " + request + " error", e);
} finally {
pageCount.incrementAndGet();
scheduler.signalNewUrl();
}
}
});
}
stat.set(STAT_STOPPED);
// release some resources
if (destroyWhenExit) {
close();
}
logger.info("Spider {} closed! {} pages downloaded.", getUUID(), pageCount.get());
}
/**
* @deprecated Use {@link #onError(Request, Exception)} instead.
*/
@Deprecated
protected void onError(Request request) {
}
protected void onError(Request request, Exception e) {
this.onError(request);
if (CollectionUtils.isNotEmpty(spiderListeners)) {
for (SpiderListener spiderListener : spiderListeners) {
spiderListener.onError(request, e);
}
}
}
protected void onSuccess(Request request) {
if (CollectionUtils.isNotEmpty(spiderListeners)) {
for (SpiderListener spiderListener : spiderListeners) {
spiderListener.onSuccess(request);
}
}
}
private void checkRunningStat() {
while (true) {
int statNow = stat.get();
if (statNow == STAT_RUNNING) {
throw new IllegalStateException("Spider is already running!");
}
if (stat.compareAndSet(statNow, STAT_RUNNING)) {
break;
}
}
}
public void close() {
destroyEach(downloader);
destroyEach(pageProcessor);
destroyEach(scheduler);
for (Pipeline pipeline : pipelines) {
destroyEach(pipeline);
}
threadPool.shutdown();
}
private void destroyEach(Object object) {
if (object instanceof Closeable) {
try {
((Closeable) object).close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
/**
* Process specific urls without url discovering.
*
* @param urls urls to process
*/
public void test(String... urls) {
initComponent();
if (urls.length > 0) {
for (String url : urls) {
processRequest(new Request(url));
}
}
}
private void processRequest(Request request) {
Page page;
if (null != request.getDownloader()){
page = request.getDownloader().download(request,this);
}else {
page = downloader.download(request, this);
}
if (page.isDownloadSuccess()){
onDownloadSuccess(request, page);
} else {
onDownloaderFail(request);
}
}
private void onDownloadSuccess(Request request, Page page) {
if (site.getAcceptStatCode().contains(page.getStatusCode())){
pageProcessor.process(page);
extractAndAddRequests(page, spawnUrl);
if (!page.getResultItems().isSkip()) {
for (Pipeline pipeline : pipelines) {
pipeline.process(page.getResultItems(), this);
}
}
} else {
logger.info("page status code error, page {} , code: {}", request.getUrl(), page.getStatusCode());
}
sleep(site.getSleepTime());
}
private void onDownloaderFail(Request request) {
if (site.getCycleRetryTimes() == 0) {
sleep(site.getSleepTime());
} else {
// for cycle retry
doCycleRetry(request);
}
}
private void doCycleRetry(Request request) {
Object cycleTriedTimesObject = request.getExtra(Request.CYCLE_TRIED_TIMES);
if (cycleTriedTimesObject == null) {
addRequest(SerializationUtils.clone(request).setPriority(0).putExtra(Request.CYCLE_TRIED_TIMES, 1));
} else {
int cycleTriedTimes = (Integer) cycleTriedTimesObject;
cycleTriedTimes++;
if (cycleTriedTimes < site.getCycleRetryTimes()) {
addRequest(SerializationUtils.clone(request).setPriority(0).putExtra(Request.CYCLE_TRIED_TIMES, cycleTriedTimes));
}
}
sleep(site.getRetrySleepTime());
}
protected void sleep(int time) {
try {
Thread.sleep(time);
} catch (InterruptedException e) {
logger.error("Thread interrupted when sleep",e);
Thread.currentThread().interrupt();
}
}
protected void extractAndAddRequests(Page page, boolean spawnUrl) {
if (spawnUrl && CollectionUtils.isNotEmpty(page.getTargetRequests())) {
for (Request request : page.getTargetRequests()) {
addRequest(request);
}
}
}
private void addRequest(Request request) {
if (site.getDomain() == null && request != null && request.getUrl() != null) {
site.setDomain(UrlUtils.getDomain(request.getUrl()));
}
scheduler.push(request, this);
}
protected void checkIfRunning() {
if (stat.get() == STAT_RUNNING) {
throw new IllegalStateException("Spider is already running!");
}
}
public void runAsync() {
Thread thread = new Thread(this);
thread.setDaemon(false);
thread.start();
}
/**
* Add urls to crawl. <br>
*
* @param urls urls
* @return this
*/
public Spider addUrl(String... urls) {
for (String url : urls) {
addRequest(new Request(url));
}
scheduler.signalNewUrl();
return this;
}
/**
* Download urls synchronizing.
*
* @param urls urls
* @param <T> type of process result
* @return list downloaded
*/
public <T> List<T> getAll(Collection<String> urls) {
destroyWhenExit = false;
spawnUrl = false;
if (startRequests!=null){
startRequests.clear();
}
for (Request request : UrlUtils.convertToRequests(urls)) {
addRequest(request);
}
CollectorPipeline collectorPipeline = getCollectorPipeline();
pipelines.add(collectorPipeline);
run();
spawnUrl = true;
destroyWhenExit = true;
return collectorPipeline.getCollected();
}
protected CollectorPipeline getCollectorPipeline() {
return new ResultItemsCollectorPipeline();
}
public <T> T get(String url) {
List<String> urls = WMCollections.newArrayList(url);
List<T> resultItemses = getAll(urls);
if (resultItemses != null && resultItemses.size() > 0) {
return resultItemses.get(0);
} else {
return null;
}
}
/**
* Add urls with information to crawl.<br>
*
* @param requests requests
* @return this
*/
public Spider addRequest(Request... requests) {
for (Request request : requests) {
addRequest(request);
}
scheduler.signalNewUrl();
return this;
}
public void start() {
runAsync();
}
public void stop() {
if (stat.compareAndSet(STAT_RUNNING, STAT_STOPPED)) {
logger.info("Spider " + getUUID() + " stop success!");
} else {
logger.info("Spider " + getUUID() + " stop fail!");
}
}
/**
* Stop when all tasks in the queue are completed and all worker threads are also completed
*/
public void stopWhenComplete(){
this.exitWhenComplete = true;
}
/**
* start with more than one threads
*
* @param threadNum threadNum
* @return this
*/
public Spider thread(int threadNum) {
checkIfRunning();
this.threadNum = threadNum;
if (threadNum <= 0) {
throw new IllegalArgumentException("threadNum should be more than one!");
}
return this;
}
/**
* start with more than one threads
*
* @param executorService executorService to run the spider
* @param threadNum threadNum
* @return this
*/
public Spider thread(ExecutorService executorService, int threadNum) {
checkIfRunning();
this.threadNum = threadNum;
if (threadNum <= 0) {
throw new IllegalArgumentException("threadNum should be more than one!");
}
this.executorService = executorService;
return this;
}
public boolean isExitWhenComplete() {
return exitWhenComplete;
}
/**
* Exit when complete. <br>
* True: exit when all url of the site is downloaded. <br>
* False: not exit until call stop() manually.<br>
*
* @param exitWhenComplete exitWhenComplete
* @return this
*/
public Spider setExitWhenComplete(boolean exitWhenComplete) {
this.exitWhenComplete = exitWhenComplete;
return this;
}
public boolean isSpawnUrl() {
return spawnUrl;
}
/**
* Get page count downloaded by spider.
*
* @return total downloaded page count
* @since 0.4.1
*/
public long getPageCount() {
return pageCount.get();
}
/**
* Get running status by spider.
*
* @return running status
* @see Status
* @since 0.4.1
*/
public Status getStatus() {
return Status.fromValue(stat.get());
}
public enum Status {
Init(0), Running(1), Stopped(2);
private Status(int value) {
this.value = value;
}
private int value;
int getValue() {
return value;
}
public static Status fromValue(int value) {
for (Status status : Status.values()) {
if (status.getValue() == value) {
return status;
}
}
//default value
return Init;
}
}
/**
* Get thread count which is running
*
* @return thread count which is running
* @since 0.4.1
*/
public int getThreadAlive() {
if (threadPool == null) {
return 0;
}
return threadPool.getThreadAlive();
}
/**
* Whether add urls extracted to download.<br>
* Add urls to download when it is true, and just download seed urls when it is false. <br>
* DO NOT set it unless you know what it means!
*
* @param spawnUrl spawnUrl
* @return this
* @since 0.4.0
*/
public Spider setSpawnUrl(boolean spawnUrl) {
this.spawnUrl = spawnUrl;
return this;
}
@Override
public String getUUID() {
if (uuid != null) {
return uuid;
}
if (site != null) {
return site.getDomain();
}
uuid = UUID.randomUUID().toString();
return uuid;
}
public Spider setExecutorService(ExecutorService executorService) {
checkIfRunning();
this.executorService = executorService;
return this;
}
@Override
public Site getSite() {
return site;
}
public List<SpiderListener> getSpiderListeners() {
return spiderListeners;
}
public Spider setSpiderListeners(List<SpiderListener> spiderListeners) {
this.spiderListeners = spiderListeners;
return this;
}
public Date getStartTime() {
return startTime;
}
public Scheduler getScheduler() {
return scheduler.getScheduler();
}
/**
* Set wait time when no url is polled.<br><br>
*
* @param emptySleepTime In MILLISECONDS.
* @return this
*/
public Spider setEmptySleepTime(long emptySleepTime) {
if(emptySleepTime<=0){
throw new IllegalArgumentException("emptySleepTime should be more than zero!");
}
this.emptySleepTime = emptySleepTime;
return this;
}
}
================================================
FILE: webmagic-core/src/main/java/us/codecraft/webmagic/SpiderListener.java
================================================
package us.codecraft.webmagic;
/**
* Listener of Spider on page processing. Used for monitor and such on.
*
* @author code4crafer@gmail.com
* @since 0.5.0
*/
public interface SpiderListener {
void onSuccess(Request request);
/**
* @deprecated Use {@link #onError(Request, Exception)} instead.
*/
@Deprecated
default void onError(Request request) {
}
default void onError(Request request, Exception e) {
this.onError(request);
}
}
================================================
FILE: webmagic-core/src/main/java/us/codecraft/webmagic/SpiderScheduler.java
================================================
package us.codecraft.webmagic;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.locks.Condition;
import java.util.concurrent.locks.ReentrantLock;
import us.codecraft.webmagic.scheduler.Scheduler;
import us.codecraft.webmagic.thread.CountableThreadPool;
public class SpiderScheduler {
private Scheduler scheduler;
private final ReentrantLock newUrlLock = new ReentrantLock();
private final Condition newUrlCondition = newUrlLock.newCondition();
public SpiderScheduler(Scheduler scheduler) {
this.scheduler = scheduler;
}
public Scheduler getScheduler() {
return scheduler;
}
public void setScheduler(Scheduler scheduler) {
this.scheduler = scheduler;
}
public Request poll(Spider spider) {
return scheduler.poll(spider);
}
public void push(Request request, Spider spider) {
scheduler.push(request, spider);
}
public boolean waitNewUrl(CountableThreadPool threadPool, long emptySleepTime) {
newUrlLock.lock();
try {
if (threadPool.getThreadAlive() == 0) {
return false;
}
newUrlCondition.await(emptySleepTime, TimeUnit.MILLISECONDS);
return false;
} catch (InterruptedException e) {
return true;
} finally {
newUrlLock.unlock();
}
}
public void signalNewUrl() {
try {
newUrlLock.lock();
newUrlCondition.signalAll();
} finally {
newUrlLock.unlock();
}
}
}
================================================
FILE: webmagic-core/src/main/java/us/codecraft/webmagic/Task.java
================================================
package us.codecraft.webmagic;
/**
* Interface for identifying different tasks.<br>
*
* @author code4crafter@gmail.com <br>
* @since 0.1.0
* @see us.codecraft.webmagic.scheduler.Scheduler
* @see us.codecraft.webmagic.pipeline.Pipeline
*/
public interface Task {
/**
* unique id for a task.
*
* @return uuid
*/
public String getUUID();
/**
* site of a task
*
* @return site
*/
public Site getSite();
}
================================================
FILE: webmagic-core/src/main/java/us/codecraft/webmagic/downloader/AbstractDownloader.java
================================================
package us.codecraft.webmagic.downloader;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.selector.Html;
/**
* Base class of downloader with some common methods.
*
* @author code4crafter@gmail.com
* @since 0.5.0
*/
public abstract class AbstractDownloader implements Downloader {
/**
* A simple method to download a url.
*
* @param url url
* @return html
*/
public Html download(String url) {
return download(url, null);
}
/**
* A simple method to download a url.
*
* @param url url
* @param charset charset
* @return html
*/
public Html download(String url, String charset) {
Page page = download(new Request(url), Site.me().setCharset(charset).toTask());
return (Html) page.getHtml();
}
/**
* @param request the {@link Request}.
* @deprecated Use {@link #onSuccess(Page, Task)} instead.
*/
@Deprecated
protected void onSuccess(Request request) {
}
/**
* @param request the {@link Request}.
* @param task the {@link Task}.
* @since 0.7.6
* @deprecated Use {@link #onSuccess(Page, Task)} instead.
*/
@Deprecated
protected void onSuccess(Request request, Task task) {
this.onSuccess(request);
}
/**
* @param page the {@link Page}.
* @param task the {@link Task}.
* @since 0.10.0
*/
protected void onSuccess(Page page, Task task) {
this.onSuccess(page.getRequest(), task);
}
/**
* @param request the {@link Request}.
* @deprecated Use {@link #onError(Page, Task, Throwable)} instead.
*/
@Deprecated
protected void onError(Request request) {
}
/**
* @param request the {@link Request}.
* @param task the {@link Task}.
* @param e the exception.
* @since 0.7.6
* @deprecated Use {@link #onError(Page, Task, Throwable)} instead.
*/
@Deprecated
protected void onError(Request request, Task task, Throwable e) {
this.onError(request);
}
/**
* @param page the {@link Page}.
* @param task the {@link Task}.
* @param e the exception.
* @since 0.10.0
*/
protected void onError(Page page, Task task, Throwable e) {
this.onError(page.getRequest(), task, e);
}
}
================================================
FILE: webmagic-core/src/main/java/us/codecraft/webmagic/downloader/CustomRedirectStrategy.java
================================================
package us.codecraft.webmagic.downloader;
import java.net.URI;
import org.apache.http.HttpRequest;
import org.apache.http.HttpResponse;
import org.apache.http.ProtocolException;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.client.methods.HttpRequestWrapper;
import org.apache.http.client.methods.HttpUriRequest;
import org.apache.http.impl.client.LaxRedirectStrategy;
import org.apache.http.protocol.HttpContext;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
*支持post 302跳转策略实现类
*HttpClient默认跳转:httpClientBuilder.setRedirectStrategy(new LaxRedirectStrategy());
*上述代码在post/redirect/post这种情况下不会传递原有请求的数据信息。所以参考了下SeimiCrawler这个项目的重定向策略。
*原代码地址:https://github.com/zhegexiaohuozi/SeimiCrawler/blob/master/project/src/main/java/cn/wanghaomiao/seimi/http/hc/SeimiRedirectStrategy.java
*/
public class CustomRedirectStrategy extends LaxRedirectStrategy {
private Logger logger = LoggerFactory.getLogger(getClass());
@Override
public HttpUriRequest getRedirect(HttpRequest request, HttpResponse response, HttpContext context) throws ProtocolException {
URI uri = getLocationURI(request, response, context);
String method = request.getRequestLine().getMethod();
if ("post".equalsIgnoreCase(method)) {
try {
HttpRequestWrapper httpRequestWrapper = (HttpRequestWrapper) request;
httpRequestWrapper.setURI(uri);
httpRequestWrapper.removeHeaders("Content-Length");
return httpRequestWrapper;
} catch (Exception e) {
logger.error("强转为HttpRequestWrapper出错");
}
return new HttpPost(uri);
} else {
return new HttpGet(uri);
}
}
}
================================================
FILE: webmagic-core/src/main/java/us/codecraft/webmagic/downloader/Downloader.java
================================================
package us.codecraft.webmagic.downloader;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Task;
/**
* Downloader is the part that downloads web pages and store in Page object. <br>
* Downloader has {@link #setThread(int)} method because downloader is always the bottleneck of a crawler,
* there are always some mechanisms such as pooling in downloader, and pool size is related to thread numbers.
*
* @author code4crafter@gmail.com <br>
* @since 0.1.0
*/
public interface Downloader {
/**
* Downloads web pages and store in Page object.
*
* @param request request
* @param task task
* @return page
*/
public Page download(Request request, Task task);
/**
* Tell the downloader how many threads the spider used.
* @param threadNum number of threads
*/
public void setThread(int threadNum);
}
================================================
FILE: webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java
================================================
package us.codecraft.webmagic.downloader;
import java.io.IOException;
import java.nio.charset.Charset;
import java.util.HashMap;
import java.util.Map;
import java.util.Optional;
import org.apache.commons.io.IOUtils;
import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.util.EntityUtils;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.proxy.Proxy;
import us.codecraft.webmagic.proxy.ProxyProvider;
import us.codecraft.webmagic.selector.PlainText;
import us.codecraft.webmagic.utils.CharsetUtils;
import us.codecraft.webmagic.utils.HttpClientUtils;
/**
* The http downloader based on HttpClient.
*
* @author code4crafter@gmail.com <br>
* @since 0.1.0
*/
public class HttpClientDownloader extends AbstractDownloader {
private final Map<String, CloseableHttpClient> httpClients = new HashMap<String, CloseableHttpClient>();
private HttpClientGenerator httpClientGenerator = new HttpClientGenerator();
private HttpUriRequestConverter httpUriRequestConverter = new HttpUriRequestConverter();
private ProxyProvider proxyProvider;
private boolean responseHeader = true;
public void setHttpUriRequestConverter(HttpUriRequestConverter httpUriRequestConverter) {
this.httpUriRequestConverter = httpUriRequestConverter;
}
public void setProxyProvider(ProxyProvider proxyProvider) {
this.proxyProvider = proxyProvider;
}
private CloseableHttpClient getHttpClient(Site site) {
if (site == null) {
return httpClientGenerator.getClient(null);
}
String domain = site.getDomain();
CloseableHttpClient httpClient = httpClients.get(domain);
if (httpClient == null) {
synchronized (this) {
httpClient = httpClients.get(domain);
if (httpClient == null) {
httpClient = httpClientGenerator.getClient(site);
httpClients.put(domain, httpClient);
}
}
}
return httpClient;
}
@Override
public Page download(Request request, Task task) {
if (task == null || task.getSite() == null) {
throw new NullPointerException("task or site can not be null");
}
CloseableHttpResponse httpResponse = null;
CloseableHttpClient httpClient = getHttpClient(task.getSite());
Proxy proxy = proxyProvider != null ? proxyProvider.getProxy(request, task) : null;
HttpClientRequestContext requestContext = httpUriRequestConverter.convert(request, task.getSite(), proxy);
Page page = null;
try {
httpResponse = httpClient.execute(requestContext.getHttpUriRequest(), requestContext.getHttpClientContext());
page = handleResponse(request, request.getCharset() != null ? request.getCharset() : task.getSite().getCharset(), httpResponse, task);
onSuccess(page, task);
return page;
} catch (IOException e) {
page = Page.ofFailure(request);
onError(page, task, e);
return page;
} finally {
if (httpResponse != null) {
//ensure the connection is released back to pool
EntityUtils.consumeQuietly(httpResponse.getEntity());
}
if (proxyProvider != null && proxy != null) {
proxyProvider.returnProxy(proxy, page, task);
}
}
}
@Override
public void setThread(int thread) {
httpClientGenerator.setPoolSize(thread);
}
protected Page handleResponse(Request request, String charset, HttpResponse httpResponse, Task task) throws IOException {
HttpEntity entity = httpResponse.getEntity();
byte[] bytes = entity != null ? IOUtils.toByteArray(entity.getContent()) : new byte[0];
String contentType = entity != null && entity.getContentType() != null ? entity.getContentType().getValue() : null;
Page page = Page.ofSuccess(request);
page.setBytes(bytes);
if (!request.isBinaryContent()) {
if (charset == null) {
charset = getHtmlCharset(contentType, bytes, task);
}
page.setCharset(charset);
page.setRawText(new String(bytes, charset));
}
page.setUrl(new PlainText(request.getUrl()));
page.setRequest(request);
page.setStatusCode(httpResponse.getStatusLine().getStatusCode());
if (responseHeader) {
page.setHeaders(HttpClientUtils.convertHeaders(httpResponse.getAllHeaders()));
}
return page;
}
private String getHtmlCharset(String contentType, byte[] contentBytes, Task task) throws IOException {
String charset = CharsetUtils.detectCharset(contentType, contentBytes);
if (charset == null) {
charset = Optional.ofNullable(task.getSite().getDefaultCharset()).orElseGet(Charset.defaultCharset()::name);
}
return charset;
}
}
================================================
FILE: webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java
================================================
package us.codecraft.webmagic.downloader;
import org.apache.commons.lang3.JavaVersion;
import org.apache.commons.lang3.SystemUtils;
import org.apache.http.HttpException;
import org.apache.http.HttpRequest;
import org.apache.http.HttpRequestInterceptor;
import org.apache.http.client.CookieStore;
import org.apache.http.config.Registry;
import org.apache.http.config.RegistryBuilder;
import org.apache.http.config.SocketConfig;
import org.apache.http.conn.socket.ConnectionSocketFactory;
import org.apache.http.conn.socket.PlainConnectionSocketFactory;
import org.apache.http.conn.ssl.SSLConnectionSocketFactory;
import org.apache.http.impl.client.*;
import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
import org.apache.http.impl.cookie.BasicClientCookie;
import org.apache.http.protocol.HttpContext;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import us.codecraft.webmagic.Site;
import javax.net.ssl.SSLContext;
import javax.net.ssl.TrustManager;
import javax.net.ssl.X509TrustManager;
import java.io.IOException;
import java.security.KeyManagementException;
import java.security.NoSuchAlgorithmException;
import java.security.cert.CertificateException;
import java.security.cert.X509Certificate;
import java.util.Map;
/**
* @author code4crafter@gmail.com <br>
* @since 0.4.0
*/
public class HttpClientGenerator {
private Logger logger = LoggerFactory.getLogger(getClass());
private PoolingHttpClientConnectionManager connectionManager;
public HttpClientGenerator() {
Registry<ConnectionSocketFactory> reg = RegistryBuilder.<ConnectionSocketFactory>create()
.register("http", PlainConnectionSocketFactory.INSTANCE)
.register("https", buildSSLConnectionSocketFactory())
.build();
connectionManager = new PoolingHttpClientConnectionManager(reg);
connectionManager.setDefaultMaxPerRoute(100);
}
private SSLConnectionSocketFactory buildSSLConnectionSocketFactory() {
try {
SSLContext sslContext = createIgnoreVerifySSL();
String[] supportedProtocols;
if (SystemUtils.isJavaVersionAtLeast(JavaVersion.JAVA_11)) {
supportedProtocols = new String[]{"SSLv3", "TLSv1", "TLSv1.1", "TLSv1.2", "TLSv1.3"};
} else {
supportedProtocols = new String[]{"SSLv3", "TLSv1", "TLSv1.1", "TLSv1.2"};
}
logger.debug("supportedProtocols: {}", String.join(", ", supportedProtocols));
return new SSLConnectionSocketFactory(sslContext, supportedProtocols,
null,
//不进行主机校验
(host, sslSession) -> true); // 优先绕过安全证书
} catch (KeyManagementException | NoSuchAlgorithmException e) {
logger.error("ssl connection fail", e);
}
return SSLConnectionSocketFactory.getSocketFactory();
}
private SSLContext createIgnoreVerifySSL() throws NoSuchAlgorithmException, KeyManagementException {
// 实现一个X509TrustManager接口,用于绕过验证,不用修改里面的方法
X509TrustManager trustManager = new X509TrustManager() {
@Override
public void checkClientTrusted(X509Certificate[] chain, String authType) throws CertificateException {
}
@Override
public void checkServerTrusted(X509Certificate[] chain, String authType) throws CertificateException {
}
@Override
public X509Certificate[] getAcceptedIssuers() {
return null;
}
};
SSLContext sc = SSLContext.getInstance("TLS");
sc.init(null, new TrustManager[]{trustManager}, null);
return sc;
}
public HttpClientGenerator setPoolSize(int poolSize) {
connectionManager.setMaxTotal(poolSize);
return this;
}
public CloseableHttpClient getClient(Site site) {
return generateClient(site);
}
private CloseableHttpClient generateClient(Site site) {
HttpClientBuilder httpClientBuilder = HttpClients.custom();
httpClientBuilder.setConnectionManager(connectionManager);
if (site.getUserAgent() != null) {
httpClientBuilder.setUserAgent(site.getUserAgent());
} else {
httpClientBuilder.setUserAgent("");
}
if (site.isUseGzip()) {
httpClientBuilder.addInterceptorFirst(new HttpRequestInterceptor() {
public void process(
final HttpRequest request,
final HttpContext context) throws HttpException, IOException {
if (!request.containsHeader("Accept-Encoding")) {
request.addHeader("Accept-Encoding", "gzip");
}
}
});
}
//解决post/redirect/post 302跳转问题
httpClientBuilder.setRedirectStrategy(new CustomRedirectStrategy());
SocketConfig.Builder socketConfigBuilder = SocketConfig.custom();
socketConfigBuilder.setSoKeepAlive(true).setTcpNoDelay(true);
socketConfigBuilder.setSoTimeout(site.getTimeOut());
SocketConfig socketConfig = socketConfigBuilder.build();
httpClientBuilder.setDefaultSocketConfig(socketConfig);
connectionManager.setDefaultSocketConfig(socketConfig);
httpClientBuilder.setRetryHandler(new DefaultHttpRequestRetryHandler(site.getRetryTimes(), true));
generateCookie(httpClientBuilder, site);
return httpClientBuilder.build();
}
private void generateCookie(HttpClientBuilder httpClientBuilder, Site site) {
if (site.isDisableCookieManagement()) {
httpClientBuilder.disableCookieManagement();
return;
}
CookieStore cookieStore = new BasicCookieStore();
for (Map.Entry<String, String> cookieEntry : site.getCookies().entrySet()) {
BasicClientCookie cookie = new BasicClientCookie(cookieEntry.getKey(), cookieEntry.getValue());
cookie.setDomain(site.getDomain());
cookieStore.addCookie(cookie);
}
for (Map.Entry<String, Map<String, String>> domainEntry : site.getAllCookies().entrySet()) {
for (Map.Entry<String, String> cookieEntry : domainEntry.getValue().entrySet()) {
BasicClientCookie cookie = new BasicClientCookie(cookieEntry.getKey(), cookieEntry.getValue());
cookie.setDomain(domainEntry.getKey());
cookieStore.addCookie(cookie);
}
}
httpClientBuilder.setDefaultCookieStore(cookieStore);
}
}
================================================
FILE: webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientRequestContext.java
================================================
package us.codecraft.webmagic.downloader;
import org.apache.http.client.methods.HttpUriRequest;
import org.apache.http.client.protocol.HttpClientContext;
/**
* @author code4crafter@gmail.com
* Date: 17/4/8
* Time: 19:43
* @since 0.7.0
*/
public class HttpClientRequestContext {
private HttpUriRequest httpUriRequest;
private HttpClientContext httpClientContext;
public HttpUriRequest getHttpUriRequest() {
return httpUriRequest;
}
public void setHttpUriRequest(HttpUriRequest httpUriRequest) {
this.httpUriRequest = httpUriRequest;
}
public HttpClientContext getHttpClientContext() {
return httpClientContext;
}
public void setHttpClientContext(HttpClientContext httpClientContext) {
this.httpClientContext = httpClientContext;
}
}
================================================
FILE: webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpUriRequestConverter.java
================================================
package us.codecraft.webmagic.downloader;
import org.apache.http.HttpHost;
import org.apache.http.auth.AuthState;
import org.apache.http.auth.ChallengeState;
import org.apache.http.auth.UsernamePasswordCredentials;
import org.apache.http.client.CookieStore;
import org.apache.http.client.config.CookieSpecs;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.HttpUriRequest;
import org.apache.http.client.methods.RequestBuilder;
import org.apache.http.client.protocol.HttpClientContext;
import org.apache.http.entity.ByteArrayEntity;
import org.apache.http.impl.auth.BasicScheme;
import org.apache.http.impl.client.BasicCookieStore;
import org.apache.http.impl.cookie.BasicClientCookie;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.proxy.Proxy;
import us.codecraft.webmagic.utils.HttpConstant;
import us.codecraft.webmagic.utils.UrlUtils;
import java.util.Map;
/**
* @author code4crafter@gmail.com
* Date: 17/3/18
* Time: 11:28
*
* @since 0.7.0
*/
public class HttpUriRequestConverter {
public HttpClientRequestContext convert(Request request, Site site, Proxy proxy) {
HttpClientRequestContext httpClientRequestContext = new HttpClientRequestContext();
httpClientRequestContext.setHttpUriRequest(convertHttpUriRequest(request, site, proxy));
httpClientRequestContext.setHttpClientContext(convertHttpClientContext(request, site, proxy));
return httpClientRequestContext;
}
private HttpClientContext convertHttpClientContext(Request request, Site site, Proxy proxy) {
HttpClientContext httpContext = new HttpClientContext();
if (proxy != null && proxy.getUsername() != null) {
AuthState authState = new AuthState();
BasicScheme proxyAuthScheme = new BasicScheme(ChallengeState.PROXY);
UsernamePasswordCredentials proxyCredentials = new UsernamePasswordCredentials(proxy.getUsername(), proxy.getPassword());
authState.update(proxyAuthScheme, proxyCredentials);
httpContext.setAttribute(HttpClientContext.PROXY_AUTH_STATE, authState);
}
if (request.getCookies() != null && !request.getCookies().isEmpty()) {
CookieStore cookieStore = new BasicCookieStore();
for (Map.Entry<String, String> cookieEntry : request.getCookies().entrySet()) {
BasicClientCookie cookie1 = new BasicClientCookie(cookieEntry.getKey(), cookieEntry.getValue());
cookie1.setDomain(UrlUtils.removePort(UrlUtils.getDomain(request.getUrl())));
cookieStore.addCookie(cookie1);
}
httpContext.setCookieStore(cookieStore);
}
return httpContext;
}
private HttpUriRequest convertHttpUriRequest(Request request, Site site, Proxy proxy) {
RequestBuilder requestBuilder = selectRequestMethod(request).setUri(UrlUtils.fixIllegalCharacterInUrl(request.getUrl()));
if (site.getHeaders() != null) {
for (Map.Entry<String, String> headerEntry : site.getHeaders().entrySet()) {
requestBuilder.addHeader(headerEntry.getKey(), headerEntry.getValue());
}
}
RequestConfig.Builder requestConfigBuilder = RequestConfig.custom();
if (site != null) {
requestConfigBuilder.setConnectionRequestTimeout(site.getTimeOut())
.setSocketTimeout(site.getTimeOut())
.setConnectTimeout(site.getTimeOut())
.setCookieSpec(CookieSpecs.STANDARD);
}
if (proxy != null) {
requestConfigBuilder.setProxy(new HttpHost(proxy.getHost(), proxy.getPort(), proxy.getScheme()));
}
requestBuilder.setConfig(requestConfigBuilder.build());
HttpUriRequest httpUriRequest = requestBuilder.build();
if (request.getHeaders() != null && !request.getHeaders().isEmpty()) {
for (Map.Entry<String, String> header : request.getHeaders().entrySet()) {
httpUriRequest.addHeader(header.getKey(), header.getValue());
}
}
return httpUriRequest;
}
private RequestBuilder selectRequestMethod(Request request) {
String method = request.getMethod();
if (method == null || method.equalsIgnoreCase(HttpConstant.Method.GET)) {
//default get
return RequestBuilder.get();
} else if (method.equalsIgnoreCase(HttpConstant.Method.POST)) {
return addFormParams(RequestBuilder.post(),request);
} else if (method.equalsIgnoreCase(HttpConstant.Method.HEAD)) {
return RequestBuilder.head();
} else if (method.equalsIgnoreCase(HttpConstant.Method.PUT)) {
return addFormParams(RequestBuilder.put(), request);
} else if (method.equalsIgnoreCase(HttpConstant.Method.DELETE)) {
return RequestBuilder.delete();
} else if (method.equalsIgnoreCase(HttpConstant.Method.TRACE)) {
return RequestBuilder.trace();
}
throw new IllegalArgumentException("Illegal HTTP Method " + method);
}
private RequestBuilder addFormParams(RequestBuilder requestBuilder, Request request) {
if (request.getRequestBody() != null) {
ByteArrayEntity entity = new ByteArrayEntity(request.getRequestBody().getBody());
entity.setContentType(request.getRequestBody().getContentType());
requestBuilder.setEntity(entity);
}
return requestBuilder;
}
}
================================================
FILE: webmagic-core/src/main/java/us/codecraft/webmagic/downloader/package.html
================================================
<html>
<body>
Downloader is the part that downloads web pages and store in Page object.
</body>
</html>
================================================
FILE: webmagic-core/src/main/java/us/codecraft/webmagic/model/HttpRequestBody.java
================================================
package us.codecraft.webmagic.model;
import org.apache.http.NameValuePair;
import org.apache.http.client.utils.URLEncodedUtils;
import org.apache.http.message.BasicNameValuePair;
import java.io.Serializable;
import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
/**
* @author code4crafter@gmail.com
* Date: 17/4/8
*/
public class HttpRequestBody implements Serializable {
private static final long serialVersionUID = 5659170945717023595L;
public static abstract class ContentType {
public static final String JSON = "application/json";
public static final String XML = "text/xml";
public static final String FORM = "application/x-www-form-urlencoded";
public static final String MULTIPART = "multipart/form-data";
}
private byte[] body;
private String contentType;
private String encoding;
public HttpRequestBody() {
}
public HttpRequestBody(byte[] body, String contentType, String encoding) {
this.body = body;
this.contentType = contentType;
this.encoding = encoding;
}
public String getContentType() {
return contentType;
}
public String getEncoding() {
return encoding;
}
public void setBody(byte[] body) {
this.body = body;
}
public void setContentType(String contentType) {
this.contentType = contentType;
}
public void setEncoding(String encoding) {
this.encoding = encoding;
}
public static HttpRequestBody json(String json, String encoding) {
try {
return new HttpRequestBody(json.getBytes(encoding), ContentType.JSON, encoding);
} catch (UnsupportedEncodingException e) {
throw new IllegalArgumentException("illegal encoding " + encoding, e);
}
}
public static HttpRequestBody xml(String xml, String encoding) {
try {
return new HttpRequestBody(xml.getBytes(encoding), ContentType.XML, encoding);
} catch (UnsupportedEncodingException e) {
throw new IllegalArgumentException("illegal encoding " + encoding, e);
}
}
public static HttpRequestBody custom(byte[] body, String contentType, String encoding) {
return new HttpRequestBody(body, contentType, encoding);
}
public static HttpRequestBody form(Map<String,Object> params, String encoding){
List<NameValuePair> nameValuePairs = new ArrayList<NameValuePair>(params.size());
for (Map.Entry<String, Object> entry : params.entrySet()) {
nameValuePairs.add(new BasicNameValuePair(entry.getKey(), String.valueOf(entry.getValue())));
}
try {
return new HttpRequestBody(URLEncodedUtils.format(nameValuePairs, encoding).getBytes(encoding), ContentType.FORM, encoding);
} catch (UnsupportedEncodingException e) {
throw new IllegalArgumentException("illegal encoding " + encoding, e);
}
}
public byte[] getBody() {
return body;
}
}
================================================
FILE: webmagic-core/src/main/java/us/codecraft/webmagic/package.html
================================================
<html>
<body>
<div class="en">
Main class "Spider" and models.
</div>
</body>
</html>
================================================
FILE: webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/CollectorPipeline.java
================================================
package us.codecraft.webmagic.pipeline;
import java.util.List;
/**
* Pipeline that can collect and store results. <br>
* Used for {@link us.codecraft.webmagic.Spider#getAll(java.util.Collection)}
*
* @author code4crafter@gmail.com
* @since 0.4.0
*/
public interface CollectorPipeline<T> extends Pipeline {
/**
* Get all results collected.
*
* @return collected results
*/
public List<T> getCollected();
}
================================================
FILE: webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/ConsolePipeline.java
================================================
package us.codecraft.webmagic.pipeline;
import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Task;
import java.util.Map;
/**
* Write results in console.<br>
* Usually used in test.
*
* @author code4crafter@gmail.com <br>
* @since 0.1.0
*/
public class ConsolePipeline implements Pipeline {
@Override
public void process(ResultItems resultItems, Task task) {
System.out.println("get page: " + resultItems.getRequest().getUrl());
for (Map.Entry<String, Object> entry : resultItems.getAll().entrySet()) {
System.out.println(entry.getKey() + ":\t" + entry.getValue());
}
}
}
================================================
FILE: webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/FilePipeline.java
================================================
package us.codecraft.webmagic.pipeline;
import org.apache.commons.codec.digest.DigestUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.utils.FilePersistentBase;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.util.Map;
/**
* Store results in files.<br>
*
* @author code4crafter@gmail.com <br>
* @since 0.1.0
*/
public class FilePipeline extends FilePersistentBase implements Pipeline {
private Logger logger = LoggerFactory.getLogger(getClass());
/**
* create a FilePipeline with default path"/data/webmagic/"
*/
public FilePipeline() {
setPath("/data/webmagic/");
}
public FilePipeline(String path) {
setPath(path);
}
@Override
public void process(ResultItems resultItems, Task task) {
String path = this.path + PATH_SEPERATOR + task.getUUID() + PATH_SEPERATOR;
try {
PrintWriter printWriter = new PrintWriter(new OutputStreamWriter(new FileOutputStream(getFile(path + DigestUtils.md5Hex(resultItems.getRequest().getUrl()) + ".html")),"UTF-8"));
printWriter.println("url:\t" + resultItems.getRequest().getUrl());
for (Map.Entry<String, Object> entry : resultItems.getAll().entrySet()) {
if (entry.getValue() instanceof Iterable) {
Iterable value = (Iterable) entry.getValue();
printWriter.println(entry.getKey() + ":");
for (Object o : value) {
printWriter.println(o);
}
} else {
printWriter.println(entry.getKey() + ":\t" + entry.getValue());
}
}
printWriter.close();
} catch (IOException e) {
logger.warn("write file error", e);
}
}
}
================================================
FILE: webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/Pipeline.java
================================================
package us.codecraft.webmagic.pipeline;
import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Task;
/**
* Pipeline is the persistent and offline process part of crawler.<br>
* The interface Pipeline can be implemented to customize ways of persistent.
*
* @author code4crafter@gmail.com <br>
* @since 0.1.0
* @see ConsolePipeline
* @see FilePipeline
*/
public interface Pipeline {
/**
* Process extracted results.
*
* @param resultItems resultItems
* @param task task
*/
public void process(ResultItems resultItems, Task task);
}
================================================
FILE: webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/ResultItemsCollectorPipeline.java
================================================
package us.codecraft.webmagic.pipeline;
import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Task;
import java.util.ArrayList;
import java.util.List;
/**
* @author code4crafter@gmail.com
* @since 0.4.0
*/
public class ResultItemsCollectorPipeline implements CollectorPipeline<ResultItems> {
private List<ResultItems> collector = new ArrayList<ResultItems>();
@Override
public synchronized void process(ResultItems resultItems, Task task) {
collector.add(resultItems);
}
@Override
public List<ResultItems> getCollected() {
return collector;
}
}
================================================
FILE: webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/package.html
================================================
<html>
<body>
Pipeline is the persistent and offline process part of crawler.
</body>
</html>
================================================
FILE: webmagic-core/src/main/java/us/codecraft/webmagic/processor/PageProcessor.java
================================================
package us.codecraft.webmagic.processor;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
/**
* Interface to be implemented to customize a crawler.
*
* <p>
* In PageProcessor, you can customize:
* </p>
* <ul>
* <li>start URLs and other settings in {@link Site}</li>
* <li>how the URLs to fetch are detected</li>
* <li>how the data are extracted and stored</li>
* </ul>
*
* @author code4crafter@gmail.com <br>
* @see Site
* @see Page
* @since 0.1.0
*/
public interface PageProcessor {
/**
* Processes the page, extract URLs to fetch, extract the data and store.
*
* @param page page
*/
void process(Page page);
/**
* Returns the site settings.
*
* @return site
* @see Site
*/
default Site getSite() {
return Site.me();
}
}
================================================
FILE: webmagic-core/src/main/java/us/codecraft/webmagic/processor/SimplePageProcessor.java
================================================
package us.codecraft.webmagic.processor;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import java.util.List;
/**
* A simple PageProcessor.
*
* @author code4crafter@gmail.com <br>
* @since 0.1.0
*/
public class SimplePageProcessor implements PageProcessor {
private String urlPattern;
private Site site;
public SimplePageProcessor(String urlPattern) {
this.site = Site.me();
//compile "*" expression to regex
this.urlPattern = "(" + urlPattern.replace(".", "\\.").replace("*", "[^\"'#]*") + ")";
}
@Override
public void process(Page page) {
List<String> requests = page.getHtml().links().regex(urlPattern).all();
//add urls to fetch
page.addTargetRequests(requests);
//extract by XPath
page.putField("title", page.getHtml().xpath("//title"));
page.putField("html", page.getHtml().toString());
//extract by Readability
page.putField("content", page.getHtml().smartContent());
}
@Override
public Site getSite() {
//settings
return site;
}
}
================================================
FILE: webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/BaiduBaikePageProcessor.java
================================================
package us.codecraft.webmagic.processor.example;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;
import java.util.ArrayList;
import java.util.List;
/**
* @author code4crafter@gmail.com <br>
* @since 0.4.0
*/
public class BaiduBaikePageProcessor implements PageProcessor {
private Site site = Site.me()//.setHttpProxy(new HttpHost("127.0.0.1",8888))
.setRetryTimes(3).setSleepTime(1000).setUseGzip(true);
@Override
public void process(Page page) {
page.putField("name", page.getHtml().css("dl.lemmaWgt-lemmaTitle h1","text").toString());
page.putField("description", page.getHtml().xpath("//div[@class='lemma-summary']/allText()"));
}
@Override
public Site getSite() {
return site;
}
public static void main(String[] args) {
//single download
Spider spider = Spider.create(new BaiduBaikePageProcessor()).thread(2);
String urlTemplate = "http://baike.baidu.com/search/word?word=%s&pic=1&sug=1&enc=utf8";
ResultItems resultItems = spider.<ResultItems>get(String.format(urlTemplate, "水力发电"));
System.out.println(resultItems);
//multidownload
List<String> list = new ArrayList<String>();
list.add(String.format(urlTemplate,"风力发电"));
list.add(String.format(urlTemplate,"太阳能"));
list.add(String.format(urlTemplate,"地热发电"));
list.add(String.format(urlTemplate,"地热发电"));
List<ResultItems> resultItemses = spider.<ResultItems>getAll(list);
for (ResultItems resultItemse : resultItemses) {
System.out.println(resultItemse.getAll());
}
spider.close();
}
}
================================================
FILE: webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/GithubRepoPageProcessor.java
================================================
package us.codecraft.webmagic.processor.example;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;
/**
* @author code4crafter@gmail.com <br>
* @since 0.3.2
*/
public class GithubRepoPageProcessor implements PageProcessor {
private Site site = Site.me().setRetryTimes(3).setSleepTime(1000).setTimeOut(10000);
@Override
public void process(Page page) {
page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/[\\w\\-]+/[\\w\\-]+)").all());
page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/[\\w\\-])").all());
page.putField("author", page.getUrl().regex("https://github\\.com/(\\w+)/.*").toString());
page.putField("name", page.getHtml().xpath("//h1[@class='public']/strong/a/text()").toString());
if (page.getResultItems().get("name")==null){
//skip this page
page.setSkip(true);
}
page.putField("readme", page.getHtml().xpath("//div[@id='readme']/tidyText()"));
}
@Override
public Site getSite() {
return site;
}
public static void main(String[] args) {
Spider.create(new GithubRepoPageProcessor()).addUrl("https://github.com/code4craft").thread(5).run();
}
}
================================================
FILE: webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/ZhihuPageProcessor.java
================================================
package us.codecraft.webmagic.processor.example;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;
/**
* @author code4crafter@gmail.com <br>
* @since 0.6.0
*/
public class ZhihuPageProcessor implements PageProcessor {
private Site site = Site.me().setRetryTimes(3).setSleepTime(1000);
@Override
public void process(Page page) {
page.addTargetRequests(page.getHtml().links().regex("https://www\\.zhihu\\.com/question/\\d+/answer/\\d+.*").all());
page.putField("title", page.getHtml().xpath("//h1[@class='QuestionHeader-title']/text()").toString());
page.putField("question", page.getHtml().xpath("//div[@class='QuestionRichText']//tidyText()").toString());
page.putField("answer", page.getHtml().xpath("//div[@class='QuestionAnswer-content']/tidyText()").toString());
if (page.getResultItems().get("title")==null){
//skip this page
page.setSkip(true);
}
}
@Override
public Site getSite() {
return site;
}
public static void main(String[] args) {
Spider.create(new ZhihuPageProcessor()).addUrl("https://www.zhihu.com/explore").run();
}
}
================================================
FILE: webmagic-core/src/main/java/us/codecraft/webmagic/processor/package.html
================================================
<html>
<body>
PageProcessor custom part of a crawler for specific site.
</body>
</html>
================================================
FILE: webmagic-core/src/main/java/us/codecraft/webmagic/proxy/Proxy.java
================================================
package us.codecraft.webmagic.proxy;
import java.io.UnsupportedEncodingException;
import java.net.URI;
import java.net.URISyntaxException;
import java.net.URLEncoder;
import java.nio.charset.StandardCharsets;
import org.apache.commons.lang3.StringUtils;
public class Proxy {
private String scheme;
private String host;
private int port;
private String username;
private String password;
public static Proxy create(final URI uri) {
Proxy proxy = new Proxy(uri.getHost(), uri.getPort(), uri.getScheme());
String userInfo = uri.getUserInfo();
if (userInfo != null) {
String[] up = userInfo.split(":");
if (up.length == 1) {
proxy.username = up[0].isEmpty() ? null : up[0];
} else {
proxy.username = up[0].isEmpty() ? null : up[0];
proxy.password = up[1].isEmpty() ? null : up[1];
}
}
return proxy;
}
public Proxy(String host, int port) {
this(host, port, null);
}
public Proxy(String host, int port, String scheme) {
this.host = host;
this.port = port;
this.scheme = scheme;
}
public Proxy(String host, int port, String username, String password) {
this.host = host;
this.port = port;
this.username = username;
this.password = password;
}
public String getScheme() {
return scheme;
}
public void setScheme(String scheme) {
this.scheme = scheme;
}
public String getHost() {
return host;
}
public int getPort() {
return port;
}
public String getUsername() {
return username;
}
public String getPassword() {
return password;
}
public URI toURI() {
final StringBuilder userInfoBuffer = new StringBuilder();
if (username != null) {
userInfoBuffer.append(urlencode(username));
}
if (password != null) {
userInfoBuffer.append(":").append(urlencode(password));
}
final String userInfo = StringUtils.defaultIfEmpty(userInfoBuffer.toString(), null);
URI uri;
try {
uri = new URI(scheme, userInfo, host, port, null, null, null);
} catch (URISyntaxException e) {
throw new IllegalArgumentException(e.getMessage(), e);
}
return uri;
}
private String urlencode(String s) {
String enc = StandardCharsets.UTF_8.name();
try {
return URLEncoder.encode(s, enc);
} catch (UnsupportedEncodingException e) {
throw new IllegalArgumentException(e);
}
}
@Override
public boolean equals(Object o) {
if (this == o) return true;
if (o == null || getClass() != o.getClass()) return false;
Proxy proxy = (Proxy) o;
if (port != proxy.port) return false;
if (host != null ? !host.equals(proxy.host) : proxy.host != null) return false;
if (scheme != null ? !scheme.equals(proxy.scheme) : proxy.scheme != null) return false;
if (username != null ? !username.equals(proxy.username) : proxy.username != null) return false;
return password != null ? password.equals(proxy.password) : proxy.password == null;
}
@Override
public int hashCode() {
int result = host != null ? host.hashCode() : 0;
result = 31 * result + port;
result = 31 * result + (scheme != null ? scheme.hashCode() : 0);
result = 31 * result + (username != null ? username.hashCode() : 0);
result = 31 * result + (password != null ? password.hashCode() : 0);
return result;
}
@Override
public String toString() {
return this.toURI().toString();
}
}
================================================
FILE: webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyProvider.java
================================================
package us.codecraft.webmagic.proxy;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Task;
/**
* Proxy provider. <br>
*
* @since 0.7.0
*/
public interface ProxyProvider {
/**
*
* Return proxy to Provider when complete a download.
* @param proxy the proxy config contains host,port and identify info
* @param page the download result
* @param task the download task
*/
void returnProxy(Proxy proxy, Page page, Task task);
/**
* Get a proxy for task by some strategy.
* @param task the download task
* @return proxy
* @deprecated Use {@link #getProxy(Request, Task)} instead.
*/
@Deprecated
default Proxy getProxy(Task task) {
throw new UnsupportedOperationException();
}
/**
* Returns a proxy for the request.
*
* @param request the request
* @param task the download task
* @return proxy
* @since 0.9.0
*/
default Proxy getProxy(Request request, Task task) {
return this.getProxy(task);
}
}
================================================
FILE: webmagic-core/src/main/java/us/codecraft/webmagic/proxy/SimpleProxyProvider.java
================================================
package us.codecraft.webmagic.proxy;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Task;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.concurrent.atomic.AtomicInteger;
/**
* A simple ProxyProvider. Provide proxy as round-robin without heartbeat and error check. It can be used when all proxies are stable.
* @author code4crafter@gmail.com
* Date: 17/4/16
* Time: 10:18
* @since 0.7.0
*/
public class SimpleProxyProvider implements ProxyProvider {
private final List<Proxy> proxies;
private final AtomicInteger pointer;
public SimpleProxyProvider(List<Proxy> proxies) {
this(proxies, new AtomicInteger(-1));
}
private SimpleProxyProvider(List<Proxy> proxies, AtomicInteger pointer) {
this.proxies = proxies;
this.pointer = pointer;
}
public static SimpleProxyProvider from(Proxy... proxies) {
List<Proxy> proxiesTemp = new ArrayList<Proxy>(proxies.length);
for (Proxy proxy : proxies) {
proxiesTemp.add(proxy);
}
return new SimpleProxyProvider(Collections.unmodifiableList(proxiesTemp));
}
@Override
public void returnProxy(Proxy proxy, Page page, Task task) {
//Donothing
}
@Override
public Proxy getProxy(Request request, Task task) {
return proxies.get(incrForLoop());
}
private int incrForLoop() {
int p = pointer.incrementAndGet();
int size = proxies.size();
if (p < size) {
return p;
}
while (!pointer.compareAndSet(p, p % size)) {
p = pointer.get();
}
return p % size;
}
}
================================================
FILE: webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/DuplicateRemovedScheduler.java
================================================
package us.codecraft.webmagic.scheduler;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.scheduler.component.DuplicateRemover;
import us.codecraft.webmagic.scheduler.component.HashSetDuplicateRemover;
import us.codecraft.webmagic.utils.HttpConstant;
/**
* Remove duplicate urls and only push urls which are not duplicate.<br><br>
*
* @author code4crafer@gmail.com
* @since 0.5.0
*/
public abstract class DuplicateRemovedScheduler implements Scheduler {
protected Logger logger = LoggerFactory.getLogger(getClass());
private DuplicateRemover duplicatedRemover = new HashSetDuplicateRemover();
public DuplicateRemover getDuplicateRemover() {
return duplicatedRemover;
}
public DuplicateRemovedScheduler setDuplicateRemover(DuplicateRemover duplicatedRemover) {
this.duplicatedRemover = duplicatedRemover;
return this;
}
@Override
public void push(Request request, Task task) {
logger.trace("get a candidate url {}", request.getUrl());
if (shouldReserved(request) || noNeedToRemoveDuplicate(request) || !duplicatedRemover.isDuplicate(request, task)) {
logger.debug("push to queue {}", request.getUrl());
pushWhenNoDuplicate(request, task);
}
}
protected boolean shouldReserved(Request request) {
return request.getExtra(Request.CYCLE_TRIED_TIMES) != null;
}
protected boolean noNeedToRemoveDuplicate(Request request) {
return HttpConstant.Method.POST.equalsIgnoreCase(request.getMethod());
}
protected void pushWhenNoDuplicate(Request request, Task task) {
}
}
================================================
FILE: webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/MonitorableScheduler.java
================================================
package us.codecraft.webmagic.scheduler;
import us.codecraft.webmagic.Task;
/**
* The scheduler whose requests can be counted for monitor.
*
* @author code4crafter@gmail.com
* @since 0.5.0
*/
public interface MonitorableScheduler extends Scheduler {
public int getLeftRequestsCount(Task task);
public int getTotalRequestsCount(Task task);
}
================================================
FILE: webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/PriorityScheduler.java
================================================
package us.codecraft.webmagic.scheduler;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.utils.NumberUtils;
import java.util.Comparator;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.LinkedBlockingQueue;
import java.util.concurrent.PriorityBlockingQueue;
/**
* Priority scheduler. Request with higher priority will poll earlier. <br>
*
* @author code4crafter@gmail.com <br>
* @since 0.2.1
*/
public class PriorityScheduler extends DuplicateRemovedScheduler implements MonitorableScheduler {
public static final int INITIAL_CAPACITY = 5;
private BlockingQueue<Request> noPriorityQueue = new LinkedBlockingQueue<Request>();
private PriorityBlockingQueue<Request> priorityQueuePlus = new PriorityBlockingQueue<Request>(INITIAL_CAPACITY, new Comparator<Request>() {
@Override
public int compare(Request o1, Request o2) {
return -NumberUtils.compareLong(o1.getPriority(), o2.getPriority());
}
});
private PriorityBlockingQueue<Request> priorityQueueMinus = new PriorityBlockingQueue<Request>(INITIAL_CAPACITY, new Comparator<Request>() {
@Override
public int compare(Request o1, Request o2) {
return -NumberUtils.compareLong(o1.getPriority(), o2.getPriority());
}
});
@Override
public void pushWhenNoDuplicate(Request request, Task task) {
if (request.getPriority() == 0) {
noPriorityQueue.add(request);
} else if (request.getPriority() > 0) {
priorityQueuePlus.put(request);
} else {
priorityQueueMinus.put(request);
}
}
@Override
public synchronized Request poll(Task task) {
Request poll = priorityQueuePlus.poll();
if (poll != null) {
return poll;
}
poll = noPriorityQueue.poll();
if (poll != null) {
return poll;
}
return priorityQueueMinus.poll();
}
@Override
public int getLeftRequestsCount(Task task) {
return noPriorityQueue.size();
}
@Override
public int getTotalRequestsCount(Task task) {
return getDuplicateRemover().getTotalRequestsCount(task);
}
}
================================================
FILE: webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/QueueScheduler.java
================================================
package us.codecraft.webmagic.scheduler;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.LinkedBlockingQueue;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Task;
/**
* Basic Scheduler implementation.<br>
* Store urls to fetch in LinkedBlockingQueue and remove duplicate urls by HashMap.
*
* Note: if you use this {@link QueueScheduler}
* with {@link Site#getCycleRetryTimes()} enabled, you may encountered dead-lock
* when the queue is full.
*
* @author code4crafter@gmail.com <br>
* @since 0.1.0
*/
public class QueueScheduler extends DuplicateRemovedScheduler implements MonitorableScheduler {
private final BlockingQueue<Request> queue;
public QueueScheduler() {
this.queue = new LinkedBlockingQueue<>();
}
/**
* Creates a {@code QueueScheduler} with the given (fixed) capacity.
*
* @param capacity the capacity of this queue,
* see {@link LinkedBlockingQueue#LinkedBlockingQueue(int)}
* @since 0.8.0
*/
public QueueScheduler(int capacity) {
this.queue = new LinkedBlockingQueue<>(capacity);
}
@Override
public void pushWhenNoDuplicate(Request request, Task task) {
logger.trace("Remaining capacity: {}", this.queue.remainingCapacity());
try {
queue.put(request);
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
}
}
@Override
public Request poll(Task task) {
return queue.poll();
}
@Override
public int getLeftRequestsCount(Task task) {
return queue.size();
}
@Override
public int getTotalRequestsCount(Task task) {
return getDuplicateRemover().getTotalRequestsCount(task);
}
}
================================================
FILE: webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/Scheduler.java
================================================
package us.codecraft.webmagic.scheduler;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Task;
/**
* Scheduler is the part of url management.<br>
* You can implement interface Scheduler to do:
* manage urls to fetch
* remove duplicate urls
*
* @author code4crafter@gmail.com <br>
* @since 0.1.0
*/
public interface Scheduler {
/**
* add a url to fetch
*
* @param request request
* @param task task
*/
public void push(Request request, Task task);
/**
* get an url to crawl
*
* @param task the task of spider
* @return the url to crawl
*/
public Request poll(Task task);
}
================================================
FILE: webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/component/DuplicateRemover.java
================================================
package us.codecraft.webmagic.scheduler.component;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Task;
/**
* Remove duplicate requests.
* @author code4crafer@gmail.com
* @since 0.5.1
*/
public interface DuplicateRemover {
/**
*
* Check whether the request is duplicate.
*
* @param request request
* @param task task
* @return true if is duplicate
*/
public boolean isDuplicate(Request request, Task task);
/**
* Reset duplicate check.
* @param task task
*/
public void resetDuplicateCheck(Task task);
/**
* Get TotalRequestsCount for monitor.
* @param task task
* @return number of total request
*/
public int getTotalRequestsCount(Task task);
}
================================================
FILE: webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/component/HashSetDuplicateRemover.java
================================================
package us.codecraft.webmagic.scheduler.component;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Task;
import java.util.Collections;
import java.util.Set;
import java.util.concurrent.ConcurrentHashMap;
/**
* @author code4crafer@gmail.com
*/
public class HashSetDuplicateRemover implements DuplicateRemover {
private Set<String> urls = Collections.newSetFromMap(new ConcurrentHashMap<String, Boolean>());
@Override
public boolean isDuplicate(Request request, Task task) {
return !urls.add(getUrl(request));
}
protected String getUrl(Request request) {
return request.getUrl();
}
@Override
public void resetDuplicateCheck(Task task) {
urls.clear();
}
@Override
public int getTotalRequestsCount(Task task) {
return urls.size();
}
}
================================================
FILE: webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/component/package.html
================================================
<html>
<body>
Component of scheduler.
</body>
</html>
================================================
FILE: webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/package.html
================================================
<html>
<body>
Scheduler is the part of url management.
</body>
</html>
================================================
FILE: webmagic-core/src/main/java/us/codecraft/webmagic/selector/AbstractSelectable.java
================================================
package us.codecraft.webmagic.selector;
import java.util.ArrayList;
import java.util.List;
import org.apache.commons.collections4.CollectionUtils;
/**
* @author code4crafer@gmail.com
* @since 0.5.2
*/
public abstract class AbstractSelectable implements Selectable {
protected abstract List<String> getSourceTexts();
@Override
public Selectable css(String selector) {
return $(selector);
}
@Override
public Selectable css(String selector, String attrName) {
return $(selector, attrName);
}
protected Selectable select(Selector selector, List<String> strings) {
List<String> results = new ArrayList<String>();
for (String string : strings) {
String result = selector.select(string);
if (result != null) {
results.add(result);
}
}
return new PlainText(results);
}
protected Selectable selectList(Selector selector, List<String> strings) {
List<String> results = new ArrayList<String>();
for (String string : strings) {
List<String> result = selector.selectList(string);
results.addAll(result);
}
return new PlainText(results);
}
@Override
public List<String> all() {
return getSourceTexts();
}
@Override
public Selectable jsonPath(String jsonPath) {
throw new UnsupportedOperationException();
}
@Override
public String get() {
List<String> sourceTexts = all();
if (CollectionUtils.isNotEmpty(sourceTexts)) {
return sourceTexts.get(0);
}
return null;
}
@Override
public Selectable select(Selector selector) {
return select(selector, getSourceTexts());
}
@Override
public Selectable selectList(Selector selector) {
return selectList(selector, getSourceTexts());
}
@Override
public Selectable regex(String regex) {
RegexSelector regexSelector = Selectors.regex(regex);
return selectList(regexSelector, getSourceTexts());
}
@Override
public Selectable regex(String regex, int group) {
RegexSelector regexSelector = Selectors.regex(regex, group);
return selectList(regexSelector, getSourceTexts());
}
@Override
public Selectable replace(String regex, String replacement) {
ReplaceSelector replaceSelector = new ReplaceSelector(regex,replacement);
return select(replaceSelector, getSourceTexts());
}
public String getFirstSourceText() {
List<String> sourceTexts = getSourceTexts();
if (CollectionUtils.isNotEmpty(sourceTexts)) {
return sourceTexts.get(0);
}
return null;
}
@Override
public String toString() {
return get();
}
@Override
public boolean match() {
return CollectionUtils.isNotEmpty(getSourceTexts());
}
}
================================================
FILE: webmagic-core/src/main/java/us/codecraft/webmagic/selector/AndSelector.java
================================================
package us.codecraft.webmagic.selector;
import java.util.ArrayList;
import java.util.List;
/**
* All selectors will be arranged as a pipeline. <br>
* The next selector uses the result of the previous as source.
* @author code4crafter@gmail.com <br>
* @since 0.2.0
*/
public class AndSelector implements Selector {
private List<Selector> selectors = new ArrayList<Selector>();
public AndSelector(Selector... selectors) {
for (Selector selector : selectors) {
this.selectors.add(selector);
}
}
public AndSelector(List<Selector> selectors) {
this.selectors = selectors;
}
@Override
public String select(String text) {
for (Selector selector : selectors) {
if (text == null) {
return null;
}
text = selector.select(text);
}
return text;
}
@Override
public List<String> selectList(String text) {
List<String> results = new ArrayList<String>();
boolean first = true;
for (Selector selector : selectors) {
if (first) {
results = selector.selectList(text);
first = false;
} else {
List<String> resultsTemp = new ArrayList<String>();
for (String result : results) {
resultsTemp.addAll(selector.selectList(result));
}
results = resultsTemp;
if (results == null || results.size() == 0) {
return results;
}
}
}
return results;
}
}
================================================
FILE: webmagic-core/src/main/java/us/codecraft/webmagic/selector/BaseElementSelector.java
================================================
package us.codecraft.webmagic.selector;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import us.codecraft.webmagic.utils.BaseSelectorUtils;
import java.util.ArrayList;
import java.util.List;
/**
* @author code4crafter@gmail.com
* @since 0.3.0
*/
public abstract class BaseElementSelector implements Selector, ElementSelector {
private Document parse(String text) {
// Jsoup could not parse <tr></tr> or <td></td> tag directly
// https://stackoverflow.com/questions/63607740/jsoup-couldnt-parse-tr-tag
text = BaseSelectorUtils.preParse(text);
return Jsoup.parse(text);
}
@Override
public String select(String text) {
if (text != null) {
return select(parse(text));
}
return null;
}
@Override
public List<String> selectList(String text) {
if (text != null) {
return selectList(parse(text));
} else {
return new ArrayList<String>();
}
}
public Element selectElement(String text) {
if (text != null) {
return selectElement(parse(text));
}
return null;
}
public List<Element> selectElements(String text) {
if (text != null) {
return selectElements(parse(text));
} else {
return new ArrayList<Element>();
}
}
public abstract Element selectElement(Element element);
public abstract List<Element> selectElements(Element element);
public abstract boolean hasAttribute();
}
================================================
FILE: webmagic-core/src/main/java/us/codecraft/webmagic/selector/CssSelector.java
================================================
package us.codecraft.webmagic.selector;
import java.util.ArrayList;
import java.util.List;
import org.apache.commons.collections4.CollectionUtils;
import org.jsoup.nodes.Element;
import org.jsoup.nodes.Node;
import org.jsoup.nodes.TextNode;
import org.jsoup.select.Elements;
/**
* CSS selector. Based on Jsoup.
*
* @author code4crafter@gmail.com <br>
* @since 0.1.0
*/
public class CssSelector extends BaseElementSelector {
private String selectorText;
private String attrName;
public CssSelector(String selectorText) {
this.selectorText = selectorText;
}
public CssSelector(String selectorText, String attrName) {
this.selectorText = selectorText;
this.attrName = attrName;
}
private String getValue(Element element) {
if (attrName == null) {
return element.outerHtml();
} else if ("innerHtml".equalsIgnoreCase(attrName)) {
return element.html();
} else if ("text".equalsIgnoreCase(attrName)) {
return getText(element);
} else if ("allText".equalsIgnoreCase(attrName)) {
return element.text();
} else {
return element.attr(attrName);
}
}
protected String getText(Element element) {
StringBuilder accum = new StringBuilder();
for (Node node : element.childNodes()) {
if (node instanceof TextNode) {
TextNode textNode = (TextNode) node;
accum.append(textNode.text());
}
}
return accum.toString();
}
@Override
public String select(Element element) {
List<Element> elements = selectElements(element);
if (CollectionUtils.isEmpty(elements)) {
return null;
}
return getValue(elements.get(0));
}
@Override
public List<String> selectList(Element doc) {
List<String> strings = new ArrayList<String>();
List<Element> elements = selectElements(doc);
if (CollectionUtils.isNotEmpty(elements)) {
for (Element element : elements) {
String value = getValue(element);
if (value != null) {
strings.add(value);
}
}
}
return strings;
}
@Override
public Element selectElement(Element element) {
Elements elements = element.select(selectorText);
if (CollectionUtils.isNotEmpty(elements)) {
return elements.get(0);
}
return null;
}
@Override
public List<Element> selectElements(Element element) {
return element.select(selectorText);
}
@Override
public boolean hasAttribute() {
return attrName != null;
}
}
================================================
FILE: webmagic-core/src/main/java/us/codecraft/webmagic/selector/ElementSelector.java
================================================
package us.codecraft.webmagic.selector;
import org.jsoup.nodes.Element;
import java.util.List;
/**
* Selector(extractor) for html elements.<br>
*
* @author code4crafter@gmail.com <br>
* @since 0.3.0
*/
public interface ElementSelector {
/**
* Extract single result in text.<br>
* If there are more than one result, only the first will be chosen.
*
* @param element element
* @return result
*/
public String select(Element element);
/**
* Extract all results in text.<br>
*
* @param element element
* @return results
*/
public List<String> selectList(Element element);
}
================================================
FILE: webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java
================================================
package us.codecraft.webmagic.selector;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.Collections;
import java.util.List;
/**
* Selectable html.<br>
*
* @author code4crafter@gmail.com <br>
* @since 0.1.0
*/
public class Html extends HtmlNode {
private Logger logger = LoggerFactory.getLogger(getClass());
/**
* Disable jsoup html entity escape. It can be set just before any Html instance is created.
* @deprecated
*/
public static boolean DISABLE_HTML_ENTITY_ESCAPE = false;
/**
* Store parsed document for better performance when only one text exist.
*/
private Document document;
public Html(String text, String url) {
try {
this.document = Jsoup.parse(text, url);
} catch (Exception e) {
this.document = null;
logger.warn("parse document error ", e);
}
}
public Html(String text) {
try {
this.document = Jsoup.parse(text);
} catch (Exception e) {
this.document = null;
logger.warn("parse document error ", e);
}
}
public Html(Document document) {
this.document = document;
}
public Document getDocument() {
return document;
}
@Override
protected List<Element> getElements() {
return Collections.<Element>singletonList(getDocument());
}
/**
* @param selector selector
* @return result
*/
public String selectDocument(Selector selector) {
if (selector instanceof ElementSelector) {
ElementSelector elementSelector = (ElementSelector) selector;
return elementSelector.select(getDocument());
} else {
return selector.select(getFirstSourceText());
}
}
public List<String> selectDocumentForList(Selector selector) {
if (selector instanceof ElementSelector) {
ElementSelector elementSelector = (ElementSelector) selector;
return elementSelector.selectList(getDocument());
} else {
return selector.selectList(getFirstSourceText());
}
}
public static Html create(String text) {
return new Html(text);
}
}
================================================
FILE: webmagic-core/src/main/java/us/codecraft/webmagic/selector/HtmlNode.java
================================================
package us.codecraft.webmagic.selector;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import java.util.ArrayList;
import java.util.List;
import java.util.ListIterator;
/**
* @author code4crafer@gmail.com
*/
public class HtmlNode extends AbstractSelectable {
private final List<Element> elements;
public HtmlNode(List<Element> elements) {
this.elements = elements;
}
public HtmlNode() {
elements = null;
}
protected List<Element> getElements() {
return elements;
}
public Selectable smartContent() {
SmartContentSelector smartContentSelector = Selectors.smartContent();
return select(smartContentSelector, getSourceTexts());
}
public Selectable smartContent(int threshold) {
SmartContentSelector smartContentSelector = Selectors.smartContent(threshold);
return select(smartContentSelector, getSourceTexts());
}
@Override
public Selectable links() {
return selectElements(new LinksSelector());
}
@Override
public Selectable xpath(String xpath) {
XpathSelector xpathSelector = Selectors.xpath(xpath);
return selectElements(xpathSelector);
}
@Override
public Selectable selectList(Selector selector) {
if (selector instanceof BaseElementSelector) {
return selectElements((BaseElementSelector) selector);
}
return selectList(selector, getSourceTexts());
}
@Override
public Selectable select(Selector selector) {
return selectList(selector);
}
/**
* select elements
*
* @param elementSelector elementSelector
* @return result
*/
protected Selectable selectElements(BaseElementSelector elementSelector) {
ListIterator<Element> elementIterator = getElements().listIterator();
if (!elementSelector.hasAttribute()) {
List<Element> resultElements = new ArrayList<Element>();
while (elementIterator.hasNext()) {
Element element = checkElementAndConvert(elementIterator);
List<Element> selectElements = elementSelector.selectElements(element);
resultElements.addAll(selectElements);
}
return new HtmlNode(resultElements);
} else {
// has attribute, consider as plaintext
List<String> resultStrings = new ArrayList<String>();
while (elementIterator.hasNext()) {
Element element = checkElementAndConvert(elementIterator);
List<String> selectList = elementSelector.selectList(element);
resultStrings.addAll(selectList);
}
return new PlainText(resultStrings);
}
}
/**
* Only document can be select
* See: https://github.com/code4craft/webmagic/issues/113
*
* @param elementIterator elementIterator
* @return element element
*/
private Element checkElementAndConvert(ListIterator<Element> elementIterator) {
Element element = elementIterator.next();
if (!(element instanceof Document)) {
Document root = new Document(element.ownerDocument().baseUri());
Element clone = element.clone();
root.appendChild(clone);
elementIterator.set(root);
return root;
}
return element;
}
@Override
public Selectable $(String selector) {
CssSelector cssSelector = Selectors.$(selector);
return selectElements(cssSelector);
}
@Override
public Selectable $(String selector, String attrName) {
CssSelector cssSelector = Selectors.$(selector, attrName);
return selectElements(cssSelector);
}
@Override
public List<Selectable> nodes() {
List<Selectable> selectables = new ArrayList<Selectable>();
for (Element element : getElements()) {
List<Element> childElements = new ArrayList<Element>(1);
childElements.add(element);
selectables.add(new HtmlNode(childElements));
}
return selectables;
}
@Override
protected List<String> getSourceTexts() {
List<String> sourceTexts = new ArrayList<String>(getElements().size());
for (Element element : getElements()) {
sourceTexts.add(element.toString());
}
return sourceTexts;
}
}
================================================
FILE: webmagic-core/src/main/java/us/codecraft/webmagic/selector/Json.java
================================================
package us.codecraft.webmagic.selector;
import com.alibaba.fastjson.JSON;
import us.codecraft.xsoup.XTokenQueue;
import java.util.List;
/**
* parse json
* @author code4crafter@gmail.com
* @since 0.5.0
*/
public class Json extends PlainText {
public Json(List<String> strings) {
super(strings);
}
public Json(String text) {
super(text);
}
/**
* remove padding for JSONP
* @param padding padding
* @return json after padding removed
*/
public Json removePadding(String padding) {
String text = getFirstSourceText();
XTokenQueue tokenQueue = new XTokenQueue(text);
tokenQueue.consumeWhitespace();
tokenQueue.consume(padding);
tokenQueue.consumeWhitespace();
String chompBalanced = tokenQueue.chompBalancedNotInQuotes('(', ')');
return new Json(chompBalanced);
}
public <T> T toObject(Class<T> clazz) {
if (getFirstSourceText() == null) {
return null;
}
return JSON.parseObject(getFirstSourceText(), clazz);
}
public <T> List<T> toList(Class<T> clazz) {
if (getFirstSourceText() == null) {
return null;
}
return JSON.parseArray(getFirstSourceText(), clazz);
}
@Override
public Selectable jsonPath(String jsonPath) {
JsonPathSelector jsonPathSelector = new JsonPathSelector(jsonPath);
return selectList(jsonPathSelector,getSourceTexts());
}
}
================================================
FILE: webmagic-core/src/main/java/us/codecraft/webmagic/selector/JsonPathSelector.java
================================================
package us.codecraft.webmagic.selector;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import com.alibaba.fastjson.JSON;
import com.jayway.jsonpath.JsonPath;
/**
* JsonPath selector.<br>
* Used to extract content from JSON.<br>
*
* @author code4crafter@gmail.com <br>
* @since 0.2.1
*/
public class JsonPathSelector implements Selector {
private final String jsonPathStr;
private final JsonPath jsonPath;
public JsonPathSelector(String jsonPathStr) {
this.jsonPathStr = jsonPathStr;
this.jsonPath = JsonPath.compile(this.jsonPathStr);
}
@SuppressWarnings("unused")
public String getJsonPathStr() {
return jsonPathStr;
}
@Override
public String select(String text) {
Object object = jsonPath.read(text);
if (object == null) {
return null;
}
if (object instanceof List) {
List<?> list = (List<?>) object;
if (list.size() > 0) {
return toString(list.iterator().next());
}
}
return object.toString();
}
private String toString(Object object) {
if (object instanceof Map) {
return JSON.toJSONString(object);
} else {
return String.valueOf(object);
}
}
@Override
@SuppressWarnings("unchecked")
public List<String> selectList(String text) {
List<String> list = new ArrayList<>();
Object object = jsonPath.read(text);
if (object == null) {
return list;
}
if (object instanceof List) {
List<Object> items = (List<Object>) object;
for (Object item : items) {
list.add(toString(item));
}
} else {
list.add(toString(object));
}
return list;
}
}
================================================
FILE: webmagic-core/src/main/java/us/codecraft/webmagic/selector/LinksSelector.java
================================================
package us.codecraft.webmagic.selector;
import java.util.ArrayList;
import java.util.List;
import org.apache.commons.lang3.StringUtils;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
/**
* Links selector based on jsoup. Use absolute url. <br>
*
* @author code4crafter@gmail.com <br>
* @since 0.7.0
*/
public class LinksSelector extends BaseElementSelector {
@Override
public String select(Element element) {
throw new UnsupportedOperationException();
}
@Override
public List<String> selectList(Element element) {
Elements elements = element.select("a");
List<String> links = new ArrayList<>(elements.size());
for (Element element0 : elements) {
if (StringUtils.isNotBlank(element0.baseUri())) {
links.add(element0.attr("abs:href"));
} else {
links.add(element0.attr("href"));
}
}
return links;
}
@Override
public Element selectElement(Element element) {
throw new UnsupportedOperationException();
}
@Override
public List<Element> selectElements(Element element) {
throw new UnsupportedOperationException();
}
@Override
public boolean hasAttribute() {
return true;
}
}
================================================
FILE: webmagic-core/src/main/java/us/codecraft/webmagic/selector/OrSelector.java
================================================
package us.codecraft.webmagic.selector;
import java.util.ArrayList;
import java.util.List;
/**
* All extractors will do extracting separately, <br>
* and the results of extractors will combined as the final result.
* @author code4crafter@gmail.com <br>
* @since 0.2.0
*/
public class OrSelector implements Selector {
private List<Selector> selectors = new ArrayList<Selector>();
public OrSelector(Selector... selectors) {
for (Selector selector : selectors) {
this.selectors.add(selector);
}
}
public OrSelector(List<Selector> selectors) {
this.selectors = selectors;
}
@Override
public String select(String text) {
for (Selector selector : selectors) {
String result = selector.select(text);
if (result != null) {
return result;
}
}
return null;
}
@Override
public List<String> selectList(String text) {
List<String> results = new ArrayList<String>();
for (Selector selector : selectors) {
List<String> strings = selector.selectList(text);
results.addAll(strings);
}
return results;
}
}
================================================
FILE: webmagic-core/src/main/java/us/codecraft/webmagic/selector/PlainText.java
================================================
package us.codecraft.webmagic.selector;
import java.util.ArrayList;
import java.util.List;
/**
* Selectable plain text.<br>
* Can not be selected by XPath or CSS Selector.
*
* @author code4crafter@gmail.com <br>
* @since 0.1.0
*/
public class PlainText extends AbstractSelectable {
protected List<String> sourceTexts;
public PlainText(List<String> sourceTexts) {
this.sourceTexts = sourceTexts;
}
public PlainText(String text) {
this.sourceTexts = new ArrayList<String>();
sourceTexts.add(text);
}
public static PlainText create(String text) {
return new PlainText(text);
}
@Override
public Selectable xpath(String xpath) {
throw new UnsupportedOperationException("XPath can not apply to plain text. Please check whether you use a previous xpath with attribute select (/@href etc).");
}
@Override
public Selectable $(String selector) {
throw new UnsupportedOperationException("$ can not apply to plain text. Please check whether you use a previous xpath with attribute select (/@href etc).");
}
@Override
public Selectable $(String selector, String attrName) {
throw new UnsupportedOperationException("$ can not apply to plain text. Please check whether you use a previous xpath with attribute select (/@href etc).");
}
@Override
public Selectable links() {
throw new UnsupportedOperationException("Links can not apply to plain text. Please check whether you use a previous xpath with attribute select (/@href etc).");
}
@Override
public List<Selectable> nodes() {
List<Selectable> nodes = new ArrayList<Selectable>(getSourceTexts().size());
for (String string : getSourceTexts()) {
nodes.add(PlainText.create(string));
}
return nodes;
}
@Override
protected List<String> getSourceTexts() {
return sourceTexts;
}
}
================================================
FILE: webmagic-core/src/main/java/us/codecraft/webmagic/selector/RegexResult.java
================================================
package us.codecraft.webmagic.selector;
/**
* Object contains regex results.<br>
* For multi group result extension.<br>
*
* @author code4crafter@gmail.com <br>
* @since 0.1.0
*/
class RegexResult {
private String[] groups;
public static final RegexResult EMPTY_RESULT = new RegexResult();
public RegexResult() {
}
public RegexResult(String[] groups) {
this.groups = groups;
}
public String get(int groupId) {
if (groups == null) {
return null;
}
return groups[groupId];
}
}
================================================
FILE: webmagic-core/src/main/java/us/codecraft/webmagic/selector/RegexSelector.java
================================================
package us.codecraft.webmagic.selector;
import org.apache.commons.lang3.StringUtils;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException;
/**
* Selector in regex.<br>
*
* @author code4crafter@gmail.com <br>
* @since 0.1.0
*/
public class RegexSelector implements Selector {
private String regexStr;
private Pattern regex;
private int group = 1;
public RegexSelector(String regexStr, int group) {
this.compileRegex(regexStr);
this.group = group;
}
private void compileRegex(String regexStr) {
if (StringUtils.isBlank(regexStr)) {
throw new IllegalArgumentException("regex must not be empty");
}
try {
this.regex = Pattern.compile(regexStr, Pattern.DOTALL | Pattern.CASE_INSENSITIVE);
this.regexStr = regexStr;
} catch (PatternSyntaxException e) {
throw new IllegalArgumentException("invalid regex "+regexStr, e);
}
}
/**
* Create a RegexSelector. When there is no capture group, the value is set to 0 else set to 1.
* @param regexStr the regular expression.
*/
public RegexSelector(String regexStr) {
this.compileRegex(regexStr);
if (regex.matcher("").groupCount() == 0) {
this.group = 0;
} else {
this.group = 1;
}
}
@Override
public String select(String text) {
return selectGroup(text).get(group);
}
@Override
public List<String> selectList(String text) {
List<String> strings = new ArrayList<String>();
List<RegexResult> results = selectGroupList(text);
for (RegexResult result : results) {
strings.add(result.get(group));
}
return strings;
}
public RegexResult selectGroup(String text) {
Matcher matcher = regex.matcher(text);
if (matcher.find()) {
String[] groups = new String[matcher.groupCount() + 1];
for (int i = 0; i < groups.length; i++) {
groups[i] = matcher.group(i);
}
return new RegexResult(groups);
}
return RegexResult.EMPTY_RESULT;
}
public List<RegexResult> selectGroupList(String text) {
Matcher matcher = regex.matcher(text);
List<RegexResult> resultList = new ArrayList<RegexResult>();
while (matcher.find()) {
String[] groups = new String[matcher.groupCount() + 1];
for (int i = 0; i < groups.length; i++) {
groups[i] = matcher.group(i);
}
resultList.add(new RegexResult(groups));
}
return resultList;
}
@Override
public String toString() {
return regexStr;
}
}
================================================
FILE: webmagic-core/src/main/java/us/codecraft/webmagic/selector/ReplaceSelector.java
================================================
package us.codecraft.webmagic.selector;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException;
/**
* Replace selector.<br>
*
* @author code4crafter@gmail.com <br>
* @since 0.1.0
*/
public class ReplaceSelector implements Selector {
private String regexStr;
private String replacement;
private Pattern regex;
public ReplaceSelector(String regexStr, String replacement) {
this.regexStr = regexStr;
this.replacement = replacement;
try {
regex = Pattern.compile(regexStr);
} catch (PatternSyntaxException e) {
throw new IllegalArgumentException("invalid regex", e);
}
}
@Override
public String select(String text) {
Matcher matcher = regex.matcher(text);
return matcher.replaceAll(replacement);
}
@Override
public List<String> selectList(String text) {
throw new UnsupportedOperationException();
}
@Override
public String toString() {
return regexStr + "_" + replacement;
}
}
================================================
FILE: webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectable.java
================================================
package us.codecraft.webmagic.selector;
import java.util.List;
/**
* Selectable text.<br>
*
* @author code4crafter@gmail.com <br>
* @since 0.1.0
*/
public interface Selectable {
/**
* select list with xpath
*
* @param xpath xpath
* @return new Selectable after extract
*/
public Selectable xpath(String xpath);
/**
* select list with css selector
*
* @param selector css selector expression
* @return new Selectable after extract
*/
public Selectable $(String selector);
/**
* select list with css selector
*
* @param selector css selector expression
* @param attrName attribute name of css selector
* @return new Selectable after extract
*/
public Selectable $(String selector, String attrName);
/**
* select list with css selector
*
* @param selector css selector expression
* @return new Selectable after extract
*/
public Selectable css(String selector);
/**
* select list with css selector
*
* @param selector css selector expression
* @param attrName attribute name of css selector
* @return new Selectable after extract
*/
public Selectable css(String selector, String attrName);
/**
* select all links
*
* @return all links
*/
public Selectable links();
/**
* select list with regex, default group is group 1
*
* @param regex regex
* @return new Selectable after extract
*/
public Selectable regex(String regex);
/**
* select list with regex
*
* @param regex regex
* @param group group
* @return new Selectable after extract
*/
public Selectable regex(String regex, int group);
/**
* replace with regex
*
* @param regex regex
* @param replacement replacement
* @return new Selectable after extract
*/
public Selectable replace(String regex, String replacement);
/**
* single string result
*
* @return single string result
*/
public String toString();
/**
* single string result
*
* @return single string result
*/
public String get();
/**
* if result exist for select
*
* @return true if result exist
*/
public boolean match();
/**
* multi string result
*
* @return multi string result
*/
public List<String> all();
/**
* extract by JSON Path expression
*
* @param jsonPath jsonPath
* @return result
*/
public Selectable jsonPath(String jsonPath);
/**
* extract by custom selector
*
* @param selector selector
* @return result
*/
public Selectable select(Selector selector);
/**
* extract by custom selector
*
* @param selector selector
* @return result
*/
public Selectable selectList(Selector selector);
/**
* get all nodes
* @return result
*/
public List<Selectable> nodes();
}
================================================
FILE: webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selector.java
================================================
package us.codecraft.webmagic.selector;
import java.util.List;
/**
* Selector(extractor) for text.<br>
*
* @author code4crafter@gmail.com <br>
* @since 0.1.0
*/
public interface Selector {
/**
* Extract single result in text.<br>
* If there are more than one result, only the first will be chosen.
*
* @param text text
* @return result
*/
public String select(String text);
/**
* Extract all results in text.<br>
*
* @param text text
* @return results
*/
public List<String> selectList(String text);
}
================================================
FILE: webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectors.java
================================================
package us.codecraft.webmagic.selector;
/**
* Convenient methods for selectors.<br>
*
* @author code4crafter@gmail.com <br>
* @since 0.2.1
*/
public abstract class Selectors {
public static RegexSelector regex(String expr) {
return new RegexSelector(expr);
}
public static RegexSelector regex(String expr, int group) {
return new RegexSelector(expr,group);
}
public static SmartContentSelector smartContent() {
return new SmartContentSelector();
}
public static SmartContentSelector smartContent(int threshold) {
return new SmartContentSelector(threshold);
}
public static CssSelector $(String expr) {
return new CssSelector(expr);
}
public static CssSelector $(String expr, String attrName) {
return new CssSelector(expr, attrName);
}
public static XpathSelector xpath(String expr) {
return new XpathSelector(expr);
}
/**
* @see #xpath(String)
* @param expr expr
* @return new selector
*/
@Deprecated
public static XpathSelector xsoup(String expr) {
return new XpathSelector(expr);
}
public static AndSelector and(Selector... selectors) {
return new AndSelector(selectors);
}
public static OrSelector or(Selector... selectors) {
return new OrSelector(selectors);
}
}
================================================
FILE: webmagic-core/src/main/java/us/codecraft/webmagic/selector/SmartContentSelector.java
================================================
package us.codecraft.webmagic.selector;
import us.codecraft.webmagic.utils.Experimental;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
/**
* Borrowed from https://code.google.com/p/cx-extractor/
*
* @author code4crafter@gmail.com <br>
* @since 0.4.1
*
*/
@Experimental
public class SmartContentSelector implements Selector {
private int threshold = 86;
public SmartContentSelector() {
}
public SmartContentSelector(int threshold) {
this.threshold = threshold;
}
@Override
public String select(String html) {
html = html.replaceAll("(?is)<!DOCTYPE.*?>", "");
html = html.replaceAll("(?is)<!--.*?-->", ""); // remove html comment
html = html.replaceAll("(?is)<script.*?>.*?</script>", ""); // remove javascript
html = html.replaceAll("(?is)<style.*?>.*?</style>", ""); // remove css
html = html.replaceAll("&.{2,5};|&#.{2,5};", " "); // remove special char
html = html.replaceAll("(?is)<.*?>", "");
List<String> lines;
int blocksWidth =3;
int start;
int end;
StringBuilder text = new StringBuilder();
ArrayList<Integer> indexDistribution = new ArrayList<Integer>();
lines = Arrays.asList(html.split("\n"));
for (int i = 0; i < lines.size() - blocksWidth; i++) {
int wordsNum = 0;
for (int j = i; j < i + blocksWidth; j++) {
lines.set(j, lines.get(j).replaceAll("\\s+", ""));
wordsNum += lines.get(j).length();
}
indexDistribution.add(wordsNum);
}
start = -1; end = -1;
boolean boolstart = false, boolend = false;
text.setLength(0);
for (int i = 0; i < indexDistribution.size() - 1; i++) {
if (indexDistribution.get(i) > threshold && ! boolstart) {
if (indexDistribution.get(i+1).intValue() != 0
|| indexDistribution.get(i+2).intValue() != 0
|| indexDistribution.get(i+3).intValue() != 0) {
boolstart = true;
start = i;
continue;
}
}
if (boolstart) {
if (indexDistribution.get(i).intValue() == 0
|| indexDistribution.get(i+1).intValue() == 0) {
end = i;
boolend = true;
}
}
StringBuilder tmp = new StringBuilder();
if (boolend) {
//System.out.println(start+1 + "\t\t" + end+1);
for (int ii = start; ii <= end; ii++) {
if (lines.get(ii).length() < 5) continue;
tmp.append(lines.get(ii) + "\n");
}
String str = tmp.toString();
//System.out.println(str);
if (str.contains("Copyright") ) continue;
text.append(str);
boolstar
gitextract_m56n222u/
├── .gitignore
├── .travis.yml
├── LICENSE
├── README-zh.md
├── README.md
├── pom.xml
├── src/
│ └── site/
│ └── site.xml
├── webmagic-core/
│ ├── README.md
│ ├── module_webmagic-core.xml
│ ├── pom.xml
│ └── src/
│ ├── main/
│ │ └── java/
│ │ └── us/
│ │ └── codecraft/
│ │ └── webmagic/
│ │ ├── Page.java
│ │ ├── Request.java
│ │ ├── ResultItems.java
│ │ ├── Site.java
│ │ ├── Spider.java
│ │ ├── SpiderListener.java
│ │ ├── SpiderScheduler.java
│ │ ├── Task.java
│ │ ├── downloader/
│ │ │ ├── AbstractDownloader.java
│ │ │ ├── CustomRedirectStrategy.java
│ │ │ ├── Downloader.java
│ │ │ ├── HttpClientDownloader.java
│ │ │ ├── HttpClientGenerator.java
│ │ │ ├── HttpClientRequestContext.java
│ │ │ ├── HttpUriRequestConverter.java
│ │ │ └── package.html
│ │ ├── model/
│ │ │ └── HttpRequestBody.java
│ │ ├── package.html
│ │ ├── pipeline/
│ │ │ ├── CollectorPipeline.java
│ │ │ ├── ConsolePipeline.java
│ │ │ ├── FilePipeline.java
│ │ │ ├── Pipeline.java
│ │ │ ├── ResultItemsCollectorPipeline.java
│ │ │ └── package.html
│ │ ├── processor/
│ │ │ ├── PageProcessor.java
│ │ │ ├── SimplePageProcessor.java
│ │ │ ├── example/
│ │ │ │ ├── BaiduBaikePageProcessor.java
│ │ │ │ ├── GithubRepoPageProcessor.java
│ │ │ │ └── ZhihuPageProcessor.java
│ │ │ └── package.html
│ │ ├── proxy/
│ │ │ ├── Proxy.java
│ │ │ ├── ProxyProvider.java
│ │ │ └── SimpleProxyProvider.java
│ │ ├── scheduler/
│ │ │ ├── DuplicateRemovedScheduler.java
│ │ │ ├── MonitorableScheduler.java
│ │ │ ├── PriorityScheduler.java
│ │ │ ├── QueueScheduler.java
│ │ │ ├── Scheduler.java
│ │ │ ├── component/
│ │ │ │ ├── DuplicateRemover.java
│ │ │ │ ├── HashSetDuplicateRemover.java
│ │ │ │ └── package.html
│ │ │ └── package.html
│ │ ├── selector/
│ │ │ ├── AbstractSelectable.java
│ │ │ ├── AndSelector.java
│ │ │ ├── BaseElementSelector.java
│ │ │ ├── CssSelector.java
│ │ │ ├── ElementSelector.java
│ │ │ ├── Html.java
│ │ │ ├── HtmlNode.java
│ │ │ ├── Json.java
│ │ │ ├── JsonPathSelector.java
│ │ │ ├── LinksSelector.java
│ │ │ ├── OrSelector.java
│ │ │ ├── PlainText.java
│ │ │ ├── RegexResult.java
│ │ │ ├── RegexSelector.java
│ │ │ ├── ReplaceSelector.java
│ │ │ ├── Selectable.java
│ │ │ ├── Selector.java
│ │ │ ├── Selectors.java
│ │ │ ├── SmartContentSelector.java
│ │ │ ├── XpathSelector.java
│ │ │ └── package.html
│ │ ├── thread/
│ │ │ └── CountableThreadPool.java
│ │ └── utils/
│ │ ├── BaseSelectorUtils.java
│ │ ├── CharsetUtils.java
│ │ ├── Experimental.java
│ │ ├── FilePersistentBase.java
│ │ ├── HttpClientUtils.java
│ │ ├── HttpConstant.java
│ │ ├── NumberUtils.java
│ │ ├── ProxyUtils.java
│ │ ├── UrlUtils.java
│ │ ├── WMCollections.java
│ │ └── package.html
│ └── test/
│ ├── java/
│ │ └── us/
│ │ └── codecraft/
│ │ └── webmagic/
│ │ ├── HtmlTest.java
│ │ ├── RequestTest.java
│ │ ├── ResultItemsTest.java
│ │ ├── SiteTest.java
│ │ ├── SpiderTest.java
│ │ ├── downloader/
│ │ │ ├── HttpClientDownloaderTest.java
│ │ │ ├── HttpUriRequestConverterTest.java
│ │ │ ├── MockGithubDownloader.java
│ │ │ └── SSLCompatibilityTest.java
│ │ ├── example/
│ │ │ └── GithubRepoPageProcessorTest.java
│ │ ├── pipeline/
│ │ │ └── FilePipelineTest.java
│ │ ├── processor/
│ │ │ └── PageProcessorTest.java
│ │ ├── proxy/
│ │ │ ├── ProxyTest.java
│ │ │ └── SimpleProxyProviderTest.java
│ │ ├── scheduler/
│ │ │ ├── DuplicateRemovedSchedulerTest.java
│ │ │ └── PrioritySchedulerTest.java
│ │ ├── selector/
│ │ │ ├── AndSelectorTest.java
│ │ │ ├── CssSelectorTest.java
│ │ │ ├── ExtractorsTest.java
│ │ │ ├── JsonPathSelectorTest.java
│ │ │ ├── JsonTest.java
│ │ │ ├── LinksSelectorTest.java
│ │ │ ├── OrSelectorTest.java
│ │ │ ├── RegexSelectorTest.java
│ │ │ └── SelectorTest.java
│ │ └── utils/
│ │ ├── CharsetUtilsTest.java
│ │ ├── NumberUtilsTest.java
│ │ └── UrlUtilsTest.java
│ └── resources/
│ ├── html/
│ │ └── mock-github.html
│ └── log4j2-test.xml
├── webmagic-coverage/
│ └── pom.xml
├── webmagic-extension/
│ ├── README.md
│ ├── pom.xml
│ └── src/
│ ├── main/
│ │ ├── java/
│ │ │ └── us/
│ │ │ └── codecraft/
│ │ │ └── webmagic/
│ │ │ ├── MultiPageModel.java
│ │ │ ├── SimpleHttpClient.java
│ │ │ ├── configurable/
│ │ │ │ ├── ConfigurablePageProcessor.java
│ │ │ │ ├── ExpressionType.java
│ │ │ │ └── ExtractRule.java
│ │ │ ├── downloader/
│ │ │ │ └── PhantomJSDownloader.java
│ │ │ ├── example/
│ │ │ │ ├── AppStore.java
│ │ │ │ ├── BaiduBaike.java
│ │ │ │ ├── GithubRepo.java
│ │ │ │ ├── GithubRepoApi.java
│ │ │ │ ├── GithubRepoPageMapper.java
│ │ │ │ ├── MonitorExample.java
│ │ │ │ ├── OschinaBlog.java
│ │ │ │ └── PatternProcessorExample.java
│ │ │ ├── handler/
│ │ │ │ ├── CompositePageProcessor.java
│ │ │ │ ├── CompositePipeline.java
│ │ │ │ ├── PatternProcessor.java
│ │ │ │ ├── PatternRequestMatcher.java
│ │ │ │ ├── RequestMatcher.java
│ │ │ │ ├── SubPageProcessor.java
│ │ │ │ └── SubPipeline.java
│ │ │ ├── model/
│ │ │ │ ├── AfterExtractor.java
│ │ │ │ ├── ConsolePageModelPipeline.java
│ │ │ │ ├── Extractor.java
│ │ │ │ ├── FieldExtractor.java
│ │ │ │ ├── HasKey.java
│ │ │ │ ├── ModelPageProcessor.java
│ │ │ │ ├── ModelPipeline.java
│ │ │ │ ├── OOSpider.java
│ │ │ │ ├── PageMapper.java
│ │ │ │ ├── PageModelCollectorPipeline.java
│ │ │ │ ├── PageModelExtractor.java
│ │ │ │ ├── annotation/
│ │ │ │ │ ├── ComboExtract.java
│ │ │ │ │ ├── ExtractBy.java
│ │ │ │ │ ├── ExtractByUrl.java
│ │ │ │ │ ├── Formatter.java
│ │ │ │ │ ├── HelpUrl.java
│ │ │ │ │ ├── TargetUrl.java
│ │ │ │ │ └── package.html
│ │ │ │ ├── fields/
│ │ │ │ │ ├── MultipleField.java
│ │ │ │ │ ├── PageField.java
│ │ │ │ │ └── SingleField.java
│ │ │ │ ├── formatter/
│ │ │ │ │ ├── BasicClassDetector.java
│ │ │ │ │ ├── BasicTypeFormatter.java
│ │ │ │ │ ├── DateFormatter.java
│ │ │ │ │ ├── ObjectFormatter.java
│ │ │ │ │ ├── ObjectFormatterBuilder.java
│ │ │ │ │ └── ObjectFormatters.java
│ │ │ │ ├── package.html
│ │ │ │ └── sources/
│ │ │ │ ├── Source.java
│ │ │ │ └── SourceTextExtractor.java
│ │ │ ├── monitor/
│ │ │ │ ├── SpiderMonitor.java
│ │ │ │ ├── SpiderStatus.java
│ │ │ │ └── SpiderStatusMXBean.java
│ │ │ ├── pipeline/
│ │ │ │ ├── CollectorPageModelPipeline.java
│ │ │ │ ├── FilePageModelPipeline.java
│ │ │ │ ├── JsonFilePageModelPipeline.java
│ │ │ │ ├── JsonFilePipeline.java
│ │ │ │ ├── MultiPagePipeline.java
│ │ │ │ └── PageModelPipeline.java
│ │ │ ├── scheduler/
│ │ │ │ ├── BloomFilterDuplicateRemover.java
│ │ │ │ ├── FileCacheQueueScheduler.java
│ │ │ │ ├── RedisPriorityScheduler.java
│ │ │ │ └── RedisScheduler.java
│ │ │ └── utils/
│ │ │ ├── ClassUtils.java
│ │ │ ├── DoubleKeyMap.java
│ │ │ ├── ExtractorUtils.java
│ │ │ ├── IPUtils.java
│ │ │ ├── MultiKeyMapBase.java
│ │ │ └── RequestUtils.java
│ │ └── resources/
│ │ ├── crawl.js
│ │ └── spider-config-draft.xml
│ └── test/
│ ├── java/
│ │ └── us/
│ │ └── codecraft/
│ │ └── webmagic/
│ │ ├── MockPageModelPipeline.java
│ │ ├── MockPipeline.java
│ │ ├── SimpleHttpClientTest.java
│ │ ├── configurable/
│ │ │ └── ConfigurablePageProcessorTest.java
│ │ ├── downloader/
│ │ │ └── MockGithubDownloader.java
│ │ ├── formatter/
│ │ │ └── DateFormatterTest.java
│ │ ├── model/
│ │ │ ├── BaseRepo.java
│ │ │ ├── GithubRepo.java
│ │ │ ├── GithubRepoApi.java
│ │ │ ├── GithubRepoTest.java
│ │ │ ├── ModelPageProcessorTest.java
│ │ │ ├── PageMapperTest.java
│ │ │ ├── PageMocker.java
│ │ │ └── PageModelExtractorTest.java
│ │ ├── monitor/
│ │ │ ├── CustomSpiderStatus.java
│ │ │ ├── CustomSpiderStatusMXBean.java
│ │ │ ├── SeedUrlWithPortTest.java
│ │ │ └── SpiderMonitorTest.java
│ │ ├── processor/
│ │ │ └── GithubRepoProcessor.java
│ │ ├── scheduler/
│ │ │ ├── BloomFilterDuplicateRemoverTest.java
│ │ │ ├── RedisPrioritySchedulerTest.java
│ │ │ └── RedisSchedulerTest.java
│ │ └── utils/
│ │ ├── IPUtilsTest.java
│ │ └── RequestUtilsTest.java
│ └── resources/
│ ├── html/
│ │ ├── mock-github.html
│ │ └── mock-webmagic.html
│ ├── json/
│ │ └── mock-githubrepo.json
│ └── log4j2-test.xml
├── webmagic-samples/
│ ├── README.md
│ ├── pom.xml
│ └── src/
│ ├── main/
│ │ ├── java/
│ │ │ └── us/
│ │ │ └── codecraft/
│ │ │ └── webmagic/
│ │ │ ├── main/
│ │ │ │ └── QuickStarter.java
│ │ │ ├── model/
│ │ │ │ └── samples/
│ │ │ │ ├── BaiduNews.java
│ │ │ │ ├── Blog.java
│ │ │ │ ├── DianpingFtlDataScanner.java
│ │ │ │ ├── GithubRepo.java
│ │ │ │ ├── IteyeBlog.java
│ │ │ │ ├── JokejiModel.java
│ │ │ │ ├── Kr36NewsModel.java
│ │ │ │ ├── News163.java
│ │ │ │ ├── OschinaAnswer.java
│ │ │ │ ├── OschinaBlog.java
│ │ │ │ └── QQMeishi.java
│ │ │ ├── recover/
│ │ │ │ ├── DuplicateStorageRemover.java
│ │ │ │ ├── MmapQueueScheduler.java
│ │ │ │ └── RecoverSample.java
│ │ │ └── samples/
│ │ │ ├── AlexanderMcqueenGoodsProcessor.java
│ │ │ ├── AmanzonPageProcessor.java
│ │ │ ├── AngularJSProcessor.java
│ │ │ ├── DiandianBlogProcessor.java
│ │ │ ├── DiaoyuwengProcessor.java
│ │ │ ├── F58PageProcesser.java
│ │ │ ├── GithubRepo.java
│ │ │ ├── GithubRepoPageProcessor.java
│ │ │ ├── HuxiuProcessor.java
│ │ │ ├── InfoQMiniBookProcessor.java
│ │ │ ├── IteyeBlogProcessor.java
│ │ │ ├── KaichibaProcessor.java
│ │ │ ├── MamacnPageProcessor.java
│ │ │ ├── MeicanProcessor.java
│ │ │ ├── NjuBBSProcessor.java
│ │ │ ├── PhantomJSPageProcessor.java
│ │ │ ├── QzoneBlogProcessor.java
│ │ │ ├── SinaBlogProcessor.java
│ │ │ ├── TianyaPageProcesser.java
│ │ │ ├── ZhihuPageProcessor.java
│ │ │ ├── formatter/
│ │ │ │ └── StringTemplateFormatter.java
│ │ │ ├── pipeline/
│ │ │ │ ├── OneFilePipeline.java
│ │ │ │ └── ReplacePipeline.java
│ │ │ └── scheduler/
│ │ │ ├── DelayQueueScheduler.java
│ │ │ ├── LevelLimitScheduler.java
│ │ │ └── ZipCodePageProcessor.java
│ │ └── resources/
│ │ ├── crawl.js
│ │ └── log4j2.xml
│ └── test/
│ └── java/
│ └── us/
│ └── codecraft/
│ └── webmagic/
│ ├── SpiderTest.java
│ ├── model/
│ │ └── ProcessorBenchmark.java
│ ├── processor/
│ │ └── SinablogProcessorTest.java
│ └── samples/
│ └── scheduler/
│ └── DelayQueueSchedulerTest.java
├── webmagic-saxon/
│ ├── README.md
│ ├── pom.xml
│ └── src/
│ ├── main/
│ │ └── java/
│ │ └── us/
│ │ └── codecraft/
│ │ └── webmagic/
│ │ └── selector/
│ │ ├── JaxpSelectorUtils.java
│ │ ├── NodeSelector.java
│ │ └── Xpath2Selector.java
│ └── test/
│ └── java/
│ └── us/
│ └── codecraft/
│ └── webmagic/
│ └── selector/
│ └── XpathSelectorTest.java
├── webmagic-scripts/
│ ├── README.md
│ ├── deploy.sh
│ ├── pom.xml
│ └── src/
│ ├── main/
│ │ ├── groovy/
│ │ │ └── Github.groovy
│ │ ├── java/
│ │ │ └── us/
│ │ │ └── codecraft/
│ │ │ └── webmagic/
│ │ │ └── scripts/
│ │ │ ├── Params.java
│ │ │ ├── ScriptConsole.java
│ │ │ ├── ScriptEnginePool.java
│ │ │ ├── ScriptProcessor.java
│ │ │ ├── ScriptProcessorBuilder.java
│ │ │ ├── config/
│ │ │ │ ├── CommandLineOption.java
│ │ │ │ └── ConfigLogger.java
│ │ │ └── languages/
│ │ │ ├── JRuby.java
│ │ │ ├── Javascript.java
│ │ │ ├── Jython.java
│ │ │ └── Language.java
│ │ ├── kotlin/
│ │ │ └── Github.kt
│ │ └── resources/
│ │ ├── js/
│ │ │ ├── defines.js
│ │ │ ├── github.js
│ │ │ └── oschina.js
│ │ ├── python/
│ │ │ ├── defines.py
│ │ │ └── oschina.py
│ │ └── ruby/
│ │ ├── defines.rb
│ │ ├── github.rb
│ │ └── oschina.rb
│ └── test/
│ ├── java/
│ │ └── us/
│ │ └── codecraft/
│ │ └── webmagic/
│ │ └── scripts/
│ │ └── ScriptProcessorTest.java
│ └── resources/
│ └── log4j2-test.xml
└── webmagic-selenium/
├── README.md
├── config.ini
├── pom.xml
└── src/
├── main/
│ └── java/
│ └── us/
│ └── codecraft/
│ └── webmagic/
│ └── downloader/
│ └── selenium/
│ ├── SeleniumDownloader.java
│ └── WebDriverPool.java
└── test/
├── java/
│ └── us/
│ └── codecraft/
│ └── webmagic/
│ ├── downloader/
│ │ ├── SeleniumTest.java
│ │ └── selenium/
│ │ ├── SeleniumDownloaderTest.java
│ │ └── WebDriverPoolTest.java
│ └── samples/
│ ├── GooglePlayProcessor.java
│ └── HuabanProcessor.java
└── resources/
└── config.ini
SYMBOL INDEX (1468 symbols across 253 files)
FILE: webmagic-core/src/main/java/us/codecraft/webmagic/Page.java
class Page (line 29) | public class Page {
method ofSuccess (line 62) | public static Page ofSuccess(Request request) {
method ofFailure (line 73) | public static Page ofFailure(Request request) {
method Page (line 77) | public Page() {
method Page (line 88) | private Page(Request request, boolean downloadSuccess) {
method fail (line 99) | @Deprecated
method fail (line 113) | @Deprecated(since = "1.0.2", forRemoval = true)
method setSkip (line 121) | public Page setSkip(boolean skip) {
method putField (line 133) | public void putField(String key, Object field) {
method getHtml (line 142) | public Html getHtml() {
method getJson (line 155) | public Json getJson() {
method setHtml (line 167) | @Deprecated
method getTargetRequests (line 172) | public List<Request> getTargetRequests() {
method addTargetRequests (line 181) | public void addTargetRequests(Iterable<String> requests) {
method addTargetRequests (line 191) | public void addTargetRequests(Iterable<String> requests, long priority) {
method addRequestIfValid (line 207) | private void addRequestIfValid(String url, long priority) {
method addTargetRequest (line 225) | public void addTargetRequest(String requestString) {
method addTargetRequest (line 238) | public void addTargetRequest(Request request) {
method getUrl (line 247) | public Selectable getUrl() {
method setUrl (line 251) | public void setUrl(Selectable url) {
method getRequest (line 260) | public Request getRequest() {
method setRequest (line 264) | public void setRequest(Request request) {
method getResultItems (line 269) | public ResultItems getResultItems() {
method getStatusCode (line 273) | public int getStatusCode() {
method setStatusCode (line 277) | public void setStatusCode(int statusCode) {
method getRawText (line 281) | public String getRawText() {
method setRawText (line 285) | public Page setRawText(String rawText) {
method getHeaders (line 290) | public Map<String, List<String>> getHeaders() {
method setHeaders (line 294) | public void setHeaders(Map<String, List<String>> headers) {
method isDownloadSuccess (line 298) | public boolean isDownloadSuccess() {
method setDownloadSuccess (line 302) | public void setDownloadSuccess(boolean downloadSuccess) {
method getBytes (line 306) | public byte[] getBytes() {
method setBytes (line 310) | public void setBytes(byte[] bytes) {
method getCharset (line 314) | public String getCharset() {
method setCharset (line 318) | public void setCharset(String charset) {
method toString (line 322) | @Override
FILE: webmagic-core/src/main/java/us/codecraft/webmagic/Request.java
class Request (line 19) | public class Request implements Serializable {
method Request (line 63) | public Request() {
method Request (line 66) | public Request(String url) {
method getPriority (line 70) | public long getPriority() {
method setPriority (line 82) | @Experimental
method getExtra (line 88) | @SuppressWarnings("unchecked")
method putExtra (line 96) | public <T> Request putExtra(String key, T value) {
method getUrl (line 101) | public String getUrl() {
method getExtras (line 105) | public Map<String, Object> getExtras() {
method setExtras (line 109) | public Request setExtras(Map<String, Object> extras) {
method setUrl (line 114) | public Request setUrl(String url) {
method getMethod (line 125) | public String getMethod() {
method setMethod (line 129) | public Request setMethod(String method) {
method hashCode (line 134) | @Override
method equals (line 141) | @Override
method addCookie (line 152) | public Request addCookie(String name, String value) {
method addHeader (line 157) | public Request addHeader(String name, String value) {
method getCookies (line 162) | public Map<String, String> getCookies() {
method getHeaders (line 166) | public Map<String, String> getHeaders() {
method getRequestBody (line 170) | public HttpRequestBody getRequestBody() {
method setRequestBody (line 174) | public void setRequestBody(HttpRequestBody requestBody) {
method isBinaryContent (line 178) | public boolean isBinaryContent() {
method getDownloader (line 182) | public Downloader getDownloader() {
method setDownloader (line 186) | public void setDownloader(Downloader downloader) {
method setBinaryContent (line 190) | public Request setBinaryContent(boolean binaryContent) {
method getCharset (line 195) | public String getCharset() {
method setCharset (line 199) | public Request setCharset(String charset) {
method toString (line 204) | @Override
FILE: webmagic-core/src/main/java/us/codecraft/webmagic/ResultItems.java
class ResultItems (line 15) | public class ResultItems {
method get (line 23) | @SuppressWarnings("unchecked")
method getAll (line 32) | public Map<String, Object> getAll() {
method put (line 36) | public <T> ResultItems put(String key, T value) {
method getRequest (line 41) | public Request getRequest() {
method setRequest (line 45) | public ResultItems setRequest(Request request) {
method isSkip (line 56) | public boolean isSkip() {
method setSkip (line 68) | public ResultItems setSkip(boolean skip) {
method toString (line 73) | @Override
FILE: webmagic-core/src/main/java/us/codecraft/webmagic/Site.java
class Site (line 19) | public class Site {
method me (line 62) | public static Site me() {
method addCookie (line 73) | public Site addCookie(String name, String value) {
method addCookie (line 86) | public Site addCookie(String domain, String name, String value) {
method setUserAgent (line 100) | public Site setUserAgent(String userAgent) {
method getCookies (line 110) | public Map<String, String> getCookies() {
method getAllCookies (line 119) | public Map<String,Map<String, String>> getAllCookies() {
method getUserAgent (line 128) | public String getUserAgent() {
method getDomain (line 137) | public String getDomain() {
method setDomain (line 147) | public Site setDomain(String domain) {
method setCharset (line 159) | public Site setCharset(String charset) {
method getCharset (line 169) | public String getCharset() {
method setDefaultCharset (line 182) | public Site setDefaultCharset(String defaultCharset) {
method getDefaultCharset (line 193) | public String getDefaultCharset() {
method getTimeOut (line 197) | public int getTimeOut() {
method setTimeOut (line 207) | public Site setTimeOut(int timeOut) {
method setAcceptStatCode (line 221) | public Site setAcceptStatCode(Set<Integer> acceptStatCode) {
method getAcceptStatCode (line 231) | public Set<Integer> getAcceptStatCode() {
method setSleepTime (line 242) | public Site setSleepTime(int sleepTime) {
method getSleepTime (line 253) | public int getSleepTime() {
method getRetryTimes (line 262) | public int getRetryTimes() {
method getHeaders (line 266) | public Map<String, String> getHeaders() {
method addHeader (line 278) | public Site addHeader(String key, String value) {
method setRetryTimes (line 289) | public Site setRetryTimes(int retryTimes) {
method getCycleRetryTimes (line 299) | public int getCycleRetryTimes() {
method setCycleRetryTimes (line 309) | public Site setCycleRetryTimes(int cycleRetryTimes) {
method isUseGzip (line 314) | public boolean isUseGzip() {
method getRetrySleepTime (line 318) | public int getRetrySleepTime() {
method setRetrySleepTime (line 328) | public Site setRetrySleepTime(int retrySleepTime) {
method setUseGzip (line 340) | public Site setUseGzip(boolean useGzip) {
method isDisableCookieManagement (line 345) | public boolean isDisableCookieManagement() {
method setDisableCookieManagement (line 356) | public Site setDisableCookieManagement(boolean disableCookieManagement) {
method toTask (line 361) | public Task toTask() {
method equals (line 379) | @Override
method hashCode (line 402) | @Override
method toString (line 417) | @Override
FILE: webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java
class Spider (line 62) | public class Spider implements Runnable, Task {
method create (line 115) | public static Spider create(PageProcessor pageProcessor) {
method Spider (line 124) | public Spider(PageProcessor pageProcessor) {
method startUrls (line 137) | public Spider startUrls(List<String> startUrls) {
method startRequest (line 150) | public Spider startRequest(List<Request> startRequests) {
method setUUID (line 163) | public Spider setUUID(String uuid) {
method scheduler (line 175) | @Deprecated
method setScheduler (line 188) | public Spider setScheduler(Scheduler updateScheduler) {
method pipeline (line 209) | @Deprecated
method addPipeline (line 222) | public Spider addPipeline(Pipeline pipeline) {
method setPipelines (line 236) | public Spider setPipelines(List<Pipeline> pipelines) {
method clearPipeline (line 247) | public Spider clearPipeline() {
method downloader (line 260) | @Deprecated
method setDownloader (line 272) | public Spider setDownloader(Downloader downloader) {
method initComponent (line 278) | protected void initComponent() {
method run (line 302) | @Override
method onError (line 366) | @Deprecated
method onError (line 370) | protected void onError(Request request, Exception e) {
method onSuccess (line 380) | protected void onSuccess(Request request) {
method checkRunningStat (line 388) | private void checkRunningStat() {
method close (line 400) | public void close() {
method destroyEach (line 410) | private void destroyEach(Object object) {
method test (line 425) | public void test(String... urls) {
method processRequest (line 434) | private void processRequest(Request request) {
method onDownloadSuccess (line 448) | private void onDownloadSuccess(Request request, Page page) {
method onDownloaderFail (line 463) | private void onDownloaderFail(Request request) {
method doCycleRetry (line 472) | private void doCycleRetry(Request request) {
method sleep (line 486) | protected void sleep(int time) {
method extractAndAddRequests (line 495) | protected void extractAndAddRequests(Page page, boolean spawnUrl) {
method addRequest (line 503) | private void addRequest(Request request) {
method checkIfRunning (line 510) | protected void checkIfRunning() {
method runAsync (line 516) | public void runAsync() {
method addUrl (line 528) | public Spider addUrl(String... urls) {
method getAll (line 543) | public <T> List<T> getAll(Collection<String> urls) {
method getCollectorPipeline (line 560) | protected CollectorPipeline getCollectorPipeline() {
method get (line 564) | public <T> T get(String url) {
method addRequest (line 580) | public Spider addRequest(Request... requests) {
method start (line 588) | public void start() {
method stop (line 592) | public void stop() {
method stopWhenComplete (line 603) | public void stopWhenComplete(){
method thread (line 613) | public Spider thread(int threadNum) {
method thread (line 629) | public Spider thread(ExecutorService executorService, int threadNum) {
method isExitWhenComplete (line 639) | public boolean isExitWhenComplete() {
method setExitWhenComplete (line 651) | public Spider setExitWhenComplete(boolean exitWhenComplete) {
method isSpawnUrl (line 656) | public boolean isSpawnUrl() {
method getPageCount (line 666) | public long getPageCount() {
method getStatus (line 677) | public Status getStatus() {
type Status (line 682) | public enum Status {
method Status (line 685) | private Status(int value) {
method getValue (line 691) | int getValue() {
method fromValue (line 695) | public static Status fromValue(int value) {
method getThreadAlive (line 712) | public int getThreadAlive() {
method setSpawnUrl (line 728) | public Spider setSpawnUrl(boolean spawnUrl) {
method getUUID (line 733) | @Override
method setExecutorService (line 745) | public Spider setExecutorService(ExecutorService executorService) {
method getSite (line 751) | @Override
method getSpiderListeners (line 756) | public List<SpiderListener> getSpiderListeners() {
method setSpiderListeners (line 760) | public Spider setSpiderListeners(List<SpiderListener> spiderListeners) {
method getStartTime (line 765) | public Date getStartTime() {
method getScheduler (line 769) | public Scheduler getScheduler() {
method setEmptySleepTime (line 779) | public Spider setEmptySleepTime(long emptySleepTime) {
FILE: webmagic-core/src/main/java/us/codecraft/webmagic/SpiderListener.java
type SpiderListener (line 9) | public interface SpiderListener {
method onSuccess (line 11) | void onSuccess(Request request);
method onError (line 16) | @Deprecated
method onError (line 20) | default void onError(Request request, Exception e) {
FILE: webmagic-core/src/main/java/us/codecraft/webmagic/SpiderScheduler.java
class SpiderScheduler (line 10) | public class SpiderScheduler {
method SpiderScheduler (line 15) | public SpiderScheduler(Scheduler scheduler) {
method getScheduler (line 19) | public Scheduler getScheduler() {
method setScheduler (line 23) | public void setScheduler(Scheduler scheduler) {
method poll (line 27) | public Request poll(Spider spider) {
method push (line 31) | public void push(Request request, Spider spider) {
method waitNewUrl (line 35) | public boolean waitNewUrl(CountableThreadPool threadPool, long emptySl...
method signalNewUrl (line 50) | public void signalNewUrl() {
FILE: webmagic-core/src/main/java/us/codecraft/webmagic/Task.java
type Task (line 11) | public interface Task {
method getUUID (line 18) | public String getUUID();
method getSite (line 25) | public Site getSite();
FILE: webmagic-core/src/main/java/us/codecraft/webmagic/downloader/AbstractDownloader.java
class AbstractDownloader (line 15) | public abstract class AbstractDownloader implements Downloader {
method download (line 23) | public Html download(String url) {
method download (line 34) | public Html download(String url, String charset) {
method onSuccess (line 43) | @Deprecated
method onSuccess (line 53) | @Deprecated
method onSuccess (line 63) | protected void onSuccess(Page page, Task task) {
method onError (line 71) | @Deprecated
method onError (line 82) | @Deprecated
method onError (line 93) | protected void onError(Page page, Task task, Throwable e) {
FILE: webmagic-core/src/main/java/us/codecraft/webmagic/downloader/CustomRedirectStrategy.java
class CustomRedirectStrategy (line 23) | public class CustomRedirectStrategy extends LaxRedirectStrategy {
method getRedirect (line 26) | @Override
FILE: webmagic-core/src/main/java/us/codecraft/webmagic/downloader/Downloader.java
type Downloader (line 15) | public interface Downloader {
method download (line 24) | public Page download(Request request, Task task);
method setThread (line 30) | public void setThread(int threadNum);
FILE: webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java
class HttpClientDownloader (line 32) | public class HttpClientDownloader extends AbstractDownloader {
method setHttpUriRequestConverter (line 44) | public void setHttpUriRequestConverter(HttpUriRequestConverter httpUri...
method setProxyProvider (line 48) | public void setProxyProvider(ProxyProvider proxyProvider) {
method getHttpClient (line 52) | private CloseableHttpClient getHttpClient(Site site) {
method download (line 70) | @Override
method setThread (line 100) | @Override
method handleResponse (line 105) | protected Page handleResponse(Request request, String charset, HttpRes...
method getHtmlCharset (line 127) | private String getHtmlCharset(String contentType, byte[] contentBytes,...
FILE: webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java
class HttpClientGenerator (line 37) | public class HttpClientGenerator {
method HttpClientGenerator (line 43) | public HttpClientGenerator() {
method buildSSLConnectionSocketFactory (line 52) | private SSLConnectionSocketFactory buildSSLConnectionSocketFactory() {
method createIgnoreVerifySSL (line 72) | private SSLContext createIgnoreVerifySSL() throws NoSuchAlgorithmExcep...
method setPoolSize (line 96) | public HttpClientGenerator setPoolSize(int poolSize) {
method getClient (line 101) | public CloseableHttpClient getClient(Site site) {
method generateClient (line 105) | private CloseableHttpClient generateClient(Site site) {
method generateCookie (line 140) | private void generateCookie(HttpClientBuilder httpClientBuilder, Site ...
FILE: webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientRequestContext.java
class HttpClientRequestContext (line 12) | public class HttpClientRequestContext {
method getHttpUriRequest (line 18) | public HttpUriRequest getHttpUriRequest() {
method setHttpUriRequest (line 22) | public void setHttpUriRequest(HttpUriRequest httpUriRequest) {
method getHttpClientContext (line 26) | public HttpClientContext getHttpClientContext() {
method setHttpClientContext (line 30) | public void setHttpClientContext(HttpClientContext httpClientContext) {
FILE: webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpUriRequestConverter.java
class HttpUriRequestConverter (line 32) | public class HttpUriRequestConverter {
method convert (line 34) | public HttpClientRequestContext convert(Request request, Site site, Pr...
method convertHttpClientContext (line 41) | private HttpClientContext convertHttpClientContext(Request request, Si...
method convertHttpUriRequest (line 62) | private HttpUriRequest convertHttpUriRequest(Request request, Site sit...
method selectRequestMethod (line 91) | private RequestBuilder selectRequestMethod(Request request) {
method addFormParams (line 110) | private RequestBuilder addFormParams(RequestBuilder requestBuilder, Re...
FILE: webmagic-core/src/main/java/us/codecraft/webmagic/model/HttpRequestBody.java
class HttpRequestBody (line 17) | public class HttpRequestBody implements Serializable {
class ContentType (line 21) | public static abstract class ContentType {
method HttpRequestBody (line 38) | public HttpRequestBody() {
method HttpRequestBody (line 41) | public HttpRequestBody(byte[] body, String contentType, String encodin...
method getContentType (line 47) | public String getContentType() {
method getEncoding (line 51) | public String getEncoding() {
method setBody (line 55) | public void setBody(byte[] body) {
method setContentType (line 59) | public void setContentType(String contentType) {
method setEncoding (line 63) | public void setEncoding(String encoding) {
method json (line 67) | public static HttpRequestBody json(String json, String encoding) {
method xml (line 75) | public static HttpRequestBody xml(String xml, String encoding) {
method custom (line 83) | public static HttpRequestBody custom(byte[] body, String contentType, ...
method form (line 87) | public static HttpRequestBody form(Map<String,Object> params, String e...
method getBody (line 99) | public byte[] getBody() {
FILE: webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/CollectorPipeline.java
type CollectorPipeline (line 12) | public interface CollectorPipeline<T> extends Pipeline {
method getCollected (line 19) | public List<T> getCollected();
FILE: webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/ConsolePipeline.java
class ConsolePipeline (line 15) | public class ConsolePipeline implements Pipeline {
method process (line 17) | @Override
FILE: webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/FilePipeline.java
class FilePipeline (line 22) | public class FilePipeline extends FilePersistentBase implements Pipeline {
method FilePipeline (line 29) | public FilePipeline() {
method FilePipeline (line 33) | public FilePipeline(String path) {
method process (line 37) | @Override
FILE: webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/Pipeline.java
type Pipeline (line 15) | public interface Pipeline {
method process (line 23) | public void process(ResultItems resultItems, Task task);
FILE: webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/ResultItemsCollectorPipeline.java
class ResultItemsCollectorPipeline (line 13) | public class ResultItemsCollectorPipeline implements CollectorPipeline<R...
method process (line 17) | @Override
method getCollected (line 22) | @Override
FILE: webmagic-core/src/main/java/us/codecraft/webmagic/processor/PageProcessor.java
type PageProcessor (line 23) | public interface PageProcessor {
method process (line 30) | void process(Page page);
method getSite (line 38) | default Site getSite() {
FILE: webmagic-core/src/main/java/us/codecraft/webmagic/processor/SimplePageProcessor.java
class SimplePageProcessor (line 14) | public class SimplePageProcessor implements PageProcessor {
method SimplePageProcessor (line 20) | public SimplePageProcessor(String urlPattern) {
method process (line 27) | @Override
method getSite (line 39) | @Override
FILE: webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/BaiduBaikePageProcessor.java
class BaiduBaikePageProcessor (line 16) | public class BaiduBaikePageProcessor implements PageProcessor {
method process (line 21) | @Override
method getSite (line 27) | @Override
method main (line 32) | public static void main(String[] args) {
FILE: webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/GithubRepoPageProcessor.java
class GithubRepoPageProcessor (line 12) | public class GithubRepoPageProcessor implements PageProcessor {
method process (line 16) | @Override
method getSite (line 29) | @Override
method main (line 34) | public static void main(String[] args) {
FILE: webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/ZhihuPageProcessor.java
class ZhihuPageProcessor (line 12) | public class ZhihuPageProcessor implements PageProcessor {
method process (line 16) | @Override
method getSite (line 28) | @Override
method main (line 33) | public static void main(String[] args) {
FILE: webmagic-core/src/main/java/us/codecraft/webmagic/proxy/Proxy.java
class Proxy (line 11) | public class Proxy {
method create (line 23) | public static Proxy create(final URI uri) {
method Proxy (line 38) | public Proxy(String host, int port) {
method Proxy (line 42) | public Proxy(String host, int port, String scheme) {
method Proxy (line 48) | public Proxy(String host, int port, String username, String password) {
method getScheme (line 55) | public String getScheme() {
method setScheme (line 59) | public void setScheme(String scheme) {
method getHost (line 63) | public String getHost() {
method getPort (line 67) | public int getPort() {
method getUsername (line 71) | public String getUsername() {
method getPassword (line 75) | public String getPassword() {
method toURI (line 79) | public URI toURI() {
method urlencode (line 97) | private String urlencode(String s) {
method equals (line 106) | @Override
method hashCode (line 120) | @Override
method toString (line 130) | @Override
FILE: webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyProvider.java
type ProxyProvider (line 12) | public interface ProxyProvider {
method returnProxy (line 21) | void returnProxy(Proxy proxy, Page page, Task task);
method getProxy (line 29) | @Deprecated
method getProxy (line 42) | default Proxy getProxy(Request request, Task task) {
FILE: webmagic-core/src/main/java/us/codecraft/webmagic/proxy/SimpleProxyProvider.java
class SimpleProxyProvider (line 19) | public class SimpleProxyProvider implements ProxyProvider {
method SimpleProxyProvider (line 25) | public SimpleProxyProvider(List<Proxy> proxies) {
method SimpleProxyProvider (line 29) | private SimpleProxyProvider(List<Proxy> proxies, AtomicInteger pointer) {
method from (line 34) | public static SimpleProxyProvider from(Proxy... proxies) {
method returnProxy (line 42) | @Override
method getProxy (line 47) | @Override
method incrForLoop (line 52) | private int incrForLoop() {
FILE: webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/DuplicateRemovedScheduler.java
class DuplicateRemovedScheduler (line 17) | public abstract class DuplicateRemovedScheduler implements Scheduler {
method getDuplicateRemover (line 23) | public DuplicateRemover getDuplicateRemover() {
method setDuplicateRemover (line 27) | public DuplicateRemovedScheduler setDuplicateRemover(DuplicateRemover ...
method push (line 32) | @Override
method shouldReserved (line 41) | protected boolean shouldReserved(Request request) {
method noNeedToRemoveDuplicate (line 45) | protected boolean noNeedToRemoveDuplicate(Request request) {
method pushWhenNoDuplicate (line 49) | protected void pushWhenNoDuplicate(Request request, Task task) {
FILE: webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/MonitorableScheduler.java
type MonitorableScheduler (line 11) | public interface MonitorableScheduler extends Scheduler {
method getLeftRequestsCount (line 13) | public int getLeftRequestsCount(Task task);
method getTotalRequestsCount (line 15) | public int getTotalRequestsCount(Task task);
FILE: webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/PriorityScheduler.java
class PriorityScheduler (line 18) | public class PriorityScheduler extends DuplicateRemovedScheduler impleme...
method compare (line 25) | @Override
method compare (line 32) | @Override
method pushWhenNoDuplicate (line 38) | @Override
method poll (line 49) | @Override
method getLeftRequestsCount (line 62) | @Override
method getTotalRequestsCount (line 67) | @Override
FILE: webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/QueueScheduler.java
class QueueScheduler (line 21) | public class QueueScheduler extends DuplicateRemovedScheduler implements...
method QueueScheduler (line 25) | public QueueScheduler() {
method QueueScheduler (line 36) | public QueueScheduler(int capacity) {
method pushWhenNoDuplicate (line 40) | @Override
method poll (line 51) | @Override
method getLeftRequestsCount (line 56) | @Override
method getTotalRequestsCount (line 61) | @Override
FILE: webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/Scheduler.java
type Scheduler (line 15) | public interface Scheduler {
method push (line 23) | public void push(Request request, Task task);
method poll (line 31) | public Request poll(Task task);
FILE: webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/component/DuplicateRemover.java
type DuplicateRemover (line 11) | public interface DuplicateRemover {
method isDuplicate (line 20) | public boolean isDuplicate(Request request, Task task);
method resetDuplicateCheck (line 26) | public void resetDuplicateCheck(Task task);
method getTotalRequestsCount (line 33) | public int getTotalRequestsCount(Task task);
FILE: webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/component/HashSetDuplicateRemover.java
class HashSetDuplicateRemover (line 13) | public class HashSetDuplicateRemover implements DuplicateRemover {
method isDuplicate (line 17) | @Override
method getUrl (line 22) | protected String getUrl(Request request) {
method resetDuplicateCheck (line 26) | @Override
method getTotalRequestsCount (line 31) | @Override
FILE: webmagic-core/src/main/java/us/codecraft/webmagic/selector/AbstractSelectable.java
class AbstractSelectable (line 13) | public abstract class AbstractSelectable implements Selectable {
method getSourceTexts (line 15) | protected abstract List<String> getSourceTexts();
method css (line 17) | @Override
method css (line 22) | @Override
method select (line 27) | protected Selectable select(Selector selector, List<String> strings) {
method selectList (line 38) | protected Selectable selectList(Selector selector, List<String> string...
method all (line 47) | @Override
method jsonPath (line 52) | @Override
method get (line 57) | @Override
method select (line 67) | @Override
method selectList (line 72) | @Override
method regex (line 77) | @Override
method regex (line 83) | @Override
method replace (line 89) | @Override
method getFirstSourceText (line 95) | public String getFirstSourceText() {
method toString (line 103) | @Override
method match (line 108) | @Override
FILE: webmagic-core/src/main/java/us/codecraft/webmagic/selector/AndSelector.java
class AndSelector (line 12) | public class AndSelector implements Selector {
method AndSelector (line 16) | public AndSelector(Selector... selectors) {
method AndSelector (line 22) | public AndSelector(List<Selector> selectors) {
method select (line 26) | @Override
method selectList (line 37) | @Override
FILE: webmagic-core/src/main/java/us/codecraft/webmagic/selector/BaseElementSelector.java
class BaseElementSelector (line 15) | public abstract class BaseElementSelector implements Selector, ElementSe...
method parse (line 16) | private Document parse(String text) {
method select (line 23) | @Override
method selectList (line 31) | @Override
method selectElement (line 40) | public Element selectElement(String text) {
method selectElements (line 47) | public List<Element> selectElements(String text) {
method selectElement (line 55) | public abstract Element selectElement(Element element);
method selectElements (line 57) | public abstract List<Element> selectElements(Element element);
method hasAttribute (line 59) | public abstract boolean hasAttribute();
FILE: webmagic-core/src/main/java/us/codecraft/webmagic/selector/CssSelector.java
class CssSelector (line 18) | public class CssSelector extends BaseElementSelector {
method CssSelector (line 24) | public CssSelector(String selectorText) {
method CssSelector (line 28) | public CssSelector(String selectorText, String attrName) {
method getValue (line 33) | private String getValue(Element element) {
method getText (line 47) | protected String getText(Element element) {
method select (line 58) | @Override
method selectList (line 67) | @Override
method selectElement (line 82) | @Override
method selectElements (line 91) | @Override
method hasAttribute (line 96) | @Override
FILE: webmagic-core/src/main/java/us/codecraft/webmagic/selector/ElementSelector.java
type ElementSelector (line 13) | public interface ElementSelector {
method select (line 22) | public String select(Element element);
method selectList (line 30) | public List<String> selectList(Element element);
FILE: webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java
class Html (line 18) | public class Html extends HtmlNode {
method Html (line 33) | public Html(String text, String url) {
method Html (line 42) | public Html(String text) {
method Html (line 51) | public Html(Document document) {
method getDocument (line 55) | public Document getDocument() {
method getElements (line 59) | @Override
method selectDocument (line 68) | public String selectDocument(Selector selector) {
method selectDocumentForList (line 77) | public List<String> selectDocumentForList(Selector selector) {
method create (line 86) | public static Html create(String text) {
FILE: webmagic-core/src/main/java/us/codecraft/webmagic/selector/HtmlNode.java
class HtmlNode (line 13) | public class HtmlNode extends AbstractSelectable {
method HtmlNode (line 17) | public HtmlNode(List<Element> elements) {
method HtmlNode (line 21) | public HtmlNode() {
method getElements (line 25) | protected List<Element> getElements() {
method smartContent (line 29) | public Selectable smartContent() {
method smartContent (line 34) | public Selectable smartContent(int threshold) {
method links (line 39) | @Override
method xpath (line 44) | @Override
method selectList (line 50) | @Override
method select (line 58) | @Override
method selectElements (line 69) | protected Selectable selectElements(BaseElementSelector elementSelecto...
method checkElementAndConvert (line 99) | private Element checkElementAndConvert(ListIterator<Element> elementIt...
method $ (line 111) | @Override
method $ (line 117) | @Override
method nodes (line 123) | @Override
method getSourceTexts (line 134) | @Override
FILE: webmagic-core/src/main/java/us/codecraft/webmagic/selector/Json.java
class Json (line 13) | public class Json extends PlainText {
method Json (line 15) | public Json(List<String> strings) {
method Json (line 19) | public Json(String text) {
method removePadding (line 28) | public Json removePadding(String padding) {
method toObject (line 38) | public <T> T toObject(Class<T> clazz) {
method toList (line 45) | public <T> List<T> toList(Class<T> clazz) {
method jsonPath (line 52) | @Override
FILE: webmagic-core/src/main/java/us/codecraft/webmagic/selector/JsonPathSelector.java
class JsonPathSelector (line 17) | public class JsonPathSelector implements Selector {
method JsonPathSelector (line 23) | public JsonPathSelector(String jsonPathStr) {
method getJsonPathStr (line 28) | @SuppressWarnings("unused")
method select (line 33) | @Override
method toString (line 48) | private String toString(Object object) {
method selectList (line 56) | @Override
FILE: webmagic-core/src/main/java/us/codecraft/webmagic/selector/LinksSelector.java
class LinksSelector (line 16) | public class LinksSelector extends BaseElementSelector {
method select (line 18) | @Override
method selectList (line 23) | @Override
method selectElement (line 37) | @Override
method selectElements (line 42) | @Override
method hasAttribute (line 47) | @Override
FILE: webmagic-core/src/main/java/us/codecraft/webmagic/selector/OrSelector.java
class OrSelector (line 12) | public class OrSelector implements Selector {
method OrSelector (line 16) | public OrSelector(Selector... selectors) {
method OrSelector (line 22) | public OrSelector(List<Selector> selectors) {
method select (line 26) | @Override
method selectList (line 37) | @Override
FILE: webmagic-core/src/main/java/us/codecraft/webmagic/selector/PlainText.java
class PlainText (line 13) | public class PlainText extends AbstractSelectable {
method PlainText (line 17) | public PlainText(List<String> sourceTexts) {
method PlainText (line 21) | public PlainText(String text) {
method create (line 26) | public static PlainText create(String text) {
method xpath (line 30) | @Override
method $ (line 35) | @Override
method $ (line 40) | @Override
method links (line 45) | @Override
method nodes (line 50) | @Override
method getSourceTexts (line 59) | @Override
FILE: webmagic-core/src/main/java/us/codecraft/webmagic/selector/RegexResult.java
class RegexResult (line 10) | class RegexResult {
method RegexResult (line 16) | public RegexResult() {
method RegexResult (line 20) | public RegexResult(String[] groups) {
method get (line 24) | public String get(int groupId) {
FILE: webmagic-core/src/main/java/us/codecraft/webmagic/selector/RegexSelector.java
class RegexSelector (line 17) | public class RegexSelector implements Selector {
method RegexSelector (line 25) | public RegexSelector(String regexStr, int group) {
method compileRegex (line 30) | private void compileRegex(String regexStr) {
method RegexSelector (line 46) | public RegexSelector(String regexStr) {
method select (line 55) | @Override
method selectList (line 60) | @Override
method selectGroup (line 70) | public RegexResult selectGroup(String text) {
method selectGroupList (line 82) | public List<RegexResult> selectGroupList(String text) {
method toString (line 95) | @Override
FILE: webmagic-core/src/main/java/us/codecraft/webmagic/selector/ReplaceSelector.java
class ReplaceSelector (line 14) | public class ReplaceSelector implements Selector {
method ReplaceSelector (line 22) | public ReplaceSelector(String regexStr, String replacement) {
method select (line 32) | @Override
method selectList (line 38) | @Override
method toString (line 43) | @Override
FILE: webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectable.java
type Selectable (line 11) | public interface Selectable {
method xpath (line 19) | public Selectable xpath(String xpath);
method $ (line 27) | public Selectable $(String selector);
method $ (line 36) | public Selectable $(String selector, String attrName);
method css (line 44) | public Selectable css(String selector);
method css (line 53) | public Selectable css(String selector, String attrName);
method links (line 59) | public Selectable links();
method regex (line 67) | public Selectable regex(String regex);
method regex (line 76) | public Selectable regex(String regex, int group);
method replace (line 85) | public Selectable replace(String regex, String replacement);
method toString (line 92) | public String toString();
method get (line 99) | public String get();
method match (line 106) | public boolean match();
method all (line 113) | public List<String> all();
method jsonPath (line 121) | public Selectable jsonPath(String jsonPath);
method select (line 129) | public Selectable select(Selector selector);
method selectList (line 137) | public Selectable selectList(Selector selector);
method nodes (line 143) | public List<Selectable> nodes();
FILE: webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selector.java
type Selector (line 11) | public interface Selector {
method select (line 20) | public String select(String text);
method selectList (line 28) | public List<String> selectList(String text);
FILE: webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectors.java
class Selectors (line 9) | public abstract class Selectors {
method regex (line 11) | public static RegexSelector regex(String expr) {
method regex (line 15) | public static RegexSelector regex(String expr, int group) {
method smartContent (line 19) | public static SmartContentSelector smartContent() {
method smartContent (line 23) | public static SmartContentSelector smartContent(int threshold) {
method $ (line 27) | public static CssSelector $(String expr) {
method $ (line 31) | public static CssSelector $(String expr, String attrName) {
method xpath (line 35) | public static XpathSelector xpath(String expr) {
method xsoup (line 44) | @Deprecated
method and (line 49) | public static AndSelector and(Selector... selectors) {
method or (line 53) | public static OrSelector or(Selector... selectors) {
FILE: webmagic-core/src/main/java/us/codecraft/webmagic/selector/SmartContentSelector.java
class SmartContentSelector (line 16) | @Experimental
method SmartContentSelector (line 21) | public SmartContentSelector() {
method SmartContentSelector (line 24) | public SmartContentSelector(int threshold) {
method select (line 28) | @Override
method selectList (line 92) | @Override
FILE: webmagic-core/src/main/java/us/codecraft/webmagic/selector/XpathSelector.java
class XpathSelector (line 16) | public class XpathSelector extends BaseElementSelector {
method XpathSelector (line 20) | public XpathSelector(String xpathStr) {
method select (line 24) | @Override
method selectList (line 29) | @Override
method selectElement (line 34) | @Override
method selectElements (line 43) | @Override
method hasAttribute (line 48) | @Override
FILE: webmagic-core/src/main/java/us/codecraft/webmagic/thread/CountableThreadPool.java
class CountableThreadPool (line 19) | public class CountableThreadPool {
method CountableThreadPool (line 29) | public CountableThreadPool(int threadNum) {
method CountableThreadPool (line 34) | public CountableThreadPool(int threadNum, ExecutorService executorServ...
method setExecutorService (line 39) | public void setExecutorService(ExecutorService executorService) {
method getThreadAlive (line 43) | public int getThreadAlive() {
method getThreadNum (line 47) | public int getThreadNum() {
method execute (line 53) | public void execute(final Runnable runnable) {
method isShutdown (line 88) | public boolean isShutdown() {
method shutdown (line 92) | public void shutdown() {
FILE: webmagic-core/src/main/java/us/codecraft/webmagic/utils/BaseSelectorUtils.java
class BaseSelectorUtils (line 6) | public class BaseSelectorUtils {
method preParse (line 15) | public static String preParse(String text) {
FILE: webmagic-core/src/main/java/us/codecraft/webmagic/utils/CharsetUtils.java
class CharsetUtils (line 20) | public abstract class CharsetUtils {
method CharsetUtils (line 24) | private CharsetUtils() {
method detectCharset (line 28) | public static String detectCharset(String contentType, byte[] contentB...
FILE: webmagic-core/src/main/java/us/codecraft/webmagic/utils/FilePersistentBase.java
class FilePersistentBase (line 11) | public class FilePersistentBase {
method setPath (line 24) | public void setPath(String path) {
method getFile (line 31) | public File getFile(String fullName) {
method checkAndMakeParentDirecotry (line 36) | public void checkAndMakeParentDirecotry(String fullName) {
method getPath (line 47) | public String getPath() {
FILE: webmagic-core/src/main/java/us/codecraft/webmagic/utils/HttpClientUtils.java
class HttpClientUtils (line 14) | public abstract class HttpClientUtils {
method convertHeaders (line 16) | public static Map<String,List<String>> convertHeaders(Header[] headers){
FILE: webmagic-core/src/main/java/us/codecraft/webmagic/utils/HttpConstant.java
class HttpConstant (line 8) | public abstract class HttpConstant {
class Method (line 10) | public static abstract class Method {
class StatusCode (line 28) | public static abstract class StatusCode {
class Header (line 34) | public static abstract class Header {
FILE: webmagic-core/src/main/java/us/codecraft/webmagic/utils/NumberUtils.java
class NumberUtils (line 6) | public abstract class NumberUtils {
method compareLong (line 8) | public static int compareLong(long o1, long o2) {
FILE: webmagic-core/src/main/java/us/codecraft/webmagic/utils/ProxyUtils.java
class ProxyUtils (line 18) | public class ProxyUtils {
method validateProxy (line 22) | public static boolean validateProxy(Proxy p) {
FILE: webmagic-core/src/main/java/us/codecraft/webmagic/utils/UrlUtils.java
class UrlUtils (line 21) | public class UrlUtils {
method canonicalizeUrl (line 32) | public static String canonicalizeUrl(String url, String refer) {
method encodeIllegalCharacterInUrl (line 58) | public static String encodeIllegalCharacterInUrl(String url) {
method fixIllegalCharacterInUrl (line 62) | public static String fixIllegalCharacterInUrl(String url) {
method getHost (line 67) | public static String getHost(String url) {
method removeProtocol (line 78) | public static String removeProtocol(String url) {
method getDomain (line 82) | public static String getDomain(String url) {
method removePort (line 91) | public static String removePort(String domain) {
method convertToRequests (line 100) | public static List<Request> convertToRequests(Collection<String> urls) {
method convertToUrls (line 108) | public static List<String> convertToUrls(Collection<Request> requests) {
method getCharset (line 118) | public static String getCharset(String contentType) {
FILE: webmagic-core/src/main/java/us/codecraft/webmagic/utils/WMCollections.java
class WMCollections (line 13) | public class WMCollections {
method newHashSet (line 15) | public static <T> Set<T> newHashSet(T... t){
method newArrayList (line 23) | public static <T> List<T> newArrayList(T... t){
FILE: webmagic-core/src/test/java/us/codecraft/webmagic/HtmlTest.java
class HtmlTest (line 15) | public class HtmlTest {
method testRegexSelector (line 17) | @Test
method testDisableJsoupHtmlEntityEscape (line 23) | @Ignore("not work in jsoup 1.8.x")
method testEnableJsoupHtmlEntityEscape (line 31) | @Test
method testAHrefExtract (line 37) | @Test
method testNthNodesGet (line 43) | @Test
method testGetHrefsByJsoup (line 51) | @Test
FILE: webmagic-core/src/test/java/us/codecraft/webmagic/RequestTest.java
class RequestTest (line 16) | public class RequestTest {
method testEqualsAndHashCode (line 18) | @Test
method testSetExtras (line 30) | @Test
method testGetExtras (line 40) | @Test
method testGetExtrasShouldBeUnmodifiable (line 47) | @Test(expected = UnsupportedOperationException.class)
FILE: webmagic-core/src/test/java/us/codecraft/webmagic/ResultItemsTest.java
class ResultItemsTest (line 11) | public class ResultItemsTest {
method testOrderOfEntries (line 13) | @Test
FILE: webmagic-core/src/test/java/us/codecraft/webmagic/SiteTest.java
class SiteTest (line 13) | public class SiteTest {
method test (line 15) | @Test
method addCookieTest (line 21) | @Test
FILE: webmagic-core/src/test/java/us/codecraft/webmagic/SpiderTest.java
class SpiderTest (line 17) | public class SpiderTest {
method testStartAndStop (line 19) | @Ignore("long time")
method testWaitAndNotify (line 36) | @Ignore("long time")
method testRound (line 45) | private void testRound() {
FILE: webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java
class HttpClientDownloaderTest (line 49) | public class HttpClientDownloaderTest {
method testDownloader (line 53) | @Test
method testDownloaderInIllegalUrl (line 60) | @Test(expected = IllegalArgumentException.class)
method test_download_fail (line 66) | @Test
method testGetHtmlCharset (line 75) | @Test
method test_selectRequestMethod (line 126) | @Test
method test_set_request_cookie (line 169) | @Test
method test_disableCookieManagement (line 186) | @Test
method test_set_request_header (line 203) | @Test
method test_set_site_header (line 220) | @Test
method test_set_site_cookie (line 236) | @Test
method test_download_when_task_is_null (line 253) | @Test
method test_download_auth_by_SimpleProxyProvider (line 269) | @Test
method test_download_binary_content (line 286) | @Test
method test_download_set_charset (line 304) | @Test
method test_download_set_request_charset (line 320) | @Test
method test_no_task_download (line 337) | @Test
FILE: webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpUriRequestConverterTest.java
class HttpUriRequestConverterTest (line 17) | public class HttpUriRequestConverterTest {
method test_illegal_uri_correct (line 19) | @Test
FILE: webmagic-core/src/test/java/us/codecraft/webmagic/downloader/MockGithubDownloader.java
class MockGithubDownloader (line 17) | public class MockGithubDownloader implements Downloader {
method download (line 19) | @Override
method setThread (line 33) | @Override
FILE: webmagic-core/src/test/java/us/codecraft/webmagic/downloader/SSLCompatibilityTest.java
class SSLCompatibilityTest (line 16) | public class SSLCompatibilityTest {
method test_tls12 (line 18) | @Test
FILE: webmagic-core/src/test/java/us/codecraft/webmagic/example/GithubRepoPageProcessorTest.java
class GithubRepoPageProcessorTest (line 18) | public class GithubRepoPageProcessorTest {
method test_github (line 20) | @Test
FILE: webmagic-core/src/test/java/us/codecraft/webmagic/pipeline/FilePipelineTest.java
class FilePipelineTest (line 15) | public class FilePipelineTest {
method before (line 20) | @BeforeClass
method testProcess (line 39) | @Test
FILE: webmagic-core/src/test/java/us/codecraft/webmagic/processor/PageProcessorTest.java
class PageProcessorTest (line 10) | public class PageProcessorTest {
method testGetSite (line 12) | @Test
FILE: webmagic-core/src/test/java/us/codecraft/webmagic/proxy/ProxyTest.java
class ProxyTest (line 18) | class ProxyTest {
method before (line 22) | @BeforeAll
class Fetch (line 32) | class Fetch extends Thread {
method Fetch (line 35) | public Fetch(HttpHost hp) {
method run (line 39) | @Override
method testCreate (line 50) | @Test
method testEqualsHashCode (line 88) | @Test
method testToString (line 96) | @Test
FILE: webmagic-core/src/test/java/us/codecraft/webmagic/proxy/SimpleProxyProviderTest.java
class SimpleProxyProviderTest (line 17) | public class SimpleProxyProviderTest {
method test_get_proxy (line 21) | @Test
FILE: webmagic-core/src/test/java/us/codecraft/webmagic/scheduler/DuplicateRemovedSchedulerTest.java
class DuplicateRemovedSchedulerTest (line 21) | @RunWith(MockitoJUnitRunner.class)
method poll (line 25) | @Override
method test_no_duplicate_removed_for_post_request (line 31) | @Test
method test_duplicate_removed_for_get_request (line 41) | @Test
FILE: webmagic-core/src/test/java/us/codecraft/webmagic/scheduler/PrioritySchedulerTest.java
class PrioritySchedulerTest (line 12) | public class PrioritySchedulerTest {
method getUUID (line 17) | @Override
method getSite (line 22) | @Override
method testDifferentPriority (line 28) | @Test
method testNoPriority (line 55) | @Test
FILE: webmagic-core/src/test/java/us/codecraft/webmagic/selector/AndSelectorTest.java
class AndSelectorTest (line 10) | public class AndSelectorTest {
method testSelectList (line 12) | @Test
method testSelectList_NoResults (line 36) | @Test
FILE: webmagic-core/src/test/java/us/codecraft/webmagic/selector/CssSelectorTest.java
class CssSelectorTest (line 16) | public class CssSelectorTest {
method testSelectElement (line 18) | @Test
method testSelectList (line 28) | @Test
FILE: webmagic-core/src/test/java/us/codecraft/webmagic/selector/ExtractorsTest.java
class ExtractorsTest (line 11) | public class ExtractorsTest {
method testEach (line 17) | @Test
method testCombo (line 27) | @Test
FILE: webmagic-core/src/test/java/us/codecraft/webmagic/selector/JsonPathSelectorTest.java
class JsonPathSelectorTest (line 14) | public class JsonPathSelectorTest {
method testJsonPath (line 37) | @Test
FILE: webmagic-core/src/test/java/us/codecraft/webmagic/selector/JsonTest.java
class JsonTest (line 11) | public class JsonTest {
method testRemovePadding (line 17) | @Test
method testRemovePaddingForQuotes (line 23) | @Test
FILE: webmagic-core/src/test/java/us/codecraft/webmagic/selector/LinksSelectorTest.java
class LinksSelectorTest (line 13) | public class LinksSelectorTest {
method testLinks (line 17) | @Test
FILE: webmagic-core/src/test/java/us/codecraft/webmagic/selector/OrSelectorTest.java
class OrSelectorTest (line 10) | public class OrSelectorTest {
method testSelectList (line 11) | @Test
FILE: webmagic-core/src/test/java/us/codecraft/webmagic/selector/RegexSelectorTest.java
class RegexSelectorTest (line 9) | public class RegexSelectorTest {
method testRegexWithSingleLeftBracket (line 11) | @Test(expected = IllegalArgumentException.class)
method testRegexWithLeftBracketQuoted (line 17) | @Test
method testRegexWithZeroWidthAssertions (line 26) | @Test
FILE: webmagic-core/src/test/java/us/codecraft/webmagic/selector/SelectorTest.java
class SelectorTest (line 12) | public class SelectorTest {
method testChain (line 16) | @Test
method testNodes (line 27) | @Test
FILE: webmagic-core/src/test/java/us/codecraft/webmagic/utils/CharsetUtilsTest.java
class CharsetUtilsTest (line 9) | class CharsetUtilsTest {
method testDetectCharset (line 11) | @Test
FILE: webmagic-core/src/test/java/us/codecraft/webmagic/utils/NumberUtilsTest.java
class NumberUtilsTest (line 6) | public class NumberUtilsTest {
method testCompareLong (line 8) | @Test
FILE: webmagic-core/src/test/java/us/codecraft/webmagic/utils/UrlUtilsTest.java
class UrlUtilsTest (line 15) | public class UrlUtilsTest {
method testFixRelativeUrl (line 17) | @Test
method testGetDomain (line 38) | @Test
method testGetCharset (line 48) | @Test
FILE: webmagic-extension/src/main/java/us/codecraft/webmagic/MultiPageModel.java
type MultiPageModel (line 13) | @Experimental
method getPageKey (line 21) | public String getPageKey();
method getPage (line 28) | public String getPage();
method getOtherPages (line 36) | public Collection<String> getOtherPages();
method combine (line 44) | public MultiPageModel combine(MultiPageModel multiPageModel);
FILE: webmagic-extension/src/main/java/us/codecraft/webmagic/SimpleHttpClient.java
class SimpleHttpClient (line 12) | public class SimpleHttpClient {
method SimpleHttpClient (line 18) | public SimpleHttpClient() {
method SimpleHttpClient (line 22) | public SimpleHttpClient(Site site) {
method setProxyProvider (line 27) | public void setProxyProvider(ProxyProvider proxyProvider){
method get (line 31) | public <T> T get(String url, Class<T> clazz) {
method get (line 35) | public <T> T get(Request request, Class<T> clazz) {
method get (line 43) | public Page get(String url) {
method get (line 47) | public Page get(Request request) {
FILE: webmagic-extension/src/main/java/us/codecraft/webmagic/configurable/ConfigurablePageProcessor.java
class ConfigurablePageProcessor (line 13) | @Experimental
method ConfigurablePageProcessor (line 20) | public ConfigurablePageProcessor(Site site, List<ExtractRule> extractR...
method process (line 25) | @Override
method getSite (line 46) | @Override
FILE: webmagic-extension/src/main/java/us/codecraft/webmagic/configurable/ExpressionType.java
type ExpressionType (line 6) | public enum ExpressionType {
FILE: webmagic-extension/src/main/java/us/codecraft/webmagic/configurable/ExtractRule.java
class ExtractRule (line 11) | public class ExtractRule {
method getFieldName (line 27) | public String getFieldName() {
method setFieldName (line 31) | public void setFieldName(String fieldName) {
method getExpressionType (line 35) | public ExpressionType getExpressionType() {
method setExpressionType (line 39) | public void setExpressionType(ExpressionType expressionType) {
method getExpressionValue (line 43) | public String getExpressionValue() {
method setExpressionValue (line 47) | public void setExpressionValue(String expressionValue) {
method getExpressionParams (line 51) | public String[] getExpressionParams() {
method setExpressionParams (line 55) | public void setExpressionParams(String[] expressionParams) {
method isMulti (line 59) | public boolean isMulti() {
method setMulti (line 63) | public void setMulti(boolean multi) {
method getSelector (line 67) | public Selector getSelector() {
method compileSelector (line 78) | private Selector compileSelector() {
method setSelector (line 101) | public void setSelector(Selector selector) {
method isNotNull (line 105) | public boolean isNotNull() {
method setNotNull (line 109) | public void setNotNull(boolean notNull) {
FILE: webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/PhantomJSDownloader.java
class PhantomJSDownloader (line 19) | public class PhantomJSDownloader extends AbstractDownloader {
method PhantomJSDownloader (line 24) | public PhantomJSDownloader() {
method PhantomJSDownloader (line 38) | public PhantomJSDownloader(String phantomJsCommand) {
method PhantomJSDownloader (line 76) | public PhantomJSDownloader(String phantomJsCommand, String crawlJsPath) {
method initPhantomjsCrawlPath (line 81) | private void initPhantomjsCrawlPath() {
method download (line 86) | @Override
method setThread (line 110) | @Override
method getPage (line 115) | protected String getPage(Request request) throws Exception {
FILE: webmagic-extension/src/main/java/us/codecraft/webmagic/example/AppStore.java
class AppStore (line 14) | @Experimental
method main (line 32) | public static void main(String[] args) {
FILE: webmagic-extension/src/main/java/us/codecraft/webmagic/example/BaiduBaike.java
class BaiduBaike (line 14) | public class BaiduBaike{
method toString (line 22) | @Override
method main (line 30) | public static void main(String[] args) {
method getName (line 50) | public String getName() {
method getDescription (line 54) | public String getDescription() {
FILE: webmagic-extension/src/main/java/us/codecraft/webmagic/example/GithubRepo.java
class GithubRepo (line 18) | @TargetUrl("https://github.com/\\w+/\\w+")
method main (line 43) | public static void main(String[] args) {
method key (line 49) | @Override
method getName (line 54) | public String getName() {
method getReadme (line 58) | public String getReadme() {
method getAuthor (line 62) | public String getAuthor() {
method getLanguage (line 66) | public List<String> getLanguage() {
method getUrl (line 70) | public String getUrl() {
method getStar (line 74) | public int getStar() {
method getFork (line 78) | public int getFork() {
method toString (line 82) | @Override
FILE: webmagic-extension/src/main/java/us/codecraft/webmagic/example/GithubRepoApi.java
class GithubRepoApi (line 16) | public class GithubRepoApi implements HasKey {
method main (line 36) | public static void main(String[] args) {
method key (line 42) | @Override
method getName (line 47) | public String getName() {
method getAuthor (line 51) | public String getAuthor() {
method getLanguage (line 55) | public List<String> getLanguage() {
method getUrl (line 59) | public String getUrl() {
method getStar (line 63) | public int getStar() {
method getFork (line 67) | public int getFork() {
FILE: webmagic-extension/src/main/java/us/codecraft/webmagic/example/GithubRepoPageMapper.java
class GithubRepoPageMapper (line 13) | public class GithubRepoPageMapper implements PageProcessor {
method process (line 19) | @Override
method getSite (line 32) | @Override
method main (line 37) | public static void main(String[] args) {
FILE: webmagic-extension/src/main/java/us/codecraft/webmagic/example/MonitorExample.java
class MonitorExample (line 12) | public class MonitorExample {
method main (line 14) | public static void main(String[] args) throws Exception {
FILE: webmagic-extension/src/main/java/us/codecraft/webmagic/example/OschinaBlog.java
class OschinaBlog (line 17) | @TargetUrl("http://my.oschina.net/flashsword/blog/\\d+")
method main (line 32) | public static void main(String[] args) {
method getTitle (line 38) | public String getTitle() {
method getContent (line 42) | public String getContent() {
method getTags (line 46) | public List<String> getTags() {
method getDate (line 50) | public Date getDate() {
FILE: webmagic-extension/src/main/java/us/codecraft/webmagic/example/PatternProcessorExample.java
class PatternProcessorExample (line 18) | public class PatternProcessorExample {
method main (line 22) | public static void main(String... args) {
FILE: webmagic-extension/src/main/java/us/codecraft/webmagic/handler/CompositePageProcessor.java
class CompositePageProcessor (line 13) | public class CompositePageProcessor implements PageProcessor {
method CompositePageProcessor (line 19) | public CompositePageProcessor(Site site) {
method process (line 23) | @Override
method setSite (line 35) | public CompositePageProcessor setSite(Site site) {
method addSubPageProcessor (line 40) | public CompositePageProcessor addSubPageProcessor(SubPageProcessor sub...
method setSubPageProcessors (line 45) | public CompositePageProcessor setSubPageProcessors(SubPageProcessor......
method getSite (line 53) | @Override
FILE: webmagic-extension/src/main/java/us/codecraft/webmagic/handler/CompositePipeline.java
class CompositePipeline (line 13) | public class CompositePipeline implements Pipeline {
method process (line 17) | @Override
method addSubPipeline (line 29) | public CompositePipeline addSubPipeline(SubPipeline subPipeline) {
method setSubPipeline (line 34) | public CompositePipeline setSubPipeline(SubPipeline... subPipelines) {
FILE: webmagic-extension/src/main/java/us/codecraft/webmagic/handler/PatternProcessor.java
class PatternProcessor (line 6) | public abstract class PatternProcessor extends PatternRequestMatcher imp...
method PatternProcessor (line 10) | public PatternProcessor(String pattern) {
FILE: webmagic-extension/src/main/java/us/codecraft/webmagic/handler/PatternRequestMatcher.java
class PatternRequestMatcher (line 16) | public abstract class PatternRequestMatcher implements RequestMatcher {
method PatternRequestMatcher (line 28) | public PatternRequestMatcher(String pattern) {
method match (line 33) | @Override
FILE: webmagic-extension/src/main/java/us/codecraft/webmagic/handler/RequestMatcher.java
type RequestMatcher (line 9) | public interface RequestMatcher {
method match (line 19) | public boolean match(Request page);
type MatchOther (line 21) | public enum MatchOther {
FILE: webmagic-extension/src/main/java/us/codecraft/webmagic/handler/SubPageProcessor.java
type SubPageProcessor (line 8) | public interface SubPageProcessor extends RequestMatcher {
method processPage (line 17) | public MatchOther processPage(Page page);
FILE: webmagic-extension/src/main/java/us/codecraft/webmagic/handler/SubPipeline.java
type SubPipeline (line 10) | public interface SubPipeline extends RequestMatcher {
method processResult (line 19) | public MatchOther processResult(ResultItems resultItems, Task task);
FILE: webmagic-extension/src/main/java/us/codecraft/webmagic/model/AfterExtractor.java
type AfterExtractor (line 11) | public interface AfterExtractor {
method afterProcess (line 13) | public void afterProcess(Page page);
FILE: webmagic-extension/src/main/java/us/codecraft/webmagic/model/ConsolePageModelPipeline.java
class ConsolePageModelPipeline (line 13) | public class ConsolePageModelPipeline implements PageModelPipeline {
method process (line 14) | @Override
FILE: webmagic-extension/src/main/java/us/codecraft/webmagic/model/Extractor.java
class Extractor (line 14) | public class Extractor {
method Extractor (line 26) | public Extractor(Selector selector, Source source, boolean notNull, bo...
method isNotNull (line 33) | public boolean isNotNull() {
method isMulti (line 37) | public boolean isMulti() {
FILE: webmagic-extension/src/main/java/us/codecraft/webmagic/model/FieldExtractor.java
class FieldExtractor (line 18) | public class FieldExtractor extends Extractor {
method FieldExtractor (line 29) | public FieldExtractor(Field field, Selector selector, Source source, b...
FILE: webmagic-extension/src/main/java/us/codecraft/webmagic/model/HasKey.java
type HasKey (line 11) | @Experimental
method key (line 19) | public String key();
FILE: webmagic-extension/src/main/java/us/codecraft/webmagic/model/ModelPageProcessor.java
class ModelPageProcessor (line 20) | class ModelPageProcessor implements PageProcessor {
method create (line 28) | public static ModelPageProcessor create(Site site, Class... clazzs) {
method addPageModel (line 37) | public ModelPageProcessor addPageModel(Class clazz) {
method ModelPageProcessor (line 43) | private ModelPageProcessor(Site site) {
method process (line 47) | @Override
method extractLinks (line 66) | private void extractLinks(Page page, Selector urlRegionSelector, List<...
method postProcessPageModel (line 83) | protected void postProcessPageModel(Class clazz, Object object) {
method getSite (line 86) | @Override
method isExtractLinks (line 91) | public boolean isExtractLinks() {
method setExtractLinks (line 95) | public void setExtractLinks(boolean extractLinks) {
FILE: webmagic-extension/src/main/java/us/codecraft/webmagic/model/ModelPipeline.java
class ModelPipeline (line 20) | class ModelPipeline implements Pipeline {
method ModelPipeline (line 24) | public ModelPipeline() {
method put (line 27) | public ModelPipeline put(Class clazz, PageModelPipeline pageModelPipel...
method process (line 32) | @Override
FILE: webmagic-extension/src/main/java/us/codecraft/webmagic/model/OOSpider.java
class OOSpider (line 41) | public class OOSpider<T> extends Spider {
method OOSpider (line 51) | protected OOSpider(ModelPageProcessor modelPageProcessor) {
method OOSpider (line 56) | public OOSpider(PageProcessor pageProcessor) {
method OOSpider (line 67) | public OOSpider(Site site, PageModelPipeline pageModelPipeline, Class....
method getCollectorPipeline (line 79) | @Override
method create (line 84) | public static OOSpider create(Site site, Class... pageModels) {
method create (line 88) | public static OOSpider create(Site site, PageModelPipeline pageModelPi...
method addPageModel (line 92) | public OOSpider addPageModel(PageModelPipeline pageModelPipeline, Clas...
method setIsExtractLinks (line 100) | public OOSpider setIsExtractLinks(boolean isExtractLinks){
FILE: webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageMapper.java
class PageMapper (line 11) | public class PageMapper<T> {
method PageMapper (line 17) | public PageMapper(Class<T> clazz) {
method get (line 22) | public T get(Page page) {
method getAll (line 26) | public List<T> getAll(Page page) {
FILE: webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelCollectorPipeline.java
class PageModelCollectorPipeline (line 16) | class PageModelCollectorPipeline<T> implements CollectorPipeline<T> {
method PageModelCollectorPipeline (line 22) | PageModelCollectorPipeline(Class<?> clazz) {
method getCollected (line 26) | @Override
method process (line 31) | @Override
FILE: webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java
class PageModelExtractor (line 34) | class PageModelExtractor {
method create (line 57) | public static PageModelExtractor create(Class clazz) {
method init (line 63) | private void init(Class clazz) {
method getAnnotationExtractByUrl (line 89) | private FieldExtractor getAnnotationExtractByUrl(Class clazz, Field fi...
method getAnnotationExtractCombo (line 108) | private FieldExtractor getAnnotationExtractCombo(Class clazz, Field fi...
method getAnnotationExtractBy (line 134) | private FieldExtractor getAnnotationExtractBy(Class clazz, Field field) {
method getSetterMethod (line 163) | public static Method getSetterMethod(Class clazz, Field field) {
method initClassExtractors (line 174) | private void initClassExtractors() {
method process (line 206) | public Object process(Page page) {
method processSingle (line 237) | private Object processSingle(Page page, String html, boolean isRaw) {
FILE: webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ComboExtract.java
type Op (line 24) | public static enum Op {
type Source (line 55) | public static enum Source {
FILE: webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractBy.java
type Type (line 27) | public static enum Type {XPath, Regex, Css, JsonPath}
type Source (line 47) | public static enum Source {
FILE: webmagic-extension/src/main/java/us/codecraft/webmagic/model/fields/MultipleField.java
class MultipleField (line 13) | public class MultipleField extends PageField {
method MultipleField (line 17) | public MultipleField(List<String> fieldNames) {
method operation (line 21) | public boolean operation(Object o, FieldExtractor fieldExtractor, Logg...
method convert (line 33) | private List<Object> convert(List<String> values, ObjectFormatter obje...
FILE: webmagic-extension/src/main/java/us/codecraft/webmagic/model/fields/PageField.java
class PageField (line 10) | public abstract class PageField {
method operation (line 11) | public abstract boolean operation(Object o, FieldExtractor fieldExtrac...
method convert (line 13) | protected Object convert(String value, ObjectFormatter objectFormatter...
method setField (line 24) | protected void setField(Object o, FieldExtractor fieldExtractor, Objec...
FILE: webmagic-extension/src/main/java/us/codecraft/webmagic/model/fields/SingleField.java
class SingleField (line 10) | public class SingleField extends PageField {
method SingleField (line 14) | public SingleField(String fieldName) {
method operation (line 18) | public boolean operation(Object o, FieldExtractor fieldExtractor, Logg...
FILE: webmagic-extension/src/main/java/us/codecraft/webmagic/model/formatter/BasicClassDetector.java
type BasicClassDetector (line 3) | public interface BasicClassDetector {
method detectBasicClass (line 4) | Class<?> detectBasicClass(Class<?> type);
class IntegerClassDetector (line 7) | class IntegerClassDetector implements BasicClassDetector {
method detectBasicClass (line 8) | @Override
class LongClassDetector (line 17) | class LongClassDetector implements BasicClassDetector {
method detectBasicClass (line 18) | @Override
class DoubleClassDetector (line 27) | class DoubleClassDetector implements BasicClassDetector {
method detectBasicClass (line 28) | @Override
class FloatClassDetector (line 37) | class FloatClassDetector implements BasicClassDetector {
method detectBasicClass (line 38) | @Override
class ShortClassDetector (line 47) | class ShortClassDetector implements BasicClassDetector {
method detectBasicClass (line 48) | @Override
class CharacterClassDetector (line 57) | class CharacterClassDetector implements BasicClassDetector {
method detectBasicClass (line 58) | @Override
class ByteClassDetector (line 67) | class ByteClassDetector implements BasicClassDetector {
method detectBasicClass (line 68) | @Override
class BooleanClassDetector (line 77) | class BooleanClassDetector implements BasicClassDetector {
method detectBasicClass (line 78) | @Override
FILE: webmagic-extension/src/main/java/us/codecraft/webmagic/model/formatter/BasicTypeFormatter.java
class BasicTypeFormatter (line 10) | public abstract class BasicTypeFormatter<T> implements ObjectFormatter<T> {
method initParam (line 12) | @Override
method format (line 17) | @Override
method formatTrimmed (line 26) | protected abstract T formatTrimmed(String raw) throws Exception;
method detectBasicClass (line 39) | public static Class<?> detectBasicClass(Class<?> type) {
class IntegerFormatter (line 49) | public static class IntegerFormatter extends BasicTypeFormatter<Intege...
method formatTrimmed (line 50) | @Override
method clazz (line 55) | @Override
class LongFormatter (line 61) | public static class LongFormatter extends BasicTypeFormatter<Long> {
method formatTrimmed (line 62) | @Override
method clazz (line 67) | @Override
class DoubleFormatter (line 73) | public static class DoubleFormatter extends BasicTypeFormatter<Double> {
method formatTrimmed (line 74) | @Override
method clazz (line 79) | @Override
class FloatFormatter (line 85) | public static class FloatFormatter extends BasicTypeFormatter<Float> {
method formatTrimmed (line 86) | @Override
method clazz (line 91) | @Override
class ShortFormatter (line 97) | public static class ShortFormatter extends BasicTypeFormatter<Short> {
method formatTrimmed (line 98) | @Override
method clazz (line 103) | @Override
class CharactorFormatter (line 109) | public static class CharactorFormatter extends BasicTypeFormatter<Char...
method formatTrimmed (line 110) | @Override
method clazz (line 115) | @Override
class ByteFormatter (line 121) | public static class ByteFormatter extends BasicTypeFormatter<Byte> {
method formatTrimmed (line 122) | @Override
method clazz (line 127) | @Override
class BooleanFormatter (line 133) | public static class BooleanFormatter extends BasicTypeFormatter<Boolea...
method formatTrimmed (line 134) | @Override
method clazz (line 139) | @Override
FILE: webmagic-extension/src/main/java/us/codecraft/webmagic/model/formatter/DateFormatter.java
class DateFormatter (line 11) | public class DateFormatter implements ObjectFormatter<Date> {
method format (line 16) | @Override
method clazz (line 21) | @Override
method initParam (line 26) | @Override
FILE: webmagic-extension/src/main/java/us/codecraft/webmagic/model/formatter/ObjectFormatter.java
type ObjectFormatter (line 6) | public interface ObjectFormatter<T> {
method format (line 8) | T format(String raw) throws Exception;
method clazz (line 10) | Class<T> clazz();
method initParam (line 12) | void initParam(String[] extra);
FILE: webmagic-extension/src/main/java/us/codecraft/webmagic/model/formatter/ObjectFormatterBuilder.java
class ObjectFormatterBuilder (line 13) | public class ObjectFormatterBuilder {
method setField (line 17) | public ObjectFormatterBuilder setField(Field field) {
method initFormatterForType (line 22) | private ObjectFormatter initFormatterForType(Class<?> fieldClazz, Stri...
method initFormatter (line 33) | private ObjectFormatter initFormatter(Class<? extends ObjectFormatter>...
method build (line 45) | public ObjectFormatter build() {
FILE: webmagic-extension/src/main/java/us/codecraft/webmagic/model/formatter/ObjectFormatters.java
class ObjectFormatters (line 10) | public class ObjectFormatters {
method put (line 21) | public static void put(Class<? extends ObjectFormatter> objectFormatte...
method get (line 31) | public static Class<? extends ObjectFormatter> get(Class<?> clazz){
FILE: webmagic-extension/src/main/java/us/codecraft/webmagic/model/sources/Source.java
type Source (line 8) | public interface Source {
method getText (line 9) | public String getText(Page page, String html, boolean isRaw, FieldExtr...
method getTextList (line 10) | public List<String> getTextList(Page page, String html, boolean isRaw,...
class RawHtml (line 12) | public class RawHtml implements Source {
method getText (line 13) | public String getText(Page page, String html, boolean isRaw, FieldEx...
method getTextList (line 17) | public List<String> getTextList(Page page, String html, boolean isRa...
class SelectedHtml (line 22) | public class SelectedHtml implements Source {
method getText (line 23) | public String getText(Page page, String html, boolean isRaw, FieldEx...
method getTextList (line 30) | public List<String> getTextList(Page page, String html, boolean isRa...
class Url (line 38) | public class Url implements Source {
method getText (line 39) | public String getText(Page page, String html, boolean isRaw, FieldEx...
method getTextList (line 43) | public List<String> getTextList(Page page, String html, boolean isRa...
class RawText (line 48) | public class RawText implements Source {
method getText (line 49) | public String getText(Page page, String html, boolean isRaw, FieldEx...
method getTextList (line 53) | public List<String> getTextList(Page page, String html, boolean isRa...
class DefaultSource (line 58) | public class DefaultSource implements Source {
method getText (line 59) | public String getText(Page page, String html, boolean isRaw, FieldEx...
method getTextList (line 63) | public List<String> getTextList(Page page, String html, boolean isRa...
FILE: webmagic-extension/src/main/java/us/codecraft/webmagic/model/sources/SourceTextExtractor.java
class SourceTextExtractor (line 9) | public class SourceTextExtractor {
method getText (line 10) | public static PageField getText(Page page, String html, boolean isRaw,...
FILE: webmagic-extension/src/main/java/us/codecraft/webmagic/monitor/SpiderMonitor.java
class SpiderMonitor (line 27) | @Experimental
method SpiderMonitor (line 38) | protected SpiderMonitor() {
method register (line 50) | public synchronized SpiderMonitor register(Spider... spiders) throws J...
method getSpiderStatusMBean (line 67) | protected SpiderStatusMXBean getSpiderStatusMBean(Spider spider, Monit...
method getSpiderStatuses (line 71) | protected List<SpiderStatusMXBean> getSpiderStatuses() {
method instance (line 75) | public static SpiderMonitor instance() {
class MonitorSpiderListener (line 79) | public class MonitorSpiderListener implements SpiderListener {
method onSuccess (line 87) | @Override
method onError (line 92) | @Override
method getSuccessCount (line 98) | public AtomicInteger getSuccessCount() {
method getErrorCount (line 102) | public AtomicInteger getErrorCount() {
method getErrorUrls (line 106) | public List<String> getErrorUrls() {
method registerMBean (line 111) | protected void registerMBean(SpiderStatusMXBean spiderStatus) throws M...
FILE: webmagic-extension/src/main/java/us/codecraft/webmagic/monitor/SpiderStatus.java
class SpiderStatus (line 15) | public class SpiderStatus implements SpiderStatusMXBean {
method SpiderStatus (line 23) | public SpiderStatus(Spider spider, SpiderMonitor.MonitorSpiderListener...
method getName (line 28) | public String getName() {
method getLeftPageCount (line 32) | public int getLeftPageCount() {
method getTotalPageCount (line 40) | public int getTotalPageCount() {
method getSuccessPageCount (line 48) | @Override
method getErrorPageCount (line 53) | @Override
method getErrorPages (line 58) | public List<String> getErrorPages() {
method getStatus (line 62) | @Override
method getThread (line 67) | @Override
method start (line 72) | public void start() {
method stop (line 76) | public void stop() {
method getStartTime (line 80) | @Override
method getPagePerSecond (line 85) | @Override
FILE: webmagic-extension/src/main/java/us/codecraft/webmagic/monitor/SpiderStatusMXBean.java
type SpiderStatusMXBean (line 10) | public interface SpiderStatusMXBean {
method getName (line 12) | public String getName();
method getStatus (line 14) | public String getStatus();
method getThread (line 16) | public int getThread();
method getTotalPageCount (line 18) | public int getTotalPageCount();
method getLeftPageCount (line 20) | public int getLeftPageCount();
method getSuccessPageCount (line 22) | public int getSuccessPageCount();
method getErrorPageCount (line 24) | public int getErrorPageCount();
method getErrorPages (line 26) | public List<String> getErrorPages();
method start (line 28) | public void start();
method stop (line 30) | public void stop();
method getStartTime (line 32) | public Date getStartTime();
method getPagePerSecond (line 34) | public int getPagePerSecond();
FILE: webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/CollectorPageModelPipeline.java
class CollectorPageModelPipeline (line 11) | public class CollectorPageModelPipeline<T> implements PageModelPipeline<...
method process (line 15) | @Override
method getCollected (line 20) | public List<T> getCollected() {
FILE: webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/FilePageModelPipeline.java
class FilePageModelPipeline (line 23) | public class FilePageModelPipeline extends FilePersistentBase implements...
method FilePageModelPipeline (line 30) | public FilePageModelPipeline() {
method FilePageModelPipeline (line 34) | public FilePageModelPipeline(String path) {
method process (line 38) | @Override
FILE: webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/JsonFilePageModelPipeline.java
class JsonFilePageModelPipeline (line 24) | public class JsonFilePageModelPipeline extends FilePersistentBase implem...
method JsonFilePageModelPipeline (line 31) | public JsonFilePageModelPipeline() {
method JsonFilePageModelPipeline (line 35) | public JsonFilePageModelPipeline(String path) {
method process (line 39) | @Override
FILE: webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/JsonFilePipeline.java
class JsonFilePipeline (line 21) | public class JsonFilePipeline extends FilePersistentBase implements Pipe...
method JsonFilePipeline (line 28) | public JsonFilePipeline() {
method JsonFilePipeline (line 32) | public JsonFilePipeline(String path) {
method process (line 36) | @Override
FILE: webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/MultiPagePipeline.java
class MultiPagePipeline (line 20) | @Experimental
method process (line 27) | @Override
method handleObject (line 36) | private void handleObject(Iterator<Map.Entry<String, Object>> iterator) {
FILE: webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/PageModelPipeline.java
type PageModelPipeline (line 11) | public interface PageModelPipeline<T> {
method process (line 13) | public void process(T t, Task task);
FILE: webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/BloomFilterDuplicateRemover.java
class BloomFilterDuplicateRemover (line 24) | public class BloomFilterDuplicateRemover implements DuplicateRemover {
method BloomFilterDuplicateRemover (line 32) | public BloomFilterDuplicateRemover(int expectedInsertions) {
method BloomFilterDuplicateRemover (line 41) | public BloomFilterDuplicateRemover(int expectedInsertions, double fpp) {
method rebuildBloomFilter (line 47) | protected BloomFilter<CharSequence> rebuildBloomFilter() {
method isDuplicate (line 54) | @Override
method getUrl (line 64) | protected String getUrl(Request request) {
method resetDuplicateCheck (line 68) | @Override
method getTotalRequestsCount (line 73) | @Override
FILE: webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/FileCacheQueueScheduler.java
class FileCacheQueueScheduler (line 19) | public class FileCacheQueueScheduler extends DuplicateRemovedScheduler i...
method FileCacheQueueScheduler (line 41) | public FileCacheQueueScheduler(String filePath) {
method flush (line 49) | private void flush() {
method init (line 54) | private void init(Task task) {
method initDuplicateRemover (line 67) | private void initDuplicateRemover() {
method initFlushThread (line 72) | private void initFlushThread() {
method initWriter (line 77) | private void initWriter() {
method readFile (line 86) | private void readFile() {
method readUrlFile (line 100) | private void readUrlFile() throws IOException {
method readCursorFile (line 115) | private void readCursorFile() throws IOException {
method close (line 133) | public void close() throws IOException {
method getFileName (line 139) | private String getFileName(String filename) {
method pushWhenNoDuplicate (line 143) | @Override
method poll (line 152) | @Override
method getLeftRequestsCount (line 161) | @Override
method getTotalRequestsCount (line 166) | @Override
method serializeRequest (line 171) | protected String serializeRequest(Request request) {
method deserializeRequest (line 175) | protected Request deserializeRequest(String line) {
FILE: webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisPriorityScheduler.java
class RedisPriorityScheduler (line 20) | public class RedisPriorityScheduler extends RedisScheduler {
method RedisPriorityScheduler (line 32) | public RedisPriorityScheduler(String host) {
method RedisPriorityScheduler (line 36) | public RedisPriorityScheduler(JedisPool pool) {
method pushWhenNoDuplicate (line 40) | @Override
method poll (line 55) | @Override
method getRequest (line 66) | private String getRequest(Jedis jedis, Task task) {
method resetDuplicateCheck (line 85) | @Override
method getZsetPlusPriorityKey (line 92) | private String getZsetPlusPriorityKey(Task task) {
method getQueueNoPriorityKey (line 96) | private String getQueueNoPriorityKey(Task task) {
method getZsetMinusPriorityKey (line 100) | private String getZsetMinusPriorityKey(Task task) {
method setExtrasInItem (line 104) | private void setExtrasInItem(Jedis jedis,Request request, Task task) {
method getExtrasInItem (line 112) | private Request getExtrasInItem(Jedis jedis, String url, Task task) {
FILE: webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java
class RedisScheduler (line 21) | public class RedisScheduler extends DuplicateRemovedScheduler implements...
method RedisScheduler (line 31) | public RedisScheduler(String host) {
method RedisScheduler (line 35) | public RedisScheduler(JedisPool pool) {
method resetDuplicateCheck (line 40) | @Override
method isDuplicate (line 47) | @Override
method pushWhenNoDuplicate (line 55) | @Override
method checkForAdditionalInfo (line 70) | private boolean checkForAdditionalInfo(Request request) {
method poll (line 97) | @Override
method getSetKey (line 116) | protected String getSetKey(Task task) {
method getQueueKey (line 120) | protected String getQueueKey(Task task) {
method getItemKey (line 124) | protected String getItemKey(Task task) {
method getLeftRequestsCount (line 128) | @Override
method getTotalRequestsCount (line 136) | @Override
FILE: webmagic-extension/src/main/java/us/codecraft/webmagic/utils/ClassUtils.java
class ClassUtils (line 11) | public abstract class ClassUtils {
method getFieldsIncludeSuperClass (line 13) | public static Set<Field> getFieldsIncludeSuperClass(Class clazz) {
FILE: webmagic-extension/src/main/java/us/codecraft/webmagic/utils/DoubleKeyMap.java
class DoubleKeyMap (line 8) | public class DoubleKeyMap<K1, K2, V> extends MultiKeyMapBase {
method DoubleKeyMap (line 11) | public DoubleKeyMap() {
method DoubleKeyMap (line 15) | public DoubleKeyMap(Map<K1, Map<K2, V>> map) {
method DoubleKeyMap (line 19) | public DoubleKeyMap(Class<? extends Map> protoMapClass) {
method init (line 24) | private void init() {
method DoubleKeyMap (line 36) | @SuppressWarnings("rawtypes")
method get (line 47) | public Map<K2, V> get(K1 key) {
method get (line 56) | public V get(K1 key1, K2 key2) {
method put (line 69) | public V put(K1 key1, Map<K2, V> submap) {
method put (line 79) | public synchronized V put(K1 key1, K2 key2, V value) {
method remove (line 92) | public synchronized V remove(K1 key1, K2 key2) {
method remove (line 107) | public Map<K2, V> remove(K1 key1) {
FILE: webmagic-extension/src/main/java/us/codecraft/webmagic/utils/ExtractorUtils.java
class ExtractorUtils (line 15) | public class ExtractorUtils {
method getSelector (line 17) | public static Selector getSelector(ExtractBy extractBy) {
method getSelectors (line 39) | public static List<Selector> getSelectors(ExtractBy[] extractBies) {
FILE: webmagic-extension/src/main/java/us/codecraft/webmagic/utils/IPUtils.java
class IPUtils (line 13) | public abstract class IPUtils {
method getFirstNoLoopbackIPAddresses (line 15) | public static String getFirstNoLoopbackIPAddresses() throws SocketExce...
FILE: webmagic-extension/src/main/java/us/codecraft/webmagic/utils/MultiKeyMapBase.java
class MultiKeyMapBase (line 15) | public abstract class MultiKeyMapBase {
method MultiKeyMapBase (line 21) | public MultiKeyMapBase() {
method MultiKeyMapBase (line 24) | @SuppressWarnings("rawtypes")
method newMap (line 29) | @SuppressWarnings("unchecked")
FILE: webmagic-extension/src/main/java/us/codecraft/webmagic/utils/RequestUtils.java
class RequestUtils (line 16) | public abstract class RequestUtils {
method from (line 20) | public static List<Request> from(String exp){
FILE: webmagic-extension/src/test/java/us/codecraft/webmagic/MockPageModelPipeline.java
class MockPageModelPipeline (line 9) | public class MockPageModelPipeline implements PageModelPipeline{
method process (line 10) | @Override
FILE: webmagic-extension/src/test/java/us/codecraft/webmagic/MockPipeline.java
class MockPipeline (line 8) | public class MockPipeline implements Pipeline{
method process (line 9) | @Override
FILE: webmagic-extension/src/test/java/us/codecraft/webmagic/SimpleHttpClientTest.java
class SimpleHttpClientTest (line 15) | public class SimpleHttpClientTest {
class Weather (line 17) | public static class Weather implements AfterExtractor {
method afterProcess (line 30) | @Override
method getLocation (line 39) | public String getLocation() {
method setLocation (line 43) | public void setLocation(String location) {
method getLowTemperature (line 47) | public Integer getLowTemperature() {
method setLowTemperature (line 51) | public void setLowTemperature(Integer lowTemperature) {
method getHighTemperature (line 55) | public Integer getHighTemperature() {
method setHighTemperature (line 59) | public void setHighTemperature(Integer highTemperature) {
method getDesc (line 63) | public String getDesc() {
method setDesc (line 67) | public void setDesc(String desc) {
method toString (line 71) | @Override
method test (line 82) | @Ignore
FILE: webmagic-extension/src/test/java/us/codecraft/webmagic/configurable/ConfigurablePageProcessorTest.java
class ConfigurablePageProcessorTest (line 17) | public class ConfigurablePageProcessorTest {
method test (line 19) | @Test
FILE: webmagic-extension/src/test/java/us/codecraft/webmagic/downloader/MockGithubDownloader.java
class MockGithubDownloader (line 11) | public class MockGithubDownloader implements Downloader{
method download (line 936) | @Override
method setThread (line 947) | @Override
FILE: webmagic-extension/src/test/java/us/codecraft/webmagic/formatter/DateFormatterTest.java
class DateFormatterTest (line 15) | public class DateFormatterTest {
method testDateFormatter (line 17) | @Test
FILE: webmagic-extension/src/test/java/us/codecraft/webmagic/model/BaseRepo.java
class BaseRepo (line 8) | public class BaseRepo {
FILE: webmagic-extension/src/test/java/us/codecraft/webmagic/model/GithubRepo.java
class GithubRepo (line 12) | @TargetUrl("https://github.com/\\w+/\\w+")
method main (line 19) | public static void main(String[] args) {
method getStar (line 25) | public int getStar() {
method getFork (line 29) | public int getFork() {
FILE: webmagic-extension/src/test/java/us/codecraft/webmagic/model/GithubRepoApi.java
class GithubRepoApi (line 10) | public class GithubRepoApi {
method getName (line 15) | public String getName() {
FILE: webmagic-extension/src/test/java/us/codecraft/webmagic/model/GithubRepoTest.java
class GithubRepoTest (line 15) | public class GithubRepoTest {
method test (line 17) | @Test
FILE: webmagic-extension/src/test/java/us/codecraft/webmagic/model/ModelPageProcessorTest.java
class ModelPageProcessorTest (line 16) | public class ModelPageProcessorTest {
class ModelFoo (line 20) | @TargetUrl("http://codecraft.us/foo")
class ModelBar (line 28) | @TargetUrl("http://codecraft.us/bar")
class MockModel (line 36) | @TargetUrl(value = "http://webmagic.io/foo/\\d+",sourceRegion = "//li[...
method testMultiModel_should_not_skip_when_match (line 42) | @Test
method testExtractLinks (line 53) | @Test
method testExtractNoLinks (line 61) | @Test
FILE: webmagic-extension/src/test/java/us/codecraft/webmagic/model/PageMapperTest.java
class PageMapperTest (line 12) | public class PageMapperTest {
method test_get (line 16) | @Test
FILE: webmagic-extension/src/test/java/us/codecraft/webmagic/model/PageMocker.java
class PageMocker (line 17) | public class PageMocker {
method getMockJsonPage (line 19) | public Page getMockJsonPage() throws IOException {
method getMockPage (line 27) | public Page getMockPage() throws IOException {
FILE: webmagic-extension/src/test/java/us/codecraft/webmagic/model/PageModelExtractorTest.java
class PageModelExtractorTest (line 21) | public class PageModelExtractorTest {
class ModelDateStr (line 25) | public static class ModelDateStr {
class ModelDate (line 32) | public static class ModelDate {
class ModelInt (line 40) | public static class ModelInt {
class ModelStringList (line 47) | public static class ModelStringList {
class ModelIntList (line 54) | public static class ModelIntList {
class ModelDateList (line 62) | public static class ModelDateList {
class ModelCustomList (line 70) | public static class ModelCustomList {
class ModelJsonStr (line 78) | public static class ModelJsonStr {
class ModelUrl (line 85) | public static class ModelUrl {
method testXpath (line 92) | @Test
method testExtractDate (line 98) | @Test
method testExtractInt (line 104) | @Test
method testExtractList (line 110) | @Test
method testExtractIntList (line 116) | @Test
method testExtractDateList (line 122) | @Test
method testExtractCustomList (line 128) | @Test
method testExtractJson (line 134) | @Test
method testExtractByUrl (line 140) | @Test
FILE: webmagic-extension/src/test/java/us/codecraft/webmagic/monitor/CustomSpiderStatus.java
class CustomSpiderStatus (line 8) | public class CustomSpiderStatus extends SpiderStatus implements CustomSp...
method CustomSpiderStatus (line 10) | public CustomSpiderStatus(Spider spider, SpiderMonitor.MonitorSpiderLi...
method getSchedulerName (line 15) | @Override
FILE: webmagic-extension/src/test/java/us/codecraft/webmagic/monitor/CustomSpiderStatusMXBean.java
type CustomSpiderStatusMXBean (line 6) | public interface CustomSpiderStatusMXBean extends SpiderStatusMXBean {
method getSchedulerName (line 8) | public String getSchedulerName();
FILE: webmagic-extension/src/test/java/us/codecraft/webmagic/monitor/SeedUrlWithPortTest.java
class SeedUrlWithPortTest (line 14) | public class SeedUrlWithPortTest {
method testSeedUrlWithPort (line 16) | @Test
class TempProcessor (line 24) | class TempProcessor implements PageProcessor {
method process (line 26) | @Override
method getSite (line 31) | @Override
FILE: webmagic-extension/src/test/java/us/codecraft/webmagic/monitor/SpiderMonitorTest.java
class SpiderMonitorTest (line 12) | public class SpiderMonitorTest {
method testInherit (line 14) | @Test
FILE: webmagic-extension/src/test/java/us/codecraft/webmagic/processor/GithubRepoProcessor.java
class GithubRepoProcessor (line 13) | public class GithubRepoProcessor implements PageProcessor {
method process (line 14) | @Override
method getSite (line 20) | @Override
method test (line 25) | @Test
FILE: webmagic-extension/src/test/java/us/codecraft/webmagic/scheduler/BloomFilterDuplicateRemoverTest.java
class BloomFilterDuplicateRemoverTest (line 14) | public class BloomFilterDuplicateRemoverTest {
method testRemove (line 16) | @Test
method testMemory (line 30) | @Ignore("long time")
method testMissHit (line 54) | @Ignore("long time")
FILE: webmagic-extension/src/test/java/us/codecraft/webmagic/scheduler/RedisPrioritySchedulerTest.java
class RedisPrioritySchedulerTest (line 15) | public class RedisPrioritySchedulerTest
method setUp (line 20) | @Before
method test (line 26) | @Ignore("environment depended")
FILE: webmagic-extension/src/test/java/us/codecraft/webmagic/scheduler/RedisSchedulerTest.java
class RedisSchedulerTest (line 15) | public class RedisSchedulerTest {
method setUp (line 19) | @Before
method test (line 24) | @Ignore("environment depended")
FILE: webmagic-extension/src/test/java/us/codecraft/webmagic/utils/IPUtilsTest.java
class IPUtilsTest (line 8) | public class IPUtilsTest {
method testGetFirstNoLoopbackIPAddresses (line 10) | @Test
FILE: webmagic-extension/src/test/java/us/codecraft/webmagic/utils/RequestUtilsTest.java
class RequestUtilsTest (line 15) | public class RequestUtilsTest {
method test_generate_range (line 17) | @Test
method test_generate_range_when_invalid_number (line 23) | @Test
FILE: webmagic-samples/src/main/java/us/codecraft/webmagic/main/QuickStarter.java
class QuickStarter (line 18) | public class QuickStarter {
method init (line 24) | private static void init(){
method main (line 35) | public static void main(String[] args) {
method readKey (line 53) | private static String readKey(String key) {
FILE: webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/BaiduNews.java
class BaiduNews (line 10) | public class BaiduNews {
method toString (line 18) | @Override
method main (line 26) | public static void main(String[] args) {
method getName (line 35) | public String getName() {
method getDescription (line 39) | public String getDescription() {
FILE: webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/Blog.java
type Blog (line 8) | public interface Blog {
method getTitle (line 10) | public String getTitle();
method getContent (line 12) | public String getContent();
FILE: webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/DianpingFtlDataScanner.java
class DianpingFtlDataScanner (line 17) | @TargetUrl("http://*.alpha.dp/*")
method main (line 23) | public static void main(String[] args) {
method afterProcess (line 28) | @Override
FILE: webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/GithubRepo.java
class GithubRepo (line 18) | @TargetUrl("https://github.com/\\w+/\\w+")
method main (line 43) | public static void main(String[] args) {
method key (line 50) | @Override
method getName (line 55) | public String getName() {
method getReadme (line 59) | public String getReadme() {
method getAuthor (line 63) | public String getAuthor() {
method getLanguage (line 67) | public List<String> getLanguage() {
method getUrl (line 71) | public String getUrl() {
method getStar (line 75) | public String getStar() {
method getFork (line 79) | public String getFork() {
FILE: webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/IteyeBlog.java
class IteyeBlog (line 13) | @TargetUrl("http://*.iteye.com/blog/*")
method toString (line 22) | @Override
method main (line 30) | public static void main(String[] args) {
method getTitle (line 34) | public String getTitle() {
method getContent (line 38) | public String getContent() {
FILE: webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/JokejiModel.java
class JokejiModel (line 14) | @TargetUrl("http://www.jokeji.cn/jokehtml/jy/\\d+.htm")
method main (line 24) | public static void main(String[] args) {
FILE: webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/Kr36NewsModel.java
class Kr36NewsModel (line 20) | @TargetUrl("http://www.36kr.com/p/\\d+.html")
method main (line 33) | public static void main(String[] args) throws IOException, JMException {
method getTitle (line 46) | public String getTitle() {
method getContent (line 50) | public String getContent() {
method getUrl (line 54) | public String getUrl() {
FILE: webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/News163.java
class News163 (line 19) | @TargetUrl("http://news.163.com/\\d+/\\d+/\\d+/\\w+*.html")
method getPageKey (line 38) | @Override
method getOtherPages (line 43) | @Override
method getPage (line 48) | @Override
method combine (line 56) | @Override
method toString (line 65) | @Override
method main (line 74) | public static void main(String[] args) {
FILE: webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/OschinaAnswer.java
class OschinaAnswer (line 13) | @TargetUrl("http://www.oschina.net/question/\\d+_\\d+*")
method main (line 24) | public static void main(String[] args) {
method afterProcess (line 28) | @Override
FILE: webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/OschinaBlog.java
class OschinaBlog (line 15) | @TargetUrl("http://my.oschina.net/flashsword/blog/\\d+")
method main (line 27) | public static void main(String[] args) {
method getTitle (line 40) | public String getTitle() {
method getContent (line 44) | public String getContent() {
method getTags (line 48) | public List<String> getTags() {
FILE: webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/QQMeishi.java
class QQMeishi (line 12) | @TargetUrl("http://meishi.qq.com/beijing/c/all[\\-p2]*")
method main (line 22) | public static void main(String[] args) {
FILE: webmagic-samples/src/main/java/us/codecraft/webmagic/recover/DuplicateStorageRemover.java
class DuplicateStorageRemover (line 19) | public class DuplicateStorageRemover implements DuplicateRemover {
method DuplicateStorageRemover (line 31) | public DuplicateStorageRemover(String path) {
method isDuplicate (line 55) | @Override
method resetDuplicateCheck (line 68) | @Override
method getTotalRequestsCount (line 74) | @Override
FILE: webmagic-samples/src/main/java/us/codecraft/webmagic/recover/MmapQueueScheduler.java
class MmapQueueScheduler (line 19) | public class MmapQueueScheduler extends DuplicateRemovedScheduler {
method MmapQueueScheduler (line 29) | public MmapQueueScheduler(DuplicateRemover duplicateRemover, String pa...
method poll (line 47) | @Override
method pushWhenNoDuplicate (line 58) | @Override
method toJson (line 64) | public String toJson(Object object) {
method fromJson (line 73) | public <T> T fromJson(String jsonString, Class<T> clazz) {
FILE: webmagic-samples/src/main/java/us/codecraft/webmagic/recover/RecoverSample.java
class RecoverSample (line 11) | public class RecoverSample {
method main (line 13) | public static void main(String[] args) {
FILE: webmagic-samples/src/main/java/us/codecraft/webmagic/samples/AlexanderMcqueenGoodsProcessor.java
class AlexanderMcqueenGoodsProcessor (line 12) | public class AlexanderMcqueenGoodsProcessor implements PageProcessor {
method process (line 21) | @Override
method getSite (line 56) | @Override
method main (line 61) | public static void main(String[] args) {
FILE: webmagic-samples/src/main/java/us/codecraft/webmagic/samples/AmanzonPageProcessor.java
class AmanzonPageProcessor (line 14) | public class AmanzonPageProcessor implements PageProcessor{
method process (line 15) | public void process(Page page) {
method getSite (line 45) | @Override
method main (line 50) | public static void main(String[] args) {
FILE: webmagic-samples/src/main/java/us/codecraft/webmagic/samples/AngularJSProcessor.java
class AngularJSProcessor (line 16) | public class AngularJSProcessor implements PageProcessor {
method process (line 24) | @Override
method getSite (line 40) | @Override
method main (line 45) | public static void main(String[] args) {
FILE: webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DiandianBlogProcessor.java
class DiandianBlogProcessor (line 12) | public class DiandianBlogProcessor implements PageProcessor {
method process (line 16) | @Override
method getSite (line 34) | @Override
FILE: webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DiaoyuwengProcessor.java
class DiaoyuwengProcessor (line 16) | public class DiaoyuwengProcessor implements PageProcessor {
method process (line 20) | @Override
method getSite (line 34) | @Override
method main (line 43) | public static void main(String[] args) {
FILE: webmagic-samples/src/main/java/us/codecraft/webmagic/samples/F58PageProcesser.java
class F58PageProcesser (line 16) | public class F58PageProcesser implements PageProcessor {
method process (line 18) | @Override
method getSite (line 26) | @Override
method main (line 31) | public static void main(String[] args) {
FILE: webmagic-samples/src/main/java/us/codecraft/webmagic/samples/GithubRepo.java
class GithubRepo (line 6) | public class GithubRepo {
method getName (line 14) | public String getName() {
method setName (line 18) | public void setName(String name) {
method getAuthor (line 22) | public String getAuthor() {
method setAuthor (line 26) | public void setAuthor(String author) {
method getReadme (line 30) | public String getReadme() {
method setReadme (line 34) | public void setReadme(String readme) {
FILE: webmagic-samples/src/main/java/us/codecraft/webmagic/samples/GithubRepoPageProcessor.java
class GithubRepoPageProcessor (line 12) | public class GithubRepoPageProcessor implements PageProcessor {
method process (line 16) | @Override
method getSite (line 32) | @Override
method main (line 37) | public static void main(String[] args) {
FILE: webmagic-samples/src/main/java/us/codecraft/webmagic/samples/HuxiuProcessor.java
class HuxiuProcessor (line 13) | public class HuxiuProcessor implements PageProcessor {
method process (line 14) | @Override
method getSite (line 22) | @Override
method main (line 27) | public static void main(String[] args) {
FILE: webmagic-samples/src/main/java/us/codecraft/webmagic/samples/InfoQMiniBookProcessor.java
class InfoQMiniBookProcessor (line 14) | public class InfoQMiniBookProcessor implements PageProcessor {
method process (line 18) | @Override
method getSite (line 29) | @Override
method main (line 38) | public static void main(String[] args) {
FILE: webmagic-samples/src/main/java/us/codecraft/webmagic/samples/IteyeBlogProcessor.java
class IteyeBlogProcessor (line 11) | public class IteyeBlogProcessor implements PageProcessor {
method process (line 15) | @Override
method getSite (line 22) | @Override
method main (line 30) | public static void main(String[] args) {
FILE: webmagic-samples/src/main/java/us/codecraft/webmagic/samples/KaichibaProcessor.java
class KaichibaProcessor (line 13) | public class KaichibaProcessor implements PageProcessor {
method process (line 14) | @Override
method getSite (line 23) | @Override
method main (line 29) | public static void main(String[] args) {
FILE: webmagic-samples/src/main/java/us/codecraft/webmagic/samples/MamacnPageProcessor.java
class MamacnPageProcessor (line 18) | public class MamacnPageProcessor implements PageProcessor {
method process (line 22) | @Override
method getSite (line 37) | @Override
method main (line 42) | public static void main(String[] args) throws FileNotFoundException, U...
FILE: webmagic-samples/src/main/java/us/codecraft/webmagic/samples/MeicanProcessor.java
class MeicanProcessor (line 15) | public class MeicanProcessor implements PageProcessor {
method process (line 16) | @Override
method getSite (line 29) | @Override
method main (line 35) | public static void main(String[] args) {
FILE: webmagic-samples/src/main/java/us/codecraft/webmagic/samples/NjuBBSProcessor.java
class NjuBBSProcessor (line 15) | public class NjuBBSProcessor implements PageProcessor {
method process (line 16) | @Override
method getSite (line 24) | @Override
method main (line 29) | public static void main(String[] args) {
FILE: webmagic-samples/src/main/java/us/codecraft/webmagic/samples/PhantomJSPageProcessor.java
class PhantomJSPageProcessor (line 19) | public class PhantomJSPageProcessor implements PageProcessor {
method process (line 27) | @Override
method getSite (line 33) | @Override
method main (line 38) | public static void main(String[] args) throws Exception {
FILE: webmagic-samples/src/main/java/us/codecraft/webmagic/samples/QzoneBlogProcessor.java
class QzoneBlogProcessor (line 12) | public class QzoneBlogProcessor implements PageProcessor {
method process (line 13) | @Override
method getSite (line 25) | @Override
FILE: webmagic-samples/src/main/java/us/codecraft/webmagic/samples/SinaBlogProcessor.java
class SinaBlogProcessor (line 11) | public class SinaBlogProcessor implements PageProcessor {
method process (line 24) | @Override
method getSite (line 39) | @Override
method main (line 44) | public static void main(String[] args) {
FILE: webmagic-samples/src/main/java/us/codecraft/webmagic/samples/TianyaPageProcesser.java
class TianyaPageProcesser (line 12) | public class TianyaPageProcesser implements PageProcessor {
method process (line 14) | @Override
method getSite (line 22) | @Override
FILE: webmagic-samples/src/main/java/us/codecraft/webmagic/samples/ZhihuPageProcessor.java
class ZhihuPageProcessor (line 16) | public class ZhihuPageProcessor implements PageProcessor {
method process (line 27) | @Override
method getSite (line 49) | @Override
method main (line 54) | public static void main(String[] args) {
FILE: webmagic-samples/src/main/java/us/codecraft/webmagic/samples/formatter/StringTemplateFormatter.java
class StringTemplateFormatter (line 8) | public class StringTemplateFormatter implements ObjectFormatter<String> {
method format (line 12) | @Override
method clazz (line 17) | @Override
method initParam (line 22) | @Override
FILE: webmagic-samples/src/main/java/us/codecraft/webmagic/samples/pipeline/OneFilePipeline.java
class OneFilePipeline (line 16) | public class OneFilePipeline extends FilePersistentBase implements Pipel...
method OneFilePipeline (line 22) | public OneFilePipeline() throws FileNotFoundException, UnsupportedEnco...
method OneFilePipeline (line 26) | public OneFilePipeline(String path) throws FileNotFoundException, Unsu...
method process (line 31) | @Override
FILE: webmagic-samples/src/main/java/us/codecraft/webmagic/samples/pipeline/ReplacePipeline.java
class ReplacePipeline (line 6) | public class ReplacePipeline {
FILE: webmagic-samples/src/main/java/us/codecraft/webmagic/samples/scheduler/DelayQueueScheduler.java
class DelayQueueScheduler (line 16) | public class DelayQueueScheduler extends PriorityScheduler {
class RequestWrapper (line 26) | private class RequestWrapper implements Delayed {
method RequestWrapper (line 32) | private RequestWrapper(Request request) {
method getStartTime (line 36) | private long getStartTime() {
method getRequest (line 40) | private Request getRequest() {
method getDelay (line 44) | @Override
method compareTo (line 50) | @Override
method DelayQueueScheduler (line 56) | public DelayQueueScheduler(long time, TimeUnit timeUnit) {
method push (line 61) | @Override
method poll (line 69) | @Override
FILE: webmagic-samples/src/main/java/us/codecraft/webmagic/samples/scheduler/LevelLimitScheduler.java
class LevelLimitScheduler (line 10) | public class LevelLimitScheduler extends PriorityScheduler {
method LevelLimitScheduler (line 14) | public LevelLimitScheduler(int levelLimit) {
method push (line 18) | @Override
FILE: webmagic-samples/src/main/java/us/codecraft/webmagic/samples/scheduler/ZipCodePageProcessor.java
class ZipCodePageProcessor (line 20) | public class ZipCodePageProcessor implements PageProcessor {
method process (line 25) | @Override
method processCountry (line 37) | private void processCountry(Page page) {
method processProvince (line 47) | private void processProvince(Page page) {
method processDistrict (line 62) | private void processDistrict(Page page) {
method getSite (line 75) | @Override
method main (line 80) | public static void main(String[] args) {
FILE: webmagic-samples/src/test/java/us/codecraft/webmagic/SpiderTest.java
class SpiderTest (line 15) | public class SpiderTest {
method testSpider (line 18) | @Ignore
method testGlobalSpider (line 25) | @Ignore
method test (line 40) | @Ignore
method languageSchema (line 47) | @Ignore
FILE: webmagic-samples/src/test/java/us/codecraft/webmagic/model/ProcessorBenchmark.java
class ProcessorBenchmark (line 15) | public class ProcessorBenchmark {
method test (line 17) | @Ignore
FILE: webmagic-samples/src/test/java/us/codecraft/webmagic/processor/SinablogProcessorTest.java
class SinablogProcessorTest (line 18) | public class SinablogProcessorTest {
method test (line 20) | @Ignore
FILE: webmagic-samples/src/test/java/us/codecraft/webmagic/samples/scheduler/DelayQueueSchedulerTest.java
class DelayQueueSchedulerTest (line 12) | public class DelayQueueSchedulerTest {
method test (line 14) | @Ignore("infinite")
FILE: webmagic-saxon/src/main/java/us/codecraft/webmagic/selector/JaxpSelectorUtils.java
class JaxpSelectorUtils (line 20) | public final class JaxpSelectorUtils {
method JaxpSelectorUtils (line 22) | private JaxpSelectorUtils() {
method NodeListToArrayList (line 26) | public static List<Node> NodeListToArrayList(NodeList nodes) {
method nodeToString (line 34) | public static String nodeToString(Node node) throws TransformerExcepti...
method nodesToStrings (line 44) | public static List<String> nodesToStrings(List<Node> nodes) throws Tra...
FILE: webmagic-saxon/src/main/java/us/codecraft/webmagic/selector/NodeSelector.java
type NodeSelector (line 13) | public interface NodeSelector {
method select (line 22) | String select(Node node);
method selectList (line 30) | List<String> selectList(Node node);
FILE: webmagic-saxon/src/main/java/us/codecraft/webmagic/selector/Xpath2Selector.java
class Xpath2Selector (line 35) | public class Xpath2Selector implements Selector, NodeSelector {
method Xpath2Selector (line 43) | public Xpath2Selector(String xpathStr) {
method newInstance (line 52) | public static Xpath2Selector newInstance(String xpathStr) {
type XPath2NamespaceContext (line 56) | enum XPath2NamespaceContext implements NamespaceContext {
method put (line 64) | private void put(String prefix, String namespaceURI) {
method XPath2NamespaceContext (line 70) | XPath2NamespaceContext() {
method getNamespaceURI (line 76) | @Override
method getPrefix (line 81) | @Override
method getPrefixes (line 90) | @Override
method init (line 100) | private void init() throws XPathExpressionException {
method select (line 106) | @Override
method select (line 117) | @Override
method selectList (line 127) | @Override
method selectList (line 138) | @Override
method selectNode (line 150) | public Node selectNode(String text) {
method selectNode (line 160) | public Node selectNode(Node node) {
method selectNodes (line 169) | public List<Node> selectNodes(String text) {
method selectNodes (line 179) | public List<Node> selectNodes(Node node) {
method parse (line 189) | protected static Document parse(String text) throws ParserConfiguratio...
FILE: webmagic-saxon/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java
class XpathSelectorTest (line 26) | public class XpathSelectorTest {
method test (line 928) | @Test
method testOschina (line 1367) | @Test
method testXPath2 (line 1375) | @Test
method testXpath2Selector (line 1383) | @Test
method performanceTest (line 1394) | @Ignore("take long time")
method parserPerformanceTest (line 1425) | @Ignore("take long time")
method testStringAPI (line 1493) | @Test
method testNodeAPI (line 1503) | @Test
method testUtilAPI (line 1514) | @Test
FILE: webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/Params.java
class Params (line 15) | public class Params {
method Params (line 33) | public Params() {
method setLanguagefromArg (line 39) | public void setLanguagefromArg(String arg) {
FILE: webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/ScriptConsole.java
class ScriptConsole (line 18) | public class ScriptConsole {
method main (line 19) | public static void main(String[] args) {
method startSpider (line 24) | private static void startSpider(Params params) {
method parseCommand (line 49) | private static Params parseCommand(String[] args) {
method exit (line 68) | private static void exit() {
method readOptions (line 74) | private static Params readOptions(CommandLine commandLine) {
FILE: webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/ScriptEnginePool.java
class ScriptEnginePool (line 15) | public class ScriptEnginePool {
method ScriptEnginePool (line 21) | public ScriptEnginePool(Language language,int size) {
method getEngine (line 30) | public ScriptEngine getEngine() {
method release (line 35) | public void release(ScriptEngine scriptEngine){
FILE: webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/ScriptProcessor.java
class ScriptProcessor (line 20) | public class ScriptProcessor implements PageProcessor {
method ScriptProcessor (line 32) | public ScriptProcessor(Language language, String script, int threadNum) {
method process (line 47) | @Override
method getSite (line 65) | @Override
FILE: webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/ScriptProcessorBuilder.java
class ScriptProcessorBuilder (line 18) | public class ScriptProcessorBuilder {
method ScriptProcessorBuilder (line 28) | private ScriptProcessorBuilder() {
method custom (line 31) | public static ScriptProcessorBuilder custom() {
method language (line 35) | public ScriptProcessorBuilder language(Language language) {
method scriptFromFile (line 40) | public ScriptProcessorBuilder scriptFromFile(String fileName) {
method scriptFromClassPathFile (line 50) | public ScriptProcessorBuilder scriptFromClassPathFile(String fileName) {
method script (line 60) | public ScriptProcessorBuilder script(String script) {
method thread (line 65) | public ScriptProcessorBuilder thread(int threadNum) {
method build (line 70) | public ScriptProcessor build(){
FILE: webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/config/CommandLineOption.java
class CommandLineOption (line 10) | public abstract class CommandLineOption {
method CommandLineOption (line 14) | public CommandLineOption(char option) {
method addParamOption (line 18) | protected abstract void addParamOption(Params params, CommandLine comm...
method addParamOptionIfInCommandLine (line 20) | public void addParamOptionIfInCommandLine(Params params, CommandLine c...
method getAllOptions (line 25) | public static List<CommandLineOption> getAllOptions() {
class OptionL (line 30) | class OptionL extends CommandLineOption {
method OptionL (line 31) | public OptionL() {
method addParamOption (line 35) | protected void addParamOption(Params params, CommandLine commandLine) {
class OptionF (line 41) | class OptionF extends CommandLineOption {
method OptionF (line 42) | public OptionF() {
method addParamOption (line 46) | protected void addParamOption(Params params, CommandLine commandLine) {
class OptionS (line 52) | class OptionS extends CommandLineOption {
method OptionS (line 53) | public OptionS() {
method addParamOption (line 57) | protected void addParamOption(Params params, CommandLine commandLine) {
class OptionT (line 63) | class OptionT extends CommandLineOption {
method OptionT (line 64) | public OptionT() {
method addParamOption (line 68) | protected void addParamOption(Params params, CommandLine commandLine) {
class OptionG (line 74) | class OptionG extends CommandLineOption {
method OptionG (line 75) | public OptionG() {
method addParamOption (line 79) | protected void addParamOption(Params params, CommandLine commandLine) {
FILE: webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/config/ConfigLogger.java
class ConfigLogger (line 10) | public class ConfigLogger {
method configLogger (line 17) | public static void configLogger(String value) {
FILE: webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/languages/JRuby.java
class JRuby (line 13) | public class JRuby extends Language {
method JRuby (line 14) | public JRuby() {
method process (line 18) | public void process(ScriptEngine engine, String defines, String script...
FILE: webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/languages/Javascript.java
class Javascript (line 8) | public class Javascript extends Language {
method Javascript (line 9) | public Javascript() {
method process (line 13) | public void process(ScriptEngine engine, String defines, String script...
FILE: webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/languages/Jython.java
class Jython (line 13) | public class Jython extends Language {
method Jython (line 14) | public Jython() {
method process (line 18) | public void process(ScriptEngine engine, String defines, String script...
FILE: webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/languages/Language.java
class Language (line 10) | public abstract class Language {
method Language (line 11) | public Language(String engineName, String defineFile, String gatherFil...
method getEngineName (line 23) | public String getEngineName() {
method getDefineFile (line 27) | public String getDefineFile() {
method getGatherFile (line 31) | public String getGatherFile() {
method process (line 35) | public abstract void process(ScriptEngine engine, String defines, Stri...
FILE: webmagic-scripts/src/main/resources/js/defines.js
function $ (line 1) | function $(str){
function xpath (line 4) | function xpath(str){
function urls (line 7) | function urls(str){
FILE: webmagic-scripts/src/main/resources/python/defines.py
function xpath (line 1) | def xpath(str):
function css (line 4) | def css(str):
function urls (line 7) | def urls(str):
function tomap (line 11) | def tomap(key,value):
FILE: webmagic-scripts/src/main/resources/ruby/defines.rb
function xpath (line 1) | def xpath str
function css (line 4) | def css str
function urls (line 7) | def urls str
FILE: webmagic-scripts/src/test/java/us/codecraft/webmagic/scripts/ScriptProcessorTest.java
class ScriptProcessorTest (line 15) | @Ignore
method testJavaScriptProcessor (line 18) | @Test
method testRubyProcessor (line 25) | @Test
method testPythonProcessor (line 33) | @Test
FILE: webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/SeleniumDownloader.java
class SeleniumDownloader (line 32) | public class SeleniumDownloader extends AbstractDownloader implements Cl...
method SeleniumDownloader (line 49) | public SeleniumDownloader(String chromeDriverPath) {
method SeleniumDownloader (line 59) | public SeleniumDownloader() {
method setSleepTime (line 70) | public SeleniumDownloader setSleepTime(int sleepTime) {
method download (line 75) | @Override
method checkInit (line 129) | private void checkInit() {
method setThread (line 137) | @Override
method close (line 142) | @Override
FILE: webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/WebDriverPool.java
class WebDriverPool (line 32) | class WebDriverPool {
method configure (line 67) | public void configure() throws IOException {
method isUrl (line 153) | private boolean isUrl(String urlString) {
method WebDriverPool (line 173) | public WebDriverPool(int capacity) {
method WebDriverPool (line 177) | public WebDriverPool() {
method get (line 186) | public WebDriver get() throws InterruptedException {
method returnToPool (line 216) | public void returnToPool(WebDriver webDriver) {
method checkRunning (line 221) | protected void checkRunning() {
method closeAll (line 227) | public void closeAll() {
FILE: webmagic-selenium/src/test/java/us/codecraft/webmagic/downloader/SeleniumTest.java
class SeleniumTest (line 21) | public class SeleniumTest {
method testSelenium (line 23) | @Ignore("need chrome driver")
FILE: webmagic-selenium/src/test/java/us/codecraft/webmagic/downloader/selenium/SeleniumDownloaderTest.java
class SeleniumDownloaderTest (line 15) | public class SeleniumDownloaderTest {
method test (line 19) | @Ignore("need chrome driver")
method testBaiduWenku (line 41) | @Ignore
FILE: webmagic-selenium/src/test/java/us/codecraft/webmagic/downloader/selenium/WebDriverPoolTest.java
class WebDriverPoolTest (line 12) | public class WebDriverPoolTest {
method test (line 16) | @Ignore("need chrome driver")
FILE: webmagic-selenium/src/test/java/us/codecraft/webmagic/samples/GooglePlayProcessor.java
class GooglePlayProcessor (line 17) | public class GooglePlayProcessor implements PageProcessor {
method process (line 21) | @Override
method getSite (line 28) | @Override
method main (line 36) | public static void main(String[] args) {
FILE: webmagic-selenium/src/test/java/us/codecraft/webmagic/samples/HuabanProcessor.java
class HuabanProcessor (line 17) | public class HuabanProcessor implements PageProcessor {
method process (line 21) | @Override
method getSite (line 31) | @Override
method main (line 39) | public static void main(String[] args) {
Condensed preview — 310 files, each showing path, character count, and a content snippet. Download the .json file or copy for the full structured content (1,172K chars).
[
{
"path": ".gitignore",
"chars": 1329,
"preview": "target/\npom.xml.tag\npom.xml.releaseBackup\npom.xml.versionsBackup\npom.xml.next\nrelease.properties\ndependency-reduced-pom."
},
{
"path": ".travis.yml",
"chars": 34,
"preview": "language: java \njdk:\n - openjdk9\n"
},
{
"path": "LICENSE",
"chars": 10255,
"preview": "Apache License\nVersion 2.0, January 2004\nhttp://www.apache.org/licenses/\n\nTERMS AND CONDITIONS FOR USE, REPRODUCTION, AN"
},
{
"path": "README-zh.md",
"chars": 5210,
"preview": "\n\n\n[\n\n[Readme in Chinese](https://github.com/code4craft/webmagic/tree/master/REA"
},
{
"path": "pom.xml",
"chars": 13561,
"preview": "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<project\n xmlns=\"http://maven.apache.org/POM/4.0.0\"\n xmlns:xsi=\"http://www."
},
{
"path": "src/site/site.xml",
"chars": 860,
"preview": "<project xmlns=\"http://maven.apache.org/DECORATION/1.6.0\"\n xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\"\n "
},
{
"path": "webmagic-core/README.md",
"chars": 85,
"preview": "webmagic-core\n-------\nwebmagic核心部分。只包含爬虫基本模块和基本抽取器。webmagic-core的目标是成为网页爬虫的一个教科书般的实现。"
},
{
"path": "webmagic-core/module_webmagic-core.xml",
"chars": 8553,
"preview": "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<project name=\"module_webmagic-core\" default=\"compile.module.webmagic-core\">\n <d"
},
{
"path": "webmagic-core/pom.xml",
"chars": 2093,
"preview": "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<project\n xmlns=\"http://maven.apache.org/POM/4.0.0\"\n xmlns:xsi=\"http://www."
},
{
"path": "webmagic-core/src/main/java/us/codecraft/webmagic/Page.java",
"chars": 8423,
"preview": "package us.codecraft.webmagic;\n\nimport org.apache.commons.lang3.StringUtils;\nimport us.codecraft.webmagic.selector.Html;"
},
{
"path": "webmagic-core/src/main/java/us/codecraft/webmagic/Request.java",
"chars": 5159,
"preview": "package us.codecraft.webmagic;\n\nimport java.io.Serializable;\nimport java.util.Collections;\nimport java.util.HashMap;\nimp"
},
{
"path": "webmagic-core/src/main/java/us/codecraft/webmagic/ResultItems.java",
"chars": 1800,
"preview": "package us.codecraft.webmagic;\n\nimport java.util.LinkedHashMap;\nimport java.util.Map;\n\n/**\n * Object contains extract re"
},
{
"path": "webmagic-core/src/main/java/us/codecraft/webmagic/Site.java",
"chars": 11231,
"preview": "package us.codecraft.webmagic;\n\nimport java.util.HashMap;\nimport java.util.HashSet;\nimport java.util.LinkedHashMap;\nimpo"
},
{
"path": "webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java",
"chars": 22154,
"preview": "package us.codecraft.webmagic;\n\n\nimport java.io.Closeable;\nimport java.io.IOException;\nimport java.util.ArrayList;\nimpor"
},
{
"path": "webmagic-core/src/main/java/us/codecraft/webmagic/SpiderListener.java",
"chars": 486,
"preview": "package us.codecraft.webmagic;\n\n/**\n * Listener of Spider on page processing. Used for monitor and such on.\n *\n * @autho"
},
{
"path": "webmagic-core/src/main/java/us/codecraft/webmagic/SpiderScheduler.java",
"chars": 1577,
"preview": "package us.codecraft.webmagic;\n\nimport java.util.concurrent.TimeUnit;\nimport java.util.concurrent.locks.Condition;\nimpor"
},
{
"path": "webmagic-core/src/main/java/us/codecraft/webmagic/Task.java",
"chars": 468,
"preview": "package us.codecraft.webmagic;\n\n/**\n * Interface for identifying different tasks.<br>\n *\n * @author code4crafter@gmail.c"
},
{
"path": "webmagic-core/src/main/java/us/codecraft/webmagic/downloader/AbstractDownloader.java",
"chars": 2445,
"preview": "package us.codecraft.webmagic.downloader;\n\nimport us.codecraft.webmagic.Page;\nimport us.codecraft.webmagic.Request;\nimpo"
},
{
"path": "webmagic-core/src/main/java/us/codecraft/webmagic/downloader/CustomRedirectStrategy.java",
"chars": 1803,
"preview": "package us.codecraft.webmagic.downloader;\n\nimport java.net.URI;\n\nimport org.apache.http.HttpRequest;\nimport org.apache.h"
},
{
"path": "webmagic-core/src/main/java/us/codecraft/webmagic/downloader/Downloader.java",
"chars": 917,
"preview": "package us.codecraft.webmagic.downloader;\n\nimport us.codecraft.webmagic.Page;\nimport us.codecraft.webmagic.Request;\nimpo"
},
{
"path": "webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java",
"chars": 5247,
"preview": "package us.codecraft.webmagic.downloader;\n\nimport java.io.IOException;\nimport java.nio.charset.Charset;\nimport java.util"
},
{
"path": "webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java",
"chars": 6616,
"preview": "package us.codecraft.webmagic.downloader;\n\nimport org.apache.commons.lang3.JavaVersion;\nimport org.apache.commons.lang3."
},
{
"path": "webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientRequestContext.java",
"chars": 834,
"preview": "package us.codecraft.webmagic.downloader;\n\nimport org.apache.http.client.methods.HttpUriRequest;\nimport org.apache.http."
},
{
"path": "webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpUriRequestConverter.java",
"chars": 5584,
"preview": "package us.codecraft.webmagic.downloader;\n\nimport org.apache.http.HttpHost;\nimport org.apache.http.auth.AuthState;\nimpor"
},
{
"path": "webmagic-core/src/main/java/us/codecraft/webmagic/downloader/package.html",
"chars": 106,
"preview": "<html>\n\t<body>\nDownloader is the part that downloads web pages and store in Page object.\n\t</body>\n</html>\n"
},
{
"path": "webmagic-core/src/main/java/us/codecraft/webmagic/model/HttpRequestBody.java",
"chars": 3094,
"preview": "package us.codecraft.webmagic.model;\n\nimport org.apache.http.NameValuePair;\nimport org.apache.http.client.utils.URLEncod"
},
{
"path": "webmagic-core/src/main/java/us/codecraft/webmagic/package.html",
"chars": 104,
"preview": "<html>\n\t<body>\n <div class=\"en\">\n Main class \"Spider\" and models.\n </div>\n\t</body>\n</html>\n"
},
{
"path": "webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/CollectorPipeline.java",
"chars": 442,
"preview": "package us.codecraft.webmagic.pipeline;\n\nimport java.util.List;\n\n/**\n * Pipeline that can collect and store results. <br"
},
{
"path": "webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/ConsolePipeline.java",
"chars": 647,
"preview": "package us.codecraft.webmagic.pipeline;\n\nimport us.codecraft.webmagic.ResultItems;\nimport us.codecraft.webmagic.Task;\n\ni"
},
{
"path": "webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/FilePipeline.java",
"chars": 1996,
"preview": "package us.codecraft.webmagic.pipeline;\n\nimport org.apache.commons.codec.digest.DigestUtils;\nimport org.slf4j.Logger;\nim"
},
{
"path": "webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/Pipeline.java",
"chars": 590,
"preview": "package us.codecraft.webmagic.pipeline;\n\nimport us.codecraft.webmagic.ResultItems;\nimport us.codecraft.webmagic.Task;\n\n/"
},
{
"path": "webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/ResultItemsCollectorPipeline.java",
"chars": 615,
"preview": "package us.codecraft.webmagic.pipeline;\n\nimport us.codecraft.webmagic.ResultItems;\nimport us.codecraft.webmagic.Task;\n\ni"
},
{
"path": "webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/package.html",
"chars": 96,
"preview": "<html>\n\t<body>\nPipeline is the persistent and offline process part of crawler.\n\t</body>\n</html>\n"
},
{
"path": "webmagic-core/src/main/java/us/codecraft/webmagic/processor/PageProcessor.java",
"chars": 837,
"preview": "package us.codecraft.webmagic.processor;\n\nimport us.codecraft.webmagic.Page;\nimport us.codecraft.webmagic.Site;\n\n/**\n * "
},
{
"path": "webmagic-core/src/main/java/us/codecraft/webmagic/processor/SimplePageProcessor.java",
"chars": 1121,
"preview": "package us.codecraft.webmagic.processor;\n\nimport us.codecraft.webmagic.Page;\nimport us.codecraft.webmagic.Site;\n\nimport "
},
{
"path": "webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/BaiduBaikePageProcessor.java",
"chars": 1814,
"preview": "package us.codecraft.webmagic.processor.example;\n\nimport us.codecraft.webmagic.Page;\nimport us.codecraft.webmagic.Result"
},
{
"path": "webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/GithubRepoPageProcessor.java",
"chars": 1356,
"preview": "package us.codecraft.webmagic.processor.example;\n\nimport us.codecraft.webmagic.Page;\nimport us.codecraft.webmagic.Site;\n"
},
{
"path": "webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/ZhihuPageProcessor.java",
"chars": 1280,
"preview": "package us.codecraft.webmagic.processor.example;\n\nimport us.codecraft.webmagic.Page;\nimport us.codecraft.webmagic.Site;\n"
},
{
"path": "webmagic-core/src/main/java/us/codecraft/webmagic/processor/package.html",
"chars": 90,
"preview": "<html>\n\t<body>\nPageProcessor custom part of a crawler for specific site.\n\t</body>\n</html>\n"
},
{
"path": "webmagic-core/src/main/java/us/codecraft/webmagic/proxy/Proxy.java",
"chars": 3810,
"preview": "package us.codecraft.webmagic.proxy;\n\nimport java.io.UnsupportedEncodingException;\nimport java.net.URI;\nimport java.net."
},
{
"path": "webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyProvider.java",
"chars": 1103,
"preview": "package us.codecraft.webmagic.proxy;\n\nimport us.codecraft.webmagic.Page;\nimport us.codecraft.webmagic.Request;\nimport us"
},
{
"path": "webmagic-core/src/main/java/us/codecraft/webmagic/proxy/SimpleProxyProvider.java",
"chars": 1749,
"preview": "package us.codecraft.webmagic.proxy;\n\nimport us.codecraft.webmagic.Page;\nimport us.codecraft.webmagic.Request;\nimport us"
},
{
"path": "webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/DuplicateRemovedScheduler.java",
"chars": 1733,
"preview": "package us.codecraft.webmagic.scheduler;\n\nimport org.slf4j.Logger;\nimport org.slf4j.LoggerFactory;\nimport us.codecraft.w"
},
{
"path": "webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/MonitorableScheduler.java",
"chars": 358,
"preview": "package us.codecraft.webmagic.scheduler;\n\nimport us.codecraft.webmagic.Task;\n\n/**\n * The scheduler whose requests can be"
},
{
"path": "webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/PriorityScheduler.java",
"chars": 2260,
"preview": "package us.codecraft.webmagic.scheduler;\n\nimport us.codecraft.webmagic.Request;\nimport us.codecraft.webmagic.Task;\nimpor"
},
{
"path": "webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/QueueScheduler.java",
"chars": 1802,
"preview": "package us.codecraft.webmagic.scheduler;\n\nimport java.util.concurrent.BlockingQueue;\nimport java.util.concurrent.LinkedB"
},
{
"path": "webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/Scheduler.java",
"chars": 670,
"preview": "package us.codecraft.webmagic.scheduler;\n\nimport us.codecraft.webmagic.Request;\nimport us.codecraft.webmagic.Task;\n\n/**\n"
},
{
"path": "webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/component/DuplicateRemover.java",
"chars": 770,
"preview": "package us.codecraft.webmagic.scheduler.component;\n\nimport us.codecraft.webmagic.Request;\nimport us.codecraft.webmagic.T"
},
{
"path": "webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/component/HashSetDuplicateRemover.java",
"chars": 840,
"preview": "package us.codecraft.webmagic.scheduler.component;\n\nimport us.codecraft.webmagic.Request;\nimport us.codecraft.webmagic.T"
},
{
"path": "webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/component/package.html",
"chars": 56,
"preview": "<html>\n\t<body>\nComponent of scheduler.\n\t</body>\n</html>\n"
},
{
"path": "webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/package.html",
"chars": 73,
"preview": "<html>\n\t<body>\nScheduler is the part of url management.\n\t</body>\n</html>\n"
},
{
"path": "webmagic-core/src/main/java/us/codecraft/webmagic/selector/AbstractSelectable.java",
"chars": 2948,
"preview": "package us.codecraft.webmagic.selector;\n\n\nimport java.util.ArrayList;\nimport java.util.List;\n\nimport org.apache.commons."
},
{
"path": "webmagic-core/src/main/java/us/codecraft/webmagic/selector/AndSelector.java",
"chars": 1631,
"preview": "package us.codecraft.webmagic.selector;\n\nimport java.util.ArrayList;\nimport java.util.List;\n\n/**\n * All selectors will b"
},
{
"path": "webmagic-core/src/main/java/us/codecraft/webmagic/selector/BaseElementSelector.java",
"chars": 1581,
"preview": "package us.codecraft.webmagic.selector;\n\nimport org.jsoup.Jsoup;\nimport org.jsoup.nodes.Document;\nimport org.jsoup.nodes"
},
{
"path": "webmagic-core/src/main/java/us/codecraft/webmagic/selector/CssSelector.java",
"chars": 2761,
"preview": "package us.codecraft.webmagic.selector;\n\n\nimport java.util.ArrayList;\nimport java.util.List;\nimport org.apache.commons.c"
},
{
"path": "webmagic-core/src/main/java/us/codecraft/webmagic/selector/ElementSelector.java",
"chars": 652,
"preview": "package us.codecraft.webmagic.selector;\n\nimport org.jsoup.nodes.Element;\n\nimport java.util.List;\n\n/**\n * Selector(extrac"
},
{
"path": "webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java",
"chars": 2329,
"preview": "package us.codecraft.webmagic.selector;\n\nimport org.jsoup.Jsoup;\nimport org.jsoup.nodes.Document;\nimport org.jsoup.nodes"
},
{
"path": "webmagic-core/src/main/java/us/codecraft/webmagic/selector/HtmlNode.java",
"chars": 4433,
"preview": "package us.codecraft.webmagic.selector;\n\nimport org.jsoup.nodes.Document;\nimport org.jsoup.nodes.Element;\n\nimport java.u"
},
{
"path": "webmagic-core/src/main/java/us/codecraft/webmagic/selector/Json.java",
"chars": 1487,
"preview": "package us.codecraft.webmagic.selector;\n\nimport com.alibaba.fastjson.JSON;\nimport us.codecraft.xsoup.XTokenQueue;\n\nimpor"
},
{
"path": "webmagic-core/src/main/java/us/codecraft/webmagic/selector/JsonPathSelector.java",
"chars": 1863,
"preview": "package us.codecraft.webmagic.selector;\n\n\nimport java.util.ArrayList;\nimport java.util.List;\nimport java.util.Map;\nimpor"
},
{
"path": "webmagic-core/src/main/java/us/codecraft/webmagic/selector/LinksSelector.java",
"chars": 1300,
"preview": "package us.codecraft.webmagic.selector;\n\nimport java.util.ArrayList;\nimport java.util.List;\n\nimport org.apache.commons.l"
},
{
"path": "webmagic-core/src/main/java/us/codecraft/webmagic/selector/OrSelector.java",
"chars": 1212,
"preview": "package us.codecraft.webmagic.selector;\n\nimport java.util.ArrayList;\nimport java.util.List;\n\n/**\n * All extractors will "
},
{
"path": "webmagic-core/src/main/java/us/codecraft/webmagic/selector/PlainText.java",
"chars": 1930,
"preview": "package us.codecraft.webmagic.selector;\n\nimport java.util.ArrayList;\nimport java.util.List;\n\n/**\n * Selectable plain tex"
},
{
"path": "webmagic-core/src/main/java/us/codecraft/webmagic/selector/RegexResult.java",
"chars": 565,
"preview": "package us.codecraft.webmagic.selector;\n\n/**\n * Object contains regex results.<br>\n * For multi group result extension.<"
},
{
"path": "webmagic-core/src/main/java/us/codecraft/webmagic/selector/RegexSelector.java",
"chars": 2849,
"preview": "package us.codecraft.webmagic.selector;\n\nimport org.apache.commons.lang3.StringUtils;\n\nimport java.util.ArrayList;\nimpor"
},
{
"path": "webmagic-core/src/main/java/us/codecraft/webmagic/selector/ReplaceSelector.java",
"chars": 1118,
"preview": "package us.codecraft.webmagic.selector;\n\nimport java.util.List;\nimport java.util.regex.Matcher;\nimport java.util.regex.P"
},
{
"path": "webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectable.java",
"chars": 3058,
"preview": "package us.codecraft.webmagic.selector;\n\nimport java.util.List;\n\n/**\n * Selectable text.<br>\n *\n * @author code4crafter@"
},
{
"path": "webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selector.java",
"chars": 583,
"preview": "package us.codecraft.webmagic.selector;\n\nimport java.util.List;\n\n/**\n * Selector(extractor) for text.<br>\n *\n * @author "
},
{
"path": "webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectors.java",
"chars": 1372,
"preview": "package us.codecraft.webmagic.selector;\n\n/**\n * Convenient methods for selectors.<br>\n *\n * @author code4crafter@gmail.c"
},
{
"path": "webmagic-core/src/main/java/us/codecraft/webmagic/selector/SmartContentSelector.java",
"chars": 3224,
"preview": "package us.codecraft.webmagic.selector;\n\nimport us.codecraft.webmagic.utils.Experimental;\n\nimport java.util.ArrayList;\ni"
},
{
"path": "webmagic-core/src/main/java/us/codecraft/webmagic/selector/XpathSelector.java",
"chars": 1288,
"preview": "package us.codecraft.webmagic.selector;\n\n\nimport java.util.List;\nimport org.apache.commons.collections4.CollectionUtils;"
},
{
"path": "webmagic-core/src/main/java/us/codecraft/webmagic/selector/package.html",
"chars": 145,
"preview": "<html>\n\t<body>\nSelectors for page extraction. Core API is the interface Selectable,and internal core is the interface Se"
},
{
"path": "webmagic-core/src/main/java/us/codecraft/webmagic/thread/CountableThreadPool.java",
"chars": 2676,
"preview": "package us.codecraft.webmagic.thread;\n\nimport java.util.concurrent.ExecutorService;\nimport java.util.concurrent.Executor"
},
{
"path": "webmagic-core/src/main/java/us/codecraft/webmagic/utils/BaseSelectorUtils.java",
"chars": 657,
"preview": "package us.codecraft.webmagic.utils;\n\n/**\n * @author hooy\n */\npublic class BaseSelectorUtils {\n\n /**\n * Jsoup/Htm"
},
{
"path": "webmagic-core/src/main/java/us/codecraft/webmagic/utils/CharsetUtils.java",
"chars": 2395,
"preview": "package us.codecraft.webmagic.utils;\n\nimport org.apache.commons.lang3.StringUtils;\nimport org.jsoup.Jsoup;\nimport org.js"
},
{
"path": "webmagic-core/src/main/java/us/codecraft/webmagic/utils/Experimental.java",
"chars": 153,
"preview": "package us.codecraft.webmagic.utils;\n\n/**\n * Stands for features unstable.\n * @author code4crafter@gmail.com <br>\n */\npu"
},
{
"path": "webmagic-core/src/main/java/us/codecraft/webmagic/utils/FilePersistentBase.java",
"chars": 1147,
"preview": "package us.codecraft.webmagic.utils;\n\nimport java.io.File;\n\n/**\n * Base object of file persistence.\n *\n * @author code4c"
},
{
"path": "webmagic-core/src/main/java/us/codecraft/webmagic/utils/HttpClientUtils.java",
"chars": 768,
"preview": "package us.codecraft.webmagic.utils;\n\nimport org.apache.http.Header;\n\nimport java.util.ArrayList;\nimport java.util.HashM"
},
{
"path": "webmagic-core/src/main/java/us/codecraft/webmagic/utils/HttpConstant.java",
"chars": 858,
"preview": "package us.codecraft.webmagic.utils;\n\n/**\n * Some constants of Http protocal.\n * @author code4crafer@gmail.com\n * @since"
},
{
"path": "webmagic-core/src/main/java/us/codecraft/webmagic/utils/NumberUtils.java",
"chars": 215,
"preview": "package us.codecraft.webmagic.utils;\n\n/**\n * @author yihua.huang@dianping.com\n */\npublic abstract class NumberUtils {\n\n "
},
{
"path": "webmagic-core/src/main/java/us/codecraft/webmagic/utils/ProxyUtils.java",
"chars": 986,
"preview": "package us.codecraft.webmagic.utils;\n\nimport org.slf4j.Logger;\nimport org.slf4j.LoggerFactory;\nimport us.codecraft.webma"
},
{
"path": "webmagic-core/src/main/java/us/codecraft/webmagic/utils/UrlUtils.java",
"chars": 3831,
"preview": "package us.codecraft.webmagic.utils;\n\nimport org.apache.commons.lang3.StringUtils;\nimport us.codecraft.webmagic.Request;"
},
{
"path": "webmagic-core/src/main/java/us/codecraft/webmagic/utils/WMCollections.java",
"chars": 637,
"preview": "package us.codecraft.webmagic.utils;\n\nimport java.util.ArrayList;\nimport java.util.HashSet;\nimport java.util.List;\nimpor"
},
{
"path": "webmagic-core/src/main/java/us/codecraft/webmagic/utils/package.html",
"chars": 58,
"preview": "<html>\n\t<body>\nStatic utils of webmagic.\n\t</body>\n</html>\n"
},
{
"path": "webmagic-core/src/test/java/us/codecraft/webmagic/HtmlTest.java",
"chars": 2278,
"preview": "package us.codecraft.webmagic;\n\nimport org.junit.Ignore;\nimport org.junit.Test;\nimport us.codecraft.webmagic.selector.Ht"
},
{
"path": "webmagic-core/src/test/java/us/codecraft/webmagic/RequestTest.java",
"chars": 1644,
"preview": "package us.codecraft.webmagic;\n\nimport static org.assertj.core.api.Assertions.assertThat;\n\nimport java.util.Collections;"
},
{
"path": "webmagic-core/src/test/java/us/codecraft/webmagic/ResultItemsTest.java",
"chars": 503,
"preview": "package us.codecraft.webmagic;\n\nimport org.junit.Test;\n\n\nimport static org.assertj.core.api.Assertions.assertThat;\n\n/**\n"
},
{
"path": "webmagic-core/src/test/java/us/codecraft/webmagic/SiteTest.java",
"chars": 1491,
"preview": "package us.codecraft.webmagic;\n\nimport static org.junit.Assert.assertEquals;\nimport static org.junit.Assert.assertTrue;\n"
},
{
"path": "webmagic-core/src/test/java/us/codecraft/webmagic/SpiderTest.java",
"chars": 2633,
"preview": "package us.codecraft.webmagic;\n\nimport org.junit.Ignore;\nimport org.junit.Test;\nimport us.codecraft.webmagic.downloader."
},
{
"path": "webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java",
"chars": 16619,
"preview": "package us.codecraft.webmagic.downloader;\n\n\nimport java.io.IOException;\nimport java.io.UnsupportedEncodingException;\nimp"
},
{
"path": "webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpUriRequestConverterTest.java",
"chars": 884,
"preview": "package us.codecraft.webmagic.downloader;\n\nimport org.junit.Test;\nimport us.codecraft.webmagic.Request;\nimport us.codecr"
},
{
"path": "webmagic-core/src/test/java/us/codecraft/webmagic/downloader/MockGithubDownloader.java",
"chars": 1067,
"preview": "package us.codecraft.webmagic.downloader;\n\n\nimport java.io.IOException;\nimport java.io.InputStream;\nimport java.nio.char"
},
{
"path": "webmagic-core/src/test/java/us/codecraft/webmagic/downloader/SSLCompatibilityTest.java",
"chars": 790,
"preview": "package us.codecraft.webmagic.downloader;\n\nimport org.junit.Test;\nimport us.codecraft.webmagic.Page;\nimport us.codecraft"
},
{
"path": "webmagic-core/src/test/java/us/codecraft/webmagic/example/GithubRepoPageProcessorTest.java",
"chars": 1100,
"preview": "package us.codecraft.webmagic.example;\n\nimport org.junit.Test;\nimport us.codecraft.webmagic.ResultItems;\nimport us.codec"
},
{
"path": "webmagic-core/src/test/java/us/codecraft/webmagic/pipeline/FilePipelineTest.java",
"chars": 1103,
"preview": "package us.codecraft.webmagic.pipeline;\n\nimport org.junit.BeforeClass;\nimport org.junit.Test;\nimport us.codecraft.webmag"
},
{
"path": "webmagic-core/src/test/java/us/codecraft/webmagic/processor/PageProcessorTest.java",
"chars": 791,
"preview": "package us.codecraft.webmagic.processor;\n\nimport static org.junit.Assert.assertEquals;\n\nimport org.junit.Test;\n\nimport u"
},
{
"path": "webmagic-core/src/test/java/us/codecraft/webmagic/proxy/ProxyTest.java",
"chars": 3676,
"preview": "package us.codecraft.webmagic.proxy;\n\nimport static org.junit.Assert.assertEquals;\nimport static org.junit.Assert.assert"
},
{
"path": "webmagic-core/src/test/java/us/codecraft/webmagic/proxy/SimpleProxyProviderTest.java",
"chars": 1112,
"preview": "package us.codecraft.webmagic.proxy;\n\nimport org.junit.Test;\nimport org.mockito.Mockito;\n\nimport us.codecraft.webmagic.R"
},
{
"path": "webmagic-core/src/test/java/us/codecraft/webmagic/scheduler/DuplicateRemovedSchedulerTest.java",
"chars": 1892,
"preview": "package us.codecraft.webmagic.scheduler;\n\nimport org.junit.Test;\nimport org.junit.runner.RunWith;\nimport org.mockito.Moc"
},
{
"path": "webmagic-core/src/test/java/us/codecraft/webmagic/scheduler/PrioritySchedulerTest.java",
"chars": 2064,
"preview": "package us.codecraft.webmagic.scheduler;\n\nimport junit.framework.Assert;\nimport org.junit.Test;\nimport us.codecraft.webm"
},
{
"path": "webmagic-core/src/test/java/us/codecraft/webmagic/selector/AndSelectorTest.java",
"chars": 2356,
"preview": "package us.codecraft.webmagic.selector;\n\nimport static org.junit.Assert.assertEquals;\n\nimport java.util.ArrayList;\nimpor"
},
{
"path": "webmagic-core/src/test/java/us/codecraft/webmagic/selector/CssSelectorTest.java",
"chars": 1442,
"preview": "package us.codecraft.webmagic.selector;\n\nimport org.jsoup.Jsoup;\nimport org.jsoup.nodes.Document;\nimport org.jsoup.nodes"
},
{
"path": "webmagic-core/src/test/java/us/codecraft/webmagic/selector/ExtractorsTest.java",
"chars": 1231,
"preview": "package us.codecraft.webmagic.selector;\n\nimport org.junit.Test;\n\nimport static org.assertj.core.api.Assertions.assertTha"
},
{
"path": "webmagic-core/src/test/java/us/codecraft/webmagic/selector/JsonPathSelectorTest.java",
"chars": 2234,
"preview": "package us.codecraft.webmagic.selector;\n\nimport com.alibaba.fastjson.JSON;\nimport com.alibaba.fastjson.JSONObject;\nimpor"
},
{
"path": "webmagic-core/src/test/java/us/codecraft/webmagic/selector/JsonTest.java",
"chars": 788,
"preview": "package us.codecraft.webmagic.selector;\n\nimport org.junit.Test;\n\nimport static org.assertj.core.api.Assertions.assertTha"
},
{
"path": "webmagic-core/src/test/java/us/codecraft/webmagic/selector/LinksSelectorTest.java",
"chars": 850,
"preview": "package us.codecraft.webmagic.selector;\n\nimport org.jsoup.Jsoup;\nimport org.junit.Test;\n\nimport java.util.List;\n\n/**\n * "
},
{
"path": "webmagic-core/src/test/java/us/codecraft/webmagic/selector/OrSelectorTest.java",
"chars": 1769,
"preview": "package us.codecraft.webmagic.selector;\n\nimport static org.junit.Assert.assertEquals;\n\nimport java.util.ArrayList;\nimpor"
},
{
"path": "webmagic-core/src/test/java/us/codecraft/webmagic/selector/RegexSelectorTest.java",
"chars": 1241,
"preview": "package us.codecraft.webmagic.selector;\n\nimport org.assertj.core.api.Assertions;\nimport org.junit.Test;\n\n/**\n * @author "
},
{
"path": "webmagic-core/src/test/java/us/codecraft/webmagic/selector/SelectorTest.java",
"chars": 1133,
"preview": "package us.codecraft.webmagic.selector;\n\nimport org.junit.Test;\n\nimport java.util.List;\n\nimport static org.assertj.core."
},
{
"path": "webmagic-core/src/test/java/us/codecraft/webmagic/utils/CharsetUtilsTest.java",
"chars": 325,
"preview": "package us.codecraft.webmagic.utils;\n\nimport static org.junit.jupiter.api.Assertions.assertNull;\n\nimport java.io.IOExcep"
},
{
"path": "webmagic-core/src/test/java/us/codecraft/webmagic/utils/NumberUtilsTest.java",
"chars": 463,
"preview": "package us.codecraft.webmagic.utils;\n\nimport org.junit.Assert;\nimport org.junit.Test;\n\npublic class NumberUtilsTest {\n\n\t"
},
{
"path": "webmagic-core/src/test/java/us/codecraft/webmagic/utils/UrlUtilsTest.java",
"chars": 1911,
"preview": "package us.codecraft.webmagic.utils;\n\nimport static org.junit.Assert.assertNull;\n\nimport org.junit.Assert;\nimport org.ju"
},
{
"path": "webmagic-core/src/test/resources/html/mock-github.html",
"chars": 114347,
"preview": "\n\n\n\n\n<!DOCTYPE html>\n<html lang=\"en\" class=\" is-u2f-enabled\">\n<head prefix=\"og: http://ogp.me/ns# fb: http://ogp.me/ns/f"
},
{
"path": "webmagic-core/src/test/resources/log4j2-test.xml",
"chars": 505,
"preview": "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<Configuration>\n <Appenders>\n <Console name=\"stdout\" target=\"SYSTEM_OUT"
},
{
"path": "webmagic-coverage/pom.xml",
"chars": 2440,
"preview": "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<project\n xmlns=\"http://maven.apache.org/POM/4.0.0\"\n xmlns:xsi=\"http://www."
},
{
"path": "webmagic-extension/README.md",
"chars": 64,
"preview": "webmagic-extension\n-------\nwebmagic的扩展模块。包括注解格式定义爬虫、JSON、分布式等支持。"
},
{
"path": "webmagic-extension/pom.xml",
"chars": 1440,
"preview": "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<project\n xmlns=\"http://maven.apache.org/POM/4.0.0\"\n xmlns:xsi=\"http://www."
},
{
"path": "webmagic-extension/src/main/java/us/codecraft/webmagic/MultiPageModel.java",
"chars": 1063,
"preview": "package us.codecraft.webmagic;\n\nimport us.codecraft.webmagic.utils.Experimental;\n\nimport java.util.Collection;\n\n/**\n * E"
},
{
"path": "webmagic-extension/src/main/java/us/codecraft/webmagic/SimpleHttpClient.java",
"chars": 1347,
"preview": "package us.codecraft.webmagic;\n\nimport us.codecraft.webmagic.downloader.HttpClientDownloader;\nimport us.codecraft.webmag"
},
{
"path": "webmagic-extension/src/main/java/us/codecraft/webmagic/configurable/ConfigurablePageProcessor.java",
"chars": 1530,
"preview": "package us.codecraft.webmagic.configurable;\n\nimport us.codecraft.webmagic.Page;\nimport us.codecraft.webmagic.Site;\nimpor"
},
{
"path": "webmagic-extension/src/main/java/us/codecraft/webmagic/configurable/ExpressionType.java",
"chars": 153,
"preview": "package us.codecraft.webmagic.configurable;\n\n/**\n * @author code4crafter@gmail.com\n */\npublic enum ExpressionType {\n\n "
},
{
"path": "webmagic-extension/src/main/java/us/codecraft/webmagic/configurable/ExtractRule.java",
"chars": 2769,
"preview": "package us.codecraft.webmagic.configurable;\n\nimport us.codecraft.webmagic.selector.JsonPathSelector;\nimport us.codecraft"
},
{
"path": "webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/PhantomJSDownloader.java",
"chars": 4163,
"preview": "package us.codecraft.webmagic.downloader;\n\nimport org.slf4j.Logger;\nimport org.slf4j.LoggerFactory;\nimport us.codecraft."
},
{
"path": "webmagic-extension/src/main/java/us/codecraft/webmagic/example/AppStore.java",
"chars": 1380,
"preview": "package us.codecraft.webmagic.example;\n\nimport us.codecraft.webmagic.Site;\nimport us.codecraft.webmagic.model.OOSpider;\n"
},
{
"path": "webmagic-extension/src/main/java/us/codecraft/webmagic/example/BaiduBaike.java",
"chars": 1781,
"preview": "package us.codecraft.webmagic.example;\n\nimport us.codecraft.webmagic.Site;\nimport us.codecraft.webmagic.model.OOSpider;\n"
},
{
"path": "webmagic-extension/src/main/java/us/codecraft/webmagic/example/GithubRepo.java",
"chars": 2581,
"preview": "package us.codecraft.webmagic.example;\n\nimport us.codecraft.webmagic.Site;\nimport us.codecraft.webmagic.model.ConsolePag"
},
{
"path": "webmagic-extension/src/main/java/us/codecraft/webmagic/example/GithubRepoApi.java",
"chars": 1900,
"preview": "package us.codecraft.webmagic.example;\n\nimport us.codecraft.webmagic.Site;\nimport us.codecraft.webmagic.model.ConsolePag"
},
{
"path": "webmagic-extension/src/main/java/us/codecraft/webmagic/example/GithubRepoPageMapper.java",
"chars": 1243,
"preview": "package us.codecraft.webmagic.example;\n\nimport us.codecraft.webmagic.Page;\nimport us.codecraft.webmagic.Site;\nimport us."
},
{
"path": "webmagic-extension/src/main/java/us/codecraft/webmagic/example/MonitorExample.java",
"chars": 869,
"preview": "package us.codecraft.webmagic.example;\n\nimport us.codecraft.webmagic.Spider;\nimport us.codecraft.webmagic.monitor.Spider"
},
{
"path": "webmagic-extension/src/main/java/us/codecraft/webmagic/example/OschinaBlog.java",
"chars": 1462,
"preview": "package us.codecraft.webmagic.example;\n\nimport us.codecraft.webmagic.Site;\nimport us.codecraft.webmagic.model.OOSpider;\n"
},
{
"path": "webmagic-extension/src/main/java/us/codecraft/webmagic/example/PatternProcessorExample.java",
"chars": 2908,
"preview": "package us.codecraft.webmagic.example;\n\nimport org.slf4j.Logger;\nimport org.slf4j.LoggerFactory;\n\nimport us.codecraft.we"
},
{
"path": "webmagic-extension/src/main/java/us/codecraft/webmagic/handler/CompositePageProcessor.java",
"chars": 1698,
"preview": "package us.codecraft.webmagic.handler;\n\nimport us.codecraft.webmagic.Page;\nimport us.codecraft.webmagic.Site;\nimport us."
},
{
"path": "webmagic-extension/src/main/java/us/codecraft/webmagic/handler/CompositePipeline.java",
"chars": 1323,
"preview": "package us.codecraft.webmagic.handler;\n\nimport us.codecraft.webmagic.ResultItems;\nimport us.codecraft.webmagic.Task;\nimp"
},
{
"path": "webmagic-extension/src/main/java/us/codecraft/webmagic/handler/PatternProcessor.java",
"chars": 331,
"preview": "package us.codecraft.webmagic.handler;\n\n/**\n * @author code4crafer@gmail.com\n */\npublic abstract class PatternProcessor "
},
{
"path": "webmagic-extension/src/main/java/us/codecraft/webmagic/handler/PatternRequestMatcher.java",
"chars": 899,
"preview": "package us.codecraft.webmagic.handler;\n\nimport us.codecraft.webmagic.Request;\n\nimport java.util.regex.Pattern;\n\n/**\n * C"
},
{
"path": "webmagic-extension/src/main/java/us/codecraft/webmagic/handler/RequestMatcher.java",
"chars": 456,
"preview": "package us.codecraft.webmagic.handler;\n\nimport us.codecraft.webmagic.Request;\n\n/**\n * @author code4crafer@gmail.com\n * @"
},
{
"path": "webmagic-extension/src/main/java/us/codecraft/webmagic/handler/SubPageProcessor.java",
"chars": 373,
"preview": "package us.codecraft.webmagic.handler;\n\nimport us.codecraft.webmagic.Page;\n\n/**\n * @author code4crafter@gmail.com\n */\npu"
},
{
"path": "webmagic-extension/src/main/java/us/codecraft/webmagic/handler/SubPipeline.java",
"chars": 507,
"preview": "package us.codecraft.webmagic.handler;\n\nimport us.codecraft.webmagic.ResultItems;\nimport us.codecraft.webmagic.Task;\n\n/*"
},
{
"path": "webmagic-extension/src/main/java/us/codecraft/webmagic/model/AfterExtractor.java",
"chars": 322,
"preview": "package us.codecraft.webmagic.model;\n\nimport us.codecraft.webmagic.Page;\n\n/**\n * Interface to be implemented by page mod"
},
{
"path": "webmagic-extension/src/main/java/us/codecraft/webmagic/model/ConsolePageModelPipeline.java",
"chars": 521,
"preview": "package us.codecraft.webmagic.model;\n\nimport org.apache.commons.lang3.builder.ToStringBuilder;\nimport us.codecraft.webma"
},
{
"path": "webmagic-extension/src/main/java/us/codecraft/webmagic/model/Extractor.java",
"chars": 843,
"preview": "package us.codecraft.webmagic.model;\n\nimport lombok.Getter;\nimport lombok.Setter;\n\nimport us.codecraft.webmagic.model.so"
},
{
"path": "webmagic-extension/src/main/java/us/codecraft/webmagic/model/FieldExtractor.java",
"chars": 814,
"preview": "package us.codecraft.webmagic.model;\n\nimport us.codecraft.webmagic.model.formatter.ObjectFormatter;\nimport us.codecraft."
},
{
"path": "webmagic-extension/src/main/java/us/codecraft/webmagic/model/HasKey.java",
"chars": 409,
"preview": "package us.codecraft.webmagic.model;\n\nimport us.codecraft.webmagic.utils.Experimental;\n\n/**\n * Interface to be implement"
},
{
"path": "webmagic-extension/src/main/java/us/codecraft/webmagic/model/ModelPageProcessor.java",
"chars": 3199,
"preview": "package us.codecraft.webmagic.model;\n\nimport us.codecraft.webmagic.Page;\nimport us.codecraft.webmagic.Request;\nimport us"
},
{
"path": "webmagic-extension/src/main/java/us/codecraft/webmagic/model/ModelPipeline.java",
"chars": 1754,
"preview": "package us.codecraft.webmagic.model;\n\nimport us.codecraft.webmagic.ResultItems;\nimport us.codecraft.webmagic.Task;\nimpor"
},
{
"path": "webmagic-extension/src/main/java/us/codecraft/webmagic/model/OOSpider.java",
"chars": 3306,
"preview": "package us.codecraft.webmagic.model;\n\nimport us.codecraft.webmagic.Site;\nimport us.codecraft.webmagic.Spider;\nimport us."
},
{
"path": "webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageMapper.java",
"chars": 605,
"preview": "package us.codecraft.webmagic.model;\n\nimport us.codecraft.webmagic.Page;\n\nimport java.util.List;\n\n/**\n * @author code4cr"
},
{
"path": "webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelCollectorPipeline.java",
"chars": 1411,
"preview": "package us.codecraft.webmagic.model;\n\nimport us.codecraft.webmagic.ResultItems;\nimport us.codecraft.webmagic.Task;\nimpor"
},
{
"path": "webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java",
"chars": 10203,
"preview": "package us.codecraft.webmagic.model;\n\nimport org.apache.commons.lang3.StringUtils;\nimport org.slf4j.Logger;\nimport org.s"
},
{
"path": "webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ComboExtract.java",
"chars": 2269,
"preview": "package us.codecraft.webmagic.model.annotation;\n\nimport java.lang.annotation.ElementType;\nimport java.lang.annotation.Re"
},
{
"path": "webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractBy.java",
"chars": 2030,
"preview": "package us.codecraft.webmagic.model.annotation;\n\nimport java.lang.annotation.ElementType;\nimport java.lang.annotation.Re"
},
{
"path": "webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractByUrl.java",
"chars": 1267,
"preview": "package us.codecraft.webmagic.model.annotation;\n\nimport java.lang.annotation.ElementType;\nimport java.lang.annotation.Re"
},
{
"path": "webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/Formatter.java",
"chars": 1202,
"preview": "package us.codecraft.webmagic.model.annotation;\n\nimport us.codecraft.webmagic.model.formatter.ObjectFormatter;\n\nimport j"
},
{
"path": "webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/HelpUrl.java",
"chars": 1159,
"preview": "package us.codecraft.webmagic.model.annotation;\n\nimport java.lang.annotation.ElementType;\nimport java.lang.annotation.Re"
},
{
"path": "webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/TargetUrl.java",
"chars": 1147,
"preview": "package us.codecraft.webmagic.model.annotation;\n\nimport java.lang.annotation.ElementType;\nimport java.lang.annotation.Re"
},
{
"path": "webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/package.html",
"chars": 70,
"preview": "<html>\n\t<body>\nAnnotations for defining a extractor.\n\t</body>\n</html>\n"
},
{
"path": "webmagic-extension/src/main/java/us/codecraft/webmagic/model/fields/MultipleField.java",
"chars": 1450,
"preview": "package us.codecraft.webmagic.model.fields;\n\nimport java.lang.reflect.InvocationTargetException;\nimport java.util.ArrayL"
},
{
"path": "webmagic-extension/src/main/java/us/codecraft/webmagic/model/fields/PageField.java",
"chars": 1188,
"preview": "package us.codecraft.webmagic.model.fields;\n\nimport java.lang.reflect.InvocationTargetException;\n\nimport org.slf4j.Logge"
},
{
"path": "webmagic-extension/src/main/java/us/codecraft/webmagic/model/fields/SingleField.java",
"chars": 900,
"preview": "package us.codecraft.webmagic.model.fields;\n\nimport java.lang.reflect.InvocationTargetException;\n\nimport org.slf4j.Logge"
},
{
"path": "webmagic-extension/src/main/java/us/codecraft/webmagic/model/formatter/BasicClassDetector.java",
"chars": 2274,
"preview": "package us.codecraft.webmagic.model.formatter;\n\npublic interface BasicClassDetector {\n Class<?> detectBasicClass(Clas"
},
{
"path": "webmagic-extension/src/main/java/us/codecraft/webmagic/model/formatter/BasicTypeFormatter.java",
"chars": 4185,
"preview": "package us.codecraft.webmagic.model.formatter;\n\nimport java.util.Arrays;\nimport java.util.List;\n\n/**\n * @author code4cra"
},
{
"path": "webmagic-extension/src/main/java/us/codecraft/webmagic/model/formatter/DateFormatter.java",
"chars": 781,
"preview": "package us.codecraft.webmagic.model.formatter;\n\nimport org.apache.commons.lang3.time.DateUtils;\n\nimport java.util.Date;\n"
},
{
"path": "webmagic-extension/src/main/java/us/codecraft/webmagic/model/formatter/ObjectFormatter.java",
"chars": 235,
"preview": "package us.codecraft.webmagic.model.formatter;\n\n/**\n * @author code4crafter@gmail.com\n */\npublic interface ObjectFormatt"
},
{
"path": "webmagic-extension/src/main/java/us/codecraft/webmagic/model/formatter/ObjectFormatterBuilder.java",
"chars": 2060,
"preview": "package us.codecraft.webmagic.model.formatter;\n\nimport us.codecraft.webmagic.model.annotation.Formatter;\n\nimport java.la"
},
{
"path": "webmagic-extension/src/main/java/us/codecraft/webmagic/model/formatter/ObjectFormatters.java",
"chars": 1057,
"preview": "package us.codecraft.webmagic.model.formatter;\n\nimport java.util.Map;\nimport java.util.concurrent.ConcurrentHashMap;\n\n/*"
},
{
"path": "webmagic-extension/src/main/java/us/codecraft/webmagic/model/package.html",
"chars": 88,
"preview": "<html>\n\t<body>\nPage model and annotations used to customize a crawler.\n\t</body>\n</html>\n"
},
{
"path": "webmagic-extension/src/main/java/us/codecraft/webmagic/model/sources/Source.java",
"chars": 2763,
"preview": "package us.codecraft.webmagic.model.sources;\n\nimport java.util.List;\n\nimport us.codecraft.webmagic.Page;\nimport us.codec"
},
{
"path": "webmagic-extension/src/main/java/us/codecraft/webmagic/model/sources/SourceTextExtractor.java",
"chars": 717,
"preview": "package us.codecraft.webmagic.model.sources;\n\nimport us.codecraft.webmagic.Page;\nimport us.codecraft.webmagic.model.Fiel"
},
{
"path": "webmagic-extension/src/main/java/us/codecraft/webmagic/monitor/SpiderMonitor.java",
"chars": 3804,
"preview": "package us.codecraft.webmagic.monitor;\n\nimport java.lang.management.ManagementFactory;\nimport java.util.ArrayList;\nimpor"
},
{
"path": "webmagic-extension/src/main/java/us/codecraft/webmagic/monitor/SpiderStatus.java",
"chars": 2593,
"preview": "package us.codecraft.webmagic.monitor;\n\nimport org.slf4j.Logger;\nimport org.slf4j.LoggerFactory;\nimport us.codecraft.web"
},
{
"path": "webmagic-extension/src/main/java/us/codecraft/webmagic/monitor/SpiderStatusMXBean.java",
"chars": 586,
"preview": "package us.codecraft.webmagic.monitor;\n\nimport java.util.Date;\nimport java.util.List;\n\n/**\n * @author code4crafer@gmail."
},
{
"path": "webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/CollectorPageModelPipeline.java",
"chars": 474,
"preview": "package us.codecraft.webmagic.pipeline;\n\nimport us.codecraft.webmagic.Task;\n\nimport java.util.ArrayList;\nimport java.uti"
},
{
"path": "webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/FilePageModelPipeline.java",
"chars": 1796,
"preview": "package us.codecraft.webmagic.pipeline;\n\nimport org.apache.commons.codec.digest.DigestUtils;\nimport org.apache.commons.l"
},
{
"path": "webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/JsonFilePageModelPipeline.java",
"chars": 1824,
"preview": "package us.codecraft.webmagic.pipeline;\n\nimport com.alibaba.fastjson.JSON;\nimport org.apache.commons.codec.digest.Digest"
},
{
"path": "webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/JsonFilePipeline.java",
"chars": 1426,
"preview": "package us.codecraft.webmagic.pipeline;\n\nimport com.alibaba.fastjson.JSON;\nimport org.apache.commons.codec.digest.Digest"
},
{
"path": "webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/MultiPagePipeline.java",
"chars": 4383,
"preview": "package us.codecraft.webmagic.pipeline;\n\nimport us.codecraft.webmagic.MultiPageModel;\nimport us.codecraft.webmagic.Resul"
},
{
"path": "webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/PageModelPipeline.java",
"chars": 291,
"preview": "package us.codecraft.webmagic.pipeline;\n\nimport us.codecraft.webmagic.Task;\n\n/**\n * Implements PageModelPipeline to pers"
},
{
"path": "webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/BloomFilterDuplicateRemover.java",
"chars": 2125,
"preview": "package us.codecraft.webmagic.scheduler;\n\n/**\n * @author code4crafter@gmail.com\n * Date: 16/12/18\n * Tim"
},
{
"path": "webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/FileCacheQueueScheduler.java",
"chars": 5328,
"preview": "package us.codecraft.webmagic.scheduler;\n\nimport org.apache.commons.lang3.math.NumberUtils;\nimport us.codecraft.webmagic"
},
{
"path": "webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisPriorityScheduler.java",
"chars": 3889,
"preview": "package us.codecraft.webmagic.scheduler;\n\nimport java.util.Set;\n\nimport org.apache.commons.codec.digest.DigestUtils;\nimp"
},
{
"path": "webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java",
"chars": 4088,
"preview": "package us.codecraft.webmagic.scheduler;\n\nimport org.apache.commons.codec.digest.DigestUtils;\nimport org.apache.commons."
},
{
"path": "webmagic-extension/src/main/java/us/codecraft/webmagic/utils/ClassUtils.java",
"chars": 676,
"preview": "package us.codecraft.webmagic.utils;\n\nimport java.lang.reflect.Field;\nimport java.util.LinkedHashSet;\nimport java.util.S"
},
{
"path": "webmagic-extension/src/main/java/us/codecraft/webmagic/utils/DoubleKeyMap.java",
"chars": 2359,
"preview": "package us.codecraft.webmagic.utils;\n\nimport java.util.Map;\n\n/**\n * @author code4crafter@gmail.com\n */\npublic class Doub"
},
{
"path": "webmagic-extension/src/main/java/us/codecraft/webmagic/utils/ExtractorUtils.java",
"chars": 1367,
"preview": "package us.codecraft.webmagic.utils;\n\nimport us.codecraft.webmagic.model.annotation.ExtractBy;\nimport us.codecraft.webma"
},
{
"path": "webmagic-extension/src/main/java/us/codecraft/webmagic/utils/IPUtils.java",
"chars": 1204,
"preview": "package us.codecraft.webmagic.utils;\n\nimport java.net.Inet6Address;\nimport java.net.InetAddress;\nimport java.net.Network"
},
{
"path": "webmagic-extension/src/main/java/us/codecraft/webmagic/utils/MultiKeyMapBase.java",
"chars": 1089,
"preview": "package us.codecraft.webmagic.utils;\n\n/**\n * @author code4crafter@gmail.com\n */\n\nimport java.util.HashMap;\nimport java.u"
},
{
"path": "webmagic-extension/src/main/java/us/codecraft/webmagic/utils/RequestUtils.java",
"chars": 1100,
"preview": "package us.codecraft.webmagic.utils;\n\nimport us.codecraft.webmagic.Request;\n\nimport java.util.ArrayList;\nimport java.uti"
},
{
"path": "webmagic-extension/src/main/resources/crawl.js",
"chars": 379,
"preview": "var system = require('system');\nvar url = system.args[1];\n\nvar page = require('webpage').create();\npage.settings.loadIma"
},
{
"path": "webmagic-extension/src/main/resources/spider-config-draft.xml",
"chars": 769,
"preview": "<!--This is a draft of config file.\nIf you have any advice, go https://github.com/code4craft/webmagic/issues/106 and com"
},
{
"path": "webmagic-extension/src/test/java/us/codecraft/webmagic/MockPageModelPipeline.java",
"chars": 330,
"preview": "package us.codecraft.webmagic;\n\nimport junit.framework.Assert;\nimport us.codecraft.webmagic.pipeline.PageModelPipeline;\n"
},
{
"path": "webmagic-extension/src/test/java/us/codecraft/webmagic/MockPipeline.java",
"chars": 255,
"preview": "package us.codecraft.webmagic;\n\nimport us.codecraft.webmagic.pipeline.Pipeline;\n\n/**\n * @author code4crafter@gmail.com\n "
},
{
"path": "webmagic-extension/src/test/java/us/codecraft/webmagic/SimpleHttpClientTest.java",
"chars": 2581,
"preview": "package us.codecraft.webmagic;\n\nimport org.junit.Ignore;\nimport org.junit.Test;\nimport us.codecraft.webmagic.model.After"
},
{
"path": "webmagic-extension/src/test/java/us/codecraft/webmagic/configurable/ConfigurablePageProcessorTest.java",
"chars": 1530,
"preview": "package us.codecraft.webmagic.configurable;\n\nimport org.junit.Test;\nimport us.codecraft.webmagic.ResultItems;\nimport us."
},
{
"path": "webmagic-extension/src/test/java/us/codecraft/webmagic/downloader/MockGithubDownloader.java",
"chars": 73619,
"preview": "package us.codecraft.webmagic.downloader;\n\nimport us.codecraft.webmagic.Page;\nimport us.codecraft.webmagic.Request;\nimpo"
},
{
"path": "webmagic-extension/src/test/java/us/codecraft/webmagic/formatter/DateFormatterTest.java",
"chars": 831,
"preview": "package us.codecraft.webmagic.formatter;\n\nimport org.apache.commons.lang3.time.DateFormatUtils;\nimport org.apache.common"
},
{
"path": "webmagic-extension/src/test/java/us/codecraft/webmagic/model/BaseRepo.java",
"chars": 294,
"preview": "package us.codecraft.webmagic.model;\n\nimport us.codecraft.webmagic.model.annotation.ExtractBy;\n\n/**\n * @author code4craf"
},
{
"path": "webmagic-extension/src/test/java/us/codecraft/webmagic/model/GithubRepo.java",
"chars": 978,
"preview": "package us.codecraft.webmagic.model;\n\nimport us.codecraft.webmagic.Site;\nimport us.codecraft.webmagic.model.annotation.E"
},
{
"path": "webmagic-extension/src/test/java/us/codecraft/webmagic/model/GithubRepoApi.java",
"chars": 402,
"preview": "package us.codecraft.webmagic.model;\n\nimport us.codecraft.webmagic.model.annotation.ExtractBy;\n\n/**\n * @author code4craf"
},
{
"path": "webmagic-extension/src/test/java/us/codecraft/webmagic/model/GithubRepoTest.java",
"chars": 960,
"preview": "package us.codecraft.webmagic.model;\n\nimport org.junit.Test;\nimport us.codecraft.webmagic.Site;\nimport us.codecraft.webm"
}
]
// ... and 110 more files (download for full content)
About this extraction
This page contains the full source code of the code4craft/webmagic GitHub repository, extracted and formatted as plain text for AI agents and large language models (LLMs). The extraction includes 310 files (1.0 MB), approximately 281.4k tokens, and a symbol index with 1468 extracted functions, classes, methods, constants, and types. Use this with OpenClaw, Claude, ChatGPT, Cursor, Windsurf, or any other AI tool that accepts text input. You can copy the full output to your clipboard or download it as a .txt file.
Extracted by GitExtract — free GitHub repo to text converter for AI. Built by Nikandr Surkov.