Repository: code4craft/webmagic Branch: develop Commit: 67816a19d68a Files: 310 Total size: 1.0 MB Directory structure: gitextract_m56n222u/ ├── .gitignore ├── .travis.yml ├── LICENSE ├── README-zh.md ├── README.md ├── pom.xml ├── src/ │ └── site/ │ └── site.xml ├── webmagic-core/ │ ├── README.md │ ├── module_webmagic-core.xml │ ├── pom.xml │ └── src/ │ ├── main/ │ │ └── java/ │ │ └── us/ │ │ └── codecraft/ │ │ └── webmagic/ │ │ ├── Page.java │ │ ├── Request.java │ │ ├── ResultItems.java │ │ ├── Site.java │ │ ├── Spider.java │ │ ├── SpiderListener.java │ │ ├── SpiderScheduler.java │ │ ├── Task.java │ │ ├── downloader/ │ │ │ ├── AbstractDownloader.java │ │ │ ├── CustomRedirectStrategy.java │ │ │ ├── Downloader.java │ │ │ ├── HttpClientDownloader.java │ │ │ ├── HttpClientGenerator.java │ │ │ ├── HttpClientRequestContext.java │ │ │ ├── HttpUriRequestConverter.java │ │ │ └── package.html │ │ ├── model/ │ │ │ └── HttpRequestBody.java │ │ ├── package.html │ │ ├── pipeline/ │ │ │ ├── CollectorPipeline.java │ │ │ ├── ConsolePipeline.java │ │ │ ├── FilePipeline.java │ │ │ ├── Pipeline.java │ │ │ ├── ResultItemsCollectorPipeline.java │ │ │ └── package.html │ │ ├── processor/ │ │ │ ├── PageProcessor.java │ │ │ ├── SimplePageProcessor.java │ │ │ ├── example/ │ │ │ │ ├── BaiduBaikePageProcessor.java │ │ │ │ ├── GithubRepoPageProcessor.java │ │ │ │ └── ZhihuPageProcessor.java │ │ │ └── package.html │ │ ├── proxy/ │ │ │ ├── Proxy.java │ │ │ ├── ProxyProvider.java │ │ │ └── SimpleProxyProvider.java │ │ ├── scheduler/ │ │ │ ├── DuplicateRemovedScheduler.java │ │ │ ├── MonitorableScheduler.java │ │ │ ├── PriorityScheduler.java │ │ │ ├── QueueScheduler.java │ │ │ ├── Scheduler.java │ │ │ ├── component/ │ │ │ │ ├── DuplicateRemover.java │ │ │ │ ├── HashSetDuplicateRemover.java │ │ │ │ └── package.html │ │ │ └── package.html │ │ ├── selector/ │ │ │ ├── AbstractSelectable.java │ │ │ ├── AndSelector.java │ │ │ ├── BaseElementSelector.java │ │ │ ├── CssSelector.java │ │ │ ├── ElementSelector.java │ │ │ ├── Html.java │ │ │ ├── HtmlNode.java │ │ │ ├── Json.java │ │ │ ├── JsonPathSelector.java │ │ │ ├── LinksSelector.java │ │ │ ├── OrSelector.java │ │ │ ├── PlainText.java │ │ │ ├── RegexResult.java │ │ │ ├── RegexSelector.java │ │ │ ├── ReplaceSelector.java │ │ │ ├── Selectable.java │ │ │ ├── Selector.java │ │ │ ├── Selectors.java │ │ │ ├── SmartContentSelector.java │ │ │ ├── XpathSelector.java │ │ │ └── package.html │ │ ├── thread/ │ │ │ └── CountableThreadPool.java │ │ └── utils/ │ │ ├── BaseSelectorUtils.java │ │ ├── CharsetUtils.java │ │ ├── Experimental.java │ │ ├── FilePersistentBase.java │ │ ├── HttpClientUtils.java │ │ ├── HttpConstant.java │ │ ├── NumberUtils.java │ │ ├── ProxyUtils.java │ │ ├── UrlUtils.java │ │ ├── WMCollections.java │ │ └── package.html │ └── test/ │ ├── java/ │ │ └── us/ │ │ └── codecraft/ │ │ └── webmagic/ │ │ ├── HtmlTest.java │ │ ├── RequestTest.java │ │ ├── ResultItemsTest.java │ │ ├── SiteTest.java │ │ ├── SpiderTest.java │ │ ├── downloader/ │ │ │ ├── HttpClientDownloaderTest.java │ │ │ ├── HttpUriRequestConverterTest.java │ │ │ ├── MockGithubDownloader.java │ │ │ └── SSLCompatibilityTest.java │ │ ├── example/ │ │ │ └── GithubRepoPageProcessorTest.java │ │ ├── pipeline/ │ │ │ └── FilePipelineTest.java │ │ ├── processor/ │ │ │ └── PageProcessorTest.java │ │ ├── proxy/ │ │ │ ├── ProxyTest.java │ │ │ └── SimpleProxyProviderTest.java │ │ ├── scheduler/ │ │ │ ├── DuplicateRemovedSchedulerTest.java │ │ │ └── PrioritySchedulerTest.java │ │ ├── selector/ │ │ │ ├── AndSelectorTest.java │ │ │ ├── CssSelectorTest.java │ │ │ ├── ExtractorsTest.java │ │ │ ├── JsonPathSelectorTest.java │ │ │ ├── JsonTest.java │ │ │ ├── LinksSelectorTest.java │ │ │ ├── OrSelectorTest.java │ │ │ ├── RegexSelectorTest.java │ │ │ └── SelectorTest.java │ │ └── utils/ │ │ ├── CharsetUtilsTest.java │ │ ├── NumberUtilsTest.java │ │ └── UrlUtilsTest.java │ └── resources/ │ ├── html/ │ │ └── mock-github.html │ └── log4j2-test.xml ├── webmagic-coverage/ │ └── pom.xml ├── webmagic-extension/ │ ├── README.md │ ├── pom.xml │ └── src/ │ ├── main/ │ │ ├── java/ │ │ │ └── us/ │ │ │ └── codecraft/ │ │ │ └── webmagic/ │ │ │ ├── MultiPageModel.java │ │ │ ├── SimpleHttpClient.java │ │ │ ├── configurable/ │ │ │ │ ├── ConfigurablePageProcessor.java │ │ │ │ ├── ExpressionType.java │ │ │ │ └── ExtractRule.java │ │ │ ├── downloader/ │ │ │ │ └── PhantomJSDownloader.java │ │ │ ├── example/ │ │ │ │ ├── AppStore.java │ │ │ │ ├── BaiduBaike.java │ │ │ │ ├── GithubRepo.java │ │ │ │ ├── GithubRepoApi.java │ │ │ │ ├── GithubRepoPageMapper.java │ │ │ │ ├── MonitorExample.java │ │ │ │ ├── OschinaBlog.java │ │ │ │ └── PatternProcessorExample.java │ │ │ ├── handler/ │ │ │ │ ├── CompositePageProcessor.java │ │ │ │ ├── CompositePipeline.java │ │ │ │ ├── PatternProcessor.java │ │ │ │ ├── PatternRequestMatcher.java │ │ │ │ ├── RequestMatcher.java │ │ │ │ ├── SubPageProcessor.java │ │ │ │ └── SubPipeline.java │ │ │ ├── model/ │ │ │ │ ├── AfterExtractor.java │ │ │ │ ├── ConsolePageModelPipeline.java │ │ │ │ ├── Extractor.java │ │ │ │ ├── FieldExtractor.java │ │ │ │ ├── HasKey.java │ │ │ │ ├── ModelPageProcessor.java │ │ │ │ ├── ModelPipeline.java │ │ │ │ ├── OOSpider.java │ │ │ │ ├── PageMapper.java │ │ │ │ ├── PageModelCollectorPipeline.java │ │ │ │ ├── PageModelExtractor.java │ │ │ │ ├── annotation/ │ │ │ │ │ ├── ComboExtract.java │ │ │ │ │ ├── ExtractBy.java │ │ │ │ │ ├── ExtractByUrl.java │ │ │ │ │ ├── Formatter.java │ │ │ │ │ ├── HelpUrl.java │ │ │ │ │ ├── TargetUrl.java │ │ │ │ │ └── package.html │ │ │ │ ├── fields/ │ │ │ │ │ ├── MultipleField.java │ │ │ │ │ ├── PageField.java │ │ │ │ │ └── SingleField.java │ │ │ │ ├── formatter/ │ │ │ │ │ ├── BasicClassDetector.java │ │ │ │ │ ├── BasicTypeFormatter.java │ │ │ │ │ ├── DateFormatter.java │ │ │ │ │ ├── ObjectFormatter.java │ │ │ │ │ ├── ObjectFormatterBuilder.java │ │ │ │ │ └── ObjectFormatters.java │ │ │ │ ├── package.html │ │ │ │ └── sources/ │ │ │ │ ├── Source.java │ │ │ │ └── SourceTextExtractor.java │ │ │ ├── monitor/ │ │ │ │ ├── SpiderMonitor.java │ │ │ │ ├── SpiderStatus.java │ │ │ │ └── SpiderStatusMXBean.java │ │ │ ├── pipeline/ │ │ │ │ ├── CollectorPageModelPipeline.java │ │ │ │ ├── FilePageModelPipeline.java │ │ │ │ ├── JsonFilePageModelPipeline.java │ │ │ │ ├── JsonFilePipeline.java │ │ │ │ ├── MultiPagePipeline.java │ │ │ │ └── PageModelPipeline.java │ │ │ ├── scheduler/ │ │ │ │ ├── BloomFilterDuplicateRemover.java │ │ │ │ ├── FileCacheQueueScheduler.java │ │ │ │ ├── RedisPriorityScheduler.java │ │ │ │ └── RedisScheduler.java │ │ │ └── utils/ │ │ │ ├── ClassUtils.java │ │ │ ├── DoubleKeyMap.java │ │ │ ├── ExtractorUtils.java │ │ │ ├── IPUtils.java │ │ │ ├── MultiKeyMapBase.java │ │ │ └── RequestUtils.java │ │ └── resources/ │ │ ├── crawl.js │ │ └── spider-config-draft.xml │ └── test/ │ ├── java/ │ │ └── us/ │ │ └── codecraft/ │ │ └── webmagic/ │ │ ├── MockPageModelPipeline.java │ │ ├── MockPipeline.java │ │ ├── SimpleHttpClientTest.java │ │ ├── configurable/ │ │ │ └── ConfigurablePageProcessorTest.java │ │ ├── downloader/ │ │ │ └── MockGithubDownloader.java │ │ ├── formatter/ │ │ │ └── DateFormatterTest.java │ │ ├── model/ │ │ │ ├── BaseRepo.java │ │ │ ├── GithubRepo.java │ │ │ ├── GithubRepoApi.java │ │ │ ├── GithubRepoTest.java │ │ │ ├── ModelPageProcessorTest.java │ │ │ ├── PageMapperTest.java │ │ │ ├── PageMocker.java │ │ │ └── PageModelExtractorTest.java │ │ ├── monitor/ │ │ │ ├── CustomSpiderStatus.java │ │ │ ├── CustomSpiderStatusMXBean.java │ │ │ ├── SeedUrlWithPortTest.java │ │ │ └── SpiderMonitorTest.java │ │ ├── processor/ │ │ │ └── GithubRepoProcessor.java │ │ ├── scheduler/ │ │ │ ├── BloomFilterDuplicateRemoverTest.java │ │ │ ├── RedisPrioritySchedulerTest.java │ │ │ └── RedisSchedulerTest.java │ │ └── utils/ │ │ ├── IPUtilsTest.java │ │ └── RequestUtilsTest.java │ └── resources/ │ ├── html/ │ │ ├── mock-github.html │ │ └── mock-webmagic.html │ ├── json/ │ │ └── mock-githubrepo.json │ └── log4j2-test.xml ├── webmagic-samples/ │ ├── README.md │ ├── pom.xml │ └── src/ │ ├── main/ │ │ ├── java/ │ │ │ └── us/ │ │ │ └── codecraft/ │ │ │ └── webmagic/ │ │ │ ├── main/ │ │ │ │ └── QuickStarter.java │ │ │ ├── model/ │ │ │ │ └── samples/ │ │ │ │ ├── BaiduNews.java │ │ │ │ ├── Blog.java │ │ │ │ ├── DianpingFtlDataScanner.java │ │ │ │ ├── GithubRepo.java │ │ │ │ ├── IteyeBlog.java │ │ │ │ ├── JokejiModel.java │ │ │ │ ├── Kr36NewsModel.java │ │ │ │ ├── News163.java │ │ │ │ ├── OschinaAnswer.java │ │ │ │ ├── OschinaBlog.java │ │ │ │ └── QQMeishi.java │ │ │ ├── recover/ │ │ │ │ ├── DuplicateStorageRemover.java │ │ │ │ ├── MmapQueueScheduler.java │ │ │ │ └── RecoverSample.java │ │ │ └── samples/ │ │ │ ├── AlexanderMcqueenGoodsProcessor.java │ │ │ ├── AmanzonPageProcessor.java │ │ │ ├── AngularJSProcessor.java │ │ │ ├── DiandianBlogProcessor.java │ │ │ ├── DiaoyuwengProcessor.java │ │ │ ├── F58PageProcesser.java │ │ │ ├── GithubRepo.java │ │ │ ├── GithubRepoPageProcessor.java │ │ │ ├── HuxiuProcessor.java │ │ │ ├── InfoQMiniBookProcessor.java │ │ │ ├── IteyeBlogProcessor.java │ │ │ ├── KaichibaProcessor.java │ │ │ ├── MamacnPageProcessor.java │ │ │ ├── MeicanProcessor.java │ │ │ ├── NjuBBSProcessor.java │ │ │ ├── PhantomJSPageProcessor.java │ │ │ ├── QzoneBlogProcessor.java │ │ │ ├── SinaBlogProcessor.java │ │ │ ├── TianyaPageProcesser.java │ │ │ ├── ZhihuPageProcessor.java │ │ │ ├── formatter/ │ │ │ │ └── StringTemplateFormatter.java │ │ │ ├── pipeline/ │ │ │ │ ├── OneFilePipeline.java │ │ │ │ └── ReplacePipeline.java │ │ │ └── scheduler/ │ │ │ ├── DelayQueueScheduler.java │ │ │ ├── LevelLimitScheduler.java │ │ │ └── ZipCodePageProcessor.java │ │ └── resources/ │ │ ├── crawl.js │ │ └── log4j2.xml │ └── test/ │ └── java/ │ └── us/ │ └── codecraft/ │ └── webmagic/ │ ├── SpiderTest.java │ ├── model/ │ │ └── ProcessorBenchmark.java │ ├── processor/ │ │ └── SinablogProcessorTest.java │ └── samples/ │ └── scheduler/ │ └── DelayQueueSchedulerTest.java ├── webmagic-saxon/ │ ├── README.md │ ├── pom.xml │ └── src/ │ ├── main/ │ │ └── java/ │ │ └── us/ │ │ └── codecraft/ │ │ └── webmagic/ │ │ └── selector/ │ │ ├── JaxpSelectorUtils.java │ │ ├── NodeSelector.java │ │ └── Xpath2Selector.java │ └── test/ │ └── java/ │ └── us/ │ └── codecraft/ │ └── webmagic/ │ └── selector/ │ └── XpathSelectorTest.java ├── webmagic-scripts/ │ ├── README.md │ ├── deploy.sh │ ├── pom.xml │ └── src/ │ ├── main/ │ │ ├── groovy/ │ │ │ └── Github.groovy │ │ ├── java/ │ │ │ └── us/ │ │ │ └── codecraft/ │ │ │ └── webmagic/ │ │ │ └── scripts/ │ │ │ ├── Params.java │ │ │ ├── ScriptConsole.java │ │ │ ├── ScriptEnginePool.java │ │ │ ├── ScriptProcessor.java │ │ │ ├── ScriptProcessorBuilder.java │ │ │ ├── config/ │ │ │ │ ├── CommandLineOption.java │ │ │ │ └── ConfigLogger.java │ │ │ └── languages/ │ │ │ ├── JRuby.java │ │ │ ├── Javascript.java │ │ │ ├── Jython.java │ │ │ └── Language.java │ │ ├── kotlin/ │ │ │ └── Github.kt │ │ └── resources/ │ │ ├── js/ │ │ │ ├── defines.js │ │ │ ├── github.js │ │ │ └── oschina.js │ │ ├── python/ │ │ │ ├── defines.py │ │ │ └── oschina.py │ │ └── ruby/ │ │ ├── defines.rb │ │ ├── github.rb │ │ └── oschina.rb │ └── test/ │ ├── java/ │ │ └── us/ │ │ └── codecraft/ │ │ └── webmagic/ │ │ └── scripts/ │ │ └── ScriptProcessorTest.java │ └── resources/ │ └── log4j2-test.xml └── webmagic-selenium/ ├── README.md ├── config.ini ├── pom.xml └── src/ ├── main/ │ └── java/ │ └── us/ │ └── codecraft/ │ └── webmagic/ │ └── downloader/ │ └── selenium/ │ ├── SeleniumDownloader.java │ └── WebDriverPool.java └── test/ ├── java/ │ └── us/ │ └── codecraft/ │ └── webmagic/ │ ├── downloader/ │ │ ├── SeleniumTest.java │ │ └── selenium/ │ │ ├── SeleniumDownloaderTest.java │ │ └── WebDriverPoolTest.java │ └── samples/ │ ├── GooglePlayProcessor.java │ └── HuabanProcessor.java └── resources/ └── config.ini ================================================ FILE CONTENTS ================================================ ================================================ FILE: .gitignore ================================================ target/ pom.xml.tag pom.xml.releaseBackup pom.xml.versionsBackup pom.xml.next release.properties dependency-reduced-pom.xml buildNumber.properties .mvn/timing.properties # https://github.com/takari/maven-wrapper#usage-without-binary-jar .mvn/wrapper/maven-wrapper.jar # Eclipse m2e generated files # Eclipse Core .project # JDT-specific (Eclipse Java Development Tools) .classpath .metadata bin/ tmp/ *.tmp *.bak *.swp *~.nib local.properties .settings/ .loadpath .recommenders # External tool builders .externalToolBuilders/ # Locally stored "Eclipse launch configurations" *.launch # PyDev specific (Python IDE for Eclipse) *.pydevproject # CDT-specific (C/C++ Development Tooling) .cproject # CDT- autotools .autotools # Java annotation processor (APT) .factorypath # PDT-specific (PHP Development Tools) .buildpath # sbteclipse plugin .target # Tern plugin .tern-project # TeXlipse plugin .texlipse # STS (Spring Tool Suite) .springBeans # Code Recommenders .recommenders/ # Annotation Processing .apt_generated/ .apt_generated_test/ # Scala IDE specific (Scala & Java development for Eclipse) .cache-main .scala_dependencies .worksheet # Uncomment this line if you wish to ignore the project description file. # Typically, this file would be tracked if it contains build/dependency configurations: #.project ================================================ FILE: .travis.yml ================================================ language: java jdk: - openjdk9 ================================================ FILE: LICENSE ================================================ Apache License Version 2.0, January 2004 http://www.apache.org/licenses/ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 1. Definitions. "License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document. "Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License. "Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity. "You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License. "Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files. "Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types. "Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below). "Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof. "Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution." "Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work. 2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form. 3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed. 4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions: You must give any other recipients of the Work or Derivative Works a copy of this License; and You must cause any modified files to carry prominent notices stating that You changed the files; and You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License. You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License. 5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions. 6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file. 7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License. 8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages. 9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability. END OF TERMS AND CONDITIONS APPENDIX: How to apply the Apache License to your work To apply the Apache License to your work, attach the following boilerplate notice, with the fields enclosed by brackets "{}" replaced with your own identifying information. (Don't include the brackets!) The text should be enclosed in the appropriate comment syntax for the file format. We also recommend that a file or class name and description of purpose be included on the same "printed page" as the copyright notice for easier identification within third-party archives. Copyright 2025 code4craft Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. ================================================ FILE: README-zh.md ================================================ ![logo](http://webmagic.io/images/logo.jpeg) [![Maven Central](https://maven-badges.herokuapp.com/maven-central/us.codecraft/webmagic-parent/badge.svg?subject=Maven%20Central)](https://maven-badges.herokuapp.com/maven-central/us.codecraft/webmagic-parent/) [![License](https://img.shields.io/badge/License-Apache%20License%202.0-blue.svg)](https://www.apache.org/licenses/LICENSE-2.0.html) [![Build Status](https://travis-ci.org/code4craft/webmagic.png?branch=master)](https://travis-ci.org/code4craft/webmagic) 官方网站[http://webmagic.io/](http://webmagic.io/) >webmagic是一个开源的Java垂直爬虫框架,目标是简化爬虫的开发流程,让开发者专注于逻辑功能的开发。webmagic的核心非常简单,但是覆盖爬虫的整个流程,也是很好的学习爬虫开发的材料。 webmagic的主要特色: * 完全模块化的设计,强大的可扩展性。 * 核心简单但是涵盖爬虫的全部流程,灵活而强大,也是学习爬虫入门的好材料。 * 提供丰富的抽取页面API。 * 无配置,但是可通过POJO+注解形式实现一个爬虫。 * 支持多线程。 * 支持分布式。 * 支持爬取js动态渲染的页面。 * 无框架依赖,可以灵活的嵌入到项目中去。 webmagic的架构和设计参考了以下两个项目,感谢以下两个项目的作者: python爬虫 **scrapy** [https://github.com/scrapy/scrapy](https://github.com/scrapy/scrapy) Java爬虫 **Spiderman** [http://git.oschina.net/l-weiwei/spiderman](http://git.oschina.net/l-weiwei/spiderman) webmagic的github地址:[https://github.com/code4craft/webmagic](https://github.com/code4craft/webmagic)。 ## 快速开始 ### 使用maven webmagic使用maven管理依赖,在项目中添加对应的依赖即可使用webmagic: ```xml us.codecraft webmagic-core ${webmagic.version} us.codecraft webmagic-extension ${webmagic.version} ``` WebMagic 使用slf4j-log4j12作为slf4j的实现.如果你自己定制了slf4j的实现,请在项目中去掉此依赖。 ```xml org.slf4j slf4j-log4j12 ``` #### 项目结构 webmagic主要包括两个包: * **webmagic-core** webmagic核心部分,只包含爬虫基本模块和基本抽取器。webmagic-core的目标是成为网页爬虫的一个教科书般的实现。 * **webmagic-extension** webmagic的扩展模块,提供一些更方便的编写爬虫的工具。包括注解格式定义爬虫、JSON、分布式等支持。 webmagic还包含两个可用的扩展包,因为这两个包都依赖了比较重量级的工具,所以从主要包中抽离出来,这些包需要下载源码后自己编译:: * **webmagic-saxon** webmagic与Saxon结合的模块。Saxon是一个XPath、XSLT的解析工具,webmagic依赖Saxon来进行XPath2.0语法解析支持。 * **webmagic-selenium** webmagic与Selenium结合的模块。Selenium是一个模拟浏览器进行页面渲染的工具,webmagic依赖Selenium进行动态页面的抓取。 在项目中,你可以根据需要依赖不同的包。 ### 不使用maven 在项目的**lib**目录下,有依赖的所有jar包,直接在IDE里import即可。 ### 第一个爬虫 #### 定制PageProcessor PageProcessor是webmagic-core的一部分,定制一个PageProcessor即可实现自己的爬虫逻辑。以下是抓取osc博客的一段代码: ```java public class OschinaBlogPageProcessor implements PageProcessor { private Site site = Site.me().setDomain("my.oschina.net"); @Override public void process(Page page) { List links = page.getHtml().links().regex("http://my\\.oschina\\.net/flashsword/blog/\\d+").all(); page.addTargetRequests(links); page.putField("title", page.getHtml().xpath("//div[@class='BlogEntity']/div[@class='BlogTitle']/h1").toString()); page.putField("content", page.getHtml().$("div.content").toString()); page.putField("tags",page.getHtml().xpath("//div[@class='BlogTags']/a/text()").all()); } @Override public Site getSite() { return site; } public static void main(String[] args) { Spider.create(new OschinaBlogPageProcessor()).addUrl("http://my.oschina.net/flashsword/blog") .addPipeline(new ConsolePipeline()).run(); } } ``` 这里通过page.addTargetRequests()方法来增加要抓取的URL,并通过page.putField()来保存抽取结果。page.getHtml().xpath()则是按照某个规则对结果进行抽取,这里抽取支持链式调用。调用结束后,toString()表示转化为单个String,all()则转化为一个String列表。 Spider是爬虫的入口类。Pipeline是结果输出和持久化的接口,这里ConsolePipeline表示结果输出到控制台。 执行这个main方法,即可在控制台看到抓取结果。webmagic默认有3秒抓取间隔,请耐心等待。 #### 使用注解 webmagic-extension包括了注解方式编写爬虫的方法,只需基于一个POJO增加注解即可完成一个爬虫。以下仍然是抓取oschina博客的一段代码,功能与OschinaBlogPageProcesser完全相同: ```java @TargetUrl("http://my.oschina.net/flashsword/blog/\\d+") public class OschinaBlog { @ExtractBy("//title") private String title; @ExtractBy(value = "div.BlogContent",type = ExtractBy.Type.Css) private String content; @ExtractBy(value = "//div[@class='BlogTags']/a/text()", multi = true) private List tags; public static void main(String[] args) { OOSpider.create( Site.me(), new ConsolePageModelPipeline(), OschinaBlog.class).addUrl("http://my.oschina.net/flashsword/blog").run(); } } ``` 这个例子定义了一个Model类,Model类的字段'title'、'content'、'tags'均为要抽取的属性。这个类在Pipeline里是可以复用的。 ### 详细文档 见[http://webmagic.io/docs/](http://webmagic.io/docs/)。 ### 示例 webmagic-samples目录里有一些定制PageProcessor以抽取不同站点的例子。 webmagic的使用可以参考:[oschina openapi 应用:博客搬家](https://git.oschina.net/yashin/MoveBlog) ### 协议 webmagic遵循[Apache 2.0协议](http://opensource.org/licenses/Apache-2.0) ### 邮件组: Gmail: [https://groups.google.com/forum/#!forum/webmagic-java](https://groups.google.com/forum/#!forum/webmagic-java) QQ: [http://list.qq.com/cgi-bin/qf_invite?id=023a01f505246785f77c5a5a9aff4e57ab20fcdde871e988](http://list.qq.com/cgi-bin/qf_invite?id=023a01f505246785f77c5a5a9aff4e57ab20fcdde871e988) ### QQ群: 373225642(已满) 542327088 ### 相关项目: [Gather Platform](https://github.com/gsh199449/spider) Gather Platform 数据抓取平台是一套基于Webmagic内核的,具有Web任务配置和任务管理界面的数据采集与搜索平台。 ================================================ FILE: README.md ================================================ ![logo](http://webmagic.io/images/logo.jpeg) [Readme in Chinese](https://github.com/code4craft/webmagic/tree/master/README-zh.md) [![Maven Central](https://maven-badges.herokuapp.com/maven-central/us.codecraft/webmagic-parent/badge.svg?subject=Maven%20Central)](https://maven-badges.herokuapp.com/maven-central/us.codecraft/webmagic-parent/) [![License](https://img.shields.io/badge/License-Apache%20License%202.0-blue.svg)](https://www.apache.org/licenses/LICENSE-2.0.html) [![Build Status](https://travis-ci.org/code4craft/webmagic.png?branch=master)](https://travis-ci.org/code4craft/webmagic) >A scalable crawler framework. It covers the whole lifecycle of crawler: downloading, url management, content extraction and persistent. It can simplify the development of a specific crawler. ## Features: * Simple core with high flexibility. * Simple API for html extracting. * Annotation with POJO to customize a crawler, no configuration. * Multi-thread and Distribution support. * Easy to be integrated. ## Install: Add dependencies to your pom.xml: ```xml us.codecraft webmagic-core ${webmagic.version} us.codecraft webmagic-extension ${webmagic.version} ``` WebMagic use slf4j with slf4j-log4j12 implementation. If you customized your slf4j implementation, please exclude slf4j-log4j12. ```xml org.slf4j slf4j-log4j12 ``` ## Get Started: ### First crawler: Write a class implements PageProcessor. For example, I wrote a crawler of github repository information. ```java public class GithubRepoPageProcessor implements PageProcessor { private Site site = Site.me().setRetryTimes(3).setSleepTime(1000); @Override public void process(Page page) { page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/\\w+/\\w+)").all()); page.putField("author", page.getUrl().regex("https://github\\.com/(\\w+)/.*").toString()); page.putField("name", page.getHtml().xpath("//h1[@class='public']/strong/a/text()").toString()); if (page.getResultItems().get("name")==null){ //skip this page page.setSkip(true); } page.putField("readme", page.getHtml().xpath("//div[@id='readme']/tidyText()")); } @Override public Site getSite() { return site; } public static void main(String[] args) { Spider.create(new GithubRepoPageProcessor()).addUrl("https://github.com/code4craft").thread(5).run(); } } ``` * `page.addTargetRequests(links)` Add urls for crawling. You can also use annotation way: ```java @TargetUrl("https://github.com/\\w+/\\w+") @HelpUrl("https://github.com/\\w+") public class GithubRepo { @ExtractBy(value = "//h1[@class='public']/strong/a/text()", notNull = true) private String name; @ExtractByUrl("https://github\\.com/(\\w+)/.*") private String author; @ExtractBy("//div[@id='readme']/tidyText()") private String readme; public static void main(String[] args) { OOSpider.create(Site.me().setSleepTime(1000) , new ConsolePageModelPipeline(), GithubRepo.class) .addUrl("https://github.com/code4craft").thread(5).run(); } } ``` ### Docs and samples: Documents: [http://webmagic.io/docs/](http://webmagic.io/docs/) The architecture of webmagic (referred to [Scrapy](http://scrapy.org/)) ![image](http://code4craft.github.io/images/posts/webmagic.png) There are more examples in `webmagic-samples` package. ### License: Licensed under [Apache 2.0 license](http://opensource.org/licenses/Apache-2.0) ### Thanks: To write webmagic, I refered to the projects below : * **Scrapy** A crawler framework in Python. [http://scrapy.org/](http://scrapy.org/) * **Spiderman** Another crawler framework in Java. [http://git.oschina.net/l-weiwei/spiderman](http://git.oschina.net/l-weiwei/spiderman) ### Mail-list: [https://groups.google.com/forum/#!forum/webmagic-java](https://groups.google.com/forum/#!forum/webmagic-java) [http://list.qq.com/cgi-bin/qf_invite?id=023a01f505246785f77c5a5a9aff4e57ab20fcdde871e988](http://list.qq.com/cgi-bin/qf_invite?id=023a01f505246785f77c5a5a9aff4e57ab20fcdde871e988) QQ Group: 373225642 542327088 ### Related Project * Gather Platform A web console based on WebMagic for Spider configuration and management. ================================================ FILE: pom.xml ================================================ 4.0.0 org.oxerr oxerr-parent 2.3.1 us.codecraft 1.0.4-SNAPSHOT pom UTF-8 UTF-8 11 11 3.23.1 1.5.0 4.4 2.14.0 3.12.0 2.0.19.graal 3.0.13 32.0.0-jre 2.29 4.5.13 4.4.15 3.7.1 9.4.12.1 2.9.0 5.10.2 1.10.2 2.7.3 2.23.1 2.0.2-beta 1.3.0 1.2.0 12.4 4.14.1 2.0.4 4.0.0.RELEASE 0.3.5 webmagic webmagic A crawler framework. It covers the whole lifecycle of crawler: downloading, url management, content extraction and persistent. It can simply the development of a specific crawler. https://github.com/code4craft/webmagic/ code4craft Yihua huang code4crafer@gmail.com yuany Ligang Yao ligang.yao@answers.com scm:git:git@github.com:code4craft/webmagic.git scm:git:git@github.com:code4craft/webmagic.git git@github.com:code4craft/webmagic.git WebMagic-${project.version} Apache License, Version 2.0 http://www.apache.org/licenses/LICENSE-2.0 webmagic-core webmagic-extension/ webmagic-scripts/ webmagic-selenium webmagic-saxon webmagic-samples webmagic-coverage org.apache.logging.log4j log4j-core test org.apache.logging.log4j log4j-slf4j2-impl test org.junit.jupiter junit-jupiter-engine test org.junit.vintage junit-vintage-engine test org.junit.platform junit-platform-launcher test org.junit.platform junit-platform-runner test org.mockito mockito-all ${mockito-all.version} test org.apache.httpcomponents httpclient ${httpclient.version} org.apache.httpcomponents httpcore ${httpcore.version} org.apache.logging.log4j log4j-core ${log4j2.version} org.apache.logging.log4j log4j-slf4j2-impl ${log4j2.version} com.google.guava guava ${guava.version} com.jayway.jsonpath json-path ${json-path.version} org.junit.jupiter junit-jupiter-engine ${junit.version} org.junit.vintage junit-vintage-engine ${junit.version} org.junit.platform junit-platform-launcher ${junit.platform.version} org.junit.platform junit-platform-runner ${junit.platform.version} org.slf4j slf4j-api ${slf4j.version} us.codecraft xsoup 0.3.7 com.alibaba fastjson ${fastjson.version} com.github.dreamhead moco-core ${moco.version} test org.slf4j slf4j-simple org.assertj assertj-core ${assertj.version} test org.apache.commons commons-lang3 ${commons-lang3.version} org.apache.commons commons-collections4 ${commons-collections4.version} commons-io commons-io ${commons-io.version} org.codehaus.groovy groovy-all ${groovy-all.version} org.jruby jruby ${jruby.version} org.python jython ${jython.version} org.seleniumhq.selenium selenium-java ${selenium-java.version} net.sf.saxon Saxon-HE ${saxon-he.version} net.sourceforge.htmlcleaner htmlcleaner ${htmlcleaner.version} com.github.detro phantomjsdriver ${phantomjsdriver.version} commons-cli commons-cli ${commons-cli.version} redis.clients jedis ${jedis.version} org.apache.maven.plugins maven-javadoc-plugin WebMagic ${project.version} en_US false aggregate aggregate site attach-javadocs jar org.jacoco jacoco-maven-plugin prepare-agent report verify report com.amashchenko.maven.plugin gitflow-maven-plugin WebMagic- ================================================ FILE: src/site/site.xml ================================================ org.apache.maven.skins maven-fluido-skin 1.11.1 true true true pull-right ================================================ FILE: webmagic-core/README.md ================================================ webmagic-core ------- webmagic核心部分。只包含爬虫基本模块和基本抽取器。webmagic-core的目标是成为网页爬虫的一个教科书般的实现。 ================================================ FILE: webmagic-core/module_webmagic-core.xml ================================================ ================================================ FILE: webmagic-core/pom.xml ================================================ us.codecraft webmagic 1.0.4-SNAPSHOT 4.0.0 webmagic-core org.apache.httpcomponents httpclient org.apache.commons commons-lang3 us.codecraft xsoup com.github.dreamhead moco-core org.slf4j slf4j-api org.mockito mockito-all org.apache.commons commons-collections4 org.assertj assertj-core commons-io commons-io com.jayway.jsonpath json-path com.alibaba fastjson ================================================ FILE: webmagic-core/src/main/java/us/codecraft/webmagic/Page.java ================================================ package us.codecraft.webmagic; import org.apache.commons.lang3.StringUtils; import us.codecraft.webmagic.selector.Html; import us.codecraft.webmagic.selector.Json; import us.codecraft.webmagic.selector.Selectable; import us.codecraft.webmagic.utils.UrlUtils; import java.util.ArrayList; import java.util.Arrays; import java.util.List; import java.util.Map; /** * Object storing extracted result and urls to fetch.
* Not thread safe.
* Main method:
* {@link #getUrl()} get url of current page
* {@link #getHtml()} get content of current page
* {@link #putField(String, Object)} save extracted result
* {@link #getResultItems()} get extract results to be used in {@link us.codecraft.webmagic.pipeline.Pipeline}
* {@link #addTargetRequests(Iterable)} {@link #addTargetRequest(String)} add urls to fetch
* * @author code4crafter@gmail.com
* @see us.codecraft.webmagic.downloader.Downloader * @see us.codecraft.webmagic.processor.PageProcessor * @since 0.1.0 */ public class Page { private Request request; private ResultItems resultItems = new ResultItems(); private Html html; private Json json; private String rawText; private Selectable url; private Map> headers; private int statusCode; private boolean downloadSuccess; private byte[] bytes; private List targetRequests = new ArrayList<>(); private String charset; /** * Returns a {@link Page} with {@link #downloadSuccess} is {@code true}, * and {@link #request} is specified. * * @param request the request. * @since 1.0.2 */ public static Page ofSuccess(Request request) { return new Page(request, true); } /** * Returns a {@link Page} with {@link #downloadSuccess} is {@code true}, * and {@link #request} is specified. * * @param request the request. * @since 1.0.2 */ public static Page ofFailure(Request request) { return new Page(request, false); } public Page() { } /** * Constructs a {@link Page} with {@link #request} * and {@link #downloadSuccess} specified. * * @param request the request. * @param downloadSuccess the download success flag. * @since 1.0.2 */ private Page(Request request, boolean downloadSuccess) { this.request = request; this.downloadSuccess = downloadSuccess; } /** * Returns a {@link Page} with {@link #downloadSuccess} is {@code false}. * * @return the page. * @deprecated Use {@link #fail(Request)} instead. */ @Deprecated public static Page fail() { return fail(null); } /** * Returns a {@link Page} with {@link #downloadSuccess} is {@code false}, * and {@link #request} is specified. * * @param request the {@link Request}. * @return the page. * @since 0.10.0 * @deprecated Use {@link #ofFailure(Request)} instead. */ @Deprecated(since = "1.0.2", forRemoval = true) public static Page fail(Request request){ Page page = new Page(); page.setRequest(request); page.setDownloadSuccess(false); return page; } public Page setSkip(boolean skip) { resultItems.setSkip(skip); return this; } /** * store extract results * * @param key key * @param field field */ public void putField(String key, Object field) { resultItems.put(key, field); } /** * get html content of page * * @return html */ public Html getHtml() { if (html == null) { html = new Html(rawText, request.getUrl()); } return html; } /** * get json content of page * * @return json * @since 0.5.0 */ public Json getJson() { if (json == null) { json = new Json(rawText); } return json; } /** * @param html html * @deprecated since 0.4.0 * The html is parse just when first time of calling {@link #getHtml()}, so use {@link #setRawText(String)} instead. */ @Deprecated public void setHtml(Html html) { this.html = html; } public List getTargetRequests() { return targetRequests; } /** * add urls to fetch * * @param requests requests */ public void addTargetRequests(Iterable requests) { addTargetRequests(requests, 0); // Default priority is 0 } /** * add urls to fetch * * @param requests requests * @param priority priority */ public void addTargetRequests(Iterable requests, long priority) { if(requests == null) { return; } for (String req : requests) { addRequestIfValid(req, priority); } } /** * Helper method to add a request if it's valid. * * @param url URL to add * @param priority Priority for the URL */ private void addRequestIfValid(String url, long priority) { if (StringUtils.isBlank(url) || url.equals("#") || url.startsWith("javascript:")) { return; } String canonicalizedUrl = UrlUtils.canonicalizeUrl(url, this.url.toString()); Request req = new Request(canonicalizedUrl); if(priority > 0) { req.setPriority(priority); } targetRequests.add(req); } /** * add url to fetch * * @param requestString requestString */ public void addTargetRequest(String requestString) { if (StringUtils.isBlank(requestString) || requestString.equals("#")) { return; } requestString = UrlUtils.canonicalizeUrl(requestString, url.toString()); targetRequests.add(new Request(requestString)); } /** * add requests to fetch * * @param request request */ public void addTargetRequest(Request request) { targetRequests.add(request); } /** * get url of current page * * @return url of current page */ public Selectable getUrl() { return url; } public void setUrl(Selectable url) { this.url = url; } /** * get request of current page * * @return request */ public Request getRequest() { return request; } public void setRequest(Request request) { this.request = request; this.resultItems.setRequest(request); } public ResultItems getResultItems() { return resultItems; } public int getStatusCode() { return statusCode; } public void setStatusCode(int statusCode) { this.statusCode = statusCode; } public String getRawText() { return rawText; } public Page setRawText(String rawText) { this.rawText = rawText; return this; } public Map> getHeaders() { return headers; } public void setHeaders(Map> headers) { this.headers = headers; } public boolean isDownloadSuccess() { return downloadSuccess; } public void setDownloadSuccess(boolean downloadSuccess) { this.downloadSuccess = downloadSuccess; } public byte[] getBytes() { return bytes; } public void setBytes(byte[] bytes) { this.bytes = bytes; } public String getCharset() { return charset; } public void setCharset(String charset) { this.charset = charset; } @Override public String toString() { return "Page{" + "request=" + request + ", resultItems=" + resultItems + ", html=" + html + ", json=" + json + ", rawText='" + rawText + '\'' + ", url=" + url + ", headers=" + headers + ", statusCode=" + statusCode + ", downloadSuccess=" + downloadSuccess + ", targetRequests=" + targetRequests + ", charset='" + charset + '\'' + ", bytes=" + Arrays.toString(bytes) + '}'; } } ================================================ FILE: webmagic-core/src/main/java/us/codecraft/webmagic/Request.java ================================================ package us.codecraft.webmagic; import java.io.Serializable; import java.util.Collections; import java.util.HashMap; import java.util.Map; import us.codecraft.webmagic.downloader.Downloader; import us.codecraft.webmagic.model.HttpRequestBody; import us.codecraft.webmagic.utils.Experimental; /** * Object contains url to crawl.
* It contains some additional information.
* * @author code4crafter@gmail.com
* @since 0.1.0 */ public class Request implements Serializable { private static final long serialVersionUID = 2062192774891352043L; public static final String CYCLE_TRIED_TIMES = "_cycle_tried_times"; private String url; private String method; private HttpRequestBody requestBody; /** * this req use this downloader */ private Downloader downloader; /** * Store additional information in extras. */ private Map extras = new HashMap<>(); /** * cookies for current url, if not set use Site's cookies */ private Map cookies = new HashMap(); private Map headers = new HashMap(); /** * Priority of the request.
* The bigger will be processed earlier.
* @see us.codecraft.webmagic.scheduler.PriorityScheduler */ private long priority; /** * When it is set to TRUE, the downloader will not try to parse response body to text. * */ private boolean binaryContent = false; private String charset; public Request() { } public Request(String url) { this.url = url; } public long getPriority() { return priority; } /** * Set the priority of request for sorting.
* Need a scheduler supporting priority.
* @see us.codecraft.webmagic.scheduler.PriorityScheduler * * @param priority priority * @return this */ @Experimental public Request setPriority(long priority) { this.priority = priority; return this; } @SuppressWarnings("unchecked") public T getExtra(String key) { if (extras == null) { return null; } return (T) extras.get(key); } public Request putExtra(String key, T value) { extras.put(key, value); return this; } public String getUrl() { return url; } public Map getExtras() { return Collections.unmodifiableMap(extras); } public Request setExtras(Map extras) { this.extras.putAll(extras); return this; } public Request setUrl(String url) { this.url = url; return this; } /** * The http method of the request. Get for default. * @return httpMethod * @see us.codecraft.webmagic.utils.HttpConstant.Method * @since 0.5.0 */ public String getMethod() { return method; } public Request setMethod(String method) { this.method = method; return this; } @Override public int hashCode() { int result = url != null ? url.hashCode() : 0; result = 31 * result + (method != null ? method.hashCode() : 0); return result; } @Override public boolean equals(Object o) { if (this == o) return true; if (o == null || getClass() != o.getClass()) return false; Request request = (Request) o; if (url != null ? !url.equals(request.url) : request.url != null) return false; return method != null ? method.equals(request.method) : request.method == null; } public Request addCookie(String name, String value) { cookies.put(name, value); return this; } public Request addHeader(String name, String value) { headers.put(name, value); return this; } public Map getCookies() { return cookies; } public Map getHeaders() { return headers; } public HttpRequestBody getRequestBody() { return requestBody; } public void setRequestBody(HttpRequestBody requestBody) { this.requestBody = requestBody; } public boolean isBinaryContent() { return binaryContent; } public Downloader getDownloader() { return downloader; } public void setDownloader(Downloader downloader) { this.downloader = downloader; } public Request setBinaryContent(boolean binaryContent) { this.binaryContent = binaryContent; return this; } public String getCharset() { return charset; } public Request setCharset(String charset) { this.charset = charset; return this; } @Override public String toString() { return "Request{" + "url='" + url + '\'' + ", method='" + method + '\'' + ", extras=" + extras + ", priority=" + priority + ", headers=" + headers + ", cookies="+ cookies+ '}'; } } ================================================ FILE: webmagic-core/src/main/java/us/codecraft/webmagic/ResultItems.java ================================================ package us.codecraft.webmagic; import java.util.LinkedHashMap; import java.util.Map; /** * Object contains extract results.
* It is contained in Page and will be processed in pipeline. * * @author code4crafter@gmail.com
* @since 0.1.0 * @see Page * @see us.codecraft.webmagic.pipeline.Pipeline */ public class ResultItems { private Map fields = new LinkedHashMap(); private Request request; private boolean skip; @SuppressWarnings("unchecked") public T get(String key) { Object o = fields.get(key); if (o == null) { return null; } return (T) fields.get(key); } public Map getAll() { return fields; } public ResultItems put(String key, T value) { fields.put(key, value); return this; } public Request getRequest() { return request; } public ResultItems setRequest(Request request) { this.request = request; return this; } /** * Whether to skip the result.
* Result which is skipped will not be processed by Pipeline. * * @return whether to skip the result */ public boolean isSkip() { return skip; } /** * Set whether to skip the result.
* Result which is skipped will not be processed by Pipeline. * * @param skip whether to skip the result * @return this */ public ResultItems setSkip(boolean skip) { this.skip = skip; return this; } @Override public String toString() { return "ResultItems{" + "fields=" + fields + ", request=" + request + ", skip=" + skip + '}'; } } ================================================ FILE: webmagic-core/src/main/java/us/codecraft/webmagic/Site.java ================================================ package us.codecraft.webmagic; import java.util.HashMap; import java.util.HashSet; import java.util.LinkedHashMap; import java.util.Map; import java.util.Set; import java.util.UUID; import us.codecraft.webmagic.utils.HttpConstant; /** * Object contains setting for crawler.
* * @author code4crafter@gmail.com
* @see us.codecraft.webmagic.processor.PageProcessor * @since 0.1.0 */ public class Site { private String domain; private String userAgent; private Map defaultCookies = new LinkedHashMap(); private Map> cookies = new HashMap>(); private String charset; private String defaultCharset; private int sleepTime = 5000; private int retryTimes = 0; private int cycleRetryTimes = 0; private int retrySleepTime = 1000; private int timeOut = 5000; private static final Set DEFAULT_STATUS_CODE_SET = new HashSet(); private Set acceptStatCode = DEFAULT_STATUS_CODE_SET; private Map headers = new HashMap(); private boolean useGzip = true; private boolean disableCookieManagement = false; static { DEFAULT_STATUS_CODE_SET.add(HttpConstant.StatusCode.CODE_200); } /** * new a Site * * @return new site */ public static Site me() { return new Site(); } /** * Add a cookie with domain {@link #getDomain()} * * @param name name * @param value value * @return this */ public Site addCookie(String name, String value) { defaultCookies.put(name, value); return this; } /** * Add a cookie with specific domain. * * @param domain domain * @param name name * @param value value * @return this */ public Site addCookie(String domain, String name, String value) { if (!cookies.containsKey(domain)){ cookies.put(domain,new HashMap()); } cookies.get(domain).put(name, value); return this; } /** * set user agent * * @param userAgent userAgent * @return this */ public Site setUserAgent(String userAgent) { this.userAgent = userAgent; return this; } /** * get cookies * * @return get cookies */ public Map getCookies() { return defaultCookies; } /** * get cookies of all domains * * @return get cookies */ public Map> getAllCookies() { return cookies; } /** * get user agent * * @return user agent */ public String getUserAgent() { return userAgent; } /** * get domain * * @return get domain */ public String getDomain() { return domain; } /** * set the domain of site. * * @param domain domain * @return this */ public Site setDomain(String domain) { this.domain = domain; return this; } /** * Set charset of page manually.
* When charset is not set or set to null, it can be auto detected by Http header. * * @param charset charset * @return this */ public Site setCharset(String charset) { this.charset = charset; return this; } /** * get charset set manually * * @return charset */ public String getCharset() { return charset; } /** * Set default charset of page. * * When charset detect failed, use this default charset. * * @param defaultCharset the default charset * @return this * @since 0.9.0 */ public Site setDefaultCharset(String defaultCharset) { this.defaultCharset = defaultCharset; return this; } /** * The default charset if charset detected failed. * * @return the defulat charset * @since 0.9.0 */ public String getDefaultCharset() { return defaultCharset; } public int getTimeOut() { return timeOut; } /** * set timeout for downloader in ms * * @param timeOut timeOut * @return this */ public Site setTimeOut(int timeOut) { this.timeOut = timeOut; return this; } /** * Set acceptStatCode.
* When status code of http response is in acceptStatCodes, it will be processed.
* {200} by default.
* It is not necessarily to be set.
* * @param acceptStatCode acceptStatCode * @return this */ public Site setAcceptStatCode(Set acceptStatCode) { this.acceptStatCode = acceptStatCode; return this; } /** * get acceptStatCode * * @return acceptStatCode */ public Set getAcceptStatCode() { return acceptStatCode; } /** * Set the interval between the processing of two pages.
* Time unit is milliseconds.
* * @param sleepTime sleepTime * @return this */ public Site setSleepTime(int sleepTime) { this.sleepTime = sleepTime; return this; } /** * Get the interval between the processing of two pages.
* Time unit is milliseconds.
* * @return the interval between the processing of two pages, */ public int getSleepTime() { return sleepTime; } /** * Get retry times immediately when download fail, 0 by default.
* * @return retry times when download fail */ public int getRetryTimes() { return retryTimes; } public Map getHeaders() { return headers; } /** * Put an Http header for downloader.
* Use {@link #addCookie(String, String)} for cookie and {@link #setUserAgent(String)} for user-agent.
* * @param key key of http header, there are some keys constant in {@link HttpConstant.Header} * @param value value of header * @return this */ public Site addHeader(String key, String value) { headers.put(key, value); return this; } /** * Set retry times when download fail, 0 by default.
* * @param retryTimes retryTimes * @return this */ public Site setRetryTimes(int retryTimes) { this.retryTimes = retryTimes; return this; } /** * When cycleRetryTimes is more than 0, it will add back to scheduler and try download again.
* * @return retry times when download fail */ public int getCycleRetryTimes() { return cycleRetryTimes; } /** * Set cycleRetryTimes times when download fail, 0 by default.
* * @param cycleRetryTimes cycleRetryTimes * @return this */ public Site setCycleRetryTimes(int cycleRetryTimes) { this.cycleRetryTimes = cycleRetryTimes; return this; } public boolean isUseGzip() { return useGzip; } public int getRetrySleepTime() { return retrySleepTime; } /** * Set retry sleep times when download fail, 1000 by default.
* * @param retrySleepTime retrySleepTime * @return this */ public Site setRetrySleepTime(int retrySleepTime) { this.retrySleepTime = retrySleepTime; return this; } /** * Whether use gzip.
* Default is true, you can set it to false to disable gzip. * * @param useGzip useGzip * @return this */ public Site setUseGzip(boolean useGzip) { this.useGzip = useGzip; return this; } public boolean isDisableCookieManagement() { return disableCookieManagement; } /** * Downloader is supposed to store response cookie. * Disable it to ignore all cookie fields and stay clean. * Warning: Set cookie will still NOT work if disableCookieManagement is true. * @param disableCookieManagement disableCookieManagement * @return this */ public Site setDisableCookieManagement(boolean disableCookieManagement) { this.disableCookieManagement = disableCookieManagement; return this; } public Task toTask() { return new Task() { @Override public String getUUID() { String uuid = Site.this.getDomain(); if (uuid == null) { uuid = UUID.randomUUID().toString(); } return uuid; } @Override public Site getSite() { return Site.this; } }; } @Override public boolean equals(Object o) { if (this == o) return true; if (o == null || getClass() != o.getClass()) return false; Site site = (Site) o; if (cycleRetryTimes != site.cycleRetryTimes) return false; if (retryTimes != site.retryTimes) return false; if (sleepTime != site.sleepTime) return false; if (timeOut != site.timeOut) return false; if (acceptStatCode != null ? !acceptStatCode.equals(site.acceptStatCode) : site.acceptStatCode != null) return false; if (charset != null ? !charset.equals(site.charset) : site.charset != null) return false; if (defaultCookies != null ? !defaultCookies.equals(site.defaultCookies) : site.defaultCookies != null) return false; if (domain != null ? !domain.equals(site.domain) : site.domain != null) return false; if (headers != null ? !headers.equals(site.headers) : site.headers != null) return false; if (userAgent != null ? !userAgent.equals(site.userAgent) : site.userAgent != null) return false; return true; } @Override public int hashCode() { int result = domain != null ? domain.hashCode() : 0; result = 31 * result + (userAgent != null ? userAgent.hashCode() : 0); result = 31 * result + (defaultCookies != null ? defaultCookies.hashCode() : 0); result = 31 * result + (charset != null ? charset.hashCode() : 0); result = 31 * result + sleepTime; result = 31 * result + retryTimes; result = 31 * result + cycleRetryTimes; result = 31 * result + timeOut; result = 31 * result + (acceptStatCode != null ? acceptStatCode.hashCode() : 0); result = 31 * result + (headers != null ? headers.hashCode() : 0); return result; } @Override public String toString() { return "Site{" + "domain='" + domain + '\'' + ", userAgent='" + userAgent + '\'' + ", cookies=" + defaultCookies + ", charset='" + charset + '\'' + ", sleepTime=" + sleepTime + ", retryTimes=" + retryTimes + ", cycleRetryTimes=" + cycleRetryTimes + ", timeOut=" + timeOut + ", acceptStatCode=" + acceptStatCode + ", headers=" + headers + '}'; } } ================================================ FILE: webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java ================================================ package us.codecraft.webmagic; import java.io.Closeable; import java.io.IOException; import java.util.ArrayList; import java.util.Collection; import java.util.Date; import java.util.List; import java.util.UUID; import java.util.concurrent.ExecutorService; import java.util.concurrent.atomic.AtomicInteger; import java.util.concurrent.atomic.AtomicLong; import org.apache.commons.collections4.CollectionUtils; import org.apache.commons.lang3.SerializationUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import us.codecraft.webmagic.downloader.Downloader; import us.codecraft.webmagic.downloader.HttpClientDownloader; import us.codecraft.webmagic.pipeline.CollectorPipeline; import us.codecraft.webmagic.pipeline.ConsolePipeline; import us.codecraft.webmagic.pipeline.Pipeline; import us.codecraft.webmagic.pipeline.ResultItemsCollectorPipeline; import us.codecraft.webmagic.processor.PageProcessor; import us.codecraft.webmagic.scheduler.QueueScheduler; import us.codecraft.webmagic.scheduler.Scheduler; import us.codecraft.webmagic.thread.CountableThreadPool; import us.codecraft.webmagic.utils.UrlUtils; import us.codecraft.webmagic.utils.WMCollections; /** * Entrance of a crawler.
* A spider contains four modules: Downloader, Scheduler, PageProcessor and * Pipeline.
* Every module is a field of Spider.
* The modules are defined in interface.
* You can customize a spider with various implementations of them.
* Examples:
*
* A simple crawler:
* Spider.create(new SimplePageProcessor("http://my.oschina.net/", * "http://my.oschina.net/*blog/*")).run();
*
* Store results to files by FilePipeline:
* Spider.create(new SimplePageProcessor("http://my.oschina.net/", * "http://my.oschina.net/*blog/*"))
* .pipeline(new FilePipeline("/data/temp/webmagic/")).run();
*
* Use FileCacheQueueScheduler to store urls and cursor in files, so that a * Spider can resume the status when shutdown.
* Spider.create(new SimplePageProcessor("http://my.oschina.net/", * "http://my.oschina.net/*blog/*"))
* .scheduler(new FileCacheQueueScheduler("/data/temp/webmagic/cache/")).run();
* * @author code4crafter@gmail.com
* @see Downloader * @see Scheduler * @see PageProcessor * @see Pipeline * @since 0.1.0 */ public class Spider implements Runnable, Task { protected Downloader downloader; protected List pipelines = new ArrayList(); protected PageProcessor pageProcessor; protected List startRequests; protected Site site; protected String uuid; protected SpiderScheduler scheduler; protected Logger logger = LoggerFactory.getLogger(getClass()); protected CountableThreadPool threadPool; protected ExecutorService executorService; protected int threadNum = 1; protected AtomicInteger stat = new AtomicInteger(STAT_INIT); protected volatile boolean exitWhenComplete = true; protected final static int STAT_INIT = 0; protected final static int STAT_RUNNING = 1; protected final static int STAT_STOPPED = 2; protected boolean spawnUrl = true; protected boolean destroyWhenExit = true; private List spiderListeners; private final AtomicLong pageCount = new AtomicLong(0); private Date startTime; private long emptySleepTime = 30000; /** * create a spider with pageProcessor. * * @param pageProcessor pageProcessor * @return new spider * @see PageProcessor */ public static Spider create(PageProcessor pageProcessor) { return new Spider(pageProcessor); } /** * create a spider with pageProcessor. * * @param pageProcessor pageProcessor */ public Spider(PageProcessor pageProcessor) { this.pageProcessor = pageProcessor; this.site = pageProcessor.getSite(); this.scheduler = new SpiderScheduler(new QueueScheduler()); } /** * Set startUrls of Spider.
* Prior to startUrls of Site. * * @param startUrls startUrls * @return this */ public Spider startUrls(List startUrls) { checkIfRunning(); this.startRequests = UrlUtils.convertToRequests(startUrls); return this; } /** * Set startUrls of Spider.
* Prior to startUrls of Site. * * @param startRequests startRequests * @return this */ public Spider startRequest(List startRequests) { checkIfRunning(); this.startRequests = startRequests; return this; } /** * Set an uuid for spider.
* Default uuid is domain of site.
* * @param uuid uuid * @return this */ public Spider setUUID(String uuid) { this.uuid = uuid; return this; } /** * set scheduler for Spider * * @param scheduler scheduler * @return this * @see #setScheduler(us.codecraft.webmagic.scheduler.Scheduler) */ @Deprecated public Spider scheduler(Scheduler scheduler) { return setScheduler(scheduler); } /** * set scheduler for Spider * * @param updateScheduler scheduler * @return this * @see Scheduler * @since 0.2.1 */ public Spider setScheduler(Scheduler updateScheduler) { checkIfRunning(); Scheduler oldScheduler = scheduler.getScheduler(); scheduler.setScheduler(updateScheduler); if (oldScheduler != null) { Request request; while ((request = oldScheduler.poll(this)) != null) { this.scheduler.push(request, this); } } return this; } /** * add a pipeline for Spider * * @param pipeline pipeline * @return this * @see #addPipeline(us.codecraft.webmagic.pipeline.Pipeline) * @deprecated */ @Deprecated public Spider pipeline(Pipeline pipeline) { return addPipeline(pipeline); } /** * add a pipeline for Spider * * @param pipeline pipeline * @return this * @see Pipeline * @since 0.2.1 */ public Spider addPipeline(Pipeline pipeline) { checkIfRunning(); this.pipelines.add(pipeline); return this; } /** * set pipelines for Spider * * @param pipelines pipelines * @return this * @see Pipeline * @since 0.4.1 */ public Spider setPipelines(List pipelines) { checkIfRunning(); this.pipelines = pipelines; return this; } /** * clear the pipelines set * * @return this */ public Spider clearPipeline() { pipelines = new ArrayList(); return this; } /** * set the downloader of spider * * @param downloader downloader * @return this * @see #setDownloader(us.codecraft.webmagic.downloader.Downloader) * @deprecated */ @Deprecated public Spider downloader(Downloader downloader) { return setDownloader(downloader); } /** * set the downloader of spider * * @param downloader downloader * @return this * @see Downloader */ public Spider setDownloader(Downloader downloader) { checkIfRunning(); this.downloader = downloader; return this; } protected void initComponent() { if (downloader == null) { this.downloader = new HttpClientDownloader(); } if (pipelines.isEmpty()) { pipelines.add(new ConsolePipeline()); } downloader.setThread(threadNum); if (threadPool == null || threadPool.isShutdown()) { if (executorService != null && !executorService.isShutdown()) { threadPool = new CountableThreadPool(threadNum, executorService); } else { threadPool = new CountableThreadPool(threadNum); } } if (startRequests != null) { for (Request request : startRequests) { addRequest(request); } startRequests.clear(); } startTime = new Date(); } @Override public void run() { checkRunningStat(); initComponent(); logger.info("Spider {} started!", getUUID()); // interrupt won't be necessarily detected while (!Thread.currentThread().isInterrupted() && stat.get() == STAT_RUNNING) { Request poll = scheduler.poll(this); if (poll == null) { if (threadPool.getThreadAlive() == 0) { //no alive thread anymore , try again poll = scheduler.poll(this); if (poll == null) { if (exitWhenComplete) { break; } else { // wait try { Thread.sleep(emptySleepTime); continue; } catch (InterruptedException e) { Thread.currentThread().interrupt(); break; } } } } else { // wait until new url added, if (scheduler.waitNewUrl(threadPool, emptySleepTime)) { // if interrupted break; } continue; } } final Request request = poll; //this may swallow the interruption threadPool.execute(new Runnable() { @Override public void run() { try { processRequest(request); onSuccess(request); } catch (Exception e) { onError(request, e); logger.error("process request " + request + " error", e); } finally { pageCount.incrementAndGet(); scheduler.signalNewUrl(); } } }); } stat.set(STAT_STOPPED); // release some resources if (destroyWhenExit) { close(); } logger.info("Spider {} closed! {} pages downloaded.", getUUID(), pageCount.get()); } /** * @deprecated Use {@link #onError(Request, Exception)} instead. */ @Deprecated protected void onError(Request request) { } protected void onError(Request request, Exception e) { this.onError(request); if (CollectionUtils.isNotEmpty(spiderListeners)) { for (SpiderListener spiderListener : spiderListeners) { spiderListener.onError(request, e); } } } protected void onSuccess(Request request) { if (CollectionUtils.isNotEmpty(spiderListeners)) { for (SpiderListener spiderListener : spiderListeners) { spiderListener.onSuccess(request); } } } private void checkRunningStat() { while (true) { int statNow = stat.get(); if (statNow == STAT_RUNNING) { throw new IllegalStateException("Spider is already running!"); } if (stat.compareAndSet(statNow, STAT_RUNNING)) { break; } } } public void close() { destroyEach(downloader); destroyEach(pageProcessor); destroyEach(scheduler); for (Pipeline pipeline : pipelines) { destroyEach(pipeline); } threadPool.shutdown(); } private void destroyEach(Object object) { if (object instanceof Closeable) { try { ((Closeable) object).close(); } catch (IOException e) { e.printStackTrace(); } } } /** * Process specific urls without url discovering. * * @param urls urls to process */ public void test(String... urls) { initComponent(); if (urls.length > 0) { for (String url : urls) { processRequest(new Request(url)); } } } private void processRequest(Request request) { Page page; if (null != request.getDownloader()){ page = request.getDownloader().download(request,this); }else { page = downloader.download(request, this); } if (page.isDownloadSuccess()){ onDownloadSuccess(request, page); } else { onDownloaderFail(request); } } private void onDownloadSuccess(Request request, Page page) { if (site.getAcceptStatCode().contains(page.getStatusCode())){ pageProcessor.process(page); extractAndAddRequests(page, spawnUrl); if (!page.getResultItems().isSkip()) { for (Pipeline pipeline : pipelines) { pipeline.process(page.getResultItems(), this); } } } else { logger.info("page status code error, page {} , code: {}", request.getUrl(), page.getStatusCode()); } sleep(site.getSleepTime()); } private void onDownloaderFail(Request request) { if (site.getCycleRetryTimes() == 0) { sleep(site.getSleepTime()); } else { // for cycle retry doCycleRetry(request); } } private void doCycleRetry(Request request) { Object cycleTriedTimesObject = request.getExtra(Request.CYCLE_TRIED_TIMES); if (cycleTriedTimesObject == null) { addRequest(SerializationUtils.clone(request).setPriority(0).putExtra(Request.CYCLE_TRIED_TIMES, 1)); } else { int cycleTriedTimes = (Integer) cycleTriedTimesObject; cycleTriedTimes++; if (cycleTriedTimes < site.getCycleRetryTimes()) { addRequest(SerializationUtils.clone(request).setPriority(0).putExtra(Request.CYCLE_TRIED_TIMES, cycleTriedTimes)); } } sleep(site.getRetrySleepTime()); } protected void sleep(int time) { try { Thread.sleep(time); } catch (InterruptedException e) { logger.error("Thread interrupted when sleep",e); Thread.currentThread().interrupt(); } } protected void extractAndAddRequests(Page page, boolean spawnUrl) { if (spawnUrl && CollectionUtils.isNotEmpty(page.getTargetRequests())) { for (Request request : page.getTargetRequests()) { addRequest(request); } } } private void addRequest(Request request) { if (site.getDomain() == null && request != null && request.getUrl() != null) { site.setDomain(UrlUtils.getDomain(request.getUrl())); } scheduler.push(request, this); } protected void checkIfRunning() { if (stat.get() == STAT_RUNNING) { throw new IllegalStateException("Spider is already running!"); } } public void runAsync() { Thread thread = new Thread(this); thread.setDaemon(false); thread.start(); } /** * Add urls to crawl.
* * @param urls urls * @return this */ public Spider addUrl(String... urls) { for (String url : urls) { addRequest(new Request(url)); } scheduler.signalNewUrl(); return this; } /** * Download urls synchronizing. * * @param urls urls * @param type of process result * @return list downloaded */ public List getAll(Collection urls) { destroyWhenExit = false; spawnUrl = false; if (startRequests!=null){ startRequests.clear(); } for (Request request : UrlUtils.convertToRequests(urls)) { addRequest(request); } CollectorPipeline collectorPipeline = getCollectorPipeline(); pipelines.add(collectorPipeline); run(); spawnUrl = true; destroyWhenExit = true; return collectorPipeline.getCollected(); } protected CollectorPipeline getCollectorPipeline() { return new ResultItemsCollectorPipeline(); } public T get(String url) { List urls = WMCollections.newArrayList(url); List resultItemses = getAll(urls); if (resultItemses != null && resultItemses.size() > 0) { return resultItemses.get(0); } else { return null; } } /** * Add urls with information to crawl.
* * @param requests requests * @return this */ public Spider addRequest(Request... requests) { for (Request request : requests) { addRequest(request); } scheduler.signalNewUrl(); return this; } public void start() { runAsync(); } public void stop() { if (stat.compareAndSet(STAT_RUNNING, STAT_STOPPED)) { logger.info("Spider " + getUUID() + " stop success!"); } else { logger.info("Spider " + getUUID() + " stop fail!"); } } /** * Stop when all tasks in the queue are completed and all worker threads are also completed */ public void stopWhenComplete(){ this.exitWhenComplete = true; } /** * start with more than one threads * * @param threadNum threadNum * @return this */ public Spider thread(int threadNum) { checkIfRunning(); this.threadNum = threadNum; if (threadNum <= 0) { throw new IllegalArgumentException("threadNum should be more than one!"); } return this; } /** * start with more than one threads * * @param executorService executorService to run the spider * @param threadNum threadNum * @return this */ public Spider thread(ExecutorService executorService, int threadNum) { checkIfRunning(); this.threadNum = threadNum; if (threadNum <= 0) { throw new IllegalArgumentException("threadNum should be more than one!"); } this.executorService = executorService; return this; } public boolean isExitWhenComplete() { return exitWhenComplete; } /** * Exit when complete.
* True: exit when all url of the site is downloaded.
* False: not exit until call stop() manually.
* * @param exitWhenComplete exitWhenComplete * @return this */ public Spider setExitWhenComplete(boolean exitWhenComplete) { this.exitWhenComplete = exitWhenComplete; return this; } public boolean isSpawnUrl() { return spawnUrl; } /** * Get page count downloaded by spider. * * @return total downloaded page count * @since 0.4.1 */ public long getPageCount() { return pageCount.get(); } /** * Get running status by spider. * * @return running status * @see Status * @since 0.4.1 */ public Status getStatus() { return Status.fromValue(stat.get()); } public enum Status { Init(0), Running(1), Stopped(2); private Status(int value) { this.value = value; } private int value; int getValue() { return value; } public static Status fromValue(int value) { for (Status status : Status.values()) { if (status.getValue() == value) { return status; } } //default value return Init; } } /** * Get thread count which is running * * @return thread count which is running * @since 0.4.1 */ public int getThreadAlive() { if (threadPool == null) { return 0; } return threadPool.getThreadAlive(); } /** * Whether add urls extracted to download.
* Add urls to download when it is true, and just download seed urls when it is false.
* DO NOT set it unless you know what it means! * * @param spawnUrl spawnUrl * @return this * @since 0.4.0 */ public Spider setSpawnUrl(boolean spawnUrl) { this.spawnUrl = spawnUrl; return this; } @Override public String getUUID() { if (uuid != null) { return uuid; } if (site != null) { return site.getDomain(); } uuid = UUID.randomUUID().toString(); return uuid; } public Spider setExecutorService(ExecutorService executorService) { checkIfRunning(); this.executorService = executorService; return this; } @Override public Site getSite() { return site; } public List getSpiderListeners() { return spiderListeners; } public Spider setSpiderListeners(List spiderListeners) { this.spiderListeners = spiderListeners; return this; } public Date getStartTime() { return startTime; } public Scheduler getScheduler() { return scheduler.getScheduler(); } /** * Set wait time when no url is polled.

* * @param emptySleepTime In MILLISECONDS. * @return this */ public Spider setEmptySleepTime(long emptySleepTime) { if(emptySleepTime<=0){ throw new IllegalArgumentException("emptySleepTime should be more than zero!"); } this.emptySleepTime = emptySleepTime; return this; } } ================================================ FILE: webmagic-core/src/main/java/us/codecraft/webmagic/SpiderListener.java ================================================ package us.codecraft.webmagic; /** * Listener of Spider on page processing. Used for monitor and such on. * * @author code4crafer@gmail.com * @since 0.5.0 */ public interface SpiderListener { void onSuccess(Request request); /** * @deprecated Use {@link #onError(Request, Exception)} instead. */ @Deprecated default void onError(Request request) { } default void onError(Request request, Exception e) { this.onError(request); } } ================================================ FILE: webmagic-core/src/main/java/us/codecraft/webmagic/SpiderScheduler.java ================================================ package us.codecraft.webmagic; import java.util.concurrent.TimeUnit; import java.util.concurrent.locks.Condition; import java.util.concurrent.locks.ReentrantLock; import us.codecraft.webmagic.scheduler.Scheduler; import us.codecraft.webmagic.thread.CountableThreadPool; public class SpiderScheduler { private Scheduler scheduler; private final ReentrantLock newUrlLock = new ReentrantLock(); private final Condition newUrlCondition = newUrlLock.newCondition(); public SpiderScheduler(Scheduler scheduler) { this.scheduler = scheduler; } public Scheduler getScheduler() { return scheduler; } public void setScheduler(Scheduler scheduler) { this.scheduler = scheduler; } public Request poll(Spider spider) { return scheduler.poll(spider); } public void push(Request request, Spider spider) { scheduler.push(request, spider); } public boolean waitNewUrl(CountableThreadPool threadPool, long emptySleepTime) { newUrlLock.lock(); try { if (threadPool.getThreadAlive() == 0) { return false; } newUrlCondition.await(emptySleepTime, TimeUnit.MILLISECONDS); return false; } catch (InterruptedException e) { return true; } finally { newUrlLock.unlock(); } } public void signalNewUrl() { try { newUrlLock.lock(); newUrlCondition.signalAll(); } finally { newUrlLock.unlock(); } } } ================================================ FILE: webmagic-core/src/main/java/us/codecraft/webmagic/Task.java ================================================ package us.codecraft.webmagic; /** * Interface for identifying different tasks.
* * @author code4crafter@gmail.com
* @since 0.1.0 * @see us.codecraft.webmagic.scheduler.Scheduler * @see us.codecraft.webmagic.pipeline.Pipeline */ public interface Task { /** * unique id for a task. * * @return uuid */ public String getUUID(); /** * site of a task * * @return site */ public Site getSite(); } ================================================ FILE: webmagic-core/src/main/java/us/codecraft/webmagic/downloader/AbstractDownloader.java ================================================ package us.codecraft.webmagic.downloader; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Task; import us.codecraft.webmagic.selector.Html; /** * Base class of downloader with some common methods. * * @author code4crafter@gmail.com * @since 0.5.0 */ public abstract class AbstractDownloader implements Downloader { /** * A simple method to download a url. * * @param url url * @return html */ public Html download(String url) { return download(url, null); } /** * A simple method to download a url. * * @param url url * @param charset charset * @return html */ public Html download(String url, String charset) { Page page = download(new Request(url), Site.me().setCharset(charset).toTask()); return (Html) page.getHtml(); } /** * @param request the {@link Request}. * @deprecated Use {@link #onSuccess(Page, Task)} instead. */ @Deprecated protected void onSuccess(Request request) { } /** * @param request the {@link Request}. * @param task the {@link Task}. * @since 0.7.6 * @deprecated Use {@link #onSuccess(Page, Task)} instead. */ @Deprecated protected void onSuccess(Request request, Task task) { this.onSuccess(request); } /** * @param page the {@link Page}. * @param task the {@link Task}. * @since 0.10.0 */ protected void onSuccess(Page page, Task task) { this.onSuccess(page.getRequest(), task); } /** * @param request the {@link Request}. * @deprecated Use {@link #onError(Page, Task, Throwable)} instead. */ @Deprecated protected void onError(Request request) { } /** * @param request the {@link Request}. * @param task the {@link Task}. * @param e the exception. * @since 0.7.6 * @deprecated Use {@link #onError(Page, Task, Throwable)} instead. */ @Deprecated protected void onError(Request request, Task task, Throwable e) { this.onError(request); } /** * @param page the {@link Page}. * @param task the {@link Task}. * @param e the exception. * @since 0.10.0 */ protected void onError(Page page, Task task, Throwable e) { this.onError(page.getRequest(), task, e); } } ================================================ FILE: webmagic-core/src/main/java/us/codecraft/webmagic/downloader/CustomRedirectStrategy.java ================================================ package us.codecraft.webmagic.downloader; import java.net.URI; import org.apache.http.HttpRequest; import org.apache.http.HttpResponse; import org.apache.http.ProtocolException; import org.apache.http.client.methods.HttpGet; import org.apache.http.client.methods.HttpPost; import org.apache.http.client.methods.HttpRequestWrapper; import org.apache.http.client.methods.HttpUriRequest; import org.apache.http.impl.client.LaxRedirectStrategy; import org.apache.http.protocol.HttpContext; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** *支持post 302跳转策略实现类 *HttpClient默认跳转:httpClientBuilder.setRedirectStrategy(new LaxRedirectStrategy()); *上述代码在post/redirect/post这种情况下不会传递原有请求的数据信息。所以参考了下SeimiCrawler这个项目的重定向策略。 *原代码地址:https://github.com/zhegexiaohuozi/SeimiCrawler/blob/master/project/src/main/java/cn/wanghaomiao/seimi/http/hc/SeimiRedirectStrategy.java */ public class CustomRedirectStrategy extends LaxRedirectStrategy { private Logger logger = LoggerFactory.getLogger(getClass()); @Override public HttpUriRequest getRedirect(HttpRequest request, HttpResponse response, HttpContext context) throws ProtocolException { URI uri = getLocationURI(request, response, context); String method = request.getRequestLine().getMethod(); if ("post".equalsIgnoreCase(method)) { try { HttpRequestWrapper httpRequestWrapper = (HttpRequestWrapper) request; httpRequestWrapper.setURI(uri); httpRequestWrapper.removeHeaders("Content-Length"); return httpRequestWrapper; } catch (Exception e) { logger.error("强转为HttpRequestWrapper出错"); } return new HttpPost(uri); } else { return new HttpGet(uri); } } } ================================================ FILE: webmagic-core/src/main/java/us/codecraft/webmagic/downloader/Downloader.java ================================================ package us.codecraft.webmagic.downloader; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Task; /** * Downloader is the part that downloads web pages and store in Page object.
* Downloader has {@link #setThread(int)} method because downloader is always the bottleneck of a crawler, * there are always some mechanisms such as pooling in downloader, and pool size is related to thread numbers. * * @author code4crafter@gmail.com
* @since 0.1.0 */ public interface Downloader { /** * Downloads web pages and store in Page object. * * @param request request * @param task task * @return page */ public Page download(Request request, Task task); /** * Tell the downloader how many threads the spider used. * @param threadNum number of threads */ public void setThread(int threadNum); } ================================================ FILE: webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java ================================================ package us.codecraft.webmagic.downloader; import java.io.IOException; import java.nio.charset.Charset; import java.util.HashMap; import java.util.Map; import java.util.Optional; import org.apache.commons.io.IOUtils; import org.apache.http.HttpEntity; import org.apache.http.HttpResponse; import org.apache.http.client.methods.CloseableHttpResponse; import org.apache.http.impl.client.CloseableHttpClient; import org.apache.http.util.EntityUtils; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Task; import us.codecraft.webmagic.proxy.Proxy; import us.codecraft.webmagic.proxy.ProxyProvider; import us.codecraft.webmagic.selector.PlainText; import us.codecraft.webmagic.utils.CharsetUtils; import us.codecraft.webmagic.utils.HttpClientUtils; /** * The http downloader based on HttpClient. * * @author code4crafter@gmail.com
* @since 0.1.0 */ public class HttpClientDownloader extends AbstractDownloader { private final Map httpClients = new HashMap(); private HttpClientGenerator httpClientGenerator = new HttpClientGenerator(); private HttpUriRequestConverter httpUriRequestConverter = new HttpUriRequestConverter(); private ProxyProvider proxyProvider; private boolean responseHeader = true; public void setHttpUriRequestConverter(HttpUriRequestConverter httpUriRequestConverter) { this.httpUriRequestConverter = httpUriRequestConverter; } public void setProxyProvider(ProxyProvider proxyProvider) { this.proxyProvider = proxyProvider; } private CloseableHttpClient getHttpClient(Site site) { if (site == null) { return httpClientGenerator.getClient(null); } String domain = site.getDomain(); CloseableHttpClient httpClient = httpClients.get(domain); if (httpClient == null) { synchronized (this) { httpClient = httpClients.get(domain); if (httpClient == null) { httpClient = httpClientGenerator.getClient(site); httpClients.put(domain, httpClient); } } } return httpClient; } @Override public Page download(Request request, Task task) { if (task == null || task.getSite() == null) { throw new NullPointerException("task or site can not be null"); } CloseableHttpResponse httpResponse = null; CloseableHttpClient httpClient = getHttpClient(task.getSite()); Proxy proxy = proxyProvider != null ? proxyProvider.getProxy(request, task) : null; HttpClientRequestContext requestContext = httpUriRequestConverter.convert(request, task.getSite(), proxy); Page page = null; try { httpResponse = httpClient.execute(requestContext.getHttpUriRequest(), requestContext.getHttpClientContext()); page = handleResponse(request, request.getCharset() != null ? request.getCharset() : task.getSite().getCharset(), httpResponse, task); onSuccess(page, task); return page; } catch (IOException e) { page = Page.ofFailure(request); onError(page, task, e); return page; } finally { if (httpResponse != null) { //ensure the connection is released back to pool EntityUtils.consumeQuietly(httpResponse.getEntity()); } if (proxyProvider != null && proxy != null) { proxyProvider.returnProxy(proxy, page, task); } } } @Override public void setThread(int thread) { httpClientGenerator.setPoolSize(thread); } protected Page handleResponse(Request request, String charset, HttpResponse httpResponse, Task task) throws IOException { HttpEntity entity = httpResponse.getEntity(); byte[] bytes = entity != null ? IOUtils.toByteArray(entity.getContent()) : new byte[0]; String contentType = entity != null && entity.getContentType() != null ? entity.getContentType().getValue() : null; Page page = Page.ofSuccess(request); page.setBytes(bytes); if (!request.isBinaryContent()) { if (charset == null) { charset = getHtmlCharset(contentType, bytes, task); } page.setCharset(charset); page.setRawText(new String(bytes, charset)); } page.setUrl(new PlainText(request.getUrl())); page.setRequest(request); page.setStatusCode(httpResponse.getStatusLine().getStatusCode()); if (responseHeader) { page.setHeaders(HttpClientUtils.convertHeaders(httpResponse.getAllHeaders())); } return page; } private String getHtmlCharset(String contentType, byte[] contentBytes, Task task) throws IOException { String charset = CharsetUtils.detectCharset(contentType, contentBytes); if (charset == null) { charset = Optional.ofNullable(task.getSite().getDefaultCharset()).orElseGet(Charset.defaultCharset()::name); } return charset; } } ================================================ FILE: webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java ================================================ package us.codecraft.webmagic.downloader; import org.apache.commons.lang3.JavaVersion; import org.apache.commons.lang3.SystemUtils; import org.apache.http.HttpException; import org.apache.http.HttpRequest; import org.apache.http.HttpRequestInterceptor; import org.apache.http.client.CookieStore; import org.apache.http.config.Registry; import org.apache.http.config.RegistryBuilder; import org.apache.http.config.SocketConfig; import org.apache.http.conn.socket.ConnectionSocketFactory; import org.apache.http.conn.socket.PlainConnectionSocketFactory; import org.apache.http.conn.ssl.SSLConnectionSocketFactory; import org.apache.http.impl.client.*; import org.apache.http.impl.conn.PoolingHttpClientConnectionManager; import org.apache.http.impl.cookie.BasicClientCookie; import org.apache.http.protocol.HttpContext; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import us.codecraft.webmagic.Site; import javax.net.ssl.SSLContext; import javax.net.ssl.TrustManager; import javax.net.ssl.X509TrustManager; import java.io.IOException; import java.security.KeyManagementException; import java.security.NoSuchAlgorithmException; import java.security.cert.CertificateException; import java.security.cert.X509Certificate; import java.util.Map; /** * @author code4crafter@gmail.com
* @since 0.4.0 */ public class HttpClientGenerator { private Logger logger = LoggerFactory.getLogger(getClass()); private PoolingHttpClientConnectionManager connectionManager; public HttpClientGenerator() { Registry reg = RegistryBuilder.create() .register("http", PlainConnectionSocketFactory.INSTANCE) .register("https", buildSSLConnectionSocketFactory()) .build(); connectionManager = new PoolingHttpClientConnectionManager(reg); connectionManager.setDefaultMaxPerRoute(100); } private SSLConnectionSocketFactory buildSSLConnectionSocketFactory() { try { SSLContext sslContext = createIgnoreVerifySSL(); String[] supportedProtocols; if (SystemUtils.isJavaVersionAtLeast(JavaVersion.JAVA_11)) { supportedProtocols = new String[]{"SSLv3", "TLSv1", "TLSv1.1", "TLSv1.2", "TLSv1.3"}; } else { supportedProtocols = new String[]{"SSLv3", "TLSv1", "TLSv1.1", "TLSv1.2"}; } logger.debug("supportedProtocols: {}", String.join(", ", supportedProtocols)); return new SSLConnectionSocketFactory(sslContext, supportedProtocols, null, //不进行主机校验 (host, sslSession) -> true); // 优先绕过安全证书 } catch (KeyManagementException | NoSuchAlgorithmException e) { logger.error("ssl connection fail", e); } return SSLConnectionSocketFactory.getSocketFactory(); } private SSLContext createIgnoreVerifySSL() throws NoSuchAlgorithmException, KeyManagementException { // 实现一个X509TrustManager接口,用于绕过验证,不用修改里面的方法 X509TrustManager trustManager = new X509TrustManager() { @Override public void checkClientTrusted(X509Certificate[] chain, String authType) throws CertificateException { } @Override public void checkServerTrusted(X509Certificate[] chain, String authType) throws CertificateException { } @Override public X509Certificate[] getAcceptedIssuers() { return null; } }; SSLContext sc = SSLContext.getInstance("TLS"); sc.init(null, new TrustManager[]{trustManager}, null); return sc; } public HttpClientGenerator setPoolSize(int poolSize) { connectionManager.setMaxTotal(poolSize); return this; } public CloseableHttpClient getClient(Site site) { return generateClient(site); } private CloseableHttpClient generateClient(Site site) { HttpClientBuilder httpClientBuilder = HttpClients.custom(); httpClientBuilder.setConnectionManager(connectionManager); if (site.getUserAgent() != null) { httpClientBuilder.setUserAgent(site.getUserAgent()); } else { httpClientBuilder.setUserAgent(""); } if (site.isUseGzip()) { httpClientBuilder.addInterceptorFirst(new HttpRequestInterceptor() { public void process( final HttpRequest request, final HttpContext context) throws HttpException, IOException { if (!request.containsHeader("Accept-Encoding")) { request.addHeader("Accept-Encoding", "gzip"); } } }); } //解决post/redirect/post 302跳转问题 httpClientBuilder.setRedirectStrategy(new CustomRedirectStrategy()); SocketConfig.Builder socketConfigBuilder = SocketConfig.custom(); socketConfigBuilder.setSoKeepAlive(true).setTcpNoDelay(true); socketConfigBuilder.setSoTimeout(site.getTimeOut()); SocketConfig socketConfig = socketConfigBuilder.build(); httpClientBuilder.setDefaultSocketConfig(socketConfig); connectionManager.setDefaultSocketConfig(socketConfig); httpClientBuilder.setRetryHandler(new DefaultHttpRequestRetryHandler(site.getRetryTimes(), true)); generateCookie(httpClientBuilder, site); return httpClientBuilder.build(); } private void generateCookie(HttpClientBuilder httpClientBuilder, Site site) { if (site.isDisableCookieManagement()) { httpClientBuilder.disableCookieManagement(); return; } CookieStore cookieStore = new BasicCookieStore(); for (Map.Entry cookieEntry : site.getCookies().entrySet()) { BasicClientCookie cookie = new BasicClientCookie(cookieEntry.getKey(), cookieEntry.getValue()); cookie.setDomain(site.getDomain()); cookieStore.addCookie(cookie); } for (Map.Entry> domainEntry : site.getAllCookies().entrySet()) { for (Map.Entry cookieEntry : domainEntry.getValue().entrySet()) { BasicClientCookie cookie = new BasicClientCookie(cookieEntry.getKey(), cookieEntry.getValue()); cookie.setDomain(domainEntry.getKey()); cookieStore.addCookie(cookie); } } httpClientBuilder.setDefaultCookieStore(cookieStore); } } ================================================ FILE: webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientRequestContext.java ================================================ package us.codecraft.webmagic.downloader; import org.apache.http.client.methods.HttpUriRequest; import org.apache.http.client.protocol.HttpClientContext; /** * @author code4crafter@gmail.com * Date: 17/4/8 * Time: 19:43 * @since 0.7.0 */ public class HttpClientRequestContext { private HttpUriRequest httpUriRequest; private HttpClientContext httpClientContext; public HttpUriRequest getHttpUriRequest() { return httpUriRequest; } public void setHttpUriRequest(HttpUriRequest httpUriRequest) { this.httpUriRequest = httpUriRequest; } public HttpClientContext getHttpClientContext() { return httpClientContext; } public void setHttpClientContext(HttpClientContext httpClientContext) { this.httpClientContext = httpClientContext; } } ================================================ FILE: webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpUriRequestConverter.java ================================================ package us.codecraft.webmagic.downloader; import org.apache.http.HttpHost; import org.apache.http.auth.AuthState; import org.apache.http.auth.ChallengeState; import org.apache.http.auth.UsernamePasswordCredentials; import org.apache.http.client.CookieStore; import org.apache.http.client.config.CookieSpecs; import org.apache.http.client.config.RequestConfig; import org.apache.http.client.methods.HttpUriRequest; import org.apache.http.client.methods.RequestBuilder; import org.apache.http.client.protocol.HttpClientContext; import org.apache.http.entity.ByteArrayEntity; import org.apache.http.impl.auth.BasicScheme; import org.apache.http.impl.client.BasicCookieStore; import org.apache.http.impl.cookie.BasicClientCookie; import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.proxy.Proxy; import us.codecraft.webmagic.utils.HttpConstant; import us.codecraft.webmagic.utils.UrlUtils; import java.util.Map; /** * @author code4crafter@gmail.com * Date: 17/3/18 * Time: 11:28 * * @since 0.7.0 */ public class HttpUriRequestConverter { public HttpClientRequestContext convert(Request request, Site site, Proxy proxy) { HttpClientRequestContext httpClientRequestContext = new HttpClientRequestContext(); httpClientRequestContext.setHttpUriRequest(convertHttpUriRequest(request, site, proxy)); httpClientRequestContext.setHttpClientContext(convertHttpClientContext(request, site, proxy)); return httpClientRequestContext; } private HttpClientContext convertHttpClientContext(Request request, Site site, Proxy proxy) { HttpClientContext httpContext = new HttpClientContext(); if (proxy != null && proxy.getUsername() != null) { AuthState authState = new AuthState(); BasicScheme proxyAuthScheme = new BasicScheme(ChallengeState.PROXY); UsernamePasswordCredentials proxyCredentials = new UsernamePasswordCredentials(proxy.getUsername(), proxy.getPassword()); authState.update(proxyAuthScheme, proxyCredentials); httpContext.setAttribute(HttpClientContext.PROXY_AUTH_STATE, authState); } if (request.getCookies() != null && !request.getCookies().isEmpty()) { CookieStore cookieStore = new BasicCookieStore(); for (Map.Entry cookieEntry : request.getCookies().entrySet()) { BasicClientCookie cookie1 = new BasicClientCookie(cookieEntry.getKey(), cookieEntry.getValue()); cookie1.setDomain(UrlUtils.removePort(UrlUtils.getDomain(request.getUrl()))); cookieStore.addCookie(cookie1); } httpContext.setCookieStore(cookieStore); } return httpContext; } private HttpUriRequest convertHttpUriRequest(Request request, Site site, Proxy proxy) { RequestBuilder requestBuilder = selectRequestMethod(request).setUri(UrlUtils.fixIllegalCharacterInUrl(request.getUrl())); if (site.getHeaders() != null) { for (Map.Entry headerEntry : site.getHeaders().entrySet()) { requestBuilder.addHeader(headerEntry.getKey(), headerEntry.getValue()); } } RequestConfig.Builder requestConfigBuilder = RequestConfig.custom(); if (site != null) { requestConfigBuilder.setConnectionRequestTimeout(site.getTimeOut()) .setSocketTimeout(site.getTimeOut()) .setConnectTimeout(site.getTimeOut()) .setCookieSpec(CookieSpecs.STANDARD); } if (proxy != null) { requestConfigBuilder.setProxy(new HttpHost(proxy.getHost(), proxy.getPort(), proxy.getScheme())); } requestBuilder.setConfig(requestConfigBuilder.build()); HttpUriRequest httpUriRequest = requestBuilder.build(); if (request.getHeaders() != null && !request.getHeaders().isEmpty()) { for (Map.Entry header : request.getHeaders().entrySet()) { httpUriRequest.addHeader(header.getKey(), header.getValue()); } } return httpUriRequest; } private RequestBuilder selectRequestMethod(Request request) { String method = request.getMethod(); if (method == null || method.equalsIgnoreCase(HttpConstant.Method.GET)) { //default get return RequestBuilder.get(); } else if (method.equalsIgnoreCase(HttpConstant.Method.POST)) { return addFormParams(RequestBuilder.post(),request); } else if (method.equalsIgnoreCase(HttpConstant.Method.HEAD)) { return RequestBuilder.head(); } else if (method.equalsIgnoreCase(HttpConstant.Method.PUT)) { return addFormParams(RequestBuilder.put(), request); } else if (method.equalsIgnoreCase(HttpConstant.Method.DELETE)) { return RequestBuilder.delete(); } else if (method.equalsIgnoreCase(HttpConstant.Method.TRACE)) { return RequestBuilder.trace(); } throw new IllegalArgumentException("Illegal HTTP Method " + method); } private RequestBuilder addFormParams(RequestBuilder requestBuilder, Request request) { if (request.getRequestBody() != null) { ByteArrayEntity entity = new ByteArrayEntity(request.getRequestBody().getBody()); entity.setContentType(request.getRequestBody().getContentType()); requestBuilder.setEntity(entity); } return requestBuilder; } } ================================================ FILE: webmagic-core/src/main/java/us/codecraft/webmagic/downloader/package.html ================================================ Downloader is the part that downloads web pages and store in Page object. ================================================ FILE: webmagic-core/src/main/java/us/codecraft/webmagic/model/HttpRequestBody.java ================================================ package us.codecraft.webmagic.model; import org.apache.http.NameValuePair; import org.apache.http.client.utils.URLEncodedUtils; import org.apache.http.message.BasicNameValuePair; import java.io.Serializable; import java.io.UnsupportedEncodingException; import java.util.ArrayList; import java.util.List; import java.util.Map; /** * @author code4crafter@gmail.com * Date: 17/4/8 */ public class HttpRequestBody implements Serializable { private static final long serialVersionUID = 5659170945717023595L; public static abstract class ContentType { public static final String JSON = "application/json"; public static final String XML = "text/xml"; public static final String FORM = "application/x-www-form-urlencoded"; public static final String MULTIPART = "multipart/form-data"; } private byte[] body; private String contentType; private String encoding; public HttpRequestBody() { } public HttpRequestBody(byte[] body, String contentType, String encoding) { this.body = body; this.contentType = contentType; this.encoding = encoding; } public String getContentType() { return contentType; } public String getEncoding() { return encoding; } public void setBody(byte[] body) { this.body = body; } public void setContentType(String contentType) { this.contentType = contentType; } public void setEncoding(String encoding) { this.encoding = encoding; } public static HttpRequestBody json(String json, String encoding) { try { return new HttpRequestBody(json.getBytes(encoding), ContentType.JSON, encoding); } catch (UnsupportedEncodingException e) { throw new IllegalArgumentException("illegal encoding " + encoding, e); } } public static HttpRequestBody xml(String xml, String encoding) { try { return new HttpRequestBody(xml.getBytes(encoding), ContentType.XML, encoding); } catch (UnsupportedEncodingException e) { throw new IllegalArgumentException("illegal encoding " + encoding, e); } } public static HttpRequestBody custom(byte[] body, String contentType, String encoding) { return new HttpRequestBody(body, contentType, encoding); } public static HttpRequestBody form(Map params, String encoding){ List nameValuePairs = new ArrayList(params.size()); for (Map.Entry entry : params.entrySet()) { nameValuePairs.add(new BasicNameValuePair(entry.getKey(), String.valueOf(entry.getValue()))); } try { return new HttpRequestBody(URLEncodedUtils.format(nameValuePairs, encoding).getBytes(encoding), ContentType.FORM, encoding); } catch (UnsupportedEncodingException e) { throw new IllegalArgumentException("illegal encoding " + encoding, e); } } public byte[] getBody() { return body; } } ================================================ FILE: webmagic-core/src/main/java/us/codecraft/webmagic/package.html ================================================
Main class "Spider" and models.
================================================ FILE: webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/CollectorPipeline.java ================================================ package us.codecraft.webmagic.pipeline; import java.util.List; /** * Pipeline that can collect and store results.
* Used for {@link us.codecraft.webmagic.Spider#getAll(java.util.Collection)} * * @author code4crafter@gmail.com * @since 0.4.0 */ public interface CollectorPipeline extends Pipeline { /** * Get all results collected. * * @return collected results */ public List getCollected(); } ================================================ FILE: webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/ConsolePipeline.java ================================================ package us.codecraft.webmagic.pipeline; import us.codecraft.webmagic.ResultItems; import us.codecraft.webmagic.Task; import java.util.Map; /** * Write results in console.
* Usually used in test. * * @author code4crafter@gmail.com
* @since 0.1.0 */ public class ConsolePipeline implements Pipeline { @Override public void process(ResultItems resultItems, Task task) { System.out.println("get page: " + resultItems.getRequest().getUrl()); for (Map.Entry entry : resultItems.getAll().entrySet()) { System.out.println(entry.getKey() + ":\t" + entry.getValue()); } } } ================================================ FILE: webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/FilePipeline.java ================================================ package us.codecraft.webmagic.pipeline; import org.apache.commons.codec.digest.DigestUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import us.codecraft.webmagic.ResultItems; import us.codecraft.webmagic.Task; import us.codecraft.webmagic.utils.FilePersistentBase; import java.io.FileOutputStream; import java.io.IOException; import java.io.OutputStreamWriter; import java.io.PrintWriter; import java.util.Map; /** * Store results in files.
* * @author code4crafter@gmail.com
* @since 0.1.0 */ public class FilePipeline extends FilePersistentBase implements Pipeline { private Logger logger = LoggerFactory.getLogger(getClass()); /** * create a FilePipeline with default path"/data/webmagic/" */ public FilePipeline() { setPath("/data/webmagic/"); } public FilePipeline(String path) { setPath(path); } @Override public void process(ResultItems resultItems, Task task) { String path = this.path + PATH_SEPERATOR + task.getUUID() + PATH_SEPERATOR; try { PrintWriter printWriter = new PrintWriter(new OutputStreamWriter(new FileOutputStream(getFile(path + DigestUtils.md5Hex(resultItems.getRequest().getUrl()) + ".html")),"UTF-8")); printWriter.println("url:\t" + resultItems.getRequest().getUrl()); for (Map.Entry entry : resultItems.getAll().entrySet()) { if (entry.getValue() instanceof Iterable) { Iterable value = (Iterable) entry.getValue(); printWriter.println(entry.getKey() + ":"); for (Object o : value) { printWriter.println(o); } } else { printWriter.println(entry.getKey() + ":\t" + entry.getValue()); } } printWriter.close(); } catch (IOException e) { logger.warn("write file error", e); } } } ================================================ FILE: webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/Pipeline.java ================================================ package us.codecraft.webmagic.pipeline; import us.codecraft.webmagic.ResultItems; import us.codecraft.webmagic.Task; /** * Pipeline is the persistent and offline process part of crawler.
* The interface Pipeline can be implemented to customize ways of persistent. * * @author code4crafter@gmail.com
* @since 0.1.0 * @see ConsolePipeline * @see FilePipeline */ public interface Pipeline { /** * Process extracted results. * * @param resultItems resultItems * @param task task */ public void process(ResultItems resultItems, Task task); } ================================================ FILE: webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/ResultItemsCollectorPipeline.java ================================================ package us.codecraft.webmagic.pipeline; import us.codecraft.webmagic.ResultItems; import us.codecraft.webmagic.Task; import java.util.ArrayList; import java.util.List; /** * @author code4crafter@gmail.com * @since 0.4.0 */ public class ResultItemsCollectorPipeline implements CollectorPipeline { private List collector = new ArrayList(); @Override public synchronized void process(ResultItems resultItems, Task task) { collector.add(resultItems); } @Override public List getCollected() { return collector; } } ================================================ FILE: webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/package.html ================================================ Pipeline is the persistent and offline process part of crawler. ================================================ FILE: webmagic-core/src/main/java/us/codecraft/webmagic/processor/PageProcessor.java ================================================ package us.codecraft.webmagic.processor; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Site; /** * Interface to be implemented to customize a crawler. * *

* In PageProcessor, you can customize: *

*
    *
  • start URLs and other settings in {@link Site}
  • *
  • how the URLs to fetch are detected
  • *
  • how the data are extracted and stored
  • *
* * @author code4crafter@gmail.com
* @see Site * @see Page * @since 0.1.0 */ public interface PageProcessor { /** * Processes the page, extract URLs to fetch, extract the data and store. * * @param page page */ void process(Page page); /** * Returns the site settings. * * @return site * @see Site */ default Site getSite() { return Site.me(); } } ================================================ FILE: webmagic-core/src/main/java/us/codecraft/webmagic/processor/SimplePageProcessor.java ================================================ package us.codecraft.webmagic.processor; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Site; import java.util.List; /** * A simple PageProcessor. * * @author code4crafter@gmail.com
* @since 0.1.0 */ public class SimplePageProcessor implements PageProcessor { private String urlPattern; private Site site; public SimplePageProcessor(String urlPattern) { this.site = Site.me(); //compile "*" expression to regex this.urlPattern = "(" + urlPattern.replace(".", "\\.").replace("*", "[^\"'#]*") + ")"; } @Override public void process(Page page) { List requests = page.getHtml().links().regex(urlPattern).all(); //add urls to fetch page.addTargetRequests(requests); //extract by XPath page.putField("title", page.getHtml().xpath("//title")); page.putField("html", page.getHtml().toString()); //extract by Readability page.putField("content", page.getHtml().smartContent()); } @Override public Site getSite() { //settings return site; } } ================================================ FILE: webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/BaiduBaikePageProcessor.java ================================================ package us.codecraft.webmagic.processor.example; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.ResultItems; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.processor.PageProcessor; import java.util.ArrayList; import java.util.List; /** * @author code4crafter@gmail.com
* @since 0.4.0 */ public class BaiduBaikePageProcessor implements PageProcessor { private Site site = Site.me()//.setHttpProxy(new HttpHost("127.0.0.1",8888)) .setRetryTimes(3).setSleepTime(1000).setUseGzip(true); @Override public void process(Page page) { page.putField("name", page.getHtml().css("dl.lemmaWgt-lemmaTitle h1","text").toString()); page.putField("description", page.getHtml().xpath("//div[@class='lemma-summary']/allText()")); } @Override public Site getSite() { return site; } public static void main(String[] args) { //single download Spider spider = Spider.create(new BaiduBaikePageProcessor()).thread(2); String urlTemplate = "http://baike.baidu.com/search/word?word=%s&pic=1&sug=1&enc=utf8"; ResultItems resultItems = spider.get(String.format(urlTemplate, "水力发电")); System.out.println(resultItems); //multidownload List list = new ArrayList(); list.add(String.format(urlTemplate,"风力发电")); list.add(String.format(urlTemplate,"太阳能")); list.add(String.format(urlTemplate,"地热发电")); list.add(String.format(urlTemplate,"地热发电")); List resultItemses = spider.getAll(list); for (ResultItems resultItemse : resultItemses) { System.out.println(resultItemse.getAll()); } spider.close(); } } ================================================ FILE: webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/GithubRepoPageProcessor.java ================================================ package us.codecraft.webmagic.processor.example; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.processor.PageProcessor; /** * @author code4crafter@gmail.com
* @since 0.3.2 */ public class GithubRepoPageProcessor implements PageProcessor { private Site site = Site.me().setRetryTimes(3).setSleepTime(1000).setTimeOut(10000); @Override public void process(Page page) { page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/[\\w\\-]+/[\\w\\-]+)").all()); page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/[\\w\\-])").all()); page.putField("author", page.getUrl().regex("https://github\\.com/(\\w+)/.*").toString()); page.putField("name", page.getHtml().xpath("//h1[@class='public']/strong/a/text()").toString()); if (page.getResultItems().get("name")==null){ //skip this page page.setSkip(true); } page.putField("readme", page.getHtml().xpath("//div[@id='readme']/tidyText()")); } @Override public Site getSite() { return site; } public static void main(String[] args) { Spider.create(new GithubRepoPageProcessor()).addUrl("https://github.com/code4craft").thread(5).run(); } } ================================================ FILE: webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/ZhihuPageProcessor.java ================================================ package us.codecraft.webmagic.processor.example; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.processor.PageProcessor; /** * @author code4crafter@gmail.com
* @since 0.6.0 */ public class ZhihuPageProcessor implements PageProcessor { private Site site = Site.me().setRetryTimes(3).setSleepTime(1000); @Override public void process(Page page) { page.addTargetRequests(page.getHtml().links().regex("https://www\\.zhihu\\.com/question/\\d+/answer/\\d+.*").all()); page.putField("title", page.getHtml().xpath("//h1[@class='QuestionHeader-title']/text()").toString()); page.putField("question", page.getHtml().xpath("//div[@class='QuestionRichText']//tidyText()").toString()); page.putField("answer", page.getHtml().xpath("//div[@class='QuestionAnswer-content']/tidyText()").toString()); if (page.getResultItems().get("title")==null){ //skip this page page.setSkip(true); } } @Override public Site getSite() { return site; } public static void main(String[] args) { Spider.create(new ZhihuPageProcessor()).addUrl("https://www.zhihu.com/explore").run(); } } ================================================ FILE: webmagic-core/src/main/java/us/codecraft/webmagic/processor/package.html ================================================ PageProcessor custom part of a crawler for specific site. ================================================ FILE: webmagic-core/src/main/java/us/codecraft/webmagic/proxy/Proxy.java ================================================ package us.codecraft.webmagic.proxy; import java.io.UnsupportedEncodingException; import java.net.URI; import java.net.URISyntaxException; import java.net.URLEncoder; import java.nio.charset.StandardCharsets; import org.apache.commons.lang3.StringUtils; public class Proxy { private String scheme; private String host; private int port; private String username; private String password; public static Proxy create(final URI uri) { Proxy proxy = new Proxy(uri.getHost(), uri.getPort(), uri.getScheme()); String userInfo = uri.getUserInfo(); if (userInfo != null) { String[] up = userInfo.split(":"); if (up.length == 1) { proxy.username = up[0].isEmpty() ? null : up[0]; } else { proxy.username = up[0].isEmpty() ? null : up[0]; proxy.password = up[1].isEmpty() ? null : up[1]; } } return proxy; } public Proxy(String host, int port) { this(host, port, null); } public Proxy(String host, int port, String scheme) { this.host = host; this.port = port; this.scheme = scheme; } public Proxy(String host, int port, String username, String password) { this.host = host; this.port = port; this.username = username; this.password = password; } public String getScheme() { return scheme; } public void setScheme(String scheme) { this.scheme = scheme; } public String getHost() { return host; } public int getPort() { return port; } public String getUsername() { return username; } public String getPassword() { return password; } public URI toURI() { final StringBuilder userInfoBuffer = new StringBuilder(); if (username != null) { userInfoBuffer.append(urlencode(username)); } if (password != null) { userInfoBuffer.append(":").append(urlencode(password)); } final String userInfo = StringUtils.defaultIfEmpty(userInfoBuffer.toString(), null); URI uri; try { uri = new URI(scheme, userInfo, host, port, null, null, null); } catch (URISyntaxException e) { throw new IllegalArgumentException(e.getMessage(), e); } return uri; } private String urlencode(String s) { String enc = StandardCharsets.UTF_8.name(); try { return URLEncoder.encode(s, enc); } catch (UnsupportedEncodingException e) { throw new IllegalArgumentException(e); } } @Override public boolean equals(Object o) { if (this == o) return true; if (o == null || getClass() != o.getClass()) return false; Proxy proxy = (Proxy) o; if (port != proxy.port) return false; if (host != null ? !host.equals(proxy.host) : proxy.host != null) return false; if (scheme != null ? !scheme.equals(proxy.scheme) : proxy.scheme != null) return false; if (username != null ? !username.equals(proxy.username) : proxy.username != null) return false; return password != null ? password.equals(proxy.password) : proxy.password == null; } @Override public int hashCode() { int result = host != null ? host.hashCode() : 0; result = 31 * result + port; result = 31 * result + (scheme != null ? scheme.hashCode() : 0); result = 31 * result + (username != null ? username.hashCode() : 0); result = 31 * result + (password != null ? password.hashCode() : 0); return result; } @Override public String toString() { return this.toURI().toString(); } } ================================================ FILE: webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyProvider.java ================================================ package us.codecraft.webmagic.proxy; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Task; /** * Proxy provider.
* * @since 0.7.0 */ public interface ProxyProvider { /** * * Return proxy to Provider when complete a download. * @param proxy the proxy config contains host,port and identify info * @param page the download result * @param task the download task */ void returnProxy(Proxy proxy, Page page, Task task); /** * Get a proxy for task by some strategy. * @param task the download task * @return proxy * @deprecated Use {@link #getProxy(Request, Task)} instead. */ @Deprecated default Proxy getProxy(Task task) { throw new UnsupportedOperationException(); } /** * Returns a proxy for the request. * * @param request the request * @param task the download task * @return proxy * @since 0.9.0 */ default Proxy getProxy(Request request, Task task) { return this.getProxy(task); } } ================================================ FILE: webmagic-core/src/main/java/us/codecraft/webmagic/proxy/SimpleProxyProvider.java ================================================ package us.codecraft.webmagic.proxy; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Task; import java.util.ArrayList; import java.util.Collections; import java.util.List; import java.util.concurrent.atomic.AtomicInteger; /** * A simple ProxyProvider. Provide proxy as round-robin without heartbeat and error check. It can be used when all proxies are stable. * @author code4crafter@gmail.com * Date: 17/4/16 * Time: 10:18 * @since 0.7.0 */ public class SimpleProxyProvider implements ProxyProvider { private final List proxies; private final AtomicInteger pointer; public SimpleProxyProvider(List proxies) { this(proxies, new AtomicInteger(-1)); } private SimpleProxyProvider(List proxies, AtomicInteger pointer) { this.proxies = proxies; this.pointer = pointer; } public static SimpleProxyProvider from(Proxy... proxies) { List proxiesTemp = new ArrayList(proxies.length); for (Proxy proxy : proxies) { proxiesTemp.add(proxy); } return new SimpleProxyProvider(Collections.unmodifiableList(proxiesTemp)); } @Override public void returnProxy(Proxy proxy, Page page, Task task) { //Donothing } @Override public Proxy getProxy(Request request, Task task) { return proxies.get(incrForLoop()); } private int incrForLoop() { int p = pointer.incrementAndGet(); int size = proxies.size(); if (p < size) { return p; } while (!pointer.compareAndSet(p, p % size)) { p = pointer.get(); } return p % size; } } ================================================ FILE: webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/DuplicateRemovedScheduler.java ================================================ package us.codecraft.webmagic.scheduler; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Task; import us.codecraft.webmagic.scheduler.component.DuplicateRemover; import us.codecraft.webmagic.scheduler.component.HashSetDuplicateRemover; import us.codecraft.webmagic.utils.HttpConstant; /** * Remove duplicate urls and only push urls which are not duplicate.

* * @author code4crafer@gmail.com * @since 0.5.0 */ public abstract class DuplicateRemovedScheduler implements Scheduler { protected Logger logger = LoggerFactory.getLogger(getClass()); private DuplicateRemover duplicatedRemover = new HashSetDuplicateRemover(); public DuplicateRemover getDuplicateRemover() { return duplicatedRemover; } public DuplicateRemovedScheduler setDuplicateRemover(DuplicateRemover duplicatedRemover) { this.duplicatedRemover = duplicatedRemover; return this; } @Override public void push(Request request, Task task) { logger.trace("get a candidate url {}", request.getUrl()); if (shouldReserved(request) || noNeedToRemoveDuplicate(request) || !duplicatedRemover.isDuplicate(request, task)) { logger.debug("push to queue {}", request.getUrl()); pushWhenNoDuplicate(request, task); } } protected boolean shouldReserved(Request request) { return request.getExtra(Request.CYCLE_TRIED_TIMES) != null; } protected boolean noNeedToRemoveDuplicate(Request request) { return HttpConstant.Method.POST.equalsIgnoreCase(request.getMethod()); } protected void pushWhenNoDuplicate(Request request, Task task) { } } ================================================ FILE: webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/MonitorableScheduler.java ================================================ package us.codecraft.webmagic.scheduler; import us.codecraft.webmagic.Task; /** * The scheduler whose requests can be counted for monitor. * * @author code4crafter@gmail.com * @since 0.5.0 */ public interface MonitorableScheduler extends Scheduler { public int getLeftRequestsCount(Task task); public int getTotalRequestsCount(Task task); } ================================================ FILE: webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/PriorityScheduler.java ================================================ package us.codecraft.webmagic.scheduler; import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Task; import us.codecraft.webmagic.utils.NumberUtils; import java.util.Comparator; import java.util.concurrent.BlockingQueue; import java.util.concurrent.LinkedBlockingQueue; import java.util.concurrent.PriorityBlockingQueue; /** * Priority scheduler. Request with higher priority will poll earlier.
* * @author code4crafter@gmail.com
* @since 0.2.1 */ public class PriorityScheduler extends DuplicateRemovedScheduler implements MonitorableScheduler { public static final int INITIAL_CAPACITY = 5; private BlockingQueue noPriorityQueue = new LinkedBlockingQueue(); private PriorityBlockingQueue priorityQueuePlus = new PriorityBlockingQueue(INITIAL_CAPACITY, new Comparator() { @Override public int compare(Request o1, Request o2) { return -NumberUtils.compareLong(o1.getPriority(), o2.getPriority()); } }); private PriorityBlockingQueue priorityQueueMinus = new PriorityBlockingQueue(INITIAL_CAPACITY, new Comparator() { @Override public int compare(Request o1, Request o2) { return -NumberUtils.compareLong(o1.getPriority(), o2.getPriority()); } }); @Override public void pushWhenNoDuplicate(Request request, Task task) { if (request.getPriority() == 0) { noPriorityQueue.add(request); } else if (request.getPriority() > 0) { priorityQueuePlus.put(request); } else { priorityQueueMinus.put(request); } } @Override public synchronized Request poll(Task task) { Request poll = priorityQueuePlus.poll(); if (poll != null) { return poll; } poll = noPriorityQueue.poll(); if (poll != null) { return poll; } return priorityQueueMinus.poll(); } @Override public int getLeftRequestsCount(Task task) { return noPriorityQueue.size(); } @Override public int getTotalRequestsCount(Task task) { return getDuplicateRemover().getTotalRequestsCount(task); } } ================================================ FILE: webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/QueueScheduler.java ================================================ package us.codecraft.webmagic.scheduler; import java.util.concurrent.BlockingQueue; import java.util.concurrent.LinkedBlockingQueue; import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Task; /** * Basic Scheduler implementation.
* Store urls to fetch in LinkedBlockingQueue and remove duplicate urls by HashMap. * * Note: if you use this {@link QueueScheduler} * with {@link Site#getCycleRetryTimes()} enabled, you may encountered dead-lock * when the queue is full. * * @author code4crafter@gmail.com
* @since 0.1.0 */ public class QueueScheduler extends DuplicateRemovedScheduler implements MonitorableScheduler { private final BlockingQueue queue; public QueueScheduler() { this.queue = new LinkedBlockingQueue<>(); } /** * Creates a {@code QueueScheduler} with the given (fixed) capacity. * * @param capacity the capacity of this queue, * see {@link LinkedBlockingQueue#LinkedBlockingQueue(int)} * @since 0.8.0 */ public QueueScheduler(int capacity) { this.queue = new LinkedBlockingQueue<>(capacity); } @Override public void pushWhenNoDuplicate(Request request, Task task) { logger.trace("Remaining capacity: {}", this.queue.remainingCapacity()); try { queue.put(request); } catch (InterruptedException e) { Thread.currentThread().interrupt(); } } @Override public Request poll(Task task) { return queue.poll(); } @Override public int getLeftRequestsCount(Task task) { return queue.size(); } @Override public int getTotalRequestsCount(Task task) { return getDuplicateRemover().getTotalRequestsCount(task); } } ================================================ FILE: webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/Scheduler.java ================================================ package us.codecraft.webmagic.scheduler; import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Task; /** * Scheduler is the part of url management.
* You can implement interface Scheduler to do: * manage urls to fetch * remove duplicate urls * * @author code4crafter@gmail.com
* @since 0.1.0 */ public interface Scheduler { /** * add a url to fetch * * @param request request * @param task task */ public void push(Request request, Task task); /** * get an url to crawl * * @param task the task of spider * @return the url to crawl */ public Request poll(Task task); } ================================================ FILE: webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/component/DuplicateRemover.java ================================================ package us.codecraft.webmagic.scheduler.component; import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Task; /** * Remove duplicate requests. * @author code4crafer@gmail.com * @since 0.5.1 */ public interface DuplicateRemover { /** * * Check whether the request is duplicate. * * @param request request * @param task task * @return true if is duplicate */ public boolean isDuplicate(Request request, Task task); /** * Reset duplicate check. * @param task task */ public void resetDuplicateCheck(Task task); /** * Get TotalRequestsCount for monitor. * @param task task * @return number of total request */ public int getTotalRequestsCount(Task task); } ================================================ FILE: webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/component/HashSetDuplicateRemover.java ================================================ package us.codecraft.webmagic.scheduler.component; import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Task; import java.util.Collections; import java.util.Set; import java.util.concurrent.ConcurrentHashMap; /** * @author code4crafer@gmail.com */ public class HashSetDuplicateRemover implements DuplicateRemover { private Set urls = Collections.newSetFromMap(new ConcurrentHashMap()); @Override public boolean isDuplicate(Request request, Task task) { return !urls.add(getUrl(request)); } protected String getUrl(Request request) { return request.getUrl(); } @Override public void resetDuplicateCheck(Task task) { urls.clear(); } @Override public int getTotalRequestsCount(Task task) { return urls.size(); } } ================================================ FILE: webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/component/package.html ================================================ Component of scheduler. ================================================ FILE: webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/package.html ================================================ Scheduler is the part of url management. ================================================ FILE: webmagic-core/src/main/java/us/codecraft/webmagic/selector/AbstractSelectable.java ================================================ package us.codecraft.webmagic.selector; import java.util.ArrayList; import java.util.List; import org.apache.commons.collections4.CollectionUtils; /** * @author code4crafer@gmail.com * @since 0.5.2 */ public abstract class AbstractSelectable implements Selectable { protected abstract List getSourceTexts(); @Override public Selectable css(String selector) { return $(selector); } @Override public Selectable css(String selector, String attrName) { return $(selector, attrName); } protected Selectable select(Selector selector, List strings) { List results = new ArrayList(); for (String string : strings) { String result = selector.select(string); if (result != null) { results.add(result); } } return new PlainText(results); } protected Selectable selectList(Selector selector, List strings) { List results = new ArrayList(); for (String string : strings) { List result = selector.selectList(string); results.addAll(result); } return new PlainText(results); } @Override public List all() { return getSourceTexts(); } @Override public Selectable jsonPath(String jsonPath) { throw new UnsupportedOperationException(); } @Override public String get() { List sourceTexts = all(); if (CollectionUtils.isNotEmpty(sourceTexts)) { return sourceTexts.get(0); } return null; } @Override public Selectable select(Selector selector) { return select(selector, getSourceTexts()); } @Override public Selectable selectList(Selector selector) { return selectList(selector, getSourceTexts()); } @Override public Selectable regex(String regex) { RegexSelector regexSelector = Selectors.regex(regex); return selectList(regexSelector, getSourceTexts()); } @Override public Selectable regex(String regex, int group) { RegexSelector regexSelector = Selectors.regex(regex, group); return selectList(regexSelector, getSourceTexts()); } @Override public Selectable replace(String regex, String replacement) { ReplaceSelector replaceSelector = new ReplaceSelector(regex,replacement); return select(replaceSelector, getSourceTexts()); } public String getFirstSourceText() { List sourceTexts = getSourceTexts(); if (CollectionUtils.isNotEmpty(sourceTexts)) { return sourceTexts.get(0); } return null; } @Override public String toString() { return get(); } @Override public boolean match() { return CollectionUtils.isNotEmpty(getSourceTexts()); } } ================================================ FILE: webmagic-core/src/main/java/us/codecraft/webmagic/selector/AndSelector.java ================================================ package us.codecraft.webmagic.selector; import java.util.ArrayList; import java.util.List; /** * All selectors will be arranged as a pipeline.
* The next selector uses the result of the previous as source. * @author code4crafter@gmail.com
* @since 0.2.0 */ public class AndSelector implements Selector { private List selectors = new ArrayList(); public AndSelector(Selector... selectors) { for (Selector selector : selectors) { this.selectors.add(selector); } } public AndSelector(List selectors) { this.selectors = selectors; } @Override public String select(String text) { for (Selector selector : selectors) { if (text == null) { return null; } text = selector.select(text); } return text; } @Override public List selectList(String text) { List results = new ArrayList(); boolean first = true; for (Selector selector : selectors) { if (first) { results = selector.selectList(text); first = false; } else { List resultsTemp = new ArrayList(); for (String result : results) { resultsTemp.addAll(selector.selectList(result)); } results = resultsTemp; if (results == null || results.size() == 0) { return results; } } } return results; } } ================================================ FILE: webmagic-core/src/main/java/us/codecraft/webmagic/selector/BaseElementSelector.java ================================================ package us.codecraft.webmagic.selector; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import us.codecraft.webmagic.utils.BaseSelectorUtils; import java.util.ArrayList; import java.util.List; /** * @author code4crafter@gmail.com * @since 0.3.0 */ public abstract class BaseElementSelector implements Selector, ElementSelector { private Document parse(String text) { // Jsoup could not parse or tag directly // https://stackoverflow.com/questions/63607740/jsoup-couldnt-parse-tr-tag text = BaseSelectorUtils.preParse(text); return Jsoup.parse(text); } @Override public String select(String text) { if (text != null) { return select(parse(text)); } return null; } @Override public List selectList(String text) { if (text != null) { return selectList(parse(text)); } else { return new ArrayList(); } } public Element selectElement(String text) { if (text != null) { return selectElement(parse(text)); } return null; } public List selectElements(String text) { if (text != null) { return selectElements(parse(text)); } else { return new ArrayList(); } } public abstract Element selectElement(Element element); public abstract List selectElements(Element element); public abstract boolean hasAttribute(); } ================================================ FILE: webmagic-core/src/main/java/us/codecraft/webmagic/selector/CssSelector.java ================================================ package us.codecraft.webmagic.selector; import java.util.ArrayList; import java.util.List; import org.apache.commons.collections4.CollectionUtils; import org.jsoup.nodes.Element; import org.jsoup.nodes.Node; import org.jsoup.nodes.TextNode; import org.jsoup.select.Elements; /** * CSS selector. Based on Jsoup. * * @author code4crafter@gmail.com
* @since 0.1.0 */ public class CssSelector extends BaseElementSelector { private String selectorText; private String attrName; public CssSelector(String selectorText) { this.selectorText = selectorText; } public CssSelector(String selectorText, String attrName) { this.selectorText = selectorText; this.attrName = attrName; } private String getValue(Element element) { if (attrName == null) { return element.outerHtml(); } else if ("innerHtml".equalsIgnoreCase(attrName)) { return element.html(); } else if ("text".equalsIgnoreCase(attrName)) { return getText(element); } else if ("allText".equalsIgnoreCase(attrName)) { return element.text(); } else { return element.attr(attrName); } } protected String getText(Element element) { StringBuilder accum = new StringBuilder(); for (Node node : element.childNodes()) { if (node instanceof TextNode) { TextNode textNode = (TextNode) node; accum.append(textNode.text()); } } return accum.toString(); } @Override public String select(Element element) { List elements = selectElements(element); if (CollectionUtils.isEmpty(elements)) { return null; } return getValue(elements.get(0)); } @Override public List selectList(Element doc) { List strings = new ArrayList(); List elements = selectElements(doc); if (CollectionUtils.isNotEmpty(elements)) { for (Element element : elements) { String value = getValue(element); if (value != null) { strings.add(value); } } } return strings; } @Override public Element selectElement(Element element) { Elements elements = element.select(selectorText); if (CollectionUtils.isNotEmpty(elements)) { return elements.get(0); } return null; } @Override public List selectElements(Element element) { return element.select(selectorText); } @Override public boolean hasAttribute() { return attrName != null; } } ================================================ FILE: webmagic-core/src/main/java/us/codecraft/webmagic/selector/ElementSelector.java ================================================ package us.codecraft.webmagic.selector; import org.jsoup.nodes.Element; import java.util.List; /** * Selector(extractor) for html elements.
* * @author code4crafter@gmail.com
* @since 0.3.0 */ public interface ElementSelector { /** * Extract single result in text.
* If there are more than one result, only the first will be chosen. * * @param element element * @return result */ public String select(Element element); /** * Extract all results in text.
* * @param element element * @return results */ public List selectList(Element element); } ================================================ FILE: webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java ================================================ package us.codecraft.webmagic.selector; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.util.Collections; import java.util.List; /** * Selectable html.
* * @author code4crafter@gmail.com
* @since 0.1.0 */ public class Html extends HtmlNode { private Logger logger = LoggerFactory.getLogger(getClass()); /** * Disable jsoup html entity escape. It can be set just before any Html instance is created. * @deprecated */ public static boolean DISABLE_HTML_ENTITY_ESCAPE = false; /** * Store parsed document for better performance when only one text exist. */ private Document document; public Html(String text, String url) { try { this.document = Jsoup.parse(text, url); } catch (Exception e) { this.document = null; logger.warn("parse document error ", e); } } public Html(String text) { try { this.document = Jsoup.parse(text); } catch (Exception e) { this.document = null; logger.warn("parse document error ", e); } } public Html(Document document) { this.document = document; } public Document getDocument() { return document; } @Override protected List getElements() { return Collections.singletonList(getDocument()); } /** * @param selector selector * @return result */ public String selectDocument(Selector selector) { if (selector instanceof ElementSelector) { ElementSelector elementSelector = (ElementSelector) selector; return elementSelector.select(getDocument()); } else { return selector.select(getFirstSourceText()); } } public List selectDocumentForList(Selector selector) { if (selector instanceof ElementSelector) { ElementSelector elementSelector = (ElementSelector) selector; return elementSelector.selectList(getDocument()); } else { return selector.selectList(getFirstSourceText()); } } public static Html create(String text) { return new Html(text); } } ================================================ FILE: webmagic-core/src/main/java/us/codecraft/webmagic/selector/HtmlNode.java ================================================ package us.codecraft.webmagic.selector; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import java.util.ArrayList; import java.util.List; import java.util.ListIterator; /** * @author code4crafer@gmail.com */ public class HtmlNode extends AbstractSelectable { private final List elements; public HtmlNode(List elements) { this.elements = elements; } public HtmlNode() { elements = null; } protected List getElements() { return elements; } public Selectable smartContent() { SmartContentSelector smartContentSelector = Selectors.smartContent(); return select(smartContentSelector, getSourceTexts()); } public Selectable smartContent(int threshold) { SmartContentSelector smartContentSelector = Selectors.smartContent(threshold); return select(smartContentSelector, getSourceTexts()); } @Override public Selectable links() { return selectElements(new LinksSelector()); } @Override public Selectable xpath(String xpath) { XpathSelector xpathSelector = Selectors.xpath(xpath); return selectElements(xpathSelector); } @Override public Selectable selectList(Selector selector) { if (selector instanceof BaseElementSelector) { return selectElements((BaseElementSelector) selector); } return selectList(selector, getSourceTexts()); } @Override public Selectable select(Selector selector) { return selectList(selector); } /** * select elements * * @param elementSelector elementSelector * @return result */ protected Selectable selectElements(BaseElementSelector elementSelector) { ListIterator elementIterator = getElements().listIterator(); if (!elementSelector.hasAttribute()) { List resultElements = new ArrayList(); while (elementIterator.hasNext()) { Element element = checkElementAndConvert(elementIterator); List selectElements = elementSelector.selectElements(element); resultElements.addAll(selectElements); } return new HtmlNode(resultElements); } else { // has attribute, consider as plaintext List resultStrings = new ArrayList(); while (elementIterator.hasNext()) { Element element = checkElementAndConvert(elementIterator); List selectList = elementSelector.selectList(element); resultStrings.addAll(selectList); } return new PlainText(resultStrings); } } /** * Only document can be select * See: https://github.com/code4craft/webmagic/issues/113 * * @param elementIterator elementIterator * @return element element */ private Element checkElementAndConvert(ListIterator elementIterator) { Element element = elementIterator.next(); if (!(element instanceof Document)) { Document root = new Document(element.ownerDocument().baseUri()); Element clone = element.clone(); root.appendChild(clone); elementIterator.set(root); return root; } return element; } @Override public Selectable $(String selector) { CssSelector cssSelector = Selectors.$(selector); return selectElements(cssSelector); } @Override public Selectable $(String selector, String attrName) { CssSelector cssSelector = Selectors.$(selector, attrName); return selectElements(cssSelector); } @Override public List nodes() { List selectables = new ArrayList(); for (Element element : getElements()) { List childElements = new ArrayList(1); childElements.add(element); selectables.add(new HtmlNode(childElements)); } return selectables; } @Override protected List getSourceTexts() { List sourceTexts = new ArrayList(getElements().size()); for (Element element : getElements()) { sourceTexts.add(element.toString()); } return sourceTexts; } } ================================================ FILE: webmagic-core/src/main/java/us/codecraft/webmagic/selector/Json.java ================================================ package us.codecraft.webmagic.selector; import com.alibaba.fastjson.JSON; import us.codecraft.xsoup.XTokenQueue; import java.util.List; /** * parse json * @author code4crafter@gmail.com * @since 0.5.0 */ public class Json extends PlainText { public Json(List strings) { super(strings); } public Json(String text) { super(text); } /** * remove padding for JSONP * @param padding padding * @return json after padding removed */ public Json removePadding(String padding) { String text = getFirstSourceText(); XTokenQueue tokenQueue = new XTokenQueue(text); tokenQueue.consumeWhitespace(); tokenQueue.consume(padding); tokenQueue.consumeWhitespace(); String chompBalanced = tokenQueue.chompBalancedNotInQuotes('(', ')'); return new Json(chompBalanced); } public T toObject(Class clazz) { if (getFirstSourceText() == null) { return null; } return JSON.parseObject(getFirstSourceText(), clazz); } public List toList(Class clazz) { if (getFirstSourceText() == null) { return null; } return JSON.parseArray(getFirstSourceText(), clazz); } @Override public Selectable jsonPath(String jsonPath) { JsonPathSelector jsonPathSelector = new JsonPathSelector(jsonPath); return selectList(jsonPathSelector,getSourceTexts()); } } ================================================ FILE: webmagic-core/src/main/java/us/codecraft/webmagic/selector/JsonPathSelector.java ================================================ package us.codecraft.webmagic.selector; import java.util.ArrayList; import java.util.List; import java.util.Map; import com.alibaba.fastjson.JSON; import com.jayway.jsonpath.JsonPath; /** * JsonPath selector.
* Used to extract content from JSON.
* * @author code4crafter@gmail.com
* @since 0.2.1 */ public class JsonPathSelector implements Selector { private final String jsonPathStr; private final JsonPath jsonPath; public JsonPathSelector(String jsonPathStr) { this.jsonPathStr = jsonPathStr; this.jsonPath = JsonPath.compile(this.jsonPathStr); } @SuppressWarnings("unused") public String getJsonPathStr() { return jsonPathStr; } @Override public String select(String text) { Object object = jsonPath.read(text); if (object == null) { return null; } if (object instanceof List) { List list = (List) object; if (list.size() > 0) { return toString(list.iterator().next()); } } return object.toString(); } private String toString(Object object) { if (object instanceof Map) { return JSON.toJSONString(object); } else { return String.valueOf(object); } } @Override @SuppressWarnings("unchecked") public List selectList(String text) { List list = new ArrayList<>(); Object object = jsonPath.read(text); if (object == null) { return list; } if (object instanceof List) { List items = (List) object; for (Object item : items) { list.add(toString(item)); } } else { list.add(toString(object)); } return list; } } ================================================ FILE: webmagic-core/src/main/java/us/codecraft/webmagic/selector/LinksSelector.java ================================================ package us.codecraft.webmagic.selector; import java.util.ArrayList; import java.util.List; import org.apache.commons.lang3.StringUtils; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; /** * Links selector based on jsoup. Use absolute url.
* * @author code4crafter@gmail.com
* @since 0.7.0 */ public class LinksSelector extends BaseElementSelector { @Override public String select(Element element) { throw new UnsupportedOperationException(); } @Override public List selectList(Element element) { Elements elements = element.select("a"); List links = new ArrayList<>(elements.size()); for (Element element0 : elements) { if (StringUtils.isNotBlank(element0.baseUri())) { links.add(element0.attr("abs:href")); } else { links.add(element0.attr("href")); } } return links; } @Override public Element selectElement(Element element) { throw new UnsupportedOperationException(); } @Override public List selectElements(Element element) { throw new UnsupportedOperationException(); } @Override public boolean hasAttribute() { return true; } } ================================================ FILE: webmagic-core/src/main/java/us/codecraft/webmagic/selector/OrSelector.java ================================================ package us.codecraft.webmagic.selector; import java.util.ArrayList; import java.util.List; /** * All extractors will do extracting separately,
* and the results of extractors will combined as the final result. * @author code4crafter@gmail.com
* @since 0.2.0 */ public class OrSelector implements Selector { private List selectors = new ArrayList(); public OrSelector(Selector... selectors) { for (Selector selector : selectors) { this.selectors.add(selector); } } public OrSelector(List selectors) { this.selectors = selectors; } @Override public String select(String text) { for (Selector selector : selectors) { String result = selector.select(text); if (result != null) { return result; } } return null; } @Override public List selectList(String text) { List results = new ArrayList(); for (Selector selector : selectors) { List strings = selector.selectList(text); results.addAll(strings); } return results; } } ================================================ FILE: webmagic-core/src/main/java/us/codecraft/webmagic/selector/PlainText.java ================================================ package us.codecraft.webmagic.selector; import java.util.ArrayList; import java.util.List; /** * Selectable plain text.
* Can not be selected by XPath or CSS Selector. * * @author code4crafter@gmail.com
* @since 0.1.0 */ public class PlainText extends AbstractSelectable { protected List sourceTexts; public PlainText(List sourceTexts) { this.sourceTexts = sourceTexts; } public PlainText(String text) { this.sourceTexts = new ArrayList(); sourceTexts.add(text); } public static PlainText create(String text) { return new PlainText(text); } @Override public Selectable xpath(String xpath) { throw new UnsupportedOperationException("XPath can not apply to plain text. Please check whether you use a previous xpath with attribute select (/@href etc)."); } @Override public Selectable $(String selector) { throw new UnsupportedOperationException("$ can not apply to plain text. Please check whether you use a previous xpath with attribute select (/@href etc)."); } @Override public Selectable $(String selector, String attrName) { throw new UnsupportedOperationException("$ can not apply to plain text. Please check whether you use a previous xpath with attribute select (/@href etc)."); } @Override public Selectable links() { throw new UnsupportedOperationException("Links can not apply to plain text. Please check whether you use a previous xpath with attribute select (/@href etc)."); } @Override public List nodes() { List nodes = new ArrayList(getSourceTexts().size()); for (String string : getSourceTexts()) { nodes.add(PlainText.create(string)); } return nodes; } @Override protected List getSourceTexts() { return sourceTexts; } } ================================================ FILE: webmagic-core/src/main/java/us/codecraft/webmagic/selector/RegexResult.java ================================================ package us.codecraft.webmagic.selector; /** * Object contains regex results.
* For multi group result extension.
* * @author code4crafter@gmail.com
* @since 0.1.0 */ class RegexResult { private String[] groups; public static final RegexResult EMPTY_RESULT = new RegexResult(); public RegexResult() { } public RegexResult(String[] groups) { this.groups = groups; } public String get(int groupId) { if (groups == null) { return null; } return groups[groupId]; } } ================================================ FILE: webmagic-core/src/main/java/us/codecraft/webmagic/selector/RegexSelector.java ================================================ package us.codecraft.webmagic.selector; import org.apache.commons.lang3.StringUtils; import java.util.ArrayList; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; import java.util.regex.PatternSyntaxException; /** * Selector in regex.
* * @author code4crafter@gmail.com
* @since 0.1.0 */ public class RegexSelector implements Selector { private String regexStr; private Pattern regex; private int group = 1; public RegexSelector(String regexStr, int group) { this.compileRegex(regexStr); this.group = group; } private void compileRegex(String regexStr) { if (StringUtils.isBlank(regexStr)) { throw new IllegalArgumentException("regex must not be empty"); } try { this.regex = Pattern.compile(regexStr, Pattern.DOTALL | Pattern.CASE_INSENSITIVE); this.regexStr = regexStr; } catch (PatternSyntaxException e) { throw new IllegalArgumentException("invalid regex "+regexStr, e); } } /** * Create a RegexSelector. When there is no capture group, the value is set to 0 else set to 1. * @param regexStr the regular expression. */ public RegexSelector(String regexStr) { this.compileRegex(regexStr); if (regex.matcher("").groupCount() == 0) { this.group = 0; } else { this.group = 1; } } @Override public String select(String text) { return selectGroup(text).get(group); } @Override public List selectList(String text) { List strings = new ArrayList(); List results = selectGroupList(text); for (RegexResult result : results) { strings.add(result.get(group)); } return strings; } public RegexResult selectGroup(String text) { Matcher matcher = regex.matcher(text); if (matcher.find()) { String[] groups = new String[matcher.groupCount() + 1]; for (int i = 0; i < groups.length; i++) { groups[i] = matcher.group(i); } return new RegexResult(groups); } return RegexResult.EMPTY_RESULT; } public List selectGroupList(String text) { Matcher matcher = regex.matcher(text); List resultList = new ArrayList(); while (matcher.find()) { String[] groups = new String[matcher.groupCount() + 1]; for (int i = 0; i < groups.length; i++) { groups[i] = matcher.group(i); } resultList.add(new RegexResult(groups)); } return resultList; } @Override public String toString() { return regexStr; } } ================================================ FILE: webmagic-core/src/main/java/us/codecraft/webmagic/selector/ReplaceSelector.java ================================================ package us.codecraft.webmagic.selector; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; import java.util.regex.PatternSyntaxException; /** * Replace selector.
* * @author code4crafter@gmail.com
* @since 0.1.0 */ public class ReplaceSelector implements Selector { private String regexStr; private String replacement; private Pattern regex; public ReplaceSelector(String regexStr, String replacement) { this.regexStr = regexStr; this.replacement = replacement; try { regex = Pattern.compile(regexStr); } catch (PatternSyntaxException e) { throw new IllegalArgumentException("invalid regex", e); } } @Override public String select(String text) { Matcher matcher = regex.matcher(text); return matcher.replaceAll(replacement); } @Override public List selectList(String text) { throw new UnsupportedOperationException(); } @Override public String toString() { return regexStr + "_" + replacement; } } ================================================ FILE: webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectable.java ================================================ package us.codecraft.webmagic.selector; import java.util.List; /** * Selectable text.
* * @author code4crafter@gmail.com
* @since 0.1.0 */ public interface Selectable { /** * select list with xpath * * @param xpath xpath * @return new Selectable after extract */ public Selectable xpath(String xpath); /** * select list with css selector * * @param selector css selector expression * @return new Selectable after extract */ public Selectable $(String selector); /** * select list with css selector * * @param selector css selector expression * @param attrName attribute name of css selector * @return new Selectable after extract */ public Selectable $(String selector, String attrName); /** * select list with css selector * * @param selector css selector expression * @return new Selectable after extract */ public Selectable css(String selector); /** * select list with css selector * * @param selector css selector expression * @param attrName attribute name of css selector * @return new Selectable after extract */ public Selectable css(String selector, String attrName); /** * select all links * * @return all links */ public Selectable links(); /** * select list with regex, default group is group 1 * * @param regex regex * @return new Selectable after extract */ public Selectable regex(String regex); /** * select list with regex * * @param regex regex * @param group group * @return new Selectable after extract */ public Selectable regex(String regex, int group); /** * replace with regex * * @param regex regex * @param replacement replacement * @return new Selectable after extract */ public Selectable replace(String regex, String replacement); /** * single string result * * @return single string result */ public String toString(); /** * single string result * * @return single string result */ public String get(); /** * if result exist for select * * @return true if result exist */ public boolean match(); /** * multi string result * * @return multi string result */ public List all(); /** * extract by JSON Path expression * * @param jsonPath jsonPath * @return result */ public Selectable jsonPath(String jsonPath); /** * extract by custom selector * * @param selector selector * @return result */ public Selectable select(Selector selector); /** * extract by custom selector * * @param selector selector * @return result */ public Selectable selectList(Selector selector); /** * get all nodes * @return result */ public List nodes(); } ================================================ FILE: webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selector.java ================================================ package us.codecraft.webmagic.selector; import java.util.List; /** * Selector(extractor) for text.
* * @author code4crafter@gmail.com
* @since 0.1.0 */ public interface Selector { /** * Extract single result in text.
* If there are more than one result, only the first will be chosen. * * @param text text * @return result */ public String select(String text); /** * Extract all results in text.
* * @param text text * @return results */ public List selectList(String text); } ================================================ FILE: webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectors.java ================================================ package us.codecraft.webmagic.selector; /** * Convenient methods for selectors.
* * @author code4crafter@gmail.com
* @since 0.2.1 */ public abstract class Selectors { public static RegexSelector regex(String expr) { return new RegexSelector(expr); } public static RegexSelector regex(String expr, int group) { return new RegexSelector(expr,group); } public static SmartContentSelector smartContent() { return new SmartContentSelector(); } public static SmartContentSelector smartContent(int threshold) { return new SmartContentSelector(threshold); } public static CssSelector $(String expr) { return new CssSelector(expr); } public static CssSelector $(String expr, String attrName) { return new CssSelector(expr, attrName); } public static XpathSelector xpath(String expr) { return new XpathSelector(expr); } /** * @see #xpath(String) * @param expr expr * @return new selector */ @Deprecated public static XpathSelector xsoup(String expr) { return new XpathSelector(expr); } public static AndSelector and(Selector... selectors) { return new AndSelector(selectors); } public static OrSelector or(Selector... selectors) { return new OrSelector(selectors); } } ================================================ FILE: webmagic-core/src/main/java/us/codecraft/webmagic/selector/SmartContentSelector.java ================================================ package us.codecraft.webmagic.selector; import us.codecraft.webmagic.utils.Experimental; import java.util.ArrayList; import java.util.Arrays; import java.util.List; /** * Borrowed from https://code.google.com/p/cx-extractor/ * * @author code4crafter@gmail.com
* @since 0.4.1 * */ @Experimental public class SmartContentSelector implements Selector { private int threshold = 86; public SmartContentSelector() { } public SmartContentSelector(int threshold) { this.threshold = threshold; } @Override public String select(String html) { html = html.replaceAll("(?is)", ""); html = html.replaceAll("(?is)", ""); // remove html comment html = html.replaceAll("(?is).*?", ""); // remove javascript html = html.replaceAll("(?is).*?", ""); // remove css html = html.replaceAll("&.{2,5};|&#.{2,5};", " "); // remove special char html = html.replaceAll("(?is)<.*?>", ""); List lines; int blocksWidth =3; int start; int end; StringBuilder text = new StringBuilder(); ArrayList indexDistribution = new ArrayList(); lines = Arrays.asList(html.split("\n")); for (int i = 0; i < lines.size() - blocksWidth; i++) { int wordsNum = 0; for (int j = i; j < i + blocksWidth; j++) { lines.set(j, lines.get(j).replaceAll("\\s+", "")); wordsNum += lines.get(j).length(); } indexDistribution.add(wordsNum); } start = -1; end = -1; boolean boolstart = false, boolend = false; text.setLength(0); for (int i = 0; i < indexDistribution.size() - 1; i++) { if (indexDistribution.get(i) > threshold && ! boolstart) { if (indexDistribution.get(i+1).intValue() != 0 || indexDistribution.get(i+2).intValue() != 0 || indexDistribution.get(i+3).intValue() != 0) { boolstart = true; start = i; continue; } } if (boolstart) { if (indexDistribution.get(i).intValue() == 0 || indexDistribution.get(i+1).intValue() == 0) { end = i; boolend = true; } } StringBuilder tmp = new StringBuilder(); if (boolend) { //System.out.println(start+1 + "\t\t" + end+1); for (int ii = start; ii <= end; ii++) { if (lines.get(ii).length() < 5) continue; tmp.append(lines.get(ii) + "\n"); } String str = tmp.toString(); //System.out.println(str); if (str.contains("Copyright") ) continue; text.append(str); boolstart = boolend = false; } } return text.toString(); } @Override public List selectList(String text) { throw new UnsupportedOperationException(); } } ================================================ FILE: webmagic-core/src/main/java/us/codecraft/webmagic/selector/XpathSelector.java ================================================ package us.codecraft.webmagic.selector; import java.util.List; import org.apache.commons.collections4.CollectionUtils; import org.jsoup.nodes.Element; import us.codecraft.xsoup.XPathEvaluator; import us.codecraft.xsoup.Xsoup; /** * XPath selector based on Xsoup.
* * @author code4crafter@gmail.com
* @since 0.3.0 */ public class XpathSelector extends BaseElementSelector { private XPathEvaluator xPathEvaluator; public XpathSelector(String xpathStr) { this.xPathEvaluator = Xsoup.compile(xpathStr); } @Override public String select(Element element) { return xPathEvaluator.evaluate(element).get(); } @Override public List selectList(Element element) { return xPathEvaluator.evaluate(element).list(); } @Override public Element selectElement(Element element) { List elements = selectElements(element); if (CollectionUtils.isNotEmpty(elements)){ return elements.get(0); } return null; } @Override public List selectElements(Element element) { return xPathEvaluator.evaluate(element).getElements(); } @Override public boolean hasAttribute() { return xPathEvaluator.hasAttribute(); } } ================================================ FILE: webmagic-core/src/main/java/us/codecraft/webmagic/selector/package.html ================================================ Selectors for page extraction. Core API is the interface Selectable,and internal core is the interface Selector。 ================================================ FILE: webmagic-core/src/main/java/us/codecraft/webmagic/thread/CountableThreadPool.java ================================================ package us.codecraft.webmagic.thread; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import java.util.concurrent.atomic.AtomicInteger; import java.util.concurrent.locks.Condition; import java.util.concurrent.locks.ReentrantLock; /** * Thread pool for workers.

* Use {@link java.util.concurrent.ExecutorService} as inner implement.

* New feature:

* 1. Block when thread pool is full to avoid poll many urls without process.

* 2. Count of thread alive for monitor. * * @author code4crafer@gmail.com * @since 0.5.0 */ public class CountableThreadPool { private int threadNum; private AtomicInteger threadAlive = new AtomicInteger(); private ReentrantLock reentrantLock = new ReentrantLock(); private Condition condition = reentrantLock.newCondition(); public CountableThreadPool(int threadNum) { this.threadNum = threadNum; this.executorService = Executors.newFixedThreadPool(threadNum); } public CountableThreadPool(int threadNum, ExecutorService executorService) { this.threadNum = threadNum; this.executorService = executorService; } public void setExecutorService(ExecutorService executorService) { this.executorService = executorService; } public int getThreadAlive() { return threadAlive.get(); } public int getThreadNum() { return threadNum; } private ExecutorService executorService; public void execute(final Runnable runnable) { if (threadAlive.get() >= threadNum) { try { reentrantLock.lock(); while (threadAlive.get() >= threadNum) { try { condition.await(); } catch (InterruptedException e) { } } } finally { reentrantLock.unlock(); } } threadAlive.incrementAndGet(); executorService.execute(new Runnable() { @Override public void run() { try { runnable.run(); } finally { try { reentrantLock.lock(); threadAlive.decrementAndGet(); condition.signal(); } finally { reentrantLock.unlock(); } } } }); } public boolean isShutdown() { return executorService.isShutdown(); } public void shutdown() { executorService.shutdown(); } } ================================================ FILE: webmagic-core/src/main/java/us/codecraft/webmagic/utils/BaseSelectorUtils.java ================================================ package us.codecraft.webmagic.utils; /** * @author hooy */ public class BaseSelectorUtils { /** * Jsoup/HtmlCleaner could not parse "tr" or "td" tag directly * https://stackoverflow.com/questions/63607740/jsoup-couldnt-parse-tr-tag * * @param text - the html string * @return text */ public static String preParse(String text) { if (((text.startsWith("") || text.startsWith("")) || ((text.startsWith("") || text.startsWith(""))) { text = "" + text + "
"; } return text; } } ================================================ FILE: webmagic-core/src/main/java/us/codecraft/webmagic/utils/CharsetUtils.java ================================================ package us.codecraft.webmagic.utils; import org.apache.commons.lang3.StringUtils; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.IOException; import java.nio.charset.Charset; /** * @author code4crafter@gmail.com * Date: 17/3/11 * Time: 10:36 * @since 0.6.2 */ public abstract class CharsetUtils { private static Logger logger = LoggerFactory.getLogger(CharsetUtils.class); private CharsetUtils() { throw new AssertionError("No us.codecraft.webmagic.utils.CharsetUtils instances for you!"); } public static String detectCharset(String contentType, byte[] contentBytes) throws IOException { String charset; // charset // 1、encoding in http header Content-Type charset = UrlUtils.getCharset(contentType); if (StringUtils.isNotBlank(contentType) && StringUtils.isNotBlank(charset)) { logger.debug("Auto get charset: {}", charset); return charset; } // use default charset to decode first time Charset defaultCharset = Charset.defaultCharset(); String content = new String(contentBytes, defaultCharset); // 2、charset in meta if (StringUtils.isNotEmpty(content)) { Document document = Jsoup.parse(content); Elements links = document.select("meta"); for (Element link : links) { // 2.1、html4.01 String metaContent = link.attr("content"); String metaCharset = link.attr("charset"); if (metaContent.indexOf("charset") != -1) { metaContent = metaContent.substring(metaContent.indexOf("charset"), metaContent.length()); charset = metaContent.split("=")[1]; break; } // 2.2、html5 else if (StringUtils.isNotEmpty(metaCharset)) { charset = metaCharset; break; } } } logger.debug("Auto get charset: {}", charset); // 3、todo use tools as cpdetector for content decode return charset; } } ================================================ FILE: webmagic-core/src/main/java/us/codecraft/webmagic/utils/Experimental.java ================================================ package us.codecraft.webmagic.utils; /** * Stands for features unstable. * @author code4crafter@gmail.com
*/ public @interface Experimental { } ================================================ FILE: webmagic-core/src/main/java/us/codecraft/webmagic/utils/FilePersistentBase.java ================================================ package us.codecraft.webmagic.utils; import java.io.File; /** * Base object of file persistence. * * @author code4crafter@gmail.com
* @since 0.2.0 */ public class FilePersistentBase { protected String path; public static String PATH_SEPERATOR = "/"; static { String property = System.getProperties().getProperty("file.separator"); if (property != null) { PATH_SEPERATOR = property; } } public void setPath(String path) { if (!path.endsWith(PATH_SEPERATOR)) { path += PATH_SEPERATOR; } this.path = path; } public File getFile(String fullName) { checkAndMakeParentDirecotry(fullName); return new File(fullName); } public void checkAndMakeParentDirecotry(String fullName) { int index = fullName.lastIndexOf(PATH_SEPERATOR); if (index > 0) { String path = fullName.substring(0, index); File file = new File(path); if (!file.exists()) { file.mkdirs(); } } } public String getPath() { return path; } } ================================================ FILE: webmagic-core/src/main/java/us/codecraft/webmagic/utils/HttpClientUtils.java ================================================ package us.codecraft.webmagic.utils; import org.apache.http.Header; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; /** * @author code4crafter@gmail.com * Date: 17/3/27 */ public abstract class HttpClientUtils { public static Map> convertHeaders(Header[] headers){ Map> results = new HashMap>(); for (Header header : headers) { List list = results.get(header.getName()); if (list == null) { list = new ArrayList(); results.put(header.getName(), list); } list.add(header.getValue()); } return results; } } ================================================ FILE: webmagic-core/src/main/java/us/codecraft/webmagic/utils/HttpConstant.java ================================================ package us.codecraft.webmagic.utils; /** * Some constants of Http protocal. * @author code4crafer@gmail.com * @since 0.5.0 */ public abstract class HttpConstant { public static abstract class Method { public static final String GET = "GET"; public static final String HEAD = "HEAD"; public static final String POST = "POST"; public static final String PUT = "PUT"; public static final String DELETE = "DELETE"; public static final String TRACE = "TRACE"; public static final String CONNECT = "CONNECT"; } public static abstract class StatusCode { public static final int CODE_200 = 200; } public static abstract class Header { public static final String REFERER = "Referer"; public static final String USER_AGENT = "User-Agent"; } } ================================================ FILE: webmagic-core/src/main/java/us/codecraft/webmagic/utils/NumberUtils.java ================================================ package us.codecraft.webmagic.utils; /** * @author yihua.huang@dianping.com */ public abstract class NumberUtils { public static int compareLong(long o1, long o2) { return Long.compare(o1, o2); } } ================================================ FILE: webmagic-core/src/main/java/us/codecraft/webmagic/utils/ProxyUtils.java ================================================ package us.codecraft.webmagic.utils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import us.codecraft.webmagic.proxy.Proxy; import java.io.IOException; import java.net.InetSocketAddress; import java.net.Socket; /** * Pooled Proxy Object * * @author yxssfxwzy@sina.com
* @since 0.5.1 */ public class ProxyUtils { private static final Logger logger = LoggerFactory.getLogger(ProxyUtils.class); public static boolean validateProxy(Proxy p) { Socket socket = null; try { socket = new Socket(); InetSocketAddress endpointSocketAddr = new InetSocketAddress(p.getHost(), p.getPort()); socket.connect(endpointSocketAddr, 3000); return true; } catch (IOException e) { logger.warn("FAILRE - CAN not connect! remote: " + p); return false; } finally { if (socket != null) { try { socket.close(); } catch (IOException e) { logger.warn("Error occurred while closing socket of validating proxy", e); } } } } } ================================================ FILE: webmagic-core/src/main/java/us/codecraft/webmagic/utils/UrlUtils.java ================================================ package us.codecraft.webmagic.utils; import org.apache.commons.lang3.StringUtils; import us.codecraft.webmagic.Request; import java.net.MalformedURLException; import java.net.URL; import java.nio.charset.Charset; import java.util.ArrayList; import java.util.Collection; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; /** * url and html utils. * * @author code4crafter@gmail.com
* @since 0.1.0 */ public class UrlUtils { /** * canonicalizeUrl *
* Borrowed from Jsoup. * * @param url url * @param refer refer * @return canonicalizeUrl */ public static String canonicalizeUrl(String url, String refer) { URL base; try { try { base = new URL(refer); } catch (MalformedURLException e) { // the base is unsuitable, but the attribute may be abs on its own, so try that URL abs = new URL(refer); return abs.toExternalForm(); } // workaround: java resolves '//path/file + ?foo' to '//path/?foo', not '//path/file?foo' as desired if (url.startsWith("?")) url = base.getPath() + url; URL abs = new URL(base, url); return abs.toExternalForm(); } catch (MalformedURLException e) { return ""; } } /** * * @param url url * @return new url * @deprecated */ public static String encodeIllegalCharacterInUrl(String url) { return url.replace(" ", "%20"); } public static String fixIllegalCharacterInUrl(String url) { //TODO more charator support return url.replace(" ", "%20").replaceAll("#+", "#"); } public static String getHost(String url) { String host = url; int i = StringUtils.ordinalIndexOf(url, "/", 3); if (i > 0) { host = StringUtils.substring(url, 0, i); } return host; } private static Pattern patternForProtocal = Pattern.compile("[\\w]+://"); public static String removeProtocol(String url) { return patternForProtocal.matcher(url).replaceAll(""); } public static String getDomain(String url) { String domain = removeProtocol(url); int i = StringUtils.indexOf(domain, "/", 1); if (i > 0) { domain = StringUtils.substring(domain, 0, i); } return removePort(domain); } public static String removePort(String domain) { int portIndex = domain.indexOf(":"); if (portIndex != -1) { return domain.substring(0, portIndex); }else { return domain; } } public static List convertToRequests(Collection urls) { List requestList = new ArrayList(urls.size()); for (String url : urls) { requestList.add(new Request(url)); } return requestList; } public static List convertToUrls(Collection requests) { List urlList = new ArrayList(requests.size()); for (Request request : requests) { urlList.add(request.getUrl()); } return urlList; } private static final Pattern patternForCharset = Pattern.compile("charset\\s*=\\s*['\"]*([^\\s;'\"]*)", Pattern.CASE_INSENSITIVE); public static String getCharset(String contentType) { if (contentType == null) { return null; } Matcher matcher = patternForCharset.matcher(contentType); if (matcher.find()) { String charset = matcher.group(1); if (Charset.isSupported(charset)) { return charset; } } return null; } } ================================================ FILE: webmagic-core/src/main/java/us/codecraft/webmagic/utils/WMCollections.java ================================================ package us.codecraft.webmagic.utils; import java.util.ArrayList; import java.util.HashSet; import java.util.List; import java.util.Set; /** * @author code4crafter@gmail.com * Date: 16/12/18 * Time: 上午10:16 */ public class WMCollections { public static Set newHashSet(T... t){ Set set = new HashSet(t.length); for (T t1 : t) { set.add(t1); } return set; } public static List newArrayList(T... t){ List list = new ArrayList(t.length); for (T t1 : t) { list.add(t1); } return list; } } ================================================ FILE: webmagic-core/src/main/java/us/codecraft/webmagic/utils/package.html ================================================ Static utils of webmagic. ================================================ FILE: webmagic-core/src/test/java/us/codecraft/webmagic/HtmlTest.java ================================================ package us.codecraft.webmagic; import org.junit.Ignore; import org.junit.Test; import us.codecraft.webmagic.selector.Html; import us.codecraft.webmagic.selector.Selectable; import static org.assertj.core.api.Assertions.assertThat; /** * @author code4crafter@gmail.com
* Date: 13-4-21 * Time: 上午8:42 */ public class HtmlTest { @Test public void testRegexSelector() { Html selectable = new Html("aaaaaaab"); assertThat(selectable.regex("(a+b)").replace("aa(a)", "$1bb").toString()).isEqualTo("abbabbab"); } @Ignore("not work in jsoup 1.8.x") @Test public void testDisableJsoupHtmlEntityEscape() throws Exception { Html.DISABLE_HTML_ENTITY_ESCAPE = true; Html html = new Html("aaaaaaa&b"); assertThat(html.regex("(aaaaaaa&b)").toString()).isEqualTo("aaaaaaa&b"); } @Test public void testEnableJsoupHtmlEntityEscape() throws Exception { Html html = new Html("aaaaaaa&b"); assertThat(html.regex("(aaaaaaa&b)").toString()).isEqualTo("aaaaaaa&b"); } @Test public void testAHrefExtract(){ Html html = new Html("xx"); assertThat(html.links().all()).contains("/xx/xx"); } @Test public void testNthNodesGet(){ Html html = new Html("xx"); assertThat(html.xpath("//a[1]/@href").get()).isEqualTo("/xx/xx"); Selectable selectable = html.xpath("//a[1]").nodes().get(0); assertThat(selectable.xpath("/a/@href").get()).isEqualTo("/xx/xx"); } @Test public void testGetHrefsByJsoup(){ Html html = new Html("issues","https://github.com/code4craft/webmagic/"); assertThat(html.xpath("//a[1]/@abs:href").get()).isEqualTo("https://github.com/code4craft/webmagic/issues"); assertThat(html.xpath("//img/@abs:src").get()).isEqualTo("https://github.com/code4craft/webmagic/webmagic.jpg"); html = new Html("issues"); assertThat(html.xpath("//a[1]/@abs:href").get()).isEqualTo("https://github.com/code4craft/webmagic/issues"); assertThat(html.xpath("//img/@abs:src").get()).isEqualTo("https://github.com/code4craft/webmagic/webmagic.jpg"); } } ================================================ FILE: webmagic-core/src/test/java/us/codecraft/webmagic/RequestTest.java ================================================ package us.codecraft.webmagic; import static org.assertj.core.api.Assertions.assertThat; import java.util.Collections; import java.util.Map; import org.junit.Test; import us.codecraft.webmagic.utils.HttpConstant; /** * @author code4crafter@gmail.com * Date: 17/3/11 */ public class RequestTest { @Test public void testEqualsAndHashCode() throws Exception { Request requestA = new Request("http://www.google.com/"); Request requestB = new Request("http://www.google.com/"); assertThat(requestA.hashCode()).isEqualTo(requestB.hashCode()); assertThat(requestA).isEqualTo(requestB); requestA.setMethod(HttpConstant.Method.GET); requestA.setMethod(HttpConstant.Method.POST); assertThat(requestA).isNotEqualTo(requestB); assertThat(requestA.hashCode()).isNotEqualTo(requestB.hashCode()); } @Test public void testSetExtras() { Request request = new Request(); Map extras = Collections.singletonMap("a", "1"); request.setExtras(extras); request.putExtra("b", "2"); assertThat(request.getExtra("a")).isEqualTo("1"); assertThat(request.getExtra("b")).isEqualTo("2"); } @Test public void testGetExtras() { Request request = new Request(); request.putExtra("a", "1"); assertThat(request.getExtras()).containsEntry("a", "1"); } @Test(expected = UnsupportedOperationException.class) public void testGetExtrasShouldBeUnmodifiable() { Request request = new Request(); request.getExtras().put("a", "1"); } } ================================================ FILE: webmagic-core/src/test/java/us/codecraft/webmagic/ResultItemsTest.java ================================================ package us.codecraft.webmagic; import org.junit.Test; import static org.assertj.core.api.Assertions.assertThat; /** * @author code4crafter@gmail.com */ public class ResultItemsTest { @Test public void testOrderOfEntries() throws Exception { ResultItems resultItems = new ResultItems(); resultItems.put("a", "a"); resultItems.put("b", "b"); resultItems.put("c", "c"); assertThat(resultItems.getAll().keySet()).containsExactly("a","b","c"); } } ================================================ FILE: webmagic-core/src/test/java/us/codecraft/webmagic/SiteTest.java ================================================ package us.codecraft.webmagic; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertTrue; import java.nio.charset.StandardCharsets; import java.util.ArrayList; import java.util.List; import java.util.Map; import org.junit.Test; public class SiteTest { @Test public void test() { Site site = Site.me().setDefaultCharset(StandardCharsets.UTF_8.name()); assertEquals(StandardCharsets.UTF_8.name(), site.getDefaultCharset()); } @Test public void addCookieTest(){ Site site=Site.me().setDefaultCharset(StandardCharsets.UTF_8.name()); site.addCookie("cookieDefault","cookie-webmagicDefault"); String firstDomain="example.com"; String secondDomain="exampleCopy.com"; site.addCookie(firstDomain, "cookie", "cookie-webmagic"); site.addCookie(firstDomain, "cookieCopy", "cookie-webmagicCopy"); site.addCookie(secondDomain, "cookie", "cookie-webmagic"); Map> allCookies = site.getAllCookies(); List domains=new ArrayList<>(); for(String key : allCookies.keySet()){ domains.add(key); } assertEquals("cookie-webmagic", allCookies.get(firstDomain).get("cookie")); assertEquals("cookie-webmagicCopy", allCookies.get(firstDomain).get("cookieCopy")); assertEquals("cookie-webmagic", allCookies.get(secondDomain).get("cookie")); assertEquals(2, domains.size()); } } ================================================ FILE: webmagic-core/src/test/java/us/codecraft/webmagic/SpiderTest.java ================================================ package us.codecraft.webmagic; import org.junit.Ignore; import org.junit.Test; import us.codecraft.webmagic.downloader.Downloader; import us.codecraft.webmagic.pipeline.Pipeline; import us.codecraft.webmagic.processor.PageProcessor; import us.codecraft.webmagic.processor.SimplePageProcessor; import us.codecraft.webmagic.scheduler.Scheduler; import java.util.Random; import java.util.concurrent.atomic.AtomicInteger; /** * @author code4crafter@gmail.com */ public class SpiderTest { @Ignore("long time") @Test public void testStartAndStop() throws InterruptedException { Spider spider = Spider.create(new SimplePageProcessor( "http://www.oschina.net/*")).addPipeline(new Pipeline() { @Override public void process(ResultItems resultItems, Task task) { System.out.println(1); } }).thread(1).addUrl("http://www.oschina.net/"); spider.start(); Thread.sleep(10000); spider.stop(); Thread.sleep(10000); spider.start(); Thread.sleep(10000); } @Ignore("long time") @Test public void testWaitAndNotify() throws InterruptedException { for (int i = 0; i < 10000; i++) { System.out.println("round " + i); testRound(); } } private void testRound() { Spider spider = Spider.create(new PageProcessor() { private AtomicInteger count = new AtomicInteger(); @Override public void process(Page page) { page.setSkip(true); } @Override public Site getSite() { return Site.me().setSleepTime(0); } }).setDownloader(new Downloader() { @Override public Page download(Request request, Task task) { return new Page().setRawText(""); } @Override public void setThread(int threadNum) { } }).setScheduler(new Scheduler() { private AtomicInteger count = new AtomicInteger(); private Random random = new Random(); @Override public void push(Request request, Task task) { } @Override public synchronized Request poll(Task task) { if (count.incrementAndGet() > 1000) { return null; } if (random.nextInt(100)>90){ return null; } return new Request("test"); } }).thread(10); spider.run(); } } ================================================ FILE: webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java ================================================ package us.codecraft.webmagic.downloader; import java.io.IOException; import java.io.UnsupportedEncodingException; import java.util.Map; import org.apache.commons.collections4.map.HashedMap; import org.apache.commons.io.IOUtils; import org.apache.http.client.methods.CloseableHttpResponse; import org.apache.http.client.methods.HttpUriRequest; import org.apache.http.impl.client.CloseableHttpClient; import org.apache.http.impl.client.HttpClients; import org.apache.http.util.EntityUtils; import org.junit.Test; import com.github.dreamhead.moco.HttpServer; import com.github.dreamhead.moco.Runnable; import com.github.dreamhead.moco.Runner; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Task; import us.codecraft.webmagic.model.HttpRequestBody; import us.codecraft.webmagic.proxy.Proxy; import us.codecraft.webmagic.proxy.SimpleProxyProvider; import us.codecraft.webmagic.selector.Html; import us.codecraft.webmagic.utils.CharsetUtils; import us.codecraft.webmagic.utils.HttpConstant; import static com.github.dreamhead.moco.Moco.and; import static com.github.dreamhead.moco.Moco.by; import static com.github.dreamhead.moco.Moco.cookie; import static com.github.dreamhead.moco.Moco.eq; import static com.github.dreamhead.moco.Moco.form; import static com.github.dreamhead.moco.Moco.header; import static com.github.dreamhead.moco.Moco.httpServer; import static com.github.dreamhead.moco.Moco.method; import static com.github.dreamhead.moco.Moco.not; import static com.github.dreamhead.moco.Moco.query; import static com.github.dreamhead.moco.Moco.text; import static com.github.dreamhead.moco.Moco.uri; import static com.github.dreamhead.moco.Moco.with; import static org.assertj.core.api.Assertions.assertThat; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertThrows; import static org.junit.Assert.assertTrue; /** * @author code4crafer@gmail.com */ public class HttpClientDownloaderTest { public static final String PAGE_ALWAYS_NOT_EXISTS = "http://localhost:13423/404"; @Test public void testDownloader() { HttpClientDownloader httpClientDownloader = new HttpClientDownloader(); Html html = httpClientDownloader.download("https://www.baidu.com/"); assertTrue(!html.getFirstSourceText().isEmpty()); } @Test(expected = IllegalArgumentException.class) public void testDownloaderInIllegalUrl() throws UnsupportedEncodingException { HttpClientDownloader httpClientDownloader = new HttpClientDownloader(); httpClientDownloader.download("http://www.oschina.net/>"); } @Test public void test_download_fail() { HttpClientDownloader httpClientDownloader = new HttpClientDownloader(); Task task = Site.me().setDomain("localhost").setCycleRetryTimes(5).toTask(); Request request = new Request(PAGE_ALWAYS_NOT_EXISTS); Page page = httpClientDownloader.download(request, task); assertThat(page.isDownloadSuccess()).isFalse(); } @Test public void testGetHtmlCharset() throws Exception { HttpServer server = httpServer(13423); server.get(by(uri("/header"))).response(header("Content-Type", "text/html; charset=gbk")); server.get(by(uri("/meta4"))).response(with(text("\n" + " \n" + " \n" + " \n" + " \n" + "")),header("Content-Type","text/html; charset=gbk")); server.get(by(uri("/meta5"))).response(with(text("\n" + " \n" + " \n" + " \n" + " \n" + "")),header("Content-Type","text/html")); Runner.running(server, new Runnable() { @Override public void run() { String charset = getCharsetByUrl("http://127.0.0.1:13423/header"); assertEquals(charset, "gbk"); charset = getCharsetByUrl("http://127.0.0.1:13423/meta4"); assertEquals(charset, "gbk"); charset = getCharsetByUrl("http://127.0.0.1:13423/meta5"); assertEquals(charset, "gbk"); } private String getCharsetByUrl(String url) { HttpClientDownloader downloader = new HttpClientDownloader(); Site site = Site.me(); CloseableHttpClient httpClient = new HttpClientGenerator().getClient(site); // encoding in http header Content-Type Request requestGBK = new Request(url); CloseableHttpResponse httpResponse = null; try { httpResponse = httpClient.execute(new HttpUriRequestConverter().convert(requestGBK, site, null).getHttpUriRequest()); } catch (IOException e) { e.printStackTrace(); } String charset = null; try { byte[] contentBytes = IOUtils.toByteArray(httpResponse.getEntity().getContent()); charset = CharsetUtils.detectCharset(httpResponse.getEntity().getContentType().getValue(), contentBytes); } catch (IOException e) { e.printStackTrace(); } return charset; } }); } @Test public void test_selectRequestMethod() throws Exception { final int port = 13423; HttpServer server = httpServer(port); server.get(eq(query("q"), "webmagic")).response("get"); server.post(eq(form("q"), "webmagic")).response("post"); server.put(eq(form("q"), "webmagic")).response("put"); server.delete(eq(query("q"), "webmagic")).response("delete"); server.request(and(by(method("HEAD")),eq(query("q"), "webmagic"))).response(header("method","head")); server.request(and(by(method("TRACE")),eq(query("q"), "webmagic"))).response("trace"); final HttpUriRequestConverter httpUriRequestConverter = new HttpUriRequestConverter(); final Site site = Site.me(); Runner.running(server, new Runnable() { @Override public void run() throws Exception { Request request = new Request(); request.setUrl("http://127.0.0.1:" + port + "/search?q=webmagic"); request.setMethod(HttpConstant.Method.GET); Map params = new HashedMap(); params.put("q","webmagic"); HttpUriRequest httpUriRequest = httpUriRequestConverter.convert(request,site,null).getHttpUriRequest(); assertThat(EntityUtils.toString(HttpClients.custom().build().execute(httpUriRequest).getEntity())).isEqualTo("get"); request.setMethod(HttpConstant.Method.DELETE); httpUriRequest = httpUriRequestConverter.convert(request, site, null).getHttpUriRequest(); assertThat(EntityUtils.toString(HttpClients.custom().build().execute(httpUriRequest).getEntity())).isEqualTo("delete"); request.setMethod(HttpConstant.Method.HEAD); httpUriRequest = httpUriRequestConverter.convert(request, site, null).getHttpUriRequest(); assertThat(HttpClients.custom().build().execute(httpUriRequest).getFirstHeader("method").getValue()).isEqualTo("head"); request.setMethod(HttpConstant.Method.TRACE); httpUriRequest = httpUriRequestConverter.convert(request, site, null).getHttpUriRequest(); assertThat(EntityUtils.toString(HttpClients.custom().build().execute(httpUriRequest).getEntity())).isEqualTo("trace"); request.setUrl("http://127.0.0.1:" + port + "/search"); request.setMethod(HttpConstant.Method.POST); request.setRequestBody(HttpRequestBody.form(params, "utf-8")); httpUriRequest = httpUriRequestConverter.convert(request, site, null).getHttpUriRequest(); assertThat(EntityUtils.toString(HttpClients.custom().build().execute(httpUriRequest).getEntity())).isEqualTo("post"); request.setMethod(HttpConstant.Method.PUT); httpUriRequest = httpUriRequestConverter.convert(request, site, null).getHttpUriRequest(); assertThat(EntityUtils.toString(HttpClients.custom().build().execute(httpUriRequest).getEntity())).isEqualTo("put"); } }); } @Test public void test_set_request_cookie() throws Exception { HttpServer server = httpServer(13423); server.get(eq(cookie("cookie"), "cookie-webmagic")).response("ok"); Runner.running(server, new Runnable() { @Override public void run() throws Exception { HttpClientDownloader httpClientDownloader = new HttpClientDownloader(); Request request = new Request(); request.setUrl("http://127.0.0.1:13423"); request.addCookie("cookie","cookie-webmagic"); Page page = httpClientDownloader.download(request, Site.me().toTask()); assertThat(page.getRawText()).isEqualTo("ok"); } }); } @Test public void test_disableCookieManagement() throws Exception { HttpServer server = httpServer(13423); server.get(not(eq(cookie("cookie"), "cookie-webmagic"))).response("ok"); Runner.running(server, new Runnable() { @Override public void run() throws Exception { HttpClientDownloader httpClientDownloader = new HttpClientDownloader(); Request request = new Request(); request.setUrl("http://127.0.0.1:13423"); request.addCookie("cookie","cookie-webmagic"); Page page = httpClientDownloader.download(request, Site.me().setDisableCookieManagement(true).toTask()); assertThat(page.getRawText()).isEqualTo("ok"); } }); } @Test public void test_set_request_header() throws Exception { HttpServer server = httpServer(13423); server.get(eq(header("header"), "header-webmagic")).response("ok"); Runner.running(server, new Runnable() { @Override public void run() throws Exception { HttpClientDownloader httpClientDownloader = new HttpClientDownloader(); Request request = new Request(); request.setUrl("http://127.0.0.1:13423"); request.addHeader("header","header-webmagic"); Page page = httpClientDownloader.download(request, Site.me().toTask()); assertThat(page.getRawText()).isEqualTo("ok"); } }); } @Test public void test_set_site_header() throws Exception { HttpServer server = httpServer(13423); server.get(eq(header("header"), "header-webmagic")).response("ok"); Runner.running(server, new Runnable() { @Override public void run() throws Exception { HttpClientDownloader httpClientDownloader = new HttpClientDownloader(); Request request = new Request(); request.setUrl("http://127.0.0.1:13423"); Page page = httpClientDownloader.download(request, Site.me().addHeader("header","header-webmagic").toTask()); assertThat(page.getRawText()).isEqualTo("ok"); } }); } @Test public void test_set_site_cookie() throws Exception { HttpServer server = httpServer(13423); server.get(eq(cookie("cookie"), "cookie-webmagic")).response("ok"); Runner.running(server, new Runnable() { @Override public void run() throws Exception { HttpClientDownloader httpClientDownloader = new HttpClientDownloader(); Request request = new Request(); request.setUrl("http://127.0.0.1:13423"); Site site = Site.me().addCookie("cookie", "cookie-webmagic").setDomain("127.0.0.1"); Page page = httpClientDownloader.download(request, site.toTask()); assertThat(page.getRawText()).isEqualTo("ok"); } }); } @Test public void test_download_when_task_is_null() throws Exception { HttpServer server = httpServer(13423); server.response("foo"); Runner.running(server, new Runnable() { @Override public void run() throws Exception { final HttpClientDownloader httpClientDownloader = new HttpClientDownloader(); Request request = new Request(); request.setUrl("http://127.0.0.1:13423/"); Page page = httpClientDownloader.download(request, Site.me().toTask()); assertThat(page.getRawText()).isEqualTo("foo"); } }); } @Test public void test_download_auth_by_SimpleProxyProvider() throws Exception { HttpServer server = httpServer(13423); server.get(eq(header("Proxy-Authorization"), "Basic dXNlcm5hbWU6cGFzc3dvcmQ=")).response("ok"); Runner.running(server, new Runnable() { @Override public void run() throws Exception { HttpClientDownloader httpClientDownloader = new HttpClientDownloader(); httpClientDownloader.setProxyProvider(SimpleProxyProvider.from(new Proxy("127.0.0.1", 13423, "username", "password"))); Request request = new Request(); request.setUrl("http://www.baidu.com"); Page page = httpClientDownloader.download(request, Site.me().toTask()); assertThat(page.getRawText()).isEqualTo("ok"); } }); } @Test public void test_download_binary_content() throws Exception { HttpServer server = httpServer(13423); server.response("binary"); Runner.running(server, new Runnable() { @Override public void run() throws Exception { final HttpClientDownloader httpClientDownloader = new HttpClientDownloader(); Request request = new Request(); request.setBinaryContent(true); request.setUrl("http://127.0.0.1:13423/"); Page page = httpClientDownloader.download(request, Site.me().toTask()); assertThat(page.getRawText()).isNull(); assertThat(page.getBytes()).isEqualTo("binary".getBytes()); } }); } @Test public void test_download_set_charset() throws Exception { HttpServer server = httpServer(13423); server.response(header("Content-Type","text/html; charset=utf-8")).response("hello world!"); Runner.running(server, new Runnable() { @Override public void run() throws Exception { final HttpClientDownloader httpClientDownloader = new HttpClientDownloader(); Request request = new Request(); request.setUrl("http://127.0.0.1:13423/"); Page page = httpClientDownloader.download(request, Site.me().toTask()); assertThat(page.getCharset()).isEqualTo("utf-8"); } }); } @Test public void test_download_set_request_charset() throws Exception { HttpServer server = httpServer(13423); server.response("hello world!"); Runner.running(server, new Runnable() { @Override public void run() throws Exception { final HttpClientDownloader httpClientDownloader = new HttpClientDownloader(); Request request = new Request(); request.setCharset("utf-8"); request.setUrl("http://127.0.0.1:13423/"); Page page = httpClientDownloader.download(request, Site.me().setCharset("gbk").toTask()); assertThat(page.getCharset()).isEqualTo("utf-8"); } }); } @Test public void test_no_task_download(){ Request request = new Request(); request.setUrl("http://127.0.0.1:13423/"); HttpClientDownloader httpClientDownloader = new HttpClientDownloader(); assertThrows(NullPointerException.class, () -> httpClientDownloader.download(request,null)); } } ================================================ FILE: webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpUriRequestConverterTest.java ================================================ package us.codecraft.webmagic.downloader; import org.junit.Test; import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.utils.UrlUtils; import java.net.URI; import static org.assertj.core.api.Assertions.assertThat; /** * @author code4crafter@gmail.com * Date: 2017/7/22 * Time: 下午5:29 */ public class HttpUriRequestConverterTest { @Test public void test_illegal_uri_correct() throws Exception { HttpUriRequestConverter httpUriRequestConverter = new HttpUriRequestConverter(); HttpClientRequestContext requestContext = httpUriRequestConverter.convert(new Request(UrlUtils.fixIllegalCharacterInUrl("http://bj.zhongkao.com/beikao/yimo/##")), Site.me(), null); assertThat(requestContext.getHttpUriRequest().getURI()).isEqualTo(new URI("http://bj.zhongkao.com/beikao/yimo/#")); } } ================================================ FILE: webmagic-core/src/test/java/us/codecraft/webmagic/downloader/MockGithubDownloader.java ================================================ package us.codecraft.webmagic.downloader; import java.io.IOException; import java.io.InputStream; import java.nio.charset.Charset; import org.apache.commons.io.IOUtils; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Task; import us.codecraft.webmagic.selector.PlainText; /** * @author code4crafter@gmail.com */ public class MockGithubDownloader implements Downloader { @Override public Page download(Request request, Task task) { Page page = new Page(); InputStream resourceAsStream = this.getClass().getResourceAsStream("/html/mock-github.html"); try { page.setRawText(IOUtils.toString(resourceAsStream, Charset.defaultCharset())); } catch (IOException e) { e.printStackTrace(); } page.setRequest(new Request("https://github.com/code4craft/webmagic")); page.setUrl(new PlainText("https://github.com/code4craft/webmagic")); return page; } @Override public void setThread(int threadNum) { } } ================================================ FILE: webmagic-core/src/test/java/us/codecraft/webmagic/downloader/SSLCompatibilityTest.java ================================================ package us.codecraft.webmagic.downloader; import org.junit.Test; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Task; import static org.assertj.core.api.Assertions.assertThat; /** * @author code4crafter@gmail.com * Date: 2017/11/29 * Time: 下午1:32 */ public class SSLCompatibilityTest { @Test public void test_tls12() throws Exception { HttpClientDownloader httpClientDownloader = new HttpClientDownloader(); Task task = Site.me().setCycleRetryTimes(5).toTask(); Request request = new Request("https://juejin.im/"); Page page = httpClientDownloader.download(request, task); assertThat(page.isDownloadSuccess()).isTrue(); } } ================================================ FILE: webmagic-core/src/test/java/us/codecraft/webmagic/example/GithubRepoPageProcessorTest.java ================================================ package us.codecraft.webmagic.example; import org.junit.Test; import us.codecraft.webmagic.ResultItems; import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.Task; import us.codecraft.webmagic.downloader.MockGithubDownloader; import us.codecraft.webmagic.pipeline.Pipeline; import us.codecraft.webmagic.processor.example.GithubRepoPageProcessor; import static org.assertj.core.api.Assertions.assertThat; /** * @author code4crafter@gmail.com * Date: 16/1/19 * Time: 上午7:27 */ public class GithubRepoPageProcessorTest { @Test public void test_github() throws Exception { Spider.create(new GithubRepoPageProcessor()).addPipeline(new Pipeline() { @Override public void process(ResultItems resultItems, Task task) { assertThat(((String) resultItems.get("name")).trim()).isEqualTo("webmagic"); assertThat(((String) resultItems.get("author")).trim()).isEqualTo("code4craft"); } }).setDownloader(new MockGithubDownloader()).test("https://github.com/code4craft/webmagic"); } } ================================================ FILE: webmagic-core/src/test/java/us/codecraft/webmagic/pipeline/FilePipelineTest.java ================================================ package us.codecraft.webmagic.pipeline; import org.junit.BeforeClass; import org.junit.Test; import us.codecraft.webmagic.Request; import us.codecraft.webmagic.ResultItems; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Task; import java.util.UUID; /** * Created by ywooer on 2014/5/6 0006. */ public class FilePipelineTest { private static ResultItems resultItems; private static Task task; @BeforeClass public static void before() { resultItems = new ResultItems(); resultItems.put("content", "webmagic 爬虫工具"); Request request = new Request("http://www.baidu.com"); resultItems.setRequest(request); task = new Task() { @Override public String getUUID() { return UUID.randomUUID().toString(); } @Override public Site getSite() { return null; } }; } @Test public void testProcess() { FilePipeline filePipeline = new FilePipeline(); filePipeline.process(resultItems, task); } } ================================================ FILE: webmagic-core/src/test/java/us/codecraft/webmagic/processor/PageProcessorTest.java ================================================ package us.codecraft.webmagic.processor; import static org.junit.Assert.assertEquals; import org.junit.Test; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Site; public class PageProcessorTest { @Test public void testGetSite() { Site actualSite = new PageProcessor() { @Override public void process(Page page) { } }.getSite(); assertEquals(Site.me(), actualSite); actualSite = new PageProcessor() { @Override public void process(Page page) { } @Override public Site getSite() { return Site.me().setTimeOut(123); }; }.getSite(); assertEquals(Site.me().setTimeOut(123), actualSite); } } ================================================ FILE: webmagic-core/src/test/java/us/codecraft/webmagic/proxy/ProxyTest.java ================================================ package us.codecraft.webmagic.proxy; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertNull; import java.net.URI; import java.util.ArrayList; import java.util.List; import org.apache.http.HttpHost; import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.Test; /** * @author yxssfxwzy@sina.com May 30, 2014 * */ class ProxyTest { private static List httpProxyList = new ArrayList(); @BeforeAll static void before() { // String[] source = { "0.0.0.1:0", "0.0.0.2:0", "0.0.0.3:0", // "0.0.0.4:0" }; String[] source = { "::0.0.0.1:0", "::0.0.0.2:0", "::0.0.0.3:0", "::0.0.0.4:0" }; for (String line : source) { httpProxyList.add(new String[] {line.split(":")[0], line.split(":")[1], line.split(":")[2], line.split(":")[3] }); } } class Fetch extends Thread { HttpHost hp; public Fetch(HttpHost hp) { this.hp = hp; } @Override public void run() { try { System.out.println("fetch web page use proxy: " + hp.getHostName() + ":" + hp.getPort()); sleep(500); } catch (InterruptedException e) { e.printStackTrace(); } } } @Test void testCreate() { Proxy proxy = Proxy.create(URI.create("//127.0.0.1:8080")); assertNull(proxy.getScheme()); assertNull(proxy.getUsername()); assertNull(proxy.getPassword()); assertEquals("127.0.0.1", proxy.getHost()); assertEquals(8080, proxy.getPort()); proxy = Proxy.create(URI.create("http://127.0.0.1:8080")); assertEquals("http", proxy.getScheme()); assertNull(proxy.getUsername()); assertNull(proxy.getPassword()); assertEquals("127.0.0.1", proxy.getHost()); assertEquals(8080, proxy.getPort()); proxy = Proxy.create(URI.create("//username:password@127.0.0.1:8080")); assertNull(proxy.getScheme()); assertEquals("username", proxy.getUsername()); assertEquals("password", proxy.getPassword()); assertEquals("127.0.0.1", proxy.getHost()); assertEquals(8080, proxy.getPort()); proxy = Proxy.create(URI.create("//username@127.0.0.1:8080")); assertNull(proxy.getScheme()); assertEquals("username", proxy.getUsername()); assertNull(proxy.getPassword()); assertEquals("127.0.0.1", proxy.getHost()); assertEquals(8080, proxy.getPort()); proxy = Proxy.create(URI.create("//:password@127.0.0.1:8080")); assertNull(proxy.getScheme()); assertNull(proxy.getUsername()); assertEquals("password", proxy.getPassword()); assertEquals("127.0.0.1", proxy.getHost()); assertEquals(8080, proxy.getPort()); } @Test void testEqualsHashCode() { var proxy0 = new Proxy("::1", 1080); var proxy1 = new Proxy("::1", 1080); assertEquals(proxy0, proxy1); assertEquals(proxy0.hashCode(), proxy1.hashCode()); } @Test void testToString() { assertEquals("//127.0.0.1:8080", new Proxy("127.0.0.1", 8080).toString()); assertEquals("http://127.0.0.1:8080", new Proxy("127.0.0.1", 8080, "http").toString()); assertEquals("//username:password@127.0.0.1:8080", new Proxy("127.0.0.1", 8080, "username", "password").toString()); assertEquals("//username@127.0.0.1:8080", new Proxy("127.0.0.1", 8080, "username", null).toString()); assertEquals("//:password@127.0.0.1:8080", new Proxy("127.0.0.1", 8080, null, "password").toString()); } } ================================================ FILE: webmagic-core/src/test/java/us/codecraft/webmagic/proxy/SimpleProxyProviderTest.java ================================================ package us.codecraft.webmagic.proxy; import org.junit.Test; import org.mockito.Mockito; import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Task; import static org.assertj.core.api.Assertions.assertThat; /** * @author code4crafter@gmail.com * Date: 17/4/16 * Time: 上午10:29 */ public class SimpleProxyProviderTest { public static final Task TASK = Site.me().toTask(); @Test public void test_get_proxy() throws Exception { Proxy originProxy1 = new Proxy("127.0.0.1", 1087); Proxy originProxy2 = new Proxy("127.0.0.1", 1088); SimpleProxyProvider proxyProvider = SimpleProxyProvider.from(originProxy1, originProxy2); Request request = Mockito.mock(Request.class); Proxy proxy = proxyProvider.getProxy(request, TASK); assertThat(proxy).isEqualTo(originProxy1); proxy = proxyProvider.getProxy(request, TASK); assertThat(proxy).isEqualTo(originProxy2); proxy = proxyProvider.getProxy(request, TASK); assertThat(proxy).isEqualTo(originProxy1); } } ================================================ FILE: webmagic-core/src/test/java/us/codecraft/webmagic/scheduler/DuplicateRemovedSchedulerTest.java ================================================ package us.codecraft.webmagic.scheduler; import org.junit.Test; import org.junit.runner.RunWith; import org.mockito.Mockito; import org.mockito.runners.MockitoJUnitRunner; import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Task; import us.codecraft.webmagic.scheduler.component.DuplicateRemover; import us.codecraft.webmagic.utils.HttpConstant; import static org.mockito.Matchers.any; import static org.mockito.Mockito.times; import static org.mockito.Mockito.verify; /** * @author code4crafter@gmail.com * Date: 17/3/11 * Time: 上午11:26 */ @RunWith(MockitoJUnitRunner.class) public class DuplicateRemovedSchedulerTest { private DuplicateRemovedScheduler duplicateRemovedScheduler = new DuplicateRemovedScheduler() { @Override public Request poll(Task task) { return null; } }; @Test public void test_no_duplicate_removed_for_post_request() throws Exception { DuplicateRemover duplicateRemover = Mockito.mock(DuplicateRemover.class); duplicateRemovedScheduler.setDuplicateRemover(duplicateRemover); Request request = new Request("https://www.google.com/"); request.setMethod(HttpConstant.Method.POST); duplicateRemovedScheduler.push(request, null); verify(duplicateRemover,times(0)).isDuplicate(any(Request.class),any(Task.class)); } @Test public void test_duplicate_removed_for_get_request() throws Exception { DuplicateRemover duplicateRemover = Mockito.mock(DuplicateRemover.class); duplicateRemovedScheduler.setDuplicateRemover(duplicateRemover); Request request = new Request("https://www.google.com/"); request.setMethod(HttpConstant.Method.GET); duplicateRemovedScheduler.push(request, null); verify(duplicateRemover,times(1)).isDuplicate(any(Request.class),any(Task.class)); } } ================================================ FILE: webmagic-core/src/test/java/us/codecraft/webmagic/scheduler/PrioritySchedulerTest.java ================================================ package us.codecraft.webmagic.scheduler; import junit.framework.Assert; import org.junit.Test; import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Task; /** * @author code4crafter@gmail.com
*/ public class PrioritySchedulerTest { private PriorityScheduler priorityScheduler = new PriorityScheduler(); private Task task = new Task() { @Override public String getUUID() { return "1"; } @Override public Site getSite() { return null; } }; @Test public void testDifferentPriority() { Request request = new Request("a"); request.setPriority(100); priorityScheduler.push(request,task); request = new Request("b"); request.setPriority(900); priorityScheduler.push(request,task); request = new Request("c"); priorityScheduler.push(request,task); request = new Request("d"); request.setPriority(-900); priorityScheduler.push(request,task); Request poll = priorityScheduler.poll(task); Assert.assertEquals("b",poll.getUrl()); poll = priorityScheduler.poll(task); Assert.assertEquals("a",poll.getUrl()); poll = priorityScheduler.poll(task); Assert.assertEquals("c",poll.getUrl()); poll = priorityScheduler.poll(task); Assert.assertEquals("d",poll.getUrl()); } @Test public void testNoPriority() { Request request = new Request("a"); priorityScheduler.push(request,task); request = new Request("b"); priorityScheduler.push(request,task); request = new Request("c"); priorityScheduler.push(request,task); Request poll = priorityScheduler.poll(task); Assert.assertEquals("a",poll.getUrl()); poll = priorityScheduler.poll(task); Assert.assertEquals("b",poll.getUrl()); poll = priorityScheduler.poll(task); Assert.assertEquals("c",poll.getUrl()); } } ================================================ FILE: webmagic-core/src/test/java/us/codecraft/webmagic/selector/AndSelectorTest.java ================================================ package us.codecraft.webmagic.selector; import static org.junit.Assert.assertEquals; import java.util.ArrayList; import java.util.List; import org.junit.Test; public class AndSelectorTest { @Test public void testSelectList() { String htmlContent = "\n" + "\n" + "\n" + " \n" + " \n" + " HTML with XPath\n" + "\n" + "\n" + "
\n" + "
Item 1
\n" + "
Item 2
\n" + "
\n" + "\n" + ""; List selectors = new ArrayList(); selectors.add(new CssSelector("div")); selectors.add(new XpathSelector("//div[@class='item1']")); AndSelector andSelector = new AndSelector(selectors); List result = andSelector.selectList(htmlContent); assertEquals("
\n Item 1\n
", result.get(0)); } @Test public void testSelectList_NoResults() { String htmlContent = "\n" + "\n" + "\n" + " \n" + " \n" + " HTML with XPath\n" + "\n" + "\n" + "
\n" + "
Item 1
\n" + "
Item 2
\n" + "
\n" + "\n" + ""; List selectors = new ArrayList(); selectors.add(new CssSelector("div")); selectors.add(new XpathSelector("//div[@class='item']")); AndSelector andSelector = new AndSelector(selectors); List result = andSelector.selectList(htmlContent); assertEquals(0, result.size()); } } ================================================ FILE: webmagic-core/src/test/java/us/codecraft/webmagic/selector/CssSelectorTest.java ================================================ package us.codecraft.webmagic.selector; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import org.junit.Test; import org.junit.runner.RunWith; import org.mockito.Mock; import org.mockito.Mockito; import org.mockito.runners.MockitoJUnitRunner; import java.util.List; import static org.junit.Assert.*; public class CssSelectorTest { @Test public void testSelectElement() { CssSelector cssSelector = new CssSelector("div"); String htmlContent = "Dummy Page
Hello World!
"; Document doc = Jsoup.parse(htmlContent); Element dummyElement = doc.getElementById("dummyDiv"); Element resultElement = cssSelector.selectElement(dummyElement); assertNotNull(resultElement); } @Test public void testSelectList() { CssSelector cssSelector = new CssSelector("div"); String htmlContent = "Dummy Page
Hello World!
"; Document doc = Jsoup.parse(htmlContent); Element dummyElement = doc.getElementById("dummyDiv"); List result = cssSelector.selectList(dummyElement); assertEquals(1, result.size()); assertEquals("[
\n Hello World!\n
]", result.toString()); } } ================================================ FILE: webmagic-core/src/test/java/us/codecraft/webmagic/selector/ExtractorsTest.java ================================================ package us.codecraft.webmagic.selector; import org.junit.Test; import static org.assertj.core.api.Assertions.assertThat; import static us.codecraft.webmagic.selector.Selectors.*; /** * @author code4crafter@gmail.com
*/ public class ExtractorsTest { String html = "

testaabbcc

"; String html2 = "aabbcc"; @Test public void testEach() { assertThat($("div h1 a").select(html)).isEqualTo("aabbcc"); assertThat($("div h1 a", "href").select(html)).isEqualTo("xxx"); assertThat($("div h1 a", "innerHtml").select(html)).isEqualTo("aabbcc"); assertThat(xpath("//a/@href").select(html)).isEqualTo("xxx"); assertThat(regex("a href=\"(.*)\"").select(html)).isEqualTo("xxx"); assertThat(regex("(a href)=\"(.*)\"", 2).select(html)).isEqualTo("xxx"); } @Test public void testCombo() { assertThat(and($("title"), regex("aa(bb)cc")).select(html2)).isEqualTo("bb"); OrSelector or = or($("div h1 a", "innerHtml"), xpath("//title")); assertThat(or.select(html)).isEqualTo("aabbcc"); assertThat(or.select(html2)).isEqualTo("aabbcc"); } } ================================================ FILE: webmagic-core/src/test/java/us/codecraft/webmagic/selector/JsonPathSelectorTest.java ================================================ package us.codecraft.webmagic.selector; import com.alibaba.fastjson.JSON; import com.alibaba.fastjson.JSONObject; import org.junit.Test; import java.util.List; import static org.assertj.core.api.Assertions.assertThat; /** * @author code4crafter@gmai.com
*/ public class JsonPathSelectorTest { private String text = "{ \"store\": {\n" + " \"book\": [ \n" + " { \"category\": \"reference\",\n" + " \"author\": \"Nigel Rees\",\n" + " \"title\": \"Sayings of the Century\",\n" + " \"price\": 8.95\n" + " },\n" + " { \"category\": \"fiction\",\n" + " \"author\": \"Evelyn Waugh\",\n" + " \"title\": \"Sword of Honour\",\n" + " \"price\": 12.99,\n" + " \"isbn\": \"0-553-21311-3\"\n" + " }\n" + " ],\n" + " \"bicycle\": {\n" + " \"color\": \"red\",\n" + " \"price\": 19.95\n" + " }\n" + " }\n" + "}"; @Test public void testJsonPath() { JsonPathSelector jsonPathSelector = new JsonPathSelector("$.store.book[*].author"); String select = jsonPathSelector.select(text); List list = jsonPathSelector.selectList(text); assertThat(select).isEqualTo("Nigel Rees"); assertThat(list).contains("Nigel Rees","Evelyn Waugh"); jsonPathSelector = new JsonPathSelector("$.store.book[?(@.category == 'reference')].title"); list = jsonPathSelector.selectList(text); select = jsonPathSelector.select(text); assertThat(select).isEqualTo("Sayings of the Century"); assertThat(list).contains("Sayings of the Century"); jsonPathSelector = new JsonPathSelector("$.store.book[?(@.category == 'reference')]"); select = jsonPathSelector.select(text); JSONObject object1= JSON.parseObject(select); JSONObject object2=JSON.parseObject("{\"author\":\"Nigel Rees\",\"title\":\"Sayings of the Century\",\"category\":\"reference\",\"price\":8.95}"); assertThat(object1).isEqualTo(object2); } } ================================================ FILE: webmagic-core/src/test/java/us/codecraft/webmagic/selector/JsonTest.java ================================================ package us.codecraft.webmagic.selector; import org.junit.Test; import static org.assertj.core.api.Assertions.assertThat; /** * @author code4crafter@gmai.com * @since 0.5.0 */ public class JsonTest { private String text = "callback({\"name\":\"json\"})"; private String textWithBrackerInContent = "callback({\"name\":\"json)\"})"; @Test public void testRemovePadding() throws Exception { String name = new Json(text).removePadding("callback").jsonPath("$.name").get(); assertThat(name).isEqualTo("json"); } @Test public void testRemovePaddingForQuotes() throws Exception { String name = new Json(textWithBrackerInContent).removePadding("callback").jsonPath("$.name").get(); assertThat(name).isEqualTo("json)"); } } ================================================ FILE: webmagic-core/src/test/java/us/codecraft/webmagic/selector/LinksSelectorTest.java ================================================ package us.codecraft.webmagic.selector; import org.jsoup.Jsoup; import org.junit.Test; import java.util.List; /** * @author code4crafter@gmail.com * Date: 17/4/8 * Time: 下午9:41 */ public class LinksSelectorTest { private String html = "
"; @Test public void testLinks() throws Exception { LinksSelector linksSelector = new LinksSelector(); List links = linksSelector.selectList(html); System.out.println(links); html = "
"; links = linksSelector.selectList(Jsoup.parse(html, "http://whatever.com/")); System.out.println(links); } } ================================================ FILE: webmagic-core/src/test/java/us/codecraft/webmagic/selector/OrSelectorTest.java ================================================ package us.codecraft.webmagic.selector; import static org.junit.Assert.assertEquals; import java.util.ArrayList; import java.util.List; import org.junit.Test; public class OrSelectorTest { @Test public void testSelectList() { String htmlContent = "\n" + "\n" + "\n" + " \n" + " \n" + " HTML with XPath\n" + "\n" + "\n" + "
\n" + "
Item 1
\n" + "
Item 2
\n" + "
\n" + "\n" + ""; String expectedResult = "[\n" + " \n" + " \n" + " HTML with XPath\n" + ",
\n" + " Item 1\n" + "
,
\n" + " Item 2\n" + "
]"; List selectors = new ArrayList(); selectors.add(new CssSelector("head")); selectors.add(new XpathSelector("//div[@class='item1']")); selectors.add(new XpathSelector("//div[@class='item2']")); OrSelector orSelector = new OrSelector(selectors); List result = orSelector.selectList(htmlContent); assertEquals(expectedResult, result.toString()); } } ================================================ FILE: webmagic-core/src/test/java/us/codecraft/webmagic/selector/RegexSelectorTest.java ================================================ package us.codecraft.webmagic.selector; import org.assertj.core.api.Assertions; import org.junit.Test; /** * @author code4crafter@gmail.com
*/ public class RegexSelectorTest { @Test(expected = IllegalArgumentException.class) public void testRegexWithSingleLeftBracket() { String regex = "\\d+("; new RegexSelector(regex); } @Test public void testRegexWithLeftBracketQuoted() { String regex = "\\(.+"; String source = "(hello world"; RegexSelector regexSelector = new RegexSelector(regex); String select = regexSelector.select(source); Assertions.assertThat(select).isEqualTo(source); } @Test public void testRegexWithZeroWidthAssertions() { String regex = "^.*(?=\\?)(?!\\?yy)"; String source = "hello world?xx?yy"; RegexSelector regexSelector = new RegexSelector(regex); String select = regexSelector.select(source); Assertions.assertThat(select).isEqualTo("hello world"); regex = "\\d{3}(?!\\d)"; source = "123456asdf"; regexSelector = new RegexSelector(regex); select = regexSelector.select(source); Assertions.assertThat(select).isEqualTo("456"); } } ================================================ FILE: webmagic-core/src/test/java/us/codecraft/webmagic/selector/SelectorTest.java ================================================ package us.codecraft.webmagic.selector; import org.junit.Test; import java.util.List; import static org.assertj.core.api.Assertions.assertThat; /** * @author code4crafter@gmail.com */ public class SelectorTest { private String html = "
"; @Test public void testChain() throws Exception { Html selectable = new Html(html); List linksWithoutChain = selectable.links().all(); Selectable xpath = selectable.xpath("//div"); List linksWithChainFirstCall = xpath.links().all(); List linksWithChainSecondCall = xpath.links().all(); assertThat(linksWithoutChain).hasSameSizeAs(linksWithChainFirstCall); assertThat(linksWithChainFirstCall).hasSameSizeAs(linksWithChainSecondCall); } @Test public void testNodes() throws Exception { Html selectable = new Html(html); List links = selectable.xpath("//a").nodes(); assertThat(links.get(0).links().get()).isEqualTo("http://whatever.com/aaa"); } } ================================================ FILE: webmagic-core/src/test/java/us/codecraft/webmagic/utils/CharsetUtilsTest.java ================================================ package us.codecraft.webmagic.utils; import static org.junit.jupiter.api.Assertions.assertNull; import java.io.IOException; import org.junit.jupiter.api.Test; class CharsetUtilsTest { @Test void testDetectCharset() throws IOException { assertNull(CharsetUtils.detectCharset(null, new byte[0])); } } ================================================ FILE: webmagic-core/src/test/java/us/codecraft/webmagic/utils/NumberUtilsTest.java ================================================ package us.codecraft.webmagic.utils; import org.junit.Assert; import org.junit.Test; public class NumberUtilsTest { @Test public void testCompareLong() { Assert.assertEquals(0, NumberUtils.compareLong(0L, 0L)); Assert.assertEquals(1, NumberUtils.compareLong(9L, 0L)); Assert.assertEquals(-1, NumberUtils.compareLong(0L, 9L)); Assert.assertEquals(-1, NumberUtils.compareLong(-9L, 0L)); Assert.assertEquals(1, NumberUtils.compareLong(0L, -9L)); } } ================================================ FILE: webmagic-core/src/test/java/us/codecraft/webmagic/utils/UrlUtilsTest.java ================================================ package us.codecraft.webmagic.utils; import static org.junit.Assert.assertNull; import org.junit.Assert; import org.junit.Test; import static org.assertj.core.api.Assertions.assertThat; /** * @author code4crafter@gmail.com
* Date: 13-4-21 * Time: 下午2:22 */ public class UrlUtilsTest { @Test public void testFixRelativeUrl() { String absoluteUrl = UrlUtils.canonicalizeUrl("aa", "http://www.dianping.com/sh/ss/com"); assertThat(absoluteUrl).isEqualTo("http://www.dianping.com/sh/ss/aa"); absoluteUrl = UrlUtils.canonicalizeUrl("../aa", "http://www.dianping.com/sh/ss/com"); assertThat(absoluteUrl).isEqualTo("http://www.dianping.com/sh/aa"); absoluteUrl = UrlUtils.canonicalizeUrl("../mshz", "http://www.court.gov.cn/zgcpwsw/zgrmfy/"); assertThat(absoluteUrl).isEqualTo("http://www.court.gov.cn/zgcpwsw/mshz"); absoluteUrl = UrlUtils.canonicalizeUrl("..aa", "http://www.dianping.com/sh/ss/com"); assertThat(absoluteUrl).isEqualTo("http://www.dianping.com/sh/ss/..aa"); absoluteUrl = UrlUtils.canonicalizeUrl("../../aa", "http://www.dianping.com/sh/ss/com/"); assertThat(absoluteUrl).isEqualTo("http://www.dianping.com/sh/aa"); absoluteUrl = UrlUtils.canonicalizeUrl("../../aa", "http://www.dianping.com/sh/ss/com"); assertThat(absoluteUrl).isEqualTo("http://www.dianping.com/aa"); } @Test public void testGetDomain(){ String url = "http://www.dianping.com/aa/"; Assert.assertEquals("www.dianping.com",UrlUtils.getDomain(url)); url = "www.dianping.com/aa/"; Assert.assertEquals("www.dianping.com",UrlUtils.getDomain(url)); url = "http://www.dianping.com"; Assert.assertEquals("www.dianping.com",UrlUtils.getDomain(url)); } @Test public void testGetCharset() { assertNull(UrlUtils.getCharset(null)); } } ================================================ FILE: webmagic-core/src/test/resources/html/mock-github.html ================================================ code4craft/webmagic Skip to content
  • Unwatch
  • Fork

/webmagic

A scalable web crawler framework. http://webmagic.io/ Edit
or Cancel
Java CSS JavaScript FreeMarker HTML Ruby
Find file
New pull request
Latest commit 800f66c @code4craft Revert "remove some unkown config"
This reverts commit 0e245c9.

README.md

logo

Readme in Chinese

User Manual (Chinese)

Build Status

A scalable crawler framework. It covers the whole lifecycle of crawler: downloading, url management, content extraction and persistent. It can simplify the development of a specific crawler.

Features:

  • Simple core with high flexibility.
  • Simple API for html extracting.
  • Annotation with POJO to customize a crawler, no configuration.
  • Multi-thread and Distribution support.
  • Easy to be integrated.

Install:

Add dependencies to your pom.xml:

<dependency>
    <groupId>us.codecraft</groupId>
    <artifactId>webmagic-core</artifactId>
    <version>0.5.2</version>
</dependency>
<dependency>
    <groupId>us.codecraft</groupId>
    <artifactId>webmagic-extension</artifactId>
    <version>0.5.2</version>
</dependency>

WebMagic use slf4j with slf4j-log4j12 implementation. If you customized your slf4j implementation, please exclude slf4j-log4j12.

<exclusions>
    <exclusion>
        <groupId>org.slf4j</groupId>
        <artifactId>slf4j-log4j12</artifactId>
    </exclusion>
</exclusions>

Get Started:

First crawler:

Write a class implements PageProcessor. For example, I wrote a crawler of github repository infomation.

public class GithubRepoPageProcessor implements PageProcessor {

    private Site site = Site.me().setRetryTimes(3).setSleepTime(1000);

    @Override
    public void process(Page page) {
        page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/\\w+/\\w+)").all());
        page.putField("author", page.getUrl().regex("https://github\\.com/(\\w+)/.*").toString());
        page.putField("name", page.getHtml().xpath("//h1[@class='entry-title public']/strong/a/text()").toString());
        if (page.getResultItems().get("name")==null){
            //skip this page
            page.setSkip(true);
        }
        page.putField("readme", page.getHtml().xpath("//div[@id='readme']/tidyText()"));
    }

    @Override
    public Site getSite() {
        return site;
    }

    public static void main(String[] args) {
        Spider.create(new GithubRepoPageProcessor()).addUrl("https://github.com/code4craft").thread(5).run();
    }
}
  • page.addTargetRequests(links)

    Add urls for crawling.

You can also use annotation way:

@TargetUrl("https://github.com/\\w+/\\w+")
@HelpUrl("https://github.com/\\w+")
public class GithubRepo {

    @ExtractBy(value = "//h1[@class='entry-title public']/strong/a/text()", notNull = true)
    private String name;

    @ExtractByUrl("https://github\\.com/(\\w+)/.*")
    private String author;

    @ExtractBy("//div[@id='readme']/tidyText()")
    private String readme;

    public static void main(String[] args) {
        OOSpider.create(Site.me().setSleepTime(1000)
                , new ConsolePageModelPipeline(), GithubRepo.class)
                .addUrl("https://github.com/code4craft").thread(5).run();
    }
}

Docs and samples:

Documents: http://webmagic.io/docs/

The architecture of webmagic (refered to Scrapy)

image

Javadocs: http://code4craft.github.io/webmagic/docs/en/

There are some samples in webmagic-samples package.

Lisence:

Lisenced under Apache 2.0 lisence

Contributors:

Thanks these people for commiting source code, reporting bugs or suggesting for new feature:

Thanks:

To write webmagic, I refered to the projects below :

Mail-list:

https://groups.google.com/forum/#!forum/webmagic-java

http://list.qq.com/cgi-bin/qf_invite?id=023a01f505246785f77c5a5a9aff4e57ab20fcdde871e988

QQ Group: 373225642

Bitdeli Badge

Something went wrong with that request. Please try again.
================================================ FILE: webmagic-core/src/test/resources/log4j2-test.xml ================================================ ================================================ FILE: webmagic-coverage/pom.xml ================================================ 4.0.0 us.codecraft webmagic 1.0.4-SNAPSHOT webmagic-coverage pom webmagic-coverage Compute aggregated test code coverage true ${project.groupId} webmagic-core ${project.version} ${project.groupId} webmagic-extension ${project.version} ${project.groupId} webmagic-scripts ${project.version} ${project.groupId} webmagic-selenium ${project.version} ${project.groupId} webmagic-saxon ${project.version} ${project.groupId} webmagic-samples ${project.version} org.jacoco jacoco-maven-plugin report-aggregate ================================================ FILE: webmagic-extension/README.md ================================================ webmagic-extension ------- webmagic的扩展模块。包括注解格式定义爬虫、JSON、分布式等支持。 ================================================ FILE: webmagic-extension/pom.xml ================================================ us.codecraft webmagic 1.0.4-SNAPSHOT 4.0.0 webmagic-extension org.projectlombok lombok 1.18.32 provided redis.clients jedis org.assertj assertj-core test com.google.guava guava true ${project.groupId} webmagic-core ${project.version} ================================================ FILE: webmagic-extension/src/main/java/us/codecraft/webmagic/MultiPageModel.java ================================================ package us.codecraft.webmagic; import us.codecraft.webmagic.utils.Experimental; import java.util.Collection; /** * Extract an object of more than one pages, such as news and articles.
* * @author code4crafter@gmail.com
* @since 0.2.0 */ @Experimental public interface MultiPageModel { /** * Page key is the identifier for the object. * * @return page key */ public String getPageKey(); /** * page is the identifier of a page in pages for one object. * * @return page */ public String getPage(); /** * other pages to be extracted.
* It is used to judge whether an object contains more than one page, and whether the pages of the object are all extracted. * * @return other pages */ public Collection getOtherPages(); /** * Combine multiPageModels to a whole object. * * @param multiPageModel multiPageModel * @return multiPageModel combined */ public MultiPageModel combine(MultiPageModel multiPageModel); } ================================================ FILE: webmagic-extension/src/main/java/us/codecraft/webmagic/SimpleHttpClient.java ================================================ package us.codecraft.webmagic; import us.codecraft.webmagic.downloader.HttpClientDownloader; import us.codecraft.webmagic.model.PageMapper; import us.codecraft.webmagic.proxy.ProxyProvider; /** * @author code4crafter@gmail.com * Date: 2017/5/27 * @since 0.7.0 */ public class SimpleHttpClient { private final HttpClientDownloader httpClientDownloader; private final Site site; public SimpleHttpClient() { this(Site.me()); } public SimpleHttpClient(Site site) { this.site = site; this.httpClientDownloader = new HttpClientDownloader(); } public void setProxyProvider(ProxyProvider proxyProvider){ this.httpClientDownloader.setProxyProvider(proxyProvider); } public T get(String url, Class clazz) { return get(new Request(url), clazz); } public T get(Request request, Class clazz) { Page page = httpClientDownloader.download(request, site.toTask()); if (!page.isDownloadSuccess()) { return null; } return new PageMapper(clazz).get(page); } public Page get(String url) { return httpClientDownloader.download(new Request(url), site.toTask()); } public Page get(Request request) { return httpClientDownloader.download(request, site.toTask()); } } ================================================ FILE: webmagic-extension/src/main/java/us/codecraft/webmagic/configurable/ConfigurablePageProcessor.java ================================================ package us.codecraft.webmagic.configurable; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.processor.PageProcessor; import us.codecraft.webmagic.utils.Experimental; import java.util.List; /** * @author code4crafter@gmail.com
*/ @Experimental public class ConfigurablePageProcessor implements PageProcessor { private Site site; private List extractRules; public ConfigurablePageProcessor(Site site, List extractRules) { this.site = site; this.extractRules = extractRules; } @Override public void process(Page page) { for (ExtractRule extractRule : extractRules) { if (extractRule.isMulti()) { List results = page.getHtml().selectDocumentForList(extractRule.getSelector()); if (extractRule.isNotNull() && results.size() == 0) { page.setSkip(true); } else { page.getResultItems().put(extractRule.getFieldName(), results); } } else { String result = page.getHtml().selectDocument(extractRule.getSelector()); if (extractRule.isNotNull() && result == null) { page.setSkip(true); } else { page.getResultItems().put(extractRule.getFieldName(), result); } } } } @Override public Site getSite() { return site; } } ================================================ FILE: webmagic-extension/src/main/java/us/codecraft/webmagic/configurable/ExpressionType.java ================================================ package us.codecraft.webmagic.configurable; /** * @author code4crafter@gmail.com */ public enum ExpressionType { XPath, Regex, Css, JsonPath; } ================================================ FILE: webmagic-extension/src/main/java/us/codecraft/webmagic/configurable/ExtractRule.java ================================================ package us.codecraft.webmagic.configurable; import us.codecraft.webmagic.selector.JsonPathSelector; import us.codecraft.webmagic.selector.Selector; import static us.codecraft.webmagic.selector.Selectors.*; /** * @author code4crafter@gmail.com */ public class ExtractRule { private String fieldName; private ExpressionType expressionType; private String expressionValue; private String[] expressionParams; private boolean multi = false; private volatile Selector selector; private boolean notNull = false; public String getFieldName() { return fieldName; } public void setFieldName(String fieldName) { this.fieldName = fieldName; } public ExpressionType getExpressionType() { return expressionType; } public void setExpressionType(ExpressionType expressionType) { this.expressionType = expressionType; } public String getExpressionValue() { return expressionValue; } public void setExpressionValue(String expressionValue) { this.expressionValue = expressionValue; } public String[] getExpressionParams() { return expressionParams; } public void setExpressionParams(String[] expressionParams) { this.expressionParams = expressionParams; } public boolean isMulti() { return multi; } public void setMulti(boolean multi) { this.multi = multi; } public Selector getSelector() { if (selector == null) { synchronized (this) { if (selector == null) { selector = compileSelector(); } } } return selector; } private Selector compileSelector() { switch (expressionType) { case Css: if (expressionParams.length >= 1) { return $(expressionValue, expressionParams[0]); } else { return $(expressionValue); } case XPath: return xpath(expressionValue); case Regex: if (expressionParams.length >= 1) { return regex(expressionValue, Integer.parseInt(expressionParams[0])); } else { return regex(expressionValue); } case JsonPath: return new JsonPathSelector(expressionValue); default: return xpath(expressionValue); } } public void setSelector(Selector selector) { this.selector = selector; } public boolean isNotNull() { return notNull; } public void setNotNull(boolean notNull) { this.notNull = notNull; } } ================================================ FILE: webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/PhantomJSDownloader.java ================================================ package us.codecraft.webmagic.downloader; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Task; import us.codecraft.webmagic.selector.PlainText; import us.codecraft.webmagic.utils.HttpConstant; import java.io.*; /** * this downloader is used to download pages which need to render the javascript * * @author dolphineor@gmail.com * @version 0.5.3 */ public class PhantomJSDownloader extends AbstractDownloader { private static final Logger logger = LoggerFactory.getLogger(PhantomJSDownloader.class); private static String crawlJsPath; private static String phantomJsCommand = "phantomjs"; // default public PhantomJSDownloader() { this.initPhantomjsCrawlPath(); } /** * 添加新的构造函数,支持phantomjs自定义命令 *

* example: * phantomjs.exe 支持windows环境 * phantomjs --ignore-ssl-errors=yes 忽略抓取地址是https时的一些错误 * /usr/local/bin/phantomjs 命令的绝对路径,避免因系统环境变量引起的IOException * * @param phantomJsCommand phantomJsCommand */ public PhantomJSDownloader(String phantomJsCommand) { this.initPhantomjsCrawlPath(); PhantomJSDownloader.phantomJsCommand = phantomJsCommand; } /** * 新增构造函数,支持crawl.js路径自定义,因为当其他项目依赖此jar包时,runtime.exec()执行phantomjs命令时无使用法jar包中的crawl.js *

     * crawl.js start --
     *
     *   var system = require('system');
     *   var url = system.args[1];
     *
     *   var page = require('webpage').create();
     *   page.settings.loadImages = false;
     *   page.settings.resourceTimeout = 5000;
     *
     *   page.open(url, function (status) {
     *       if (status != 'success') {
     *           console.log("HTTP request failed!");
     *       } else {
     *           console.log(page.content);
     *       }
     *
     *       page.close();
     *       phantom.exit();
     *   });
     *
     * -- crawl.js end
     * 
* 具体项目时可以将以上js代码复制下来使用 *

* example: * new PhantomJSDownloader("/your/path/phantomjs", "/your/path/crawl.js"); * * @param phantomJsCommand phantomJsCommand * @param crawlJsPath crawlJsPath */ public PhantomJSDownloader(String phantomJsCommand, String crawlJsPath) { PhantomJSDownloader.phantomJsCommand = phantomJsCommand; PhantomJSDownloader.crawlJsPath = crawlJsPath; } private void initPhantomjsCrawlPath() { PhantomJSDownloader.crawlJsPath = new File(this.getClass().getResource("/").getPath()).getPath() + System.getProperty("file.separator") + "crawl.js "; } @Override public Page download(Request request, Task task) { if (logger.isInfoEnabled()) { logger.info("downloading page: " + request.getUrl()); } Page page = Page.fail(request); try { String content = getPage(request); if (!content.contains("HTTP request failed")) { page.setDownloadSuccess(true); page.setRawText(content); page.setUrl(new PlainText(request.getUrl())); page.setRequest(request); page.setStatusCode(HttpConstant.StatusCode.CODE_200); } onSuccess(page, task); } catch (Exception e) { onError(page, task, e); logger.warn("download page {} error", request.getUrl(), e); } return page; } @Override public void setThread(int threadNum) { // ignore } protected String getPage(Request request) throws Exception { String url = request.getUrl(); Runtime runtime = Runtime.getRuntime(); Process process = runtime.exec(phantomJsCommand + " " + crawlJsPath + " " + url); InputStream is = process.getInputStream(); BufferedReader br = new BufferedReader(new InputStreamReader(is)); StringBuilder builder = new StringBuilder(); String line; while ((line = br.readLine()) != null) { builder.append(line).append("\n"); } return builder.toString(); } } ================================================ FILE: webmagic-extension/src/main/java/us/codecraft/webmagic/example/AppStore.java ================================================ package us.codecraft.webmagic.example; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.model.OOSpider; import us.codecraft.webmagic.model.annotation.ExtractBy; import us.codecraft.webmagic.utils.Experimental; import java.util.List; /** * @author code4crafter@gmail.com * @since 0.4.1 */ @Experimental public class AppStore { @ExtractBy(type = ExtractBy.Type.JsonPath, value = "$..trackName") private String trackName; @ExtractBy(type = ExtractBy.Type.JsonPath, value = "$..description") private String description; @ExtractBy(type = ExtractBy.Type.JsonPath, value = "$..userRatingCount") private int userRatingCount; @ExtractBy(type = ExtractBy.Type.JsonPath, value = "$..screenshotUrls") private List screenshotUrls; @ExtractBy(type = ExtractBy.Type.JsonPath, value = "$..supportedDevices") private List supportedDevices; public static void main(String[] args) { AppStore appStore = OOSpider.create(Site.me(), AppStore.class).get("http://itunes.apple.com/lookup?id=653350791&country=cn&entity=software"); System.out.println(appStore.trackName); System.out.println(appStore.description); System.out.println(appStore.userRatingCount); System.out.println(appStore.screenshotUrls); System.out.println(appStore.supportedDevices); } } ================================================ FILE: webmagic-extension/src/main/java/us/codecraft/webmagic/example/BaiduBaike.java ================================================ package us.codecraft.webmagic.example; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.model.OOSpider; import us.codecraft.webmagic.model.annotation.ExtractBy; import java.util.ArrayList; import java.util.List; /** * @since 0.4.0 * @author code4crafter@gmail.com */ public class BaiduBaike{ @ExtractBy("//h1[@class=title]/div[@class=lemmaTitleH1]/text()") private String name; @ExtractBy("//div[@id='lemmaContent-0']//div[@class='para']/allText()") private String description; @Override public String toString() { return "BaiduBaike{" + "name='" + name + '\'' + ", description='" + description + '\'' + '}'; } public static void main(String[] args) { OOSpider ooSpider = OOSpider.create(Site.me().setSleepTime(0), BaiduBaike.class); //single download String urlTemplate = "http://baike.baidu.com/search/word?word=%s&pic=1&sug=1&enc=utf8"; BaiduBaike baike = ooSpider.get("http://baike.baidu.com/search/word?word=httpclient&pic=1&sug=1&enc=utf8"); System.out.println(baike); //multidownload List list = new ArrayList(); list.add(String.format(urlTemplate,"风力发电")); list.add(String.format(urlTemplate,"太阳能")); list.add(String.format(urlTemplate,"地热发电")); list.add(String.format(urlTemplate,"地热发电")); List resultItemses = ooSpider.getAll(list); for (BaiduBaike resultItemse : resultItemses) { System.out.println(resultItemse); } ooSpider.close(); } public String getName() { return name; } public String getDescription() { return description; } } ================================================ FILE: webmagic-extension/src/main/java/us/codecraft/webmagic/example/GithubRepo.java ================================================ package us.codecraft.webmagic.example; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.model.ConsolePageModelPipeline; import us.codecraft.webmagic.model.HasKey; import us.codecraft.webmagic.model.OOSpider; import us.codecraft.webmagic.model.annotation.ExtractBy; import us.codecraft.webmagic.model.annotation.ExtractByUrl; import us.codecraft.webmagic.model.annotation.HelpUrl; import us.codecraft.webmagic.model.annotation.TargetUrl; import java.util.List; /** * @author code4crafter@gmail.com
* @since 0.3.2 */ @TargetUrl("https://github.com/\\w+/\\w+") @HelpUrl({"https://github.com/\\w+\\?tab=repositories", "https://github.com/\\w+", "https://github.com/explore/*"}) public class GithubRepo implements HasKey { @ExtractBy(value = "//h1[@class='public']/strong/a/text()", notNull = true) private String name; @ExtractByUrl("https://github\\.com/(\\w+)/.*") private String author; @ExtractBy("//div[@id='readme']/tidyText()") private String readme; @ExtractBy(value = "//div[@class='repository-lang-stats']//li//span[@class='lang']/text()", multi = true) private List language; @ExtractBy("//ul[@class='pagehead-actions']/li[1]//a[@class='social-count js-social-count']/text()") private int star; @ExtractBy("//ul[@class='pagehead-actions']/li[2]//a[@class='social-count']/text()") private int fork; @ExtractByUrl private String url; public static void main(String[] args) { OOSpider.create(Site.me().setSleepTime(100) , new ConsolePageModelPipeline(), GithubRepo.class) .addUrl("https://github.com/code4craft").thread(10).run(); } @Override public String key() { return author + ":" + name; } public String getName() { return name; } public String getReadme() { return readme; } public String getAuthor() { return author; } public List getLanguage() { return language; } public String getUrl() { return url; } public int getStar() { return star; } public int getFork() { return fork; } @Override public String toString() { return "GithubRepo{" + "name='" + name + '\'' + ", author='" + author + '\'' + ", readme='" + readme + '\'' + ", language=" + language + ", star=" + star + ", fork=" + fork + ", url='" + url + '\'' + '}'; } } ================================================ FILE: webmagic-extension/src/main/java/us/codecraft/webmagic/example/GithubRepoApi.java ================================================ package us.codecraft.webmagic.example; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.model.ConsolePageModelPipeline; import us.codecraft.webmagic.model.HasKey; import us.codecraft.webmagic.model.OOSpider; import us.codecraft.webmagic.model.annotation.ExtractBy; import us.codecraft.webmagic.model.annotation.ExtractByUrl; import java.util.List; /** * @author code4crafter@gmail.com
* @since 0.4.1 */ public class GithubRepoApi implements HasKey { @ExtractBy(type = ExtractBy.Type.JsonPath, value = "$.name", source = ExtractBy.Source.RawText) private String name; @ExtractBy(type = ExtractBy.Type.JsonPath, value = "$..owner.login", source = ExtractBy.Source.RawText) private String author; @ExtractBy(type = ExtractBy.Type.JsonPath, value = "$.language",multi = true, source = ExtractBy.Source.RawText) private List language; @ExtractBy(type = ExtractBy.Type.JsonPath, value = "$.stargazers_count", source = ExtractBy.Source.RawText) private int star; @ExtractBy(type = ExtractBy.Type.JsonPath, value = "$.forks_count", source = ExtractBy.Source.RawText) private int fork; @ExtractByUrl private String url; public static void main(String[] args) { OOSpider.create(Site.me().setSleepTime(100) , new ConsolePageModelPipeline(), GithubRepoApi.class) .addUrl("https://api.github.com/repos/code4craft/webmagic").run(); } @Override public String key() { return author + ":" + name; } public String getName() { return name; } public String getAuthor() { return author; } public List getLanguage() { return language; } public String getUrl() { return url; } public int getStar() { return star; } public int getFork() { return fork; } } ================================================ FILE: webmagic-extension/src/main/java/us/codecraft/webmagic/example/GithubRepoPageMapper.java ================================================ package us.codecraft.webmagic.example; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.model.PageMapper; import us.codecraft.webmagic.processor.PageProcessor; /** * @author code4crafter@gmail.com
* @since 0.3.2 */ public class GithubRepoPageMapper implements PageProcessor { private Site site = Site.me().setRetryTimes(3).setSleepTime(0); private PageMapper githubRepoPageMapper = new PageMapper(GithubRepo.class); @Override public void process(Page page) { page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/\\w+/\\w+)").all()); page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/\\w+)").all()); GithubRepo githubRepo = githubRepoPageMapper.get(page); if (githubRepo == null) { page.setSkip(true); } else { page.putField("repo", githubRepo); } } @Override public Site getSite() { return site; } public static void main(String[] args) { Spider.create(new GithubRepoPageMapper()).addUrl("https://github.com/code4craft").thread(5).run(); } } ================================================ FILE: webmagic-extension/src/main/java/us/codecraft/webmagic/example/MonitorExample.java ================================================ package us.codecraft.webmagic.example; import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.monitor.SpiderMonitor; import us.codecraft.webmagic.processor.example.GithubRepoPageProcessor; import us.codecraft.webmagic.processor.example.ZhihuPageProcessor; /** * @author code4crafer@gmail.com * @since 0.5.0 */ public class MonitorExample { public static void main(String[] args) throws Exception { Spider zhihuSpider = Spider.create(new ZhihuPageProcessor()) .addUrl("http://my.oschina.net/flashsword/blog"); Spider githubSpider = Spider.create(new GithubRepoPageProcessor()) .addUrl("https://github.com/code4craft"); SpiderMonitor.instance().register(zhihuSpider); SpiderMonitor.instance().register(githubSpider); zhihuSpider.start(); githubSpider.start(); } } ================================================ FILE: webmagic-extension/src/main/java/us/codecraft/webmagic/example/OschinaBlog.java ================================================ package us.codecraft.webmagic.example; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.model.OOSpider; import us.codecraft.webmagic.model.annotation.ExtractBy; import us.codecraft.webmagic.model.annotation.Formatter; import us.codecraft.webmagic.model.annotation.TargetUrl; import us.codecraft.webmagic.pipeline.JsonFilePageModelPipeline; import java.util.Date; import java.util.List; /** * @author code4crafter@gmail.com
* @since 0.3.2 */ @TargetUrl("http://my.oschina.net/flashsword/blog/\\d+") public class OschinaBlog { @ExtractBy("//title/text()") private String title; @ExtractBy(value = "div.BlogContent", type = ExtractBy.Type.Css) private String content; @ExtractBy(value = "//div[@class='BlogTags']/a/text()", multi = true) private List tags; @ExtractBy("//div[@class='BlogStat']/regex('\\d+-\\d+-\\d+\\s+\\d+:\\d+')") private Date date; public static void main(String[] args) { //results will be saved to "/data/webmagic/" in json format OOSpider.create(Site.me(), new JsonFilePageModelPipeline("/data/webmagic/"), OschinaBlog.class) .addUrl("http://my.oschina.net/flashsword/blog").run(); } public String getTitle() { return title; } public String getContent() { return content; } public List getTags() { return tags; } public Date getDate() { return date; } } ================================================ FILE: webmagic-extension/src/main/java/us/codecraft/webmagic/example/PatternProcessorExample.java ================================================ package us.codecraft.webmagic.example; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import us.codecraft.webmagic.*; import us.codecraft.webmagic.handler.CompositePageProcessor; import us.codecraft.webmagic.handler.CompositePipeline; import us.codecraft.webmagic.handler.PatternProcessor; import us.codecraft.webmagic.handler.RequestMatcher; /** * Created with IntelliJ IDEA. * User: Sebastian MA * Date: April 04, 2014 * Time: 21:23 */ public class PatternProcessorExample { private static Logger log = LoggerFactory.getLogger(PatternProcessorExample.class); public static void main(String... args) { // define a patternProcessor which handles only "http://item.jd.com/.*" PatternProcessor githubRepoProcessor = new PatternProcessor("https://github\\.com/[\\w\\-]+/[\\w\\-]+") { @Override public RequestMatcher.MatchOther processPage(Page page) { page.putField("reponame", page.getHtml().xpath("//h1[@class='entry-title public']/strong/a/text()").toString()); return RequestMatcher.MatchOther.YES; } @Override public RequestMatcher.MatchOther processResult(ResultItems resultItems, Task task) { log.info("Extracting from repo" + resultItems.getRequest()); System.out.println("Repo name: "+resultItems.get("reponame")); return RequestMatcher.MatchOther.YES; } }; PatternProcessor githubUserProcessor = new PatternProcessor("https://github\\.com/[\\w\\-]+") { @Override public RequestMatcher.MatchOther processPage(Page page) { log.info("Extracting from " + page.getUrl()); page.addTargetRequests(page.getHtml().links().regex("https://github\\.com/[\\w\\-]+/[\\w\\-]+").all()); page.addTargetRequests(page.getHtml().links().regex("https://github\\.com/[\\w\\-]+").all()); page.putField("username", page.getHtml().xpath("//span[@class='vcard-fullname']/text()").toString()); return RequestMatcher.MatchOther.YES; } @Override public RequestMatcher.MatchOther processResult(ResultItems resultItems, Task task) { System.out.println("User name: "+resultItems.get("username")); return RequestMatcher.MatchOther.YES; } }; CompositePageProcessor pageProcessor = new CompositePageProcessor(Site.me().setDomain("github.com").setRetryTimes(3)); CompositePipeline pipeline = new CompositePipeline(); pageProcessor.setSubPageProcessors(githubRepoProcessor, githubUserProcessor); pipeline.setSubPipeline(githubRepoProcessor, githubUserProcessor); Spider.create(pageProcessor).addUrl("https://github.com/code4craft").thread(5).addPipeline(pipeline).runAsync(); } } ================================================ FILE: webmagic-extension/src/main/java/us/codecraft/webmagic/handler/CompositePageProcessor.java ================================================ package us.codecraft.webmagic.handler; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.processor.PageProcessor; import java.util.ArrayList; import java.util.List; /** * @author code4crafter@gmail.com */ public class CompositePageProcessor implements PageProcessor { private Site site; private List subPageProcessors = new ArrayList(); public CompositePageProcessor(Site site) { this.site = site; } @Override public void process(Page page) { for (SubPageProcessor subPageProcessor : subPageProcessors) { if (subPageProcessor.match(page.getRequest())) { SubPageProcessor.MatchOther matchOtherProcessorProcessor = subPageProcessor.processPage(page); if (matchOtherProcessorProcessor == null || matchOtherProcessorProcessor != SubPageProcessor.MatchOther.YES) { return; } } } } public CompositePageProcessor setSite(Site site) { this.site = site; return this; } public CompositePageProcessor addSubPageProcessor(SubPageProcessor subPageProcessor) { this.subPageProcessors.add(subPageProcessor); return this; } public CompositePageProcessor setSubPageProcessors(SubPageProcessor... subPageProcessors) { this.subPageProcessors = new ArrayList(); for (SubPageProcessor subPageProcessor : subPageProcessors) { this.subPageProcessors.add(subPageProcessor); } return this; } @Override public Site getSite() { return site; } } ================================================ FILE: webmagic-extension/src/main/java/us/codecraft/webmagic/handler/CompositePipeline.java ================================================ package us.codecraft.webmagic.handler; import us.codecraft.webmagic.ResultItems; import us.codecraft.webmagic.Task; import us.codecraft.webmagic.pipeline.Pipeline; import java.util.ArrayList; import java.util.List; /** * @author code4crafer@gmail.com */ public class CompositePipeline implements Pipeline { private List subPipelines = new ArrayList(); @Override public void process(ResultItems resultItems, Task task) { for (SubPipeline subPipeline : subPipelines) { if (subPipeline.match(resultItems.getRequest())) { RequestMatcher.MatchOther matchOtherProcessorProcessor = subPipeline.processResult(resultItems, task); if (matchOtherProcessorProcessor == null || matchOtherProcessorProcessor != RequestMatcher.MatchOther.YES) { return; } } } } public CompositePipeline addSubPipeline(SubPipeline subPipeline) { this.subPipelines.add(subPipeline); return this; } public CompositePipeline setSubPipeline(SubPipeline... subPipelines) { this.subPipelines = new ArrayList(); for (SubPipeline subPipeline : subPipelines) { this.subPipelines.add(subPipeline); } return this; } } ================================================ FILE: webmagic-extension/src/main/java/us/codecraft/webmagic/handler/PatternProcessor.java ================================================ package us.codecraft.webmagic.handler; /** * @author code4crafer@gmail.com */ public abstract class PatternProcessor extends PatternRequestMatcher implements SubPipeline, SubPageProcessor { /** * @param pattern url pattern to handle */ public PatternProcessor(String pattern) { super(pattern); } } ================================================ FILE: webmagic-extension/src/main/java/us/codecraft/webmagic/handler/PatternRequestMatcher.java ================================================ package us.codecraft.webmagic.handler; import us.codecraft.webmagic.Request; import java.util.regex.Pattern; /** * Created with IntelliJ IDEA. * User: Sebastian MA * Date: April 03, 2014 * Time: 10:00 *

* A PatternHandler is in charge of both page extraction and data processing by implementing * its two abstract methods. */ public abstract class PatternRequestMatcher implements RequestMatcher { /** * match pattern. only matched page should be handled. */ protected String pattern; private Pattern patternCompiled; /** * @param pattern url pattern to handle */ public PatternRequestMatcher(String pattern) { this.pattern = pattern; this.patternCompiled = Pattern.compile(pattern); } @Override public boolean match(Request request) { return patternCompiled.matcher(request.getUrl()).matches(); } } ================================================ FILE: webmagic-extension/src/main/java/us/codecraft/webmagic/handler/RequestMatcher.java ================================================ package us.codecraft.webmagic.handler; import us.codecraft.webmagic.Request; /** * @author code4crafer@gmail.com * @since 0.5.0 */ public interface RequestMatcher { /** * Check whether to process the page.

* Please DO NOT change page status in this method. * * @param page page * * @return whether matches */ public boolean match(Request page); public enum MatchOther { YES, NO } } ================================================ FILE: webmagic-extension/src/main/java/us/codecraft/webmagic/handler/SubPageProcessor.java ================================================ package us.codecraft.webmagic.handler; import us.codecraft.webmagic.Page; /** * @author code4crafter@gmail.com */ public interface SubPageProcessor extends RequestMatcher { /** * process the page, extract urls to fetch, extract the data and store * * @param page page * * @return whether continue to match */ public MatchOther processPage(Page page); } ================================================ FILE: webmagic-extension/src/main/java/us/codecraft/webmagic/handler/SubPipeline.java ================================================ package us.codecraft.webmagic.handler; import us.codecraft.webmagic.ResultItems; import us.codecraft.webmagic.Task; /** * @author code4crafer@gmail.com * @since 0.5.0 */ public interface SubPipeline extends RequestMatcher { /** * process the page, extract urls to fetch, extract the data and store * * @param resultItems resultItems * @param task task * @return whether continue to match */ public MatchOther processResult(ResultItems resultItems, Task task); } ================================================ FILE: webmagic-extension/src/main/java/us/codecraft/webmagic/model/AfterExtractor.java ================================================ package us.codecraft.webmagic.model; import us.codecraft.webmagic.Page; /** * Interface to be implemented by page models that need to do something after fields are extracted.
* * @author code4crafter@gmail.com
* @since 0.2.0 */ public interface AfterExtractor { public void afterProcess(Page page); } ================================================ FILE: webmagic-extension/src/main/java/us/codecraft/webmagic/model/ConsolePageModelPipeline.java ================================================ package us.codecraft.webmagic.model; import org.apache.commons.lang3.builder.ToStringBuilder; import us.codecraft.webmagic.Task; import us.codecraft.webmagic.pipeline.PageModelPipeline; /** * Print page model in console.
* Usually used in test.
* @author code4crafter@gmail.com
* @since 0.2.0 */ public class ConsolePageModelPipeline implements PageModelPipeline { @Override public void process(Object o, Task task) { System.out.println(ToStringBuilder.reflectionToString(o)); } } ================================================ FILE: webmagic-extension/src/main/java/us/codecraft/webmagic/model/Extractor.java ================================================ package us.codecraft.webmagic.model; import lombok.Getter; import lombok.Setter; import us.codecraft.webmagic.model.sources.Source; import us.codecraft.webmagic.selector.Selector; /** * The object contains 'ExtractBy' information. * @author code4crafter@gmail.com
* @since 0.2.0 */ public class Extractor { @Getter @Setter protected Selector selector; @Getter protected final Source source; protected final boolean notNull; protected final boolean multi; public Extractor(Selector selector, Source source, boolean notNull, boolean multi) { this.selector = selector; this.source = source; this.notNull = notNull; this.multi = multi; } public boolean isNotNull() { return notNull; } public boolean isMulti() { return multi; } } ================================================ FILE: webmagic-extension/src/main/java/us/codecraft/webmagic/model/FieldExtractor.java ================================================ package us.codecraft.webmagic.model; import us.codecraft.webmagic.model.formatter.ObjectFormatter; import us.codecraft.webmagic.model.sources.Source; import us.codecraft.webmagic.selector.Selector; import java.lang.reflect.Field; import java.lang.reflect.Method; import lombok.Getter; import lombok.Setter; /** * Wrapper of field and extractor. * @author code4crafter@gmail.com
* @since 0.2.0 */ public class FieldExtractor extends Extractor { @Getter private final Field field; @Getter @Setter private Method setterMethod; @Getter @Setter private ObjectFormatter objectFormatter; public FieldExtractor(Field field, Selector selector, Source source, boolean notNull, boolean multi) { super(selector, source, notNull, multi); this.field = field; } } ================================================ FILE: webmagic-extension/src/main/java/us/codecraft/webmagic/model/HasKey.java ================================================ package us.codecraft.webmagic.model; import us.codecraft.webmagic.utils.Experimental; /** * Interface to be implemented by page mode.
* Can be used to identify a page model, or be used as name of file storing the object.
* @author code4crafter@gmail.com
* @since 0.2.0 */ @Experimental public interface HasKey { /** * * * @return key */ public String key(); } ================================================ FILE: webmagic-extension/src/main/java/us/codecraft/webmagic/model/ModelPageProcessor.java ================================================ package us.codecraft.webmagic.model; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.processor.PageProcessor; import us.codecraft.webmagic.selector.Selector; import java.util.ArrayList; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; /** * The extension to PageProcessor for page model extractor. * * @author code4crafter@gmail.com
* @since 0.2.0 */ class ModelPageProcessor implements PageProcessor { private List pageModelExtractorList = new ArrayList(); private Site site; private boolean extractLinks = true; public static ModelPageProcessor create(Site site, Class... clazzs) { ModelPageProcessor modelPageProcessor = new ModelPageProcessor(site); for (Class clazz : clazzs) { modelPageProcessor.addPageModel(clazz); } return modelPageProcessor; } public ModelPageProcessor addPageModel(Class clazz) { PageModelExtractor pageModelExtractor = PageModelExtractor.create(clazz); pageModelExtractorList.add(pageModelExtractor); return this; } private ModelPageProcessor(Site site) { this.site = site; } @Override public void process(Page page) { for (PageModelExtractor pageModelExtractor : pageModelExtractorList) { if (extractLinks) { extractLinks(page, pageModelExtractor.getHelpUrlRegionSelector(), pageModelExtractor.getHelpUrlPatterns()); extractLinks(page, pageModelExtractor.getTargetUrlRegionSelector(), pageModelExtractor.getTargetUrlPatterns()); } Object process = pageModelExtractor.process(page); if (process == null || (process instanceof List && ((List) process).size() == 0)) { continue; } postProcessPageModel(pageModelExtractor.getClazz(), process); page.putField(pageModelExtractor.getClazz().getCanonicalName(), process); } if (page.getResultItems().getAll().size() == 0) { page.getResultItems().setSkip(true); } } private void extractLinks(Page page, Selector urlRegionSelector, List urlPatterns) { List links; if (urlRegionSelector == null) { links = page.getHtml().links().all(); } else { links = page.getHtml().selectList(urlRegionSelector).links().all(); } for (String link : links) { for (Pattern targetUrlPattern : urlPatterns) { Matcher matcher = targetUrlPattern.matcher(link); if (matcher.find()) { page.addTargetRequest(new Request(matcher.group(0))); } } } } protected void postProcessPageModel(Class clazz, Object object) { } @Override public Site getSite() { return site; } public boolean isExtractLinks() { return extractLinks; } public void setExtractLinks(boolean extractLinks) { this.extractLinks = extractLinks; } } ================================================ FILE: webmagic-extension/src/main/java/us/codecraft/webmagic/model/ModelPipeline.java ================================================ package us.codecraft.webmagic.model; import us.codecraft.webmagic.ResultItems; import us.codecraft.webmagic.Task; import us.codecraft.webmagic.model.annotation.ExtractBy; import us.codecraft.webmagic.pipeline.PageModelPipeline; import us.codecraft.webmagic.pipeline.Pipeline; import java.lang.annotation.Annotation; import java.util.List; import java.util.Map; import java.util.concurrent.ConcurrentHashMap; /** * The extension to Pipeline for page model extractor. * * @author code4crafter@gmail.com
* @since 0.2.0 */ class ModelPipeline implements Pipeline { private Map pageModelPipelines = new ConcurrentHashMap(); public ModelPipeline() { } public ModelPipeline put(Class clazz, PageModelPipeline pageModelPipeline) { pageModelPipelines.put(clazz, pageModelPipeline); return this; } @Override public void process(ResultItems resultItems, Task task) { for (Map.Entry classPageModelPipelineEntry : pageModelPipelines.entrySet()) { Object o = resultItems.get(classPageModelPipelineEntry.getKey().getCanonicalName()); if (o != null) { Annotation annotation = classPageModelPipelineEntry.getKey().getAnnotation(ExtractBy.class); if (annotation == null || !((ExtractBy) annotation).multi()) { classPageModelPipelineEntry.getValue().process(o, task); } else { List list = (List) o; for (Object o1 : list) { classPageModelPipelineEntry.getValue().process(o1, task); } } } } } } ================================================ FILE: webmagic-extension/src/main/java/us/codecraft/webmagic/model/OOSpider.java ================================================ package us.codecraft.webmagic.model; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.pipeline.CollectorPipeline; import us.codecraft.webmagic.pipeline.PageModelPipeline; import us.codecraft.webmagic.processor.PageProcessor; import java.util.ArrayList; import java.util.List; /** * The spider for page model extractor.
* In webmagic, we call a POJO containing extract result as "page model".
* You can customize a crawler by write a page model with annotations.
* Such as: *
 * {@literal @}TargetUrl("http://my.oschina.net/flashsword/blog/\\d+")
 *  public class OschinaBlog{
 *
 *      {@literal @}ExtractBy("//title")
 *      private String title;
 *
 *      {@literal @}ExtractBy(value = "div.BlogContent",type = ExtractBy.Type.Css)
 *      private String content;
 *
 *      {@literal @}ExtractBy(value = "//div[@class='BlogTags']/a/text()", multi = true)
 *      private List<String> tags;
 * }
 * 
* And start the spider by: *
 *   OOSpider.create(Site.me().addStartUrl("http://my.oschina.net/flashsword/blog")
 *        ,new JsonFilePageModelPipeline(), OschinaBlog.class).run();
 * }
 * 
* * @author code4crafter@gmail.com
* @since 0.2.0 */ public class OOSpider extends Spider { private ModelPageProcessor modelPageProcessor; private ModelPipeline modelPipeline; private PageModelPipeline pageModelPipeline; private List pageModelClasses = new ArrayList(); protected OOSpider(ModelPageProcessor modelPageProcessor) { super(modelPageProcessor); this.modelPageProcessor = modelPageProcessor; } public OOSpider(PageProcessor pageProcessor) { super(pageProcessor); } /** * create a spider * * @param site site * @param pageModelPipeline pageModelPipeline * @param pageModels pageModels */ public OOSpider(Site site, PageModelPipeline pageModelPipeline, Class... pageModels) { this(ModelPageProcessor.create(site, pageModels)); this.modelPipeline = new ModelPipeline(); super.addPipeline(modelPipeline); for (Class pageModel : pageModels) { if (pageModelPipeline != null) { this.modelPipeline.put(pageModel, pageModelPipeline); } pageModelClasses.add(pageModel); } } @Override protected CollectorPipeline getCollectorPipeline() { return new PageModelCollectorPipeline(pageModelClasses.get(0)); } public static OOSpider create(Site site, Class... pageModels) { return new OOSpider(site, null, pageModels); } public static OOSpider create(Site site, PageModelPipeline pageModelPipeline, Class... pageModels) { return new OOSpider(site, pageModelPipeline, pageModels); } public OOSpider addPageModel(PageModelPipeline pageModelPipeline, Class... pageModels) { for (Class pageModel : pageModels) { modelPageProcessor.addPageModel(pageModel); modelPipeline.put(pageModel, pageModelPipeline); } return this; } public OOSpider setIsExtractLinks(boolean isExtractLinks){ modelPageProcessor.setExtractLinks(isExtractLinks); return this; } } ================================================ FILE: webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageMapper.java ================================================ package us.codecraft.webmagic.model; import us.codecraft.webmagic.Page; import java.util.List; /** * @author code4crafer@gmail.com * @since 0.5.2 */ public class PageMapper { private Class clazz; private PageModelExtractor pageModelExtractor; public PageMapper(Class clazz) { this.clazz = clazz; this.pageModelExtractor = PageModelExtractor.create(clazz); } public T get(Page page) { return (T) pageModelExtractor.process(page); } public List getAll(Page page) { return (List) pageModelExtractor.process(page); } } ================================================ FILE: webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelCollectorPipeline.java ================================================ package us.codecraft.webmagic.model; import us.codecraft.webmagic.ResultItems; import us.codecraft.webmagic.Task; import us.codecraft.webmagic.model.annotation.ExtractBy; import us.codecraft.webmagic.pipeline.CollectorPageModelPipeline; import us.codecraft.webmagic.pipeline.CollectorPipeline; import java.lang.annotation.Annotation; import java.util.List; /** * @author code4crafter@gmail.com * @since 0.4.0 */ class PageModelCollectorPipeline implements CollectorPipeline { private final CollectorPageModelPipeline classPipeline = new CollectorPageModelPipeline(); private final Class clazz; PageModelCollectorPipeline(Class clazz) { this.clazz = clazz; } @Override public List getCollected() { return classPipeline.getCollected(); } @Override public synchronized void process(ResultItems resultItems, Task task) { Object o = resultItems.get(clazz.getCanonicalName()); if (o != null) { Annotation annotation = clazz.getAnnotation(ExtractBy.class); if (annotation == null || !((ExtractBy) annotation).multi()) { classPipeline.process((T) o, task); } else { List list = (List) o; for (Object o1 : list) { classPipeline.process((T) o1, task); } } } } } ================================================ FILE: webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java ================================================ package us.codecraft.webmagic.model; import org.apache.commons.lang3.StringUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import lombok.Getter; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.model.annotation.*; import us.codecraft.webmagic.model.fields.PageField; import us.codecraft.webmagic.model.formatter.ObjectFormatterBuilder; import us.codecraft.webmagic.model.sources.Source; import us.codecraft.webmagic.model.sources.SourceTextExtractor; import us.codecraft.webmagic.model.sources.Source.*; import us.codecraft.webmagic.selector.*; import us.codecraft.webmagic.utils.ClassUtils; import us.codecraft.webmagic.utils.ExtractorUtils; import java.lang.annotation.Annotation; import java.lang.reflect.Field; import java.lang.reflect.Method; import java.util.ArrayList; import java.util.List; import java.util.regex.Pattern; import static us.codecraft.webmagic.model.annotation.ExtractBy.Source.RawText; /** * The main internal logic of page model extractor. * * @author code4crafter@gmail.com
* @since 0.2.0 */ class PageModelExtractor { @Getter private List targetUrlPatterns = new ArrayList(); @Getter private Selector targetUrlRegionSelector; @Getter private List helpUrlPatterns = new ArrayList(); @Getter private Selector helpUrlRegionSelector; @Getter private Class clazz; private List fieldExtractors; private Extractor objectExtractor; private Logger logger = LoggerFactory.getLogger(getClass()); public static PageModelExtractor create(Class clazz) { PageModelExtractor pageModelExtractor = new PageModelExtractor(); pageModelExtractor.init(clazz); return pageModelExtractor; } private void init(Class clazz) { this.clazz = clazz; initClassExtractors(); fieldExtractors = new ArrayList(); for (Field field : ClassUtils.getFieldsIncludeSuperClass(clazz)) { field.setAccessible(true); FieldExtractor fieldExtractor = getAnnotationExtractBy(clazz, field); FieldExtractor fieldExtractorTmp = getAnnotationExtractCombo(clazz, field); if (fieldExtractor != null && fieldExtractorTmp != null) { throw new IllegalStateException("Only one of 'ExtractBy ComboExtract ExtractByUrl' can be added to a field!"); } else if (fieldExtractor == null && fieldExtractorTmp != null) { fieldExtractor = fieldExtractorTmp; } fieldExtractorTmp = getAnnotationExtractByUrl(clazz, field); if (fieldExtractor != null && fieldExtractorTmp != null) { throw new IllegalStateException("Only one of 'ExtractBy ComboExtract ExtractByUrl' can be added to a field!"); } else if (fieldExtractor == null && fieldExtractorTmp != null) { fieldExtractor = fieldExtractorTmp; } if (fieldExtractor != null) { fieldExtractor.setObjectFormatter(new ObjectFormatterBuilder().setField(field).build()); fieldExtractors.add(fieldExtractor); } } } private FieldExtractor getAnnotationExtractByUrl(Class clazz, Field field) { FieldExtractor fieldExtractor = null; ExtractByUrl extractByUrl = field.getAnnotation(ExtractByUrl.class); if (extractByUrl != null) { String regexPattern = extractByUrl.value(); if (regexPattern.trim().equals("")) { regexPattern = ".*"; } fieldExtractor = new FieldExtractor(field, new RegexSelector(regexPattern), new Url(), extractByUrl.notNull(), extractByUrl.multi() || List.class.isAssignableFrom(field.getType())); Method setterMethod = getSetterMethod(clazz, field); if (setterMethod != null) { fieldExtractor.setSetterMethod(setterMethod); } } return fieldExtractor; } private FieldExtractor getAnnotationExtractCombo(Class clazz, Field field) { FieldExtractor fieldExtractor = null; ComboExtract comboExtract = field.getAnnotation(ComboExtract.class); if (comboExtract != null) { ExtractBy[] extractBies = comboExtract.value(); Selector selector; switch (comboExtract.op()) { case And: selector = new AndSelector(ExtractorUtils.getSelectors(extractBies)); break; case Or: selector = new OrSelector(ExtractorUtils.getSelectors(extractBies)); break; default: selector = new AndSelector(ExtractorUtils.getSelectors(extractBies)); } fieldExtractor = new FieldExtractor(field, selector, comboExtract.source() == ComboExtract.Source.RawHtml ? new RawHtml() : new SelectedHtml(), comboExtract.notNull(), comboExtract.multi() || List.class.isAssignableFrom(field.getType())); Method setterMethod = getSetterMethod(clazz, field); if (setterMethod != null) { fieldExtractor.setSetterMethod(setterMethod); } } return fieldExtractor; } private FieldExtractor getAnnotationExtractBy(Class clazz, Field field) { FieldExtractor fieldExtractor = null; ExtractBy extractBy = field.getAnnotation(ExtractBy.class); if (extractBy != null) { Selector selector = ExtractorUtils.getSelector(extractBy); ExtractBy.Source extractSource = extractBy.source(); if (extractBy.type()== ExtractBy.Type.JsonPath) extractSource = RawText; Source source = null; switch (extractSource) { case RawText: source = new RawText(); break; case RawHtml: source = new RawHtml(); break; case SelectedHtml: source = new SelectedHtml(); break; default: source = new SelectedHtml(); } fieldExtractor = new FieldExtractor(field, selector, source, extractBy.notNull(), List.class.isAssignableFrom(field.getType())); fieldExtractor.setSetterMethod(getSetterMethod(clazz, field)); } return fieldExtractor; } public static Method getSetterMethod(Class clazz, Field field) { String name = "set" + StringUtils.capitalize(field.getName()); try { Method declaredMethod = clazz.getDeclaredMethod(name, field.getType()); declaredMethod.setAccessible(true); return declaredMethod; } catch (NoSuchMethodException e) { return null; } } private void initClassExtractors() { Annotation annotation = clazz.getAnnotation(TargetUrl.class); if (annotation == null) { targetUrlPatterns.add(Pattern.compile(".*")); } else { TargetUrl targetUrl = (TargetUrl) annotation; String[] value = targetUrl.value(); for (String s : value) { targetUrlPatterns.add(Pattern.compile(s.replace(".", "\\.").replace("*", "[^\"'#]*"))); } if (!targetUrl.sourceRegion().equals("")) { targetUrlRegionSelector = new XpathSelector(targetUrl.sourceRegion()); } } annotation = clazz.getAnnotation(HelpUrl.class); if (annotation != null) { HelpUrl helpUrl = (HelpUrl) annotation; String[] value = helpUrl.value(); for (String s : value) { helpUrlPatterns.add(Pattern.compile(s.replace(".", "\\.").replace("*", "[^\"'#]*"))); } if (!helpUrl.sourceRegion().equals("")) { helpUrlRegionSelector = new XpathSelector(helpUrl.sourceRegion()); } } annotation = clazz.getAnnotation(ExtractBy.class); if (annotation != null) { ExtractBy extractBy = (ExtractBy) annotation; objectExtractor = new Extractor(new XpathSelector(extractBy.value()), new SelectedHtml(), extractBy.notNull(), extractBy.multi()); } } public Object process(Page page) { boolean matched = false; for (Pattern targetPattern : targetUrlPatterns) { if (targetPattern.matcher(page.getUrl().toString()).matches()) { matched = true; } } if (!matched) { return null; } if (objectExtractor == null) { return processSingle(page, null, true); } else { if (objectExtractor.multi) { List os = new ArrayList(); List list = objectExtractor.getSelector().selectList(page.getRawText()); for (String s : list) { Object o = processSingle(page, s, false); if (o != null) { os.add(o); } } return os; } else { String select = objectExtractor.getSelector().select(page.getRawText()); Object o = processSingle(page, select, false); return o; } } } private Object processSingle(Page page, String html, boolean isRaw) { Object o = null; try { o = clazz.newInstance(); for (FieldExtractor fieldExtractor : fieldExtractors) { PageField field = SourceTextExtractor.getText(page, html, isRaw, fieldExtractor); if (!field.operation(o, fieldExtractor, logger)) return null; } if (AfterExtractor.class.isAssignableFrom(clazz)) ((AfterExtractor) o).afterProcess(page); } catch (Exception e) { logger.error("extract fail", e); } return o; } } ================================================ FILE: webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ComboExtract.java ================================================ package us.codecraft.webmagic.model.annotation; import java.lang.annotation.ElementType; import java.lang.annotation.Retention; import java.lang.annotation.Target; /** * Combo 'ExtractBy' extractor with and/or operator. * * @author code4crafter@gmail.com
* @since 0.2.1 */ @Retention(java.lang.annotation.RetentionPolicy.RUNTIME) @Target({ElementType.FIELD, ElementType.TYPE}) public @interface ComboExtract { /** * The extractors to be combined. * * @return the extractors to be combined */ ExtractBy[] value(); public static enum Op { /** * All extractors will be arranged as a pipeline.
* The next extractor uses the result of the previous as source. */ And, /** * All extractors will do extracting separately,
* and the results of extractors will combined as the final result. */ Or; } /** * Combining operation of extractors.
* * @return combining operation of extractors */ Op op() default Op.And; /** * Define whether the field can be null.
* If set to 'true' and the extractor get no result, the entire class will be discarded.
* * @return whether the field can be null */ boolean notNull() default false; /** * types of source for extracting. */ public static enum Source { /** * extract from the content extracted by class extractor */ SelectedHtml, /** * extract from the raw html */ RawHtml } /** * The source for extracting.
* It works only if you already added 'ExtractBy' to Class.
* * @return the source for extracting */ Source source() default Source.SelectedHtml; /** * Define whether the extractor return more than one result. * When set to 'true', the extractor return a list of string (so you should define the field as List).
* * Deprecated since 0.4.2. This option is determined automatically by the class of field. * @deprecated since 0.4.2 * @return whether the extractor return more than one result */ boolean multi() default false; } ================================================ FILE: webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractBy.java ================================================ package us.codecraft.webmagic.model.annotation; import java.lang.annotation.ElementType; import java.lang.annotation.Retention; import java.lang.annotation.Target; /** * Define the extractor for field or class.
* * @author code4crafter@gmail.com
* @since 0.2.0 */ @Retention(java.lang.annotation.RetentionPolicy.RUNTIME) @Target({ElementType.FIELD, ElementType.TYPE}) public @interface ExtractBy { /** * Extractor expression, support XPath, CSS Selector and regex. * * @return extractor expression */ String value(); /** * types of extractor expressions */ public static enum Type {XPath, Regex, Css, JsonPath} /** * Extractor type, support XPath, CSS Selector and regex. * * @return extractor type */ Type type() default Type.XPath; /** * Define whether the field can be null.
* If set to 'true' and the extractor get no result, the entire class will be discarded.
* * @return whether the field can be null */ boolean notNull() default false; /** * types of source for extracting. */ public static enum Source { /** * extract from the content extracted by class extractor */ SelectedHtml, /** * extract from the raw html */ RawHtml, RawText } /** * The source for extracting.
* It works only if you already added 'ExtractBy' to Class.
* * @return the source for extracting */ Source source() default Source.SelectedHtml; /** * Define whether the extractor return more than one result. * When set to 'true', the extractor return a list of string (so you should define the field as List).
* * Deprecated since 0.4.2. This option is determined automatically by the class of field. * @deprecated since 0.4.2 * @return whether the extractor return more than one result */ boolean multi() default false; } ================================================ FILE: webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractByUrl.java ================================================ package us.codecraft.webmagic.model.annotation; import java.lang.annotation.ElementType; import java.lang.annotation.Retention; import java.lang.annotation.Target; /** * Define a extractor to extract data in url of current page. Only regex can be used.
* * @author code4crafter@gmail.com
* @since 0.2.0 */ @Retention(java.lang.annotation.RetentionPolicy.RUNTIME) @Target({ElementType.FIELD}) public @interface ExtractByUrl { /** * Extractor expression, only regex can be used * * @return extractor expression */ String value() default ""; /** * Define whether the field can be null.
* If set to 'true' and the extractor get no result, the entire class will be discarded.
* * @return whether the field can be null */ boolean notNull() default false; /** * Define whether the extractor return more than one result. * When set to 'true', the extractor return a list of string (so you should define the field as List).
* * Deprecated since 0.4.2. This option is determined automatically by the class of field. * @deprecated since 0.4.2 * @return whether the extractor return more than one result */ boolean multi() default false; } ================================================ FILE: webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/Formatter.java ================================================ package us.codecraft.webmagic.model.annotation; import us.codecraft.webmagic.model.formatter.ObjectFormatter; import java.lang.annotation.ElementType; import java.lang.annotation.Retention; import java.lang.annotation.Target; /** * Define how the result string is convert to an object for field. * * @author code4crafter@gmail.com
* @since 0.3.2 */ @Retention(java.lang.annotation.RetentionPolicy.RUNTIME) @Target({ElementType.FIELD}) public @interface Formatter { Class DEFAULT_FORMATTER = ObjectFormatter.class; /** * Set formatter params. * * @return formatter params */ String[] value() default ""; /** * Specific the class of field of class of elements in collection for field.
* It is not necessary to be set because we can detect the class by class of field, * unless you use a collection as a field.
* * @return the class of field */ Class subClazz() default Void.class; /** * If there are more than one formatter for a class, just specify the implement. * @return implement */ Class formatter() default ObjectFormatter.class; } ================================================ FILE: webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/HelpUrl.java ================================================ package us.codecraft.webmagic.model.annotation; import java.lang.annotation.ElementType; import java.lang.annotation.Retention; import java.lang.annotation.Target; /** * Define the 'help' url patterns for class.
* All urls matching the pattern will be crawled and but not extracted for new objects.
* * @author code4crafter@gmail.com
* @since 0.2.0 */ @Retention(java.lang.annotation.RetentionPolicy.RUNTIME) @Target({ElementType.TYPE}) public @interface HelpUrl { /** * The url patterns to crawl.
* Use regex expression with some changes:
* "." stand for literal character "." instead of "any character".
* "*" stand for any legal character for url in 0-n length ([^"'#]*) instead of "any length".
* * @return the url patterns for class */ String[] value(); /** * Define the region for url extracting.
* Only support XPath.
* When sourceRegion is set, the urls will be extracted only from the region instead of entire content.
* * @return the region for url extracting */ String sourceRegion() default ""; } ================================================ FILE: webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/TargetUrl.java ================================================ package us.codecraft.webmagic.model.annotation; import java.lang.annotation.ElementType; import java.lang.annotation.Retention; import java.lang.annotation.Target; /** * Define the url patterns for class.
* All urls matching the pattern will be crawled and extracted for new objects.
* * @author code4crafter@gmail.com
* @since 0.2.0 */ @Retention(java.lang.annotation.RetentionPolicy.RUNTIME) @Target({ElementType.TYPE}) public @interface TargetUrl { /** * The url patterns for class.
* Use regex expression with some changes:
* "." stand for literal character "." instead of "any character".
* "*" stand for any legal character for url in 0-n length ([^"'#]*) instead of "any length".
* * @return the url patterns for class */ String[] value(); /** * Define the region for url extracting.
* Only support XPath.
* When sourceRegion is set, the urls will be extracted only from the region instead of entire content.
* * @return the region for url extracting */ String sourceRegion() default ""; } ================================================ FILE: webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/package.html ================================================ Annotations for defining a extractor. ================================================ FILE: webmagic-extension/src/main/java/us/codecraft/webmagic/model/fields/MultipleField.java ================================================ package us.codecraft.webmagic.model.fields; import java.lang.reflect.InvocationTargetException; import java.util.ArrayList; import java.util.List; import org.slf4j.Logger; import lombok.Getter; import us.codecraft.webmagic.model.FieldExtractor; import us.codecraft.webmagic.model.formatter.ObjectFormatter; public class MultipleField extends PageField { @Getter private List fieldNames; public MultipleField(List fieldNames) { this.fieldNames = fieldNames; } public boolean operation(Object o, FieldExtractor fieldExtractor, Logger logger) throws IllegalAccessException, InvocationTargetException { if ((this.fieldNames == null || this.fieldNames.size() == 0) && fieldExtractor.isNotNull()) return false; if (fieldExtractor.getObjectFormatter() != null) { List converted = this.convert(this.fieldNames, fieldExtractor.getObjectFormatter(), logger); setField(o, fieldExtractor, converted); } else setField(o, fieldExtractor, this.fieldNames); return true; } private List convert(List values, ObjectFormatter objectFormatter, Logger logger) { List objects = new ArrayList<>(); for (String value : values) { Object converted = this.convert(value, objectFormatter, logger); if (converted != null) objects.add(converted); } return objects; } } ================================================ FILE: webmagic-extension/src/main/java/us/codecraft/webmagic/model/fields/PageField.java ================================================ package us.codecraft.webmagic.model.fields; import java.lang.reflect.InvocationTargetException; import org.slf4j.Logger; import us.codecraft.webmagic.model.FieldExtractor; import us.codecraft.webmagic.model.formatter.ObjectFormatter; public abstract class PageField { public abstract boolean operation(Object o, FieldExtractor fieldExtractor, Logger logger) throws IllegalAccessException, InvocationTargetException; protected Object convert(String value, ObjectFormatter objectFormatter, Logger logger) { try { Object format = objectFormatter.format(value); logger.debug("String {} is converted to {}", value, format); return format; } catch (Exception e) { logger.error("convert " + value + " to " + objectFormatter.clazz() + " error!", e); } return null; } protected void setField(Object o, FieldExtractor fieldExtractor, Object value) throws IllegalAccessException, InvocationTargetException { if (value != null) { if (fieldExtractor.getSetterMethod() != null) fieldExtractor.getSetterMethod().invoke(o, value); fieldExtractor.getField().set(o, value); } } } ================================================ FILE: webmagic-extension/src/main/java/us/codecraft/webmagic/model/fields/SingleField.java ================================================ package us.codecraft.webmagic.model.fields; import java.lang.reflect.InvocationTargetException; import org.slf4j.Logger; import lombok.Getter; import us.codecraft.webmagic.model.FieldExtractor; public class SingleField extends PageField { @Getter private String fieldName; public SingleField(String fieldName) { this.fieldName = fieldName; } public boolean operation(Object o, FieldExtractor fieldExtractor, Logger logger) throws IllegalAccessException, InvocationTargetException { if (fieldExtractor.getObjectFormatter() != null) { Object converted = this.convert(this.fieldName, fieldExtractor.getObjectFormatter(), logger); if (converted == null && fieldExtractor.isNotNull()) return false; setField(o, fieldExtractor, converted); } else setField(o, fieldExtractor, this.fieldName); return true; } } ================================================ FILE: webmagic-extension/src/main/java/us/codecraft/webmagic/model/formatter/BasicClassDetector.java ================================================ package us.codecraft.webmagic.model.formatter; public interface BasicClassDetector { Class detectBasicClass(Class type); } class IntegerClassDetector implements BasicClassDetector { @Override public Class detectBasicClass(Class type) { if (type.equals(Integer.TYPE) || type.equals(Integer.class)) { return Integer.class; } return null; } } class LongClassDetector implements BasicClassDetector { @Override public Class detectBasicClass(Class type) { if (type.equals(Long.TYPE) || type.equals(Long.class)) { return Long.class; } return null; } } class DoubleClassDetector implements BasicClassDetector { @Override public Class detectBasicClass(Class type) { if (type.equals(Double.TYPE) || type.equals(Double.class)) { return Double.class; } return null; } } class FloatClassDetector implements BasicClassDetector { @Override public Class detectBasicClass(Class type) { if (type.equals(Float.TYPE) || type.equals(Float.class)) { return Float.class; } return null; } } class ShortClassDetector implements BasicClassDetector { @Override public Class detectBasicClass(Class type) { if (type.equals(Short.TYPE) || type.equals(Short.class)) { return Short.class; } return null; } } class CharacterClassDetector implements BasicClassDetector { @Override public Class detectBasicClass(Class type) { if (type.equals(Character.TYPE) || type.equals(Character.class)) { return Character.class; } return null; } } class ByteClassDetector implements BasicClassDetector { @Override public Class detectBasicClass(Class type) { if (type.equals(Byte.TYPE) || type.equals(Byte.class)) { return Byte.class; } return null; } } class BooleanClassDetector implements BasicClassDetector { @Override public Class detectBasicClass(Class type) { if (type.equals(Boolean.TYPE) || type.equals(Boolean.class)) { return Boolean.class; } return null; } } ================================================ FILE: webmagic-extension/src/main/java/us/codecraft/webmagic/model/formatter/BasicTypeFormatter.java ================================================ package us.codecraft.webmagic.model.formatter; import java.util.Arrays; import java.util.List; /** * @author code4crafter@gmail.com * @since 0.3.2 */ public abstract class BasicTypeFormatter implements ObjectFormatter { @Override public void initParam(String[] extra) { } @Override public T format(String raw) throws Exception { if (raw == null) { return null; } raw = raw.trim(); return formatTrimmed(raw); } protected abstract T formatTrimmed(String raw) throws Exception; public static final List> basicTypeFormatters = Arrays.>asList(IntegerFormatter.class, LongFormatter.class, DoubleFormatter.class, FloatFormatter.class, ShortFormatter.class, CharactorFormatter.class, ByteFormatter.class, BooleanFormatter.class); public static final List basicClassDetector= Arrays.asList(new IntegerClassDetector(), new LongClassDetector(), new FloatClassDetector(), new DoubleClassDetector(), new ShortClassDetector(), new ByteClassDetector(), new BooleanClassDetector(), new CharacterClassDetector()); public static Class detectBasicClass(Class type) { for (BasicClassDetector detector : basicClassDetector) { Class detectedClass = detector.detectBasicClass(type); if (detectedClass != null) { return detectedClass; } } return type; } public static class IntegerFormatter extends BasicTypeFormatter { @Override public Integer formatTrimmed(String raw) throws Exception { return Integer.parseInt(raw); } @Override public Class clazz() { return Integer.class; } } public static class LongFormatter extends BasicTypeFormatter { @Override public Long formatTrimmed(String raw) throws Exception { return Long.parseLong(raw); } @Override public Class clazz() { return Long.class; } } public static class DoubleFormatter extends BasicTypeFormatter { @Override public Double formatTrimmed(String raw) throws Exception { return Double.parseDouble(raw); } @Override public Class clazz() { return Double.class; } } public static class FloatFormatter extends BasicTypeFormatter { @Override public Float formatTrimmed(String raw) throws Exception { return Float.parseFloat(raw); } @Override public Class clazz() { return Float.class; } } public static class ShortFormatter extends BasicTypeFormatter { @Override public Short formatTrimmed(String raw) throws Exception { return Short.parseShort(raw); } @Override public Class clazz() { return Short.class; } } public static class CharactorFormatter extends BasicTypeFormatter { @Override public Character formatTrimmed(String raw) throws Exception { return raw.charAt(0); } @Override public Class clazz() { return Character.class; } } public static class ByteFormatter extends BasicTypeFormatter { @Override public Byte formatTrimmed(String raw) throws Exception { return Byte.parseByte(raw, 10); } @Override public Class clazz() { return Byte.class; } } public static class BooleanFormatter extends BasicTypeFormatter { @Override public Boolean formatTrimmed(String raw) throws Exception { return Boolean.parseBoolean(raw); } @Override public Class clazz() { return Boolean.class; } } } ================================================ FILE: webmagic-extension/src/main/java/us/codecraft/webmagic/model/formatter/DateFormatter.java ================================================ package us.codecraft.webmagic.model.formatter; import org.apache.commons.lang3.time.DateUtils; import java.util.Date; /** * @author code4crafter@gmail.com * @since 0.3.2 */ public class DateFormatter implements ObjectFormatter { public static final String[] DEFAULT_PATTERN = new String[]{"yyyy-MM-dd HH:mm"}; private String[] datePatterns = DEFAULT_PATTERN; @Override public Date format(String raw) throws Exception { return DateUtils.parseDate(raw, datePatterns); } @Override public Class clazz() { return Date.class; } @Override public void initParam(String[] extra) { if (extra != null && !(extra.length == 1 && extra[0].length() == 0)) { datePatterns = extra; } } } ================================================ FILE: webmagic-extension/src/main/java/us/codecraft/webmagic/model/formatter/ObjectFormatter.java ================================================ package us.codecraft.webmagic.model.formatter; /** * @author code4crafter@gmail.com */ public interface ObjectFormatter { T format(String raw) throws Exception; Class clazz(); void initParam(String[] extra); } ================================================ FILE: webmagic-extension/src/main/java/us/codecraft/webmagic/model/formatter/ObjectFormatterBuilder.java ================================================ package us.codecraft.webmagic.model.formatter; import us.codecraft.webmagic.model.annotation.Formatter; import java.lang.reflect.Field; import java.util.List; /** * @author code4crafter@gmail.com * @since 0.7.0 * Date: 2017/6/3 */ public class ObjectFormatterBuilder { private Field field; public ObjectFormatterBuilder setField(Field field) { this.field = field; return this; } private ObjectFormatter initFormatterForType(Class fieldClazz, String[] params) { if (fieldClazz.equals(String.class) || List.class.isAssignableFrom(fieldClazz)){ return null; } Class formatterClass = ObjectFormatters.get(BasicTypeFormatter.detectBasicClass(fieldClazz)); if (formatterClass == null) { throw new IllegalStateException("Can't find formatter for field " + field.getName() + " of type " + fieldClazz); } return initFormatter(formatterClass, params); } private ObjectFormatter initFormatter(Class formatterClazz, String[] params) { try { ObjectFormatter objectFormatter = formatterClazz.newInstance(); objectFormatter.initParam(params); return objectFormatter; } catch (InstantiationException e) { throw new RuntimeException(e); } catch (IllegalAccessException e) { throw new RuntimeException(e); } } public ObjectFormatter build() { Formatter formatter = field.getAnnotation(Formatter.class); if (formatter != null && !formatter.formatter().equals(Formatter.DEFAULT_FORMATTER)) { return initFormatter(formatter.formatter(), formatter.value()); } if (formatter == null || formatter.subClazz().equals(Void.class)) { return initFormatterForType(field.getType(), formatter != null ? formatter.value() : null); } else { return initFormatterForType(formatter.subClazz(), formatter.value()); } } } ================================================ FILE: webmagic-extension/src/main/java/us/codecraft/webmagic/model/formatter/ObjectFormatters.java ================================================ package us.codecraft.webmagic.model.formatter; import java.util.Map; import java.util.concurrent.ConcurrentHashMap; /** * @author code4crafter@gmail.com * @since 0.3.2 */ public class ObjectFormatters { private static Map> formatterMap = new ConcurrentHashMap>(); static { for (Class basicTypeFormatter : BasicTypeFormatter.basicTypeFormatters) { put(basicTypeFormatter); } put(DateFormatter.class); } public static void put(Class objectFormatter) { try { formatterMap.put(objectFormatter.newInstance().clazz(), objectFormatter); } catch (InstantiationException e) { throw new RuntimeException(e); } catch (IllegalAccessException e) { throw new RuntimeException(e); } } public static Class get(Class clazz){ return formatterMap.get(clazz); } } ================================================ FILE: webmagic-extension/src/main/java/us/codecraft/webmagic/model/package.html ================================================ Page model and annotations used to customize a crawler. ================================================ FILE: webmagic-extension/src/main/java/us/codecraft/webmagic/model/sources/Source.java ================================================ package us.codecraft.webmagic.model.sources; import java.util.List; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.model.FieldExtractor; public interface Source { public String getText(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor); public List getTextList(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor); public class RawHtml implements Source { public String getText(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) { return page.getHtml().selectDocument(fieldExtractor.getSelector()); } public List getTextList(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) { return page.getHtml().selectDocumentForList(fieldExtractor.getSelector()); } } public class SelectedHtml implements Source { public String getText(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) { if (isRaw) return page.getHtml().selectDocument(fieldExtractor.getSelector()); else return fieldExtractor.getSelector().select(html); } public List getTextList(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) { if (isRaw) return page.getHtml().selectDocumentForList(fieldExtractor.getSelector()); else return fieldExtractor.getSelector().selectList(html); } } public class Url implements Source { public String getText(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) { return fieldExtractor.getSelector().select(page.getUrl().toString()); } public List getTextList(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) { return fieldExtractor.getSelector().selectList(page.getUrl().toString()); } } public class RawText implements Source { public String getText(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) { return fieldExtractor.getSelector().select(page.getRawText()); } public List getTextList(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) { return fieldExtractor.getSelector().selectList(page.getRawText()); } } public class DefaultSource implements Source { public String getText(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) { return fieldExtractor.getSelector().select(html); } public List getTextList(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) { return fieldExtractor.getSelector().selectList(html); } } } ================================================ FILE: webmagic-extension/src/main/java/us/codecraft/webmagic/model/sources/SourceTextExtractor.java ================================================ package us.codecraft.webmagic.model.sources; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.model.FieldExtractor; import us.codecraft.webmagic.model.fields.MultipleField; import us.codecraft.webmagic.model.fields.PageField; import us.codecraft.webmagic.model.fields.SingleField; public class SourceTextExtractor { public static PageField getText(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) { Source source = fieldExtractor.getSource(); if (fieldExtractor.isMulti()) return new MultipleField(source.getTextList(page, html, isRaw, fieldExtractor)); else return new SingleField(source.getText(page, html, isRaw, fieldExtractor)); } } ================================================ FILE: webmagic-extension/src/main/java/us/codecraft/webmagic/monitor/SpiderMonitor.java ================================================ package us.codecraft.webmagic.monitor; import java.lang.management.ManagementFactory; import java.util.ArrayList; import java.util.Collections; import java.util.List; import java.util.concurrent.atomic.AtomicInteger; import javax.management.InstanceAlreadyExistsException; import javax.management.JMException; import javax.management.MBeanRegistrationException; import javax.management.MBeanServer; import javax.management.MalformedObjectNameException; import javax.management.NotCompliantMBeanException; import javax.management.ObjectName; import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.SpiderListener; import us.codecraft.webmagic.utils.Experimental; import us.codecraft.webmagic.utils.UrlUtils; /** * @author code4crafer@gmail.com * @since 0.5.0 */ @Experimental public class SpiderMonitor { private static final SpiderMonitor INSTANCE = new SpiderMonitor(); private MBeanServer mbeanServer; private String jmxServerName; private List spiderStatuses = new ArrayList<>(); protected SpiderMonitor() { jmxServerName = "WebMagic"; mbeanServer = ManagementFactory.getPlatformMBeanServer(); } /** * Register spider for monitor. * * @param spiders spiders * @return this * @throws JMException JMException */ public synchronized SpiderMonitor register(Spider... spiders) throws JMException { for (Spider spider : spiders) { MonitorSpiderListener monitorSpiderListener = new MonitorSpiderListener(); if (spider.getSpiderListeners() == null) { List spiderListeners = new ArrayList<>(); spiderListeners.add(monitorSpiderListener); spider.setSpiderListeners(spiderListeners); } else { spider.getSpiderListeners().add(monitorSpiderListener); } SpiderStatusMXBean spiderStatusMBean = getSpiderStatusMBean(spider, monitorSpiderListener); registerMBean(spiderStatusMBean); spiderStatuses.add(spiderStatusMBean); } return this; } protected SpiderStatusMXBean getSpiderStatusMBean(Spider spider, MonitorSpiderListener monitorSpiderListener) { return new SpiderStatus(spider, monitorSpiderListener); } protected List getSpiderStatuses() { return this.spiderStatuses; } public static SpiderMonitor instance() { return INSTANCE; } public class MonitorSpiderListener implements SpiderListener { private final AtomicInteger successCount = new AtomicInteger(0); private final AtomicInteger errorCount = new AtomicInteger(0); private List errorUrls = Collections.synchronizedList(new ArrayList()); @Override public void onSuccess(Request request) { successCount.incrementAndGet(); } @Override public void onError(Request request, Exception e) { errorUrls.add(request.getUrl()); errorCount.incrementAndGet(); } public AtomicInteger getSuccessCount() { return successCount; } public AtomicInteger getErrorCount() { return errorCount; } public List getErrorUrls() { return errorUrls; } } protected void registerMBean(SpiderStatusMXBean spiderStatus) throws MalformedObjectNameException, InstanceAlreadyExistsException, MBeanRegistrationException, NotCompliantMBeanException { ObjectName objName = new ObjectName(jmxServerName + ":name=" + UrlUtils.removePort(spiderStatus.getName())); mbeanServer.registerMBean(spiderStatus, objName); } } ================================================ FILE: webmagic-extension/src/main/java/us/codecraft/webmagic/monitor/SpiderStatus.java ================================================ package us.codecraft.webmagic.monitor; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.scheduler.MonitorableScheduler; import java.util.Date; import java.util.List; /** * @author code4crafer@gmail.com * @since 0.5.0 */ public class SpiderStatus implements SpiderStatusMXBean { protected final Spider spider; protected Logger logger = LoggerFactory.getLogger(getClass()); protected final SpiderMonitor.MonitorSpiderListener monitorSpiderListener; public SpiderStatus(Spider spider, SpiderMonitor.MonitorSpiderListener monitorSpiderListener) { this.spider = spider; this.monitorSpiderListener = monitorSpiderListener; } public String getName() { return spider.getUUID(); } public int getLeftPageCount() { if (spider.getScheduler() instanceof MonitorableScheduler) { return ((MonitorableScheduler) spider.getScheduler()).getLeftRequestsCount(spider); } logger.warn("Get leftPageCount fail, try to use a Scheduler implement MonitorableScheduler for monitor count!"); return -1; } public int getTotalPageCount() { if (spider.getScheduler() instanceof MonitorableScheduler) { return ((MonitorableScheduler) spider.getScheduler()).getTotalRequestsCount(spider); } logger.warn("Get totalPageCount fail, try to use a Scheduler implement MonitorableScheduler for monitor count!"); return -1; } @Override public int getSuccessPageCount() { return monitorSpiderListener.getSuccessCount().get(); } @Override public int getErrorPageCount() { return monitorSpiderListener.getErrorCount().get(); } public List getErrorPages() { return monitorSpiderListener.getErrorUrls(); } @Override public String getStatus() { return spider.getStatus().name(); } @Override public int getThread() { return spider.getThreadAlive(); } public void start() { spider.start(); } public void stop() { spider.stop(); } @Override public Date getStartTime() { return spider.getStartTime(); } @Override public int getPagePerSecond() { if (getStartTime() != null) { int runSeconds = (int) (System.currentTimeMillis() - getStartTime().getTime()) / 1000; if (runSeconds != 0) { return getSuccessPageCount() / runSeconds; } } return -1; } } ================================================ FILE: webmagic-extension/src/main/java/us/codecraft/webmagic/monitor/SpiderStatusMXBean.java ================================================ package us.codecraft.webmagic.monitor; import java.util.Date; import java.util.List; /** * @author code4crafer@gmail.com * @since 0.5.0 */ public interface SpiderStatusMXBean { public String getName(); public String getStatus(); public int getThread(); public int getTotalPageCount(); public int getLeftPageCount(); public int getSuccessPageCount(); public int getErrorPageCount(); public List getErrorPages(); public void start(); public void stop(); public Date getStartTime(); public int getPagePerSecond(); } ================================================ FILE: webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/CollectorPageModelPipeline.java ================================================ package us.codecraft.webmagic.pipeline; import us.codecraft.webmagic.Task; import java.util.ArrayList; import java.util.List; /** * @author code4crafter@gmail.com */ public class CollectorPageModelPipeline implements PageModelPipeline { private List collected = new ArrayList(); @Override public synchronized void process(T t, Task task) { collected.add(t); } public List getCollected() { return collected; } } ================================================ FILE: webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/FilePageModelPipeline.java ================================================ package us.codecraft.webmagic.pipeline; import org.apache.commons.codec.digest.DigestUtils; import org.apache.commons.lang3.builder.ToStringBuilder; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import us.codecraft.webmagic.Task; import us.codecraft.webmagic.model.HasKey; import us.codecraft.webmagic.utils.FilePersistentBase; import java.io.FileWriter; import java.io.IOException; import java.io.PrintWriter; /** * Store results objects (page models) to files in plain format.
* Use model.getKey() as file name if the model implements HasKey.
* Otherwise use SHA1 as file name. * * @author code4crafter@gmail.com
* @since 0.3.0 */ public class FilePageModelPipeline extends FilePersistentBase implements PageModelPipeline { private Logger logger = LoggerFactory.getLogger(getClass()); /** * new JsonFilePageModelPipeline with default path "/data/webmagic/" */ public FilePageModelPipeline() { setPath("/data/webmagic/"); } public FilePageModelPipeline(String path) { setPath(path); } @Override public void process(Object o, Task task) { String path = this.path + PATH_SEPERATOR + task.getUUID() + PATH_SEPERATOR; try { String filename; if (o instanceof HasKey) { filename = path + ((HasKey) o).key() + ".html"; } else { filename = path + DigestUtils.md5Hex(ToStringBuilder.reflectionToString(o)) + ".html"; } PrintWriter printWriter = new PrintWriter(new FileWriter(getFile(filename))); printWriter.write(ToStringBuilder.reflectionToString(o)); printWriter.close(); } catch (IOException e) { logger.warn("write file error", e); } } } ================================================ FILE: webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/JsonFilePageModelPipeline.java ================================================ package us.codecraft.webmagic.pipeline; import com.alibaba.fastjson.JSON; import org.apache.commons.codec.digest.DigestUtils; import org.apache.commons.lang3.builder.ToStringBuilder; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import us.codecraft.webmagic.Task; import us.codecraft.webmagic.model.HasKey; import us.codecraft.webmagic.utils.FilePersistentBase; import java.io.FileWriter; import java.io.IOException; import java.io.PrintWriter; /** * Store results objects (page models) to files in JSON format.
* Use model.getKey() as file name if the model implements HasKey.
* Otherwise use SHA1 as file name. * * @author code4crafter@gmail.com
* @since 0.2.0 */ public class JsonFilePageModelPipeline extends FilePersistentBase implements PageModelPipeline { private Logger logger = LoggerFactory.getLogger(getClass()); /** * new JsonFilePageModelPipeline with default path "/data/webmagic/" */ public JsonFilePageModelPipeline() { setPath("/data/webmagic/"); } public JsonFilePageModelPipeline(String path) { setPath(path); } @Override public void process(Object o, Task task) { String path = this.path + PATH_SEPERATOR + task.getUUID() + PATH_SEPERATOR; try { String filename; if (o instanceof HasKey) { filename = path + ((HasKey) o).key() + ".json"; } else { filename = path + DigestUtils.md5Hex(ToStringBuilder.reflectionToString(o)) + ".json"; } PrintWriter printWriter = new PrintWriter(new FileWriter(getFile(filename))); printWriter.write(JSON.toJSONString(o)); printWriter.close(); } catch (IOException e) { logger.warn("write file error", e); } } } ================================================ FILE: webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/JsonFilePipeline.java ================================================ package us.codecraft.webmagic.pipeline; import com.alibaba.fastjson.JSON; import org.apache.commons.codec.digest.DigestUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import us.codecraft.webmagic.ResultItems; import us.codecraft.webmagic.Task; import us.codecraft.webmagic.utils.FilePersistentBase; import java.io.FileWriter; import java.io.IOException; import java.io.PrintWriter; /** * Store results to files in JSON format.
* * @author code4crafter@gmail.com
* @since 0.2.0 */ public class JsonFilePipeline extends FilePersistentBase implements Pipeline { private Logger logger = LoggerFactory.getLogger(getClass()); /** * new JsonFilePageModelPipeline with default path "/data/webmagic/" */ public JsonFilePipeline() { setPath("/data/webmagic"); } public JsonFilePipeline(String path) { setPath(path); } @Override public void process(ResultItems resultItems, Task task) { String path = this.path + PATH_SEPERATOR + task.getUUID() + PATH_SEPERATOR; try { PrintWriter printWriter = new PrintWriter(new FileWriter(getFile(path + DigestUtils.md5Hex(resultItems.getRequest().getUrl()) + ".json"))); printWriter.write(JSON.toJSONString(resultItems.getAll())); printWriter.close(); } catch (IOException e) { logger.warn("write file error", e); } } } ================================================ FILE: webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/MultiPagePipeline.java ================================================ package us.codecraft.webmagic.pipeline; import us.codecraft.webmagic.MultiPageModel; import us.codecraft.webmagic.ResultItems; import us.codecraft.webmagic.Task; import us.codecraft.webmagic.utils.Experimental; import us.codecraft.webmagic.utils.DoubleKeyMap; import java.util.*; import java.util.concurrent.ConcurrentHashMap; /** * A pipeline combines the result in more than one page together.
* Used for news and articles containing more than one web page.
* MultiPagePipeline will store parts of object and output them when all parts are extracted.
* * @author code4crafter@gmail.com
* @since 0.2.0 */ @Experimental public class MultiPagePipeline implements Pipeline { private DoubleKeyMap pageMap = new DoubleKeyMap(ConcurrentHashMap.class); private DoubleKeyMap objectMap = new DoubleKeyMap(ConcurrentHashMap.class); @Override public void process(ResultItems resultItems, Task task) { Map resultItemsAll = resultItems.getAll(); Iterator> iterator = resultItemsAll.entrySet().iterator(); while (iterator.hasNext()) { handleObject(iterator); } } private void handleObject(Iterator> iterator) { Map.Entry objectEntry = iterator.next(); Object o = objectEntry.getValue(); //需要拼凑 if (o instanceof MultiPageModel) { MultiPageModel multiPageModel = (MultiPageModel) o; //这次处理的部分,设置为完成 pageMap.put(multiPageModel.getPageKey(), multiPageModel.getPage(), Boolean.FALSE); //每个key单独加锁 synchronized (pageMap.get(multiPageModel.getPageKey())) { pageMap.put(multiPageModel.getPageKey(), multiPageModel.getPage(), Boolean.TRUE); //其他需要拼凑的部分 if (multiPageModel.getOtherPages() != null) { for (String otherPage : multiPageModel.getOtherPages()) { Boolean aBoolean = pageMap.get(multiPageModel.getPageKey(), otherPage); if (aBoolean == null) { pageMap.put(multiPageModel.getPageKey(), otherPage, Boolean.FALSE); } } } //check if all pages are processed Map booleanMap = pageMap.get(multiPageModel.getPageKey()); objectMap.put(multiPageModel.getPageKey(), multiPageModel.getPage(), multiPageModel); if (booleanMap == null) { return; } // /过滤,这次完成的page item中,还未拼凑完整的item,不进入下一个pipeline for (Map.Entry stringBooleanEntry : booleanMap.entrySet()) { if (!stringBooleanEntry.getValue()) { iterator.remove(); return; } } List> entryList = new ArrayList>(); entryList.addAll(objectMap.get(multiPageModel.getPageKey()).entrySet()); if (entryList.size() != 0) { Collections.sort(entryList, new Comparator>() { @Override public int compare(Map.Entry o1, Map.Entry o2) { try { int i1 = Integer.parseInt(o1.getKey()); int i2 = Integer.parseInt(o2.getKey()); return i1 - i2; } catch (NumberFormatException e) { return o1.getKey().compareTo(o2.getKey()); } } }); // 合并 MultiPageModel value = entryList.get(0).getValue(); for (int i = 1; i < entryList.size(); i++) { value = value.combine(entryList.get(i).getValue()); } objectEntry.setValue(value); } } } } } ================================================ FILE: webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/PageModelPipeline.java ================================================ package us.codecraft.webmagic.pipeline; import us.codecraft.webmagic.Task; /** * Implements PageModelPipeline to persistent your page model. * * @author code4crafter@gmail.com
* @since 0.2.0 */ public interface PageModelPipeline { public void process(T t, Task task); } ================================================ FILE: webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/BloomFilterDuplicateRemover.java ================================================ package us.codecraft.webmagic.scheduler; /** * @author code4crafter@gmail.com * Date: 16/12/18 * Time: 上午10:23 */ import com.google.common.hash.BloomFilter; import com.google.common.hash.Funnels; import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Task; import us.codecraft.webmagic.scheduler.component.DuplicateRemover; import java.nio.charset.Charset; import java.util.concurrent.atomic.AtomicInteger; /** * BloomFilterDuplicateRemover for huge number of urls. * * @author code4crafer@gmail.com * @since 0.5.1 */ public class BloomFilterDuplicateRemover implements DuplicateRemover { private int expectedInsertions; private double fpp; private AtomicInteger counter; public BloomFilterDuplicateRemover(int expectedInsertions) { this(expectedInsertions, 0.01); } /** * * @param expectedInsertions the number of expected insertions to the constructed * @param fpp the desired false positive probability (must be positive and less than 1.0) */ public BloomFilterDuplicateRemover(int expectedInsertions, double fpp) { this.expectedInsertions = expectedInsertions; this.fpp = fpp; this.bloomFilter = rebuildBloomFilter(); } protected BloomFilter rebuildBloomFilter() { counter = new AtomicInteger(0); return BloomFilter.create(Funnels.stringFunnel(Charset.defaultCharset()), expectedInsertions, fpp); } private final BloomFilter bloomFilter; @Override public boolean isDuplicate(Request request, Task task) { boolean isDuplicate = bloomFilter.mightContain(getUrl(request)); if (!isDuplicate) { bloomFilter.put(getUrl(request)); counter.incrementAndGet(); } return isDuplicate; } protected String getUrl(Request request) { return request.getUrl(); } @Override public void resetDuplicateCheck(Task task) { rebuildBloomFilter(); } @Override public int getTotalRequestsCount(Task task) { return counter.get(); } } ================================================ FILE: webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/FileCacheQueueScheduler.java ================================================ package us.codecraft.webmagic.scheduler; import org.apache.commons.lang3.math.NumberUtils; import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Task; import java.io.*; import java.util.concurrent.*; import java.util.concurrent.atomic.AtomicBoolean; import java.util.concurrent.atomic.AtomicInteger; /** * Store urls and cursor in files so that a Spider can resume the status when shutdown.
* * @author code4crafter@gmail.com
* @since 0.2.0 */ public class FileCacheQueueScheduler extends DuplicateRemovedScheduler implements MonitorableScheduler, Closeable { private String filePath = System.getProperty("java.io.tmpdir"); private String fileUrlAllName = ".urls.txt"; private Task task; private String fileCursor = ".cursor.txt"; private PrintWriter fileUrlWriter; private PrintWriter fileCursorWriter; private AtomicInteger cursor = new AtomicInteger(); private AtomicBoolean inited = new AtomicBoolean(false); private BlockingQueue queue; private ScheduledExecutorService flushThreadPool; public FileCacheQueueScheduler(String filePath) { if (!filePath.endsWith("/") && !filePath.endsWith("\\")) { filePath += "/"; } this.filePath = filePath; initDuplicateRemover(); } private void flush() { fileUrlWriter.flush(); fileCursorWriter.flush(); } private void init(Task task) { this.task = task; File file = new File(filePath); if (!file.exists()) { file.mkdirs(); } readFile(); initWriter(); initFlushThread(); inited.set(true); logger.info("init cache scheduler success"); } private void initDuplicateRemover() { BloomFilterDuplicateRemover bloomFilterDuplicateRemover = new BloomFilterDuplicateRemover(this.filePath.hashCode()); setDuplicateRemover(bloomFilterDuplicateRemover); } private void initFlushThread() { flushThreadPool = Executors.newScheduledThreadPool(1); flushThreadPool.scheduleAtFixedRate(this::flush, 10, 10, TimeUnit.SECONDS); } private void initWriter() { try { fileUrlWriter = new PrintWriter(new FileWriter(getFileName(fileUrlAllName), true)); fileCursorWriter = new PrintWriter(new FileWriter(getFileName(fileCursor), false)); } catch (IOException e) { throw new RuntimeException("init cache scheduler error", e); } } private void readFile() { try { queue = new LinkedBlockingQueue(); readCursorFile(); readUrlFile(); // initDuplicateRemover(); } catch (FileNotFoundException e) { //init logger.info("init cache file " + getFileName(fileUrlAllName)); } catch (IOException e) { logger.error("init file error", e); } } private void readUrlFile() throws IOException { try (BufferedReader fileUrlReader = new BufferedReader(new FileReader(getFileName(fileUrlAllName)))) { String line; int lineReaded = 0; while ((line = fileUrlReader.readLine()) != null) { Request request = deserializeRequest(line); this.getDuplicateRemover().isDuplicate(request, null); lineReaded++; if (lineReaded > cursor.get()) { queue.add(request); } } } } private void readCursorFile() throws IOException { String fileName = getFileName(fileCursor); try (BufferedReader fileCursorReader = new BufferedReader(new FileReader(fileName))) { String line; String lastLine = null; //read the last number while ((line = fileCursorReader.readLine()) != null) { line = line.trim(); if (!line.isEmpty()) { lastLine = line; } } if (lastLine != null) { cursor.set(NumberUtils.toInt(lastLine)); } } } public void close() throws IOException { flushThreadPool.shutdown(); fileUrlWriter.close(); fileCursorWriter.close(); } private String getFileName(String filename) { return filePath + task.getUUID() + filename; } @Override protected void pushWhenNoDuplicate(Request request, Task task) { if (!inited.get()) { init(task); } queue.add(request); fileUrlWriter.println(serializeRequest(request)); } @Override public synchronized Request poll(Task task) { if (!inited.get()) { init(task); } fileCursorWriter.println(cursor.incrementAndGet()); return queue.poll(); } @Override public int getLeftRequestsCount(Task task) { return queue.size(); } @Override public int getTotalRequestsCount(Task task) { return getDuplicateRemover().getTotalRequestsCount(task); } protected String serializeRequest(Request request) { return request.getUrl(); } protected Request deserializeRequest(String line) { return new Request(line); } } ================================================ FILE: webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisPriorityScheduler.java ================================================ package us.codecraft.webmagic.scheduler; import java.util.Set; import org.apache.commons.codec.digest.DigestUtils; import org.apache.commons.lang3.StringUtils; import com.alibaba.fastjson.JSON; import redis.clients.jedis.Jedis; import redis.clients.jedis.JedisPool; import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Task; /** * the redis scheduler with priority * @author sai * Created by sai on 16-5-27. */ public class RedisPriorityScheduler extends RedisScheduler { private static final String ZSET_PREFIX = "zset_"; private static final String QUEUE_PREFIX = "queue_"; private static final String NO_PRIORITY_SUFFIX = "_zore"; private static final String PLUS_PRIORITY_SUFFIX = "_plus"; private static final String MINUS_PRIORITY_SUFFIX = "_minus"; public RedisPriorityScheduler(String host) { super(host); } public RedisPriorityScheduler(JedisPool pool) { super(pool); } @Override protected void pushWhenNoDuplicate(Request request, Task task) { try (Jedis jedis = pool.getResource()) { if (request.getPriority() > 0) { jedis.zadd(getZsetPlusPriorityKey(task), request.getPriority(), request.getUrl()); } else if (request.getPriority() < 0) { jedis.zadd(getZsetMinusPriorityKey(task), request.getPriority(), request.getUrl()); } else { jedis.lpush(getQueueNoPriorityKey(task), request.getUrl()); } setExtrasInItem(jedis, request, task); } } @Override public synchronized Request poll(Task task) { try (Jedis jedis = pool.getResource()) { String url = getRequest(jedis, task); if (StringUtils.isBlank(url)) { return null; } return getExtrasInItem(jedis, url, task); } } private String getRequest(Jedis jedis, Task task) { String url; Set urls = jedis.zrevrange(getZsetPlusPriorityKey(task), 0, 0); if (urls.isEmpty()) { url = jedis.lpop(getQueueNoPriorityKey(task)); if (StringUtils.isBlank(url)) { urls = jedis.zrevrange(getZsetMinusPriorityKey(task), 0, 0); if (!urls.isEmpty()) { url = urls.toArray(new String[0])[0]; jedis.zrem(getZsetMinusPriorityKey(task), url); } } } else { url = urls.toArray(new String[0])[0]; jedis.zrem(getZsetPlusPriorityKey(task), url); } return url; } @Override public void resetDuplicateCheck(Task task) { try (Jedis jedis = pool.getResource()) { jedis.del(getSetKey(task)); } } private String getZsetPlusPriorityKey(Task task) { return ZSET_PREFIX + task.getUUID() + PLUS_PRIORITY_SUFFIX; } private String getQueueNoPriorityKey(Task task) { return QUEUE_PREFIX + task.getUUID() + NO_PRIORITY_SUFFIX; } private String getZsetMinusPriorityKey(Task task) { return ZSET_PREFIX + task.getUUID() + MINUS_PRIORITY_SUFFIX; } private void setExtrasInItem(Jedis jedis,Request request, Task task) { if (!request.getExtras().isEmpty()) { String field = DigestUtils.sha1Hex(request.getUrl()); String value = JSON.toJSONString(request); jedis.hset(getItemKey(task), field, value); } } private Request getExtrasInItem(Jedis jedis, String url, Task task) { String key = getItemKey(task); String field = DigestUtils.sha1Hex(url); byte[] bytes = jedis.hget(key.getBytes(), field.getBytes()); if (bytes != null) { return JSON.parseObject(new String(bytes), Request.class); } return new Request(url); } } ================================================ FILE: webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java ================================================ package us.codecraft.webmagic.scheduler; import org.apache.commons.codec.digest.DigestUtils; import org.apache.commons.lang3.StringUtils; import com.alibaba.fastjson.JSON; import redis.clients.jedis.Jedis; import redis.clients.jedis.JedisPool; import redis.clients.jedis.JedisPoolConfig; import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Task; import us.codecraft.webmagic.scheduler.component.DuplicateRemover; /** * Use Redis as url scheduler for distributed crawlers.
* * @author code4crafter@gmail.com
* @since 0.2.0 */ public class RedisScheduler extends DuplicateRemovedScheduler implements MonitorableScheduler, DuplicateRemover { protected JedisPool pool; private static final String QUEUE_PREFIX = "queue_"; private static final String SET_PREFIX = "set_"; private static final String ITEM_PREFIX = "item_"; public RedisScheduler(String host) { this(new JedisPool(new JedisPoolConfig(), host)); } public RedisScheduler(JedisPool pool) { this.pool = pool; setDuplicateRemover(this); } @Override public void resetDuplicateCheck(Task task) { try (Jedis jedis = pool.getResource()) { jedis.del(getSetKey(task)); } } @Override public boolean isDuplicate(Request request, Task task) { try (Jedis jedis = pool.getResource()) { return jedis.sadd(getSetKey(task), request.getUrl()) == 0; } } @Override protected void pushWhenNoDuplicate(Request request, Task task) { Jedis jedis = pool.getResource(); try { jedis.rpush(getQueueKey(task), request.getUrl()); if (checkForAdditionalInfo(request)) { String field = DigestUtils.sha1Hex(request.getUrl()); String value = JSON.toJSONString(request); jedis.hset((ITEM_PREFIX + task.getUUID()), field, value); } } finally { jedis.close(); } } private boolean checkForAdditionalInfo(Request request) { if (request == null) { return false; } if (!request.getHeaders().isEmpty() || !request.getCookies().isEmpty()) { return true; } if (StringUtils.isNotBlank(request.getCharset()) || StringUtils.isNotBlank(request.getMethod())) { return true; } if (request.isBinaryContent() || request.getRequestBody() != null) { return true; } if (!request.getExtras().isEmpty()) { return true; } if (request.getPriority() != 0L) { return true; } return false; } @Override public synchronized Request poll(Task task) { try (Jedis jedis = pool.getResource()) { String url = jedis.lpop(getQueueKey(task)); if (url == null) { return null; } String key = ITEM_PREFIX + task.getUUID(); String field = DigestUtils.sha1Hex(url); byte[] bytes = jedis.hget(key.getBytes(), field.getBytes()); if (bytes != null) { Request o = JSON.parseObject(new String(bytes), Request.class); return o; } Request request = new Request(url); return request; } } protected String getSetKey(Task task) { return SET_PREFIX + task.getUUID(); } protected String getQueueKey(Task task) { return QUEUE_PREFIX + task.getUUID(); } protected String getItemKey(Task task) { return ITEM_PREFIX + task.getUUID(); } @Override public int getLeftRequestsCount(Task task) { try (Jedis jedis = pool.getResource()) { Long size = jedis.llen(getQueueKey(task)); return size.intValue(); } } @Override public int getTotalRequestsCount(Task task) { try (Jedis jedis = pool.getResource()) { Long size = jedis.scard(getSetKey(task)); return size.intValue(); } } } ================================================ FILE: webmagic-extension/src/main/java/us/codecraft/webmagic/utils/ClassUtils.java ================================================ package us.codecraft.webmagic.utils; import java.lang.reflect.Field; import java.util.LinkedHashSet; import java.util.Set; /** * @author code4crafter@gmail.com * @since 0.5.0 */ public abstract class ClassUtils { public static Set getFieldsIncludeSuperClass(Class clazz) { Set fields = new LinkedHashSet(); Class current = clazz; while (current != null) { Field[] currentFields = current.getDeclaredFields(); for (Field currentField : currentFields) { fields.add(currentField); } current = current.getSuperclass(); } return fields; } } ================================================ FILE: webmagic-extension/src/main/java/us/codecraft/webmagic/utils/DoubleKeyMap.java ================================================ package us.codecraft.webmagic.utils; import java.util.Map; /** * @author code4crafter@gmail.com */ public class DoubleKeyMap extends MultiKeyMapBase { private Map> map; public DoubleKeyMap() { init(); } public DoubleKeyMap(Map> map) { this(map,DEFAULT_CLAZZ); } public DoubleKeyMap(Class protoMapClass) { super(protoMapClass); init(); } private void init() { if (map == null) { map = this.>newMap(); } } /** * init map with protoMapClass * * @param map the origin map to contains the DoubleKeyMap * @param protoMapClass protoMapClass */ @SuppressWarnings("rawtypes") public DoubleKeyMap(Map> map, Class protoMapClass) { super(protoMapClass); this.map = map; init(); } /** * @param key key * @return map */ public Map get(K1 key) { return map.get(key); } /** * @param key1 key1 * @param key2 key2 * @return value */ public V get(K1 key1, K2 key2) { if (get(key1) == null) { return null; } return get(key1).get(key2); } /** * @param key1 key1 * @param submap submap * @return value */ public V put(K1 key1, Map submap) { return put(key1, submap); } /** * @param key1 key1 * @param key2 key2 * @param value value * @return value */ public synchronized V put(K1 key1, K2 key2, V value) { if (map.get(key1) == null) { //不加锁的话,多个线程有可能都会执行到这里 map.put(key1, this.newMap()); } return get(key1).put(key2, value); } /** * @param key1 key1 * @param key2 key2 * @return value */ public synchronized V remove(K1 key1, K2 key2) { if (get(key1) == null) { return null; } V remove = get(key1).remove(key2); if (get(key1).size() == 0) { remove(key1); } return remove; } /** * @param key1 key1 * @return map */ public Map remove(K1 key1) { Map remove = map.remove(key1); return remove; } } ================================================ FILE: webmagic-extension/src/main/java/us/codecraft/webmagic/utils/ExtractorUtils.java ================================================ package us.codecraft.webmagic.utils; import us.codecraft.webmagic.model.annotation.ExtractBy; import us.codecraft.webmagic.selector.*; import java.util.ArrayList; import java.util.List; /** * Tools for annotation converting.
* * @author code4crafter@gmail.com
* @since 0.2.1 */ public class ExtractorUtils { public static Selector getSelector(ExtractBy extractBy) { String value = extractBy.value(); Selector selector; switch (extractBy.type()) { case Css: selector = new CssSelector(value); break; case Regex: selector = new RegexSelector(value); break; case XPath: selector = new XpathSelector(value); break; case JsonPath: selector = new JsonPathSelector(value); break; default: selector = new XpathSelector(value); } return selector; } public static List getSelectors(ExtractBy[] extractBies) { List selectors = new ArrayList(); if (extractBies == null) { return selectors; } for (ExtractBy extractBy : extractBies) { selectors.add(getSelector(extractBy)); } return selectors; } } ================================================ FILE: webmagic-extension/src/main/java/us/codecraft/webmagic/utils/IPUtils.java ================================================ package us.codecraft.webmagic.utils; import java.net.Inet6Address; import java.net.InetAddress; import java.net.NetworkInterface; import java.net.SocketException; import java.util.Enumeration; /** * @author code4crafer@gmail.com * @since 0.5.0 */ public abstract class IPUtils { public static String getFirstNoLoopbackIPAddresses() throws SocketException { Enumeration networkInterfaces = NetworkInterface.getNetworkInterfaces(); InetAddress localAddress = null; while (networkInterfaces.hasMoreElements()) { NetworkInterface networkInterface = networkInterfaces.nextElement(); Enumeration inetAddresses = networkInterface.getInetAddresses(); while (inetAddresses.hasMoreElements()) { InetAddress address = inetAddresses.nextElement(); if (!address.isLoopbackAddress() && !Inet6Address.class.isInstance(address)) { return address.getHostAddress(); } else if (!address.isLoopbackAddress()) { localAddress = address; } } } return localAddress.getHostAddress(); } } ================================================ FILE: webmagic-extension/src/main/java/us/codecraft/webmagic/utils/MultiKeyMapBase.java ================================================ package us.codecraft.webmagic.utils; /** * @author code4crafter@gmail.com */ import java.util.HashMap; import java.util.Map; /** * multi-key map, some basic objects * * * @author yihua.huang */ public abstract class MultiKeyMapBase { protected static final Class DEFAULT_CLAZZ = HashMap.class; @SuppressWarnings("rawtypes") private Class protoMapClass = DEFAULT_CLAZZ; public MultiKeyMapBase() { } @SuppressWarnings("rawtypes") public MultiKeyMapBase(Class protoMapClass) { this.protoMapClass = protoMapClass; } @SuppressWarnings("unchecked") protected Map newMap() { try { return (Map) protoMapClass.newInstance(); } catch (InstantiationException e) { throw new IllegalArgumentException("wrong proto type map " + protoMapClass); } catch (IllegalAccessException e) { throw new IllegalArgumentException("wrong proto type map " + protoMapClass); } } } ================================================ FILE: webmagic-extension/src/main/java/us/codecraft/webmagic/utils/RequestUtils.java ================================================ package us.codecraft.webmagic.utils; import us.codecraft.webmagic.Request; import java.util.ArrayList; import java.util.Collections; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; /** * @author code4crafter@gmail.com * Date: 2017/6/5 * Time: 下午4:58 */ public abstract class RequestUtils { private static Pattern p4Range = Pattern.compile("\\[(\\d+)\\-(\\d+)\\]"); public static List from(String exp){ Matcher matcher = p4Range.matcher(exp); if (!matcher.find()) { return Collections.singletonList(new Request(exp)); } int rangeFrom = Integer.parseInt(matcher.group(1)); int rangeTo = Integer.parseInt(matcher.group(2)); if (rangeFrom > rangeTo) { return Collections.emptyList(); } List requests = new ArrayList(rangeTo - rangeFrom + 1); for (int i = rangeFrom; i <= rangeTo; i++) { requests.add(new Request(matcher.replaceAll(String.valueOf(i)))); } return requests; } } ================================================ FILE: webmagic-extension/src/main/resources/crawl.js ================================================ var system = require('system'); var url = system.args[1]; var page = require('webpage').create(); page.settings.loadImages = false; page.settings.resourceTimeout = 5000; page.open(url, function (status) { if (status != 'success') { console.log("HTTP request failed!"); } else { console.log(page.content); } page.close(); phantom.exit(); }); ================================================ FILE: webmagic-extension/src/main/resources/spider-config-draft.xml ================================================ utf-8 ================================================ FILE: webmagic-extension/src/test/java/us/codecraft/webmagic/MockPageModelPipeline.java ================================================ package us.codecraft.webmagic; import junit.framework.Assert; import us.codecraft.webmagic.pipeline.PageModelPipeline; /** * @author code4crafter@gmail.com */ public class MockPageModelPipeline implements PageModelPipeline{ @Override public void process(Object o, Task task) { Assert.assertNotNull(o); } } ================================================ FILE: webmagic-extension/src/test/java/us/codecraft/webmagic/MockPipeline.java ================================================ package us.codecraft.webmagic; import us.codecraft.webmagic.pipeline.Pipeline; /** * @author code4crafter@gmail.com */ public class MockPipeline implements Pipeline{ @Override public void process(ResultItems resultItems, Task task) { } } ================================================ FILE: webmagic-extension/src/test/java/us/codecraft/webmagic/SimpleHttpClientTest.java ================================================ package us.codecraft.webmagic; import org.junit.Ignore; import org.junit.Test; import us.codecraft.webmagic.model.AfterExtractor; import us.codecraft.webmagic.model.annotation.ExtractBy; import static org.assertj.core.api.Assertions.assertThat; /** * @author code4crafter@gmail.com * Date: 2017/6/3 * Time: 下午2:54 */ public class SimpleHttpClientTest { public static class Weather implements AfterExtractor { private String location; @ExtractBy(notNull = true, value = "//div[@id='7d']//ul[@class='t']/li[2]/p[@class='tem']/i/regex('([\\-\\d]+)',1)") private Integer lowTemperature; @ExtractBy(notNull = true, value = "//div[@id='7d']//ul[@class='t']/li[2]/p[@class='tem']/span/regex('([\\-\\d]+)',1)") private Integer highTemperature; @ExtractBy(notNull = true, value = "//div[@id='7d']//ul[@class='t']/li[2]/p[@class='wea']/text()") private String desc; @Override public void afterProcess(Page page) { if (lowTemperature > highTemperature) { int temp = lowTemperature; lowTemperature = highTemperature; highTemperature = temp; } } public String getLocation() { return location; } public void setLocation(String location) { this.location = location; } public Integer getLowTemperature() { return lowTemperature; } public void setLowTemperature(Integer lowTemperature) { this.lowTemperature = lowTemperature; } public Integer getHighTemperature() { return highTemperature; } public void setHighTemperature(Integer highTemperature) { this.highTemperature = highTemperature; } public String getDesc() { return desc; } public void setDesc(String desc) { this.desc = desc; } @Override public String toString() { return "Weather{" + "location='" + location + '\'' + ", lowTemperature=" + lowTemperature + ", highTemperature=" + highTemperature + ", desc='" + desc + '\'' + '}'; } } @Ignore @Test public void test() throws Exception { Weather weather = new SimpleHttpClient(Site.me()).get("http://www.weather.com.cn/weather/101020100.shtml", Weather.class); assertThat(weather).isNotNull(); } } ================================================ FILE: webmagic-extension/src/test/java/us/codecraft/webmagic/configurable/ConfigurablePageProcessorTest.java ================================================ package us.codecraft.webmagic.configurable; import org.junit.Test; import us.codecraft.webmagic.ResultItems; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.downloader.MockGithubDownloader; import java.util.ArrayList; import java.util.List; import static org.assertj.core.api.Assertions.assertThat; /** * @author code4crafter@gmail.com */ public class ConfigurablePageProcessorTest { @Test public void test() throws Exception { List extractRules = new ArrayList(); ExtractRule extractRule = new ExtractRule(); extractRule.setExpressionType(ExpressionType.XPath); extractRule.setExpressionValue("//title"); extractRule.setFieldName("title"); extractRules.add(extractRule); extractRule = new ExtractRule(); extractRule.setExpressionType(ExpressionType.XPath); extractRule.setExpressionValue("//ul[@class='pagehead-actions']/li[1]//a[@class='social-count js-social-count']/text()"); extractRule.setFieldName("star"); extractRules.add(extractRule); ResultItems resultItems = Spider.create(new ConfigurablePageProcessor(Site.me(), extractRules)) .setDownloader(new MockGithubDownloader()).get("https://github.com/code4craft/webmagic"); assertThat(resultItems.getAll()).containsEntry("title", "code4craft/webmagic · GitHub"); assertThat(resultItems.getAll()).containsEntry("star", " 86 "); } } ================================================ FILE: webmagic-extension/src/test/java/us/codecraft/webmagic/downloader/MockGithubDownloader.java ================================================ package us.codecraft.webmagic.downloader; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Task; import us.codecraft.webmagic.selector.PlainText; /** * @author code4crafter@gmail.com */ public class MockGithubDownloader implements Downloader{ private String html = "\n" + "\n" + "\n" + "\n" + "\n" + "\n" + " \n" + " \n" + " \n" + " code4craft/webmagic · GitHub\n" + " \n" + " \n" + " \n" + " \n" + " \n" + " \n" + " \n" + " \n" + " \n" + " \n" + " \n" + " \n" + " \n" + " \n" + "\n" + "\n" + " \n" + " \n" + " \n" + " \n" + " \n" + "\n" + " \n" + " \n" + " \n" + "\n" + " \n" + "\n" + "\n" + " \n" + " \n" + " \n" + "\n" + " \n" + "\n" + " \n" + " \n" + " \n" + " \n" + "\n" + " \n" + " \n" + " \n" + " \n" + " \n" + " \n" + "\n" + " \n" + "\n" + " \n" + " \n" + "\n" + " \n" + "\n" + "\n" + " \n" + "
\n" + " \n" + " \n" + " \n" + "\n" + "\n" + " \n" + "
\n" + "
\n" + "\n" + " \n" + " \n" + " \n" + "\n" + "
\n" + " Sign up\n" + " Sign in\n" + "
\n" + "\n" + "
\n" + "\n" + " \n" + "
\n" + "\n" + "\n" + "\n" + " \n" + "\n" + "
\n" + " \n" + " This repository\n" + " \n" + "\n" + "
\n" + "
\n" + "\n" + "
\n" + " \n" + " \n" + "
This repository
\n" + "
\n" + "\n" + "
\n" + " \n" + " \n" + "
All repositories
\n" + "
\n" + "\n" + "
\n" + "
\n" + "
\n" + "\n" + " \n" + " \n" + " \n" + "\n" + "\n" + " \n" + "\n" + "
\n" + "
\n" + "\n" + "
\n" + "
\n" + "\n" + "\n" + " \n" + "\n" + "\n" + "
\n" + " \n" + "
\n" + "
\n" + " \n" + "\n" + "\n" + "\n" + "

\n" + " public\n" + " \n" + " \n" + " code4craft/webmagic\n" + "\n" + " \n" + " \"Octocat-spinner-32\"\n" + " \n" + "\n" + "

\n" + "
\n" + "
\n" + "\n" + "
\n" + "\n" + "
\n" + "\n" + "
\n" + " \n" + "\n" + "
\n" + "
\n" + " \n" + "
\n" + " \n" + "\n" + "\n" + "
\n" + "
\n" + "\n" + "
\n" + " \n" + "\n" + " \n" + "\n" + "
\n" + "

HTTPS clone URL

\n" + "
\n" + " \n" + "\n" + " \n" + "
\n" + "
\n" + "\n" + " \n" + "\n" + "
\n" + "

Subversion checkout URL

\n" + "
\n" + " \n" + "\n" + " \n" + "
\n" + "
\n" + "\n" + "\n" + "

You can clone with\n" + " HTTPS,\n" + " or Subversion.\n" + " \n" + " \n" + " \n" + " \n" + " \n" + "

\n" + "\n" + " \n" + " \n" + " Clone in Desktop\n" + " \n" + "\n" + "\n" + " \n" + " \n" + " Download ZIP\n" + " \n" + "
\n" + "
\n" + "\n" + "
\n" + " \n" + "
\n" + "\n" + "
\n" + "
\n" + "

A scalable web crawler framework.

\n" + "
\n" + "\n" + "\n" + "\n" + "
\n" + "\n" + "
\n" + "\n" + "
\n" + "\n" + " \n" + "\n" + "
\n" + "
    \n" + "
  1. \n" + " \n" + " \n" + " Java\n" + " 100%\n" + " \n" + "
  2. \n" + "
\n" + "
\n" + "
\n" + "\n" + "
\n" + "\n" + " \n" + " Java\n" + " \n" + "\n" + "\n" + "\n" + "\n" + "
\n" + " \n" + " \n" + " \n" + "\n" + " \n" + "\n" + "\n" + "
\n" + " \n" + " \n" + " branch:\n" + " master\n" + " \n" + "\n" + "
\n" + "\n" + "
\n" + "
\n" + " Switch branches/tags\n" + " \n" + "
\n" + "\n" + "
\n" + "
\n" + " \n" + "
\n" + "
\n" + "
    \n" + "
  • \n" + " Branches\n" + "
  • \n" + "
  • \n" + " Tags\n" + "
  • \n" + "
\n" + "
\n" + "
\n" + "\n" + "
\n" + "\n" + "
\n" + "\n" + "\n" + "
\n" + " \n" + " en-webmagic\n" + "
\n" + "
\n" + " \n" + " gh-pages\n" + "
\n" + "
\n" + " \n" + " master\n" + "
\n" + "
\n" + " \n" + " xsoup\n" + "
\n" + "
\n" + "\n" + "
Nothing to show
\n" + "
\n" + "\n" + "
\n" + "
\n" + "\n" + "\n" + "
\n" + " \n" + " webmagic-parent-0.3.1\n" + "
\n" + "
\n" + " \n" + " webmagic-parent-0.2.1\n" + "
\n" + "
\n" + " \n" + " webmagic-0.3.0\n" + "
\n" + "
\n" + " \n" + " version-0.2.0\n" + "
\n" + "
\n" + " \n" + " version-0.1.0\n" + "
\n" + "
\n" + "\n" + "
Nothing to show
\n" + "
\n" + "\n" + "
\n" + "
\n" + "
\n" + "\n" + "\n" + " \n" + "
\n" + "\n" + "\n" + "\n" + "Show File Finder\n" + "
\n" + " \n" + " \n" + "\n" + "
\n" + "

\n" + " Fetching latest commit…\n" + "

\n" + "
\n" + "

\"Octocat-spinner-32-eaf2f5\"

\n" + "

Cannot retrieve the latest commit at this time

\n" + "
\n" + "
\n" + " \n" + "\n" + " \n" + "\n" + " \n" + " \n" + " \n" + " \n" + " \n" + " \n" + " \n" + " \n" + " \n" + " \n" + " \n" + " \n" + " \n" + " \n" + " \n" + " \n" + " \n" + " \n" + " \n" + " \n" + " \n" + " \n" + " \n" + " \n" + " \n" + " \n" + " \n" + " \n" + " \n" + " \n" + " \n" + " \n" + " \n" + " \n" + " \n" + " \n" + " \n" + " \n" + " \n" + " \n" + " \n" + " \n" + " \n" + " \n" + " \n" + " \n" + " \n" + " \n" + " \n" + " \n" + " \n" + " \n" + " \n" + " \n" + " \n" + " \n" + " \n" + " \n" + " \n" + " \n" + " \n" + " \n" + " \n" + " \n" + " \n" + " \n" + " \n" + " \n" + " \n" + " \n" + " \n" + " \n" + " \n" + " \n" + " \n" + " \n" + " \n" + " \n" + " \n" + " \n" + " \n" + " \n" + " \n" + " \n" + "\n" + "\n" + "
\n" + " \n" + " \"Octocat-spinner-32\"\n" + " \n" + " en_docs\n" + " \n" + " update readme\n" + "
\n" + " \n" + " \"Octocat-spinner-32\"\n" + " \n" + " webmagic-core\n" + " \n" + " add example\n" + "
\n" + " \n" + " \"Octocat-spinner-32\"\n" + " \n" + " webmagic-extension\n" + " \n" + " add example\n" + "
\n" + " \n" + " \"Octocat-spinner-32\"\n" + " \n" + " webmagic-lucene\n" + " \n" + " update pom\n" + "
\n" + " \n" + " \"Octocat-spinner-32\"\n" + " \n" + " webmagic-samples\n" + " \n" + " some bugfix\n" + "
\n" + " \n" + " \"Octocat-spinner-32\"\n" + " \n" + " webmagic-saxon\n" + " \n" + " xsoup test\n" + "
\n" + " \n" + " \"Octocat-spinner-32\"\n" + " \n" + " webmagic-selenium\n" + " \n" + " update pom\n" + "
\n" + " \n" + " \"Octocat-spinner-32\"\n" + " \n" + " zh_docs\n" + " \n" + " update version\n" + "
\n" + " \n" + " \"Octocat-spinner-32\"\n" + " \n" + " .gitignore\n" + " \n" + " 增加剔除文件\n" + "
\n" + " \n" + " \"Octocat-spinner-32\"\n" + " \n" + " .travis.yml\n" + " \n" + " add jdk\n" + "
\n" + " \n" + " \"Octocat-spinner-32\"\n" + " \n" + " README.md\n" + " \n" + " update version\n" + "
\n" + " \n" + " \"Octocat-spinner-32\"\n" + " \n" + " pom.xml\n" + " \n" + " [maven-release-plugin] prepare for next development iteration\n" + "
\n" + " \n" + " \"Octocat-spinner-32\"\n" + " \n" + " release-note.md\n" + " \n" + " release note\n" + "
\n" + " \n" + " \"Octocat-spinner-32\"\n" + " \n" + " webmagic manual.md\n" + " \n" + " readme\n" + "
\n" + "
\n" + "\n" + "
\n" + " README.md

\n" + "webmagic

\n" + "\n" + "

Readme in Chinese

\n" + "\n" + "

\"Build

\n" + "\n" + "
\n" + "

A scalable crawler framework. It covers the whole lifecycle of crawler: downloading, url management, content extraction and persistent. It can simplify the development of a specific crawler.

\n" + "
\n" + "\n" + "

\n" + "Features:

\n" + "\n" + "
    \n" + "
  • Simple core with high flexibility.
  • \n" + "
  • Simple API for html extracting.
  • \n" + "
  • Annotation with POJO to customize a crawler, no configuration.
  • \n" + "
  • Multi-thread and Distribution support.
  • \n" + "
  • Easy to be integrated.
  • \n" + "

\n" + "Install:

\n" + "\n" + "

Add dependencies to your pom.xml:

\n" + "\n" + "
    <dependency>\n" +
            "        <groupId>us.codecraft</groupId>\n" +
            "        <artifactId>webmagic-core</artifactId>\n" +
            "        <version>0.3.1</version>\n" +
            "    </dependency>\n" +
            "    <dependency>\n" +
            "        <groupId>us.codecraft</groupId>\n" +
            "        <artifactId>webmagic-extension</artifactId>\n" +
            "        <version>0.3.1</version>\n" +
            "    </dependency>\n" +
            "
\n" + "\n" + "

\n" + "Get Started:

\n" + "\n" + "

\n" + "First crawler:

\n" + "\n" + "

Write a class implements PageProcessor:

\n" + "\n" + "
    public class OschinaBlogPageProcessor implements PageProcessor {\n" +
            "\n" +
            "        private Site site = Site.me().setDomain(\"my.oschina.net\")\n" +
            "           .addStartUrl(\"http://my.oschina.net/flashsword/blog\");\n" +
            "\n" +
            "        @Override\n" +
            "        public void process(Page page) {\n" +
            "            List<String> links = page.getHtml().links().regex(\"http://my\\\\.oschina\\\\.net/flashsword/blog/\\\\d+\").all();\n" +
            "            page.addTargetRequests(links);\n" +
            "            page.putField(\"title\", page.getHtml().xpath(\"//div[@class='BlogEntity']/div[@class='BlogTitle']/h1\").toString());\n" +
            "            page.putField(\"content\", page.getHtml().$(\"div.content\").toString());\n" +
            "            page.putField(\"tags\",page.getHtml().xpath(\"//div[@class='BlogTags']/a/text()\").all());\n" +
            "        }\n" +
            "\n" +
            "        @Override\n" +
            "        public Site getSite() {\n" +
            "            return site;\n" +
            "\n" +
            "        }\n" +
            "\n" +
            "        public static void main(String[] args) {\n" +
            "            Spider.create(new OschinaBlogPageProcessor())\n" +
            "                 .pipeline(new ConsolePipeline()).run();\n" +
            "        }\n" +
            "    }\n" +
            "
\n" + "\n" + "
    \n" + "
  • \n" + "

    page.addTargetRequests(links)

    \n" + "\n" + "

    Add urls for crawling.

    \n" + "
  • \n" + "

You can also use annotation way:

\n" + "\n" + "
    @TargetUrl(\"http://my.oschina.net/flashsword/blog/\\\\d+\")\n" +
            "    public class OschinaBlog {\n" +
            "\n" +
            "        @ExtractBy(\"//title\")\n" +
            "        private String title;\n" +
            "\n" +
            "        @ExtractBy(value = \"div.BlogContent\",type = ExtractBy.Type.Css)\n" +
            "        private String content;\n" +
            "\n" +
            "        @ExtractBy(value = \"//div[@class='BlogTags']/a/text()\", multi = true)\n" +
            "        private List<String> tags;\n" +
            "\n" +
            "        public static void main(String[] args) {\n" +
            "            OOSpider.create(\n" +
            "                Site.me().addStartUrl(\"http://my.oschina.net/flashsword/blog\"),\n" +
            "                new ConsolePageModelPipeline(), OschinaBlog.class).run();\n" +
            "        }\n" +
            "    }\n" +
            "
\n" + "\n" + "

\n" + "Docs and samples:

\n" + "\n" + "

The architecture of webmagic (refered to Scrapy)

\n" + "\n" + "

\"image\"

\n" + "\n" + "

Javadocs: http://code4craft.github.io/webmagic/docs/en/

\n" + "\n" + "

There are some samples in webmagic-samples package.

\n" + "\n" + "

\n" + "Lisence:

\n" + "\n" + "

Lisenced under Apache 2.0 lisence

\n" + "\n" + "

\n" + "Thanks:

\n" + "\n" + "

To write webmagic, I refered to the projects below :

\n" + "\n" + "
\n" + "
\n" + "\n" + "\n" + "
\n" + "\n" + "
\n" + "
\n" + "
\n" + "
\n" + "\n" + "\n" + "
\n" + "\n" + "
\n" + "
\n" + " \n" + "\n" + " \n" + " \n" + " \n" + "\n" + " \n" + "
\n" + "
\n" + "\n" + "\n" + "
\n" + "
\n" + "
\n" + " \n" + "
\n" + "
\n" + "
\n" + "
\n" + "
\n" + "
\n" + "
\n" + " \n" + " \n" + " \n" + " \n" + " \n" + " \n" + "
\n" + "
\n" + "\n" + "\n" + "\n" + "
\n" + " \n" + " \n" + " Something went wrong with that request. Please try again.\n" + "
\n" + "\n" + " \n" + "\n" + "\n"; @Override public Page download(Request request, Task task) { Page page = new Page(); page.setRawText(html); page.setStatusCode(200); page.setDownloadSuccess(true); page.setRequest(new Request("https://github.com/code4craft/webmagic")); page.setUrl(new PlainText("https://github.com/code4craft/webmagic")); return page; } @Override public void setThread(int threadNum) { } } ================================================ FILE: webmagic-extension/src/test/java/us/codecraft/webmagic/formatter/DateFormatterTest.java ================================================ package us.codecraft.webmagic.formatter; import org.apache.commons.lang3.time.DateFormatUtils; import org.apache.commons.lang3.time.DateUtils; import org.junit.Test; import us.codecraft.webmagic.model.formatter.DateFormatter; import java.util.Date; import static org.assertj.core.api.Assertions.assertThat; /** * @author code4crafter@gmail.com */ public class DateFormatterTest { @Test public void testDateFormatter() throws Exception { DateFormatter dateFormatter = new DateFormatter(); String pattern = "yyyy-MM-dd HH:mm"; Date date = DateUtils.parseDate("2013-09-10 22:11", new String[]{pattern}); dateFormatter.initParam(new String[]{pattern}); Date format = dateFormatter.format(DateFormatUtils.format(date, pattern)); assertThat(format).isEqualTo(date); } } ================================================ FILE: webmagic-extension/src/test/java/us/codecraft/webmagic/model/BaseRepo.java ================================================ package us.codecraft.webmagic.model; import us.codecraft.webmagic.model.annotation.ExtractBy; /** * @author code4crafter@gmail.com */ public class BaseRepo { @ExtractBy("//ul[@class='pagehead-actions']/li[1]//a[@class='social-count js-social-count']/text()") protected int star; } ================================================ FILE: webmagic-extension/src/test/java/us/codecraft/webmagic/model/GithubRepo.java ================================================ package us.codecraft.webmagic.model; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.model.annotation.ExtractBy; import us.codecraft.webmagic.model.annotation.HelpUrl; import us.codecraft.webmagic.model.annotation.TargetUrl; /** * @author code4crafter@gmail.com
* @since 0.3.2 */ @TargetUrl("https://github.com/\\w+/\\w+") @HelpUrl({"https://github.com/\\w+\\?tab=repositories", "https://github.com/\\w+", "https://github.com/explore/*"}) public class GithubRepo extends BaseRepo{ @ExtractBy("//ul[@class='pagehead-actions']/li[2]//a[@class='social-count']/text()") private int fork; public static void main(String[] args) { OOSpider.create(Site.me().setSleepTime(100) , new ConsolePageModelPipeline(), GithubRepo.class) .addUrl("https://github.com/code4craft").thread(10).run(); } public int getStar() { return star; } public int getFork() { return fork; } } ================================================ FILE: webmagic-extension/src/test/java/us/codecraft/webmagic/model/GithubRepoApi.java ================================================ package us.codecraft.webmagic.model; import us.codecraft.webmagic.model.annotation.ExtractBy; /** * @author code4crafter@gmail.com * Date: 2017/6/3 * Time: 下午9:07 */ public class GithubRepoApi { @ExtractBy(type = ExtractBy.Type.JsonPath, value = "$.name",source = ExtractBy.Source.RawText) private String name; public String getName() { return name; } } ================================================ FILE: webmagic-extension/src/test/java/us/codecraft/webmagic/model/GithubRepoTest.java ================================================ package us.codecraft.webmagic.model; import org.junit.Test; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Task; import us.codecraft.webmagic.downloader.MockGithubDownloader; import us.codecraft.webmagic.example.GithubRepo; import us.codecraft.webmagic.pipeline.PageModelPipeline; import static org.assertj.core.api.Assertions.assertThat; /** * @author code4crafter@gmail.com
*/ public class GithubRepoTest { @Test public void test() { OOSpider.create(Site.me().setSleepTime(0) , new PageModelPipeline() { @Override public void process(GithubRepo o, Task task) { assertThat(o.getStar()).isEqualTo(86); assertThat(o.getFork()).isEqualTo(70); } }, GithubRepo.class).addUrl("https://github.com/code4craft/webmagic").setDownloader(new MockGithubDownloader()).test("https://github.com/code4craft/webmagic"); } } ================================================ FILE: webmagic-extension/src/test/java/us/codecraft/webmagic/model/ModelPageProcessorTest.java ================================================ package us.codecraft.webmagic.model; import org.junit.Test; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Request; import us.codecraft.webmagic.model.annotation.ExtractBy; import us.codecraft.webmagic.model.annotation.HelpUrl; import us.codecraft.webmagic.model.annotation.TargetUrl; import us.codecraft.webmagic.selector.PlainText; import static org.assertj.core.api.Assertions.assertThat; /** * @author code4crafter@gmail.com */ public class ModelPageProcessorTest { private PageMocker pageMocker = new PageMocker(); @TargetUrl("http://codecraft.us/foo") public static class ModelFoo { @ExtractBy(value = "//div/@foo", notNull = true) private String foo; } @TargetUrl("http://codecraft.us/bar") public static class ModelBar { @ExtractBy(value = "//div/@bar", notNull = true) private String bar; } @TargetUrl(value = "http://webmagic.io/foo/\\d+",sourceRegion = "//li[@class='bar']") @HelpUrl(value = "http://webmagic.io/bar/\\d+",sourceRegion = "//li[@class='foo']") public static class MockModel { } @Test public void testMultiModel_should_not_skip_when_match() throws Exception { Page page = new Page(); page.setRawText("
"); page.setRequest(new Request("http://codecraft.us/foo")); page.setUrl(PlainText.create("http://codecraft.us/foo")); ModelPageProcessor modelPageProcessor = ModelPageProcessor.create(null, ModelFoo.class, ModelBar.class); modelPageProcessor.process(page); assertThat(page.getResultItems().isSkip()).isFalse(); } @Test public void testExtractLinks() throws Exception { ModelPageProcessor modelPageProcessor = ModelPageProcessor.create(null, MockModel.class); Page page = pageMocker.getMockPage(); modelPageProcessor.process(page); assertThat(page.getTargetRequests()).containsExactly(new Request("http://webmagic.io/bar/3"), new Request("http://webmagic.io/bar/4"), new Request("http://webmagic.io/foo/3"), new Request("http://webmagic.io/foo/4")); } @Test public void testExtractNoLinks() throws Exception { ModelPageProcessor modelPageProcessor = ModelPageProcessor.create(null, MockModel.class); Page page = pageMocker.getMockPage(); modelPageProcessor.setExtractLinks(false); modelPageProcessor.process(page); assertThat(page.getTargetRequests()).isEmpty(); } } ================================================ FILE: webmagic-extension/src/test/java/us/codecraft/webmagic/model/PageMapperTest.java ================================================ package us.codecraft.webmagic.model; import org.junit.Test; import static org.assertj.core.api.Assertions.assertThat; /** * @author code4crafter@gmail.com * Date: 2017/6/3 * Time: 下午3:23 */ public class PageMapperTest { private PageMocker pageMocker = new PageMocker(); @Test public void test_get() throws Exception { PageMapper pageMapper = new PageMapper(GithubRepoApi.class); GithubRepoApi githubRepo = pageMapper.get(pageMocker.getMockJsonPage()); assertThat(githubRepo.getName()).isEqualTo("webmagic"); } } ================================================ FILE: webmagic-extension/src/test/java/us/codecraft/webmagic/model/PageMocker.java ================================================ package us.codecraft.webmagic.model; import java.io.IOException; import java.nio.charset.Charset; import org.apache.commons.io.IOUtils; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Request; import us.codecraft.webmagic.selector.PlainText; /** * @author code4crafter@gmail.com * Date: 2017/6/3 * Time: 下午9:08 */ public class PageMocker { public Page getMockJsonPage() throws IOException { Page page = new Page(); page.setRawText(IOUtils.toString(PageMocker.class.getClassLoader().getResourceAsStream("json/mock-githubrepo.json"), Charset.defaultCharset())); page.setRequest(new Request("https://api.github.com/repos/code4craft/webmagic")); page.setUrl(new PlainText("https://api.github.com/repos/code4craft/webmagic")); return page; } public Page getMockPage() throws IOException { Page page = new Page(); page.setRawText(IOUtils.toString(PageMocker.class.getClassLoader().getResourceAsStream("html/mock-webmagic.html"), Charset.defaultCharset())); page.setRequest(new Request("http://webmagic.io/list/0")); page.setUrl(new PlainText("http://webmagic.io/list/0")); return page; } } ================================================ FILE: webmagic-extension/src/test/java/us/codecraft/webmagic/model/PageModelExtractorTest.java ================================================ package us.codecraft.webmagic.model; import org.apache.commons.lang3.time.DateFormatUtils; import org.apache.commons.lang3.time.DateUtils; import org.junit.Test; import us.codecraft.webmagic.model.annotation.ExtractBy; import us.codecraft.webmagic.model.annotation.ExtractByUrl; import us.codecraft.webmagic.model.annotation.Formatter; import us.codecraft.webmagic.model.formatter.DateFormatter; import java.util.Date; import java.util.List; import static org.assertj.core.api.Assertions.assertThat; /** * @author code4crafter@gmail.com * Date: 2017/6/3 * Time: 下午9:06 */ public class PageModelExtractorTest { private PageMocker pageMocker = new PageMocker(); public static class ModelDateStr { @ExtractBy(value = "//div[@class='date']/text()", notNull = true) private String dateStr; } public static class ModelDate { @Formatter(value = "yyyyMMdd", formatter = DateFormatter.class) @ExtractBy(value = "//div[@class='date']/text()", notNull = true) private Date date; } public static class ModelInt { @ExtractBy(value = "//div[@class='number']/text()", notNull = true) private int number; } public static class ModelStringList { @ExtractBy("//li[@class='list']/a/@href") private List links; } public static class ModelIntList { @Formatter(subClazz = Integer.class) @ExtractBy("//li[@class='numbers']/text()") private List numbers; } public static class ModelDateList { @Formatter(subClazz = Date.class, value = "yyyyMMdd") @ExtractBy("//li[@class='dates']/text()") private List dates; } public static class ModelCustomList { @Formatter(subClazz = Date.class, value = "yyyyMMdd",formatter = DateFormatter.class) @ExtractBy("//li[@class='dates']/text()") private List dates; } public static class ModelJsonStr { @ExtractBy(type = ExtractBy.Type.JsonPath, value = "$.name") private String name; } public static class ModelUrl { @ExtractByUrl("https://api\\.github\\.com/repos/\\w+/(\\w+)") private String name; } @Test public void testXpath() throws Exception { ModelDateStr modelDate = (ModelDateStr) PageModelExtractor.create(ModelDateStr.class).process(pageMocker.getMockPage()); assertThat(modelDate.dateStr).isEqualTo("20170603"); } @Test public void testExtractDate() throws Exception { ModelDate modelDate = (ModelDate) PageModelExtractor.create(ModelDate.class).process(pageMocker.getMockPage()); assertThat(DateFormatUtils.format(modelDate.date,"yyyyMMdd")).isEqualTo("20170603"); } @Test public void testExtractInt() throws Exception { ModelInt modelDate = (ModelInt) PageModelExtractor.create(ModelInt.class).process(pageMocker.getMockPage()); assertThat(modelDate.number).isEqualTo(12); } @Test public void testExtractList() throws Exception { ModelStringList modelDate = (ModelStringList) PageModelExtractor.create(ModelStringList.class).process(pageMocker.getMockPage()); assertThat(modelDate.links).containsExactly("http://webmagic.io/list/1","http://webmagic.io/list/2","http://webmagic.io/list/3","http://webmagic.io/list/4"); } @Test public void testExtractIntList() throws Exception { ModelIntList modelDate = (ModelIntList) PageModelExtractor.create(ModelIntList.class).process(pageMocker.getMockPage()); assertThat(modelDate.numbers).containsExactly(1,2,3,4); } @Test public void testExtractDateList() throws Exception { ModelDateList modelDate = (ModelDateList) PageModelExtractor.create(ModelDateList.class).process(pageMocker.getMockPage()); assertThat(modelDate.dates).containsExactly(DateUtils.parseDate("20170601", "yyyyMMdd"), DateUtils.parseDate("20170602", "yyyyMMdd"), DateUtils.parseDate("20170603", "yyyyMMdd"), DateUtils.parseDate("20170604", "yyyyMMdd")); } @Test public void testExtractCustomList() throws Exception { ModelCustomList modelDate = (ModelCustomList) PageModelExtractor.create(ModelCustomList.class).process(pageMocker.getMockPage()); assertThat(modelDate.dates).containsExactly(DateUtils.parseDate("20170601", "yyyyMMdd"), DateUtils.parseDate("20170602", "yyyyMMdd"), DateUtils.parseDate("20170603", "yyyyMMdd"), DateUtils.parseDate("20170604", "yyyyMMdd")); } @Test public void testExtractJson() throws Exception { ModelJsonStr modelDate = (ModelJsonStr) PageModelExtractor.create(ModelJsonStr.class).process(pageMocker.getMockJsonPage()); assertThat(modelDate.name).isEqualTo("webmagic"); } @Test public void testExtractByUrl() throws Exception { ModelUrl modelDate = (ModelUrl) PageModelExtractor.create(ModelUrl.class).process(pageMocker.getMockJsonPage()); assertThat(modelDate.name).isEqualTo("webmagic"); } } ================================================ FILE: webmagic-extension/src/test/java/us/codecraft/webmagic/monitor/CustomSpiderStatus.java ================================================ package us.codecraft.webmagic.monitor; import us.codecraft.webmagic.Spider; /** * @author code4crafer@gmail.com */ public class CustomSpiderStatus extends SpiderStatus implements CustomSpiderStatusMXBean { public CustomSpiderStatus(Spider spider, SpiderMonitor.MonitorSpiderListener monitorSpiderListener) { super(spider, monitorSpiderListener); } @Override public String getSchedulerName() { return spider.getScheduler().getClass().getName(); } } ================================================ FILE: webmagic-extension/src/test/java/us/codecraft/webmagic/monitor/CustomSpiderStatusMXBean.java ================================================ package us.codecraft.webmagic.monitor; /** * @author code4crafer@gmail.com */ public interface CustomSpiderStatusMXBean extends SpiderStatusMXBean { public String getSchedulerName(); } ================================================ FILE: webmagic-extension/src/test/java/us/codecraft/webmagic/monitor/SeedUrlWithPortTest.java ================================================ package us.codecraft.webmagic.monitor; import org.junit.Test; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.processor.PageProcessor; import javax.management.JMException; /** * @author jerry_shenchao@163.com */ public class SeedUrlWithPortTest { @Test public void testSeedUrlWithPort() throws JMException { Spider spider = Spider.create(new TempProcessor()).addUrl("http://www.hndpf.org:8889/"); SpiderMonitor.instance().register(spider); spider.run(); } } class TempProcessor implements PageProcessor { @Override public void process(Page page) { } @Override public Site getSite() { return Site.me(); } } ================================================ FILE: webmagic-extension/src/test/java/us/codecraft/webmagic/monitor/SpiderMonitorTest.java ================================================ package us.codecraft.webmagic.monitor; import org.junit.Test; import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.processor.example.GithubRepoPageProcessor; import us.codecraft.webmagic.processor.example.ZhihuPageProcessor; /** * @author code4crafer@gmail.com * @since 0.5.0 */ public class SpiderMonitorTest { @Test public void testInherit() throws Exception { SpiderMonitor spiderMonitor = new SpiderMonitor(){ @Override protected SpiderStatusMXBean getSpiderStatusMBean(Spider spider, MonitorSpiderListener monitorSpiderListener) { return new CustomSpiderStatus(spider, monitorSpiderListener); } }; Spider zhihuSpider = Spider.create(new ZhihuPageProcessor()) .addUrl("http://my.oschina.net/flashsword/blog").thread(2); Spider githubSpider = Spider.create(new GithubRepoPageProcessor()) .addUrl("https://github.com/code4craft"); spiderMonitor.register(zhihuSpider, githubSpider); } } ================================================ FILE: webmagic-extension/src/test/java/us/codecraft/webmagic/processor/GithubRepoProcessor.java ================================================ package us.codecraft.webmagic.processor; import junit.framework.Assert; import org.junit.Test; import us.codecraft.webmagic.*; import us.codecraft.webmagic.downloader.MockGithubDownloader; import us.codecraft.webmagic.model.OOSpider; import us.codecraft.webmagic.pipeline.Pipeline; /** * @author code4crafter@gmail.com */ public class GithubRepoProcessor implements PageProcessor { @Override public void process(Page page) { page.putField("star",page.getHtml().xpath("//ul[@class='pagehead-actions']/li[2]//a[@class='social-count js-social-count']/text()").toString()); page.putField("fork",page.getHtml().xpath("//ul[@class='pagehead-actions']/li[3]//a[@class='social-count']/text()").toString()); } @Override public Site getSite() { return Site.me(); } @Test public void test() { OOSpider.create(new GithubRepoProcessor()).addPipeline(new Pipeline() { @Override public void process(ResultItems resultItems, Task task) { Assert.assertEquals("78",((String)resultItems.get("star")).trim()); Assert.assertEquals("65",((String)resultItems.get("fork")).trim()); } }).setDownloader(new MockGithubDownloader()).test("https://github.com/code4craft/webmagic"); } } ================================================ FILE: webmagic-extension/src/test/java/us/codecraft/webmagic/scheduler/BloomFilterDuplicateRemoverTest.java ================================================ package us.codecraft.webmagic.scheduler; import org.junit.Ignore; import org.junit.Test; import us.codecraft.webmagic.Request; import us.codecraft.webmagic.scheduler.component.DuplicateRemover; import us.codecraft.webmagic.scheduler.component.HashSetDuplicateRemover; import static org.assertj.core.api.Assertions.assertThat; /** * @author code4crafer@gmail.com */ public class BloomFilterDuplicateRemoverTest { @Test public void testRemove() throws Exception { BloomFilterDuplicateRemover bloomFilterDuplicateRemover = new BloomFilterDuplicateRemover(10); boolean isDuplicate = bloomFilterDuplicateRemover.isDuplicate(new Request("a"), null); assertThat(isDuplicate).isFalse(); isDuplicate = bloomFilterDuplicateRemover.isDuplicate(new Request("a"), null); assertThat(isDuplicate).isTrue(); isDuplicate = bloomFilterDuplicateRemover.isDuplicate(new Request("b"), null); assertThat(isDuplicate).isFalse(); isDuplicate = bloomFilterDuplicateRemover.isDuplicate(new Request("b"), null); assertThat(isDuplicate).isTrue(); } @Ignore("long time") @Test public void testMemory() throws Exception { int times = 5000000; DuplicateRemover duplicateRemover = new BloomFilterDuplicateRemover(times,0.005); long freeMemory = Runtime.getRuntime().freeMemory(); long time = System.currentTimeMillis(); for (int i = 0; i < times; i++) { duplicateRemover.isDuplicate(new Request(String.valueOf(i)), null); } System.out.println("Time used by bloomfilter:" + (System.currentTimeMillis() - time)); System.out.println("Memory used by bloomfilter:" + (freeMemory - Runtime.getRuntime().freeMemory())); duplicateRemover = new HashSetDuplicateRemover(); System.gc(); freeMemory = Runtime.getRuntime().freeMemory(); time = System.currentTimeMillis(); for (int i = 0; i < times; i++) { duplicateRemover.isDuplicate(new Request(String.valueOf(i)), null); } System.out.println("Time used by hashset:" + (System.currentTimeMillis() - time)); System.out.println("Memory used by hashset:" + (freeMemory - Runtime.getRuntime().freeMemory())); } @Ignore("long time") @Test public void testMissHit() throws Exception { int times = 5000000; DuplicateRemover duplicateRemover = new BloomFilterDuplicateRemover(times, 0.01); int right = 0; int wrong = 0; int missCheck = 0; for (int i = 0; i < times; i++) { boolean duplicate = duplicateRemover.isDuplicate(new Request(String.valueOf(i)), null); if (duplicate) { wrong++; } else { right++; } duplicate = duplicateRemover.isDuplicate(new Request(String.valueOf(i)), null); if (!duplicate) { missCheck++; } } System.out.println("Right count: " + right + " Wrong count: " + wrong + " Miss check: " + missCheck); } } ================================================ FILE: webmagic-extension/src/test/java/us/codecraft/webmagic/scheduler/RedisPrioritySchedulerTest.java ================================================ package us.codecraft.webmagic.scheduler; import org.junit.Assert; import org.junit.Before; import org.junit.Ignore; import org.junit.Test; import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Task; /** * @author sai * Created by sai on 16-7-5. */ public class RedisPrioritySchedulerTest { private RedisPriorityScheduler scheduler; @Before public void setUp() { scheduler = new RedisPriorityScheduler("localhost"); } @Ignore("environment depended") @Test public void test() { Task task = new Task() { @Override public String getUUID() { return "TestTask"; } @Override public Site getSite() { return null; } }; scheduler.resetDuplicateCheck(task); Request request = new Request("https://www.google.com"); Request request1= new Request("https://www.facebook.com/"); Request request2= new Request("https://twitter.com"); request.setPriority(1).putExtra("name", "google"); request1.setPriority(0).putExtra("name", "facebook"); request2.setPriority(-1).putExtra("name", "twitter"); scheduler.push(request, task); scheduler.push(request1, task); scheduler.push(request2, task); Request GRequest = scheduler.poll(task); Request FBRequest = scheduler.poll(task); Request TRequest = scheduler.poll(task); Assert.assertEquals(GRequest.getUrl(), request.getUrl()); Assert.assertEquals(GRequest.getExtra("name"), request.getExtra("name")); Assert.assertEquals(FBRequest.getUrl(), request1.getUrl()); Assert.assertEquals(FBRequest.getExtra("name"), request.getExtra("name")); Assert.assertEquals(TRequest.getUrl(), request2.getUrl()); Assert.assertEquals(TRequest.getExtra("name"), request.getExtra("name")); } } ================================================ FILE: webmagic-extension/src/test/java/us/codecraft/webmagic/scheduler/RedisSchedulerTest.java ================================================ package us.codecraft.webmagic.scheduler; import org.junit.Before; import org.junit.Ignore; import org.junit.Test; import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Task; import static org.assertj.core.api.Assertions.assertThat; /** * @author code4crafter@gmail.com
*/ public class RedisSchedulerTest { private RedisScheduler redisScheduler; @Before public void setUp() { redisScheduler = new RedisScheduler("localhost"); } @Ignore("environment depended") @Test public void test() { Task task = new Task() { @Override public String getUUID() { return "1"; } @Override public Site getSite() { return null; } }; Request request = new Request("http://www.ibm.com/developerworks/cn/java/j-javadev2-22/"); request.putExtra("1","2"); redisScheduler.push(request, task); Request poll = redisScheduler.poll(task); assertThat(poll).isEqualTo(request); } } ================================================ FILE: webmagic-extension/src/test/java/us/codecraft/webmagic/utils/IPUtilsTest.java ================================================ package us.codecraft.webmagic.utils; import org.junit.Test; /** * @author code4crafer@gmail.com */ public class IPUtilsTest { @Test public void testGetFirstNoLoopbackIPAddresses() throws Exception { System.out.println(IPUtils.getFirstNoLoopbackIPAddresses()); } } ================================================ FILE: webmagic-extension/src/test/java/us/codecraft/webmagic/utils/RequestUtilsTest.java ================================================ package us.codecraft.webmagic.utils; import org.junit.Test; import us.codecraft.webmagic.Request; import java.util.List; import static org.assertj.core.api.Assertions.assertThat; /** * @author code4crafter@gmail.com * Date: 2017/6/5 * Time: 下午5:08 */ public class RequestUtilsTest { @Test public void test_generate_range() throws Exception { List requests = RequestUtils.from("http://angularjs.cn/api/article/latest?p=[1-3]&s=20"); assertThat(requests).containsExactly(new Request("http://angularjs.cn/api/article/latest?p=1&s=20"), new Request("http://angularjs.cn/api/article/latest?p=2&s=20"), new Request("http://angularjs.cn/api/article/latest?p=3&s=20")); } @Test public void test_generate_range_when_invalid_number() throws Exception { List requests = RequestUtils.from("http://angularjs.cn/api/article/latest?p=[10-3]&s=20"); assertThat(requests).isEmpty(); } } ================================================ FILE: webmagic-extension/src/test/resources/html/mock-github.html ================================================ code4craft/webmagic Skip to content
  • Unwatch
  • Fork

/webmagic

A scalable web crawler framework. http://webmagic.io/ Edit
or Cancel
Java CSS JavaScript FreeMarker HTML Ruby
Find file
New pull request
Latest commit 800f66c @code4craft Revert "remove some unkown config"
This reverts commit 0e245c9.

README.md

logo

Readme in Chinese

User Manual (Chinese)

Build Status

A scalable crawler framework. It covers the whole lifecycle of crawler: downloading, url management, content extraction and persistent. It can simplify the development of a specific crawler.

Features:

  • Simple core with high flexibility.
  • Simple API for html extracting.
  • Annotation with POJO to customize a crawler, no configuration.
  • Multi-thread and Distribution support.
  • Easy to be integrated.

Install:

Add dependencies to your pom.xml:

<dependency>
    <groupId>us.codecraft</groupId>
    <artifactId>webmagic-core</artifactId>
    <version>0.5.2</version>
</dependency>
<dependency>
    <groupId>us.codecraft</groupId>
    <artifactId>webmagic-extension</artifactId>
    <version>0.5.2</version>
</dependency>

WebMagic use slf4j with slf4j-log4j12 implementation. If you customized your slf4j implementation, please exclude slf4j-log4j12.

<exclusions>
    <exclusion>
        <groupId>org.slf4j</groupId>
        <artifactId>slf4j-log4j12</artifactId>
    </exclusion>
</exclusions>

Get Started:

First crawler:

Write a class implements PageProcessor. For example, I wrote a crawler of github repository infomation.

public class GithubRepoPageProcessor implements PageProcessor {

    private Site site = Site.me().setRetryTimes(3).setSleepTime(1000);

    @Override
    public void process(Page page) {
        page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/\\w+/\\w+)").all());
        page.putField("author", page.getUrl().regex("https://github\\.com/(\\w+)/.*").toString());
        page.putField("name", page.getHtml().xpath("//h1[@class='entry-title public']/strong/a/text()").toString());
        if (page.getResultItems().get("name")==null){
            //skip this page
            page.setSkip(true);
        }
        page.putField("readme", page.getHtml().xpath("//div[@id='readme']/tidyText()"));
    }

    @Override
    public Site getSite() {
        return site;
    }

    public static void main(String[] args) {
        Spider.create(new GithubRepoPageProcessor()).addUrl("https://github.com/code4craft").thread(5).run();
    }
}
  • page.addTargetRequests(links)

    Add urls for crawling.

You can also use annotation way:

@TargetUrl("https://github.com/\\w+/\\w+")
@HelpUrl("https://github.com/\\w+")
public class GithubRepo {

    @ExtractBy(value = "//h1[@class='entry-title public']/strong/a/text()", notNull = true)
    private String name;

    @ExtractByUrl("https://github\\.com/(\\w+)/.*")
    private String author;

    @ExtractBy("//div[@id='readme']/tidyText()")
    private String readme;

    public static void main(String[] args) {
        OOSpider.create(Site.me().setSleepTime(1000)
                , new ConsolePageModelPipeline(), GithubRepo.class)
                .addUrl("https://github.com/code4craft").thread(5).run();
    }
}

Docs and samples:

Documents: http://webmagic.io/docs/

The architecture of webmagic (refered to Scrapy)

image

Javadocs: http://code4craft.github.io/webmagic/docs/en/

There are some samples in webmagic-samples package.

Lisence:

Lisenced under Apache 2.0 lisence

Contributors:

Thanks these people for commiting source code, reporting bugs or suggesting for new feature:

Thanks:

To write webmagic, I refered to the projects below :

Mail-list:

https://groups.google.com/forum/#!forum/webmagic-java

http://list.qq.com/cgi-bin/qf_invite?id=023a01f505246785f77c5a5a9aff4e57ab20fcdde871e988

QQ Group: 373225642

Bitdeli Badge

Something went wrong with that request. Please try again.
================================================ FILE: webmagic-extension/src/test/resources/html/mock-webmagic.html ================================================
20170603
12
  • 1
  • 2
  • 3
  • 4
  • 20170601
  • 20170602
  • 20170603
  • 20170604
================================================ FILE: webmagic-extension/src/test/resources/json/mock-githubrepo.json ================================================ { "id": 9623064, "name": "webmagic", "full_name": "code4craft/webmagic", "owner": { "login": "code4craft", "id": 1351884, "avatar_url": "https://avatars0.githubusercontent.com/u/1351884?v=3", "gravatar_id": "", "url": "https://api.github.com/users/code4craft", "html_url": "https://github.com/code4craft", "followers_url": "https://api.github.com/users/code4craft/followers", "following_url": "https://api.github.com/users/code4craft/following{/other_user}", "gists_url": "https://api.github.com/users/code4craft/gists{/gist_id}", "starred_url": "https://api.github.com/users/code4craft/starred{/owner}{/repo}", "subscriptions_url": "https://api.github.com/users/code4craft/subscriptions", "organizations_url": "https://api.github.com/users/code4craft/orgs", "repos_url": "https://api.github.com/users/code4craft/repos", "events_url": "https://api.github.com/users/code4craft/events{/privacy}", "received_events_url": "https://api.github.com/users/code4craft/received_events", "type": "User", "site_admin": false }, "private": false, "html_url": "https://github.com/code4craft/webmagic", "description": "A scalable web crawler framework for Java.", "fork": false, "url": "https://api.github.com/repos/code4craft/webmagic", "forks_url": "https://api.github.com/repos/code4craft/webmagic/forks", "keys_url": "https://api.github.com/repos/code4craft/webmagic/keys{/key_id}", "collaborators_url": "https://api.github.com/repos/code4craft/webmagic/collaborators{/collaborator}", "teams_url": "https://api.github.com/repos/code4craft/webmagic/teams", "hooks_url": "https://api.github.com/repos/code4craft/webmagic/hooks", "issue_events_url": "https://api.github.com/repos/code4craft/webmagic/issues/events{/number}", "events_url": "https://api.github.com/repos/code4craft/webmagic/events", "assignees_url": "https://api.github.com/repos/code4craft/webmagic/assignees{/user}", "branches_url": "https://api.github.com/repos/code4craft/webmagic/branches{/branch}", "tags_url": "https://api.github.com/repos/code4craft/webmagic/tags", "blobs_url": "https://api.github.com/repos/code4craft/webmagic/git/blobs{/sha}", "git_tags_url": "https://api.github.com/repos/code4craft/webmagic/git/tags{/sha}", "git_refs_url": "https://api.github.com/repos/code4craft/webmagic/git/refs{/sha}", "trees_url": "https://api.github.com/repos/code4craft/webmagic/git/trees{/sha}", "statuses_url": "https://api.github.com/repos/code4craft/webmagic/statuses/{sha}", "languages_url": "https://api.github.com/repos/code4craft/webmagic/languages", "stargazers_url": "https://api.github.com/repos/code4craft/webmagic/stargazers", "contributors_url": "https://api.github.com/repos/code4craft/webmagic/contributors", "subscribers_url": "https://api.github.com/repos/code4craft/webmagic/subscribers", "subscription_url": "https://api.github.com/repos/code4craft/webmagic/subscription", "commits_url": "https://api.github.com/repos/code4craft/webmagic/commits{/sha}", "git_commits_url": "https://api.github.com/repos/code4craft/webmagic/git/commits{/sha}", "comments_url": "https://api.github.com/repos/code4craft/webmagic/comments{/number}", "issue_comment_url": "https://api.github.com/repos/code4craft/webmagic/issues/comments{/number}", "contents_url": "https://api.github.com/repos/code4craft/webmagic/contents/{+path}", "compare_url": "https://api.github.com/repos/code4craft/webmagic/compare/{base}...{head}", "merges_url": "https://api.github.com/repos/code4craft/webmagic/merges", "archive_url": "https://api.github.com/repos/code4craft/webmagic/{archive_format}{/ref}", "downloads_url": "https://api.github.com/repos/code4craft/webmagic/downloads", "issues_url": "https://api.github.com/repos/code4craft/webmagic/issues{/number}", "pulls_url": "https://api.github.com/repos/code4craft/webmagic/pulls{/number}", "milestones_url": "https://api.github.com/repos/code4craft/webmagic/milestones{/number}", "notifications_url": "https://api.github.com/repos/code4craft/webmagic/notifications{?since,all,participating}", "labels_url": "https://api.github.com/repos/code4craft/webmagic/labels{/name}", "releases_url": "https://api.github.com/repos/code4craft/webmagic/releases{/id}", "deployments_url": "https://api.github.com/repos/code4craft/webmagic/deployments", "created_at": "2013-04-23T12:57:36Z", "updated_at": "2017-06-03T03:58:13Z", "pushed_at": "2017-06-03T07:10:15Z", "git_url": "git://github.com/code4craft/webmagic.git", "ssh_url": "git@github.com:code4craft/webmagic.git", "clone_url": "https://github.com/code4craft/webmagic.git", "svn_url": "https://github.com/code4craft/webmagic", "homepage": "http://webmagic.io/", "size": 16982, "stargazers_count": 4566, "watchers_count": 4566, "language": "Java", "has_issues": true, "has_projects": true, "has_downloads": true, "has_wiki": true, "has_pages": true, "forks_count": 2432, "mirror_url": null, "open_issues_count": 96, "forks": 2432, "open_issues": 96, "watchers": 4566, "default_branch": "master", "network_count": 2432, "subscribers_count": 618 } ================================================ FILE: webmagic-extension/src/test/resources/log4j2-test.xml ================================================ ================================================ FILE: webmagic-samples/README.md ================================================ webmagic-samples ------- webmagic的一些示例。包括抓取常见 博客、信息类网站等。 ================================================ FILE: webmagic-samples/pom.xml ================================================ us.codecraft webmagic 1.0.4-SNAPSHOT 4.0.0 webmagic-samples ${project.groupId} webmagic-core ${project.version} ${project.groupId} webmagic-extension ${project.version} org.mapdb mapdb 3.1.0 com.fasterxml.jackson.core jackson-core 2.15.2 com.fasterxml.jackson.core jackson-annotations 2.15.2 com.fasterxml.jackson.core jackson-databind 2.16.0 ================================================ FILE: webmagic-samples/src/main/java/us/codecraft/webmagic/main/QuickStarter.java ================================================ package us.codecraft.webmagic.main; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.model.OOSpider; import us.codecraft.webmagic.model.samples.IteyeBlog; import us.codecraft.webmagic.model.samples.News163; import us.codecraft.webmagic.model.samples.OschinaBlog; import us.codecraft.webmagic.pipeline.ConsolePipeline; import us.codecraft.webmagic.pipeline.MultiPagePipeline; import java.util.LinkedHashMap; import java.util.Map; import java.util.Scanner; /** * @author code4crafter@gmail.com
*/ public class QuickStarter { private static Map clazzMap; private static Map urlMap; private static void init(){ clazzMap = new LinkedHashMap(); clazzMap.put("1", OschinaBlog.class); clazzMap.put("2", IteyeBlog.class); clazzMap.put("3", News163.class); urlMap = new LinkedHashMap(); urlMap.put("1", "http://my.oschina.net/flashsword/blog"); urlMap.put("2", "http://flashsword20.iteye.com/"); urlMap.put("3", "http://news.163.com/"); } public static void main(String[] args) { init(); String key = null; key = readKey(key); System.out.println("The demo started and will last 20 seconds..."); //Start spider OOSpider.create(Site.me(), clazzMap.get(key)).addUrl(urlMap.get(key)).addPipeline(new MultiPagePipeline()).addPipeline(new ConsolePipeline()).runAsync(); try { Thread.sleep(20000); } catch (InterruptedException e) { e.printStackTrace(); } System.out.println("The demo stopped!"); System.out.println("To more usage, try to customize your own Spider!"); System.exit(0); } private static String readKey(String key) { Scanner stdin = new Scanner(System.in); System.out.println("Choose a Spider demo:"); for (Map.Entry classEntry : clazzMap.entrySet()) { System.out.println(classEntry.getKey()+"\t" + classEntry.getValue() + "\t" + urlMap.get(classEntry.getKey())); } while (key == null) { key = stdin.nextLine(); if (clazzMap.get(key) == null) { System.out.println("Invalid choice!"); key = null; } } return key; } } ================================================ FILE: webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/BaiduNews.java ================================================ package us.codecraft.webmagic.model.samples; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.model.OOSpider; import us.codecraft.webmagic.model.annotation.ExtractBy; /** * @author code4crafter@gmail.com */ public class BaiduNews { @ExtractBy("//h3[@class='c-title']/a/text()") private String name; @ExtractBy("//div[@class='c-summary']/text()") private String description; @Override public String toString() { return "BaiduNews{" + "name='" + name + '\'' + ", description='" + description + '\'' + '}'; } public static void main(String[] args) { OOSpider ooSpider = OOSpider.create(Site.me().setSleepTime(0), BaiduNews.class); //single download BaiduNews baike = ooSpider.get("http://news.baidu.com/ns?tn=news&cl=2&rn=20&ct=1&fr=bks0000&ie=utf-8&word=httpclient"); System.out.println(baike); ooSpider.close(); } public String getName() { return name; } public String getDescription() { return description; } } ================================================ FILE: webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/Blog.java ================================================ package us.codecraft.webmagic.model.samples; /** * @author code4crafter@gmail.com
* Date: 13-8-2
* Time: 上午8:10
*/ public interface Blog { public String getTitle(); public String getContent(); } ================================================ FILE: webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/DianpingFtlDataScanner.java ================================================ package us.codecraft.webmagic.model.samples; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.model.AfterExtractor; import us.codecraft.webmagic.model.OOSpider; import us.codecraft.webmagic.model.annotation.ExtractBy; import us.codecraft.webmagic.model.annotation.TargetUrl; import java.util.List; /** * @author yihua.huang@dianping.com
* Date: 13-8-13
* Time: 上午10:13
*/ @TargetUrl("http://*.alpha.dp/*") public class DianpingFtlDataScanner implements AfterExtractor { @ExtractBy(value = "(DP\\.data\\(\\{.*\\}\\));", type = ExtractBy.Type.Regex, notNull = true, multi = true) private List data; public static void main(String[] args) { OOSpider.create(Site.me().setSleepTime(0), DianpingFtlDataScanner.class) .thread(5).run(); } @Override public void afterProcess(Page page) { if (data.size() > 1) { System.err.println(page.getUrl()); } if (data.size() > 0 && data.get(0).length() > 100) { System.err.println(page.getUrl()); } } } ================================================ FILE: webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/GithubRepo.java ================================================ package us.codecraft.webmagic.model.samples; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.model.HasKey; import us.codecraft.webmagic.model.OOSpider; import us.codecraft.webmagic.model.annotation.ExtractBy; import us.codecraft.webmagic.model.annotation.ExtractByUrl; import us.codecraft.webmagic.model.annotation.HelpUrl; import us.codecraft.webmagic.model.annotation.TargetUrl; import us.codecraft.webmagic.pipeline.JsonFilePageModelPipeline; import us.codecraft.webmagic.scheduler.FileCacheQueueScheduler; import java.util.List; /** * @author code4crafter@gmail.com
*/ @TargetUrl("https://github.com/\\w+/\\w+") @HelpUrl({"https://github.com/\\w+\\?tab=repositories","https://github.com/\\w+","https://github.com/explore/*"}) public class GithubRepo implements HasKey { @ExtractBy(value = "//h1[@class='entry-title public']/strong/a/text()", notNull = true) private String name; @ExtractByUrl("https://github\\.com/(\\w+)/.*") private String author; @ExtractBy("//div[@id='readme']") private String readme; @ExtractBy(value = "//div[@class='repository-lang-stats']//li//span[@class='lang']",multi = true) private List language; @ExtractBy("//a[@class='social-count js-social-count']/text()") private String star; @ExtractBy("//a[@class='social-count js-social-count']/text()") private String fork; @ExtractByUrl private String url; public static void main(String[] args) { OOSpider.create(Site.me().setSleepTime(0).setRetryTimes(3), new JsonFilePageModelPipeline(), GithubRepo.class) .addUrl("https://github.com/explore") .setScheduler(new FileCacheQueueScheduler("/data/webmagic/cache/")).thread(15).run(); } @Override public String key() { return author+"_"+name; } public String getName() { return name; } public String getReadme() { return readme; } public String getAuthor() { return author; } public List getLanguage() { return language; } public String getUrl() { return url; } public String getStar() { return star; } public String getFork() { return fork; } } ================================================ FILE: webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/IteyeBlog.java ================================================ package us.codecraft.webmagic.model.samples; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.model.annotation.ExtractBy; import us.codecraft.webmagic.model.OOSpider; import us.codecraft.webmagic.model.annotation.TargetUrl; /** * @author code4crafter@gmail.com
* Date: 13-8-2
* Time: 上午7:52
*/ @TargetUrl("http://*.iteye.com/blog/*") public class IteyeBlog implements Blog{ @ExtractBy("//title") private String title; @ExtractBy(value = "div#blog_content",type = ExtractBy.Type.Css) private String content; @Override public String toString() { return "IteyeBlog{" + "title='" + title + '\'' + ", content='" + content + '\'' + '}'; } public static void main(String[] args) { OOSpider.create(Site.me(), IteyeBlog.class).addUrl("http://flashsword20.iteye.com/blog").run(); } public String getTitle() { return title; } public String getContent() { return content; } } ================================================ FILE: webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/JokejiModel.java ================================================ package us.codecraft.webmagic.model.samples; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.model.ConsolePageModelPipeline; import us.codecraft.webmagic.model.OOSpider; import us.codecraft.webmagic.model.annotation.ExtractBy; import us.codecraft.webmagic.model.annotation.HelpUrl; import us.codecraft.webmagic.model.annotation.TargetUrl; import us.codecraft.webmagic.scheduler.RedisScheduler; /** * @author code4crafter@gmail.com */ @TargetUrl("http://www.jokeji.cn/jokehtml/jy/\\d+.htm") @HelpUrl("http://www.jokeji.cn/list\\w+.htm") public class JokejiModel { @ExtractBy("//title/regex('([^_]+)',1)") private String title; @ExtractBy("//div[@class=mob_txt]/tidyText()") private String content; public static void main(String[] args) { OOSpider.create(Site.me().setDomain("www.jokeji.cn").setCharset("gbk").setSleepTime(100).setTimeOut(3000) .setUserAgent("Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)") , new ConsolePageModelPipeline(), JokejiModel.class).addUrl("http://www.jokeji.cn/").thread(2) .scheduler(new RedisScheduler("127.0.0.1")) .run(); } } ================================================ FILE: webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/Kr36NewsModel.java ================================================ package us.codecraft.webmagic.model.samples; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.Task; import us.codecraft.webmagic.model.OOSpider; import us.codecraft.webmagic.monitor.SpiderMonitor; import us.codecraft.webmagic.pipeline.PageModelPipeline; import us.codecraft.webmagic.model.annotation.ExtractBy; import us.codecraft.webmagic.model.annotation.ExtractByUrl; import us.codecraft.webmagic.model.annotation.HelpUrl; import us.codecraft.webmagic.model.annotation.TargetUrl; import javax.management.JMException; import java.io.IOException; /** * @author code4crafter@gmail.com <br> */ @TargetUrl("http://www.36kr.com/p/\\d+.html") @HelpUrl("http://www.36kr.com/#/page/\\d+") public class Kr36NewsModel { @ExtractBy("//h1[@class='entry-title sep10']") private String title; @ExtractBy("//div[@class='mainContent sep-10']/tidyText()") private String content; @ExtractByUrl private String url; public static void main(String[] args) throws IOException, JMException { //Just for benchmark Spider thread = OOSpider.create(Site.me().setSleepTime(0), new PageModelPipeline() { @Override public void process(Object o, Task task) { } }, Kr36NewsModel.class).thread(20).addUrl("http://www.36kr.com/"); thread.start(); SpiderMonitor spiderMonitor = SpiderMonitor.instance(); spiderMonitor.register(thread); } public String getTitle() { return title; } public String getContent() { return content; } public String getUrl() { return url; } } ================================================ FILE: webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/News163.java ================================================ package us.codecraft.webmagic.model.samples; import us.codecraft.webmagic.MultiPageModel; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.model.OOSpider; import us.codecraft.webmagic.model.annotation.ExtractBy; import us.codecraft.webmagic.model.annotation.ExtractByUrl; import us.codecraft.webmagic.model.annotation.TargetUrl; import us.codecraft.webmagic.pipeline.ConsolePipeline; import us.codecraft.webmagic.pipeline.MultiPagePipeline; import us.codecraft.webmagic.scheduler.RedisScheduler; import java.util.Collection; import java.util.List; /** * @author code4crafter@gmail.com <br> */ @TargetUrl("http://news.163.com/\\d+/\\d+/\\d+/\\w+*.html") public class News163 implements MultiPageModel { @ExtractByUrl("http://news\\.163\\.com/\\d+/\\d+/\\d+/([^_]*).*\\.html") private String pageKey; @ExtractByUrl(value = "http://news\\.163\\.com/\\d+/\\d+/\\d+/\\w+_(\\d+)\\.html", notNull = false) private String page; @ExtractBy(value = "//div[@class=\"ep-pages\"]//a/regex('http://news\\.163\\.com/\\d+/\\d+/\\d+/\\w+_(\\d+)\\.html',1)" , multi = true, notNull = false) private List<String> otherPage; @ExtractBy("//h1[@id=\"h1title\"]/text()") private String title; @ExtractBy("//div[@id=\"epContentLeft\"]") private String content; @Override public String getPageKey() { return pageKey; } @Override public Collection<String> getOtherPages() { return otherPage; } @Override public String getPage() { if (page == null) { return "1"; } return page; } @Override public MultiPageModel combine(MultiPageModel multiPageModel) { News163 news163 = new News163(); news163.title = this.title; News163 pagedModel1 = (News163) multiPageModel; news163.content = this.content + pagedModel1.content; return news163; } @Override public String toString() { return "News163{" + "content='" + content + '\'' + ", title='" + title + '\'' + ", otherPage=" + otherPage + '}'; } public static void main(String[] args) { OOSpider.create(Site.me(), News163.class).addUrl("http://news.163.com/13/0802/05/958I1E330001124J_2.html") .scheduler(new RedisScheduler("localhost")).addPipeline(new MultiPagePipeline()).addPipeline(new ConsolePipeline()).run(); } } ================================================ FILE: webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/OschinaAnswer.java ================================================ package us.codecraft.webmagic.model.samples; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.model.*; import us.codecraft.webmagic.model.annotation.ExtractBy; import us.codecraft.webmagic.model.annotation.HelpUrl; import us.codecraft.webmagic.model.annotation.TargetUrl; /** * @author code4crafter@gmail.com <br> */ @TargetUrl("http://www.oschina.net/question/\\d+_\\d+*") @HelpUrl("http://www.oschina.net/question/*") @ExtractBy(value = "//ul[@class='list']/li[@class='Answer']", multi = true) public class OschinaAnswer implements AfterExtractor{ @ExtractBy("//img/@title") private String user; @ExtractBy("//div[@class='detail']") private String content; public static void main(String[] args) { OOSpider.create(Site.me(), OschinaAnswer.class).addUrl("http://www.oschina.net/question/567527_120597").run(); } @Override public void afterProcess(Page page) { } } ================================================ FILE: webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/OschinaBlog.java ================================================ package us.codecraft.webmagic.model.samples; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Task; import us.codecraft.webmagic.model.OOSpider; import us.codecraft.webmagic.pipeline.PageModelPipeline; import us.codecraft.webmagic.model.annotation.ExtractBy; import us.codecraft.webmagic.model.annotation.TargetUrl; import java.util.List; /** * @author code4crafter@gmail.com <br> */ @TargetUrl("http://my.oschina.net/flashsword/blog/\\d+") public class OschinaBlog{ @ExtractBy("//title") private String title; @ExtractBy(value = "div.BlogContent",type = ExtractBy.Type.Css) private String content; @ExtractBy(value = "//div[@class='BlogTags']/a/text()", multi = true) private List<String> tags; public static void main(String[] args) { OOSpider.create(Site.me() .setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36") .setSleepTime(0) .setRetryTimes(3) ,new PageModelPipeline() { @Override public void process(Object o, Task task) { } }, OschinaBlog.class).thread(10).addUrl("http://my.oschina.net/flashsword/blog").run(); } public String getTitle() { return title; } public String getContent() { return content; } public List<String> getTags() { return tags; } } ================================================ FILE: webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/QQMeishi.java ================================================ package us.codecraft.webmagic.model.samples; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.model.ConsolePageModelPipeline; import us.codecraft.webmagic.model.OOSpider; import us.codecraft.webmagic.model.annotation.ExtractBy; import us.codecraft.webmagic.model.annotation.TargetUrl; /** * @author code4crafter@gmail.com */ @TargetUrl("http://meishi.qq.com/beijing/c/all[\\-p2]*") @ExtractBy(value = "//ul[@id=\"promos_list2\"]/li",multi = true) public class QQMeishi { @ExtractBy("//div[@class=info]/a[@class=title]/h4/text()") private String shopName; @ExtractBy("//div[@class=info]/a[@class=title]/text()") private String promo; public static void main(String[] args) { OOSpider.create(Site.me(), new ConsolePageModelPipeline(), QQMeishi.class).addUrl("http://meishi.qq.com/beijing/c/all").thread(4).run(); } } ================================================ FILE: webmagic-samples/src/main/java/us/codecraft/webmagic/recover/DuplicateStorageRemover.java ================================================ package us.codecraft.webmagic.recover; import com.google.common.base.Charsets; import com.google.common.hash.BloomFilter; import com.google.common.hash.Funnels; import org.mapdb.DB; import org.mapdb.DBMaker; import org.mapdb.IndexTreeList; import org.mapdb.Serializer; import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Task; import us.codecraft.webmagic.scheduler.component.DuplicateRemover; import java.util.concurrent.atomic.AtomicInteger; /** * @author :linweisen */ public class DuplicateStorageRemover implements DuplicateRemover { private DB db; private static String DATABASE_NAME = "duplicate"; private IndexTreeList<String> urlDuplicateQueue; private BloomFilter<CharSequence> bloomFilter; private AtomicInteger counter; public DuplicateStorageRemover(String path) { String duplicatStoragePath = path; DB db = DBMaker.fileDB(duplicatStoragePath) .fileMmapEnableIfSupported() .fileMmapPreclearDisable() .cleanerHackEnable() .closeOnJvmShutdown() .transactionEnable() .concurrencyScale(128) .make(); this.db = db; this.urlDuplicateQueue = db.indexTreeList(DATABASE_NAME, Serializer.STRING).createOrOpen(); counter = new AtomicInteger(this.urlDuplicateQueue.size()); this.bloomFilter = BloomFilter.create(Funnels.stringFunnel(Charsets.UTF_8), 200000, 1E-7); for (String url : this.urlDuplicateQueue){ bloomFilter.put(url); } } @Override public boolean isDuplicate(Request request, Task task) { String url = request.getUrl(); boolean isDuplicate = bloomFilter.mightContain(url); if (!isDuplicate) { bloomFilter.put(url); urlDuplicateQueue.add(url); this.db.commit(); counter.incrementAndGet(); } return isDuplicate; } @Override public void resetDuplicateCheck(Task task) { this.bloomFilter = BloomFilter.create(Funnels.stringFunnel(Charsets.UTF_8), 200000, 1E-7); this.urlDuplicateQueue.clear(); } @Override public int getTotalRequestsCount(Task task) { return counter.get(); } } ================================================ FILE: webmagic-samples/src/main/java/us/codecraft/webmagic/recover/MmapQueueScheduler.java ================================================ package us.codecraft.webmagic.recover; import com.fasterxml.jackson.databind.ObjectMapper; import org.apache.commons.lang3.StringUtils; import org.mapdb.DB; import org.mapdb.DBMaker; import org.mapdb.IndexTreeList; import org.mapdb.Serializer; import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Task; import us.codecraft.webmagic.scheduler.DuplicateRemovedScheduler; import us.codecraft.webmagic.scheduler.component.DuplicateRemover; import java.io.IOException; /** * @author :linweisen */ public class MmapQueueScheduler extends DuplicateRemovedScheduler { private DB db; private static String DATABASE_NAME = "queue"; private IndexTreeList<String> queue; private static ObjectMapper mapper; public MmapQueueScheduler(DuplicateRemover duplicateRemover, String path) { super.setDuplicateRemover(duplicateRemover); String queuePath = path; DB db = DBMaker.fileDB(queuePath) .fileMmapEnableIfSupported() .fileMmapPreclearDisable() .cleanerHackEnable() .closeOnJvmShutdown() .transactionEnable() .concurrencyScale(128) .make(); this.db = db; this.mapper = new ObjectMapper(); this.queue = db.indexTreeList(MmapQueueScheduler.DATABASE_NAME, Serializer.STRING).createOrOpen(); } @Override public Request poll(Task task) { if (this.queue.size() > 0){ String s = queue.remove(0); return fromJson(s, Request.class); }else{ return null; } } @Override public void pushWhenNoDuplicate(Request request, Task task) { queue.add(toJson(request)); this.db.commit(); } public String toJson(Object object) { try { return mapper.writeValueAsString(object); } catch (IOException e) { logger.warn("write to json string error:" + object, e); return null; } } public <T> T fromJson(String jsonString, Class<T> clazz) { if (StringUtils.isEmpty(jsonString)) { return null; } try { return mapper.readValue(jsonString, clazz); } catch (IOException e) { logger.warn("parse json string error:" + jsonString, e); return null; } } } ================================================ FILE: webmagic-samples/src/main/java/us/codecraft/webmagic/recover/RecoverSample.java ================================================ package us.codecraft.webmagic.recover; import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.samples.SinaBlogProcessor; import us.codecraft.webmagic.scheduler.component.DuplicateRemover; /** * @author code4crafter@gmail.com <br> */ public class RecoverSample { public static void main(String[] args) { String storage = "queue"; String duplicate = "duplicate"; Spider spider = new Spider(new SinaBlogProcessor()); DuplicateRemover remover = new DuplicateStorageRemover(duplicate); spider.setScheduler(new MmapQueueScheduler(remover, storage)); spider.addUrl("http://blog.sina.com.cn/s/articlelist_1487828712_0_1.html") .run(); } } ================================================ FILE: webmagic-samples/src/main/java/us/codecraft/webmagic/samples/AlexanderMcqueenGoodsProcessor.java ================================================ package us.codecraft.webmagic.samples; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.processor.PageProcessor; import us.codecraft.webmagic.scheduler.PriorityScheduler; /** * @author code4crafer@gmail.com */ public class AlexanderMcqueenGoodsProcessor implements PageProcessor { private Site site = Site.me().setRetryTimes(3).setSleepTime(0); public static final String URL_LIST = "http://www\\.alexandermcqueen\\.cn/.*"; public static final String URL_POST = "http://www\\.alexandermcqueen\\.cn/cn/\\w+/.*\\.html"; @Override public void process(Page page) { if (page.getUrl().regex(URL_POST).match()) { page.putField("goodsName", page.getHtml().xpath("//div[@id='description']/h1/tidyText()")); if (page.getResultItems().get("goodsName") == null) { page.setSkip(true); } page.putField("currency", page.getHtml().xpath("//div[@id='description']//div[@class='itemBoxPrice']/span//span[@class='currency']/tidyText()")); page.putField("goodsPrice", page.getHtml().xpath("//div[@id='description']//div[@class='itemBoxPrice']/span//span[@class='priceValue']/tidyText()")); page.putField("description", page.getHtml() .xpath("//div[@id='tabbedDescription']//div[@class='tabbedDescription']//ul[@id='tabs']//li[@id='tab_description']/div[@id='description_pane']/tidyText()")); page.putField("material", page.getHtml() .xpath("//div[@id='tabbedDescription']" + "//div[@class='tabbedDescription']" + "//ul[@id='tabs']" + "//li[@id='tab_description']" + "//div[@class='productProperty']" + "//div[@class='productPropertyRow']/span[2]/tidyText()")); page.putField("goodsCode", page.getHtml() .xpath("//div[@id='tabbedDescription']" + "//div[@class='tabbedDescription']" + "//ul[@id='tabs']" + "//li[@id='tab_description']" + "//div[@class='productProperty']" + "//div[@class='productPropertyRow']//span[@id='modelFabricColorContainer']/tidyText()")); page.putField("goodsSize", page.getHtml() .xpath("//div[@id='sizesContainer']//div[@id='sizes']//ul[@class='SizeW']")); page.putField("goodsColors", page.getHtml() .xpath("//div[@id='colors']/ul/html()")); } else { page.addTargetRequests(page.getHtml().links().regex(URL_POST).all(), 1000); page.addTargetRequests(page.getHtml().links().regex(URL_LIST).all(), 1); } } @Override public Site getSite() { return site; } public static void main(String[] args) { Spider.create(new AlexanderMcqueenGoodsProcessor()).setScheduler(new PriorityScheduler()) .addUrl("http://www.alexandermcqueen.cn/sitemap.asp?tskay=E2F1A848").thread(5).run(); } } ================================================ FILE: webmagic-samples/src/main/java/us/codecraft/webmagic/samples/AmanzonPageProcessor.java ================================================ package us.codecraft.webmagic.samples; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.processor.PageProcessor; import us.codecraft.webmagic.selector.Html; import java.util.List; /** * @author code4crafer@gmail.com */ public class AmanzonPageProcessor implements PageProcessor{ public void process(Page page) { Html html = page.getHtml(); List<String> questionList = html.xpath("//table[@class='tgCustomerCommunityCenterColumn']//div[@class='content']//table[@class='dataGrid']//tr").all(); if(questionList != null && questionList.size() > 1) { //i=0是列名称,所以i从1开始 for( int i = 1 ; i < questionList.size(); i++) { System.out.println(questionList.get(i)); Html tempHtml = Html.create("<table>"+questionList.get(i)+"</table>"); String comment = tempHtml.xpath("//td[@class='title']//a/text()").toString(); System.out.println(comment); String answerNum = tempHtml.xpath("//td[@class='num']/text()").toString(); System.out.println(answerNum); String createTime = tempHtml.xpath("//td[3]/text()").toString(); System.out.println(createTime); /* Document doc = Jsoup.parse(questionList.get(i)); Html hmt = Html.create(questionList.get(i)) ; String str = hmt.links().toString(); String content = doc.getElementsByTag("a").text(); String ss = doc.text();*/ } } } @Override public Site getSite() { return Site.me(); } public static void main(String[] args) { Spider.create(new AmanzonPageProcessor()).test("http://www.amazon.de/forum/Fx27CUFD8S7LJ5D"); } } ================================================ FILE: webmagic-samples/src/main/java/us/codecraft/webmagic/samples/AngularJSProcessor.java ================================================ package us.codecraft.webmagic.samples; import java.util.List; import org.apache.commons.collections4.CollectionUtils; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.processor.PageProcessor; import us.codecraft.webmagic.selector.JsonPathSelector; /** * @author code4crafter@gmail.com * @since 0.5.0 */ public class AngularJSProcessor implements PageProcessor { private Site site = Site.me(); private static final String ARITICALE_URL = "http://angularjs\\.cn/api/article/\\w+"; private static final String LIST_URL = "http://angularjs\\.cn/api/article/latest.*"; @Override public void process(Page page) { if (page.getUrl().regex(LIST_URL).match()) { List<String> ids = new JsonPathSelector("$.data[*]._id").selectList(page.getRawText()); if (CollectionUtils.isNotEmpty(ids)) { for (String id : ids) { page.addTargetRequest("http://angularjs.cn/api/article/" + id); } } } else { page.putField("title", new JsonPathSelector("$.data.title").select(page.getRawText())); page.putField("content", new JsonPathSelector("$.data.content").select(page.getRawText())); } } @Override public Site getSite() { return site; } public static void main(String[] args) { Spider.create(new AngularJSProcessor()).addUrl("http://angularjs.cn/api/article/latest?p=1&s=20").run(); } } ================================================ FILE: webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DiandianBlogProcessor.java ================================================ package us.codecraft.webmagic.samples; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.processor.PageProcessor; import java.util.List; /** * @author code4crafter@gmail.com <br> */ public class DiandianBlogProcessor implements PageProcessor { private Site site; @Override public void process(Page page) { //a()表示提取链接,links()表示提取所有链接 //getHtml()返回Html对象,支持链式调用 //r()表示用正则表达式提取一条内容,regex()表示提取多条内容 //toString()表示取单条结果,all()表示取多条 List<String> requests = page.getHtml().links().regex("(.*/post/.*)").all(); //使用page.addTargetRequests()方法将待抓取的链接加入队列 page.addTargetRequests(requests); //page.putField(key,value)将抽取的内容加入结果Map //x()和xs()使用xpath进行抽取 page.putField("title", page.getHtml().xpath("//title").regex("(.*?)\\|").toString()); //smartContent()使用readability技术直接抽取正文,对于规整的文本有比较好的抽取正确率 page.putField("content", page.getHtml().smartContent()); page.putField("date", page.getUrl().regex("post/(\\d+-\\d+-\\d+)/")); page.putField("id", page.getUrl().regex("post/\\d+-\\d+-\\d+/(\\d+)")); } @Override public Site getSite() { //site定义抽取配置,以及开始url等 if (site == null) { site = Site.me().setDomain("progressdaily.diandian.com"). setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31"); } return site; } } ================================================ FILE: webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DiaoyuwengProcessor.java ================================================ package us.codecraft.webmagic.samples; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.processor.PageProcessor; import us.codecraft.webmagic.selector.PlainText; import java.util.List; /** * @author code4crafter@gmail.com <br> * Date: 13-4-21 * Time: 下午8:08 */ public class DiaoyuwengProcessor implements PageProcessor { private Site site; @Override public void process(Page page) { List<String> requests = page.getHtml().links().regex("(http://www\\.diaoyuweng\\.com/home\\.php\\?mod=space&uid=88304&do=thread&view=me&type=thread&order=dateline&from=space&page=\\d+)").all(); page.addTargetRequests(requests); requests = page.getHtml().links().regex("(http://www\\.diaoyuweng\\.com/thread-\\d+-1-1.html)").all(); page.addTargetRequests(requests); if (page.getUrl().toString().contains("thread")){ page.putField("title", page.getHtml().xpath("//a[@id='thread_subject']")); page.putField("content", page.getHtml().xpath("//div[@class='pcb']//tbody/tidyText()")); page.putField("date",page.getHtml().regex("发表于 (\\d{4}-\\d+-\\d+ \\d+:\\d+:\\d+)")); page.putField("id",new PlainText("1000"+page.getUrl().regex("http://www\\.diaoyuweng\\.com/thread-(\\d+)-1-1.html").toString())); } } @Override public Site getSite() { if (site==null){ site= Site.me().setDomain("www.diaoyuweng.com"). setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31").setCharset("GBK").setSleepTime(500); } return site; } public static void main(String[] args) { Spider.create(new DiaoyuwengProcessor()).addUrl("http://www.diaoyuweng.com/home.php?mod=space&uid=88304&do=thread&view=me&type=thread&from=space").run(); } } ================================================ FILE: webmagic-samples/src/main/java/us/codecraft/webmagic/samples/F58PageProcesser.java ================================================ package us.codecraft.webmagic.samples; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.processor.PageProcessor; import us.codecraft.webmagic.scheduler.RedisScheduler; import java.util.List; /** * @author code4crafter@gmail.com <br> * Date: 13-4-21 * Time: 下午1:48 */ public class F58PageProcesser implements PageProcessor { @Override public void process(Page page) { List<String> strings = page.getHtml().links().regex(".*/yewu/.*").all(); page.addTargetRequests(strings); page.putField("title",page.getHtml().regex("<title>(.*)")); page.putField("body",page.getHtml().xpath("//dd")); } @Override public Site getSite() { return Site.me().setDomain("sh.58.com").setCycleRetryTimes(2); //To change body of implemented methods use File | Settings | File Templates. } public static void main(String[] args) { Spider.create(new F58PageProcesser()).setScheduler(new RedisScheduler("localhost")).addUrl("http://sh1.51a8.com/").run(); } } ================================================ FILE: webmagic-samples/src/main/java/us/codecraft/webmagic/samples/GithubRepo.java ================================================ package us.codecraft.webmagic.samples; /** * @author code4crafer@gmail.com */ public class GithubRepo { private String name; private String author; private String readme; public String getName() { return name; } public void setName(String name) { this.name = name; } public String getAuthor() { return author; } public void setAuthor(String author) { this.author = author; } public String getReadme() { return readme; } public void setReadme(String readme) { this.readme = readme; } } ================================================ FILE: webmagic-samples/src/main/java/us/codecraft/webmagic/samples/GithubRepoPageProcessor.java ================================================ package us.codecraft.webmagic.samples; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.processor.PageProcessor; /** * @author code4crafter@gmail.com
* @since 0.5.1 */ public class GithubRepoPageProcessor implements PageProcessor { private Site site = Site.me().setRetryTimes(3).setSleepTime(0); @Override public void process(Page page) { page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/\\w+/\\w+)").all()); page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/\\w+)").all()); GithubRepo githubRepo = new GithubRepo(); githubRepo.setAuthor(page.getUrl().regex("https://github\\.com/(\\w+)/.*").toString()); githubRepo.setName(page.getHtml().xpath("//h1[contains(@class, 'entry-title') and contains(@class, 'public')]/strong/a/text()").toString()); githubRepo.setReadme(page.getHtml().xpath("//div[@id='readme']/tidyText()").toString()); if (githubRepo.getName() == null) { //skip this page page.setSkip(true); } else { page.putField("repo", githubRepo); } } @Override public Site getSite() { return site; } public static void main(String[] args) { Spider.create(new GithubRepoPageProcessor()).addUrl("https://github.com/code4craft").thread(5).run(); } } ================================================ FILE: webmagic-samples/src/main/java/us/codecraft/webmagic/samples/HuxiuProcessor.java ================================================ package us.codecraft.webmagic.samples; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.processor.PageProcessor; import java.util.List; /** * @author code4crafter@gmail.com
*/ public class HuxiuProcessor implements PageProcessor { @Override public void process(Page page) { List requests = page.getHtml().links().regex(".*article.*").all(); page.addTargetRequests(requests); page.putField("title",page.getHtml().xpath("//div[@class='clearfix neirong']//h1/text()")); page.putField("content",page.getHtml().xpath("//div[@id='neirong_box']/tidyText()")); } @Override public Site getSite() { return Site.me().setDomain("www.huxiu.com"); } public static void main(String[] args) { Spider.create(new HuxiuProcessor()).addUrl("http://www.huxiu.com/").run(); } } ================================================ FILE: webmagic-samples/src/main/java/us/codecraft/webmagic/samples/InfoQMiniBookProcessor.java ================================================ package us.codecraft.webmagic.samples; import org.apache.commons.collections4.CollectionUtils; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.processor.PageProcessor; import java.util.List; /** * @author code4crafter@gmail.com
*/ public class InfoQMiniBookProcessor implements PageProcessor { private Site site; @Override public void process(Page page) { page.addTargetRequests(page.getHtml().links().regex("http://www\\.infoq\\.com/cn/minibooks/.*").all()); List all = page.getHtml().links().regex(".*\\.pdf").all(); if (CollectionUtils.isNotEmpty(all)) { page.putField("pdf", all); } else { page.getResultItems().setSkip(true); } } @Override public Site getSite() { if (site == null) { site = Site.me().setDomain("www.infoq.com").addCookie("RegisteredUserCookie", "sDDDc8dIAgZSq67uJSXhtpQaHEi1XDOH"). setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31"); } return site; } public static void main(String[] args) { Spider.create(new InfoQMiniBookProcessor()) .thread(5) .addUrl("http://www.infoq.com/cn/minibooks") .run(); } } ================================================ FILE: webmagic-samples/src/main/java/us/codecraft/webmagic/samples/IteyeBlogProcessor.java ================================================ package us.codecraft.webmagic.samples; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.processor.PageProcessor; /** * @author code4crafter@gmail.com
*/ public class IteyeBlogProcessor implements PageProcessor { private Site site; @Override public void process(Page page) { page.addTargetRequests(page.getHtml().links().regex(".*yanghaoli\\.iteye\\.com/blog/\\d+").all()); page.putField("title",page.getHtml().xpath("//title").toString()); page.putField("content",page.getHtml().smartContent().toString()); } @Override public Site getSite() { if (site == null) { site = Site.me().setDomain("yanghaoli.iteye.com"); } return site; } public static void main(String[] args) { Spider.create(new IteyeBlogProcessor()).thread(5).addUrl("http://yanghaoli.iteye.com/").run(); } } ================================================ FILE: webmagic-samples/src/main/java/us/codecraft/webmagic/samples/KaichibaProcessor.java ================================================ package us.codecraft.webmagic.samples; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.processor.PageProcessor; /** * @author code4crafter@gmail.com
* Date: 13-5-20 * Time: 下午5:31 */ public class KaichibaProcessor implements PageProcessor { @Override public void process(Page page) { //http://progressdaily.diandian.com/post/2013-01-24/40046867275 int i = Integer.valueOf(page.getUrl().regex("shop/(\\d+)").toString()) + 1; page.addTargetRequest("http://kaichiba.com/shop/" + i); page.putField("title",page.getHtml().xpath("//Title")); page.putField("items", page.getHtml().xpath("//li[@class=\"foodTitle\"]").replace("^\\s+", "").replace("\\s+$", "").replace(".*?", "")); } @Override public Site getSite() { return Site.me().setDomain("kaichiba.com").setCharset("utf-8"). setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31"); } public static void main(String[] args) { Spider.create(new KaichibaProcessor()).addUrl("http://kaichiba.com/shop/41725781").run(); } } ================================================ FILE: webmagic-samples/src/main/java/us/codecraft/webmagic/samples/MamacnPageProcessor.java ================================================ package us.codecraft.webmagic.samples; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.processor.PageProcessor; import us.codecraft.webmagic.samples.pipeline.OneFilePipeline; import us.codecraft.webmagic.scheduler.FileCacheQueueScheduler; import us.codecraft.webmagic.selector.Selectable; import java.io.FileNotFoundException; import java.io.UnsupportedEncodingException; import java.util.List; /** * @author code4crafer@gmail.com */ public class MamacnPageProcessor implements PageProcessor { private Site site = Site.me().setDomain("www.mama.cn").setSleepTime(100); @Override public void process(Page page) { List nodes = page.getHtml().xpath("//ul[@id=ma-thumb-list]/li").nodes(); StringBuilder accum = new StringBuilder(); for (Selectable node : nodes) { accum.append("img:").append(node.xpath("//a/@href").get()).append("\n"); accum.append("title:").append(node.xpath("//img/@alt").get()).append("\n"); } page.putField("",accum.toString()); if (accum.length() == 0) { page.setSkip(true); } page.addTargetRequests(page.getHtml().links().regex("http://www\\.mama\\.cn/photo/.*\\.html").all()); } @Override public Site getSite() { return site; } public static void main(String[] args) throws FileNotFoundException, UnsupportedEncodingException { Spider.create(new MamacnPageProcessor()) .setScheduler(new FileCacheQueueScheduler("/data/webmagic/mamacn")) .addUrl("http://www.mama.cn/photo/t1-p1.html") .addPipeline(new OneFilePipeline("/data/webmagic/mamacn/data")) .thread(5) .run(); } } ================================================ FILE: webmagic-samples/src/main/java/us/codecraft/webmagic/samples/MeicanProcessor.java ================================================ package us.codecraft.webmagic.samples; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.processor.PageProcessor; import java.util.List; /** * @author code4crafter@gmail.com
* Date: 13-5-20 * Time: 下午5:31 */ public class MeicanProcessor implements PageProcessor { @Override public void process(Page page) { //http://progressdaily.diandian.com/post/2013-01-24/40046867275 List requests = page.getHtml().xpath("//a[@class=\"area_link flat_btn\"]/@href").all(); if (requests.size() > 2) { requests = requests.subList(0, 2); } page.addTargetRequests(requests); page.addTargetRequests(page.getHtml().links().regex("(.*/restaurant/[^#]+)").all()); page.putField("items", page.getHtml().xpath("//ul[@class=\"dishes menu_dishes\"]/li/span[@class=\"name\"]/text()")); page.putField("prices", page.getHtml().xpath("//ul[@class=\"dishes menu_dishes\"]/li/span[@class=\"price_outer\"]/span[@class=\"price\"]/text()")); } @Override public Site getSite() { return Site.me().setDomain("meican.com").setCharset("utf-8"). setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31"); } public static void main(String[] args) { Spider.create(new MeicanProcessor()).addUrl("http://www.meican.com/shanghai/districts").run(); } } ================================================ FILE: webmagic-samples/src/main/java/us/codecraft/webmagic/samples/NjuBBSProcessor.java ================================================ package us.codecraft.webmagic.samples; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.processor.PageProcessor; import java.util.List; /** * @author code4crafter@gmail.com
* Date: 13-4-21 * Time: 下午8:08 */ public class NjuBBSProcessor implements PageProcessor { @Override public void process(Page page) { List requests = page.getHtml().regex("]*href=(bbstcon\\?board=Pictures&file=[^>]*)").all(); page.addTargetRequests(requests); page.putField("title",page.getHtml().xpath("//div[@id='content']//h2/a")); page.putField("content",page.getHtml().smartContent()); } @Override public Site getSite() { return Site.me().setDomain("bbs.nju.edu.cn"); } public static void main(String[] args) { Spider.create(new NjuBBSProcessor()).addUrl("http://bbs.nju.edu.cn/board?board=Pictures").run(); } } ================================================ FILE: webmagic-samples/src/main/java/us/codecraft/webmagic/samples/PhantomJSPageProcessor.java ================================================ package us.codecraft.webmagic.samples; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.ResultItems; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.downloader.PhantomJSDownloader; import us.codecraft.webmagic.pipeline.CollectorPipeline; import us.codecraft.webmagic.pipeline.ResultItemsCollectorPipeline; import us.codecraft.webmagic.processor.PageProcessor; import java.util.List; /** * Created by dolphineor on 2014-11-21. *

* 以淘宝为例, 搜索冬装的相关结果 */ public class PhantomJSPageProcessor implements PageProcessor { private Site site = Site.me() .setDomain("s.taobao.com") .setCharset("GBK") .addHeader("Referer", "http://www.taobao.com/") .setRetryTimes(3).setSleepTime(1000); @Override public void process(Page page) { if (page.getRawText() != null) page.putField("html", page.getRawText()); } @Override public Site getSite() { return site; } public static void main(String[] args) throws Exception { PhantomJSDownloader phantomDownloader = new PhantomJSDownloader(); CollectorPipeline collectorPipeline = new ResultItemsCollectorPipeline(); Spider.create(new PhantomJSPageProcessor()) .addUrl("http://s.taobao.com/search?q=%B6%AC%D7%B0&sort=sale-desc") //%B6%AC%D7%B0为冬装的GBK编码 .setDownloader(phantomDownloader) .addPipeline(collectorPipeline) .thread((Runtime.getRuntime().availableProcessors() - 1) << 1) .run(); List resultItemsList = collectorPipeline.getCollected(); System.out.println(resultItemsList.get(0).get("html").toString()); } } ================================================ FILE: webmagic-samples/src/main/java/us/codecraft/webmagic/samples/QzoneBlogProcessor.java ================================================ package us.codecraft.webmagic.samples; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.processor.PageProcessor; import java.util.List; /** * @author code4crafter@gmail.com
*/ public class QzoneBlogProcessor implements PageProcessor { @Override public void process(Page page) { //http://progressdaily.diandian.com/post/2013-01-24/40046867275 //http://b1.cnc.qzone.qq.com/cgi-bin/blognew/get_abs?hostUin=233017404&uin=233017404&blogType=0&statYear=2013&source=0&statYear=2013&g_tk=291639571&g_tk=291639571&reqInfo=7&pos=0&num=15&source=0&rand=0.46480297949165106 // &cateName=&cateHex=&statYear=2013&reqInfo=7&pos=0&num=15&sortType=0&source=0&rand=0.46480297949165106&g_tk=291639571&verbose=1&ref=qzone List requests = page.getHtml().regex("]*href=[\"']{1}(http://17dujingdian\\.com/post/[^#]*?)[\"']{1}").all(); page.addTargetRequests(requests); page.putField("title",page.getHtml().xpath("//div[@id='content']//h2/a")); page.putField("content",page.getHtml().smartContent()); } @Override public Site getSite() { return Site.me().setDomain("www.diandian.com"). setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31"); } } ================================================ FILE: webmagic-samples/src/main/java/us/codecraft/webmagic/samples/SinaBlogProcessor.java ================================================ package us.codecraft.webmagic.samples; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.processor.PageProcessor; /** * @author code4crafter@gmail.com
*/ public class SinaBlogProcessor implements PageProcessor { public static final String URL_LIST = "http://blog\\.sina\\.com\\.cn/s/articlelist_1487828712_0_\\d+\\.html"; public static final String URL_POST = "http://blog\\.sina\\.com\\.cn/s/blog_\\w+\\.html"; private Site site = Site .me() .setDomain("blog.sina.com.cn") .setSleepTime(3000) .setUserAgent( "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31"); @Override public void process(Page page) { //列表页 if (page.getUrl().regex(URL_LIST).match()) { page.addTargetRequests(page.getHtml().xpath("//div[@class=\"articleList\"]").links().regex(URL_POST).all()); page.addTargetRequests(page.getHtml().links().regex(URL_LIST).all()); //文章页 } else { page.putField("title", page.getHtml().xpath("//div[@class='articalTitle']/h2")); page.putField("content", page.getHtml().xpath("//div[@id='articlebody']//div[@class='articalContent']")); page.putField("date", page.getHtml().xpath("//div[@id='articlebody']//span[@class='time SG_txtc']").regex("\\((.*)\\)")); } } @Override public Site getSite() { return site; } public static void main(String[] args) { Spider.create(new SinaBlogProcessor()).addUrl("http://blog.sina.com.cn/s/articlelist_1487828712_0_1.html") .run(); } } ================================================ FILE: webmagic-samples/src/main/java/us/codecraft/webmagic/samples/TianyaPageProcesser.java ================================================ package us.codecraft.webmagic.samples; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.processor.PageProcessor; import java.util.List; /** * @author code4crafter@gmail.com
*/ public class TianyaPageProcesser implements PageProcessor { @Override public void process(Page page) { List strings = page.getHtml().regex("]*href=[\"']{1}(/post-free.*?\\.shtml)[\"']{1}").all(); page.addTargetRequests(strings); page.putField("title", page.getHtml().xpath("//div[@id='post_head']//span[@class='s_title']//b")); page.putField("body",page.getHtml().smartContent()); } @Override public Site getSite() { return Site.me().setDomain("http://bbs.tianya.cn/"); //To change body of implemented methods use File | Settings | File Templates. } } ================================================ FILE: webmagic-samples/src/main/java/us/codecraft/webmagic/samples/ZhihuPageProcessor.java ================================================ package us.codecraft.webmagic.samples; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.pipeline.FilePipeline; import us.codecraft.webmagic.processor.PageProcessor; import us.codecraft.webmagic.selector.Html; import java.util.List; /** * @author 410775541@qq.com
* @since 0.5.1 */ public class ZhihuPageProcessor implements PageProcessor { private Site site = Site.me().setCycleRetryTimes(5).setRetryTimes(5).setSleepTime(500).setTimeOut(3 * 60 * 1000) .setUserAgent("Mozilla/5.0 (Windows NT 6.1; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0") .addHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8") .addHeader("Accept-Language", "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3") .setCharset("UTF-8"); private static final int voteNum = 1000; @Override public void process(Page page) { List relativeUrl = page.getHtml().xpath("//li[@class='item clearfix']/div/a/@href").all(); page.addTargetRequests(relativeUrl); relativeUrl = page.getHtml().xpath("//div[@id='zh-question-related-questions']//a[@class='question_link']/@href").all(); page.addTargetRequests(relativeUrl); List answers = page.getHtml().xpath("//div[@id='zh-question-answer-wrap']/div").all(); boolean exist = false; for(String answer:answers){ String vote = new Html(answer).xpath("//div[@class='zm-votebar']//span[@class='count']/text()").toString(); if(Integer.valueOf(vote) >= voteNum){ page.putField("vote",vote); page.putField("content",new Html(answer).xpath("//div[@class='zm-editable-content']")); page.putField("userid", new Html(answer).xpath("//a[@class='author-link']/@href")); exist = true; } } if(!exist){ page.setSkip(true); } } @Override public Site getSite() { return site; } public static void main(String[] args) { Spider.create(new ZhihuPageProcessor()). addUrl("http://www.zhihu.com/search?type=question&q=java"). addPipeline(new FilePipeline("D:\\webmagic\\")). thread(5). run(); } } ================================================ FILE: webmagic-samples/src/main/java/us/codecraft/webmagic/samples/formatter/StringTemplateFormatter.java ================================================ package us.codecraft.webmagic.samples.formatter; import us.codecraft.webmagic.model.formatter.ObjectFormatter; /** * @author yihua.huang@dianping.com */ public class StringTemplateFormatter implements ObjectFormatter { private String template; @Override public String format(String raw) throws Exception { return String.format(template, raw); } @Override public Class clazz() { return String.class; } @Override public void initParam(String[] extra) { template = extra[0]; } } ================================================ FILE: webmagic-samples/src/main/java/us/codecraft/webmagic/samples/pipeline/OneFilePipeline.java ================================================ package us.codecraft.webmagic.samples.pipeline; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import us.codecraft.webmagic.ResultItems; import us.codecraft.webmagic.Task; import us.codecraft.webmagic.pipeline.Pipeline; import us.codecraft.webmagic.utils.FilePersistentBase; import java.io.*; import java.util.Map; /** * @author code4crafer@gmail.com */ public class OneFilePipeline extends FilePersistentBase implements Pipeline { private Logger logger = LoggerFactory.getLogger(getClass()); private PrintWriter printWriter; public OneFilePipeline() throws FileNotFoundException, UnsupportedEncodingException { this("/data/webmagic/"); } public OneFilePipeline(String path) throws FileNotFoundException, UnsupportedEncodingException { setPath(path); printWriter = new PrintWriter(new OutputStreamWriter(new FileOutputStream(getFile(path)), "UTF-8")); } @Override public synchronized void process(ResultItems resultItems, Task task) { printWriter.println("url:\t" + resultItems.getRequest().getUrl()); for (Map.Entry entry : resultItems.getAll().entrySet()) { if (entry.getValue() instanceof Iterable) { Iterable value = (Iterable) entry.getValue(); printWriter.println(entry.getKey() + ":"); for (Object o : value) { printWriter.println(o); } } else { printWriter.println(entry.getKey() + ":\t" + entry.getValue()); } } printWriter.flush(); } } ================================================ FILE: webmagic-samples/src/main/java/us/codecraft/webmagic/samples/pipeline/ReplacePipeline.java ================================================ package us.codecraft.webmagic.samples.pipeline; /** * @author code4crafer@gmail.com */ public class ReplacePipeline { } ================================================ FILE: webmagic-samples/src/main/java/us/codecraft/webmagic/samples/scheduler/DelayQueueScheduler.java ================================================ package us.codecraft.webmagic.samples.scheduler; import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Task; import us.codecraft.webmagic.scheduler.PriorityScheduler; import java.util.HashSet; import java.util.Set; import java.util.concurrent.DelayQueue; import java.util.concurrent.Delayed; import java.util.concurrent.TimeUnit; /** * @author code4crafter@gmail.com */ public class DelayQueueScheduler extends PriorityScheduler { private DelayQueue queue = new DelayQueue(); private Set urls = new HashSet(); private long time; private TimeUnit timeUnit; private class RequestWrapper implements Delayed { private long startTime = System.currentTimeMillis(); private Request request; private RequestWrapper(Request request) { this.request = request; } private long getStartTime() { return startTime; } private Request getRequest() { return request; } @Override public long getDelay(TimeUnit unit) { long convert = unit.convert(TimeUnit.MILLISECONDS.convert(time, timeUnit) - System.currentTimeMillis() + startTime, TimeUnit.MILLISECONDS); return convert; } @Override public int compareTo(Delayed o) { return new Long(getDelay(TimeUnit.MILLISECONDS)).compareTo(o.getDelay(TimeUnit.MILLISECONDS)); } } public DelayQueueScheduler(long time, TimeUnit timeUnit) { this.time = time; this.timeUnit = timeUnit; } @Override public synchronized void push(Request request, Task task) { if (urls.add(request.getUrl())) { queue.add(new RequestWrapper(request)); } } @Override public synchronized Request poll(Task task) { RequestWrapper take = null; while (take == null) { try { take = queue.take(); } catch (InterruptedException e) { e.printStackTrace(); } } queue.add(new RequestWrapper(take.getRequest())); return take.getRequest(); } } ================================================ FILE: webmagic-samples/src/main/java/us/codecraft/webmagic/samples/scheduler/LevelLimitScheduler.java ================================================ package us.codecraft.webmagic.samples.scheduler; import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Task; import us.codecraft.webmagic.scheduler.PriorityScheduler; /** * @author code4crafter@gmail.com */ public class LevelLimitScheduler extends PriorityScheduler { private int levelLimit = 3; public LevelLimitScheduler(int levelLimit) { this.levelLimit = levelLimit; } @Override public synchronized void push(Request request, Task task) { if (((Integer) request.getExtra("_level")) <= levelLimit) { super.push(request, task); } } } ================================================ FILE: webmagic-samples/src/main/java/us/codecraft/webmagic/samples/scheduler/ZipCodePageProcessor.java ================================================ package us.codecraft.webmagic.samples.scheduler; import org.apache.commons.lang3.StringUtils; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.processor.PageProcessor; import us.codecraft.webmagic.scheduler.PriorityScheduler; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; import static us.codecraft.webmagic.selector.Selectors.xpath; /** * @author code4crafter@gmail.com */ public class ZipCodePageProcessor implements PageProcessor { private Site site = Site.me().setCharset("gb2312") .setSleepTime(100); @Override public void process(Page page) { if (page.getUrl().toString().equals("http://www.ip138.com/post/")) { processCountry(page); } else if (page.getUrl().regex("http://www\\.ip138\\.com/\\d{6}[/]?$").toString() != null) { processDistrict(page); } else { processProvince(page); } } private void processCountry(Page page) { List provinces = page.getHtml().xpath("//*[@id=\"newAlexa\"]/table/tbody/tr/td").all(); for (String province : provinces) { String link = xpath("//@href").select(province); String title = xpath("/text()").select(province); Request request = new Request(link).setPriority(0).putExtra("province", title); page.addTargetRequest(request); } } private void processProvince(Page page) { //这里仅靠xpath没法精准定位,所以使用正则作为筛选,不符合正则的会被过滤掉 List districts = page.getHtml().xpath("//body/table/tbody/tr[@bgcolor=\"#ffffff\"]").all(); Pattern pattern = Pattern.compile("([^<>]+).*?href=\"(.*?)\"",Pattern.DOTALL); for (String district : districts) { Matcher matcher = pattern.matcher(district); while (matcher.find()) { String title = matcher.group(1); String link = matcher.group(2); Request request = new Request(link).setPriority(1).putExtra("province", page.getRequest().getExtra("province")).putExtra("district", title); page.addTargetRequest(request); } } } private void processDistrict(Page page) { String province = page.getRequest().getExtra("province").toString(); String district = page.getRequest().getExtra("district").toString(); String zipCode = page.getHtml().regex("

邮编:(\\d+)

").toString(); page.putField("result", StringUtils.join(new String[]{province, district, zipCode}, "\t")); List links = page.getHtml().links().regex("http://www\\.ip138\\.com/\\d{6}[/]?$").all(); for (String link : links) { page.addTargetRequest(new Request(link).setPriority(2).putExtra("province", province).putExtra("district", district)); } } @Override public Site getSite() { return site; } public static void main(String[] args) { Spider spider = Spider.create(new ZipCodePageProcessor()).scheduler(new PriorityScheduler()).addUrl("http://www.ip138.com/post/"); spider.run(); } } ================================================ FILE: webmagic-samples/src/main/resources/crawl.js ================================================ var system = require('system'); var url = system.args[1]; var page = require('webpage').create(); page.settings.loadImages = false; page.settings.resourceTimeout = 5000; page.open(url, function (status) { if (status != 'success') { console.log("HTTP request failed!"); } else { console.log(page.content); } page.close(); phantom.exit(); }); ================================================ FILE: webmagic-samples/src/main/resources/log4j2.xml ================================================ ================================================ FILE: webmagic-samples/src/test/java/us/codecraft/webmagic/SpiderTest.java ================================================ package us.codecraft.webmagic; import org.junit.Ignore; import org.junit.Test; import us.codecraft.webmagic.pipeline.FilePipeline; import us.codecraft.webmagic.processor.SimplePageProcessor; import us.codecraft.webmagic.samples.HuxiuProcessor; import us.codecraft.webmagic.scheduler.FileCacheQueueScheduler; /** * @author code4crafter@gmail.com
* Date: 13-4-20 * Time: 下午7:46 */ public class SpiderTest { @Ignore @Test public void testSpider() throws InterruptedException { Spider me = Spider.create(new HuxiuProcessor()).addPipeline(new FilePipeline()); me.run(); } @Ignore @Test public void testGlobalSpider(){ // PageProcessor pageProcessor = new MeicanProcessor(); // Spider.me().pipeline(new FilePipeline()).scheduler(new FileCacheQueueScheduler(pageProcessor.getSite(),"/data/temp/webmagic/cache/")). // processor(pageProcessor).run(); SimplePageProcessor pageProcessor2 = new SimplePageProcessor( "http://www.diaoyuweng.com/thread-*-1-1.html"); System.out.println(pageProcessor2.getSite().getCharset()); pageProcessor2.getSite().setSleepTime(500); Spider.create(pageProcessor2).addUrl("http://www.diaoyuweng.com/home.php?mod=space&uid=88304&do=thread&view=me&type=thread&from=space").addPipeline(new FilePipeline()).scheduler(new FileCacheQueueScheduler("/data/temp/webmagic/cache/")). run(); } @Ignore @Test public void test(){ System.out.println(System.getProperty("java.io.tmpdir")); } @Ignore @Test public void languageSchema() { /** * * _hrefs = regex("]*href=[\"']{1}(/yewu/.*?)[\"']{1}") * title = r(""(.*)"") * body = x("//dd[@class='w133']") * * site.domain = "sh.58.com" * site.ua="" * site.cookie="aa:bb" * */ /** * * * if (page == r('') && refer(1) == 1) { * * type = _refer(1) * content = _text.t().c() * title = x("asd@asd").r("",1) * body[r(_currentUrl).g(1)] = body[r(_currentUrl).g(1)] + (x("").r("",1,2).c()) * * body=body[r(_currentUrl).g(1)] * tags[%] = (tags[%] + xpath('')) . r('') * * _targetUrls.add('' + x('').r('')) * _sourceUrls.add() * _header.put("",""); * _cookie.add("asdsadasdsa"); * * * } * * _cookie.add(_cookie['']) * * if (page == r('') && refer(1) == 1) * ( * _targetUrl = '' + x('') & r('') * _sourceUrl = '' * ) * */ /** * * * * * * * * * * */ /** * * if (model.url('') && model.refer(1) == 1) * ( * * model.set(type, model.refer(1)) * content = t(_html) > c() * title = x(_html, 'asd@asd') > r('',1) * body[r(_currentUrl).g(1)] = body[r(_currentUrl).g(1)] + (x('') > r('',1,2) > c()) | x('') * tags[%] = tags + xpath('') > r('') * model.setTargetUrl(); * * _targetUrl = '' + x('') & r('') * _sourceUrl = '' * ) * * _cookie.add(_cookie['']) * * if (page == r('') && refer(1) == 1) * ( * _targetUrl = '' + x('') & r('') * _sourceUrl = '' * ) * */ } } ================================================ FILE: webmagic-samples/src/test/java/us/codecraft/webmagic/model/ProcessorBenchmark.java ================================================ package us.codecraft.webmagic.model; import org.junit.Ignore; import org.junit.Test; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.model.samples.OschinaBlog; import us.codecraft.webmagic.selector.Html; import us.codecraft.webmagic.selector.PlainText; /** * @author code4crafter@gmail.com */ public class ProcessorBenchmark { @Ignore @Test public void test() { ModelPageProcessor modelPageProcessor = ModelPageProcessor.create(Site.me(), OschinaBlog.class); Page page = new Page(); page.setRequest(new Request("http://my.oschina.net/flashsword/blog")); page.setUrl(new PlainText("http://my.oschina.net/flashsword/blog")); page.setHtml(new Html(html)); long time = System.currentTimeMillis(); for (int i = 0; i < 1000; i++) { modelPageProcessor.process(page); } System.out.println(System.currentTimeMillis() - time); time = System.currentTimeMillis(); for (int i = 0; i < 1000; i++) { modelPageProcessor.process(page); } System.out.println(System.currentTimeMillis() - time); } private String html = "\n" + "\n" + "\n" + "\n" + " \n" + " \n" + " \n" + " \n" + " Jsoup代码解读之八-防御XSS攻击 - 黄亿华的个人页面 - 开源中国社区\n" + " \n" + " \n" + " \n" + " \n" + " \n" + " \n" + " \n" + " \n" + " \n" + " \n" + " \n" + " \n" + " \n" + " \n" + " \n" + " \n" + " \n" + " \n" + " \n" + "\n" + "\n" + "
\n" + "\t
\n" + "\t\t
\n" + " \t开源中国社区\n" + "
\n" + "
开源项目发现、使用和交流平台
\n" + "\t\t
\n" + " \t\n" + "
\n" + "
\n" + "\t
\n" + "\t
\n" + "\t\t
\n" + "\t\t当前访客身份:\n" + "\t\t\t\t黄亿华 [ 退出 ]\n" + "\t\t\t\t\t\t\t\n" + "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t你有0新留言\t\t\t\n" + "\t\t\t\t\t\t\t\t\t\t\t\n" + "\t\t
\n" + "\t\t
\n" + " \t\t
\n" + "\t\t\t\t\n" + "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\n" + " \t\t\t\n" + "\t\t\t\t\n" + "\t\t\t\t
\n" + " \t\t\t\t\n" + " \t\t\t\t\n" + " \t\t\t\t\n" + "
软件
\n" + " \n" + "
\n" + "\t\t\t\t\t\t\t\n" + " \t\t
\n" + "\t\t
\n" + "\t\t
\n" + "\t
\n" + "\t
\t\n" + "\n" + "
\n" + "
\n" + "\t\t切换风格 \"黄亿华\"\n" + " \n" + " 黄亿华\n" + "\t\t\n" + "\t\t\t\n" + " \t\t\t修改资料\n" + "\t\t\t更换头像\n" + " \t\t\n" + " \n" + "
\n" + "
\n" + " \t关注(43)\n" + " \t粉丝(98)\n" + " \t积分(173)\n" + "
\n" + "
\n" + "
\n" + "码农一枚
实用主义者
抵制重复造轮子,却造了不少轮子
http://codecraft.us
\n" + "\n" + "\n" + "\n" + "
\n" + "\t.发表博文\n" + "\t.空间管理\n" + "
\n" + " 管理» 博客分类\n" + " \n" + "
\n" + "
\n" + " 管理» 最新评论 \n" + "
    \n" + "\t\t
  • \n" + "\t\t@黄亿华:引用来自“lidongyang”的评论 引用来自“黄亿华...\n" + "\t\t查看»\n" + "\t
  • \n" + "\t\t
  • \n" + "\t\t@lidongyang:引用来自“黄亿华”的评论 引用来自“lidongyan...\n" + "\t\t查看»\n" + "\t
  • \n" + "\t\t
  • \n" + "\t\t@黄亿华:引用来自“lidongyang”的评论 引用来自“黄亿华...\n" + "\t\t查看»\n" + "\t
  • \n" + "\t\t
  • \n" + "\t\t@lidongyang:引用来自“黄亿华”的评论 引用来自“lidongyan...\n" + "\t\t查看»\n" + "\t
  • \n" + "\t\t
  • \n" + "\t\t@黄亿华:引用来自“searchjack”的评论 不是好的就会被认...\n" + "\t\t查看»\n" + "\t
  • \n" + "\t\t
  • \n" + "\t\t@searchjack:不是好的就会被认可, 干自己的, 到时候, 单干\n" + "\t\t查看»\n" + "\t
  • \n" + "\t\t
  • \n" + "\t\t@searchjack:极好的工具,\n" + "\t\t查看»\n" + "\t
  • \n" + "\t\t
  • \n" + "\t\t@黄亿华:引用来自“静风流云”的评论 貌似,OSC也是类似处...\n" + "\t\t查看»\n" + "\t
  • \n" + "\t\t
  • \n" + "\t\t@静风流云:貌似,OSC也是类似处理的。\n" + "\t\t查看»\n" + "\t
  • \n" + "\t\t
  • \n" + "\t\t@黄亿华:引用来自“仪山湖”的评论 最近要写个爬虫,看了...\n" + "\t\t查看»\n" + "\t
  • \n" + "\t
\n" + "
\n" + "
\n" + "访客统计\n" + "
    \n" + "\t
  • 6 (查看最新访客»)
  • \n" + "
  • 284
  • \n" + "
  • 817
  • \n" + "
  • 1888
  • \n" + "
  • 16453
  • \n" + "
\n" + "
\n" + "\n" + "
\n" + "\t
\n" + " \t\n" + "\t
\n" + "\t\n" + " \t
\t\t\n" + "
\n" + "

Jsoup代码解读之八-防御XSS攻击

\n" + "
\n" + " \t\t \t\t \t\t\n" + " \t\t\t编辑 | 删除\n" + " \t\t\n" + "\t\t\t \t\t \t\t发表于3天前(2013-08-31 08:24) , \n" + " \t\t已有1628次阅读 ,共3个评论\n" + " \t\t\t\t\t,共 79 人收藏此文 \t
\n" + "
\n" + "\t \t
\n" + "

目录:[ - ]

\n" + " \n" + " \t
\n" + " \n" + "\t \t

\n" + "\n" + "

防御XSS攻击的一般原理

\n" + "

cleaner是Jsoup的重要功能之一,我们常用它来进行富文本输入中的XSS防御。

\n" + "

我们知道,XSS攻击的一般方式是,通过在页面输入中嵌入一段恶意脚本,对输出时的DOM结构进行修改,从而达到执行这段脚本的目的。对于纯文本输入,过滤/转义HTML特殊字符<,>,",'是行之有效的办法,但是如果本身用户输入的就是一段HTML文本(例如博客文章),这种方式就不太有效了。这个时候,就是Jsoup大显身手的时候了。

\n" + "

在前面,我们已经知道了,Jsoup里怎么将HTML变成一棵DOM树,怎么对DOM树进行遍历,怎么对DOM文档进行输出,那么其实cleaner的实现方式,也能猜出大概了。使用Jsoup进行XSS防御,大致分为三个步骤:

\n" + "
    \n" + "
  1. 将HTML解析为DOM树

    这一步可以过滤掉一些企图搞破坏的非闭合标签、非正常语法等。例如一些输入,会尝试用</textarea>闭合当前Tag,然后写入攻击脚本。而根据前面对Jsoup的parser的分析,这种时候,这些非闭合标签会被当做错误并丢弃。

  2. \n" + "
  3. 过滤高风险标签/属性/属性值

    高风险标签是指<script>以及类似标签,对属性/属性值进行过滤是因为某些属性值里也可以写入javascript脚本,例如onclick='alert("xss!")'

  4. \n" + "
  5. 重新将DOM树输出为HTML文本

    DOM树的输出,在前面(Jsoup代码解读之三)已经提到过了。

  6. \n" + "
\n" + "\n" + "

Cleaner与Whitelist

\n" + "

对于上述的两个步骤,1、3都已经分别在parser和输出中完成,现在只剩下步骤 2:过滤高风险标签等。

\n" + "

Jsoup给出的答案是白名单。下面是Whitelist的部分代码。

\n" + "
public class Whitelist {\n" +
            "    private Set<TagName> tagNames; // tags allowed, lower case. e.g. [p, br, span]\n" +
            "    private Map<TagName, Set<AttributeKey>> attributes; // tag -> attribute[]. allowed attributes [href] for a tag.\n" +
            "    private Map<TagName, Map<AttributeKey, AttributeValue>> enforcedAttributes; // always set these attribute values\n" +
            "    private Map<TagName, Map<AttributeKey, Set<Protocol>>> protocols; // allowed URL protocols for attributes\n" +
            "    private boolean preserveRelativeLinks; // option to preserve relative links\n" +
            "}
\n" + "

这里定义了标签名/属性名/属性值的白名单。

\n" + "

Cleaner是过滤的执行者。不出所料,Cleaner内部定义了CleaningVisitor来进行标签的过滤。CleaningVisitor的过滤过程并不改变原始DOM树的值,而是将符合条件的属性,加入到Element destination里去。

\n" + "
private final class CleaningVisitor implements NodeVisitor {\n" +
            "    private int numDiscarded = 0;\n" +
            "    private final Element root;\n" +
            "    private Element destination; // current element to append nodes to\n" +
            "\n" +
            "    private CleaningVisitor(Element root, Element destination) {\n" +
            "        this.root = root;\n" +
            "        this.destination = destination;\n" +
            "    }\n" +
            "\n" +
            "    public void head(Node source, int depth) {\n" +
            "        if (source instanceof Element) {\n" +
            "            Element sourceEl = (Element) source;\n" +
            "\n" +
            "            if (whitelist.isSafeTag(sourceEl.tagName())) { // safe, clone and copy safe attrs\n" +
            "                ElementMeta meta = createSafeElement(sourceEl);\n" +
            "                Element destChild = meta.el;\n" +
            "                destination.appendChild(destChild);\n" +
            "\n" +
            "                numDiscarded += meta.numAttribsDiscarded;\n" +
            "                destination = destChild;\n" +
            "            } else if (source != root) { // not a safe tag, so don't add. don't count root against discarded.\n" +
            "                numDiscarded++;\n" +
            "            }\n" +
            "        } else if (source instanceof TextNode) {\n" +
            "            TextNode sourceText = (TextNode) source;\n" +
            "            TextNode destText = new TextNode(sourceText.getWholeText(), source.baseUri());\n" +
            "            destination.appendChild(destText);\n" +
            "        } else { // else, we don't care about comments, xml proc instructions, etc\n" +
            "            numDiscarded++;\n" +
            "        }\n" +
            "    }\n" +
            "\n" +
            "    public void tail(Node source, int depth) {\n" +
            "        if (source instanceof Element && whitelist.isSafeTag(source.nodeName())) {\n" +
            "            destination = destination.parent(); // would have descended, so pop destination stack\n" +
            "        }\n" +
            "    }\n" +
            "}
\n" + "\n" + "

结束语

\n" + "

至此,Jsoup的全部模块都已经写完了。Jsoup源码并不多,只有14000多行,但是实现非常精巧,在读代码的过程中,除了相关知识,还验证几个很重要的思想:

\n" + "
    \n" + "
  • 最好的代码抽象,是对现实概念的映射。

    这句话在看《代码大全》的时候印象很深刻。在Jsoup里,只要有相关知识,每个类的作用都能第一时间明白其作用。

  • \n" + "
  • 不要过度抽象

    在Jsoup里,只用到了两个接口,一个是NodeVisitor,一个是Connection,其他都是用抽象类或者直接用实现类代替。记得有次面试的时候被问到我们开发中每逢一个功能,都要先定义一个接口的做法是否必要?现在的答案是没有必要,过度的抽象反而会降低代码质量。

    另外,Jsoup的代码内聚性都很高,每个类的功能基本都定义在类的内部,这是一个典型的充血模型。同时有大量的facade使用,而避免了Factory、Configure等类的出现,个人感觉这点是非常好的。

  • \n" + "
\n" + "

最后继续贴上Jsoup解读系列的github地址:https://github.com/code4craft/jsoup-learning/

\n" + " \t \t \n" + " \t\n" + "\t
\n" + " \t关键字:\n" + " \t \tJsoup\n" + " \t \tXSS\n" + " \t \tOO\n" + " \t \t
\n" + "\t \t \n" + "
\t\t\n" + "\t \t\t声明:OSCHINA 博客文章版权属于作者,受法律保护。未经作者同意不得转载。\n" + "\t \t
\n" + "\n" + " \n" + "\t
\n" + "\n" + "\t\n" + "\t
\n" + "\t\n" + "\t\n" + "\t\t分享到: \n" + "\t\t\n" + "\t\t\n" + "\t\n" + " 已有 0人顶\n" + "\t\n" + "\t
\n" + "\t\t\n" + "
\n" + "
\n" + "
\n" + "

共有 3 条网友评论

\n" + "\t\t\t
    \n" + "\t\t\t\t\t\t
  • \n" + "\t\n" + "\t\n" + "\t\n" + "\t
    \n" + "\t\t\"静风流云\"\t\t\t\n" + "\t\n" + "\t\t
    \n" + "\t\t\t1楼:静风流云 发表于 2013-09-01 08:34 \t\t\t\n" + " \t \t 删除\n" + "\t\t\t\t\t\t\t\t\t 回复此评论\n" + "\t\t\t\t\t
    \n" + "\t\t
    貌似,OSC也是类似处理的。
    \n" + "\t\t
    \n" + "
    \n" + "
  • \t\t\t\t\t
  • \n" + "\t\n" + "\t\n" + "\t\n" + "\t
    \n" + "\t\t\"黄亿华\"\t\t\t\n" + "\t\n" + "\t\t
    \n" + "\t\t\t2楼:黄亿华 发表于 2013-09-01 08:37 \t\t\t\n" + " \t \t 删除\n" + "\t\t\t\t\t\t\t\t
    \n" + "\t\t

    引用来自“静风流云”的评论

    貌似,OSC也是类似处理的。

    OSC就是使用Jsoup做解析的,见这里:http://www.oschina.net/p/jsoup
    \n" + "\t\t
    \n" + "
    \n" + "
  • \t\t\t\t\t
  • \n" + "\t\n" + "\t\n" + "\t\n" + "\t
    \n" + "\t\t\"searchjack\"\t\t\t\n" + "\t\n" + "\t\t
    \n" + "\t\t\t3楼:searchjack 发表于 2013-09-02 09:20 \t\t\t\n" + " \t \t 删除\n" + "\t\t\t\t\t\t\t\t\t 回复此评论\n" + "\t\t\t\t\t
    \n" + "\t\t
    极好的工具,
    \n" + "\t\t
    \n" + "
    \n" + "
  • \t\t\t\t
\n" + "
\n" + "\t
\n" + "\n" + "\n" + "
\n" + " \n" + "
\n" + "
\n" + "
\n" + "\t \n" + "\t \n" + "\t 文明上网,理性发言\n" + "
\n" + "\t回到页首 | 回到评论列表\n" + "
\n" + "
\n" + "\t\n" + "
\n" + "\t关闭相关文章阅读\n" + "\t\n" + "
\n" + "\n" + "\n" + "\n" + "\n" + "\n" + "\n" + "
\n" + "\t
\n" + "\t
\n" + "\n" + "\n" + "\n" + "\n" + "
© 开源中国(OsChina.NET) | 关于我们 | 广告联系 | @新浪微博 | 开源中国手机版 | 粤ICP备12009483号-3\n" + "\t开源中国手机客户端:\n" + "\tAndroid\n" + "\tiPhone\n" + "\tWP7\n" + "
\n" + "
\n" + "
\n" + "\n" + "\n" + "\n" + "\n" + "\n" + "\n" + ""; } ================================================ FILE: webmagic-samples/src/test/java/us/codecraft/webmagic/processor/SinablogProcessorTest.java ================================================ package us.codecraft.webmagic.processor; import org.junit.Ignore; import org.junit.Test; import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.pipeline.FilePipeline; import us.codecraft.webmagic.pipeline.JsonFilePipeline; import us.codecraft.webmagic.samples.SinaBlogProcessor; import us.codecraft.webmagic.scheduler.FileCacheQueueScheduler; import java.io.IOException; /** * @author code4crafter@gmail.com
* Date: 13-6-9 * Time: 上午8:02 */ public class SinablogProcessorTest { @Ignore @Test public void test() throws IOException { SinaBlogProcessor sinaBlogProcessor = new SinaBlogProcessor(); //pipeline是抓取结束后的处理 //默认放到/data/webmagic/ftl/[domain]目录下 JsonFilePipeline pipeline = new JsonFilePipeline("/data/webmagic/"); //Spider.me()是简化写法,其实就是new一个啦 //Spider.pipeline()设定一个pipeline,支持链式调用 //ConsolePipeline输出结果到控制台 //FileCacheQueueSchedular保存url,支持断点续传,临时文件输出到/data/temp/webmagic/cache目录 //Spider.run()执行 Spider.create(sinaBlogProcessor).pipeline(new FilePipeline()).pipeline(pipeline).scheduler(new FileCacheQueueScheduler("/data/temp/webmagic/cache/")). run(); } } ================================================ FILE: webmagic-samples/src/test/java/us/codecraft/webmagic/samples/scheduler/DelayQueueSchedulerTest.java ================================================ package us.codecraft.webmagic.samples.scheduler; import org.junit.Ignore; import org.junit.Test; import us.codecraft.webmagic.Request; import java.util.concurrent.TimeUnit; /** * @author code4crafter@gmail.com */ public class DelayQueueSchedulerTest { @Ignore("infinite") @Test public void test() { DelayQueueScheduler delayQueueScheduler = new DelayQueueScheduler(1, TimeUnit.SECONDS); delayQueueScheduler.push(new Request("1"), null); while (true){ Request poll = delayQueueScheduler.poll(null); System.out.println(System.currentTimeMillis()+"\t"+poll); } } } ================================================ FILE: webmagic-saxon/README.md ================================================ webmagic-extension ------- webmagic的扩展模块,依赖Saxon进行xpath2.0解析支持。Saxon依赖包太大,不作为默认模块引入。 ================================================ FILE: webmagic-saxon/pom.xml ================================================ us.codecraft webmagic 1.0.4-SNAPSHOT 4.0.0 webmagic-saxon true ${project.groupId} webmagic-core ${project.version} net.sourceforge.htmlcleaner htmlcleaner net.sf.saxon Saxon-HE ================================================ FILE: webmagic-saxon/src/main/java/us/codecraft/webmagic/selector/JaxpSelectorUtils.java ================================================ package us.codecraft.webmagic.selector; import org.w3c.dom.Node; import org.w3c.dom.NodeList; import javax.xml.transform.OutputKeys; import javax.xml.transform.Transformer; import javax.xml.transform.TransformerException; import javax.xml.transform.TransformerFactory; import javax.xml.transform.dom.DOMSource; import javax.xml.transform.stream.StreamResult; import java.io.StringWriter; import java.util.ArrayList; import java.util.Collections; import java.util.List; /** * @author hooy */ public final class JaxpSelectorUtils { private JaxpSelectorUtils() { throw new RuntimeException("The util class cannot be instanced"); } public static List NodeListToArrayList(NodeList nodes) { List list = new ArrayList<>(nodes.getLength()); for (int i = 0; i < nodes.getLength(); i++) { list.add(nodes.item(i)); } return list; } public static String nodeToString(Node node) throws TransformerException { List before = Collections.singletonList(node); List after = nodesToStrings(before); if (after.size() > 0) { return after.get(0); } else { return null; } } public static List nodesToStrings(List nodes) throws TransformerException { List results = new ArrayList<>(nodes.size()); Transformer transformer = TransformerFactory.newInstance().newTransformer(); StreamResult xmlOutput = new StreamResult(); transformer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "yes"); for (Node node : nodes) { if (node.getNodeType() == Node.ATTRIBUTE_NODE || node.getNodeType() == Node.TEXT_NODE) { results.add(node.getTextContent()); } else { xmlOutput.setWriter(new StringWriter()); transformer.transform(new DOMSource(node), xmlOutput); results.add(xmlOutput.getWriter().toString()); } } return results; } } ================================================ FILE: webmagic-saxon/src/main/java/us/codecraft/webmagic/selector/NodeSelector.java ================================================ package us.codecraft.webmagic.selector; import org.w3c.dom.Node; import java.util.List; /** * Selector(extractor) for html node.
* * @author hooy
* @since 0.8.0 */ public interface NodeSelector { /** * Extract single result in text.
* If there are more than one result, only the first will be chosen. * * @param node node * @return result */ String select(Node node); /** * Extract all results in text.
* * @param node node * @return results */ List selectList(Node node); } ================================================ FILE: webmagic-saxon/src/main/java/us/codecraft/webmagic/selector/Xpath2Selector.java ================================================ package us.codecraft.webmagic.selector; import java.util.*; import java.util.concurrent.ConcurrentHashMap; import javax.xml.namespace.NamespaceContext; import javax.xml.parsers.ParserConfigurationException; import javax.xml.xpath.XPathConstants; import javax.xml.xpath.XPathExpression; import javax.xml.xpath.XPathExpressionException; import org.htmlcleaner.CleanerProperties; import org.htmlcleaner.DomSerializer; import org.htmlcleaner.HtmlCleaner; import org.htmlcleaner.TagNode; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.w3c.dom.Document; import org.w3c.dom.Node; import org.w3c.dom.NodeList; import net.sf.saxon.lib.NamespaceConstant; import net.sf.saxon.xpath.XPathEvaluator; import us.codecraft.webmagic.utils.BaseSelectorUtils; import static us.codecraft.webmagic.selector.JaxpSelectorUtils.*; /** * 支持xpath2.0的选择器。包装了HtmlCleaner和Saxon HE。
* * @author code4crafter@gmail.com, hooy
* Date: 13-4-21 * Time: 上午9:39 */ public class Xpath2Selector implements Selector, NodeSelector { private final String xpathStr; private XPathExpression xPathExpression; private final Logger logger = LoggerFactory.getLogger(getClass()); public Xpath2Selector(String xpathStr) { this.xpathStr = xpathStr; try { init(); } catch (XPathExpressionException e) { throw new IllegalArgumentException("XPath error!", e); } } public static Xpath2Selector newInstance(String xpathStr) { return new Xpath2Selector(xpathStr); } enum XPath2NamespaceContext implements NamespaceContext { INSTANCE; private final Map prefix2NamespaceMap = new ConcurrentHashMap<>(); private final Map> namespace2PrefixMap = new ConcurrentHashMap<>(); private void put(String prefix, String namespaceURI) { prefix2NamespaceMap.put(prefix, namespaceURI); List prefixes = namespace2PrefixMap.computeIfAbsent(namespaceURI, k -> new ArrayList<>()); prefixes.add(prefix); } XPath2NamespaceContext() { put("fn", NamespaceConstant.FN); put("xslt", NamespaceConstant.XSLT); put("xhtml", NamespaceConstant.XHTML); } @Override public String getNamespaceURI(String prefix) { return prefix2NamespaceMap.get(prefix); } @Override public String getPrefix(String namespaceURI) { List prefixes = namespace2PrefixMap.get(namespaceURI); if (prefixes == null || prefixes.size() < 1) { return null; } return prefixes.get(0); } @Override public Iterator getPrefixes(String namespaceURI) { List prefixes = namespace2PrefixMap.get(namespaceURI); if (prefixes == null || prefixes.size() < 1) { return null; } return prefixes.iterator(); } } private void init() throws XPathExpressionException { XPathEvaluator xPathEvaluator = new XPathEvaluator(); xPathEvaluator.setNamespaceContext(XPath2NamespaceContext.INSTANCE); xPathExpression = xPathEvaluator.compile(xpathStr); } @Override public String select(String text) { try { Document doc = parse(text); return select(doc); } catch (Exception e) { logger.error("select text error! " + xpathStr, e); } return null; } @Override public String select(Node node) { try { return (String) xPathExpression.evaluate(node, XPathConstants.STRING); } catch (Exception e) { logger.error("select text error! " + xpathStr, e); } return null; } @Override public List selectList(String text) { try { Document doc = parse(text); return selectList(doc); } catch (Exception e) { logger.error("select text error! " + xpathStr, e); } return null; } @Override public List selectList(Node node) { try { NodeList result = (NodeList) xPathExpression.evaluate(node, XPathConstants.NODESET); List nodes = NodeListToArrayList(result); return nodesToStrings(nodes); } catch (Exception e) { logger.error("select text error! " + xpathStr, e); } return null; } public Node selectNode(String text) { try { Document doc = parse(text); return selectNode(doc); } catch (Exception e) { logger.error("select text error! " + xpathStr, e); } return null; } public Node selectNode(Node node) { try { return (Node) xPathExpression.evaluate(node, XPathConstants.NODE); } catch (Exception e) { logger.error("select text error! " + xpathStr, e); } return null; } public List selectNodes(String text) { try { Document doc = parse(text); return selectNodes(doc); } catch (Exception e) { logger.error("select text error! " + xpathStr, e); } return null; } public List selectNodes(Node node) { try { NodeList result = (NodeList) xPathExpression.evaluate(node, XPathConstants.NODESET); return NodeListToArrayList(result); } catch (Exception e) { logger.error("select text error! " + xpathStr, e); } return null; } protected static Document parse(String text) throws ParserConfigurationException { // HtmlCleaner could not parse or tag directly text = BaseSelectorUtils.preParse(text); HtmlCleaner htmlCleaner = new HtmlCleaner(); TagNode tagNode = htmlCleaner.clean(text); return new DomSerializer(new CleanerProperties()).createDOM(tagNode); } } ================================================ FILE: webmagic-saxon/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java ================================================ package us.codecraft.webmagic.selector; import java.util.List; import org.htmlcleaner.HtmlCleaner; import org.htmlcleaner.TagNode; import org.htmlcleaner.XPatherException; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.junit.Assert; import org.junit.Ignore; import org.junit.Test; import org.w3c.dom.Node; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.processor.PageProcessor; import us.codecraft.xsoup.XPathEvaluator; import us.codecraft.xsoup.Xsoup; import javax.xml.transform.TransformerException; /** * @author code4crafter@gmail.com
Date: 13-4-21 Time: 上午10:06 */ public class XpathSelectorTest { private String html = "\n" + "\n" + "\n" + "\n" + " \n" + " \n" + " 再次吐槽easyui - 开源中国 OSChina.NET\n" + " \n" + " \n" + " \n" + " \n" + " \n" + " \n" + " \n" + " \n" + " \n" + " \n" + " \n" + " \n" + " \n" + " \n" + " \n" + " \n" + " \n" + "\n" + "
\n" + "\t
\n" + "
\n" + " \t\n" + "
\n" + "\t\t
\n" + " \t\t \t\t黄亿华,您好 \n" + "\t\t\t\n" + "\t\t\t\t我的空间\n" + "\t\t\t\t\n" + "\t\t\t | \n" + "\t\t\t添加软件 | 投递新闻 | 退出\n" + " \t\t\t\t
\n" + "\t\t
\n" + "\t
\n" + "
\n" + "
\n" + "

讨论区

\n" + "
\n" + "\t
当前位置:
\n" + "\t
\n" + "\t\t\t\t\t \t\t讨论区 »\n" + " \t\t技术问答\t\t\t\t\t\t\t\t» EasyUI\n" + "\t\t\t\t\t\t\t\t\t\t
\n" + "
\n" + "\n" + "
\n" + "
\n" + "\t
\n" + "\n" + "\n" + "\n" + "\n" + "\n" + "
\n" + "\t\n" + "\t
\n" + "\t
\n" + "\t\t
\"午后冬日\"
\n" + "\t\t
\n" + "\t\t\t

再次吐槽easyui

\n" + "\t\t\t
\n" + "\t\t\t\t午后冬日\n" + "\t\t\t\t发表于 2013-4-21 02:28 13小时前,\n" + "\t\t\t\t3回/289阅,\n" + "\t\t\t\t最后回答: 4小时前\t\t\t\t\t\t\t\t\t\t\t
\n" + "\t\t
\n" + "\t\t\n" + "\t\t
\n" + "\t
\n" + "\t\t \t \t\t\t\t\t\n" + "\t\t

Java、PHP、Ruby、iOS、Python 等 JetBrains 开发工具低至 99 元(3折),详情»

\n" + "\t\t
\n" + "\t\t\t\t\t\t
刚用到easyui treegrid组件,发现这货第一次加载时候并没有传默认参数,展开某一列时候才传递id:xx的参数。这样和后台总是疙里疙瘩,像没事就拌嘴的两口子,查网上都遇到相同问题,最好解决方案就是通过 \n" + "onBeforeExpand事件来扩展,自行解决。看到官方例子中简洁的代码,感觉easyui耍流氓了,真搞不懂为何要这样实现
\n" + "\t\t\t\t\t\t
\n" + "\t\t\t\t标签:\t\t\t\t\n" + "\t\t\t\t\t\t\t\t\t\t\t\tEasyUI \t\t\t\t\t\t\t\t\t\t\t
\n" + "\t\t\t\t\t\t
\n" + "\t\t\t\n" + "\t\t\t\t\t\t\t\t\t我想问同样的问题\n" + "\t\t\t\t\t\t\n" + "\t\t\t共0个人想要问同样的问题\n" + "\t\t\t\t\t\t补充话题说明»\n" + "\t\t\t
\n" + "\t\t\t\t\t\t
\n" + "\t
    \n" + "
    \t\t
    \n" + "\t\t\n" + "\t\t
    \n" + " \n" + "\t\t\t\t
    \n" + "\t\t\t
    分享到
    \n" + "\t\t\t\n" + "\t\t\t
    1
    \n" + "\t\t\t\n" + "\t\t\t
    \n" + "\t\t\t\t\t\t\t\t \t\t\t\n" + "\t\t\t\t\t\t\t\t0\n" + "\t\t\t\t|\n" + "\t\t\t\t\t\t\t\t \t\t\t\n" + "\t\t\t\t\t\t\t\t0\n" + "\t\t\t
    \n" + "\t\t\t\n" + "\t\t
    \n" + "\t\t
    \n" + "\t\t\t\t\t\t
    \n" + "\t\t\t\n" + " \t

    \t\t\t\n" + "\t\t\t\t\n" + "\t\t\t\t\t按评价排序 |\n" + "\t\t\t\t\t显示最新答案 | 回页面顶部\n" + "\t\t\t\t\n" + "\t\t\t\t共有3个答案 我要回答»\n" + "\t\t\t

    \n" + "\t\t\t \t
    • \n" + "\t
      \"布谷鸟\"
      \n" + "\t
      \n" + "\t\t
      布谷鸟 回答于 2013-04-21 09:28
      \t\t\n" + " \t
      \n" + "\t\t\t \t\t \t\t举报\n" + " \t
      \n" + "\t\t
      \n" + "\t\t
      对话框、日期控件和选项卡效果还不错,树状菜单没得zTree好用,建议楼主不要全部效果都依赖于此框架,有些easyui实现不好的地方可以换其它的插件实现嘛,反正我现在再也不用诸如ext和easyui之类的东西了,感觉好肥
      \n" + "\t
      \n" + "\t
      \n" + "\t
      --- 共有 1 条评论 --- \n" + "
        \n" + "\t\t
      • \n" + "\t\t\"午后冬日\"\n" + "\t\t\n" + "\t\t前端水平实在有限,自己搞的总是感觉不伦不类,只能用这些框架,再集成其它插件,切换主题时风格又不一致。\n" + "\t\t(4小时前 by 午后冬日)\n" + "\t\t回复\n" + "\t\t\n" + "\t\t
        \n" + "\t
      • \n" + "\t
      \n" + "\n" + "
      \n" + "\t
      \t\t\t\t\t\t有帮助(1) |\n" + "\t\t没帮助(0) |\n" + "\t\t评论(1) |\n" + " \t引用此答案\t
      \n" + "
    • \n" + "\t
      \"静风流云\"
      \n" + "\t
      \n" + "\t\t
      静风流云 回答于 2013-04-21 11:08
      \t\t\n" + " \t
      \n" + "\t\t\t \t\t \t\t举报\n" + " \t
      \n" + "\t\t
      \n" + "\t\t

      没办法,原来项目也是因为客户特殊的需求,对layout选型的时候,犹豫了好久,最终放弃了。
      幸亏来了一个厉害的前端,解决问题,够用就好。

      \n" + "\t
      \n" + "\t
      \n" + "\t
      --- 共有 1 条评论 --- \n" + "
        \n" + "\t\t
      • \n" + "\t\t\"午后冬日\"\n" + "\t\t\n" + "\t\t我也是犹豫了好久,看过很多前端框架,总是不太满意。个人开发前台后台数据库全部要自己搞定,郁闷ing\n" + "\t\t(4小时前 by 午后冬日)\n" + "\t\t回复\n" + "\t\t\n" + "\t\t
        \n" + "\t
      • \n" + "\t
      \n" + "\n" + "
      \n" + "\t
      \t\t\t\t\t\t有帮助(0) |\n" + "\t\t没帮助(0) |\n" + "\t\t评论(1) |\n" + " \t引用此答案\t
      \n" + "
    • \n" + "\t
      \"布谷鸟\"
      \n" + "\t
      \n" + "\t\t
      布谷鸟 回答于 2013-04-21 11:29
      \t\t\n" + " \t
      \n" + "\t\t\t \t\t \t\t举报\n" + " \t
      \n" + "\t\t
      \n" + "\t\t

      引用来自“布谷鸟”的答案

      对话框、日期控件和选项卡效果还不错,树状菜单没得zTree好用,建议楼主不要全部效果都依赖于此框架,有些easyui实现不好的地方可以换其它的插件实现嘛,反正我现在再也不用诸如ext和easyui之类的东西了,感觉好肥
      前后端你一个人搞啊?那确实很麻烦。面面俱到的话,工作量很大。但是如果需要实现的功能不是很多,而时间也不紧迫的话,事情干起来也还不错。如非必须,建议逐步弃用这些前端框架,在一些比较能够提升体验的地方选用一些适当的插件即可,如此也不再需要担心风格的问题,你看osc后台截图,界面那叫一个丑,用得方便顺手就够了
      \n" + "\t
      \n" + "\t
      \n" + "\t
      \n" + "\t
      \t\t\t\t\t\t有帮助(0) |\n" + "\t\t没帮助(0) |\n" + "\t\t评论(0) |\n" + " \t引用此答案\t
      \n" + "
    \n" + "\t\t\t\t
    \n" + "\t\t
    \n" + "\t\t\t
    \"黄亿华\"
    \n" + "\t\t\t
    \n" + "\t\t\t\t\n" + "\t\t\t\t\n" + "\t\t\t\t\n" + "\t\t\t\t\n" + "\t\t\t\t

    \n" + "\t\t\t\t回答案顶部 | 回页面顶部\n" + "\t\t\t
    \n" + "\t\t\t
    \n" + "\t\t\t\n" + "\t\t
    \n" + "\t
    \t\n" + "\t\n" + "\n" + "\n" + "\n" + "\t
    \n" + "\t
    \n" + " \t\n" + "\t
    \n" + "\t\t
    \n" + "\t\t\t有什么技术问题吗?\n" + "\t\t\t我要提问\n" + "\t\t\t
    \n" + "\t\t
    \n" + "\t\t\n" + "\t\t\t\t\t\t
    \n" + "\t\t\t全部(29)...午后冬日的其他问题\n" + "\t\t\t\n" + "\t\t
    \n" + "\t\t\t\t
    \n" + "\t\t\n" + "\t\t
    \n" + "\t\t\n" + "\t\t
    \n" + "\t\t\t类似的话题\n" + "\t\t\t\n" + "\t\t
    \n" + "\t
    \n" + "\t
    \n" + "
    \n" + "\n" + "\n" + "\n" + "\n" + "
    \n" + "\t
    \n" + "\n" + "\n" + "\n" + "\n" + "
    © 开源中国(OsChina.NET) | 关于我们 | 广告联系 | @新浪微博 | 开源中国手机版 | 粤ICP备12009483号-3\n" + "\t开源中国手机客户端:\n" + "\tAndroid\n" + "\tiPhone\n" + "\tWP7\n" + "
    \n" + "
    \n" + "
    \n" + "\n" + "\n" + ""; @Test public void test() { String text = "\n" + "\n" + "\n" + " \n" + " \n" + " jsoup 解析页面商品信息 - - ITeye技术网站\n" + " \n" + " \n" + " \n" + " \n" + " \n" + " \n" + "\n" + " \n" + " \n" + "\n" + "\n" + " \n" + " \n" + "\n" + " \n" + "\n" + " \n" + " \n" + " \n" + " \n" + "
    \n" + "
    \n" + " 首页\n" + " 资讯\n" + " 精华\n" + " 论坛\n" + " 问答\n" + " 博客\n" + " 专栏\n" + " 群组\n" + " 更多 \n" + "
    \n" + " 招聘\n" + " 搜索\n" + "
    \n" + "
    \n" + "\n" + "
    \n" + " \n" + " 欢迎flashsword20\n" + " 0\n" + " \n" + " \"Newpm\"收件箱(3)\n" + " \n" + " 我的应用\n" + "
    \n" + " 我的关注\n" + " 我的群组\n" + " 我的简历\n" + " 我的相册\n" + " 我的收藏\n" + " 我的代码\n" + " 我的微博\n" + "
    \n" + " 我的博客\n" + " 设置\n" + "
    \n" + "
    \n" + " \n" + " \n" + "
    \n" + "
    \n" + " \n" + " \n" + "
    \n" + "
    \n" + "
    \n" + "
    \n" + "
    \n" + " \n" + "
    \n" + "
    \n" + " \n" + "\n" + "\n" + "\n" + " \n" + "\n" + "\n" + "
    \n" + "
    \n" + "

    \n" + " jsoup 解析页面商品信息\n" + " \n" + "

    \n" + " \n" + "
     
    \n" + "
    \n" + "\n" + "
    \n" + "

    今天用了jsoup 解析页面商品信息,感觉比用xpath获取信息准确多了

    \n" + "

    \n" + "

    下面就记录一下:

    \n" + "

    一、首先去 http://jsoup.org/download 下载jsoup的jar包。

    \n" + "

    \n" + "

    二、下面记录下相关代码:

    \n" + "

    \n" + "

    \n" + "

    Document doc = Jsoup.connect(url).get(); //将htm转换成Document类型数据结构

    \n" + "


    doc.select(\"div:has(div) div#spec-n1:has(img) img\").first().attr(\"src\")); //查找div下含有div的标签

    \n" + "

    \n" + "

    并且 div的id='spec-n1',此div第一个img标签,img里属性是src的值。

    \n" + "

    \n" + "

    doc.select(\"div:has(div) div.crumb:has(a) a:eq(4)\").text(); //查找class='crumb'的div下第4个a标签

    \n" + "

    下的值。

    \n" + "

    \n" + "

    doc.select(\"div:has(div) div#name:has(h1)\").text(); //查找id='name'的div下的h1标签的值。

    \n" + "

    \n" + "

    doc.select(\"tbody:has(tr) td.tdTitle:contains(品牌) + td\").text(); //查找class='tdTitle'的td标签里

    \n" + "

    \n" + "

    含有‘品牌’td的下一个td标签中内容。

    \n" + "

    \n" + "

    doc.select(\"script[type=text/javascript]:not([src~=[a-zA-Z0-9./\\\\s]+)\"); //查找含有此<script

    \n" + "

    \n" + "

    type=\"text/javascript\">……</script>内容,不含有script标签中有src属性的script,如:

    \n" + "

    \n" + "

    <script src=\"url\" type=\"text/javascript\"></script>。

    \n" + "
    \n" + "\n" + " \n" + "\n" + "\n" + " \n" + " \n" + "
    \n" + " \n" + "
    分享到:\n" + " \n" + " \n" + "
    \n" + "
    \n" + "\n" + " \n" + "
    \n" + " \n" + "
    \n" + "\n" + "
    \n" + "
    评论
    \n" + " \n" + " \n" + " \n" + " \n" + "
    \n" + "\n" + "
    \n" + "
    发表评论
    \n" + "
    \n" + "\n" + "\n" + " \n" + "\n" + "\n" + "\n" + "
    \n" + "\n" + "\n" + "\n" + "\n" + "

    (快捷键 Alt+S / Ctrl+Enter)

    \n" + "
    \n" + " \n" + "
    \n" + "
    \n" + "\n" + "\n" + "\n" + "\n" + "\n" + "\n" + "\n" + "
    \n" + "\n" + "
    \n" + "
    \n" + "
    \n" + "
    \"masong1987的博客\"
    \n" + "
    masong1987
    \n" + "
    \n" + "\n" + "
    \n" + "
      \n" + "
    • 浏览: 5401 次
    • \n" + "
    • 性别: \"Icon_minigender_1\"
    • \n" + "
    • 来自: 北京
    • \n" + "
    • \n" + " \n" + "
    • \n" + " 发短消息\n" + " \n" + " 更多访客>>\n" + " \n" + "
      \n" + "
      \"flashsword20的博客\"
      \n" + " \n" + "
      \n" + " \n" + "
      \n" + "
      \"dylinshi126的博客\"
      \n" + " \n" + "
      \n" + " \n" + "
      \n" + "
      \"machoo的博客\"
      \n" + " \n" + "
      \n" + " \n" + "
      \n" + "
      \"arson的博客\"
      \n" + " \n" + "
      \n" + " \n" + "
    \n" + "\n" + " \n" + "\n" + "
    \n" + "
    文章分类
    \n" + " \n" + "
    \n" + "
    \n" + "
    社区版块
    \n" + " \n" + "
    \n" + "
    \n" + "
    存档分类
    \n" + " \n" + "
    \n" + " \n" + " \n" + "\n" + "
    \n" + "
    最新评论
    \n" + " \n" + "
    \n" + "\n" + "
    \n" + " \n" + "
    \n" + "
    \n" + "\n" + "
    \n" + "
    \n" + "
    \n" + " 声明:ITeye文章版权属于作者,受法律保护。没有作者书面许可不得转载。若作者同意转载,必须以超链接形式标明文章原始出处和作者。
    \n" + " © 2003-2012 ITeye.com. All rights reserved. [ 京ICP证110151号 京公网安备110105010620 ]\n" + "
    \n" + "
    \n" + " \n" + " \n" + "\n" + " \n" + " \n" + " \n" + "\n"; String text2 = "
    aaa
    "; XpathSelector xpathSelector = new XpathSelector( "//div[@id='main']/div[@class='blog_main']/div[@class='blog_title']/h3/a/text()"); String select = xpathSelector.select(text); Assert.assertEquals("jsoup 解析页面商品信息", select); } @Test public void testOschina() { Html html1 = new Html(html); Assert.assertEquals("再次吐槽easyui", html1.xpath("//*[@class='QTitle']/h1/a/text()").toString()); Assert.assertNotNull(html1.$("a[href]").xpath("//@href").all()); Selectors.xpath("/abc/").select(""); } @Test public void testXPath2() { String text = "

    眉山:扎实推进农业农村工作 促农持续增收
    \n" + "2013-07-31 23:29:45   来源:眉山网      责任编辑:张斯炜

    "; Xpath2Selector xpathSelector = new Xpath2Selector("//h1/text()"); Assert.assertEquals("眉山:扎实推进农业农村工作 促农持续增收", xpathSelector.select(text)); } @Test public void testXpath2Selector() { Xpath2Selector xpath2Selector = new Xpath2Selector("//xhtml:a/@href"); String select = xpath2Selector.select(html); Assert.assertEquals("http://www.oschina.net/", select); List selectList = xpath2Selector.selectList(html); Assert.assertEquals(113, selectList.size()); Assert.assertEquals("http://www.oschina.net/", selectList.get(0)); } @Ignore("take long time") @Test public void performanceTest() { Xpath2Selector xpath2Selector = new Xpath2Selector("//a"); long time = System.currentTimeMillis(); for (int i = 0; i < 1000; i++) { xpath2Selector.selectList(html); } System.out.println(System.currentTimeMillis() - time); XpathSelector xpathSelector = new XpathSelector("//a"); time = System.currentTimeMillis(); for (int i = 0; i < 1000; i++) { xpathSelector.selectList(html); } System.out.println(System.currentTimeMillis() - time); time = System.currentTimeMillis(); for (int i = 0; i < 1000; i++) { xpath2Selector.selectList(html); } System.out.println(System.currentTimeMillis() - time); CssSelector cssSelector = new CssSelector("a"); time = System.currentTimeMillis(); for (int i = 0; i < 1000; i++) { cssSelector.selectList(html); } System.out.println("css " + (System.currentTimeMillis() - time)); } @Ignore("take long time") @Test public void parserPerformanceTest() throws XPatherException { System.out.println(html.length()); HtmlCleaner htmlCleaner = new HtmlCleaner(); TagNode tagNode = htmlCleaner.clean(html); Document document = Jsoup.parse(html); long time = System.currentTimeMillis(); for (int i = 0; i < 2000; i++) { htmlCleaner.clean(html); } System.out.println(System.currentTimeMillis() - time); time = System.currentTimeMillis(); for (int i = 0; i < 2000; i++) { tagNode.evaluateXPath("//a"); } System.out.println(System.currentTimeMillis() - time); System.out.println("============="); time = System.currentTimeMillis(); for (int i = 0; i < 2000; i++) { Jsoup.parse(html); } System.out.println(System.currentTimeMillis() - time); time = System.currentTimeMillis(); for (int i = 0; i < 2000; i++) { document.select("a"); } System.out.println(System.currentTimeMillis() - time); System.out.println("============="); time = System.currentTimeMillis(); for (int i = 0; i < 2000; i++) { htmlCleaner.clean(html); } System.out.println(System.currentTimeMillis() - time); time = System.currentTimeMillis(); for (int i = 0; i < 2000; i++) { tagNode.evaluateXPath("//a"); } System.out.println(System.currentTimeMillis() - time); System.out.println("============="); XPathEvaluator compile = Xsoup.compile("//a"); time = System.currentTimeMillis(); for (int i = 0; i < 2000; i++) { compile.evaluate(document); } System.out.println(System.currentTimeMillis() - time); } /** * New api test * * @author hooy * @since 8.0 */ private String rank = "

    点击榜

    排名分类书名/最新章节作者推荐更新时间
    1.现实
    0
    11-24 22:32
    2.架空
    1047
    03-04 14:44
    3.现实
    0
    07-20 09:06
    4.豪门
    0
    12-03 09:12
    5.现实
    0
    02-01 21:12
    6.玄奇
    3455
    02-28 12:31
    7.玄奇
    20614
    03-31 12:37
    8.复仇
    55
    06-03 11:43
    9.穿越
    0
    10-27 18:50
    10.宫斗
    320
    10-31 13:58
    11.宫斗
    6268
    07-12 20:23
    12.现实
    0
    01-18 23:00
    13.婚恋
    0
    12-14 20:50
    14.修真
    0
    02-03 23:40
    15.豪门
    0
    11-06 23:38
    16.穿越
    191
    12-02 23:37
    17.穿越
    412
    10-13 22:39
    18.豪门
    635
    07-01 13:15
    19.架空
    144
    06-18 09:35
    20.宅斗
    1032
    08-15 19:03
    21.宫斗
    0
    09-30 20:32
    22.豪门
    0
    06-05 11:31
    23.重生
    80
    11-25 19:56
    24.异世
    68
    01-12 10:06
    25.豪门
    0
    05-29 18:46
    26.婚恋
    2778
    11-04 17:48
    27.玄奇
    207
    12-06 16:57
    28.穿越
    260
    01-04 23:26
    29.豪门
    0
    12-07 21:39
    30.架空
    1127
    06-06 17:28
    31.穿越
    113
    09-13 09:06
    32.架空
    597
    02-14 18:47
    33.玄奇
    528
    06-04 22:04
    34.穿越
    328
    06-06 22:09
    35.架空
    539
    05-24 14:42
    36.架空
    0
    03-05 23:27
    37.穿越
    3215
    08-21 16:38
    38.宫斗
    905
    08-04 20:24
    39.玄奇
    1328
    07-25 10:58
    40.穿越
    203
    01-27 20:53
    41.宫斗
    407
    08-31 09:03
    42.宅斗
    16
    05-03 17:38
    43.豪门
    0
    11-10 08:00
    44.婚恋
    0
    07-12 21:37
    45.架空
    0
    06-23 21:02
    46.玄奇
    1382
    05-31 20:36
    47.重生
    334
    07-16 19:19
    48.婚恋
    505
    11-01 16:42
    49.婚恋
    0
    10-19 18:32
    50.豪门
    540
    09-19 19:18
    51.婚恋
    226
    03-18 13:09
    52.穿越
    1026
    03-08 16:28
    53.重生
    304
    02-19 10:25
    54.玄奇
    2617
    02-15 20:57
    55.穿越
    199
    09-04 19:43
    56.同人
    768
    07-19 20:00
    57.宅斗
    0
    02-13 18:13
    58.豪门
    0
    11-12 22:23
    59.架空
    0
    07-28 23:42
    60.婚恋
    0
    02-03 23:09
    61.豪门
    285
    01-07 19:21
    62.重生
    654
    10-12 18:16
    63.异能
    617
    06-18 20:23
    64.宫斗
    27
    06-02 21:05
    65.种田
    206
    08-31 19:23
    66.宅斗
    2444
    08-19 15:51
    67.宅斗
    818
    08-07 23:38
    68.现代
    0
    12-23 17:02
    69.玄奇
    0
    07-23 12:00
    70.婚恋
    0
    11-01 16:43
    71.豪门
    0
    09-12 00:01
    72.架空
    0
    04-27 22:42
    73.豪门
    0
    04-19 13:55
    74.异能
    62
    07-30 00:00
    75.穿越
    1307
    07-20 16:41
    76.玄奇
    12820
    07-15 23:46
    77.架空
    828
    06-06 17:54
    78.宅斗
    985
    05-20 23:53
    79.玄奇
    4960
    04-12 15:58
    80.玄奇
    245
    03-02 23:11
    81.宅斗
    34
    12-21 10:11
    82.宅斗
    1411
    07-21 00:00
    83.现代
    0
    07-31 10:10
    84.玄奇
    0
    06-18 13:53
    85.架空
    0
    12-03 23:41
    86.玄奇
    0
    11-28 22:13
    87.豪门
    0
    11-07 22:48
    88.婚恋
    0
    08-29 23:15
    89.种田
    1831
    08-21 16:38
    90.豪门
    0
    07-11 21:25
    91.豪门
    0
    06-13 15:37
    92.豪门
    0
    05-07 22:10
    93.豪门
    0
    02-28 00:01
    94.豪门
    304
    12-16 07:30
    95.婚恋
    669
    11-07 18:16
    96.仙侠
    54
    09-25 19:51
    97.豪门
    655
    08-31 13:02
    98.现实
    374
    06-29 09:55
    99.穿越
    373
    06-19 18:07
    100.婚恋
    159
    06-04 21:05
    "; @Test public void testStringAPI() { // testAPI: selectList(String) -> selectList(Node) List items = new Xpath2Selector("//div[@class=\"bd\"]//tbody/tr").selectList(rank); Assert.assertSame(100, items.size()); // testAPI: select(String) -> select(Node) String name = new Xpath2Selector("//td[3]/div/a[1]/text()").select(items.get(10)); Assert.assertEquals("深宫安容传", name); } @Test public void testNodeAPI() { // testAPI: selectNodes(String) -> selectNodes(Node) List items = new Xpath2Selector("//div[@class=\"bd\"]//tbody/tr").selectNodes(rank); Assert.assertSame(100, items.size()); // testAPI: selectNode(Node) Node item = new Xpath2Selector("./td[3]/div/a[1]").selectNode(items.get(10)); String name = new Xpath2Selector("./text()").select(item); Assert.assertEquals("深宫安容传", name); } @Test public void testUtilAPI() throws TransformerException { Node item = Xpath2Selector.newInstance("//div[@class=\"bd\"]//tbody/tr[11]/td[3]/div/a[1]/text()").selectNode(rank); // testAPI: nodeToString(Node) -> nodesToStrings(List) String name = JaxpSelectorUtils.nodeToString(item); Assert.assertEquals("深宫安容传", name); } } ================================================ FILE: webmagic-scripts/README.md ================================================ webmagic-scripts ====== ## 目标: 使得可以用简单脚本的方式编写爬虫,从而为一些常用场景提供可流通的脚本。如果已经有人写好了脚本,那么你直接使用就可以了! ## 实例: 例如:我需要抓github的仓库数据,可以这样写一个脚本(javascript): ```javascript var name=xpath("//h1[@class='entry-title public']/strong/a/text()") var readme=xpath("//div[@id='readme']/tidyText()") var star=xpath("//ul[@class='pagehead-actions']/li[1]//a[@class='social-count js-social-count']/text()") var fork=xpath("//ul[@class='pagehead-actions']/li[2]//a[@class='social-count']/text()") var url=page.getUrl().toString() if (name!=null){ println(name) println(readme) println(star) println(url) } urls("(https://github\\.com/\\w+/\\w+)") urls("(https://github\\.com/\\w+)") ``` 然后使用webmagic加载并启动它,无需下载依赖、编写代码、执行的过程。目前已经有控制台版本,请下载[http://code4craft.qiniudn.com/webmagic-console.tar.gz](http://code4craft.qiniudn.com/webmagic-console.tar.gz)。 解压后,使用以下命令执行: java -jar -Dfile.encoding='utf-8' webmagic-console.jar -f 脚本文件名 [-l 语言,默认是javascript] [-t 线程数] [-s 抓取间隔,毫秒] url1 url2 … 例如,对于github这个脚本,我可以这样执行: java -jar -Dfile.encoding='utf-8' webmagic-console.jar -f github.js -t 2 -s 0 https://github.com/code4craft 目前这部分使用Java的ScriptEngine机制完成。 ## 语言: 选用javascript是因为用户面比较广。目前还支持ruby语言,选用ruby是因为ruby的语法编写DSL更简洁: ```ruby name= xpath "//h1[@class='entry-title public']/strong/a/text()" readme = xpath "//div[@id='readme']/tidyText()" star = xpath "//ul[@class='pagehead-actions']/li[1]//a[@class='social-count js-social-count']/text()" fork = xpath "//ul[@class='pagehead-actions']/li[2]//a[@class='social-count']/text()" url=$page.getUrl().toString() puts name,readme,star,fork,url unless name==nil urls "(https://github\\.com/\\w+/\\w+)" urls "(https://github\\.com/\\w+)" ``` 多语言通过参数-l区分,例如执行这个ruby脚本需要: java -jar -Dfile.encoding='utf-8' webmagic-console.jar -f github.rb -t2 -s0 -l ruby https://github.com/code4craft 这个功能目前仍在实验阶段。欢迎大家积极参与并提出意见。 ================================================ FILE: webmagic-scripts/deploy.sh ================================================ #!/bin/sh VERSION="0.4.1-SNAPSHOT" mvn clean package cp target/webmagic-scripts-${VERSION}.jar /usr/local/webmagic/webmagic-console.jar rsync -avz --delete target/lib/ /usr/local/webmagic/lib/ ================================================ FILE: webmagic-scripts/pom.xml ================================================ us.codecraft webmagic 1.0.4-SNAPSHOT 4.0.0 webmagic-scripts 2.1.0 org.apache.logging.log4j log4j-core org.apache.logging.log4j log4j-slf4j2-impl org.jruby jruby org.jetbrains.kotlin kotlin-stdlib ${kotlin.version} org.python jython commons-cli commons-cli ${project.groupId} webmagic-core ${project.version} ${project.groupId} webmagic-extension ${project.version} org.projectlombok lombok 1.18.32 provided ${project.basedir}/src/main/java org.apache.maven.plugins maven-jar-plugin true ./lib/ us.codecraft.webmagic.scripts.ScriptConsole org.codehaus.mojo build-helper-maven-plugin 3.0.0 add-source generate-sources add-source ${project.basedir}/src/main/kotlin ================================================ FILE: webmagic-scripts/src/main/groovy/Github.groovy ================================================ Github { Site { sleepTime 0 timeOut 100 retryTimes 3 userAgent ['a','b','c'].random } match "https://github.com/\\w+/\\w+" { addUrl(url.regex("https://github.com/\\w+/\\w+")) return { name: html.xpath("//h1[@class='entry-title public']/strong/a/text()") author: html.xpath "https://github\\.com/(\\w+)/.*" readme: html.xpath "//div[@id='readme']/tidyText()" star : toInt(html.xpath("//div[@id='readme']/tidyText()")) } } } ================================================ FILE: webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/Params.java ================================================ package us.codecraft.webmagic.scripts; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Set; import lombok.Getter; import lombok.Setter; import us.codecraft.webmagic.scripts.languages.JRuby; import us.codecraft.webmagic.scripts.languages.Javascript; import us.codecraft.webmagic.scripts.languages.Language; import us.codecraft.webmagic.utils.WMCollections; public class Params { @Getter Language language = new Javascript(); @Getter @Setter String scriptFileName; @Getter @Setter List urls; @Getter @Setter int thread = 1; @Getter @Setter int sleepTime = 1000; private static Map> alias; public Params() { alias = new HashMap>(); alias.put(new Javascript(), WMCollections.newHashSet("js", "javascript", "JavaScript", "JS")); alias.put(new JRuby(), WMCollections.newHashSet("ruby", "jruby", "Ruby", "JRuby")); } public void setLanguagefromArg(String arg) { for (Map.Entry> languageSetEntry : alias.entrySet()) { if (languageSetEntry.getValue().contains(arg)) { this.language = languageSetEntry.getKey(); return; } } } } ================================================ FILE: webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/ScriptConsole.java ================================================ package us.codecraft.webmagic.scripts; import org.apache.commons.cli.*; import us.codecraft.webmagic.ResultItems; import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.Task; import us.codecraft.webmagic.pipeline.Pipeline; import us.codecraft.webmagic.scripts.config.CommandLineOption; import us.codecraft.webmagic.utils.WMCollections; import java.util.List; /** * @author code4crafter@gmail.com / FrancoisGib * @since 0.4.1 */ public class ScriptConsole { public static void main(String[] args) { Params params = parseCommand(args); startSpider(params); } private static void startSpider(Params params) { ScriptProcessor pageProcessor = ScriptProcessorBuilder.custom() .language(params.getLanguage()).scriptFromFile(params.getScriptFileName()).thread(params.getThread()).build(); pageProcessor.getSite().setSleepTime(params.getSleepTime()); pageProcessor.getSite().setRetryTimes(3); pageProcessor.getSite().setAcceptStatCode(WMCollections.newHashSet(200, 404,403, 500,502)); Spider spider = Spider.create(pageProcessor).thread(params.getThread()); spider.clearPipeline().addPipeline(new Pipeline() { @Override public void process(ResultItems resultItems, Task task) { } }); if (params.getUrls() == null || params.getUrls().size() == 0) { System.err.println("Need at least one argument"); System.out.println("Usage: java -jar webmagic.jar [-l language] -f script file [-t threadnum] [-s sleep time] url1 [url2 url3]"); System.exit(-1); } for (String url : params.getUrls()) { spider.addUrl(url); } spider.run(); } private static Params parseCommand(String[] args) { try { Options options = new Options(); options.addOption(new Option("l", "language", true, "language")); options.addOption(new Option("t", "thread", true, "thread")); options.addOption(new Option("f", "file", true, "script file")); options.addOption(new Option("i", "input", true, "input file")); options.addOption(new Option("s", "sleep", true, "sleep time")); options.addOption(new Option("g", "logger", true, "sleep time")); CommandLineParser commandLineParser = new PosixParser(); CommandLine commandLine = commandLineParser.parse(options, args); return readOptions(commandLine); } catch (Exception e) { e.printStackTrace(); exit(); return null; } } private static void exit() { System.err.println("Format error"); System.out.println("Usage: java -jar webmagic.jar [-l language] -f script file [-t threadnum] [-s sleep time] url1 [url2 url3]"); System.exit(-1); } private static Params readOptions(CommandLine commandLine) { Params params = new Params(); List options = CommandLineOption.getAllOptions(); for (CommandLineOption option : options) option.addParamOptionIfInCommandLine(params, commandLine); return params; } } ================================================ FILE: webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/ScriptEnginePool.java ================================================ package us.codecraft.webmagic.scripts; import javax.script.ScriptEngine; import javax.script.ScriptEngineManager; import us.codecraft.webmagic.scripts.languages.Language; import java.util.concurrent.LinkedBlockingQueue; import java.util.concurrent.atomic.AtomicInteger; /** * @author code4crafter@gmail.com * @since 0.4.1 */ public class ScriptEnginePool { private final AtomicInteger availableCount; private final LinkedBlockingQueue scriptEngines = new LinkedBlockingQueue(); public ScriptEnginePool(Language language,int size) { this.availableCount = new AtomicInteger(size); for (int i=0;i getAllOptions() { return List.of(new OptionL(), new OptionF(), new OptionS(), new OptionT(), new OptionG()); } } class OptionL extends CommandLineOption { public OptionL() { super('l'); } protected void addParamOption(Params params, CommandLine commandLine) { String language = commandLine.getOptionValue("l"); params.setLanguagefromArg(language); } } class OptionF extends CommandLineOption { public OptionF() { super('f'); } protected void addParamOption(Params params, CommandLine commandLine) { String scriptFilename = commandLine.getOptionValue("f"); params.setScriptFileName(scriptFilename); } } class OptionS extends CommandLineOption { public OptionS() { super('s'); } protected void addParamOption(Params params, CommandLine commandLine) { Integer sleepTime = Integer.parseInt(commandLine.getOptionValue("s")); params.setSleepTime(sleepTime); } } class OptionT extends CommandLineOption { public OptionT() { super('t'); } protected void addParamOption(Params params, CommandLine commandLine) { Integer thread = Integer.parseInt(commandLine.getOptionValue("t")); params.setThread(thread); } } class OptionG extends CommandLineOption { public OptionG() { super('g'); } protected void addParamOption(Params params, CommandLine commandLine) { ConfigLogger.configLogger(commandLine.getOptionValue("g")); } } ================================================ FILE: webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/config/ConfigLogger.java ================================================ package us.codecraft.webmagic.scripts.config; import java.util.List; import org.apache.commons.lang3.tuple.Pair; import org.apache.logging.log4j.Level; import org.apache.logging.log4j.core.Logger; import org.slf4j.LoggerFactory; public class ConfigLogger { /** * Log the config parameter. If the counter is less than the number of available * options then it means that the user entered an option * * @param value The config string */ public static void configLogger(String value) { List> options = List.of( Pair.of("debug", Level.DEBUG), Pair.of("info", Level.INFO), Pair.of("warn", Level.WARN), Pair.of("trace", Level.TRACE), Pair.of("off", Level.OFF), Pair.of("error", Level.ERROR)); Pair option = options.get(0); int i = 1; while (i < options.size() && !option.getLeft().equalsIgnoreCase(value)) option = options.get(i++); if (i < options.size()) { Logger rootLogger = (Logger) LoggerFactory.getLogger(org.slf4j.Logger.ROOT_LOGGER_NAME); rootLogger.setLevel(option.getRight()); } } } ================================================ FILE: webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/languages/JRuby.java ================================================ package us.codecraft.webmagic.scripts.languages; import java.util.Iterator; import java.util.Map; import javax.script.ScriptEngine; import javax.script.ScriptException; import org.jruby.RubyHash; import us.codecraft.webmagic.Page; public class JRuby extends Language { public JRuby() { super("jruby","ruby/defines.rb",""); } public void process(ScriptEngine engine, String defines, String script, Page page) throws ScriptException { RubyHash oRuby = (RubyHash) engine.eval(defines + "\n" + script, engine.getContext()); Iterator itruby = oRuby.entrySet().iterator(); while (itruby.hasNext()) { Map.Entry pairs = (Map.Entry) itruby.next(); page.getResultItems().put(pairs.getKey().toString(), pairs.getValue()); } } } ================================================ FILE: webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/languages/Javascript.java ================================================ package us.codecraft.webmagic.scripts.languages; import javax.script.ScriptEngine; import javax.script.ScriptException; import us.codecraft.webmagic.Page; public class Javascript extends Language { public Javascript() { super("javascript","js/defines.js",""); } public void process(ScriptEngine engine, String defines, String script, Page page) throws ScriptException { engine.eval(defines + "\n" + script, engine.getContext()); } } ================================================ FILE: webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/languages/Jython.java ================================================ package us.codecraft.webmagic.scripts.languages; import java.util.Iterator; import java.util.Map; import javax.script.ScriptEngine; import javax.script.ScriptException; import org.python.core.PyDictionary; import us.codecraft.webmagic.Page; public class Jython extends Language { public Jython() { super("jython","python/defines.py",""); } public void process(ScriptEngine engine, String defines, String script, Page page) throws ScriptException { engine.eval(defines + "\n" + script, engine.getContext()); PyDictionary oJython = (PyDictionary) engine.get("result"); Iterator it = oJython.entrySet().iterator(); while (it.hasNext()) { Map.Entry pairs = (Map.Entry) it.next(); page.getResultItems().put(pairs.getKey().toString(), pairs.getValue()); } } } ================================================ FILE: webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/languages/Language.java ================================================ package us.codecraft.webmagic.scripts.languages; import javax.script.ScriptEngine; import javax.script.ScriptException; import us.codecraft.webmagic.Page; /** * @author FrancoisGib */ public abstract class Language { public Language(String engineName, String defineFile, String gatherFile) { this.engineName = engineName; this.defineFile = defineFile; this.gatherFile = gatherFile; } private String engineName; private String defineFile; private String gatherFile; public String getEngineName() { return engineName; } public String getDefineFile() { return defineFile; } public String getGatherFile() { return gatherFile; } public abstract void process(ScriptEngine engine, String defines, String script, Page page) throws ScriptException; } ================================================ FILE: webmagic-scripts/src/main/kotlin/Github.kt ================================================ import us.codecraft.webmagic.Page import us.codecraft.webmagic.Site import us.codecraft.webmagic.Spider import us.codecraft.webmagic.processor.PageProcessor import us.codecraft.webmagic.processor.example.GithubRepoPageProcessor /** * * @author code4crafter@gmail.com * Date: 2017/5/31 * Time: 下午11:33 * */ class GithubRepoPageProcessor : PageProcessor { private val site = Site.me().setRetryTimes(3).setSleepTime(1000).setTimeOut(10000) override fun process(page: Page) { page.addTargetRequests(page.html.links().regex("(https://github\\.com/[\\w\\-]+/[\\w\\-]+)").all()) page.addTargetRequests(page.html.links().regex("(https://github\\.com/[\\w\\-])").all()) page.putField("author", page.url.regex("https://github\\.com/(\\w+)/.*").toString()) page.putField("name", page.html.xpath("//h1[@class='public']/strong/a/text()").toString()) if (page.resultItems.get("name") == null) { //skip this page page.setSkip(true) } page.putField("readme", page.html.xpath("//div[@id='readme']/tidyText()")) } override fun getSite(): Site { return site } companion object { @JvmStatic fun main(args: Array) { Spider.create(GithubRepoPageProcessor()).addUrl("https://github.com/code4craft").thread(5).run() } } } ================================================ FILE: webmagic-scripts/src/main/resources/js/defines.js ================================================ function $(str){ return page.getHtml().$(str).toString(); } function xpath(str){ return page.getHtml().xpath(str).toString(); } function urls(str){ links = page.getHtml().links().regex(str).all(); page.addTargetRequests(links); } ================================================ FILE: webmagic-scripts/src/main/resources/js/github.js ================================================ var name=xpath("//h1[@class='entry-title public']/strong/a/text()") var readme=xpath("//div[@id='readme']/tidyText()") var star=xpath("//ul[@class='pagehead-actions']/li[1]//a[@class='social-count js-social-count']/text()") var fork=xpath("//ul[@class='pagehead-actions']/li[2]//a[@class='social-count']/text()") var url=page.getUrl().toString() if (name!=null){ println(name) println(readme) println(star) println(url) } urls("(https://github\\.com/\\w+/\\w+)") urls("(https://github\\.com/\\w+)") ================================================ FILE: webmagic-scripts/src/main/resources/js/oschina.js ================================================ var result = { title: $("div.BlogTitle h1"), content: $("div.BlogContent") } var config = { ua: '', sleepTime : 20 } title = $("div.BlogTitle h1"), content = $("div.BlogContent") urls("http://my\\.oschina\\.net/flashsword/blog/\\d+") config; ================================================ FILE: webmagic-scripts/src/main/resources/python/defines.py ================================================ def xpath(str): return page.getHtml().xpath(str).toString() def css(str): return page.getHtml().css(str).toString() def urls(str): links=page.getHtml().links().regex(str).all() page.addTargetRequests(links); def tomap(key,value): return "hello world" ================================================ FILE: webmagic-scripts/src/main/resources/python/oschina.py ================================================ title=xpath("div[@class=BlogTitle]") urls="http://my\\.oschina\\.net/flashsword/blog/\\d+" result={"title":title,"urls":urls} ================================================ FILE: webmagic-scripts/src/main/resources/ruby/defines.rb ================================================ def xpath str $page.getHtml().xpath(str).toString() end def css str $page.getHtml().css(str).toString() end def urls str links = $page.getHtml().links().regex(str).all(); $page.addTargetRequests(links); end ================================================ FILE: webmagic-scripts/src/main/resources/ruby/github.rb ================================================ name= xpath "//h1[@class='entry-title public']/strong/a/text()" readme = xpath "//div[@id='readme']/tidyText()" star = xpath "//ul[@class='pagehead-actions']/li[1]//a[@class='social-count js-social-count']/text()" fork = xpath "//ul[@class='pagehead-actions']/li[2]//a[@class='social-count']/text()" url=$page.getUrl().toString() puts name,readme,star,fork,url unless name==nil urls "(https://github\\.com/\\w+/\\w+)" urls "(https://github\\.com/\\w+)" ================================================ FILE: webmagic-scripts/src/main/resources/ruby/oschina.rb ================================================ urls "http://my\\.oschina\\.net/flashsword/blog/\\d+" title = css "div.BlogTitle h1" content = css "div.BlogContent" return {"title"=>title,"content"=>content} ================================================ FILE: webmagic-scripts/src/test/java/us/codecraft/webmagic/scripts/ScriptProcessorTest.java ================================================ package us.codecraft.webmagic.scripts; import org.junit.Ignore; import org.junit.Test; import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.scripts.languages.JRuby; import us.codecraft.webmagic.scripts.languages.Javascript; import us.codecraft.webmagic.scripts.languages.Jython; /** * @author code4crafter@gmail.com * @since 0.4.1 */ @Ignore public class ScriptProcessorTest { @Test public void testJavaScriptProcessor() { ScriptProcessor pageProcessor = ScriptProcessorBuilder.custom().language(new Javascript()).scriptFromClassPathFile("js/oschina.js").build(); pageProcessor.getSite().setSleepTime(0); Spider.create(pageProcessor).addUrl("http://my.oschina.net/flashsword/blog").setSpawnUrl(false).run(); } @Test public void testRubyProcessor() { ScriptProcessor pageProcessor = ScriptProcessorBuilder.custom().language(new JRuby()).scriptFromClassPathFile("ruby/oschina.rb").build(); pageProcessor.getSite().setSleepTime(0); Spider.create(pageProcessor).addUrl("http://my.oschina.net/flashsword/blog").setSpawnUrl(false).run(); } @Test public void testPythonProcessor() { ScriptProcessor pageProcessor = ScriptProcessorBuilder.custom().language(new Jython()).scriptFromClassPathFile("python/oschina.py").build(); pageProcessor.getSite().setSleepTime(0); Spider.create(pageProcessor).addUrl("http://my.oschina.net/flashsword/blog").setSpawnUrl(false).run(); } } ================================================ FILE: webmagic-scripts/src/test/resources/log4j2-test.xml ================================================ ================================================ FILE: webmagic-selenium/README.md ================================================ webmagic-extension ------- webmagic与selenium的集成,用于爬取ajax页面。selenium太重,所以单独抽出成一个包了。 ================================================ FILE: webmagic-selenium/config.ini ================================================ # What WebDriver to use for the tests driver=phantomjs #driver=firefox #driver=chrome #driver=http://localhost:8910 #driver=http://localhost:4444/wd/hub # PhantomJS specific config (change according to your installation) #phantomjs_exec_path=/Users/Bingo/bin/phantomjs-qt5 phantomjs_exec_path=/Users/Bingo/Downloads/phantomjs-1.9.8-macosx/bin/phantomjs #phantomjs_driver_path=/Users/Bingo/Documents/workspace/webmagic/webmagic-selenium/src/main.js phantomjs_driver_loglevel=DEBUG ================================================ FILE: webmagic-selenium/pom.xml ================================================ us.codecraft webmagic 1.0.4-SNAPSHOT 4.0.0 webmagic-selenium org.seleniumhq.selenium selenium-java ${project.groupId} webmagic-core ${project.version} com.github.detro phantomjsdriver org.apache.maven.plugins maven-deploy-plugin 3.0.0-M1 true ================================================ FILE: webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/SeleniumDownloader.java ================================================ package us.codecraft.webmagic.downloader.selenium; import org.openqa.selenium.By; import org.openqa.selenium.Cookie; import org.openqa.selenium.WebDriver; import org.openqa.selenium.WebElement; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Task; import us.codecraft.webmagic.downloader.AbstractDownloader; import us.codecraft.webmagic.selector.Html; import us.codecraft.webmagic.selector.PlainText; import us.codecraft.webmagic.utils.HttpConstant; import java.io.Closeable; import java.io.IOException; import java.net.http.HttpRequest; import java.util.Map; /** * 使用Selenium调用浏览器进行渲染。目前仅支持chrome。
    * 需要下载Selenium driver支持。
    * * @author code4crafter@gmail.com
    * Date: 13-7-26
    * Time: 下午1:37
    */ public class SeleniumDownloader extends AbstractDownloader implements Closeable { private volatile WebDriverPool webDriverPool; private Logger logger = LoggerFactory.getLogger(getClass()); private int sleepTime = 0; private int poolSize = 1; private static final String DRIVER_PHANTOMJS = "phantomjs"; /** * 新建 * * @param chromeDriverPath chromeDriverPath */ public SeleniumDownloader(String chromeDriverPath) { System.getProperties().setProperty("webdriver.chrome.driver", chromeDriverPath); } /** * Constructor without any filed. Construct PhantomJS browser * * @author bob.li.0718@gmail.com */ public SeleniumDownloader() { // System.setProperty("phantomjs.binary.path", // "/Users/Bingo/Downloads/phantomjs-1.9.7-macosx/bin/phantomjs"); } /** * set sleep time to wait until load success * * @param sleepTime sleepTime * @return this */ public SeleniumDownloader setSleepTime(int sleepTime) { this.sleepTime = sleepTime; return this; } @Override public Page download(Request request, Task task) { checkInit(); WebDriver webDriver = null; Page page = Page.fail(request); try { webDriver = webDriverPool.get(); logger.info("downloading page " + request.getUrl()); webDriver.get(request.getUrl()); try { if (sleepTime > 0) { Thread.sleep(sleepTime); } } catch (InterruptedException e) { e.printStackTrace(); } WebDriver.Options manage = webDriver.manage(); Site site = task.getSite(); if (site.getCookies() != null) { for (Map.Entry cookieEntry : site.getCookies() .entrySet()) { Cookie cookie = new Cookie(cookieEntry.getKey(), cookieEntry.getValue()); manage.addCookie(cookie); } } /* * TODO You can add mouse event or other processes * * @author: bob.li.0718@gmail.com */ WebElement webElement = webDriver.findElement(By.xpath("/html")); String content = webElement.getAttribute("outerHTML"); page.setDownloadSuccess(true); page.setRawText(content); page.setHtml(new Html(content, request.getUrl())); page.setUrl(new PlainText(request.getUrl())); page.setRequest(request); page.setStatusCode(HttpConstant.StatusCode.CODE_200); onSuccess(page, task); } catch (Exception e) { logger.warn("download page {} error", request.getUrl(), e); onError(page, task, e); } finally { if (webDriver != null) { webDriverPool.returnToPool(webDriver); } } return page; } private void checkInit() { if (webDriverPool == null) { synchronized (this) { webDriverPool = new WebDriverPool(poolSize); } } } @Override public void setThread(int thread) { this.poolSize = thread; } @Override public void close() throws IOException { webDriverPool.closeAll(); } } ================================================ FILE: webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/WebDriverPool.java ================================================ package us.codecraft.webmagic.downloader.selenium; import java.io.FileReader; import java.io.IOException; import java.net.MalformedURLException; import java.net.URL; import java.util.ArrayList; import java.util.Collections; import java.util.List; import java.util.Properties; import java.util.concurrent.BlockingDeque; import java.util.concurrent.LinkedBlockingDeque; import java.util.concurrent.atomic.AtomicInteger; import org.openqa.selenium.WebDriver; import org.openqa.selenium.chrome.ChromeDriver; import org.openqa.selenium.chrome.ChromeOptions; import org.openqa.selenium.firefox.FirefoxDriver; import org.openqa.selenium.firefox.FirefoxOptions; import org.openqa.selenium.phantomjs.PhantomJSDriver; import org.openqa.selenium.phantomjs.PhantomJSDriverService; import org.openqa.selenium.remote.DesiredCapabilities; import org.openqa.selenium.remote.RemoteWebDriver; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * @author code4crafter@gmail.com
    * Date: 13-7-26
    * Time: 下午1:41
    */ class WebDriverPool { private Logger logger = LoggerFactory.getLogger(getClass()); private final static int DEFAULT_CAPACITY = 5; private final int capacity; private final static int STAT_RUNNING = 1; private final static int STAT_CLODED = 2; private AtomicInteger stat = new AtomicInteger(STAT_RUNNING); /* * new fields for configuring phantomJS */ private WebDriver mDriver = null; private boolean mAutoQuitDriver = true; private static final String DEFAULT_CONFIG_FILE = "/data/webmagic/webmagic-selenium/config.ini"; private static final String DRIVER_FIREFOX = "firefox"; private static final String DRIVER_CHROME = "chrome"; private static final String DRIVER_PHANTOMJS = "phantomjs"; protected static Properties sConfig; protected static DesiredCapabilities sCaps; /** * Configure the GhostDriver, and initialize a WebDriver instance. This part * of code comes from GhostDriver. * https://github.com/detro/ghostdriver/tree/master/test/java/src/test/java/ghostdriver * * @author bob.li.0718@gmail.com * @throws IOException */ public void configure() throws IOException { // Read config file sConfig = new Properties(); String configFile = DEFAULT_CONFIG_FILE; if (System.getProperty("selenuim_config")!=null){ configFile = System.getProperty("selenuim_config"); } sConfig.load(new FileReader(configFile)); // Prepare capabilities sCaps = new DesiredCapabilities(); sCaps.setCapability("takesScreenshot", false); String driver = sConfig.getProperty("driver", DRIVER_PHANTOMJS); // Fetch PhantomJS-specific configuration parameters if (driver.equals(DRIVER_PHANTOMJS)) { // "phantomjs_exec_path" if (sConfig.getProperty("phantomjs_exec_path") != null) { sCaps.setCapability( PhantomJSDriverService.PHANTOMJS_EXECUTABLE_PATH_PROPERTY, sConfig.getProperty("phantomjs_exec_path")); } else { throw new IOException( String.format( "Property '%s' not set!", PhantomJSDriverService.PHANTOMJS_EXECUTABLE_PATH_PROPERTY)); } // "phantomjs_driver_path" if (sConfig.getProperty("phantomjs_driver_path") != null) { System.out.println("Test will use an external GhostDriver"); sCaps.setCapability( PhantomJSDriverService.PHANTOMJS_GHOSTDRIVER_PATH_PROPERTY, sConfig.getProperty("phantomjs_driver_path")); } else { System.out .println("Test will use PhantomJS internal GhostDriver"); } } // Disable "web-security", enable all possible "ssl-protocols" and // "ignore-ssl-errors" for PhantomJSDriver // sCaps.setCapability(PhantomJSDriverService.PHANTOMJS_CLI_ARGS, new // String[] { // "--web-security=false", // "--ssl-protocol=any", // "--ignore-ssl-errors=true" // }); ArrayList cliArgsCap = new ArrayList(); cliArgsCap.add("--web-security=false"); cliArgsCap.add("--ssl-protocol=any"); cliArgsCap.add("--ignore-ssl-errors=true"); sCaps.setCapability(PhantomJSDriverService.PHANTOMJS_CLI_ARGS, cliArgsCap); // Control LogLevel for GhostDriver, via CLI arguments sCaps.setCapability( PhantomJSDriverService.PHANTOMJS_GHOSTDRIVER_CLI_ARGS, new String[] { "--logLevel=" + (sConfig.getProperty("phantomjs_driver_loglevel") != null ? sConfig .getProperty("phantomjs_driver_loglevel") : "INFO") }); // String driver = sConfig.getProperty("driver", DRIVER_PHANTOMJS); // Start appropriate Driver if (isUrl(driver)) { sCaps.setBrowserName("phantomjs"); mDriver = new RemoteWebDriver(new URL(driver), sCaps); } else if (driver.equals(DRIVER_FIREFOX)) { mDriver = new FirefoxDriver(new FirefoxOptions(sCaps)); } else if (driver.equals(DRIVER_CHROME)) { mDriver = new ChromeDriver(new ChromeOptions().merge(sCaps)); } else if (driver.equals(DRIVER_PHANTOMJS)) { mDriver = new PhantomJSDriver(sCaps); } } /** * check whether input is a valid URL * * @author bob.li.0718@gmail.com * @param urlString urlString * @return true means yes, otherwise no. */ private boolean isUrl(String urlString) { try { new URL(urlString); return true; } catch (MalformedURLException mue) { return false; } } /** * store webDrivers created */ private List webDriverList = Collections .synchronizedList(new ArrayList()); /** * store webDrivers available */ private BlockingDeque innerQueue = new LinkedBlockingDeque(); public WebDriverPool(int capacity) { this.capacity = capacity; } public WebDriverPool() { this(DEFAULT_CAPACITY); } /** * * @return * @throws InterruptedException */ public WebDriver get() throws InterruptedException { checkRunning(); WebDriver poll = innerQueue.poll(); if (poll != null) { return poll; } if (webDriverList.size() < capacity) { synchronized (webDriverList) { if (webDriverList.size() < capacity) { // add new WebDriver instance into pool try { configure(); innerQueue.add(mDriver); webDriverList.add(mDriver); } catch (IOException e) { e.printStackTrace(); } // ChromeDriver e = new ChromeDriver(); // WebDriver e = getWebDriver(); // innerQueue.add(e); // webDriverList.add(e); } } } return innerQueue.take(); } public void returnToPool(WebDriver webDriver) { checkRunning(); innerQueue.add(webDriver); } protected void checkRunning() { if (!stat.compareAndSet(STAT_RUNNING, STAT_RUNNING)) { throw new IllegalStateException("Already closed!"); } } public void closeAll() { boolean b = stat.compareAndSet(STAT_RUNNING, STAT_CLODED); if (!b) { throw new IllegalStateException("Already closed!"); } for (WebDriver webDriver : webDriverList) { logger.info("Quit webDriver" + webDriver); webDriver.quit(); webDriver = null; } } } ================================================ FILE: webmagic-selenium/src/test/java/us/codecraft/webmagic/downloader/SeleniumTest.java ================================================ package us.codecraft.webmagic.downloader; import java.util.Arrays; import java.util.HashMap; import java.util.Map; import org.junit.Ignore; import org.junit.Test; import org.openqa.selenium.By; import org.openqa.selenium.WebDriver; import org.openqa.selenium.WebElement; import org.openqa.selenium.chrome.ChromeDriver; import org.openqa.selenium.chrome.ChromeOptions; import org.openqa.selenium.remote.DesiredCapabilities; /** * @author code4crafter@gmail.com
    * Date: 13-7-26
    * Time: 下午12:27
    */ public class SeleniumTest { @Ignore("need chrome driver") @Test public void testSelenium() { System.getProperties().setProperty("webdriver.chrome.driver", "/Users/yihua/Downloads/chromedriver"); Map contentSettings = new HashMap(); contentSettings.put("images", 2); Map preferences = new HashMap(); preferences.put("profile.default_content_settings", contentSettings); DesiredCapabilities caps = new DesiredCapabilities(); caps.setCapability("chrome.prefs", preferences); caps.setCapability("chrome.switches", Arrays.asList("--user-data-dir=/Users/yihua/temp/chrome")); WebDriver webDriver = new ChromeDriver(new ChromeOptions().merge(caps)); webDriver.get("http://huaban.com/"); WebElement webElement = webDriver.findElement(By.xpath("/html")); System.out.println(webElement.getAttribute("outerHTML")); webDriver.close(); } } ================================================ FILE: webmagic-selenium/src/test/java/us/codecraft/webmagic/downloader/selenium/SeleniumDownloaderTest.java ================================================ package us.codecraft.webmagic.downloader.selenium; import org.junit.Ignore; import org.junit.Test; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Task; /** * @author code4crafter@gmail.com
    * Date: 13-7-26
    * Time: 下午2:46
    */ public class SeleniumDownloaderTest { private String chromeDriverPath = "/Users/yihua/Downloads/chromedriver"; @Ignore("need chrome driver") @Test public void test() { SeleniumDownloader seleniumDownloader = new SeleniumDownloader(chromeDriverPath); long time1 = System.currentTimeMillis(); for (int i = 0; i < 100; i++) { Page page = seleniumDownloader.download(new Request("http://huaban.com/"), new Task() { @Override public String getUUID() { return "huaban.com"; } @Override public Site getSite() { return Site.me(); } }); System.out.println(page.getHtml().$("#waterfall").links().regex(".*pins.*").all()); } System.out.println(System.currentTimeMillis() - time1); } @Ignore @Test public void testBaiduWenku() { SeleniumDownloader seleniumDownloader = new SeleniumDownloader(chromeDriverPath); seleniumDownloader.setSleepTime(10000); long time1 = System.currentTimeMillis(); Page page = seleniumDownloader.download(new Request("http://wenku.baidu.com/view/462933ff04a1b0717fd5ddc2.html"), new Task() { @Override public String getUUID() { return "huaban.com"; } @Override public Site getSite() { return Site.me(); } }); System.out.println(page.getHtml().$("div.inner").replace("<[^<>]+>","").replace("&nsbp;","").all()); } } ================================================ FILE: webmagic-selenium/src/test/java/us/codecraft/webmagic/downloader/selenium/WebDriverPoolTest.java ================================================ package us.codecraft.webmagic.downloader.selenium; import org.junit.Ignore; import org.junit.Test; import org.openqa.selenium.WebDriver; /** * @author code4crafter@gmail.com
    * Date: 13-7-26
    * Time: 下午2:12
    */ public class WebDriverPoolTest { private String chromeDriverPath = "/Users/yihua/Downloads/chromedriver"; @Ignore("need chrome driver") @Test public void test() { System.getProperties().setProperty("webdriver.chrome.driver", chromeDriverPath); WebDriverPool webDriverPool = new WebDriverPool(5); for (int i = 0; i < 5; i++) { try { WebDriver webDriver = webDriverPool.get(); System.out.println(i); } catch (InterruptedException e) { e.printStackTrace(); } } webDriverPool.closeAll(); } } ================================================ FILE: webmagic-selenium/src/test/java/us/codecraft/webmagic/samples/GooglePlayProcessor.java ================================================ package us.codecraft.webmagic.samples; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.downloader.selenium.SeleniumDownloader; import us.codecraft.webmagic.pipeline.FilePipeline; import us.codecraft.webmagic.processor.PageProcessor; /** * * Using Selenium with PhantomJS to fetch web-page with JS
    * * @author bob.li.0718@gmail.com
    * Date: 15-7-11
    */ public class GooglePlayProcessor implements PageProcessor { private Site site; @Override public void process(Page page) { page.putField("whole-html", page.getHtml().toString()); } @Override public Site getSite() { if (null == site) { site = Site.me().setDomain("play.google.com").setSleepTime(300); } return site; } public static void main(String[] args) { Spider.create(new GooglePlayProcessor()) .thread(5) .addPipeline( new FilePipeline( "/Users/Bingo/Documents/workspace/webmagic/webmagic-selenium/data/")) .setDownloader(new SeleniumDownloader()) .addUrl("https://play.google.com/store/apps/details?id=com.tencent.mm") .runAsync(); } } ================================================ FILE: webmagic-selenium/src/test/java/us/codecraft/webmagic/samples/HuabanProcessor.java ================================================ package us.codecraft.webmagic.samples; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.downloader.selenium.SeleniumDownloader; import us.codecraft.webmagic.pipeline.FilePipeline; import us.codecraft.webmagic.processor.PageProcessor; /** * 花瓣网抽取器。
    * 使用Selenium做页面动态渲染。
    * @author code4crafter@gmail.com
    * Date: 13-7-26
    * Time: 下午4:08
    */ public class HuabanProcessor implements PageProcessor { private Site site; @Override public void process(Page page) { page.addTargetRequests(page.getHtml().links().regex("http://huaban\\.com/.*").all()); if (page.getUrl().toString().contains("pins")) { page.putField("img", page.getHtml().xpath("//div[@class='image-holder']/a/img/@src").toString()); } else { page.getResultItems().setSkip(true); } } @Override public Site getSite() { if (null == site) { site = Site.me().setDomain("huaban.com").setSleepTime(0); } return site; } public static void main(String[] args) { Spider.create(new HuabanProcessor()).thread(5) .addPipeline(new FilePipeline("/data/webmagic/test/")) .setDownloader(new SeleniumDownloader("/Users/yihua/Downloads/chromedriver")) .addUrl("http://huaban.com/") .runAsync(); } } ================================================ FILE: webmagic-selenium/src/test/resources/config.ini ================================================ #driver=phantomjs #driver=firefox driver=chrome #driver=http://localhost:8910 driver=http://localhost:4444/wd/hub # PhantomJS specific config (change according to your installation) #phantomjs_exec_path=/Users/detro/bin/phantomjs-qt5 phantomjs_exec_path=/Users/detro/bin/phantomjs-upstream phantomjs_driver_path=../../src/main.js phantomjs_driver_loglevel=DEBUG