Repository: biezhi/elves Branch: master Commit: 0b13f4b812e8 Files: 27 Total size: 46.3 KB Directory structure: gitextract_lkwbnl7j/ ├── .gitignore ├── .travis.yml ├── LICENSE ├── README.md ├── pom.xml └── src/ ├── main/ │ └── java/ │ └── io/ │ └── github/ │ └── biezhi/ │ └── elves/ │ ├── Elves.java │ ├── ElvesEngine.java │ ├── config/ │ │ ├── Config.java │ │ └── UserAgent.java │ ├── download/ │ │ └── Downloader.java │ ├── event/ │ │ ├── ElvesEvent.java │ │ └── EventManager.java │ ├── pipeline/ │ │ └── Pipeline.java │ ├── request/ │ │ ├── Parser.java │ │ └── Request.java │ ├── response/ │ │ ├── Body.java │ │ ├── Response.java │ │ └── Result.java │ ├── scheduler/ │ │ └── Scheduler.java │ ├── spider/ │ │ └── Spider.java │ └── utils/ │ ├── ElvesUtils.java │ └── NamedThreadFactory.java └── test/ └── java/ └── io/ └── github/ └── biezhi/ └── elves/ ├── event/ │ └── ElvesEventTest.java └── examples/ ├── DoubanExample.java ├── MeiziExample.java ├── News163Example.java └── QiubaiExample.java ================================================ FILE CONTENTS ================================================ ================================================ FILE: .gitignore ================================================ # Created by .ignore support plugin (hsz.mobi) ### Eclipse template .metadata bin/ tmp/ *.tmp *.bak *.swp *~.nib local.properties .settings/ .loadpath .recommenders # External tool builders .externalToolBuilders/ # Locally stored "Eclipse launch configurations" *.launch # PyDev specific (Python IDE for Eclipse) *.pydevproject # CDT-specific (C/C++ Development Tooling) .cproject # Java annotation processor (APT) .factorypath # PDT-specific (PHP Development Tools) .buildpath # sbteclipse plugin .target # Tern plugin .tern-project # TeXlipse plugin .texlipse # STS (Spring Tool Suite) .springBeans # Code Recommenders .recommenders/ # Scala IDE specific (Scala & Java development for Eclipse) .cache-main .scala_dependencies .worksheet ### Maven template target/ pom.xml.tag pom.xml.releaseBackup pom.xml.versionsBackup pom.xml.next release.properties dependency-reduced-pom.xml buildNumber.properties .mvn/timing.properties # Avoid ignoring Maven wrapper jar file (.jar files are usually ignored) !/.mvn/wrapper/maven-wrapper.jar ### Java template # Compiled class file *.class # Log file *.log # BlueJ files *.ctxt # Mobile Tools for Java (J2ME) .mtj.tmp/ # Package Files # *.jar *.war *.ear *.zip *.tar.gz *.rar # virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml hs_err_pid* ### JetBrains template # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and Webstorm # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 # User-specific stuff: .idea/**/workspace.xml .idea/**/tasks.xml .idea/dictionaries # Sensitive or high-churn files: .idea/**/dataSources/ .idea/**/dataSources.ids .idea/**/dataSources.xml .idea/**/dataSources.local.xml .idea/**/sqlDataSources.xml .idea/**/dynamic.xml .idea/**/uiDesigner.xml # Gradle: .idea/**/gradle.xml .idea/**/libraries # CMake cmake-build-debug/ # Mongo Explorer plugin: .idea/**/mongoSettings.xml ## File-based project format: *.iws ## Plugin-specific files: # IntelliJ out/ # mpeltonen/sbt-idea plugin .idea_modules/ # JIRA plugin atlassian-ide-plugin.xml # Cursive Clojure plugin .idea/replstate.xml # Crashlytics plugin (for Android Studio and IntelliJ) com_crashlytics_export_strings.xml crashlytics.properties crashlytics-build.properties fabric.properties ================================================ FILE: .travis.yml ================================================ language: java jdk: - oraclejdk8 notifications: email: false sudo: false before_install: - export TZ='Asia/Shanghai' ================================================ FILE: LICENSE ================================================ MIT License Copyright (c) 2018 王爵nice (biezhi) Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ================================================ FILE: README.md ================================================ # Elves 一个轻量级的爬虫框架设计与实现,[博文分析](https://blog.biezhi.me/2018/01/design-and-implement-a-crawler-framework.html)。 [![](https://img.shields.io/travis/biezhi/elves.svg)](https://travis-ci.org/biezhi/elves) [![](https://img.shields.io/maven-central/v/io.github.biezhi/elves.svg)](https://mvnrepository.com/artifact/io.github.biezhi/elves) [![@biezhi on zhihu](https://img.shields.io/badge/zhihu-%40biezhi-red.svg)](https://www.zhihu.com/people/biezhi) [![](https://img.shields.io/badge/license-MIT-FF0080.svg)](https://github.com/biezhi/elves/blob/master/LICENSE) [![](https://img.shields.io/github/followers/biezhi.svg?style=social&label=Follow%20Me)](https://github.com/biezhi) ## 特性 - 事件驱动 - 易于定制 - 多线程执行 - `CSS` 选择器和 `XPath` 支持 **Maven** 坐标 ```xml io.github.biezhi elves 0.0.2 ``` 如果你想在本地运行这个项目源码,请确保你是 `Java8` 环境并且安装了 [lombok](https://projectlombok.org/) 插件。 ## 架构图 ## 调用流程图 ## 快速上手 搭建一个爬虫程序需要进行这么几步操作 1. 编写一个爬虫类继承自 `Spider` 2. 设置要抓取的 URL 列表 3. 实现 `Spider` 的 `parse` 方法 4. 添加 `Pipeline` 处理 `parse` 过滤后的数据 举个栗子: ```java public class DoubanSpider extends Spider { public DoubanSpider(String name) { super(name); this.startUrls( "https://movie.douban.com/tag/爱情", "https://movie.douban.com/tag/喜剧", "https://movie.douban.com/tag/动画", "https://movie.douban.com/tag/动作", "https://movie.douban.com/tag/史诗", "https://movie.douban.com/tag/犯罪"); } @Override public void onStart(Config config) { this.addPipeline((Pipeline>) (item, request) -> log.info("保存到文件: {}", item)); } public Result parse(Response response) { Result> result = new Result<>(); Elements elements = response.body().css("#content table .pl2 a"); List titles = elements.stream().map(Element::text).collect(Collectors.toList()); result.setItem(titles); // 获取下一页 URL Elements nextEl = response.body().css("#content > div > div.article > div.paginator > span.next > a"); if (null != nextEl && nextEl.size() > 0) { String nextPageUrl = nextEl.get(0).attr("href"); Request nextReq = this.makeRequest(nextPageUrl, this::parse); result.addRequest(nextReq); } return result; } } public static void main(String[] args) { DoubanSpider doubanSpider = new DoubanSpider("豆瓣电影"); Elves.me(doubanSpider, Config.me()).start(); } ``` ## 爬虫例子 - [豆瓣电影](https://github.com/biezhi/elves/blob/master/src/test/java/io/github/biezhi/elves/examples/DoubanExample.java) - [网易新闻](https://github.com/biezhi/elves/blob/master/src/test/java/io/github/biezhi/elves/examples/News163Example.java) - [糗事百科](https://github.com/biezhi/elves/blob/master/src/test/java/io/github/biezhi/elves/examples/QiubaiExample.java) - [妹。。。妹子图](https://github.com/biezhi/elves/blob/master/src/test/java/io/github/biezhi/elves/examples/MeiziExample.java) ## 开源协议 [MIT](https://github.com/biezhi/elves/blob/master/LICENSE) ================================================ FILE: pom.xml ================================================ 4.0.0 io.github.biezhi elves 0.0.2 elves https://biezhi.github.io/elves crawler framework The Apache Software License, Version 2.0 http://www.apache.org/licenses/LICENSE-2.0.txt biezhi biezhi.me@gmail.com scm:git@github.com:biezhi/elves.git scm:git@github.com:biezhi/elves.git git@github.com:biezhi/elves.git org.slf4j slf4j-api 1.7.25 org.jsoup jsoup 1.10.3 us.codecraft xsoup 0.3.1 io.github.biezhi oh-my-request 0.0.1 org.projectlombok lombok 1.16.18 provided ch.qos.logback logback-classic 1.2.3 test org.apache.maven.plugins maven-compiler-plugin 1.8 1.8 release oss https://oss.sonatype.org/content/repositories/snapshots/ oss https://oss.sonatype.org/service/local/staging/deploy/maven2/ org.apache.maven.plugins maven-source-plugin 2.4 package jar-no-fork org.apache.maven.plugins maven-javadoc-plugin 2.10.2 UTF-8 UTF-8 package jar -Xdoclint:none org.apache.maven.plugins maven-gpg-plugin 1.6 sign-artifacts verify sign snapshots oss https://oss.sonatype.org/content/repositories/snapshots/ oss https://oss.sonatype.org/service/local/staging/deploy/maven2/ org.apache.maven.plugins maven-source-plugin 2.4 package jar-no-fork org.apache.maven.plugins maven-surefire-plugin 2.17 true org.apache.maven.plugins maven-gpg-plugin 1.6 sign-artifacts verify sign ================================================ FILE: src/main/java/io/github/biezhi/elves/Elves.java ================================================ package io.github.biezhi.elves; import io.github.biezhi.elves.config.Config; import io.github.biezhi.elves.event.ElvesEvent; import io.github.biezhi.elves.event.EventManager; import io.github.biezhi.elves.spider.Spider; import lombok.NoArgsConstructor; import lombok.extern.slf4j.Slf4j; import java.util.ArrayList; import java.util.List; import java.util.function.Consumer; /** * Elves * * @author biezhi * @date 2018/1/11 */ @Slf4j @NoArgsConstructor public class Elves { List spiders = new ArrayList<>(); Config config; public static Elves me(Spider spider) { return me(spider, Config.me()); } public static Elves me(Spider spider, Config config) { Elves elves = new Elves(); elves.spiders.add(spider); elves.config = config; return elves; } public void start() { new ElvesEngine(this).start(); } public Elves onStart(Consumer consumer) { EventManager.registerEvent(ElvesEvent.GLOBAL_STARTED, consumer); return this; } } ================================================ FILE: src/main/java/io/github/biezhi/elves/ElvesEngine.java ================================================ package io.github.biezhi.elves; import io.github.biezhi.elves.config.Config; import io.github.biezhi.elves.download.Downloader; import io.github.biezhi.elves.event.ElvesEvent; import io.github.biezhi.elves.event.EventManager; import io.github.biezhi.elves.pipeline.Pipeline; import io.github.biezhi.elves.request.Parser; import io.github.biezhi.elves.request.Request; import io.github.biezhi.elves.response.Response; import io.github.biezhi.elves.response.Result; import io.github.biezhi.elves.scheduler.Scheduler; import io.github.biezhi.elves.spider.Spider; import io.github.biezhi.elves.utils.ElvesUtils; import io.github.biezhi.elves.utils.NamedThreadFactory; import lombok.extern.slf4j.Slf4j; import java.util.List; import java.util.concurrent.*; import java.util.stream.Collectors; /** * Elves Engine * * @author biezhi * @date 2018/1/12 */ @Slf4j public class ElvesEngine { private List spiders; private Config config; private boolean isRunning; private Scheduler scheduler; private ExecutorService executorService; ElvesEngine(Elves elves) { this.scheduler = new Scheduler(); this.spiders = elves.spiders; this.config = elves.config; this.executorService = new ThreadPoolExecutor(config.parallelThreads(), config.parallelThreads(), 0, TimeUnit.MILLISECONDS, config.queueSize() == 0 ? new SynchronousQueue<>() : (config.queueSize() < 0 ? new LinkedBlockingQueue<>() : new LinkedBlockingQueue<>(config.queueSize())), new NamedThreadFactory("task")); } public void start() { if (isRunning) { throw new RuntimeException("Elves 已经启动"); } isRunning = true; // 全局启动事件 EventManager.fireEvent(ElvesEvent.GLOBAL_STARTED, config); spiders.forEach(spider -> { Config conf = config.clone(); log.info("Spider [{}] 启动...", spider.getName()); log.info("Spider [{}] 配置 [{}]", spider.getName(), conf); spider.setConfig(conf); List requests = spider.getStartUrls().stream() .map(spider::makeRequest).collect(Collectors.toList()); spider.getRequests().addAll(requests); scheduler.addRequests(requests); EventManager.fireEvent(ElvesEvent.SPIDER_STARTED, conf); }); // 后台生产 Thread downloadTread = new Thread(() -> { while (isRunning) { if (!scheduler.hasRequest()) { ElvesUtils.sleep(100); continue; } Request request = scheduler.nextRequest(); executorService.submit(new Downloader(scheduler, request)); ElvesUtils.sleep(request.getSpider().getConfig().delay()); } }); downloadTread.setDaemon(true); downloadTread.setName("download-thread"); downloadTread.start(); // 消费 this.complete(); } private void complete() { while (isRunning) { if (!scheduler.hasResponse()) { ElvesUtils.sleep(100); continue; } Response response = scheduler.nextResponse(); Parser parser = response.getRequest().getParser(); if (null != parser) { Result result = parser.parse(response); List requests = result.getRequests(); if (!ElvesUtils.isEmpty(requests)) { requests.forEach(scheduler::addRequest); } if (null != result.getItem()) { List pipelines = response.getRequest().getSpider().getPipelines(); pipelines.forEach(pipeline -> pipeline.process(result.getItem(), response.getRequest())); } } } } public void stop(){ isRunning = false; scheduler.clear(); log.info("爬虫已经停止."); } } ================================================ FILE: src/main/java/io/github/biezhi/elves/config/Config.java ================================================ package io.github.biezhi.elves.config; import lombok.ToString; /** * 爬虫配置 * * @author biezhi * @date 2018/1/11 */ @ToString public class Config implements Cloneable { /** * 读取超时设置 */ private int timeout = 10_000; /** * 下载间隔 */ private int delay = 1000; /** * 下载线程数 */ private int parallelThreads = Runtime.getRuntime().availableProcessors() * 2; /** * UserAgent */ private String userAgent = UserAgent.CHROME_FOR_MAC; private int queueSize; public static Config me() { return new Config(); } public Config timeout(int timeout) { this.timeout = timeout; return this; } public int timeout() { return this.timeout; } public Config delay(int delay) { this.delay = delay; return this; } public long delay() { return this.delay; } public Config parallelThreads(int parallelThreads) { this.parallelThreads = parallelThreads; return this; } public int parallelThreads() { return this.parallelThreads; } public String userAgent() { return userAgent; } public Config userAgent(String userAgent) { this.userAgent = userAgent; return this; } public int queueSize() { return queueSize; } public Config queueSize(int queueSize) { this.queueSize = queueSize; return this; } @Override public Config clone() { try { return (Config) super.clone(); } catch (CloneNotSupportedException e) { e.printStackTrace(); } return null; } } ================================================ FILE: src/main/java/io/github/biezhi/elves/config/UserAgent.java ================================================ package io.github.biezhi.elves.config; /** * 浏览器UA常量 * * @author biezhi * @date 2018/1/11 */ public interface UserAgent { String SAFARI_FOR_MAC = "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50"; String IE_9_FOR_WIN = "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0;"; String IE_8_FOR_WIN = "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)"; String IE_7_FOR_WIN = "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)"; String FIREFOX_FOR_MAC = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1"; String OPERA_FOR_MAC = "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11"; String CHROME_FOR_MAC = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11"; String TENCENT_TT = "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; TencentTraveler 4.0)"; String THE_WORLD = "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; The World)"; String SOUGOU = "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)"; String QIHU_360 = "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)"; /** * 移动端UA */ String SAFARI_FOR_IPHONE = "Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5"; } ================================================ FILE: src/main/java/io/github/biezhi/elves/download/Downloader.java ================================================ package io.github.biezhi.elves.download; import io.github.biezhi.elves.request.Request; import io.github.biezhi.elves.response.Response; import io.github.biezhi.elves.scheduler.Scheduler; import lombok.extern.slf4j.Slf4j; import java.io.InputStream; /** * 下载器线程 * * @author biezhi * @date 2018/1/11 */ @Slf4j public class Downloader implements Runnable { private final Scheduler scheduler; private final Request request; public Downloader(Scheduler scheduler, Request request) { this.scheduler = scheduler; this.request = request; } @Override public void run() { log.debug("[{}] 开始请求", request.getUrl()); io.github.biezhi.request.Request httpReq = null; if ("get".equalsIgnoreCase(request.method())) { httpReq = io.github.biezhi.request.Request.get(request.getUrl()); } if ("post".equalsIgnoreCase(request.method())) { httpReq = io.github.biezhi.request.Request.post(request.getUrl()); } InputStream result = httpReq.contentType(request.contentType()).headers(request.getHeaders()) .connectTimeout(request.getSpider().getConfig().timeout()) .readTimeout(request.getSpider().getConfig().timeout()) .stream(); log.debug("[{}] 下载完毕", request.getUrl()); Response response = new Response(request, result); scheduler.addResponse(response); } } ================================================ FILE: src/main/java/io/github/biezhi/elves/event/ElvesEvent.java ================================================ package io.github.biezhi.elves.event; /** * 事件枚举 * * @author biezhi * @date 2018/1/11 */ public enum ElvesEvent { GLOBAL_STARTED, SPIDER_STARTED } ================================================ FILE: src/main/java/io/github/biezhi/elves/event/EventManager.java ================================================ package io.github.biezhi.elves.event; import io.github.biezhi.elves.config.Config; import java.util.*; import java.util.function.Consumer; /** * 事件管理器 * * @author biezhi * @date 2018/1/11 */ public class EventManager { private static final Map>> elvesEventConsumerMap = new HashMap<>(); public static void registerEvent(ElvesEvent elvesEvent, Consumer consumer) { List> consumers = elvesEventConsumerMap.get(elvesEvent); if (null == consumers) { consumers = new ArrayList<>(); } consumers.add(consumer); elvesEventConsumerMap.put(elvesEvent, consumers); } public static void fireEvent(ElvesEvent elvesEvent, Config config) { Optional.ofNullable(elvesEventConsumerMap.get(elvesEvent)).ifPresent(consumers -> consumers.forEach(consumer -> consumer.accept(config))); } } ================================================ FILE: src/main/java/io/github/biezhi/elves/pipeline/Pipeline.java ================================================ package io.github.biezhi.elves.pipeline; import io.github.biezhi.elves.request.Request; import io.github.biezhi.elves.spider.Spider; /** * 数据处理接口 * * @author biezhi * @date 2018/1/12 */ public interface Pipeline { void process(T item, Request request); } ================================================ FILE: src/main/java/io/github/biezhi/elves/request/Parser.java ================================================ package io.github.biezhi.elves.request; import io.github.biezhi.elves.response.Result; import io.github.biezhi.elves.response.Response; /** * 解析器接口 * * @author biezhi * @date 2018/1/12 */ public interface Parser { Result parse(Response response); } ================================================ FILE: src/main/java/io/github/biezhi/elves/request/Request.java ================================================ package io.github.biezhi.elves.request; import io.github.biezhi.elves.spider.Spider; import lombok.Getter; import java.nio.charset.Charset; import java.nio.charset.StandardCharsets; import java.util.HashMap; import java.util.Map; /** * Request 请求 * * @author biezhi * @date 2018/1/11 */ @Getter public class Request { private Spider spider; private String url; private String method = "GET"; private Map headers = new HashMap<>(); private Map cookies = new HashMap<>(); private String contentType = "text/html; charset=UTF-8"; private String charset = "UTF-8"; private Parser parser; public Request(Spider spider, String url, Parser parser) { this.spider = spider; this.url = url; this.parser = parser; this.header("User-Agent", spider.getConfig().userAgent()); } public Request header(String key, String value) { this.headers.put(key, value); return this; } public Request cookie(String key, String value) { this.cookies.put(key, value); return this; } public String header(String key) { return this.headers.get(key); } public String cookie(String key) { return this.cookies.get(key); } public void setParser(Parser parser) { this.parser = parser; } public String contentType() { return contentType; } public Request contentType(String contentType) { this.contentType = contentType; return this; } public String charset() { return charset; } public Request charset(String charset) { this.charset = charset; return this; } public Request method(String method) { this.method = method; return this; } public String method() { return this.method; } } ================================================ FILE: src/main/java/io/github/biezhi/elves/response/Body.java ================================================ package io.github.biezhi.elves.response; import org.jsoup.Jsoup; import org.jsoup.select.Elements; import us.codecraft.xsoup.XElements; import us.codecraft.xsoup.Xsoup; import java.io.BufferedReader; import java.io.InputStream; import java.io.InputStreamReader; /** * 响应Body * * @author biezhi * @date 2018/1/12 */ public class Body { private final InputStream inputStream; private final String charset; private String bodyString; public Body(InputStream inputStream, String charset) { this.inputStream = inputStream; this.charset = charset; } @Override public String toString() { if (null == this.bodyString) { StringBuilder html = new StringBuilder(100); try { BufferedReader br = new BufferedReader(new InputStreamReader(inputStream, charset)); String temp; while ((temp = br.readLine()) != null) { html.append(temp).append("\n"); } } catch (Exception e) { e.printStackTrace(); } this.bodyString = html.toString(); } return this.bodyString; } public InputStream getInputStream() { return inputStream; } public Elements css(String css) { return Jsoup.parse(this.toString()).select(css); } public XElements xpath(String xpath) { return Xsoup.compile(xpath).evaluate(Jsoup.parse(this.toString())); } } ================================================ FILE: src/main/java/io/github/biezhi/elves/response/Response.java ================================================ package io.github.biezhi.elves.response; import io.github.biezhi.elves.request.Request; import lombok.Getter; import java.io.InputStream; /** * 响应对象 * * @author biezhi * @date 2018/1/11 */ public class Response { @Getter private Request request; private Body body; public Response(Request request, InputStream inputStream) { this.request = request; this.body = new Body(inputStream, request.charset()); } public Body body() { return body; } } ================================================ FILE: src/main/java/io/github/biezhi/elves/response/Result.java ================================================ package io.github.biezhi.elves.response; import io.github.biezhi.elves.request.Request; import io.github.biezhi.elves.utils.ElvesUtils; import lombok.Data; import lombok.NoArgsConstructor; import java.util.ArrayList; import java.util.List; /** * 响应结果封装 *

* 存储 Item 数据和新添加的 Request 列表 * * @author biezhi * @date 2018/1/12 */ @Data @NoArgsConstructor public class Result { private List requests = new ArrayList<>(); private T item; public Result(T item) { this.item = item; } public Result addRequest(Request request) { this.requests.add(request); return this; } public Result addRequests(List requests) { if (!ElvesUtils.isEmpty(requests)) { this.requests.addAll(requests); } return this; } } ================================================ FILE: src/main/java/io/github/biezhi/elves/scheduler/Scheduler.java ================================================ package io.github.biezhi.elves.scheduler; import io.github.biezhi.elves.request.Request; import io.github.biezhi.elves.response.Response; import lombok.extern.slf4j.Slf4j; import java.util.List; import java.util.concurrent.BlockingQueue; import java.util.concurrent.LinkedBlockingQueue; /** * 爬虫调度器 * * @author biezhi * @date 2018/1/12 */ @Slf4j public class Scheduler { private BlockingQueue pending = new LinkedBlockingQueue<>(); private BlockingQueue result = new LinkedBlockingQueue<>(); public void addRequest(Request request) { try { this.pending.put(request); } catch (InterruptedException e) { log.error("向调度器添加 Request 出错", e); } } public void addResponse(Response response) { try { this.result.put(response); } catch (InterruptedException e) { log.error("向调度器添加 Response 出错", e); } } public boolean hasRequest() { return pending.size() > 0; } public Request nextRequest() { try { return pending.take(); } catch (InterruptedException e) { log.error("从调度器获取 Request 出错", e); return null; } } public boolean hasResponse() { return result.size() > 0; } public Response nextResponse() { try { return result.take(); } catch (InterruptedException e) { log.error("从调度器获取 Response 出错", e); return null; } } public void addRequests(List requests) { requests.forEach(this::addRequest); } public void clear() { pending.clear(); } } ================================================ FILE: src/main/java/io/github/biezhi/elves/spider/Spider.java ================================================ package io.github.biezhi.elves.spider; import io.github.biezhi.elves.config.Config; import io.github.biezhi.elves.event.ElvesEvent; import io.github.biezhi.elves.event.EventManager; import io.github.biezhi.elves.pipeline.Pipeline; import io.github.biezhi.elves.request.Parser; import io.github.biezhi.elves.request.Request; import io.github.biezhi.elves.response.Response; import io.github.biezhi.elves.response.Result; import lombok.Data; import java.util.ArrayList; import java.util.Arrays; import java.util.List; import java.util.function.Consumer; /** * 爬虫基类 * * @author biezhi * @date 2018/1/11 */ @Data public abstract class Spider { protected String name; protected Config config; protected List startUrls = new ArrayList<>(); protected List pipelines = new ArrayList<>(); protected List requests = new ArrayList<>(); public Spider(String name) { this.name = name; EventManager.registerEvent(ElvesEvent.SPIDER_STARTED, this::onStart); } public Spider startUrls(String... urls) { this.startUrls.addAll(Arrays.asList(urls)); return this; } /** * 爬虫启动前执行 */ public void onStart(Config config) { } /** * 添加 Pipeline 处理 */ protected Spider addPipeline(Pipeline pipeline) { this.pipelines.add(pipeline); return this; } /** * 构建一个Request */ public Request makeRequest(String url) { return makeRequest(url, this::parse); } public Request makeRequest(String url, Parser parser) { return new Request(this, url, parser); } /** * 解析 DOM */ protected abstract Result parse(Response response); protected void resetRequest(Consumer requestConsumer) { this.resetRequest(this.requests, requestConsumer); } protected void resetRequest(List requests, Consumer requestConsumer) { requests.forEach(requestConsumer::accept); } } ================================================ FILE: src/main/java/io/github/biezhi/elves/utils/ElvesUtils.java ================================================ package io.github.biezhi.elves.utils; import java.util.Collection; import java.util.concurrent.TimeUnit; /** * Elves Utils * * @author biezhi * @date 2018/1/12 */ public class ElvesUtils { public static void sleep(long time){ try { TimeUnit.MILLISECONDS.sleep(time); } catch (InterruptedException e) { e.printStackTrace(); } } public static boolean isEmpty(Collection collection){ return null == collection || collection.size() == 0; } } ================================================ FILE: src/main/java/io/github/biezhi/elves/utils/NamedThreadFactory.java ================================================ package io.github.biezhi.elves.utils; import java.util.concurrent.ThreadFactory; import java.util.concurrent.atomic.LongAdder; public class NamedThreadFactory implements ThreadFactory { private final String prefix; private final LongAdder threadNumber = new LongAdder(); public NamedThreadFactory(String prefix) { this.prefix = prefix; } @Override public Thread newThread(Runnable runnable) { threadNumber.add(1); return new Thread(runnable, prefix + "@thread-" + threadNumber.intValue()); } } ================================================ FILE: src/test/java/io/github/biezhi/elves/event/ElvesEventTest.java ================================================ package io.github.biezhi.elves.event; import io.github.biezhi.elves.Elves; import io.github.biezhi.elves.config.Config; import io.github.biezhi.elves.response.Response; import io.github.biezhi.elves.response.Result; import io.github.biezhi.elves.spider.Spider; /** * @author biezhi * @date 2018/1/12 */ public class ElvesEventTest { public static void main(String[] args) { Elves.me(new Spider("测试爬虫") { @Override public Result parse(Response response) { return new Result<>(response.body().toString()); } }, Config.me()).onStart(config -> System.out.println("asasas")).start(); } } ================================================ FILE: src/test/java/io/github/biezhi/elves/examples/DoubanExample.java ================================================ package io.github.biezhi.elves.examples; import io.github.biezhi.elves.Elves; import io.github.biezhi.elves.config.Config; import io.github.biezhi.elves.pipeline.Pipeline; import io.github.biezhi.elves.request.Request; import io.github.biezhi.elves.response.Response; import io.github.biezhi.elves.response.Result; import io.github.biezhi.elves.spider.Spider; import lombok.extern.slf4j.Slf4j; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import java.util.List; import java.util.stream.Collectors; /** * 豆瓣电影示例 * * @author biezhi * @date 2018/1/11 */ public class DoubanExample { @Slf4j static class DoubanSpider extends Spider { public DoubanSpider(String name) { super(name); this.startUrls( "https://movie.douban.com/tag/爱情", "https://movie.douban.com/tag/喜剧", "https://movie.douban.com/tag/动画", "https://movie.douban.com/tag/动作", "https://movie.douban.com/tag/史诗", "https://movie.douban.com/tag/犯罪"); } @Override public void onStart(Config config) { this.addPipeline((Pipeline>) (item, request) -> log.info("保存到文件: {}", item)); } public Result parse(Response response) { Result> result = new Result<>(); Elements elements = response.body().css("#content table .pl2 a"); List titles = elements.stream().map(Element::text).collect(Collectors.toList()); result.setItem(titles); // 获取下一页 URL Elements nextEl = response.body().css("#content > div > div.article > div.paginator > span.next > a"); if (null != nextEl && nextEl.size() > 0) { String nextPageUrl = nextEl.get(0).attr("href"); Request nextReq = this.makeRequest(nextPageUrl, this::parse); result.addRequest(nextReq); } return result; } } public static void main(String[] args) { DoubanSpider doubanSpider = new DoubanSpider("豆瓣电影"); Elves.me(doubanSpider, Config.me()).start(); } } ================================================ FILE: src/test/java/io/github/biezhi/elves/examples/MeiziExample.java ================================================ package io.github.biezhi.elves.examples; import io.github.biezhi.elves.Elves; import io.github.biezhi.elves.config.Config; import io.github.biezhi.elves.config.UserAgent; import io.github.biezhi.elves.pipeline.Pipeline; import io.github.biezhi.elves.request.Parser; import io.github.biezhi.elves.request.Request; import io.github.biezhi.elves.response.Response; import io.github.biezhi.elves.response.Result; import io.github.biezhi.elves.spider.Spider; import lombok.extern.slf4j.Slf4j; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import java.io.File; import java.util.List; import java.util.Optional; import java.util.stream.Collectors; /** * 妹子图示例 * * @author biezhi * @date 2018/1/12 */ public class MeiziExample { @Slf4j static class MeiziSpider extends Spider { private String storageDir = "/Users/biezhi/Desktop/meizi"; public MeiziSpider(String name) { super(name); this.startUrls( "http://www.meizitu.com/a/pure.html", "http://www.meizitu.com/a/cute.html", "http://www.meizitu.com/a/sexy.html", "http://www.meizitu.com/a/fuli.html", "http://www.meizitu.com/a/legs.html"); } @Override public void onStart(Config config) { this.addPipeline((Pipeline>) (item, request) -> { item.forEach(imgUrl -> { log.info("开始下载: {}", imgUrl); io.github.biezhi.request.Request.get(imgUrl) .header("Referer", request.getUrl()) .header("User-Agent", UserAgent.CHROME_FOR_MAC) .connectTimeout(20_000) .readTimeout(20_000) .receive(new File(storageDir, System.currentTimeMillis() + ".jpg")); }); log.info("[{}] 图片下载 OJ8K.", request.getUrl()); }); this.requests.forEach(this::resetRequest); } private Request resetRequest(Request request) { request.contentType("text/html; charset=gb2312"); request.charset("gb2312"); return request; } @Override protected Result parse(Response response) { Result result = new Result<>(); Elements elements = response.body().css("#maincontent > div.inWrap > ul > li:nth-child(1) > div > div > a"); log.info("elements size: {}", elements.size()); List requests = elements.stream() .map(element -> element.attr("href")) .map(href -> MeiziSpider.this.makeRequest(href, new MeiziSpider.PictureParser())) .map(this::resetRequest) .collect(Collectors.toList()); result.addRequests(requests); // 获取下一页 URL Optional nextEl = response.body().css("#wp_page_numbers > ul > li > a").stream().filter(element -> "下一页".equals(element.text())).findFirst(); if (nextEl.isPresent()) { String nextPageUrl = "http://www.meizitu.com/a/" + nextEl.get().attr("href"); Request nextReq = MeiziSpider.this.makeRequest(nextPageUrl, this::parse); result.addRequest(this.resetRequest(nextReq)); } return result; } static class PictureParser implements Parser> { @Override public Result> parse(Response response) { Elements elements = response.body().css("#picture > p > img"); List src = elements.stream().map(element -> element.attr("src")).collect(Collectors.toList()); return new Result<>(src); } } } public static void main(String[] args) { MeiziSpider meiziSpider = new MeiziSpider("妹子图"); Elves.me(meiziSpider, Config.me().delay(3000)).start(); } } ================================================ FILE: src/test/java/io/github/biezhi/elves/examples/News163Example.java ================================================ package io.github.biezhi.elves.examples; import io.github.biezhi.elves.Elves; import io.github.biezhi.elves.config.Config; import io.github.biezhi.elves.pipeline.Pipeline; import io.github.biezhi.elves.response.Response; import io.github.biezhi.elves.response.Result; import io.github.biezhi.elves.spider.Spider; import lombok.extern.slf4j.Slf4j; import org.jsoup.nodes.Element; import java.util.List; import java.util.stream.Collectors; /** * 网易新闻示例 * * @author biezhi * @date 2018/1/15 */ public class News163Example { @Slf4j static class News163Spider extends Spider { public News163Spider(String name) { super(name); this.startUrls( "http://news.163.com/special/0001386F/rank_news.html", "http://news.163.com/special/0001386F/rank_ent.html", // 娱乐 "http://news.163.com/special/0001386F/rank_sports.html", // 体育 "http://news.163.com/special/0001386F/rank_tech.html", // 科技 "http://news.163.com/special/0001386F/game_rank.html", //游戏 "http://news.163.com/special/0001386F/rank_book.html"); // 读书 } @Override public void onStart(Config config) { this.addPipeline((Pipeline>) (item, request) -> item.forEach(System.out::println)); this.requests.forEach(request -> { request.contentType("text/html; charset=gb2312"); request.charset("gb2312"); }); } @Override protected Result parse(Response response) { List titles = response.body().css("div.areabg1 .area-half.left div.tabContents td a").stream() .map(Element::text) .collect(Collectors.toList()); return new Result(titles); } } public static void main(String[] args) { Elves.me(new News163Spider("网易新闻")).start(); } } ================================================ FILE: src/test/java/io/github/biezhi/elves/examples/QiubaiExample.java ================================================ package io.github.biezhi.elves.examples; import io.github.biezhi.elves.Elves; import io.github.biezhi.elves.config.Config; import io.github.biezhi.elves.pipeline.Pipeline; import io.github.biezhi.elves.request.Request; import io.github.biezhi.elves.response.Response; import io.github.biezhi.elves.response.Result; import io.github.biezhi.elves.spider.Spider; import lombok.extern.slf4j.Slf4j; import org.jsoup.nodes.Element; import java.util.List; import java.util.Optional; import java.util.stream.Collectors; /** * 糗事百科示例 * * @author biezhi * @date 2018/1/15 */ public class QiubaiExample { private static final String BASE_URL = "https://www.qiushibaike.com"; @Slf4j static class QiubaiSpider extends Spider { public QiubaiSpider(String name) { super(name); this.startUrls(BASE_URL); } @Override public void onStart(Config config) { this.addPipeline((Pipeline>) (items, request) -> { log.info("=== 段子来了 ==="); items.forEach(item -> System.out.println("\r\n" + item + "\r\n============END==========\r\n")); }); } @Override protected Result parse(Response response) { Result result = new Result(); List items = response.body().css("#content-left div.article div.content span").stream() .map(element -> element.text().replace("
", "\r\n")) .collect(Collectors.toList()); result.setItem(items); // 下一页 Optional nextEl = response.body().css("ul.pagination a span").stream() .filter(element -> "下一页".equals(element.text())) .map(Element::parent) .findFirst(); if (nextEl.isPresent()) { String nextPageUrl = BASE_URL + nextEl.get().attr("href"); Request nextReq = QiubaiSpider.this.makeRequest(nextPageUrl, this::parse); result.addRequest(nextReq); } return result; } } public static void main(String[] args) { QiubaiSpider qiubaiSpider = new QiubaiSpider("糗事百科"); Elves.me(qiubaiSpider, Config.me().delay(2000)).start(); } }