parser) {
this.parser = parser;
}
public String contentType() {
return contentType;
}
public Request contentType(String contentType) {
this.contentType = contentType;
return this;
}
public String charset() {
return charset;
}
public Request charset(String charset) {
this.charset = charset;
return this;
}
public Request method(String method) {
this.method = method;
return this;
}
public String method() {
return this.method;
}
}
================================================
FILE: src/main/java/io/github/biezhi/elves/response/Body.java
================================================
package io.github.biezhi.elves.response;
import org.jsoup.Jsoup;
import org.jsoup.select.Elements;
import us.codecraft.xsoup.XElements;
import us.codecraft.xsoup.Xsoup;
import java.io.BufferedReader;
import java.io.InputStream;
import java.io.InputStreamReader;
/**
* 响应Body
*
* @author biezhi
* @date 2018/1/12
*/
public class Body {
private final InputStream inputStream;
private final String charset;
private String bodyString;
public Body(InputStream inputStream, String charset) {
this.inputStream = inputStream;
this.charset = charset;
}
@Override
public String toString() {
if (null == this.bodyString) {
StringBuilder html = new StringBuilder(100);
try {
BufferedReader br = new BufferedReader(new InputStreamReader(inputStream, charset));
String temp;
while ((temp = br.readLine()) != null) {
html.append(temp).append("\n");
}
} catch (Exception e) {
e.printStackTrace();
}
this.bodyString = html.toString();
}
return this.bodyString;
}
public InputStream getInputStream() {
return inputStream;
}
public Elements css(String css) {
return Jsoup.parse(this.toString()).select(css);
}
public XElements xpath(String xpath) {
return Xsoup.compile(xpath).evaluate(Jsoup.parse(this.toString()));
}
}
================================================
FILE: src/main/java/io/github/biezhi/elves/response/Response.java
================================================
package io.github.biezhi.elves.response;
import io.github.biezhi.elves.request.Request;
import lombok.Getter;
import java.io.InputStream;
/**
* 响应对象
*
* @author biezhi
* @date 2018/1/11
*/
public class Response {
@Getter
private Request request;
private Body body;
public Response(Request request, InputStream inputStream) {
this.request = request;
this.body = new Body(inputStream, request.charset());
}
public Body body() {
return body;
}
}
================================================
FILE: src/main/java/io/github/biezhi/elves/response/Result.java
================================================
package io.github.biezhi.elves.response;
import io.github.biezhi.elves.request.Request;
import io.github.biezhi.elves.utils.ElvesUtils;
import lombok.Data;
import lombok.NoArgsConstructor;
import java.util.ArrayList;
import java.util.List;
/**
* 响应结果封装
*
* 存储 Item 数据和新添加的 Request 列表
*
* @author biezhi
* @date 2018/1/12
*/
@Data
@NoArgsConstructor
public class Result {
private List requests = new ArrayList<>();
private T item;
public Result(T item) {
this.item = item;
}
public Result addRequest(Request request) {
this.requests.add(request);
return this;
}
public Result addRequests(List requests) {
if (!ElvesUtils.isEmpty(requests)) {
this.requests.addAll(requests);
}
return this;
}
}
================================================
FILE: src/main/java/io/github/biezhi/elves/scheduler/Scheduler.java
================================================
package io.github.biezhi.elves.scheduler;
import io.github.biezhi.elves.request.Request;
import io.github.biezhi.elves.response.Response;
import lombok.extern.slf4j.Slf4j;
import java.util.List;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.LinkedBlockingQueue;
/**
* 爬虫调度器
*
* @author biezhi
* @date 2018/1/12
*/
@Slf4j
public class Scheduler {
private BlockingQueue pending = new LinkedBlockingQueue<>();
private BlockingQueue result = new LinkedBlockingQueue<>();
public void addRequest(Request request) {
try {
this.pending.put(request);
} catch (InterruptedException e) {
log.error("向调度器添加 Request 出错", e);
}
}
public void addResponse(Response response) {
try {
this.result.put(response);
} catch (InterruptedException e) {
log.error("向调度器添加 Response 出错", e);
}
}
public boolean hasRequest() {
return pending.size() > 0;
}
public Request nextRequest() {
try {
return pending.take();
} catch (InterruptedException e) {
log.error("从调度器获取 Request 出错", e);
return null;
}
}
public boolean hasResponse() {
return result.size() > 0;
}
public Response nextResponse() {
try {
return result.take();
} catch (InterruptedException e) {
log.error("从调度器获取 Response 出错", e);
return null;
}
}
public void addRequests(List requests) {
requests.forEach(this::addRequest);
}
public void clear() {
pending.clear();
}
}
================================================
FILE: src/main/java/io/github/biezhi/elves/spider/Spider.java
================================================
package io.github.biezhi.elves.spider;
import io.github.biezhi.elves.config.Config;
import io.github.biezhi.elves.event.ElvesEvent;
import io.github.biezhi.elves.event.EventManager;
import io.github.biezhi.elves.pipeline.Pipeline;
import io.github.biezhi.elves.request.Parser;
import io.github.biezhi.elves.request.Request;
import io.github.biezhi.elves.response.Response;
import io.github.biezhi.elves.response.Result;
import lombok.Data;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.function.Consumer;
/**
* 爬虫基类
*
* @author biezhi
* @date 2018/1/11
*/
@Data
public abstract class Spider {
protected String name;
protected Config config;
protected List startUrls = new ArrayList<>();
protected List pipelines = new ArrayList<>();
protected List requests = new ArrayList<>();
public Spider(String name) {
this.name = name;
EventManager.registerEvent(ElvesEvent.SPIDER_STARTED, this::onStart);
}
public Spider startUrls(String... urls) {
this.startUrls.addAll(Arrays.asList(urls));
return this;
}
/**
* 爬虫启动前执行
*/
public void onStart(Config config) {
}
/**
* 添加 Pipeline 处理
*/
protected Spider addPipeline(Pipeline pipeline) {
this.pipelines.add(pipeline);
return this;
}
/**
* 构建一个Request
*/
public Request makeRequest(String url) {
return makeRequest(url, this::parse);
}
public Request makeRequest(String url, Parser parser) {
return new Request(this, url, parser);
}
/**
* 解析 DOM
*/
protected abstract Result parse(Response response);
protected void resetRequest(Consumer requestConsumer) {
this.resetRequest(this.requests, requestConsumer);
}
protected void resetRequest(List requests, Consumer requestConsumer) {
requests.forEach(requestConsumer::accept);
}
}
================================================
FILE: src/main/java/io/github/biezhi/elves/utils/ElvesUtils.java
================================================
package io.github.biezhi.elves.utils;
import java.util.Collection;
import java.util.concurrent.TimeUnit;
/**
* Elves Utils
*
* @author biezhi
* @date 2018/1/12
*/
public class ElvesUtils {
public static void sleep(long time){
try {
TimeUnit.MILLISECONDS.sleep(time);
} catch (InterruptedException e) {
e.printStackTrace();
}
}
public static boolean isEmpty(Collection collection){
return null == collection || collection.size() == 0;
}
}
================================================
FILE: src/main/java/io/github/biezhi/elves/utils/NamedThreadFactory.java
================================================
package io.github.biezhi.elves.utils;
import java.util.concurrent.ThreadFactory;
import java.util.concurrent.atomic.LongAdder;
public class NamedThreadFactory implements ThreadFactory {
private final String prefix;
private final LongAdder threadNumber = new LongAdder();
public NamedThreadFactory(String prefix) {
this.prefix = prefix;
}
@Override
public Thread newThread(Runnable runnable) {
threadNumber.add(1);
return new Thread(runnable, prefix + "@thread-" + threadNumber.intValue());
}
}
================================================
FILE: src/test/java/io/github/biezhi/elves/event/ElvesEventTest.java
================================================
package io.github.biezhi.elves.event;
import io.github.biezhi.elves.Elves;
import io.github.biezhi.elves.config.Config;
import io.github.biezhi.elves.response.Response;
import io.github.biezhi.elves.response.Result;
import io.github.biezhi.elves.spider.Spider;
/**
* @author biezhi
* @date 2018/1/12
*/
public class ElvesEventTest {
public static void main(String[] args) {
Elves.me(new Spider("测试爬虫") {
@Override
public Result parse(Response response) {
return new Result<>(response.body().toString());
}
}, Config.me()).onStart(config -> System.out.println("asasas")).start();
}
}
================================================
FILE: src/test/java/io/github/biezhi/elves/examples/DoubanExample.java
================================================
package io.github.biezhi.elves.examples;
import io.github.biezhi.elves.Elves;
import io.github.biezhi.elves.config.Config;
import io.github.biezhi.elves.pipeline.Pipeline;
import io.github.biezhi.elves.request.Request;
import io.github.biezhi.elves.response.Response;
import io.github.biezhi.elves.response.Result;
import io.github.biezhi.elves.spider.Spider;
import lombok.extern.slf4j.Slf4j;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.util.List;
import java.util.stream.Collectors;
/**
* 豆瓣电影示例
*
* @author biezhi
* @date 2018/1/11
*/
public class DoubanExample {
@Slf4j
static class DoubanSpider extends Spider {
public DoubanSpider(String name) {
super(name);
this.startUrls(
"https://movie.douban.com/tag/爱情",
"https://movie.douban.com/tag/喜剧",
"https://movie.douban.com/tag/动画",
"https://movie.douban.com/tag/动作",
"https://movie.douban.com/tag/史诗",
"https://movie.douban.com/tag/犯罪");
}
@Override
public void onStart(Config config) {
this.addPipeline((Pipeline>) (item, request) -> log.info("保存到文件: {}", item));
}
public Result parse(Response response) {
Result> result = new Result<>();
Elements elements = response.body().css("#content table .pl2 a");
List titles = elements.stream().map(Element::text).collect(Collectors.toList());
result.setItem(titles);
// 获取下一页 URL
Elements nextEl = response.body().css("#content > div > div.article > div.paginator > span.next > a");
if (null != nextEl && nextEl.size() > 0) {
String nextPageUrl = nextEl.get(0).attr("href");
Request nextReq = this.makeRequest(nextPageUrl, this::parse);
result.addRequest(nextReq);
}
return result;
}
}
public static void main(String[] args) {
DoubanSpider doubanSpider = new DoubanSpider("豆瓣电影");
Elves.me(doubanSpider, Config.me()).start();
}
}
================================================
FILE: src/test/java/io/github/biezhi/elves/examples/MeiziExample.java
================================================
package io.github.biezhi.elves.examples;
import io.github.biezhi.elves.Elves;
import io.github.biezhi.elves.config.Config;
import io.github.biezhi.elves.config.UserAgent;
import io.github.biezhi.elves.pipeline.Pipeline;
import io.github.biezhi.elves.request.Parser;
import io.github.biezhi.elves.request.Request;
import io.github.biezhi.elves.response.Response;
import io.github.biezhi.elves.response.Result;
import io.github.biezhi.elves.spider.Spider;
import lombok.extern.slf4j.Slf4j;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.File;
import java.util.List;
import java.util.Optional;
import java.util.stream.Collectors;
/**
* 妹子图示例
*
* @author biezhi
* @date 2018/1/12
*/
public class MeiziExample {
@Slf4j
static class MeiziSpider extends Spider {
private String storageDir = "/Users/biezhi/Desktop/meizi";
public MeiziSpider(String name) {
super(name);
this.startUrls(
"http://www.meizitu.com/a/pure.html",
"http://www.meizitu.com/a/cute.html",
"http://www.meizitu.com/a/sexy.html",
"http://www.meizitu.com/a/fuli.html",
"http://www.meizitu.com/a/legs.html");
}
@Override
public void onStart(Config config) {
this.addPipeline((Pipeline>) (item, request) -> {
item.forEach(imgUrl -> {
log.info("开始下载: {}", imgUrl);
io.github.biezhi.request.Request.get(imgUrl)
.header("Referer", request.getUrl())
.header("User-Agent", UserAgent.CHROME_FOR_MAC)
.connectTimeout(20_000)
.readTimeout(20_000)
.receive(new File(storageDir, System.currentTimeMillis() + ".jpg"));
});
log.info("[{}] 图片下载 OJ8K.", request.getUrl());
});
this.requests.forEach(this::resetRequest);
}
private Request resetRequest(Request request) {
request.contentType("text/html; charset=gb2312");
request.charset("gb2312");
return request;
}
@Override
protected Result parse(Response response) {
Result result = new Result<>();
Elements elements = response.body().css("#maincontent > div.inWrap > ul > li:nth-child(1) > div > div > a");
log.info("elements size: {}", elements.size());
List requests = elements.stream()
.map(element -> element.attr("href"))
.map(href -> MeiziSpider.this.makeRequest(href, new MeiziSpider.PictureParser()))
.map(this::resetRequest)
.collect(Collectors.toList());
result.addRequests(requests);
// 获取下一页 URL
Optional nextEl = response.body().css("#wp_page_numbers > ul > li > a").stream().filter(element -> "下一页".equals(element.text())).findFirst();
if (nextEl.isPresent()) {
String nextPageUrl = "http://www.meizitu.com/a/" + nextEl.get().attr("href");
Request nextReq = MeiziSpider.this.makeRequest(nextPageUrl, this::parse);
result.addRequest(this.resetRequest(nextReq));
}
return result;
}
static class PictureParser implements Parser> {
@Override
public Result> parse(Response response) {
Elements elements = response.body().css("#picture > p > img");
List src = elements.stream().map(element -> element.attr("src")).collect(Collectors.toList());
return new Result<>(src);
}
}
}
public static void main(String[] args) {
MeiziSpider meiziSpider = new MeiziSpider("妹子图");
Elves.me(meiziSpider, Config.me().delay(3000)).start();
}
}
================================================
FILE: src/test/java/io/github/biezhi/elves/examples/News163Example.java
================================================
package io.github.biezhi.elves.examples;
import io.github.biezhi.elves.Elves;
import io.github.biezhi.elves.config.Config;
import io.github.biezhi.elves.pipeline.Pipeline;
import io.github.biezhi.elves.response.Response;
import io.github.biezhi.elves.response.Result;
import io.github.biezhi.elves.spider.Spider;
import lombok.extern.slf4j.Slf4j;
import org.jsoup.nodes.Element;
import java.util.List;
import java.util.stream.Collectors;
/**
* 网易新闻示例
*
* @author biezhi
* @date 2018/1/15
*/
public class News163Example {
@Slf4j
static class News163Spider extends Spider {
public News163Spider(String name) {
super(name);
this.startUrls(
"http://news.163.com/special/0001386F/rank_news.html",
"http://news.163.com/special/0001386F/rank_ent.html", // 娱乐
"http://news.163.com/special/0001386F/rank_sports.html", // 体育
"http://news.163.com/special/0001386F/rank_tech.html", // 科技
"http://news.163.com/special/0001386F/game_rank.html", //游戏
"http://news.163.com/special/0001386F/rank_book.html"); // 读书
}
@Override
public void onStart(Config config) {
this.addPipeline((Pipeline>) (item, request) -> item.forEach(System.out::println));
this.requests.forEach(request -> {
request.contentType("text/html; charset=gb2312");
request.charset("gb2312");
});
}
@Override
protected Result parse(Response response) {
List titles = response.body().css("div.areabg1 .area-half.left div.tabContents td a").stream()
.map(Element::text)
.collect(Collectors.toList());
return new Result(titles);
}
}
public static void main(String[] args) {
Elves.me(new News163Spider("网易新闻")).start();
}
}
================================================
FILE: src/test/java/io/github/biezhi/elves/examples/QiubaiExample.java
================================================
package io.github.biezhi.elves.examples;
import io.github.biezhi.elves.Elves;
import io.github.biezhi.elves.config.Config;
import io.github.biezhi.elves.pipeline.Pipeline;
import io.github.biezhi.elves.request.Request;
import io.github.biezhi.elves.response.Response;
import io.github.biezhi.elves.response.Result;
import io.github.biezhi.elves.spider.Spider;
import lombok.extern.slf4j.Slf4j;
import org.jsoup.nodes.Element;
import java.util.List;
import java.util.Optional;
import java.util.stream.Collectors;
/**
* 糗事百科示例
*
* @author biezhi
* @date 2018/1/15
*/
public class QiubaiExample {
private static final String BASE_URL = "https://www.qiushibaike.com";
@Slf4j
static class QiubaiSpider extends Spider {
public QiubaiSpider(String name) {
super(name);
this.startUrls(BASE_URL);
}
@Override
public void onStart(Config config) {
this.addPipeline((Pipeline>) (items, request) -> {
log.info("=== 段子来了 ===");
items.forEach(item -> System.out.println("\r\n" + item + "\r\n============END==========\r\n"));
});
}
@Override
protected Result parse(Response response) {
Result result = new Result();
List items = response.body().css("#content-left div.article div.content span").stream()
.map(element -> element.text().replace("
", "\r\n"))
.collect(Collectors.toList());
result.setItem(items);
// 下一页
Optional nextEl = response.body().css("ul.pagination a span").stream()
.filter(element -> "下一页".equals(element.text()))
.map(Element::parent)
.findFirst();
if (nextEl.isPresent()) {
String nextPageUrl = BASE_URL + nextEl.get().attr("href");
Request nextReq = QiubaiSpider.this.makeRequest(nextPageUrl, this::parse);
result.addRequest(nextReq);
}
return result;
}
}
public static void main(String[] args) {
QiubaiSpider qiubaiSpider = new QiubaiSpider("糗事百科");
Elves.me(qiubaiSpider, Config.me().delay(2000)).start();
}
}