Repository: ERYhua/nyaHentaiCrawler
Branch: local
Commit: 9e62548ae87a
Files: 11
Total size: 34.7 KB
Directory structure:
gitextract_upojyjqn/
├── .idea/
│ ├── compiler.xml
│ ├── misc.xml
│ ├── vcs.xml
│ └── workspace.xml
├── README.md
├── nyaCrawler.iml
├── pom.xml
├── src/
│ └── main/
│ └── java/
│ └── com/
│ └── cn/
│ ├── main/
│ │ └── nyaPictureMain.java
│ └── util/
│ └── HttpClientUtil.java
├── target/
│ └── classes/
│ └── META-INF/
│ └── nyaCrawler.kotlin_module
└── web/
└── WEB-INF/
└── web.xml
================================================
FILE CONTENTS
================================================
================================================
FILE: .idea/compiler.xml
================================================
================================================
FILE: .idea/misc.xml
================================================
================================================
FILE: .idea/vcs.xml
================================================
================================================
FILE: .idea/workspace.xml
================================================
main
1564232410913
1564232410913
1576766569859
1576766569859
1576766587111
1576766587111
No facets are configured
Web|nyaCrawler
================================================
FILE: README.md
================================================
# nyaHentaiCrawler
喵绅士本子爬取(自用)
用于抓取喵绅士的本子,启用main输入本子对应网址即可使用,个人使用版
//注意!!!
//因源站更新过几版,且添加了反扒延迟等措施,因此已不在适用,仅做参考
2021/1/7更新
现在添加代理,可以本地配置代理爬取,源站依然可用
2021/3/15
原网址git挂了,现有其他地址访问
2021/12/3
原网址git又可以使用了,地址直接谷歌搜索喵绅士就行
原本准备用线程爬取,无需等待,想了下没必要能用就行,现在能看本子的地方太多了
2022/1/25
喵绅士阵亡了,可惜
================================================
FILE: nyaCrawler.iml
================================================
================================================
FILE: pom.xml
================================================
4.0.0
com.cn
nyaCrawler
1.0-SNAPSHOT
org.apache.httpcomponents
httpclient
4.5.2
commons-io
commons-io
2.5
org.jsoup
jsoup
1.11.3
================================================
FILE: src/main/java/com/cn/main/nyaPictureMain.java
================================================
package com.cn.main;
import com.cn.util.HttpClientUtil;
import org.apache.commons.io.FileUtils;
import org.apache.http.HttpEntity;
import org.apache.http.HttpHost;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.List;
public class nyaPictureMain {
//存放目录
private static String fileSource = "E://nyaManhua//new//";
public static void main(String[] args) throws Exception {
List urlList = new ArrayList();
//地址
urlList.add("https://zha.doghentai.com/g/338012/");
urlList.add("");
urlList.add("");
urlList.add("");
urlList.add("");
urlList.add("");
urlList.add("");
urlList.add("");
urlList.add("");
urlList.add("");
urlList.add("");
urlList.add("");
urlList.add("");
urlList.add("");
urlList.add("");
urlList.add("");
urlList.add("");
urlList.add("");
urlList.add("");
nyaPictureMain.crawlerNyaUrl(urlList);
String exSite = "cmd /c start " + fileSource ;
Runtime.getRuntime().exec(exSite);
}
public static void crawlerNyaPic(int picSum,String fileUrl,String intputFile,String suffix){
try {
for (int i = 1; i <= picSum; i++) {
// suffix = ".jpg"; //随时替换文件格式
CloseableHttpClient httpClient = HttpClients.createDefault(); // 创建HttpClient实例
HttpGet httpGet = new HttpGet(fileUrl+i+suffix); // 创建Httpget实例
//设置Http报文头信息
httpGet.setHeader("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36");
httpGet.setHeader("accept", "image/avif,image/webp,image/apng,image/*,*/*;q=0.8");
httpGet.setHeader("accept-encoding", "gzip, deflate, br");
httpGet.setHeader("referer", "https://zha.doghentai.com/");
httpGet.setHeader("sec-fetch-dest", "image");
httpGet.setHeader("accept-language", "zh-CN,zh;q=0.9,en;q=0.8");
HttpHost proxy = new HttpHost("127.0.0.1", 7890);
//超时时间单位为毫秒
RequestConfig defaultRequestConfig = RequestConfig.custom()
.setConnectTimeout(1000).setSocketTimeout(30000)
.setProxy(proxy).build();
httpClient = HttpClients.custom().setDefaultRequestConfig(defaultRequestConfig).build();
CloseableHttpResponse response = null;
response = httpClient.execute(httpGet); // 执行http get请求
HttpEntity entity = response.getEntity(); // 获取返回实体
if(null != entity){
InputStream inputStream = entity.getContent();//返回一个输入流
//输出图片
FileUtils.copyInputStreamToFile(inputStream, new File(intputFile+i+suffix));//引用org.apache.commons.io.FileUtils
System.out.println(i+suffix);
}
response.close(); // 关闭response
httpClient.close(); // 关闭HttpClient实体
}
}catch (Exception e){
System.out.println(e);
}
}
public static void crawlerNyaUrl(List urlList) throws Exception {
Integer rateDow = 1;
for(String url:urlList){
String html = "";
if(url.length() != 0){
html = HttpClientUtil.getSource(url);
Document document = Jsoup.parse(html);
Element element = document.selectFirst("div.container").selectFirst("a");
String coverImgUrl = element.select("img").attr("data-src");
//获取图片载点
String[] ourStr = coverImgUrl.split("/");
//获取后缀
String[] oursuffix = coverImgUrl.split("\\.");
//获取数量
Elements picSum = document.select("div.thumb-container");
//获取本子名字
String benziName = element.select("img").attr("alt");
benziName = benziName.replaceAll("\\?","").replaceAll(":","").replaceAll(" ","").replaceAll("\\*","");
int count = picSum.size();
int benziN = Integer.parseInt(ourStr[ourStr.length-2]);
String suffix = "."+oursuffix[oursuffix.length-1];
String fileUrl = "https://i0.nyacdn.com/galleries/"+benziN+"/";
String intputFile = fileSource +benziName +"//";
nyaPictureMain.crawlerNyaPic(count,fileUrl,intputFile,suffix);
//缓存完后暂停几秒
Thread.sleep(3000);
}
}
System.out.println("喵变态图片缓存成功!!!!");
}
}
================================================
FILE: src/main/java/com/cn/util/HttpClientUtil.java
================================================
package com.cn.util;
import org.apache.http.HttpStatus;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import java.io.IOException;
public class HttpClientUtil {
public static String getSource(String url) {
String html = new String();
HttpGet httpget = new HttpGet(url); //创建Http请求实例,URL 如:https://cd.lianjia.com/
// 模拟浏览器,避免被服务器拒绝,返回返回403 forbidden的错误信息
httpget.setHeader("User-Agent",
"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36");
CloseableHttpResponse response = null;
CloseableHttpClient httpclient = HttpClients.createDefault(); // 使用默认的HttpClient
try {
response = httpclient.execute(httpget);
if (response.getStatusLine().getStatusCode() == HttpStatus.SC_OK) { // 返回 200 表示成功
html = EntityUtils.toString(response.getEntity(), "utf-8"); // 获取服务器响应实体的内容
}
} catch (IOException e) {
e.printStackTrace();
} finally {
if (response != null) {
try {
response.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
return html;
}
}
================================================
FILE: web/WEB-INF/web.xml
================================================