Repository: ERYhua/nyaHentaiCrawler Branch: local Commit: 9e62548ae87a Files: 11 Total size: 34.7 KB Directory structure: gitextract_upojyjqn/ ├── .idea/ │ ├── compiler.xml │ ├── misc.xml │ ├── vcs.xml │ └── workspace.xml ├── README.md ├── nyaCrawler.iml ├── pom.xml ├── src/ │ └── main/ │ └── java/ │ └── com/ │ └── cn/ │ ├── main/ │ │ └── nyaPictureMain.java │ └── util/ │ └── HttpClientUtil.java ├── target/ │ └── classes/ │ └── META-INF/ │ └── nyaCrawler.kotlin_module └── web/ └── WEB-INF/ └── web.xml ================================================ FILE CONTENTS ================================================ ================================================ FILE: .idea/compiler.xml ================================================ ================================================ FILE: .idea/misc.xml ================================================ ================================================ FILE: .idea/vcs.xml ================================================ ================================================ FILE: .idea/workspace.xml ================================================ main 1564232410913 1576766569859 1576766587111 No facets are configured Web|nyaCrawler ================================================ FILE: README.md ================================================ # nyaHentaiCrawler 喵绅士本子爬取(自用) 用于抓取喵绅士的本子,启用main输入本子对应网址即可使用,个人使用版 //注意!!! //因源站更新过几版,且添加了反扒延迟等措施,因此已不在适用,仅做参考 2021/1/7更新 现在添加代理,可以本地配置代理爬取,源站依然可用 2021/3/15 原网址git挂了,现有其他地址访问 2021/12/3 原网址git又可以使用了,地址直接谷歌搜索喵绅士就行 原本准备用线程爬取,无需等待,想了下没必要能用就行,现在能看本子的地方太多了 2022/1/25 喵绅士阵亡了,可惜 ================================================ FILE: nyaCrawler.iml ================================================ ================================================ FILE: pom.xml ================================================ 4.0.0 com.cn nyaCrawler 1.0-SNAPSHOT org.apache.httpcomponents httpclient 4.5.2 commons-io commons-io 2.5 org.jsoup jsoup 1.11.3 ================================================ FILE: src/main/java/com/cn/main/nyaPictureMain.java ================================================ package com.cn.main; import com.cn.util.HttpClientUtil; import org.apache.commons.io.FileUtils; import org.apache.http.HttpEntity; import org.apache.http.HttpHost; import org.apache.http.client.config.RequestConfig; import org.apache.http.client.methods.CloseableHttpResponse; import org.apache.http.client.methods.HttpGet; import org.apache.http.impl.client.CloseableHttpClient; import org.apache.http.impl.client.HttpClients; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import java.io.File; import java.io.IOException; import java.io.InputStream; import java.util.ArrayList; import java.util.List; public class nyaPictureMain { //存放目录 private static String fileSource = "E://nyaManhua//new//"; public static void main(String[] args) throws Exception { List urlList = new ArrayList(); //地址 urlList.add("https://zha.doghentai.com/g/338012/"); urlList.add(""); urlList.add(""); urlList.add(""); urlList.add(""); urlList.add(""); urlList.add(""); urlList.add(""); urlList.add(""); urlList.add(""); urlList.add(""); urlList.add(""); urlList.add(""); urlList.add(""); urlList.add(""); urlList.add(""); urlList.add(""); urlList.add(""); urlList.add(""); nyaPictureMain.crawlerNyaUrl(urlList); String exSite = "cmd /c start " + fileSource ; Runtime.getRuntime().exec(exSite); } public static void crawlerNyaPic(int picSum,String fileUrl,String intputFile,String suffix){ try { for (int i = 1; i <= picSum; i++) { // suffix = ".jpg"; //随时替换文件格式 CloseableHttpClient httpClient = HttpClients.createDefault(); // 创建HttpClient实例 HttpGet httpGet = new HttpGet(fileUrl+i+suffix); // 创建Httpget实例 //设置Http报文头信息 httpGet.setHeader("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36"); httpGet.setHeader("accept", "image/avif,image/webp,image/apng,image/*,*/*;q=0.8"); httpGet.setHeader("accept-encoding", "gzip, deflate, br"); httpGet.setHeader("referer", "https://zha.doghentai.com/"); httpGet.setHeader("sec-fetch-dest", "image"); httpGet.setHeader("accept-language", "zh-CN,zh;q=0.9,en;q=0.8"); HttpHost proxy = new HttpHost("127.0.0.1", 7890); //超时时间单位为毫秒 RequestConfig defaultRequestConfig = RequestConfig.custom() .setConnectTimeout(1000).setSocketTimeout(30000) .setProxy(proxy).build(); httpClient = HttpClients.custom().setDefaultRequestConfig(defaultRequestConfig).build(); CloseableHttpResponse response = null; response = httpClient.execute(httpGet); // 执行http get请求 HttpEntity entity = response.getEntity(); // 获取返回实体 if(null != entity){ InputStream inputStream = entity.getContent();//返回一个输入流 //输出图片 FileUtils.copyInputStreamToFile(inputStream, new File(intputFile+i+suffix));//引用org.apache.commons.io.FileUtils System.out.println(i+suffix); } response.close(); // 关闭response httpClient.close(); // 关闭HttpClient实体 } }catch (Exception e){ System.out.println(e); } } public static void crawlerNyaUrl(List urlList) throws Exception { Integer rateDow = 1; for(String url:urlList){ String html = ""; if(url.length() != 0){ html = HttpClientUtil.getSource(url); Document document = Jsoup.parse(html); Element element = document.selectFirst("div.container").selectFirst("a"); String coverImgUrl = element.select("img").attr("data-src"); //获取图片载点 String[] ourStr = coverImgUrl.split("/"); //获取后缀 String[] oursuffix = coverImgUrl.split("\\."); //获取数量 Elements picSum = document.select("div.thumb-container"); //获取本子名字 String benziName = element.select("img").attr("alt"); benziName = benziName.replaceAll("\\?","").replaceAll(":","").replaceAll(" ","").replaceAll("\\*",""); int count = picSum.size(); int benziN = Integer.parseInt(ourStr[ourStr.length-2]); String suffix = "."+oursuffix[oursuffix.length-1]; String fileUrl = "https://i0.nyacdn.com/galleries/"+benziN+"/"; String intputFile = fileSource +benziName +"//"; nyaPictureMain.crawlerNyaPic(count,fileUrl,intputFile,suffix); //缓存完后暂停几秒 Thread.sleep(3000); } } System.out.println("喵变态图片缓存成功!!!!"); } } ================================================ FILE: src/main/java/com/cn/util/HttpClientUtil.java ================================================ package com.cn.util; import org.apache.http.HttpStatus; import org.apache.http.client.methods.CloseableHttpResponse; import org.apache.http.client.methods.HttpGet; import org.apache.http.impl.client.CloseableHttpClient; import org.apache.http.impl.client.HttpClients; import org.apache.http.util.EntityUtils; import java.io.IOException; public class HttpClientUtil { public static String getSource(String url) { String html = new String(); HttpGet httpget = new HttpGet(url); //创建Http请求实例,URL 如:https://cd.lianjia.com/ // 模拟浏览器,避免被服务器拒绝,返回返回403 forbidden的错误信息 httpget.setHeader("User-Agent", "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36"); CloseableHttpResponse response = null; CloseableHttpClient httpclient = HttpClients.createDefault(); // 使用默认的HttpClient try { response = httpclient.execute(httpget); if (response.getStatusLine().getStatusCode() == HttpStatus.SC_OK) { // 返回 200 表示成功 html = EntityUtils.toString(response.getEntity(), "utf-8"); // 获取服务器响应实体的内容 } } catch (IOException e) { e.printStackTrace(); } finally { if (response != null) { try { response.close(); } catch (IOException e) { e.printStackTrace(); } } } return html; } } ================================================ FILE: web/WEB-INF/web.xml ================================================