Repository: RxGirlz/OpenYspider Branch: 4.x Commit: bdb54be48d74 Files: 29 Total size: 123.3 KB Directory structure: gitextract_fx82gl62/ ├── .gitignore ├── README-JAV.md ├── README.md ├── oys-boot-starter/ │ ├── pom.xml │ └── src/ │ └── main/ │ ├── java/ │ │ └── com/ │ │ └── devyy/ │ │ └── oys/ │ │ └── srarter/ │ │ ├── OpenYspiderApplication.java │ │ ├── config/ │ │ │ ├── MybatisPlusConfig.java │ │ │ └── Swagger3Config.java │ │ └── main/ │ │ └── JavMain.java │ └── resources/ │ ├── META-INF/ │ │ └── additional-spring-configuration-metadata.json │ ├── application-mysql.properties │ └── application.properties ├── oys-codeforces/ │ ├── pom.xml │ └── src/ │ └── main/ │ └── java/ │ └── com/ │ └── devyy/ │ └── oys/ │ └── codeforces/ │ ├── CfController.java │ ├── CfDO.java │ └── dao/ │ └── CfMapper.java ├── oys-core/ │ ├── pom.xml │ └── src/ │ └── main/ │ └── java/ │ └── com/ │ └── devyy/ │ └── oys/ │ └── srarter/ │ └── core/ │ ├── enums/ │ │ └── StateTypeEnum.java │ └── util/ │ └── SpiderUtil.java ├── oys-tujidao/ │ ├── pom.xml │ └── src/ │ ├── main/ │ │ └── java/ │ │ └── com/ │ │ └── devyy/ │ │ └── oys/ │ │ └── tujidao/ │ │ ├── TuJiDaoAlbumDO.java │ │ ├── TuJiDaoController.java │ │ ├── TuJiDaoService.java │ │ ├── TuJiDaoServiceImpl.java │ │ └── dao/ │ │ └── TuJiDaoAlbumMapper.java │ └── test/ │ ├── java/ │ │ └── com/ │ │ └── devyy/ │ │ └── oys/ │ │ └── tujidao/ │ │ └── TuJiDaoServiceImplTest.java │ └── resources/ │ ├── gengxin.html │ └── gengxin2.html ├── pom.xml └── sql_scripts/ └── oys3_all_scripts_mysql.sql ================================================ FILE CONTENTS ================================================ ================================================ FILE: .gitignore ================================================ HELP.md target/ !.mvn/wrapper/maven-wrapper.jar !**/src/main/** !**/src/test/** ### STS ### .apt_generated .classpath .factorypath .project .settings .springBeans .sts4-cache ### IntelliJ IDEA ### .idea *.iws *.iml *.ipr ### NetBeans ### /nbproject/private/ /nbbuild/ /dist/ /nbdist/ /.nb-gradle/ build/ ### VS Code ### .vscode/ ================================================ FILE: README-JAV.md ================================================ - IPX=265 5T2 - SSNI=256 停更 5T2 - SSIS=162 5T4 - MIAA=135 5T5 - STARS=132 5T3 - MIDE=128 5T3 - FSDSS=128 5T4 - JUL=103 5T4 - JUFE=85 5T5 - HND=85 5T1 - / - ABP=84 5T1 - CAWD=71 5T4 - FCH=66 停更 5T5 - DASD=58 5T3 - ABW=54 5T4 - CJOD=53 5T5 - PPPD=52 5T3 - RKI=46 5T5 - PRED=45 5T1 - MGMQ=45 5T5 - / - DPMI=45 5T3 - DNJR=44 5T5 - MSFH=42 5T5 停更 - MKMP=41 5T3 - MOPG=39 停更 5T5 - DMOW=37 停更 5T5 - GVH=37 5T3 - OTIM=34 停更 5T3 - GENM=33 5T5 - MEYD=32 5T1 - / - OKP=32 5T2 - MVSD=31 5T3 - WANZ=30 停更 5T1 - MGMJ=30 停更 5T5 - BLK=30 5T3 - APNS=29 5T3 - MGMP=27 5T5 - EKDV=27 5T1 - EBOD=26 5T2 - OFJE=26 5T2 - / - ARM=25 5T5 - MDTM=23 5T1 - AARM=22 5T5 - IPVR=21 5T4 - CLOT=18 5T5 - TPPN=18 5T3 - ECB=18 5T1 - MMUS=16 - DVAJ=16 - SHKD=16 --- OVG-149 PFES-003 PFES-012 RCTD-108 SNIS-792 SW-690 XMOM-016 --- IPX-992 IPX-838 IPX-819 IPX-916 IPX-851 IPX-832 IPX-812 JUFE-400 JUFE-385 JUFE-375 CJOD-340 RKI-623 MGMQ MOPT-019 MOPT-021 AARM-109 AARM-110 松本いちか VRKM-497 EBVR-035 ================================================ FILE: README.md ================================================ # OpenYspider 4.x 千万级图片、视频爬虫 [开源版本] ![](swagger3.png) ## 简介 OpenYspider 是一个使用 Java 编写的简单爬虫。主要用到的技术栈有: 1. spring-boot-starter-web 2. spring-boot-starter-test 3. mybatis-plus-boot-starter 4. springfox-boot-starter 5. lombok 6. jsoup 7. mockito + jacoco 当前 LTS 的网站有: 1. `tujidao.com` Deprecated 的网站(请于历史提交中查看): 1. `tangyun365.com` 2. `yalayi.com` 3. `rosmm88.com` 4. `mzsock.com` 5. `meinvla.net` 6. `leetcode-cn.com` ## 开发环境 `Windows 11` + `JDK 17` + `Mysql 8.x` ```sh $ java --version openjdk 17.0.1 2021-10-19 OpenJDK Runtime Environment (build 17.0.1+12-39) OpenJDK 64-Bit Server VM (build 17.0.1+12-39, mixed mode, sharing) ``` 运行启动类 `OpenYspiderApplication` 后,浏览器访问 [http://localhost:23333/swagger-ui/index.html#/](http://localhost:23333/swagger-ui/index.html#/) 数据库脚本: [sql_scripts](./sql_scripts/oys3_all_scripts_mysql.sql) ## 爬取网站 数据统计截止 2022-02-12 ### 1 图集岛(原美图日) [ 2,647,717P / 905G ] - 目标网站:[https://www.tujidao.com/](https://www.tujidao.com/) - 特点:图片路径可遍历 ```sql select count(*) from oys_tujidao_album_t where album_id > 0 and album_id <= 10000; -- 9995 ok select count(*) from oys_tujidao_album_t where album_id > 10000 and album_id <= 20000; -- 10000 select count(*) from oys_tujidao_album_t where album_id > 20000 and album_id <= 30000; -- 9999 [23001] select count(*) from oys_tujidao_album_t where album_id > 30000 and album_id <= 40000; -- 10000 select count(*) from oys_tujidao_album_t where album_id > 40000 and album_id <= 50000; -- 8925 [46018] ``` ## 部分成果展示 ![](result1.png) ![](result2.png) ================================================ FILE: oys-boot-starter/pom.xml ================================================ openyspider com.devyy ${revision} 4.0.0 oys-boot-starter com.devyy oys-tujidao ${revision} com.devyy oys-codeforces ${revision} org.springframework.boot spring-boot-maven-plugin ${org.springframework.boot.spring-boot-dependencies.version} org.jacoco jacoco-maven-plugin ${jacoco-maven-plugin.version} report verify report-aggregate ================================================ FILE: oys-boot-starter/src/main/java/com/devyy/oys/srarter/OpenYspiderApplication.java ================================================ package com.devyy.oys.srarter; import org.springframework.boot.SpringApplication; import org.springframework.boot.autoconfigure.SpringBootApplication; import org.springframework.context.annotation.ComponentScan; /** * 启动类 * * @since 2019-12-01 */ @SpringBootApplication @ComponentScan("com.devyy.oys") public class OpenYspiderApplication { public static void main(String[] args) { SpringApplication.run(OpenYspiderApplication.class, args); } } ================================================ FILE: oys-boot-starter/src/main/java/com/devyy/oys/srarter/config/MybatisPlusConfig.java ================================================ package com.devyy.oys.srarter.config; import org.mybatis.spring.annotation.MapperScan; import org.springframework.context.annotation.Configuration; /** * Mybatis-Plus 配置类 * * @since 2020-03-22 */ @Configuration @MapperScan("com.devyy.oys.**.dao") public class MybatisPlusConfig { } ================================================ FILE: oys-boot-starter/src/main/java/com/devyy/oys/srarter/config/Swagger3Config.java ================================================ package com.devyy.oys.srarter.config; import io.swagger.annotations.ApiOperation; import org.springframework.context.annotation.Bean; import org.springframework.context.annotation.Configuration; import springfox.documentation.builders.ApiInfoBuilder; import springfox.documentation.builders.PathSelectors; import springfox.documentation.builders.RequestHandlerSelectors; import springfox.documentation.oas.annotations.EnableOpenApi; import springfox.documentation.service.ApiInfo; import springfox.documentation.spi.DocumentationType; import springfox.documentation.spring.web.plugins.Docket; /** * Swagger3 配置类 * * http://localhost:23333/swagger-ui/index.html */ @EnableOpenApi @Configuration public class Swagger3Config { @Bean public Docket createRestApi() { return new Docket(DocumentationType.OAS_30) .apiInfo(apiInfo()) .select() .apis(RequestHandlerSelectors.withMethodAnnotation(ApiOperation.class)) .paths(PathSelectors.any()) .build(); } private ApiInfo apiInfo() { return new ApiInfoBuilder() .title("OpenYspider 千万级图片爬虫、视频爬虫 [开源版本]") .description("Github: https://github.com/RxGirlz/OpenYspider") .version("v4.x") .build(); } } ================================================ FILE: oys-boot-starter/src/main/java/com/devyy/oys/srarter/main/JavMain.java ================================================ package com.devyy.oys.srarter.main; import com.kennycason.kumo.CollisionMode; import com.kennycason.kumo.WordCloud; import com.kennycason.kumo.WordFrequency; import com.kennycason.kumo.bg.CircleBackground; import com.kennycason.kumo.font.scale.LinearFontScalar; import com.kennycason.kumo.nlp.FrequencyAnalyzer; import com.kennycason.kumo.nlp.normalize.UpperCaseNormalizer; import com.kennycason.kumo.palette.ColorPalette; import org.apache.commons.io.FileUtils; import java.awt.*; import java.io.File; import java.io.IOException; import java.nio.charset.StandardCharsets; import java.util.Collection; import java.util.Collections; import java.util.List; import java.util.Locale; import java.util.Map; import java.util.function.Function; import java.util.stream.Collectors; import java.util.stream.Stream; /** * @since 2021-05-01 */ public class JavMain { /** * Jav 文件夹目录 */ private static final String JAV_BASE_DIR = "D:\\GITHUB\\Jav"; /** * CMD 输出命令 * dir /b /s > D:\GITHUB\Jav\jav20220824.txt */ private static final String JAV_FILE_NAME = "jav20220824"; private static final String JAV_INPUT_FILE = String.format(Locale.ENGLISH, "%s/%s.txt", JAV_BASE_DIR, JAV_FILE_NAME); private static final String JAV_OUTPUT_FILE = String.format(Locale.ENGLISH, "%s/%s.sh", JAV_BASE_DIR, JAV_FILE_NAME); public static void main(String[] args) { // 打印命令行 doPrintCommand(); // 分析 // doAnalyse(); // 查重 // doFindRepeat(); // 词云 // doWordCloud(); } /** * 打印 echo 番号 shell 脚本 */ private static void doPrintCommand() { try { File inputFile = new File(JAV_INPUT_FILE); List fanHao = FileUtils.readLines(inputFile, StandardCharsets.UTF_8.name()); List cmdList = fanHao.stream() // 过滤掉 jav202xxxxx.txt .filter(name -> !name.endsWith(".txt") && name.contains(".")) .map(name -> name.substring(name.lastIndexOf("\\") + 1)) .map(name -> String.format(Locale.ENGLISH, "echo > %s.txt", name)) .collect(Collectors.toList()); File outputFile = new File(JAV_OUTPUT_FILE); FileUtils.writeLines(outputFile, cmdList); } catch (IOException e) { e.printStackTrace(); } } private static Stream getJavStream() { File directory = new File(JAV_BASE_DIR); // 递归遍历 Jav txt 文件 Collection avFiles = FileUtils.listFiles(directory, new String[]{"txt"}, true); return avFiles.stream().map(File::getName); } /** * 统计分析 Jav 目录索引 */ private static void doAnalyse() { long totals = getJavStream().map(name -> name.split("-")[0]).count(); System.out.printf(Locale.ENGLISH, "Jav totals: %s, Ranks:%n", totals); getJavStream() .map(name -> name.split("-")[0]) .collect(Collectors.groupingBy(Function.identity(), Collectors.counting())) .entrySet().stream() // .filter(entry -> entry.getValue() >= 5) .collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue)) .entrySet().stream() .sorted(Collections.reverseOrder(Map.Entry.comparingByValue())) .forEach(System.out::println); } /** * 查重 */ private static void doFindRepeat() { getJavStream() .map(name -> name.split("\\.")[0]) .collect(Collectors.groupingBy(Function.identity(), Collectors.counting())) .entrySet().stream() .filter(entry -> entry.getValue() > 1) .forEach(System.out::println); } /** * 输出词云 */ private static void doWordCloud() { List avPrefix = getJavStream() .map(name -> name.split("-")[0]) // .filter(str -> !str.equals("SSNI") && !str.equals("IPX")) .collect(Collectors.toList()); final FrequencyAnalyzer frequencyAnalyzer = new FrequencyAnalyzer(); frequencyAnalyzer.setNormalizer(new UpperCaseNormalizer()); final List wordFrequencies = frequencyAnalyzer.load(avPrefix); final Dimension dimension = new Dimension(600, 600); final WordCloud wordCloud = new WordCloud(dimension, CollisionMode.PIXEL_PERFECT); wordCloud.setPadding(0); wordCloud.setBackground(new CircleBackground(300)); wordCloud.setColorPalette(new ColorPalette(new Color(0x4055F1), new Color(0x408DF1), new Color(0x40AAF1), new Color(0x40C5F1), new Color(0x40D3F1), new Color(0xFFFFFF))); wordCloud.setFontScalar(new LinearFontScalar(20, 160)); wordCloud.build(wordFrequencies); wordCloud.writeToFile(JAV_BASE_DIR + "/jav-wordCloud.png"); } } ================================================ FILE: oys-boot-starter/src/main/resources/META-INF/additional-spring-configuration-metadata.json ================================================ { "properties": [ { "name": "oys.config.webdriver.chrome.driver.path", "type": "java.lang.String", "description": "webdriver.chrome.driver path(根据情况自定义)." }, { "name": "oys.tujidao.url.prefix", "type": "java.lang.String", "description": "图集岛-相册信息目录路径前缀." }, { "name": "oys.tujidao.img.url.prefix", "type": "java.lang.String", "description": "图集岛-图片资源目录路径前缀." }, { "name": "oys.tujidao.local.folder.prefix", "type": "java.lang.String", "description": "图集岛-本地存储路径前缀(根据情况自定义)." }, { "name": "oys.tujidao.local.preview.prefix", "type": "java.lang.String", "description": "图集岛-本地预览存储路径前缀(根据情况自定义)." }, { "name": "oys.tujidao.local.cover.prefix", "type": "java.lang.String", "description": "图集岛-本地封面存储路径前缀(根据情况自定义)." }, { "name": "oys.tujidao.local.cover.num.start", "type": "java.lang.Integer", "description": "图集岛-本地封面开始下标(根据情况自定义)." }, { "name": "oys.tujidao.local.cover.num.end", "type": "java.lang.Integer", "description": "图集岛-本地封面结束下标(根据情况自定义)." }, { "name": "oys.tujidao.local.preview.num.start", "type": "java.lang.Integer", "description": "图集岛-预下载开始下标(根据情况自定义)." }, { "name": "oys.tujidao.local.preview.num.end", "type": "java.lang.Integer", "description": "图集岛-预下载结束下标(根据情况自定义)." } ] } ================================================ FILE: oys-boot-starter/src/main/resources/application-mysql.properties ================================================ datasource.url.1=jdbc:mysql://127.0.0.1:3306/dev?useUnicode=true&characterEncoding=UTF-8&serverTimezone=UTC datasource.user.1=root datasource.password.1=123456 datasource.jdbcDriverClass.1=com.mysql.cj.jdbc.Driver ================================================ FILE: oys-boot-starter/src/main/resources/application.properties ================================================ spring.profiles.active=mysql # springboot 2.6.x spring.mvc.pathmatch.matching-strategy=ant_path_matcher # port server.port=23333 # spring datasource spring.datasource.url=${datasource.url.1} spring.datasource.username=${datasource.user.1} spring.datasource.password=${datasource.password.1} spring.datasource.driver-class-name=${datasource.jdbcDriverClass.1} spring.datasource.type=com.zaxxer.hikari.HikariDataSource # mybatis-plus mybatis-plus.type-aliases-package=com.devyy.* # self config server.servlet.encoding.charset=utf-8 server.servlet.encoding.force=true server.servlet.encoding.enabled=true # tujidao oys.tujidao.url.prefix=https://www.tujidao03.com/u/?action=gengxin&page= oys.tujidao.img.url.prefix=https://tjg.gzhuibei.com/a/1/ oys.tujidao.local.folder.prefix=D:/\u56FE\u96C6\u5C9B\u722C\u866B\uFF0850001-60000\uFF09/ oys.tujidao.local.preview.prefix=D:/\u56FE\u96C6\u5C9B\u722C\u866BPreview/ oys.tujidao.local.cover.prefix=D:/\u56FE\u96C6\u5C9B\u722C\u866B\u5C01\u9762/ oys.tujidao.local.cover.num.start=58540 oys.tujidao.local.cover.num.end=59448 oys.tujidao.local.preview.num.start=58540 oys.tujidao.local.preview.num.end=59448 # 56425-57050 625 # 57051-57791 740 # 57792-58539 747 # 58540-59448 908 # kafka spring.kafka.bootstrap-servers=localhost.ubuntu0.com:9092,localhost.ubuntu1.com:9092,localhost.ubuntu2.com:9092 spring.kafka.consumer.auto-offset-reset=none spring.kafka.consumer.enable-auto-commit=false # redis spring.redis.cluster.nodes=localhost.ubuntu0.com:6379,localhost.ubuntu0.com:6380,localhost.ubuntu1.com:6379,localhost.ubuntu1.com:6380,localhost.ubuntu2.com:6379,localhost.ubuntu2.com:6380 spring.redis.password= #spring.redis.host=local.ubuntu.com # mongodb spring.data.mongodb.uri= ================================================ FILE: oys-codeforces/pom.xml ================================================ openyspider com.devyy ${revision} 4.0.0 oys-codeforces 17 17 ================================================ FILE: oys-codeforces/src/main/java/com/devyy/oys/codeforces/CfController.java ================================================ package com.devyy.oys.codeforces; import com.baomidou.mybatisplus.core.conditions.query.QueryWrapper; import com.devyy.oys.codeforces.dao.CfMapper; import io.swagger.annotations.Api; import io.swagger.annotations.ApiOperation; import lombok.extern.slf4j.Slf4j; import org.openqa.selenium.By; import org.openqa.selenium.WebDriver; import org.openqa.selenium.WebElement; import org.openqa.selenium.chrome.ChromeDriver; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.beans.factory.annotation.Value; import org.springframework.data.redis.core.RedisTemplate; import org.springframework.data.redis.core.StringRedisTemplate; import org.springframework.kafka.core.KafkaTemplate; import org.springframework.web.bind.annotation.PostMapping; import org.springframework.web.bind.annotation.RequestMapping; import org.springframework.web.bind.annotation.RestController; import java.time.Duration; import java.util.List; /** * Controller + ServiceImpl * * @since 2021-01-24 */ @Slf4j @RestController @RequestMapping("/codeforces") @Api(tags = "Codeforces 爬虫") public class CfController { @Value("${oys.codeforces.handleOrEmail:123}") private String handleOrEmail; @Value("${oys.codeforces.password:123}") private String password; @Autowired private CfMapper cfMapper; @ApiOperation(value = "获取片段") @PostMapping("/step1") public String doGetFragment() throws InterruptedException { System.setProperty("webdriver.chrome.driver", "D:\\GITHUB\\LTS\\codeforces-spider\\chromedriver.exe"); WebDriver webDriver = new ChromeDriver(); webDriver.manage().timeouts().implicitlyWait(Duration.ofSeconds(30)); // login webDriver.get("https://codeforces.com/enter?back=%2F"); webDriver.findElement(By.id("handleOrEmail")).sendKeys(handleOrEmail); webDriver.findElement(By.id("password")).sendKeys(password); webDriver.findElement(By.id("remember")).click(); webDriver.findElement(By.className("submit")).click(); Thread.sleep(5000); List cfDOList = cfMapper.selectList(new QueryWrapper().select().isNull("FRAGMENT")); log.info("==>cfDOList size={}", cfDOList.size()); for (CfDO cfDO : cfDOList) { String submissionId = cfDO.getSubmissionId(); String url = "https://codeforces.com/contest/1593/submission/" + submissionId; log.info("==>url={}", url); try { webDriver.get(url); webDriver.findElement(By.className("click-to-view-tests")).click(); Thread.sleep(3000); List outputs = webDriver.findElements(By.className("output")); int size = outputs.size(); if (outputs.size() < 7) { Thread.sleep(3000); outputs = webDriver.findElements(By.className("output")); size = outputs.size(); } WebElement outputs8 = outputs.get(size - 1); String fragment = outputs8.getText().substring(0, 500); cfDO.setFragment(fragment); log.info(fragment); cfMapper.updateById(cfDO); log.info("<==success submissionId={}", submissionId); } catch (Exception e) { log.warn("<==failed submissionId={}", submissionId); } } webDriver.close(); return "success"; } @ApiOperation(value = "合并片段") @PostMapping("/step2") public String doMergeFragment() { List cfDOList = cfMapper.selectList(new QueryWrapper().select().orderByAsc("NO")); log.info("==>cfDOList size={}", cfDOList.size()); StringBuilder stringBuilder = new StringBuilder(); for (CfDO cfDO : cfDOList) { String fragment = cfDO.getFragment(); stringBuilder.append(fragment); } log.info("==>doMergeFragment={}", stringBuilder); return "success"; } @Autowired private KafkaTemplate kafkaTemplate; @ApiOperation(value = "Kafka") @PostMapping("/step3") public String testKafka() { kafkaTemplate.send("T_ubuntu", "test"); log.info("==>kafkaTemplate.send success"); return "success"; } @Autowired private StringRedisTemplate stringRedisTemplate; @ApiOperation(value = "redis") @PostMapping("/step4") public String testRedis() { stringRedisTemplate.boundValueOps("ubuntu:redis:test").set("hello"); log.info("==>redis set success"); String value = stringRedisTemplate.boundValueOps("ubuntu:redis:test").get(); log.info("==>redis get value={}", value); return "success"; } } ================================================ FILE: oys-codeforces/src/main/java/com/devyy/oys/codeforces/CfDO.java ================================================ package com.devyy.oys.codeforces; import com.baomidou.mybatisplus.annotation.TableField; import com.baomidou.mybatisplus.annotation.TableId; import com.baomidou.mybatisplus.annotation.TableName; import lombok.Data; /** * Entity 实体类 * * @since 2021-01-24 */ @Data @TableName("oys_codeforces") public class CfDO { @TableId("SUBMISSION_ID") private String submissionId; @TableField("NO") private Long no; @TableField("FRAGMENT") private String fragment; } ================================================ FILE: oys-codeforces/src/main/java/com/devyy/oys/codeforces/dao/CfMapper.java ================================================ package com.devyy.oys.codeforces.dao; import com.baomidou.mybatisplus.core.mapper.BaseMapper; import com.devyy.oys.codeforces.CfDO; import org.springframework.stereotype.Repository; /** * Mybatis-Plus Mapper * * @since 2021-01-24 */ @Repository public interface CfMapper extends BaseMapper { } ================================================ FILE: oys-core/pom.xml ================================================ openyspider com.devyy ${revision} 4.0.0 oys-core ================================================ FILE: oys-core/src/main/java/com/devyy/oys/srarter/core/enums/StateTypeEnum.java ================================================ package com.devyy.oys.srarter.core.enums; import lombok.Getter; /** * 资源状态 * * @since 2019-12-01 */ @Getter public enum StateTypeEnum { /** * 黑名单 */ BLACKLIST(-1, "黑名单"), /** * 白名单 */ WHITELIST(0, "白名单"), // /** * 下载完成 */ DONE(102, "下载完成"), /** * 下载中 */ DOWNLOADING(101, "下载中"), /** * 未开始/待重试 */ STARTED(100, "未开始/待重试"), // /** * 已解析--album专用 */ ANALYSIS(200, "已解析"), /** * 未达预期 */ EXCEPTION(500, "未达预期"), /** * 找不到资源 */ NOTFOUND(404, "找不到资源"), ; private final int seq; private final String desc; StateTypeEnum(int seq, String desc) { this.seq = seq; this.desc = desc; } } ================================================ FILE: oys-core/src/main/java/com/devyy/oys/srarter/core/util/SpiderUtil.java ================================================ package com.devyy.oys.srarter.core.util; import lombok.extern.slf4j.Slf4j; import org.apache.commons.io.FileUtils; import java.io.File; import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStream; import java.net.URL; import java.net.URLConnection; /** * 爬虫工具类 * * @since 2019-12-01 */ @Slf4j public class SpiderUtil { private static boolean ioDownload2(String onlineUrl, String localUrl, int timeout) { try { URL source = new URL(onlineUrl); File destination = new File(localUrl); // FileUtils.copyURLToFile(new URL(onlineUrl), new File(localUrl), timeout, timeout); final URLConnection connection = source.openConnection(); connection.setRequestProperty("Referer", "https://www.tujidao.com/"); connection.setConnectTimeout(timeout); connection.setReadTimeout(timeout); try (final InputStream stream = connection.getInputStream()) { FileUtils.copyInputStreamToFile(stream, destination); } log.info("==>io下载成功 localUrl={}", localUrl); return true; } catch (Exception e) { if (!(e instanceof FileNotFoundException)) { log.warn("FileUtils.copyURLToFile failed={} e.message={}", onlineUrl, e.getMessage()); } return false; } } /** * 带重试次数 * * @param onlineUrl onlineUrl * @param localUrl localUrl * @param times 重试次数 * @return success */ public static boolean ioDownload2Times(String onlineUrl, String localUrl, int times) { if (times < 0) { return false; } else { // timeout 递增 int timeout = 10000; if (times < 1) { timeout = 30000; } else if (times < 2) { timeout = 20000; } boolean success = ioDownload2(onlineUrl, localUrl, timeout); if (!success) { return ioDownload2Times(onlineUrl, localUrl, times - 1); } } return true; } /** * 图片移动 * * @param oldPath 原始路径 * @param newPath 目标路径 */ public static void fileMove(String oldPath, String newPath) { File oldName = new File(oldPath); File newName = new File(newPath); oldName.renameTo(newName); } /** * 图片移动 * * @param oldPath 原始路径 * @param newPath 目标路径 */ public static void fileCopy(String oldPath, String newPath) { File oldName = new File(oldPath); File newName = new File(newPath); try { FileUtils.copyFile(oldName, newName); log.info("==>fileCopy success oldPath={} newPath={}", oldPath, newPath); } catch (IOException e) { log.warn("==>fileCopy failed oldPath={} newPath={}", oldPath, newPath); } } } ================================================ FILE: oys-tujidao/pom.xml ================================================ openyspider com.devyy ${revision} 4.0.0 oys-tujidao com.devyy oys-core ${revision} ================================================ FILE: oys-tujidao/src/main/java/com/devyy/oys/tujidao/TuJiDaoAlbumDO.java ================================================ package com.devyy.oys.tujidao; import com.baomidou.mybatisplus.annotation.IdType; import com.baomidou.mybatisplus.annotation.TableField; import com.baomidou.mybatisplus.annotation.TableId; import com.baomidou.mybatisplus.annotation.TableName; import lombok.Data; /** * 实体类 * * @since 2019-12-01 */ @Data @TableName("oys_tujidao_album_t") public class TuJiDaoAlbumDO { /** * 自增 ID */ @TableId(type = IdType.AUTO) private Long id; /** * 状态 * * @see com.devyy.oys.srarter.core.enums.StateTypeEnum */ private Integer state; /** * 总数 */ private Integer total; /** * 相册名 */ @TableField("album_name") private String albumName; /** * 相册 ID */ @TableField("album_id") private Integer albumId; } ================================================ FILE: oys-tujidao/src/main/java/com/devyy/oys/tujidao/TuJiDaoController.java ================================================ package com.devyy.oys.tujidao; import io.swagger.annotations.Api; import io.swagger.annotations.ApiOperation; import lombok.extern.slf4j.Slf4j; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.web.bind.annotation.PostMapping; import org.springframework.web.bind.annotation.RequestMapping; import org.springframework.web.bind.annotation.RestController; /** * REST 接口层 * * @since 2019-12-01 */ @Slf4j @RestController @RequestMapping("/tujidao") @Api(tags = "图集岛爬虫") public class TuJiDaoController { @Autowired private TuJiDaoService tujidaoService; /** * 预下载 * * @return "success" */ @ApiOperation(value = "预下载") @PostMapping("/step1") public String step1() { return tujidaoService.doPreDownload(); } /** * 生成封面 * * @return "success" */ @ApiOperation(value = "生成封面") @PostMapping("/step2") public String step2() { return tujidaoService.doGenerateCover(); } /** * 同步更新记录 * * @return "success" */ @ApiOperation(value = "同步更新记录") @PostMapping("/step3") public String step3() { return tujidaoService.doSyncRecords(); } /** * 本地迁移 * * @return "success" */ @ApiOperation(value = "本地迁移") @PostMapping("/step4") public String step4() { return tujidaoService.doLocalMigration(); } } ================================================ FILE: oys-tujidao/src/main/java/com/devyy/oys/tujidao/TuJiDaoService.java ================================================ package com.devyy.oys.tujidao; /** * Service 层 * * @since 2019-12-01 */ public interface TuJiDaoService { /** * 预下载 * * @return success * @since 2019-12-08 */ String doPreDownload(); /** * 生成封面 * * @return success * @since 2019-12-08 */ String doGenerateCover(); /** * 同步更新记录 * * @return success */ String doSyncRecords(); /** * 本地迁移 * * @return success * @since 2019-12-08 */ String doLocalMigration(); } ================================================ FILE: oys-tujidao/src/main/java/com/devyy/oys/tujidao/TuJiDaoServiceImpl.java ================================================ package com.devyy.oys.tujidao; import com.devyy.oys.srarter.core.enums.StateTypeEnum; import com.devyy.oys.srarter.core.util.SpiderUtil; import com.devyy.oys.tujidao.dao.TuJiDaoAlbumMapper; import lombok.extern.slf4j.Slf4j; import org.apache.commons.io.FileUtils; import org.apache.commons.io.filefilter.DirectoryFileFilter; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.beans.factory.annotation.Value; import org.springframework.stereotype.Service; import org.springframework.util.CollectionUtils; import javax.net.ssl.HostnameVerifier; import javax.net.ssl.HttpsURLConnection; import java.io.File; import java.io.FilenameFilter; import java.io.IOException; import java.text.MessageFormat; import java.util.Collection; import java.util.HashMap; import java.util.HashSet; import java.util.Locale; import java.util.Map; import java.util.Objects; import java.util.Set; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import java.util.regex.Pattern; /** * Service 层实现 * * @since 2019-12-01 */ @Slf4j @Service public class TuJiDaoServiceImpl implements TuJiDaoService { /** * 源站网页 url */ @Value("${oys.tujidao.url.prefix}") private String tjdWebUrlPrefix; /** * 源站图片 cdn */ @Value("${oys.tujidao.img.url.prefix}") private String tjdImgUrlPrefix; /** * 本地归档目录 */ @Value("${oys.tujidao.local.folder.prefix}") private String tjdLocalFolderPrefix; /** * 本地 Preview 目录 */ @Value("${oys.tujidao.local.preview.prefix}") private String tjdLocalPreviewPrefix; /** * 本地封面目录 */ @Value("${oys.tujidao.local.cover.prefix}") private String tjdLocalCoverPrefix; /** * 封面范围 [start, end] */ @Value("${oys.tujidao.local.cover.num.start}") private Integer tjdCoverStart; @Value("${oys.tujidao.local.cover.num.end}") private Integer tjdLocalCoverEnd; /** * Preview 范围 [start, end] */ @Value("${oys.tujidao.local.preview.num.start}") private Integer tjdPreviewStart; @Value("${oys.tujidao.local.preview.num.end}") private Integer tjdPreviewEnd; @Autowired private TuJiDaoAlbumMapper tuJiDaoAlbumMapper; @Override public String doPreDownload() { ExecutorService executors = Executors.newFixedThreadPool(8); for (int i = tjdPreviewStart; i <= tjdPreviewEnd; i++) { final int finalI = i; executors.submit(() -> downloadByAlbumId(String.valueOf(finalI))); } return "success"; } static Map map = new HashMap<>(); static { map.put(119, 50); map.put(1562, 86); map.put(293, 129); map.put(313, 100); map.put(331, 116); map.put(338, 80); map.put(37, 80); map.put(3849, 100); map.put(543, 16); map.put(714, 61); map.put(716, 68); map.put(727, 69); map.put(16471, 24); map.put(16681, 46); map.put(16968, 75); map.put(17058, 100); map.put(32793, 158); map.put(34456, 44); map.put(35381, 33); map.put(35527, 150); map.put(35534, 58); map.put(35576, 21); map.put(35697, 39); map.put(35854, 25); map.put(36655, 60); map.put(36656, 52); map.put(36658, 68); map.put(36659, 73); map.put(36697, 76); map.put(36698, 52); map.put(36700, 80); map.put(36701, 92); map.put(37180, 45); map.put(38013, 41); map.put(38313, 42); map.put(38432, 35); map.put(39740, 48); map.put(44264, 17); map.put(45681, 42); map.put(45697, 19); map.put(46207, 16); map.put(48080, 62); } private void downloadByAlbumId(String albumId) { String localFolder = tjdLocalPreviewPrefix + albumId + "/"; // 若文件夹路径不存在,则新建 File file = new File(localFolder); if (!file.exists()) { if (!file.mkdirs()) { log.error("==>localFolder={} 创建文件路径失败", localFolder); return; } } // 顺序扫描直到 404 for (int i = 0; ; i++) { String onlinePath = String.format(Locale.CHINESE, "%s%s/%d.jpg", tjdImgUrlPrefix, albumId, i); String localPath = String.format(Locale.CHINESE, "%s%d.jpg", localFolder, i); // 幂等,若当前文件未下载,则进行下载 File file2 = new File(localPath); if (!file2.exists()) { if (!SpiderUtil.ioDownload2Times(onlinePath, localPath, 3)) { break; } } } } @Override public String doGenerateCover() { // 若文件夹路径不存在,则新建 File file = new File(tjdLocalCoverPrefix); if (!file.exists()) { if (!file.mkdirs()) { log.error("==>localFolder={} 创建文件路径失败", tjdLocalCoverPrefix); return "error"; } } // 选取 1.jpg 作为封面 for (int i = tjdCoverStart; i <= tjdLocalCoverEnd; i++) { String onlinePath = String.format(Locale.CHINESE, "%s%d/1.jpg", tjdLocalPreviewPrefix, i); String localPath = String.format(Locale.CHINESE, "%s%d-1.jpg", tjdLocalCoverPrefix, i); // 幂等,若当前文件未下载,则进行下载 File file2 = new File(localPath); if (!file2.exists()) { SpiderUtil.fileCopy(onlinePath, localPath); } } return "success"; } @Override public String doSyncRecords() { // 解决 cookie 和 https 问题 Map cookiesMap = buildCookies(); try { trustAllHttpsCertificates(); } catch (Exception e) { log.error("==>TujidaoService#doScanAlbums failed e=", e); } HttpsURLConnection.setDefaultHostnameVerifier(hv); Document document = null; // 55625-56283 659 // 56284-56939 656 // 56940-57639 700 // 57640-58330 690 for (int i = 1; i <= 50; i++) { try { document = Jsoup.connect(tjdWebUrlPrefix + i).cookies(cookiesMap).get(); } catch (IOException e) { log.error("==>url={} e={}", tjdWebUrlPrefix + i, e.getMessage()); } if (Objects.isNull(document)) { continue; } Element heziElement = document.getElementsByClass("hezi").first(); if (Objects.isNull(heziElement)) { continue; } heziElement.getElementsByTag("li").forEach(liElement -> { Element biaotiElement = liElement.getElementsByClass("biaoti").first(); Element shuliangElement = liElement.getElementsByClass("shuliang").first(); if (biaotiElement == null || shuliangElement == null) { return; } int albumId = Integer.parseInt(liElement.attr("id")); int total = Integer.parseInt(shuliangElement.text().replace("P", "")); String albumName = MessageFormat.format("{0}-[{1}P] {2}", String.valueOf(albumId), total, rmIllegalName(biaotiElement.text())); TuJiDaoAlbumDO tjdDO = new TuJiDaoAlbumDO(); tjdDO.setAlbumId(albumId); tjdDO.setAlbumName(albumName); tjdDO.setTotal(total); tjdDO.setState(StateTypeEnum.STARTED.getSeq()); // 幂等,保证记录数唯一 Map queryMap = new HashMap<>(1); queryMap.put("album_id", tjdDO.getAlbumId()); if (CollectionUtils.isEmpty(tuJiDaoAlbumMapper.selectByMap(queryMap))) { tuJiDaoAlbumMapper.insert(tjdDO); log.info("album_id={} 同步成功,total={},title={}", tjdDO.getAlbumId(), tjdDO.getTotal(), tjdDO.getAlbumName()); } else { log.warn("album_id={} 已存在", tjdDO.getAlbumId()); } }); } return "success"; } @Override public String doLocalMigration() { Map queryMap = new HashMap<>(1); queryMap.put("state", StateTypeEnum.STARTED.getSeq()); tuJiDaoAlbumMapper.selectByMap(queryMap).forEach(albumDO -> { int total = albumDO.getTotal(); int albumId = albumDO.getAlbumId(); String albumName = albumDO.getAlbumName(); String localFolder = tjdLocalFolderPrefix + albumName; // 若文件夹路径不存在,则新建 File file = new File(localFolder); if (!file.exists()) { if (!file.mkdirs()) { log.error("==>localFolder={} 创建文件路径失败", localFolder); return; } } // 2022-01-16 针对源站已经隐去 图片数 信息 fix if (total == -1) { String previewDir = tjdLocalPreviewPrefix + albumId; total = FileUtils.listFiles(new File(previewDir), new String[]{"jpg"}, false).size() - 1; } for (int i = 0; i <= total; i++) { String onlinePath = String.format(Locale.CHINESE, "%s%s/%d.jpg", tjdLocalPreviewPrefix, albumId, i); String localPath = String.format(Locale.CHINESE, "%s/%d.jpg", localFolder, i); // 幂等,若当前文件未下载,则进行下载 File file2 = new File(localPath); if (!file2.exists()) { SpiderUtil.fileMove(onlinePath, localPath); } } albumDO.setState(StateTypeEnum.ANALYSIS.getSeq()); tuJiDaoAlbumMapper.updateById(albumDO); log.info("==>albumId={} 相册已本地迁移完成", albumId); }); return "success"; } private Map buildCookies() { Map cookiesMap = new HashMap<>(); cookiesMap.put("TujidaoService%5Fid", "411999177-1588406642-%7C1600614708"); cookiesMap.put("PHPSESSID", "uulfluqmbunauqqjnj4mf8i8dr"); cookiesMap.put("UM_distinctid", "171d493d83c1c-074736cab8c8d9-670103b-1fa400-171d493d83d151"); cookiesMap.put("atpsida", "7a72b6965366155670da7487_1600614730_4"); cookiesMap.put("cna", "MkUPF0HprU0CAXWIT4ixum+C"); cookiesMap.put("leixing", "0"); cookiesMap.put("name", "rxgirlz"); cookiesMap.put("sca", "5cafd77a"); cookiesMap.put("uid", "157108"); return cookiesMap; } private final HostnameVerifier hv = (s, sslSession) -> true; private void trustAllHttpsCertificates() throws Exception { javax.net.ssl.TrustManager[] trustAllCerts = new javax.net.ssl.TrustManager[1]; javax.net.ssl.TrustManager tm = new miTM(); trustAllCerts[0] = tm; javax.net.ssl.SSLContext sc = javax.net.ssl.SSLContext.getInstance("SSL"); sc.init(null, trustAllCerts, null); javax.net.ssl.HttpsURLConnection.setDefaultSSLSocketFactory(sc.getSocketFactory()); } private static class miTM implements javax.net.ssl.TrustManager, javax.net.ssl.X509TrustManager { public java.security.cert.X509Certificate[] getAcceptedIssuers() { return null; } public boolean isServerTrusted(java.security.cert.X509Certificate[] certs) { return true; } public boolean isClientTrusted(java.security.cert.X509Certificate[] certs) { return true; } public void checkServerTrusted(java.security.cert.X509Certificate[] certs, String authType) throws java.security.cert.CertificateException { return; } public void checkClientTrusted(java.security.cert.X509Certificate[] certs, String authType) throws java.security.cert.CertificateException { return; } } /** * 文件不合法名正则 */ private static final Pattern FILE_PATTERN = Pattern.compile("[\\\\/:*?\"<>|]"); /** * 去除不合法文件名 */ private String rmIllegalName(String s) { return FILE_PATTERN.matcher(s).replaceAll(""); } static String[] jpgs = { "D:/图集岛爬虫Preview/49322/20.jpg", "D:/图集岛爬虫Preview/49323/13.jpg", "D:/图集岛爬虫Preview/49324/34.jpg", "D:/图集岛爬虫Preview/49331/49.jpg", "D:/图集岛爬虫Preview/49332/22.jpg", "D:/图集岛爬虫Preview/49340/19.jpg", "D:/图集岛爬虫Preview/49340/83.jpg", "D:/图集岛爬虫Preview/49349/38.jpg", "D:/图集岛爬虫Preview/49355/56.jpg", "D:/图集岛爬虫Preview/49357/83.jpg", "D:/图集岛爬虫Preview/49360/42.jpg", "D:/图集岛爬虫Preview/49366/1.jpg", "D:/图集岛爬虫Preview/49367/13.jpg", "D:/图集岛爬虫Preview/49369/20.jpg", "D:/图集岛爬虫Preview/49372/1.jpg", "D:/图集岛爬虫Preview/49374/61.jpg", "D:/图集岛爬虫Preview/49380/27.jpg", "D:/图集岛爬虫Preview/49382/23.jpg", "D:/图集岛爬虫Preview/49386/28.jpg", "D:/图集岛爬虫Preview/49387/20.jpg", "D:/图集岛爬虫Preview/49387/21.jpg", "D:/图集岛爬虫Preview/49388/88.jpg", "D:/图集岛爬虫Preview/49390/8.jpg", "D:/图集岛爬虫Preview/49392/23.jpg", "D:/图集岛爬虫Preview/49393/26.jpg", "D:/图集岛爬虫Preview/49397/82.jpg", "D:/图集岛爬虫Preview/49408/8.jpg", "D:/图集岛爬虫Preview/49410/91.jpg", "D:/图集岛爬虫Preview/49412/30.jpg", "D:/图集岛爬虫Preview/49414/21.jpg", "D:/图集岛爬虫Preview/49418/5.jpg", "D:/图集岛爬虫Preview/49420/8.jpg", "D:/图集岛爬虫Preview/49421/22.jpg", "D:/图集岛爬虫Preview/49421/42.jpg", "D:/图集岛爬虫Preview/49424/9.jpg", "D:/图集岛爬虫Preview/49425/19.jpg", "D:/图集岛爬虫Preview/49426/68.jpg", "D:/图集岛爬虫Preview/49426/82.jpg", "D:/图集岛爬虫Preview/49432/28.jpg", "D:/图集岛爬虫Preview/49501/34.jpg", "D:/图集岛爬虫Preview/49501/52.jpg", "D:/图集岛爬虫Preview/49503/19.jpg", "D:/图集岛爬虫Preview/49503/40.jpg", "D:/图集岛爬虫Preview/49508/14.jpg", "D:/图集岛爬虫Preview/49512/33.jpg", "D:/图集岛爬虫Preview/49513/1.jpg", "D:/图集岛爬虫Preview/49521/34.jpg", "D:/图集岛爬虫Preview/49522/31.jpg", "D:/图集岛爬虫Preview/49537/26.jpg", "D:/图集岛爬虫Preview/49544/1.jpg", "D:/图集岛爬虫Preview/49544/11.jpg", "D:/图集岛爬虫Preview/49544/4.jpg", "D:/图集岛爬虫Preview/49547/43.jpg", "D:/图集岛爬虫Preview/49550/5.jpg", "D:/图集岛爬虫Preview/49551/16.jpg", "D:/图集岛爬虫Preview/49551/65.jpg", "D:/图集岛爬虫Preview/49559/6.jpg", "D:/图集岛爬虫Preview/49562/75.jpg", "D:/图集岛爬虫Preview/49563/42.jpg", "D:/图集岛爬虫Preview/49563/55.jpg", "D:/图集岛爬虫Preview/49563/9.jpg", "D:/图集岛爬虫Preview/49565/17.jpg", "D:/图集岛爬虫Preview/49574/1.jpg", "D:/图集岛爬虫Preview/49574/4.jpg", "D:/图集岛爬虫Preview/49578/12.jpg", "D:/图集岛爬虫Preview/49578/15.jpg", "D:/图集岛爬虫Preview/49578/27.jpg", "D:/图集岛爬虫Preview/49583/28.jpg", "D:/图集岛爬虫Preview/49591/64.jpg", "D:/图集岛爬虫Preview/49598/10.jpg", "D:/图集岛爬虫Preview/49599/38.jpg", "D:/图集岛爬虫Preview/49601/42.jpg", "D:/图集岛爬虫Preview/49602/64.jpg", "D:/图集岛爬虫Preview/49602/75.jpg", "D:/图集岛爬虫Preview/49605/31.jpg", "D:/图集岛爬虫Preview/49606/40.jpg", "D:/图集岛爬虫Preview/49608/15.jpg", "D:/图集岛爬虫Preview/49611/27.jpg", "D:/图集岛爬虫Preview/49611/45.jpg", "D:/图集岛爬虫Preview/49614/36.jpg", "D:/图集岛爬虫Preview/49627/28.jpg", "D:/图集岛爬虫Preview/49627/7.jpg", "D:/图集岛爬虫Preview/49628/51.jpg", "D:/图集岛爬虫Preview/49628/76.jpg", "D:/图集岛爬虫Preview/49629/47.jpg", "D:/图集岛爬虫Preview/49633/11.jpg", "D:/图集岛爬虫Preview/49637/5.jpg", "D:/图集岛爬虫Preview/49639/24.jpg", "D:/图集岛爬虫Preview/49640/12.jpg", "D:/图集岛爬虫Preview/49645/21.jpg", "D:/图集岛爬虫Preview/49646/41.jpg", "D:/图集岛爬虫Preview/49653/12.jpg", "D:/图集岛爬虫Preview/49658/11.jpg", "D:/图集岛爬虫Preview/49665/2.jpg", "D:/图集岛爬虫Preview/49666/72.jpg", "D:/图集岛爬虫Preview/49671/4.jpg", "D:/图集岛爬虫Preview/49672/22.jpg", "D:/图集岛爬虫Preview/49677/25.jpg", "D:/图集岛爬虫Preview/49677/27.jpg", "D:/图集岛爬虫Preview/49691/23.jpg", "D:/图集岛爬虫Preview/49702/1.jpg", "D:/图集岛爬虫Preview/49704/9.jpg", "D:/图集岛爬虫Preview/49706/48.jpg", "D:/图集岛爬虫Preview/49713/5.jpg", "D:/图集岛爬虫Preview/49716/10.jpg", "D:/图集岛爬虫Preview/49721/18.jpg", "D:/图集岛爬虫Preview/49722/0.jpg", "D:/图集岛爬虫Preview/49722/28.jpg", "D:/图集岛爬虫Preview/49722/3.jpg", "D:/图集岛爬虫Preview/49722/31.jpg", "D:/图集岛爬虫Preview/49724/8.jpg", "D:/图集岛爬虫Preview/49725/2.jpg", "D:/图集岛爬虫Preview/49742/27.jpg", "D:/图集岛爬虫Preview/49750/5.jpg", "D:/图集岛爬虫Preview/49755/1.jpg", "D:/图集岛爬虫Preview/49763/52.jpg", "D:/图集岛爬虫Preview/49763/63.jpg", "D:/图集岛爬虫Preview/49770/38.jpg", "D:/图集岛爬虫Preview/49771/49.jpg", "D:/图集岛爬虫Preview/49774/38.jpg", "D:/图集岛爬虫Preview/49778/41.jpg", "D:/图集岛爬虫Preview/49779/11.jpg", "D:/图集岛爬虫Preview/49779/3.jpg", "D:/图集岛爬虫Preview/49780/21.jpg", "D:/图集岛爬虫Preview/49784/46.jpg", "D:/图集岛爬虫Preview/49789/25.jpg", "D:/图集岛爬虫Preview/49789/36.jpg", "D:/图集岛爬虫Preview/49791/47.jpg", "D:/图集岛爬虫Preview/49792/13.jpg", "D:/图集岛爬虫Preview/49804/15.jpg", "D:/图集岛爬虫Preview/49805/14.jpg", "D:/图集岛爬虫Preview/49805/24.jpg", "D:/图集岛爬虫Preview/49805/4.jpg", "D:/图集岛爬虫Preview/49805/7.jpg", "D:/图集岛爬虫Preview/49815/49.jpg", "D:/图集岛爬虫Preview/49816/75.jpg", "D:/图集岛爬虫Preview/49818/6.jpg", "D:/图集岛爬虫Preview/49819/26.jpg", "D:/图集岛爬虫Preview/49822/13.jpg", "D:/图集岛爬虫Preview/49822/18.jpg", "D:/图集岛爬虫Preview/49823/11.jpg", "D:/图集岛爬虫Preview/49826/15.jpg", "D:/图集岛爬虫Preview/49830/62.jpg", "D:/图集岛爬虫Preview/49836/2.jpg", "D:/图集岛爬虫Preview/49842/20.jpg", "D:/图集岛爬虫Preview/49842/35.jpg", "D:/图集岛爬虫Preview/49858/45.jpg", "D:/图集岛爬虫Preview/49859/1.jpg", "D:/图集岛爬虫Preview/49863/46.jpg", "D:/图集岛爬虫Preview/49863/5.jpg", "D:/图集岛爬虫Preview/49870/17.jpg", "D:/图集岛爬虫Preview/49874/6.jpg", "D:/图集岛爬虫Preview/49878/46.jpg", "D:/图集岛爬虫Preview/49879/15.jpg", "D:/图集岛爬虫Preview/49885/20.jpg", "D:/图集岛爬虫Preview/49891/13.jpg", "D:/图集岛爬虫Preview/49891/68.jpg", "D:/图集岛爬虫Preview/49895/69.jpg", "D:/图集岛爬虫Preview/49900/18.jpg", "D:/图集岛爬虫Preview/49900/33.jpg", "D:/图集岛爬虫Preview/49901/14.jpg", "D:/图集岛爬虫Preview/49905/71.jpg", "D:/图集岛爬虫Preview/49909/5.jpg", "D:/图集岛爬虫Preview/49910/32.jpg", "D:/图集岛爬虫Preview/49916/39.jpg", "D:/图集岛爬虫Preview/49933/29.jpg", "D:/图集岛爬虫Preview/49933/41.jpg", "D:/图集岛爬虫Preview/49936/32.jpg", "D:/图集岛爬虫Preview/49942/46.jpg", "D:/图集岛爬虫Preview/49954/79.jpg", "D:/图集岛爬虫Preview/49960/54.jpg", "D:/图集岛爬虫Preview/49962/21.jpg", "D:/图集岛爬虫Preview/49962/34.jpg", "D:/图集岛爬虫Preview/49964/7.jpg", "D:/图集岛爬虫Preview/49967/9.jpg", "D:/图集岛爬虫Preview/49969/1.jpg", "D:/图集岛爬虫Preview/49969/55.jpg", "D:/图集岛爬虫Preview/49969/85.jpg", "D:/图集岛爬虫Preview/49975/40.jpg", "D:/图集岛爬虫Preview/49976/3.jpg", "D:/图集岛爬虫Preview/49977/19.jpg", "D:/图集岛爬虫Preview/49978/25.jpg", "D:/图集岛爬虫Preview/49978/38.jpg", "D:/图集岛爬虫Preview/49978/51.jpg", "D:/图集岛爬虫Preview/49979/18.jpg", "D:/图集岛爬虫Preview/49980/4.jpg", "D:/图集岛爬虫Preview/49984/16.jpg", "D:/图集岛爬虫Preview/49984/7.jpg", "D:/图集岛爬虫Preview/49986/48.jpg", "D:/图集岛爬虫Preview/49988/24.jpg", "D:/图集岛爬虫Preview/49999/48.jpg", "D:/图集岛爬虫Preview/49999/53.jpg", "D:/图集岛爬虫Preview/49999/57.jpg", "D:/图集岛爬虫Preview/49999/64.jpg", "D:/图集岛爬虫Preview/49999/8.jpg", "D:/图集岛爬虫Preview/50000/16.jpg", "D:/图集岛爬虫Preview/50000/17.jpg", "D:/图集岛爬虫Preview/50002/38.jpg", "D:/图集岛爬虫Preview/50003/20.jpg", "D:/图集岛爬虫Preview/50004/20.jpg", "D:/图集岛爬虫Preview/50004/7.jpg", "D:/图集岛爬虫Preview/50006/42.jpg", "D:/图集岛爬虫Preview/50007/9.jpg", "D:/图集岛爬虫Preview/50012/75.jpg", "D:/图集岛爬虫Preview/50021/25.jpg", "D:/图集岛爬虫Preview/50022/46.jpg", "D:/图集岛爬虫Preview/50026/4.jpg", "D:/图集岛爬虫Preview/50028/1.jpg", "D:/图集岛爬虫Preview/50028/4.jpg", "D:/图集岛爬虫Preview/50030/19.jpg", "D:/图集岛爬虫Preview/50031/6.jpg", "D:/图集岛爬虫Preview/50032/1.jpg", "D:/图集岛爬虫Preview/50035/38.jpg", "D:/图集岛爬虫Preview/50038/18.jpg", "D:/图集岛爬虫Preview/50038/2.jpg", "D:/图集岛爬虫Preview/50038/34.jpg", "D:/图集岛爬虫Preview/50042/28.jpg", "D:/图集岛爬虫Preview/50042/38.jpg", "D:/图集岛爬虫Preview/50043/6.jpg", "D:/图集岛爬虫Preview/50048/10.jpg", "D:/图集岛爬虫Preview/50048/13.jpg", "D:/图集岛爬虫Preview/50048/18.jpg", "D:/图集岛爬虫Preview/50048/3.jpg", "D:/图集岛爬虫Preview/50048/35.jpg", "D:/图集岛爬虫Preview/50048/4.jpg", "D:/图集岛爬虫Preview/50053/10.jpg", "D:/图集岛爬虫Preview/50053/19.jpg", "D:/图集岛爬虫Preview/50053/39.jpg", "D:/图集岛爬虫Preview/50053/8.jpg", "D:/图集岛爬虫Preview/50053/9.jpg", "D:/图集岛爬虫Preview/50066/39.jpg", "D:/图集岛爬虫Preview/50068/2.jpg", "D:/图集岛爬虫Preview/50069/63.jpg", "D:/图集岛爬虫Preview/50069/64.jpg", "D:/图集岛爬虫Preview/50071/43.jpg", "D:/图集岛爬虫Preview/50077/41.jpg", "D:/图集岛爬虫Preview/50083/3.jpg", "D:/图集岛爬虫Preview/50088/19.jpg", "D:/图集岛爬虫Preview/50089/1.jpg", "D:/图集岛爬虫Preview/50091/15.jpg", "D:/图集岛爬虫Preview/50091/19.jpg", "D:/图集岛爬虫Preview/50100/36.jpg", "D:/图集岛爬虫Preview/50101/18.jpg", "D:/图集岛爬虫Preview/50101/2.jpg", "D:/图集岛爬虫Preview/50102/8.jpg", "D:/图集岛爬虫Preview/50103/24.jpg", "D:/图集岛爬虫Preview/50106/61.jpg", "D:/图集岛爬虫Preview/50109/22.jpg", "D:/图集岛爬虫Preview/50110/11.jpg", "D:/图集岛爬虫Preview/50113/53.jpg", "D:/图集岛爬虫Preview/50114/23.jpg", "D:/图集岛爬虫Preview/50114/4.jpg", "D:/图集岛爬虫Preview/50117/1.jpg", "D:/图集岛爬虫Preview/50120/13.jpg", "D:/图集岛爬虫Preview/50121/4.jpg", "D:/图集岛爬虫Preview/50121/6.jpg", "D:/图集岛爬虫Preview/50125/0.jpg", "D:/图集岛爬虫Preview/50126/39.jpg", "D:/图集岛爬虫Preview/50131/30.jpg", "D:/图集岛爬虫Preview/50140/12.jpg", "D:/图集岛爬虫Preview/50142/9.jpg", "D:/图集岛爬虫Preview/50149/12.jpg", "D:/图集岛爬虫Preview/50151/57.jpg", "D:/图集岛爬虫Preview/50154/54.jpg", "D:/图集岛爬虫Preview/50160/19.jpg", "D:/图集岛爬虫Preview/50160/21.jpg", "D:/图集岛爬虫Preview/50167/33.jpg", "D:/图集岛爬虫Preview/50170/16.jpg", "D:/图集岛爬虫Preview/50170/2.jpg", "D:/图集岛爬虫Preview/50174/22.jpg", "D:/图集岛爬虫Preview/50176/4.jpg", "D:/图集岛爬虫Preview/50177/19.jpg", "D:/图集岛爬虫Preview/50179/7.jpg", "D:/图集岛爬虫Preview/50181/1.jpg", "D:/图集岛爬虫Preview/50181/8.jpg", "D:/图集岛爬虫Preview/50183/4.jpg", "D:/图集岛爬虫Preview/50189/16.jpg", "D:/图集岛爬虫Preview/50192/17.jpg", "D:/图集岛爬虫Preview/50192/24.jpg", "D:/图集岛爬虫Preview/50192/36.jpg", "D:/图集岛爬虫Preview/50193/26.jpg", "D:/图集岛爬虫Preview/50194/8.jpg", "D:/图集岛爬虫Preview/50319/22.jpg" }; private static void reDownload() { String directoryStr = "F:/图集岛爬虫Preview/"; for (String jpg : jpgs) { String tmp = jpg.replace(directoryStr, ""); String albumId = tmp.split("-")[0]; String fileName = tmp.split("/")[1]; String onlinePath = "https://tjg.gzhuibei.com/a/1/" + albumId + "/" + fileName; SpiderUtil.ioDownload2Times(onlinePath, jpg, 3); } } private static void delSize0() { String part = "图集岛爬虫Preview"; String directoryStr = "F:/" + part + "/"; File directory = new File(directoryStr); File[] folders = directory.listFiles((FilenameFilter) DirectoryFileFilter.DIRECTORY); if (folders == null) { return; } for (File folder : folders) { String folderName = folder.getName(); String subDirectoryStr = directoryStr + folderName; File subDirectory = new File(subDirectoryStr); Collection jpgFiles = FileUtils.listFiles(subDirectory, new String[]{"jpg"}, false); for (File jpg : jpgFiles) { if (jpg.length() == 0) { // jpg.delete(); System.out.println("size0 jpg=" + subDirectoryStr + "/" + jpg.getName()); } } } } public static void main(String[] args) { // reDownload(); // delSize0(); Set zipFileNames = new HashSet<>(); Set fileNames = new HashSet<>(); File zipDirectory = new File("F:/zip图集岛爬虫(40001-50000)/"); File directory = new File("F:/图集岛爬虫(40001-50000)/"); Collection zipFiles = FileUtils.listFiles(zipDirectory, new String[]{"zip"}, false); for (File folder : zipFiles) { String folderName = folder.getName(); zipFileNames.add(folderName.replace(".zip", "")); } File[] folders = directory.listFiles((FilenameFilter) DirectoryFileFilter.DIRECTORY); if (folders == null) { return; } for (File folder : folders) { String folderName = folder.getName(); fileNames.add(folderName); } fileNames.removeAll(zipFileNames); System.out.println(fileNames); } } ================================================ FILE: oys-tujidao/src/main/java/com/devyy/oys/tujidao/dao/TuJiDaoAlbumMapper.java ================================================ package com.devyy.oys.tujidao.dao; import com.baomidou.mybatisplus.core.mapper.BaseMapper; import com.devyy.oys.tujidao.TuJiDaoAlbumDO; import org.springframework.stereotype.Repository; /** * Mybatis-Plus Mapper * * @since 2019-12-01 */ @Repository public interface TuJiDaoAlbumMapper extends BaseMapper { } ================================================ FILE: oys-tujidao/src/test/java/com/devyy/oys/tujidao/TuJiDaoServiceImplTest.java ================================================ package com.devyy.oys.tujidao; import com.devyy.oys.tujidao.dao.TuJiDaoAlbumMapper; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.Test; import org.junit.jupiter.api.extension.ExtendWith; import org.mockito.InjectMocks; import org.mockito.Mock; import org.mockito.MockedStatic; import org.mockito.Mockito; import org.mockito.junit.jupiter.MockitoExtension; import java.io.File; import java.io.IOException; import java.net.URL; import java.nio.charset.StandardCharsets; import java.util.ArrayList; /** * TuJiDaoServiceImpl UT * * @since 2022-01-18 */ @ExtendWith(MockitoExtension.class) public class TuJiDaoServiceImplTest { @InjectMocks private final TuJiDaoServiceImpl tuJiDaoService = new TuJiDaoServiceImpl(); @Mock private TuJiDaoAlbumMapper tuJiDaoAlbumMapper; @Test public void doSyncRecords() throws IOException { // mock 网页 URL url = getClass().getResource("/gengxin2.html"); Assertions.assertNotNull(url); File file = new File(url.getPath()); Document document = Jsoup.parse(file, StandardCharsets.UTF_8.name()); // mock Jsoup.connect().cookies().get() MockedStatic jsoupMockedStatic = Mockito.mockStatic(Jsoup.class, Mockito.RETURNS_DEEP_STUBS); jsoupMockedStatic.when(() -> Jsoup.connect(Mockito.anyString()).cookies(Mockito.anyMap()).get()) .thenReturn(document); // mock tuJiDaoAlbumMapper.selectByMap() Mockito.when(tuJiDaoAlbumMapper.selectByMap(Mockito.anyMap())).thenReturn(new ArrayList<>()); // mock tuJiDaoAlbumMapper.insert() Mockito.when(tuJiDaoAlbumMapper.insert(Mockito.any())).thenReturn(1); // doTest tuJiDaoService.doSyncRecords(); // verify jsoupMockedStatic.verify(() -> Jsoup.connect(Mockito.anyString()), Mockito.atLeastOnce()); Mockito.verify(tuJiDaoAlbumMapper, Mockito.atLeastOnce()).insert(Mockito.any()); } } ================================================ FILE: oys-tujidao/src/test/resources/gengxin.html ================================================ 更新记录_图集岛
  • 标签类型
  • 机构厂商
  • 地区
极品 美少女 御姐 女神 日本少女 嫩模 妹子 尤物 正妹 熟女 乙女 女星 阳光 网红 少FU 日本少FU 泳装少女 软妹 日本妹子 萌女 车模 女优 少女 日本嫩模 清纯少女 性感少女 混血 黑长直 明星 杂志 欧美熟女 金发 黑人 女郎 轻熟女 巨RU女优 女友 白领 萝莉
风格
可爱 甜美 清纯 性感 气质 妩媚 清新 唯美 美胸 诱惑 私房 翘T 大胆 福利 大胸 高贵 粉嫩 惊艳 白嫩 养眼 巨RU 情趣 短发 丰T 丰满 巨RU萝莉 勾魂 半裸 欧美大胸 蕾丝诱惑 制服诱惑 丰满少F 风骚少F 风骚 顶级少F 美T少F 肥TUN 欧美巨RU 学生 肉感 双马尾 大尺度 骨感 长发 冷艳 包T 美T 爆RU 魅惑 优雅 美RU
丝 · 腿 · 足
丝袜女郎 黑丝萝莉 白丝萝莉 长筒袜 丝袜 黑丝 丝袜美腿 灰丝 美腿 高跟 腿模 丝袜诱惑 网袜 丝足 白丝 肉丝 红丝 吊带丝袜 美脚 玉足 情趣丝袜 美足 网衣 长腿 凉高跟 高跟美腿 欧美制服 欧美丝袜 街拍美T 丝袜美T 美丝 丝足高跟 丝足诱惑 街拍丝袜 丝袜人体 丝袜短裙 街拍短裙 丝袜制服 街拍美腿 街拍高跟 丝袜肥T 街拍黑丝 黑丝制服 黑丝美腿 丝袜熟女 丝袜少F 肉丝美腿 黑丝诱惑 情趣制服
情趣
薄纱 剧情 肚兜 旗袍 空姐 赛车女郎 护士 运动 女仆 兔女郎 制服 COS 猫女 美束 古典 SM 圣诞 湿身 人体 厨娘 JK 女警 老师 医生 新年 捆绑 绳艺 眼镜 不知火舞 COSER 清纯护士 纹身 OL 秘书 调教 镣铐 姐妹花 透视
场景
桌球 浴室 篮球 拳击 足球 居家 办公室 街拍 外拍 海边 展台 钢管 床上 酒店 健身 户外 香车美人 沙滩
服装
连衣裙 泳装 校服 长裙 热裤 睡衣 蕾丝 超短裙 和服 牛仔 水手服 比基尼 丁字裤 内衣 高叉 职业装 死库水 婚纱 豹纹 衬衫 紧身裤 体操服 皮衣 西装 学生制服 日本制服 迷你裙 毛衣 长靴 吊带 抹胸 背心

公告:春节放假停更,2021.02.10 - 02.19 节后恢复正常更新,祝所有用户新年快乐,感谢你们的支持 。

更新记录
================================================ FILE: oys-tujidao/src/test/resources/gengxin2.html ================================================ 更新记录_图集岛
极品 美少女 御姐 女神 日本少女 嫩模 妹子 尤物 正妹 熟女 乙女 女星 阳光 网红 少FU 日本少FU 泳装少女 软妹 日本妹子 萌女 车模 女优 少女 日本嫩模 清纯少女 性感少女 混血 黑长直 明星 杂志 欧美熟女 金发 黑人 女郎 轻熟女 巨RU女优 女友 白领 萝莉
风格
可爱 甜美 清纯 性感 气质 妩媚 清新 唯美 美胸 诱惑 私房 翘T 大胆 福利 大胸 高贵 粉嫩 惊艳 白嫩 养眼 巨RU 情趣 短发 丰T 丰满 巨RU萝莉 勾魂 半裸 欧美大胸 蕾丝诱惑 制服诱惑 丰满少F 风骚少F 风骚 顶级少F 美T少F 肥TUN 欧美巨RU 学生 肉感 双马尾 大尺度 骨感 长发 冷艳 包T 美T 爆RU 魅惑 优雅 美RU
丝 · 腿 · 足
丝袜女郎 黑丝萝莉 白丝萝莉 长筒袜 丝袜 黑丝 丝袜美腿 灰丝 美腿 高跟 腿模 丝袜诱惑 网袜 丝足 白丝 肉丝 红丝 吊带丝袜 美脚 玉足 情趣丝袜 美足 网衣 长腿 凉高跟 高跟美腿 欧美制服 欧美丝袜 街拍美T 丝袜美T 美丝 丝足高跟 丝足诱惑 街拍丝袜 丝袜人体 丝袜短裙 街拍短裙 丝袜制服 街拍美腿 街拍高跟 丝袜肥T 街拍黑丝 黑丝制服 黑丝美腿 丝袜熟女 丝袜少F 肉丝美腿 黑丝诱惑 情趣制服
情趣
薄纱 剧情 肚兜 旗袍 空姐 赛车女郎 护士 运动 女仆 兔女郎 制服 COS 猫女 美束 古典 SM 圣诞 湿身 人体 厨娘 JK 女警 老师 医生 新年 捆绑 绳艺 眼镜 不知火舞 COSER 清纯护士 纹身 OL 秘书 调教 镣铐 姐妹花 透视
场景
桌球 浴室 篮球 拳击 足球 居家 办公室 街拍 外拍 海边 展台 钢管 床上 酒店 健身 户外 香车美人 沙滩
服装
连衣裙 泳装 校服 长裙 热裤 睡衣 蕾丝 超短裙 和服 牛仔 水手服 比基尼 丁字裤 内衣 高叉 职业装 死库水 婚纱 豹纹 衬衫 紧身裤 体操服 皮衣 西装 学生制服 日本制服 迷你裙 毛衣 长靴 吊带 抹胸 背心
全部图集 / 更新记录
跳转至
================================================ FILE: pom.xml ================================================ 4.0.0 com.devyy openyspider ${revision} pom oys-core oys-tujidao oys-boot-starter oys-codeforces 2.7.5 1.18.24 3.5.2 3.0.0 2.11.0 1.15.3 4.1.4 1.3.0 2.22.2 2.22.2 0.8.8 4.0-SNAPSHOT 17 17 17 UTF-8 org.springframework.boot spring-boot-dependencies ${org.springframework.boot.spring-boot-dependencies.version} pom import org.seleniumhq.selenium selenium-api ${org.seleniumhq.selenium.selenium-chrome-driver.version} org.seleniumhq.selenium selenium-remote-driver ${org.seleniumhq.selenium.selenium-chrome-driver.version} org.springframework.boot spring-boot-starter-web org.springframework.boot spring-boot-starter-test org.mockito mockito-inline org.jsoup jsoup ${org.jsoup.jsoup.version} org.projectlombok lombok ${org.projectlombok.lombok.version} mysql mysql-connector-java com.baomidou mybatis-plus-boot-starter ${com.baomidou.mybatis-plus-boot-starter.version} io.springfox springfox-boot-starter ${io.springfox.springfox-boot-starter.version} commons-io commons-io ${commons-io.commons-io.version} org.seleniumhq.selenium selenium-chrome-driver ${org.seleniumhq.selenium.selenium-chrome-driver.version} org.springframework.kafka spring-kafka org.springframework.boot spring-boot-starter-data-redis com.kennycason kumo-core 1.28 com.kennycason kumo-api 1.28 org.codehaus.mojo flatten-maven-plugin ${flatten-maven-plugin.version} true resolveCiFriendliesOnly expand flatten process-resources flatten flatten.clean clean clean org.apache.maven.plugins maven-surefire-plugin ${maven-surefire-plugin.version} org.apache.maven.plugins maven-failsafe-plugin ${maven-failsafe-plugin.version} org.jacoco jacoco-maven-plugin ${jacoco-maven-plugin.version} prepare-agent ================================================ FILE: sql_scripts/oys3_all_scripts_mysql.sql ================================================ /*==============================================================*/ /* Table: OYS_TUJIDAO_ALBUM */ /*==============================================================*/ CREATE TABLE OYS_TUJIDAO_ALBUM_T ( ID BIGINT NOT NULL AUTO_INCREMENT COMMENT '主键 ID' , STATE INT COMMENT '状态', TOTAL INT COMMENT '图片总数', ALBUM_NAME VARCHAR(255) COMMENT '相册名', ALBUM_ID INT COMMENT '相册id', CREATION_DATE DATETIME DEFAULT CURRENT_TIMESTAMP COMMENT '创建时间', LAST_UPDATED_DATE DATETIME DEFAULT CURRENT_TIMESTAMP COMMENT '最后修改时间', PRIMARY KEY (ID), KEY ALBUM_ID_UNIQUE (ALBUM_ID) ); /*==============================================================*/ /* Table: OYS_CODEFORCES */ /*==============================================================*/ CREATE TABLE OYS_CODEFORCES ( SUBMISSION_ID VARCHAR(32) NOT NULL COMMENT '提交 ID', NO NUMERIC(11,0) COMMENT '序号', FRAGMENT VARCHAR(1024) COMMENT '片段值', PRIMARY KEY (SUBMISSION_ID) );