Repository: ssssssss-team/spider-flow
Branch: master
Commit: c799cca99c7d
Files: 213
Total size: 4.0 MB
Directory structure:
gitextract_2yt2vx0u/
├── .gitattributes
├── .gitignore
├── Dockerfile
├── LICENSE
├── README.md
├── db/
│ └── spiderflow.sql
├── pom.xml
├── spider-flow-api/
│ ├── pom.xml
│ └── src/
│ └── main/
│ └── java/
│ └── org/
│ └── spiderflow/
│ ├── ExpressionEngine.java
│ ├── Grammerable.java
│ ├── annotation/
│ │ ├── Comment.java
│ │ ├── Example.java
│ │ └── Return.java
│ ├── common/
│ │ └── CURDController.java
│ ├── concurrent/
│ │ ├── ChildPriorThreadSubmitStrategy.java
│ │ ├── LinkedThreadSubmitStrategy.java
│ │ ├── ParentPriorThreadSubmitStrategy.java
│ │ ├── RandomThreadSubmitStrategy.java
│ │ ├── SpiderFlowThreadPoolExecutor.java
│ │ ├── SpiderFutureTask.java
│ │ └── ThreadSubmitStrategy.java
│ ├── context/
│ │ ├── CookieContext.java
│ │ ├── SpiderContext.java
│ │ └── SpiderContextHolder.java
│ ├── enums/
│ │ ├── FlowNoticeType.java
│ │ └── FlowNoticeWay.java
│ ├── executor/
│ │ ├── FunctionExecutor.java
│ │ ├── FunctionExtension.java
│ │ ├── PluginConfig.java
│ │ └── ShapeExecutor.java
│ ├── expression/
│ │ └── DynamicMethod.java
│ ├── io/
│ │ ├── Line.java
│ │ ├── RandomAccessFileReader.java
│ │ └── SpiderResponse.java
│ ├── listener/
│ │ └── SpiderListener.java
│ ├── model/
│ │ ├── Grammer.java
│ │ ├── JsonBean.java
│ │ ├── Plugin.java
│ │ ├── Shape.java
│ │ ├── SpiderLog.java
│ │ ├── SpiderNode.java
│ │ └── SpiderOutput.java
│ └── utils/
│ └── Maps.java
├── spider-flow-core/
│ ├── pom.xml
│ └── src/
│ └── main/
│ └── java/
│ └── org/
│ └── spiderflow/
│ └── core/
│ ├── Spider.java
│ ├── executor/
│ │ ├── function/
│ │ │ ├── Base64FunctionExecutor.java
│ │ │ ├── DateFunctionExecutor.java
│ │ │ ├── ExtractFunctionExecutor.java
│ │ │ ├── FileFunctionExecutor.java
│ │ │ ├── JsonFunctionExecutor.java
│ │ │ ├── ListFunctionExecutor.java
│ │ │ ├── MD5FunctionExecutor.java
│ │ │ ├── RandomFunctionExecutor.java
│ │ │ ├── StringFunctionExecutor.java
│ │ │ ├── ThreadFunctionExecutor.java
│ │ │ ├── UrlFunctionExecutor.java
│ │ │ └── extension/
│ │ │ ├── ArrayFunctionExtension.java
│ │ │ ├── DateFunctionExtension.java
│ │ │ ├── ElementFunctionExtension.java
│ │ │ ├── ElementsFunctionExtension.java
│ │ │ ├── ListFunctionExtension.java
│ │ │ ├── MapFunctionExtension.java
│ │ │ ├── ObjectFunctionExtension.java
│ │ │ ├── ResponseFunctionExtension.java
│ │ │ ├── SqlRowSetExtension.java
│ │ │ └── StringFunctionExtension.java
│ │ └── shape/
│ │ ├── CommentExecutor.java
│ │ ├── ExecuteSQLExecutor.java
│ │ ├── ForkJoinExecutor.java
│ │ ├── FunctionExecutor.java
│ │ ├── LoopExecutor.java
│ │ ├── OutputExecutor.java
│ │ ├── ProcessExecutor.java
│ │ ├── RequestExecutor.java
│ │ ├── StartExecutor.java
│ │ └── VariableExecutor.java
│ ├── expression/
│ │ ├── DefaultExpressionEngine.java
│ │ ├── ExpressionError.java
│ │ ├── ExpressionGlobalVariables.java
│ │ ├── ExpressionTemplate.java
│ │ ├── ExpressionTemplateContext.java
│ │ ├── interpreter/
│ │ │ ├── AstInterpreter.java
│ │ │ ├── JavaReflection.java
│ │ │ └── Reflection.java
│ │ └── parsing/
│ │ ├── Ast.java
│ │ ├── CharacterStream.java
│ │ ├── Parser.java
│ │ ├── Span.java
│ │ ├── Token.java
│ │ ├── TokenStream.java
│ │ ├── TokenType.java
│ │ └── Tokenizer.java
│ ├── io/
│ │ ├── HttpRequest.java
│ │ └── HttpResponse.java
│ ├── job/
│ │ ├── SpiderJob.java
│ │ ├── SpiderJobContext.java
│ │ └── SpiderJobManager.java
│ ├── mapper/
│ │ ├── DataSourceMapper.java
│ │ ├── FlowNoticeMapper.java
│ │ ├── FunctionMapper.java
│ │ ├── SpiderFlowMapper.java
│ │ ├── TaskMapper.java
│ │ └── VariableMapper.java
│ ├── model/
│ │ ├── DataSource.java
│ │ ├── FlowNotice.java
│ │ ├── Function.java
│ │ ├── SpiderFlow.java
│ │ ├── Task.java
│ │ └── Variable.java
│ ├── script/
│ │ └── ScriptManager.java
│ ├── serializer/
│ │ └── FastJsonSerializer.java
│ ├── service/
│ │ ├── DataSourceService.java
│ │ ├── FlowNoticeService.java
│ │ ├── FunctionService.java
│ │ ├── SpiderFlowService.java
│ │ ├── TaskService.java
│ │ └── VariableService.java
│ └── utils/
│ ├── DataSourceUtils.java
│ ├── EmailUtils.java
│ ├── ExecutorsUtils.java
│ ├── ExpressionUtils.java
│ ├── ExtractUtils.java
│ ├── FileUtils.java
│ └── SpiderFlowUtils.java
└── spider-flow-web/
├── pom.xml
└── src/
└── main/
├── java/
│ └── org/
│ └── spiderflow/
│ ├── SpiderApplication.java
│ ├── configuration/
│ │ ├── ResourcesConfiguration.java
│ │ └── WebSocketConfiguration.java
│ ├── controller/
│ │ ├── DataSourceController.java
│ │ ├── FlowNoticeController.java
│ │ ├── FunctionController.java
│ │ ├── SpiderFlowController.java
│ │ ├── SpiderRestController.java
│ │ ├── TaskController.java
│ │ └── VariableController.java
│ ├── logback/
│ │ ├── SpiderFlowFileAppender.java
│ │ └── SpiderFlowWebSocketAppender.java
│ ├── model/
│ │ ├── SpiderWebSocketContext.java
│ │ └── WebSocketEvent.java
│ └── websocket/
│ └── WebSocketEditorServer.java
└── resources/
├── application.properties
├── logback-spring.xml
└── static/
├── css/
│ ├── easyui.css
│ ├── editor.css
│ ├── index.css
│ ├── layui-black-gray.css
│ └── layui-blue.css
├── datasource-edit.html
├── datasources.html
├── editCron.html
├── editor.html
├── function-edit.html
├── functions.html
├── index.html
├── js/
│ ├── canvas-viewer.js
│ ├── codemirror/
│ │ ├── codemirror.css
│ │ ├── codemirror.js
│ │ ├── dracula.css
│ │ ├── idea.css
│ │ ├── javascript.js
│ │ ├── placeholder.js
│ │ ├── show-hint.css
│ │ ├── show-hint.js
│ │ ├── spiderflow-hint.js
│ │ ├── spiderflow.js
│ │ └── sql.js
│ ├── common.js
│ ├── cron/
│ │ └── cron.js
│ ├── editor.js
│ ├── index.js
│ ├── jsontree/
│ │ ├── jsontree.css
│ │ └── jsontree.js
│ ├── layui/
│ │ ├── css/
│ │ │ ├── layui.css
│ │ │ ├── layui.mobile.css
│ │ │ └── modules/
│ │ │ ├── code.css
│ │ │ ├── laydate/
│ │ │ │ └── default/
│ │ │ │ └── laydate.css
│ │ │ └── layer/
│ │ │ └── default/
│ │ │ └── layer.css
│ │ ├── ext/
│ │ │ ├── eleTree/
│ │ │ │ ├── eleTree.css
│ │ │ │ └── eleTree.js
│ │ │ └── treeselect/
│ │ │ └── treeselect.js
│ │ ├── extends/
│ │ │ ├── formSelects-v4.css
│ │ │ ├── formSelects-v4.js
│ │ │ └── treeGrid.js
│ │ └── layui.all.js
│ ├── log-viewer.js
│ ├── mxgraph/
│ │ ├── css/
│ │ │ ├── common.css
│ │ │ └── explorer.css
│ │ ├── mxgraph.js
│ │ └── resources/
│ │ ├── editor.txt
│ │ ├── editor_de.txt
│ │ ├── editor_zh.txt
│ │ ├── graph.txt
│ │ ├── graph_de.txt
│ │ └── graph_zh.txt
│ └── spider-editor.js
├── log.html
├── resources/
│ └── templates/
│ ├── comment.html
│ ├── edge.html
│ ├── executeSql.html
│ ├── forkJoin.html
│ ├── function.html
│ ├── loop.html
│ ├── output.html
│ ├── process.html
│ ├── request.html
│ ├── root.html
│ ├── start.html
│ └── variable.html
├── spiderList-notice.html
├── spiderList.html
├── task.html
├── variable-edit.html
└── variables.html
================================================
FILE CONTENTS
================================================
================================================
FILE: .gitattributes
================================================
*.js linguist-language=java
*.css linguist-language=java
*.html linguist-language=java
================================================
FILE: .gitignore
================================================
target
*.iml
out/
.idea
.classpath
.project
.settings
bin/
.myeclipse
================================================
FILE: Dockerfile
================================================
FROM java:8
MAINTAINER octopus
RUN mkdir -p /spider-flow
WORKDIR /spider-flow
EXPOSE 8088
ADD ./spider-flow-web/target/spider-flow.jar ./
CMD sleep 30;java -Djava.security.egd=file:/dev/./urandom -jar spider-flow.jar
================================================
FILE: LICENSE
================================================
MIT License
Copyright (c) 2019 小东
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
================================================
FILE: README.md
================================================
[介绍](#介绍) | [特性](#特性) | [插件](#插件) | DEMO站点 | 文档 | 更新日志 | [截图](#项目部分截图) | [其它开源](#其它开源项目) | [免责声明](#免责声明)
## 介绍
平台以流程图的方式定义爬虫,是一个高度灵活可配置的爬虫平台
## 特性
- [x] 支持Xpath/JsonPath/css选择器/正则提取/混搭提取
- [x] 支持JSON/XML/二进制格式
- [x] 支持多数据源、SQL select/selectInt/selectOne/insert/update/delete
- [x] 支持爬取JS动态渲染(或ajax)的页面
- [x] 支持代理
- [x] 支持自动保存至数据库/文件
- [x] 常用字符串、日期、文件、加解密等函数
- [x] 支持插件扩展(自定义执行器,自定义方法)
- [x] 任务监控,任务日志
- [x] 支持HTTP接口
- [x] 支持Cookie自动管理
- [x] 支持自定义函数
## 插件
- [x] [Selenium插件](https://gitee.com/ssssssss-team/spider-flow-selenium)
- [x] [Redis插件](https://gitee.com/ssssssss-team/spider-flow-redis)
- [x] [OSS插件](https://gitee.com/ssssssss-team/spider-flow-oss)
- [x] [Mongodb插件](https://gitee.com/ssssssss-team/spider-flow-mongodb)
- [x] [IP代理池插件](https://gitee.com/ssssssss-team/spider-flow-proxypool)
- [x] [OCR识别插件](https://gitee.com/ssssssss-team/spider-flow-ocr)
- [x] [电子邮箱插件](https://gitee.com/ssssssss-team/spider-flow-mailbox)
## 项目部分截图
### 爬虫列表

### 爬虫测试

### Debug

### 日志

## 其它开源项目
- [spider-flow-vue,spider-flow的前端](https://gitee.com/ssssssss-team/spider-flow-vue)
- [magic-api,一个以XML为基础自动映射为HTTP接口的框架](https://gitee.com/ssssssss-team/magic-api)
- [magic-api-spring-boot-starter](https://gitee.com/ssssssss-team/magic-api-spring-boot-starter)
## 免责声明
请勿将`spider-flow`应用到任何可能会违反法律规定和道德约束的工作中,请友善使用`spider-flow`,遵守蜘蛛协议,不要将`spider-flow`用于任何非法用途。如您选择使用`spider-flow`即代表您遵守此协议,作者不承担任何由于您违反此协议带来任何的法律风险和损失,一切后果由您承担。
================================================
FILE: db/spiderflow.sql
================================================
SET FOREIGN_KEY_CHECKS=0;
CREATE DATABASE spiderflow;
USE spiderflow;
DROP TABLE IF EXISTS `sp_flow`;
CREATE TABLE `sp_flow` (
`id` varchar(32) NOT NULL,
`name` varchar(64) DEFAULT NULL COMMENT '任务名字',
`xml` longtext DEFAULT NULL COMMENT 'xml表达式',
`cron` varchar(255) DEFAULT NULL COMMENT 'corn表达式',
`enabled` char(1) DEFAULT '0' COMMENT '任务是否启动,默认未启动',
`create_date` datetime DEFAULT CURRENT_TIMESTAMP COMMENT '创建时间',
`last_execute_time` datetime DEFAULT NULL COMMENT '上一次执行时间',
`next_execute_time` datetime DEFAULT NULL COMMENT '下一次执行时间',
`execute_count` int(8) DEFAULT NULL COMMENT '定时执行的已执行次数',
PRIMARY KEY (`id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COMMENT '爬虫任务表';
INSERT INTO `sp_flow` VALUES ('b45fb98d2a564c23ba623a377d5e12e9', '爬取码云GVP', '\n \n \n \n {"spiderName":"爬取码云GVP","threadCount":""}\n \n \n \n \n \n \n {"shape":"start"}\n \n \n \n \n \n {"value":"抓取首页","loopVariableName":"","sleep":"","timeout":"","response-charset":"","method":"GET","body-type":"none","body-content-type":"text/plain","loopCount":"","url":"https://gitee.com/gvp/all","proxy":"","request-body":[""],"follow-redirect":"1","shape":"request"}\n \n \n \n \n \n {"value":"","condition":""}\n \n \n \n \n \n {"value":"提取项目名、地址","loopVariableName":"","variable-name":["projectUrls","projectNames"],"loopCount":"","variable-value":["${extract.selectors(resp.html,'.categorical-project-card a','attr','href')}","${extract.selectors(resp.html,'.project-name')}"],"shape":"variable"}\n \n \n \n \n \n {"value":"","condition":""}\n \n \n \n \n \n {"value":"抓取详情页","loopVariableName":"projectIndex","sleep":"","timeout":"","response-charset":"","method":"GET","body-type":"none","body-content-type":"text/plain","loopCount":"10","url":"https://gitee.com/${projectUrls[projectIndex]}","proxy":"","request-body":[""],"follow-redirect":"1","shape":"request"}\n \n \n \n \n \n {"value":"","condition":""}\n \n \n \n \n \n {"value":"提取项目描述","loopVariableName":"","variable-name":["projectDesc"],"loopCount":"","variable-value":["${extract.selector(resp.html,'.git-project-desc-text')}"],"shape":"variable"}\n \n \n \n \n \n {"value":"","condition":""}\n \n \n \n \n \n {"value":"输出","output-name":["项目名","项目地址","项目描述"],"output-value":["${projectNames[projectIndex]}","https://gitee.com${projectUrls[projectIndex]}","${projectDesc}"],"shape":"output"}\n \n \n \n \n \n {"value":"","condition":""}\n \n \n \n\n', null, '0', '2019-08-22 13:46:54', null, null, null);
INSERT INTO `sp_flow` VALUES ('f0a67f17ee1a498a9b2f4ca30556f3c3', '抓取每日菜价', '\n \n \n \n {"spiderName":"抓取每日菜价","threadCount":""}\n \n \n \n \n \n \n {"shape":"start"}\n \n \n \n \n \n {"value":"开始抓取","loopVariableName":"","sleep":"","timeout":"","response-charset":"","method":"GET","body-type":"none","body-content-type":"text/plain","loopCount":"","url":"http://www.beijingprice.cn:8086/price/priceToday/PageLoad/LoadPrice?jsoncallback=1","proxy":"","request-body":[""],"follow-redirect":"1","shape":"request"}\n \n \n \n \n \n {"value":"","condition":""}\n \n \n \n \n \n {"value":"解析JSON","loopVariableName":"","variable-name":["jsonstr","jsondata","data"],"loopCount":"","variable-value":["${string.substring(resp.html,2,resp.html.length()-1)}","${json.parse(jsonstr)}","${extract.jsonpath(jsondata[0],'data')}"],"shape":"variable"}\n \n \n \n \n \n {"value":"","condition":""}\n \n \n \n \n \n {"value":"输出","loopVariableName":"i","output-name":["菜名","菜价","单位"],"loopCount":"${list.length(data)}","output-value":["${data[i].ItemName}","${data[i].Price04}","${data[i].ItemUnit}"],"shape":"output"}\n \n \n \n \n \n {"value":"","condition":""}\n \n \n \n\n', null, '0', '2019-08-22 13:48:22', null, null, null);
INSERT INTO `sp_flow` VALUES ('b4430885ba8349588d1220d37eac831d', '爬取开源中国动弹', '\n \n \n \n {"spiderName":"爬取开源中国动弹","threadCount":""}\n \n \n \n \n \n \n {"shape":"start"}\n \n \n \n \n \n {"value":"爬取动弹","loopVariableName":"","sleep":"","timeout":"","response-charset":"","method":"GET","parameter-name":["type","lastLogId"],"body-type":"none","body-content-type":"text/plain","loopCount":"","url":"https://www.oschina.net/tweets/widgets/_tweet_index_list ","proxy":"","parameter-value":["ajax","${lastLogId}"],"request-body":"","follow-redirect":"1","tls-validate":"1","shape":"request"}\n \n \n \n \n \n {"value":"","condition":""}\n \n \n \n \n \n {"value":"提取lastLogId以及tweets","loopVariableName":"","variable-name":["lastLogId","tweets","fetchCount"],"loopCount":"","variable-value":["${resp.selector('.tweet-item:last-child').attr('data-tweet-id')}","${resp.selectors('.tweet-item[data-tweet-id]')}","${fetchCount == null ? 0 : fetchCount + 1}"],"shape":"variable"}\n \n \n \n \n \n {"value":"","condition":""}\n \n \n \n \n \n {"value":"循环","loopVariableName":"index","loopCount":"${list.length(tweets)}","shape":"loop"}\n \n \n \n \n \n {"value":"","condition":""}\n \n \n \n \n \n {"value":"提取详细信息","loopVariableName":"","variable-name":["content","author","like","reply","publishTime"],"loopCount":"","variable-value":["${tweets[index].selector('.text').text()}","${tweets[index].selector('.user').text()}","${tweets[index].selector('.like span').text()}","${tweets[index].selector('.reply span').text()}","${tweets[index].selector('.date').regx('(.*?) ')}"],"shape":"variable"}\n \n \n \n \n \n {"value":"","condition":""}\n \n \n \n \n \n {"value":"输出","loopVariableName":"","output-name":["作者","内容","点赞数","评论数","发布时间"],"loopCount":"","output-value":["${author}","${content}","${like}","${reply}","${publishTime}"],"shape":"output"}\n \n \n \n \n \n {"value":"","condition":""}\n \n \n \n \n \n \n \n \n \n \n \n {"value":"爬取5页","condition":"${fetchCount < 3}"}\n \n \n \n\n', '', '0', '2019-11-03 17:02:49', '2019-11-04 10:11:31', '2019-11-03 17:30:56', '3');
INSERT INTO `sp_flow` VALUES ('663aaa5e36a84c9594ef3cfd6738e9a7', '百度热点', '\n \n \n \n {"spiderName":"百度热点","threadCount":""}\n \n \n \n \n \n \n {"shape":"start"}\n \n \n \n \n \n {"value":"开始抓取","loopVariableName":"","sleep":"","timeout":"","response-charset":"gbk","method":"GET","body-type":"none","body-content-type":"text/plain","loopCount":"","url":"https://top.baidu.com/buzz?b=1&fr=topindex","proxy":"","request-body":"","follow-redirect":"1","tls-validate":"1","shape":"request"}\n \n \n \n \n \n {"value":"定义变量","loopVariableName":"","variable-name":["elementbd"],"loopCount":"","variable-value":["${resp.xpaths('//*[@id=\\"main\\"]/div[2]/div/table/tbody/tr')}"],"shape":"variable"}\n \n \n \n \n \n {"value":"输出","loopVariableName":"i","output-name":["名称","地址","百度指数","2"],"loopCount":"${elementbd.size()-1}","output-value":["${elementbd[i+1].xpath('//td[2]/a[1]/text()')}","${elementbd[i+1].xpath('//td[2]/a[1]/@href')}","${elementbd[i+1].xpath('//td[4]/span/text()')}","${elementbd[i+1].xpath('//td[3]/a[2]/text()')}"],"shape":"output"}\n \n \n \n \n \n {"value":"","condition":""}\n \n \n \n \n \n {"value":"","condition":""}\n \n \n \n \n \n {"value":"","condition":""}\n \n \n \n\n', '0 0/30 * * * ? *', '1', '2019-10-20 17:24:21', '2019-11-04 08:52:05', '2019-10-30 14:52:39', '45');
DROP TABLE IF EXISTS `sp_datasource`;
CREATE TABLE `sp_datasource` (
`id` varchar(32) NOT NULL,
`name` varchar(255) DEFAULT NULL,
`driver_class_name` varchar(255) DEFAULT NULL,
`jdbc_url` varchar(255) DEFAULT NULL,
`username` varchar(64) DEFAULT NULL,
`password` varchar(32) DEFAULT NULL,
`create_date` datetime NOT NULL DEFAULT CURRENT_TIMESTAMP,
PRIMARY KEY (`id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;
DROP TABLE IF EXISTS `sp_variable`;
CREATE TABLE `sp_variable` (
`id` int(11) NOT NULL AUTO_INCREMENT,
`name` varchar(32) DEFAULT NULL COMMENT '变量名',
`value` varchar(512) DEFAULT NULL COMMENT '变量值',
`description` varchar(255) DEFAULT NULL COMMENT '变量描述',
`create_date` datetime DEFAULT CURRENT_TIMESTAMP COMMENT '创建时间',
PRIMARY KEY (`id`)
) ENGINE=InnoDB AUTO_INCREMENT=4 DEFAULT CHARSET=utf8mb4;
/* v0.3.0 新增 */
DROP TABLE IF EXISTS `sp_task`;
CREATE TABLE `sp_task` (
`id` int(11) NOT NULL AUTO_INCREMENT,
`flow_id` varchar(32) NOT NULL,
`begin_time` datetime DEFAULT NULL,
`end_time` datetime DEFAULT NULL,
PRIMARY KEY (`id`)
) ENGINE=InnoDB AUTO_INCREMENT=7 DEFAULT CHARSET=utf8mb4;
/* v0.4.0 新增 */
DROP TABLE IF EXISTS `sp_function`;
CREATE TABLE `sp_function` (
`id` varchar(32) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci NOT NULL,
`name` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci NULL DEFAULT NULL COMMENT '函数名',
`parameter` varchar(512) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci NULL DEFAULT NULL COMMENT '参数',
`script` text CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci NULL COMMENT 'js脚本',
`create_date` datetime(0) NULL DEFAULT CURRENT_TIMESTAMP,
PRIMARY KEY (`id`) USING BTREE
) ENGINE = InnoDB CHARACTER SET = utf8mb4 COLLATE = utf8mb4_general_ci ROW_FORMAT = Dynamic;
/* v0.5.0 新增 */
DROP TABLE IF EXISTS `sp_flow_notice`;
CREATE TABLE `sp_flow_notice` (
`id` varchar(32) NOT NULL,
`recipients` varchar(200) DEFAULT NULL COMMENT '收件人',
`notice_way` char(10) DEFAULT NULL COMMENT '通知方式',
`start_notice` char(1) DEFAULT '0' COMMENT '流程开始通知:1:开启通知,0:关闭通知',
`exception_notice` char(1) DEFAULT '0' COMMENT '流程异常通知:1:开启通知,0:关闭通知',
`end_notice` char(1) DEFAULT '0' COMMENT '流程结束通知:1:开启通知,0:关闭通知',
PRIMARY KEY (`id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COMMENT '爬虫任务通知表';
================================================
FILE: pom.xml
================================================
4.0.0
org.spiderflow
spider-flow
0.5.0
pom
spider-flow
https://gitee.com/jmxd/spider-flow
org.springframework.boot
spring-boot-starter-parent
2.0.7.RELEASE
UTF-8
${project.version}
1.2.83
1.1.16
2.11.5
3.1.0
1.6
1.8
2.7
28.2-jre
1.11.3
0.3.1
org.springframework.boot
spring-boot-starter-web
org.springframework.boot
spring-boot-starter-quartz
org.springframework.boot
spring-boot-starter-mail
org.springframework
spring-jdbc
org.springframework.boot
spring-boot-starter-websocket
com.baomidou
mybatis-plus-boot-starter
${mybatis.plus.version}
mysql
mysql-connector-java
com.alibaba
fastjson
${alibaba.fastjson.version}
com.alibaba
druid-spring-boot-starter
${alibaba.druid.version}
com.alibaba
transmittable-thread-local
${alibaba.transmittable.version}
org.apache.commons
commons-text
${apache.commons.text.verion}
org.apache.commons
commons-csv
${apache.commons.csv.verion}
commons-io
commons-io
${commons.io.version}
commons-codec
commons-codec
com.google.guava
guava
${guava.version}
org.jsoup
jsoup
${jsoup.version}
us.codecraft
xsoup
${xsoup.version}
org.spiderflow
spider-flow-api
${spider-flow.version}
org.spiderflow
spider-flow-core
${spider-flow.version}
org.spiderflow
spider-flow-selenium
${spider-flow.version}
org.spiderflow
spider-flow-proxypool
${spider-flow.version}
org.spiderflow
spider-flow-mongodb
${spider-flow.version}
org.spiderflow
spider-flow-redis
${spider-flow.version}
org.spiderflow
spider-flow-ocr
${spider-flow.version}
org.spiderflow
spider-flow-oss
${spider-flow.version}
org.spiderflow
spider-flow-mailbox
${spider-flow.version}
spider-flow-api
spider-flow-core
spider-flow-web
================================================
FILE: spider-flow-api/pom.xml
================================================
4.0.0
org.spiderflow
spider-flow
0.5.0
spider-flow-api
spider-flow-api
https://gitee.com/jmxd/spider-flow/tree/master/spider-flow-api
UTF-8
================================================
FILE: spider-flow-api/src/main/java/org/spiderflow/ExpressionEngine.java
================================================
package org.spiderflow;
import java.util.Map;
/**
* 表达式引擎
*/
public interface ExpressionEngine {
/**
* 执行表达式
* @param expression 表达式
* @param variables 变量
* @return
*/
Object execute(String expression, Map variables);
}
================================================
FILE: spider-flow-api/src/main/java/org/spiderflow/Grammerable.java
================================================
package org.spiderflow;
import java.util.List;
import org.spiderflow.model.Grammer;
public interface Grammerable {
List grammers();
}
================================================
FILE: spider-flow-api/src/main/java/org/spiderflow/annotation/Comment.java
================================================
package org.spiderflow.annotation;
import java.lang.annotation.Documented;
import java.lang.annotation.ElementType;
import java.lang.annotation.Retention;
import java.lang.annotation.RetentionPolicy;
import java.lang.annotation.Target;
/**
* 该注解用来标注自定义的方法注释,用来页面代码提示
*/
@Documented
@Retention(RetentionPolicy.RUNTIME)
@Target({ElementType.METHOD,ElementType.TYPE})
public @interface Comment {
String value();
}
================================================
FILE: spider-flow-api/src/main/java/org/spiderflow/annotation/Example.java
================================================
package org.spiderflow.annotation;
import java.lang.annotation.Documented;
import java.lang.annotation.ElementType;
import java.lang.annotation.Retention;
import java.lang.annotation.RetentionPolicy;
import java.lang.annotation.Target;
/**
* 该注解用来标注自定义的方法注释,用来页面代码案例
*/
@Documented
@Retention(RetentionPolicy.RUNTIME)
@Target(ElementType.METHOD)
public @interface Example {
String value();
}
================================================
FILE: spider-flow-api/src/main/java/org/spiderflow/annotation/Return.java
================================================
package org.spiderflow.annotation;
import java.lang.annotation.Documented;
import java.lang.annotation.ElementType;
import java.lang.annotation.Retention;
import java.lang.annotation.RetentionPolicy;
import java.lang.annotation.Target;
/**
* 该注解用来标注自定义的方法注释,用来页面提示返回值类型
*/
@Documented
@Retention(RetentionPolicy.RUNTIME)
@Target(ElementType.METHOD)
public @interface Return {
Class>[] value();
}
================================================
FILE: spider-flow-api/src/main/java/org/spiderflow/common/CURDController.java
================================================
package org.spiderflow.common;
import org.spiderflow.model.JsonBean;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.web.bind.annotation.RequestMapping;
import org.springframework.web.bind.annotation.RequestParam;
import com.baomidou.mybatisplus.core.conditions.query.QueryWrapper;
import com.baomidou.mybatisplus.core.mapper.BaseMapper;
import com.baomidou.mybatisplus.core.metadata.IPage;
import com.baomidou.mybatisplus.extension.plugins.pagination.Page;
import com.baomidou.mybatisplus.extension.service.impl.ServiceImpl;
public abstract class CURDController,M extends BaseMapper, T> {
@Autowired
private S service;
@RequestMapping("/list")
public IPage list(@RequestParam(name = "page",defaultValue = "1")Integer page, @RequestParam(name = "limit",defaultValue = "1")Integer size){
return service.page(new Page(page, size), new QueryWrapper().orderByDesc("create_date"));
}
@RequestMapping("get")
public JsonBean get(String id) {
return new JsonBean(service.getById(id));
}
@RequestMapping("delete")
public JsonBean delete(String id){
return new JsonBean(service.removeById(id));
}
@RequestMapping("save")
public JsonBean save(T t){
return new JsonBean(service.saveOrUpdate(t));
}
}
================================================
FILE: spider-flow-api/src/main/java/org/spiderflow/concurrent/ChildPriorThreadSubmitStrategy.java
================================================
package org.spiderflow.concurrent;
import org.spiderflow.model.SpiderNode;
import java.util.Comparator;
import java.util.PriorityQueue;
public class ChildPriorThreadSubmitStrategy implements ThreadSubmitStrategy{
private Object mutex = this;
private Comparator comparator = (o1, o2) -> {
if(o1.hasLeftNode(o2.getNodeId())){
return -1;
}
return 1;
};
private PriorityQueue> priorityQueue = new PriorityQueue<>((o1, o2) -> comparator.compare(o1.getNode(),o2.getNode()));
@Override
public Comparator comparator() {
return comparator;
}
@Override
public void add(SpiderFutureTask> task) {
synchronized (mutex){
priorityQueue.add(task);
}
}
@Override
public boolean isEmpty() {
synchronized (mutex){
return priorityQueue.isEmpty();
}
}
@Override
public SpiderFutureTask> get() {
synchronized (mutex){
return priorityQueue.poll();
}
}
}
================================================
FILE: spider-flow-api/src/main/java/org/spiderflow/concurrent/LinkedThreadSubmitStrategy.java
================================================
package org.spiderflow.concurrent;
import org.spiderflow.model.SpiderNode;
import java.util.Comparator;
import java.util.List;
import java.util.concurrent.CopyOnWriteArrayList;
public class LinkedThreadSubmitStrategy implements ThreadSubmitStrategy{
private List> taskList = new CopyOnWriteArrayList<>();
@Override
public Comparator comparator() {
return (o1, o2) -> -1;
}
@Override
public void add(SpiderFutureTask> task) {
taskList.add(task);
}
@Override
public boolean isEmpty() {
return taskList.isEmpty();
}
@Override
public SpiderFutureTask> get() {
return taskList.remove(0);
}
}
================================================
FILE: spider-flow-api/src/main/java/org/spiderflow/concurrent/ParentPriorThreadSubmitStrategy.java
================================================
package org.spiderflow.concurrent;
import org.spiderflow.model.SpiderNode;
import java.util.Comparator;
import java.util.PriorityQueue;
public class ParentPriorThreadSubmitStrategy implements ThreadSubmitStrategy {
private Object mutex = this;
private Comparator comparator = (o1, o2) -> {
if (o1.hasLeftNode(o2.getNodeId())) {
return 1;
}
return -1;
};
private PriorityQueue> priorityQueue = new PriorityQueue<>((o1, o2) -> comparator.compare(o1.getNode(), o2.getNode()));
@Override
public Comparator comparator() {
return comparator;
}
@Override
public void add(SpiderFutureTask> task) {
synchronized (mutex) {
priorityQueue.add(task);
}
}
@Override
public boolean isEmpty() {
synchronized (mutex) {
return priorityQueue.isEmpty();
}
}
@Override
public SpiderFutureTask> get() {
synchronized (mutex) {
return priorityQueue.poll();
}
}
}
================================================
FILE: spider-flow-api/src/main/java/org/spiderflow/concurrent/RandomThreadSubmitStrategy.java
================================================
package org.spiderflow.concurrent;
import org.apache.commons.lang3.RandomUtils;
import org.spiderflow.model.SpiderNode;
import java.util.Comparator;
import java.util.List;
import java.util.concurrent.CopyOnWriteArrayList;
public class RandomThreadSubmitStrategy implements ThreadSubmitStrategy{
private List> taskList = new CopyOnWriteArrayList<>();
@Override
public Comparator comparator() {
return (o1, o2) -> RandomUtils.nextInt(0,3) - 1;
}
@Override
public void add(SpiderFutureTask> task) {
taskList.add(task);
}
@Override
public boolean isEmpty() {
return taskList.isEmpty();
}
@Override
public SpiderFutureTask> get() {
return taskList.remove(RandomUtils.nextInt(0, taskList.size()));
}
}
================================================
FILE: spider-flow-api/src/main/java/org/spiderflow/concurrent/SpiderFlowThreadPoolExecutor.java
================================================
package org.spiderflow.concurrent;
import org.spiderflow.model.SpiderNode;
import java.util.concurrent.*;
import java.util.concurrent.atomic.AtomicInteger;
public class SpiderFlowThreadPoolExecutor {
/**
* 最大线程数
*/
private int maxThreads;
/**
* 真正线程池
*/
private ThreadPoolExecutor executor;
/**
* 线程number计数器
*/
private final AtomicInteger poolNumber = new AtomicInteger(1);
/**
* ThreadGroup
*/
private static final ThreadGroup SPIDER_FLOW_THREAD_GROUP = new ThreadGroup("spider-flow-group");
/**
* 线程名称前缀
*/
private static final String THREAD_POOL_NAME_PREFIX = "spider-flow-";
public SpiderFlowThreadPoolExecutor(int maxThreads) {
super();
this.maxThreads = maxThreads;
//创建线程池实例
this.executor = new ThreadPoolExecutor(maxThreads, maxThreads, 10, TimeUnit.MILLISECONDS, new LinkedBlockingQueue<>(), runnable -> {
//重写线程名称
return new Thread(SPIDER_FLOW_THREAD_GROUP, runnable, THREAD_POOL_NAME_PREFIX + poolNumber.getAndIncrement());
});
}
public Future> submit(Runnable runnable){
return this.executor.submit(runnable);
}
/**
* 创建子线程池
* @param threads 线程池大小
* @return
*/
public SubThreadPoolExecutor createSubThreadPoolExecutor(int threads,ThreadSubmitStrategy submitStrategy){
return new SubThreadPoolExecutor(Math.min(maxThreads, threads),submitStrategy);
}
/**
* 子线程池
*/
public class SubThreadPoolExecutor{
/**
* 线程池大小
*/
private int threads;
/**
* 正在执行中的任务
*/
private Future>[] futures;
/**
* 执行中的数量
*/
private AtomicInteger executing = new AtomicInteger(0);
/**
* 是否运行中
*/
private volatile boolean running = true;
/**
* 是否提交任务中
*/
private volatile boolean submitting = false;
private ThreadSubmitStrategy submitStrategy;
public SubThreadPoolExecutor(int threads,ThreadSubmitStrategy submitStrategy) {
super();
this.threads = threads;
this.futures = new Future[threads];
this.submitStrategy = submitStrategy;
}
/**
* 等待所有线程执行完毕
*/
public void awaitTermination(){
while(executing.get() > 0){
removeDoneFuture();
}
running = false;
//当停止时,唤醒提交任务线程使其结束
synchronized (submitStrategy){
submitStrategy.notifyAll();
}
}
private int index(){
for (int i = 0; i < threads; i++) {
if(futures[i] == null || futures[i].isDone()){
return i;
}
}
return -1;
}
/**
* 清除已完成的任务
*/
private void removeDoneFuture(){
for (int i = 0; i < threads; i++) {
try {
if(futures[i] != null && futures[i].get(10,TimeUnit.MILLISECONDS) == null){
futures[i] = null;
}
} catch (Throwable t) {
//忽略异常
}
}
}
/**
* 等待有空闲线程
*/
private void await(){
while(index() == -1){
removeDoneFuture();
}
}
/**
* 异步提交任务
*/
public Future submitAsync(Runnable runnable, T value, SpiderNode node){
SpiderFutureTask future = new SpiderFutureTask<>(()-> {
try {
//执行任务
runnable.run();
} finally {
//正在执行的线程数-1
executing.decrementAndGet();
}
}, value,node,this);
submitStrategy.add(future);
//如果是第一次调用submitSync方法,则启动提交任务线程
if(!submitting){
submitting = true;
CompletableFuture.runAsync(this::submit);
}
synchronized (submitStrategy){
//通知继续从集合中取任务提交到线程池中
submitStrategy.notifyAll();
}
return future;
}
private void submit(){
while(running){
try {
synchronized (submitStrategy){
//如果集合是空的,则等待提交
if(submitStrategy.isEmpty()){
submitStrategy.wait(); //等待唤醒
}
}
//当该线程被唤醒时,把集合中所有任务都提交到线程池中
while(!submitStrategy.isEmpty()){
//从提交策略中获取任务提交到线程池中
SpiderFutureTask> futureTask = submitStrategy.get();
//如果没有空闲线程且在线程池中提交,则直接运行
if(index() == -1 && Thread.currentThread().getThreadGroup() == SPIDER_FLOW_THREAD_GROUP){
futureTask.run();
}else{
//等待有空闲线程时在提交
await();
//提交任务至线程池中
futures[index()] = executor.submit(futureTask);
}
}
} catch (InterruptedException ignored) {
}
}
}
}
}
================================================
FILE: spider-flow-api/src/main/java/org/spiderflow/concurrent/SpiderFutureTask.java
================================================
package org.spiderflow.concurrent;
import java.util.concurrent.FutureTask;
import org.spiderflow.concurrent.SpiderFlowThreadPoolExecutor.SubThreadPoolExecutor;
import org.spiderflow.model.SpiderNode;
public class SpiderFutureTask extends FutureTask {
private SubThreadPoolExecutor executor;
private SpiderNode node;
public SpiderFutureTask(Runnable runnable, V result, SpiderNode node,SubThreadPoolExecutor executor) {
super(runnable,result);
this.executor = executor;
this.node = node;
}
public SubThreadPoolExecutor getExecutor() {
return executor;
}
public SpiderNode getNode() {
return node;
}
}
================================================
FILE: spider-flow-api/src/main/java/org/spiderflow/concurrent/ThreadSubmitStrategy.java
================================================
package org.spiderflow.concurrent;
import org.spiderflow.model.SpiderNode;
import java.util.Comparator;
public interface ThreadSubmitStrategy {
Comparator comparator();
void add(SpiderFutureTask> task);
boolean isEmpty();
SpiderFutureTask> get();
}
================================================
FILE: spider-flow-api/src/main/java/org/spiderflow/context/CookieContext.java
================================================
package org.spiderflow.context;
import java.util.HashMap;
/**
* Cookie上下文
*/
public class CookieContext extends HashMap {
}
================================================
FILE: spider-flow-api/src/main/java/org/spiderflow/context/SpiderContext.java
================================================
package org.spiderflow.context;
import org.spiderflow.concurrent.SpiderFlowThreadPoolExecutor.SubThreadPoolExecutor;
import org.spiderflow.model.SpiderNode;
import org.spiderflow.model.SpiderOutput;
import java.util.*;
import java.util.concurrent.Future;
import java.util.concurrent.LinkedBlockingQueue;
import java.util.concurrent.locks.ReentrantLock;
/**
* 爬虫上下文
* @author jmxd
*
*/
public class SpiderContext extends HashMap{
private String id = UUID.randomUUID().toString().replace("-", "");
/**
* 流程ID
*/
private String flowId;
private static final long serialVersionUID = 8379177178417619790L;
/**
* 流程执行线程
*/
private SubThreadPoolExecutor threadPool;
/**
* 根节点
*/
private SpiderNode rootNode;
/**
* 爬虫是否运行中
*/
private volatile boolean running = true;
/**
* Future队列
*/
private LinkedBlockingQueue> futureQueue = new LinkedBlockingQueue<>();
/**
* Cookie上下文
*/
private CookieContext cookieContext = new CookieContext();
public List getOutputs() {
return Collections.emptyList();
}
public T get(String key){
return (T) super.get(key);
}
public T get(String key,T defaultValue){
T value = this.get(key);
return value == null ? defaultValue : value;
}
public String getFlowId() {
return flowId;
}
public void setFlowId(String flowId) {
this.flowId = flowId;
}
public LinkedBlockingQueue> getFutureQueue() {
return futureQueue;
}
public boolean isRunning() {
return running;
}
public void setRunning(boolean running) {
this.running = running;
}
public void addOutput(SpiderOutput output){
}
public SubThreadPoolExecutor getThreadPool() {
return threadPool;
}
public void setThreadPool(SubThreadPoolExecutor threadPool) {
this.threadPool = threadPool;
}
public SpiderNode getRootNode() {
return rootNode;
}
public void setRootNode(SpiderNode rootNode) {
this.rootNode = rootNode;
}
public String getId() {
return id;
}
public CookieContext getCookieContext() {
return cookieContext;
}
public void pause(String nodeId,String event,String key,Object value){}
public void resume(){}
public void stop(){}
}
================================================
FILE: spider-flow-api/src/main/java/org/spiderflow/context/SpiderContextHolder.java
================================================
package org.spiderflow.context;
import com.alibaba.ttl.TransmittableThreadLocal;
public class SpiderContextHolder {
private static final ThreadLocal THREAD_LOCAL = new TransmittableThreadLocal<>();
public static SpiderContext get() {
return THREAD_LOCAL.get();
}
public static void set(SpiderContext context) {
THREAD_LOCAL.set(context);
}
public static void remove() {
THREAD_LOCAL.remove();
}
}
================================================
FILE: spider-flow-api/src/main/java/org/spiderflow/enums/FlowNoticeType.java
================================================
package org.spiderflow.enums;
/**
* 流程通知类型
*
* @author BillDowney
* @date 2020年4月4日 上午1:32:53
*/
public enum FlowNoticeType {
/**
* 流程开始通知
*/
startNotice,
/**
* 流程异常通知
*/
exceptionNotice,
/**
* 流程结束通知
*/
endNotice
}
================================================
FILE: spider-flow-api/src/main/java/org/spiderflow/enums/FlowNoticeWay.java
================================================
package org.spiderflow.enums;
import java.util.LinkedHashMap;
import java.util.Map;
/**
* 流程通知方式
*
* @author BillDowney
* @date 2020年4月3日 下午3:26:18
*/
public enum FlowNoticeWay {
email("邮件通知");
private FlowNoticeWay(String title) {
this.title = title;
}
private String title;
@Override
public String toString() {
return this.name() + ":" + this.title;
}
public static Map getMap() {
Map map = new LinkedHashMap();
for (FlowNoticeWay type : FlowNoticeWay.values()) {
map.put(type.name(), type.toString());
}
return map;
}
}
================================================
FILE: spider-flow-api/src/main/java/org/spiderflow/executor/FunctionExecutor.java
================================================
package org.spiderflow.executor;
public interface FunctionExecutor {
String getFunctionPrefix();
}
================================================
FILE: spider-flow-api/src/main/java/org/spiderflow/executor/FunctionExtension.java
================================================
package org.spiderflow.executor;
public interface FunctionExtension {
Class> support();
}
================================================
FILE: spider-flow-api/src/main/java/org/spiderflow/executor/PluginConfig.java
================================================
package org.spiderflow.executor;
import org.spiderflow.model.Plugin;
public interface PluginConfig {
Plugin plugin();
}
================================================
FILE: spider-flow-api/src/main/java/org/spiderflow/executor/ShapeExecutor.java
================================================
package org.spiderflow.executor;
import java.util.Map;
import org.spiderflow.context.SpiderContext;
import org.spiderflow.model.Shape;
import org.spiderflow.model.SpiderNode;
/**
* 执行器接口
* @author jmxd
*
*/
public interface ShapeExecutor {
String LOOP_VARIABLE_NAME = "loopVariableName";
String LOOP_COUNT = "loopCount";
String THREAD_COUNT = "threadCount";
default Shape shape(){
return null;
}
/**
* 节点形状
* @return 节点形状名称
*/
String supportShape();
/**
* 执行器具体的功能实现
* @param node 当前要执行的爬虫节点
* @param context 爬虫上下文
* @param variables 节点流程的全部变量的集合
*/
void execute(SpiderNode node, SpiderContext context, Map variables);
default boolean allowExecuteNext(SpiderNode node, SpiderContext context, Map variables){
return true;
}
default boolean isThread(){
return true;
}
}
================================================
FILE: spider-flow-api/src/main/java/org/spiderflow/expression/DynamicMethod.java
================================================
package org.spiderflow.expression;
import java.util.List;
public interface DynamicMethod {
Object execute(String methodName, List