Repository: 2young2simple/yispider Branch: master Commit: 57d48bdb0f8e Files: 97 Total size: 17.8 MB Directory structure: gitextract_m02sfw7x/ ├── .gitignore ├── README.md ├── example-spider/ │ ├── bilibili/ │ │ ├── conf.json │ │ └── main.go │ ├── dilidili/ │ │ ├── conf.json │ │ └── main.go │ ├── douban-movie/ │ │ ├── conf.json │ │ └── main.go │ ├── douban-movie-code/ │ │ ├── conf.json │ │ └── main.go │ ├── empty/ │ │ ├── conf.json │ │ └── main.go │ ├── haoqi/ │ │ ├── conf.json │ │ └── main.go │ ├── jingdong/ │ │ ├── conf.json │ │ └── main.go │ ├── qiongyou/ │ │ ├── conf.json │ │ └── main.go │ ├── qiubai/ │ │ ├── conf.json │ │ ├── main.go │ │ └── sem_test.go │ ├── ttkb/ │ │ ├── conf.json │ │ └── main.go │ ├── ttkb-author/ │ │ ├── conf.json │ │ └── main.go │ ├── tuiku/ │ │ ├── conf.json │ │ └── main.go │ ├── wangyi-music/ │ │ ├── conf.json │ │ ├── main.go │ │ └── music/ │ │ ├── conf.json │ │ └── wangyi-music │ └── woshipm/ │ ├── conf.json │ └── main.go ├── manage/ │ ├── conf.json │ ├── config/ │ │ └── config.go │ ├── discover/ │ │ ├── discover.go │ │ ├── etcd/ │ │ │ └── etcd.go │ │ ├── file/ │ │ │ └── file.go │ │ └── zookeeper/ │ │ └── zookeeper.go │ ├── http/ │ │ ├── controller.go │ │ ├── request.go │ │ ├── server.go │ │ └── service.go │ ├── logger/ │ │ └── logger.go │ ├── main.go │ ├── model/ │ │ ├── node_info.go │ │ └── task.go │ ├── schedule/ │ │ ├── request.go │ │ └── schedule.go │ ├── strategy/ │ │ └── rand_strategy.go │ └── task/ │ └── task.go ├── spider/ │ ├── boot.go │ ├── common/ │ │ ├── encode.go │ │ ├── prase_req.go │ │ └── prase_req_test.go │ ├── conf.json │ ├── config/ │ │ └── config.go │ ├── core/ │ │ ├── engine.go │ │ └── runtime.go │ ├── downloader/ │ │ ├── request.go │ │ └── request_test.go │ ├── http/ │ │ └── server.go │ ├── logger/ │ │ └── logger.go │ ├── model/ │ │ ├── context.go │ │ ├── page.go │ │ └── task.go │ ├── pipline/ │ │ ├── console/ │ │ │ └── console.go │ │ ├── file/ │ │ │ └── file.go │ │ ├── mysql/ │ │ │ ├── dbModel.go │ │ │ ├── mysql.go │ │ │ └── mysqlPipline.go │ │ ├── nsq/ │ │ │ └── nsq.go │ │ └── pipline.go │ ├── process/ │ │ ├── filter/ │ │ │ ├── repoat_filter.go │ │ │ ├── repoat_filter_test.go │ │ │ ├── url_filter.go │ │ │ └── url_filter_test.go │ │ ├── json-process/ │ │ │ ├── json_process.go │ │ │ └── json_rule.go │ │ ├── process.go │ │ └── template-process/ │ │ ├── template_process.go │ │ ├── template_rule.go │ │ └── template_rule_test.go │ ├── register/ │ │ └── etcd/ │ │ └── etcd.go │ ├── schedule/ │ │ ├── schedule.go │ │ ├── schedule_chan.go │ │ ├── schedule_chan_test.go │ │ ├── schedule_redis.go │ │ └── schedule_redis_test.go │ └── spider/ │ └── spider.go └── storage/ ├── conf.json ├── config/ │ └── config.go ├── db/ │ ├── elasticsearch/ │ │ └── elasticsearch.go │ ├── hbase/ │ │ └── hbase.go │ └── mysql/ │ └── mysql.go ├── logger/ │ └── logger.go └── main.go ================================================ FILE CONTENTS ================================================ ================================================ FILE: .gitignore ================================================ .idea/ .idea *_result.txt .DS_Store *.txt ================================================ FILE: README.md ================================================ # YiSpider A distributed spider platform ## 介绍 一款分布式爬虫平台,帮助你更好的管理和开发爬虫。 内置一套爬虫定义规则(模版),可使用模版快速定义爬虫,也可当作框架手动开发爬虫 ## 计划 - [x] 增加了更多例子。 - [x] 内置实现了基于redis的调度器。 - [ ] 正在准备管理网页端部分的制作,敬请期待。 ## 架构 目前框架分为2个部分: #### 1.爬虫部分(spider节点): 内部结构参考python scrapy框架,主要由 schedule,page process,pipline 4个部分组成,单个爬虫单独调度器,单独上下文管理,目前内置2中pipline的方式,控制台和文件,节点信息注册在etcd上用于manage节点发现。 - `core`:负责爬虫生命周期、上下文的管理,负责爬虫的运行。 - `schedule`:负责爬虫请求的调度。(基于 channel 或 redis 的调度器) - `process`:负责请求结果的处理。 - `pipline`: 结果的输出输出到不同渠道,如控制台,文件,消息队列,数据库等等 - `register`:负责服务的注册(目前只支持etcd) - `http`: 提供一些http接口 #### 2.管理部分(manage节点): 负责spider节点的管理,用etcd进行spider节点的发现。通过http与spider节点通讯。 ## 开始使用 ### 例子 example-spider包内有大量实例 - 哔哩哔哩 - 嘀哩嘀哩 - 豆瓣电影 - 好奇心日报 - 京东 - 穷游 - 糗百 - 推库 - 网易云音乐 ### 请求介绍 初始请求(Request)Url有2种语法糖方式,用于简便易用: #### 1. http://xxx/xxx/{begin-end,offset} ``` start = 0 20 40 ... 10000 url = https://movie.douban.com/j/new_search_subjects?sort=T&range=0,10&tags=&start={0-10000,20} ``` #### 2. http://xxx/xxx/{aa|bb|cc} ``` start = 0 20 40 60 url = https://movie.douban.com/j/new_search_subjects?sort=T&range=0,10&tags=&start={0|20|40|60} ``` #### 3.http://www.dilidili.wang{$href} (AddQueue特有) ``` 如果 href = "/abc" (href是process解析出的参数) url = http://www.dilidili.wang{$href} url = http://www.dilidili.wang/abc url = https://movie.douban.com/j/new_search_subjects?sort=T&range=0,10&tags=&start={0-$count,20} 等等 ``` ### 实例 #### 1. Json模版 ``` http接口调用 curl -d '{"id":"douban-movie","Name":"douban-movie","request":[{"url":"https://movie.douban.com/j/new_search_subjects?sort=T\u0026range=0,10\u0026tags=\u0026start={0-100,20}","method":"get","type":"","data":null,"header":null,"cookies":{"url":"","data":""},"process_name":"movie"}],"process":[{"name":"movie","reg_url":null,"type":"json","template_rule":{"Rule":null},"json_rule":{"Rule":{"casts":"casts","cover":"cover","id":"id","node":"array|data","rate":"rate","star":"star","title":"title","url":"url"}},"add_queue":null}],"pipline":"file","depth":0,"end_count":0}' "http://127.0.0.1:7774/task/addAndRun" ``` 豆瓣电影模版 ``` { "id": "douban-movie", "Name": "douban-movie", "request": [ { "url": "https://movie.douban.com/j/new_search_subjects?sort=T&range=0,10&tags=&start={0-10,20}", "method": "get", "process_name": "movie" } ], "process": [ { "name": "movie", "type": "json", "json_rule": { "Rule": { "casts": "casts", "cover": "cover", "id": "id", "node": "array|data", "rate": "rate", "star": "star", "title": "title", "url": "url" } }, "add_queue": null } ], "pipline": "file", "depth": 0, "end_count": 0 } ``` dilidili模版 ``` { "id": "dilidili", "Name": "dilidili", "request": [ { "url": "http://www.dilidili.wang/{gaoxiao|kehuan|yundong|danmei|zhiyuxi|luoli|zhenren|zhuangbi|youxi|tuili|qingchun|kongbu|jizhan|rexue|qingxiaoshuo|maoxian|hougong|qihuan|tongnian|lianai|meishaonv|lizhi|baihe|paomianfan|yinv}/", "method": "get", "process_name": "animelist" } ], "process": [ { "name": "animelist", "type": "template", "template_rule": { "Rule": { "content": "text|dd div", "desc": "text|dd p", "href": "attr.href|dt a", "img": "attr.src|dt a img", "node": "array|.anime_list dl", "title": "text|dd h3 a" } }, "add_queue": [ { "url": "http://www.dilidili.wang{href}", "method": "get", "process_name": "animeinfo" } ] }, { "name": "animeinfo", "type": "template", "template_rule": { "Rule": { "episode": "texts|.time_con .swiper-slide .clear li a em", "episode-link": "attrs.href|.time_con .swiper-slide .clear li a", "title": "text|.detail dl dd h1" } }, "add_queue": [ { "url": "{episode-link}", "method": "get", "process_name": "episodeinfo" } ] }, { "name": "episodeinfo", "reg_url": null, "type": "template", "template_rule": { "Rule": { "player": "attr.src|.player_main iframe", "title": "text|#intro2 h1", "url": "attr.href|link[rel=\"canonical\"]" } }, "add_queue": null } ], "pipline": "file", "depth": 0, "end_count": 0 } ``` #### 2. 代码模版 编写 豆瓣电影 ``` package main import ( "YiSpider/spider/model" "YiSpider/spider" spider2 "YiSpider/spider/spider" ) func main(){ task := &model.Task{ Id:"douban-movie", Name:"douban-movie", Request:[]*model.Request{ { Method:"get", Url:"https://movie.douban.com/j/new_search_subjects?sort=T&range=0,10&tags=&start={0-10000,20}", ProcessName:"movie", }, }, Process: []model.Process{ { Name:"movie", Type:"json", JsonRule:model.JsonRule{ Rule:map[string]string{ "node":"array|data", "rate":"rate", "star":"star", "id":"id", "url":"url", "title":"title", "cover":"cover", "casts":"casts", }, }, }, }, Pipline:"file", } app := spider.New() app.AddSpider(spider2.InitWithTask(task)) app.Run() } ``` dilidili番剧 ``` package main import ( "YiSpider/spider/model" "YiSpider/spider" spider2 "YiSpider/spider/spider" ) func main(){ task := &model.Task{ Id:"dilidili", Name:"dilidili", Request:[]*model.Request{ { Method:"get", Url:"http://www.dilidili.wang/{gaoxiao|kehuan|yundong|danmei|zhiyuxi|luoli|zhenren|zhuangbi|youxi|tuili|qingchun|kongbu|jizhan|rexue|qingxiaoshuo|maoxian|hougong|qihuan|tongnian|lianai|meishaonv|lizhi|baihe|paomianfan|yinv}/", ProcessName:"animelist", }, }, Process: []model.Process{ { Name:"animelist", Type:"template", TemplateRule:model.TemplateRule{ Rule:map[string]string{ "node":"array|.anime_list dl", "img":"attr.src|dt a img", "title":"text|dd h3 a", "href":"attr.href|dt a", "content":"text|dd div", "desc":"text|dd p", }, }, AddQueue:[]*model.Request{ { Method: "get", Url: "http://www.dilidili.wang{$href}", ProcessName: "animeinfo", }, }, }, { Name:"animeinfo", Type:"template", TemplateRule:model.TemplateRule{ Rule:map[string]string{ "episode":"texts|.time_con .swiper-slide .clear li a em", "title":"text|.detail dl dd h1", "episode-link":"attrs.href|.time_con .swiper-slide .clear li a", }, }, AddQueue:[]*model.Request{ { Method: "get", Url: "{$episode-link}", ProcessName: "episodeinfo", }, }, }, { Name:"episodeinfo", Type:"template", TemplateRule:model.TemplateRule{ Rule:map[string]string{ "url":"attr.href|link[rel=\"canonical\"]", "title":"text|#intro2 h1", "player":"attr.src|.player_main iframe", }, }, }, }, Pipline:"file", } app := spider.New() app.AddSpider(spider2.InitWithTask(task)) app.Run() } ``` 3. 纯代码编写 ``` type Movies struct { Datas []Movie `json:"data"` } type Movie struct { Rate string `json:"rate"` Start string `json:"start"` Id string `json:"id"` Url string `json:"url"` Title string `json:"title"` Cover string `json:"cover"` Casts []string `json:"casts"` } type PageProcess struct{} func (p *PageProcess) Process(context model.Context) (*model.Page, error) { movies := Movies{} if err := json.Unmarshal(context.Body, &movies); err != nil { return nil, err } page := &model.Page{} for _, movie := range movies.Datas { page.AddResult(movie) } return page, nil } func main() { sp := &spider2.Spider{} sp.Name = "douban-movie-code" sp.Id = "douban-movie-code" sp.Requests = []*model.Request{ { Method: "get", Url: "https://movie.douban.com/j/new_search_subjects?sort=T&range=0,10&tags=&start={0-10000,20}", ProcessName: "movie", }, } sp.AddProcess("movie", &PageProcess{}) sp.Pipline = file.NewFilePipline("./") app := spider.New() app.AddSpider(sp) app.Run() } ``` ================================================ FILE: example-spider/bilibili/conf.json ================================================ { "name":"bilibili_spider", "version":"0.01", "work_num": 10, "max_wait_num":12000, "http_addr":"127.0.0.1:7774", "etcd":["http://127.0.0.1:2379"], "schedule":"redis", "redis_addr":"127.0.0.1:6379", "mysql":"root:123456@tcp(127.0.0.1:3306)/auto_db?charset=utf8" } ================================================ FILE: example-spider/bilibili/main.go ================================================ package main import ( "YiSpider/spider" "YiSpider/spider/model" spider2 "YiSpider/spider/spider" ) func main() { task := &model.Task{ Id: "bilibili", Name: "bilibili", Request: []*model.Request{ { Method: "get", Url: "http://bangumi.bilibili.com/web_api/season/index_global?page={1-147,1}&page_size=20&version=0&is_finish=0&start_year=0&tag_id=&index_type=1&index_sort=0&quarter=0", ProcessName: "animelist", }, }, Process: []model.Process{ { Name: "animelist", Type: "json", JsonRule: model.JsonRule{ Rule: map[string]string{ "node": "array|result.list", "img": "cover", "favorites": "favorites", "title": "title", "total_count": "total_count", "update_time": "update_time", "url": "url", }, }, AddQueue: nil, }, }, Pipline: "mysql", } app := spider.New() app.AddSpider(spider2.InitWithTask(task)) app.Run() } /* dilidili json { "id": "dilidili", "Name": "dilidili", "request": [ { "url": "http://www.dilidili.wang/{gaoxiao|kehuan|yundong|danmei|zhiyuxi|luoli|zhenren|zhuangbi|youxi|tuili|qingchun|kongbu|jizhan|rexue|qingxiaoshuo|maoxian|hougong|qihuan|tongnian|lianai|meishaonv|lizhi|baihe|paomianfan|yinv}/", "method": "get", "type": "", "data": null, "header": null, "cookies": { "url": "", "data": "" }, "process_name": "animelist" } ], "process": [ { "name": "animelist", "reg_url": null, "type": "template", "template_rule": { "Rule": { "content": "text|dd div", "desc": "text|dd p", "href": "attr.href|dt a", "img": "attr.src|dt a img", "node": "array|.anime_list dl", "title": "text|dd h3 a" } }, "json_rule": { "Rule": null }, "add_queue": [ { "url": "http://www.dilidili.wang{href}", "method": "get", "type": "", "data": null, "header": null, "cookies": { "url": "", "data": "" }, "process_name": "animeinfo" } ] }, { "name": "animeinfo", "reg_url": null, "type": "template", "template_rule": { "Rule": { "episode": "texts|.time_con .swiper-slide .clear li a em", "episode-link": "attrs.href|.time_con .swiper-slide .clear li a", "title": "text|.detail dl dd h1" } }, "json_rule": { "Rule": null }, "add_queue": [ { "url": "{episode-link}", "method": "get", "type": "", "data": null, "header": null, "cookies": { "url": "", "data": "" }, "process_name": "episodeinfo" } ] }, { "name": "episodeinfo", "reg_url": null, "type": "template", "template_rule": { "Rule": { "player": "attr.src|.player_main iframe", "title": "text|#intro2 h1", "url": "attr.href|link[rel=\"canonical\"]" } }, "json_rule": { "Rule": null }, "add_queue": null } ], "pipline": "file", "depth": 0, "end_count": 0 } {"id":"dilidili","Name":"dilidili","request":[{"url":"http://www.dilidili.wang/{gaoxiao|kehuan|yundong|danmei|zhiyuxi|luoli|zhenren|zhuangbi|youxi|tuili|qingchun|kongbu|jizhan|rexue|qingxiaoshuo|maoxian|hougong|qihuan|tongnian|lianai|meishaonv|lizhi|baihe|paomianfan|yinv}/","method":"get","type":"","data":null,"header":null,"cookies":{"url":"","data":""},"process_name":"animelist"}],"process":[{"name":"animelist","reg_url":null,"type":"template","template_rule":{"Rule":{"content":"text|dd div","desc":"text|dd p","href":"attr.href|dt a","img":"attr.src|dt a img","node":"array|.anime_list dl","title":"text|dd h3 a"}},"json_rule":{"Rule":null},"add_queue":[{"url":"http://www.dilidili.wang{href}","method":"get","type":"","data":null,"header":null,"cookies":{"url":"","data":""},"process_name":"animeinfo"}]},{"name":"animeinfo","reg_url":null,"type":"template","template_rule":{"Rule":{"episode":"texts|.time_con .swiper-slide .clear li a em","episode-link":"attrs.href|.time_con .swiper-slide .clear li a","title":"text|.detail dl dd h1"}},"json_rule":{"Rule":null},"add_queue":[{"url":"{episode-link}","method":"get","type":"","data":null,"header":null,"cookies":{"url":"","data":""},"process_name":"episodeinfo"}]},{"name":"episodeinfo","reg_url":null,"type":"template","template_rule":{"Rule":{"player":"attr.src|.player_main iframe","title":"text|#intro2 h1","url":"attr.href|link[rel=\"canonical\"]"}},"json_rule":{"Rule":null},"add_queue":null}],"pipline":"file","depth":0,"end_count":0} */ ================================================ FILE: example-spider/dilidili/conf.json ================================================ { "name":"dilidili_spider", "version":"0.01", "work_num": 100, "max_wait_num":1000000, "http_addr":"127.0.0.1:7774", "etcd":["http://127.0.0.1:2379"], "schedule":"redis", "redis_addr":"127.0.0.1:6379" } ================================================ FILE: example-spider/dilidili/main.go ================================================ package main import ( "YiSpider/spider" "YiSpider/spider/model" spider2 "YiSpider/spider/spider" ) func main() { task := &model.Task{ Id: "dilidili", Name: "dilidili", Request: []*model.Request{ { Method: "get", Url: "http://www.dilidili.wang/{gaoxiao|kehuan|yundong|danmei|zhiyuxi|luoli|zhenren|zhuangbi|youxi|tuili|qingchun|kongbu|jizhan|rexue|qingxiaoshuo|maoxian|hougong|qihuan|tongnian|lianai|meishaonv|lizhi|baihe|paomianfan|yinv}/", ProcessName: "animelist", }, }, Process: []model.Process{ { Name: "animelist", Type: "template", TemplateRule: model.TemplateRule{ Rule: map[string]string{ "node": "array|.anime_list dl", "img": "attr.src|dt a img", "title": "text|dd h3 a", "href": "attr.href|dt a", "content": "text|dd div", "desc": "text|dd p", }, }, AddQueue: []*model.Request{ { Method: "get", Url: "http://www.dilidili.wang{$href}", ProcessName: "animeinfo", }, }, }, { Name: "animeinfo", Type: "template", TemplateRule: model.TemplateRule{ Rule: map[string]string{ "episode": "texts|.time_con .swiper-slide .clear li a em", "title": "text|.detail dl dd h1", "episode-link": "attrs.href|.time_con .swiper-slide .clear li a", }, }, AddQueue: []*model.Request{ { Method: "get", Url: "{$episode-link}", ProcessName: "episodeinfo", }, }, }, { Name: "episodeinfo", Type: "template", TemplateRule: model.TemplateRule{ Rule: map[string]string{ "url": "attr.href|link[rel=\"canonical\"]", "title": "text|#intro2 h1", "player": "attr.src|.player_main iframe", }, }, }, }, Pipline: "file", } app := spider.New() app.AddSpider(spider2.InitWithTask(task)) app.Run() } /* dilidili json { "id": "dilidili", "Name": "dilidili", "request": [ { "url": "http://www.dilidili.wang/{gaoxiao|kehuan|yundong|danmei|zhiyuxi|luoli|zhenren|zhuangbi|youxi|tuili|qingchun|kongbu|jizhan|rexue|qingxiaoshuo|maoxian|hougong|qihuan|tongnian|lianai|meishaonv|lizhi|baihe|paomianfan|yinv}/", "method": "get", "type": "", "data": null, "header": null, "cookies": { "url": "", "data": "" }, "process_name": "animelist" } ], "process": [ { "name": "animelist", "reg_url": null, "type": "template", "template_rule": { "Rule": { "content": "text|dd div", "desc": "text|dd p", "href": "attr.href|dt a", "img": "attr.src|dt a img", "node": "array|.anime_list dl", "title": "text|dd h3 a" } }, "json_rule": { "Rule": null }, "add_queue": [ { "url": "http://www.dilidili.wang{href}", "method": "get", "type": "", "data": null, "header": null, "cookies": { "url": "", "data": "" }, "process_name": "animeinfo" } ] }, { "name": "animeinfo", "reg_url": null, "type": "template", "template_rule": { "Rule": { "episode": "texts|.time_con .swiper-slide .clear li a em", "episode-link": "attrs.href|.time_con .swiper-slide .clear li a", "title": "text|.detail dl dd h1" } }, "json_rule": { "Rule": null }, "add_queue": [ { "url": "{episode-link}", "method": "get", "type": "", "data": null, "header": null, "cookies": { "url": "", "data": "" }, "process_name": "episodeinfo" } ] }, { "name": "episodeinfo", "reg_url": null, "type": "template", "template_rule": { "Rule": { "player": "attr.src|.player_main iframe", "title": "text|#intro2 h1", "url": "attr.href|link[rel=\"canonical\"]" } }, "json_rule": { "Rule": null }, "add_queue": null } ], "pipline": "file", "depth": 0, "end_count": 0 } {"id":"dilidili","Name":"dilidili","request":[{"url":"http://www.dilidili.wang/{gaoxiao|kehuan|yundong|danmei|zhiyuxi|luoli|zhenren|zhuangbi|youxi|tuili|qingchun|kongbu|jizhan|rexue|qingxiaoshuo|maoxian|hougong|qihuan|tongnian|lianai|meishaonv|lizhi|baihe|paomianfan|yinv}/","method":"get","type":"","data":null,"header":null,"cookies":{"url":"","data":""},"process_name":"animelist"}],"process":[{"name":"animelist","reg_url":null,"type":"template","template_rule":{"Rule":{"content":"text|dd div","desc":"text|dd p","href":"attr.href|dt a","img":"attr.src|dt a img","node":"array|.anime_list dl","title":"text|dd h3 a"}},"json_rule":{"Rule":null},"add_queue":[{"url":"http://www.dilidili.wang{href}","method":"get","type":"","data":null,"header":null,"cookies":{"url":"","data":""},"process_name":"animeinfo"}]},{"name":"animeinfo","reg_url":null,"type":"template","template_rule":{"Rule":{"episode":"texts|.time_con .swiper-slide .clear li a em","episode-link":"attrs.href|.time_con .swiper-slide .clear li a","title":"text|.detail dl dd h1"}},"json_rule":{"Rule":null},"add_queue":[{"url":"{episode-link}","method":"get","type":"","data":null,"header":null,"cookies":{"url":"","data":""},"process_name":"episodeinfo"}]},{"name":"episodeinfo","reg_url":null,"type":"template","template_rule":{"Rule":{"player":"attr.src|.player_main iframe","title":"text|#intro2 h1","url":"attr.href|link[rel=\"canonical\"]"}},"json_rule":{"Rule":null},"add_queue":null}],"pipline":"file","depth":0,"end_count":0} */ ================================================ FILE: example-spider/douban-movie/conf.json ================================================ { "name":"sohu_spider", "version":"0.01", "work_num": 50, "max_wait_num":4096, "http_addr":"127.0.0.1:7774", "mysql":"root:123456@tcp(127.0.0.1:3306)/auto_db?charset=utf8" } ================================================ FILE: example-spider/douban-movie/main.go ================================================ package main import ( "YiSpider/spider" "YiSpider/spider/model" spider2 "YiSpider/spider/spider" ) func main() { task := &model.Task{ Id: "douban-movie", Name: "douban-movie", Request: []*model.Request{ { Method: "get", Url: "https://movie.douban.com/j/new_search_subjects?sort=T&range=0,10&tags=&start={0-10000,20}", ProcessName: "movie", }, }, Process: []model.Process{ { Name: "movie", Type: "json", JsonRule: model.JsonRule{ Rule: map[string]string{ "node": "array|data", "rate": "rate", "star": "star", "id": "id", "url": "url", "title": "title", "cover": "cover", "casts": "casts", }, }, }, }, Pipline:"mysql", } app := spider.New() app.AddSpider(spider2.InitWithTask(task)) app.Run() } /* douban-movie json { "id": "douban-movie", "Name": "douban-movie", "request": [ { "url": "https://movie.douban.com/j/new_search_subjects?sort=T&range=0,10&tags=&start={0-10,20}", "method": "get", "type": "", "data": null, "header": null, "cookies": { "url": "", "data": "" }, "process_name": "movie" } ], "process": [ { "name": "movie", "reg_url": null, "type": "json", "template_rule": { "Rule": null }, "json_rule": { "Rule": { "casts": "casts", "cover": "cover", "id": "id", "node": "array|data", "rate": "rate", "star": "star", "title": "title", "url": "url" } }, "add_queue": null } ], "pipline": "file", "depth": 0, "end_count": 0 } curl -d '{"id":"douban-movie","Name":"douban-movie","request":[{"url":"https://movie.douban.com/j/new_search_subjects?sort=T\u0026range=0,10\u0026tags=\u0026start={0-100,20}","method":"get","type":"","data":null,"header":null,"cookies":{"url":"","data":""},"process_name":"movie"}],"process":[{"name":"movie","reg_url":null,"type":"json","template_rule":{"Rule":null},"json_rule":{"Rule":{"casts":"casts","cover":"cover","id":"id","node":"array|data","rate":"rate","star":"star","title":"title","url":"url"}},"add_queue":null}],"pipline":"file","depth":0,"end_count":0}' "http://127.0.0.1:7774/task/addAndRun" */ ================================================ FILE: example-spider/douban-movie-code/conf.json ================================================ { "name":"douban_spider", "version":"0.01", "work_num": 10, "max_wait_num":4096, "http_addr":"127.0.0.1:7774", "etcd":["http://127.0.0.1:2379"], "schedule":"redis", "redis_addr":"127.0.0.1:6379" } ================================================ FILE: example-spider/douban-movie-code/main.go ================================================ package main import ( "YiSpider/spider" "YiSpider/spider/model" "YiSpider/spider/pipline/file" spider2 "YiSpider/spider/spider" "encoding/json" ) type Movies struct { Datas []Movie `json:"data"` } type Movie struct { Rate string `json:"rate"` Start string `json:"start"` Id string `json:"id"` Url string `json:"url"` Title string `json:"title"` Cover string `json:"cover"` Casts []string `json:"casts"` } type PageProcess struct{} func (p *PageProcess) Process(context model.Context) (*model.Page, error) { movies := Movies{} if err := json.Unmarshal(context.Body, &movies); err != nil { return nil, err } page := &model.Page{} for _, movie := range movies.Datas { page.AddResult(movie) } return page, nil } func main() { sp := &spider2.Spider{} sp.Name = "douban-movie-code" sp.Id = "douban-movie-code" sp.Requests = []*model.Request{ { Method: "get", Url: "https://movie.douban.com/j/new_search_subjects?sort=T&range=0,10&tags=&start={0-10000,20}", ProcessName: "movie", }, } sp.AddProcess("movie", &PageProcess{}) sp.Pipline = file.NewFilePipline("./") app := spider.New() app.AddSpider(sp) app.Run() } ================================================ FILE: example-spider/empty/conf.json ================================================ { "name":"sohu_spider", "version":"0.01", "work_num": 200, "max_wait_num":12000, "http_addr":"127.0.0.1:7774", "etcd":["http://127.0.0.1:2379"] } ================================================ FILE: example-spider/empty/main.go ================================================ package main import ( "YiSpider/spider" ) func main() { app := spider.New() app.Run() } ================================================ FILE: example-spider/haoqi/conf.json ================================================ { "name":"haoqi_spider", "version":"0.01", "work_num": 100, "max_wait_num":12000, "http_addr":"127.0.0.1:7774", "etcd":["http://127.0.0.1:2379"] } ================================================ FILE: example-spider/haoqi/main.go ================================================ package main import ( "YiSpider/spider" "YiSpider/spider/model" spider2 "YiSpider/spider/spider" ) func main() { task := &model.Task{ Id: "haoqi", Name: "haoqi", Request: []*model.Request{ { Method: "get", Url: "http://www.qdaily.com/categories/categorymore/{1-54,1}/1509942163.json", ProcessName: "articles", }, }, Process: []model.Process{ { Name: "articles", Type: "json", JsonRule: model.JsonRule{ Rule: map[string]string{ "node": "array|data.feeds", "datatype": "datatype", "image": "image", "post": "post", "type": "type", }, }, AddQueue: nil, }, { Name: "articles", Type: "json", JsonRule: model.JsonRule{ Rule: map[string]string{ "node": "nil|data", "last_key": "last_key", }, }, AddQueue: []*model.Request{ { Method: "get", Url: "http://www.qdaily.com/categories/categorymore/18/{$last_key}.json", ProcessName: "articles", }, }, }, }, Pipline: "file", } app := spider.New() app.AddSpider(spider2.InitWithTask(task)) app.Run() } /* dilidili json { "id": "dilidili", "Name": "dilidili", "request": [ { "url": "http://www.dilidili.wang/{gaoxiao|kehuan|yundong|danmei|zhiyuxi|luoli|zhenren|zhuangbi|youxi|tuili|qingchun|kongbu|jizhan|rexue|qingxiaoshuo|maoxian|hougong|qihuan|tongnian|lianai|meishaonv|lizhi|baihe|paomianfan|yinv}/", "method": "get", "type": "", "data": null, "header": null, "cookies": { "url": "", "data": "" }, "process_name": "animelist" } ], "process": [ { "name": "animelist", "reg_url": null, "type": "template", "template_rule": { "Rule": { "content": "text|dd div", "desc": "text|dd p", "href": "attr.href|dt a", "img": "attr.src|dt a img", "node": "array|.anime_list dl", "title": "text|dd h3 a" } }, "json_rule": { "Rule": null }, "add_queue": [ { "url": "http://www.dilidili.wang{href}", "method": "get", "type": "", "data": null, "header": null, "cookies": { "url": "", "data": "" }, "process_name": "animeinfo" } ] }, { "name": "animeinfo", "reg_url": null, "type": "template", "template_rule": { "Rule": { "episode": "texts|.time_con .swiper-slide .clear li a em", "episode-link": "attrs.href|.time_con .swiper-slide .clear li a", "title": "text|.detail dl dd h1" } }, "json_rule": { "Rule": null }, "add_queue": [ { "url": "{episode-link}", "method": "get", "type": "", "data": null, "header": null, "cookies": { "url": "", "data": "" }, "process_name": "episodeinfo" } ] }, { "name": "episodeinfo", "reg_url": null, "type": "template", "template_rule": { "Rule": { "player": "attr.src|.player_main iframe", "title": "text|#intro2 h1", "url": "attr.href|link[rel=\"canonical\"]" } }, "json_rule": { "Rule": null }, "add_queue": null } ], "pipline": "file", "depth": 0, "end_count": 0 } {"id":"dilidili","Name":"dilidili","request":[{"url":"http://www.dilidili.wang/{gaoxiao|kehuan|yundong|danmei|zhiyuxi|luoli|zhenren|zhuangbi|youxi|tuili|qingchun|kongbu|jizhan|rexue|qingxiaoshuo|maoxian|hougong|qihuan|tongnian|lianai|meishaonv|lizhi|baihe|paomianfan|yinv}/","method":"get","type":"","data":null,"header":null,"cookies":{"url":"","data":""},"process_name":"animelist"}],"process":[{"name":"animelist","reg_url":null,"type":"template","template_rule":{"Rule":{"content":"text|dd div","desc":"text|dd p","href":"attr.href|dt a","img":"attr.src|dt a img","node":"array|.anime_list dl","title":"text|dd h3 a"}},"json_rule":{"Rule":null},"add_queue":[{"url":"http://www.dilidili.wang{href}","method":"get","type":"","data":null,"header":null,"cookies":{"url":"","data":""},"process_name":"animeinfo"}]},{"name":"animeinfo","reg_url":null,"type":"template","template_rule":{"Rule":{"episode":"texts|.time_con .swiper-slide .clear li a em","episode-link":"attrs.href|.time_con .swiper-slide .clear li a","title":"text|.detail dl dd h1"}},"json_rule":{"Rule":null},"add_queue":[{"url":"{episode-link}","method":"get","type":"","data":null,"header":null,"cookies":{"url":"","data":""},"process_name":"episodeinfo"}]},{"name":"episodeinfo","reg_url":null,"type":"template","template_rule":{"Rule":{"player":"attr.src|.player_main iframe","title":"text|#intro2 h1","url":"attr.href|link[rel=\"canonical\"]"}},"json_rule":{"Rule":null},"add_queue":null}],"pipline":"file","depth":0,"end_count":0} */ ================================================ FILE: example-spider/jingdong/conf.json ================================================ { "name":"qiongyou_spider", "version":"0.01", "work_num": 128, "max_wait_num":100000000, "http_addr":"127.0.0.1:7774", "etcd":["http://127.0.0.1:2379"] } ================================================ FILE: example-spider/jingdong/main.go ================================================ package main import ( "YiSpider/spider" "YiSpider/spider/model" spider2 "YiSpider/spider/spider" "fmt" ) func main() { goodsType := `电子书刊|电子书|网络原创|数字杂志|多媒体图书|音像|音乐|影视|教育音像|英文原版|少儿|商务投资|英语学习与考试|文学|传记|励志|文艺|小说|文学|青春文学|传记|艺术|少儿|少儿|0-2岁|3-6岁|7-10岁|11-14岁|人文社科|历史|哲学|国学|政治/军事|法律|人文社科|心理学|文化|社会科学|经管励志|经济|金融与投资|管理|励志与成功|生活|生活|健身与保健|家庭与育儿|旅游|烹饪美食|科技|工业技术|科普读物|建筑|医学|科学与自然|计算机与互联网|电子通信|教育|中小学教辅|教育与考试|外语学习|大中专教材|字典词典|港台图书|艺术/设计/收藏|经济管理|文化/学术|少儿|其他|工具书|杂志/期刊|套装书|手机通讯|手机|对讲机|运营商|合约机|选号中心|装宽带|办套餐|手机配件|移动电源|电池/移动电源|蓝牙耳机|充电器/数据线|苹果周边|手机耳机|手机贴膜|手机存储卡|充电器|数据线|手机保护套|车载配件|iPhone 配件|手机电池|创意配件|便携/无线音响|手机饰品|拍照配件|手机支架|大 家 电|平板电视|空调|冰箱|洗衣机|家庭影院|DVD/电视盒子|迷你音响|冷柜/冰吧|家电配件|功放|回音壁/Soundbar|Hi-Fi专区|电视盒子|酒柜|厨卫大电|燃气灶|油烟机|热水器|消毒柜|洗碗机|厨房小电|料理机|榨汁机|电饭煲|电压力锅|豆浆机|咖啡机|微波炉|电烤箱|电磁炉|面包机|煮蛋器|酸奶机|电炖锅|电水壶/热水瓶|电饼铛|多用途锅|电烧烤炉|果蔬解毒机|其它厨房电器|养生壶/煎药壶|电热饭盒|生活电器|取暖电器|净化器|加湿器|扫地机器人|吸尘器|挂烫机/熨斗|插座|电话机|清洁机|除湿机|干衣机|收录/音机|电风扇|冷风扇|其它生活电器|生活电器配件|净水器|饮水机|个护健康|剃须刀|剃/脱毛器|口腔护理|电吹风|美容器|理发器|卷/直发器|按摩椅|按摩器|足浴盆|血压计|电子秤/厨房秤|血糖仪|体温计|其它健康电器|计步器/脂肪检测仪|五金家装|电动工具|手动工具|仪器仪表|浴霸/排气扇|灯具|LED灯|洁身器|水槽|龙头|淋浴花洒|厨卫五金|家具五金|门铃|电气开关|插座|电工电料|监控安防|电线/线缆|摄影摄像|数码相机|单电/微单相机|单反相机|摄像机|拍立得|运动相机|镜头|户外器材|影棚器材|冲印服务|数码相框|数码配件|存储卡|读卡器|滤镜|闪光灯/手柄|相机包|三脚架/云台|相机清洁/贴膜|机身附件|镜头附件|电池/充电器|移动电源|数码支架|智能设备|智能手环|智能手表|智能眼镜|运动跟踪器|健康监测|智能配饰|智能家居|体感车|其他配件|智能机器人|无人机|影音娱乐|MP3/MP4|智能设备|耳机/耳麦|便携/无线音箱|音箱/音响|高清播放器|收音机|MP3/MP4配件|麦克风|专业音频|苹果配件|电子教育|学生平板|点读机/笔|早教益智|录音笔|电纸书|电子词典|复读机|虚拟商品|延保服务|杀毒软件|积分商品|家纺|桌布/罩件|地毯地垫|沙发垫套/椅垫|床品套件|被子|枕芯|床单被罩|毯子|床垫/床褥|蚊帐|抱枕靠垫|毛巾浴巾|电热毯|窗帘/窗纱|布艺软饰|凉席|灯具|台灯|节能灯|装饰灯|落地灯|应急灯/手电|LED灯|吸顶灯|五金电器|筒灯射灯|吊灯|氛围照明|生活日用|保暖防护|收纳用品|雨伞雨具|浴室用品|缝纫/针织用品|洗晒/熨烫|净化除味|家装软饰|相框/照片墙|装饰字画|节庆饰品|手工/十字绣|装饰摆件|帘艺隔断|墙贴/装饰贴|钟饰|花瓶花艺|香薰蜡烛|创意家居|宠物生活|宠物主粮|宠物零食|医疗保健|家居日用|宠物玩具|出行装备|洗护美容|电脑整机|笔记本|超极本|游戏本|平板电脑|平板电脑配件|台式机|服务器/工作站|笔记本配件|一体机|电脑配件|CPU|主板|显卡|硬盘|SSD固态硬盘|内存|机箱|电源|显示器|刻录机/光驱|散热器|声卡/扩展卡|装机配件|组装电脑|外设产品|移动硬盘|U盘|鼠标|键盘|鼠标垫|摄像头|手写板|硬盘盒|插座|线缆|UPS电源|电脑工具|游戏设备|电玩|电脑清洁|网络仪表仪器|游戏设备|游戏机|游戏耳机|手柄/方向盘|游戏软件|游戏周边|网络产品|路由器|网卡|交换机|网络存储|4G/3G上网|网络盒子|网络配件|办公设备|投影机|投影配件|多功能一体机|打印机|传真设备|验钞/点钞机|扫描设备|复合机|碎纸机|考勤机|收款/POS机|会议音频视频|保险柜|装订/封装机|安防监控|办公家具|白板|文具/耗材|硒鼓/墨粉|墨盒|色带|纸类|办公文具|学生文具|财会用品|文件管理|本册/便签|计算器|笔类|画具画材|刻录碟片/附件|服务产品|上门安装|延保服务|维修保养|电脑软件|京东服务|烹饪锅具|炒锅|煎锅|压力锅|蒸锅|汤锅|奶锅|锅具套装|煲类|水壶|火锅|刀剪菜板|菜刀|剪刀|刀具套装|砧板|瓜果刀/刨|多功能刀|厨房配件|保鲜盒|烘焙/烧烤|饭盒/提锅|储物/置物架|厨房DIY/小工具|水具酒具|塑料杯|运动水壶|玻璃杯|陶瓷/马克杯|保温杯|保温壶|酒杯/酒具|杯具套装|餐具|餐具套装|碗/碟/盘|筷勺/刀叉|一次性用品|果盘/果篮|酒店用品|自助餐炉|酒店餐具|酒店水具|茶具/咖啡具|整套茶具|茶杯|茶壶|茶盘茶托|茶叶罐|茶具配件|茶宠摆件|咖啡具|其他|清洁用品|纸品湿巾|衣物清洁|清洁工具|驱虫用品|家庭清洁|皮具护理|一次性用品|面部护肤|洁面|乳液面霜|面膜|剃须|套装|精华|眼霜|卸妆|防晒|防晒隔离|T区护理|眼部护理|精华露|爽肤水|身体护理|沐浴|润肤|颈部|手足|纤体塑形|美胸|套装|精油|洗发护发|染发/造型|香薰精油|磨砂/浴盐|手工/香皂|洗发|护发|染发|磨砂膏|香皂|口腔护理|牙膏/牙粉|牙刷/牙线|漱口水|套装|女性护理|卫生巾|卫生护垫|私密护理|脱毛膏|其他|洗发护发|洗发|护发|染发|造型|假发|套装|美发工具|脸部护理|香水彩妆|香水|底妆|腮红|眼影|唇部|美甲|眼线|美妆工具|套装|防晒隔离|卸妆|眉笔|睫毛膏|女装|T恤|衬衫|针织衫|雪纺衫|卫衣|马甲|连衣裙|半身裙|牛仔裤|休闲裤|打底裤|正装裤|小西装|短外套|风衣|毛呢大衣|真皮皮衣|棉服|羽绒服|大码女装|中老年女装|婚纱|打底衫|旗袍/唐装|加绒裤|吊带/背心|羊绒衫|短裤|皮草|礼服|仿皮皮衣|羊毛衫|设计师/潮牌|男装|衬衫|T恤|POLO衫|针织衫|羊绒衫|卫衣|马甲/背心|夹克|风衣|毛呢大衣|仿皮皮衣|西服|棉服|羽绒服|牛仔裤|休闲裤|西裤|西服套装|大码男装|中老年男装|唐装/中山装|工装|真皮皮衣|加绒裤|卫裤/运动裤|短裤|设计师/潮牌|羊毛衫|内衣|文胸|女式内裤|男式内裤|睡衣/家居服|塑身美体|泳衣|吊带/背心|抹胸|连裤袜/丝袜|美腿袜|商务男袜|保暖内衣|情侣睡衣|文胸套装|少女文胸|休闲棉袜 |大码内衣|内衣配件|打底裤袜|打底衫|秋衣秋裤|情趣内衣|洗衣服务|服装洗护|服饰配件|太阳镜|光学镜架/镜片|围巾/手套/帽子套装|袖扣|棒球帽|毛线帽|遮阳帽|老花镜|装饰眼镜|防辐射眼镜|游泳镜|女士丝巾/围巾/披肩|男士丝巾/围巾|鸭舌帽|贝雷帽|礼帽|真皮手套|毛线手套|防晒手套|男士腰带/礼盒|女士腰带/礼盒|钥匙扣|遮阳伞/雨伞|口罩|耳罩/耳包|假领|毛线/布面料|领带/领结/领带夹|钟表|男表|瑞表|女表|国表|日韩表|欧美表|德表|儿童手表|智能手表|闹钟|座钟挂钟|钟表配件|流行男鞋|商务休闲鞋|正装鞋|休闲鞋|凉鞋/沙滩鞋|男靴|功能鞋|拖鞋/人字拖|雨鞋/雨靴|传统布鞋|鞋配件|帆布鞋|增高鞋|工装鞋|定制鞋|时尚女鞋|高跟鞋|单鞋|休闲鞋|凉鞋|女靴|雪地靴|拖鞋/人字拖|踝靴|筒靴|帆布鞋|雨鞋/雨靴|妈妈鞋|鞋配件|特色鞋|鱼嘴鞋|布鞋/绣花鞋|马丁靴|坡跟鞋|松糕鞋|内增高|防水台|奶粉|婴幼奶粉|孕妈奶粉|营养辅食|益生菌/初乳|米粉/菜粉|果泥/果汁|DHA|宝宝零食|钙铁锌/维生素|清火/开胃|面条/粥|尿裤湿巾|婴儿尿裤|拉拉裤|婴儿湿巾|成人尿裤|喂养用品|奶瓶奶嘴|吸奶器|暖奶消毒|儿童餐具|水壶/水杯|牙胶安抚|围兜/防溅衣|辅食料理机|食物存储|洗护用品|宝宝护肤|洗发沐浴|奶瓶清洗|驱蚊防晒|理发器|洗澡用具|婴儿口腔清洁|洗衣液/皂|日常护理|座便器|童车童床|婴儿推车|餐椅摇椅|婴儿床|学步车|三轮车|自行车|电动车|扭扭车|滑板车|婴儿床垫|寝居服饰|婴儿外出服|婴儿内衣|婴儿礼盒|婴儿鞋帽袜|安全防护|家居床品|睡袋/抱被|爬行垫|妈妈专区|妈咪包/背婴带|产后塑身|文胸/内裤|防辐射服|孕妈装|孕期营养|孕妇护肤|待产护理|月子装|防溢乳垫|童装童鞋|套装|上衣|裤子|裙子|内衣/家居服|羽绒服/棉服|亲子装|儿童配饰|礼服/演出服|运动鞋|皮鞋/帆布鞋|靴子|凉鞋|功能鞋|户外/运动服|安全座椅|提篮式|安全座椅|增高垫|潮流女包|钱包|手拿包|单肩包|双肩包|手提包|斜挎包|钥匙包|卡包/零钱包|精品男包|男士钱包|男士手包|卡包名片夹|商务公文包|双肩包|单肩/斜挎包|钥匙包|功能箱包|电脑包|拉杆箱|旅行包|旅行配件|休闲运动包|拉杆包|登山包|妈咪包|书包|相机包|腰包/胸包|礼品|火机烟具|礼品文具|军刀军具|收藏品|工艺礼品|创意礼品|礼盒礼券|鲜花绿植|婚庆节庆|京东卡|美妆礼品|礼品定制|京东福卡|古董文玩|奢侈品|箱包|钱包|服饰|腰带|太阳镜/眼镜框|配件|鞋靴|饰品|名品腕表|高档化妆品|婚庆|婚嫁首饰|婚纱摄影|婚纱礼服|婚庆服务|婚庆礼品/用品|婚宴|进口食品|饼干蛋糕|糖果/巧克力|休闲零食|冲调饮品|粮油调味|牛奶|地方特产|其他特产|新疆|北京|山西|内蒙古|福建|湖南|四川|云南|东北|休闲食品|休闲零食|坚果炒货|肉干肉脯|蜜饯果干|糖果/巧克力|饼干蛋糕|无糖食品|粮油调味|米面杂粮|食用油|调味品|南北干货|方便食品|有机食品|饮料冲调|饮用水|饮料|牛奶乳品|咖啡/奶茶|冲饮谷物|蜂蜜/柚子茶|成人奶粉|食品礼券|月饼|大闸蟹|粽子|卡券|茗茶|铁观音|普洱|龙井|绿茶|红茶|乌龙茶|花草茶|花果茶|养生茶|黑茶|白茶|其它茶|时尚饰品|项链|手链/脚链|戒指|耳饰|毛衣链|发饰/发卡|胸针|饰品配件|婚庆饰品|黄金|黄金吊坠|黄金项链|黄金转运珠|黄金手镯/手链/脚链|黄金耳饰|黄金戒指|K金饰品|K金吊坠|K金项链|K金手镯/手链/脚链|K金戒指|K金耳饰|金银投资|投资金|投资银|投资收藏|银饰|银吊坠/项链|银手镯/手链/脚链|银戒指|银耳饰|足银手镯|宝宝银饰|钻石|裸钻|钻戒|钻石项链/吊坠|钻石耳饰|钻石手镯/手链|翡翠玉石|项链/吊坠|手镯/手串|戒指|耳饰|挂件/摆件/把件|玉石孤品|水晶玛瑙|项链/吊坠|耳饰|手镯/手链/脚链|戒指|头饰/胸针|摆件/挂件|彩宝|琥珀/蜜蜡|碧玺|红宝石/蓝宝石|坦桑石|珊瑚|祖母绿|葡萄石|其他天然宝石|项链/吊坠|耳饰|手镯/手链|戒指|铂金|铂金项链/吊坠|铂金手镯/手链/脚链|铂金戒指|铂金耳饰|木手串/把件|小叶紫檀|黄花梨|沉香木|金丝楠|菩提|其他|橄榄核/核桃|檀香|珍珠|珍珠项链|珍珠吊坠|珍珠耳饰|珍珠手链|珍珠戒指|珍珠胸针|维修保养|机油|正时皮带|添加剂|汽车喇叭|防冻液|汽车玻璃|滤清器|火花塞|减震器|柴机油/辅助油|雨刷|车灯|后视镜|轮胎|轮毂|刹车片/盘|维修配件|蓄电池|底盘装甲/护板|贴膜|汽修工具|改装配件|车载电器|导航仪|安全预警仪|行车记录仪|倒车雷达|蓝牙设备|车载影音|净化器|电源|智能驾驶|车载电台|车载电器配件|吸尘器|智能车机|冰箱|汽车音响|车载生活电器|美容清洗|车蜡|补漆笔|玻璃水|清洁剂|洗车工具|镀晶镀膜|打蜡机|洗车配件|洗车机|洗车水枪|毛巾掸子|汽车装饰|脚垫|座垫|座套|后备箱垫|头枕腰靠|方向盘套|香水|空气净化|挂件摆件|功能小件|车身装饰件|车衣|安全自驾|安全座椅|胎压监测|防盗设备|应急救援|保温箱|地锁|摩托车|充气泵|储物箱|自驾野营|摩托车装备|汽车服务|清洗美容|功能升级|保养维修|油卡充值|车险|加油卡|ETC|驾驶培训|赛事改装|赛事服装|赛事用品|制动系统|悬挂系统|进气系统|排气系统|电子管理|车身强化|赛事座椅|运动鞋包|跑步鞋|休闲鞋|篮球鞋|板鞋|帆布鞋|足球鞋|乒羽网鞋|专项运动鞋|训练鞋|拖鞋|运动包|运动服饰|羽绒服|棉服|运动裤|夹克/风衣|卫衣/套头衫|T恤|套装|乒羽网服|健身服|运动背心|毛衫/线衫|运动配饰|骑行运动|折叠车|山地车/公路车|电动车|其他整车|骑行服|骑行装备|平衡车|垂钓用品|鱼竿鱼线|浮漂鱼饵|钓鱼桌椅|钓鱼配件|钓箱鱼包|其它|游泳用品|泳镜|泳帽|游泳包防水包|女士泳衣|男士泳衣|比基尼|其它|户外鞋服|冲锋衣裤|速干衣裤|滑雪服|羽绒服/棉服|休闲衣裤|抓绒衣裤|软壳衣裤|T恤|户外风衣|功能内衣|军迷服饰|登山鞋|雪地靴|徒步鞋|越野跑鞋|休闲鞋|工装鞋|溯溪鞋|沙滩/凉拖|户外袜|户外装备|帐篷/垫子|睡袋/吊床|登山攀岩|户外配饰|背包|户外照明|户外仪表|户外工具|望远镜|旅游用品|便携桌椅床|野餐烧烤|军迷用品|救援装备|滑雪装备|极限户外|冲浪潜水|健身训练|综合训练器|其他大型器械|哑铃|仰卧板/收腹机|其他中小型器材|瑜伽舞蹈|甩脂机|踏步机|武术搏击|健身车/动感单车|跑步机|运动护具|体育用品|羽毛球|乒乓球|篮球|足球|网球|排球|高尔夫|台球|棋牌麻将|轮滑滑板|其他|适用年龄|0-6个月|6-12个月|1-3岁|3-6岁|6-14岁|14岁以上|遥控/电动|遥控车|遥控飞机|遥控船|机器人|轨道/助力|毛绒布艺|毛绒/布艺|靠垫/抱枕|娃娃玩具|芭比娃娃|卡通娃娃|智能娃娃|模型玩具|仿真模型|拼插模型|收藏爱好|健身玩具|炫舞毯|爬行垫/毯|户外玩具|戏水玩具|动漫玩具|电影周边|卡通周边|网游周边|益智玩具|摇铃/床铃|健身架|早教启智|拖拉玩具|积木拼插|积木|拼图|磁力棒|立体拼插|DIY玩具|手工彩泥|绘画工具|情景玩具|创意减压|减压玩具|创意玩具|乐器|钢琴|电子琴/电钢琴|吉他/尤克里里|打击乐器|西洋管弦|民族管弦乐器|乐器配件|电脑音乐|工艺礼品乐器|口琴/口风琴/竖笛|手风琴||机票|国内机票|酒店|国内酒店|酒店团购|旅行|度假|景点|租车|火车票|旅游团购|充值|手机充值|游戏|游戏点卡|QQ充值|票务|电影票|演唱会|话剧歌剧|音乐会|体育赛事|舞蹈芭蕾|戏曲综艺|产地直供|水果|苹果|橙子|奇异果/猕猴桃|车厘子/樱桃|芒果|蓝莓|火龙果|葡萄/提子|柚子|香蕉|牛油果|梨|菠萝/凤梨|桔/橘|柠檬|草莓|桃/李/杏|更多水果|水果礼盒/券|猪牛羊肉|牛肉|羊肉|猪肉|内脏类|海鲜水产|鱼类|虾类|蟹类|贝类|海参|海产干货|其他水产|海产礼盒|禽肉蛋品|鸡肉|鸭肉|蛋类|其他禽类|冷冻食品|水饺/馄饨|汤圆/元宵|面点|火锅丸串|速冻半成品|奶酪黄油|熟食腊味|熟食|腊肠/腊肉|火腿|糕点|礼品卡券|饮品甜品|冷藏果蔬汁|冰激凌|其他` task := &model.Task{ Id: "jingdong", Name: "jingdong", Request: []*model.Request{ { Method: "get", Url: fmt.Sprintf("https://search.jd.com/Search?keyword={%s}&enc=utf-8&page={1-5,1}", goodsType), ProcessName: "jingdong-list", }, }, Process: []model.Process{ { Name: "jingdong-list", Type: "template", TemplateRule: model.TemplateRule{ Rule: map[string]string{ "node": "array|.gl-item", "img": "attr.src|.err-product", "price": "text|.p-price strong i", "goods_name": "text|.p-name a em", "desc": "text|.p-name a i", "comment_num": "text|.p-commit strong a", "shop_addr": "attr.href|.curr-shop", "shop_name": "attr.title|.curr-shop", "goods_id": "attr.data-sku|.J_focus", }, }, AddQueue: []*model.Request{ { Method: "get", Url: "https://sclub.jd.com/comment/productPageComments.action?productId={$goods_id}&score=0&sortType=5&page=0&pageSize=10", ProcessName: "jingdong-comment-first", }, }, }, { // Name: "jingdong-comment-first", Type: "json", JsonRule: model.JsonRule{ Rule: map[string]string{ "max_page": "maxPage", "id": "productCommentSummary.productId", }, }, AddQueue: []*model.Request{ { Method: "get", Url: "https://sclub.jd.com/comment/productPageComments.action?productId={$id}&score=0&sortType=5&page={0-$max_page,1}&pageSize=10", ProcessName: "jingdong-comments", }, }, }, { Name: "jingdong-comments", Type: "json", JsonRule: model.JsonRule{ Rule: map[string]string{ "node": "array|comments", "comment_id": "id", "content": "content", "create_time": "creationTime", "image_count": "imageCount", "isMobile": "isMobile", "productColor": "productColor", "productSize": "productSize", "productId": "referenceId", "score": "score", "replyCount": "replyCount", "usefulVoteCount": "usefulVoteCount", "userClient": "userClient", "userClientShow": "userClientShow", "userLevelId": "userLevelId", "userLevelName": "userLevelName", "userProvince": "userProvince", "nickname": "nickname", }, }, }, { Name: "jingdong-type", Type: "template", TemplateRule: model.TemplateRule{ Rule: map[string]string{ "type": "texts|.items a", }, }, }, }, Pipline: "file", } app := spider.New() app.AddSpider(spider2.InitWithTask(task)) app.Run() } ================================================ FILE: example-spider/qiongyou/conf.json ================================================ { "name":"qiongyou_spider", "version":"0.01", "work_num": 5, "max_wait_num":12000, "http_addr":"127.0.0.1:7774", "etcd":["http://127.0.0.1:2379"], "schedule":"redis", "redis_addr":"127.0.0.1:6379" } ================================================ FILE: example-spider/qiongyou/main.go ================================================ package main import ( "YiSpider/spider" "YiSpider/spider/model" spider2 "YiSpider/spider/spider" ) func main() { task := &model.Task{ Id: "qiongyou", Name: "qiongyou", Request: []*model.Request{ { Method: "get", Url: "http://plan.qyer.com/search_0_0_0_0_0_1_{1-134,1}/", ProcessName: "qiongyou-list", }, { Method: "get", Url: "http://plan.qyer.com/search_0_0_0_0_0_2_{1-134,1}/", ProcessName: "qiongyou-list", }, { Method: "get", Url: "http://plan.qyer.com/search_0_0_0_0_0_3_{1-134,1}/", ProcessName: "qiongyou-list", }, { Method: "get", Url: "http://plan.qyer.com/search_0_0_0_0_0_4_{1-134,1}/", ProcessName: "qiongyou-list", }, { Method: "get", Url: "http://plan.qyer.com/search_0_0_0_0_0_5_{1-134,1}/", ProcessName: "qiongyou-list", }, { Method: "get", Url: "http://plan.qyer.com/search_0_0_0_0_0_6_{1-134,1}/", ProcessName: "qiongyou-list", }, { Method: "get", Url: "http://plan.qyer.com/search_0_0_0_0_0_7_{1-134,1}/", ProcessName: "qiongyou-list", }, { Method: "get", Url: "http://plan.qyer.com/search_0_0_0_0_0_8_{1-134,1}/", ProcessName: "qiongyou-list", }, { Method: "get", Url: "http://plan.qyer.com/search_0_0_0_0_0_9_{1-134,1}/", ProcessName: "qiongyou-list", }, { Method: "get", Url: "http://plan.qyer.com/search_0_0_0_0_0_10_{1-134,1}/", ProcessName: "qiongyou-list", }, { Method: "get", Url: "http://plan.qyer.com/search_0_0_0_0_0_11_{1-134,1}/", ProcessName: "qiongyou-list", }, }, Process: []model.Process{ { Name: "qiongyou-list", Type: "template", TemplateRule: model.TemplateRule{ Rule: map[string]string{ "node": "array|.items", "img": "attr.src|.plan-cover", "time": "text|.fontYaHei dt", "title": "text|.fontYaHei dd", "day": "text|.day strong", "tag": "text|.tag strong", "plan": "text|.plan p", "author": "text|.name", "read_num": "text|.number .icon1", "xx_num": "text|.number .icon2", }, }, }, }, Pipline: "file", } app := spider.New() app.AddSpider(spider2.InitWithTask(task)) app.Run() } ================================================ FILE: example-spider/qiubai/conf.json ================================================ { "name":"sohu_spider", "version":"0.01", "work_num": 1, "max_wait_num":4096, "http_addr":"127.0.0.1:7773", "etcd":["http://127.0.0.1:2379"] } ================================================ FILE: example-spider/qiubai/main.go ================================================ package main import ( "YiSpider/spider" "YiSpider/spider/model" spider2 "YiSpider/spider/spider" ) func main() { task := &model.Task{ Id: "qiiubai", Name: "qiubai", Request: []*model.Request{ { Method: "get", Url: "https://www.qiushibaike.com", }, }, Process: []model.Process{ { RegUrl: []string{ "/.*?/page/[0-9]+", "/hot/|/imgrank/|/text/|/history/|/pic/|/textnew/", }, Type: "template", TemplateRule: model.TemplateRule{ Rule: map[string]string{ "node": "array|.article", "url": "attr.href|.contentHerf", "author": "attr.alt|.author a img", "content": "text|.content span", "like_num": "text|.stats-vote i", "comment_num": "text|.stats-comments a i", }, }, }, }, Pipline: "file", } app := spider.New() app.AddSpider(spider2.InitWithTask(task)) app.Run() } ================================================ FILE: example-spider/qiubai/sem_test.go ================================================ package main import ( "fmt" "os" "strings" "time" "github.com/tebeka/selenium" ) func Example() { // running). const ( seleniumPath = "vendor/selenium-server-standalone-3.4.jar" geckoDriverPath = "vendor/geckodriver-v0.18.0-linux64" port = 8080 ) opts := []selenium.ServiceOption{ selenium.StartFrameBuffer(), // Start an X frame buffer for the browser to run in. selenium.GeckoDriver(geckoDriverPath), // Specify the path to GeckoDriver in order to use Firefox. selenium.Output(os.Stderr), // Output debug information to STDERR. } selenium.SetDebug(true) service, err := selenium.NewSeleniumService(seleniumPath, port, opts...) if err != nil { panic(err) // panic is used only as an example and is not otherwise recommended. } defer service.Stop() // Connect to the WebDriver instance running locally. caps := selenium.Capabilities{"browserName": "firefox"} wd, err := selenium.NewRemote(caps, fmt.Sprintf("http://localhost:%d/wd/hub", port)) if err != nil { panic(err) } defer wd.Quit() // Navigate to the simple playground interface. if err := wd.Get("http://play.golang.org/?simple=1"); err != nil { panic(err) } // Get a reference to the text box containing code. elem, err := wd.FindElement(selenium.ByCSSSelector, "#code") if err != nil { panic(err) } // Remove the boilerplate code already in the text box. if err := elem.Clear(); err != nil { panic(err) } // Enter some new code in text box. err = elem.SendKeys(` package main import "fmt" func main() { fmt.Println("Hello WebDriver!\n") } `) if err != nil { panic(err) } // Click the run button. btn, err := wd.FindElement(selenium.ByCSSSelector, "#run") if err != nil { panic(err) } if err := btn.Click(); err != nil { panic(err) } // Wait for the program to finish running and get the output. outputDiv, err := wd.FindElement(selenium.ByCSSSelector, "#output") if err != nil { panic(err) } var output string for { output, err = outputDiv.Text() if err != nil { panic(err) } if output != "Waiting for remote server..." { break } time.Sleep(time.Millisecond * 100) } fmt.Printf("%s", strings.Replace(output, "\n\n", "\n", -1)) } ================================================ FILE: example-spider/ttkb/conf.json ================================================ { "name":"ttkb-author", "version":"0.01", "work_num": 50, "max_wait_num":200000, "http_addr":"127.0.0.1:7774", "etcd":["http://127.0.0.1:2379"], "mysql":"root:123456@tcp(127.0.0.1:3306)/auto_db?charset=utf8" } ================================================ FILE: example-spider/ttkb/main.go ================================================ package main import ( "YiSpider/spider" "YiSpider/spider/model" spider2 "YiSpider/spider/spider" "fmt" ) func main() { types := "daily_timeline|kb_video_news|kb_news_bagua|kb_news_qipa|kb_photo_news|kb_news_tech|kb_news_finance|location|kb_news_world|kb_news_movie|kb_news_gaojidi|kb_news_wealth|kb_photo_gif|kb_news_sports|kb_news_mil|kb_news_history|kb_news_nba|kb_news_car|kb_news_chaobao|kb_news_laugh|kb_news_pet|kb_news_science|kb_news_baby|kb_news_astro|kb_news_sex|kb_news_beauty|kb_news_house|kb_news_share|kb_news_rock|kb_news_tfboys|kb_news_augury|kb_news_photography|kb_news_lottery|kb_news_cate|kb_news_julebu|kb_news_travel|kb_news_idea|kb_news_lol|kb_news_erciyuan|kb_news_space|kb_news_game|kb_news_iphone|kb_news_esport|kb_news_health|kb_news_outfit|kb_news_furnishing|kb_news_workout|kb_news_soup|kb_news_run|kb_news_fishing|kb_news_buddism|kb_news_diet|kb_news_football|kb_news_tennis|kb_news_tea|kb_news_yoga|kb_news_plaything|kb_news_watch" //types := "daily_timeline|kb_video_news|kb_news_bagua|kb_news_qipa|kb_photo_news|kb_news_tech|kb_news_finance|location|kb_news_world|kb_news_movie|kb_news_gaojidi|kb_news_wealth|kb_photo_gif|kb_news_sports|kb_news_mil|kb_news_history|kb_news_nba|kb_news_car|kb_news_chaobao|kb_news_laugh|kb_news_pet|kb_news_science|kb_news_baby|kb_news_astro|kb_news_sex|kb_news_beauty|kb_news_house|kb_news_share|kb_news_rock|kb_news_tfboys|kb_news_augury|kb_news_photography|kb_news_lottery|kb_news_cate|kb_news_julebu|kb_news_travel|kb_news_idea|kb_news_lol|kb_news_erciyuan|kb_news_space|kb_news_game|kb_news_iphone|kb_news_esport|kb_news_health|kb_news_outfit|kb_news_furnishing|kb_news_workout|kb_news_soup|kb_news_run|kb_news_fishing|kb_news_buddism|kb_news_diet|kb_news_football|kb_news_tennis|kb_news_tea|kb_news_yoga|kb_news_plaything|kb_news_watch" task := &model.Task{ Id: "ttkb-author", Name: "ttkb-author", Request: []*model.Request{ { Method: "get", Url: fmt.Sprintf(`http://r.cnews.qq.com/getSubNewsChlidInterest?patchver=4511&mid=fd248c13ee1ce793495484e4cf3250f8ebbb475a&devid=860046037899335&store=60009&screen_height=1920&apptype=android&origin_imei=860046037899335&hw=OnePlus_ONEPLUSA3000&appver=25_areading_4.5.11&appversion=4.5.11&uid=bfa0a264a6547298&screen_width=1080&sceneid=&android_id=bfa0a264a6547298&last_id=20171207A03G7J00&ssid=GeeyueTech_5G&forward=0&IronThroneBuildTime=1512716487405&omgid=e0f7a4180378ba4e5ee80b0820ef5a1744ca0010211815&IronThroneRelBuildTime=415047497&refreshType=normal&qqnetwork=wifi&last_time=&bottom_id=20171207A0BFU500&top_time=1512631500¤tTab=kuaibao&top_id=20171207C0HX4500&is_wap=0&omgbizid=b03081d3f5806f45b65904d08cfad6bc77130080211815&page={1-1000,1}&imsi=460019017167485&lastRefreshTime=&IronThroneRelExecTime=415047499&muid=49887860909485482&activefrom=icon&cachedCount=20&direction=0&sessionid=&chRefreshTimes=0&chlid={%s}&bottom_time=1512603257&IronThroneExecTime=1512716487407&qn-sig=284d6905ece4010e0ebd89dce072b5ee&qn-rid=6e63ca4d-1285-47ee-b95d-0bb49da3ce03`,types), ProcessName: "ttkblist", }, }, Process: []model.Process{ { Name: "ttkblist", Type: "json", JsonRule: model.JsonRule{ Rule: map[string]string{ "node": "array|newslist", "chlid": "chlid", }, }, AddQueue:[]*model.Request{ { Method: "get", Url : "http://r.cnews.qq.com/getSubItem?chlid={$chlid}", ProcessName: "author", }, }, }, { Name: "author", Type: "json", JsonRule: model.JsonRule{ Rule: map[string]string{ //"node": "", "chlid": "channelInfo.chlid", "chlname":"channelInfo.chlname", "desc":"channelInfo.desc", "subCount":"channelInfo.subCount", "uin":"channelInfo.uin", "intro":"channelInfo.intro", "recommend":"channelInfo.recommend", "followCount":"channelInfo.followCount", "readCount":"channelInfo.readCount", "shareCount":"channelInfo.shareCount", "colCount":"channelInfo.colCount", }, }, }, }, Pipline: "mysql", } app := spider.New() app.AddSpider(spider2.InitWithTask(task)) app.Run() } ================================================ FILE: example-spider/ttkb-author/conf.json ================================================ { "name":"ttkb-author", "version":"0.01", "work_num": 50, "max_wait_num":210000, "http_addr":"127.0.0.1:7774", "etcd":["http://127.0.0.1:2379"], "mysql":"root:123456@tcp(127.0.0.1:3306)/auto_db?charset=utf8" } ================================================ FILE: example-spider/ttkb-author/main.go ================================================ package main import ( "YiSpider/spider" "YiSpider/spider/model" spider2 "YiSpider/spider/spider" ) func main() { task := &model.Task{ Id: "ttkb-author", Name: "ttkb-author", Request: []*model.Request{ { Method: "get", Url:`http://r.cnews.qq.com/getSubItem?chlid={6000000-6200000,1}`, ProcessName: "ttkb-author", }, }, Process: []model.Process{ { Name: "ttkb-author", Type: "json", JsonRule: model.JsonRule{ Rule: map[string]string{ "chlid": "channelInfo.chlid", "chlname":"channelInfo.chlname", "desc":"channelInfo.desc", "subCount":"channelInfo.subCount", "uin":"channelInfo.uin", "intro":"channelInfo.intro", "recommend":"channelInfo.recommend", "followCount":"channelInfo.followCount", "readCount":"channelInfo.readCount", "shareCount":"channelInfo.shareCount", "colCount":"channelInfo.colCount", }, }, }, }, Pipline: "mysql", } app := spider.New() app.AddSpider(spider2.InitWithTask(task)) app.Run() } ================================================ FILE: example-spider/tuiku/conf.json ================================================ { "name":"tuiku_spider", "version":"0.01", "work_num": 2, "max_wait_num":12000, "http_addr":"127.0.0.1:7774", "etcd":["http://127.0.0.1:2379"] } ================================================ FILE: example-spider/tuiku/main.go ================================================ package main import ( "YiSpider/spider" "YiSpider/spider/model" spider2 "YiSpider/spider/spider" ) func main() { task := &model.Task{ Id: "tuiku", Name: "tuiku", Request: []*model.Request{ { Method: "get", Url: "http://www.tuicool.com/ah/0/{1-100,1}?lang=1", ProcessName: "tuikulist", }, { Method: "get", Url: "http://www.tuicool.com/ah/101000000/{1-100,1}?lang=1", ProcessName: "tuikulist", }, { Method: "get", Url: "http://www.tuicool.com/ah/101040000/{1-100,1}?lang=1", ProcessName: "tuikulist", }, { Method: "get", Url: "http://www.tuicool.com/ah/101050000/{1-100,1}?lang=1", ProcessName: "tuikulist", }, { Method: "get", Url: "http://www.tuicool.com/ah/20/{1-100,1}?lang=1", ProcessName: "tuikulist", }, { Method: "get", Url: "http://www.tuicool.com/ah/108000000/{1-100,1}?lang=1", ProcessName: "tuikulist", }, { Method: "get", Url: "http://www.tuicool.com/ah/114000000/{1-100,1}?lang=1", ProcessName: "tuikulist", }, }, Process: []model.Process{ { Name: "tuikulist", Type: "template", TemplateRule: model.TemplateRule{ Rule: map[string]string{ "node": "array|.list_article_item", "img": "attr.src|.article_thumb_image img", "title": "text|.title a", "author": "text|.tip span:nth-child(1)", "time": "text|.tip span:nth-child(3)", }, }, }, }, Pipline: "file", } app := spider.New() app.AddSpider(spider2.InitWithTask(task)) app.Run() } ================================================ FILE: example-spider/wangyi-music/conf.json ================================================ { "name":"music_spider", "version":"0.01", "work_num": 100, "max_wait_num":100000, "http_addr":"127.0.0.1:7775", "mysql":"root:123456@tcp(127.0.0.1:3306)/auto_db?charset=utf8" } ================================================ FILE: example-spider/wangyi-music/main.go ================================================ package main import ( "YiSpider/spider" "YiSpider/spider/model" spider2 "YiSpider/spider/spider" "fmt" "strings" ) func main() { musicType := `华语| 欧美| 日语| 韩语| 粤语| 小语种| 流行| 摇滚| 民谣| 电子| 舞曲| 说唱| 轻音乐| 爵士| 乡村| R&B/Soul| 古典| 民族| 英伦| 金属| 朋克| 蓝调| 雷鬼| 世界音乐| 拉丁| 另类/独立| New Age| 古风| 后摇| Bossa Nova| 清晨| 夜晚| 学习| 工作| 午休| 下午茶| 地铁| 驾车| 运动| 旅行| 散步| 酒吧| 怀旧| 清新| 浪漫| 性感| 伤感| 治愈| 放松| 孤独| 感动| 兴奋| 快乐| 安静| 思念| 影视原声| ACG| 校园| 游戏| 70后| 80后| 90后| 网络歌曲| KTV| 经典| 翻唱| 吉他| 钢琴| 器乐| 儿童| 榜单| 00后|` musicType = strings.Replace(musicType, " ", "", -1) musicTypes := strings.Split(musicType, "|") reqs := []*model.Request{} for _, ty := range musicTypes { reqs = append(reqs, &model.Request{ Method: "get", Url: fmt.Sprintf("http://music.163.com/discover/playlist/?order=hot&cat=%s&limit=35&offset={0-1440,35}", ty), ProcessName: "music-list", }) } task := &model.Task{ Id: "music-list", Name: "music-list", Request: reqs, Process: []model.Process{ { Name: "music-list", Type: "template", TemplateRule: model.TemplateRule{ Rule: map[string]string{ "node": "array|.m-cvrlst li", "img": "attr.src|.u-cover img", "music_addr": "attr.href|.u-cover a", "title": "attr.title|.u-cover a", "play_num": "text|.nb", "author": "text|.nm", }, }, AddQueue: []*model.Request{ { Method: "get", Url: "http://music.163.com{$music_addr}", ProcessName: "music-detail", }, }, }, { Name: "music-detail", Type: "template", TemplateRule: model.TemplateRule{ Rule: map[string]string{ "img": "attr.src|.u-cover img", "title": "text|.f-ff2", "play_num": "text|#play-count", "author": "text|.s-fc7", "like_num": "text|.u-btni-fav i", "share_num": "text|.u-btni-share i", "comment_num": "text|#cnt_comment_count", "desc": "text|#album-desc-dot", "time": "text|.time", "music_count": "#playlist-track-count", "id": "attr.data-rid|#content-operation", }, }, }, }, } app := spider.New() app.AddSpider(spider2.InitWithTask(task)) app.Run() } ================================================ FILE: example-spider/wangyi-music/music/conf.json ================================================ { "name":"music_spider", "version":"0.01", "work_num": 100, "max_wait_num":100000, "http_addr":"127.0.0.1:7775", "etcd":["http://127.0.0.1:2379"], "mysql":"root:123456@tcp(127.0.0.1:3306)/auto_db?charset=utf8" } ================================================ FILE: example-spider/wangyi-music/music/wangyi-music ================================================ [File too large to display: 17.7 MB] ================================================ FILE: example-spider/woshipm/conf.json ================================================ { "name":"woshipm_spider", "version":"0.01", "work_num": 100, "max_wait_num":12000, "http_addr":"127.0.0.1:7774", "etcd":["http://127.0.0.1:2379"] } ================================================ FILE: example-spider/woshipm/main.go ================================================ package main import ( "YiSpider/spider" "YiSpider/spider/model" spider2 "YiSpider/spider/spider" ) func main() { task := &model.Task{ Id: "woshipm", Name: "woshipm", Request: []*model.Request{ { Method: "get", Url: "http://www.woshipm.com/category/pd/page/{1-588,1}", ProcessName: "woshipm-list", }, }, Process: []model.Process{ { Name: "woshipm-list", Type: "template", TemplateRule: model.TemplateRule{ Rule: map[string]string{ "node": "array|.postlist-item", "img": "attr.src|.post-img a img", "time": "text|.stream-list-meta time", "title": "text|.post-title a", "author": "text|.author a", "des": "text|.des", "read_num": "text|.post-meta-items span:nth-child(1)", "collect_num": "text|.post-meta-items span:nth-child(2)", "like_num": "text|.post-meta-items span:nth-child(3)", }, }, }, }, Pipline: "file", } app := spider.New() app.AddSpider(spider2.InitWithTask(task)) app.Run() } ================================================ FILE: manage/conf.json ================================================ { "name":"yi_spider_manage", "version":"0.01", "discover":"etcd", "http_addr":"127.0.0.1:7778", "etcd":["http://127.0.0.1:2379"] } ================================================ FILE: manage/config/config.go ================================================ package config import ( "YiSpider/manage/logger" "encoding/json" "io/ioutil" "os" ) var ConfigI *Config type Config struct { Name string `json:"name"` Version string `json:"version"` Discover string `json:"discover"` HttpAddr string `json:"http_addr"` Etcd []string `json:"etcd"` } func InitConfig() error { var file *os.File var bytes []byte var err error if file, err = os.OpenFile("./conf.json", os.O_RDONLY, 0666); err != nil { return err } if bytes, err = ioutil.ReadAll(file); err != nil { return err } ConfigI = &Config{} if err = json.Unmarshal(bytes, ConfigI); err != nil { return err } logger.Info("init success ", *ConfigI) return nil } ================================================ FILE: manage/discover/discover.go ================================================ package discover import ( "YiSpider/manage/config" "YiSpider/manage/discover/etcd" "YiSpider/manage/model" ) type Discover interface { GetNodes() map[string]*model.Node Start() error } var DiscoverI Discover func InitDiscover() error { var err error switch config.ConfigI.Discover { case "etcd": DiscoverI, err = etcd.NewCluster(config.ConfigI.Etcd) if err != nil { return err } DiscoverI.Start() } return nil } func GetNodes() map[string]*model.Node { if DiscoverI != nil { return DiscoverI.GetNodes() } return nil } ================================================ FILE: manage/discover/etcd/etcd.go ================================================ package etcd import ( "encoding/json" "time" "YiSpider/manage/logger" "YiSpider/manage/model" "fmt" "github.com/coreos/etcd/client" "golang.org/x/net/context" "log" ) type Cluster struct { nodes map[string]*model.Node KeysAPI client.KeysAPI } func NewCluster(endpoints []string) (*Cluster, error) { cfg := client.Config{ Endpoints: endpoints, Transport: client.DefaultTransport, HeaderTimeoutPerRequest: time.Second, } etcdClient, err := client.New(cfg) if err != nil { logger.Error("Error: cannot connec to etcd:", err) return nil, err } master := &Cluster{ nodes: make(map[string]*model.Node), KeysAPI: client.NewKeysAPI(etcdClient), } return master, nil } func (c *Cluster) Start() error { go c.WatchWorkers() fmt.Println("Master Start ...") return nil } func (c *Cluster) GetNodes() map[string]*model.Node { fmt.Println("c.nodes", c.nodes) return c.nodes } func (c *Cluster) addWorker(info *model.WorkerInfo) { node := &model.Node{ IsHealth: true, IP: info.IP, Name: info.Name, CPU: info.CPU, MetaData: info.MetaData, SpiderData: info.SpiderData, } c.nodes[node.Name] = node } func (c *Cluster) updateWorker(info *model.WorkerInfo) { c.addWorker(info) } func unmarshal(node *client.Node) *model.WorkerInfo { logger.Info(node.Value) info := &model.WorkerInfo{} err := json.Unmarshal([]byte(node.Value), info) if err != nil { logger.Error(err) } return info } func (c *Cluster) WatchWorkers() { api := c.KeysAPI watcher := api.Watcher("spiders/", &client.WatcherOptions{ Recursive: true, }) for { res, err := watcher.Next(context.Background()) if err != nil { logger.Error("Error watch workers:", err) break } if res.Action == "expire" { info := unmarshal(res.PrevNode) logger.Info("Expire worker ", info.Name) member, ok := c.nodes[info.Name] if ok { member.IsHealth = false } } else if res.Action == "set" { info := unmarshal(res.Node) if _, ok := c.nodes[info.Name]; ok { logger.Info("Update worker ", info.Name) c.updateWorker(info) } else { logger.Info("Add worker ", info.Name) c.addWorker(info) } } else if res.Action == "delete" { info := unmarshal(res.Node) log.Println("Delete worker ", info.Name) delete(c.nodes, info.Name) } } } ================================================ FILE: manage/discover/file/file.go ================================================ package file func init() { } ================================================ FILE: manage/discover/zookeeper/zookeeper.go ================================================ package zookeeper func init() { } ================================================ FILE: manage/http/controller.go ================================================ package http import ( "YiSpider/manage/model" "encoding/json" "io/ioutil" "net/http" "net/url" ) var errorMethod = []byte("{\"code\":\"400\",\"msg\":\"not support method\"}") var errorQuery = []byte("{\"code\":\"400\",\"msg\":\"error url parmas\"}") var errorBody = []byte("{\"code\":\"400\",\"msg\":\"error get body\"}") var errorJson = []byte("{\"code\":\"400\",\"msg\":\"error get Json\"}") var commonSuccess = []byte("{\"code\":\"200\",\"msg\":\"success\"}") func AddTask(w http.ResponseWriter, req *http.Request) { if req.Method != "POST" { w.Write(errorMethod) return } body, err := ioutil.ReadAll(req.Body) if err != nil { w.Write([]byte(err.Error())) return } task := &model.Task{} err = json.Unmarshal(body, task) if err != nil { w.Write([]byte(err.Error())) return } data, err := AddTaskS(task) if err != nil { w.Write([]byte(err.Error())) return } w.Write(data) } func StopTask(w http.ResponseWriter, req *http.Request) { if req.Method != "GET" { w.Write(errorMethod) return } queryMap, err := url.ParseQuery(req.URL.RawQuery) if err != nil { w.Write(errorQuery) return } name := queryMap.Get("name") data, err := StopTaskS(name) if err != nil { w.Write([]byte(err.Error())) return } w.Write(data) } func RunTask(w http.ResponseWriter, req *http.Request) { if req.Method != "GET" { w.Write(errorMethod) return } queryMap, err := url.ParseQuery(req.URL.RawQuery) if err != nil { w.Write(errorQuery) return } name := queryMap.Get("name") data, err := RunTaskS(name) if err != nil { w.Write([]byte(err.Error())) return } w.Write(data) } func EndTask(w http.ResponseWriter, req *http.Request) { if req.Method != "GET" { w.Write(errorMethod) return } queryMap, err := url.ParseQuery(req.URL.RawQuery) if err != nil { w.Write(errorQuery) return } name := queryMap.Get("name") data, err := EndTaskS(name) if err != nil { w.Write([]byte(err.Error())) return } w.Write(data) } func ListTask(w http.ResponseWriter, req *http.Request) { if req.Method != "GET" { w.Write(errorMethod) return } queryMap, err := url.ParseQuery(req.URL.RawQuery) if err != nil { w.Write(errorQuery) return } name := queryMap.Get("name") data, err := ListTaskS(name) if err != nil { w.Write([]byte(err.Error())) return } w.Write(data) } func ListNode(w http.ResponseWriter, req *http.Request) { if req.Method != "GET" { w.Write(errorMethod) return } data, err := ListNodesS() if err != nil { w.Write([]byte(err.Error())) return } w.Write(data) } ================================================ FILE: manage/http/request.go ================================================ package http import ( "bytes" "encoding/json" "fmt" "golang.org/x/net/publicsuffix" "io/ioutil" "net/http" "net/http/cookiejar" "time" ) var ClientI *http.Client func init() { ClientI = MakeClient(nil) } func makeCookiejar() http.CookieJar { cookiejarOptions := cookiejar.Options{ PublicSuffixList: publicsuffix.List, } jar, _ := cookiejar.New(&cookiejarOptions) return jar } func MakeClient(transport http.RoundTripper) *http.Client { return &http.Client{Jar: makeCookiejar(), Transport: transport, Timeout: 5 * time.Second} } func Get(url string) ([]byte, error) { res, err := DoRequest("GET", url, nil) if err != nil { return []byte{}, err } var body []byte if body, err = ioutil.ReadAll(res.Body); err != nil { return []byte{}, err } fmt.Println("GET", url, " =>", string(body)) return body, nil } func Post(url string, data interface{}) ([]byte, error) { dataJ, err := json.Marshal(data) if err != nil { return []byte{}, err } fmt.Println("Request:", string(dataJ)) res, err := DoRequest("POST", url, dataJ) if err != nil { return []byte{}, err } var body []byte if body, err = ioutil.ReadAll(res.Body); err != nil { return []byte{}, err } return body, nil } func DoRequest(method string, url string, data []byte) (resp *http.Response, err error) { req, err := http.NewRequest(method, url, bytes.NewBuffer(data)) if err != nil { return nil, err } req.Header.Set("Content-Type", "application/json") return ClientI.Do(req) } ================================================ FILE: manage/http/server.go ================================================ package http import ( "YiSpider/manage/config" "YiSpider/manage/logger" "net/http" ) func InitHttpServer() { http.HandleFunc("/task/add", AddTask) http.HandleFunc("/task/run", RunTask) http.HandleFunc("/task/stop", StopTask) http.HandleFunc("/task/end", EndTask) http.HandleFunc("/tasks", ListTask) http.HandleFunc("/nodes", ListNode) err := http.ListenAndServe(config.ConfigI.HttpAddr, nil) if err != nil { logger.Error("ListenAndServe fail:", err) } } ================================================ FILE: manage/http/service.go ================================================ package http import ( "YiSpider/manage/discover" "YiSpider/manage/model" "YiSpider/manage/strategy" "encoding/json" "fmt" ) func AddTaskS(task *model.Task) ([]byte, error) { node := strategy.GetNode() return Post(getUrl(node.IP, "/task/add"), task) } func RunTaskS(name string) ([]byte, error) { node := strategy.GetNode() return Get(getUrl(node.IP, "/task/run?name="+name)) } func StopTaskS(name string) ([]byte, error) { node := strategy.GetNode() return Get(getUrl(node.IP, "/task/stop?name="+name)) } func EndTaskS(name string) ([]byte, error) { node := strategy.GetNode() return Get(getUrl(node.IP, "/task/end?name="+name)) } func ListTaskS(name string) ([]byte, error) { fmt.Println("name", name, "nodes", discover.GetNodes()) node := discover.GetNodes()[name] return Get(getUrl(node.IP, "/tasks")) } func ListNodesS() ([]byte, error) { nodes := discover.GetNodes() return json.Marshal(nodes) } func getUrl(ip string, path string) string { url := fmt.Sprintf("http://%s%s", ip, path) return url } ================================================ FILE: manage/logger/logger.go ================================================ package logger import "fmt" func Info(v ...interface{}) { fmt.Println(v) } func Debug(v ...interface{}) { fmt.Println(v) } func Warn(v ...interface{}) { fmt.Println(v) } func Error(v ...interface{}) { fmt.Println(v) } ================================================ FILE: manage/main.go ================================================ package main import ( "YiSpider/manage/config" "YiSpider/manage/discover" "YiSpider/manage/http" "YiSpider/manage/logger" ) func main() { var err error if err = config.InitConfig(); err != nil { logger.Info(err.Error()) return } discover.InitDiscover() http.InitHttpServer() } ================================================ FILE: manage/model/node_info.go ================================================ package model type Node struct { IsHealth bool `json:"is_health"` IP string `json:"ip"` Name string `json:"name"` CPU int `json:"cpu"` MetaData map[string]string `json:"metadata"` SpiderData map[string]*SpiderData `json:"spider_data"` } type WorkerInfo struct { Name string `json:"name"` IP string `json:"ip"` CPU int `json:"cpu"` MetaData map[string]string `json:"metadata"` SpiderData map[string]*SpiderData `json:"spider_data"` } type SpiderData struct { DownloadFailCount int32 `json:"download_fail_count"` DownloadCount int32 `json:"download_count"` UrlNum int32 `json:"url_num"` WaitUrlNum int `json:"wait_url_num"` CrawlerResultNum int32 `json:"crawler_result_num"` } ================================================ FILE: manage/model/task.go ================================================ package model type Task struct { Id string `json:"id"` Name string `json:"name"` Url string `json:"url"` Host string `json:"host"` Method string `json:"method"` Header map[string]string `json:"header"` Cookies Cookies `json:"cookies"` Proxys []string `json:"proxys"` RequestBody RequestBody `json:"request_body"` Process Process `json:"process"` Depth int `json:"depth"` EndCount int `json:"end_count"` Pipline string `json:"pipline"` } type RequestBody struct { Type string `json:"type"` // json urlencode form Data map[string]string `json:"data"` } type Cookies struct { Url string `json:"url"` Data string `json:"data"` } type Process struct { Url string RegUrl []string Type string `json:"type"` // template json self_process TemplateRule TemplateRule `json:"template_rule"` JsonRule JsonRule `json:"json_rule"` } type TemplateRule struct { Rule map[string]string } type JsonRule struct { Rule map[string]interface{} } ================================================ FILE: manage/schedule/request.go ================================================ package schedule import ( "bytes" "encoding/json" "fmt" "golang.org/x/net/publicsuffix" "io/ioutil" "net/http" "net/http/cookiejar" "testing" "time" ) var ClientI *http.Client func init() { ClientI = MakeClient(nil) } func makeCookiejar() http.CookieJar { cookiejarOptions := cookiejar.Options{ PublicSuffixList: publicsuffix.List, } jar, _ := cookiejar.New(&cookiejarOptions) return jar } func MakeClient(transport http.RoundTripper) *http.Client { return &http.Client{Jar: makeCookiejar(), Transport: transport, Timeout: 60 * time.Second} } func Get(url string) ([]byte, error) { res, err := DoRequest("GET", url, nil) if err != nil { return []byte{}, err } var body []byte if body, err = ioutil.ReadAll(res.Body); err != nil { return []byte{}, err } fmt.Println("GET", url, " =>", string(body)) return body, nil } func Post(url string, data interface{}) ([]byte, error) { dataJ, err := json.Marshal(data) if err != nil { return []byte{}, err } fmt.Println("Request:", string(dataJ)) res, err := DoRequest("POST", url, dataJ) if err != nil { return []byte{}, err } var body []byte if body, err = ioutil.ReadAll(res.Body); err != nil { return []byte{}, err } return body, nil } func DoRequest(method string, url string, data []byte) (resp *http.Response, err error) { req, err := http.NewRequest(method, url, bytes.NewBuffer(data)) if err != nil { return nil, err } req.Header.Set("Content-Type", "application/json") return ClientI.Do(req) } ================================================ FILE: manage/schedule/schedule.go ================================================ package schedule import ( "YiSpider/manage/discover" "YiSpider/manage/model" "YiSpider/manage/strategy" "fmt" ) func AddTask(task *model.Task) ([]byte, error) { node := strategy.GetNode() return Post(getUrl(node.IP, "/task/add"), task) } func RunTask(name string) ([]byte, error) { node := strategy.GetNode() return Get(getUrl(node.IP, "/task/run?name="+name)) } func StopTask(name string) ([]byte, error) { node := strategy.GetNode() return Get(getUrl(node.IP, "/task/stop?name="+name)) } func EndTask(name string) ([]byte, error) { node := strategy.GetNode() return Get(getUrl(node.IP, "/task/end?name="+name)) } func ListTask(name string) ([]byte, error) { node := discover.GetNodes()[name] return Get(getUrl(node.IP, "/task/list")) } func getUrl(ip string, path string) string { url := fmt.Sprintf("http://%s:7777%s", ip, path) return url } ================================================ FILE: manage/strategy/rand_strategy.go ================================================ package strategy import ( "YiSpider/manage/discover" "YiSpider/manage/model" ) func GetNode() *model.Node { nodes := discover.GetNodes() for _, node := range nodes { return node } return nil } ================================================ FILE: manage/task/task.go ================================================ package task func init() { } ================================================ FILE: spider/boot.go ================================================ package spider import ( "YiSpider/spider/config" "YiSpider/spider/core" "YiSpider/spider/http" "YiSpider/spider/register/etcd" "YiSpider/spider/spider" ) type Boot struct { engine *core.Engine } func init() { var err error if err = config.InitConfig(); err != nil { panic(err) } } func New() *Boot { s := &Boot{} s.engine = core.New() return s } func (s *Boot) AddSpider(spider *spider.Spider) *core.Engine { return s.engine.AddSpider(spider) } func (s *Boot) Run() { s.engine.Run() if len(config.ConfigI.Etcd) > 0{ worker := etcd.NewWorker(config.ConfigI.Name, config.ConfigI.HttpAddr, config.ConfigI.Etcd) go worker.HeartBeat() } http.InitHttpServer() } ================================================ FILE: spider/common/encode.go ================================================ package common import ( "fmt" "strings" "github.com/saintfish/chardet" "golang.org/x/text/encoding" "golang.org/x/text/encoding/charmap" "golang.org/x/text/encoding/japanese" "golang.org/x/text/encoding/korean" "golang.org/x/text/encoding/simplifiedchinese" "golang.org/x/text/encoding/traditionalchinese" "golang.org/x/text/encoding/unicode" "golang.org/x/text/transform" ) var ( charsetDetector = chardet.NewTextDetector() charsetDetectors = map[string]encoding.Encoding{ "Big5": traditionalchinese.Big5, "EUC-JP": japanese.EUCJP, "EUC-KR": korean.EUCKR, "GB-18030": simplifiedchinese.GB18030, "ISO-2022-JP": japanese.ISO2022JP, "ISO-8859-5": charmap.ISO8859_5, "ISO-8859-6": charmap.ISO8859_6, "ISO-8859-7": charmap.ISO8859_7, "ISO-8859-8": charmap.ISO8859_8, "ISO-8859-8-I": charmap.ISO8859_8I, "KOI8-R": charmap.KOI8R, "Shift_JIS": japanese.ShiftJIS, "UTF-16BE": unicode.UTF16(unicode.BigEndian, unicode.UseBOM), "UTF-16LE": unicode.UTF16(unicode.LittleEndian, unicode.UseBOM), "windows-1251": charmap.Windows1251, "windows-1252": charmap.Windows1252, "windows-1253": charmap.Windows1253, "windows-1254": charmap.Windows1254, "windows-1255": charmap.Windows1255, "windows-1256": charmap.Windows1256, } ) func ToUtf8(html []byte) ([]byte, error) { r, err := charsetDetector.DetectBest(html) if err != nil { return nil, err } if strings.ToLower(r.Charset) == strings.ToLower("UTF-8") || strings.ToLower(r.Charset) == strings.ToLower("ISO-8859-1") || strings.ToLower(r.Charset) == strings.ToLower("Big5") { return html, nil } t, ok := charsetDetectors[r.Charset] if !ok { return nil, fmt.Errorf( "could not find charset decoder for `%s`", r.Charset) } html, _, err = transform.Bytes(t.NewDecoder(), html) return html, err } ================================================ FILE: spider/common/prase_req.go ================================================ package common import ( "YiSpider/spider/model" "encoding/json" "regexp" "strconv" "strings" ) func PraseReq(reqs []*model.Request, ctx map[string]interface{}) []*model.Request { resultsReqs := []*model.Request{} for _, req := range reqs { results, ok := isRuleReq(req, ctx) if ok { resultsReqs = append(resultsReqs, results...) } else { resultsReqs = append(resultsReqs, req) } } return resultsReqs } func FindRule(text string) [][]string { reg := regexp.MustCompile(`{([^}]+)}`) return reg.FindAllStringSubmatch(text, -1) } func isRuleReq(req *model.Request, ctx map[string]interface{}) ([]*model.Request, bool) { reqs := []*model.Request{req} outReqs := []*model.Request{} finalReqs := []*model.Request{} isMatch := false rules := FindRule(req.Url) if len(rules) > 0 { isMatch = true } else { return nil, false } if ctx != nil { reqs, isMatch = PraseParamCtx(req, rules, ctx) } for _, r := range reqs { outReqs = append(outReqs, PraseOffset(r)...) } for _, r := range outReqs { finalReqs = append(finalReqs, PraseOr(r)...) } if isMatch { return finalReqs, true } return finalReqs, isMatch } // http://xxxxxxxx.com/abc/{begin-end,offset}/ example:{1-400,10} func PraseOffset(req *model.Request) []*model.Request { reqs := []*model.Request{} outrReqs := []*model.Request{} rules := FindRule(req.Url) if len(rules) <= 0 { return []*model.Request{req} } var begin, end, offset int var rule string for _,rulee :=range rules{ rule = rulee[1] sp := strings.Split(rule, ",") if len(sp) != 2 { continue } rs := strings.Split(sp[0], "-") var err error begin, err = strconv.Atoi(rs[0]) end, err = strconv.Atoi(rs[1]) offset, err = strconv.Atoi(sp[1]) if err != nil { continue } if offset == 0 { continue } break } if begin == 0 && end == 0 && offset == 0{ return []*model.Request{req} } for i := begin; i <= end; i = i + offset { url := strings.Replace(req.Url, "{"+rule+"}", strconv.Itoa(i), 1) req := &model.Request{Url: url, Method: req.Method, ContentType: req.ContentType, Data: req.Data, Header: req.Header, Cookies: req.Cookies, ProcessName: req.ProcessName} reqs = append(reqs, req) } for _, r := range reqs { outrReqs = append(outrReqs, PraseOffset(r)...) } return outrReqs } // http://xxxxxxxx.com/abc/{id1|id2|id3}/ func PraseOr(req *model.Request) []*model.Request { reqs := []*model.Request{} outrReqs := []*model.Request{} rules := FindRule(req.Url) if len(rules) <= 0 { return []*model.Request{req} } ruleArray := rules[0] rule := ruleArray[1] sp := strings.Split(rule, "|") if len(sp) < 2 { return []*model.Request{req} } for _, word := range sp { url := strings.Replace(req.Url, "{"+rule+"}", word, 1) r := &model.Request{Url: url, Method: req.Method, ContentType: req.ContentType, Data: req.Data, Header: req.Header, Cookies: req.Cookies, ProcessName: req.ProcessName} reqs = append(reqs, r) } for _, r := range reqs { outrReqs = append(outrReqs, PraseOr(r)...) } return outrReqs } // http://xxxxxxxx.com/abc/{name}/{id}/ func PraseParamCtx(req *model.Request, rules [][]string, ctx map[string]interface{}) ([]*model.Request, bool) { reqs := []*model.Request{} reqUrl := req.Url count := strings.Count(reqUrl, "$") if count <= 0 { return []*model.Request{req}, false } for ctxName, ruleUrl := range ctx { urlArray, ok := ruleUrl.([]string) if ok { for _, url := range urlArray { u := strings.Replace(reqUrl, "{$"+url+"}", string(url), -1) u = strings.Replace(reqUrl, "$"+url, string(url), -1) r := &model.Request{Url: u, Method: req.Method, ContentType: req.ContentType, Data: req.Data, Header: req.Header, Cookies: req.Cookies, ProcessName: req.ProcessName} if newCount := strings.Count(u, "$"); newCount != count { reqUrl = u count = newCount if count == 0 { reqs = append(reqs, r) } } } } urlStr, ok := ruleUrl.(string) if ok { url := strings.Replace(reqUrl, "{$"+ctxName+"}", string(urlStr), -1) url = strings.Replace(url, "$"+ctxName, string(urlStr), -1) r := &model.Request{Url: url, Method: req.Method, ContentType: req.ContentType, Data: req.Data, Header: req.Header, Cookies: req.Cookies, ProcessName: req.ProcessName} if newCount := strings.Count(url, "$"); newCount != count { reqUrl = url count = newCount if count == 0 { reqs = append(reqs, r) } } } urlNumber, ok := ruleUrl.(json.Number) if ok { url := strings.Replace(reqUrl, "{$"+ctxName+"}", string(urlNumber), -1) url = strings.Replace(url, "$"+ctxName, string(urlNumber), -1) r := &model.Request{Url: url, Method: req.Method, ContentType: req.ContentType, Data: req.Data, Header: req.Header, Cookies: req.Cookies, ProcessName: req.ProcessName} if newCount := strings.Count(url, "$"); newCount != count { reqUrl = url count = newCount if count == 0 { reqs = append(reqs, r) } } } urlInt, ok := ruleUrl.(int) if ok { url := strings.Replace(reqUrl, "{$"+ctxName+"}", strconv.Itoa(urlInt), -1) url = strings.Replace(url, "$"+ctxName, strconv.Itoa(urlInt), -1) r := &model.Request{Url: url, Method: req.Method, ContentType: req.ContentType, Data: req.Data, Header: req.Header, Cookies: req.Cookies, ProcessName: req.ProcessName} if newCount := strings.Count(url, "$"); newCount != count { reqUrl = url count = newCount if count == 0 { reqs = append(reqs, r) } } } } return reqs, true } ================================================ FILE: spider/common/prase_req_test.go ================================================ package common import ( "YiSpider/spider/model" "fmt" "testing" ) func TestPraseOffset(t *testing.T) { reqs := []*model.Request{ { Method: "get", Url: "https://movie.douban.com/j/new_search_subjects?sort=T&range=0,10&tags=&start={0-10,1}&last={1|2|3}&page={0-5,1}", ProcessName: "movie", }, } results := PraseReq(reqs, nil) for _, result := range results { fmt.Println(result) } } func TestPraseOr(t *testing.T) { reqs := []*model.Request{ { Method: "get", Url: "https://movie.douban.com/j/new_search_subjects?sort=T&range=0,10&tags=&start={0|20|40}&page={1|2|3}", ProcessName: "movie", }, } results := PraseReq(reqs, nil) for _, result := range results { fmt.Println(result) } } func TestPraseParamCtx(t *testing.T) { reqs := []*model.Request{ { Method: "get", Url: "https://sclub.jd.com/comment/productPageComments.action?productId={$id}&score=0&sortType=5&page={0-$max_page,1}&pageSize=10", ProcessName: "movie", }, } results := PraseReq(reqs, map[string]interface{}{ "id": 13123123, "max_page": 10, }) for _, result := range results { fmt.Println(result) } } func TestFindRule(t *testing.T) { url := `"https://movie.douban.com/j/new_search_subjects?sort=T&url={1-$count,1}&tags="` results := FindRule(url) for _, result := range results { fmt.Println(result) } } func TestOrAndOffset(t *testing.T){ goodsType := `电子书刊|电子书|网络原创|数字杂志|多媒体图书|音像|音乐|影视|教育音像|英文原版|少儿|商务投资|英语学习与考试|文学|传记|励志|文艺|小说|文学|青春文学|传记|艺术|少儿|少儿|0-2岁|3-6岁|7-10岁|11-14岁|人文社科|历史|哲学|国学|政治/军事|法律|人文社科|心理学|文化|社会科学|经管励志|经济|金融与投资|管理|励志与成功|生活|生活|健身与保健|家庭与育儿|旅游|烹饪美食|科技|工业技术|科普读物|建筑|医学|科学与自然|计算机与互联网|电子通信|教育|中小学教辅|教育与考试|外语学习|大中专教材|字典词典|港台图书|艺术/设计/收藏|经济管理|文化/学术|少儿|其他|工具书|杂志/期刊|套装书|手机通讯|手机|对讲机|运营商|合约机|选号中心|装宽带|办套餐|手机配件|移动电源|电池/移动电源|蓝牙耳机|充电器/数据线|苹果周边|手机耳机|手机贴膜|手机存储卡|充电器|数据线|手机保护套|车载配件|iPhone 配件|手机电池|创意配件|便携/无线音响|手机饰品|拍照配件|手机支架|大 家 电|平板电视|空调|冰箱|洗衣机|家庭影院|DVD/电视盒子|迷你音响|冷柜/冰吧|家电配件|功放|回音壁/Soundbar|Hi-Fi专区|电视盒子|酒柜|厨卫大电|燃气灶|油烟机|热水器|消毒柜|洗碗机|厨房小电|料理机|榨汁机|电饭煲|电压力锅|豆浆机|咖啡机|微波炉|电烤箱|电磁炉|面包机|煮蛋器|酸奶机|电炖锅|电水壶/热水瓶|电饼铛|多用途锅|电烧烤炉|果蔬解毒机|其它厨房电器|养生壶/煎药壶|电热饭盒|生活电器|取暖电器|净化器|加湿器|扫地机器人|吸尘器|挂烫机/熨斗|插座|电话机|清洁机|除湿机|干衣机|收录/音机|电风扇|冷风扇|其它生活电器|生活电器配件|净水器|饮水机|个护健康|剃须刀|剃/脱毛器|口腔护理|电吹风|美容器|理发器|卷/直发器|按摩椅|按摩器|足浴盆|血压计|电子秤/厨房秤|血糖仪|体温计|其它健康电器|计步器/脂肪检测仪|五金家装|电动工具|手动工具|仪器仪表|浴霸/排气扇|灯具|LED灯|洁身器|水槽|龙头|淋浴花洒|厨卫五金|家具五金|门铃|电气开关|插座|电工电料|监控安防|电线/线缆|摄影摄像|数码相机|单电/微单相机|单反相机|摄像机|拍立得|运动相机|镜头|户外器材|影棚器材|冲印服务|数码相框|数码配件|存储卡|读卡器|滤镜|闪光灯/手柄|相机包|三脚架/云台|相机清洁/贴膜|机身附件|镜头附件|电池/充电器|移动电源|数码支架|智能设备|智能手环|智能手表|智能眼镜|运动跟踪器|健康监测|智能配饰|智能家居|体感车|其他配件|智能机器人|无人机|影音娱乐|MP3/MP4|智能设备|耳机/耳麦|便携/无线音箱|音箱/音响|高清播放器|收音机|MP3/MP4配件|麦克风|专业音频|苹果配件|电子教育|学生平板|点读机/笔|早教益智|录音笔|电纸书|电子词典|复读机|虚拟商品|延保服务|杀毒软件|积分商品|家纺|桌布/罩件|地毯地垫|沙发垫套/椅垫|床品套件|被子|枕芯|床单被罩|毯子|床垫/床褥|蚊帐|抱枕靠垫|毛巾浴巾|电热毯|窗帘/窗纱|布艺软饰|凉席|灯具|台灯|节能灯|装饰灯|落地灯|应急灯/手电|LED灯|吸顶灯|五金电器|筒灯射灯|吊灯|氛围照明|生活日用|保暖防护|收纳用品|雨伞雨具|浴室用品|缝纫/针织用品|洗晒/熨烫|净化除味|家装软饰|相框/照片墙|装饰字画|节庆饰品|手工/十字绣|装饰摆件|帘艺隔断|墙贴/装饰贴|钟饰|花瓶花艺|香薰蜡烛|创意家居|宠物生活|宠物主粮|宠物零食|医疗保健|家居日用|宠物玩具|出行装备|洗护美容|电脑整机|笔记本|超极本|游戏本|平板电脑|平板电脑配件|台式机|服务器/工作站|笔记本配件|一体机|电脑配件|CPU|主板|显卡|硬盘|SSD固态硬盘|内存|机箱|电源|显示器|刻录机/光驱|散热器|声卡/扩展卡|装机配件|组装电脑|外设产品|移动硬盘|U盘|鼠标|键盘|鼠标垫|摄像头|手写板|硬盘盒|插座|线缆|UPS电源|电脑工具|游戏设备|电玩|电脑清洁|网络仪表仪器|游戏设备|游戏机|游戏耳机|手柄/方向盘|游戏软件|游戏周边|网络产品|路由器|网卡|交换机|网络存储|4G/3G上网|网络盒子|网络配件|办公设备|投影机|投影配件|多功能一体机|打印机|传真设备|验钞/点钞机|扫描设备|复合机|碎纸机|考勤机|收款/POS机|会议音频视频|保险柜|装订/封装机|安防监控|办公家具|白板|文具/耗材|硒鼓/墨粉|墨盒|色带|纸类|办公文具|学生文具|财会用品|文件管理|本册/便签|计算器|笔类|画具画材|刻录碟片/附件|服务产品|上门安装|延保服务|维修保养|电脑软件|京东服务|烹饪锅具|炒锅|煎锅|压力锅|蒸锅|汤锅|奶锅|锅具套装|煲类|水壶|火锅|刀剪菜板|菜刀|剪刀|刀具套装|砧板|瓜果刀/刨|多功能刀|厨房配件|保鲜盒|烘焙/烧烤|饭盒/提锅|储物/置物架|厨房DIY/小工具|水具酒具|塑料杯|运动水壶|玻璃杯|陶瓷/马克杯|保温杯|保温壶|酒杯/酒具|杯具套装|餐具|餐具套装|碗/碟/盘|筷勺/刀叉|一次性用品|果盘/果篮|酒店用品|自助餐炉|酒店餐具|酒店水具|茶具/咖啡具|整套茶具|茶杯|茶壶|茶盘茶托|茶叶罐|茶具配件|茶宠摆件|咖啡具|其他|清洁用品|纸品湿巾|衣物清洁|清洁工具|驱虫用品|家庭清洁|皮具护理|一次性用品|面部护肤|洁面|乳液面霜|面膜|剃须|套装|精华|眼霜|卸妆|防晒|防晒隔离|T区护理|眼部护理|精华露|爽肤水|身体护理|沐浴|润肤|颈部|手足|纤体塑形|美胸|套装|精油|洗发护发|染发/造型|香薰精油|磨砂/浴盐|手工/香皂|洗发|护发|染发|磨砂膏|香皂|口腔护理|牙膏/牙粉|牙刷/牙线|漱口水|套装|女性护理|卫生巾|卫生护垫|私密护理|脱毛膏|其他|洗发护发|洗发|护发|染发|造型|假发|套装|美发工具|脸部护理|香水彩妆|香水|底妆|腮红|眼影|唇部|美甲|眼线|美妆工具|套装|防晒隔离|卸妆|眉笔|睫毛膏|女装|T恤|衬衫|针织衫|雪纺衫|卫衣|马甲|连衣裙|半身裙|牛仔裤|休闲裤|打底裤|正装裤|小西装|短外套|风衣|毛呢大衣|真皮皮衣|棉服|羽绒服|大码女装|中老年女装|婚纱|打底衫|旗袍/唐装|加绒裤|吊带/背心|羊绒衫|短裤|皮草|礼服|仿皮皮衣|羊毛衫|设计师/潮牌|男装|衬衫|T恤|POLO衫|针织衫|羊绒衫|卫衣|马甲/背心|夹克|风衣|毛呢大衣|仿皮皮衣|西服|棉服|羽绒服|牛仔裤|休闲裤|西裤|西服套装|大码男装|中老年男装|唐装/中山装|工装|真皮皮衣|加绒裤|卫裤/运动裤|短裤|设计师/潮牌|羊毛衫|内衣|文胸|女式内裤|男式内裤|睡衣/家居服|塑身美体|泳衣|吊带/背心|抹胸|连裤袜/丝袜|美腿袜|商务男袜|保暖内衣|情侣睡衣|文胸套装|少女文胸|休闲棉袜 |大码内衣|内衣配件|打底裤袜|打底衫|秋衣秋裤|情趣内衣|洗衣服务|服装洗护|服饰配件|太阳镜|光学镜架/镜片|围巾/手套/帽子套装|袖扣|棒球帽|毛线帽|遮阳帽|老花镜|装饰眼镜|防辐射眼镜|游泳镜|女士丝巾/围巾/披肩|男士丝巾/围巾|鸭舌帽|贝雷帽|礼帽|真皮手套|毛线手套|防晒手套|男士腰带/礼盒|女士腰带/礼盒|钥匙扣|遮阳伞/雨伞|口罩|耳罩/耳包|假领|毛线/布面料|领带/领结/领带夹|钟表|男表|瑞表|女表|国表|日韩表|欧美表|德表|儿童手表|智能手表|闹钟|座钟挂钟|钟表配件|流行男鞋|商务休闲鞋|正装鞋|休闲鞋|凉鞋/沙滩鞋|男靴|功能鞋|拖鞋/人字拖|雨鞋/雨靴|传统布鞋|鞋配件|帆布鞋|增高鞋|工装鞋|定制鞋|时尚女鞋|高跟鞋|单鞋|休闲鞋|凉鞋|女靴|雪地靴|拖鞋/人字拖|踝靴|筒靴|帆布鞋|雨鞋/雨靴|妈妈鞋|鞋配件|特色鞋|鱼嘴鞋|布鞋/绣花鞋|马丁靴|坡跟鞋|松糕鞋|内增高|防水台|奶粉|婴幼奶粉|孕妈奶粉|营养辅食|益生菌/初乳|米粉/菜粉|果泥/果汁|DHA|宝宝零食|钙铁锌/维生素|清火/开胃|面条/粥|尿裤湿巾|婴儿尿裤|拉拉裤|婴儿湿巾|成人尿裤|喂养用品|奶瓶奶嘴|吸奶器|暖奶消毒|儿童餐具|水壶/水杯|牙胶安抚|围兜/防溅衣|辅食料理机|食物存储|洗护用品|宝宝护肤|洗发沐浴|奶瓶清洗|驱蚊防晒|理发器|洗澡用具|婴儿口腔清洁|洗衣液/皂|日常护理|座便器|童车童床|婴儿推车|餐椅摇椅|婴儿床|学步车|三轮车|自行车|电动车|扭扭车|滑板车|婴儿床垫|寝居服饰|婴儿外出服|婴儿内衣|婴儿礼盒|婴儿鞋帽袜|安全防护|家居床品|睡袋/抱被|爬行垫|妈妈专区|妈咪包/背婴带|产后塑身|文胸/内裤|防辐射服|孕妈装|孕期营养|孕妇护肤|待产护理|月子装|防溢乳垫|童装童鞋|套装|上衣|裤子|裙子|内衣/家居服|羽绒服/棉服|亲子装|儿童配饰|礼服/演出服|运动鞋|皮鞋/帆布鞋|靴子|凉鞋|功能鞋|户外/运动服|安全座椅|提篮式|安全座椅|增高垫|潮流女包|钱包|手拿包|单肩包|双肩包|手提包|斜挎包|钥匙包|卡包/零钱包|精品男包|男士钱包|男士手包|卡包名片夹|商务公文包|双肩包|单肩/斜挎包|钥匙包|功能箱包|电脑包|拉杆箱|旅行包|旅行配件|休闲运动包|拉杆包|登山包|妈咪包|书包|相机包|腰包/胸包|礼品|火机烟具|礼品文具|军刀军具|收藏品|工艺礼品|创意礼品|礼盒礼券|鲜花绿植|婚庆节庆|京东卡|美妆礼品|礼品定制|京东福卡|古董文玩|奢侈品|箱包|钱包|服饰|腰带|太阳镜/眼镜框|配件|鞋靴|饰品|名品腕表|高档化妆品|婚庆|婚嫁首饰|婚纱摄影|婚纱礼服|婚庆服务|婚庆礼品/用品|婚宴|进口食品|饼干蛋糕|糖果/巧克力|休闲零食|冲调饮品|粮油调味|牛奶|地方特产|其他特产|新疆|北京|山西|内蒙古|福建|湖南|四川|云南|东北|休闲食品|休闲零食|坚果炒货|肉干肉脯|蜜饯果干|糖果/巧克力|饼干蛋糕|无糖食品|粮油调味|米面杂粮|食用油|调味品|南北干货|方便食品|有机食品|饮料冲调|饮用水|饮料|牛奶乳品|咖啡/奶茶|冲饮谷物|蜂蜜/柚子茶|成人奶粉|食品礼券|月饼|大闸蟹|粽子|卡券|茗茶|铁观音|普洱|龙井|绿茶|红茶|乌龙茶|花草茶|花果茶|养生茶|黑茶|白茶|其它茶|时尚饰品|项链|手链/脚链|戒指|耳饰|毛衣链|发饰/发卡|胸针|饰品配件|婚庆饰品|黄金|黄金吊坠|黄金项链|黄金转运珠|黄金手镯/手链/脚链|黄金耳饰|黄金戒指|K金饰品|K金吊坠|K金项链|K金手镯/手链/脚链|K金戒指|K金耳饰|金银投资|投资金|投资银|投资收藏|银饰|银吊坠/项链|银手镯/手链/脚链|银戒指|银耳饰|足银手镯|宝宝银饰|钻石|裸钻|钻戒|钻石项链/吊坠|钻石耳饰|钻石手镯/手链|翡翠玉石|项链/吊坠|手镯/手串|戒指|耳饰|挂件/摆件/把件|玉石孤品|水晶玛瑙|项链/吊坠|耳饰|手镯/手链/脚链|戒指|头饰/胸针|摆件/挂件|彩宝|琥珀/蜜蜡|碧玺|红宝石/蓝宝石|坦桑石|珊瑚|祖母绿|葡萄石|其他天然宝石|项链/吊坠|耳饰|手镯/手链|戒指|铂金|铂金项链/吊坠|铂金手镯/手链/脚链|铂金戒指|铂金耳饰|木手串/把件|小叶紫檀|黄花梨|沉香木|金丝楠|菩提|其他|橄榄核/核桃|檀香|珍珠|珍珠项链|珍珠吊坠|珍珠耳饰|珍珠手链|珍珠戒指|珍珠胸针|维修保养|机油|正时皮带|添加剂|汽车喇叭|防冻液|汽车玻璃|滤清器|火花塞|减震器|柴机油/辅助油|雨刷|车灯|后视镜|轮胎|轮毂|刹车片/盘|维修配件|蓄电池|底盘装甲/护板|贴膜|汽修工具|改装配件|车载电器|导航仪|安全预警仪|行车记录仪|倒车雷达|蓝牙设备|车载影音|净化器|电源|智能驾驶|车载电台|车载电器配件|吸尘器|智能车机|冰箱|汽车音响|车载生活电器|美容清洗|车蜡|补漆笔|玻璃水|清洁剂|洗车工具|镀晶镀膜|打蜡机|洗车配件|洗车机|洗车水枪|毛巾掸子|汽车装饰|脚垫|座垫|座套|后备箱垫|头枕腰靠|方向盘套|香水|空气净化|挂件摆件|功能小件|车身装饰件|车衣|安全自驾|安全座椅|胎压监测|防盗设备|应急救援|保温箱|地锁|摩托车|充气泵|储物箱|自驾野营|摩托车装备|汽车服务|清洗美容|功能升级|保养维修|油卡充值|车险|加油卡|ETC|驾驶培训|赛事改装|赛事服装|赛事用品|制动系统|悬挂系统|进气系统|排气系统|电子管理|车身强化|赛事座椅|运动鞋包|跑步鞋|休闲鞋|篮球鞋|板鞋|帆布鞋|足球鞋|乒羽网鞋|专项运动鞋|训练鞋|拖鞋|运动包|运动服饰|羽绒服|棉服|运动裤|夹克/风衣|卫衣/套头衫|T恤|套装|乒羽网服|健身服|运动背心|毛衫/线衫|运动配饰|骑行运动|折叠车|山地车/公路车|电动车|其他整车|骑行服|骑行装备|平衡车|垂钓用品|鱼竿鱼线|浮漂鱼饵|钓鱼桌椅|钓鱼配件|钓箱鱼包|其它|游泳用品|泳镜|泳帽|游泳包防水包|女士泳衣|男士泳衣|比基尼|其它|户外鞋服|冲锋衣裤|速干衣裤|滑雪服|羽绒服/棉服|休闲衣裤|抓绒衣裤|软壳衣裤|T恤|户外风衣|功能内衣|军迷服饰|登山鞋|雪地靴|徒步鞋|越野跑鞋|休闲鞋|工装鞋|溯溪鞋|沙滩/凉拖|户外袜|户外装备|帐篷/垫子|睡袋/吊床|登山攀岩|户外配饰|背包|户外照明|户外仪表|户外工具|望远镜|旅游用品|便携桌椅床|野餐烧烤|军迷用品|救援装备|滑雪装备|极限户外|冲浪潜水|健身训练|综合训练器|其他大型器械|哑铃|仰卧板/收腹机|其他中小型器材|瑜伽舞蹈|甩脂机|踏步机|武术搏击|健身车/动感单车|跑步机|运动护具|体育用品|羽毛球|乒乓球|篮球|足球|网球|排球|高尔夫|台球|棋牌麻将|轮滑滑板|其他|适用年龄|0-6个月|6-12个月|1-3岁|3-6岁|6-14岁|14岁以上|遥控/电动|遥控车|遥控飞机|遥控船|机器人|轨道/助力|毛绒布艺|毛绒/布艺|靠垫/抱枕|娃娃玩具|芭比娃娃|卡通娃娃|智能娃娃|模型玩具|仿真模型|拼插模型|收藏爱好|健身玩具|炫舞毯|爬行垫/毯|户外玩具|戏水玩具|动漫玩具|电影周边|卡通周边|网游周边|益智玩具|摇铃/床铃|健身架|早教启智|拖拉玩具|积木拼插|积木|拼图|磁力棒|立体拼插|DIY玩具|手工彩泥|绘画工具|情景玩具|创意减压|减压玩具|创意玩具|乐器|钢琴|电子琴/电钢琴|吉他/尤克里里|打击乐器|西洋管弦|民族管弦乐器|乐器配件|电脑音乐|工艺礼品乐器|口琴/口风琴/竖笛|手风琴||机票|国内机票|酒店|国内酒店|酒店团购|旅行|度假|景点|租车|火车票|旅游团购|充值|手机充值|游戏|游戏点卡|QQ充值|票务|电影票|演唱会|话剧歌剧|音乐会|体育赛事|舞蹈芭蕾|戏曲综艺|产地直供|水果|苹果|橙子|奇异果/猕猴桃|车厘子/樱桃|芒果|蓝莓|火龙果|葡萄/提子|柚子|香蕉|牛油果|梨|菠萝/凤梨|桔/橘|柠檬|草莓|桃/李/杏|更多水果|水果礼盒/券|猪牛羊肉|牛肉|羊肉|猪肉|内脏类|海鲜水产|鱼类|虾类|蟹类|贝类|海参|海产干货|其他水产|海产礼盒|禽肉蛋品|鸡肉|鸭肉|蛋类|其他禽类|冷冻食品|水饺/馄饨|汤圆/元宵|面点|火锅丸串|速冻半成品|奶酪黄油|熟食腊味|熟食|腊肠/腊肉|火腿|糕点|礼品卡券|饮品甜品|冷藏果蔬汁|冰激凌|其他` reqs := []*model.Request{ { Method: "get", Url: fmt.Sprintf("https://search.jd.com/Search?keyword={%s}&enc=utf-8&page={1-5,1}", goodsType), ProcessName: "jingdong-list", }, } results := PraseReq(reqs, nil) fmt.Println(len(results)) for _, result := range results { fmt.Println(result) } //daily_timeline|kb_video_news|kb_news_bagua|kb_news_qipa|kb_photo_news|kb_news_tech|kb_news_finance|location|kb_news_world|kb_news_movie|kb_news_gaojidi|kb_news_wealth|kb_photo_gif|kb_news_sports|kb_news_mil|kb_news_history|kb_news_nba|kb_news_car|kb_news_chaobao|kb_news_laugh|kb_news_pet|kb_news_science|kb_news_baby|kb_news_astro|kb_news_sex|kb_news_beauty|kb_news_house|kb_news_share|kb_news_rock|kb_news_tfboys|kb_news_augury|kb_news_photography|kb_news_lottery|kb_news_cate|kb_news_julebu|kb_news_travel|kb_news_idea|kb_news_lol|kb_news_erciyuan|kb_news_space|kb_news_game|kb_news_iphone|kb_news_esport|kb_news_health|kb_news_outfit|kb_news_furnishing|kb_news_workout|kb_news_soup|kb_news_run|kb_news_fishing|kb_news_buddism|kb_news_diet|kb_news_football|kb_news_tennis|kb_news_tea|kb_news_yoga|kb_news_plaything|kb_news_watch } func TestOrAndOffset_2(t *testing.T) { types := "daily_timeline|kb_video_news|kb_news_bagua|kb_news_qipa|kb_photo_news|kb_news_tech|kb_news_finance|location|kb_news_world|kb_news_movie|kb_news_gaojidi|kb_news_wealth|kb_photo_gif|kb_news_sports|kb_news_mil|kb_news_history|kb_news_nba|kb_news_car|kb_news_chaobao|kb_news_laugh|kb_news_pet|kb_news_science|kb_news_baby|kb_news_astro|kb_news_sex|kb_news_beauty|kb_news_house|kb_news_share|kb_news_rock|kb_news_tfboys|kb_news_augury|kb_news_photography|kb_news_lottery|kb_news_cate|kb_news_julebu|kb_news_travel|kb_news_idea|kb_news_lol|kb_news_erciyuan|kb_news_space|kb_news_game|kb_news_iphone|kb_news_esport|kb_news_health|kb_news_outfit|kb_news_furnishing|kb_news_workout|kb_news_soup|kb_news_run|kb_news_fishing|kb_news_buddism|kb_news_diet|kb_news_football|kb_news_tennis|kb_news_tea|kb_news_yoga|kb_news_plaything|kb_news_watch" reqs:= []*model.Request{ { Method: "get", Url: fmt.Sprintf(`http://r.cnews.qq.com/getSubNewsChlidInterest?patchver=4511&mid=fd248c13ee1ce793495484e4cf3250f8ebbb475a&devid=860046037899335&store=60009&screen_height=1920&apptype=android&origin_imei=860046037899335&hw=OnePlus_ONEPLUSA3000&appver=25_areading_4.5.11&appversion=4.5.11&uid=bfa0a264a6547298&screen_width=1080&sceneid=&android_id=bfa0a264a6547298&last_id=20171207A03G7J00&ssid=GeeyueTech_5G&forward=0&IronThroneBuildTime=1512716487405&omgid=e0f7a4180378ba4e5ee80b0820ef5a1744ca0010211815&IronThroneRelBuildTime=415047497&refreshType=normal&qqnetwork=wifi&last_time=&bottom_id=20171207A0BFU500&top_time=1512631500¤tTab=kuaibao&top_id=20171207C0HX4500&is_wap=0&omgbizid=b03081d3f5806f45b65904d08cfad6bc77130080211815&page={1-100,1}&imsi=460019017167485&lastRefreshTime=&IronThroneRelExecTime=415047499&muid=49887860909485482&activefrom=icon&cachedCount=20&direction=0&sessionid=&chRefreshTimes=0&chlid={%s}&bottom_time=1512603257&IronThroneExecTime=1512716487407&qn-sig=284d6905ece4010e0ebd89dce072b5ee&qn-rid=6e63ca4d-1285-47ee-b95d-0bb49da3ce03`,types), ProcessName: "ttkblist", }, } results := PraseReq(reqs, nil) fmt.Println(len(results)) //for _, result := range results { // fmt.Println(result) //} } ================================================ FILE: spider/conf.json ================================================ { "name":"qiubai_spider", "version":"0.01", "work_num": 1, "max_wait_num":4096, "http_addr":"127.0.0.1:7775", "etcd":["http://127.0.0.1:2379"], "schedule":"redis", "redis_addr":"127.0.0.1:6379" } ================================================ FILE: spider/config/config.go ================================================ package config import ( "YiSpider/spider/logger" "encoding/json" "io/ioutil" "os" ) var ConfigI *Config type Config struct { Name string `json:"name"` Version string `json:"version"` WorkNum int `json:"work_num"` MaxWaitNum int `json:"max_wait_num"` HttpAddr string `json:"http_addr"` RedisAddr string `json:"redis_addr"` ScheduleMode string `json:"schedule"` Etcd []string `json:"etcd"` Mysql string `json:"mysql"` } func InitConfig() error { var file *os.File var bytes []byte var err error if file, err = os.OpenFile("conf.json", os.O_RDONLY, 0666); err != nil { return err } if bytes, err = ioutil.ReadAll(file); err != nil { return err } ConfigI = &Config{} if err = json.Unmarshal(bytes, ConfigI); err != nil { return err } logger.Info("init success ", *ConfigI) return nil } ================================================ FILE: spider/core/engine.go ================================================ package core import ( "YiSpider/spider/spider" "fmt" "github.com/kataras/go-errors" "sync" ) var engineI *Engine var once sync.Once func New() *Engine { once.Do(func() { engineI = &Engine{spiders: make(map[string]*SpiderRuntime)} }) return engineI } func GetEnine() *Engine { return engineI } type Engine struct { spiders map[string]*SpiderRuntime } func (m *Engine) AddSpider(spider *spider.Spider) *Engine { spiderRuntime := NewSpiderRuntime() spiderRuntime.SetSpider(spider) m.spiders[spider.Name] = spiderRuntime return m } func (m *Engine) RunTask(name string) error { s, ok := m.spiders[name] if !ok { return errors.New(fmt.Sprintf("Task [%s] is not exist", name)) } s.Run() return nil } func (m *Engine) StopTask(name string) error { s, ok := m.spiders[name] if !ok { return errors.New(fmt.Sprintf("Task [%s] is not exist", name)) } s.Stop() return nil } func (m *Engine) EndTask(name string) error { s, ok := m.spiders[name] if !ok { return errors.New(fmt.Sprintf("Task [%s] is not exist", name)) } s.Exit() return nil } func (m *Engine) ListTask() []*spider.Spider { spiders := []*spider.Spider{} for _, s := range m.spiders { spiders = append(spiders, s.spider) } return spiders } func (m *Engine) GetTaskMetas() map[string]*TaskMeta { metas := map[string]*TaskMeta{} for name, s := range m.spiders { metas[name] = s.TaskMeta } return metas } func (m *Engine) Run() { for _, s := range m.spiders { s.Run() } } ================================================ FILE: spider/core/runtime.go ================================================ package core import ( "YiSpider/spider/config" "YiSpider/spider/downloader" "YiSpider/spider/logger" "YiSpider/spider/model" "YiSpider/spider/schedule" //"time" "YiSpider/spider/common" "YiSpider/spider/process" "YiSpider/spider/spider" "io/ioutil" "net/http" "sync" "sync/atomic" "YiSpider/spider/pipline/mysql" ) const Default_WorkNum = 1 type SpiderRuntime struct { sync.Mutex workNum int schedule schedule.Schedule spider *spider.Spider stopSign bool recoverChan chan int TaskMeta *TaskMeta } type TaskMeta struct { DownloadFailCount int32 `json:"download_fail_count"` DownloadCount int32 `json:"download_fail_count"` UrlNum int32 `json:"url_num"` WaitUrlNum int `json:"wait_url_num"` CrawlerResultNum int32 `json:"crawler_result_num"` } func NewSpiderRuntime() *SpiderRuntime { workNum := config.ConfigI.WorkNum if workNum == 0 { workNum = Default_WorkNum } s := &SpiderRuntime{} s.workNum = workNum s.schedule = schedule.GetSchedule(config.ConfigI) s.recoverChan = make(chan int) meta := &TaskMeta{} meta.WaitUrlNum = 0 meta.UrlNum = int32(0) meta.DownloadCount = int32(0) meta.DownloadFailCount = int32(0) meta.CrawlerResultNum = int32(0) s.TaskMeta = meta if len(config.ConfigI.Mysql) > 0{ mysql.InitMysql(config.ConfigI.Mysql) } return s } func (s *SpiderRuntime) SetSpider(spider *spider.Spider) { s.spider = spider } func (s *SpiderRuntime) GetSpider() *spider.Spider { return s.spider } func (s *SpiderRuntime) Run() { if s.stopSign { s.recoverChan <- 1 return } s.schedule.PushMuti(s.spider.GetRequests()) for i := 0; i < s.workNum; i++ { go s.worker() } } func (s *SpiderRuntime) Stop() { s.stopSign = true } func (s *SpiderRuntime) worker() { context := model.Context{} for { if s.stopSign { _, ok := <-s.recoverChan s.stopSign = false if !ok { goto exit } } req, ok := s.schedule.Pop() if !ok { goto exit } if req == nil { logger.Info("schedule is emply") continue } atomic.AddInt32(&s.TaskMeta.DownloadCount, 1) response, err := s.download(req) if err != nil { logger.Error(err.Error()) atomic.AddInt32(&s.TaskMeta.DownloadFailCount, 1) continue } body, err := ioutil.ReadAll(response.Body) if err != nil { logger.Error(err.Error()) continue } context.Clear() context.Body, err = common.ToUtf8(body) if err != nil { context.Body = body } context.Request = response.Request context.Header = response.Header ps, ok := s.spider.Process[req.ProcessName] if !ok { response.Body.Close() logger.Info("process is not find ! please call SetProcess|SetTask") break } for _, p := range ps { page, err := processWrapper(p, context) if err != nil { logger.Info("Process fail|", err.Error()) continue } if page == nil { logger.Info("Process page is nil") continue } s.TaskMeta.WaitUrlNum = s.schedule.Count() if page.Urls != nil && len(page.Urls) > 0 { atomic.AddInt32(&s.TaskMeta.UrlNum, int32(len(page.Urls))) go func() { s.schedule.PushMuti(page.Urls) }() } if page.ResultCount > 0 { atomic.AddInt32(&s.TaskMeta.CrawlerResultNum, int32(page.ResultCount)) s.spider.Pipline.ProcessData(page.Result, s.spider.Name, req.ProcessName) } } response.Body.Close() } exit: logger.Info(s.spider.Name, "worker close") } func processWrapper(p process.Process, context model.Context) (*model.Page, error) { defer func() { if err := recover(); err != nil { logger.Error(err) } }() page, err := p.Process(context) return page, err } func (s *SpiderRuntime) download(req *model.Request) (*http.Response, error) { //time.Sleep(1*time.Second) switch req.Method { case "get": return downloader.Get(req.ProcessName, req.Url) case "post": return downloader.PostJson(req.ProcessName, req.Url, req.Data) } return nil, nil } func (s *SpiderRuntime) Exit() { s.schedule.Close() close(s.recoverChan) } ================================================ FILE: spider/downloader/request.go ================================================ package downloader import ( "YiSpider/spider/logger" "bytes" "encoding/json" "errors" "fmt" "golang.org/x/net/publicsuffix" "net/http" "net/http/cookiejar" "sync" "time" ) var Clients map[string]*http.Client var lock sync.RWMutex func init() { Clients = make(map[string]*http.Client) } func makeCookiejar() http.CookieJar { cookiejarOptions := cookiejar.Options{ PublicSuffixList: publicsuffix.List, } jar, _ := cookiejar.New(&cookiejarOptions) return jar } func makeClient(transport http.RoundTripper, jar http.CookieJar) *http.Client { return &http.Client{Jar: jar, Transport: transport, Timeout: 60 * time.Second} } func Get(taskId string, url string) (*http.Response, error) { res, err := doRequest(taskId, "GET", url, nil) if err != nil { logger.Info("Download fail doRequest,url:", url, "err:", err) return nil, err } logger.Info("GET", url, " =>", res.StatusCode) if res.StatusCode >= 400 { return nil, errors.New(fmt.Sprintf("download fail,url %s, StatusCode %d", url, res.StatusCode)) } return res, nil } func PostJson(taskId string, url string, data interface{}) (*http.Response, error) { dataJ, err := json.Marshal(data) if err != nil { return nil, err } res, err := doRequest(taskId, "POST", url, dataJ) if err != nil { return nil, err } logger.Info("POST", url, "=>", res.StatusCode) if res.StatusCode >= 400 { return nil, errors.New(fmt.Sprintf("download fail, StatusCode %d", res.StatusCode)) } return res, nil } func doRequest(id string, method string, url string, data []byte) (resp *http.Response, err error) { req, err := http.NewRequest(method, url, bytes.NewBuffer(data)) if err != nil { return nil, err } //req.Header.Set("Content-Type", "application/json") client := getClient(id) if client == nil { client = makeClient(nil, makeCookiejar()) setClient(id, client) } return client.Do(req) } func setClient(id string, client *http.Client) { lock.Lock() defer lock.Unlock() Clients[id] = client } func getClient(id string) *http.Client { lock.RLock() defer lock.RUnlock() client := Clients[id] return client } ================================================ FILE: spider/downloader/request_test.go ================================================ package downloader import "testing" func TestGet(t *testing.T) { if _, err := Get("baidu", "http://www.hao123.com"); err != nil { t.Fatal(err) } } func TestPostJson(t *testing.T) { } ================================================ FILE: spider/http/server.go ================================================ package http import ( "YiSpider/spider/config" "YiSpider/spider/core" "YiSpider/spider/model" spider2 "YiSpider/spider/spider" "encoding/json" "io/ioutil" "log" "net/http" "net/url" ) var errorMethod = []byte("{\"code\":\"400\",\"msg\":\"not support method\"}") var errorQuery = []byte("{\"code\":\"400\",\"msg\":\"error url parmas\"}") var errorJson = []byte("{\"code\":\"400\",\"msg\":\"error prase json \"}") var errorReadBody = []byte("{\"code\":\"400\",\"msg\":\"error read body\"}") var commonSuccess = []byte("{\"code\":\"200\",\"msg\":\"success\"}") func AddTask(w http.ResponseWriter, req *http.Request) { if req.Method != "POST" { w.Write(errorMethod) return } body, err := ioutil.ReadAll(req.Body) if err != nil { w.Write(errorReadBody) return } spider := &model.Task{} err = json.Unmarshal(body, spider) if err != nil { w.Write(errorJson) return } err = core.GetEnine().AddSpider(spider2.InitWithTask(spider)).RunTask(spider.Name) if err != nil { w.Write([]byte(err.Error())) return } w.Write(commonSuccess) return } func StopTask(w http.ResponseWriter, req *http.Request) { if req.Method != "GET" { w.Write(errorMethod) return } queryMap, err := url.ParseQuery(req.URL.RawQuery) if err != nil { w.Write(errorQuery) return } name := queryMap.Get("name") if err := core.GetEnine().StopTask(name); err != nil { w.Write([]byte(err.Error())) return } w.Write(commonSuccess) return } func RunTask(w http.ResponseWriter, req *http.Request) { if req.Method != "GET" { w.Write(errorMethod) return } queryMap, err := url.ParseQuery(req.URL.RawQuery) if err != nil { w.Write(errorQuery) return } name := queryMap.Get("name") if err := core.GetEnine().RunTask(name); err != nil { w.Write([]byte(err.Error())) return } w.Write(commonSuccess) return } func EndTask(w http.ResponseWriter, req *http.Request) { if req.Method != "GET" { w.Write(errorMethod) return } queryMap, err := url.ParseQuery(req.URL.RawQuery) if err != nil { w.Write(errorQuery) return } name := queryMap.Get("name") if err := core.GetEnine().EndTask(name); err != nil { w.Write([]byte(err.Error())) return } w.Write(commonSuccess) return } func ListTask(w http.ResponseWriter, req *http.Request) { if req.Method != "GET" { w.Write(errorMethod) return } tasks := core.GetEnine().ListTask() datas, err := json.Marshal(tasks) if err != nil { w.Write([]byte(err.Error())) return } w.Write(datas) return } func InitHttpServer() { http.HandleFunc("/task/addAndRun", AddTask) http.HandleFunc("/task/run", RunTask) http.HandleFunc("/task/stop", StopTask) http.HandleFunc("/task/end", EndTask) http.HandleFunc("/tasks", ListTask) err := http.ListenAndServe(config.ConfigI.HttpAddr, nil) if err != nil { log.Fatal("ListenAndServe fail:", err) } } ================================================ FILE: spider/logger/logger.go ================================================ package logger import "fmt" func Info(v ...interface{}) { fmt.Println(v) } func Debug(v ...interface{}) { fmt.Println(v) } func Warn(v ...interface{}) { fmt.Println(v) } func Error(v ...interface{}) { fmt.Println(v) } ================================================ FILE: spider/model/context.go ================================================ package model import "net/http" type Context struct { Body []byte Request *http.Request Header http.Header } func (c *Context) Clear() { c.Body = nil c.Request = nil c.Header = nil } ================================================ FILE: spider/model/page.go ================================================ package model type Page struct { Result []map[string]interface{} ResultCount int Urls []*Request } func (p *Page) AddUrl(req *Request) { if p.Urls == nil { p.Urls = []*Request{} } p.Urls = append(p.Urls, req) } func (p *Page) AddUrls(req []*Request) { if p.Urls == nil { p.Urls = []*Request{} } p.Urls = append(p.Urls, req...) } func (p *Page) AddResult(value map[string]interface{}) { if p.Result == nil { p.Result = []map[string]interface{}{} } p.Result = append(p.Result, value) p.ResultCount++ } ================================================ FILE: spider/model/task.go ================================================ package model import ( "encoding/json" ) type Task struct { Id string `json:"id"` Name string `jsonTask:"name"` Request []*Request `json:"request"` Process []Process `json:"process"` Pipline string `json:"pipline"` Depth int `json:"depth"` EndCount int `json:"end_count"` } type Request struct { Url string `json:"url"` Method string `json:"method"` ContentType string `json:"type"` // json urlencode form Data map[string]string `json:"data"` Header map[string]string `json:"header"` Cookies Cookies `json:"cookies"` ProcessName string `json:"process_name"` } func (r *Request) Write() ([]byte, error) { return json.Marshal(r) } func (r *Request) Read(b []byte) error { return json.Unmarshal(b, r) } type Cookies struct { Url string `json:"url"` Data string `json:"data"` } type Process struct { Name string `json:"name"` RegUrl []string `json:"reg_url"` Type string `json:"type"` // template json self_process TemplateRule TemplateRule `json:"template_rule"` JsonRule JsonRule `json:"json_rule"` AddQueue []*Request `json:"add_queue"` // http://www.baidu.com/{name}/{ctx} } type TemplateRule struct { Rule map[string]string } type JsonRule struct { Rule map[string]string } ================================================ FILE: spider/pipline/console/console.go ================================================ package console import ( "encoding/json" "fmt" ) type ConsolePipline struct { } func NewConsolePipline() *ConsolePipline { return &ConsolePipline{} } func (c *ConsolePipline) ProcessData(v []map[string]interface{}, taskName string, processName string) { bytes, _ := json.Marshal(v) fmt.Println("Pipline :", string(bytes)) } ================================================ FILE: spider/pipline/file/file.go ================================================ package file import ( "YiSpider/spider/logger" "encoding/json" "fmt" "os" "time" ) type FilePipline struct { root string files map[string]*os.File } func NewFilePipline(root string) *FilePipline { return &FilePipline{root: root, files: make(map[string]*os.File)} } func (c *FilePipline) ProcessData(v []map[string]interface{}, taskName string, processName string) { file, ok := c.files[processName] if !ok { var f *os.File var err error path := fmt.Sprintf("%s%s-%s.txt", c.root, taskName, processName) if f, err = os.OpenFile(path, os.O_CREATE|os.O_RDWR, 0666); err != nil { logger.Error("FilePipline Open File fail, path =", path, err) return } f.WriteString(fmt.Sprintf("========= Task : %s =============\n", taskName)) f.WriteString(fmt.Sprintf("======= Task Begin : %s =============\n", time.Now())) c.files[processName] = f file = f } for _, value := range v { data, err := json.Marshal(value) if err != nil { logger.Error("FilePipline json.Marshal fail, v = ", v) return } file.WriteString(string(data) + "\n") } logger.Info("File Pipline write. Count:", len(v)) return } func (c *FilePipline) Close() { for _, f := range c.files { f.Close() } } ================================================ FILE: spider/pipline/mysql/dbModel.go ================================================ package mysql import ( "fmt" "strings" "reflect" "encoding/json" ) type Field struct { Name string Pk bool Value interface{} } func (f *Field) Sql() string{ var sql string switch f.Value.(type) { case string: sql = fmt.Sprintf("\n `%s` varchar(255) NULL DEFAULT '' ",f.Name) case int: sql = fmt.Sprintf("\n `%s` integer NULL DEFAULT 0 ",f.Name) case int32: sql = fmt.Sprintf("\n `%s` integer NULL DEFAULT 0 ",f.Name) case int64: sql = fmt.Sprintf("\n `%s` integer NULL DEFAULT 0 ",f.Name) case float64: sql = fmt.Sprintf("\n `%s` float NULL DEFAULT 0.0 ",f.Name) case float32: sql = fmt.Sprintf("\n `%s` float NULL DEFAULT 0.0 ",f.Name) default: sql = fmt.Sprintf("\n `%s` varchar(255) NULL DEFAULT '' ",f.Name) } if f.Pk{ sql = fmt.Sprintf("\n `%s` integer AUTO_INCREMENT PRIMARY KEY",f.Name) } sql += "," return sql } type DBModel struct { Name string Fields []Field } func (d *DBModel) TableSql() string{ sql := fmt.Sprintf("CREATE TABLE IF NOT EXISTS `%s` (",d.Name) for _,field := range d.Fields{ sql += field.Sql() } sql = sql[:len(sql)-1] sql += "\n ) ENGINE=InnoDB DEFAULT CHARSET=utf8;" return sql } func (d *DBModel) InsertSql() string{ sql := fmt.Sprintf("INSERT `%s` SET ",d.Name) for i:= 1;i< len(d.Fields);i++{ sql += fmt.Sprintf("`%s`=?,",d.Fields[i].Name) } sql = sql[:len(sql)-1] return sql } func (d *DBModel) InsertArgs() []interface{}{ args := []interface{}{} for i:= 1;i< len(d.Fields);i++{ rv := reflect.ValueOf(d.Fields[i].Value) switch rv.Kind(){ case reflect.Array: case reflect.Slice: bytes,_ := json.Marshal(d.Fields[i].Value) args = append(args,string(bytes)) default: args = append(args,rv.String()) } } return args } func NewDBModel(name string,m map[string]interface{}) *DBModel{ dbModel := &DBModel{Name:name,Fields:[]Field{}} dbModel.Fields = append(dbModel.Fields,Field{Name:strings.ToLower("FffId"),Pk:true,Value:1}) for k,v := range m{ dbModel.Fields = append(dbModel.Fields,Field{Name:strings.ToLower(k),Pk:false,Value:v}) } return dbModel } ================================================ FILE: spider/pipline/mysql/mysql.go ================================================ package mysql import "database/sql" import ( _ "github.com/go-sql-driver/mysql" "github.com/astaxie/beego" "github.com/astaxie/beego/orm" ) var DB *sql.DB func InitMysql(mysql string) { orm.RegisterDriver("mysql", orm.DRMySQL) orm.RegisterDataBase("default", "mysql", mysql) orm.RegisterModel(&C{}) orm.RunSyncdb("default", false, true) } type C struct { Id int } func CreateTable(m *DBModel) error{ o := orm.NewOrm() _,err := o.Raw(m.TableSql()).Exec() if err != nil{ return err } beego.Info("创建表 ",m.Name," 成功 【完成】") return nil } func Add(m *DBModel) error{ o := orm.NewOrm() _,err := o.Raw(m.InsertSql(),m.InsertArgs()...).Exec() if err != nil{ return err } beego.Info("插入数据成功") return nil } ================================================ FILE: spider/pipline/mysql/mysqlPipline.go ================================================ package mysql import ( "sync" ) type MysqlPipline struct { sync.Once } func NewMysqlPipline() *MysqlPipline { return &MysqlPipline{} } func (c *MysqlPipline) ProcessData(v []map[string]interface{}, taskName string, processName string) { for _,m :=range v{ dbModel := NewDBModel(processName,m) CreateTable(dbModel) Add(dbModel) } } ================================================ FILE: spider/pipline/nsq/nsq.go ================================================ package nsq ================================================ FILE: spider/pipline/pipline.go ================================================ package pipline type Pipline interface { ProcessData(v []map[string]interface{}, taskName string, processName string) } ================================================ FILE: spider/process/filter/repoat_filter.go ================================================ package filter import ( "YiSpider/spider/model" "sync" ) var CuckooFilter map[string]int var lock sync.RWMutex func init() { CuckooFilter = make(map[string]int) } func RepeatFilter(url string, process *model.Process) bool { sign := url if ok := get(sign); ok { return false } put(sign) return true } func get(name string) bool { lock.RLock() defer lock.RUnlock() _, ok := CuckooFilter[name] return ok } func put(name string) { lock.Lock() defer lock.Unlock() CuckooFilter[name] = 1 } ================================================ FILE: spider/process/filter/repoat_filter_test.go ================================================ package filter import ( "YiSpider/spider/model" "fmt" "testing" ) func TestRepeatFilter(t *testing.T) { task := &model.Task{ Id: "qiiubai", Name: "qiubai", Method: "get", Host: "https://www.qiushibaike.com", Url: "https://www.qiushibaike.com", Process: model.Process{ Url: "https://www.qiushibaike.com", RegUrl: []string{ "/.*?/page/[0-9]+", }, Type: "template", TemplateRule: model.TemplateRule{ Rule: map[string]string{ "node": "array|.article", "url": "attr.href|.contentHerf", "author": "attr.alt|.author a img", "content": "text|.content span", "like_num": "text|.stats-vote i", "comment_num": "text|.stats-comments a i", }, }, }, Pipline: "file", } fmt.Println(RepeatFilter("/8hr/", task)) fmt.Println(RepeatFilter("/8hr/page/3/", task)) fmt.Println(RepeatFilter("/8hr/page/4/", task)) fmt.Println(RepeatFilter("/8hr/page/5/", task)) fmt.Println(RepeatFilter("/8hr/page/13/", task)) fmt.Println(RepeatFilter("/8hr/page/3/", task)) } ================================================ FILE: spider/process/filter/url_filter.go ================================================ package filter import ( "YiSpider/spider/model" "regexp" ) func Filter(url string, process *model.Process) bool { if len(url) == 0 { return false } check := false for _, regUrl := range process.RegUrl { reg := regexp.MustCompile(regUrl) match := reg.MatchString(url) if match { check = true break } } if check == false { return false } return RepeatFilter(url, process) } ================================================ FILE: spider/process/filter/url_filter_test.go ================================================ package filter import ( "YiSpider/spider/model" "fmt" "testing" ) func TestFilter(t *testing.T) { task := &model.Task{ Id: "qiiubai", Name: "qiubai", Method: "get", Host: "https://www.qiushibaike.com", Url: "https://www.qiushibaike.com", Process: model.Process{ Url: "https://www.qiushibaike.com", RegUrl: []string{ "/.*?/page/[0-9]+", }, Type: "template", TemplateRule: model.TemplateRule{ Rule: map[string]string{ "node": "array|.article", "url": "attr.href|.contentHerf", "author": "attr.alt|.author a img", "content": "text|.content span", "like_num": "text|.stats-vote i", "comment_num": "text|.stats-comments a i", }, }, }, Pipline: "file", } fmt.Println(Filter("/8hr/", task)) fmt.Println(Filter("/8hr/page/3/", task)) fmt.Println(Filter("/8hr/page/4/", task)) fmt.Println(Filter("/8hr/page/5/", task)) fmt.Println(Filter("/8hr/page/13/", task)) fmt.Println(Filter("/8hr/page/3/", task)) } ================================================ FILE: spider/process/json-process/json_process.go ================================================ package json_process import ( "YiSpider/spider/model" ) type JsonProcess struct { jsonProcess *model.Process } func NewJsonProcess(jsonProcess *model.Process) *JsonProcess { return &JsonProcess{jsonProcess: jsonProcess} } func (j *JsonProcess) Process(context model.Context) (*model.Page, error) { return JsonRuleProcess(j.jsonProcess, context) } ================================================ FILE: spider/process/json-process/json_rule.go ================================================ package json_process import ( "YiSpider/spider/common" "YiSpider/spider/logger" "YiSpider/spider/model" simplejson "github.com/bitly/go-simplejson" "strings" ) func JsonRuleProcess(process *model.Process, context model.Context) (*model.Page, error) { return Process(process, context) } func Process(process *model.Process, context model.Context) (*model.Page, error) { jsonRule := process.JsonRule.Rule page := &model.Page{} sJson, err := simplejson.NewJson(context.Body) if err != nil { logger.Error("NewDocumentFromReader fail,", err) return nil, err } resultType := "map" rootSel := []string{} v, ok := jsonRule["node"] if ok { contentInfo := strings.Split(v, "|") resultType = contentInfo[0] selStr := contentInfo[1] rootSel = strings.Split(selStr, ".") } if resultType == "array" { for _, name := range rootSel { sJson = sJson.Get(name) } rootNode, err := sJson.Array() if err != nil { logger.Error("Json fail,", err) return nil, err } if len(rootNode) >= 0 { for _, node := range rootNode { nodeMap, ok := node.(map[string]interface{}) if !ok { continue } data := map[string]interface{}{} for key, value := range jsonRule { if key == "node" { continue } data[key] = nodeMap[value] } if len(process.AddQueue) > 0 { page.AddUrls(common.PraseReq(process.AddQueue, data)) } page.AddResult(data) } } } if resultType == "map" { result := map[string]interface{}{} for _, name := range rootSel { sJson = sJson.Get(name) } if err != nil { logger.Error("Json fail,", err) return nil, err } for key, value := range jsonRule { valueSel := []string{} valueSel = strings.Split(value, ".") valueNode := *sJson for _, name := range valueSel { valueNode = *valueNode.Get(name) } result[key] = valueNode.Interface() } if len(process.AddQueue) > 0 { page.AddUrls(common.PraseReq(process.AddQueue, result)) } page.AddResult(result) } if resultType == "nil" { result := map[string]interface{}{} for _, name := range rootSel { sJson = sJson.Get(name) } rootNode, err := sJson.Map() if err != nil { logger.Error("Json fail,", err) return nil, err } for key, value := range jsonRule { result[key] = rootNode[value] } page.Urls = []*model.Request{} if len(process.AddQueue) > 0 { page.AddUrls(common.PraseReq(process.AddQueue, result)) } } return page, nil } ================================================ FILE: spider/process/process.go ================================================ package process import ( "YiSpider/spider/model" ) type Process interface { Process(context model.Context) (*model.Page, error) } ================================================ FILE: spider/process/template-process/template_process.go ================================================ package template_process import ( "YiSpider/spider/model" ) type TemplateProcess struct { tempProcess *model.Process } func NewTemplateProcess(tempProcess *model.Process) *TemplateProcess { return &TemplateProcess{tempProcess: tempProcess} } func (t *TemplateProcess) Process(context model.Context) (*model.Page, error) { return TemplateRuleProcess(t.tempProcess, context) } ================================================ FILE: spider/process/template-process/template_rule.go ================================================ package template_process import ( "YiSpider/spider/common" "YiSpider/spider/logger" "YiSpider/spider/model" "YiSpider/spider/process/filter" "bytes" "encoding/json" "fmt" "github.com/PuerkitoBio/goquery" url2 "net/url" "strings" ) func TemplateRuleProcess(process *model.Process, context model.Context) (*model.Page, error) { page := &model.Page{} rule := process.TemplateRule.Rule doc, err := goquery.NewDocumentFromReader(bytes.NewBuffer(context.Body)) if err != nil { logger.Error("NewDocumentFromReader fail,", err) return nil, err } if len(process.RegUrl) > 0 { doc.Find("a").Each(func(i int, sel *goquery.Selection) { href, _ := sel.Attr("href") href = getComplateUrl(context.Request.URL, href) if filter.Filter(href, process) { page.AddUrl(&model.Request{Url: href, Method: "get"}) } }) } resultType := "map" rootSel := "" v, ok := rule["node"] if ok { contentInfo := strings.Split(v, "|") resultType = contentInfo[0] rootSel = contentInfo[1] } if resultType == "array" { doc.Find(rootSel).Each(func(i int, s *goquery.Selection) { data := getMapFromDom(rule, s) if data == nil { return } if len(process.AddQueue) > 0 { page.AddUrls(common.PraseReq(process.AddQueue, data)) } page.AddResult(data) }) } if resultType == "map" { data := getMapFromDom(rule, doc.Selection) if len(process.AddQueue) > 0 { page.AddUrls(common.PraseReq(process.AddQueue, data)) } page.AddResult(data) } return page, nil } func getMapFromDom(rule map[string]string, node *goquery.Selection) map[string]interface{} { result := make(map[string]interface{}) isNull := true for key, value := range rule { if key == "node" { continue } rules := strings.Split(value, "|") ValueType := strings.Split(rules[0], ".") if len(rules) < 2 { continue } s := node.Find(rules[1]) switch ValueType[0] { case "text": result[key] = s.Text() case "html": result[key], _ = s.Html() case "attr": if len(ValueType) < 2 { continue } result[key], _ = s.Attr(ValueType[1]) case "texts": arr := []string{} s.Each(func(i int, sel *goquery.Selection) { text := sel.Text() arr = append(arr, text) }) j, _ := json.Marshal(arr) result[key] = string(j) case "htmls": arr := []string{} s.Each(func(i int, sel *goquery.Selection) { html, _ := s.Html() arr = append(arr, html) }) j, _ := json.Marshal(arr) result[key] = string(j) case "attrs": arr := []string{} attr := "" s.Each(func(i int, sel *goquery.Selection) { if len(ValueType) >= 2 { attr, _ = sel.Attr(ValueType[1]) arr = append(arr, attr) } }) result[key] = arr default: result[key] = "" } res, ok := result[key].(string) if ok || len(res) != 0 { isNull = false } } if isNull == true { return nil } return result } func getComplateUrl(url *url2.URL, href string) string { if strings.HasPrefix(href, "/") { newHref := fmt.Sprintf("%s://%s%s", url.Scheme, url.Host, href) return newHref } newHref := fmt.Sprintf("%s://%s/%s", url.Scheme, url.Host, href) return newHref } ================================================ FILE: spider/process/template-process/template_rule_test.go ================================================ package template_process import ( "encoding/json" "fmt" "github.com/PuerkitoBio/goquery" "testing" ) func TestTemplateProcess(t *testing.T) { //doc,err := goquery.NewDocument("https://www.qiushibaike.com/") //if err != nil{ // t.Fatal("open url fail ",err) //} //html,err := doc.Html() //if err != nil{ // t.Fatal("get html fail ",err) //} // // //rule := map[string]string{ // "node":"array|.article", // "url":"attr.href|.contentHerf", // "author":"attr.alt|.author a img", // "content":"text|.content span", // "like_num":"text|.stats-vote i", // "comment_num":"text|.stats-comments a i", //} // //result,_ := TemplateRuleProcess(rule,[]byte(html)) //data,_ := json.Marshal(result) //fmt.Println("Result :",string(data)) } ================================================ FILE: spider/register/etcd/etcd.go ================================================ package etcd import ( "encoding/json" "log" "runtime" "time" "YiSpider/spider/core" "github.com/coreos/etcd/client" "golang.org/x/net/context" ) type Worker struct { Name string IP string KeysAPI client.KeysAPI } type WorkerInfo struct { Name string `json:"name"` IP string `json:"ip"` CPU int `json:"cpu"` MetaData map[string]string `json:"metadata"` SpiderData map[string]*SpiderData `json:"spider_data"` } type SpiderData struct { DownloadFailCount int32 `json:"download_fail_count"` DownloadCount int32 `json:"download_count"` UrlNum int32 `json:"url_num"` WaitUrlNum int `json:"wait_url_num"` CrawlerResultNum int32 `json:"crawler_result_num"` } func NewWorker(name, IP string, endpoints []string) *Worker { cfg := client.Config{ Endpoints: endpoints, Transport: client.DefaultTransport, HeaderTimeoutPerRequest: time.Second, } etcdClient, err := client.New(cfg) if err != nil { log.Fatal("Error: cannot connec to etcd:", err) } w := &Worker{ Name: name, IP: IP, KeysAPI: client.NewKeysAPI(etcdClient), } return w } func (w *Worker) HeartBeat() { api := w.KeysAPI for { info := &WorkerInfo{ Name: w.Name, IP: w.IP, CPU: runtime.NumCPU(), SpiderData: getSpiderData(), } key := "spiders/" + w.Name value, _ := json.Marshal(info) _, err := api.Set(context.Background(), key, string(value), &client.SetOptions{ TTL: time.Second * 15, }) if err != nil { log.Println("Error update workerInfo:", err) } time.Sleep(time.Second * 5) } } func getSpiderData() map[string]*SpiderData { datas := make(map[string]*SpiderData) metas := core.GetEnine().GetTaskMetas() for name, meta := range metas { data := &SpiderData{} data.CrawlerResultNum = meta.CrawlerResultNum data.DownloadFailCount = meta.DownloadFailCount data.DownloadCount = meta.DownloadCount data.WaitUrlNum = meta.WaitUrlNum data.UrlNum = meta.UrlNum datas[name] = data } return datas } ================================================ FILE: spider/schedule/schedule.go ================================================ package schedule import ( "YiSpider/spider/config" "YiSpider/spider/model" ) type Schedule interface { Push(req *model.Request) PushMuti(reqs []*model.Request) Pop() (*model.Request, bool) Count() int Close() } var ( scheduleMap = make(map[string]func(*config.Config) Schedule) ) func RegisterSchedule(name string, builder func(*config.Config) Schedule) { scheduleMap[name] = builder } func GetSchedule(c *config.Config) Schedule { schedule := scheduleMap[c.ScheduleMode] if schedule == nil { return scheduleMap["chan"](c) } return schedule(c) } ================================================ FILE: spider/schedule/schedule_chan.go ================================================ package schedule import ( "YiSpider/manage/logger" "YiSpider/spider/common" "YiSpider/spider/config" "YiSpider/spider/model" ) type ChanSchedule struct { waitQueue chan *model.Request } func NewChanSchedule(config *config.Config) Schedule { schedule := &ChanSchedule{} schedule.waitQueue = make(chan *model.Request, config.MaxWaitNum) return schedule } func (d *ChanSchedule) Push(req *model.Request) { praseReqs := common.PraseReq([]*model.Request{req}, nil) for _, req := range praseReqs { logger.Info("Push Url:", req.Url, req.ProcessName, len(d.waitQueue)) d.waitQueue <- req } } func (d *ChanSchedule) PushMuti(reqs []*model.Request) { praseReqs := common.PraseReq(reqs, nil) for _, req := range praseReqs { logger.Info("Push Url:", req.Url, req.ProcessName, len(d.waitQueue)) d.waitQueue <- req } } func (d *ChanSchedule) Pop() (*model.Request, bool) { req, ok := <-d.waitQueue logger.Info("Pop Url:", req.Url, req.ProcessName, len(d.waitQueue)) return req, ok } func (d *ChanSchedule) Count() int { return len(d.waitQueue) } func (d *ChanSchedule) Close() { close(d.waitQueue) } func init() { RegisterSchedule("chan", NewChanSchedule) } ================================================ FILE: spider/schedule/schedule_chan_test.go ================================================ package schedule import ( "testing" ) func TestInitDownloader(t *testing.T) { //s := NewSchedule(4) //s.Push(&model.Task{Id:"hao123",Url:"http://www.hao123.com",Method:"get"}) //task,ok := s.Pop() //if !ok{ // t.Fatal() //} //fmt.Println(task) } ================================================ FILE: spider/schedule/schedule_redis.go ================================================ package schedule import ( "YiSpider/spider/common" "YiSpider/spider/config" "YiSpider/spider/logger" "YiSpider/spider/model" "github.com/garyburd/redigo/redis" "time" ) type RedisSchedule struct { name string address string pool *redis.Pool } func NewRedisSchedule(config *config.Config) Schedule { schedule := &RedisSchedule{} schedule.address = config.RedisAddr schedule.name = config.Name schedule.connect() return schedule } func (r *RedisSchedule) connect() { r.pool = &redis.Pool{ MaxIdle: 10, IdleTimeout: 240 * time.Second, Dial: func() (redis.Conn, error) { return redis.Dial("tcp", r.address) }, } go r.CronCount(1) } func (r *RedisSchedule) Push(req *model.Request) { conn := r.pool.Get() defer conn.Close() praseReqs := common.PraseReq([]*model.Request{req}, nil) for _, req := range praseReqs { logger.Info("Push Url:", req.Url, req.ProcessName) body, err := req.Write() if err != nil { logger.Info("Push Url:", err.Error()) continue } _, err = conn.Do("LPUSH", r.name, body) if err != nil { logger.Info("Push Url:", err.Error()) continue } } } func (r *RedisSchedule) PushMuti(reqs []*model.Request) { conn := r.pool.Get() defer conn.Close() praseReqs := common.PraseReq(reqs, nil) for _, req := range praseReqs { logger.Info("Push Url:", req.Url, req.ProcessName) body, err := req.Write() if err != nil { logger.Info("Push Url:", err.Error()) continue } _, err = conn.Do("LPUSH", r.name, body) if err != nil { logger.Info("Push Url:", err.Error()) continue } } } func (r *RedisSchedule) Pop() (*model.Request, bool) { conn := r.pool.Get() defer conn.Close() value, err := redis.ByteSlices(conn.Do("BRPOP", r.name, 5)) if err != nil { logger.Info("Pop Url: ", err.Error()) return nil, true } req := &model.Request{} if err := req.Read(value[1]); err != nil { logger.Info("Pop Url: ", err.Error()) return nil, true } logger.Info("Pop Url:", req.Url, req.ProcessName) return req, true } func (r *RedisSchedule) Count() int { conn := r.pool.Get() defer conn.Close() value, err := redis.Int(conn.Do("LLEN", r.name)) if err != nil { logger.Info("Count ", err.Error()) return -1 } return value } func (r *RedisSchedule) Close() { r.pool.Close() } func (r *RedisSchedule) CronCount(flushTime int) { ticker := time.NewTicker(time.Second * time.Duration(flushTime)) go func() { for range ticker.C { logger.Info("RedisSchedule Count:", r.Count()) } }() } func init() { RegisterSchedule("redis", NewRedisSchedule) } ================================================ FILE: spider/schedule/schedule_redis_test.go ================================================ package schedule import ( "testing" "YiSpider/spider/config" "YiSpider/spider/model" ) func TestRedisSchedule_Push(t *testing.T) { s := NewRedisSchedule(&config.Config{RedisAddr: "127.0.0.1:6379"}) s.Push(&model.Request{Url: "www.bai123.com", Method: "get", Header: map[string]string{"a": "b"}}) } func TestRedisSchedule_Pop(t *testing.T) { s := NewRedisSchedule(&config.Config{Name: "qiongyou_spider", RedisAddr: "127.0.0.1:6379"}) for i := 0; i < 100; i++ { go s.Pop() } } ================================================ FILE: spider/spider/spider.go ================================================ package spider import ( "YiSpider/spider/model" "YiSpider/spider/pipline" "YiSpider/spider/pipline/console" "YiSpider/spider/pipline/file" "YiSpider/spider/process" "YiSpider/spider/process/json-process" "YiSpider/spider/process/template-process" "YiSpider/spider/pipline/mysql" "YiSpider/spider/config" ) type Spider struct { Id string Name string Depth int EndCount int Requests []*model.Request Process map[string][]process.Process Pipline pipline.Pipline } func (s *Spider) GetPipline() pipline.Pipline { return s.Pipline } func (s *Spider) GetProcess(name string) []process.Process { return s.Process[name] } func (s *Spider) GetRequests() []*model.Request { return s.Requests } func (s *Spider) AddProcess(name string, p process.Process) { if s.Process == nil { s.Process = make(map[string][]process.Process) } processs, ok := s.Process[name] if !ok { ps := []process.Process{} s.Process[name] = append(ps, p) } else { processs = append(processs, p) } } func InitWithTask(task *model.Task) *Spider { s := &Spider{} s.Id = task.Id s.Name = task.Name s.Depth = task.Depth s.EndCount = task.EndCount s.Requests = task.Request s.Process = make(map[string][]process.Process) for i, p := range task.Process { switch p.Type { case "template": processs, ok := s.Process[p.Name] if !ok { processs = []process.Process{} s.Process[p.Name] = processs } s.Process[p.Name] = append(processs, template_process.NewTemplateProcess(&task.Process[i])) case "json": processs, ok := s.Process[p.Name] if !ok { processs = []process.Process{} s.Process[p.Name] = processs } s.Process[p.Name] = append(processs, json_process.NewJsonProcess(&task.Process[i])) } } switch task.Pipline { case "console": s.Pipline = console.NewConsolePipline() case "file": s.Pipline = file.NewFilePipline("./") case "mysql": s.Pipline = mysql.NewMysqlPipline() default: if len(config.ConfigI.Mysql) > 0{ s.Pipline = mysql.NewMysqlPipline() }else{ s.Pipline = file.NewFilePipline("./") } } return s } ================================================ FILE: storage/conf.json ================================================ { "name":"yi_spider_storage", "version":"0.01" } ================================================ FILE: storage/config/config.go ================================================ package config import ( "YiSpider/storage/logger" "encoding/json" "io/ioutil" "os" ) var ConfigI *Config type Config struct { Name string `json:"name"` Version string `json:"version"` } func InitConfig() error { var file *os.File var bytes []byte var err error if file, err = os.OpenFile("./storage/conf.json", os.O_RDONLY, 0666); err != nil { return err } if bytes, err = ioutil.ReadAll(file); err != nil { return err } ConfigI = &Config{} if err = json.Unmarshal(bytes, ConfigI); err != nil { return err } logger.Info("init success ", *ConfigI) return nil } ================================================ FILE: storage/db/elasticsearch/elasticsearch.go ================================================ package elasticsearch func init() { } ================================================ FILE: storage/db/hbase/hbase.go ================================================ package hbase ================================================ FILE: storage/db/mysql/mysql.go ================================================ package mysql ================================================ FILE: storage/logger/logger.go ================================================ package logger import "fmt" func Info(v ...interface{}) { fmt.Println(v) } func Debug(v ...interface{}) { fmt.Println(v) } func Warn(v ...interface{}) { fmt.Println(v) } func Error(v ...interface{}) { fmt.Println(v) } ================================================ FILE: storage/main.go ================================================ package main import ( "YiSpider/storage/config" "YiSpider/storage/logger" ) func main() { var err error if err = config.InitConfig(); err != nil { logger.Info(err.Error()) return } }