Repository: zjfGit/python3-scrapy-spider-phantomjs-selenium Branch: master Commit: 60ae18057f53 Files: 33 Total size: 63.5 KB Directory structure: gitextract_dzmg5ry4/ ├── README.md ├── SpiderKeeper.py ├── commands/ │ └── crawlall.py ├── commonUtils.py ├── ghostdriver.log ├── items.py ├── middlewares/ │ └── middleware.py ├── middlewares.py ├── mysqlUtils.py ├── notusedspiders/ │ ├── ContentSpider.py │ ├── ContentSpider_real.py │ ├── DgContentSpider_PhantomJS.py │ ├── DgUrlSpider_PhantomJS.py │ ├── PostHandle.py │ ├── UrlSpider.py │ ├── check_post.py │ ├── contentSettings.py │ ├── params.js │ ├── uploadUtils.py │ └── utils.py ├── pipelines.py ├── settings.py ├── setup.py ├── spiders/ │ ├── UrlSpider_JFSH.py │ ├── UrlSpider_MSZT.py │ ├── UrlSpider_SYDW.py │ ├── UrlSpider_YLBG.py │ ├── UrlSpider_YMYE.py │ └── __init__.py ├── test.py ├── urlSettings.py └── webBrowserPools/ ├── ghostdriver.log └── pool.py ================================================ FILE CONTENTS ================================================ ================================================ FILE: README.md ================================================ # 爬虫Windows环境搭建 ## 安装需要的程序包 - Python3.4.3 > https://pan.baidu.com/s/1pK8KDcv - pip9.0.1 > https://pan.baidu.com/s/1mhNdRN6 - 编辑器pycharm > https://pan.baidu.com/s/1i4Nkdk5 - pywin32 > http://pan.baidu.com/s/1pKZiZWZ - pyOpenSSL > http://pan.baidu.com/s/1hsgOQJq - windows_sdk > http://pan.baidu.com/s/1hrM6iRa - phantomjs > http://pan.baidu.com/s/1nvHm5AD ## 安装过程 ### 安装基础环境 1. 安装Python安装包,一路Next 2. 将Python的安装目录添加到环境变量Path中 3. win + r 输入Cmd打开命令行窗口,输入Python 测试是否安装成功 ### 安装pip > pip的作用相当于linux的yum,安装之后可以采用命令行的方式在线安装一些依赖包 1. 解压pip压缩包到某一目录(推荐与Python基础环境目录同级) 2. cmd窗口进入pip解压目录 3. 输入 python setup.py install 进行安装,安装过程中将会在Python目录的scripts目录下进行 4. 将pip的安装目录 C:\Python34\Scripts; 配置到环境变量path中 5. cmd命令行输入pip list 或者 pip --version 进行检验 ### 安装Scrapy > Scrapy是一个比较成熟的爬虫框架,使用它可以进行网页内容的抓取,但是对于windows并不友好,我们需要一些类库去支持它 1. 安装pywin32: 一路next即可 2. 安装wheel:安装scrapy时需要一些whl文件的安装,whl文件的安装需要预先配置wheel文件。在cmd下使用pip安装 : pip install wheel 3. 安装PyOpenSSL:下载完成PyOpenSSL后,进入下载所在目录,执行安装:pip install pyOpenSSl (**注意,执行安装的wheel文件名一定要tab键自动弹出,不要手动敲入**) 4. 安装lxml: 直接使用pip在线安装 pip install lxml > ***在Windows的安装过程中,一定会出现 “error: Microsoft Visual C++ 10.0 is required (Unable to find vcvarsall.bat).”的问题,也就是无法找到相对应的编译包。一般的做法是下载VisualStudio来获得Complier,但是我们不这样做。*** > 下载windows-sdk后,执行安装操作,如果安装成功,那么这个问题就解决了。如果失败,那么需要先把安装失败过程中的2个编译包卸载。他们分别为:Microsoft Visual C++ 2010 x86 Redistributable、Microsoft Visual C++ 2010 x64 Redistributable(可以使用360或者腾讯管家来卸载) > 卸载完成之后,在安装确认过程中,不要勾选Visual C++ compiler,这样他第一次就能安装成功。安装成功之后,再次点击sdk进行安装,这时候又需要把Visual C++ compiler勾选上,再次执行安装。完成以上操作后,就不会出现Microsoft Visual C++ 10.0 is required的问题了。 > 如果在安装过程中出现“failed building wheel for xxx”的问题,那么需要手动下载wheel包进行安装,所有的安装文件都可以在[http://www.lfd.uci.edu/~gohlke/pythonlibs/](http://www.lfd.uci.edu/~gohlke/pythonlibs/)里找到,找到需要的包并下载完成后执行pip install xxxx即可。 5. 安装Scrapy:pip install Scrapy, 安装完成后可以再命令行窗口输入Scrapy进行验证。 # 爬虫架构设计 为了更好的扩展性和爬虫工作的易于监控,爬虫项目分成3个子项目,分别是url提取、内容爬取、内容更新(包括更新线上内容和定时审核) 主要是采用 Python 编写的scrapy框架,scrapy是目前非常热门的一种爬虫框架,它把整个爬虫过程分为了多个独立的模块,并提供了多个基类可以供我们去自由扩展,让爬虫编写变得简单而有逻辑性。并且scrapy自带的多线程、异常处理、以及强大的自定义Settings也让整个数据抓取过程变得高效而稳定。 scrapy-redis:一个三方的基于redis的分布式爬虫框架,配合scrapy使用,让爬虫具有了分布式爬取的功能。github地址: https://github.com/darkrho/scrapy-redis mongodb 、mysql 或其他数据库:针对不同类型数据可以根据具体需求来选择不同的数据库存储。结构化数据可以使用mysql节省空间,非结构化、文本等数据可以采用mongodb等非关系型数据提高访问速度。具体选择可以自行百度谷歌,有很多关于sql和nosql的对比文章。 其实对于已有的scrapy程序,对其扩展成分布式程序还是比较容易的。总的来说就是以下几步: * 找一台高性能服务器,用于redis队列的维护以及数据的存储。 * 扩展scrapy程序,让其通过服务器的redis来获取start_urls,并改写pipeline里数据 存储部分,把存储地址改为服务器地址。 * 在服务器上写一些生成url的脚本,并定期执行。 # 1 url提取 ## 1.1 分布式抓取的原理 采用scrapy-redis实现分布式,其实从原理上来说很简单,这里为描述方便,我们把自己的核心服务器称为master,而把用于跑爬虫程序的机器称为slave。 我们知道,采用scrapy框架抓取网页,我们需要首先给定它一些start_urls,爬虫首先访问start_urls里面的url,再根据我们的具体逻辑,对里面的元素、或者是其他的二级、三级页面进行抓取。而要实现分布式,我们只需要在这个starts_urls里面做文章就行了。 我们在master上搭建一个redis数据库(注意这个数据库只用作url的存储,不关心爬取的具体数据,不要和后面的mongodb或者mysql混淆),并对每一个需要爬取的网站类型,都开辟一个单独的列表字段。通过设置slave上scrapy-redis获取url的地址为master地址。这样的结果就是,尽管有多个slave,然而大家获取url的地方只有一个,那就是服务器master上的redis数据库。 并且,由于scrapy-redis自身的队列机制,slave获取的链接不会相互冲突。这样各个slave在完成抓取任务之后,再把获取的结果汇总到服务器上(这时的数据存储不再在是redis,而是mongodb或者 mysql等存放具体内容的数据库了) 这种方法的还有好处就是程序移植性强,只要处理好路径问题,把slave上的程序移植到另一台机器上运行,基本上就是复制粘贴的事情。 ## 1.2 url的提取 首先明确一点,url是在master而不是slave上生成的。 对于每一个门类的urls(每一个门类对应redis下的一个字段,表示一个url的列表),我们可以单独写一个生成url的脚本。这个脚本要做的事很简单,就是按照我们需要的格式,构造除url并添加到redis里面。 对于slave,我们知道,scrapy可以通过Settings来让爬取结束之后不自动关闭,而是不断的去询问队列里有没有新的url,如果有新的url,那么继续获取url并进行爬取。利用这一特性,我们就可以采用控制url的生成的方法,来控制slave爬虫程序的爬取。 ## 1.3 url的处理 1、判断URL指向网站的域名,如果指向外部网站,直接丢弃 2、URL去重,然后URL地址存入redis和数据库; # 2 内容爬取 ## 2.1 定时爬取 有了上面的介绍,定时抓取的实现就变得简单了,我们只需要定时的去执行url生成的脚本即可。这里推荐linux下的crontab指令,能够非常方便的制定定时任务,具体的介绍大家可以自行查看文档。 ## 2.2 # 3 内容更新 ## 3.1 表设计 帖子爬取表: id :自增主键 md5_url :md5加密URL url :爬取目标URL title :爬取文章标题 content :爬取文章内容(已处理) user_id :随机发帖的用户ID spider_name :爬虫名 site :爬取域名 gid :灌入帖子的ID module : status :状态 (1:已爬取;0:未爬取) use_time :爬取时间 create_time :创建时间 CREATE TABLE `NewTable` ( `id` bigint(20) NOT NULL AUTO_INCREMENT , `md5_url` varchar(100) CHARACTER SET utf8 COLLATE utf8_general_ci NOT NULL , `url` varchar(200) CHARACTER SET utf8 COLLATE utf8_general_ci NOT NULL , `title` varchar(100) CHARACTER SET utf8 COLLATE utf8_general_ci NOT NULL , `content` mediumtext CHARACTER SET utf8 COLLATE utf8_general_ci NOT NULL , `user_id` varchar(30) CHARACTER SET utf8 COLLATE utf8_general_ci NOT NULL , `spider_name` varchar(100) CHARACTER SET utf8 COLLATE utf8_general_ci NOT NULL , `site` varchar(100) CHARACTER SET utf8 COLLATE utf8_general_ci NOT NULL , `gid` varchar(10) CHARACTER SET utf8 COLLATE utf8_general_ci NOT NULL , `module` varchar(255) CHARACTER SET utf8 COLLATE utf8_general_ci NOT NULL , `status` tinyint(4) NOT NULL DEFAULT 0 , `use_time` datetime NOT NULL , `create_time` datetime NOT NULL , PRIMARY KEY (`id`) ) ENGINE=InnoDB DEFAULT CHARACTER SET=utf8 COLLATE=utf8_general_ci AUTO_INCREMENT=4120 ROW_FORMAT=COMPACT; # 4 系统优化 ## 4.1 防抓取方法 * 设置download_delay,这个方法基本上属于万能的,理论上只要你的delay足够长,网站服务器都没办法判断你是正常浏览还是爬虫。但它带来的副作用也是显然的:大量降低爬取效率。因此这个我们可能需要进行多次测试来得到一个合适的值。有时候download_delay可以设为一个范围随机值。 * 随机生成User-agent:更改User-agent能够防止一些403或者400的错误,基本上属于每个爬虫都会写的。这里我们可以重写scrapy 里的middleware,让程序每次请求都随机获取一个User-agent,增大隐蔽性。具体实现可以参考 http://www.sharejs.com/codes/python/8310 * 设置代理IP池:网上有很多免费或收费的代理池,可以借由他们作为中介来爬。一个问题是速度不能保证,第二个问题是,这些代理很多可能本来就没办法用。因此如果要用这个方法,比较靠谱的做法是先用程序筛选一些好用的代理,再在这些代理里面去随机、或者顺序访问。 * 设置好header里面的domian和host,有些网站,比如雪球网会根据这两项来判断请求来源,因此也是要注意的地方。 ## 4.2 程序化管理、web管理 上述方法虽然能够实现一套完整的流程,但在具体操作过程中还是比较麻烦,可能的话还可以架构web服务器,通过web端来实现url的添加、爬虫状态的监控等,能够减轻非常大的工作量。这些内容如果要展开实在太多,这里就只提一下。 # 5 scrapy部署 ## 5.1 安装python3.6 ``` ``` 1、下载源代码 wget https://www.python.org/ftp/python/3.6.1/Python-3.6.1.tgz 2、解压文件 cp Python-3.6.1.tgz /usr/local/goldmine/ tar -xvf Python-3.6.1.tgz 3、编译 ./configure --prefix=/usr/local 4、安装 make && make altinstall 注意:这里使用的是make altinstall ,如果使用make install,会在系统中有两个版本的Python在/usr/bin/目录中,可能会导致问题。 4.1 报错---zipimport.ZipImportError: can't decompress data; zlib not available # http://www.zlib.net/zlib-1.2.11.tar ============================================= 使用root用户: wget http://www.zlib.net/zlib-1.2.11.tar tar -xvf zlib-1.2.11.tar.gz cd zlib-1.2.11 ./configure make sudo make install ============================================= 安装完zlib,重新执行 Python-3.6.1中的 make && make altinstall 即可安装成功; # 5.2 服务安装虚拟环境【root安装】 安装virtualenv可以搭建虚拟且独立的python环境,使每个项目环境和其他的项目独立开来,保持环境的干净,解决包冲突。 ### 5.2.1 安装virtualenv /usr/local/bin/pip3.6 install virtualenv 结果报错了, =============== pip is configured with locations that require TLS/SSL, however the ssl module in Python is not available. Collecting virtualenv Could not fetch URL https://pypi.python.org/simple/virtualenv/: There was a problem confirming the ssl certificate: Can't connect to HTTPS URL because the SSL module is not available. - skipping =============== rpm -aq | grep openssl ,发现缺少 openssl-devel ; 【route add default gw 192.168.1.219】 yum install openssl-devel -y 然后,重新编译python,见 5.1 ; ### 5.2.2 创建新的虚拟环境 virtualenv -p /usr/local/bin/python3.6 python3.6-env ### 5.2.3 激活虚拟环境 source python3.6-env/bin/active 5.2.3.1 虚拟环境中安装 python ### 5.2.4 退出虚拟环境 deactive # 5.2 安装scrapy # 5.3 安装配置redis yum install redis # 5.4 # 6 redis安装&配置 ## 6.1 安装 mac : sudo brew install redis /usr/local/bin/redis-server /usr/local/etc/redis.conf # 参考 * 1.[基于Python,scrapy,redis的分布式爬虫实现框架](http://ju.outofmemory.cn/entry/206756) * 2.[小白进阶之Scrapy第三篇(基于Scrapy-Redis的分布式以及cookies池)](http://ju.outofmemory.cn/entry/299500) * 3.[CentOS中使用virtualenv搭建python3环境](http://www.jb51.net/article/67393.htm) * 4.[CentOS使用virtualenv搭建独立的Python环境](http://www.51ou.com/browse/linuxwt/60216.html) * 5.[python虚拟环境安装和配置](http://blog.csdn.net/pipisorry/article/details/39998317) ================================================ FILE: SpiderKeeper.py ================================================ # -*- coding: utf-8 -*- import time import threading from scrapy import cmdline # def ylbg(): # print(">> thread.staring ylbg ...") # cmdline.execute("scrapy crawl UrlSpider_YLBG".split()) # print(">> thread.ending ylbg ...") # # def sydw(): # print(">> thread.starting sydw ...") # cmdline.execute("scrapy crawl UrlSpider_SYDW".split()) # print(">> thread.ending sydw ...") # # threading._start_new_thread(ylbg()) # threading._start_new_thread(sydw()) # 配置 commands ,执行 scrapy list 下的所有spider cmdline.execute("scrapy crawlall".split()) ================================================ FILE: commands/crawlall.py ================================================ from scrapy.commands import ScrapyCommand from scrapy.crawler import CrawlerRunner from scrapy.utils.conf import arglist_to_dict class Command(ScrapyCommand): requires_project = True def syntax(self): return '[options]' def short_desc(self): return 'Runs all of the spiders' def add_options(self, parser): ScrapyCommand.add_options(self, parser) parser.add_option("-a", dest="spargs", action="append", default=[], metavar="NAME=VALUE", help="set spider argument (may be repeated)") parser.add_option("-o", "--output", metavar="FILE", help="dump scraped items into FILE (use - for stdout)") parser.add_option("-t", "--output-format", metavar="FORMAT", help="format to use for dumping items with -o") def process_options(self, args, opts): ScrapyCommand.process_options(self, args, opts) # try: opts.spargs = arglist_to_dict(opts.spargs) # except ValueError: # raise UsageError("Invalid -a value, use -a NAME=VALUE", print_help=False) def run(self, args, opts): # settings = get_project_settings() spider_loader = self.crawler_process.spider_loader for spidername in args or spider_loader.list(): print("*********cralall spidername************" + spidername) self.crawler_process.crawl(spidername, **opts.spargs) self.crawler_process.start() ================================================ FILE: commonUtils.py ================================================ import random import time import datetime from hashlib import md5 # 获取随机发帖ID def get_random_user(user_str): user_list = [] for user_id in str(user_str).split(','): user_list.append(user_id) userid_idx = random.randint(1, len(user_list)) user_chooesd = user_list[userid_idx-1] return user_chooesd # 获取MD5加密URL def get_linkmd5id(url): # url进行md5处理,为避免重复采集设计 md5_url = md5(url.encode("utf8")).hexdigest() return md5_url # get unix time stamp def get_time_stamp(): create_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') time_array = time.strptime(create_time, "%Y-%m-%d %H:%M:%S") time_stamp = int(time.mktime(time_array)) return time_stamp ================================================ FILE: ghostdriver.log ================================================ [INFO - 2017-06-28T00:22:35.372Z] GhostDriver - Main - running on port 9643 [INFO - 2017-06-28T00:22:38.400Z] Session [e424dd60-5b97-11e7-a0fa-fbfe1e4d560f] - page.settings - {"XSSAuditingEnabled":false,"javascriptCanCloseWindows":true,"javascriptCanOpenWindows":true,"javascriptEnabled":true,"loadImages":false,"localToRemoteUrlAccessEnabled":false,"userAgent":"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)","webSecurityEnabled":true} [INFO - 2017-06-28T00:22:38.400Z] Session [e424dd60-5b97-11e7-a0fa-fbfe1e4d560f] - page.customHeaders: - {} [INFO - 2017-06-28T00:22:38.400Z] Session [e424dd60-5b97-11e7-a0fa-fbfe1e4d560f] - Session.negotiatedCapabilities - {"browserName":"phantomjs","version":"2.1.1","driverName":"ghostdriver","driverVersion":"1.2.0","platform":"windows-7-32bit","javascriptEnabled":true,"takesScreenshot":true,"handlesAlerts":false,"databaseEnabled":false,"locationContextEnabled":false,"applicationCacheEnabled":false,"browserConnectionEnabled":false,"cssSelectorsEnabled":true,"webStorageEnabled":false,"rotatable":false,"acceptSslCerts":false,"nativeEvents":true,"proxy":{"proxyType":"direct"},"phantomjs.page.settings.userAgent":"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)","phantomjs.page.settings.loadImages":false} [INFO - 2017-06-28T00:22:38.400Z] SessionManagerReqHand - _postNewSessionCommand - New Session Created: e424dd60-5b97-11e7-a0fa-fbfe1e4d560f [ERROR - 2017-06-28T00:22:38.410Z] RouterReqHand - _handle.error - {"name":"Missing Command Parameter","message":"{\"headers\":{\"Accept\":\"application/json\",\"Accept-Encoding\":\"identity\",\"Connection\":\"close\",\"Content-Length\":\"73\",\"Content-Type\":\"application/json;charset=UTF-8\",\"Host\":\"127.0.0.1:9643\",\"User-Agent\":\"Python http auth\"},\"httpVersion\":\"1.1\",\"method\":\"POST\",\"post\":\"{\\\"sessionId\\\": \\\"e424dd60-5b97-11e7-a0fa-fbfe1e4d560f\\\", \\\"pageLoad\\\": 180000}\",\"url\":\"/timeouts\",\"urlParsed\":{\"anchor\":\"\",\"query\":\"\",\"file\":\"timeouts\",\"directory\":\"/\",\"path\":\"/timeouts\",\"relative\":\"/timeouts\",\"port\":\"\",\"host\":\"\",\"password\":\"\",\"user\":\"\",\"userInfo\":\"\",\"authority\":\"\",\"protocol\":\"\",\"source\":\"/timeouts\",\"queryKey\":{},\"chunks\":[\"timeouts\"]},\"urlOriginal\":\"/session/e424dd60-5b97-11e7-a0fa-fbfe1e4d560f/timeouts\"}","line":546,"sourceURL":"phantomjs://code/session_request_handler.js","stack":"_postTimeout@phantomjs://code/session_request_handler.js:546:73\n_handle@phantomjs://code/session_request_handler.js:148:25\n_reroute@phantomjs://code/request_handler.js:61:20\n_handle@phantomjs://code/router_request_handler.js:78:46"} phantomjs://platform/console++.js:263 in error [INFO - 2017-06-28T00:27:35.412Z] SessionManagerReqHand - _cleanupWindowlessSessions - Asynchronous Sessions clean-up phase starting NOW [INFO - 2017-06-28T00:32:35.411Z] SessionManagerReqHand - _cleanupWindowlessSessions - Asynchronous Sessions clean-up phase starting NOW [INFO - 2017-06-28T00:37:35.416Z] SessionManagerReqHand - _cleanupWindowlessSessions - Asynchronous Sessions clean-up phase starting NOW [INFO - 2017-06-28T00:42:35.418Z] SessionManagerReqHand - _cleanupWindowlessSessions - Asynchronous Sessions clean-up phase starting NOW [INFO - 2017-06-28T00:47:35.418Z] SessionManagerReqHand - _cleanupWindowlessSessions - Asynchronous Sessions clean-up phase starting NOW [INFO - 2017-06-28T00:52:35.423Z] SessionManagerReqHand - _cleanupWindowlessSessions - Asynchronous Sessions clean-up phase starting NOW [INFO - 2017-06-28T00:57:35.423Z] SessionManagerReqHand - _cleanupWindowlessSessions - Asynchronous Sessions clean-up phase starting NOW [INFO - 2017-06-28T01:02:35.427Z] SessionManagerReqHand - _cleanupWindowlessSessions - Asynchronous Sessions clean-up phase starting NOW [INFO - 2017-06-28T01:07:35.431Z] SessionManagerReqHand - _cleanupWindowlessSessions - Asynchronous Sessions clean-up phase starting NOW [INFO - 2017-06-28T01:12:35.470Z] SessionManagerReqHand - _cleanupWindowlessSessions - Asynchronous Sessions clean-up phase starting NOW [INFO - 2017-06-28T01:17:35.469Z] SessionManagerReqHand - _cleanupWindowlessSessions - Asynchronous Sessions clean-up phase starting NOW [INFO - 2017-06-28T01:22:35.469Z] SessionManagerReqHand - _cleanupWindowlessSessions - Asynchronous Sessions clean-up phase starting NOW [INFO - 2017-06-28T01:27:35.477Z] SessionManagerReqHand - _cleanupWindowlessSessions - Asynchronous Sessions clean-up phase starting NOW essSessions - Asynchronous Sessions clean-up phase starting NOW [INFO - 2017-06-28T01:29:06.882Z] SessionManagerReqHand - _cleanupWindowlessSessions - Asynchronous Sessions clean-up phase starting NOW 2017-06-28T01:18:20.002Z] SessionManagerReqHand - _cleanupWindowlessSessions - Asynchronous Sessions clean-up phase starting NOW [INFO - 2017-06-28T01:23:20.005Z] SessionManagerReqHand - _cleanupWindowlessSessions - Asynchronous Sessions clean-up phase starting NOW [INFO - 2017-06-28T01:28:20.013Z] SessionManagerReqHand - _cleanupWindowlessSessions - Asynchronous Sessions clean-up phase starting NOW 2017-06-28T01:18:06.690Z] SessionManagerReqHand - _cleanupWindowlessSessions - Asynchronous Sessions clean-up phase starting NOW [INFO - 2017-06-28T01:23:06.726Z] SessionManagerReqHand - _cleanupWindowlessSessions - Asynchronous Sessions clean-up phase starting NOW [INFO - 2017-06-28T01:28:06.738Z] SessionManagerReqHand - _cleanupWindowlessSessions - Asynchronous Sessions clean-up phase starting NOW ================================================ FILE: items.py ================================================ # -*- coding: utf-8 -*- # Define here the models for your scraped items # # See documentation in: # http://doc.scrapy.org/en/latest/topics/items.html import scrapy class DgspiderUrlItem(scrapy.Item): url = scrapy.Field() class DgspiderPostItem(scrapy.Item): url = scrapy.Field() title = scrapy.Field() text = scrapy.Field() ================================================ FILE: middlewares/middleware.py ================================================ # douguo request middleware # for the page which loaded by js/ajax # ang changes should be recored here: # # @author zhangjianfei # @date 2017/05/04 from selenium import webdriver from scrapy.http import HtmlResponse from selenium.webdriver.common.desired_capabilities import DesiredCapabilities from DgSpiderPhantomJS import urlSettings import time import datetime import random import os import execjs import DgSpiderPhantomJS.settings as settings class JavaScriptMiddleware(object): def process_request(self, request, spider): print("LOGS: Spider name in middleware - " + spider.name) # 开启虚拟浏览器参数 dcap = dict(DesiredCapabilities.PHANTOMJS) # 设置agents dcap["phantomjs.page.settings.userAgent"] = (random.choice(settings.USER_AGENTS)) # 禁止加载图片 dcap["phantomjs.page.settings.loadImages"] = False driver = webdriver.PhantomJS(executable_path=r"D:\phantomjs-2.1.1\bin\phantomjs.exe", desired_capabilities=dcap) # 由于phantomjs路径已经增添在path中,path可以不写 # driver = webdriver.PhantomJS() # 利用firfox # driver = webdriver.Firefox(executable_path=r"D:\FireFoxBrowser\firefox.exe") # 利用chrome # chromedriver = "C:\Program Files (x86)\Google\Chrome\Application\chromedriver.exe" # os.environ["webdriver.chrome.driver"] = chromedriver # driver = webdriver.Chrome(chromedriver) # 模拟登陆 # driver.find_element_by_class_name("input_id").send_keys("34563453") # driver.find_element_by_class_name("input_pwd").send_keys("zjf%#¥&") # driver.find_element_by_class_name("btn btn_lightgreen btn_login").click() # driver.implicitly_wait(15) # time.sleep(10) # 模拟用户下拉 # js1 = 'return document.body.scrollHeight' # js2 = 'window.scrollTo(0, document.body.scrollHeight)' # js3 = "document.body.scrollTop=1000" # old_scroll_height = 0 # while driver.execute_script(js1) > old_scroll_height: # old_scroll_height = driver.execute_script(js1) # driver.execute_script(js2) # time.sleep(3) # 设置20秒页面超时返回 driver.set_page_load_timeout(180) # 设置20秒脚本超时时间 driver.set_script_timeout(180) # get time stamp # get page screenshot # driver.save_screenshot("D:\p.jpg") # 模拟用户在同一个浏览器对象下刷新页面 # the whole page source body = '' for i in range(50): print("SPider name: " + spider.name) # sleep in a random time for the ajax asynchronous request # time.sleep(random.randint(5, 6)) time.sleep(random.randint(300, 600)) print("LOGS: freshing page " + str(i) + "...") # get page request driver.get(request.url) # waiting for response driver.implicitly_wait(30) # get page resource body = body + driver.page_source return HtmlResponse(driver.current_url, body=body, encoding='utf-8', request=request) ================================================ FILE: middlewares.py ================================================ # -*- coding: utf-8 -*- # Define here the models for your spider middleware # # See documentation in: # http://doc.scrapy.org/en/latest/topics/spider-middleware.html from scrapy import signals class DgspiderphantomjsSpiderMiddleware(object): # Not all methods need to be defined. If a method is not defined, # scrapy acts as if the spider middleware does not modify the # passed objects. @classmethod def from_crawler(cls, crawler): # This method is used by Scrapy to create your spiders. s = cls() crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) return s def process_spider_input(response, spider): # Called for each response that goes through the spider # middleware and into the spider. # Should return None or raise an exception. return None def process_spider_output(response, result, spider): # Called with the results returned from the Spider, after # it has processed the response. # Must return an iterable of Request, dict or Item objects. for i in result: yield i def process_spider_exception(response, exception, spider): # Called when a spider or process_spider_input() method # (from other spider middleware) raises an exception. # Should return either None or an iterable of Response, dict # or Item objects. pass def process_start_requests(start_requests, spider): # Called with the start requests of the spider, and works # similarly to the process_spider_output() method, except # that it doesn’t have a response associated. # Must return only requests (not items). for r in start_requests: yield r def spider_opened(self, spider): spider.logger.info('Spider opened: %s' % spider.name) ================================================ FILE: mysqlUtils.py ================================================ import pymysql import pymysql.cursors import os def dbhandle_online(): host = '192.168.1.235' user = 'root' passwd = 'douguo2015' charset = 'utf8' conn = pymysql.connect( host=host, user=user, passwd=passwd, charset=charset, use_unicode=False ) return conn def dbhandle_local(): host = '192.168.1.235' user = 'root' passwd = 'douguo2015' charset = 'utf8' conn = pymysql.connect( host=host, user=user, passwd=passwd, charset=charset, use_unicode=True # use_unicode=False ) return conn def dbhandle_geturl(gid): host = '192.168.1.235' user = 'root' passwd = 'douguo2015' charset = 'utf8' conn = pymysql.connect( host=host, user=user, passwd=passwd, charset=charset, use_unicode=False ) cursor = conn.cursor() sql = 'select url,spider_name,site,gid,module from dg_spider.dg_spider_post where status=0 and gid=%s limit 1' % gid try: cursor.execute(sql) result = cursor.fetchone() conn.commit() except Exception as e: print("***** exception") print(e) conn.rollback() if result is None: os._exit(0) else: url = result[0] spider_name = result[1] site = result[2] gid = result[3] module = result[4] return url.decode(), spider_name.decode(), site.decode(), gid.decode(), module.decode() def dbhandle_insert_content(url, title, content, user_id, has_img): host = '192.168.1.235' user = 'root' passwd = 'douguo2015' charset = 'utf8' conn = pymysql.connect( host=host, user=user, passwd=passwd, charset=charset, use_unicode=False ) cur = conn.cursor() # 如果标题或者内容为空,那么程序将退出,篇文章将会作废并将status设置为1,爬虫继续向下运行获得新的URl if content.strip() == '' or title.strip() == '': sql_fail = 'update dg_spider.dg_spider_post set status="%s" where url="%s" ' % ('1', url) try: cur.execute(sql_fail) result = cur.fetchone() conn.commit() except Exception as e: print(e) conn.rollback() os._exit(0) sql = 'update dg_spider.dg_spider_post set title="%s",content="%s",user_id="%s",has_img="%s" where url="%s" ' \ % (title, content, user_id, has_img, url) try: cur.execute(sql) result = cur.fetchone() conn.commit() except Exception as e: print(e) conn.rollback() return result def dbhandle_update_status(url, status): host = '192.168.1.235' user = 'root' passwd = 'douguo2015' charset = 'utf8' conn = pymysql.connect( host=host, user=user, passwd=passwd, charset=charset, use_unicode=False ) cur = conn.cursor() sql = 'update dg_spider.dg_spider_post set status="%s" where url="%s" ' \ % (status, url) try: cur.execute(sql) result = cur.fetchone() conn.commit() except Exception as e: print(e) conn.rollback() return result def dbhandle_get_content(url): host = '192.168.1.235' user = 'root' passwd = 'douguo2015' charset = 'utf8' conn = pymysql.connect( host=host, user=user, passwd=passwd, charset=charset, use_unicode=False ) cursor = conn.cursor() sql = 'select title,content,user_id,gid from dg_spider.dg_spider_post where status=1 and url="%s" limit 1' % url try: cursor.execute(sql) result = cursor.fetchone() conn.commit() except Exception as e: print("***** exception") print(e) conn.rollback() if result is None: os._exit(1) title = result[0] content = result[1] user_id = result[2] gid = result[3] return title.decode(), content.decode(), user_id.decode(), gid.decode() # 获取爬虫初始化参数 def dbhandle_get_spider_param(url): host = '192.168.1.235' user = 'root' passwd = 'douguo2015' charset = 'utf8' conn = pymysql.connect( host=host, user=user, passwd=passwd, charset=charset, use_unicode=False ) cursor = conn.cursor() sql = 'select title,content,user_id,gid from dg_spider.dg_spider_post where status=0 and url="%s" limit 1' % url result = '' try: cursor.execute(sql) result = cursor.fetchone() conn.commit() except Exception as e: print("***** exception") print(e) conn.rollback() title = result[0] content = result[1] user_id = result[2] gid = result[3] return title.decode(), content.decode(), user_id.decode(), gid.decode() ================================================ FILE: notusedspiders/ContentSpider.py ================================================ # -*- coding: utf-8 -*- import scrapy from scrapy.selector import Selector from DgSpiderPhantomJS import urlSettings from DgSpiderPhantomJS.items import DgspiderPostItem from DgSpiderPhantomJS.mysqlUtils import dbhandle_geturl from DgSpiderPhantomJS.mysqlUtils import dbhandle_update_status from DgSpiderPhantomJS.notusedspiders import contentSettings class DgContentSpider(scrapy.Spider): print('>>> Spider DgContentPhantomJSSpider Staring ...') # get url from db result = dbhandle_geturl(urlSettings.GROUP_ID) url = result[0] spider_name = result[1] site = result[2] gid = result[3] module = result[4] # set spider name name = contentSettings.SPIDER_NAME # name = 'DgUrlSpiderPhantomJS' # set domains allowed_domains = [contentSettings.DOMAIN] # set scrapy url start_urls = [url] # change status """对于爬去网页,无论是否爬取成功都将设置status为1,避免死循环""" dbhandle_update_status(url, 1) # scrapy crawl def parse(self, response): # init the item item = DgspiderPostItem() # get the page source sel = Selector(response) print(sel) # get post title title_date = sel.xpath(contentSettings.POST_TITLE_XPATH) item['title'] = title_date.xpath('string(.)').extract() # get post page source item['text'] = sel.xpath(contentSettings.POST_CONTENT_XPATH).extract() # get url item['url'] = DgContentSpider.url yield item ================================================ FILE: notusedspiders/ContentSpider_real.py ================================================ # -*- coding: utf-8 -*- import scrapy from scrapy.selector import Selector from DgSpiderPhantomJS import urlSettings from DgSpiderPhantomJS.items import DgspiderPostItem from DgSpiderPhantomJS.mysqlUtils import dbhandle_geturl from DgSpiderPhantomJS.mysqlUtils import dbhandle_update_status from DgSpiderPhantomJS.notusedspiders import contentSettings class DgContentSpider(scrapy.Spider): print('LOGS: Spider DgContentPhantomSpider Staring ...') # get url from db result = dbhandle_geturl(urlSettings.GROUP_ID) url = result[0] spider_name = result[1] site = result[2] gid = result[3] module = result[4] # set spider name name = contentSettings.SPIDER_NAME # name = 'DgUrlSpiderPhantomJS' # set domains allowed_domains = [contentSettings.DOMAIN] # set scrapy url start_urls = [url] # change status """对于爬去网页,无论是否爬取成功都将设置status为1,避免死循环""" dbhandle_update_status(url, 1) # scrapy crawl def parse(self, response): # init the item item = DgspiderPostItem() # get the page source sel = Selector(response) print(sel) # get post title title_date = sel.xpath(contentSettings.POST_TITLE_XPATH) item['title'] = title_date.xpath('string(.)').extract() # get post page source item['text'] = sel.xpath(contentSettings.POST_CONTENT_XPATH).extract() # get url item['url'] = DgContentSpider.url yield item ================================================ FILE: notusedspiders/DgContentSpider_PhantomJS.py ================================================ # -*- coding: utf-8 -*- import scrapy from scrapy.selector import Selector from DgSpiderPhantomJS import urlSettings from DgSpiderPhantomJS.items import DgspiderPostItem from DgSpiderPhantomJS.mysqlUtils import dbhandle_geturl from DgSpiderPhantomJS.mysqlUtils import dbhandle_update_status from DgSpiderPhantomJS.notusedspiders import contentSettings class DgcontentspiderPhantomjsSpider(scrapy.Spider): print('>>> Spider DgContentPhantomJSSpider Staring ...') # get url from db result = dbhandle_geturl(urlSettings.GROUP_ID) url = result[0] spider_name = result[1] site = result[2] gid = result[3] module = result[4] # set spider name name = contentSettings.SPIDER_NAME # name = 'DgUrlSpiderPhantomJS' # set domains allowed_domains = [contentSettings.DOMAIN] # set scrapy url start_urls = [url] # change status """对于爬去网页,无论是否爬取成功都将设置status为1,避免死循环""" dbhandle_update_status(url, 1) # scrapy crawl def parse(self, response): # init the item item = DgspiderPostItem() # get the page source sel = Selector(response) print(sel) # get post title title_date = sel.xpath(contentSettings.POST_TITLE_XPATH) item['title'] = title_date.xpath('string(.)').extract() # get post page source item['text'] = sel.xpath(contentSettings.POST_CONTENT_XPATH).extract() # get url item['url'] = self.url yield item ================================================ FILE: notusedspiders/DgUrlSpider_PhantomJS.py ================================================ # -*- coding: utf-8 -*- import scrapy from DgSpiderPhantomJS.items import DgspiderUrlItem from scrapy.selector import Selector from DgSpiderPhantomJS import urlSettings class DgurlspiderPhantomjsSpider(scrapy.Spider): print('>>> Spider DgUrlPhantomJSSpider Staring ...') # set your spider name # name = urlSettings.SPIDER_NAME name = urlSettings.SPIDER_NAME # set your allowed domain allowed_domains = [urlSettings.DOMAIN] # set spider start url start_urls = [urlSettings.URL_START] # scrapy crawl def parse(self, response): # init the item item = DgspiderUrlItem() # get the page source sel = Selector(response) # page_source = self.page url_list = sel.xpath(urlSettings.POST_URL_PHANTOMJS_XPATH).extract() # if the url you got had some prefix, it will works, such as 'http://' url_item = [] for url in url_list: url = url.replace(urlSettings.URL_PREFIX, '') url_item.append(urlSettings.URL_PREFIX + url) # use set to del repeated urls url_item = list(set(url_item)) item['url'] = url_item yield item ================================================ FILE: notusedspiders/PostHandle.py ================================================ # -*- coding: utf-8 -*- import json from DgSpiderPhantomJS.mysqlUtils import dbhandle_get_content from DgSpiderPhantomJS.mysqlUtils import dbhandle_update_status from DgSpiderPhantomJS.notusedspiders.uploadUtils import upload_post def post_handel(url): result = dbhandle_get_content(url) title = result[0] content = result[1] user_id = result[2] gid = result[3] cs = [] text_list = content.split('[dgimg]') for text_single in text_list: text_single_c = text_single.split('[/dgimg]') if len(text_single_c) == 1: cs_json = {"c": text_single_c[0], "i": '', "w": '', "h": ''} cs.append(cs_json) else: # tmp_img_upload_json = upload_img_result.pop() pic_flag = text_single_c[1] img_params = text_single_c[0].split(';') i = img_params[0] w = img_params[1] h = img_params[2] cs_json = {"c": pic_flag, "i": i, "w": w, "h": h} cs.append(cs_json) strcs = json.dumps(cs) json_data = {"apisign": "99ea3eda4b45549162c4a741d58baa60", "user_id": user_id, "gid": gid, "t": title, "cs": strcs} # 上传帖子 result_uploadpost = upload_post(json_data) # 更新状态2,成功上传帖子 result_updateresult = dbhandle_update_status(url, 2) # # if __name__ == '__main__': # post_handel('http://www.mama.cn/baby/art/20140523/773474.html') ================================================ FILE: notusedspiders/UrlSpider.py ================================================ # -*- coding: utf-8 -*- import scrapy from scrapy.selector import Selector from DgSpiderPhantomJS import urlSettings from DgSpiderPhantomJS.items import DgspiderUrlItem from DgSpiderPhantomJS.notusedspiders import contentSettings class DgUrlSpider(scrapy.Spider): print('LOGS: Spider DgUrlPhantomSpider Staring ...') # set your spider name name = contentSettings.SPIDER_NAME # set your allowed domain allowed_domains = [urlSettings.DOMAIN] # set spider start url start_urls = [urlSettings.URL_START_JFSS] # scrapy crawl def parse(self, response): # init the item item = DgspiderUrlItem() # get the page source sel = Selector(response) # page_source = self.page url_list = sel.xpath(urlSettings.POST_URL_PHANTOMJS_XPATH).extract() # if the url you got had some prefix, it will works, such as 'http://' url_item = [] for url in url_list: url = url.replace(urlSettings.URL_PREFIX, '') url_item.append(urlSettings.URL_PREFIX + url) # use set to del repeated urls url_item = list(set(url_item)) item['url'] = url_item # transer item to pipeline yield item # for i in range(5): # yield Request(self.start_urls[0], callback=self.parse) ================================================ FILE: notusedspiders/check_post.py ================================================ import requests, re import http import urllib # 圈圈:孕妈育儿 4 # 圈圈:减肥瘦身 33 # 圈圈:情感生活 30 def checkPost(): # CREATE_POST_URL = "http://api.qa.douguo.net/robot/handlePost" CREATE_POST_URL = "http://api.douguo.net/robot/handlePost" fields={'group_id': '35', 'type': 1, 'apisign':'99ea3eda4b45549162c4a741d58baa60'} r = requests.post(CREATE_POST_URL, data=fields) print(r.json()) if __name__ == '__main__': #for i in range(1,50): #checkPost() checkPost() # print(i), #print(testText('aaaa\001')) ================================================ FILE: notusedspiders/contentSettings.py ================================================ # -*- coding: utf-8 -*- # Scrapy settings for DgSpider project # 图片储存 IMAGES_STORE = 'D:\\pics\\jfss\\' # 爬取域名 DOMAIN = 'toutiao.com' # 图片域名前缀 DOMAIN_HTTP = "http:" # 随机发帖用户 CREATE_POST_USER = '37619,18441390,18441391,18441392,18441393,18441394,18441395,18441396,18441397,18441398,18441399,'\ '18441400,18441401,18441402,18441403,18441404, 18441405,18441406,18441407,18441408,18441409,' \ '18441410,18441411,18441412,18441413,18441414,18441415,18441416,18441417,18441418,18441419,' \ '18441420,18441421,18441422,18441423,18441424,18441425,18441426,18441427,18441428,18441429,' \ '18441430,18441431,18441432,18441433,18441434,18441435,18441436,18441437,18441438,18441439,' \ '18441440,18441441,18441442,18441443,18441444,18441445,18441446,18441447,18441448,18441449,' \ '18441450,18441451,18441452,18441453,18441454,18441455,18441456,18441457,18441458,18441460,' \ '18441461,18441462,18441463,18441464,18441465,18441466,18441467,18441468,18441469,18441470,' \ '18441471,18441472,18441473,18441474,18441475,18441476,18441477,18441478,18441479,18441481,' \ '18441482,18441483,18441484,18441485,18441486,18441487,18441488,18441489,18441490' # 爬虫名 SPIDER_NAME = 'DgContentSpider_PhantomJS' # 文章URL爬取规则XPATH POST_TITLE_XPATH = '//h1[@class="article-title"]' POST_CONTENT_XPATH = '//div[@class="article-content"]' ================================================ FILE: notusedspiders/params.js ================================================ function getParam(){ var asas; var cpcp; var t = Math.floor((new Date).getTime() / 1e3) , e = t.toString(16).toUpperCase() , i = md5(t).toString().toUpperCase(); if (8 != e.length){ asas = "479BB4B7254C150"; cpcp = "7E0AC8874BB0985"; }else{ for (var n = i.slice(0, 5), o = i.slice(-5), a = "", s = 0; 5 > s; s++){ a += n[s] + e[s]; } for (var r = "", c = 0; 5 > c; c++){ r += e[c + 3] + o[c]; } asas = "A1" + a + e.slice(-3); cpcp= e.slice(0, 3) + r + "E1"; } return '{"as":"'+asas+'","cp":"'+cpcp+'"}'; } !function(e) { "use strict"; function t(e, t) { var n = (65535 & e) + (65535 & t) , r = (e >> 16) + (t >> 16) + (n >> 16); return r << 16 | 65535 & n } function n(e, t) { return e << t | e >>> 32 - t } function r(e, r, o, i, a, u) { return t(n(t(t(r, e), t(i, u)), a), o) } function o(e, t, n, o, i, a, u) { return r(t & n | ~t & o, e, t, i, a, u) } function i(e, t, n, o, i, a, u) { return r(t & o | n & ~o, e, t, i, a, u) } function a(e, t, n, o, i, a, u) { return r(t ^ n ^ o, e, t, i, a, u) } function u(e, t, n, o, i, a, u) { return r(n ^ (t | ~o), e, t, i, a, u) } function s(e, n) { e[n >> 5] |= 128 << n % 32, e[(n + 64 >>> 9 << 4) + 14] = n; var r, s, c, l, f, p = 1732584193, d = -271733879, h = -1732584194, m = 271733878; for (r = 0; r < e.length; r += 16) s = p, c = d, l = h, f = m, p = o(p, d, h, m, e[r], 7, -680876936), m = o(m, p, d, h, e[r + 1], 12, -389564586), h = o(h, m, p, d, e[r + 2], 17, 606105819), d = o(d, h, m, p, e[r + 3], 22, -1044525330), p = o(p, d, h, m, e[r + 4], 7, -176418897), m = o(m, p, d, h, e[r + 5], 12, 1200080426), h = o(h, m, p, d, e[r + 6], 17, -1473231341), d = o(d, h, m, p, e[r + 7], 22, -45705983), p = o(p, d, h, m, e[r + 8], 7, 1770035416), m = o(m, p, d, h, e[r + 9], 12, -1958414417), h = o(h, m, p, d, e[r + 10], 17, -42063), d = o(d, h, m, p, e[r + 11], 22, -1990404162), p = o(p, d, h, m, e[r + 12], 7, 1804603682), m = o(m, p, d, h, e[r + 13], 12, -40341101), h = o(h, m, p, d, e[r + 14], 17, -1502002290), d = o(d, h, m, p, e[r + 15], 22, 1236535329), p = i(p, d, h, m, e[r + 1], 5, -165796510), m = i(m, p, d, h, e[r + 6], 9, -1069501632), h = i(h, m, p, d, e[r + 11], 14, 643717713), d = i(d, h, m, p, e[r], 20, -373897302), p = i(p, d, h, m, e[r + 5], 5, -701558691), m = i(m, p, d, h, e[r + 10], 9, 38016083), h = i(h, m, p, d, e[r + 15], 14, -660478335), d = i(d, h, m, p, e[r + 4], 20, -405537848), p = i(p, d, h, m, e[r + 9], 5, 568446438), m = i(m, p, d, h, e[r + 14], 9, -1019803690), h = i(h, m, p, d, e[r + 3], 14, -187363961), d = i(d, h, m, p, e[r + 8], 20, 1163531501), p = i(p, d, h, m, e[r + 13], 5, -1444681467), m = i(m, p, d, h, e[r + 2], 9, -51403784), h = i(h, m, p, d, e[r + 7], 14, 1735328473), d = i(d, h, m, p, e[r + 12], 20, -1926607734), p = a(p, d, h, m, e[r + 5], 4, -378558), m = a(m, p, d, h, e[r + 8], 11, -2022574463), h = a(h, m, p, d, e[r + 11], 16, 1839030562), d = a(d, h, m, p, e[r + 14], 23, -35309556), p = a(p, d, h, m, e[r + 1], 4, -1530992060), m = a(m, p, d, h, e[r + 4], 11, 1272893353), h = a(h, m, p, d, e[r + 7], 16, -155497632), d = a(d, h, m, p, e[r + 10], 23, -1094730640), p = a(p, d, h, m, e[r + 13], 4, 681279174), m = a(m, p, d, h, e[r], 11, -358537222), h = a(h, m, p, d, e[r + 3], 16, -722521979), d = a(d, h, m, p, e[r + 6], 23, 76029189), p = a(p, d, h, m, e[r + 9], 4, -640364487), m = a(m, p, d, h, e[r + 12], 11, -421815835), h = a(h, m, p, d, e[r + 15], 16, 530742520), d = a(d, h, m, p, e[r + 2], 23, -995338651), p = u(p, d, h, m, e[r], 6, -198630844), m = u(m, p, d, h, e[r + 7], 10, 1126891415), h = u(h, m, p, d, e[r + 14], 15, -1416354905), d = u(d, h, m, p, e[r + 5], 21, -57434055), p = u(p, d, h, m, e[r + 12], 6, 1700485571), m = u(m, p, d, h, e[r + 3], 10, -1894986606), h = u(h, m, p, d, e[r + 10], 15, -1051523), d = u(d, h, m, p, e[r + 1], 21, -2054922799), p = u(p, d, h, m, e[r + 8], 6, 1873313359), m = u(m, p, d, h, e[r + 15], 10, -30611744), h = u(h, m, p, d, e[r + 6], 15, -1560198380), d = u(d, h, m, p, e[r + 13], 21, 1309151649), p = u(p, d, h, m, e[r + 4], 6, -145523070), m = u(m, p, d, h, e[r + 11], 10, -1120210379), h = u(h, m, p, d, e[r + 2], 15, 718787259), d = u(d, h, m, p, e[r + 9], 21, -343485551), p = t(p, s), d = t(d, c), h = t(h, l), m = t(m, f); return [p, d, h, m] } function c(e) { var t, n = ""; for (t = 0; t < 32 * e.length; t += 8) n += String.fromCharCode(e[t >> 5] >>> t % 32 & 255); return n } function l(e) { var t, n = []; for (n[(e.length >> 2) - 1] = void 0, t = 0; t < n.length; t += 1) n[t] = 0; for (t = 0; t < 8 * e.length; t += 8) n[t >> 5] |= (255 & e.charCodeAt(t / 8)) << t % 32; return n } function f(e) { return c(s(l(e), 8 * e.length)) } function p(e, t) { var n, r, o = l(e), i = [], a = []; for (i[15] = a[15] = void 0, o.length > 16 && (o = s(o, 8 * e.length)), n = 0; 16 > n; n += 1) i[n] = 909522486 ^ o[n], a[n] = 1549556828 ^ o[n]; return r = s(i.concat(l(t)), 512 + 8 * t.length), c(s(a.concat(r), 640)) } function d(e) { var t, n, r = "0123456789abcdef", o = ""; for (n = 0; n < e.length; n += 1) t = e.charCodeAt(n), o += r.charAt(t >>> 4 & 15) + r.charAt(15 & t); return o } function h(e) { return unescape(encodeURIComponent(e)) } function m(e) { return f(h(e)) } function g(e) { return d(m(e)) } function v(e, t) { return p(h(e), h(t)) } function y(e, t) { return d(v(e, t)) } function b(e, t, n) { return t ? n ? v(t, e) : y(t, e) : n ? m(e) : g(e) } "function" == typeof define && define.amd ? define("static/js/lib/md5", ["require"], function() { return b }) : "object" == typeof module && module.exports ? module.exports = b : e.md5 = b }(this) ================================================ FILE: notusedspiders/uploadUtils.py ================================================ import requests from requests_toolbelt.multipart.encoder import MultipartEncoder def upload_post(json_data): # 上传帖子 ,参考:http://192.168.2.25:3000/api/interface/2016 # create_post_url = "http://api.qa.douguo.net/robot/uploadimagespost" create_post_url = "http://api.douguo.net/robot/uploadimagespost" # 传帖子 # dataJson = json.dumps({"user_id":"19013245","gid":30,"t":"2017-03-23","cs":[{"c":"啦啦啦","i":"","w":0,"h":0}, # {"c":"啦啦啦2222","i":"http://wwww.douguo.com/abc.jpg","w":0,"h":0}],"time":1235235234}) # jsonData = {"user_id":"19013245","gid":5,"t":"TEST","cs":'[{"c":"啊啊啊","i":"qqq","w":12,"h":10}, # {"c":"这个内容真不错","i":"http://wwww.baidu.com","w":10,"h":10}]',"time":61411313} # print(jsonData) req_post = requests.post(create_post_url, data=json_data) print(req_post.json()) # print(reqPost.text) def uploadImage(img_path, content_type, user_id): # 上传单个图片 , 参考:http://192.168.2.25:3000/api/interface/2015 # UPLOAD_IMG_URL = "http://api.qa.douguo.net/robot/uploadpostimage" UPLOAD_IMG_URL = "http://api.douguo.net/robot/uploadpostimage" # 传图片 m = MultipartEncoder( # fields={'user_id': '192323', # 'images': ('filename', open(imgPath, 'rb'), 'image/JPEG')} fields={'user_id': user_id, 'apisign': '99ea3eda4b45549162c4a741d58baa60', 'image': ('filename', open(img_path, 'rb'), 'image/jpeg')} ) r = requests.post(UPLOAD_IMG_URL, data=m, headers={'Content-Type': m.content_type}) print(r.json()) # print(r.text) return r.json() # return r.text ================================================ FILE: notusedspiders/utils.py ================================================ import time import datetime ================================================ FILE: pipelines.py ================================================ # -*- coding: utf-8 -*- # Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html import datetime from DgSpiderPhantomJS import urlSettings from DgSpiderPhantomJS.mysqlUtils import dbhandle_online from DgSpiderPhantomJS.commonUtils import get_linkmd5id class DgspiderphantomjsPipeline(object): def __init__(self): pass # process the data def process_item(self, item, spider): # get mysql connettion db_object = dbhandle_online() cursor = db_object.cursor() print(">>>>> Spider name :") print(spider.name) for url in item['url']: linkmd5id = get_linkmd5id(url) if spider.name == urlSettings.SPIDER_JFSS: spider_name = urlSettings.SPIDER_JFSS gid = urlSettings.GROUP_ID_JFSS elif spider.name == urlSettings.SPIDER_MSZT: spider_name = urlSettings.SPIDER_MSZT gid = urlSettings.GROUP_ID_MSZT elif spider.name == urlSettings.SPIDER_SYDW: spider_name = urlSettings.SPIDER_SYDW gid = urlSettings.GROUP_ID_SYDW elif spider.name == urlSettings.SPIDER_YLBG: spider_name = urlSettings.SPIDER_YLBG gid = urlSettings.GROUP_ID_YLBG elif spider.name == urlSettings.SPIDER_YMYE: spider_name = urlSettings.SPIDER_YMYE gid = urlSettings.GROUP_ID_YMYE module = urlSettings.MODULE site = urlSettings.DOMAIN create_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') status = '0' sql_search = 'select md5_url from dg_spider.dg_spider_post where md5_url="%s"' % linkmd5id sql = 'insert into dg_spider.dg_spider_post(md5_url, url, spider_name, site, gid, module, status, ' \ 'create_time) ' \ 'values("%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s")' \ % (linkmd5id, url, spider_name, site, gid, module, status, create_time) try: # if url is not existed, then insert cursor.execute(sql_search) result_search = cursor.fetchone() if result_search is None or result_search[0].strip() == '': cursor.execute(sql) result = cursor.fetchone() db_object.commit() except Exception as e: print("Waring!: catch exception !") print(e) db_object.rollback() return item # spider开启时被调用 def open_spider(self, spider): pass # sipder 关闭时被调用 def close_spider(self, spider): pass ================================================ FILE: settings.py ================================================ # -*- coding: utf-8 -*- # Scrapy settings for dg-spider-phantomJS project # # For simplicity, this file contains only settings considered important or # commonly used. You can find more settings consulting the documentation: # # http://doc.scrapy.org/en/latest/topics/settings.html # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html BOT_NAME = 'dg-spider-phantomJS' SPIDER_MODULES = ['dg-spider-phantomJS.spiders'] NEWSPIDER_MODULE = 'dg-spider-phantomJS.spiders' # 注册PIPELINES ITEM_PIPELINES = { 'dg-spider-phantomJS.pipelines.DgspiderphantomjsPipeline': 544 } DOWNLOADER_MIDDLEWARES = { 'dg-spider-phantomJS.middlewares.middleware.JavaScriptMiddleware': 543, # 键为中间件类的路径,值为中间件的顺序 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None, # 禁止内置的中间件 } USER_AGENTS = [ "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)", "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)", "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)", "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)", "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6", "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1", "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0", "Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5" ] COMMANDS_MODULE = 'dg-spider-phantomJS.commands' # # Crawl responsibly by identifying yourself (and your website) on the user-agent #USER_AGENT = 'DgSpiderPhantomJS (+http://www.yourdomain.com)' # Obey robots.txt rules # ROBOTSTXT_OBEY = True # Configure maximum concurrent requests performed by Scrapy (default: 16) #CONCURRENT_REQUESTS = 32 # Configure a delay for requests for the same website (default: 0) # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay # See also autothrottle settings and docs # 设置下载延迟 # DOWNLOAD_DELAY = 3 # The download delay setting will honor only one of: #CONCURRENT_REQUESTS_PER_DOMAIN = 16 #CONCURRENT_REQUESTS_PER_IP = 16 # Disable cookies (enabled by default) COOKIES_ENABLED = True # Disable Telnet Console (enabled by default) #TELNETCONSOLE_ENABLED = False # Override the default request headers: #DEFAULT_REQUEST_HEADERS = { # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', # 'Accept-Language': 'en', #} # Enable or disable spider middlewares # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html #SPIDER_MIDDLEWARES = { # 'dg-spider-phantomJS.middlewares.DgspiderphantomjsSpiderMiddleware': 543, #} # Enable or disable downloader middlewares # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html #DOWNLOADER_MIDDLEWARES = { # 'dg-spider-phantomJS.middlewares.MyCustomDownloaderMiddleware': 543, #} # Enable or disable extensions # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html #EXTENSIONS = { # 'scrapy.extensions.telnet.TelnetConsole': None, #} # Configure item pipelines # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html #ITEM_PIPELINES = { # 'dg-spider-phantomJS.pipelines.DgspiderphantomjsPipeline': 300, #} # Enable and configure the AutoThrottle extension (disabled by default) # See http://doc.scrapy.org/en/latest/topics/autothrottle.html #AUTOTHROTTLE_ENABLED = True # The initial download delay #AUTOTHROTTLE_START_DELAY = 5 # The maximum download delay to be set in case of high latencies #AUTOTHROTTLE_MAX_DELAY = 60 # The average number of requests Scrapy should be sending in parallel to # each remote server #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 # Enable showing throttling stats for every response received: #AUTOTHROTTLE_DEBUG = False # Enable and configure HTTP caching (disabled by default) # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings #HTTPCACHE_ENABLED = True #HTTPCACHE_EXPIRATION_SECS = 0 #HTTPCACHE_DIR = 'httpcache' #HTTPCACHE_IGNORE_HTTP_CODES = [] #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' ================================================ FILE: setup.py ================================================ from setuptools import setup, find_packages setup(name='scrapy-mymodule', entry_points={ 'scrapy.commands': [ 'crawlall=cnblogs.commands:crawlall', ], }, ) ================================================ FILE: spiders/UrlSpider_JFSH.py ================================================ # -*- coding: utf-8 -*- import scrapy from DgSpiderPhantomJS.items import DgspiderUrlItem from scrapy.selector import Selector from DgSpiderPhantomJS import urlSettings class UrlspiderJfshSpider(scrapy.Spider): name = "UrlSpider_JFSS" # set your allowed domain allowed_domains = [urlSettings.DOMAIN] # set spider start url start_urls = [urlSettings.URL_START_JFSS] # scrapy crawl def parse(self, response): print("LOGS: Starting spider JFSS ...") # init the item item = DgspiderUrlItem() # get the page source sel = Selector(response) # page_source = self.page url_list = sel.xpath(urlSettings.POST_URL_PHANTOMJS_XPATH).extract() # if the url you got had some prefix, it will works, such as 'http://' url_item = [] for url in url_list: url = url.replace(urlSettings.URL_PREFIX, '') url_item.append(urlSettings.URL_PREFIX + url) # use set to del repeated urls url_item = list(set(url_item)) item['url'] = url_item # transer item to pipeline yield item ================================================ FILE: spiders/UrlSpider_MSZT.py ================================================ # -*- coding: utf-8 -*- import scrapy from scrapy.selector import Selector from DgSpiderPhantomJS import urlSettings from DgSpiderPhantomJS.items import DgspiderUrlItem class UrlspiderMsztSpider(scrapy.Spider): name = "UrlSpider_MSZT" # set your allowed domain allowed_domains = [urlSettings.DOMAIN] # set spider start url start_urls = [urlSettings.URL_START_MSZT] # scrapy crawl def parse(self, response): print("LOGS: Starting spider MSZT ...") # init the item item = DgspiderUrlItem() # get the page source sel = Selector(response) # page_source = self.page url_list = sel.xpath(urlSettings.POST_URL_PHANTOMJS_XPATH).extract() # if the url you got had some prefix, it will works, such as 'http://' url_item = [] for url in url_list: url = url.replace(urlSettings.URL_PREFIX, '') url_item.append(urlSettings.URL_PREFIX + url) # use set to del repeated urls url_item = list(set(url_item)) item['url'] = url_item # transer item to pipeline yield item ================================================ FILE: spiders/UrlSpider_SYDW.py ================================================ # -*- coding: utf-8 -*- import scrapy from scrapy.selector import Selector from DgSpiderPhantomJS import urlSettings from DgSpiderPhantomJS.items import DgspiderUrlItem class UrlspiderSydwSpider(scrapy.Spider): name = "UrlSpider_SYDW" # set your allowed domain allowed_domains = [urlSettings.DOMAIN] # set spider start url start_urls = [urlSettings.URL_START_SYDW] # scrapy crawl def parse(self, response): print("LOGS: Starting spider SYDW ...") # init the item item = DgspiderUrlItem() # get the page source sel = Selector(response) # page_source = self.page url_list = sel.xpath(urlSettings.POST_URL_PHANTOMJS_XPATH).extract() # if the url you got had some prefix, it will works, such as 'http://' url_item = [] for url in url_list: url = url.replace(urlSettings.URL_PREFIX, '') url_item.append(urlSettings.URL_PREFIX + url) # use set to del repeated urls url_item = list(set(url_item)) item['url'] = url_item # transer item to pipeline yield item ================================================ FILE: spiders/UrlSpider_YLBG.py ================================================ # -*- coding: utf-8 -*- import scrapy from scrapy.selector import Selector from DgSpiderPhantomJS import urlSettings from DgSpiderPhantomJS.items import DgspiderUrlItem class UrlspiderYlbgSpider(scrapy.Spider): name = "UrlSpider_YLBG" # set your allowed domain allowed_domains = [urlSettings.DOMAIN] # set spider start url start_urls = [urlSettings.URL_START_YLBG] # scrapy crawl def parse(self, response): print("LOGS: Starting spider YLBG ...") # init the item item = DgspiderUrlItem() # get the page source sel = Selector(response) # page_source = self.page url_list = sel.xpath(urlSettings.POST_URL_PHANTOMJS_XPATH).extract() # if the url you got had some prefix, it will works, such as 'http://' url_item = [] for url in url_list: url = url.replace(urlSettings.URL_PREFIX, '') url_item.append(urlSettings.URL_PREFIX + url) # use set to del repeated urls url_item = list(set(url_item)) item['url'] = url_item # transer item to pipeline yield item ================================================ FILE: spiders/UrlSpider_YMYE.py ================================================ # -*- coding: utf-8 -*- import scrapy from scrapy.selector import Selector from DgSpiderPhantomJS import urlSettings from DgSpiderPhantomJS.items import DgspiderUrlItem class UrlspiderYmyeSpider(scrapy.Spider): name = "UrlSpider_YMYE" # set your allowed domain allowed_domains = [urlSettings.DOMAIN] # set spider start url start_urls = [urlSettings.URL_START_YMYE] # scrapy crawl def parse(self, response): print("LOGS: Starting spider YMYE ...") # init the item item = DgspiderUrlItem() # get the page source sel = Selector(response) # page_source = self.page url_list = sel.xpath(urlSettings.POST_URL_PHANTOMJS_XPATH).extract() # if the url you got had some prefix, it will works, such as 'http://' url_item = [] for url in url_list: url = url.replace(urlSettings.URL_PREFIX, '') url_item.append(urlSettings.URL_PREFIX + url) # use set to del repeated urls url_item = list(set(url_item)) item['url'] = url_item # transer item to pipeline yield item # for i in range(5): # yield Request(self.start_urls[0], callback=self.parse) ================================================ FILE: spiders/__init__.py ================================================ # This package will contain the spiders of your Scrapy project # # Please refer to the documentation for information on how to create and manage # your spiders. ================================================ FILE: test.py ================================================ import datetime import sys, shelve, time, execjs # import PyV8 # create_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') # print(create_time) def initDriverPool(): create_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') time_array = time.strptime(create_time, "%Y-%m-%d %H:%M:%S") time_stamp = int(time.mktime(time_array)) print(time_stamp) def execjs(): js_str = open('D:\Scrapy\DgSpiderPhantomJS\DgSpiderPhantomJS\params.js').read() a = execjs.compile(js_str).call('getParam') # a = execjs.eval(js_str3) print(a) # def js(self): # ctxt = PyV8.JSContext() # ctxt.enter() # func = ctxt.eval('''(function(){return '###'})''') # print(func) if __name__=='__main__': execjs() ================================================ FILE: urlSettings.py ================================================ # -*- coding: utf-8 -*- """爬取域名""" DOMAIN = 'toutiao.com' """圈子列表""" # 减肥瘦身 GROUP_ID_JFSS = '33' # 情感生活 GROUP_ID_QQSH = '30' # 营养专家 GROUP_ID_YYZJ = '35' # 孕妈育儿 GROUP_ID_YMYE = '4' # 深夜豆文 GROUP_ID_SYDW = '37' # 美食杂谈 GROUP_ID_MSZT = '24' # 娱乐八卦 GROUP_ID_YLBG = '38' """爬虫列表""" SPIDER_JFSS = 'UrlSpider_JFSS' SPIDER_QQSH = 'UrlSpider_QQSH' SPIDER_YYZJ = 'UrlSpider_YYZJ' SPIDER_YMYE = 'UrlSpider_YMYE' SPIDER_SYDW = 'UrlSpider_SYDW' SPIDER_MSZT = 'UrlSpider_MSZT' SPIDER_YLBG = 'UrlSpider_YLBG' MODULE = '999' # url 前缀 URL_PREFIX = 'http://www.toutiao.com' # 爬取起始页 URL_START_JFSS = 'http://www.toutiao.com/ch/news_regimen/' URL_START_YMYE = 'http://www.toutiao.com/ch/news_baby/' URL_START_SYDW = 'http://www.toutiao.com/ch/news_essay/' URL_START_MSZT = 'http://www.toutiao.com/ch/news_food/' URL_START_YLBG = 'http://www.toutiao.com/ch/news_entertainment/' """静态页爬取规则""" # # 文章列表页起始爬取URL # START_LIST_URL = 'http://www.eastlady.cn/emotion/pxgx/1.html' # # # 文章列表循环规则 # LIST_URL_RULER_PREFIX = 'http://www.eastlady.cn/emotion/pxgx/' # LIST_URL_RULER_SUFFIX = '.html' # LIST_URL_RULER_LOOP = 30 # # # 文章URL爬取规则XPATH # POST_URL_XPATH = '//div[@class="article_list"]/ul/li/span[1]/a[last()]/@href' """今日头条-动态JS/Ajax爬取规则""" POST_URL_PHANTOMJS_XPATH = '//div[@class="title-box"]/a/@href' ================================================ FILE: webBrowserPools/ghostdriver.log ================================================ [INFO - 2017-05-08T02:11:33.071Z] GhostDriver - Main - running on port 13763 [INFO - 2017-05-08T02:11:36.561Z] Session [aa201d90-3393-11e7-8f82-03c3e0612c46] - page.settings - {"XSSAuditingEnabled":false,"javascriptCanCloseWindows":true,"javascriptCanOpenWindows":true,"javascriptEnabled":true,"loadImages":false,"localToRemoteUrlAccessEnabled":false,"userAgent":"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)","webSecurityEnabled":true} [INFO - 2017-05-08T02:11:36.561Z] Session [aa201d90-3393-11e7-8f82-03c3e0612c46] - page.customHeaders: - {} [INFO - 2017-05-08T02:11:36.562Z] Session [aa201d90-3393-11e7-8f82-03c3e0612c46] - Session.negotiatedCapabilities - {"browserName":"phantomjs","version":"2.1.1","driverName":"ghostdriver","driverVersion":"1.2.0","platform":"windows-7-32bit","javascriptEnabled":true,"takesScreenshot":true,"handlesAlerts":false,"databaseEnabled":false,"locationContextEnabled":false,"applicationCacheEnabled":false,"browserConnectionEnabled":false,"cssSelectorsEnabled":true,"webStorageEnabled":false,"rotatable":false,"acceptSslCerts":false,"nativeEvents":true,"proxy":{"proxyType":"direct"},"phantomjs.page.settings.userAgent":"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)","phantomjs.page.settings.loadImages":false} [INFO - 2017-05-08T02:11:36.562Z] SessionManagerReqHand - _postNewSessionCommand - New Session Created: aa201d90-3393-11e7-8f82-03c3e0612c46 ================================================ FILE: webBrowserPools/pool.py ================================================ # douguo object pool # for the page which loaded by js/ajax # ang changes should be recored here: # # @author zhangjianfei # @date 2017/05/08 from selenium import webdriver from scrapy.http import HtmlResponse from selenium.webdriver.common.desired_capabilities import DesiredCapabilities import time import random import os import DgSpiderPhantomJS.settings as settings import pickle def save_driver(): dcap = dict(DesiredCapabilities.PHANTOMJS) dcap["phantomjs.page.settings.userAgent"] = (random.choice(settings.USER_AGENTS)) dcap["phantomjs.page.settings.loadImages"] = False driver = webdriver.PhantomJS(executable_path=r"D:\phantomjs-2.1.1\bin\phantomjs.exe", desired_capabilities=dcap) fn = open('D:\driver.pkl', 'w') # with open(fn, 'w') as f: pickle.dump(driver, fn, 0) fn.close() def get_driver(): fn = 'D:\driver.pkl' with open(fn, 'r') as f: driver = pickle.load(f) return driver if __name__ == '__main__': save_driver()