Repository: AlexTan-b-z/ZhihuSpider Branch: V2.0 Commit: 7f35d157fa7f Files: 28 Total size: 82.3 KB Directory structure: gitextract_sqz78djh/ ├── .gitignore ├── LICENSE ├── README.md └── zhihu/ ├── scrapy.cfg └── zhihu/ ├── __init__.py ├── cookie.py ├── items.py ├── middlewares.py ├── pipelines.py ├── proxy.py ├── scrapy_redis/ │ ├── BloomfilterOnRedis.py │ ├── __init__.py │ ├── connection.py │ ├── defaults.py │ ├── dupefilter.py │ ├── picklecompat.py │ ├── pipelines.py │ ├── queue.py │ ├── scheduler.py │ ├── spiders.py │ ├── tests.py │ └── utils.py ├── settings.py ├── spiders/ │ ├── __init__.py │ ├── zhihuspider.py │ └── zhihuspider0.py ├── user_agents_pc.py └── yumdama.py ================================================ FILE CONTENTS ================================================ ================================================ FILE: .gitignore ================================================ # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] *$py.class # C extensions *.so # Distribution / packaging .Python env/ build/ develop-eggs/ dist/ downloads/ eggs/ .eggs/ lib/ lib64/ parts/ sdist/ var/ wheels/ *.egg-info/ .installed.cfg *.egg # PyInstaller # Usually these files are written by a python script from a template # before PyInstaller builds the exe, so as to inject date/other infos into it. *.manifest *.spec # Installer logs pip-log.txt pip-delete-this-directory.txt # Unit test / coverage reports htmlcov/ .tox/ .coverage .coverage.* .cache nosetests.xml coverage.xml *.cover .hypothesis/ # Translations *.mo *.pot # Django stuff: *.log local_settings.py # Flask stuff: instance/ .webassets-cache # Scrapy stuff: .scrapy # Sphinx documentation docs/_build/ # PyBuilder target/ # Jupyter Notebook .ipynb_checkpoints # pyenv .python-version # celery beat schedule file celerybeat-schedule # SageMath parsed files *.sage.py # dotenv .env # virtualenv .venv venv/ ENV/ # Spyder project settings .spyderproject .spyproject # Rope project settings .ropeproject # mkdocs documentation /site # mypy .mypy_cache/ ================================================ FILE: LICENSE ================================================ MIT License Copyright (c) 2017 AlexTan-b-z Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ================================================ FILE: README.md ================================================ # 知乎爬虫(scrapy默认配置下单机1小时可爬取60多万条数据) *** *版本*:2.0 *作者*: AlexTan *CSDN*: [AlexTan_](http://blog.csdn.net/alextan_) *E-Mail* : *** ## 原文博客:[ZhihuSpider](http://blog.csdn.net/AlexTan_/article/details/77057068) ## 更新日志: * 2017.12.18:v2.0版本,修改spider,解决了 爬虫运行过久由于一些特殊原因把redis里的待爬取requests队列里的Request都耗尽,从而导致重新运行爬虫时start_requests里的request都被dupefilter过滤掉 的问题。 * 2017.11.21:v2.0版本 对proxy.py进行了优化,使每个ip的权值都不会超过10,避免出现有的ip权值无限增长,失效后要等很久才能删掉失效ip的问题。 * 2017.10.08: v2.0版本 对ip代理池(中间件)进行了优化(知乎爬虫用不上,这个中间件可以移植到其他爬虫去,只对知乎爬虫有需求的可以无视),由于上次那个代理ip过期了,这次用的讯代理,感觉比上次那个代理好用多了,有效率在95%左右。但是缺点就是优质版每次只能提取20个,每天最多提取1000个。以前那个换ip的代码会误删很多并没有失效的ip,所以这次代码就对ip进行了加权(status)处理。默认权值为10,一次访问失败会减一,访问成功会加一,当权值小于1的时候,删除该ip。 * 2017.08.22: 对三个版本的 pipline 和 spider 两个文件都修改了一下。因为以前RelationItem插入mongo时,next的数据会随机插入到粉丝或者关注里,导致数据会发生错误。 现已修正。同时,有人说到如果启用代理ip,获取ip那儿会造成堵塞,这次在获取代理ip那儿加了个多线程,解决了堵塞问题。 * 2017.08.17: v2.0版本 对scrapy_redis进行优化,修改了scrapy-redis的去重机制(加了布隆过滤器)。更新原因: v1.0版本运行两到三天就会把内存(16G的服务器)占满。 更新后,V2.0版本,运行3天,只会占大概2到3G内存(几乎不会增长)。 ## 关于redis: 如果要持久运行,建议修改一下redis.conf文件,ubuntu默认在 `/etc/redis/redis.conf` 下: 1. 把 maxmemory 设置成你内存的 3/4 2. 把 maxmemory-policy 设置成 allkeys-lru ### 最后建议多弄几个账号运行,目测78个就足够了。 ## 原文博客:[ZhihuSpider](http://blog.csdn.net/AlexTan_/article/details/77057068) *** 最后,欢迎大家提出问题,共同学习!!! ================================================ FILE: zhihu/scrapy.cfg ================================================ # Automatically created by: scrapy startproject # # For more information about the [deploy] section see: # https://scrapyd.readthedocs.org/en/latest/deploy.html [settings] default = zhihu.settings [deploy] #url = http://localhost:6800/ project = zhihu ================================================ FILE: zhihu/zhihu/__init__.py ================================================ ================================================ FILE: zhihu/zhihu/cookie.py ================================================ #encoding=utf8 import pdb import os import time import json from selenium import webdriver from selenium.webdriver.common.desired_capabilities import DesiredCapabilities import logging from .yumdama import identify # ------------------------------------------ # 版本:1.0 # 日期:2017-8-06 # 作者:AlexTan # # # ------------------------------------------ dcap = dict(DesiredCapabilities.PHANTOMJS) dcap["phantomjs.page.settings.userAgent"] = ( "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.86 Safari/537.36" ) logger = logging.getLogger(__name__) logging.getLogger("selenium").setLevel(logging.WARNING) # 将selenium的日志级别设成WARNING,太烦人 METHOD = 0 #0代表手动输入验证码,1代表云打码 myZhiHu = [ ('account','password',0), #0代表账号为手机,1代表账号为邮箱 ] def getCookie(account,password,way): if way == 0: loginURL = "https://www.zhihu.com/login/phone_num" username = 'phone_num' else: loginURL = "https://www.zhihu.com/login/email" username = 'email' try: browser = webdriver.PhantomJS(desired_capabilities=dcap) #browser = webdriver.Firefox() browser.set_window_size(1920, 1080) browser.get("https://www.zhihu.com/explore") time.sleep(1) #pdb.set_trace() browser.find_element_by_class_name('switch-to-login').click() loginDIV = browser.find_element_by_id('SidebarSignFlow').find_element_by_class_name('LoginForm') loginDIV.find_element_by_name('account').send_keys(account) loginDIV.find_element_by_name('password').send_keys(password) time.sleep(1) while True: browser.save_screenshot("zhihu.png") if loginDIV.find_element_by_class_name('captcha-module').get_attribute('style') != '': if METHOD == 0: code_txt = input("请查看路径下新生成的zhihu.png,然后输入验证码:") else: img = loginDIV.find_element_by_class_name('captcha') x = img.location["x"] y = img.location["y"] from PIL import Image im = Image.open("zhihu.png") im.crop((x, y, 85 + x, y + 30)).save("captcha.png") #pdb.set_trace() code_txt = identify() loginDIV.find_element_by_name('captcha').send_keys(code_txt) loginDIV.find_element_by_class_name('zg-btn-blue').click() time.sleep(3) try: loginDIV.find_element_by_class_name('error') logger.warning("验证码或账号密码错误 %s!" % account) except: break try: #pdb.set_trace() browser.find_element_by_class_name('top-nav-profile') cookie = {} for elem in browser.get_cookies(): cookie[elem["name"]] = elem["value"] logger.warning("Get Cookie Success!( Account:%s )" % account) #pdb.set_trace() return json.dumps(cookie) except Exception: logger.warning("Failed %s!" % account) return "" except Exception: logger.warning("Failed %s!" % account) return "" finally: try: browser.quit() except Exception: pass def UpdateCookie(account,cookie): browser = webdriver.PhantomJS(desired_capabilities=dcap) #browser = webdriver.Firefox() browser.set_window_size(1920, 1080) browser.get('https://www.zhihu.com') browser.delete_all_cookies() send_cookie = [] for key,value in cookie.items(): one = {} one = {'domain':'.zhihu.com','name':key,'value':value,'path':'/','expiry':None} #pdb.set_trace() browser.add_cookie({k: one[k] for k in ('name', 'value', 'domain', 'path', 'expiry')}) #one = {'domain':'.zhihu.com','name':key,'value':value} #send_cookie.append(one) #browser.add_cookie(send_cookie) browser.get('https://www.zhihu.com/account/unhuman?type=unhuman&message=%E7%B3%BB%E7%BB%9F%E6%A3%80%E6%B5%8B%E5%88%B0%E6%82%A8%E7%9A%84%E5%B8%90%E5%8F%B7%E6%88%96IP%E5%AD%98%E5%9C%A8%E5%BC%82%E5%B8%B8%E6%B5%81%E9%87%8F%EF%BC%8C%E8%AF%B7%E8%BE%93%E5%85%A5%E4%BB%A5%E4%B8%8B%E5%AD%97%E7%AC%A6%E7%94%A8%E4%BA%8E%E7%A1%AE%E8%AE%A4%E8%BF%99%E4%BA%9B%E8%AF%B7%E6%B1%82%E4%B8%8D%E6%98%AF%E8%87%AA%E5%8A%A8%E7%A8%8B%E5%BA%8F%E5%8F%91%E5%87%BA%E7%9A%84') time.sleep(1) browser.save_screenshot("update.png") if METHOD == 0: code_txt = input("请查看路径下新生成的update.png,然后输入验证码:") else: img = browser.find_element_by_class_name('Unhuman-captcha') x = img.location["x"] y = img.location["y"] from PIL import Image im = Image.open("zhihu.png") im.crop((x, y, 85 + x, y + 30)).save("captcha.png") #pdb.set_trace() code_txt = identify() browser.find_element_by_class_name('Input').send_keys(code_txt) browser.find_element_by_class_name('Button--blue').click() time.sleep(3) try: browser.find_element_by_class_name('AppHeader-profile') cookie = {} for elem in browser.get_cookies(): cookie[elem["name"]] = elem["value"] logger.warning("Update Cookie Success!( Account:%s )" % account) #pdb.set_trace() return json.dumps(cookie) except Exception: logger.warning("Update Failed %s!" % account) return "" finally: try: browser.quit() except Exception: pass def initCookie(rconn, spiderName): """ 获取所有账号的Cookies,存入Redis。如果Redis已有该账号的Cookie,则不再获取。 """ for zhihu in myZhiHu: if rconn.get("%s:Cookies:%s--%s" % (spiderName, zhihu[0], zhihu[1])) is None: # 'zhihuspider:Cookies:账号--密码',为None即不存在。 cookie = getCookie(zhihu[0], zhihu[1],zhihu[2]) if len(cookie) > 0: rconn.set("%s:Cookies:%s--%s" % (spiderName, zhihu[0], zhihu[1]), cookie) cookieNum = str(rconn.keys()).count("zhihuspider:Cookies") logger.warning("The num of the cookies is %s" % cookieNum) if cookieNum == 0: logger.warning('Stopping...') os.system("pause") def updateCookie(accountText, rconn, spiderName, cookie): """ 更新一个账号的Cookie """ account = accountText.split("--")[0] #pdb.set_trace() new_cookie = UpdateCookie(account, cookie) if len(new_cookie) > 0: logger.warning("The cookie of %s has been updated successfully!" % account) rconn.set("%s:Cookies:%s" % (spiderName, accountText), new_cookie) else: logger.warning("The cookie of %s updated failed! Remove it!" % accountText) removeCookie(accountText, rconn, spiderName) def removeCookie(accountText, rconn, spiderName): """ 删除某个账号的Cookie """ rconn.delete("%s:Cookies:%s" % (spiderName, accountText)) cookieNum = str(rconn.keys()).count("zhihuspider:Cookies") logger.warning("The num of the cookies left is %s" % cookieNum) if cookieNum == 0: logger.warning("Stopping...") os.system("pause") if __name__ == '__main__': getCookie(myZhiHu[0][0],myZhiHu[0][1],myZhiHu[0][2]) ================================================ FILE: zhihu/zhihu/items.py ================================================ # -*- coding: utf-8 -*- # ------------------------------------------ # 版本:1.0 # 日期:2017-8-06 # 作者:AlexTan # # # ------------------------------------------ import scrapy class ZhihuItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() user_id = scrapy.Field() user_image_url = scrapy.Field() name = scrapy.Field() locations = scrapy.Field() business = scrapy.Field() #所在行业 employments = scrapy.Field() #职业经历 gender = scrapy.Field() education = scrapy.Field() followees_num = scrapy.Field() #我关注的人数 followers_num = scrapy.Field() #关注我的人数 class RelationItem(scrapy.Item): user_id = scrapy.Field() relation_type = scrapy.Field() #关系类型 relations_id = scrapy.Field() class AnswerItem(scrapy.Item): answer_user_id = scrapy.Field() answer_id = scrapy.Field() question_id = scrapy.Field() cretated_time = scrapy.Field() updated_time = scrapy.Field() voteup_count = scrapy.Field() comment_count = scrapy.Field() content = scrapy.Field() class QuestionItem(scrapy.Item): ask_user_id = scrapy.Field() question_id = scrapy.Field() ask_time = scrapy.Field() answer_count = scrapy.Field() followees_count = scrapy.Field() title = scrapy.Field() class ArticleItem(scrapy.Item): author_id = scrapy.Field() title = scrapy.Field() article_id = scrapy.Field() content = scrapy.Field() cretated_time = scrapy.Field() updated_time = scrapy.Field() voteup_count = scrapy.Field() comment_count = scrapy.Field() ================================================ FILE: zhihu/zhihu/middlewares.py ================================================ # -*- coding: utf-8 -*- import logging import telnetlib import random import redis import json import os import threading import pdb from scrapy import signals from .user_agents_pc import agents from .proxy import initIPPOOLS, updateIPPOOLS from .cookie import initCookie, updateCookie, removeCookie from scrapy.utils.response import response_status_message from scrapy.downloadermiddlewares.retry import RetryMiddleware from scrapy.exceptions import IgnoreRequest # ------------------------------------------ # 版本:1.0 # 日期:2017-8-06 # 作者:AlexTan # # # ------------------------------------------ logger = logging.getLogger(__name__) class UserAgentMiddleware(object): """ 换User-Agent """ def process_request(self, request, spider): agent = random.choice(agents) request.headers["User-Agent"] = agent class ProxyMiddleware(RetryMiddleware): '''IP代理''' def __init__(self, settings, crawler): #自己获取的ip self.TIMES = 10 RetryMiddleware.__init__(self, settings) self.rconn = settings.get("RCONN", redis.Redis(crawler.settings.get('REDIS_HOST', 'localhsot'), crawler.settings.get('REDIS_PORT', 6379))) #initIPPOOLS(self.rconn) @classmethod def from_crawler(cls, crawler): return cls(crawler.settings, crawler) def process_request(self,request,spider): #pdb.set_trace() ipNum=len(self.rconn.keys('IP*')) #pdb.set_trace() if ipNum<50: proxy_thread = threading.Thread(target= initIPPOOLS,args = (self.rconn,)) proxy_thread.setDaemon(True) proxy_thread.start() #initIPPOOLS(self.rconn) if self.TIMES == 3: baseIP=random.choice(self.rconn.keys('IP:*')) ip=str(baseIP,'utf-8').replace('IP:','') try: IP,PORT,status=ip.split(':') request.meta['status'] = status telnetlib.Telnet(IP,port=PORT,timeout=2) #测试ip是否有效 except: logger.warning("The ip is not available !( IP:%s )" % ip) updateIPPOOLS(self.rconn,IP+':'+PORT,status) else: #pdb.set_trace() self.IP = "http://" + IP + ':' + PORT logger.warning("The current IP is %s!" % self.IP) self.TIMES = 0 updateIPPOOLS(self.rconn,IP+':'+PORT,status,1) #pdb.set_trace() else: self.TIMES += 1 #pdb.set_trace() if self.IP is not "": request.meta["proxy"] = self.IP def process_response(self,request,response,spider): if response.status in [400,403,404,429,500,502,503,504]: self.TIMES = 3 logger.error("%s! error..." % response.status) #pdb.set_trace() try: updateIPPOOLS(self.rconn,request.meta['proxy'].replace('http://',''),request.meta['status'],-1) except: pass reason = response_status_message(response.status) return self._retry(request, reason, spider) or response # 重试 else: return response def process_exception(self, request, exception, spider): #pdb.set_trace() self.TIMES = 3 try: updateIPPOOLS(self.rconn,request.meta['proxy'].replace('http://',''),request.meta['status'],-1) except: pass return request class CookiesMiddleware(RetryMiddleware): """ 维护Cookie """ def __init__(self, settings, crawler): RetryMiddleware.__init__(self, settings) self.rconn = settings.get("RCONN", redis.Redis(crawler.settings.get('REDIS_HOST', 'localhsot'), crawler.settings.get('REDIS_PORT', 6379))) initCookie(self.rconn, crawler.spider.name) @classmethod def from_crawler(cls, crawler): return cls(crawler.settings, crawler) def process_request(self, request, spider): redisKeys = self.rconn.keys() while len(redisKeys) > 0: elem = random.choice(redisKeys) #pdb.set_trace() if b'zhihuspider:Cookies' in elem: #pdb.set_trace() elem = str(elem,'utf-8') cookie = json.loads(str(self.rconn.get(elem),'utf-8')) request.cookies = cookie request.meta["accountText"] = elem.split("Cookies:")[-1] break else: #pdb.set_trace() redisKeys.remove(elem) def process_response(self, request, response, spider): #pdb.set_trace() reason = response_status_message(response.status) if response.status in [300, 301, 302, 303]: pdb.set_trace() if reason == '301 Moved Permanently': return self._retry(request, reason, spider) or response # 重试 else: raise IgnoreRequest elif response.status in [403, 414]: logger.error("%s! Stopping..." % response.status) os.system("pause") updateCookie(request.meta['accountText'], self.rconn, spider.name, request.cookies) return self._retry(request, reason, spider) or response # 重试 else: return response ================================================ FILE: zhihu/zhihu/pipelines.py ================================================ # -*- coding: utf-8 -*- import pymongo import pdb from .items import ZhihuItem,RelationItem,AnswerItem,QuestionItem,ArticleItem # ------------------------------------------ # 版本:1.0 # 日期:2017-8-06 # 作者:AlexTan # # # ------------------------------------------ class ZhihuPipeline(object): def __init__(self, mongo_uri, mongo_db): self.mongo_uri = mongo_uri self.mongo_db = mongo_db @classmethod def from_crawler(cls,crawler): return cls( mongo_uri = crawler.settings.get('MONGO_URI'), mongo_db = crawler.settings.get('MONGO_DATABASE','zhihu') ) def open_spider(self,spider): self.client = pymongo.MongoClient(self.mongo_uri) self.db = self.client[self.mongo_db] def close_spider(self,spider): self.client.close() def process_item(self, item, spider): if isinstance(item, ZhihuItem): self._process_user_item(item) elif isinstance(item, AnswerItem): self._process_answer_item(item) elif isinstance(item, QuestionItem): self._process_question_item(item) elif isinstance(item, ArticleItem): self._process_article_item(item) else: #pdb.set_trace() self._process_relation_item(item) return item def _process_user_item(self,item): self.db.UserInfo.insert(dict(item)) def _process_relation_item(self,item): try: isnext,relation_type = item['relation_type'].split(':') if isnext == 'next': for one in item['relations_id']: #pdb.set_trace() self.db.Relation.update({'user_id':item['user_id'],'relation_type':relation_type},{"$push":{'relations_id':one}}) except: self.db.Relation.insert(dict(item)) def _process_answer_item(self,item): self.db.AnswerInfo.insert(dict(item)) def _process_question_item(self,item): self.db.QuestionInfo.insert(dict(item)) def _process_article_item(self,item): self.db.ArticleInfo.insert(dict(item)) ================================================ FILE: zhihu/zhihu/proxy.py ================================================ # encoding=utf-8 import telnetlib import urllib import logging # ------------------------------------------ # 版本:1.0 # 日期:2017-8-06 # 作者:AlexTan # # # ------------------------------------------ logger = logging.getLogger(__name__) IPPOOLNUM=20 #一次性从网页获取的IP数量 def GetIPPOOLS(num): #大象代理买的ip,5元20000个,每十个差不多有一个能用 IPPOOL=urllib.request.urlopen("http://tpv.daxiangdaili.com/ip/?tid=559480480576119&num="+str(num)+"&operator=1&filter=on&protocol=http&category=2&delay=1").read().decode("utf-8","ignore").split('\r\n') ''' #自己获取的ip IPPOOLS1=urllib.request.urlopen("http://127.0.0.1:8000/?types=0&count=20&country=%E5%9B%BD%E5%86%85").read().decode("utf-8",'ignore') IPPOOLS2=re.findall('\"(\d+\.\d+\.\d+\.\d+\"\,\s*\d+)',IPPOOLS1) IPPOOL=[i.replace('", ',':') for i in IPPOOLS2] ''' return IPPOOL def initIPPOOLS(rconn): """把有效的IP存入 REDIS数据库""" ipNum=len(rconn.keys('IP*')) if ipNum # # ------------------------------------------ class SimpleHash(object): def __init__(self, cap, seed): self.cap = cap self.seed = seed def hash(self, value): ret = 0 for i in range(len(value)): ret += self.seed * ret + ord(value[i]) return (self.cap - 1) & ret class BloomFilter(object): def __init__(self, server, key, blockNum=1): self.bit_size = 1 << 31 # Redis的String类型最大容量为512M,现使用256M self.seeds = [5, 7, 11, 13, 31] # self.seeds = [5, 7, 11, 13, 31, 37, 61] self.server = server self.key = key self.blockNum = blockNum self.hashfunc = [] for seed in self.seeds: self.hashfunc.append(SimpleHash(self.bit_size, seed)) def isContains(self, str_input): if not str_input: return False ret = True name = self.key + str(int(str_input[0:2], 16) % self.blockNum) for f in self.hashfunc: loc = f.hash(str_input) ret = ret & self.server.getbit(name, loc) return ret def insert(self, str_input): name = self.key + str(int(str_input[0:2], 16) % self.blockNum) for f in self.hashfunc: loc = f.hash(str_input) self.server.setbit(name, loc, 1) ================================================ FILE: zhihu/zhihu/scrapy_redis/__init__.py ================================================ ================================================ FILE: zhihu/zhihu/scrapy_redis/connection.py ================================================ import six from scrapy.utils.misc import load_object from . import defaults # Shortcut maps 'setting name' -> 'parmater name'. SETTINGS_PARAMS_MAP = { 'REDIS_URL': 'url', 'REDIS_HOST': 'host', 'REDIS_PORT': 'port', 'REDIS_ENCODING': 'encoding', } def get_redis_from_settings(settings): """Returns a redis client instance from given Scrapy settings object. This function uses ``get_client`` to instantiate the client and uses ``defaults.REDIS_PARAMS`` global as defaults values for the parameters. You can override them using the ``REDIS_PARAMS`` setting. Parameters ---------- settings : Settings A scrapy settings object. See the supported settings below. Returns ------- server Redis client instance. Other Parameters ---------------- REDIS_URL : str, optional Server connection URL. REDIS_HOST : str, optional Server host. REDIS_PORT : str, optional Server port. REDIS_ENCODING : str, optional Data encoding. REDIS_PARAMS : dict, optional Additional client parameters. """ params = defaults.REDIS_PARAMS.copy() params.update(settings.getdict('REDIS_PARAMS')) # XXX: Deprecate REDIS_* settings. for source, dest in SETTINGS_PARAMS_MAP.items(): val = settings.get(source) if val: params[dest] = val # Allow ``redis_cls`` to be a path to a class. if isinstance(params.get('redis_cls'), six.string_types): params['redis_cls'] = load_object(params['redis_cls']) return get_redis(**params) # Backwards compatible alias. from_settings = get_redis_from_settings def get_redis(**kwargs): """Returns a redis client instance. Parameters ---------- redis_cls : class, optional Defaults to ``redis.StrictRedis``. url : str, optional If given, ``redis_cls.from_url`` is used to instantiate the class. **kwargs Extra parameters to be passed to the ``redis_cls`` class. Returns ------- server Redis client instance. """ redis_cls = kwargs.pop('redis_cls', defaults.REDIS_CLS) url = kwargs.pop('url', None) if url: return redis_cls.from_url(url, **kwargs) else: return redis_cls(**kwargs) ================================================ FILE: zhihu/zhihu/scrapy_redis/defaults.py ================================================ import redis # For standalone use. DUPEFILTER_KEY = 'dupefilter:%(timestamp)s' PIPELINE_KEY = '%(spider)s:items' REDIS_CLS = redis.StrictRedis REDIS_ENCODING = 'utf-8' # Sane connection defaults. REDIS_PARAMS = { 'socket_timeout': 30, 'socket_connect_timeout': 30, 'retry_on_timeout': True, 'encoding': REDIS_ENCODING, } SCHEDULER_QUEUE_KEY = '%(spider)s:requests' SCHEDULER_QUEUE_CLASS = 'zhihu.scrapy_redis.queue.PriorityQueue' SCHEDULER_DUPEFILTER_KEY = '%(spider)s:dupefilter' SCHEDULER_DUPEFILTER_CLASS = 'zhihu.scrapy_redis.dupefilter.RFPDupeFilter' START_URLS_KEY = '%(name)s:start_urls' START_URLS_AS_SET = False ================================================ FILE: zhihu/zhihu/scrapy_redis/dupefilter.py ================================================ import logging import time import pdb from .BloomfilterOnRedis import BloomFilter from scrapy.dupefilters import BaseDupeFilter from scrapy.utils.request import request_fingerprint from . import defaults from .connection import get_redis_from_settings logger = logging.getLogger(__name__) # TODO: Rename class to RedisDupeFilter. class RFPDupeFilter(BaseDupeFilter): """Redis-based request duplicates filter. This class can also be used with default Scrapy's scheduler. """ logger = logger def __init__(self, server, key, debug=False): """Initialize the duplicates filter. Parameters ---------- server : redis.StrictRedis The redis server instance. key : str Redis key Where to store fingerprints. debug : bool, optional Whether to log filtered requests. """ self.server = server self.key = key self.debug = debug self.bf = BloomFilter(server, key, blockNum=1) # you can increase blockNum if your are filtering too many urls self.logdupes = True @classmethod def from_settings(cls, settings): """Returns an instance from given settings. This uses by default the key ``dupefilter:``. When using the ``scrapy_redis.scheduler.Scheduler`` class, this method is not used as it needs to pass the spider name in the key. Parameters ---------- settings : scrapy.settings.Settings Returns ------- RFPDupeFilter A RFPDupeFilter instance. """ server = get_redis_from_settings(settings) # XXX: This creates one-time key. needed to support to use this # class as standalone dupefilter with scrapy's default scheduler # if scrapy passes spider on open() method this wouldn't be needed # TODO: Use SCRAPY_JOB env as default and fallback to timestamp. key = defaults.DUPEFILTER_KEY % {'timestamp': int(time.time())} debug = settings.getbool('DUPEFILTER_DEBUG') return cls(server, key=key, debug=debug) @classmethod def from_crawler(cls, crawler): """Returns instance from crawler. Parameters ---------- crawler : scrapy.crawler.Crawler Returns ------- RFPDupeFilter Instance of RFPDupeFilter. """ return cls.from_settings(crawler.settings) def request_seen(self, request): """Returns True if request was already seen. Parameters ---------- request : scrapy.http.Request Returns ------- bool """ fp = request_fingerprint(request) if self.bf.isContains(fp): return True else: self.bf.insert(fp) return False def request_fingerprint(self, request): """Returns a fingerprint for a given request. Parameters ---------- request : scrapy.http.Request Returns ------- str """ return request_fingerprint(request) def close(self, reason=''): """Delete data on close. Called by Scrapy's scheduler. Parameters ---------- reason : str, optional """ self.clear() def clear(self): """Clears fingerprints data.""" self.server.delete(self.key) def log(self, request, spider): """Logs given request. Parameters ---------- request : scrapy.http.Request spider : scrapy.spiders.Spider """ if self.debug: msg = "Filtered duplicate request: %(request)s" self.logger.debug(msg, {'request': request}, extra={'spider': spider}) elif self.logdupes: msg = ("Filtered duplicate request %(request)s" " - no more duplicates will be shown" " (see DUPEFILTER_DEBUG to show all duplicates)") self.logger.debug(msg, {'request': request}, extra={'spider': spider}) self.logdupes = False ================================================ FILE: zhihu/zhihu/scrapy_redis/picklecompat.py ================================================ """A pickle wrapper module with protocol=-1 by default.""" try: import cPickle as pickle # PY2 except ImportError: import pickle def loads(s): return pickle.loads(s) def dumps(obj): return pickle.dumps(obj, protocol=-1) ================================================ FILE: zhihu/zhihu/scrapy_redis/pipelines.py ================================================ from scrapy.utils.misc import load_object from scrapy.utils.serialize import ScrapyJSONEncoder from twisted.internet.threads import deferToThread from . import connection, defaults default_serialize = ScrapyJSONEncoder().encode class RedisPipeline(object): """Pushes serialized item into a redis list/queue Settings -------- REDIS_ITEMS_KEY : str Redis key where to store items. REDIS_ITEMS_SERIALIZER : str Object path to serializer function. """ def __init__(self, server, key=defaults.PIPELINE_KEY, serialize_func=default_serialize): """Initialize pipeline. Parameters ---------- server : StrictRedis Redis client instance. key : str Redis key where to store items. serialize_func : callable Items serializer function. """ self.server = server self.key = key self.serialize = serialize_func @classmethod def from_settings(cls, settings): params = { 'server': connection.from_settings(settings), } if settings.get('REDIS_ITEMS_KEY'): params['key'] = settings['REDIS_ITEMS_KEY'] if settings.get('REDIS_ITEMS_SERIALIZER'): params['serialize_func'] = load_object( settings['REDIS_ITEMS_SERIALIZER'] ) return cls(**params) @classmethod def from_crawler(cls, crawler): return cls.from_settings(crawler.settings) def process_item(self, item, spider): return deferToThread(self._process_item, item, spider) def _process_item(self, item, spider): key = self.item_key(item, spider) data = self.serialize(item) self.server.rpush(key, data) return item def item_key(self, item, spider): """Returns redis key based on given spider. Override this function to use a different key depending on the item and/or spider. """ return self.key % {'spider': spider.name} ================================================ FILE: zhihu/zhihu/scrapy_redis/queue.py ================================================ from scrapy.utils.reqser import request_to_dict, request_from_dict from . import picklecompat class Base(object): """Per-spider base queue class""" def __init__(self, server, spider, key, serializer=None): """Initialize per-spider redis queue. Parameters ---------- server : StrictRedis Redis client instance. spider : Spider Scrapy spider instance. key: str Redis key where to put and get messages. serializer : object Serializer object with ``loads`` and ``dumps`` methods. """ if serializer is None: # Backward compatibility. # TODO: deprecate pickle. serializer = picklecompat if not hasattr(serializer, 'loads'): raise TypeError("serializer does not implement 'loads' function: %r" % serializer) if not hasattr(serializer, 'dumps'): raise TypeError("serializer '%s' does not implement 'dumps' function: %r" % serializer) self.server = server self.spider = spider self.key = key % {'spider': spider.name} self.serializer = serializer def _encode_request(self, request): """Encode a request object""" obj = request_to_dict(request, self.spider) return self.serializer.dumps(obj) def _decode_request(self, encoded_request): """Decode an request previously encoded""" obj = self.serializer.loads(encoded_request) return request_from_dict(obj, self.spider) def __len__(self): """Return the length of the queue""" raise NotImplementedError def push(self, request): """Push a request""" raise NotImplementedError def pop(self, timeout=0): """Pop a request""" raise NotImplementedError def clear(self): """Clear queue/stack""" self.server.delete(self.key) class FifoQueue(Base): """Per-spider FIFO queue""" def __len__(self): """Return the length of the queue""" return self.server.llen(self.key) def push(self, request): """Push a request""" self.server.lpush(self.key, self._encode_request(request)) def pop(self, timeout=0): """Pop a request""" if timeout > 0: data = self.server.brpop(self.key, timeout) if isinstance(data, tuple): data = data[1] else: data = self.server.rpop(self.key) if data: return self._decode_request(data) class PriorityQueue(Base): """Per-spider priority queue abstraction using redis' sorted set""" def __len__(self): """Return the length of the queue""" return self.server.zcard(self.key) def push(self, request): """Push a request""" data = self._encode_request(request) score = -request.priority # We don't use zadd method as the order of arguments change depending on # whether the class is Redis or StrictRedis, and the option of using # kwargs only accepts strings, not bytes. self.server.execute_command('ZADD', self.key, score, data) def pop(self, timeout=0): """ Pop a request timeout not support in this queue class """ # use atomic range/remove using multi/exec pipe = self.server.pipeline() pipe.multi() pipe.zrange(self.key, 0, 0).zremrangebyrank(self.key, 0, 0) results, count = pipe.execute() if results: return self._decode_request(results[0]) class LifoQueue(Base): """Per-spider LIFO queue.""" def __len__(self): """Return the length of the stack""" return self.server.llen(self.key) def push(self, request): """Push a request""" self.server.lpush(self.key, self._encode_request(request)) def pop(self, timeout=0): """Pop a request""" if timeout > 0: data = self.server.blpop(self.key, timeout) if isinstance(data, tuple): data = data[1] else: data = self.server.lpop(self.key) if data: return self._decode_request(data) # TODO: Deprecate the use of these names. SpiderQueue = FifoQueue SpiderStack = LifoQueue SpiderPriorityQueue = PriorityQueue ================================================ FILE: zhihu/zhihu/scrapy_redis/scheduler.py ================================================ import importlib import six from scrapy.utils.misc import load_object from . import connection, defaults # TODO: add SCRAPY_JOB support. class Scheduler(object): """Redis-based scheduler Settings -------- SCHEDULER_PERSIST : bool (default: False) Whether to persist or clear redis queue. SCHEDULER_FLUSH_ON_START : bool (default: False) Whether to flush redis queue on start. SCHEDULER_IDLE_BEFORE_CLOSE : int (default: 0) How many seconds to wait before closing if no message is received. SCHEDULER_QUEUE_KEY : str Scheduler redis key. SCHEDULER_QUEUE_CLASS : str Scheduler queue class. SCHEDULER_DUPEFILTER_KEY : str Scheduler dupefilter redis key. SCHEDULER_DUPEFILTER_CLASS : str Scheduler dupefilter class. SCHEDULER_SERIALIZER : str Scheduler serializer. """ def __init__(self, server, persist=False, flush_on_start=False, queue_key=defaults.SCHEDULER_QUEUE_KEY, queue_cls=defaults.SCHEDULER_QUEUE_CLASS, dupefilter_key=defaults.SCHEDULER_DUPEFILTER_KEY, dupefilter_cls=defaults.SCHEDULER_DUPEFILTER_CLASS, idle_before_close=0, serializer=None): """Initialize scheduler. Parameters ---------- server : Redis The redis server instance. persist : bool Whether to flush requests when closing. Default is False. flush_on_start : bool Whether to flush requests on start. Default is False. queue_key : str Requests queue key. queue_cls : str Importable path to the queue class. dupefilter_key : str Duplicates filter key. dupefilter_cls : str Importable path to the dupefilter class. idle_before_close : int Timeout before giving up. """ if idle_before_close < 0: raise TypeError("idle_before_close cannot be negative") self.server = server self.persist = persist self.flush_on_start = flush_on_start self.queue_key = queue_key self.queue_cls = queue_cls self.dupefilter_cls = dupefilter_cls self.dupefilter_key = dupefilter_key self.idle_before_close = idle_before_close self.serializer = serializer self.stats = None def __len__(self): return len(self.queue) @classmethod def from_settings(cls, settings): kwargs = { 'persist': settings.getbool('SCHEDULER_PERSIST'), 'flush_on_start': settings.getbool('SCHEDULER_FLUSH_ON_START'), 'idle_before_close': settings.getint('SCHEDULER_IDLE_BEFORE_CLOSE'), } # If these values are missing, it means we want to use the defaults. optional = { # TODO: Use custom prefixes for this settings to note that are # specific to scrapy-redis. 'queue_key': 'SCHEDULER_QUEUE_KEY', 'queue_cls': 'SCHEDULER_QUEUE_CLASS', 'dupefilter_key': 'SCHEDULER_DUPEFILTER_KEY', # We use the default setting name to keep compatibility. 'dupefilter_cls': 'DUPEFILTER_CLASS', 'serializer': 'SCHEDULER_SERIALIZER', } for name, setting_name in optional.items(): val = settings.get(setting_name) if val: kwargs[name] = val # Support serializer as a path to a module. if isinstance(kwargs.get('serializer'), six.string_types): kwargs['serializer'] = importlib.import_module(kwargs['serializer']) server = connection.from_settings(settings) # Ensure the connection is working. server.ping() return cls(server=server, **kwargs) @classmethod def from_crawler(cls, crawler): instance = cls.from_settings(crawler.settings) # FIXME: for now, stats are only supported from this constructor instance.stats = crawler.stats return instance def open(self, spider): self.spider = spider try: self.queue = load_object(self.queue_cls)( server=self.server, spider=spider, key=self.queue_key % {'spider': spider.name}, serializer=self.serializer, ) except TypeError as e: raise ValueError("Failed to instantiate queue class '%s': %s", self.queue_cls, e) try: self.df = load_object(self.dupefilter_cls)( server=self.server, key=self.dupefilter_key % {'spider': spider.name}, debug=spider.settings.getbool('DUPEFILTER_DEBUG'), ) except TypeError as e: raise ValueError("Failed to instantiate dupefilter class '%s': %s", self.dupefilter_cls, e) if self.flush_on_start: self.flush() # notice if there are requests already in the queue to resume the crawl if len(self.queue): spider.log("Resuming crawl (%d requests scheduled)" % len(self.queue)) def close(self, reason): if not self.persist: self.flush() def flush(self): self.df.clear() self.queue.clear() def enqueue_request(self, request): if not request.dont_filter and self.df.request_seen(request): self.df.log(request, self.spider) return False if self.stats: self.stats.inc_value('scheduler/enqueued/redis', spider=self.spider) self.queue.push(request) return True def next_request(self): block_pop_timeout = self.idle_before_close request = self.queue.pop(block_pop_timeout) if request and self.stats: self.stats.inc_value('scheduler/dequeued/redis', spider=self.spider) return request def has_pending_requests(self): return len(self) > 0 ================================================ FILE: zhihu/zhihu/scrapy_redis/spiders.py ================================================ from scrapy import signals from scrapy.exceptions import DontCloseSpider from scrapy.spiders import Spider, CrawlSpider from . import connection, defaults from .utils import bytes_to_str class RedisMixin(object): """Mixin class to implement reading urls from a redis queue.""" redis_key = None redis_batch_size = None redis_encoding = None # Redis client placeholder. server = None def start_requests(self): """Returns a batch of start requests from redis.""" return self.next_requests() def setup_redis(self, crawler=None): """Setup redis connection and idle signal. This should be called after the spider has set its crawler object. """ if self.server is not None: return if crawler is None: # We allow optional crawler argument to keep backwards # compatibility. # XXX: Raise a deprecation warning. crawler = getattr(self, 'crawler', None) if crawler is None: raise ValueError("crawler is required") settings = crawler.settings if self.redis_key is None: self.redis_key = settings.get( 'REDIS_START_URLS_KEY', defaults.START_URLS_KEY, ) self.redis_key = self.redis_key % {'name': self.name} if not self.redis_key.strip(): raise ValueError("redis_key must not be empty") if self.redis_batch_size is None: # TODO: Deprecate this setting (REDIS_START_URLS_BATCH_SIZE). self.redis_batch_size = settings.getint( 'REDIS_START_URLS_BATCH_SIZE', settings.getint('CONCURRENT_REQUESTS'), ) try: self.redis_batch_size = int(self.redis_batch_size) except (TypeError, ValueError): raise ValueError("redis_batch_size must be an integer") if self.redis_encoding is None: self.redis_encoding = settings.get('REDIS_ENCODING', defaults.REDIS_ENCODING) self.logger.info("Reading start URLs from redis key '%(redis_key)s' " "(batch size: %(redis_batch_size)s, encoding: %(redis_encoding)s", self.__dict__) self.server = connection.from_settings(crawler.settings) # The idle signal is called when the spider has no requests left, # that's when we will schedule new requests from redis queue crawler.signals.connect(self.spider_idle, signal=signals.spider_idle) def next_requests(self): """Returns a request to be scheduled or none.""" use_set = self.settings.getbool('REDIS_START_URLS_AS_SET', defaults.START_URLS_AS_SET) fetch_one = self.server.spop if use_set else self.server.lpop # XXX: Do we need to use a timeout here? found = 0 # TODO: Use redis pipeline execution. while found < self.redis_batch_size: data = fetch_one(self.redis_key) if not data: # Queue empty. break req = self.make_request_from_data(data) if req: yield req found += 1 else: self.logger.debug("Request not made from data: %r", data) if found: self.logger.debug("Read %s requests from '%s'", found, self.redis_key) def make_request_from_data(self, data): """Returns a Request instance from data coming from Redis. By default, ``data`` is an encoded URL. You can override this method to provide your own message decoding. Parameters ---------- data : bytes Message from redis. """ url = bytes_to_str(data, self.redis_encoding) return self.make_requests_from_url(url) def schedule_next_requests(self): """Schedules a request if available""" # TODO: While there is capacity, schedule a batch of redis requests. for req in self.next_requests(): self.crawler.engine.crawl(req, spider=self) def spider_idle(self): """Schedules a request if available, otherwise waits.""" # XXX: Handle a sentinel to close the spider. self.schedule_next_requests() raise DontCloseSpider class RedisSpider(RedisMixin, Spider): """Spider that reads urls from redis queue when idle. Attributes ---------- redis_key : str (default: REDIS_START_URLS_KEY) Redis key where to fetch start URLs from.. redis_batch_size : int (default: CONCURRENT_REQUESTS) Number of messages to fetch from redis on each attempt. redis_encoding : str (default: REDIS_ENCODING) Encoding to use when decoding messages from redis queue. Settings -------- REDIS_START_URLS_KEY : str (default: ":start_urls") Default Redis key where to fetch start URLs from.. REDIS_START_URLS_BATCH_SIZE : int (deprecated by CONCURRENT_REQUESTS) Default number of messages to fetch from redis on each attempt. REDIS_START_URLS_AS_SET : bool (default: False) Use SET operations to retrieve messages from the redis queue. If False, the messages are retrieve using the LPOP command. REDIS_ENCODING : str (default: "utf-8") Default encoding to use when decoding messages from redis queue. """ @classmethod def from_crawler(self, crawler, *args, **kwargs): obj = super(RedisSpider, self).from_crawler(crawler, *args, **kwargs) obj.setup_redis(crawler) return obj class RedisCrawlSpider(RedisMixin, CrawlSpider): """Spider that reads urls from redis queue when idle. Attributes ---------- redis_key : str (default: REDIS_START_URLS_KEY) Redis key where to fetch start URLs from.. redis_batch_size : int (default: CONCURRENT_REQUESTS) Number of messages to fetch from redis on each attempt. redis_encoding : str (default: REDIS_ENCODING) Encoding to use when decoding messages from redis queue. Settings -------- REDIS_START_URLS_KEY : str (default: ":start_urls") Default Redis key where to fetch start URLs from.. REDIS_START_URLS_BATCH_SIZE : int (deprecated by CONCURRENT_REQUESTS) Default number of messages to fetch from redis on each attempt. REDIS_START_URLS_AS_SET : bool (default: True) Use SET operations to retrieve messages from the redis queue. REDIS_ENCODING : str (default: "utf-8") Default encoding to use when decoding messages from redis queue. """ @classmethod def from_crawler(self, crawler, *args, **kwargs): obj = super(RedisCrawlSpider, self).from_crawler(crawler, *args, **kwargs) obj.setup_redis(crawler) return obj ================================================ FILE: zhihu/zhihu/scrapy_redis/tests.py ================================================ import os import mock import redis from scrapy import Request, Spider from unittest import TestCase from . import connection from .dupefilter import RFPDupeFilter from .queue import SpiderQueue, SpiderPriorityQueue, SpiderStack from .scheduler import Scheduler # allow test settings from environment REDIS_HOST = os.environ.get('REDIST_HOST', 'localhost') REDIS_PORT = int(os.environ.get('REDIS_PORT', 6379)) class RedisTestMixin(object): @property def server(self): if not hasattr(self, '_redis'): self._redis = redis.Redis(REDIS_HOST, REDIS_PORT) return self._redis def clear_keys(self, prefix): keys = self.server.keys(prefix + '*') if keys: self.server.delete(*keys) class DupeFilterTest(RedisTestMixin, TestCase): def setUp(self): self.key = 'scrapy_redis:tests:dupefilter:' self.df = RFPDupeFilter(self.server, self.key) def tearDown(self): self.clear_keys(self.key) def test_dupe_filter(self): req = Request('http://example.com') self.assertFalse(self.df.request_seen(req)) self.assertTrue(self.df.request_seen(req)) self.df.close('nothing') class QueueTestMixin(RedisTestMixin): queue_cls = None def setUp(self): self.spider = Spider('myspider') self.key = 'scrapy_redis:tests:%s:queue' % self.spider.name self.q = self.queue_cls(self.server, Spider('myspider'), self.key) def tearDown(self): self.clear_keys(self.key) def test_clear(self): self.assertEqual(len(self.q), 0) for i in range(10): # XXX: can't use same url for all requests as SpiderPriorityQueue # uses redis' set implemention and we will end with only one # request in the set and thus failing the test. It should be noted # that when using SpiderPriorityQueue it acts as a request # duplication filter whenever the serielized requests are the same. # This might be unwanted on repetitive requests to the same page # even with dont_filter=True flag. req = Request('http://example.com/?page=%s' % i) self.q.push(req) self.assertEqual(len(self.q), 10) self.q.clear() self.assertEqual(len(self.q), 0) class SpiderQueueTest(QueueTestMixin, TestCase): queue_cls = SpiderQueue def test_queue(self): req1 = Request('http://example.com/page1') req2 = Request('http://example.com/page2') self.q.push(req1) self.q.push(req2) out1 = self.q.pop() out2 = self.q.pop() self.assertEqual(out1.url, req1.url) self.assertEqual(out2.url, req2.url) class SpiderPriorityQueueTest(QueueTestMixin, TestCase): queue_cls = SpiderPriorityQueue def test_queue(self): req1 = Request('http://example.com/page1', priority=100) req2 = Request('http://example.com/page2', priority=50) req3 = Request('http://example.com/page2', priority=200) self.q.push(req1) self.q.push(req2) self.q.push(req3) out1 = self.q.pop() out2 = self.q.pop() out3 = self.q.pop() self.assertEqual(out1.url, req3.url) self.assertEqual(out2.url, req1.url) self.assertEqual(out3.url, req2.url) class SpiderStackTest(QueueTestMixin, TestCase): queue_cls = SpiderStack def test_queue(self): req1 = Request('http://example.com/page1') req2 = Request('http://example.com/page2') self.q.push(req1) self.q.push(req2) out1 = self.q.pop() out2 = self.q.pop() self.assertEqual(out1.url, req2.url) self.assertEqual(out2.url, req1.url) class SchedulerTest(RedisTestMixin, TestCase): def setUp(self): self.persist = False self.key_prefix = 'scrapy_redis:tests:' self.queue_key = self.key_prefix + '%(spider)s:requests' self.dupefilter_key = self.key_prefix + '%(spider)s:dupefilter' self.idle_before_close = 0 self.scheduler = Scheduler(self.server, self.persist, self.queue_key, SpiderQueue, self.dupefilter_key, self.idle_before_close) self.spider = Spider('myspider') def tearDown(self): self.clear_keys(self.key_prefix) def test_scheduler(self): # default no persist self.assertFalse(self.scheduler.persist) self.scheduler.open(self.spider) self.assertEqual(len(self.scheduler), 0) req = Request('http://example.com') self.scheduler.enqueue_request(req) self.assertTrue(self.scheduler.has_pending_requests()) self.assertEqual(len(self.scheduler), 1) # dupefilter in action self.scheduler.enqueue_request(req) self.assertEqual(len(self.scheduler), 1) out = self.scheduler.next_request() self.assertEqual(out.url, req.url) self.assertFalse(self.scheduler.has_pending_requests()) self.assertEqual(len(self.scheduler), 0) self.scheduler.close('finish') def test_scheduler_persistent(self): # TODO: Improve this test to avoid the need to check for log messages. self.spider.log = mock.Mock(spec=self.spider.log) self.scheduler.persist = True self.scheduler.open(self.spider) self.assertEqual(self.spider.log.call_count, 0) self.scheduler.enqueue_request(Request('http://example.com/page1')) self.scheduler.enqueue_request(Request('http://example.com/page2')) self.assertTrue(self.scheduler.has_pending_requests()) self.scheduler.close('finish') self.scheduler.open(self.spider) self.spider.log.assert_has_calls([ mock.call("Resuming crawl (2 requests scheduled)"), ]) self.assertEqual(len(self.scheduler), 2) self.scheduler.persist = False self.scheduler.close('finish') self.assertEqual(len(self.scheduler), 0) class ConnectionTest(TestCase): # We can get a connection from just REDIS_URL. def test_redis_url(self): settings = dict( REDIS_URL = 'redis://foo:bar@localhost:9001/42' ) server = connection.from_settings(settings) connect_args = server.connection_pool.connection_kwargs self.assertEqual(connect_args['host'], 'localhost') self.assertEqual(connect_args['port'], 9001) self.assertEqual(connect_args['password'], 'bar') self.assertEqual(connect_args['db'], 42) # We can get a connection from REDIS_HOST/REDIS_PORT. def test_redis_host_port(self): settings = dict( REDIS_HOST = 'localhost', REDIS_PORT = 9001 ) server = connection.from_settings(settings) connect_args = server.connection_pool.connection_kwargs self.assertEqual(connect_args['host'], 'localhost') self.assertEqual(connect_args['port'], 9001) # REDIS_URL takes precedence over REDIS_HOST/REDIS_PORT. def test_redis_url_precedence(self): settings = dict( REDIS_HOST = 'baz', REDIS_PORT = 1337, REDIS_URL = 'redis://foo:bar@localhost:9001/42' ) server = connection.from_settings(settings) connect_args = server.connection_pool.connection_kwargs self.assertEqual(connect_args['host'], 'localhost') self.assertEqual(connect_args['port'], 9001) self.assertEqual(connect_args['password'], 'bar') self.assertEqual(connect_args['db'], 42) # We fallback to REDIS_HOST/REDIS_PORT if REDIS_URL is None. def test_redis_host_port_fallback(self): settings = dict( REDIS_HOST = 'baz', REDIS_PORT = 1337, REDIS_URL = None ) server = connection.from_settings(settings) connect_args = server.connection_pool.connection_kwargs self.assertEqual(connect_args['host'], 'baz') self.assertEqual(connect_args['port'], 1337) # We use default values for REDIS_HOST/REDIS_PORT. def test_redis_default(self): settings = dict() server = connection.from_settings(settings) connect_args = server.connection_pool.connection_kwargs self.assertEqual(connect_args['host'], 'localhost') self.assertEqual(connect_args['port'], 6379) ================================================ FILE: zhihu/zhihu/scrapy_redis/utils.py ================================================ import six def bytes_to_str(s, encoding='utf-8'): """Returns a str if a bytes object is given.""" if six.PY3 and isinstance(s, bytes): return s.decode(encoding) return s ================================================ FILE: zhihu/zhihu/settings.py ================================================ # -*- coding: utf-8 -*- # ------------------------------------------ # 版本:1.0 # 日期:2017-8-06 # 作者:AlexTan # # # ------------------------------------------ BOT_NAME = 'zhihu' SPIDER_MODULES = ['zhihu.spiders'] NEWSPIDER_MODULE = 'zhihu.spiders' REDIRECT_ENABLED = False RETRY_TIMES = 1 DOWNLOAD_TIMEOUT = 10 #下载超时时间 # Crawl responsibly by identifying yourself (and your website) on the user-agent USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.86 Safari/537.36' #分布式配置 SCHEDULER = "zhihu.scrapy_redis.scheduler.Scheduler" SCHEDULER_PERSIST = True DUPEFILTER_CLASS = "zhihu.scrapy_redis.dupefilter.RFPDupeFilter" # 种子队列的信息 REDIS_URL = None REDIS_HOST = '127.0.0.1' REDIS_PORT = 6379#6379 FILTER_URL = None FILTER_HOST = '127.0.0.1' FILTER_PORT = 6379#6379 FILTER_DB = 0 MONGO_URI = 'mongodb://127.0.0.1:27017/' MONGO_DATABASE = 'zhihu3' DOWNLOADER_MIDDLEWARES = { 'zhihu.middlewares.UserAgentMiddleware': 543, 'zhihu.middlewares.CookiesMiddleware': 544, #'zhihu.middlewares.ProxyMiddleware':125, #"scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware": 545, } ITEM_PIPELINES = { 'zhihu.pipelines.ZhihuPipeline': 301, } ''' DOWNLOAD_DELAY = 3 AUTOTHROTTLE_ENABLED = True AUTOTHROTTLE_START_DELAY = 3 AUTOTHROTTLE_MAX_DELAY = 60 ''' # Obey robots.txt rules #ROBOTSTXT_OBEY = True # Configure maximum concurrent requests performed by Scrapy (default: 16) #CONCURRENT_REQUESTS = 1 # Configure a delay for requests for the same website (default: 0) # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay # See also autothrottle settings and docs #DOWNLOAD_DELAY = 3 # The download delay setting will honor only one of: #CONCURRENT_REQUESTS_PER_DOMAIN = 16 #CONCURRENT_REQUESTS_PER_IP = 16 # Disable cookies (enabled by default) #COOKIES_ENABLED = False # Disable Telnet Console (enabled by default) #TELNETCONSOLE_ENABLED = False # Override the default request headers: #DEFAULT_REQUEST_HEADERS = { # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', # 'Accept-Language': 'en', #} # Enable or disable spider middlewares # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html #SPIDER_MIDDLEWARES = { # 'zhihu.middlewares.ZhihuSpiderMiddleware': 543, #} # Enable or disable downloader middlewares # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html # Enable or disable extensions # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html #EXTENSIONS = { # 'scrapy.extensions.telnet.TelnetConsole': None, #} # Configure item pipelines # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html # Enable and configure the AutoThrottle extension (disabled by default) # See http://doc.scrapy.org/en/latest/topics/autothrottle.html #AUTOTHROTTLE_ENABLED = True # The initial download delay #AUTOTHROTTLE_START_DELAY = 5 # The maximum download delay to be set in case of high latencies #AUTOTHROTTLE_MAX_DELAY = 60 # The average number of requests Scrapy should be sending in parallel to # each remote server #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 # Enable showing throttling stats for every response received: #AUTOTHROTTLE_DEBUG = False # Enable and configure HTTP caching (disabled by default) # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings #HTTPCACHE_ENABLED = True #HTTPCACHE_EXPIRATION_SECS = 0 #HTTPCACHE_DIR = 'httpcache' #HTTPCACHE_IGNORE_HTTP_CODES = [] #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' ================================================ FILE: zhihu/zhihu/spiders/__init__.py ================================================ # This package will contain the spiders of your Scrapy project # # Please refer to the documentation for information on how to create and manage # your spiders. ================================================ FILE: zhihu/zhihu/spiders/zhihuspider.py ================================================ # -*- coding: utf-8 -*- import scrapy import re import pdb import json from selenium import webdriver from selenium.webdriver.common.desired_capabilities import DesiredCapabilities from ..items import ZhihuItem,RelationItem from scrapy.http import Request,FormRequest from scrapy_redis.spiders import RedisSpider # ------------------------------------------ # 版本:1.0 # 日期:2017-8-06 # 作者:AlexTan # # # ------------------------------------------ #zhihuspider1是模拟浏览器爬(速度慢,不建议,仅供学习) zhihuspider0抓包爬(速度快) class ZhihuspiderSpider(RedisSpider): #class ZhihuspiderSpider(scrapy.Spider): name = "zhihuspider1" #allowed_domains = ["zhihu.com"] host = 'https://www.zhihu.com' redis_key = "zhihuspider:start_urls" #start_urls = ['https://www.zhihu.com/people/yun-he-shu-ju-8/answers'] strat_user_id = ['yun-he-shu-ju-8'] #pdb.set_trace() dcap = dict(DesiredCapabilities.PHANTOMJS) dcap["phantomjs.page.settings.userAgent"] = ("Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:54.0) Gecko/20100101 Firefox/54.0") dcap["phantomjs.page.settings.loadImages"] = False obj = webdriver.PhantomJS(desired_capabilities=dcap) def start_requests(self): for one in self.strat_user_id: yield Request('https://www.zhihu.com/people/'+one+'/answers',callback=self.parse,dont_filter=True) #return [Request('https://www.zhihu.com/#signin',callback=self.start_login,meta={'cookiejar':1})] #这个登录已不可用,仅供学习 def start_login(self,response): xsrf = response.xpath('//input[@name="_xsrf"]/@value').extract_first() return [FormRequest('https://www.zhihu.com/login/phone_num',method='POST',meta={'cookiejar':response.meta['cookiejar']},formdata={ #'_xsrf':xsrf, 'password':'88888888', 'remember_me':"true", 'phone_num':'666666'}, callback=self.after_login )] def after_login(self,response): pdb.set_trace() if json.loads(response.body)['msg'].encode('utf8') == "登录成功": self.logger.info("登录成功!%s" % str(response.meta['cookiejar'])) print("登录成功!") self.obj.add_cookie(response.meta['cookiejar']) for one in self.strat_user_id: yield Request('https://www.zhihu.com/people/'+one+'/answers',meta={'cookiejar':response.meta['cookiejar']},callback=self.parse) else: self.logger.error('登录失败') def __del__(self): self.obj.quit() def parse(self, response): item = ZhihuItem() name = response.xpath('//span[@class="ProfileHeader-name"]/text()').extract()[0] #pdb.set_trace() user_image_url = response.xpath('//img[@class="Avatar Avatar--large UserAvatar-inner"]/@srcset').extract()[0].replace(' 2x','') user_id = re.findall('people\/(.*?)\/',response.url)[0] gender_icon = response.xpath('.//svg[@class="Icon Icon--male" or @class="Icon Icon--female"]/@class').extract() #pdb.set_trace() gender = "" if gender_icon: if gender_icon[0] == "Icon Icon--female": gender = "女" elif gender_icon[0] == "Icon Icon--male": gender = "男" item['name'] = name item['user_id'] = user_id item['user_image_url'] = user_image_url item['gender'] = gender try: num = response.xpath('//div[@class="NumberBoard-value"]/text()').extract() item['followees_num'] = num[0] item['followers_num'] = num[1] followees_url = response.url.replace('answers','following') followers_url = response.url.replace('answers','followers') relation_item = RelationItem() relation_item['relations_id'] = [] relation_item['user_id'] = user_id relation_item['relation_type'] = 'followees' yield Request(followees_url,callback=self.relations,meta={'page':1,'item':relation_item}) relation_item['relation_type'] = 'followers' yield Request(followers_url,callback=self.relations,meta={'page':1,'item':relation_item}) except: print("需要登录!") self.obj.get(response.url) try: self.obj.find_element_by_class_name('ProfileHeader-expandButton').click() first = self.obj.find_elements_by_xpath('//div[@class="ProfileHeader-detailItem"]') for one in first: label = one.find_element_by_class_name('ProfileHeader-detailLabel').text if label == "居住地": location = one.find_element_by_class_name('ProfileHeader-detailValue').text.replace('\n',',') item['location'] = location elif label == "所在行业" or "行业": business = one.find_element_by_class_name('ProfileHeader-detailValue').text.replace('\n',',') item['business'] = business elif label == "职业经历": professional = one.find_element_by_class_name('ProfileHeader-detailValue').text.replace('\n',',') item['professional'] = professional elif label == "教育经历": education = one.find_element_by_class_name('ProfileHeader-detailValue').text.replace('\n',',') item['education'] = education else: pass except: pass yield item def relations(self,response): self.obj.get(response.url) followees_a = self.obj.find_elements_by_xpath('//a[@class="UserLink-link"]') #pdb.set_trace() #followees_a = response.xpath('//a[@class="UserLink-link"]/@href').extract() followees = [] for one in followees_a: try: one = one.get_attribute('href') followees.append(one.replace('https://www.zhihu.com/people/','')) except: pass followees = list(set(followees)) #pdb.set_trace() response.meta['item']['relations_id']+=followees nextpage_button = response.xpath('//button[@class="Button PaginationButton PaginationButton-next Button--plain"]').extract() if nextpage_button: #pdb.set_trace() nextpage_url = response.url.replace('?page='+str(response.meta['page']),'') + "?page=" + str(response.meta['page']+1) yield Request(nextpage_url,callback=self.relations,meta={'page':response.meta['page']+1,'item':response.meta['item']}) else: yield response.meta['item'] for user in followees: yield Request('https://www.zhihu.com/people/'+user+'/answers',callback=self.parse) ================================================ FILE: zhihu/zhihu/spiders/zhihuspider0.py ================================================ # -*- coding: utf-8 -*- import scrapy import re import pdb import json from scrapy.http import Request from ..items import ZhihuItem,RelationItem,AnswerItem,QuestionItem,ArticleItem from ..scrapy_redis.spiders import RedisSpider # ------------------------------------------ # 版本:1.0 # 日期:2017-8-06 # 作者:AlexTan # # # ------------------------------------------ class Zhihuspider0Spider(RedisSpider): name = 'zhihuspider' redis_key = "zhihuspider:start_urls" allowed_domains = ['zhihu.com'] start_urls = ['http://zhihu.com/'] strat_user_id = ['yun-he-shu-ju-8'] def start_requests(self): for one in self.strat_user_id: yield Request('https://www.zhihu.com/api/v4/members/'+one+'?include=locations,employments,industry_category,gender,educations,business,follower_count,following_count,description,badge[?(type=best_answerer)].topics',meta={'user_id':one},callback=self.parse) def parse(self, response): json_result = str(response.body,encoding="utf8").replace('false','0').replace('true','1') dict_result = eval(json_result) item = ZhihuItem() if dict_result['gender'] == 1: item['gender'] = '男' elif dict_result['gender'] == 0: item['gender'] = '女' else: item['gender'] = '未知' item['user_id'] = dict_result['url_token'] item['user_image_url'] = dict_result['avatar_url'][:-6] + 'xl.jpg' item['name'] = dict_result['name'] item['locations'] = [] for one in dict_result['locations']: item['locations'].append(one['name']) try: item['business'] = dict_result['business']['name'] except: try: item['business'] = dict_result['industry_category'] except: pass item['education'] = [] for one in dict_result['educations']: try: education = one['school']['name'] + ":" + one['major']['name'] except: try: education = one['school']['name'] except: pass item['education'].append(education) #pdb.set_trace() item['followees_num'] = dict_result['following_count'] item['followers_num'] = dict_result['follower_count'] item['employments'] = [] for one in dict_result['employments']: try: employment = one['company']['name'] + ":" + one['job']['name'] except: try: employment = one['company']['name'] except: pass item['employments'].append(employment) #pdb.set_trace() yield item item = RelationItem() one = response.meta['user_id'] item['relations_id'] = [] item['user_id'] = one item['relation_type'] = '' yield Request('https://www.zhihu.com/api/v4/members/'+one+'/followers?include=data[*].answer_count,badge[?(type=best_answerer)].topics&limit=20&offset=0',callback=self.parse_relation,meta={'item':item,'offset':0,'relation_type':'followers'}) yield Request('https://www.zhihu.com/api/v4/members/'+one+'/followees?include=data[*].answer_count,badge[?(type=best_answerer)].topics&limit=20&offset=0',callback=self.parse_relation,meta={'item':item,'offset':0,'relation_type':'followees'}) yield Request('https://www.zhihu.com/api/v4/members/'+one+'/answers?include=data[*].comment_count,content,voteup_count,created_time,updated_time;data[*].author.badge[?(type=best_answerer)].topics&limit=20&offset=0',callback=self.parse_answer,meta={'answer_user_id':one,'offset':0}) yield Request('https://www.zhihu.com/people/'+one+'/asks?page=1',callback=self.parse_question,meta={'ask_user_id':one,'page':1}) yield Request('https://www.zhihu.com/api/v4/members/'+one+'/articles?include=data[*].comment_count,content,voteup_count,created,updated;data[*].author.badge[?(type=best_answerer)].topics&limit=20&offset=0',callback=self.parse_article,meta={'author_id':one,'offset':0}) def parse_relation(self,response): json_result = str(response.body,encoding="utf8").replace('false','0').replace('true','1') dict_result = eval(json_result) relations_id = [] for one in dict_result['data']: relations_id.append(one['url_token']) response.meta['item']['relations_id'] = relations_id if response.meta['offset'] == 0: response.meta['item']['relation_type'] = response.meta['relation_type'] else: response.meta['item']['relation_type'] = 'next:' + response.meta['relation_type'] #pdb.set_trace() yield response.meta['item'] for one in response.meta['item']['relations_id']: yield Request('https://www.zhihu.com/api/v4/members/'+one+'?include=locations,employments,industry_category,gender,educations,business,follower_count,following_count,description,badge[?(type=best_answerer)].topics',meta={'user_id':one},callback=self.parse) #pdb.set_trace() if dict_result['paging']['is_end'] == 0: #pdb.set_trace() offset = response.meta['offset'] + 20 next_page = re.findall('(.*offset=)\d+',response.url)[0] #pdb.set_trace() yield Request(next_page + str(offset),callback=self.parse_relation,meta={'item':response.meta['item'],'offset':offset,'relation_type':response.meta['relation_type']}) def parse_answer(self,response): json_result = str(response.body,encoding="utf8").replace('false','0').replace('true','1') dict_result = eval(json_result) for one in dict_result['data']: item = AnswerItem() item['answer_user_id'] = response.meta['answer_user_id'] item['answer_id'] = one['id'] item['question_id'] = one['question']['id'] #pdb.set_trace() item['cretated_time'] = one['created_time'] item['updated_time'] = one['updated_time'] item['voteup_count'] = one['voteup_count'] item['comment_count'] = one['comment_count'] item['content'] = one['content'] yield item if dict_result['paging']['is_end'] == 0: offset = response.meta['offset'] + 20 next_page = re.findall('(.*offset=)\d+',response.url)[0] yield Request(next_page + str(offset),callback=self.parse_answer,meta={'answer_user_id':response.meta['answer_user_id'],'offset':offset}) def parse_question(self,response): list_item = response.xpath('//div[@class="List-item"]') for one in list_item: item = QuestionItem() item['ask_user_id'] = response.meta['ask_user_id'] title = one.xpath('.//div[@class="QuestionItem-title"]') item['title'] = title.xpath('./a/text()').extract()[0] item['question_id'] = title.xpath('./a/@href').extract()[0].replace('/question/','') content_item = one.xpath('.//div[@class="ContentItem-status"]//span/text()').extract() item['ask_time'] = content_item[0] item['answer_count'] = content_item[1] item['followees_count'] = content_item[2] yield item next_page = response.xpath('//button[@class="Button PaginationButton PaginationButton-next Button--plain"]/text()').extract() if next_page: response.meta['page'] += 1 next_url = re.findall('(.*page=)\d+',response.url)[0] + str(response.meta['page']) yield Request(next_url,callback=self.parse_question,meta={'ask_user_id':response.meta['ask_user_id'],'page':response.meta['page']}) def parse_article(self,response): json_result = str(response.body,encoding="utf8").replace('false','0').replace('true','1') dict_result = eval(json_result) for one in dict_result['data']: item = ArticleItem() item['author_id'] = response.meta['author_id'] item['title'] = one['title'] item['article_id'] = one['id'] item['content'] = one['content'] #pdb.set_trace() item['cretated_time'] = one['created'] item['updated_time'] = one['updated'] item['voteup_count'] = one['voteup_count'] item['comment_count'] = one['comment_count'] yield item if dict_result['paging']['is_end'] == 0: offset = response.meta['offset'] + 20 next_page = re.findall('(.*offset=)\d+',response.url)[0] yield Request(next_page + str(offset),callback=self.parse_article,meta={'author_id':response.meta['author_id'],'offset':offset}) ================================================ FILE: zhihu/zhihu/user_agents_pc.py ================================================ #encoding=utf8 # ------------------------------------------ # 版本:1.0 # 日期:2017-8-06 # 作者:AlexTan # # # ------------------------------------------ agents = [ "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:54.0) Gecko/20100101 Firefox/54.0", "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:54.0) Gecko/20100101 Firefox/54.0", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.86 Safari/537.36", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.86 Safari/537.36", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.79 Safari/537.36 Edge/14.14393", ] ================================================ FILE: zhihu/zhihu/yumdama.py ================================================ # encoding=utf-8 import http.client, mimetypes, urllib, json, time, requests import pdb ###################################################################### # 错误代码请查询 http://www.yundama.com/apidoc/YDM_ErrorCode.html # 所有函数请查询 http://www.yundama.com/apidoc # 1. http://www.yundama.com/index/reg/developer 注册开发者账号 # 2. http://www.yundama.com/developer/myapp 添加新软件 # 3. 使用添加的软件ID和密钥进行开发,享受丰厚分成 # 用户名 username = '' # 密码 password = '' # 软件ID,开发者分成必要参数。登录开发者后台【我的软件】获得! appid = 0000 # 软件密钥,开发者分成必要参数。登录开发者后台【我的软件】获得! appkey = '' # 图片文件 filename = 'captcha.png' # 验证码类型,# 例:1004表示4位字母数字,不同类型收费不同。请准确填写,否则影响识别率。在此查询所有类型 http://www.yundama.com/price.html codetype = 1004 # 超时时间,秒 timeout = 60 ###################################################################### class YDMHttp(): apiurl = 'http://api.yundama.com/api.php' username = '' password = '' appid = '' appkey = '' def __init__(self, username, password, appid, appkey): self.username = username self.password = password self.appid = str(appid) self.appkey = appkey def request(self, fields, files=[]): response = self.post_url(self.apiurl, fields, files) response = json.loads(response) return response def balance(self): data = {'method': 'balance', 'username': self.username, 'password': self.password, 'appid': self.appid, 'appkey': self.appkey} response = self.request(data) if (response): if (response['ret'] and response['ret'] < 0): return response['ret'] else: return response['balance'] else: return -9001 def login(self): data = {'method': 'login', 'username': self.username, 'password': self.password, 'appid': self.appid, 'appkey': self.appkey} response = self.request(data) if (response): if (response['ret'] and response['ret'] < 0): return response['ret'] else: return response['uid'] else: return -9001 def upload(self, filename, codetype, timeout): data = {'method': 'upload', 'username': self.username, 'password': self.password, 'appid': self.appid, 'appkey': self.appkey, 'codetype': str(codetype), 'timeout': str(timeout)} file = {'file': filename} response = self.request(data, file) if (response): if (response['ret'] and response['ret'] < 0): return response['ret'] else: return response['cid'] else: return -9001 def result(self, cid): data = {'method': 'result', 'username': self.username, 'password': self.password, 'appid': self.appid, 'appkey': self.appkey, 'cid': str(cid)} response = self.request(data) return response and response['text'] or '' def decode(self, filename, codetype, timeout): cid = self.upload(filename, codetype, timeout) if (cid > 0): for i in range(0, timeout): result = self.result(cid) if (result != ''): return cid, result else: time.sleep(1) return -3003, '' else: return cid, '' def post_url(self, url, fields, files=[]): for key in files: files[key] = open(files[key], 'rb'); res = requests.post(url, files=files, data=fields) return res.text ###################################################################### def identify(): if (username == 'username'): print ('请设置好相关参数再测试') else: #pdb.set_trace() # 初始化 yundama = YDMHttp(username, password, appid, appkey) # 登陆云打码 uid = yundama.login() # print 'uid: %s' % uid # 查询余额 balance = yundama.balance() # print 'balance: %s' % balance # 开始识别,图片路径,验证码类型ID,超时时间(秒),识别结果 cid, result = yundama.decode(filename, codetype, timeout) # print 'cid: %s, result: %s' % (cid, result) return result