[
  {
    "path": ".gitignore",
    "content": "# Byte-compiled / optimized / DLL files\n__pycache__/\n*.py[cod]\n*$py.class\n\n# C extensions\n*.so\n\n# Distribution / packaging\n.Python\nenv/\nbuild/\ndevelop-eggs/\ndist/\ndownloads/\neggs/\n.eggs/\nlib/\nlib64/\nparts/\nsdist/\nvar/\nwheels/\n*.egg-info/\n.installed.cfg\n*.egg\n\n# PyInstaller\n#  Usually these files are written by a python script from a template\n#  before PyInstaller builds the exe, so as to inject date/other infos into it.\n*.manifest\n*.spec\n\n# Installer logs\npip-log.txt\npip-delete-this-directory.txt\n\n# Unit test / coverage reports\nhtmlcov/\n.tox/\n.coverage\n.coverage.*\n.cache\nnosetests.xml\ncoverage.xml\n*.cover\n.hypothesis/\n\n# Translations\n*.mo\n*.pot\n\n# Django stuff:\n*.log\nlocal_settings.py\n\n# Flask stuff:\ninstance/\n.webassets-cache\n\n# Scrapy stuff:\n.scrapy\n\n# Sphinx documentation\ndocs/_build/\n\n# PyBuilder\ntarget/\n\n# Jupyter Notebook\n.ipynb_checkpoints\n\n# pyenv\n.python-version\n\n# celery beat schedule file\ncelerybeat-schedule\n\n# SageMath parsed files\n*.sage.py\n\n# dotenv\n.env\n\n# virtualenv\n.venv\nvenv/\nENV/\n\n# Spyder project settings\n.spyderproject\n.spyproject\n\n# Rope project settings\n.ropeproject\n\n# mkdocs documentation\n/site\n\n# mypy\n.mypy_cache/\n"
  },
  {
    "path": "LICENSE",
    "content": "MIT License\n\nCopyright (c) 2017 AlexTan-b-z\n\nPermission is hereby granted, free of charge, to any person obtaining a copy\nof this software and associated documentation files (the \"Software\"), to deal\nin the Software without restriction, including without limitation the rights\nto use, copy, modify, merge, publish, distribute, sublicense, and/or sell\ncopies of the Software, and to permit persons to whom the Software is\nfurnished to do so, subject to the following conditions:\n\nThe above copyright notice and this permission notice shall be included in all\ncopies or substantial portions of the Software.\n\nTHE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\nIMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\nFITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\nAUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\nLIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\nOUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\nSOFTWARE.\n"
  },
  {
    "path": "README.md",
    "content": "# 知乎爬虫(scrapy默认配置下单机1小时可爬取60多万条数据)  \n***\n*版本*：2.0  \n*作者*: AlexTan  \n\n*CSDN*: [AlexTan_](http://blog.csdn.net/alextan_)  \n*E-Mail* : <alextanbz@gmail.com> \n***\n\n## 原文博客：[ZhihuSpider](http://blog.csdn.net/AlexTan_/article/details/77057068)\n\n\n\n## 更新日志：\n* 2017.12.18：v2.0版本，修改spider，解决了 爬虫运行过久由于一些特殊原因把redis里的待爬取requests队列里的Request都耗尽，从而导致重新运行爬虫时start_requests里的request都被dupefilter过滤掉 的问题。\n\n* 2017.11.21:v2.0版本 对proxy.py进行了优化，使每个ip的权值都不会超过10，避免出现有的ip权值无限增长，失效后要等很久才能删掉失效ip的问题。\n\n* 2017.10.08: v2.0版本 对ip代理池（中间件）进行了优化(知乎爬虫用不上，这个中间件可以移植到其他爬虫去，只对知乎爬虫有需求的可以无视)，由于上次那个代理ip过期了，这次用的讯代理，感觉比上次那个代理好用多了，有效率在95%左右。但是缺点就是优质版每次只能提取20个，每天最多提取1000个。以前那个换ip的代码会误删很多并没有失效的ip，所以这次代码就对ip进行了加权(status)处理。默认权值为10，一次访问失败会减一，访问成功会加一，当权值小于1的时候，删除该ip。\n\n* 2017.08.22: 对三个版本的 pipline 和 spider 两个文件都修改了一下。因为以前RelationItem插入mongo时，next的数据会随机插入到粉丝或者关注里，导致数据会发生错误。 现已修正。同时，有人说到如果启用代理ip，获取ip那儿会造成堵塞，这次在获取代理ip那儿加了个多线程，解决了堵塞问题。\n\n* 2017.08.17: v2.0版本 对scrapy_redis进行优化，修改了scrapy-redis的去重机制（加了布隆过滤器）。更新原因： v1.0版本运行两到三天就会把内存（16G的服务器）占满。 更新后，V2.0版本，运行3天，只会占大概2到3G内存（几乎不会增长）。\n\n\n## 关于redis:\n如果要持久运行，建议修改一下redis.conf文件，ubuntu默认在 `/etc/redis/redis.conf` 下:\n1. 把 maxmemory 设置成你内存的 3/4\n2. 把 maxmemory-policy 设置成 allkeys-lru\n\n### 最后建议多弄几个账号运行，目测78个就足够了。\n\n\n## 原文博客：[ZhihuSpider](http://blog.csdn.net/AlexTan_/article/details/77057068)\n\n\n***\n\n最后，欢迎大家提出问题，共同学习！！！\n"
  },
  {
    "path": "zhihu/scrapy.cfg",
    "content": "# Automatically created by: scrapy startproject\n#\n# For more information about the [deploy] section see:\n# https://scrapyd.readthedocs.org/en/latest/deploy.html\n\n[settings]\ndefault = zhihu.settings\n\n[deploy]\n#url = http://localhost:6800/\nproject = zhihu\n"
  },
  {
    "path": "zhihu/zhihu/__init__.py",
    "content": ""
  },
  {
    "path": "zhihu/zhihu/cookie.py",
    "content": "#encoding=utf8\nimport pdb\nimport os\nimport time\nimport json\nfrom selenium import webdriver\nfrom selenium.webdriver.common.desired_capabilities import DesiredCapabilities\nimport logging\nfrom .yumdama import identify\n\n# ------------------------------------------\n#   版本：1.0\n#   日期：2017-8-06\n#   作者：AlexTan\n#   <CSDN:   http://blog.csdn.net/alextan_>  \n#   <e-mail: alextanbz@gmail.com>\n# ------------------------------------------\n\ndcap = dict(DesiredCapabilities.PHANTOMJS)\ndcap[\"phantomjs.page.settings.userAgent\"] = (\n    \"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.86 Safari/537.36\"\n)\nlogger = logging.getLogger(__name__)\nlogging.getLogger(\"selenium\").setLevel(logging.WARNING) # 将selenium的日志级别设成WARNING，太烦人\n\nMETHOD = 0 #0代表手动输入验证码，1代表云打码\n\nmyZhiHu = [\n    ('account','password',0),  #0代表账号为手机，1代表账号为邮箱\n]\n\ndef getCookie(account,password,way):\n    if way == 0:\n        loginURL = \"https://www.zhihu.com/login/phone_num\"\n        username = 'phone_num'\n    else:\n        loginURL = \"https://www.zhihu.com/login/email\"\n        username = 'email'\n    try:\n        browser = webdriver.PhantomJS(desired_capabilities=dcap)\n        #browser = webdriver.Firefox()\n        browser.set_window_size(1920, 1080)\n        browser.get(\"https://www.zhihu.com/explore\")\n        time.sleep(1)\n        #pdb.set_trace()\n        browser.find_element_by_class_name('switch-to-login').click()\n        loginDIV = browser.find_element_by_id('SidebarSignFlow').find_element_by_class_name('LoginForm')\n        loginDIV.find_element_by_name('account').send_keys(account)\n        loginDIV.find_element_by_name('password').send_keys(password)\n        time.sleep(1)\n        while True:\n            browser.save_screenshot(\"zhihu.png\")\n            if loginDIV.find_element_by_class_name('captcha-module').get_attribute('style') != '':\n                if METHOD == 0:\n                    code_txt = input(\"请查看路径下新生成的zhihu.png，然后输入验证码:\")\n                else:\n                    img = loginDIV.find_element_by_class_name('captcha')\n                    x = img.location[\"x\"]\n                    y = img.location[\"y\"]\n                    from PIL import Image\n                    im = Image.open(\"zhihu.png\")\n                    im.crop((x, y, 85 + x, y + 30)).save(\"captcha.png\")\n                    #pdb.set_trace()\n                    code_txt = identify()\n                loginDIV.find_element_by_name('captcha').send_keys(code_txt)\n            loginDIV.find_element_by_class_name('zg-btn-blue').click()\n            time.sleep(3)\n            try:\n                loginDIV.find_element_by_class_name('error')\n                logger.warning(\"验证码或账号密码错误 %s!\" % account)\n            except:\n                break\n        try:\n            #pdb.set_trace()\n            browser.find_element_by_class_name('top-nav-profile')\n            cookie = {}\n            for elem in browser.get_cookies():\n                cookie[elem[\"name\"]] = elem[\"value\"]\n            logger.warning(\"Get Cookie Success!( Account:%s )\" % account)\n            #pdb.set_trace()\n            return json.dumps(cookie)\n        except Exception:\n            logger.warning(\"Failed %s!\" % account)\n            return \"\"\n    except Exception:\n        logger.warning(\"Failed %s!\" % account)\n        return \"\"\n    finally:\n        try:\n            browser.quit()\n        except Exception:\n            pass\n\ndef UpdateCookie(account,cookie):\n    browser = webdriver.PhantomJS(desired_capabilities=dcap)\n    #browser = webdriver.Firefox()\n    browser.set_window_size(1920, 1080)\n    browser.get('https://www.zhihu.com')\n    browser.delete_all_cookies()\n    send_cookie = []\n    for key,value in cookie.items():\n        one = {}\n        one = {'domain':'.zhihu.com','name':key,'value':value,'path':'/','expiry':None}\n        #pdb.set_trace()\n        browser.add_cookie({k: one[k] for k in ('name', 'value', 'domain', 'path', 'expiry')})\n        #one = {'domain':'.zhihu.com','name':key,'value':value}\n        #send_cookie.append(one)\n    #browser.add_cookie(send_cookie)\n    browser.get('https://www.zhihu.com/account/unhuman?type=unhuman&message=%E7%B3%BB%E7%BB%9F%E6%A3%80%E6%B5%8B%E5%88%B0%E6%82%A8%E7%9A%84%E5%B8%90%E5%8F%B7%E6%88%96IP%E5%AD%98%E5%9C%A8%E5%BC%82%E5%B8%B8%E6%B5%81%E9%87%8F%EF%BC%8C%E8%AF%B7%E8%BE%93%E5%85%A5%E4%BB%A5%E4%B8%8B%E5%AD%97%E7%AC%A6%E7%94%A8%E4%BA%8E%E7%A1%AE%E8%AE%A4%E8%BF%99%E4%BA%9B%E8%AF%B7%E6%B1%82%E4%B8%8D%E6%98%AF%E8%87%AA%E5%8A%A8%E7%A8%8B%E5%BA%8F%E5%8F%91%E5%87%BA%E7%9A%84')\n    time.sleep(1)\n    browser.save_screenshot(\"update.png\")\n    if METHOD == 0:\n        code_txt = input(\"请查看路径下新生成的update.png，然后输入验证码:\")\n    else:\n        img = browser.find_element_by_class_name('Unhuman-captcha')\n        x = img.location[\"x\"]\n        y = img.location[\"y\"]\n        from PIL import Image\n        im = Image.open(\"zhihu.png\")\n        im.crop((x, y, 85 + x, y + 30)).save(\"captcha.png\")\n        #pdb.set_trace()\n        code_txt = identify()\n    browser.find_element_by_class_name('Input').send_keys(code_txt)\n    browser.find_element_by_class_name('Button--blue').click()\n    time.sleep(3)\n    try:\n        browser.find_element_by_class_name('AppHeader-profile')\n        cookie = {}\n        for elem in browser.get_cookies():\n            cookie[elem[\"name\"]] = elem[\"value\"]\n        logger.warning(\"Update Cookie Success!( Account:%s )\" % account)\n        #pdb.set_trace()\n        return json.dumps(cookie)\n    except Exception:\n        logger.warning(\"Update Failed %s!\" % account)\n        return \"\"\n    finally:\n        try:\n            browser.quit()\n        except Exception:\n            pass\n\n\n\ndef initCookie(rconn, spiderName):\n    \"\"\" 获取所有账号的Cookies，存入Redis。如果Redis已有该账号的Cookie，则不再获取。 \"\"\"\n    for zhihu in myZhiHu:\n        if rconn.get(\"%s:Cookies:%s--%s\" % (spiderName, zhihu[0], zhihu[1])) is None:  # 'zhihuspider:Cookies:账号--密码'，为None即不存在。\n            cookie = getCookie(zhihu[0], zhihu[1],zhihu[2])\n            if len(cookie) > 0:\n                rconn.set(\"%s:Cookies:%s--%s\" % (spiderName, zhihu[0], zhihu[1]), cookie)\n    cookieNum = str(rconn.keys()).count(\"zhihuspider:Cookies\")\n    logger.warning(\"The num of the cookies is %s\" % cookieNum)\n    if cookieNum == 0:\n        logger.warning('Stopping...')\n        os.system(\"pause\")\n\ndef updateCookie(accountText, rconn, spiderName, cookie):\n    \"\"\" 更新一个账号的Cookie \"\"\"\n    account = accountText.split(\"--\")[0]\n    #pdb.set_trace()\n    new_cookie = UpdateCookie(account, cookie)\n    if len(new_cookie) > 0:\n        logger.warning(\"The cookie of %s has been updated successfully!\" % account)\n        rconn.set(\"%s:Cookies:%s\" % (spiderName, accountText), new_cookie)\n    else:\n        logger.warning(\"The cookie of %s updated failed! Remove it!\" % accountText)\n        removeCookie(accountText, rconn, spiderName)\n\ndef removeCookie(accountText, rconn, spiderName):\n    \"\"\" 删除某个账号的Cookie \"\"\"\n    rconn.delete(\"%s:Cookies:%s\" % (spiderName, accountText))\n    cookieNum = str(rconn.keys()).count(\"zhihuspider:Cookies\")\n    logger.warning(\"The num of the cookies left is %s\" % cookieNum)\n    if cookieNum == 0:\n        logger.warning(\"Stopping...\")\n        os.system(\"pause\")\n\n\nif __name__ == '__main__':\n    getCookie(myZhiHu[0][0],myZhiHu[0][1],myZhiHu[0][2])\n"
  },
  {
    "path": "zhihu/zhihu/items.py",
    "content": "# -*- coding: utf-8 -*-\n\n# ------------------------------------------\n#   版本：1.0\n#   日期：2017-8-06\n#   作者：AlexTan\n#   <CSDN:   http://blog.csdn.net/alextan_>  \n#   <e-mail: alextanbz@gmail.com>\n# ------------------------------------------\n\nimport scrapy\n\n\nclass ZhihuItem(scrapy.Item):\n    # define the fields for your item here like:\n    # name = scrapy.Field()\n    user_id = scrapy.Field()\n    user_image_url = scrapy.Field()\n    name = scrapy.Field()\n    locations = scrapy.Field()\n    business = scrapy.Field() #所在行业\n    employments = scrapy.Field() #职业经历\n    gender = scrapy.Field()\n    education = scrapy.Field()\n    followees_num = scrapy.Field() #我关注的人数\n    followers_num = scrapy.Field() #关注我的人数\n\nclass RelationItem(scrapy.Item):\n    user_id = scrapy.Field()\n    relation_type = scrapy.Field() #关系类型\n    relations_id = scrapy.Field()\n\nclass AnswerItem(scrapy.Item):\n    answer_user_id = scrapy.Field()\n    answer_id = scrapy.Field()\n    question_id = scrapy.Field()\n    cretated_time = scrapy.Field()\n    updated_time = scrapy.Field()\n    voteup_count = scrapy.Field()\n    comment_count = scrapy.Field()\n    content = scrapy.Field()\n\nclass QuestionItem(scrapy.Item):\n    ask_user_id = scrapy.Field()\n    question_id = scrapy.Field()\n    ask_time = scrapy.Field()\n    answer_count = scrapy.Field()\n    followees_count = scrapy.Field()\n    title = scrapy.Field()\n\nclass ArticleItem(scrapy.Item):\n    author_id = scrapy.Field()\n    title = scrapy.Field()\n    article_id = scrapy.Field()\n    content = scrapy.Field()\n    cretated_time = scrapy.Field()\n    updated_time = scrapy.Field()\n    voteup_count = scrapy.Field()\n    comment_count = scrapy.Field()"
  },
  {
    "path": "zhihu/zhihu/middlewares.py",
    "content": "# -*- coding: utf-8 -*-\n\nimport logging\nimport telnetlib\nimport random\nimport redis\nimport json\nimport os\nimport threading\nimport pdb\nfrom scrapy import signals\nfrom .user_agents_pc import agents\nfrom .proxy import initIPPOOLS, updateIPPOOLS\nfrom .cookie import initCookie, updateCookie, removeCookie\nfrom scrapy.utils.response import response_status_message\nfrom scrapy.downloadermiddlewares.retry import RetryMiddleware\nfrom scrapy.exceptions import IgnoreRequest\n\n# ------------------------------------------\n#   版本：1.0\n#   日期：2017-8-06\n#   作者：AlexTan\n#   <CSDN:   http://blog.csdn.net/alextan_>  \n#   <e-mail: alextanbz@gmail.com>\n# ------------------------------------------\n\nlogger = logging.getLogger(__name__)\n\nclass UserAgentMiddleware(object):\n    \"\"\" 换User-Agent \"\"\"\n\n    def process_request(self, request, spider):\n        agent = random.choice(agents)\n        request.headers[\"User-Agent\"] = agent\n\nclass ProxyMiddleware(RetryMiddleware):\n    '''IP代理'''\n    def __init__(self, settings, crawler):\n        #自己获取的ip\n        self.TIMES = 10\n        RetryMiddleware.__init__(self, settings)\n        self.rconn = settings.get(\"RCONN\", redis.Redis(crawler.settings.get('REDIS_HOST', 'localhsot'), crawler.settings.get('REDIS_PORT', 6379)))\n        #initIPPOOLS(self.rconn)\n\n    @classmethod\n    def from_crawler(cls, crawler):\n        return cls(crawler.settings, crawler)\n\n    def process_request(self,request,spider):\n        #pdb.set_trace()\n        ipNum=len(self.rconn.keys('IP*'))\n        #pdb.set_trace()\n        if ipNum<50:\n            proxy_thread = threading.Thread(target= initIPPOOLS,args = (self.rconn,))\n            proxy_thread.setDaemon(True)\n            proxy_thread.start()\n            #initIPPOOLS(self.rconn)\n        if self.TIMES == 3:\n            baseIP=random.choice(self.rconn.keys('IP:*'))\n            ip=str(baseIP,'utf-8').replace('IP:','')\n            try:\n                IP,PORT,status=ip.split(':')\n                request.meta['status'] = status\n                telnetlib.Telnet(IP,port=PORT,timeout=2) #测试ip是否有效\n            except:\n                logger.warning(\"The ip is not available !( IP:%s )\" % ip)\n                updateIPPOOLS(self.rconn,IP+':'+PORT,status)\n            else:\n                #pdb.set_trace()\n                self.IP = \"http://\" + IP + ':' + PORT\n                logger.warning(\"The current IP is %s!\" % self.IP)\n                self.TIMES = 0\n                updateIPPOOLS(self.rconn,IP+':'+PORT,status,1)\n                #pdb.set_trace()\n        else:\n            self.TIMES += 1\n        #pdb.set_trace()\n        if self.IP is not \"\":\n            request.meta[\"proxy\"] = self.IP\n\n    def process_response(self,request,response,spider):\n        if response.status in [400,403,404,429,500,502,503,504]:\n            self.TIMES = 3\n            logger.error(\"%s! error...\" % response.status)\n            #pdb.set_trace()\n            try:\n                updateIPPOOLS(self.rconn,request.meta['proxy'].replace('http://',''),request.meta['status'],-1)\n            except:\n                pass\n            reason = response_status_message(response.status)\n            return self._retry(request, reason, spider) or response  # 重试\n        else:\n            return response\n\n    def process_exception(self, request, exception, spider):\n        #pdb.set_trace()\n        self.TIMES = 3\n        try:\n            updateIPPOOLS(self.rconn,request.meta['proxy'].replace('http://',''),request.meta['status'],-1)\n        except:\n            pass\n        return request\n\nclass CookiesMiddleware(RetryMiddleware):\n    \"\"\" 维护Cookie \"\"\"\n\n    def __init__(self, settings, crawler):\n        RetryMiddleware.__init__(self, settings)\n        self.rconn = settings.get(\"RCONN\", redis.Redis(crawler.settings.get('REDIS_HOST', 'localhsot'), crawler.settings.get('REDIS_PORT', 6379)))\n        initCookie(self.rconn, crawler.spider.name)\n\n    @classmethod\n    def from_crawler(cls, crawler):\n        return cls(crawler.settings, crawler)\n\n    def process_request(self, request, spider):\n        redisKeys = self.rconn.keys()\n        while len(redisKeys) > 0:\n            elem = random.choice(redisKeys)\n            #pdb.set_trace()\n            if b'zhihuspider:Cookies' in elem:\n                #pdb.set_trace()\n                elem = str(elem,'utf-8')\n                cookie = json.loads(str(self.rconn.get(elem),'utf-8'))\n                request.cookies = cookie\n                request.meta[\"accountText\"] = elem.split(\"Cookies:\")[-1]\n                break\n            else:\n                #pdb.set_trace()\n                redisKeys.remove(elem)\n\n    def process_response(self, request, response, spider):\n        #pdb.set_trace()\n        reason = response_status_message(response.status)\n        if response.status in [300, 301, 302, 303]:\n            pdb.set_trace()\n            if reason == '301 Moved Permanently':\n                return self._retry(request, reason, spider) or response  # 重试\n            else:\n                raise IgnoreRequest\n        elif response.status in [403, 414]:\n            logger.error(\"%s! Stopping...\" % response.status)\n            os.system(\"pause\")\n            updateCookie(request.meta['accountText'], self.rconn, spider.name, request.cookies)\n            return self._retry(request, reason, spider) or response  # 重试\n        else:\n            return response\n"
  },
  {
    "path": "zhihu/zhihu/pipelines.py",
    "content": "# -*- coding: utf-8 -*-\nimport pymongo\nimport pdb\nfrom .items import ZhihuItem,RelationItem,AnswerItem,QuestionItem,ArticleItem\n\n# ------------------------------------------\n#   版本：1.0\n#   日期：2017-8-06\n#   作者：AlexTan\n#   <CSDN:   http://blog.csdn.net/alextan_>  \n#   <e-mail: alextanbz@gmail.com>\n# ------------------------------------------\n\n\nclass ZhihuPipeline(object):\n    def __init__(self, mongo_uri, mongo_db):\n        self.mongo_uri = mongo_uri\n        self.mongo_db = mongo_db\n\n    @classmethod\n    def from_crawler(cls,crawler):\n        return cls(\n                mongo_uri = crawler.settings.get('MONGO_URI'),\n                mongo_db = crawler.settings.get('MONGO_DATABASE','zhihu')\n            )\n\n    def open_spider(self,spider):\n        self.client = pymongo.MongoClient(self.mongo_uri)\n        self.db = self.client[self.mongo_db]\n\n    def close_spider(self,spider):\n        self.client.close()\n\n    def process_item(self, item, spider):\n        if isinstance(item, ZhihuItem):\n            self._process_user_item(item)\n        elif isinstance(item, AnswerItem):\n            self._process_answer_item(item)\n        elif isinstance(item, QuestionItem):\n            self._process_question_item(item)\n        elif isinstance(item, ArticleItem):\n            self._process_article_item(item)\n        else:\n            #pdb.set_trace()\n            self._process_relation_item(item)\n        return item\n\n    def _process_user_item(self,item):\n        self.db.UserInfo.insert(dict(item))\n\n    def _process_relation_item(self,item):\n        try:\n            isnext,relation_type = item['relation_type'].split(':')\n            if isnext == 'next':\n                for one in item['relations_id']:\n                    #pdb.set_trace()\n                    self.db.Relation.update({'user_id':item['user_id'],'relation_type':relation_type},{\"$push\":{'relations_id':one}})\n        except:\n            self.db.Relation.insert(dict(item))\n\n    def _process_answer_item(self,item):\n        self.db.AnswerInfo.insert(dict(item))\n\n    def _process_question_item(self,item):\n        self.db.QuestionInfo.insert(dict(item))\n\n    def _process_article_item(self,item):\n        self.db.ArticleInfo.insert(dict(item))\n"
  },
  {
    "path": "zhihu/zhihu/proxy.py",
    "content": "# encoding=utf-8\nimport telnetlib\nimport urllib\nimport logging\n\n# ------------------------------------------\n#   版本：1.0\n#   日期：2017-8-06\n#   作者：AlexTan\n#   <CSDN:   http://blog.csdn.net/alextan_>  \n#   <e-mail: alextanbz@gmail.com>\n# ------------------------------------------\n\nlogger = logging.getLogger(__name__)\nIPPOOLNUM=20 #一次性从网页获取的IP数量\n\ndef GetIPPOOLS(num):\n    #大象代理买的ip,5元20000个，每十个差不多有一个能用\n    IPPOOL=urllib.request.urlopen(\"http://tpv.daxiangdaili.com/ip/?tid=559480480576119&num=\"+str(num)+\"&operator=1&filter=on&protocol=http&category=2&delay=1\").read().decode(\"utf-8\",\"ignore\").split('\\r\\n')\n    '''\n    #自己获取的ip\n    IPPOOLS1=urllib.request.urlopen(\"http://127.0.0.1:8000/?types=0&count=20&country=%E5%9B%BD%E5%86%85\").read().decode(\"utf-8\",'ignore')\n    IPPOOLS2=re.findall('\\\"(\\d+\\.\\d+\\.\\d+\\.\\d+\\\"\\,\\s*\\d+)',IPPOOLS1)\n    IPPOOL=[i.replace('\", ',':') for i in IPPOOLS2]\n    '''\n    return IPPOOL\n\ndef initIPPOOLS(rconn):\n    \"\"\"把有效的IP存入\tREDIS数据库\"\"\"\n\n    ipNum=len(rconn.keys('IP*'))\n    if ipNum<IPPOOLNUM:\n        IPPOOLS=GetIPPOOLS(IPPOOLNUM)\n        for ipall in IPPOOLS:\n            try:\n                ip=ipall.split(':')[0]\n                port=ipall.split(':')[1]\n                telnetlib.Telnet(ip,port=port,timeout=2) #检验代理ip是否有效\n            except:\n                logger.warning(\"The ip is not available !( IP:%s )\" % ipall)\n            else:\n                logger.warning(\"Get ip Success!( IP:%s )\" % ipall)\n                rconn.set(\"IP:%s:10\"%(ipall),ipall)     #10 is status\n    else:\n        logger.warning(\"The number of  the IP is %s!\" % str(ipNum))\n\ndef updateIPPOOLS(rconn,ip,status,flag=0): # 0代表对status减一，-1代表减2，1代表加1\n    if int(status) < 1:\n        removeIPPOOLS(rconn,ip,status)\n        return\n    '''update status'''\n    if flag == 1: #+status\n        if int(status) < 10:\n            rconn.delete('IP:'+ ip + ':' + status)\n            status = int(status) + 1\n            rconn.set(\"IP:%s:%s\"%(ip,str(status)),ip)\n    elif flag == -1:\n        rconn.delete('IP:'+ ip + ':' + status)\n        status = int(status) - 2\n        rconn.set(\"IP:%s:%s\"%(ip,str(status)),ip)\n    else:\n        rconn.delete('IP:'+ ip + ':' + status)\n        status = int(status) - 1\n        rconn.set(\"IP:%s:%s\"%(ip,str(status)),ip)\n\ndef removeIPPOOLS(rconn,ip,status):\n    logger.error(\"IP:%s not available ! System is deleting\" % ip)\n    try:\n        rconn.delete('IP:' + ip + ':' + status)\n    except:\n        pass\n    ipNum=len(rconn.keys('IP*'))\n    logger.warning(\"The number of  the IP is %s!\" % str(ipNum))\n"
  },
  {
    "path": "zhihu/zhihu/scrapy_redis/BloomfilterOnRedis.py",
    "content": "# -*- coding: utf-8 -*-\n\n# ------------------------------------------\n#   版本：1.0\n#   日期：2017-8-06\n#   作者：AlexTan\n#   <CSDN:   http://blog.csdn.net/alextan_>  \n#   <e-mail: alextanbz@gmail.com>\n# ------------------------------------------\n\n\nclass SimpleHash(object):\n    def __init__(self, cap, seed):\n        self.cap = cap\n        self.seed = seed\n\n    def hash(self, value):\n        ret = 0\n        for i in range(len(value)):\n            ret += self.seed * ret + ord(value[i])\n        return (self.cap - 1) & ret\n\n\nclass BloomFilter(object):\n    def __init__(self, server, key, blockNum=1):\n        self.bit_size = 1 << 31  # Redis的String类型最大容量为512M，现使用256M\n        self.seeds = [5, 7, 11, 13, 31]\n        # self.seeds = [5, 7, 11, 13, 31, 37, 61]\n        self.server = server\n        self.key = key\n        self.blockNum = blockNum\n        self.hashfunc = []\n        for seed in self.seeds:\n            self.hashfunc.append(SimpleHash(self.bit_size, seed))\n\n    def isContains(self, str_input):\n        if not str_input:\n            return False\n        ret = True\n\n        name = self.key + str(int(str_input[0:2], 16) % self.blockNum)\n        for f in self.hashfunc:\n            loc = f.hash(str_input)\n            ret = ret & self.server.getbit(name, loc)\n        return ret\n\n    def insert(self, str_input):\n        name = self.key + str(int(str_input[0:2], 16) % self.blockNum)\n        for f in self.hashfunc:\n            loc = f.hash(str_input)\n            self.server.setbit(name, loc, 1)"
  },
  {
    "path": "zhihu/zhihu/scrapy_redis/__init__.py",
    "content": ""
  },
  {
    "path": "zhihu/zhihu/scrapy_redis/connection.py",
    "content": "import six\n\nfrom scrapy.utils.misc import load_object\n\nfrom . import defaults\n\n\n# Shortcut maps 'setting name' -> 'parmater name'.\nSETTINGS_PARAMS_MAP = {\n    'REDIS_URL': 'url',\n    'REDIS_HOST': 'host',\n    'REDIS_PORT': 'port',\n    'REDIS_ENCODING': 'encoding',\n}\n\n\ndef get_redis_from_settings(settings):\n    \"\"\"Returns a redis client instance from given Scrapy settings object.\n\n    This function uses ``get_client`` to instantiate the client and uses\n    ``defaults.REDIS_PARAMS`` global as defaults values for the parameters. You\n    can override them using the ``REDIS_PARAMS`` setting.\n\n    Parameters\n    ----------\n    settings : Settings\n        A scrapy settings object. See the supported settings below.\n\n    Returns\n    -------\n    server\n        Redis client instance.\n\n    Other Parameters\n    ----------------\n    REDIS_URL : str, optional\n        Server connection URL.\n    REDIS_HOST : str, optional\n        Server host.\n    REDIS_PORT : str, optional\n        Server port.\n    REDIS_ENCODING : str, optional\n        Data encoding.\n    REDIS_PARAMS : dict, optional\n        Additional client parameters.\n\n    \"\"\"\n    params = defaults.REDIS_PARAMS.copy()\n    params.update(settings.getdict('REDIS_PARAMS'))\n    # XXX: Deprecate REDIS_* settings.\n    for source, dest in SETTINGS_PARAMS_MAP.items():\n        val = settings.get(source)\n        if val:\n            params[dest] = val\n\n    # Allow ``redis_cls`` to be a path to a class.\n    if isinstance(params.get('redis_cls'), six.string_types):\n        params['redis_cls'] = load_object(params['redis_cls'])\n\n    return get_redis(**params)\n\n\n# Backwards compatible alias.\nfrom_settings = get_redis_from_settings\n\n\ndef get_redis(**kwargs):\n    \"\"\"Returns a redis client instance.\n\n    Parameters\n    ----------\n    redis_cls : class, optional\n        Defaults to ``redis.StrictRedis``.\n    url : str, optional\n        If given, ``redis_cls.from_url`` is used to instantiate the class.\n    **kwargs\n        Extra parameters to be passed to the ``redis_cls`` class.\n\n    Returns\n    -------\n    server\n        Redis client instance.\n\n    \"\"\"\n    redis_cls = kwargs.pop('redis_cls', defaults.REDIS_CLS)\n    url = kwargs.pop('url', None)\n    if url:\n        return redis_cls.from_url(url, **kwargs)\n    else:\n        return redis_cls(**kwargs)\n"
  },
  {
    "path": "zhihu/zhihu/scrapy_redis/defaults.py",
    "content": "import redis\n\n\n# For standalone use.\nDUPEFILTER_KEY = 'dupefilter:%(timestamp)s'\n\nPIPELINE_KEY = '%(spider)s:items'\n\nREDIS_CLS = redis.StrictRedis\nREDIS_ENCODING = 'utf-8'\n# Sane connection defaults.\nREDIS_PARAMS = {\n    'socket_timeout': 30,\n    'socket_connect_timeout': 30,\n    'retry_on_timeout': True,\n    'encoding': REDIS_ENCODING,\n}\n\nSCHEDULER_QUEUE_KEY = '%(spider)s:requests'\nSCHEDULER_QUEUE_CLASS = 'zhihu.scrapy_redis.queue.PriorityQueue'\nSCHEDULER_DUPEFILTER_KEY = '%(spider)s:dupefilter'\nSCHEDULER_DUPEFILTER_CLASS = 'zhihu.scrapy_redis.dupefilter.RFPDupeFilter'\n\nSTART_URLS_KEY = '%(name)s:start_urls'\nSTART_URLS_AS_SET = False\n"
  },
  {
    "path": "zhihu/zhihu/scrapy_redis/dupefilter.py",
    "content": "import logging\nimport time\nimport pdb\nfrom .BloomfilterOnRedis import BloomFilter\nfrom scrapy.dupefilters import BaseDupeFilter\nfrom scrapy.utils.request import request_fingerprint\n\nfrom . import defaults\nfrom .connection import get_redis_from_settings\n\n\nlogger = logging.getLogger(__name__)\n\n\n# TODO: Rename class to RedisDupeFilter.\nclass RFPDupeFilter(BaseDupeFilter):\n    \"\"\"Redis-based request duplicates filter.\n\n    This class can also be used with default Scrapy's scheduler.\n\n    \"\"\"\n\n    logger = logger\n\n    def __init__(self, server, key, debug=False):\n        \"\"\"Initialize the duplicates filter.\n\n        Parameters\n        ----------\n        server : redis.StrictRedis\n            The redis server instance.\n        key : str\n            Redis key Where to store fingerprints.\n        debug : bool, optional\n            Whether to log filtered requests.\n\n        \"\"\"\n        self.server = server\n        self.key = key\n        self.debug = debug\n        self.bf = BloomFilter(server, key, blockNum=1)  # you can increase blockNum if your are filtering too many urls\n        self.logdupes = True\n\n    @classmethod\n    def from_settings(cls, settings):\n        \"\"\"Returns an instance from given settings.\n\n        This uses by default the key ``dupefilter:<timestamp>``. When using the\n        ``scrapy_redis.scheduler.Scheduler`` class, this method is not used as\n        it needs to pass the spider name in the key.\n\n        Parameters\n        ----------\n        settings : scrapy.settings.Settings\n\n        Returns\n        -------\n        RFPDupeFilter\n            A RFPDupeFilter instance.\n\n\n        \"\"\"\n        server = get_redis_from_settings(settings)\n        # XXX: This creates one-time key. needed to support to use this\n        # class as standalone dupefilter with scrapy's default scheduler\n        # if scrapy passes spider on open() method this wouldn't be needed\n        # TODO: Use SCRAPY_JOB env as default and fallback to timestamp.\n        key = defaults.DUPEFILTER_KEY % {'timestamp': int(time.time())}\n        debug = settings.getbool('DUPEFILTER_DEBUG')\n        return cls(server, key=key, debug=debug)\n\n    @classmethod\n    def from_crawler(cls, crawler):\n        \"\"\"Returns instance from crawler.\n\n        Parameters\n        ----------\n        crawler : scrapy.crawler.Crawler\n\n        Returns\n        -------\n        RFPDupeFilter\n            Instance of RFPDupeFilter.\n\n        \"\"\"\n        return cls.from_settings(crawler.settings)\n\n    def request_seen(self, request):\n        \"\"\"Returns True if request was already seen.\n\n        Parameters\n        ----------\n        request : scrapy.http.Request\n\n        Returns\n        -------\n        bool\n\n        \"\"\"\n        fp = request_fingerprint(request)\n        if self.bf.isContains(fp):\n            return True\n        else:\n            self.bf.insert(fp)\n            return False\n\n    def request_fingerprint(self, request):\n        \"\"\"Returns a fingerprint for a given request.\n\n        Parameters\n        ----------\n        request : scrapy.http.Request\n\n        Returns\n        -------\n        str\n\n        \"\"\"\n        return request_fingerprint(request)\n\n    def close(self, reason=''):\n        \"\"\"Delete data on close. Called by Scrapy's scheduler.\n\n        Parameters\n        ----------\n        reason : str, optional\n\n        \"\"\"\n        self.clear()\n\n    def clear(self):\n        \"\"\"Clears fingerprints data.\"\"\"\n        self.server.delete(self.key)\n\n    def log(self, request, spider):\n        \"\"\"Logs given request.\n\n        Parameters\n        ----------\n        request : scrapy.http.Request\n        spider : scrapy.spiders.Spider\n\n        \"\"\"\n        if self.debug:\n            msg = \"Filtered duplicate request: %(request)s\"\n            self.logger.debug(msg, {'request': request}, extra={'spider': spider})\n        elif self.logdupes:\n            msg = (\"Filtered duplicate request %(request)s\"\n                   \" - no more duplicates will be shown\"\n                   \" (see DUPEFILTER_DEBUG to show all duplicates)\")\n            self.logger.debug(msg, {'request': request}, extra={'spider': spider})\n            self.logdupes = False\n"
  },
  {
    "path": "zhihu/zhihu/scrapy_redis/picklecompat.py",
    "content": "\"\"\"A pickle wrapper module with protocol=-1 by default.\"\"\"\n\ntry:\n    import cPickle as pickle  # PY2\nexcept ImportError:\n    import pickle\n\n\ndef loads(s):\n    return pickle.loads(s)\n\n\ndef dumps(obj):\n    return pickle.dumps(obj, protocol=-1)"
  },
  {
    "path": "zhihu/zhihu/scrapy_redis/pipelines.py",
    "content": "from scrapy.utils.misc import load_object\nfrom scrapy.utils.serialize import ScrapyJSONEncoder\nfrom twisted.internet.threads import deferToThread\n\nfrom . import connection, defaults\n\n\ndefault_serialize = ScrapyJSONEncoder().encode\n\n\nclass RedisPipeline(object):\n    \"\"\"Pushes serialized item into a redis list/queue\n\n    Settings\n    --------\n    REDIS_ITEMS_KEY : str\n        Redis key where to store items.\n    REDIS_ITEMS_SERIALIZER : str\n        Object path to serializer function.\n\n    \"\"\"\n\n    def __init__(self, server,\n                 key=defaults.PIPELINE_KEY,\n                 serialize_func=default_serialize):\n        \"\"\"Initialize pipeline.\n\n        Parameters\n        ----------\n        server : StrictRedis\n            Redis client instance.\n        key : str\n            Redis key where to store items.\n        serialize_func : callable\n            Items serializer function.\n\n        \"\"\"\n        self.server = server\n        self.key = key\n        self.serialize = serialize_func\n\n    @classmethod\n    def from_settings(cls, settings):\n        params = {\n            'server': connection.from_settings(settings),\n        }\n        if settings.get('REDIS_ITEMS_KEY'):\n            params['key'] = settings['REDIS_ITEMS_KEY']\n        if settings.get('REDIS_ITEMS_SERIALIZER'):\n            params['serialize_func'] = load_object(\n                settings['REDIS_ITEMS_SERIALIZER']\n            )\n\n        return cls(**params)\n\n    @classmethod\n    def from_crawler(cls, crawler):\n        return cls.from_settings(crawler.settings)\n\n    def process_item(self, item, spider):\n        return deferToThread(self._process_item, item, spider)\n\n    def _process_item(self, item, spider):\n        key = self.item_key(item, spider)\n        data = self.serialize(item)\n        self.server.rpush(key, data)\n        return item\n\n    def item_key(self, item, spider):\n        \"\"\"Returns redis key based on given spider.\n\n        Override this function to use a different key depending on the item\n        and/or spider.\n\n        \"\"\"\n        return self.key % {'spider': spider.name}\n"
  },
  {
    "path": "zhihu/zhihu/scrapy_redis/queue.py",
    "content": "from scrapy.utils.reqser import request_to_dict, request_from_dict\n\nfrom . import picklecompat\n\n\nclass Base(object):\n    \"\"\"Per-spider base queue class\"\"\"\n\n    def __init__(self, server, spider, key, serializer=None):\n        \"\"\"Initialize per-spider redis queue.\n\n        Parameters\n        ----------\n        server : StrictRedis\n            Redis client instance.\n        spider : Spider\n            Scrapy spider instance.\n        key: str\n            Redis key where to put and get messages.\n        serializer : object\n            Serializer object with ``loads`` and ``dumps`` methods.\n\n        \"\"\"\n        if serializer is None:\n            # Backward compatibility.\n            # TODO: deprecate pickle.\n            serializer = picklecompat\n        if not hasattr(serializer, 'loads'):\n            raise TypeError(\"serializer does not implement 'loads' function: %r\"\n                            % serializer)\n        if not hasattr(serializer, 'dumps'):\n            raise TypeError(\"serializer '%s' does not implement 'dumps' function: %r\"\n                            % serializer)\n\n        self.server = server\n        self.spider = spider\n        self.key = key % {'spider': spider.name}\n        self.serializer = serializer\n\n    def _encode_request(self, request):\n        \"\"\"Encode a request object\"\"\"\n        obj = request_to_dict(request, self.spider)\n        return self.serializer.dumps(obj)\n\n    def _decode_request(self, encoded_request):\n        \"\"\"Decode an request previously encoded\"\"\"\n        obj = self.serializer.loads(encoded_request)\n        return request_from_dict(obj, self.spider)\n\n    def __len__(self):\n        \"\"\"Return the length of the queue\"\"\"\n        raise NotImplementedError\n\n    def push(self, request):\n        \"\"\"Push a request\"\"\"\n        raise NotImplementedError\n\n    def pop(self, timeout=0):\n        \"\"\"Pop a request\"\"\"\n        raise NotImplementedError\n\n    def clear(self):\n        \"\"\"Clear queue/stack\"\"\"\n        self.server.delete(self.key)\n\n\nclass FifoQueue(Base):\n    \"\"\"Per-spider FIFO queue\"\"\"\n\n    def __len__(self):\n        \"\"\"Return the length of the queue\"\"\"\n        return self.server.llen(self.key)\n\n    def push(self, request):\n        \"\"\"Push a request\"\"\"\n        self.server.lpush(self.key, self._encode_request(request))\n\n    def pop(self, timeout=0):\n        \"\"\"Pop a request\"\"\"\n        if timeout > 0:\n            data = self.server.brpop(self.key, timeout)\n            if isinstance(data, tuple):\n                data = data[1]\n        else:\n            data = self.server.rpop(self.key)\n        if data:\n            return self._decode_request(data)\n\n\nclass PriorityQueue(Base):\n    \"\"\"Per-spider priority queue abstraction using redis' sorted set\"\"\"\n\n    def __len__(self):\n        \"\"\"Return the length of the queue\"\"\"\n        return self.server.zcard(self.key)\n\n    def push(self, request):\n        \"\"\"Push a request\"\"\"\n        data = self._encode_request(request)\n        score = -request.priority\n        # We don't use zadd method as the order of arguments change depending on\n        # whether the class is Redis or StrictRedis, and the option of using\n        # kwargs only accepts strings, not bytes.\n        self.server.execute_command('ZADD', self.key, score, data)\n\n    def pop(self, timeout=0):\n        \"\"\"\n        Pop a request\n        timeout not support in this queue class\n        \"\"\"\n        # use atomic range/remove using multi/exec\n        pipe = self.server.pipeline()\n        pipe.multi()\n        pipe.zrange(self.key, 0, 0).zremrangebyrank(self.key, 0, 0)\n        results, count = pipe.execute()\n        if results:\n            return self._decode_request(results[0])\n\n\nclass LifoQueue(Base):\n    \"\"\"Per-spider LIFO queue.\"\"\"\n\n    def __len__(self):\n        \"\"\"Return the length of the stack\"\"\"\n        return self.server.llen(self.key)\n\n    def push(self, request):\n        \"\"\"Push a request\"\"\"\n        self.server.lpush(self.key, self._encode_request(request))\n\n    def pop(self, timeout=0):\n        \"\"\"Pop a request\"\"\"\n        if timeout > 0:\n            data = self.server.blpop(self.key, timeout)\n            if isinstance(data, tuple):\n                data = data[1]\n        else:\n            data = self.server.lpop(self.key)\n\n        if data:\n            return self._decode_request(data)\n\n# TODO: Deprecate the use of these names.\nSpiderQueue = FifoQueue\nSpiderStack = LifoQueue\nSpiderPriorityQueue = PriorityQueue\n"
  },
  {
    "path": "zhihu/zhihu/scrapy_redis/scheduler.py",
    "content": "import importlib\nimport six\n\nfrom scrapy.utils.misc import load_object\n\nfrom . import connection, defaults\n\n\n# TODO: add SCRAPY_JOB support.\nclass Scheduler(object):\n    \"\"\"Redis-based scheduler\n\n    Settings\n    --------\n    SCHEDULER_PERSIST : bool (default: False)\n        Whether to persist or clear redis queue.\n    SCHEDULER_FLUSH_ON_START : bool (default: False)\n        Whether to flush redis queue on start.\n    SCHEDULER_IDLE_BEFORE_CLOSE : int (default: 0)\n        How many seconds to wait before closing if no message is received.\n    SCHEDULER_QUEUE_KEY : str\n        Scheduler redis key.\n    SCHEDULER_QUEUE_CLASS : str\n        Scheduler queue class.\n    SCHEDULER_DUPEFILTER_KEY : str\n        Scheduler dupefilter redis key.\n    SCHEDULER_DUPEFILTER_CLASS : str\n        Scheduler dupefilter class.\n    SCHEDULER_SERIALIZER : str\n        Scheduler serializer.\n\n    \"\"\"\n\n    def __init__(self, server,\n                 persist=False,\n                 flush_on_start=False,\n                 queue_key=defaults.SCHEDULER_QUEUE_KEY,\n                 queue_cls=defaults.SCHEDULER_QUEUE_CLASS,\n                 dupefilter_key=defaults.SCHEDULER_DUPEFILTER_KEY,\n                 dupefilter_cls=defaults.SCHEDULER_DUPEFILTER_CLASS,\n                 idle_before_close=0,\n                 serializer=None):\n        \"\"\"Initialize scheduler.\n\n        Parameters\n        ----------\n        server : Redis\n            The redis server instance.\n        persist : bool\n            Whether to flush requests when closing. Default is False.\n        flush_on_start : bool\n            Whether to flush requests on start. Default is False.\n        queue_key : str\n            Requests queue key.\n        queue_cls : str\n            Importable path to the queue class.\n        dupefilter_key : str\n            Duplicates filter key.\n        dupefilter_cls : str\n            Importable path to the dupefilter class.\n        idle_before_close : int\n            Timeout before giving up.\n\n        \"\"\"\n        if idle_before_close < 0:\n            raise TypeError(\"idle_before_close cannot be negative\")\n\n        self.server = server\n        self.persist = persist\n        self.flush_on_start = flush_on_start\n        self.queue_key = queue_key\n        self.queue_cls = queue_cls\n        self.dupefilter_cls = dupefilter_cls\n        self.dupefilter_key = dupefilter_key\n        self.idle_before_close = idle_before_close\n        self.serializer = serializer\n        self.stats = None\n\n    def __len__(self):\n        return len(self.queue)\n\n    @classmethod\n    def from_settings(cls, settings):\n        kwargs = {\n            'persist': settings.getbool('SCHEDULER_PERSIST'),\n            'flush_on_start': settings.getbool('SCHEDULER_FLUSH_ON_START'),\n            'idle_before_close': settings.getint('SCHEDULER_IDLE_BEFORE_CLOSE'),\n        }\n\n        # If these values are missing, it means we want to use the defaults.\n        optional = {\n            # TODO: Use custom prefixes for this settings to note that are\n            # specific to scrapy-redis.\n            'queue_key': 'SCHEDULER_QUEUE_KEY',\n            'queue_cls': 'SCHEDULER_QUEUE_CLASS',\n            'dupefilter_key': 'SCHEDULER_DUPEFILTER_KEY',\n            # We use the default setting name to keep compatibility.\n            'dupefilter_cls': 'DUPEFILTER_CLASS',\n            'serializer': 'SCHEDULER_SERIALIZER',\n        }\n        for name, setting_name in optional.items():\n            val = settings.get(setting_name)\n            if val:\n                kwargs[name] = val\n\n        # Support serializer as a path to a module.\n        if isinstance(kwargs.get('serializer'), six.string_types):\n            kwargs['serializer'] = importlib.import_module(kwargs['serializer'])\n\n        server = connection.from_settings(settings)\n        # Ensure the connection is working.\n        server.ping()\n\n        return cls(server=server, **kwargs)\n\n    @classmethod\n    def from_crawler(cls, crawler):\n        instance = cls.from_settings(crawler.settings)\n        # FIXME: for now, stats are only supported from this constructor\n        instance.stats = crawler.stats\n        return instance\n\n    def open(self, spider):\n        self.spider = spider\n\n        try:\n            self.queue = load_object(self.queue_cls)(\n                server=self.server,\n                spider=spider,\n                key=self.queue_key % {'spider': spider.name},\n                serializer=self.serializer,\n            )\n        except TypeError as e:\n            raise ValueError(\"Failed to instantiate queue class '%s': %s\",\n                             self.queue_cls, e)\n\n        try:\n            self.df = load_object(self.dupefilter_cls)(\n                server=self.server,\n                key=self.dupefilter_key % {'spider': spider.name},\n                debug=spider.settings.getbool('DUPEFILTER_DEBUG'),\n            )\n        except TypeError as e:\n            raise ValueError(\"Failed to instantiate dupefilter class '%s': %s\",\n                             self.dupefilter_cls, e)\n\n        if self.flush_on_start:\n            self.flush()\n        # notice if there are requests already in the queue to resume the crawl\n        if len(self.queue):\n            spider.log(\"Resuming crawl (%d requests scheduled)\" % len(self.queue))\n\n    def close(self, reason):\n        if not self.persist:\n            self.flush()\n\n    def flush(self):\n        self.df.clear()\n        self.queue.clear()\n\n    def enqueue_request(self, request):\n        if not request.dont_filter and self.df.request_seen(request):\n            self.df.log(request, self.spider)\n            return False\n        if self.stats:\n            self.stats.inc_value('scheduler/enqueued/redis', spider=self.spider)\n        self.queue.push(request)\n        return True\n\n    def next_request(self):\n        block_pop_timeout = self.idle_before_close\n        request = self.queue.pop(block_pop_timeout)\n        if request and self.stats:\n            self.stats.inc_value('scheduler/dequeued/redis', spider=self.spider)\n        return request\n\n    def has_pending_requests(self):\n        return len(self) > 0"
  },
  {
    "path": "zhihu/zhihu/scrapy_redis/spiders.py",
    "content": "from scrapy import signals\nfrom scrapy.exceptions import DontCloseSpider\nfrom scrapy.spiders import Spider, CrawlSpider\n\nfrom . import connection, defaults\nfrom .utils import bytes_to_str\n\n\nclass RedisMixin(object):\n    \"\"\"Mixin class to implement reading urls from a redis queue.\"\"\"\n    redis_key = None\n    redis_batch_size = None\n    redis_encoding = None\n\n    # Redis client placeholder.\n    server = None\n\n    def start_requests(self):\n        \"\"\"Returns a batch of start requests from redis.\"\"\"\n        return self.next_requests()\n\n    def setup_redis(self, crawler=None):\n        \"\"\"Setup redis connection and idle signal.\n\n        This should be called after the spider has set its crawler object.\n        \"\"\"\n        if self.server is not None:\n            return\n\n        if crawler is None:\n            # We allow optional crawler argument to keep backwards\n            # compatibility.\n            # XXX: Raise a deprecation warning.\n            crawler = getattr(self, 'crawler', None)\n\n        if crawler is None:\n            raise ValueError(\"crawler is required\")\n\n        settings = crawler.settings\n\n        if self.redis_key is None:\n            self.redis_key = settings.get(\n                'REDIS_START_URLS_KEY', defaults.START_URLS_KEY,\n            )\n\n        self.redis_key = self.redis_key % {'name': self.name}\n\n        if not self.redis_key.strip():\n            raise ValueError(\"redis_key must not be empty\")\n\n        if self.redis_batch_size is None:\n            # TODO: Deprecate this setting (REDIS_START_URLS_BATCH_SIZE).\n            self.redis_batch_size = settings.getint(\n                'REDIS_START_URLS_BATCH_SIZE',\n                settings.getint('CONCURRENT_REQUESTS'),\n            )\n\n        try:\n            self.redis_batch_size = int(self.redis_batch_size)\n        except (TypeError, ValueError):\n            raise ValueError(\"redis_batch_size must be an integer\")\n\n        if self.redis_encoding is None:\n            self.redis_encoding = settings.get('REDIS_ENCODING', defaults.REDIS_ENCODING)\n\n        self.logger.info(\"Reading start URLs from redis key '%(redis_key)s' \"\n                         \"(batch size: %(redis_batch_size)s, encoding: %(redis_encoding)s\",\n                         self.__dict__)\n\n        self.server = connection.from_settings(crawler.settings)\n        # The idle signal is called when the spider has no requests left,\n        # that's when we will schedule new requests from redis queue\n        crawler.signals.connect(self.spider_idle, signal=signals.spider_idle)\n\n    def next_requests(self):\n        \"\"\"Returns a request to be scheduled or none.\"\"\"\n        use_set = self.settings.getbool('REDIS_START_URLS_AS_SET', defaults.START_URLS_AS_SET)\n        fetch_one = self.server.spop if use_set else self.server.lpop\n        # XXX: Do we need to use a timeout here?\n        found = 0\n        # TODO: Use redis pipeline execution.\n        while found < self.redis_batch_size:\n            data = fetch_one(self.redis_key)\n            if not data:\n                # Queue empty.\n                break\n            req = self.make_request_from_data(data)\n            if req:\n                yield req\n                found += 1\n            else:\n                self.logger.debug(\"Request not made from data: %r\", data)\n\n        if found:\n            self.logger.debug(\"Read %s requests from '%s'\", found, self.redis_key)\n\n    def make_request_from_data(self, data):\n        \"\"\"Returns a Request instance from data coming from Redis.\n\n        By default, ``data`` is an encoded URL. You can override this method to\n        provide your own message decoding.\n\n        Parameters\n        ----------\n        data : bytes\n            Message from redis.\n\n        \"\"\"\n        url = bytes_to_str(data, self.redis_encoding)\n        return self.make_requests_from_url(url)\n\n    def schedule_next_requests(self):\n        \"\"\"Schedules a request if available\"\"\"\n        # TODO: While there is capacity, schedule a batch of redis requests.\n        for req in self.next_requests():\n            self.crawler.engine.crawl(req, spider=self)\n\n    def spider_idle(self):\n        \"\"\"Schedules a request if available, otherwise waits.\"\"\"\n        # XXX: Handle a sentinel to close the spider.\n        self.schedule_next_requests()\n        raise DontCloseSpider\n\n\nclass RedisSpider(RedisMixin, Spider):\n    \"\"\"Spider that reads urls from redis queue when idle.\n\n    Attributes\n    ----------\n    redis_key : str (default: REDIS_START_URLS_KEY)\n        Redis key where to fetch start URLs from..\n    redis_batch_size : int (default: CONCURRENT_REQUESTS)\n        Number of messages to fetch from redis on each attempt.\n    redis_encoding : str (default: REDIS_ENCODING)\n        Encoding to use when decoding messages from redis queue.\n\n    Settings\n    --------\n    REDIS_START_URLS_KEY : str (default: \"<spider.name>:start_urls\")\n        Default Redis key where to fetch start URLs from..\n    REDIS_START_URLS_BATCH_SIZE : int (deprecated by CONCURRENT_REQUESTS)\n        Default number of messages to fetch from redis on each attempt.\n    REDIS_START_URLS_AS_SET : bool (default: False)\n        Use SET operations to retrieve messages from the redis queue. If False,\n        the messages are retrieve using the LPOP command.\n    REDIS_ENCODING : str (default: \"utf-8\")\n        Default encoding to use when decoding messages from redis queue.\n\n    \"\"\"\n\n    @classmethod\n    def from_crawler(self, crawler, *args, **kwargs):\n        obj = super(RedisSpider, self).from_crawler(crawler, *args, **kwargs)\n        obj.setup_redis(crawler)\n        return obj\n\n\nclass RedisCrawlSpider(RedisMixin, CrawlSpider):\n    \"\"\"Spider that reads urls from redis queue when idle.\n\n    Attributes\n    ----------\n    redis_key : str (default: REDIS_START_URLS_KEY)\n        Redis key where to fetch start URLs from..\n    redis_batch_size : int (default: CONCURRENT_REQUESTS)\n        Number of messages to fetch from redis on each attempt.\n    redis_encoding : str (default: REDIS_ENCODING)\n        Encoding to use when decoding messages from redis queue.\n\n    Settings\n    --------\n    REDIS_START_URLS_KEY : str (default: \"<spider.name>:start_urls\")\n        Default Redis key where to fetch start URLs from..\n    REDIS_START_URLS_BATCH_SIZE : int (deprecated by CONCURRENT_REQUESTS)\n        Default number of messages to fetch from redis on each attempt.\n    REDIS_START_URLS_AS_SET : bool (default: True)\n        Use SET operations to retrieve messages from the redis queue.\n    REDIS_ENCODING : str (default: \"utf-8\")\n        Default encoding to use when decoding messages from redis queue.\n\n    \"\"\"\n\n    @classmethod\n    def from_crawler(self, crawler, *args, **kwargs):\n        obj = super(RedisCrawlSpider, self).from_crawler(crawler, *args, **kwargs)\n        obj.setup_redis(crawler)\n        return obj\n"
  },
  {
    "path": "zhihu/zhihu/scrapy_redis/tests.py",
    "content": "import os\n\nimport mock\nimport redis\n\nfrom scrapy import Request, Spider\nfrom unittest import TestCase\n\nfrom . import connection\nfrom .dupefilter import RFPDupeFilter\nfrom .queue import SpiderQueue, SpiderPriorityQueue, SpiderStack\nfrom .scheduler import Scheduler\n\n\n# allow test settings from environment\nREDIS_HOST = os.environ.get('REDIST_HOST', 'localhost')\nREDIS_PORT = int(os.environ.get('REDIS_PORT', 6379))\n\n\nclass RedisTestMixin(object):\n\n    @property\n    def server(self):\n        if not hasattr(self, '_redis'):\n            self._redis = redis.Redis(REDIS_HOST, REDIS_PORT)\n        return self._redis\n\n    def clear_keys(self, prefix):\n        keys = self.server.keys(prefix + '*')\n        if keys:\n            self.server.delete(*keys)\n\n\nclass DupeFilterTest(RedisTestMixin, TestCase):\n\n    def setUp(self):\n        self.key = 'scrapy_redis:tests:dupefilter:'\n        self.df = RFPDupeFilter(self.server, self.key)\n\n    def tearDown(self):\n        self.clear_keys(self.key)\n\n    def test_dupe_filter(self):\n        req = Request('http://example.com')\n\n        self.assertFalse(self.df.request_seen(req))\n        self.assertTrue(self.df.request_seen(req))\n\n        self.df.close('nothing')\n\n\nclass QueueTestMixin(RedisTestMixin):\n\n    queue_cls = None\n\n    def setUp(self):\n        self.spider = Spider('myspider')\n        self.key = 'scrapy_redis:tests:%s:queue' % self.spider.name\n        self.q = self.queue_cls(self.server, Spider('myspider'), self.key)\n\n    def tearDown(self):\n        self.clear_keys(self.key)\n\n    def test_clear(self):\n        self.assertEqual(len(self.q), 0)\n\n        for i in range(10):\n            # XXX: can't use same url for all requests as SpiderPriorityQueue\n            # uses redis' set implemention and we will end with only one\n            # request in the set and thus failing the test. It should be noted\n            # that when using SpiderPriorityQueue it acts as a request\n            # duplication filter whenever the serielized requests are the same.\n            # This might be unwanted on repetitive requests to the same page\n            # even with dont_filter=True flag.\n            req = Request('http://example.com/?page=%s' % i)\n            self.q.push(req)\n        self.assertEqual(len(self.q), 10)\n\n        self.q.clear()\n        self.assertEqual(len(self.q), 0)\n\n\nclass SpiderQueueTest(QueueTestMixin, TestCase):\n\n    queue_cls = SpiderQueue\n\n    def test_queue(self):\n        req1 = Request('http://example.com/page1')\n        req2 = Request('http://example.com/page2')\n\n        self.q.push(req1)\n        self.q.push(req2)\n\n        out1 = self.q.pop()\n        out2 = self.q.pop()\n\n        self.assertEqual(out1.url, req1.url)\n        self.assertEqual(out2.url, req2.url)\n\n\nclass SpiderPriorityQueueTest(QueueTestMixin, TestCase):\n\n    queue_cls = SpiderPriorityQueue\n\n    def test_queue(self):\n        req1 = Request('http://example.com/page1', priority=100)\n        req2 = Request('http://example.com/page2', priority=50)\n        req3 = Request('http://example.com/page2', priority=200)\n\n        self.q.push(req1)\n        self.q.push(req2)\n        self.q.push(req3)\n\n        out1 = self.q.pop()\n        out2 = self.q.pop()\n        out3 = self.q.pop()\n\n        self.assertEqual(out1.url, req3.url)\n        self.assertEqual(out2.url, req1.url)\n        self.assertEqual(out3.url, req2.url)\n\n\nclass SpiderStackTest(QueueTestMixin, TestCase):\n\n    queue_cls = SpiderStack\n\n    def test_queue(self):\n        req1 = Request('http://example.com/page1')\n        req2 = Request('http://example.com/page2')\n\n        self.q.push(req1)\n        self.q.push(req2)\n\n        out1 = self.q.pop()\n        out2 = self.q.pop()\n\n        self.assertEqual(out1.url, req2.url)\n        self.assertEqual(out2.url, req1.url)\n\n\nclass SchedulerTest(RedisTestMixin, TestCase):\n\n    def setUp(self):\n        self.persist = False\n        self.key_prefix = 'scrapy_redis:tests:'\n        self.queue_key = self.key_prefix + '%(spider)s:requests'\n        self.dupefilter_key = self.key_prefix + '%(spider)s:dupefilter'\n        self.idle_before_close = 0\n        self.scheduler = Scheduler(self.server, self.persist, self.queue_key,\n                                   SpiderQueue, self.dupefilter_key,\n                                   self.idle_before_close)\n        self.spider = Spider('myspider')\n\n    def tearDown(self):\n        self.clear_keys(self.key_prefix)\n\n    def test_scheduler(self):\n        # default no persist\n        self.assertFalse(self.scheduler.persist)\n\n        self.scheduler.open(self.spider)\n        self.assertEqual(len(self.scheduler), 0)\n\n        req = Request('http://example.com')\n        self.scheduler.enqueue_request(req)\n        self.assertTrue(self.scheduler.has_pending_requests())\n        self.assertEqual(len(self.scheduler), 1)\n\n        # dupefilter in action\n        self.scheduler.enqueue_request(req)\n        self.assertEqual(len(self.scheduler), 1)\n\n        out = self.scheduler.next_request()\n        self.assertEqual(out.url, req.url)\n\n        self.assertFalse(self.scheduler.has_pending_requests())\n        self.assertEqual(len(self.scheduler), 0)\n\n        self.scheduler.close('finish')\n\n    def test_scheduler_persistent(self):\n        # TODO: Improve this test to avoid the need to check for log messages.\n        self.spider.log = mock.Mock(spec=self.spider.log)\n\n        self.scheduler.persist = True\n        self.scheduler.open(self.spider)\n\n        self.assertEqual(self.spider.log.call_count, 0)\n\n        self.scheduler.enqueue_request(Request('http://example.com/page1'))\n        self.scheduler.enqueue_request(Request('http://example.com/page2'))\n\n        self.assertTrue(self.scheduler.has_pending_requests())\n        self.scheduler.close('finish')\n\n        self.scheduler.open(self.spider)\n        self.spider.log.assert_has_calls([\n            mock.call(\"Resuming crawl (2 requests scheduled)\"),\n        ])\n        self.assertEqual(len(self.scheduler), 2)\n\n        self.scheduler.persist = False\n        self.scheduler.close('finish')\n\n        self.assertEqual(len(self.scheduler), 0)\n\n\nclass ConnectionTest(TestCase):\n\n    # We can get a connection from just REDIS_URL.\n    def test_redis_url(self):\n        settings = dict(\n            REDIS_URL = 'redis://foo:bar@localhost:9001/42'\n        )\n\n        server = connection.from_settings(settings)\n        connect_args = server.connection_pool.connection_kwargs\n\n        self.assertEqual(connect_args['host'], 'localhost')\n        self.assertEqual(connect_args['port'], 9001)\n        self.assertEqual(connect_args['password'], 'bar')\n        self.assertEqual(connect_args['db'], 42)\n\n    # We can get a connection from REDIS_HOST/REDIS_PORT.\n    def test_redis_host_port(self):\n        settings = dict(\n            REDIS_HOST = 'localhost',\n            REDIS_PORT = 9001\n        )\n\n        server = connection.from_settings(settings)\n        connect_args = server.connection_pool.connection_kwargs\n\n        self.assertEqual(connect_args['host'], 'localhost')\n        self.assertEqual(connect_args['port'], 9001)\n\n    # REDIS_URL takes precedence over REDIS_HOST/REDIS_PORT.\n    def test_redis_url_precedence(self):\n        settings = dict(\n            REDIS_HOST = 'baz',\n            REDIS_PORT = 1337,\n            REDIS_URL = 'redis://foo:bar@localhost:9001/42'\n        )\n\n        server = connection.from_settings(settings)\n        connect_args = server.connection_pool.connection_kwargs\n\n        self.assertEqual(connect_args['host'], 'localhost')\n        self.assertEqual(connect_args['port'], 9001)\n        self.assertEqual(connect_args['password'], 'bar')\n        self.assertEqual(connect_args['db'], 42)\n\n    # We fallback to REDIS_HOST/REDIS_PORT if REDIS_URL is None.\n    def test_redis_host_port_fallback(self):\n        settings = dict(\n            REDIS_HOST = 'baz',\n            REDIS_PORT = 1337,\n            REDIS_URL = None\n        )\n\n        server = connection.from_settings(settings)\n        connect_args = server.connection_pool.connection_kwargs\n\n        self.assertEqual(connect_args['host'], 'baz')\n        self.assertEqual(connect_args['port'], 1337)\n\n    # We use default values for REDIS_HOST/REDIS_PORT.\n    def test_redis_default(self):\n        settings = dict()\n\n        server = connection.from_settings(settings)\n        connect_args = server.connection_pool.connection_kwargs\n\n        self.assertEqual(connect_args['host'], 'localhost')\n        self.assertEqual(connect_args['port'], 6379)\n"
  },
  {
    "path": "zhihu/zhihu/scrapy_redis/utils.py",
    "content": "import six\n\n\ndef bytes_to_str(s, encoding='utf-8'):\n    \"\"\"Returns a str if a bytes object is given.\"\"\"\n    if six.PY3 and isinstance(s, bytes):\n        return s.decode(encoding)\n    return s"
  },
  {
    "path": "zhihu/zhihu/settings.py",
    "content": "# -*- coding: utf-8 -*-\n\n# ------------------------------------------\n#   版本：1.0\n#   日期：2017-8-06\n#   作者：AlexTan\n#   <CSDN:   http://blog.csdn.net/alextan_>  \n#   <e-mail: alextanbz@gmail.com>\n# ------------------------------------------\n\nBOT_NAME = 'zhihu'\n\nSPIDER_MODULES = ['zhihu.spiders']\nNEWSPIDER_MODULE = 'zhihu.spiders'\n\n\n\n\nREDIRECT_ENABLED = False\nRETRY_TIMES = 1\nDOWNLOAD_TIMEOUT = 10 #下载超时时间\n\n\n# Crawl responsibly by identifying yourself (and your website) on the user-agent\nUSER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.86 Safari/537.36'\n\n\n#分布式配置\nSCHEDULER = \"zhihu.scrapy_redis.scheduler.Scheduler\"\nSCHEDULER_PERSIST = True\nDUPEFILTER_CLASS = \"zhihu.scrapy_redis.dupefilter.RFPDupeFilter\"\n\n\n# 种子队列的信息\nREDIS_URL = None\nREDIS_HOST = '127.0.0.1'\nREDIS_PORT = 6379#6379\nFILTER_URL = None\nFILTER_HOST = '127.0.0.1'\nFILTER_PORT = 6379#6379\nFILTER_DB = 0\n\n\n\nMONGO_URI = 'mongodb://127.0.0.1:27017/'\nMONGO_DATABASE = 'zhihu3'\n\n\nDOWNLOADER_MIDDLEWARES = {\n    'zhihu.middlewares.UserAgentMiddleware': 543,\n    'zhihu.middlewares.CookiesMiddleware': 544,\n    #'zhihu.middlewares.ProxyMiddleware':125,\n    #\"scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware\": 545,\n}\n\n\nITEM_PIPELINES = {\n    'zhihu.pipelines.ZhihuPipeline': 301,\n}\n\n'''\nDOWNLOAD_DELAY = 3\nAUTOTHROTTLE_ENABLED = True\nAUTOTHROTTLE_START_DELAY = 3\nAUTOTHROTTLE_MAX_DELAY = 60\n'''\n\n# Obey robots.txt rules\n#ROBOTSTXT_OBEY = True\n\n# Configure maximum concurrent requests performed by Scrapy (default: 16)\n#CONCURRENT_REQUESTS = 1\n\n# Configure a delay for requests for the same website (default: 0)\n# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay\n# See also autothrottle settings and docs\n#DOWNLOAD_DELAY = 3\n# The download delay setting will honor only one of:\n#CONCURRENT_REQUESTS_PER_DOMAIN = 16\n#CONCURRENT_REQUESTS_PER_IP = 16\n\n# Disable cookies (enabled by default)\n#COOKIES_ENABLED = False\n\n# Disable Telnet Console (enabled by default)\n#TELNETCONSOLE_ENABLED = False\n\n# Override the default request headers:\n#DEFAULT_REQUEST_HEADERS = {\n#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',\n#   'Accept-Language': 'en',\n#}\n\n# Enable or disable spider middlewares\n# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html\n#SPIDER_MIDDLEWARES = {\n#    'zhihu.middlewares.ZhihuSpiderMiddleware': 543,\n#}\n\n# Enable or disable downloader middlewares\n# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html\n\n\n# Enable or disable extensions\n# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html\n#EXTENSIONS = {\n#    'scrapy.extensions.telnet.TelnetConsole': None,\n#}\n\n# Configure item pipelines\n# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html\n\n\n# Enable and configure the AutoThrottle extension (disabled by default)\n# See http://doc.scrapy.org/en/latest/topics/autothrottle.html\n#AUTOTHROTTLE_ENABLED = True\n# The initial download delay\n#AUTOTHROTTLE_START_DELAY = 5\n# The maximum download delay to be set in case of high latencies\n#AUTOTHROTTLE_MAX_DELAY = 60\n# The average number of requests Scrapy should be sending in parallel to\n# each remote server\n#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0\n# Enable showing throttling stats for every response received:\n#AUTOTHROTTLE_DEBUG = False\n\n# Enable and configure HTTP caching (disabled by default)\n# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings\n#HTTPCACHE_ENABLED = True\n#HTTPCACHE_EXPIRATION_SECS = 0\n#HTTPCACHE_DIR = 'httpcache'\n#HTTPCACHE_IGNORE_HTTP_CODES = []\n#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'\n"
  },
  {
    "path": "zhihu/zhihu/spiders/__init__.py",
    "content": "# This package will contain the spiders of your Scrapy project\n#\n# Please refer to the documentation for information on how to create and manage\n# your spiders.\n"
  },
  {
    "path": "zhihu/zhihu/spiders/zhihuspider.py",
    "content": "# -*- coding: utf-8 -*-\nimport scrapy\nimport re\nimport pdb\nimport json\nfrom selenium import webdriver\nfrom selenium.webdriver.common.desired_capabilities import DesiredCapabilities\nfrom ..items import ZhihuItem,RelationItem\nfrom scrapy.http import Request,FormRequest\nfrom scrapy_redis.spiders import RedisSpider\n\n# ------------------------------------------\n#   版本：1.0\n#   日期：2017-8-06\n#   作者：AlexTan\n#   <CSDN:   http://blog.csdn.net/alextan_>  \n#   <e-mail: alextanbz@gmail.com>\n# ------------------------------------------\n\n\n#zhihuspider1是模拟浏览器爬（速度慢,不建议，仅供学习） zhihuspider0抓包爬（速度快）\nclass ZhihuspiderSpider(RedisSpider):\n#class ZhihuspiderSpider(scrapy.Spider):\n    name = \"zhihuspider1\"\n    #allowed_domains = [\"zhihu.com\"]\n    host = 'https://www.zhihu.com'\n    redis_key = \"zhihuspider:start_urls\"\n    #start_urls = ['https://www.zhihu.com/people/yun-he-shu-ju-8/answers']\n    strat_user_id = ['yun-he-shu-ju-8']\n    #pdb.set_trace()\n    dcap = dict(DesiredCapabilities.PHANTOMJS)\n    dcap[\"phantomjs.page.settings.userAgent\"] = (\"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:54.0) Gecko/20100101 Firefox/54.0\")\n    dcap[\"phantomjs.page.settings.loadImages\"] = False\n    obj = webdriver.PhantomJS(desired_capabilities=dcap)\n\n\n    def start_requests(self):\n        for one in self.strat_user_id:\n            yield Request('https://www.zhihu.com/people/'+one+'/answers',callback=self.parse,dont_filter=True)\n        #return [Request('https://www.zhihu.com/#signin',callback=self.start_login,meta={'cookiejar':1})] #这个登录已不可用，仅供学习\n\n    def start_login(self,response):\n        xsrf = response.xpath('//input[@name=\"_xsrf\"]/@value').extract_first()\n        return [FormRequest('https://www.zhihu.com/login/phone_num',method='POST',meta={'cookiejar':response.meta['cookiejar']},formdata={\n                #'_xsrf':xsrf,\n                'password':'88888888',\n                'remember_me':\"true\",\n                'phone_num':'666666'},\n                callback=self.after_login\n                )]\n\n    def after_login(self,response):\n        pdb.set_trace()\n        if json.loads(response.body)['msg'].encode('utf8') == \"登录成功\":\n            self.logger.info(\"登录成功！%s\" % str(response.meta['cookiejar']))\n            print(\"登录成功！\")\n            self.obj.add_cookie(response.meta['cookiejar'])\n            for one in self.strat_user_id:\n                yield Request('https://www.zhihu.com/people/'+one+'/answers',meta={'cookiejar':response.meta['cookiejar']},callback=self.parse)\n        else:\n            self.logger.error('登录失败')\n\n    def __del__(self):\n        self.obj.quit()\n\n    def parse(self, response):\n        item = ZhihuItem()\n        name = response.xpath('//span[@class=\"ProfileHeader-name\"]/text()').extract()[0]\n        #pdb.set_trace()\n        user_image_url = response.xpath('//img[@class=\"Avatar Avatar--large UserAvatar-inner\"]/@srcset').extract()[0].replace(' 2x','')\n        user_id = re.findall('people\\/(.*?)\\/',response.url)[0]\n        gender_icon = response.xpath('.//svg[@class=\"Icon Icon--male\" or @class=\"Icon Icon--female\"]/@class').extract()\n        #pdb.set_trace()\n        gender = \"\"\n        if gender_icon:\n            if gender_icon[0] == \"Icon Icon--female\":\n                gender = \"女\"\n            elif gender_icon[0] == \"Icon Icon--male\":\n                gender = \"男\"\n        item['name'] = name\n        item['user_id'] = user_id\n        item['user_image_url'] = user_image_url\n        item['gender'] = gender\n        try:\n            num = response.xpath('//div[@class=\"NumberBoard-value\"]/text()').extract()\n            item['followees_num'] = num[0]\n            item['followers_num'] = num[1]\n            followees_url = response.url.replace('answers','following')\n            followers_url = response.url.replace('answers','followers')\n            relation_item = RelationItem()\n            relation_item['relations_id'] = []\n            relation_item['user_id'] = user_id\n            relation_item['relation_type'] = 'followees'\n            yield Request(followees_url,callback=self.relations,meta={'page':1,'item':relation_item})\n            relation_item['relation_type'] = 'followers'\n            yield Request(followers_url,callback=self.relations,meta={'page':1,'item':relation_item})\n        except:\n            print(\"需要登录！\")\n\n        self.obj.get(response.url)\n        try:\n            self.obj.find_element_by_class_name('ProfileHeader-expandButton').click()\n            first = self.obj.find_elements_by_xpath('//div[@class=\"ProfileHeader-detailItem\"]')\n            for one in first:\n                label = one.find_element_by_class_name('ProfileHeader-detailLabel').text\n                if label == \"居住地\":\n                    location = one.find_element_by_class_name('ProfileHeader-detailValue').text.replace('\\n',',')\n                    item['location'] = location\n                elif label == \"所在行业\" or \"行业\":\n                    business = one.find_element_by_class_name('ProfileHeader-detailValue').text.replace('\\n',',')\n                    item['business'] = business\n                elif label == \"职业经历\":\n                    professional = one.find_element_by_class_name('ProfileHeader-detailValue').text.replace('\\n',',')\n                    item['professional'] = professional\n                elif label == \"教育经历\":\n                    education = one.find_element_by_class_name('ProfileHeader-detailValue').text.replace('\\n',',')\n                    item['education'] = education\n                else:\n                    pass\n        except:\n            pass\n        yield item\n\n    def relations(self,response):\n        self.obj.get(response.url)\n        followees_a = self.obj.find_elements_by_xpath('//a[@class=\"UserLink-link\"]')\n        #pdb.set_trace()\n        #followees_a = response.xpath('//a[@class=\"UserLink-link\"]/@href').extract()\n        followees = []\n        for one in followees_a:\n            try:\n                one = one.get_attribute('href')\n                followees.append(one.replace('https://www.zhihu.com/people/',''))\n            except:\n                pass\n        followees = list(set(followees))\n        #pdb.set_trace()\n        response.meta['item']['relations_id']+=followees\n        nextpage_button = response.xpath('//button[@class=\"Button PaginationButton PaginationButton-next Button--plain\"]').extract()\n        if nextpage_button:\n            #pdb.set_trace()\n            nextpage_url = response.url.replace('?page='+str(response.meta['page']),'') + \"?page=\" + str(response.meta['page']+1)\n            yield Request(nextpage_url,callback=self.relations,meta={'page':response.meta['page']+1,'item':response.meta['item']})\n        else:\n            yield response.meta['item']\n            for user in followees:\n                yield Request('https://www.zhihu.com/people/'+user+'/answers',callback=self.parse)\n"
  },
  {
    "path": "zhihu/zhihu/spiders/zhihuspider0.py",
    "content": "# -*- coding: utf-8 -*-\nimport scrapy\nimport re\nimport pdb\nimport json\nfrom scrapy.http import Request\nfrom ..items import ZhihuItem,RelationItem,AnswerItem,QuestionItem,ArticleItem\nfrom ..scrapy_redis.spiders import RedisSpider\n\n# ------------------------------------------\n#   版本：1.0\n#   日期：2017-8-06\n#   作者：AlexTan\n#   <CSDN:   http://blog.csdn.net/alextan_>  \n#   <e-mail: alextanbz@gmail.com>\n# ------------------------------------------\n\nclass Zhihuspider0Spider(RedisSpider):\n    name = 'zhihuspider'\n    redis_key = \"zhihuspider:start_urls\"\n    allowed_domains = ['zhihu.com']\n    start_urls = ['http://zhihu.com/']\n    strat_user_id = ['yun-he-shu-ju-8']\n\n    def start_requests(self):\n        for one in self.strat_user_id:\n            yield Request('https://www.zhihu.com/api/v4/members/'+one+'?include=locations,employments,industry_category,gender,educations,business,follower_count,following_count,description,badge[?(type=best_answerer)].topics',meta={'user_id':one},callback=self.parse)\n            \n\n    def parse(self, response):\n        json_result = str(response.body,encoding=\"utf8\").replace('false','0').replace('true','1')\n        dict_result = eval(json_result)\n        item = ZhihuItem()\n        if dict_result['gender'] == 1:\n            item['gender'] = '男'\n        elif dict_result['gender'] == 0:\n            item['gender'] = '女'\n        else:\n            item['gender'] = '未知'\n        item['user_id'] = dict_result['url_token']\n        item['user_image_url'] = dict_result['avatar_url'][:-6] + 'xl.jpg'\n        item['name'] = dict_result['name']\n        item['locations'] = []\n        for one in dict_result['locations']:\n            item['locations'].append(one['name'])\n        try:\n            item['business'] = dict_result['business']['name']\n        except:\n            try:\n                item['business'] = dict_result['industry_category']\n            except:\n                pass\n\n        item['education'] = []\n        for one in dict_result['educations']:\n            try:\n                education = one['school']['name'] + \":\" + one['major']['name']\n            except:\n                try:\n                    education = one['school']['name']\n                except:\n                    pass\n            item['education'].append(education)\n        #pdb.set_trace()\n        item['followees_num'] = dict_result['following_count']\n        item['followers_num'] = dict_result['follower_count']\n        item['employments'] = []\n        for one in dict_result['employments']:\n            try:\n                employment = one['company']['name'] + \":\" + one['job']['name']\n            except:\n                try:\n                    employment = one['company']['name']\n                except:\n                    pass\n            item['employments'].append(employment)\n        #pdb.set_trace()\n        yield item\n        item = RelationItem()\n        one = response.meta['user_id']\n        item['relations_id'] = []\n        item['user_id'] = one\n        item['relation_type'] = ''\n        yield Request('https://www.zhihu.com/api/v4/members/'+one+'/followers?include=data[*].answer_count,badge[?(type=best_answerer)].topics&limit=20&offset=0',callback=self.parse_relation,meta={'item':item,'offset':0,'relation_type':'followers'})\n        yield Request('https://www.zhihu.com/api/v4/members/'+one+'/followees?include=data[*].answer_count,badge[?(type=best_answerer)].topics&limit=20&offset=0',callback=self.parse_relation,meta={'item':item,'offset':0,'relation_type':'followees'})\n        yield Request('https://www.zhihu.com/api/v4/members/'+one+'/answers?include=data[*].comment_count,content,voteup_count,created_time,updated_time;data[*].author.badge[?(type=best_answerer)].topics&limit=20&offset=0',callback=self.parse_answer,meta={'answer_user_id':one,'offset':0})\n        yield Request('https://www.zhihu.com/people/'+one+'/asks?page=1',callback=self.parse_question,meta={'ask_user_id':one,'page':1})\n        yield Request('https://www.zhihu.com/api/v4/members/'+one+'/articles?include=data[*].comment_count,content,voteup_count,created,updated;data[*].author.badge[?(type=best_answerer)].topics&limit=20&offset=0',callback=self.parse_article,meta={'author_id':one,'offset':0})\n\n    def parse_relation(self,response):\n        json_result = str(response.body,encoding=\"utf8\").replace('false','0').replace('true','1')\n        dict_result = eval(json_result)\n        relations_id = []\n        for one in dict_result['data']:\n            relations_id.append(one['url_token'])\n        response.meta['item']['relations_id'] = relations_id\n        if response.meta['offset'] == 0:\n            response.meta['item']['relation_type'] = response.meta['relation_type']\n        else:\n            response.meta['item']['relation_type'] = 'next:' + response.meta['relation_type']\n        #pdb.set_trace()\n        yield response.meta['item']\n        for one in response.meta['item']['relations_id']:\n                yield Request('https://www.zhihu.com/api/v4/members/'+one+'?include=locations,employments,industry_category,gender,educations,business,follower_count,following_count,description,badge[?(type=best_answerer)].topics',meta={'user_id':one},callback=self.parse)\n        #pdb.set_trace()\n        if dict_result['paging']['is_end'] == 0:\n            #pdb.set_trace()\n            offset = response.meta['offset'] + 20\n            next_page = re.findall('(.*offset=)\\d+',response.url)[0]\n            #pdb.set_trace()\n            yield Request(next_page + str(offset),callback=self.parse_relation,meta={'item':response.meta['item'],'offset':offset,'relation_type':response.meta['relation_type']})\n\n    def parse_answer(self,response):\n        json_result = str(response.body,encoding=\"utf8\").replace('false','0').replace('true','1')\n        dict_result = eval(json_result)\n        for one in dict_result['data']:\n            item = AnswerItem()\n            item['answer_user_id'] = response.meta['answer_user_id']\n            item['answer_id'] = one['id']\n            item['question_id'] = one['question']['id']\n            #pdb.set_trace()\n            item['cretated_time'] = one['created_time']\n            item['updated_time'] = one['updated_time']\n            item['voteup_count'] = one['voteup_count']\n            item['comment_count'] = one['comment_count']\n            item['content'] = one['content']\n            yield item\n        if dict_result['paging']['is_end'] == 0:\n            offset = response.meta['offset'] + 20\n            next_page = re.findall('(.*offset=)\\d+',response.url)[0]\n            yield Request(next_page + str(offset),callback=self.parse_answer,meta={'answer_user_id':response.meta['answer_user_id'],'offset':offset})\n\n    def parse_question(self,response):\n        list_item = response.xpath('//div[@class=\"List-item\"]')\n        for one in list_item:\n            item = QuestionItem()\n            item['ask_user_id'] = response.meta['ask_user_id']\n            title = one.xpath('.//div[@class=\"QuestionItem-title\"]')\n            item['title'] = title.xpath('./a/text()').extract()[0]\n            item['question_id'] = title.xpath('./a/@href').extract()[0].replace('/question/','')\n            content_item = one.xpath('.//div[@class=\"ContentItem-status\"]//span/text()').extract()\n            item['ask_time'] = content_item[0]\n            item['answer_count'] = content_item[1]\n            item['followees_count'] = content_item[2]\n            yield item\n        next_page = response.xpath('//button[@class=\"Button PaginationButton PaginationButton-next Button--plain\"]/text()').extract()\n        if next_page:\n            response.meta['page'] += 1\n            next_url = re.findall('(.*page=)\\d+',response.url)[0] + str(response.meta['page'])\n            yield Request(next_url,callback=self.parse_question,meta={'ask_user_id':response.meta['ask_user_id'],'page':response.meta['page']})\n\n    def parse_article(self,response):\n        json_result = str(response.body,encoding=\"utf8\").replace('false','0').replace('true','1')\n        dict_result = eval(json_result)\n        for one in dict_result['data']:\n            item = ArticleItem()\n            item['author_id'] = response.meta['author_id']\n            item['title'] = one['title']\n            item['article_id'] = one['id']\n            item['content'] = one['content']\n            #pdb.set_trace()\n            item['cretated_time'] = one['created']\n            item['updated_time'] = one['updated']\n            item['voteup_count'] = one['voteup_count']\n            item['comment_count'] = one['comment_count']\n            yield item\n        if dict_result['paging']['is_end'] == 0:\n            offset = response.meta['offset'] + 20\n            next_page = re.findall('(.*offset=)\\d+',response.url)[0]\n            yield Request(next_page + str(offset),callback=self.parse_article,meta={'author_id':response.meta['author_id'],'offset':offset})\n"
  },
  {
    "path": "zhihu/zhihu/user_agents_pc.py",
    "content": "#encoding=utf8\n\n# ------------------------------------------\n#   版本：1.0\n#   日期：2017-8-06\n#   作者：AlexTan\n#   <CSDN:   http://blog.csdn.net/alextan_>  \n#   <e-mail: alextanbz@gmail.com>\n# ------------------------------------------\n\nagents = [\n    \"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:54.0) Gecko/20100101 Firefox/54.0\",\n    \"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:54.0) Gecko/20100101 Firefox/54.0\",\n    \"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.86 Safari/537.36\",\n    \"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.86 Safari/537.36\",\n    \"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36\",\n    \"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.79 Safari/537.36 Edge/14.14393\",\n]"
  },
  {
    "path": "zhihu/zhihu/yumdama.py",
    "content": "# encoding=utf-8\nimport http.client, mimetypes, urllib, json, time, requests\nimport pdb\n\n######################################################################\n\n# 错误代码请查询 http://www.yundama.com/apidoc/YDM_ErrorCode.html\n# 所有函数请查询 http://www.yundama.com/apidoc\n\n# 1. http://www.yundama.com/index/reg/developer 注册开发者账号\n# 2. http://www.yundama.com/developer/myapp 添加新软件\n# 3. 使用添加的软件ID和密钥进行开发，享受丰厚分成\n\n# 用户名\nusername = ''\n\n# 密码\npassword = ''\n\n# 软件ＩＤ，开发者分成必要参数。登录开发者后台【我的软件】获得！\nappid = 0000\n\n# 软件密钥，开发者分成必要参数。登录开发者后台【我的软件】获得！\nappkey = ''\n\n# 图片文件\nfilename = 'captcha.png'\n\n# 验证码类型，# 例：1004表示4位字母数字，不同类型收费不同。请准确填写，否则影响识别率。在此查询所有类型 http://www.yundama.com/price.html\ncodetype = 1004\n\n# 超时时间，秒\ntimeout = 60\n\n\n######################################################################\n\nclass YDMHttp():\n\n    apiurl = 'http://api.yundama.com/api.php'\n    username = ''\n    password = ''\n    appid = ''\n    appkey = ''\n\n    def __init__(self, username, password, appid, appkey):\n        self.username = username  \n        self.password = password\n        self.appid = str(appid)\n        self.appkey = appkey\n\n    def request(self, fields, files=[]):\n        response = self.post_url(self.apiurl, fields, files)\n        response = json.loads(response)\n        return response\n    \n    def balance(self):\n        data = {'method': 'balance', 'username': self.username, 'password': self.password, 'appid': self.appid, 'appkey': self.appkey}\n        response = self.request(data)\n        if (response):\n            if (response['ret'] and response['ret'] < 0):\n                return response['ret']\n            else:\n                return response['balance']\n        else:\n            return -9001\n    \n    def login(self):\n        data = {'method': 'login', 'username': self.username, 'password': self.password, 'appid': self.appid, 'appkey': self.appkey}\n        response = self.request(data)\n        if (response):\n            if (response['ret'] and response['ret'] < 0):\n                return response['ret']\n            else:\n                return response['uid']\n        else:\n            return -9001\n\n    def upload(self, filename, codetype, timeout):\n        data = {'method': 'upload', 'username': self.username, 'password': self.password, 'appid': self.appid, 'appkey': self.appkey, 'codetype': str(codetype), 'timeout': str(timeout)}\n        file = {'file': filename}\n        response = self.request(data, file)\n        if (response):\n            if (response['ret'] and response['ret'] < 0):\n                return response['ret']\n            else:\n                return response['cid']\n        else:\n            return -9001\n\n    def result(self, cid):\n        data = {'method': 'result', 'username': self.username, 'password': self.password, 'appid': self.appid, 'appkey': self.appkey, 'cid': str(cid)}\n        response = self.request(data)\n        return response and response['text'] or ''\n\n    def decode(self, filename, codetype, timeout):\n        cid = self.upload(filename, codetype, timeout)\n        if (cid > 0):\n            for i in range(0, timeout):\n                result = self.result(cid)\n                if (result != ''):\n                    return cid, result\n                else:\n                    time.sleep(1)\n            return -3003, ''\n        else:\n            return cid, ''\n\n    def post_url(self, url, fields, files=[]):\n        for key in files:\n            files[key] = open(files[key], 'rb');\n        res = requests.post(url, files=files, data=fields)\n        return res.text\n\n\n######################################################################\n\n\ndef identify():\n    if (username == 'username'):\n        print ('请设置好相关参数再测试')\n    else:\n        #pdb.set_trace()\n        # 初始化\n        yundama = YDMHttp(username, password, appid, appkey)\n\n        # 登陆云打码\n        uid = yundama.login()\n        # print 'uid: %s' % uid\n\n        # 查询余额\n        balance = yundama.balance()\n        # print 'balance: %s' % balance\n\n        # 开始识别，图片路径，验证码类型ID，超时时间（秒），识别结果\n        cid, result = yundama.decode(filename, codetype, timeout)\n        # print 'cid: %s, result: %s' % (cid, result)\n        return result\n"
  }
]