[
  {
    "path": ".gitignore",
    "content": "*.json\r\n*.gif\r\n*.pyc\r\n"
  },
  {
    "path": "README.md",
    "content": "2016-04-10\r\nScrapy爬虫 - 获取知乎用户数据\r\n======\r\n### 安装Scrapy爬虫框架\r\n\r\n关于如何安装Python以及Scrapy框架，这里不做介绍，请自行网上搜索。\r\n\r\n### 初始化\r\n\r\n安装好Scrapy后，执行 `scrapy startproject myspider`\r\n接下来你会看到 myspider 文件夹，目录结构如下：\r\n\r\n- scrapy.cfg\r\n- myspider\r\n    - items.py\r\n    - pipelines.py\r\n    - settings.py\r\n    - __init__.py\r\n    - spiders\r\n        - __init__.py\r\n\r\n### 编写爬虫文件\r\n\r\n在spiders目录下新建 users.py\r\n\r\n```py\r\n# -*- coding: utf-8 -*-\r\nimport scrapy\r\nimport os\r\nimport time\r\nfrom zhihu.items import UserItem\r\nfrom zhihu.myconfig import UsersConfig # 爬虫配置\r\n\r\nclass UsersSpider(scrapy.Spider):\r\n    name = 'users'\r\n    domain = 'https://www.zhihu.com'\r\n    login_url = 'https://www.zhihu.com/login/email'\r\n    headers = {\r\n        \"Accept\": \"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8\",\r\n        \"Accept-Language\": \"zh-CN,zh;q=0.8\",\r\n        \"Connection\": \"keep-alive\",\r\n        \"Host\": \"www.zhihu.com\",\r\n        \"Upgrade-Insecure-Requests\": \"1\",\r\n        \"User-Agent\": \"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.109 Safari/537.36\"\r\n    }\r\n\r\n    def __init__(self, url = None):\r\n        self.user_url = url\r\n\r\n    def start_requests(self):\r\n        yield scrapy.Request(\r\n            url = self.domain,\r\n            headers = self.headers,\r\n            meta = {\r\n                'proxy': UsersConfig['proxy'],\r\n                'cookiejar': 1\r\n            },\r\n            callback = self.request_captcha\r\n        )\r\n\r\n    def request_captcha(self, response):\r\n        # 获取_xsrf值\r\n        _xsrf = response.css('input[name=\"_xsrf\"]::attr(value)').extract()[0]\r\n        # 获取验证码地址\r\n        captcha_url = 'http://www.zhihu.com/captcha.gif?r=' + str(time.time() * 1000)\r\n        # 准备下载验证码\r\n        yield scrapy.Request(\r\n            url = captcha_url,\r\n            headers = self.headers,\r\n            meta = {\r\n                'proxy': UsersConfig['proxy'],\r\n                'cookiejar': response.meta['cookiejar'],\r\n                '_xsrf': _xsrf\r\n            },\r\n            callback = self.download_captcha\r\n        )\r\n\r\n    def download_captcha(self, response):\r\n        # 下载验证码\r\n        with open('captcha.gif', 'wb') as fp:\r\n            fp.write(response.body)\r\n        # 用软件打开验证码图片\r\n        os.system('start captcha.gif')\r\n        # 输入验证码\r\n        print 'Please enter captcha: '\r\n        captcha = raw_input()\r\n\r\n        yield scrapy.FormRequest(\r\n            url = self.login_url,\r\n            headers = self.headers,\r\n            formdata = {\r\n                'email': UsersConfig['email'],\r\n                'password': UsersConfig['password'],\r\n                '_xsrf': response.meta['_xsrf'],\r\n                'remember_me': 'true',\r\n                'captcha': captcha\r\n            },\r\n            meta = {\r\n                'proxy': UsersConfig['proxy'],\r\n                'cookiejar': response.meta['cookiejar']\r\n            },\r\n            callback = self.request_zhihu\r\n        )\r\n\r\n    def request_zhihu(self, response):\r\n        yield scrapy.Request(\r\n            url = self.user_url + '/about',\r\n            headers = self.headers,\r\n            meta = {\r\n                'proxy': UsersConfig['proxy'],\r\n                'cookiejar': response.meta['cookiejar'],\r\n                'from': {\r\n                    'sign': 'else',\r\n                    'data': {}\r\n                }\r\n            },\r\n            callback = self.user_item,\r\n            dont_filter = True\r\n        )\r\n\r\n        yield scrapy.Request(\r\n            url = self.user_url + '/followees',\r\n            headers = self.headers,\r\n            meta = {\r\n                'proxy': UsersConfig['proxy'],\r\n                'cookiejar': response.meta['cookiejar'],\r\n                'from': {\r\n                    'sign': 'else',\r\n                    'data': {}\r\n                }\r\n            },\r\n            callback = self.user_start,\r\n            dont_filter = True\r\n        )\r\n\r\n        yield scrapy.Request(\r\n            url = self.user_url + '/followers',\r\n            headers = self.headers,\r\n            meta = {\r\n                'proxy': UsersConfig['proxy'],\r\n                'cookiejar': response.meta['cookiejar'],\r\n                'from': {\r\n                    'sign': 'else',\r\n                    'data': {}\r\n                }\r\n            },\r\n            callback = self.user_start,\r\n            dont_filter = True\r\n        )\r\n\r\n    def user_start(self, response):\r\n        sel_root = response.xpath('//h2[@class=\"zm-list-content-title\"]')\r\n        # 判断关注列表是否为空\r\n        if len(sel_root):\r\n            for sel in sel_root:\r\n                people_url = sel.xpath('a/@href').extract()[0]\r\n\r\n                yield scrapy.Request(\r\n                    url = people_url + '/about',\r\n                    headers = self.headers,\r\n                    meta = {\r\n                        'proxy': UsersConfig['proxy'],\r\n                        'cookiejar': response.meta['cookiejar'],\r\n                        'from': {\r\n                            'sign': 'else',\r\n                            'data': {}\r\n                        }\r\n                    },\r\n                    callback = self.user_item,\r\n                    dont_filter = True\r\n                )\r\n\r\n                yield scrapy.Request(\r\n                    url = people_url + '/followees',\r\n                    headers = self.headers,\r\n                    meta = {\r\n                        'proxy': UsersConfig['proxy'],\r\n                        'cookiejar': response.meta['cookiejar'],\r\n                        'from': {\r\n                            'sign': 'else',\r\n                            'data': {}\r\n                        }\r\n                    },\r\n                    callback = self.user_start,\r\n                    dont_filter = True\r\n                )\r\n\r\n                yield scrapy.Request(\r\n                    url = people_url + '/followers',\r\n                    headers = self.headers,\r\n                    meta = {\r\n                        'proxy': UsersConfig['proxy'],\r\n                        'cookiejar': response.meta['cookiejar'],\r\n                        'from': {\r\n                            'sign': 'else',\r\n                            'data': {}\r\n                        }\r\n                    },\r\n                    callback = self.user_start,\r\n                    dont_filter = True\r\n                )\r\n\r\n    def user_item(self, response):\r\n        def value(list):\r\n            return list[0] if len(list) else ''\r\n\r\n        sel = response.xpath('//div[@class=\"zm-profile-header ProfileCard\"]')\r\n\r\n        item = UserItem()\r\n        item['url'] = response.url[:-6]\r\n        item['name'] = sel.xpath('//a[@class=\"name\"]/text()').extract()[0].encode('utf-8')\r\n        item['bio'] = value(sel.xpath('//span[@class=\"bio\"]/@title').extract()).encode('utf-8')\r\n        item['location'] = value(sel.xpath('//span[contains(@class, \"location\")]/@title').extract()).encode('utf-8')\r\n        item['business'] = value(sel.xpath('//span[contains(@class, \"business\")]/@title').extract()).encode('utf-8')\r\n        item['gender'] = 0 if sel.xpath('//i[contains(@class, \"icon-profile-female\")]') else 1\r\n        item['avatar'] = value(sel.xpath('//img[@class=\"Avatar Avatar--l\"]/@src').extract())\r\n        item['education'] = value(sel.xpath('//span[contains(@class, \"education\")]/@title').extract()).encode('utf-8')\r\n        item['major'] = value(sel.xpath('//span[contains(@class, \"education-extra\")]/@title').extract()).encode('utf-8')\r\n        item['employment'] = value(sel.xpath('//span[contains(@class, \"employment\")]/@title').extract()).encode('utf-8')\r\n        item['position'] = value(sel.xpath('//span[contains(@class, \"position\")]/@title').extract()).encode('utf-8')\r\n        item['content'] = value(sel.xpath('//span[@class=\"content\"]/text()').extract()).strip().encode('utf-8')\r\n        item['ask'] = int(sel.xpath('//div[contains(@class, \"profile-navbar\")]/a[2]/span[@class=\"num\"]/text()').extract()[0])\r\n        item['answer'] = int(sel.xpath('//div[contains(@class, \"profile-navbar\")]/a[3]/span[@class=\"num\"]/text()').extract()[0])\r\n        item['agree'] = int(sel.xpath('//span[@class=\"zm-profile-header-user-agree\"]/strong/text()').extract()[0])\r\n        item['thanks'] = int(sel.xpath('//span[@class=\"zm-profile-header-user-thanks\"]/strong/text()').extract()[0])\r\n\r\n        yield item\r\n```\r\n\r\n### 添加爬虫配置文件\r\n\r\n在myspider目录下新建myconfig.py，并添加以下内容，将你的配置信息填入相应位置\r\n\r\n```py\r\n# -*- coding: utf-8 -*-\r\nUsersConfig = {\r\n    # 代理\r\n    'proxy': '',\r\n\r\n    # 知乎用户名和密码\r\n    'email': 'your email',\r\n    'password': 'your password',\r\n}\r\n\r\nDbConfig = {\r\n    # db config\r\n    'user': 'db user',\r\n    'passwd': 'db password',\r\n    'db': 'db name',\r\n    'host': 'db host',\r\n}\r\n```\r\n\r\n### 修改items.py\r\n\r\n```py\r\n# -*- coding: utf-8 -*-\r\nimport scrapy\r\n\r\nclass UserItem(scrapy.Item):\r\n    # define the fields for your item here like:\r\n    url = scrapy.Field()\r\n    name = scrapy.Field()\r\n    bio = scrapy.Field()\r\n    location = scrapy.Field()\r\n    business = scrapy.Field()\r\n    gender = scrapy.Field()\r\n    avatar = scrapy.Field()\r\n    education = scrapy.Field()\r\n    major = scrapy.Field()\r\n    employment = scrapy.Field()\r\n    position = scrapy.Field()\r\n    content = scrapy.Field()\r\n    ask = scrapy.Field()\r\n    answer = scrapy.Field()\r\n    agree = scrapy.Field()\r\n    thanks = scrapy.Field()\r\n```\r\n\r\n### 将用户数据存入mysql数据库\r\n\r\n修改pipelines.py\r\n\r\n```py\r\n# -*- coding: utf-8 -*-\r\nimport MySQLdb\r\nimport datetime\r\nfrom zhihu.myconfig import DbConfig\r\n\r\nclass UserPipeline(object):\r\n    def __init__(self):\r\n        self.conn = MySQLdb.connect(user = DbConfig['user'], passwd = DbConfig['passwd'], db = DbConfig['db'], host = DbConfig['host'], charset = 'utf8', use_unicode = True)\r\n        self.cursor = self.conn.cursor()\r\n        # 清空表\r\n        # self.cursor.execute('truncate table weather;')\r\n        # self.conn.commit()\r\n\r\n    def process_item(self, item, spider):\r\n        curTime = datetime.datetime.now()\r\n        try:\r\n            self.cursor.execute(\r\n                \"\"\"INSERT IGNORE INTO users (url, name, bio, location, business, gender, avatar, education, major, employment, position, content, ask, answer, agree, thanks, create_at)\r\n                VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)\"\"\",\r\n                (\r\n                    item['url'],\r\n                    item['name'],\r\n                    item['bio'],\r\n                    item['location'],\r\n                    item['business'],\r\n                    item['gender'],\r\n                    item['avatar'],\r\n                    item['education'],\r\n                    item['major'],\r\n                    item['employment'],\r\n                    item['position'],\r\n                    item['content'],\r\n                    item['ask'],\r\n                    item['answer'],\r\n                    item['agree'],\r\n                    item['thanks'],\r\n                    curTime\r\n                )\r\n            )\r\n            self.conn.commit()\r\n        except MySQLdb.Error, e:\r\n            print 'Error %d %s' % (e.args[0], e.args[1])\r\n\r\n        return item\r\n```\r\n\r\n### 修改settings.py\r\n\r\n找到 `ITEM_PIPELINES`，改为：\r\n```py\r\nITEM_PIPELINES = {\r\n   'myspider.pipelines.UserPipeline': 300,\r\n}\r\n```\r\n\r\n在末尾添加，设置爬虫的深度\r\n```py\r\nDEPTH_LIMIT=10\r\n```\r\n\r\n### 爬取知乎用户数据\r\n\r\n确保MySQL已经打开，在项目根目录下打开终端，\r\n执行 `scrapy crawl users -a url=https://www.zhihu.com/people/<user>`，\r\n其中user为爬虫的第一个用户，之后会根据该用户关注的人和被关注的人进行爬取数据\r\n接下来会下载验证码图片，若未自动打开，请到根目录下打开 captcha.gif，在终端输入验证码\r\n数据爬取Loading...\r\n\r\n### 源码\r\n\r\n源码可以在这里找到 [github](https://github.com/ansenhuang/scrapy-zhihu-users)\r\n"
  },
  {
    "path": "scrapy.cfg",
    "content": "# Automatically created by: scrapy startproject\n#\n# For more information about the [deploy] section see:\n# https://scrapyd.readthedocs.org/en/latest/deploy.html\n\n[settings]\ndefault = zhihu.settings\n\n[deploy]\n#url = http://localhost:6800/\nproject = zhihu\n"
  },
  {
    "path": "zhihu/__init__.py",
    "content": ""
  },
  {
    "path": "zhihu/items.py",
    "content": "# -*- coding: utf-8 -*-\n\n# Define here the models for your scraped items\n#\n# See documentation in:\n# http://doc.scrapy.org/en/latest/topics/items.html\n\nimport scrapy\n\n\nclass UserItem(scrapy.Item):\n    # define the fields for your item here like:\n    url = scrapy.Field()\n    name = scrapy.Field()\n    bio = scrapy.Field()\n    location = scrapy.Field()\n    business = scrapy.Field()\n    gender = scrapy.Field()\n    avatar = scrapy.Field()\n    education = scrapy.Field()\n    major = scrapy.Field()\n    employment = scrapy.Field()\n    position = scrapy.Field()\n    content = scrapy.Field()\n    ask = scrapy.Field()\n    answer = scrapy.Field()\n    agree = scrapy.Field()\n    thanks = scrapy.Field()\n"
  },
  {
    "path": "zhihu/myconfig.py",
    "content": "# -*- coding: utf-8 -*-\r\nUsersConfig = {\r\n    # 代理\r\n    'proxy': '',\r\n\r\n    # 知乎用户名和密码\r\n    'email': '',\r\n    'password': '',\r\n}\r\n\r\nDbConfig = {\r\n    # db config\r\n    'user': 'root',\r\n    'passwd': '123456',\r\n    'db': 'zhihu',\r\n    'host': '127.0.0.1',\r\n}\r\n"
  },
  {
    "path": "zhihu/pipelines.py",
    "content": "# -*- coding: utf-8 -*-\n\n# Define your item pipelines here\n#\n# Don't forget to add your pipeline to the ITEM_PIPELINES setting\n# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html\nimport MySQLdb\nimport datetime\nfrom zhihu.myconfig import DbConfig\n\nclass UserPipeline(object):\n    def __init__(self):\n        self.conn = MySQLdb.connect(user = DbConfig['user'], passwd = DbConfig['passwd'], db = DbConfig['db'], host = DbConfig['host'], charset = 'utf8', use_unicode = True)\n        self.cursor = self.conn.cursor()\n        # 清空表\n        # self.cursor.execute('truncate table weather;')\n        # self.conn.commit()\n\n    def process_item(self, item, spider):\n        curTime = datetime.datetime.now()\n        try:\n            self.cursor.execute(\n                \"\"\"INSERT IGNORE INTO users (url, name, bio, location, business, gender, avatar, education, major, employment, position, content, ask, answer, agree, thanks, create_at)\n                VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)\"\"\",\n                (\n                    item['url'],\n                    item['name'],\n                    item['bio'],\n                    item['location'],\n                    item['business'],\n                    item['gender'],\n                    item['avatar'],\n                    item['education'],\n                    item['major'],\n                    item['employment'],\n                    item['position'],\n                    item['content'],\n                    item['ask'],\n                    item['answer'],\n                    item['agree'],\n                    item['thanks'],\n                    curTime\n                )\n            )\n            self.conn.commit()\n        except MySQLdb.Error, e:\n            print 'Error %d %s' % (e.args[0], e.args[1])\n\n        return item\n"
  },
  {
    "path": "zhihu/settings.py",
    "content": "# -*- coding: utf-8 -*-\n\n# Scrapy settings for zhihu project\n#\n# For simplicity, this file contains only settings considered important or\n# commonly used. You can find more settings consulting the documentation:\n#\n#     http://doc.scrapy.org/en/latest/topics/settings.html\n#     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html\n#     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html\n\nBOT_NAME = 'zhihu'\n\nSPIDER_MODULES = ['zhihu.spiders']\nNEWSPIDER_MODULE = 'zhihu.spiders'\n\n\n# Crawl responsibly by identifying yourself (and your website) on the user-agent\n#USER_AGENT = 'zhihu (+http://www.yourdomain.com)'\n\n# Configure maximum concurrent requests performed by Scrapy (default: 16)\n#CONCURRENT_REQUESTS=32\n\n# Configure a delay for requests for the same website (default: 0)\n# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay\n# See also autothrottle settings and docs\n#DOWNLOAD_DELAY=3\n# The download delay setting will honor only one of:\n#CONCURRENT_REQUESTS_PER_DOMAIN=16\n#CONCURRENT_REQUESTS_PER_IP=16\n\n# Disable cookies (enabled by default)\nCOOKIES_ENABLED=True\nCOOKIES_DEBUG=False\n\n# Disable Telnet Console (enabled by default)\n#TELNETCONSOLE_ENABLED=False\n\n# Override the default request headers:\n#DEFAULT_REQUEST_HEADERS = {\n#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',\n#   'Accept-Language': 'en',\n#}\n\n# Enable or disable spider middlewares\n# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html\n#SPIDER_MIDDLEWARES = {\n#    'zhihu.middlewares.MyCustomSpiderMiddleware': 543,\n#}\n\n# Enable or disable downloader middlewares\n# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html\n#DOWNLOADER_MIDDLEWARES = {\n#    'zhihu.middlewares.MyCustomDownloaderMiddleware': 543,\n#}\n\n# Enable or disable extensions\n# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html\n#EXTENSIONS = {\n#    'scrapy.telnet.TelnetConsole': None,\n#}\n\n# Configure item pipelines\n# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html\nITEM_PIPELINES = {\n   'zhihu.pipelines.UserPipeline': 300,\n}\n\n# Enable and configure the AutoThrottle extension (disabled by default)\n# See http://doc.scrapy.org/en/latest/topics/autothrottle.html\n# NOTE: AutoThrottle will honour the standard settings for concurrency and delay\n#AUTOTHROTTLE_ENABLED=True\n# The initial download delay\n#AUTOTHROTTLE_START_DELAY=5\n# The maximum download delay to be set in case of high latencies\n#AUTOTHROTTLE_MAX_DELAY=60\n# Enable showing throttling stats for every response received:\n#AUTOTHROTTLE_DEBUG=False\n\n# Enable and configure HTTP caching (disabled by default)\n# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings\n#HTTPCACHE_ENABLED=True\n#HTTPCACHE_EXPIRATION_SECS=0\n#HTTPCACHE_DIR='httpcache'\n#HTTPCACHE_IGNORE_HTTP_CODES=[]\n#HTTPCACHE_STORAGE='scrapy.extensions.httpcache.FilesystemCacheStorage'\n\n# Extra\nDEPTH_LIMIT=3+10\n"
  },
  {
    "path": "zhihu/spiders/__init__.py",
    "content": "# This package will contain the spiders of your Scrapy project\n#\n# Please refer to the documentation for information on how to create and manage\n# your spiders.\n"
  },
  {
    "path": "zhihu/spiders/users.py",
    "content": "# -*- coding: utf-8 -*-\nimport scrapy\nimport os\nimport time\nfrom zhihu.items import UserItem\nfrom zhihu.myconfig import UsersConfig\n\nclass UsersSpider(scrapy.Spider):\n    name = 'users'\n    domain = 'https://www.zhihu.com'\n    login_url = 'https://www.zhihu.com/login/email'\n    headers = {\n        \"Accept\": \"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8\",\n        \"Accept-Language\": \"zh-CN,zh;q=0.8\",\n        \"Connection\": \"keep-alive\",\n        \"Host\": \"www.zhihu.com\",\n        \"Upgrade-Insecure-Requests\": \"1\",\n        \"User-Agent\": \"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.109 Safari/537.36\"\n    }\n\n    def __init__(self, url = None):\n        self.user_url = url\n\n    def start_requests(self):\n        yield scrapy.Request(\n            url = self.domain,\n            headers = self.headers,\n            meta = {\n                'proxy': UsersConfig['proxy'],\n                'cookiejar': 1\n            },\n            callback = self.request_captcha\n        )\n\n    def request_captcha(self, response):\n        # 获取_xsrf值\n        _xsrf = response.css('input[name=\"_xsrf\"]::attr(value)').extract()[0]\n        # 获取验证码地址\n        captcha_url = 'http://www.zhihu.com/captcha.gif?r=' + str(time.time() * 1000)\n        # 准备下载验证码\n        yield scrapy.Request(\n            url = captcha_url,\n            headers = self.headers,\n            meta = {\n                'proxy': UsersConfig['proxy'],\n                'cookiejar': response.meta['cookiejar'],\n                '_xsrf': _xsrf\n            },\n            callback = self.download_captcha\n        )\n\n    def download_captcha(self, response):\n        # 下载验证码\n        with open('captcha.gif', 'wb') as fp:\n            fp.write(response.body)\n        # 用软件打开验证码图片\n        os.system('start captcha.gif')\n        # 输入验证码\n        print 'Please enter captcha: '\n        captcha = raw_input()\n\n        yield scrapy.FormRequest(\n            url = self.login_url,\n            headers = self.headers,\n            formdata = {\n                'email': UsersConfig['email'],\n                'password': UsersConfig['password'],\n                '_xsrf': response.meta['_xsrf'],\n                'remember_me': 'true',\n                'captcha': captcha\n            },\n            meta = {\n                'proxy': UsersConfig['proxy'],\n                'cookiejar': response.meta['cookiejar']\n            },\n            callback = self.request_zhihu\n        )\n\n    def request_zhihu(self, response):\n        yield scrapy.Request(\n            url = self.user_url + '/about',\n            headers = self.headers,\n            meta = {\n                'proxy': UsersConfig['proxy'],\n                'cookiejar': response.meta['cookiejar'],\n                'from': {\n                    'sign': 'else',\n                    'data': {}\n                }\n            },\n            callback = self.user_item,\n            dont_filter = True\n        )\n\n        yield scrapy.Request(\n            url = self.user_url + '/followees',\n            headers = self.headers,\n            meta = {\n                'proxy': UsersConfig['proxy'],\n                'cookiejar': response.meta['cookiejar'],\n                'from': {\n                    'sign': 'else',\n                    'data': {}\n                }\n            },\n            callback = self.user_start,\n            dont_filter = True\n        )\n\n        yield scrapy.Request(\n            url = self.user_url + '/followers',\n            headers = self.headers,\n            meta = {\n                'proxy': UsersConfig['proxy'],\n                'cookiejar': response.meta['cookiejar'],\n                'from': {\n                    'sign': 'else',\n                    'data': {}\n                }\n            },\n            callback = self.user_start,\n            dont_filter = True\n        )\n\n    def user_start(self, response):\n        sel_root = response.xpath('//h2[@class=\"zm-list-content-title\"]')\n        # 判断关注列表是否为空\n        if len(sel_root):\n            for sel in sel_root:\n                people_url = sel.xpath('a/@href').extract()[0]\n\n                yield scrapy.Request(\n                    url = people_url + '/about',\n                    headers = self.headers,\n                    meta = {\n                        'proxy': UsersConfig['proxy'],\n                        'cookiejar': response.meta['cookiejar'],\n                        'from': {\n                            'sign': 'else',\n                            'data': {}\n                        }\n                    },\n                    callback = self.user_item,\n                    dont_filter = True\n                )\n\n                yield scrapy.Request(\n                    url = people_url + '/followees',\n                    headers = self.headers,\n                    meta = {\n                        'proxy': UsersConfig['proxy'],\n                        'cookiejar': response.meta['cookiejar'],\n                        'from': {\n                            'sign': 'else',\n                            'data': {}\n                        }\n                    },\n                    callback = self.user_start,\n                    dont_filter = True\n                )\n\n                yield scrapy.Request(\n                    url = people_url + '/followers',\n                    headers = self.headers,\n                    meta = {\n                        'proxy': UsersConfig['proxy'],\n                        'cookiejar': response.meta['cookiejar'],\n                        'from': {\n                            'sign': 'else',\n                            'data': {}\n                        }\n                    },\n                    callback = self.user_start,\n                    dont_filter = True\n                )\n\n    def user_item(self, response):\n        def value(list):\n            return list[0] if len(list) else ''\n\n        sel = response.xpath('//div[@class=\"zm-profile-header ProfileCard\"]')\n\n        item = UserItem()\n        item['url'] = response.url[:-6]\n        item['name'] = sel.xpath('//a[@class=\"name\"]/text()').extract()[0].encode('utf-8')\n        item['bio'] = value(sel.xpath('//span[@class=\"bio\"]/@title').extract()).encode('utf-8')\n        item['location'] = value(sel.xpath('//span[contains(@class, \"location\")]/@title').extract()).encode('utf-8')\n        item['business'] = value(sel.xpath('//span[contains(@class, \"business\")]/@title').extract()).encode('utf-8')\n        item['gender'] = 0 if sel.xpath('//i[contains(@class, \"icon-profile-female\")]') else 1\n        item['avatar'] = value(sel.xpath('//img[@class=\"Avatar Avatar--l\"]/@src').extract())\n        item['education'] = value(sel.xpath('//span[contains(@class, \"education\")]/@title').extract()).encode('utf-8')\n        item['major'] = value(sel.xpath('//span[contains(@class, \"education-extra\")]/@title').extract()).encode('utf-8')\n        item['employment'] = value(sel.xpath('//span[contains(@class, \"employment\")]/@title').extract()).encode('utf-8')\n        item['position'] = value(sel.xpath('//span[contains(@class, \"position\")]/@title').extract()).encode('utf-8')\n        item['content'] = value(sel.xpath('//span[@class=\"content\"]/text()').extract()).strip().encode('utf-8')\n        item['ask'] = int(sel.xpath('//div[contains(@class, \"profile-navbar\")]/a[2]/span[@class=\"num\"]/text()').extract()[0])\n        item['answer'] = int(sel.xpath('//div[contains(@class, \"profile-navbar\")]/a[3]/span[@class=\"num\"]/text()').extract()[0])\n        item['agree'] = int(sel.xpath('//span[@class=\"zm-profile-header-user-agree\"]/strong/text()').extract()[0])\n        item['thanks'] = int(sel.xpath('//span[@class=\"zm-profile-header-user-thanks\"]/strong/text()').extract()[0])\n\n        yield item\n"
  }
]