[ { "path": ".gitignore", "content": "*.json\r\n*.gif\r\n*.pyc\r\n" }, { "path": "README.md", "content": "2016-04-10\r\nScrapy爬虫 - 获取知乎用户数据\r\n======\r\n### 安装Scrapy爬虫框架\r\n\r\n关于如何安装Python以及Scrapy框架，这里不做介绍，请自行网上搜索。\r\n\r\n### 初始化\r\n\r\n安装好Scrapy后，执行 `scrapy startproject myspider`\r\n接下来你会看到 myspider 文件夹，目录结构如下：\r\n\r\n- scrapy.cfg\r\n- myspider\r\n - items.py\r\n - pipelines.py\r\n - settings.py\r\n - __init__.py\r\n - spiders\r\n - __init__.py\r\n\r\n### 编写爬虫文件\r\n\r\n在spiders目录下新建 users.py\r\n\r\n```py\r\n# -*- coding: utf-8 -*-\r\nimport scrapy\r\nimport os\r\nimport time\r\nfrom zhihu.items import UserItem\r\nfrom zhihu.myconfig import UsersConfig # 爬虫配置\r\n\r\nclass UsersSpider(scrapy.Spider):\r\n name = 'users'\r\n domain = 'https://www.zhihu.com'\r\n login_url = 'https://www.zhihu.com/login/email'\r\n headers = {\r\n \"Accept\": \"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8\",\r\n \"Accept-Language\": \"zh-CN,zh;q=0.8\",\r\n \"Connection\": \"keep-alive\",\r\n \"Host\": \"www.zhihu.com\",\r\n \"Upgrade-Insecure-Requests\": \"1\",\r\n \"User-Agent\": \"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.109 Safari/537.36\"\r\n }\r\n\r\n def __init__(self, url = None):\r\n self.user_url = url\r\n\r\n def start_requests(self):\r\n yield scrapy.Request(\r\n url = self.domain,\r\n headers = self.headers,\r\n meta = {\r\n 'proxy': UsersConfig['proxy'],\r\n 'cookiejar': 1\r\n },\r\n callback = self.request_captcha\r\n )\r\n\r\n def request_captcha(self, response):\r\n # 获取_xsrf值\r\n _xsrf = response.css('input[name=\"_xsrf\"]::attr(value)').extract()[0]\r\n # 获取验证码地址\r\n captcha_url = 'http://www.zhihu.com/captcha.gif?r=' + str(time.time() * 1000)\r\n # 准备下载验证码\r\n yield scrapy.Request(\r\n url = captcha_url,\r\n headers = self.headers,\r\n meta = {\r\n 'proxy': UsersConfig['proxy'],\r\n 'cookiejar': response.meta['cookiejar'],\r\n '_xsrf': _xsrf\r\n },\r\n callback = self.download_captcha\r\n )\r\n\r\n def download_captcha(self, response):\r\n # 下载验证码\r\n with open('captcha.gif', 'wb') as fp:\r\n fp.write(response.body)\r\n # 用软件打开验证码图片\r\n os.system('start captcha.gif')\r\n # 输入验证码\r\n print 'Please enter captcha: '\r\n captcha = raw_input()\r\n\r\n yield scrapy.FormRequest(\r\n url = self.login_url,\r\n headers = self.headers,\r\n formdata = {\r\n 'email': UsersConfig['email'],\r\n 'password': UsersConfig['password'],\r\n '_xsrf': response.meta['_xsrf'],\r\n 'remember_me': 'true',\r\n 'captcha': captcha\r\n },\r\n meta = {\r\n 'proxy': UsersConfig['proxy'],\r\n 'cookiejar': response.meta['cookiejar']\r\n },\r\n callback = self.request_zhihu\r\n )\r\n\r\n def request_zhihu(self, response):\r\n yield scrapy.Request(\r\n url = self.user_url + '/about',\r\n headers = self.headers,\r\n meta = {\r\n 'proxy': UsersConfig['proxy'],\r\n 'cookiejar': response.meta['cookiejar'],\r\n 'from': {\r\n 'sign': 'else',\r\n 'data': {}\r\n }\r\n },\r\n callback = self.user_item,\r\n dont_filter = True\r\n )\r\n\r\n yield scrapy.Request(\r\n url = self.user_url + '/followees',\r\n headers = self.headers,\r\n meta = {\r\n 'proxy': UsersConfig['proxy'],\r\n 'cookiejar': response.meta['cookiejar'],\r\n 'from': {\r\n 'sign': 'else',\r\n 'data': {}\r\n }\r\n },\r\n callback = self.user_start,\r\n dont_filter = True\r\n )\r\n\r\n yield scrapy.Request(\r\n url = self.user_url + '/followers',\r\n headers = self.headers,\r\n meta = {\r\n 'proxy': UsersConfig['proxy'],\r\n 'cookiejar': response.meta['cookiejar'],\r\n 'from': {\r\n 'sign': 'else',\r\n 'data': {}\r\n }\r\n },\r\n callback = self.user_start,\r\n dont_filter = True\r\n )\r\n\r\n def user_start(self, response):\r\n sel_root = response.xpath('//h2[@class=\"zm-list-content-title\"]')\r\n # 判断关注列表是否为空\r\n if len(sel_root):\r\n for sel in sel_root:\r\n people_url = sel.xpath('a/@href').extract()[0]\r\n\r\n yield scrapy.Request(\r\n url = people_url + '/about',\r\n headers = self.headers,\r\n meta = {\r\n 'proxy': UsersConfig['proxy'],\r\n 'cookiejar': response.meta['cookiejar'],\r\n 'from': {\r\n 'sign': 'else',\r\n 'data': {}\r\n }\r\n },\r\n callback = self.user_item,\r\n dont_filter = True\r\n )\r\n\r\n yield scrapy.Request(\r\n url = people_url + '/followees',\r\n headers = self.headers,\r\n meta = {\r\n 'proxy': UsersConfig['proxy'],\r\n 'cookiejar': response.meta['cookiejar'],\r\n 'from': {\r\n 'sign': 'else',\r\n 'data': {}\r\n }\r\n },\r\n callback = self.user_start,\r\n dont_filter = True\r\n )\r\n\r\n yield scrapy.Request(\r\n url = people_url + '/followers',\r\n headers = self.headers,\r\n meta = {\r\n 'proxy': UsersConfig['proxy'],\r\n 'cookiejar': response.meta['cookiejar'],\r\n 'from': {\r\n 'sign': 'else',\r\n 'data': {}\r\n }\r\n },\r\n callback = self.user_start,\r\n dont_filter = True\r\n )\r\n\r\n def user_item(self, response):\r\n def value(list):\r\n return list[0] if len(list) else ''\r\n\r\n sel = response.xpath('//div[@class=\"zm-profile-header ProfileCard\"]')\r\n\r\n item = UserItem()\r\n item['url'] = response.url[:-6]\r\n item['name'] = sel.xpath('//a[@class=\"name\"]/text()').extract()[0].encode('utf-8')\r\n item['bio'] = value(sel.xpath('//span[@class=\"bio\"]/@title').extract()).encode('utf-8')\r\n item['location'] = value(sel.xpath('//span[contains(@class, \"location\")]/@title').extract()).encode('utf-8')\r\n item['business'] = value(sel.xpath('//span[contains(@class, \"business\")]/@title').extract()).encode('utf-8')\r\n item['gender'] = 0 if sel.xpath('//i[contains(@class, \"icon-profile-female\")]') else 1\r\n item['avatar'] = value(sel.xpath('//img[@class=\"Avatar Avatar--l\"]/@src').extract())\r\n item['education'] = value(sel.xpath('//span[contains(@class, \"education\")]/@title').extract()).encode('utf-8')\r\n item['major'] = value(sel.xpath('//span[contains(@class, \"education-extra\")]/@title').extract()).encode('utf-8')\r\n item['employment'] = value(sel.xpath('//span[contains(@class, \"employment\")]/@title').extract()).encode('utf-8')\r\n item['position'] = value(sel.xpath('//span[contains(@class, \"position\")]/@title').extract()).encode('utf-8')\r\n item['content'] = value(sel.xpath('//span[@class=\"content\"]/text()').extract()).strip().encode('utf-8')\r\n item['ask'] = int(sel.xpath('//div[contains(@class, \"profile-navbar\")]/a[2]/span[@class=\"num\"]/text()').extract()[0])\r\n item['answer'] = int(sel.xpath('//div[contains(@class, \"profile-navbar\")]/a[3]/span[@class=\"num\"]/text()').extract()[0])\r\n item['agree'] = int(sel.xpath('//span[@class=\"zm-profile-header-user-agree\"]/strong/text()').extract()[0])\r\n item['thanks'] = int(sel.xpath('//span[@class=\"zm-profile-header-user-thanks\"]/strong/text()').extract()[0])\r\n\r\n yield item\r\n```\r\n\r\n### 添加爬虫配置文件\r\n\r\n在myspider目录下新建myconfig.py，并添加以下内容，将你的配置信息填入相应位置\r\n\r\n```py\r\n# -*- coding: utf-8 -*-\r\nUsersConfig = {\r\n # 代理\r\n 'proxy': '',\r\n\r\n # 知乎用户名和密码\r\n 'email': 'your email',\r\n 'password': 'your password',\r\n}\r\n\r\nDbConfig = {\r\n # db config\r\n 'user': 'db user',\r\n 'passwd': 'db password',\r\n 'db': 'db name',\r\n 'host': 'db host',\r\n}\r\n```\r\n\r\n### 修改items.py\r\n\r\n```py\r\n# -*- coding: utf-8 -*-\r\nimport scrapy\r\n\r\nclass UserItem(scrapy.Item):\r\n # define the fields for your item here like:\r\n url = scrapy.Field()\r\n name = scrapy.Field()\r\n bio = scrapy.Field()\r\n location = scrapy.Field()\r\n business = scrapy.Field()\r\n gender = scrapy.Field()\r\n avatar = scrapy.Field()\r\n education = scrapy.Field()\r\n major = scrapy.Field()\r\n employment = scrapy.Field()\r\n position = scrapy.Field()\r\n content = scrapy.Field()\r\n ask = scrapy.Field()\r\n answer = scrapy.Field()\r\n agree = scrapy.Field()\r\n thanks = scrapy.Field()\r\n```\r\n\r\n### 将用户数据存入mysql数据库\r\n\r\n修改pipelines.py\r\n\r\n```py\r\n# -*- coding: utf-8 -*-\r\nimport MySQLdb\r\nimport datetime\r\nfrom zhihu.myconfig import DbConfig\r\n\r\nclass UserPipeline(object):\r\n def __init__(self):\r\n self.conn = MySQLdb.connect(user = DbConfig['user'], passwd = DbConfig['passwd'], db = DbConfig['db'], host = DbConfig['host'], charset = 'utf8', use_unicode = True)\r\n self.cursor = self.conn.cursor()\r\n # 清空表\r\n # self.cursor.execute('truncate table weather;')\r\n # self.conn.commit()\r\n\r\n def process_item(self, item, spider):\r\n curTime = datetime.datetime.now()\r\n try:\r\n self.cursor.execute(\r\n \"\"\"INSERT IGNORE INTO users (url, name, bio, location, business, gender, avatar, education, major, employment, position, content, ask, answer, agree, thanks, create_at)\r\n VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)\"\"\",\r\n (\r\n item['url'],\r\n item['name'],\r\n item['bio'],\r\n item['location'],\r\n item['business'],\r\n item['gender'],\r\n item['avatar'],\r\n item['education'],\r\n item['major'],\r\n item['employment'],\r\n item['position'],\r\n item['content'],\r\n item['ask'],\r\n item['answer'],\r\n item['agree'],\r\n item['thanks'],\r\n curTime\r\n )\r\n )\r\n self.conn.commit()\r\n except MySQLdb.Error, e:\r\n print 'Error %d %s' % (e.args[0], e.args[1])\r\n\r\n return item\r\n```\r\n\r\n### 修改settings.py\r\n\r\n找到 `ITEM_PIPELINES`，改为：\r\n```py\r\nITEM_PIPELINES = {\r\n 'myspider.pipelines.UserPipeline': 300,\r\n}\r\n```\r\n\r\n在末尾添加，设置爬虫的深度\r\n```py\r\nDEPTH_LIMIT=10\r\n```\r\n\r\n### 爬取知乎用户数据\r\n\r\n确保MySQL已经打开，在项目根目录下打开终端，\r\n执行 `scrapy crawl users -a url=https://www.zhihu.com/people/`，\r\n其中user为爬虫的第一个用户，之后会根据该用户关注的人和被关注的人进行爬取数据\r\n接下来会下载验证码图片，若未自动打开，请到根目录下打开 captcha.gif，在终端输入验证码\r\n数据爬取Loading...\r\n\r\n### 源码\r\n\r\n源码可以在这里找到 [github](https://github.com/ansenhuang/scrapy-zhihu-users)\r\n" }, { "path": "scrapy.cfg", "content": "# Automatically created by: scrapy startproject\n#\n# For more information about the [deploy] section see:\n# https://scrapyd.readthedocs.org/en/latest/deploy.html\n\n[settings]\ndefault = zhihu.settings\n\n[deploy]\n#url = http://localhost:6800/\nproject = zhihu\n" }, { "path": "zhihu/__init__.py", "content": "" }, { "path": "zhihu/items.py", "content": "# -*- coding: utf-8 -*-\n\n# Define here the models for your scraped items\n#\n# See documentation in:\n# http://doc.scrapy.org/en/latest/topics/items.html\n\nimport scrapy\n\n\nclass UserItem(scrapy.Item):\n # define the fields for your item here like:\n url = scrapy.Field()\n name = scrapy.Field()\n bio = scrapy.Field()\n location = scrapy.Field()\n business = scrapy.Field()\n gender = scrapy.Field()\n avatar = scrapy.Field()\n education = scrapy.Field()\n major = scrapy.Field()\n employment = scrapy.Field()\n position = scrapy.Field()\n content = scrapy.Field()\n ask = scrapy.Field()\n answer = scrapy.Field()\n agree = scrapy.Field()\n thanks = scrapy.Field()\n" }, { "path": "zhihu/myconfig.py", "content": "# -*- coding: utf-8 -*-\r\nUsersConfig = {\r\n # 代理\r\n 'proxy': '',\r\n\r\n # 知乎用户名和密码\r\n 'email': '',\r\n 'password': '',\r\n}\r\n\r\nDbConfig = {\r\n # db config\r\n 'user': 'root',\r\n 'passwd': '123456',\r\n 'db': 'zhihu',\r\n 'host': '127.0.0.1',\r\n}\r\n" }, { "path": "zhihu/pipelines.py", "content": "# -*- coding: utf-8 -*-\n\n# Define your item pipelines here\n#\n# Don't forget to add your pipeline to the ITEM_PIPELINES setting\n# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html\nimport MySQLdb\nimport datetime\nfrom zhihu.myconfig import DbConfig\n\nclass UserPipeline(object):\n def __init__(self):\n self.conn = MySQLdb.connect(user = DbConfig['user'], passwd = DbConfig['passwd'], db = DbConfig['db'], host = DbConfig['host'], charset = 'utf8', use_unicode = True)\n self.cursor = self.conn.cursor()\n # 清空表\n # self.cursor.execute('truncate table weather;')\n # self.conn.commit()\n\n def process_item(self, item, spider):\n curTime = datetime.datetime.now()\n try:\n self.cursor.execute(\n \"\"\"INSERT IGNORE INTO users (url, name, bio, location, business, gender, avatar, education, major, employment, position, content, ask, answer, agree, thanks, create_at)\n VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)\"\"\",\n (\n item['url'],\n item['name'],\n item['bio'],\n item['location'],\n item['business'],\n item['gender'],\n item['avatar'],\n item['education'],\n item['major'],\n item['employment'],\n item['position'],\n item['content'],\n item['ask'],\n item['answer'],\n item['agree'],\n item['thanks'],\n curTime\n )\n )\n self.conn.commit()\n except MySQLdb.Error, e:\n print 'Error %d %s' % (e.args[0], e.args[1])\n\n return item\n" }, { "path": "zhihu/settings.py", "content": "# -*- coding: utf-8 -*-\n\n# Scrapy settings for zhihu project\n#\n# For simplicity, this file contains only settings considered important or\n# commonly used. You can find more settings consulting the documentation:\n#\n# http://doc.scrapy.org/en/latest/topics/settings.html\n# http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html\n# http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html\n\nBOT_NAME = 'zhihu'\n\nSPIDER_MODULES = ['zhihu.spiders']\nNEWSPIDER_MODULE = 'zhihu.spiders'\n\n\n# Crawl responsibly by identifying yourself (and your website) on the user-agent\n#USER_AGENT = 'zhihu (+http://www.yourdomain.com)'\n\n# Configure maximum concurrent requests performed by Scrapy (default: 16)\n#CONCURRENT_REQUESTS=32\n\n# Configure a delay for requests for the same website (default: 0)\n# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay\n# See also autothrottle settings and docs\n#DOWNLOAD_DELAY=3\n# The download delay setting will honor only one of:\n#CONCURRENT_REQUESTS_PER_DOMAIN=16\n#CONCURRENT_REQUESTS_PER_IP=16\n\n# Disable cookies (enabled by default)\nCOOKIES_ENABLED=True\nCOOKIES_DEBUG=False\n\n# Disable Telnet Console (enabled by default)\n#TELNETCONSOLE_ENABLED=False\n\n# Override the default request headers:\n#DEFAULT_REQUEST_HEADERS = {\n# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',\n# 'Accept-Language': 'en',\n#}\n\n# Enable or disable spider middlewares\n# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html\n#SPIDER_MIDDLEWARES = {\n# 'zhihu.middlewares.MyCustomSpiderMiddleware': 543,\n#}\n\n# Enable or disable downloader middlewares\n# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html\n#DOWNLOADER_MIDDLEWARES = {\n# 'zhihu.middlewares.MyCustomDownloaderMiddleware': 543,\n#}\n\n# Enable or disable extensions\n# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html\n#EXTENSIONS = {\n# 'scrapy.telnet.TelnetConsole': None,\n#}\n\n# Configure item pipelines\n# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html\nITEM_PIPELINES = {\n 'zhihu.pipelines.UserPipeline': 300,\n}\n\n# Enable and configure the AutoThrottle extension (disabled by default)\n# See http://doc.scrapy.org/en/latest/topics/autothrottle.html\n# NOTE: AutoThrottle will honour the standard settings for concurrency and delay\n#AUTOTHROTTLE_ENABLED=True\n# The initial download delay\n#AUTOTHROTTLE_START_DELAY=5\n# The maximum download delay to be set in case of high latencies\n#AUTOTHROTTLE_MAX_DELAY=60\n# Enable showing throttling stats for every response received:\n#AUTOTHROTTLE_DEBUG=False\n\n# Enable and configure HTTP caching (disabled by default)\n# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings\n#HTTPCACHE_ENABLED=True\n#HTTPCACHE_EXPIRATION_SECS=0\n#HTTPCACHE_DIR='httpcache'\n#HTTPCACHE_IGNORE_HTTP_CODES=[]\n#HTTPCACHE_STORAGE='scrapy.extensions.httpcache.FilesystemCacheStorage'\n\n# Extra\nDEPTH_LIMIT=3+10\n" }, { "path": "zhihu/spiders/__init__.py", "content": "# This package will contain the spiders of your Scrapy project\n#\n# Please refer to the documentation for information on how to create and manage\n# your spiders.\n" }, { "path": "zhihu/spiders/users.py", "content": "# -*- coding: utf-8 -*-\nimport scrapy\nimport os\nimport time\nfrom zhihu.items import UserItem\nfrom zhihu.myconfig import UsersConfig\n\nclass UsersSpider(scrapy.Spider):\n name = 'users'\n domain = 'https://www.zhihu.com'\n login_url = 'https://www.zhihu.com/login/email'\n headers = {\n \"Accept\": \"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8\",\n \"Accept-Language\": \"zh-CN,zh;q=0.8\",\n \"Connection\": \"keep-alive\",\n \"Host\": \"www.zhihu.com\",\n \"Upgrade-Insecure-Requests\": \"1\",\n \"User-Agent\": \"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.109 Safari/537.36\"\n }\n\n def __init__(self, url = None):\n self.user_url = url\n\n def start_requests(self):\n yield scrapy.Request(\n url = self.domain,\n headers = self.headers,\n meta = {\n 'proxy': UsersConfig['proxy'],\n 'cookiejar': 1\n },\n callback = self.request_captcha\n )\n\n def request_captcha(self, response):\n # 获取_xsrf值\n _xsrf = response.css('input[name=\"_xsrf\"]::attr(value)').extract()[0]\n # 获取验证码地址\n captcha_url = 'http://www.zhihu.com/captcha.gif?r=' + str(time.time() * 1000)\n # 准备下载验证码\n yield scrapy.Request(\n url = captcha_url,\n headers = self.headers,\n meta = {\n 'proxy': UsersConfig['proxy'],\n 'cookiejar': response.meta['cookiejar'],\n '_xsrf': _xsrf\n },\n callback = self.download_captcha\n )\n\n def download_captcha(self, response):\n # 下载验证码\n with open('captcha.gif', 'wb') as fp:\n fp.write(response.body)\n # 用软件打开验证码图片\n os.system('start captcha.gif')\n # 输入验证码\n print 'Please enter captcha: '\n captcha = raw_input()\n\n yield scrapy.FormRequest(\n url = self.login_url,\n headers = self.headers,\n formdata = {\n 'email': UsersConfig['email'],\n 'password': UsersConfig['password'],\n '_xsrf': response.meta['_xsrf'],\n 'remember_me': 'true',\n 'captcha': captcha\n },\n meta = {\n 'proxy': UsersConfig['proxy'],\n 'cookiejar': response.meta['cookiejar']\n },\n callback = self.request_zhihu\n )\n\n def request_zhihu(self, response):\n yield scrapy.Request(\n url = self.user_url + '/about',\n headers = self.headers,\n meta = {\n 'proxy': UsersConfig['proxy'],\n 'cookiejar': response.meta['cookiejar'],\n 'from': {\n 'sign': 'else',\n 'data': {}\n }\n },\n callback = self.user_item,\n dont_filter = True\n )\n\n yield scrapy.Request(\n url = self.user_url + '/followees',\n headers = self.headers,\n meta = {\n 'proxy': UsersConfig['proxy'],\n 'cookiejar': response.meta['cookiejar'],\n 'from': {\n 'sign': 'else',\n 'data': {}\n }\n },\n callback = self.user_start,\n dont_filter = True\n )\n\n yield scrapy.Request(\n url = self.user_url + '/followers',\n headers = self.headers,\n meta = {\n 'proxy': UsersConfig['proxy'],\n 'cookiejar': response.meta['cookiejar'],\n 'from': {\n 'sign': 'else',\n 'data': {}\n }\n },\n callback = self.user_start,\n dont_filter = True\n )\n\n def user_start(self, response):\n sel_root = response.xpath('//h2[@class=\"zm-list-content-title\"]')\n # 判断关注列表是否为空\n if len(sel_root):\n for sel in sel_root:\n people_url = sel.xpath('a/@href').extract()[0]\n\n yield scrapy.Request(\n url = people_url + '/about',\n headers = self.headers,\n meta = {\n 'proxy': UsersConfig['proxy'],\n 'cookiejar': response.meta['cookiejar'],\n 'from': {\n 'sign': 'else',\n 'data': {}\n }\n },\n callback = self.user_item,\n dont_filter = True\n )\n\n yield scrapy.Request(\n url = people_url + '/followees',\n headers = self.headers,\n meta = {\n 'proxy': UsersConfig['proxy'],\n 'cookiejar': response.meta['cookiejar'],\n 'from': {\n 'sign': 'else',\n 'data': {}\n }\n },\n callback = self.user_start,\n dont_filter = True\n )\n\n yield scrapy.Request(\n url = people_url + '/followers',\n headers = self.headers,\n meta = {\n 'proxy': UsersConfig['proxy'],\n 'cookiejar': response.meta['cookiejar'],\n 'from': {\n 'sign': 'else',\n 'data': {}\n }\n },\n callback = self.user_start,\n dont_filter = True\n )\n\n def user_item(self, response):\n def value(list):\n return list[0] if len(list) else ''\n\n sel = response.xpath('//div[@class=\"zm-profile-header ProfileCard\"]')\n\n item = UserItem()\n item['url'] = response.url[:-6]\n item['name'] = sel.xpath('//a[@class=\"name\"]/text()').extract()[0].encode('utf-8')\n item['bio'] = value(sel.xpath('//span[@class=\"bio\"]/@title').extract()).encode('utf-8')\n item['location'] = value(sel.xpath('//span[contains(@class, \"location\")]/@title').extract()).encode('utf-8')\n item['business'] = value(sel.xpath('//span[contains(@class, \"business\")]/@title').extract()).encode('utf-8')\n item['gender'] = 0 if sel.xpath('//i[contains(@class, \"icon-profile-female\")]') else 1\n item['avatar'] = value(sel.xpath('//img[@class=\"Avatar Avatar--l\"]/@src').extract())\n item['education'] = value(sel.xpath('//span[contains(@class, \"education\")]/@title').extract()).encode('utf-8')\n item['major'] = value(sel.xpath('//span[contains(@class, \"education-extra\")]/@title').extract()).encode('utf-8')\n item['employment'] = value(sel.xpath('//span[contains(@class, \"employment\")]/@title').extract()).encode('utf-8')\n item['position'] = value(sel.xpath('//span[contains(@class, \"position\")]/@title').extract()).encode('utf-8')\n item['content'] = value(sel.xpath('//span[@class=\"content\"]/text()').extract()).strip().encode('utf-8')\n item['ask'] = int(sel.xpath('//div[contains(@class, \"profile-navbar\")]/a[2]/span[@class=\"num\"]/text()').extract()[0])\n item['answer'] = int(sel.xpath('//div[contains(@class, \"profile-navbar\")]/a[3]/span[@class=\"num\"]/text()').extract()[0])\n item['agree'] = int(sel.xpath('//span[@class=\"zm-profile-header-user-agree\"]/strong/text()').extract()[0])\n item['thanks'] = int(sel.xpath('//span[@class=\"zm-profile-header-user-thanks\"]/strong/text()').extract()[0])\n\n yield item\n" } ]