[
  {
    "path": ".gitignore",
    "content": "*.pyc\n*.swp\nstorage/\n__pycache__/\n.idea/\n.vscode/"
  },
  {
    "path": "README.md",
    "content": "## ScrapyDouban\n\n[![ScrapyDouban Demo](https://img.youtube.com/vi/Fyrvrb0rqvE/0.jpg)](https://www.youtube.com/watch?v=Fyrvrb0rqvE) [演示视频](https://www.youtube.com/watch?v=Fyrvrb0rqvE)\n\n基于 Python3 的豆瓣电影/豆瓣读书 Scarpy 爬虫，实现封面下载+元数据抓取+评论入库。\n\n维护这个项目的目的是分享一些我在使用 Scrapy 过程中的实践，该项目大概涵盖了 80% 我所使用到的 Scrapy 知识，希望能帮助到正在学习 Scrapy 的朋友，也希望大家在阅读[ Scrapy 官方文档](https://scrapy.readthedocs.io/en/stable/index.html)后食用，但是请注意目前项目所使用版本为 Scrapy 2.5.0。\n\n![Python3](https://www.python.org/static/community_logos/python-powered-w-200x80.png)\n![MySQL](https://www.mysql.com/common/logos/powered-by-mysql-167x86.png)\n![Docker](https://www.docker.com/sites/default/files/horizontal.png)\n\n### Docker\n-------\n项目包含了 douban_scrapyd douban_db douban_adminer 三个容器。\n\ndouban_scrapyd 容器基于 [python:3.9-slim-buster](https://pythonspeed.com/articles/base-image-python-docker-images/)，默认安装的 Python3 库有 scrapy scrapyd pymysql pillow arrow，默认映射端口 6800:6800 以方便用户通过宿主机 IP:6800 访问 scrapyd 管理界面，登陆所需参数，用户名:scrapyd 密码:public。\n\ndouban_db 容器基于 mysql:8，root 密码为 public，默认初始化时导入 docker/mysql/douban.sql 文件到 douban 数据库。\n\ndouban_adminer 容器基于 adminer:4，默认映射端口 8080:8080 以方便用户通过宿主机 IP:8080 访问数据库管理界面，登陆所需参数，服务器:mysql 用户名:root 密码:public。\n\n\n### 项目 SQL\n------\n\n项目所使用的 SQL 文件存放路径为 docker/mysql/douban.sql 。\n\n### 收集流程\n-------\n\n    首先收集 Subject ID --> 然后通过 Subject ID 抓取详情页面，收集元数据 --> 最后通过 Subject ID 来收集评论\n\n### 使用方法\n-------\n    $ git clone https://github.com/baabaaox/ScrapyDouban.git\n    # 构建并运行容器\n    $ cd ./ScrapyDouban/docker\n    $ sudo docker-compose up --build -d\n    # 进入 douban_scrapyd 容器\n    $ sudo docker exec -it douban_scrapyd bash\n    # 进入 scrapy 目录\n    $ cd /srv/ScrapyDouban/scrapy\n    $ scrapy list\n    # 抓取电影数据\n    $ scrapy crawl movie_subject # 收集电影 Subject ID\n    $ scrapy crawl movie_meta # 收集电影元数据\n    $ scrapy crawl movie_comment # 收集电影评论\n    # 抓取书籍数据\n    $ scrapy crawl book_subject # 收集书籍 Subject ID\n    $ scrapy crawl book_meta # 收集书籍元数据\n    $ scrapy crawl book_comment # 收集书籍评论\n\n如果你想在测试的时候比较方便的修改代码，你可以把项目所在路径 scrapy 目录挂载到 douban_scrapyd 容器。\n如果你习惯使用 scrapyd 进行操作，可以通过 scrapyd-client 直接将项目部署到 douban_scrapyd 容器。\n\n### 代理 IP\n--------\n\n由于豆瓣的反爬虫机制，现在只能通过代理 IP 来绕过。默认 settings.py 里面并未启用 douban.middlewares.ProxyMiddleware 中间件，如果你真的需要使用豆瓣的数据来进行一些研究，可以去租用付费的代理池。\n\n\n### 图片下载\n--------\n\ndouban.pipelines.CoverPipeline 通过对 spider.name 进行过滤来处理封面下载逻辑，所下载图片文件的保存路径为 douban_scrapy 容器的 /srv/ScrapyDouban/storage 目录。\n"
  },
  {
    "path": "docker/docker-compose.yml",
    "content": "services:\n  mysql:\n    build: ./mysql\n    container_name: douban_mysql\n    environment:\n      - MYSQL_ROOT_PASSWORD=public\n      - MYSQL_DATABASE=douban\n    command: mysqld --default-authentication-plugin=mysql_native_password\n  adminer:\n    image: adminer:4\n    container_name: douban_adminer\n    ports:\n      - 8080:8080\n    links:\n      - mysql\n  scrapyd:\n    build: ./scrapyd\n    container_name: douban_scrapyd\n    ports:\n      - 6800:6800\n    environment:\n      - TZ=Asia/Chongqing\n      - MYSQL_HOST=mysql\n      - MYSQL_USER=root\n      - MYSQL_PASS=public\n      - MYSQL_DB=douban\n    links:\n      - mysql\n"
  },
  {
    "path": "docker/mysql/Dockerfile",
    "content": "FROM mysql:8\r\n\r\nADD douban.sql /docker-entrypoint-initdb.d\r\n"
  },
  {
    "path": "docker/mysql/douban.sql",
    "content": "-- Adminer 4.6.3 MySQL dump\n\nSET NAMES utf8;\nSET time_zone = '+00:00';\nSET foreign_key_checks = 0;\nSET sql_mode = 'NO_AUTO_VALUE_ON_ZERO';\n\nSET NAMES utf8mb4;\n\nCREATE DATABASE IF NOT EXISTS `douban` /*!40100 DEFAULT CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci */;\nUSE `douban`;\n\nDROP TABLE IF EXISTS `books`;\nCREATE TABLE `books` (\n  `id` int(10) unsigned NOT NULL AUTO_INCREMENT,\n  `slug` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '',\n  `name` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '',\n  `sub_name` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '',\n  `alt_name` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '',\n  `cover` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '',\n  `summary` text CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci,\n  `authors` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '',\n  `author_intro` text CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci,\n  `translators` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '',\n  `series` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '',\n  `publisher` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '',\n  `publish_date` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '',\n  `pages` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '',\n  `price` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '',\n  `binding` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '',\n  `isbn` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '',\n  `tags` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '',\n  `douban_id` int(10) unsigned NOT NULL DEFAULT '0',\n  `douban_score` decimal(3,1) unsigned NOT NULL DEFAULT '0.0',\n  `douban_votes` int(10) unsigned NOT NULL DEFAULT '0',\n  `created_at` timestamp NOT NULL DEFAULT '0000-00-00 00:00:00',\n  `updated_at` timestamp NOT NULL DEFAULT '0000-00-00 00:00:00',\n  PRIMARY KEY (`id`),\n  KEY `books_slug_index` (`slug`),\n  KEY `books_name_index` (`name`),\n  KEY `books_douban_id_index` (`douban_id`)\n) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;\n\n\nDROP TABLE IF EXISTS `comments`;\nCREATE TABLE `comments` (\n  `id` int(10) unsigned NOT NULL AUTO_INCREMENT,\n  `douban_id` int(10) unsigned NOT NULL DEFAULT '0',\n  `douban_comment_id` int(10) unsigned NOT NULL DEFAULT '0',\n  `douban_user_nickname` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '',\n  `douban_user_avatar` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '',\n  `douban_user_url` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '',\n  `content` text CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci NOT NULL,\n  `votes` int(10) unsigned NOT NULL DEFAULT '0',\n  `created_at` timestamp NOT NULL DEFAULT '0000-00-00 00:00:00',\n  `updated_at` timestamp NOT NULL DEFAULT '0000-00-00 00:00:00',\n  PRIMARY KEY (`id`),\n  KEY `comments_douban_id_index` (`douban_id`),\n  KEY `comments_douban_comment_id_index` (`douban_comment_id`)\n) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;\n\n\nDROP TABLE IF EXISTS `movies`;\nCREATE TABLE `movies` (\n  `id` int(10) unsigned NOT NULL AUTO_INCREMENT,\n  `type` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '',\n  `slug` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '',\n  `name` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '',\n  `alias` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '',\n  `cover` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '',\n  `year` smallint(5) unsigned NOT NULL DEFAULT '0',\n  `regions` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '',\n  `genres` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '',\n  `languages` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '',\n  `release_date` date DEFAULT NULL,\n  `official_site` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '',\n  `directors` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '',\n  `writers` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '',\n  `actors` text CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci,\n  `storyline` text CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci,\n  `mins` smallint(5) unsigned NOT NULL DEFAULT '0',\n  `recommend_tip` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '',\n  `tags` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '',\n  `avg_score` decimal(3,1) unsigned NOT NULL DEFAULT '0.0',\n  `imdb_id` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '',\n  `imdb_score` decimal(3,1) unsigned NOT NULL DEFAULT '0.0',\n  `imdb_votes` int(10) unsigned NOT NULL DEFAULT '0',\n  `douban_id` int(10) unsigned NOT NULL DEFAULT '0',\n  `douban_score` decimal(3,1) unsigned NOT NULL DEFAULT '0.0',\n  `douban_votes` int(10) unsigned NOT NULL DEFAULT '0',\n  `created_at` timestamp NOT NULL DEFAULT '0000-00-00 00:00:00',\n  `updated_at` timestamp NOT NULL DEFAULT '0000-00-00 00:00:00',\n  PRIMARY KEY (`id`),\n  KEY `movies_slug_index` (`slug`),\n  KEY `movies_name_index` (`name`),\n  KEY `movies_imdb_id_index` (`imdb_id`),\n  KEY `movies_douban_id_index` (`douban_id`)\n) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;\n\n\nDROP TABLE IF EXISTS `subjects`;\nCREATE TABLE `subjects` (\n  `id` int(10) unsigned NOT NULL AUTO_INCREMENT,\n  `douban_id` int(10) unsigned NOT NULL DEFAULT '0',\n  `type` enum('movie','book') CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT 'movie',\n  PRIMARY KEY (`id`),\n  UNIQUE KEY `subjects_douban_id_unique` (`douban_id`)\n) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;\n\n\n-- 2018-11-27 16:52:54\n"
  },
  {
    "path": "docker/scrapyd/Dockerfile",
    "content": "FROM python:3.9-slim-buster\n\nARG SCRAPY_VERSION=2.5.0\n\nRUN apt-get update \\\n    && apt-get install -y --no-install-recommends git \\\n    && pip install -i https://mirrors.aliyun.com/pypi/simple/ --upgrade pip \\\n    && pip install -i https://mirrors.aliyun.com/pypi/simple/ scrapy==$SCRAPY_VERSION pymysql==1.0.2 pillow==8.2.0 arrow==1.0.3 \\\n    && pip install -U git+https://github.com/scrapy/scrapyd.git \\\n    && git clone https://github.com/baabaaox/ScrapyDouban.git /srv/ScrapyDouban\n\nCOPY scrapyd.conf /etc/scrapyd/\n\nEXPOSE 6800\n\nCMD [\"scrapyd\"]\n"
  },
  {
    "path": "docker/scrapyd/scrapyd.conf",
    "content": "[scrapyd]\nbind_address = 0.0.0.0\nusername = scrapyd\npassword = public\n"
  },
  {
    "path": "requirements.txt",
    "content": "arrow==1.2.1\nPillow==9.0.0\nPyMySQL==1.0.2\nScrapy==2.5.1\n"
  },
  {
    "path": "scrapy/douban/__init__.py",
    "content": ""
  },
  {
    "path": "scrapy/douban/database.py",
    "content": "import os\n\nimport pymysql\n\nMYSQL_HOST = os.environ.get(\"MYSQL_HOST\", \"localhost\")\nMYSQL_USER = os.environ.get(\"MYSQL_USER\", \"root\")\nMYSQL_PASS = os.environ.get(\"MYSQL_PASS\", \"public\")\nMYSQL_DB = os.environ.get(\"MYSQL_DB\", \"douban\")\n\nconnection = pymysql.connect(\n    host=MYSQL_HOST,\n    user=MYSQL_USER,\n    password=MYSQL_PASS,\n    db=MYSQL_DB,\n    charset=\"utf8mb4\",\n    cursorclass=pymysql.cursors.DictCursor,\n)\n"
  },
  {
    "path": "scrapy/douban/items.py",
    "content": "from scrapy import Field, Item\n\n\nclass Subject(Item):\n    douban_id = Field()\n    type = Field()\n\n\nclass MovieMeta(Item):\n    douban_id = Field()\n    type = Field()\n    cover = Field()\n    name = Field()\n    slug = Field()\n    year = Field()\n    directors = Field()\n    writers = Field()\n    actors = Field()\n    genres = Field()\n    official_site = Field()\n    regions = Field()\n    languages = Field()\n    release_date = Field()\n    mins = Field()\n    alias = Field()\n    imdb_id = Field()\n    douban_id = Field()\n    douban_score = Field()\n    douban_votes = Field()\n    tags = Field()\n    storyline = Field()\n\n\nclass BookMeta(Item):\n    douban_id = Field()\n    slug = Field()\n    name = Field()\n    sub_name = Field()\n    alt_name = Field()\n    cover = Field()\n    summary = Field()\n    authors = Field()\n    author_intro = Field()\n    translators = Field()\n    series = Field()\n    publisher = Field()\n    publish_date = Field()\n    pages = Field()\n    price = Field()\n    binding = Field()\n    isbn = Field()\n    douban_id = Field()\n    douban_score = Field()\n    douban_votes = Field()\n    tags = Field()\n\n\nclass Comment(Item):\n    douban_id = Field()\n    douban_comment_id = Field()\n    douban_user_nickname = Field()\n    douban_user_avatar = Field()\n    douban_user_url = Field()\n    content = Field()\n    votes = Field()\n"
  },
  {
    "path": "scrapy/douban/middlewares.py",
    "content": "# Define here the models for your spider middleware\n#\n# See documentation in:\n# https://docs.scrapy.org/en/latest/topics/spider-middleware.html\n\n# useful for handling different item types with a single interface\nfrom itemadapter import ItemAdapter, is_item\nfrom scrapy import signals\n\n\nclass ProxyMiddleware(object):\n    def process_request(self, request, spider):\n        # curl https://m.douban.com/book/subject/26628811/ -x http://127.0.0.1:8081\n        request.meta[\"proxy\"] = \"http://127.0.0.1:8081\"\n\n\nclass DoubanSpiderMiddleware:\n    # Not all methods need to be defined. If a method is not defined,\n    # scrapy acts as if the spider middleware does not modify the\n    # passed objects.\n\n    @classmethod\n    def from_crawler(cls, crawler):\n        # This method is used by Scrapy to create your spiders.\n        s = cls()\n        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)\n        return s\n\n    def process_spider_input(self, response, spider):\n        # Called for each response that goes through the spider\n        # middleware and into the spider.\n\n        # Should return None or raise an exception.\n        return None\n\n    def process_spider_output(self, response, result, spider):\n        # Called with the results returned from the Spider, after\n        # it has processed the response.\n\n        # Must return an iterable of Request, or item objects.\n        for i in result:\n            yield i\n\n    def process_spider_exception(self, response, exception, spider):\n        # Called when a spider or process_spider_input() method\n        # (from other spider middleware) raises an exception.\n\n        # Should return either None or an iterable of Request or item objects.\n        pass\n\n    def process_start_requests(self, start_requests, spider):\n        # Called with the start requests of the spider, and works\n        # similarly to the process_spider_output() method, except\n        # that it doesn’t have a response associated.\n\n        # Must return only requests (not items).\n        for r in start_requests:\n            yield r\n\n    def spider_opened(self, spider):\n        spider.logger.info(\"Spider opened: %s\" % spider.name)\n\n\nclass DoubanDownloaderMiddleware:\n    # Not all methods need to be defined. If a method is not defined,\n    # scrapy acts as if the downloader middleware does not modify the\n    # passed objects.\n\n    @classmethod\n    def from_crawler(cls, crawler):\n        # This method is used by Scrapy to create your spiders.\n        s = cls()\n        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)\n        return s\n\n    def process_request(self, request, spider):\n        # Called for each request that goes through the downloader\n        # middleware.\n\n        # Must either:\n        # - return None: continue processing this request\n        # - or return a Response object\n        # - or return a Request object\n        # - or raise IgnoreRequest: process_exception() methods of\n        #   installed downloader middleware will be called\n        return None\n\n    def process_response(self, request, response, spider):\n        # Called with the response returned from the downloader.\n\n        # Must either;\n        # - return a Response object\n        # - return a Request object\n        # - or raise IgnoreRequest\n        return response\n\n    def process_exception(self, request, exception, spider):\n        # Called when a download handler or a process_request()\n        # (from other downloader middleware) raises an exception.\n\n        # Must either:\n        # - return None: continue processing this exception\n        # - return a Response object: stops process_exception() chain\n        # - return a Request object: stops process_exception() chain\n        pass\n\n    def spider_opened(self, spider):\n        spider.logger.info(\"Spider opened: %s\" % spider.name)\n"
  },
  {
    "path": "scrapy/douban/pipelines.py",
    "content": "import hashlib\nimport logging\n\nfrom scrapy import Request\nfrom scrapy.pipelines.images import ImagesPipeline\nfrom scrapy.utils.misc import arg_to_iter\nfrom scrapy.utils.python import to_bytes\nfrom twisted.internet.defer import DeferredList\n\nimport douban.database as db\nfrom douban.items import BookMeta, Comment, MovieMeta, Subject\n\ncursor = db.connection.cursor()\n\n\nclass DoubanPipeline(object):\n    def get_subject(self, item):\n        sql = \"SELECT id FROM subjects WHERE douban_id=%s\" % item[\"douban_id\"]\n        cursor.execute(sql)\n        return cursor.fetchone()\n\n    def save_subject(self, item):\n        keys = item.keys()\n        values = tuple(item.values())\n        fields = \",\".join(keys)\n        temp = \",\".join([\"%s\"] * len(keys))\n        sql = \"INSERT INTO subjects (%s) VALUES (%s)\" % (fields, temp)\n        cursor.execute(sql, values)\n        return db.connection.commit()\n\n    def get_movie_meta(self, item):\n        sql = \"SELECT id FROM movies WHERE douban_id=%s\" % item[\"douban_id\"]\n        cursor.execute(sql)\n        return cursor.fetchone()\n\n    def save_movie_meta(self, item):\n        keys = item.keys()\n        values = tuple(item.values())\n        fields = \",\".join(keys)\n        temp = \",\".join([\"%s\"] * len(keys))\n        sql = \"INSERT INTO movies (%s) VALUES (%s)\" % (fields, temp)\n        cursor.execute(sql, tuple(i.strip() for i in values))\n        return db.connection.commit()\n\n    def update_movie_meta(self, item):\n        douban_id = item.pop(\"douban_id\")\n        keys = item.keys()\n        values = tuple(item.values())\n        values.append(douban_id)\n        fields = [\"%s=\" % i + \"%s\" for i in keys]\n        sql = \"UPDATE movies SET %s WHERE douban_id=%s\" % (\",\".join(fields), \"%s\")\n        cursor.execute(sql, tuple(i.strip() for i in values))\n        return db.connection.commit()\n\n    def get_book_meta(self, item):\n        sql = \"SELECT id FROM books WHERE douban_id=%s\" % item[\"douban_id\"]\n        cursor.execute(sql)\n        return cursor.fetchone()\n\n    def save_book_meta(self, item):\n        keys = item.keys()\n        values = tuple(item.values())\n        fields = \",\".join(keys)\n        temp = \",\".join([\"%s\"] * len(keys))\n        sql = \"INSERT INTO books (%s) VALUES (%s)\" % (fields, temp)\n        cursor.execute(sql, tuple(i.strip() for i in values))\n        return db.connection.commit()\n\n    def update_book_meta(self, item):\n        douban_id = item.pop(\"douban_id\")\n        keys = item.keys()\n        values = tuple(item.values())\n        values.append(douban_id)\n        fields = [\"%s=\" % i + \"%s\" for i in keys]\n        sql = \"UPDATE books SET %s WHERE douban_id=%s\" % (\",\".join(fields), \"%s\")\n        cursor.execute(sql, values)\n        return db.connection.commit()\n\n    def get_comment(self, item):\n        sql = \"SELECT * FROM comments WHERE douban_comment_id=%s\" % item[\"douban_comment_id\"]\n        cursor.execute(sql)\n        return cursor.fetchone()\n\n    def save_comment(self, item):\n        keys = item.keys()\n        values = tuple(item.values())\n        fields = \",\".join(keys)\n        temp = \",\".join([\"%s\"] * len(keys))\n        sql = \"INSERT INTO comments (%s) VALUES (%s)\" % (fields, temp)\n        cursor.execute(sql, values)\n        return db.connection.commit()\n\n    def process_item(self, item, spider):\n        try:\n            if isinstance(item, Subject):\n                \"\"\"\n                subject\n                \"\"\"\n                exist = self.get_subject(item)\n                if not exist:\n                    self.save_subject(item)\n            elif isinstance(item, MovieMeta):\n                \"\"\"\n                meta\n                \"\"\"\n                exist = self.get_movie_meta(item)\n                if not exist:\n                    self.save_movie_meta(item)\n                else:\n                    self.update_movie_meta(item)\n            elif isinstance(item, BookMeta):\n                \"\"\"\n                meta\n                \"\"\"\n                exist = self.get_book_meta(item)\n                if not exist:\n                    self.save_book_meta(item)\n                else:\n                    self.update_book_meta(item)\n            elif isinstance(item, Comment):\n                \"\"\"\n                comment\n                \"\"\"\n                exist = self.get_comment(item)\n                if not exist:\n                    self.save_comment(item)\n        except Exception as e:\n            logging.warn(item)\n            logging.error(e)\n        return item\n\n\nclass CoverPipeline(ImagesPipeline):\n    def process_item(self, item, spider):\n        if \"meta\" not in spider.name:\n            return item\n        info = self.spiderinfo\n        requests = arg_to_iter(self.get_media_requests(item, info))\n        dlist = [self._process_request(r, info, item) for r in requests]\n        dfd = DeferredList(dlist, consumeErrors=1)\n        return dfd.addCallback(self.item_completed, item, info)\n\n    def file_path(self, request, response=None, info=None, *, item=None):\n        guid = hashlib.sha1(to_bytes(request.url)).hexdigest()\n        return \"%s%s/%s%s/%s.jpg\" % (guid[9], guid[19], guid[29], guid[39], guid)\n\n    def get_media_requests(self, item, info):\n        if item[\"cover\"]:\n            return Request(item[\"cover\"])\n\n    def item_completed(self, results, item, info):\n        image_paths = [x[\"path\"] for ok, x in results if ok]\n        if image_paths:\n            item[\"cover\"] = image_paths[0]\n        else:\n            item[\"cover\"] = \"\"\n        return item\n"
  },
  {
    "path": "scrapy/douban/settings.py",
    "content": "# Scrapy settings for douban project\n#\n# For simplicity, this file contains only settings considered important or\n# commonly used. You can find more settings consulting the documentation:\n#\n#     https://docs.scrapy.org/en/latest/topics/settings.html\n#     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html\n#     https://docs.scrapy.org/en/latest/topics/spider-middleware.html\n\nBOT_NAME = \"douban\"\n\nSPIDER_MODULES = [\"douban.spiders\"]\nNEWSPIDER_MODULE = \"douban.spiders\"\n\nLOG_LEVEL = \"DEBUG\"\nIMAGES_STORE = \"../storage/\"\n\n# Crawl responsibly by identifying yourself (and your website) on the user-agent\nUSER_AGENT = (\n    \"Mozilla/5.0 (iPhone; CPU iPhone OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148\"\n)\n\n# Obey robots.txt rules\nROBOTSTXT_OBEY = False\n\n# Configure maximum concurrent requests performed by Scrapy (default: 16)\n# CONCURRENT_REQUESTS = 32\n\n# Configure a delay for requests for the same website (default: 0)\n# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay\n# See also autothrottle settings and docs\n# DOWNLOAD_DELAY = 3\n# The download delay setting will honor only one of:\n# CONCURRENT_REQUESTS_PER_DOMAIN = 16\nCONCURRENT_REQUESTS_PER_IP = 1\n\n# Disable cookies (enabled by default)\nCOOKIES_ENABLED = True\n\n# Disable Telnet Console (enabled by default)\n# TELNETCONSOLE_ENABLED = False\n\n# Override the default request headers:\n# DEFAULT_REQUEST_HEADERS = {\n#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',\n#   'Accept-Language': 'en',\n# }\n\n# Enable or disable spider middlewares\n# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html\n# SPIDER_MIDDLEWARES = {\n#    'douban.middlewares.DoubanSpiderMiddleware': 543,\n# }\n\n# Enable or disable downloader middlewares\n# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html\n# DOWNLOADER_MIDDLEWARES = {\n#    'douban.middlewares.DoubanDownloaderMiddleware': 543,\n# }\n\n# Enable or disable extensions\n# See https://docs.scrapy.org/en/latest/topics/extensions.html\n# EXTENSIONS = {\n#    'scrapy.extensions.telnet.TelnetConsole': None,\n# }\n\n# Configure item pipelines\n# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html\nITEM_PIPELINES = {\n    \"douban.pipelines.CoverPipeline\": 1,\n    \"douban.pipelines.DoubanPipeline\": 300,\n}\n\n# Enable and configure the AutoThrottle extension (disabled by default)\n# See https://docs.scrapy.org/en/latest/topics/autothrottle.html\n# AUTOTHROTTLE_ENABLED = True\n# The initial download delay\n# AUTOTHROTTLE_START_DELAY = 5\n# The maximum download delay to be set in case of high latencies\n# AUTOTHROTTLE_MAX_DELAY = 60\n# The average number of requests Scrapy should be sending in parallel to\n# each remote server\n# AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0\n# Enable showing throttling stats for every response received:\n# AUTOTHROTTLE_DEBUG = False\n\n# Enable and configure HTTP caching (disabled by default)\n# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings\n# HTTPCACHE_ENABLED = True\n# HTTPCACHE_EXPIRATION_SECS = 0\n# HTTPCACHE_DIR = 'httpcache'\n# HTTPCACHE_IGNORE_HTTP_CODES = []\n# HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'\n"
  },
  {
    "path": "scrapy/douban/spiders/__init__.py",
    "content": "# This package will contain the spiders of your Scrapy project\n#\n# Please refer to the documentation for information on how to create and manage\n# your spiders.\n"
  },
  {
    "path": "scrapy/douban/spiders/book_comment.py",
    "content": "import json\n\nimport douban.database as db\nfrom douban.items import Comment\nfrom scrapy import Request, Spider\n\ncursor = db.connection.cursor()\n\n\nclass BookCommentSpider(Spider):\n    name = \"book_comment\"\n    allowed_domains = [\"book.douban.com\"]\n\n    def start_requests(self):\n        sql = \"SELECT douban_id FROM books WHERE douban_id NOT IN \\\n            (SELECT douban_id FROM comments GROUP BY douban_id) ORDER BY douban_id DESC\"\n        cursor.execute(sql)\n        books = cursor.fetchall()\n        baseurl = \"https://m.douban.com/rexxar/api/v2/book/%s/interests?count=5&order_by=hot\"\n        referer = \"https://m.douban.com/book/subject/%s/?from=showing\"\n        for book in books:\n            yield Request(\n                baseurl % book[\"douban_id\"], headers={\"Referer\": referer % book[\"douban_id\"]},\n            )\n\n    def parse(self, response):\n        douban_id = response.url.split(\"/\")[-2]\n        items = json.loads(response.body)[\"interests\"]\n        for item in items:\n            comment = Comment()\n            comment[\"douban_id\"] = douban_id\n            comment[\"douban_comment_id\"] = item[\"id\"]\n            comment[\"douban_user_nickname\"] = item[\"user\"][\"name\"]\n            comment[\"douban_user_avatar\"] = item[\"user\"][\"avatar\"]\n            comment[\"douban_user_url\"] = item[\"user\"][\"url\"]\n            comment[\"content\"] = item[\"comment\"]\n            comment[\"votes\"] = item[\"vote_count\"]\n            yield comment\n"
  },
  {
    "path": "scrapy/douban/spiders/book_meta.py",
    "content": "import douban.database as db\nimport douban.util as util\nfrom douban.items import BookMeta\nfrom scrapy import Spider\n\ncursor = db.connection.cursor()\n\n\nclass BookMetaSpider(Spider):\n    name = \"book_meta\"\n    user_agent = \"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 \\\n                  (KHTML, like Gecko) Chrome/67.0.3396.62 Safari/537.36\"\n    allowed_domains = [\"book.douban.com\"]\n    sql = 'SELECT * FROM subjects WHERE type=\"book\" AND douban_id NOT IN \\\n           (SELECT douban_id FROM books) ORDER BY douban_id'\n    cursor.execute(sql)\n    books = cursor.fetchall()\n    start_urls = (\"https://book.douban.com/subject/%s/\" % i[\"douban_id\"] for i in books)\n\n    def set_douban_id(self, meta, response):\n        meta[\"douban_id\"] = response.url[32:-1]\n        return meta\n\n    def set_cover(self, meta, response):\n        regex = '//img[@rel=\"v:photo\"]/@src'\n        match = response.xpath(regex).get()\n        if match:\n            if match.find(\"default\") == -1:\n                meta[\"cover\"] = match.replace(\"spst\", \"lpst\").replace(\"mpic\", \"lpic\")\n            else:\n                meta[\"cover\"] = \"\"\n        return meta\n\n    def set_slug(self, meta, response):\n        meta[\"slug\"] = util.shorturl(meta[\"douban_id\"])\n        return meta\n\n    def set_name(self, meta, response):\n        regex = \"//title/text()\"\n        match = response.xpath(regex).get()\n        if match:\n            meta[\"name\"] = match[:-5].strip()\n        return meta\n\n    def set_alt_name(self, meta, response):\n        regex = '//text()[preceding-sibling::span[text()=\"原作名:\"]][following\\\n-sibling::br]'\n        match = response.xpath(regex).get()\n        if match:\n            meta[\"alt_name\"] = match\n        return meta\n\n    def set_sub_name(self, meta, response):\n        regex = '//text()[preceding-sibling::span[text()=\"副标题:\"]][following\\\n-sibling::br]'\n        match = response.xpath(regex).get()\n        if match:\n            meta[\"sub_name\"] = match\n        return meta\n\n    def set_author(self, meta, response):\n        regex = '//a[parent::span[child::span[text()=\" 作者\"]]]/text()'\n        matches = response.xpath(regex).getall()\n        if matches:\n            meta[\"authors\"] = \"/\".join((i.strip() for i in matches))\n        return meta\n\n    def set_summary(self, meta, response):\n        regex = '//div[@id=\"link-report\"]//div[@class=\"intro\"]'\n        matches = response.xpath(regex)\n        if matches:\n            items = matches[-1].xpath(\"p/text()\").getall()\n            meta[\"summary\"] = \"\".join((\"<p>%s</p>\" % i for i in items))\n        return meta\n\n    def set_author_intro(self, meta, response):\n        regex = '//div[@class=\"indent \"]//div[@class=\"intro\"]'\n        matches = response.xpath(regex)\n        if matches:\n            items = matches[-1].xpath(\"p/text()\").getall()\n            meta[\"author_intro\"] = \"\".join((\"<p>%s</p>\" % i for i in items))\n        return meta\n\n    def set_translator(self, meta, response):\n        regex = '//a[parent::span[child::span[text()=\" 译者\"]]]/text()'\n        matches = response.xpath(regex).getall()\n        if matches:\n            meta[\"translators\"] = \"/\".join((i.strip() for i in matches))\n        return meta\n\n    def set_series(self, meta, response):\n        regex = '//a[preceding-sibling::span[text()=\"丛书:\"]][following\\\n-sibling::br]/text()'\n        matches = response.xpath(regex).getall()\n        if matches:\n            meta[\"series\"] = \"/\".join((i.strip() for i in matches))\n        return meta\n\n    def set_publisher(self, meta, response):\n        regex = '//text()[preceding-sibling::span[text()=\"出版社:\"]][following\\\n-sibling::br]'\n        match = response.xpath(regex).get()\n        if match:\n            meta[\"publisher\"] = match\n        return meta\n\n    def set_publish_date(self, meta, response):\n        regex = '//text()[preceding-sibling::span[text()=\"出版年:\"]][following\\\n-sibling::br]'\n        match = response.xpath(regex).get()\n        if match:\n            meta[\"publish_date\"] = match\n        return meta\n\n    def set_pages(self, meta, response):\n        regex = '//text()[preceding-sibling::span[text()=\"页数:\"]][following\\\n-sibling::br]'\n        match = response.xpath(regex).get()\n        if match:\n            meta[\"pages\"] = match\n        return meta\n\n    def set_price(self, meta, response):\n        regex = '//text()[preceding-sibling::span[text()=\"定价:\"]][following\\\n-sibling::br]'\n        match = response.xpath(regex).get()\n        if match:\n            meta[\"price\"] = match\n        return meta\n\n    def set_binding(self, meta, response):\n        regex = '//text()[preceding-sibling::span[text()=\"装帧:\"]][following\\\n-sibling::br]'\n        match = response.xpath(regex).get()\n        if match:\n            meta[\"binding\"] = match\n        return meta\n\n    def set_isbn(self, meta, response):\n        regex = '//text()[preceding-sibling::span[text()=\"ISBN:\"]][following\\\n-sibling::br]'\n        match = response.xpath(regex).get()\n        if match:\n            meta[\"isbn\"] = match\n        return meta\n\n    def set_score(self, meta, response):\n        regex = '//strong[@property=\"v:average\"]/text()'\n        match = response.xpath(regex).get()\n        if match:\n            score = match.strip()\n            if score:\n                meta[\"douban_score\"] = score\n        return meta\n\n    def set_votes(self, meta, response):\n        regex = '//span[@property=\"v:votes\"]/text()'\n        match = response.xpath(regex).get()\n        if match:\n            votes = match.strip()\n            if votes:\n                meta[\"douban_votes\"] = votes\n        return meta\n\n    def set_tags(self, meta, response):\n        regex = '//a[@class=\"  tag\"]/text()'\n        matches = response.xpath(regex).getall()\n        if matches:\n            meta[\"tags\"] = \"/\".join((i.strip() for i in matches))\n        return meta\n\n    def parse(self, response):\n        meta = BookMeta()\n        self.set_douban_id(meta, response)\n        self.set_cover(meta, response)\n        self.set_name(meta, response)\n        self.set_sub_name(meta, response)\n        self.set_alt_name(meta, response)\n        self.set_summary(meta, response)\n        self.set_author(meta, response)\n        self.set_author_intro(meta, response)\n        self.set_translator(meta, response)\n        self.set_series(meta, response)\n        self.set_publisher(meta, response)\n        self.set_publish_date(meta, response)\n        self.set_pages(meta, response)\n        self.set_price(meta, response)\n        self.set_binding(meta, response)\n        self.set_isbn(meta, response)\n        self.set_score(meta, response)\n        self.set_votes(meta, response)\n        self.set_tags(meta, response)\n        self.set_slug(meta, response)\n        return meta\n"
  },
  {
    "path": "scrapy/douban/spiders/book_subject.py",
    "content": "import random\nimport string\n\nfrom douban.items import Subject\nfrom scrapy.linkextractors import LinkExtractor\nfrom scrapy.spiders import CrawlSpider, Request, Rule\n\n\nclass BookSubjectSpider(CrawlSpider):\n    name = \"book_subject\"\n    user_agent = \"Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko\"\n    allowed_domains = [\"book.douban.com\"]\n    start_urls = [\"https://book.douban.com/subject/26628811/\"]\n    rules = (\n        Rule(\n            LinkExtractor(allow=(\"https://book.douban.com/subject/(\\\\d)+/$\")),\n            callback=\"parse_item\",\n            follow=True,\n            process_request=\"cookie\",\n        ),\n    )\n\n    def cookie(self, request, response):\n        bid = \"\".join(random.choice(string.ascii_letters + string.digits) for x in range(11))\n        request.cookies[\"bid\"] = bid\n        request = request.replace(url=request.url.replace(\"?\", \"/?\"))\n        return request\n\n    def start_requests(self):\n        for url in self.start_urls:\n            bid = \"\".join(random.choice(string.ascii_letters + string.digits) for x in range(11))\n            yield Request(url, cookies={\"bid\": bid})\n\n    def set_douban_id(self, subject, response):\n        subject[\"douban_id\"] = response.url[32:-1]\n        return subject\n\n    def parse_item(self, response):\n        subject = Subject()\n        self.set_douban_id(subject, response)\n        subject[\"type\"] = \"book\"\n        return subject\n"
  },
  {
    "path": "scrapy/douban/spiders/movie_comment.py",
    "content": "import json\r\n\r\nimport douban.database as db\r\nfrom douban.items import Comment\r\nfrom scrapy import Request, Spider\r\n\r\ncursor = db.connection.cursor()\r\n\r\n\r\nclass MovieCommentSpider(Spider):\r\n    name = \"movie_comment\"\r\n    allowed_domains = [\"movie.douban.com\"]\r\n\r\n    def start_requests(self):\r\n        sql = \"SELECT douban_id FROM movies WHERE douban_id NOT IN \\\r\n            (SELECT douban_id FROM comments GROUP BY douban_id) ORDER BY douban_id DESC\"\r\n        cursor.execute(sql)\r\n        movies = cursor.fetchall()\r\n        baseurl = \"https://m.douban.com/rexxar/api/v2/movie/%s/interests?count=5&order_by=hot\"\r\n        referer = \"https://m.douban.com/movie/subject/%s/?from=showing\"\r\n        for movie in movies:\r\n            yield Request(\r\n                baseurl % movie[\"douban_id\"], headers={\"Referer\": referer % movie[\"douban_id\"]},\r\n            )\r\n\r\n    def parse(self, response):\r\n        douban_id = response.url.split(\"/\")[-2]\r\n        items = json.loads(response.body)[\"interests\"]\r\n        for item in items:\r\n            comment = Comment()\r\n            comment[\"douban_id\"] = douban_id\r\n            comment[\"douban_comment_id\"] = item[\"id\"]\r\n            comment[\"douban_user_nickname\"] = item[\"user\"][\"name\"]\r\n            comment[\"douban_user_avatar\"] = item[\"user\"][\"avatar\"]\r\n            comment[\"douban_user_url\"] = item[\"user\"][\"url\"]\r\n            comment[\"content\"] = item[\"comment\"]\r\n            comment[\"votes\"] = item[\"vote_count\"]\r\n            yield comment\r\n"
  },
  {
    "path": "scrapy/douban/spiders/movie_meta.py",
    "content": "import douban.database as db\nimport douban.util as util\nimport douban.validator as validator\nfrom douban.items import MovieMeta\nfrom scrapy import Spider\n\ncursor = db.connection.cursor()\n\n\nclass MovieMetaSpider(Spider):\n    name = \"movie_meta\"\n    allowed_domains = [\"movie.douban.com\"]\n    user_agent = \"Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko\"\n    sql = 'SELECT * FROM subjects WHERE type=\"movie\" AND douban_id NOT IN \\\n(SELECT douban_id FROM movies) ORDER BY douban_id DESC'\n    cursor.execute(sql)\n    movies = cursor.fetchall()\n    start_urls = (\"https://movie.douban.com/subject/%s/\" % i[\"douban_id\"] for i in movies)\n\n    def set_douban_id(self, meta, response):\n        meta[\"douban_id\"] = response.url[33:-1]\n        return meta\n\n    def set_type(self, meta, response):\n        regex = '//text()[preceding-sibling::span[text()=\"集数:\"]][fo\\\nllowing-sibling::br]'\n        match = response.xpath(regex).get()\n        if match:\n            meta[\"type\"] = \"tv\"\n        else:\n            meta[\"type\"] = \"movie\"\n        return meta\n\n    def set_cover(self, meta, response):\n        regex = '//img[@rel=\"v:image\"]/@src'\n        match = response.xpath(regex).get()\n        if match:\n            meta[\"cover\"] = match.replace(\"s_ratio_poster\", \"l_ratio_poster\")\n        else:\n            meta[\"cover\"] = \"\"\n        return meta\n\n    def set_name(self, meta, response):\n        regex = \"//title/text()\"\n        match = response.xpath(regex).get()\n        if match:\n            meta[\"name\"] = match[:-5].strip()\n        return meta\n\n    def set_slug(self, meta, response):\n        meta[\"slug\"] = util.shorturl(meta[\"douban_id\"])\n        return meta\n\n    def set_year(self, meta, response):\n        regex = '//span[@class=\"year\"]/text()'\n        match = response.xpath(regex).get()\n        if match:\n            meta[\"year\"] = validator.match_year(match)\n        return meta\n\n    def set_directors(self, meta, response):\n        regex = '//a[@rel=\"v:directedBy\"]/text()'\n        matches = response.xpath(regex).getall()\n        meta[\"directors\"] = validator.process_slash_str(\"/\".join(matches))\n        return meta\n\n    def set_writers(self, meta, response):\n        regex = '//span[preceding-sibling::span[text()=\"编剧\"]]/a/text()'\n        matches = response.xpath(regex).getall()\n        meta[\"writers\"] = validator.process_slash_str(\"/\".join(matches))\n        return meta\n\n    def set_actors(self, meta, response):\n        regex = '//a[@rel=\"v:starring\"]/text()'\n        matches = response.xpath(regex).getall()\n        meta[\"actors\"] = validator.process_slash_str(\"/\".join(matches))\n        return meta\n\n    def set_genres(self, meta, response):\n        regex = '//span[@property=\"v:genre\"]/text()'\n        matches = response.xpath(regex).getall()\n        meta[\"genres\"] = \"/\".join(matches)\n        return meta\n\n    def set_official_site(self, meta, response):\n        regex = '//a[preceding-sibling::span[text()=\"官方网站:\"]][following-si\\\nbling::br]/@href'\n        match = response.xpath(regex).get()\n        if match:\n            meta[\"official_site\"] = validator.process_url(match)\n        return meta\n\n    def set_regions(self, meta, response):\n        regex = '//text()[preceding-sibling::span[text()=\"制片国家/地区:\"]][fo\\\nllowing-sibling::br]'\n        match = response.xpath(regex).get()\n        if match:\n            meta[\"regions\"] = match\n        return meta\n\n    def set_languages(self, meta, response):\n        regex = '//text()[preceding-sibling::span[text()=\"语言:\"]][following-s\\\nibling::br]'\n        match = response.xpath(regex).get()\n        if match:\n            meta[\"languages\"] = match\n        return meta\n\n    def set_release_date(self, meta, response):\n        regex = '//span[@property=\"v:initialReleaseDate\"]/@content'\n        match = response.xpath(regex).get()\n        if match:\n            release_date = validator.str_to_date(validator.match_date(match))\n            if release_date:\n                meta[\"release_date\"] = release_date\n        return meta\n\n    def set_runtime(self, meta, response):\n        regex = '//span[@property=\"v:runtime\"]/@content'\n        match = response.xpath(regex).get()\n        if match:\n            meta[\"mins\"] = match\n        return meta\n\n    def set_alias(self, meta, response):\n        regex = '//text()[preceding-sibling::span[text()=\"又名:\"]][following-s\\\nibling::br]'\n        match = response.xpath(regex).get()\n        if match:\n            meta[\"alias\"] = validator.process_slash_str(match)\n        return meta\n\n    def set_imdb_id(self, meta, response):\n        regex = '//a[preceding-sibling::span[text()=\"IMDb链接:\"]][following-si\\\nbling::br]/@href'\n        match = response.xpath(regex).get()\n        if match:\n            meta[\"imdb_id\"] = match.strip().split(\"?\")[0][27:]\n        return meta\n\n    def set_score(self, meta, response):\n        regex = '//strong[@property=\"v:average\"]/text()'\n        match = response.xpath(regex).get()\n        if match:\n            meta[\"douban_score\"] = match\n        return meta\n\n    def set_votes(self, meta, response):\n        regex = '//span[@property=\"v:votes\"]/text()'\n        match = response.xpath(regex).get()\n        if match:\n            meta[\"douban_votes\"] = match\n        return meta\n\n    def set_tags(self, meta, response):\n        regex = '//div[@class=\"tags-body\"]/a/text()'\n        matches = response.xpath(regex).getall()\n        meta[\"tags\"] = \"/\".join(matches)\n        return meta\n\n    def set_storyline(self, meta, response):\n        regex = '//span[@class=\"all hidden\"]/text()'\n        matches = response.xpath(regex).getall()\n        if matches:\n            meta[\"storyline\"] = \"<br>\".join([item.strip() for item in matches])\n        else:\n            regex = '//span[@property=\"v:summary\"]/text()'\n            matches = response.xpath(regex).getall()\n            if matches:\n                meta[\"storyline\"] = \"<br>\".join([item.strip() for item in matches])\n        return meta\n\n    def set_comments(self, meta, response):\n        regex = '//div[@class=\"comment\"]/p/text()'\n        matches = response.xpath(regex).getall()\n        meta[\"comments\"] = \"/\".join((i.strip() for i in matches))\n        return meta\n\n    def parse(self, response):\n        meta = MovieMeta()\n        self.set_douban_id(meta, response)\n        self.set_type(meta, response)\n        self.set_cover(meta, response)\n        self.set_name(meta, response)\n        self.set_year(meta, response)\n        self.set_directors(meta, response)\n        self.set_writers(meta, response)\n        self.set_actors(meta, response)\n        self.set_genres(meta, response)\n        self.set_official_site(meta, response)\n        self.set_regions(meta, response)\n        self.set_languages(meta, response)\n        self.set_release_date(meta, response)\n        self.set_runtime(meta, response)\n        self.set_alias(meta, response)\n        self.set_imdb_id(meta, response)\n        self.set_score(meta, response)\n        self.set_votes(meta, response)\n        self.set_tags(meta, response)\n        self.set_storyline(meta, response)\n        self.set_slug(meta, response)\n        return meta\n"
  },
  {
    "path": "scrapy/douban/spiders/movie_subject.py",
    "content": "import random\r\nimport string\r\n\r\nfrom douban.items import Subject\r\nfrom scrapy.linkextractors import LinkExtractor\r\nfrom scrapy.spiders import CrawlSpider, Request, Rule\r\n\r\n\r\nclass MovieSubjectSpider(CrawlSpider):\r\n    name = \"movie_subject\"\r\n    allowed_domains = [\"m.douban.com\"]\r\n    start_urls = [\"https://m.douban.com/movie/subject/1292052/\"]\r\n    rules = (\r\n        Rule(\r\n            LinkExtractor(allow=(\"movie/subject/(\\\\d)+\\\\?from=rec$\")),\r\n            callback=\"parse_item\",\r\n            follow=True,\r\n            process_request=\"cookie\",\r\n        ),\r\n    )\r\n\r\n    def cookie(self, request, response):\r\n        bid = \"\".join(random.choice(string.ascii_letters + string.digits) for x in range(11))\r\n        request.cookies[\"bid\"] = bid\r\n        request = request.replace(url=request.url.replace(\"?\", \"/?\"))\r\n        return request\r\n\r\n    def start_requests(self):\r\n        for url in self.start_urls:\r\n            bid = \"\".join(random.choice(string.ascii_letters + string.digits) for x in range(11))\r\n            yield Request(url, cookies={\"bid\": bid})\r\n\r\n    def set_douban_id(self, subject, response):\r\n        subject[\"douban_id\"] = response.url[35:-10]\r\n        return subject\r\n\r\n    def parse_item(self, response):\r\n        subject = Subject()\r\n        self.set_douban_id(subject, response)\r\n        subject[\"type\"] = \"movie\"\r\n        return subject\r\n"
  },
  {
    "path": "scrapy/douban/util.py",
    "content": "import hashlib\n\n\ndef shorturl(url):\n    chars = \"abcdefghijklmnopqrstuvwxyz0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ\"\n    _hex = 0x7FFFFFF & int(str(hashlib.md5(url.encode()).hexdigest()), 16)\n    code = \"\"\n    for i in range(9):\n        index = 0x0000003D & _hex\n        code += chars[index]\n        _hex = _hex >> 3\n    return code\n"
  },
  {
    "path": "scrapy/douban/validator.py",
    "content": "import re\n\nimport arrow\n\n\ndef match_year(s):\n    matches = re.findall(\"[\\\\d]{4}\", s)\n    if matches:\n        return matches[0]\n    else:\n        return \"0\"\n\n\ndef match_date(s):\n    matches = re.findall(\"[\\\\d-]{8,10}\", s)\n    if matches:\n        return matches[0]\n    else:\n        return False\n\n\ndef str_to_date(s):\n    try:\n        return str(arrow.get(s, \"YYYY-M-D\").format(\"YYYY-MM-DD\"))\n    except Exception:\n        return False\n\n\ndef is_match_chinese(s):\n    matches = re.findall(\"[\\u4e00-\\u9fa5]+\", s)\n    if matches:\n        return True\n    else:\n        return False\n\n\ndef process_slash_str(s):\n    alias = []\n    items = s.split(\"/\")\n    for item in items:\n        if is_match_chinese(item):\n            alias.append(item)\n    return \"/\".join(alias[0:30])\n\n\ndef process_url(s):\n    if len(s) < 255:\n        return s\n    return \"\"\n"
  },
  {
    "path": "scrapy/scrapy.cfg",
    "content": "# Automatically created by: scrapy startproject\n#\n# For more information about the [deploy] section see:\n# https://scrapyd.readthedocs.io/en/latest/deploy.html\n\n[settings]\ndefault = douban.settings\n\n[deploy]\nurl = http://localhost:6800/\nproject = douban\nusername = scrapyd\npassword = public\n"
  }
]