Repository: 40robber/ScrapyDouban Branch: master Commit: b6d2cced7fc1 Files: 24 Total size: 44.5 KB Directory structure: gitextract_lbt66xi9/ ├── .gitignore ├── README.md ├── docker/ │ ├── docker-compose.yml │ ├── mysql/ │ │ ├── Dockerfile │ │ └── douban.sql │ └── scrapyd/ │ ├── Dockerfile │ └── scrapyd.conf ├── requirements.txt └── scrapy/ ├── douban/ │ ├── __init__.py │ ├── database.py │ ├── items.py │ ├── middlewares.py │ ├── pipelines.py │ ├── settings.py │ ├── spiders/ │ │ ├── __init__.py │ │ ├── book_comment.py │ │ ├── book_meta.py │ │ ├── book_subject.py │ │ ├── movie_comment.py │ │ ├── movie_meta.py │ │ └── movie_subject.py │ ├── util.py │ └── validator.py └── scrapy.cfg ================================================ FILE CONTENTS ================================================ ================================================ FILE: .gitignore ================================================ *.pyc *.swp storage/ __pycache__/ .idea/ .vscode/ ================================================ FILE: README.md ================================================ ## ScrapyDouban [](https://www.youtube.com/watch?v=Fyrvrb0rqvE) [演示视频](https://www.youtube.com/watch?v=Fyrvrb0rqvE) 基于 Python3 的豆瓣电影/豆瓣读书 Scarpy 爬虫,实现封面下载+元数据抓取+评论入库。 维护这个项目的目的是分享一些我在使用 Scrapy 过程中的实践,该项目大概涵盖了 80% 我所使用到的 Scrapy 知识,希望能帮助到正在学习 Scrapy 的朋友,也希望大家在阅读[ Scrapy 官方文档](https://scrapy.readthedocs.io/en/stable/index.html)后食用,但是请注意目前项目所使用版本为 Scrapy 2.5.0。    ### Docker ------- 项目包含了 douban_scrapyd douban_db douban_adminer 三个容器。 douban_scrapyd 容器基于 [python:3.9-slim-buster](https://pythonspeed.com/articles/base-image-python-docker-images/),默认安装的 Python3 库有 scrapy scrapyd pymysql pillow arrow,默认映射端口 6800:6800 以方便用户通过宿主机 IP:6800 访问 scrapyd 管理界面,登陆所需参数,用户名:scrapyd 密码:public。 douban_db 容器基于 mysql:8,root 密码为 public,默认初始化时导入 docker/mysql/douban.sql 文件到 douban 数据库。 douban_adminer 容器基于 adminer:4,默认映射端口 8080:8080 以方便用户通过宿主机 IP:8080 访问数据库管理界面,登陆所需参数,服务器:mysql 用户名:root 密码:public。 ### 项目 SQL ------ 项目所使用的 SQL 文件存放路径为 docker/mysql/douban.sql 。 ### 收集流程 ------- 首先收集 Subject ID --> 然后通过 Subject ID 抓取详情页面,收集元数据 --> 最后通过 Subject ID 来收集评论 ### 使用方法 ------- $ git clone https://github.com/baabaaox/ScrapyDouban.git # 构建并运行容器 $ cd ./ScrapyDouban/docker $ sudo docker-compose up --build -d # 进入 douban_scrapyd 容器 $ sudo docker exec -it douban_scrapyd bash # 进入 scrapy 目录 $ cd /srv/ScrapyDouban/scrapy $ scrapy list # 抓取电影数据 $ scrapy crawl movie_subject # 收集电影 Subject ID $ scrapy crawl movie_meta # 收集电影元数据 $ scrapy crawl movie_comment # 收集电影评论 # 抓取书籍数据 $ scrapy crawl book_subject # 收集书籍 Subject ID $ scrapy crawl book_meta # 收集书籍元数据 $ scrapy crawl book_comment # 收集书籍评论 如果你想在测试的时候比较方便的修改代码,你可以把项目所在路径 scrapy 目录挂载到 douban_scrapyd 容器。 如果你习惯使用 scrapyd 进行操作,可以通过 scrapyd-client 直接将项目部署到 douban_scrapyd 容器。 ### 代理 IP -------- 由于豆瓣的反爬虫机制,现在只能通过代理 IP 来绕过。默认 settings.py 里面并未启用 douban.middlewares.ProxyMiddleware 中间件,如果你真的需要使用豆瓣的数据来进行一些研究,可以去租用付费的代理池。 ### 图片下载 -------- douban.pipelines.CoverPipeline 通过对 spider.name 进行过滤来处理封面下载逻辑,所下载图片文件的保存路径为 douban_scrapy 容器的 /srv/ScrapyDouban/storage 目录。 ================================================ FILE: docker/docker-compose.yml ================================================ services: mysql: build: ./mysql container_name: douban_mysql environment: - MYSQL_ROOT_PASSWORD=public - MYSQL_DATABASE=douban command: mysqld --default-authentication-plugin=mysql_native_password adminer: image: adminer:4 container_name: douban_adminer ports: - 8080:8080 links: - mysql scrapyd: build: ./scrapyd container_name: douban_scrapyd ports: - 6800:6800 environment: - TZ=Asia/Chongqing - MYSQL_HOST=mysql - MYSQL_USER=root - MYSQL_PASS=public - MYSQL_DB=douban links: - mysql ================================================ FILE: docker/mysql/Dockerfile ================================================ FROM mysql:8 ADD douban.sql /docker-entrypoint-initdb.d ================================================ FILE: docker/mysql/douban.sql ================================================ -- Adminer 4.6.3 MySQL dump SET NAMES utf8; SET time_zone = '+00:00'; SET foreign_key_checks = 0; SET sql_mode = 'NO_AUTO_VALUE_ON_ZERO'; SET NAMES utf8mb4; CREATE DATABASE IF NOT EXISTS `douban` /*!40100 DEFAULT CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci */; USE `douban`; DROP TABLE IF EXISTS `books`; CREATE TABLE `books` ( `id` int(10) unsigned NOT NULL AUTO_INCREMENT, `slug` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '', `name` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '', `sub_name` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '', `alt_name` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '', `cover` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '', `summary` text CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci, `authors` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '', `author_intro` text CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci, `translators` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '', `series` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '', `publisher` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '', `publish_date` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '', `pages` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '', `price` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '', `binding` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '', `isbn` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '', `tags` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '', `douban_id` int(10) unsigned NOT NULL DEFAULT '0', `douban_score` decimal(3,1) unsigned NOT NULL DEFAULT '0.0', `douban_votes` int(10) unsigned NOT NULL DEFAULT '0', `created_at` timestamp NOT NULL DEFAULT '0000-00-00 00:00:00', `updated_at` timestamp NOT NULL DEFAULT '0000-00-00 00:00:00', PRIMARY KEY (`id`), KEY `books_slug_index` (`slug`), KEY `books_name_index` (`name`), KEY `books_douban_id_index` (`douban_id`) ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci; DROP TABLE IF EXISTS `comments`; CREATE TABLE `comments` ( `id` int(10) unsigned NOT NULL AUTO_INCREMENT, `douban_id` int(10) unsigned NOT NULL DEFAULT '0', `douban_comment_id` int(10) unsigned NOT NULL DEFAULT '0', `douban_user_nickname` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '', `douban_user_avatar` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '', `douban_user_url` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '', `content` text CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci NOT NULL, `votes` int(10) unsigned NOT NULL DEFAULT '0', `created_at` timestamp NOT NULL DEFAULT '0000-00-00 00:00:00', `updated_at` timestamp NOT NULL DEFAULT '0000-00-00 00:00:00', PRIMARY KEY (`id`), KEY `comments_douban_id_index` (`douban_id`), KEY `comments_douban_comment_id_index` (`douban_comment_id`) ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci; DROP TABLE IF EXISTS `movies`; CREATE TABLE `movies` ( `id` int(10) unsigned NOT NULL AUTO_INCREMENT, `type` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '', `slug` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '', `name` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '', `alias` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '', `cover` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '', `year` smallint(5) unsigned NOT NULL DEFAULT '0', `regions` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '', `genres` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '', `languages` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '', `release_date` date DEFAULT NULL, `official_site` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '', `directors` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '', `writers` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '', `actors` text CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci, `storyline` text CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci, `mins` smallint(5) unsigned NOT NULL DEFAULT '0', `recommend_tip` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '', `tags` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '', `avg_score` decimal(3,1) unsigned NOT NULL DEFAULT '0.0', `imdb_id` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '', `imdb_score` decimal(3,1) unsigned NOT NULL DEFAULT '0.0', `imdb_votes` int(10) unsigned NOT NULL DEFAULT '0', `douban_id` int(10) unsigned NOT NULL DEFAULT '0', `douban_score` decimal(3,1) unsigned NOT NULL DEFAULT '0.0', `douban_votes` int(10) unsigned NOT NULL DEFAULT '0', `created_at` timestamp NOT NULL DEFAULT '0000-00-00 00:00:00', `updated_at` timestamp NOT NULL DEFAULT '0000-00-00 00:00:00', PRIMARY KEY (`id`), KEY `movies_slug_index` (`slug`), KEY `movies_name_index` (`name`), KEY `movies_imdb_id_index` (`imdb_id`), KEY `movies_douban_id_index` (`douban_id`) ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci; DROP TABLE IF EXISTS `subjects`; CREATE TABLE `subjects` ( `id` int(10) unsigned NOT NULL AUTO_INCREMENT, `douban_id` int(10) unsigned NOT NULL DEFAULT '0', `type` enum('movie','book') CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT 'movie', PRIMARY KEY (`id`), UNIQUE KEY `subjects_douban_id_unique` (`douban_id`) ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci; -- 2018-11-27 16:52:54 ================================================ FILE: docker/scrapyd/Dockerfile ================================================ FROM python:3.9-slim-buster ARG SCRAPY_VERSION=2.5.0 RUN apt-get update \ && apt-get install -y --no-install-recommends git \ && pip install -i https://mirrors.aliyun.com/pypi/simple/ --upgrade pip \ && pip install -i https://mirrors.aliyun.com/pypi/simple/ scrapy==$SCRAPY_VERSION pymysql==1.0.2 pillow==8.2.0 arrow==1.0.3 \ && pip install -U git+https://github.com/scrapy/scrapyd.git \ && git clone https://github.com/baabaaox/ScrapyDouban.git /srv/ScrapyDouban COPY scrapyd.conf /etc/scrapyd/ EXPOSE 6800 CMD ["scrapyd"] ================================================ FILE: docker/scrapyd/scrapyd.conf ================================================ [scrapyd] bind_address = 0.0.0.0 username = scrapyd password = public ================================================ FILE: requirements.txt ================================================ arrow==1.2.1 Pillow==9.0.0 PyMySQL==1.0.2 Scrapy==2.5.1 ================================================ FILE: scrapy/douban/__init__.py ================================================ ================================================ FILE: scrapy/douban/database.py ================================================ import os import pymysql MYSQL_HOST = os.environ.get("MYSQL_HOST", "localhost") MYSQL_USER = os.environ.get("MYSQL_USER", "root") MYSQL_PASS = os.environ.get("MYSQL_PASS", "public") MYSQL_DB = os.environ.get("MYSQL_DB", "douban") connection = pymysql.connect( host=MYSQL_HOST, user=MYSQL_USER, password=MYSQL_PASS, db=MYSQL_DB, charset="utf8mb4", cursorclass=pymysql.cursors.DictCursor, ) ================================================ FILE: scrapy/douban/items.py ================================================ from scrapy import Field, Item class Subject(Item): douban_id = Field() type = Field() class MovieMeta(Item): douban_id = Field() type = Field() cover = Field() name = Field() slug = Field() year = Field() directors = Field() writers = Field() actors = Field() genres = Field() official_site = Field() regions = Field() languages = Field() release_date = Field() mins = Field() alias = Field() imdb_id = Field() douban_id = Field() douban_score = Field() douban_votes = Field() tags = Field() storyline = Field() class BookMeta(Item): douban_id = Field() slug = Field() name = Field() sub_name = Field() alt_name = Field() cover = Field() summary = Field() authors = Field() author_intro = Field() translators = Field() series = Field() publisher = Field() publish_date = Field() pages = Field() price = Field() binding = Field() isbn = Field() douban_id = Field() douban_score = Field() douban_votes = Field() tags = Field() class Comment(Item): douban_id = Field() douban_comment_id = Field() douban_user_nickname = Field() douban_user_avatar = Field() douban_user_url = Field() content = Field() votes = Field() ================================================ FILE: scrapy/douban/middlewares.py ================================================ # Define here the models for your spider middleware # # See documentation in: # https://docs.scrapy.org/en/latest/topics/spider-middleware.html # useful for handling different item types with a single interface from itemadapter import ItemAdapter, is_item from scrapy import signals class ProxyMiddleware(object): def process_request(self, request, spider): # curl https://m.douban.com/book/subject/26628811/ -x http://127.0.0.1:8081 request.meta["proxy"] = "http://127.0.0.1:8081" class DoubanSpiderMiddleware: # Not all methods need to be defined. If a method is not defined, # scrapy acts as if the spider middleware does not modify the # passed objects. @classmethod def from_crawler(cls, crawler): # This method is used by Scrapy to create your spiders. s = cls() crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) return s def process_spider_input(self, response, spider): # Called for each response that goes through the spider # middleware and into the spider. # Should return None or raise an exception. return None def process_spider_output(self, response, result, spider): # Called with the results returned from the Spider, after # it has processed the response. # Must return an iterable of Request, or item objects. for i in result: yield i def process_spider_exception(self, response, exception, spider): # Called when a spider or process_spider_input() method # (from other spider middleware) raises an exception. # Should return either None or an iterable of Request or item objects. pass def process_start_requests(self, start_requests, spider): # Called with the start requests of the spider, and works # similarly to the process_spider_output() method, except # that it doesn’t have a response associated. # Must return only requests (not items). for r in start_requests: yield r def spider_opened(self, spider): spider.logger.info("Spider opened: %s" % spider.name) class DoubanDownloaderMiddleware: # Not all methods need to be defined. If a method is not defined, # scrapy acts as if the downloader middleware does not modify the # passed objects. @classmethod def from_crawler(cls, crawler): # This method is used by Scrapy to create your spiders. s = cls() crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) return s def process_request(self, request, spider): # Called for each request that goes through the downloader # middleware. # Must either: # - return None: continue processing this request # - or return a Response object # - or return a Request object # - or raise IgnoreRequest: process_exception() methods of # installed downloader middleware will be called return None def process_response(self, request, response, spider): # Called with the response returned from the downloader. # Must either; # - return a Response object # - return a Request object # - or raise IgnoreRequest return response def process_exception(self, request, exception, spider): # Called when a download handler or a process_request() # (from other downloader middleware) raises an exception. # Must either: # - return None: continue processing this exception # - return a Response object: stops process_exception() chain # - return a Request object: stops process_exception() chain pass def spider_opened(self, spider): spider.logger.info("Spider opened: %s" % spider.name) ================================================ FILE: scrapy/douban/pipelines.py ================================================ import hashlib import logging from scrapy import Request from scrapy.pipelines.images import ImagesPipeline from scrapy.utils.misc import arg_to_iter from scrapy.utils.python import to_bytes from twisted.internet.defer import DeferredList import douban.database as db from douban.items import BookMeta, Comment, MovieMeta, Subject cursor = db.connection.cursor() class DoubanPipeline(object): def get_subject(self, item): sql = "SELECT id FROM subjects WHERE douban_id=%s" % item["douban_id"] cursor.execute(sql) return cursor.fetchone() def save_subject(self, item): keys = item.keys() values = tuple(item.values()) fields = ",".join(keys) temp = ",".join(["%s"] * len(keys)) sql = "INSERT INTO subjects (%s) VALUES (%s)" % (fields, temp) cursor.execute(sql, values) return db.connection.commit() def get_movie_meta(self, item): sql = "SELECT id FROM movies WHERE douban_id=%s" % item["douban_id"] cursor.execute(sql) return cursor.fetchone() def save_movie_meta(self, item): keys = item.keys() values = tuple(item.values()) fields = ",".join(keys) temp = ",".join(["%s"] * len(keys)) sql = "INSERT INTO movies (%s) VALUES (%s)" % (fields, temp) cursor.execute(sql, tuple(i.strip() for i in values)) return db.connection.commit() def update_movie_meta(self, item): douban_id = item.pop("douban_id") keys = item.keys() values = tuple(item.values()) values.append(douban_id) fields = ["%s=" % i + "%s" for i in keys] sql = "UPDATE movies SET %s WHERE douban_id=%s" % (",".join(fields), "%s") cursor.execute(sql, tuple(i.strip() for i in values)) return db.connection.commit() def get_book_meta(self, item): sql = "SELECT id FROM books WHERE douban_id=%s" % item["douban_id"] cursor.execute(sql) return cursor.fetchone() def save_book_meta(self, item): keys = item.keys() values = tuple(item.values()) fields = ",".join(keys) temp = ",".join(["%s"] * len(keys)) sql = "INSERT INTO books (%s) VALUES (%s)" % (fields, temp) cursor.execute(sql, tuple(i.strip() for i in values)) return db.connection.commit() def update_book_meta(self, item): douban_id = item.pop("douban_id") keys = item.keys() values = tuple(item.values()) values.append(douban_id) fields = ["%s=" % i + "%s" for i in keys] sql = "UPDATE books SET %s WHERE douban_id=%s" % (",".join(fields), "%s") cursor.execute(sql, values) return db.connection.commit() def get_comment(self, item): sql = "SELECT * FROM comments WHERE douban_comment_id=%s" % item["douban_comment_id"] cursor.execute(sql) return cursor.fetchone() def save_comment(self, item): keys = item.keys() values = tuple(item.values()) fields = ",".join(keys) temp = ",".join(["%s"] * len(keys)) sql = "INSERT INTO comments (%s) VALUES (%s)" % (fields, temp) cursor.execute(sql, values) return db.connection.commit() def process_item(self, item, spider): try: if isinstance(item, Subject): """ subject """ exist = self.get_subject(item) if not exist: self.save_subject(item) elif isinstance(item, MovieMeta): """ meta """ exist = self.get_movie_meta(item) if not exist: self.save_movie_meta(item) else: self.update_movie_meta(item) elif isinstance(item, BookMeta): """ meta """ exist = self.get_book_meta(item) if not exist: self.save_book_meta(item) else: self.update_book_meta(item) elif isinstance(item, Comment): """ comment """ exist = self.get_comment(item) if not exist: self.save_comment(item) except Exception as e: logging.warn(item) logging.error(e) return item class CoverPipeline(ImagesPipeline): def process_item(self, item, spider): if "meta" not in spider.name: return item info = self.spiderinfo requests = arg_to_iter(self.get_media_requests(item, info)) dlist = [self._process_request(r, info, item) for r in requests] dfd = DeferredList(dlist, consumeErrors=1) return dfd.addCallback(self.item_completed, item, info) def file_path(self, request, response=None, info=None, *, item=None): guid = hashlib.sha1(to_bytes(request.url)).hexdigest() return "%s%s/%s%s/%s.jpg" % (guid[9], guid[19], guid[29], guid[39], guid) def get_media_requests(self, item, info): if item["cover"]: return Request(item["cover"]) def item_completed(self, results, item, info): image_paths = [x["path"] for ok, x in results if ok] if image_paths: item["cover"] = image_paths[0] else: item["cover"] = "" return item ================================================ FILE: scrapy/douban/settings.py ================================================ # Scrapy settings for douban project # # For simplicity, this file contains only settings considered important or # commonly used. You can find more settings consulting the documentation: # # https://docs.scrapy.org/en/latest/topics/settings.html # https://docs.scrapy.org/en/latest/topics/downloader-middleware.html # https://docs.scrapy.org/en/latest/topics/spider-middleware.html BOT_NAME = "douban" SPIDER_MODULES = ["douban.spiders"] NEWSPIDER_MODULE = "douban.spiders" LOG_LEVEL = "DEBUG" IMAGES_STORE = "../storage/" # Crawl responsibly by identifying yourself (and your website) on the user-agent USER_AGENT = ( "Mozilla/5.0 (iPhone; CPU iPhone OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148" ) # Obey robots.txt rules ROBOTSTXT_OBEY = False # Configure maximum concurrent requests performed by Scrapy (default: 16) # CONCURRENT_REQUESTS = 32 # Configure a delay for requests for the same website (default: 0) # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay # See also autothrottle settings and docs # DOWNLOAD_DELAY = 3 # The download delay setting will honor only one of: # CONCURRENT_REQUESTS_PER_DOMAIN = 16 CONCURRENT_REQUESTS_PER_IP = 1 # Disable cookies (enabled by default) COOKIES_ENABLED = True # Disable Telnet Console (enabled by default) # TELNETCONSOLE_ENABLED = False # Override the default request headers: # DEFAULT_REQUEST_HEADERS = { # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', # 'Accept-Language': 'en', # } # Enable or disable spider middlewares # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html # SPIDER_MIDDLEWARES = { # 'douban.middlewares.DoubanSpiderMiddleware': 543, # } # Enable or disable downloader middlewares # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html # DOWNLOADER_MIDDLEWARES = { # 'douban.middlewares.DoubanDownloaderMiddleware': 543, # } # Enable or disable extensions # See https://docs.scrapy.org/en/latest/topics/extensions.html # EXTENSIONS = { # 'scrapy.extensions.telnet.TelnetConsole': None, # } # Configure item pipelines # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html ITEM_PIPELINES = { "douban.pipelines.CoverPipeline": 1, "douban.pipelines.DoubanPipeline": 300, } # Enable and configure the AutoThrottle extension (disabled by default) # See https://docs.scrapy.org/en/latest/topics/autothrottle.html # AUTOTHROTTLE_ENABLED = True # The initial download delay # AUTOTHROTTLE_START_DELAY = 5 # The maximum download delay to be set in case of high latencies # AUTOTHROTTLE_MAX_DELAY = 60 # The average number of requests Scrapy should be sending in parallel to # each remote server # AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 # Enable showing throttling stats for every response received: # AUTOTHROTTLE_DEBUG = False # Enable and configure HTTP caching (disabled by default) # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings # HTTPCACHE_ENABLED = True # HTTPCACHE_EXPIRATION_SECS = 0 # HTTPCACHE_DIR = 'httpcache' # HTTPCACHE_IGNORE_HTTP_CODES = [] # HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' ================================================ FILE: scrapy/douban/spiders/__init__.py ================================================ # This package will contain the spiders of your Scrapy project # # Please refer to the documentation for information on how to create and manage # your spiders. ================================================ FILE: scrapy/douban/spiders/book_comment.py ================================================ import json import douban.database as db from douban.items import Comment from scrapy import Request, Spider cursor = db.connection.cursor() class BookCommentSpider(Spider): name = "book_comment" allowed_domains = ["book.douban.com"] def start_requests(self): sql = "SELECT douban_id FROM books WHERE douban_id NOT IN \ (SELECT douban_id FROM comments GROUP BY douban_id) ORDER BY douban_id DESC" cursor.execute(sql) books = cursor.fetchall() baseurl = "https://m.douban.com/rexxar/api/v2/book/%s/interests?count=5&order_by=hot" referer = "https://m.douban.com/book/subject/%s/?from=showing" for book in books: yield Request( baseurl % book["douban_id"], headers={"Referer": referer % book["douban_id"]}, ) def parse(self, response): douban_id = response.url.split("/")[-2] items = json.loads(response.body)["interests"] for item in items: comment = Comment() comment["douban_id"] = douban_id comment["douban_comment_id"] = item["id"] comment["douban_user_nickname"] = item["user"]["name"] comment["douban_user_avatar"] = item["user"]["avatar"] comment["douban_user_url"] = item["user"]["url"] comment["content"] = item["comment"] comment["votes"] = item["vote_count"] yield comment ================================================ FILE: scrapy/douban/spiders/book_meta.py ================================================ import douban.database as db import douban.util as util from douban.items import BookMeta from scrapy import Spider cursor = db.connection.cursor() class BookMetaSpider(Spider): name = "book_meta" user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 \ (KHTML, like Gecko) Chrome/67.0.3396.62 Safari/537.36" allowed_domains = ["book.douban.com"] sql = 'SELECT * FROM subjects WHERE type="book" AND douban_id NOT IN \ (SELECT douban_id FROM books) ORDER BY douban_id' cursor.execute(sql) books = cursor.fetchall() start_urls = ("https://book.douban.com/subject/%s/" % i["douban_id"] for i in books) def set_douban_id(self, meta, response): meta["douban_id"] = response.url[32:-1] return meta def set_cover(self, meta, response): regex = '//img[@rel="v:photo"]/@src' match = response.xpath(regex).get() if match: if match.find("default") == -1: meta["cover"] = match.replace("spst", "lpst").replace("mpic", "lpic") else: meta["cover"] = "" return meta def set_slug(self, meta, response): meta["slug"] = util.shorturl(meta["douban_id"]) return meta def set_name(self, meta, response): regex = "//title/text()" match = response.xpath(regex).get() if match: meta["name"] = match[:-5].strip() return meta def set_alt_name(self, meta, response): regex = '//text()[preceding-sibling::span[text()="原作名:"]][following\ -sibling::br]' match = response.xpath(regex).get() if match: meta["alt_name"] = match return meta def set_sub_name(self, meta, response): regex = '//text()[preceding-sibling::span[text()="副标题:"]][following\ -sibling::br]' match = response.xpath(regex).get() if match: meta["sub_name"] = match return meta def set_author(self, meta, response): regex = '//a[parent::span[child::span[text()=" 作者"]]]/text()' matches = response.xpath(regex).getall() if matches: meta["authors"] = "/".join((i.strip() for i in matches)) return meta def set_summary(self, meta, response): regex = '//div[@id="link-report"]//div[@class="intro"]' matches = response.xpath(regex) if matches: items = matches[-1].xpath("p/text()").getall() meta["summary"] = "".join(("
%s
" % i for i in items)) return meta def set_author_intro(self, meta, response): regex = '//div[@class="indent "]//div[@class="intro"]' matches = response.xpath(regex) if matches: items = matches[-1].xpath("p/text()").getall() meta["author_intro"] = "".join(("%s
" % i for i in items)) return meta def set_translator(self, meta, response): regex = '//a[parent::span[child::span[text()=" 译者"]]]/text()' matches = response.xpath(regex).getall() if matches: meta["translators"] = "/".join((i.strip() for i in matches)) return meta def set_series(self, meta, response): regex = '//a[preceding-sibling::span[text()="丛书:"]][following\ -sibling::br]/text()' matches = response.xpath(regex).getall() if matches: meta["series"] = "/".join((i.strip() for i in matches)) return meta def set_publisher(self, meta, response): regex = '//text()[preceding-sibling::span[text()="出版社:"]][following\ -sibling::br]' match = response.xpath(regex).get() if match: meta["publisher"] = match return meta def set_publish_date(self, meta, response): regex = '//text()[preceding-sibling::span[text()="出版年:"]][following\ -sibling::br]' match = response.xpath(regex).get() if match: meta["publish_date"] = match return meta def set_pages(self, meta, response): regex = '//text()[preceding-sibling::span[text()="页数:"]][following\ -sibling::br]' match = response.xpath(regex).get() if match: meta["pages"] = match return meta def set_price(self, meta, response): regex = '//text()[preceding-sibling::span[text()="定价:"]][following\ -sibling::br]' match = response.xpath(regex).get() if match: meta["price"] = match return meta def set_binding(self, meta, response): regex = '//text()[preceding-sibling::span[text()="装帧:"]][following\ -sibling::br]' match = response.xpath(regex).get() if match: meta["binding"] = match return meta def set_isbn(self, meta, response): regex = '//text()[preceding-sibling::span[text()="ISBN:"]][following\ -sibling::br]' match = response.xpath(regex).get() if match: meta["isbn"] = match return meta def set_score(self, meta, response): regex = '//strong[@property="v:average"]/text()' match = response.xpath(regex).get() if match: score = match.strip() if score: meta["douban_score"] = score return meta def set_votes(self, meta, response): regex = '//span[@property="v:votes"]/text()' match = response.xpath(regex).get() if match: votes = match.strip() if votes: meta["douban_votes"] = votes return meta def set_tags(self, meta, response): regex = '//a[@class=" tag"]/text()' matches = response.xpath(regex).getall() if matches: meta["tags"] = "/".join((i.strip() for i in matches)) return meta def parse(self, response): meta = BookMeta() self.set_douban_id(meta, response) self.set_cover(meta, response) self.set_name(meta, response) self.set_sub_name(meta, response) self.set_alt_name(meta, response) self.set_summary(meta, response) self.set_author(meta, response) self.set_author_intro(meta, response) self.set_translator(meta, response) self.set_series(meta, response) self.set_publisher(meta, response) self.set_publish_date(meta, response) self.set_pages(meta, response) self.set_price(meta, response) self.set_binding(meta, response) self.set_isbn(meta, response) self.set_score(meta, response) self.set_votes(meta, response) self.set_tags(meta, response) self.set_slug(meta, response) return meta ================================================ FILE: scrapy/douban/spiders/book_subject.py ================================================ import random import string from douban.items import Subject from scrapy.linkextractors import LinkExtractor from scrapy.spiders import CrawlSpider, Request, Rule class BookSubjectSpider(CrawlSpider): name = "book_subject" user_agent = "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko" allowed_domains = ["book.douban.com"] start_urls = ["https://book.douban.com/subject/26628811/"] rules = ( Rule( LinkExtractor(allow=("https://book.douban.com/subject/(\\d)+/$")), callback="parse_item", follow=True, process_request="cookie", ), ) def cookie(self, request, response): bid = "".join(random.choice(string.ascii_letters + string.digits) for x in range(11)) request.cookies["bid"] = bid request = request.replace(url=request.url.replace("?", "/?")) return request def start_requests(self): for url in self.start_urls: bid = "".join(random.choice(string.ascii_letters + string.digits) for x in range(11)) yield Request(url, cookies={"bid": bid}) def set_douban_id(self, subject, response): subject["douban_id"] = response.url[32:-1] return subject def parse_item(self, response): subject = Subject() self.set_douban_id(subject, response) subject["type"] = "book" return subject ================================================ FILE: scrapy/douban/spiders/movie_comment.py ================================================ import json import douban.database as db from douban.items import Comment from scrapy import Request, Spider cursor = db.connection.cursor() class MovieCommentSpider(Spider): name = "movie_comment" allowed_domains = ["movie.douban.com"] def start_requests(self): sql = "SELECT douban_id FROM movies WHERE douban_id NOT IN \ (SELECT douban_id FROM comments GROUP BY douban_id) ORDER BY douban_id DESC" cursor.execute(sql) movies = cursor.fetchall() baseurl = "https://m.douban.com/rexxar/api/v2/movie/%s/interests?count=5&order_by=hot" referer = "https://m.douban.com/movie/subject/%s/?from=showing" for movie in movies: yield Request( baseurl % movie["douban_id"], headers={"Referer": referer % movie["douban_id"]}, ) def parse(self, response): douban_id = response.url.split("/")[-2] items = json.loads(response.body)["interests"] for item in items: comment = Comment() comment["douban_id"] = douban_id comment["douban_comment_id"] = item["id"] comment["douban_user_nickname"] = item["user"]["name"] comment["douban_user_avatar"] = item["user"]["avatar"] comment["douban_user_url"] = item["user"]["url"] comment["content"] = item["comment"] comment["votes"] = item["vote_count"] yield comment ================================================ FILE: scrapy/douban/spiders/movie_meta.py ================================================ import douban.database as db import douban.util as util import douban.validator as validator from douban.items import MovieMeta from scrapy import Spider cursor = db.connection.cursor() class MovieMetaSpider(Spider): name = "movie_meta" allowed_domains = ["movie.douban.com"] user_agent = "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko" sql = 'SELECT * FROM subjects WHERE type="movie" AND douban_id NOT IN \ (SELECT douban_id FROM movies) ORDER BY douban_id DESC' cursor.execute(sql) movies = cursor.fetchall() start_urls = ("https://movie.douban.com/subject/%s/" % i["douban_id"] for i in movies) def set_douban_id(self, meta, response): meta["douban_id"] = response.url[33:-1] return meta def set_type(self, meta, response): regex = '//text()[preceding-sibling::span[text()="集数:"]][fo\ llowing-sibling::br]' match = response.xpath(regex).get() if match: meta["type"] = "tv" else: meta["type"] = "movie" return meta def set_cover(self, meta, response): regex = '//img[@rel="v:image"]/@src' match = response.xpath(regex).get() if match: meta["cover"] = match.replace("s_ratio_poster", "l_ratio_poster") else: meta["cover"] = "" return meta def set_name(self, meta, response): regex = "//title/text()" match = response.xpath(regex).get() if match: meta["name"] = match[:-5].strip() return meta def set_slug(self, meta, response): meta["slug"] = util.shorturl(meta["douban_id"]) return meta def set_year(self, meta, response): regex = '//span[@class="year"]/text()' match = response.xpath(regex).get() if match: meta["year"] = validator.match_year(match) return meta def set_directors(self, meta, response): regex = '//a[@rel="v:directedBy"]/text()' matches = response.xpath(regex).getall() meta["directors"] = validator.process_slash_str("/".join(matches)) return meta def set_writers(self, meta, response): regex = '//span[preceding-sibling::span[text()="编剧"]]/a/text()' matches = response.xpath(regex).getall() meta["writers"] = validator.process_slash_str("/".join(matches)) return meta def set_actors(self, meta, response): regex = '//a[@rel="v:starring"]/text()' matches = response.xpath(regex).getall() meta["actors"] = validator.process_slash_str("/".join(matches)) return meta def set_genres(self, meta, response): regex = '//span[@property="v:genre"]/text()' matches = response.xpath(regex).getall() meta["genres"] = "/".join(matches) return meta def set_official_site(self, meta, response): regex = '//a[preceding-sibling::span[text()="官方网站:"]][following-si\ bling::br]/@href' match = response.xpath(regex).get() if match: meta["official_site"] = validator.process_url(match) return meta def set_regions(self, meta, response): regex = '//text()[preceding-sibling::span[text()="制片国家/地区:"]][fo\ llowing-sibling::br]' match = response.xpath(regex).get() if match: meta["regions"] = match return meta def set_languages(self, meta, response): regex = '//text()[preceding-sibling::span[text()="语言:"]][following-s\ ibling::br]' match = response.xpath(regex).get() if match: meta["languages"] = match return meta def set_release_date(self, meta, response): regex = '//span[@property="v:initialReleaseDate"]/@content' match = response.xpath(regex).get() if match: release_date = validator.str_to_date(validator.match_date(match)) if release_date: meta["release_date"] = release_date return meta def set_runtime(self, meta, response): regex = '//span[@property="v:runtime"]/@content' match = response.xpath(regex).get() if match: meta["mins"] = match return meta def set_alias(self, meta, response): regex = '//text()[preceding-sibling::span[text()="又名:"]][following-s\ ibling::br]' match = response.xpath(regex).get() if match: meta["alias"] = validator.process_slash_str(match) return meta def set_imdb_id(self, meta, response): regex = '//a[preceding-sibling::span[text()="IMDb链接:"]][following-si\ bling::br]/@href' match = response.xpath(regex).get() if match: meta["imdb_id"] = match.strip().split("?")[0][27:] return meta def set_score(self, meta, response): regex = '//strong[@property="v:average"]/text()' match = response.xpath(regex).get() if match: meta["douban_score"] = match return meta def set_votes(self, meta, response): regex = '//span[@property="v:votes"]/text()' match = response.xpath(regex).get() if match: meta["douban_votes"] = match return meta def set_tags(self, meta, response): regex = '//div[@class="tags-body"]/a/text()' matches = response.xpath(regex).getall() meta["tags"] = "/".join(matches) return meta def set_storyline(self, meta, response): regex = '//span[@class="all hidden"]/text()' matches = response.xpath(regex).getall() if matches: meta["storyline"] = "