Repository: LinkedInLearning/web-scraping-with-python-2848331 Branch: master Commit: 841cdd162f40 Files: 202 Total size: 32.9 MB Directory structure: gitextract_jwpdz1w4/ ├── .github/ │ ├── CODEOWNERS │ ├── ISSUE_TEMPLATE.md │ ├── PULL_REQUEST_TEMPLATE.md │ └── workflows/ │ └── main.yml ├── .gitignore ├── 01_03/ │ └── ietf_scraper/ │ ├── ietf_scraper/ │ │ ├── __init__.py │ │ ├── items.py │ │ ├── middlewares.py │ │ ├── pipelines.py │ │ ├── settings.py │ │ └── spiders/ │ │ ├── __init__.py │ │ └── ietf.py │ └── scrapy.cfg ├── 01_04_b/ │ └── ietf_scraper/ │ ├── ietf_scraper/ │ │ ├── __init__.py │ │ ├── items.py │ │ ├── middlewares.py │ │ ├── pipelines.py │ │ ├── settings.py │ │ └── spiders/ │ │ ├── __init__.py │ │ └── ietf.py │ └── scrapy.cfg ├── 01_04_e/ │ └── ietf_scraper/ │ ├── ietf_scraper/ │ │ ├── __init__.py │ │ ├── items.py │ │ ├── middlewares.py │ │ ├── pipelines.py │ │ ├── settings.py │ │ └── spiders/ │ │ ├── __init__.py │ │ └── ietf.py │ └── scrapy.cfg ├── 02_01/ │ └── article_scraper/ │ ├── article_scraper/ │ │ ├── __init__.py │ │ ├── items.py │ │ ├── middlewares.py │ │ ├── pipelines.py │ │ ├── settings.py │ │ └── spiders/ │ │ ├── __init__.py │ │ └── wikipedia.py │ └── scrapy.cfg ├── 02_02_b/ │ └── article_crawler/ │ ├── article_crawler/ │ │ ├── __init__.py │ │ ├── items.py │ │ ├── middlewares.py │ │ ├── pipelines.py │ │ ├── settings.py │ │ └── spiders/ │ │ ├── __init__.py │ │ └── wikipedia.py │ └── scrapy.cfg ├── 02_02_e/ │ └── article_crawler/ │ ├── article_crawler/ │ │ ├── __init__.py │ │ ├── items.py │ │ ├── middlewares.py │ │ ├── pipelines.py │ │ ├── settings.py │ │ └── spiders/ │ │ ├── __init__.py │ │ ├── articles.csv │ │ └── wikipedia.py │ └── scrapy.cfg ├── 02_03_b/ │ └── article_crawler/ │ ├── article_crawler/ │ │ ├── __init__.py │ │ ├── items.py │ │ ├── middlewares.py │ │ ├── pipelines.py │ │ ├── settings.py │ │ └── spiders/ │ │ ├── __init__.py │ │ ├── articles.csv │ │ └── wikipedia.py │ └── scrapy.cfg ├── 02_03_e/ │ └── article_crawler/ │ ├── article_crawler/ │ │ ├── __init__.py │ │ ├── items.py │ │ ├── middlewares.py │ │ ├── pipelines.py │ │ ├── settings.py │ │ └── spiders/ │ │ ├── __init__.py │ │ ├── articles.csv │ │ ├── articles.json │ │ ├── articles.xml │ │ └── wikipedia.py │ └── scrapy.cfg ├── 02_04_b/ │ └── article_crawler/ │ ├── article_crawler/ │ │ ├── __init__.py │ │ ├── items.py │ │ ├── middlewares.py │ │ ├── pipelines.py │ │ ├── settings.py │ │ └── spiders/ │ │ ├── __init__.py │ │ ├── articles.csv │ │ ├── articles.json │ │ ├── articles.xml │ │ └── wikipedia.py │ └── scrapy.cfg ├── 02_04_e/ │ └── article_crawler/ │ ├── article_crawler/ │ │ ├── __init__.py │ │ ├── items.py │ │ ├── middlewares.py │ │ ├── pipelines.py │ │ ├── settings.py │ │ └── spiders/ │ │ ├── __init__.py │ │ ├── articles.csv │ │ ├── articles.json │ │ ├── articles.xml │ │ └── wikipedia.py │ └── scrapy.cfg ├── 02_05/ │ └── news_scraper/ │ ├── news_scraper/ │ │ ├── __init__.py │ │ ├── items.py │ │ ├── middlewares.py │ │ ├── pipelines.py │ │ ├── settings.py │ │ └── spiders/ │ │ ├── __init__.py │ │ ├── associated_press.py │ │ ├── cnn.py │ │ ├── news_articles.json │ │ └── yahoo.py │ └── scrapy.cfg ├── 03_01_b/ │ └── form/ │ ├── form/ │ │ ├── __init__.py │ │ ├── items.py │ │ ├── middlewares.py │ │ ├── pipelines.py │ │ ├── settings.py │ │ └── spiders/ │ │ ├── __init__.py │ │ └── get_form.py │ └── scrapy.cfg ├── 03_01_e/ │ └── form/ │ ├── form/ │ │ ├── __init__.py │ │ ├── items.py │ │ ├── middlewares.py │ │ ├── pipelines.py │ │ ├── settings.py │ │ └── spiders/ │ │ ├── __init__.py │ │ ├── get_form.py │ │ └── post_form.py │ └── scrapy.cfg ├── 03_03_b/ │ └── news_scraper/ │ ├── news_scraper/ │ │ ├── __init__.py │ │ ├── items.py │ │ ├── middlewares.py │ │ ├── pipelines.py │ │ ├── settings.py │ │ └── spiders/ │ │ ├── __init__.py │ │ ├── associated_press.py │ │ ├── cnn.py │ │ ├── news_articles.json │ │ └── yahoo.py │ └── scrapy.cfg ├── 03_03_e/ │ └── news_scraper/ │ ├── news_scraper/ │ │ ├── __init__.py │ │ ├── items.py │ │ ├── middlewares.py │ │ ├── pipelines.py │ │ ├── settings.py │ │ └── spiders/ │ │ ├── __init__.py │ │ ├── associated_press.py │ │ ├── cnn.py │ │ ├── news_articles.json │ │ └── yahoo.py │ └── scrapy.cfg ├── 03_04/ │ └── news_scraper/ │ ├── news_scraper/ │ │ ├── __init__.py │ │ ├── items.py │ │ ├── middlewares.py │ │ ├── pipelines.py │ │ ├── settings.py │ │ └── spiders/ │ │ ├── __init__.py │ │ ├── cnn.py │ │ └── counts.csv │ └── scrapy.cfg ├── 03_05/ │ └── news_scraper/ │ ├── news_scraper/ │ │ ├── __init__.py │ │ ├── items.py │ │ ├── middlewares.py │ │ ├── pipelines.py │ │ ├── settings.py │ │ └── spiders/ │ │ ├── __init__.py │ │ ├── cnn.py │ │ └── counts.csv │ └── scrapy.cfg ├── 04_01_b/ │ └── profiles/ │ ├── profiles/ │ │ ├── __init__.py │ │ ├── items.py │ │ ├── middlewares.py │ │ ├── pipelines.py │ │ ├── settings.py │ │ └── spiders/ │ │ ├── __init__.py │ │ └── pythonscraping.py │ └── scrapy.cfg ├── 04_01_e/ │ └── profiles/ │ ├── profiles/ │ │ ├── __init__.py │ │ ├── items.py │ │ ├── middlewares.py │ │ ├── pipelines.py │ │ ├── settings.py │ │ └── spiders/ │ │ ├── __init__.py │ │ └── pythonscraping.py │ └── scrapy.cfg ├── 04_02_b/ │ ├── chromedriver │ └── locations/ │ ├── locations/ │ │ ├── __init__.py │ │ ├── items.py │ │ ├── middlewares.py │ │ ├── pipelines.py │ │ ├── settings.py │ │ └── spiders/ │ │ ├── __init__.py │ │ └── dunkin.py │ └── scrapy.cfg ├── 04_02_e/ │ ├── chromedriver │ └── locations/ │ ├── locations/ │ │ ├── __init__.py │ │ ├── items.py │ │ ├── middlewares.py │ │ ├── pipelines.py │ │ ├── settings.py │ │ └── spiders/ │ │ ├── __init__.py │ │ └── dunkin.py │ └── scrapy.cfg ├── CONTRIBUTING.md ├── LICENSE ├── NOTICE └── README.md ================================================ FILE CONTENTS ================================================ ================================================ FILE: .github/CODEOWNERS ================================================ # Codeowners for these exercise files: # * (asterisk) deotes "all files and folders" # Example: * @producer @instructor ================================================ FILE: .github/ISSUE_TEMPLATE.md ================================================ ## Issue Overview ## Describe your environment ## Steps to Reproduce 1. 2. 3. 4. ## Expected Behavior ## Current Behavior ## Possible Solution ## Screenshots / Video ## Related Issues ================================================ FILE: .github/PULL_REQUEST_TEMPLATE.md ================================================ ================================================ FILE: .github/workflows/main.yml ================================================ name: Copy To Branches on: workflow_dispatch: jobs: copy-to-branches: runs-on: ubuntu-latest steps: - uses: actions/checkout@v2 with: fetch-depth: 0 - name: Copy To Branches Action uses: planetoftheweb/copy-to-branches@v1 ================================================ FILE: .gitignore ================================================ .DS_Store node_modules .tmp npm-debug.log ================================================ FILE: 01_03/ietf_scraper/ietf_scraper/__init__.py ================================================ ================================================ FILE: 01_03/ietf_scraper/ietf_scraper/items.py ================================================ # -*- coding: utf-8 -*- # Define here the models for your scraped items # # See documentation in: # https://docs.scrapy.org/en/latest/topics/items.html import scrapy class IetfScraperItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() pass ================================================ FILE: 01_03/ietf_scraper/ietf_scraper/middlewares.py ================================================ # -*- coding: utf-8 -*- # Define here the models for your spider middleware # # See documentation in: # https://docs.scrapy.org/en/latest/topics/spider-middleware.html from scrapy import signals class IetfScraperSpiderMiddleware: # Not all methods need to be defined. If a method is not defined, # scrapy acts as if the spider middleware does not modify the # passed objects. @classmethod def from_crawler(cls, crawler): # This method is used by Scrapy to create your spiders. s = cls() crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) return s def process_spider_input(self, response, spider): # Called for each response that goes through the spider # middleware and into the spider. # Should return None or raise an exception. return None def process_spider_output(self, response, result, spider): # Called with the results returned from the Spider, after # it has processed the response. # Must return an iterable of Request, dict or Item objects. for i in result: yield i def process_spider_exception(self, response, exception, spider): # Called when a spider or process_spider_input() method # (from other spider middleware) raises an exception. # Should return either None or an iterable of Request, dict # or Item objects. pass def process_start_requests(self, start_requests, spider): # Called with the start requests of the spider, and works # similarly to the process_spider_output() method, except # that it doesn’t have a response associated. # Must return only requests (not items). for r in start_requests: yield r def spider_opened(self, spider): spider.logger.info('Spider opened: %s' % spider.name) class IetfScraperDownloaderMiddleware: # Not all methods need to be defined. If a method is not defined, # scrapy acts as if the downloader middleware does not modify the # passed objects. @classmethod def from_crawler(cls, crawler): # This method is used by Scrapy to create your spiders. s = cls() crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) return s def process_request(self, request, spider): # Called for each request that goes through the downloader # middleware. # Must either: # - return None: continue processing this request # - or return a Response object # - or return a Request object # - or raise IgnoreRequest: process_exception() methods of # installed downloader middleware will be called return None def process_response(self, request, response, spider): # Called with the response returned from the downloader. # Must either; # - return a Response object # - return a Request object # - or raise IgnoreRequest return response def process_exception(self, request, exception, spider): # Called when a download handler or a process_request() # (from other downloader middleware) raises an exception. # Must either: # - return None: continue processing this exception # - return a Response object: stops process_exception() chain # - return a Request object: stops process_exception() chain pass def spider_opened(self, spider): spider.logger.info('Spider opened: %s' % spider.name) ================================================ FILE: 01_03/ietf_scraper/ietf_scraper/pipelines.py ================================================ # -*- coding: utf-8 -*- # Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html class IetfScraperPipeline: def process_item(self, item, spider): return item ================================================ FILE: 01_03/ietf_scraper/ietf_scraper/settings.py ================================================ # -*- coding: utf-8 -*- # Scrapy settings for ietf_scraper project # # For simplicity, this file contains only settings considered important or # commonly used. You can find more settings consulting the documentation: # # https://docs.scrapy.org/en/latest/topics/settings.html # https://docs.scrapy.org/en/latest/topics/downloader-middleware.html # https://docs.scrapy.org/en/latest/topics/spider-middleware.html BOT_NAME = 'ietf_scraper' SPIDER_MODULES = ['ietf_scraper.spiders'] NEWSPIDER_MODULE = 'ietf_scraper.spiders' # Crawl responsibly by identifying yourself (and your website) on the user-agent #USER_AGENT = 'ietf_scraper (+http://www.yourdomain.com)' # Obey robots.txt rules ROBOTSTXT_OBEY = True # Configure maximum concurrent requests performed by Scrapy (default: 16) #CONCURRENT_REQUESTS = 32 # Configure a delay for requests for the same website (default: 0) # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay # See also autothrottle settings and docs #DOWNLOAD_DELAY = 3 # The download delay setting will honor only one of: #CONCURRENT_REQUESTS_PER_DOMAIN = 16 #CONCURRENT_REQUESTS_PER_IP = 16 # Disable cookies (enabled by default) #COOKIES_ENABLED = False # Disable Telnet Console (enabled by default) #TELNETCONSOLE_ENABLED = False # Override the default request headers: #DEFAULT_REQUEST_HEADERS = { # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', # 'Accept-Language': 'en', #} # Enable or disable spider middlewares # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html #SPIDER_MIDDLEWARES = { # 'ietf_scraper.middlewares.IetfScraperSpiderMiddleware': 543, #} # Enable or disable downloader middlewares # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html #DOWNLOADER_MIDDLEWARES = { # 'ietf_scraper.middlewares.IetfScraperDownloaderMiddleware': 543, #} # Enable or disable extensions # See https://docs.scrapy.org/en/latest/topics/extensions.html #EXTENSIONS = { # 'scrapy.extensions.telnet.TelnetConsole': None, #} # Configure item pipelines # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html #ITEM_PIPELINES = { # 'ietf_scraper.pipelines.IetfScraperPipeline': 300, #} # Enable and configure the AutoThrottle extension (disabled by default) # See https://docs.scrapy.org/en/latest/topics/autothrottle.html #AUTOTHROTTLE_ENABLED = True # The initial download delay #AUTOTHROTTLE_START_DELAY = 5 # The maximum download delay to be set in case of high latencies #AUTOTHROTTLE_MAX_DELAY = 60 # The average number of requests Scrapy should be sending in parallel to # each remote server #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 # Enable showing throttling stats for every response received: #AUTOTHROTTLE_DEBUG = False # Enable and configure HTTP caching (disabled by default) # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings #HTTPCACHE_ENABLED = True #HTTPCACHE_EXPIRATION_SECS = 0 #HTTPCACHE_DIR = 'httpcache' #HTTPCACHE_IGNORE_HTTP_CODES = [] #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' ================================================ FILE: 01_03/ietf_scraper/ietf_scraper/spiders/__init__.py ================================================ # This package will contain the spiders of your Scrapy project # # Please refer to the documentation for information on how to create and manage # your spiders. ================================================ FILE: 01_03/ietf_scraper/ietf_scraper/spiders/ietf.py ================================================ # -*- coding: utf-8 -*- import scrapy class IetfSpider(scrapy.Spider): name = 'ietf' allowed_domains = ['pythonscraping.com'] start_urls = ['http://pythonscraping.com/linkedin/ietf.html'] def parse(self, response): return {'title': response.xpath('//span[@class="title"]/text()').get()} ================================================ FILE: 01_03/ietf_scraper/scrapy.cfg ================================================ # Automatically created by: scrapy startproject # # For more information about the [deploy] section see: # https://scrapyd.readthedocs.io/en/latest/deploy.html [settings] default = ietf_scraper.settings [deploy] #url = http://localhost:6800/ project = ietf_scraper ================================================ FILE: 01_04_b/ietf_scraper/ietf_scraper/__init__.py ================================================ ================================================ FILE: 01_04_b/ietf_scraper/ietf_scraper/items.py ================================================ # -*- coding: utf-8 -*- # Define here the models for your scraped items # # See documentation in: # https://docs.scrapy.org/en/latest/topics/items.html import scrapy class IetfScraperItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() pass ================================================ FILE: 01_04_b/ietf_scraper/ietf_scraper/middlewares.py ================================================ # -*- coding: utf-8 -*- # Define here the models for your spider middleware # # See documentation in: # https://docs.scrapy.org/en/latest/topics/spider-middleware.html from scrapy import signals class IetfScraperSpiderMiddleware: # Not all methods need to be defined. If a method is not defined, # scrapy acts as if the spider middleware does not modify the # passed objects. @classmethod def from_crawler(cls, crawler): # This method is used by Scrapy to create your spiders. s = cls() crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) return s def process_spider_input(self, response, spider): # Called for each response that goes through the spider # middleware and into the spider. # Should return None or raise an exception. return None def process_spider_output(self, response, result, spider): # Called with the results returned from the Spider, after # it has processed the response. # Must return an iterable of Request, dict or Item objects. for i in result: yield i def process_spider_exception(self, response, exception, spider): # Called when a spider or process_spider_input() method # (from other spider middleware) raises an exception. # Should return either None or an iterable of Request, dict # or Item objects. pass def process_start_requests(self, start_requests, spider): # Called with the start requests of the spider, and works # similarly to the process_spider_output() method, except # that it doesn’t have a response associated. # Must return only requests (not items). for r in start_requests: yield r def spider_opened(self, spider): spider.logger.info('Spider opened: %s' % spider.name) class IetfScraperDownloaderMiddleware: # Not all methods need to be defined. If a method is not defined, # scrapy acts as if the downloader middleware does not modify the # passed objects. @classmethod def from_crawler(cls, crawler): # This method is used by Scrapy to create your spiders. s = cls() crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) return s def process_request(self, request, spider): # Called for each request that goes through the downloader # middleware. # Must either: # - return None: continue processing this request # - or return a Response object # - or return a Request object # - or raise IgnoreRequest: process_exception() methods of # installed downloader middleware will be called return None def process_response(self, request, response, spider): # Called with the response returned from the downloader. # Must either; # - return a Response object # - return a Request object # - or raise IgnoreRequest return response def process_exception(self, request, exception, spider): # Called when a download handler or a process_request() # (from other downloader middleware) raises an exception. # Must either: # - return None: continue processing this exception # - return a Response object: stops process_exception() chain # - return a Request object: stops process_exception() chain pass def spider_opened(self, spider): spider.logger.info('Spider opened: %s' % spider.name) ================================================ FILE: 01_04_b/ietf_scraper/ietf_scraper/pipelines.py ================================================ # -*- coding: utf-8 -*- # Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html class IetfScraperPipeline: def process_item(self, item, spider): return item ================================================ FILE: 01_04_b/ietf_scraper/ietf_scraper/settings.py ================================================ # -*- coding: utf-8 -*- # Scrapy settings for ietf_scraper project # # For simplicity, this file contains only settings considered important or # commonly used. You can find more settings consulting the documentation: # # https://docs.scrapy.org/en/latest/topics/settings.html # https://docs.scrapy.org/en/latest/topics/downloader-middleware.html # https://docs.scrapy.org/en/latest/topics/spider-middleware.html BOT_NAME = 'ietf_scraper' SPIDER_MODULES = ['ietf_scraper.spiders'] NEWSPIDER_MODULE = 'ietf_scraper.spiders' # Crawl responsibly by identifying yourself (and your website) on the user-agent #USER_AGENT = 'ietf_scraper (+http://www.yourdomain.com)' # Obey robots.txt rules ROBOTSTXT_OBEY = True # Configure maximum concurrent requests performed by Scrapy (default: 16) #CONCURRENT_REQUESTS = 32 # Configure a delay for requests for the same website (default: 0) # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay # See also autothrottle settings and docs #DOWNLOAD_DELAY = 3 # The download delay setting will honor only one of: #CONCURRENT_REQUESTS_PER_DOMAIN = 16 #CONCURRENT_REQUESTS_PER_IP = 16 # Disable cookies (enabled by default) #COOKIES_ENABLED = False # Disable Telnet Console (enabled by default) #TELNETCONSOLE_ENABLED = False # Override the default request headers: #DEFAULT_REQUEST_HEADERS = { # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', # 'Accept-Language': 'en', #} # Enable or disable spider middlewares # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html #SPIDER_MIDDLEWARES = { # 'ietf_scraper.middlewares.IetfScraperSpiderMiddleware': 543, #} # Enable or disable downloader middlewares # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html #DOWNLOADER_MIDDLEWARES = { # 'ietf_scraper.middlewares.IetfScraperDownloaderMiddleware': 543, #} # Enable or disable extensions # See https://docs.scrapy.org/en/latest/topics/extensions.html #EXTENSIONS = { # 'scrapy.extensions.telnet.TelnetConsole': None, #} # Configure item pipelines # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html #ITEM_PIPELINES = { # 'ietf_scraper.pipelines.IetfScraperPipeline': 300, #} # Enable and configure the AutoThrottle extension (disabled by default) # See https://docs.scrapy.org/en/latest/topics/autothrottle.html #AUTOTHROTTLE_ENABLED = True # The initial download delay #AUTOTHROTTLE_START_DELAY = 5 # The maximum download delay to be set in case of high latencies #AUTOTHROTTLE_MAX_DELAY = 60 # The average number of requests Scrapy should be sending in parallel to # each remote server #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 # Enable showing throttling stats for every response received: #AUTOTHROTTLE_DEBUG = False # Enable and configure HTTP caching (disabled by default) # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings #HTTPCACHE_ENABLED = True #HTTPCACHE_EXPIRATION_SECS = 0 #HTTPCACHE_DIR = 'httpcache' #HTTPCACHE_IGNORE_HTTP_CODES = [] #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' ================================================ FILE: 01_04_b/ietf_scraper/ietf_scraper/spiders/__init__.py ================================================ # This package will contain the spiders of your Scrapy project # # Please refer to the documentation for information on how to create and manage # your spiders. ================================================ FILE: 01_04_b/ietf_scraper/ietf_scraper/spiders/ietf.py ================================================ # -*- coding: utf-8 -*- import scrapy class IetfSpider(scrapy.Spider): name = 'ietf' allowed_domains = ['pythonscraping.com'] start_urls = ['http://pythonscraping.com/linkedin/ietf.html'] def parse(self, response): #title = response.css('span.title::text').get() title = response.xpath('//span[@class="title"]/text()').get() return {"title": title} ================================================ FILE: 01_04_b/ietf_scraper/scrapy.cfg ================================================ # Automatically created by: scrapy startproject # # For more information about the [deploy] section see: # https://scrapyd.readthedocs.io/en/latest/deploy.html [settings] default = ietf_scraper.settings [deploy] #url = http://localhost:6800/ project = ietf_scraper ================================================ FILE: 01_04_e/ietf_scraper/ietf_scraper/__init__.py ================================================ ================================================ FILE: 01_04_e/ietf_scraper/ietf_scraper/items.py ================================================ # -*- coding: utf-8 -*- # Define here the models for your scraped items # # See documentation in: # https://docs.scrapy.org/en/latest/topics/items.html import scrapy class IetfScraperItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() pass ================================================ FILE: 01_04_e/ietf_scraper/ietf_scraper/middlewares.py ================================================ # -*- coding: utf-8 -*- # Define here the models for your spider middleware # # See documentation in: # https://docs.scrapy.org/en/latest/topics/spider-middleware.html from scrapy import signals class IetfScraperSpiderMiddleware: # Not all methods need to be defined. If a method is not defined, # scrapy acts as if the spider middleware does not modify the # passed objects. @classmethod def from_crawler(cls, crawler): # This method is used by Scrapy to create your spiders. s = cls() crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) return s def process_spider_input(self, response, spider): # Called for each response that goes through the spider # middleware and into the spider. # Should return None or raise an exception. return None def process_spider_output(self, response, result, spider): # Called with the results returned from the Spider, after # it has processed the response. # Must return an iterable of Request, dict or Item objects. for i in result: yield i def process_spider_exception(self, response, exception, spider): # Called when a spider or process_spider_input() method # (from other spider middleware) raises an exception. # Should return either None or an iterable of Request, dict # or Item objects. pass def process_start_requests(self, start_requests, spider): # Called with the start requests of the spider, and works # similarly to the process_spider_output() method, except # that it doesn’t have a response associated. # Must return only requests (not items). for r in start_requests: yield r def spider_opened(self, spider): spider.logger.info('Spider opened: %s' % spider.name) class IetfScraperDownloaderMiddleware: # Not all methods need to be defined. If a method is not defined, # scrapy acts as if the downloader middleware does not modify the # passed objects. @classmethod def from_crawler(cls, crawler): # This method is used by Scrapy to create your spiders. s = cls() crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) return s def process_request(self, request, spider): # Called for each request that goes through the downloader # middleware. # Must either: # - return None: continue processing this request # - or return a Response object # - or return a Request object # - or raise IgnoreRequest: process_exception() methods of # installed downloader middleware will be called return None def process_response(self, request, response, spider): # Called with the response returned from the downloader. # Must either; # - return a Response object # - return a Request object # - or raise IgnoreRequest return response def process_exception(self, request, exception, spider): # Called when a download handler or a process_request() # (from other downloader middleware) raises an exception. # Must either: # - return None: continue processing this exception # - return a Response object: stops process_exception() chain # - return a Request object: stops process_exception() chain pass def spider_opened(self, spider): spider.logger.info('Spider opened: %s' % spider.name) ================================================ FILE: 01_04_e/ietf_scraper/ietf_scraper/pipelines.py ================================================ # -*- coding: utf-8 -*- # Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html class IetfScraperPipeline: def process_item(self, item, spider): return item ================================================ FILE: 01_04_e/ietf_scraper/ietf_scraper/settings.py ================================================ # -*- coding: utf-8 -*- # Scrapy settings for ietf_scraper project # # For simplicity, this file contains only settings considered important or # commonly used. You can find more settings consulting the documentation: # # https://docs.scrapy.org/en/latest/topics/settings.html # https://docs.scrapy.org/en/latest/topics/downloader-middleware.html # https://docs.scrapy.org/en/latest/topics/spider-middleware.html BOT_NAME = 'ietf_scraper' SPIDER_MODULES = ['ietf_scraper.spiders'] NEWSPIDER_MODULE = 'ietf_scraper.spiders' # Crawl responsibly by identifying yourself (and your website) on the user-agent #USER_AGENT = 'ietf_scraper (+http://www.yourdomain.com)' # Obey robots.txt rules ROBOTSTXT_OBEY = True # Configure maximum concurrent requests performed by Scrapy (default: 16) #CONCURRENT_REQUESTS = 32 # Configure a delay for requests for the same website (default: 0) # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay # See also autothrottle settings and docs #DOWNLOAD_DELAY = 3 # The download delay setting will honor only one of: #CONCURRENT_REQUESTS_PER_DOMAIN = 16 #CONCURRENT_REQUESTS_PER_IP = 16 # Disable cookies (enabled by default) #COOKIES_ENABLED = False # Disable Telnet Console (enabled by default) #TELNETCONSOLE_ENABLED = False # Override the default request headers: #DEFAULT_REQUEST_HEADERS = { # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', # 'Accept-Language': 'en', #} # Enable or disable spider middlewares # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html #SPIDER_MIDDLEWARES = { # 'ietf_scraper.middlewares.IetfScraperSpiderMiddleware': 543, #} # Enable or disable downloader middlewares # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html #DOWNLOADER_MIDDLEWARES = { # 'ietf_scraper.middlewares.IetfScraperDownloaderMiddleware': 543, #} # Enable or disable extensions # See https://docs.scrapy.org/en/latest/topics/extensions.html #EXTENSIONS = { # 'scrapy.extensions.telnet.TelnetConsole': None, #} # Configure item pipelines # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html #ITEM_PIPELINES = { # 'ietf_scraper.pipelines.IetfScraperPipeline': 300, #} # Enable and configure the AutoThrottle extension (disabled by default) # See https://docs.scrapy.org/en/latest/topics/autothrottle.html #AUTOTHROTTLE_ENABLED = True # The initial download delay #AUTOTHROTTLE_START_DELAY = 5 # The maximum download delay to be set in case of high latencies #AUTOTHROTTLE_MAX_DELAY = 60 # The average number of requests Scrapy should be sending in parallel to # each remote server #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 # Enable showing throttling stats for every response received: #AUTOTHROTTLE_DEBUG = False # Enable and configure HTTP caching (disabled by default) # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings #HTTPCACHE_ENABLED = True #HTTPCACHE_EXPIRATION_SECS = 0 #HTTPCACHE_DIR = 'httpcache' #HTTPCACHE_IGNORE_HTTP_CODES = [] #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' ================================================ FILE: 01_04_e/ietf_scraper/ietf_scraper/spiders/__init__.py ================================================ # This package will contain the spiders of your Scrapy project # # Please refer to the documentation for information on how to create and manage # your spiders. ================================================ FILE: 01_04_e/ietf_scraper/ietf_scraper/spiders/ietf.py ================================================ # -*- coding: utf-8 -*- import scrapy import w3lib.html class IetfSpider(scrapy.Spider): name = 'ietf' allowed_domains = ['pythonscraping.com'] start_urls = ['http://pythonscraping.com/linkedin/ietf.html'] def parse(self, response): return { 'number': response.xpath('//span[@class="rfc-no"]/text()').get(), 'title': response.xpath('//meta[@name="DC.Title"]/@content').get(), # 'title': response.xpath('//span[@class="title"]/text()').get(), 'date': response.xpath('//span[@class="date"]/text()').get(), # 'date': response.xpath('//meta[@name="DC.Date.Issued"]/@content').get(), 'description': response.xpath('//meta[@name="DC.Description.Abstract"]/@content').get(), 'author': response.xpath('//meta[@name="DC.Creator"]/@content').get(), # 'author': response.xpath('//span[@class="author-name"]/text()').get(), 'company': response.xpath('//span[@class="author-company"]/text()').get(), 'address': response.xpath('//span[@class="address"]/text()').get(), 'text': w3lib.html.remove_tags(response.xpath('//div[@class="text"]').get()), 'headings': response.xpath('//span[@class="subheading"]/text()').getall() } ================================================ FILE: 01_04_e/ietf_scraper/scrapy.cfg ================================================ # Automatically created by: scrapy startproject # # For more information about the [deploy] section see: # https://scrapyd.readthedocs.io/en/latest/deploy.html [settings] default = ietf_scraper.settings [deploy] #url = http://localhost:6800/ project = ietf_scraper ================================================ FILE: 02_01/article_scraper/article_scraper/__init__.py ================================================ ================================================ FILE: 02_01/article_scraper/article_scraper/items.py ================================================ # -*- coding: utf-8 -*- # Define here the models for your scraped items # # See documentation in: # https://docs.scrapy.org/en/latest/topics/items.html import scrapy class ArticleScraperItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() pass ================================================ FILE: 02_01/article_scraper/article_scraper/middlewares.py ================================================ # -*- coding: utf-8 -*- # Define here the models for your spider middleware # # See documentation in: # https://docs.scrapy.org/en/latest/topics/spider-middleware.html from scrapy import signals class ArticleScraperSpiderMiddleware: # Not all methods need to be defined. If a method is not defined, # scrapy acts as if the spider middleware does not modify the # passed objects. @classmethod def from_crawler(cls, crawler): # This method is used by Scrapy to create your spiders. s = cls() crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) return s def process_spider_input(self, response, spider): # Called for each response that goes through the spider # middleware and into the spider. # Should return None or raise an exception. return None def process_spider_output(self, response, result, spider): # Called with the results returned from the Spider, after # it has processed the response. # Must return an iterable of Request, dict or Item objects. for i in result: yield i def process_spider_exception(self, response, exception, spider): # Called when a spider or process_spider_input() method # (from other spider middleware) raises an exception. # Should return either None or an iterable of Request, dict # or Item objects. pass def process_start_requests(self, start_requests, spider): # Called with the start requests of the spider, and works # similarly to the process_spider_output() method, except # that it doesn’t have a response associated. # Must return only requests (not items). for r in start_requests: yield r def spider_opened(self, spider): spider.logger.info('Spider opened: %s' % spider.name) class ArticleScraperDownloaderMiddleware: # Not all methods need to be defined. If a method is not defined, # scrapy acts as if the downloader middleware does not modify the # passed objects. @classmethod def from_crawler(cls, crawler): # This method is used by Scrapy to create your spiders. s = cls() crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) return s def process_request(self, request, spider): # Called for each request that goes through the downloader # middleware. # Must either: # - return None: continue processing this request # - or return a Response object # - or return a Request object # - or raise IgnoreRequest: process_exception() methods of # installed downloader middleware will be called return None def process_response(self, request, response, spider): # Called with the response returned from the downloader. # Must either; # - return a Response object # - return a Request object # - or raise IgnoreRequest return response def process_exception(self, request, exception, spider): # Called when a download handler or a process_request() # (from other downloader middleware) raises an exception. # Must either: # - return None: continue processing this exception # - return a Response object: stops process_exception() chain # - return a Request object: stops process_exception() chain pass def spider_opened(self, spider): spider.logger.info('Spider opened: %s' % spider.name) ================================================ FILE: 02_01/article_scraper/article_scraper/pipelines.py ================================================ # -*- coding: utf-8 -*- # Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html class ArticleScraperPipeline: def process_item(self, item, spider): return item ================================================ FILE: 02_01/article_scraper/article_scraper/settings.py ================================================ # -*- coding: utf-8 -*- # Scrapy settings for article_scraper project # # For simplicity, this file contains only settings considered important or # commonly used. You can find more settings consulting the documentation: # # https://docs.scrapy.org/en/latest/topics/settings.html # https://docs.scrapy.org/en/latest/topics/downloader-middleware.html # https://docs.scrapy.org/en/latest/topics/spider-middleware.html BOT_NAME = 'article_scraper' SPIDER_MODULES = ['article_scraper.spiders'] NEWSPIDER_MODULE = 'article_scraper.spiders' # Crawl responsibly by identifying yourself (and your website) on the user-agent #USER_AGENT = 'article_scraper (+http://www.yourdomain.com)' # Obey robots.txt rules ROBOTSTXT_OBEY = True # Configure maximum concurrent requests performed by Scrapy (default: 16) #CONCURRENT_REQUESTS = 32 # Configure a delay for requests for the same website (default: 0) # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay # See also autothrottle settings and docs #DOWNLOAD_DELAY = 3 # The download delay setting will honor only one of: #CONCURRENT_REQUESTS_PER_DOMAIN = 16 #CONCURRENT_REQUESTS_PER_IP = 16 # Disable cookies (enabled by default) #COOKIES_ENABLED = False # Disable Telnet Console (enabled by default) #TELNETCONSOLE_ENABLED = False # Override the default request headers: #DEFAULT_REQUEST_HEADERS = { # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', # 'Accept-Language': 'en', #} # Enable or disable spider middlewares # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html #SPIDER_MIDDLEWARES = { # 'article_scraper.middlewares.ArticleScraperSpiderMiddleware': 543, #} # Enable or disable downloader middlewares # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html #DOWNLOADER_MIDDLEWARES = { # 'article_scraper.middlewares.ArticleScraperDownloaderMiddleware': 543, #} # Enable or disable extensions # See https://docs.scrapy.org/en/latest/topics/extensions.html #EXTENSIONS = { # 'scrapy.extensions.telnet.TelnetConsole': None, #} # Configure item pipelines # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html #ITEM_PIPELINES = { # 'article_scraper.pipelines.ArticleScraperPipeline': 300, #} # Enable and configure the AutoThrottle extension (disabled by default) # See https://docs.scrapy.org/en/latest/topics/autothrottle.html #AUTOTHROTTLE_ENABLED = True # The initial download delay #AUTOTHROTTLE_START_DELAY = 5 # The maximum download delay to be set in case of high latencies #AUTOTHROTTLE_MAX_DELAY = 60 # The average number of requests Scrapy should be sending in parallel to # each remote server #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 # Enable showing throttling stats for every response received: #AUTOTHROTTLE_DEBUG = False # Enable and configure HTTP caching (disabled by default) # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings #HTTPCACHE_ENABLED = True #HTTPCACHE_EXPIRATION_SECS = 0 #HTTPCACHE_DIR = 'httpcache' #HTTPCACHE_IGNORE_HTTP_CODES = [] #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' ================================================ FILE: 02_01/article_scraper/article_scraper/spiders/__init__.py ================================================ # This package will contain the spiders of your Scrapy project # # Please refer to the documentation for information on how to create and manage # your spiders. ================================================ FILE: 02_01/article_scraper/article_scraper/spiders/wikipedia.py ================================================ # -*- coding: utf-8 -*- import scrapy from scrapy.spiders import CrawlSpider, Rule from scrapy.linkextractors import LinkExtractor class WikipediaSpider(CrawlSpider): name = 'wikipedia' allowed_domains = ['en.wikipedia.org'] start_urls = ['https://en.wikipedia.org/wiki/Kevin_Bacon'] rules = [Rule(LinkExtractor(allow=r'wiki/((?!:).)*$'), callback='parse_info', follow=True)] def parse_info(self, response): return { 'title': response.xpath('//h1/text()').get() or response.xpath('//h1/i/text()').get(), 'url': response.url, 'last_edited': response.xpath('//li[@id="footer-info-lastmod"]/text()').get() } ================================================ FILE: 02_01/article_scraper/scrapy.cfg ================================================ # Automatically created by: scrapy startproject # # For more information about the [deploy] section see: # https://scrapyd.readthedocs.io/en/latest/deploy.html [settings] default = article_scraper.settings [deploy] #url = http://localhost:6800/ project = article_scraper ================================================ FILE: 02_02_b/article_crawler/article_crawler/__init__.py ================================================ ================================================ FILE: 02_02_b/article_crawler/article_crawler/items.py ================================================ # -*- coding: utf-8 -*- # Define here the models for your scraped items # # See documentation in: # https://docs.scrapy.org/en/latest/topics/items.html import scrapy class ArticleCrawlerItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() pass ================================================ FILE: 02_02_b/article_crawler/article_crawler/middlewares.py ================================================ # -*- coding: utf-8 -*- # Define here the models for your spider middleware # # See documentation in: # https://docs.scrapy.org/en/latest/topics/spider-middleware.html from scrapy import signals class ArticleCrawlerSpiderMiddleware: # Not all methods need to be defined. If a method is not defined, # scrapy acts as if the spider middleware does not modify the # passed objects. @classmethod def from_crawler(cls, crawler): # This method is used by Scrapy to create your spiders. s = cls() crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) return s def process_spider_input(self, response, spider): # Called for each response that goes through the spider # middleware and into the spider. # Should return None or raise an exception. return None def process_spider_output(self, response, result, spider): # Called with the results returned from the Spider, after # it has processed the response. # Must return an iterable of Request, dict or Item objects. for i in result: yield i def process_spider_exception(self, response, exception, spider): # Called when a spider or process_spider_input() method # (from other spider middleware) raises an exception. # Should return either None or an iterable of Request, dict # or Item objects. pass def process_start_requests(self, start_requests, spider): # Called with the start requests of the spider, and works # similarly to the process_spider_output() method, except # that it doesn’t have a response associated. # Must return only requests (not items). for r in start_requests: yield r def spider_opened(self, spider): spider.logger.info('Spider opened: %s' % spider.name) class ArticleCrawlerDownloaderMiddleware: # Not all methods need to be defined. If a method is not defined, # scrapy acts as if the downloader middleware does not modify the # passed objects. @classmethod def from_crawler(cls, crawler): # This method is used by Scrapy to create your spiders. s = cls() crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) return s def process_request(self, request, spider): # Called for each request that goes through the downloader # middleware. # Must either: # - return None: continue processing this request # - or return a Response object # - or return a Request object # - or raise IgnoreRequest: process_exception() methods of # installed downloader middleware will be called return None def process_response(self, request, response, spider): # Called with the response returned from the downloader. # Must either; # - return a Response object # - return a Request object # - or raise IgnoreRequest return response def process_exception(self, request, exception, spider): # Called when a download handler or a process_request() # (from other downloader middleware) raises an exception. # Must either: # - return None: continue processing this exception # - return a Response object: stops process_exception() chain # - return a Request object: stops process_exception() chain pass def spider_opened(self, spider): spider.logger.info('Spider opened: %s' % spider.name) ================================================ FILE: 02_02_b/article_crawler/article_crawler/pipelines.py ================================================ # -*- coding: utf-8 -*- # Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html class ArticleCrawlerPipeline: def process_item(self, item, spider): return item ================================================ FILE: 02_02_b/article_crawler/article_crawler/settings.py ================================================ # -*- coding: utf-8 -*- # Scrapy settings for article_crawler project # # For simplicity, this file contains only settings considered important or # commonly used. You can find more settings consulting the documentation: # # https://docs.scrapy.org/en/latest/topics/settings.html # https://docs.scrapy.org/en/latest/topics/downloader-middleware.html # https://docs.scrapy.org/en/latest/topics/spider-middleware.html BOT_NAME = 'article_crawler' SPIDER_MODULES = ['article_crawler.spiders'] NEWSPIDER_MODULE = 'article_crawler.spiders' # Crawl responsibly by identifying yourself (and your website) on the user-agent #USER_AGENT = 'article_crawler (+http://www.yourdomain.com)' # Obey robots.txt rules ROBOTSTXT_OBEY = True # Configure maximum concurrent requests performed by Scrapy (default: 16) #CONCURRENT_REQUESTS = 32 # Configure a delay for requests for the same website (default: 0) # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay # See also autothrottle settings and docs #DOWNLOAD_DELAY = 3 # The download delay setting will honor only one of: #CONCURRENT_REQUESTS_PER_DOMAIN = 16 #CONCURRENT_REQUESTS_PER_IP = 16 # Disable cookies (enabled by default) #COOKIES_ENABLED = False # Disable Telnet Console (enabled by default) #TELNETCONSOLE_ENABLED = False # Override the default request headers: #DEFAULT_REQUEST_HEADERS = { # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', # 'Accept-Language': 'en', #} # Enable or disable spider middlewares # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html #SPIDER_MIDDLEWARES = { # 'article_crawler.middlewares.ArticleCrawlerSpiderMiddleware': 543, #} # Enable or disable downloader middlewares # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html #DOWNLOADER_MIDDLEWARES = { # 'article_crawler.middlewares.ArticleCrawlerDownloaderMiddleware': 543, #} # Enable or disable extensions # See https://docs.scrapy.org/en/latest/topics/extensions.html #EXTENSIONS = { # 'scrapy.extensions.telnet.TelnetConsole': None, #} # Configure item pipelines # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html #ITEM_PIPELINES = { # 'article_crawler.pipelines.ArticleCrawlerPipeline': 300, #} # Enable and configure the AutoThrottle extension (disabled by default) # See https://docs.scrapy.org/en/latest/topics/autothrottle.html #AUTOTHROTTLE_ENABLED = True # The initial download delay #AUTOTHROTTLE_START_DELAY = 5 # The maximum download delay to be set in case of high latencies #AUTOTHROTTLE_MAX_DELAY = 60 # The average number of requests Scrapy should be sending in parallel to # each remote server #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 # Enable showing throttling stats for every response received: #AUTOTHROTTLE_DEBUG = False # Enable and configure HTTP caching (disabled by default) # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings #HTTPCACHE_ENABLED = True #HTTPCACHE_EXPIRATION_SECS = 0 #HTTPCACHE_DIR = 'httpcache' #HTTPCACHE_IGNORE_HTTP_CODES = [] #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' ================================================ FILE: 02_02_b/article_crawler/article_crawler/spiders/__init__.py ================================================ # This package will contain the spiders of your Scrapy project # # Please refer to the documentation for information on how to create and manage # your spiders. ================================================ FILE: 02_02_b/article_crawler/article_crawler/spiders/wikipedia.py ================================================ # -*- coding: utf-8 -*- import scrapy from scrapy.spiders import CrawlSpider, Rule from scrapy.linkextractors import LinkExtractor class WikipediaSpider(CrawlSpider): name = 'wikipedia' allowed_domains = ['en.wikipedia.org'] start_urls = ['https://en.wikipedia.org/wiki/Kevin_Bacon'] rules = [ Rule(LinkExtractor(allow=r'wiki/((?!:).)*$'), callback='parse_info', follow=True) ] def parse_info(self, response): return { "title": response.xpath('//h1/text()').get() or response.xpath('//h1/i/text()'), "url": response.url, "last_edited": response.xpath('//li[@id="footer-info-lastmod"]/text()').get() } ================================================ FILE: 02_02_b/article_crawler/scrapy.cfg ================================================ # Automatically created by: scrapy startproject # # For more information about the [deploy] section see: # https://scrapyd.readthedocs.io/en/latest/deploy.html [settings] default = article_crawler.settings [deploy] #url = http://localhost:6800/ project = article_crawler ================================================ FILE: 02_02_e/article_crawler/article_crawler/__init__.py ================================================ ================================================ FILE: 02_02_e/article_crawler/article_crawler/items.py ================================================ # -*- coding: utf-8 -*- # Define here the models for your scraped items # # See documentation in: # https://docs.scrapy.org/en/latest/topics/items.html import scrapy class Article(scrapy.Item): title = scrapy.Field() url = scrapy.Field() lastUpdated = scrapy.Field() ================================================ FILE: 02_02_e/article_crawler/article_crawler/middlewares.py ================================================ # -*- coding: utf-8 -*- # Define here the models for your spider middleware # # See documentation in: # https://docs.scrapy.org/en/latest/topics/spider-middleware.html from scrapy import signals class ArticleCrawlerSpiderMiddleware: # Not all methods need to be defined. If a method is not defined, # scrapy acts as if the spider middleware does not modify the # passed objects. @classmethod def from_crawler(cls, crawler): # This method is used by Scrapy to create your spiders. s = cls() crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) return s def process_spider_input(self, response, spider): # Called for each response that goes through the spider # middleware and into the spider. # Should return None or raise an exception. return None def process_spider_output(self, response, result, spider): # Called with the results returned from the Spider, after # it has processed the response. # Must return an iterable of Request, dict or Item objects. for i in result: yield i def process_spider_exception(self, response, exception, spider): # Called when a spider or process_spider_input() method # (from other spider middleware) raises an exception. # Should return either None or an iterable of Request, dict # or Item objects. pass def process_start_requests(self, start_requests, spider): # Called with the start requests of the spider, and works # similarly to the process_spider_output() method, except # that it doesn’t have a response associated. # Must return only requests (not items). for r in start_requests: yield r def spider_opened(self, spider): spider.logger.info('Spider opened: %s' % spider.name) class ArticleCrawlerDownloaderMiddleware: # Not all methods need to be defined. If a method is not defined, # scrapy acts as if the downloader middleware does not modify the # passed objects. @classmethod def from_crawler(cls, crawler): # This method is used by Scrapy to create your spiders. s = cls() crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) return s def process_request(self, request, spider): # Called for each request that goes through the downloader # middleware. # Must either: # - return None: continue processing this request # - or return a Response object # - or return a Request object # - or raise IgnoreRequest: process_exception() methods of # installed downloader middleware will be called return None def process_response(self, request, response, spider): # Called with the response returned from the downloader. # Must either; # - return a Response object # - return a Request object # - or raise IgnoreRequest return response def process_exception(self, request, exception, spider): # Called when a download handler or a process_request() # (from other downloader middleware) raises an exception. # Must either: # - return None: continue processing this exception # - return a Response object: stops process_exception() chain # - return a Request object: stops process_exception() chain pass def spider_opened(self, spider): spider.logger.info('Spider opened: %s' % spider.name) ================================================ FILE: 02_02_e/article_crawler/article_crawler/pipelines.py ================================================ # -*- coding: utf-8 -*- # Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html class ArticleCrawlerPipeline: def process_item(self, item, spider): return item ================================================ FILE: 02_02_e/article_crawler/article_crawler/settings.py ================================================ # -*- coding: utf-8 -*- # Scrapy settings for article_crawler project # # For simplicity, this file contains only settings considered important or # commonly used. You can find more settings consulting the documentation: # # https://docs.scrapy.org/en/latest/topics/settings.html # https://docs.scrapy.org/en/latest/topics/downloader-middleware.html # https://docs.scrapy.org/en/latest/topics/spider-middleware.html BOT_NAME = 'article_crawler' SPIDER_MODULES = ['article_crawler.spiders'] NEWSPIDER_MODULE = 'article_crawler.spiders' # Crawl responsibly by identifying yourself (and your website) on the user-agent #USER_AGENT = 'article_crawler (+http://www.yourdomain.com)' # Obey robots.txt rules ROBOTSTXT_OBEY = True # Configure maximum concurrent requests performed by Scrapy (default: 16) #CONCURRENT_REQUESTS = 32 # Configure a delay for requests for the same website (default: 0) # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay # See also autothrottle settings and docs #DOWNLOAD_DELAY = 3 # The download delay setting will honor only one of: #CONCURRENT_REQUESTS_PER_DOMAIN = 16 #CONCURRENT_REQUESTS_PER_IP = 16 # Disable cookies (enabled by default) #COOKIES_ENABLED = False # Disable Telnet Console (enabled by default) #TELNETCONSOLE_ENABLED = False # Override the default request headers: #DEFAULT_REQUEST_HEADERS = { # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', # 'Accept-Language': 'en', #} # Enable or disable spider middlewares # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html #SPIDER_MIDDLEWARES = { # 'article_crawler.middlewares.ArticleCrawlerSpiderMiddleware': 543, #} # Enable or disable downloader middlewares # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html #DOWNLOADER_MIDDLEWARES = { # 'article_crawler.middlewares.ArticleCrawlerDownloaderMiddleware': 543, #} # Enable or disable extensions # See https://docs.scrapy.org/en/latest/topics/extensions.html #EXTENSIONS = { # 'scrapy.extensions.telnet.TelnetConsole': None, #} # Configure item pipelines # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html #ITEM_PIPELINES = { # 'article_crawler.pipelines.ArticleCrawlerPipeline': 300, #} # Enable and configure the AutoThrottle extension (disabled by default) # See https://docs.scrapy.org/en/latest/topics/autothrottle.html #AUTOTHROTTLE_ENABLED = True # The initial download delay #AUTOTHROTTLE_START_DELAY = 5 # The maximum download delay to be set in case of high latencies #AUTOTHROTTLE_MAX_DELAY = 60 # The average number of requests Scrapy should be sending in parallel to # each remote server #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 # Enable showing throttling stats for every response received: #AUTOTHROTTLE_DEBUG = False # Enable and configure HTTP caching (disabled by default) # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings #HTTPCACHE_ENABLED = True #HTTPCACHE_EXPIRATION_SECS = 0 #HTTPCACHE_DIR = 'httpcache' #HTTPCACHE_IGNORE_HTTP_CODES = [] #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' ================================================ FILE: 02_02_e/article_crawler/article_crawler/spiders/__init__.py ================================================ # This package will contain the spiders of your Scrapy project # # Please refer to the documentation for information on how to create and manage # your spiders. ================================================ FILE: 02_02_e/article_crawler/article_crawler/spiders/articles.csv ================================================ lastUpdated,title,url " This page was last edited on 19 September 2020, at 00:35",Kevin Bacon,https://en.wikipedia.org/wiki/Kevin_Bacon " This page was last edited on 21 July 2020, at 00:07",Screen Actors Guild Awards,https://en.wikipedia.org/wiki/Screen_Actors_Guild_Award " This page was last edited on 8 September 2020, at 12:45",Golden Globe Awards,https://en.wikipedia.org/wiki/Golden_Globe_Award " This page was last edited on 18 August 2020, at 20:30", (TV series),https://en.wikipedia.org/wiki/I_Love_Dick_(TV_series) " This page was last edited on 6 September 2020, at 23:58",List of social networking websites,https://en.wikipedia.org/wiki/Social_networks " This page was last edited on 3 October 2020, at 12:56",Hollywood Walk of Fame,https://en.wikipedia.org/wiki/Hollywood_Walk_of_Fame " This page was last edited on 14 August 2020, at 04:30",Golden Globe Award for Best Actor – Television Series Musical or Comedy,https://en.wikipedia.org/wiki/Golden_Globe_Award_for_Best_Actor_%E2%80%93_Television_Series_Musical_or_Comedy " This page was last edited on 22 September 2020, at 10:27",Primetime Emmy Award,https://en.wikipedia.org/wiki/Primetime_Emmy_Award " This page was last edited on 3 September 2020, at 14:05",[],https://en.wikipedia.org/wiki/Taking_Chance " This page was last edited on 1 October 2020, at 12:55",Academy Awards,https://en.wikipedia.org/wiki/Academy_Award " This page was last edited on 18 September 2020, at 16:08",[],https://en.wikipedia.org/wiki/The_Guardian " This page was last edited on 26 May 2020, at 18:28",Obie Award,https://en.wikipedia.org/wiki/Obie_Award " This page was last edited on 5 October 2020, at 13:16",Richard Dean Anderson,https://en.wikipedia.org/wiki/Richard_Dean_Anderson " This page was last edited on 23 July 2020, at 12:44",Main Page,https://en.wikipedia.org/wiki/Main_Page " This page was last edited on 11 September 2020, at 16:17",[],https://en.wikipedia.org/wiki/The_Following " This page was last edited on 23 September 2020, at 20:01", (film),https://en.wikipedia.org/wiki/Patriots_Day_(film) " This page was last edited on 1 October 2020, at 17:17", (film),https://en.wikipedia.org/wiki/Black_Mass_(film) " This page was last edited on 6 October 2020, at 15:27",Fox Broadcasting Company,https://en.wikipedia.org/wiki/Fox_Broadcasting_Company " This page was last edited on 7 October 2020, at 00:10",HBO,https://en.wikipedia.org/wiki/HBO ================================================ FILE: 02_02_e/article_crawler/article_crawler/spiders/wikipedia.py ================================================ # -*- coding: utf-8 -*- import scrapy from scrapy.spiders import CrawlSpider, Rule from scrapy.linkextractors import LinkExtractor from article_crawler.items import Article class WikipediaSpider(CrawlSpider): name = 'wikipedia' allowed_domains = ['en.wikipedia.org'] start_urls = ['https://en.wikipedia.org/wiki/Kevin_Bacon'] rules = [ Rule(LinkExtractor(allow=r'wiki/((?!:).)*$'), callback='parse_info', follow=True) ] def parse_info(self, response): article = Article() article['title']= response.xpath('//h1/text()').get() or response.xpath('//h1/i/text()') article['url'] = response.url article['lastUpdated'] = response.xpath('//li[@id="footer-info-lastmod"]/text()').get() return article ================================================ FILE: 02_02_e/article_crawler/scrapy.cfg ================================================ # Automatically created by: scrapy startproject # # For more information about the [deploy] section see: # https://scrapyd.readthedocs.io/en/latest/deploy.html [settings] default = article_crawler.settings [deploy] #url = http://localhost:6800/ project = article_crawler ================================================ FILE: 02_03_b/article_crawler/article_crawler/__init__.py ================================================ ================================================ FILE: 02_03_b/article_crawler/article_crawler/items.py ================================================ # -*- coding: utf-8 -*- # Define here the models for your scraped items # # See documentation in: # https://docs.scrapy.org/en/latest/topics/items.html import scrapy class Article(scrapy.Item): title = scrapy.Field() url = scrapy.Field() lastUpdated = scrapy.Field() ================================================ FILE: 02_03_b/article_crawler/article_crawler/middlewares.py ================================================ # -*- coding: utf-8 -*- # Define here the models for your spider middleware # # See documentation in: # https://docs.scrapy.org/en/latest/topics/spider-middleware.html from scrapy import signals class ArticleCrawlerSpiderMiddleware: # Not all methods need to be defined. If a method is not defined, # scrapy acts as if the spider middleware does not modify the # passed objects. @classmethod def from_crawler(cls, crawler): # This method is used by Scrapy to create your spiders. s = cls() crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) return s def process_spider_input(self, response, spider): # Called for each response that goes through the spider # middleware and into the spider. # Should return None or raise an exception. return None def process_spider_output(self, response, result, spider): # Called with the results returned from the Spider, after # it has processed the response. # Must return an iterable of Request, dict or Item objects. for i in result: yield i def process_spider_exception(self, response, exception, spider): # Called when a spider or process_spider_input() method # (from other spider middleware) raises an exception. # Should return either None or an iterable of Request, dict # or Item objects. pass def process_start_requests(self, start_requests, spider): # Called with the start requests of the spider, and works # similarly to the process_spider_output() method, except # that it doesn’t have a response associated. # Must return only requests (not items). for r in start_requests: yield r def spider_opened(self, spider): spider.logger.info('Spider opened: %s' % spider.name) class ArticleCrawlerDownloaderMiddleware: # Not all methods need to be defined. If a method is not defined, # scrapy acts as if the downloader middleware does not modify the # passed objects. @classmethod def from_crawler(cls, crawler): # This method is used by Scrapy to create your spiders. s = cls() crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) return s def process_request(self, request, spider): # Called for each request that goes through the downloader # middleware. # Must either: # - return None: continue processing this request # - or return a Response object # - or return a Request object # - or raise IgnoreRequest: process_exception() methods of # installed downloader middleware will be called return None def process_response(self, request, response, spider): # Called with the response returned from the downloader. # Must either; # - return a Response object # - return a Request object # - or raise IgnoreRequest return response def process_exception(self, request, exception, spider): # Called when a download handler or a process_request() # (from other downloader middleware) raises an exception. # Must either: # - return None: continue processing this exception # - return a Response object: stops process_exception() chain # - return a Request object: stops process_exception() chain pass def spider_opened(self, spider): spider.logger.info('Spider opened: %s' % spider.name) ================================================ FILE: 02_03_b/article_crawler/article_crawler/pipelines.py ================================================ # -*- coding: utf-8 -*- # Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html class ArticleCrawlerPipeline: def process_item(self, item, spider): return item ================================================ FILE: 02_03_b/article_crawler/article_crawler/settings.py ================================================ # -*- coding: utf-8 -*- # Scrapy settings for article_crawler project # # For simplicity, this file contains only settings considered important or # commonly used. You can find more settings consulting the documentation: # # https://docs.scrapy.org/en/latest/topics/settings.html # https://docs.scrapy.org/en/latest/topics/downloader-middleware.html # https://docs.scrapy.org/en/latest/topics/spider-middleware.html BOT_NAME = 'article_crawler' SPIDER_MODULES = ['article_crawler.spiders'] NEWSPIDER_MODULE = 'article_crawler.spiders' # Crawl responsibly by identifying yourself (and your website) on the user-agent #USER_AGENT = 'article_crawler (+http://www.yourdomain.com)' # Obey robots.txt rules ROBOTSTXT_OBEY = True # Configure maximum concurrent requests performed by Scrapy (default: 16) #CONCURRENT_REQUESTS = 32 # Configure a delay for requests for the same website (default: 0) # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay # See also autothrottle settings and docs #DOWNLOAD_DELAY = 3 # The download delay setting will honor only one of: #CONCURRENT_REQUESTS_PER_DOMAIN = 16 #CONCURRENT_REQUESTS_PER_IP = 16 # Disable cookies (enabled by default) #COOKIES_ENABLED = False # Disable Telnet Console (enabled by default) #TELNETCONSOLE_ENABLED = False # Override the default request headers: #DEFAULT_REQUEST_HEADERS = { # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', # 'Accept-Language': 'en', #} # Enable or disable spider middlewares # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html #SPIDER_MIDDLEWARES = { # 'article_crawler.middlewares.ArticleCrawlerSpiderMiddleware': 543, #} # Enable or disable downloader middlewares # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html #DOWNLOADER_MIDDLEWARES = { # 'article_crawler.middlewares.ArticleCrawlerDownloaderMiddleware': 543, #} # Enable or disable extensions # See https://docs.scrapy.org/en/latest/topics/extensions.html #EXTENSIONS = { # 'scrapy.extensions.telnet.TelnetConsole': None, #} # Configure item pipelines # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html #ITEM_PIPELINES = { # 'article_crawler.pipelines.ArticleCrawlerPipeline': 300, #} # Enable and configure the AutoThrottle extension (disabled by default) # See https://docs.scrapy.org/en/latest/topics/autothrottle.html #AUTOTHROTTLE_ENABLED = True # The initial download delay #AUTOTHROTTLE_START_DELAY = 5 # The maximum download delay to be set in case of high latencies #AUTOTHROTTLE_MAX_DELAY = 60 # The average number of requests Scrapy should be sending in parallel to # each remote server #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 # Enable showing throttling stats for every response received: #AUTOTHROTTLE_DEBUG = False # Enable and configure HTTP caching (disabled by default) # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings #HTTPCACHE_ENABLED = True #HTTPCACHE_EXPIRATION_SECS = 0 #HTTPCACHE_DIR = 'httpcache' #HTTPCACHE_IGNORE_HTTP_CODES = [] #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' ================================================ FILE: 02_03_b/article_crawler/article_crawler/spiders/__init__.py ================================================ # This package will contain the spiders of your Scrapy project # # Please refer to the documentation for information on how to create and manage # your spiders. ================================================ FILE: 02_03_b/article_crawler/article_crawler/spiders/articles.csv ================================================ lastUpdated,title,url " This page was last edited on 19 September 2020, at 00:35",Kevin Bacon,https://en.wikipedia.org/wiki/Kevin_Bacon " This page was last edited on 21 July 2020, at 00:07",Screen Actors Guild Awards,https://en.wikipedia.org/wiki/Screen_Actors_Guild_Award " This page was last edited on 8 September 2020, at 12:45",Golden Globe Awards,https://en.wikipedia.org/wiki/Golden_Globe_Award " This page was last edited on 18 August 2020, at 20:30", (TV series),https://en.wikipedia.org/wiki/I_Love_Dick_(TV_series) " This page was last edited on 6 September 2020, at 23:58",List of social networking websites,https://en.wikipedia.org/wiki/Social_networks " This page was last edited on 3 October 2020, at 12:56",Hollywood Walk of Fame,https://en.wikipedia.org/wiki/Hollywood_Walk_of_Fame " This page was last edited on 14 August 2020, at 04:30",Golden Globe Award for Best Actor – Television Series Musical or Comedy,https://en.wikipedia.org/wiki/Golden_Globe_Award_for_Best_Actor_%E2%80%93_Television_Series_Musical_or_Comedy " This page was last edited on 22 September 2020, at 10:27",Primetime Emmy Award,https://en.wikipedia.org/wiki/Primetime_Emmy_Award " This page was last edited on 3 September 2020, at 14:05",[],https://en.wikipedia.org/wiki/Taking_Chance " This page was last edited on 1 October 2020, at 12:55",Academy Awards,https://en.wikipedia.org/wiki/Academy_Award " This page was last edited on 18 September 2020, at 16:08",[],https://en.wikipedia.org/wiki/The_Guardian " This page was last edited on 26 May 2020, at 18:28",Obie Award,https://en.wikipedia.org/wiki/Obie_Award " This page was last edited on 5 October 2020, at 13:16",Richard Dean Anderson,https://en.wikipedia.org/wiki/Richard_Dean_Anderson " This page was last edited on 23 July 2020, at 12:44",Main Page,https://en.wikipedia.org/wiki/Main_Page " This page was last edited on 11 September 2020, at 16:17",[],https://en.wikipedia.org/wiki/The_Following " This page was last edited on 23 September 2020, at 20:01", (film),https://en.wikipedia.org/wiki/Patriots_Day_(film) " This page was last edited on 1 October 2020, at 17:17", (film),https://en.wikipedia.org/wiki/Black_Mass_(film) " This page was last edited on 6 October 2020, at 15:27",Fox Broadcasting Company,https://en.wikipedia.org/wiki/Fox_Broadcasting_Company " This page was last edited on 7 October 2020, at 00:10",HBO,https://en.wikipedia.org/wiki/HBO ================================================ FILE: 02_03_b/article_crawler/article_crawler/spiders/wikipedia.py ================================================ # -*- coding: utf-8 -*- import scrapy from scrapy.spiders import CrawlSpider, Rule from scrapy.linkextractors import LinkExtractor from article_crawler.items import Article class WikipediaSpider(CrawlSpider): name = 'wikipedia' allowed_domains = ['en.wikipedia.org'] start_urls = ['https://en.wikipedia.org/wiki/Kevin_Bacon'] rules = [ Rule(LinkExtractor(allow=r'wiki/((?!:).)*$'), callback='parse_info', follow=True) ] def parse_info(self, response): article = Article() article['title']= response.xpath('//h1/text()').get() or response.xpath('//h1/i/text()') article['url'] = response.url article['lastUpdated'] = response.xpath('//li[@id="footer-info-lastmod"]/text()').get() return article ================================================ FILE: 02_03_b/article_crawler/scrapy.cfg ================================================ # Automatically created by: scrapy startproject # # For more information about the [deploy] section see: # https://scrapyd.readthedocs.io/en/latest/deploy.html [settings] default = article_crawler.settings [deploy] #url = http://localhost:6800/ project = article_crawler ================================================ FILE: 02_03_e/article_crawler/article_crawler/__init__.py ================================================ ================================================ FILE: 02_03_e/article_crawler/article_crawler/items.py ================================================ # -*- coding: utf-8 -*- # Define here the models for your scraped items # # See documentation in: # https://docs.scrapy.org/en/latest/topics/items.html import scrapy class Article(scrapy.Item): title = scrapy.Field() url = scrapy.Field() lastUpdated = scrapy.Field() ================================================ FILE: 02_03_e/article_crawler/article_crawler/middlewares.py ================================================ # -*- coding: utf-8 -*- # Define here the models for your spider middleware # # See documentation in: # https://docs.scrapy.org/en/latest/topics/spider-middleware.html from scrapy import signals class ArticleCrawlerSpiderMiddleware: # Not all methods need to be defined. If a method is not defined, # scrapy acts as if the spider middleware does not modify the # passed objects. @classmethod def from_crawler(cls, crawler): # This method is used by Scrapy to create your spiders. s = cls() crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) return s def process_spider_input(self, response, spider): # Called for each response that goes through the spider # middleware and into the spider. # Should return None or raise an exception. return None def process_spider_output(self, response, result, spider): # Called with the results returned from the Spider, after # it has processed the response. # Must return an iterable of Request, dict or Item objects. for i in result: yield i def process_spider_exception(self, response, exception, spider): # Called when a spider or process_spider_input() method # (from other spider middleware) raises an exception. # Should return either None or an iterable of Request, dict # or Item objects. pass def process_start_requests(self, start_requests, spider): # Called with the start requests of the spider, and works # similarly to the process_spider_output() method, except # that it doesn’t have a response associated. # Must return only requests (not items). for r in start_requests: yield r def spider_opened(self, spider): spider.logger.info('Spider opened: %s' % spider.name) class ArticleCrawlerDownloaderMiddleware: # Not all methods need to be defined. If a method is not defined, # scrapy acts as if the downloader middleware does not modify the # passed objects. @classmethod def from_crawler(cls, crawler): # This method is used by Scrapy to create your spiders. s = cls() crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) return s def process_request(self, request, spider): # Called for each request that goes through the downloader # middleware. # Must either: # - return None: continue processing this request # - or return a Response object # - or return a Request object # - or raise IgnoreRequest: process_exception() methods of # installed downloader middleware will be called return None def process_response(self, request, response, spider): # Called with the response returned from the downloader. # Must either; # - return a Response object # - return a Request object # - or raise IgnoreRequest return response def process_exception(self, request, exception, spider): # Called when a download handler or a process_request() # (from other downloader middleware) raises an exception. # Must either: # - return None: continue processing this exception # - return a Response object: stops process_exception() chain # - return a Request object: stops process_exception() chain pass def spider_opened(self, spider): spider.logger.info('Spider opened: %s' % spider.name) ================================================ FILE: 02_03_e/article_crawler/article_crawler/pipelines.py ================================================ # -*- coding: utf-8 -*- # Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html class ArticleCrawlerPipeline: def process_item(self, item, spider): return item ================================================ FILE: 02_03_e/article_crawler/article_crawler/settings.py ================================================ # -*- coding: utf-8 -*- # Scrapy settings for article_crawler project # # For simplicity, this file contains only settings considered important or # commonly used. You can find more settings consulting the documentation: # # https://docs.scrapy.org/en/latest/topics/settings.html # https://docs.scrapy.org/en/latest/topics/downloader-middleware.html # https://docs.scrapy.org/en/latest/topics/spider-middleware.html BOT_NAME = 'article_crawler' CLOSESPIDER_PAGECOUNT=10 FEED_URI='articles.json' FEED_FORMAT='json' SPIDER_MODULES = ['article_crawler.spiders'] NEWSPIDER_MODULE = 'article_crawler.spiders' # Crawl responsibly by identifying yourself (and your website) on the user-agent #USER_AGENT = 'article_crawler (+http://www.yourdomain.com)' # Obey robots.txt rules ROBOTSTXT_OBEY = True # Configure maximum concurrent requests performed by Scrapy (default: 16) #CONCURRENT_REQUESTS = 32 # Configure a delay for requests for the same website (default: 0) # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay # See also autothrottle settings and docs #DOWNLOAD_DELAY = 3 # The download delay setting will honor only one of: #CONCURRENT_REQUESTS_PER_DOMAIN = 16 #CONCURRENT_REQUESTS_PER_IP = 16 # Disable cookies (enabled by default) #COOKIES_ENABLED = False # Disable Telnet Console (enabled by default) #TELNETCONSOLE_ENABLED = False # Override the default request headers: #DEFAULT_REQUEST_HEADERS = { # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', # 'Accept-Language': 'en', #} # Enable or disable spider middlewares # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html #SPIDER_MIDDLEWARES = { # 'article_crawler.middlewares.ArticleCrawlerSpiderMiddleware': 543, #} # Enable or disable downloader middlewares # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html #DOWNLOADER_MIDDLEWARES = { # 'article_crawler.middlewares.ArticleCrawlerDownloaderMiddleware': 543, #} # Enable or disable extensions # See https://docs.scrapy.org/en/latest/topics/extensions.html #EXTENSIONS = { # 'scrapy.extensions.telnet.TelnetConsole': None, #} # Configure item pipelines # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html #ITEM_PIPELINES = { # 'article_crawler.pipelines.ArticleCrawlerPipeline': 300, #} # Enable and configure the AutoThrottle extension (disabled by default) # See https://docs.scrapy.org/en/latest/topics/autothrottle.html #AUTOTHROTTLE_ENABLED = True # The initial download delay #AUTOTHROTTLE_START_DELAY = 5 # The maximum download delay to be set in case of high latencies #AUTOTHROTTLE_MAX_DELAY = 60 # The average number of requests Scrapy should be sending in parallel to # each remote server #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 # Enable showing throttling stats for every response received: #AUTOTHROTTLE_DEBUG = False # Enable and configure HTTP caching (disabled by default) # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings #HTTPCACHE_ENABLED = True #HTTPCACHE_EXPIRATION_SECS = 0 #HTTPCACHE_DIR = 'httpcache' #HTTPCACHE_IGNORE_HTTP_CODES = [] #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' ================================================ FILE: 02_03_e/article_crawler/article_crawler/spiders/__init__.py ================================================ # This package will contain the spiders of your Scrapy project # # Please refer to the documentation for information on how to create and manage # your spiders. ================================================ FILE: 02_03_e/article_crawler/article_crawler/spiders/articles.csv ================================================ lastUpdated,title,url " This page was last edited on 19 September 2020, at 00:35",Kevin Bacon,https://en.wikipedia.org/wiki/Kevin_Bacon " This page was last edited on 21 July 2020, at 00:07",Screen Actors Guild Awards,https://en.wikipedia.org/wiki/Screen_Actors_Guild_Award " This page was last edited on 8 September 2020, at 12:45",Golden Globe Awards,https://en.wikipedia.org/wiki/Golden_Globe_Award " This page was last edited on 18 August 2020, at 20:30", (TV series),https://en.wikipedia.org/wiki/I_Love_Dick_(TV_series) " This page was last edited on 6 September 2020, at 23:58",List of social networking websites,https://en.wikipedia.org/wiki/Social_networks " This page was last edited on 3 October 2020, at 12:56",Hollywood Walk of Fame,https://en.wikipedia.org/wiki/Hollywood_Walk_of_Fame " This page was last edited on 14 August 2020, at 04:30",Golden Globe Award for Best Actor – Television Series Musical or Comedy,https://en.wikipedia.org/wiki/Golden_Globe_Award_for_Best_Actor_%E2%80%93_Television_Series_Musical_or_Comedy " This page was last edited on 22 September 2020, at 10:27",Primetime Emmy Award,https://en.wikipedia.org/wiki/Primetime_Emmy_Award " This page was last edited on 3 September 2020, at 14:05",[],https://en.wikipedia.org/wiki/Taking_Chance " This page was last edited on 1 October 2020, at 12:55",Academy Awards,https://en.wikipedia.org/wiki/Academy_Award " This page was last edited on 18 September 2020, at 16:08",[],https://en.wikipedia.org/wiki/The_Guardian " This page was last edited on 26 May 2020, at 18:28",Obie Award,https://en.wikipedia.org/wiki/Obie_Award " This page was last edited on 5 October 2020, at 13:16",Richard Dean Anderson,https://en.wikipedia.org/wiki/Richard_Dean_Anderson " This page was last edited on 23 July 2020, at 12:44",Main Page,https://en.wikipedia.org/wiki/Main_Page " This page was last edited on 11 September 2020, at 16:17",[],https://en.wikipedia.org/wiki/The_Following " This page was last edited on 23 September 2020, at 20:01", (film),https://en.wikipedia.org/wiki/Patriots_Day_(film) " This page was last edited on 1 October 2020, at 17:17", (film),https://en.wikipedia.org/wiki/Black_Mass_(film) " This page was last edited on 6 October 2020, at 15:27",Fox Broadcasting Company,https://en.wikipedia.org/wiki/Fox_Broadcasting_Company " This page was last edited on 7 October 2020, at 00:10",HBO,https://en.wikipedia.org/wiki/HBO lastUpdated,title,url " This page was last edited on 19 September 2020, at 00:35",Kevin Bacon,https://en.wikipedia.org/wiki/Kevin_Bacon " This page was last edited on 18 August 2020, at 20:30", (TV series),https://en.wikipedia.org/wiki/I_Love_Dick_(TV_series) " This page was last edited on 22 September 2020, at 10:27",Primetime Emmy Award,https://en.wikipedia.org/wiki/Primetime_Emmy_Award " This page was last edited on 21 July 2020, at 00:07",Screen Actors Guild Awards,https://en.wikipedia.org/wiki/Screen_Actors_Guild_Award " This page was last edited on 20 March 2020, at 11:35",SixDegrees.org,https://en.wikipedia.org/wiki/SixDegrees.org " This page was last edited on 14 August 2020, at 04:30",Golden Globe Award for Best Actor – Television Series Musical or Comedy,https://en.wikipedia.org/wiki/Golden_Globe_Award_for_Best_Actor_%E2%80%93_Television_Series_Musical_or_Comedy " This page was last edited on 6 September 2020, at 23:58",List of social networking websites,https://en.wikipedia.org/wiki/Social_networks " This page was last edited on 2 October 2020, at 20:10",Six Degrees of Kevin Bacon,https://en.wikipedia.org/wiki/Six_Degrees_of_Kevin_Bacon " This page was last edited on 1 October 2020, at 12:55",Academy Awards,https://en.wikipedia.org/wiki/Academy_Award " This page was last edited on 3 October 2020, at 12:56",Hollywood Walk of Fame,https://en.wikipedia.org/wiki/Hollywood_Walk_of_Fame " This page was last edited on 8 September 2020, at 12:45",Golden Globe Awards,https://en.wikipedia.org/wiki/Golden_Globe_Award " This page was last edited on 3 September 2020, at 14:05",[],https://en.wikipedia.org/wiki/Taking_Chance " This page was last edited on 11 September 2020, at 16:17",[],https://en.wikipedia.org/wiki/The_Following " This page was last edited on 18 September 2020, at 16:08",[],https://en.wikipedia.org/wiki/The_Guardian " This page was last edited on 25 September 2020, at 05:22","[]",https://en.wikipedia.org/wiki/She%27s_Having_a_Baby " This page was last edited on 2 July 2020, at 13:53",SNAC,https://en.wikipedia.org/wiki/SNAC " This page was last edited on 23 July 2020, at 12:44",Main Page,https://en.wikipedia.org/wiki/Main_Page " This page was last edited on 20 September 2020, at 07:23",[],https://en.wikipedia.org/wiki/Los_Angeles_Daily_News " This page was last edited on 7 October 2020, at 00:10",HBO,https://en.wikipedia.org/wiki/HBO " This page was last edited on 22 July 2020, at 18:39", (2020 TV series),https://en.wikipedia.org/wiki/Ana_(2020_TV_series) " This page was last edited on 6 October 2020, at 15:27",Fox Broadcasting Company,https://en.wikipedia.org/wiki/Fox_Broadcasting_Company lastUpdated,title,url " This page was last edited on 19 September 2020, at 00:35",Kevin Bacon,https://en.wikipedia.org/wiki/Kevin_Bacon " This page was last edited on 18 August 2020, at 20:30", (TV series),https://en.wikipedia.org/wiki/I_Love_Dick_(TV_series) " This page was last edited on 22 September 2020, at 10:27",Primetime Emmy Award,https://en.wikipedia.org/wiki/Primetime_Emmy_Award " This page was last edited on 20 March 2020, at 11:35",SixDegrees.org,https://en.wikipedia.org/wiki/SixDegrees.org " This page was last edited on 21 July 2020, at 00:07",Screen Actors Guild Awards,https://en.wikipedia.org/wiki/Screen_Actors_Guild_Award " This page was last edited on 2 October 2020, at 20:10",Six Degrees of Kevin Bacon,https://en.wikipedia.org/wiki/Six_Degrees_of_Kevin_Bacon " This page was last edited on 14 August 2020, at 04:30",Golden Globe Award for Best Actor – Television Series Musical or Comedy,https://en.wikipedia.org/wiki/Golden_Globe_Award_for_Best_Actor_%E2%80%93_Television_Series_Musical_or_Comedy " This page was last edited on 6 September 2020, at 23:58",List of social networking websites,https://en.wikipedia.org/wiki/Social_networks " This page was last edited on 1 October 2020, at 12:55",Academy Awards,https://en.wikipedia.org/wiki/Academy_Award " This page was last edited on 3 October 2020, at 12:56",Hollywood Walk of Fame,https://en.wikipedia.org/wiki/Hollywood_Walk_of_Fame " This page was last edited on 3 September 2020, at 14:05",[],https://en.wikipedia.org/wiki/Taking_Chance " This page was last edited on 11 September 2020, at 16:17",[],https://en.wikipedia.org/wiki/The_Following " This page was last edited on 18 September 2020, at 16:08",[],https://en.wikipedia.org/wiki/The_Guardian " This page was last edited on 8 September 2020, at 12:45",Golden Globe Awards,https://en.wikipedia.org/wiki/Golden_Globe_Award " This page was last edited on 6 October 2020, at 15:27",Fox Broadcasting Company,https://en.wikipedia.org/wiki/Fox_Broadcasting_Company " This page was last edited on 23 July 2020, at 12:44",Main Page,https://en.wikipedia.org/wiki/Main_Page " This page was last edited on 3 October 2020, at 11:46",WorldCat,https://en.wikipedia.org/wiki/WorldCat_Identities_(identifier) " This page was last edited on 7 October 2020, at 00:10",HBO,https://en.wikipedia.org/wiki/HBO " This page was last edited on 6 June 2020, at 20:53",Bruce Gilbert,https://en.wikipedia.org/wiki/Bruce_Gilbert " This page was last edited on 23 June 2020, at 19:06", (TV series),https://en.wikipedia.org/wiki/The_Remix_(TV_series) " This page was last edited on 6 October 2020, at 13:04",[],https://en.wikipedia.org/wiki/The_New_York_Times lastUpdated,title,url " This page was last edited on 19 September 2020, at 00:35",Kevin Bacon,https://en.wikipedia.org/wiki/Kevin_Bacon " This page was last edited on 21 July 2020, at 00:07",Screen Actors Guild Awards,https://en.wikipedia.org/wiki/Screen_Actors_Guild_Award " This page was last edited on 8 September 2020, at 12:45",Golden Globe Awards,https://en.wikipedia.org/wiki/Golden_Globe_Award " This page was last edited on 14 August 2020, at 04:30",Golden Globe Award for Best Actor – Television Series Musical or Comedy,https://en.wikipedia.org/wiki/Golden_Globe_Award_for_Best_Actor_%E2%80%93_Television_Series_Musical_or_Comedy " This page was last edited on 18 August 2020, at 20:30", (TV series),https://en.wikipedia.org/wiki/I_Love_Dick_(TV_series) " This page was last edited on 22 September 2020, at 10:27",Primetime Emmy Award,https://en.wikipedia.org/wiki/Primetime_Emmy_Award " This page was last edited on 6 September 2020, at 23:58",List of social networking websites,https://en.wikipedia.org/wiki/Social_networks " This page was last edited on 3 September 2020, at 14:05",[],https://en.wikipedia.org/wiki/Taking_Chance " This page was last edited on 1 October 2020, at 12:55",Academy Awards,https://en.wikipedia.org/wiki/Academy_Award " This page was last edited on 3 October 2020, at 12:56",Hollywood Walk of Fame,https://en.wikipedia.org/wiki/Hollywood_Walk_of_Fame " This page was last edited on 18 September 2020, at 16:08",[],https://en.wikipedia.org/wiki/The_Guardian " This page was last edited on 1 October 2020, at 17:17", (film),https://en.wikipedia.org/wiki/Black_Mass_(film) " This page was last edited on 11 September 2020, at 16:17",[],https://en.wikipedia.org/wiki/The_Following " This page was last edited on 11 May 2020, at 14:47",National Library of Latvia,https://en.wikipedia.org/wiki/National_Library_of_Latvia " This page was last edited on 23 July 2020, at 12:44",Main Page,https://en.wikipedia.org/wiki/Main_Page " This page was last edited on 23 September 2020, at 20:01", (film),https://en.wikipedia.org/wiki/Patriots_Day_(film) " This page was last edited on 5 October 2020, at 15:12",Judy Garland,https://en.wikipedia.org/wiki/Judy_Garland " This page was last edited on 6 October 2020, at 15:27",Fox Broadcasting Company,https://en.wikipedia.org/wiki/Fox_Broadcasting_Company " This page was last edited on 7 October 2020, at 00:10",HBO,https://en.wikipedia.org/wiki/HBO lastUpdated,title,url " This page was last edited on 19 September 2020, at 00:35",Kevin Bacon,https://en.wikipedia.org/wiki/Kevin_Bacon " This page was last edited on 18 August 2020, at 20:30", (TV series),https://en.wikipedia.org/wiki/I_Love_Dick_(TV_series) " This page was last edited on 22 September 2020, at 10:27",Primetime Emmy Award,https://en.wikipedia.org/wiki/Primetime_Emmy_Award " This page was last edited on 20 March 2020, at 11:35",SixDegrees.org,https://en.wikipedia.org/wiki/SixDegrees.org " This page was last edited on 2 October 2020, at 20:10",Six Degrees of Kevin Bacon,https://en.wikipedia.org/wiki/Six_Degrees_of_Kevin_Bacon " This page was last edited on 1 October 2020, at 12:55",Academy Awards,https://en.wikipedia.org/wiki/Academy_Award " This page was last edited on 18 September 2020, at 16:08",[],https://en.wikipedia.org/wiki/The_Guardian " This page was last edited on 3 October 2020, at 12:56",Hollywood Walk of Fame,https://en.wikipedia.org/wiki/Hollywood_Walk_of_Fame " This page was last edited on 14 August 2020, at 04:30",Golden Globe Award for Best Actor – Television Series Musical or Comedy,https://en.wikipedia.org/wiki/Golden_Globe_Award_for_Best_Actor_%E2%80%93_Television_Series_Musical_or_Comedy " This page was last edited on 3 September 2020, at 14:05",[],https://en.wikipedia.org/wiki/Taking_Chance " This page was last edited on 6 September 2020, at 23:58",List of social networking websites,https://en.wikipedia.org/wiki/Social_networks " This page was last edited on 21 July 2020, at 00:07",Screen Actors Guild Awards,https://en.wikipedia.org/wiki/Screen_Actors_Guild_Award " This page was last edited on 8 September 2020, at 12:45",Golden Globe Awards,https://en.wikipedia.org/wiki/Golden_Globe_Award " This page was last edited on 11 September 2020, at 16:17",[],https://en.wikipedia.org/wiki/The_Following " This page was last edited on 6 October 2020, at 03:55",IMDb,https://en.wikipedia.org/wiki/IMDb " This page was last edited on 23 July 2020, at 12:44",Main Page,https://en.wikipedia.org/wiki/Main_Page " This page was last edited on 3 October 2020, at 11:46",WorldCat,https://en.wikipedia.org/wiki/WorldCat_Identities_(identifier) " This page was last edited on 30 July 2020, at 18:19",Virtual International Authority File,https://en.wikipedia.org/wiki/VIAF_(identifier) " This page was last edited on 18 September 2020, at 03:04",Trove,https://en.wikipedia.org/wiki/Trove " This page was last edited on 6 October 2020, at 15:27",Fox Broadcasting Company,https://en.wikipedia.org/wiki/Fox_Broadcasting_Company " This page was last edited on 7 October 2020, at 00:10",HBO,https://en.wikipedia.org/wiki/HBO ================================================ FILE: 02_03_e/article_crawler/article_crawler/spiders/articles.json ================================================ [ {"title": "Kevin Bacon", "url": "https://en.wikipedia.org/wiki/Kevin_Bacon", "lastUpdated": " This page was last edited on 19 September 2020, at 00:35"}, {"title": "Fox Broadcasting Company", "url": "https://en.wikipedia.org/wiki/Fox_Broadcasting_Company", "lastUpdated": " This page was last edited on 6 October 2020, at 15:27"}, {"title": " (film)", "url": "https://en.wikipedia.org/wiki/Patriots_Day_(film)", "lastUpdated": " This page was last edited on 23 September 2020, at 20:01"}, {"title": " (TV series)", "url": "https://en.wikipedia.org/wiki/I_Love_Dick_(TV_series)", "lastUpdated": " This page was last edited on 18 August 2020, at 20:30"}, {"title": "Screen Actors Guild Awards", "url": "https://en.wikipedia.org/wiki/Screen_Actors_Guild_Award", "lastUpdated": " This page was last edited on 21 July 2020, at 00:07"}, , , {"title": "Primetime Emmy Award", "url": "https://en.wikipedia.org/wiki/Primetime_Emmy_Award", "lastUpdated": " This page was last edited on 22 September 2020, at 10:27"}, {"title": "Golden Globe Awards", "url": "https://en.wikipedia.org/wiki/Golden_Globe_Award", "lastUpdated": " This page was last edited on 8 September 2020, at 12:45"}, {"title": " (film)", "url": "https://en.wikipedia.org/wiki/Black_Mass_(film)", "lastUpdated": " This page was last edited on 1 October 2020, at 17:17"}, {"title": " (film)", "url": "https://en.wikipedia.org/wiki/Frost/Nixon_(film)", "lastUpdated": " This page was last edited on 16 August 2020, at 00:08"}, , , {"title": "HBO", "url": "https://en.wikipedia.org/wiki/HBO", "lastUpdated": " This page was last edited on 7 October 2020, at 00:10"}, , {"title": "Circle in the Square Theatre", "url": "https://en.wikipedia.org/wiki/Circle_in_the_Square", "lastUpdated": " This page was last edited on 27 September 2020, at 21:06"}, {"title": "Main Page", "url": "https://en.wikipedia.org/wiki/Main_Page", "lastUpdated": " This page was last edited on 23 July 2020, at 12:44"}, {"title": "WorldCat", "url": "https://en.wikipedia.org/wiki/WorldCat_Identities_(identifier)", "lastUpdated": " This page was last edited on 3 October 2020, at 11:46"}, {"title": "Virtual International Authority File", "url": "https://en.wikipedia.org/wiki/VIAF_(identifier)", "lastUpdated": " This page was last edited on 30 July 2020, at 18:19"}, {"title": "Trove", "url": "https://en.wikipedia.org/wiki/Trove", "lastUpdated": " This page was last edited on 18 September 2020, at 03:04"}, {"title": " (film)", "url": "https://en.wikipedia.org/wiki/Wild_Things_(film)", "lastUpdated": " This page was last edited on 29 September 2020, at 08:35"}, {"title": "Syst\u00e8me universitaire de documentation", "url": "https://en.wikipedia.org/wiki/SUDOC_(identifier)", "lastUpdated": " This page was last edited on 19 October 2019, at 13:42"}, {"title": "SNAC", "url": "https://en.wikipedia.org/wiki/SNAC", "lastUpdated": " This page was last edited on 2 July 2020, at 13:53"} ] ================================================ FILE: 02_03_e/article_crawler/article_crawler/spiders/articles.xml ================================================ Kevin Baconhttps://en.wikipedia.org/wiki/Kevin_Bacon This page was last edited on 19 September 2020, at 00:35 (TV series)https://en.wikipedia.org/wiki/I_Love_Dick_(TV_series) This page was last edited on 18 August 2020, at 20:30 Primetime Emmy Awardhttps://en.wikipedia.org/wiki/Primetime_Emmy_Award This page was last edited on 22 September 2020, at 10:27 SixDegrees.orghttps://en.wikipedia.org/wiki/SixDegrees.org This page was last edited on 20 March 2020, at 11:35 Golden Globe Award for Best Actor – Television Series Musical or Comedyhttps://en.wikipedia.org/wiki/Golden_Globe_Award_for_Best_Actor_%E2%80%93_Television_Series_Musical_or_Comedy This page was last edited on 14 August 2020, at 04:30 List of social networking websiteshttps://en.wikipedia.org/wiki/Social_networks This page was last edited on 6 September 2020, at 23:58 Six Degrees of Kevin Baconhttps://en.wikipedia.org/wiki/Six_Degrees_of_Kevin_Bacon This page was last edited on 2 October 2020, at 20:10 Screen Actors Guild Awardshttps://en.wikipedia.org/wiki/Screen_Actors_Guild_Award This page was last edited on 21 July 2020, at 00:07 <value><Selector xpath='//h1/i/text()' data='The Guardian'></value>https://en.wikipedia.org/wiki/The_Guardian This page was last edited on 18 September 2020, at 16:08 Hollywood Walk of Famehttps://en.wikipedia.org/wiki/Hollywood_Walk_of_Fame This page was last edited on 3 October 2020, at 12:56 Academy Awardshttps://en.wikipedia.org/wiki/Academy_Award This page was last edited on 1 October 2020, at 12:55 Cannes Film Festivalhttps://en.wikipedia.org/wiki/Cannes_Film_Festival This page was last edited on 5 October 2020, at 12:51 <value><Selector xpath='//h1/i/text()' data='Taking Chance'></value>https://en.wikipedia.org/wiki/Taking_Chance This page was last edited on 3 September 2020, at 14:05 Alan Rickmanhttps://en.wikipedia.org/wiki/Alan_Rickman This page was last edited on 7 October 2020, at 00:12 <value><Selector xpath='//h1/i/text()' data='The Following'></value>https://en.wikipedia.org/wiki/The_Following This page was last edited on 11 September 2020, at 16:17 Main Pagehttps://en.wikipedia.org/wiki/Main_Page This page was last edited on 23 July 2020, at 12:44 Fox Broadcasting Companyhttps://en.wikipedia.org/wiki/Fox_Broadcasting_Company This page was last edited on 6 October 2020, at 15:27 Golden Globe Awardshttps://en.wikipedia.org/wiki/Golden_Globe_Award This page was last edited on 8 September 2020, at 12:45 WorldCathttps://en.wikipedia.org/wiki/WorldCat_Identities_(identifier) This page was last edited on 3 October 2020, at 11:46 Virtual International Authority Filehttps://en.wikipedia.org/wiki/VIAF_(identifier) This page was last edited on 30 July 2020, at 18:19 HBOhttps://en.wikipedia.org/wiki/HBO This page was last edited on 7 October 2020, at 00:10 Trovehttps://en.wikipedia.org/wiki/Trove This page was last edited on 18 September 2020, at 03:04 Système universitaire de documentationhttps://en.wikipedia.org/wiki/SUDOC_(identifier) This page was last edited on 19 October 2019, at 13:42 ================================================ FILE: 02_03_e/article_crawler/article_crawler/spiders/wikipedia.py ================================================ # -*- coding: utf-8 -*- import scrapy from scrapy.spiders import CrawlSpider, Rule from scrapy.linkextractors import LinkExtractor from article_crawler.items import Article class WikipediaSpider(CrawlSpider): name = 'wikipedia' allowed_domains = ['en.wikipedia.org'] start_urls = ['https://en.wikipedia.org/wiki/Kevin_Bacon'] rules = [ Rule(LinkExtractor(allow=r'wiki/((?!:).)*$'), callback='parse_info', follow=True) ] custom_settings={ 'FEED_URI': 'articles.xml', 'FEED_FORMAT': 'xml' } def parse_info(self, response): article = Article() article['title']= response.xpath('//h1/text()').get() or response.xpath('//h1/i/text()') article['url'] = response.url article['lastUpdated'] = response.xpath('//li[@id="footer-info-lastmod"]/text()').get() return article ================================================ FILE: 02_03_e/article_crawler/scrapy.cfg ================================================ # Automatically created by: scrapy startproject # # For more information about the [deploy] section see: # https://scrapyd.readthedocs.io/en/latest/deploy.html [settings] default = article_crawler.settings [deploy] #url = http://localhost:6800/ project = article_crawler ================================================ FILE: 02_04_b/article_crawler/article_crawler/__init__.py ================================================ ================================================ FILE: 02_04_b/article_crawler/article_crawler/items.py ================================================ # -*- coding: utf-8 -*- # Define here the models for your scraped items # # See documentation in: # https://docs.scrapy.org/en/latest/topics/items.html import scrapy class Article(scrapy.Item): title = scrapy.Field() url = scrapy.Field() lastUpdated = scrapy.Field() ================================================ FILE: 02_04_b/article_crawler/article_crawler/middlewares.py ================================================ # -*- coding: utf-8 -*- # Define here the models for your spider middleware # # See documentation in: # https://docs.scrapy.org/en/latest/topics/spider-middleware.html from scrapy import signals class ArticleCrawlerSpiderMiddleware: # Not all methods need to be defined. If a method is not defined, # scrapy acts as if the spider middleware does not modify the # passed objects. @classmethod def from_crawler(cls, crawler): # This method is used by Scrapy to create your spiders. s = cls() crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) return s def process_spider_input(self, response, spider): # Called for each response that goes through the spider # middleware and into the spider. # Should return None or raise an exception. return None def process_spider_output(self, response, result, spider): # Called with the results returned from the Spider, after # it has processed the response. # Must return an iterable of Request, dict or Item objects. for i in result: yield i def process_spider_exception(self, response, exception, spider): # Called when a spider or process_spider_input() method # (from other spider middleware) raises an exception. # Should return either None or an iterable of Request, dict # or Item objects. pass def process_start_requests(self, start_requests, spider): # Called with the start requests of the spider, and works # similarly to the process_spider_output() method, except # that it doesn’t have a response associated. # Must return only requests (not items). for r in start_requests: yield r def spider_opened(self, spider): spider.logger.info('Spider opened: %s' % spider.name) class ArticleCrawlerDownloaderMiddleware: # Not all methods need to be defined. If a method is not defined, # scrapy acts as if the downloader middleware does not modify the # passed objects. @classmethod def from_crawler(cls, crawler): # This method is used by Scrapy to create your spiders. s = cls() crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) return s def process_request(self, request, spider): # Called for each request that goes through the downloader # middleware. # Must either: # - return None: continue processing this request # - or return a Response object # - or return a Request object # - or raise IgnoreRequest: process_exception() methods of # installed downloader middleware will be called return None def process_response(self, request, response, spider): # Called with the response returned from the downloader. # Must either; # - return a Response object # - return a Request object # - or raise IgnoreRequest return response def process_exception(self, request, exception, spider): # Called when a download handler or a process_request() # (from other downloader middleware) raises an exception. # Must either: # - return None: continue processing this exception # - return a Response object: stops process_exception() chain # - return a Request object: stops process_exception() chain pass def spider_opened(self, spider): spider.logger.info('Spider opened: %s' % spider.name) ================================================ FILE: 02_04_b/article_crawler/article_crawler/pipelines.py ================================================ # -*- coding: utf-8 -*- # Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html class ArticleCrawlerPipeline: def process_item(self, item, spider): return item ================================================ FILE: 02_04_b/article_crawler/article_crawler/settings.py ================================================ # -*- coding: utf-8 -*- # Scrapy settings for article_crawler project # # For simplicity, this file contains only settings considered important or # commonly used. You can find more settings consulting the documentation: # # https://docs.scrapy.org/en/latest/topics/settings.html # https://docs.scrapy.org/en/latest/topics/downloader-middleware.html # https://docs.scrapy.org/en/latest/topics/spider-middleware.html BOT_NAME = 'article_crawler' CLOSESPIDER_PAGECOUNT=10 FEED_URI='articles.json' FEED_FORMAT='json' SPIDER_MODULES = ['article_crawler.spiders'] NEWSPIDER_MODULE = 'article_crawler.spiders' # Crawl responsibly by identifying yourself (and your website) on the user-agent #USER_AGENT = 'article_crawler (+http://www.yourdomain.com)' # Obey robots.txt rules ROBOTSTXT_OBEY = True # Configure maximum concurrent requests performed by Scrapy (default: 16) #CONCURRENT_REQUESTS = 32 # Configure a delay for requests for the same website (default: 0) # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay # See also autothrottle settings and docs #DOWNLOAD_DELAY = 3 # The download delay setting will honor only one of: #CONCURRENT_REQUESTS_PER_DOMAIN = 16 #CONCURRENT_REQUESTS_PER_IP = 16 # Disable cookies (enabled by default) #COOKIES_ENABLED = False # Disable Telnet Console (enabled by default) #TELNETCONSOLE_ENABLED = False # Override the default request headers: #DEFAULT_REQUEST_HEADERS = { # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', # 'Accept-Language': 'en', #} # Enable or disable spider middlewares # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html #SPIDER_MIDDLEWARES = { # 'article_crawler.middlewares.ArticleCrawlerSpiderMiddleware': 543, #} # Enable or disable downloader middlewares # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html #DOWNLOADER_MIDDLEWARES = { # 'article_crawler.middlewares.ArticleCrawlerDownloaderMiddleware': 543, #} # Enable or disable extensions # See https://docs.scrapy.org/en/latest/topics/extensions.html #EXTENSIONS = { # 'scrapy.extensions.telnet.TelnetConsole': None, #} # Configure item pipelines # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html #ITEM_PIPELINES = { # 'article_crawler.pipelines.ArticleCrawlerPipeline': 300, #} # Enable and configure the AutoThrottle extension (disabled by default) # See https://docs.scrapy.org/en/latest/topics/autothrottle.html #AUTOTHROTTLE_ENABLED = True # The initial download delay #AUTOTHROTTLE_START_DELAY = 5 # The maximum download delay to be set in case of high latencies #AUTOTHROTTLE_MAX_DELAY = 60 # The average number of requests Scrapy should be sending in parallel to # each remote server #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 # Enable showing throttling stats for every response received: #AUTOTHROTTLE_DEBUG = False # Enable and configure HTTP caching (disabled by default) # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings #HTTPCACHE_ENABLED = True #HTTPCACHE_EXPIRATION_SECS = 0 #HTTPCACHE_DIR = 'httpcache' #HTTPCACHE_IGNORE_HTTP_CODES = [] #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' ================================================ FILE: 02_04_b/article_crawler/article_crawler/spiders/__init__.py ================================================ # This package will contain the spiders of your Scrapy project # # Please refer to the documentation for information on how to create and manage # your spiders. ================================================ FILE: 02_04_b/article_crawler/article_crawler/spiders/articles.csv ================================================ lastUpdated,title,url " This page was last edited on 19 September 2020, at 00:35",Kevin Bacon,https://en.wikipedia.org/wiki/Kevin_Bacon " This page was last edited on 21 July 2020, at 00:07",Screen Actors Guild Awards,https://en.wikipedia.org/wiki/Screen_Actors_Guild_Award " This page was last edited on 8 September 2020, at 12:45",Golden Globe Awards,https://en.wikipedia.org/wiki/Golden_Globe_Award " This page was last edited on 18 August 2020, at 20:30", (TV series),https://en.wikipedia.org/wiki/I_Love_Dick_(TV_series) " This page was last edited on 6 September 2020, at 23:58",List of social networking websites,https://en.wikipedia.org/wiki/Social_networks " This page was last edited on 3 October 2020, at 12:56",Hollywood Walk of Fame,https://en.wikipedia.org/wiki/Hollywood_Walk_of_Fame " This page was last edited on 14 August 2020, at 04:30",Golden Globe Award for Best Actor – Television Series Musical or Comedy,https://en.wikipedia.org/wiki/Golden_Globe_Award_for_Best_Actor_%E2%80%93_Television_Series_Musical_or_Comedy " This page was last edited on 22 September 2020, at 10:27",Primetime Emmy Award,https://en.wikipedia.org/wiki/Primetime_Emmy_Award " This page was last edited on 3 September 2020, at 14:05",[],https://en.wikipedia.org/wiki/Taking_Chance " This page was last edited on 1 October 2020, at 12:55",Academy Awards,https://en.wikipedia.org/wiki/Academy_Award " This page was last edited on 18 September 2020, at 16:08",[],https://en.wikipedia.org/wiki/The_Guardian " This page was last edited on 26 May 2020, at 18:28",Obie Award,https://en.wikipedia.org/wiki/Obie_Award " This page was last edited on 5 October 2020, at 13:16",Richard Dean Anderson,https://en.wikipedia.org/wiki/Richard_Dean_Anderson " This page was last edited on 23 July 2020, at 12:44",Main Page,https://en.wikipedia.org/wiki/Main_Page " This page was last edited on 11 September 2020, at 16:17",[],https://en.wikipedia.org/wiki/The_Following " This page was last edited on 23 September 2020, at 20:01", (film),https://en.wikipedia.org/wiki/Patriots_Day_(film) " This page was last edited on 1 October 2020, at 17:17", (film),https://en.wikipedia.org/wiki/Black_Mass_(film) " This page was last edited on 6 October 2020, at 15:27",Fox Broadcasting Company,https://en.wikipedia.org/wiki/Fox_Broadcasting_Company " This page was last edited on 7 October 2020, at 00:10",HBO,https://en.wikipedia.org/wiki/HBO lastUpdated,title,url " This page was last edited on 19 September 2020, at 00:35",Kevin Bacon,https://en.wikipedia.org/wiki/Kevin_Bacon " This page was last edited on 18 August 2020, at 20:30", (TV series),https://en.wikipedia.org/wiki/I_Love_Dick_(TV_series) " This page was last edited on 22 September 2020, at 10:27",Primetime Emmy Award,https://en.wikipedia.org/wiki/Primetime_Emmy_Award " This page was last edited on 21 July 2020, at 00:07",Screen Actors Guild Awards,https://en.wikipedia.org/wiki/Screen_Actors_Guild_Award " This page was last edited on 20 March 2020, at 11:35",SixDegrees.org,https://en.wikipedia.org/wiki/SixDegrees.org " This page was last edited on 14 August 2020, at 04:30",Golden Globe Award for Best Actor – Television Series Musical or Comedy,https://en.wikipedia.org/wiki/Golden_Globe_Award_for_Best_Actor_%E2%80%93_Television_Series_Musical_or_Comedy " This page was last edited on 6 September 2020, at 23:58",List of social networking websites,https://en.wikipedia.org/wiki/Social_networks " This page was last edited on 2 October 2020, at 20:10",Six Degrees of Kevin Bacon,https://en.wikipedia.org/wiki/Six_Degrees_of_Kevin_Bacon " This page was last edited on 1 October 2020, at 12:55",Academy Awards,https://en.wikipedia.org/wiki/Academy_Award " This page was last edited on 3 October 2020, at 12:56",Hollywood Walk of Fame,https://en.wikipedia.org/wiki/Hollywood_Walk_of_Fame " This page was last edited on 8 September 2020, at 12:45",Golden Globe Awards,https://en.wikipedia.org/wiki/Golden_Globe_Award " This page was last edited on 3 September 2020, at 14:05",[],https://en.wikipedia.org/wiki/Taking_Chance " This page was last edited on 11 September 2020, at 16:17",[],https://en.wikipedia.org/wiki/The_Following " This page was last edited on 18 September 2020, at 16:08",[],https://en.wikipedia.org/wiki/The_Guardian " This page was last edited on 25 September 2020, at 05:22","[]",https://en.wikipedia.org/wiki/She%27s_Having_a_Baby " This page was last edited on 2 July 2020, at 13:53",SNAC,https://en.wikipedia.org/wiki/SNAC " This page was last edited on 23 July 2020, at 12:44",Main Page,https://en.wikipedia.org/wiki/Main_Page " This page was last edited on 20 September 2020, at 07:23",[],https://en.wikipedia.org/wiki/Los_Angeles_Daily_News " This page was last edited on 7 October 2020, at 00:10",HBO,https://en.wikipedia.org/wiki/HBO " This page was last edited on 22 July 2020, at 18:39", (2020 TV series),https://en.wikipedia.org/wiki/Ana_(2020_TV_series) " This page was last edited on 6 October 2020, at 15:27",Fox Broadcasting Company,https://en.wikipedia.org/wiki/Fox_Broadcasting_Company lastUpdated,title,url " This page was last edited on 19 September 2020, at 00:35",Kevin Bacon,https://en.wikipedia.org/wiki/Kevin_Bacon " This page was last edited on 18 August 2020, at 20:30", (TV series),https://en.wikipedia.org/wiki/I_Love_Dick_(TV_series) " This page was last edited on 22 September 2020, at 10:27",Primetime Emmy Award,https://en.wikipedia.org/wiki/Primetime_Emmy_Award " This page was last edited on 20 March 2020, at 11:35",SixDegrees.org,https://en.wikipedia.org/wiki/SixDegrees.org " This page was last edited on 21 July 2020, at 00:07",Screen Actors Guild Awards,https://en.wikipedia.org/wiki/Screen_Actors_Guild_Award " This page was last edited on 2 October 2020, at 20:10",Six Degrees of Kevin Bacon,https://en.wikipedia.org/wiki/Six_Degrees_of_Kevin_Bacon " This page was last edited on 14 August 2020, at 04:30",Golden Globe Award for Best Actor – Television Series Musical or Comedy,https://en.wikipedia.org/wiki/Golden_Globe_Award_for_Best_Actor_%E2%80%93_Television_Series_Musical_or_Comedy " This page was last edited on 6 September 2020, at 23:58",List of social networking websites,https://en.wikipedia.org/wiki/Social_networks " This page was last edited on 1 October 2020, at 12:55",Academy Awards,https://en.wikipedia.org/wiki/Academy_Award " This page was last edited on 3 October 2020, at 12:56",Hollywood Walk of Fame,https://en.wikipedia.org/wiki/Hollywood_Walk_of_Fame " This page was last edited on 3 September 2020, at 14:05",[],https://en.wikipedia.org/wiki/Taking_Chance " This page was last edited on 11 September 2020, at 16:17",[],https://en.wikipedia.org/wiki/The_Following " This page was last edited on 18 September 2020, at 16:08",[],https://en.wikipedia.org/wiki/The_Guardian " This page was last edited on 8 September 2020, at 12:45",Golden Globe Awards,https://en.wikipedia.org/wiki/Golden_Globe_Award " This page was last edited on 6 October 2020, at 15:27",Fox Broadcasting Company,https://en.wikipedia.org/wiki/Fox_Broadcasting_Company " This page was last edited on 23 July 2020, at 12:44",Main Page,https://en.wikipedia.org/wiki/Main_Page " This page was last edited on 3 October 2020, at 11:46",WorldCat,https://en.wikipedia.org/wiki/WorldCat_Identities_(identifier) " This page was last edited on 7 October 2020, at 00:10",HBO,https://en.wikipedia.org/wiki/HBO " This page was last edited on 6 June 2020, at 20:53",Bruce Gilbert,https://en.wikipedia.org/wiki/Bruce_Gilbert " This page was last edited on 23 June 2020, at 19:06", (TV series),https://en.wikipedia.org/wiki/The_Remix_(TV_series) " This page was last edited on 6 October 2020, at 13:04",[],https://en.wikipedia.org/wiki/The_New_York_Times lastUpdated,title,url " This page was last edited on 19 September 2020, at 00:35",Kevin Bacon,https://en.wikipedia.org/wiki/Kevin_Bacon " This page was last edited on 21 July 2020, at 00:07",Screen Actors Guild Awards,https://en.wikipedia.org/wiki/Screen_Actors_Guild_Award " This page was last edited on 8 September 2020, at 12:45",Golden Globe Awards,https://en.wikipedia.org/wiki/Golden_Globe_Award " This page was last edited on 14 August 2020, at 04:30",Golden Globe Award for Best Actor – Television Series Musical or Comedy,https://en.wikipedia.org/wiki/Golden_Globe_Award_for_Best_Actor_%E2%80%93_Television_Series_Musical_or_Comedy " This page was last edited on 18 August 2020, at 20:30", (TV series),https://en.wikipedia.org/wiki/I_Love_Dick_(TV_series) " This page was last edited on 22 September 2020, at 10:27",Primetime Emmy Award,https://en.wikipedia.org/wiki/Primetime_Emmy_Award " This page was last edited on 6 September 2020, at 23:58",List of social networking websites,https://en.wikipedia.org/wiki/Social_networks " This page was last edited on 3 September 2020, at 14:05",[],https://en.wikipedia.org/wiki/Taking_Chance " This page was last edited on 1 October 2020, at 12:55",Academy Awards,https://en.wikipedia.org/wiki/Academy_Award " This page was last edited on 3 October 2020, at 12:56",Hollywood Walk of Fame,https://en.wikipedia.org/wiki/Hollywood_Walk_of_Fame " This page was last edited on 18 September 2020, at 16:08",[],https://en.wikipedia.org/wiki/The_Guardian " This page was last edited on 1 October 2020, at 17:17", (film),https://en.wikipedia.org/wiki/Black_Mass_(film) " This page was last edited on 11 September 2020, at 16:17",[],https://en.wikipedia.org/wiki/The_Following " This page was last edited on 11 May 2020, at 14:47",National Library of Latvia,https://en.wikipedia.org/wiki/National_Library_of_Latvia " This page was last edited on 23 July 2020, at 12:44",Main Page,https://en.wikipedia.org/wiki/Main_Page " This page was last edited on 23 September 2020, at 20:01", (film),https://en.wikipedia.org/wiki/Patriots_Day_(film) " This page was last edited on 5 October 2020, at 15:12",Judy Garland,https://en.wikipedia.org/wiki/Judy_Garland " This page was last edited on 6 October 2020, at 15:27",Fox Broadcasting Company,https://en.wikipedia.org/wiki/Fox_Broadcasting_Company " This page was last edited on 7 October 2020, at 00:10",HBO,https://en.wikipedia.org/wiki/HBO lastUpdated,title,url " This page was last edited on 19 September 2020, at 00:35",Kevin Bacon,https://en.wikipedia.org/wiki/Kevin_Bacon " This page was last edited on 18 August 2020, at 20:30", (TV series),https://en.wikipedia.org/wiki/I_Love_Dick_(TV_series) " This page was last edited on 22 September 2020, at 10:27",Primetime Emmy Award,https://en.wikipedia.org/wiki/Primetime_Emmy_Award " This page was last edited on 20 March 2020, at 11:35",SixDegrees.org,https://en.wikipedia.org/wiki/SixDegrees.org " This page was last edited on 2 October 2020, at 20:10",Six Degrees of Kevin Bacon,https://en.wikipedia.org/wiki/Six_Degrees_of_Kevin_Bacon " This page was last edited on 1 October 2020, at 12:55",Academy Awards,https://en.wikipedia.org/wiki/Academy_Award " This page was last edited on 18 September 2020, at 16:08",[],https://en.wikipedia.org/wiki/The_Guardian " This page was last edited on 3 October 2020, at 12:56",Hollywood Walk of Fame,https://en.wikipedia.org/wiki/Hollywood_Walk_of_Fame " This page was last edited on 14 August 2020, at 04:30",Golden Globe Award for Best Actor – Television Series Musical or Comedy,https://en.wikipedia.org/wiki/Golden_Globe_Award_for_Best_Actor_%E2%80%93_Television_Series_Musical_or_Comedy " This page was last edited on 3 September 2020, at 14:05",[],https://en.wikipedia.org/wiki/Taking_Chance " This page was last edited on 6 September 2020, at 23:58",List of social networking websites,https://en.wikipedia.org/wiki/Social_networks " This page was last edited on 21 July 2020, at 00:07",Screen Actors Guild Awards,https://en.wikipedia.org/wiki/Screen_Actors_Guild_Award " This page was last edited on 8 September 2020, at 12:45",Golden Globe Awards,https://en.wikipedia.org/wiki/Golden_Globe_Award " This page was last edited on 11 September 2020, at 16:17",[],https://en.wikipedia.org/wiki/The_Following " This page was last edited on 6 October 2020, at 03:55",IMDb,https://en.wikipedia.org/wiki/IMDb " This page was last edited on 23 July 2020, at 12:44",Main Page,https://en.wikipedia.org/wiki/Main_Page " This page was last edited on 3 October 2020, at 11:46",WorldCat,https://en.wikipedia.org/wiki/WorldCat_Identities_(identifier) " This page was last edited on 30 July 2020, at 18:19",Virtual International Authority File,https://en.wikipedia.org/wiki/VIAF_(identifier) " This page was last edited on 18 September 2020, at 03:04",Trove,https://en.wikipedia.org/wiki/Trove " This page was last edited on 6 October 2020, at 15:27",Fox Broadcasting Company,https://en.wikipedia.org/wiki/Fox_Broadcasting_Company " This page was last edited on 7 October 2020, at 00:10",HBO,https://en.wikipedia.org/wiki/HBO ================================================ FILE: 02_04_b/article_crawler/article_crawler/spiders/articles.json ================================================ [ {"title": "Kevin Bacon", "url": "https://en.wikipedia.org/wiki/Kevin_Bacon", "lastUpdated": " This page was last edited on 19 September 2020, at 00:35"}, {"title": "Fox Broadcasting Company", "url": "https://en.wikipedia.org/wiki/Fox_Broadcasting_Company", "lastUpdated": " This page was last edited on 6 October 2020, at 15:27"}, {"title": " (film)", "url": "https://en.wikipedia.org/wiki/Patriots_Day_(film)", "lastUpdated": " This page was last edited on 23 September 2020, at 20:01"}, {"title": " (TV series)", "url": "https://en.wikipedia.org/wiki/I_Love_Dick_(TV_series)", "lastUpdated": " This page was last edited on 18 August 2020, at 20:30"}, {"title": "Screen Actors Guild Awards", "url": "https://en.wikipedia.org/wiki/Screen_Actors_Guild_Award", "lastUpdated": " This page was last edited on 21 July 2020, at 00:07"}, , , {"title": "Primetime Emmy Award", "url": "https://en.wikipedia.org/wiki/Primetime_Emmy_Award", "lastUpdated": " This page was last edited on 22 September 2020, at 10:27"}, {"title": "Golden Globe Awards", "url": "https://en.wikipedia.org/wiki/Golden_Globe_Award", "lastUpdated": " This page was last edited on 8 September 2020, at 12:45"}, {"title": " (film)", "url": "https://en.wikipedia.org/wiki/Black_Mass_(film)", "lastUpdated": " This page was last edited on 1 October 2020, at 17:17"}, {"title": " (film)", "url": "https://en.wikipedia.org/wiki/Frost/Nixon_(film)", "lastUpdated": " This page was last edited on 16 August 2020, at 00:08"}, , , {"title": "HBO", "url": "https://en.wikipedia.org/wiki/HBO", "lastUpdated": " This page was last edited on 7 October 2020, at 00:10"}, , {"title": "Circle in the Square Theatre", "url": "https://en.wikipedia.org/wiki/Circle_in_the_Square", "lastUpdated": " This page was last edited on 27 September 2020, at 21:06"}, {"title": "Main Page", "url": "https://en.wikipedia.org/wiki/Main_Page", "lastUpdated": " This page was last edited on 23 July 2020, at 12:44"}, {"title": "WorldCat", "url": "https://en.wikipedia.org/wiki/WorldCat_Identities_(identifier)", "lastUpdated": " This page was last edited on 3 October 2020, at 11:46"}, {"title": "Virtual International Authority File", "url": "https://en.wikipedia.org/wiki/VIAF_(identifier)", "lastUpdated": " This page was last edited on 30 July 2020, at 18:19"}, {"title": "Trove", "url": "https://en.wikipedia.org/wiki/Trove", "lastUpdated": " This page was last edited on 18 September 2020, at 03:04"}, {"title": " (film)", "url": "https://en.wikipedia.org/wiki/Wild_Things_(film)", "lastUpdated": " This page was last edited on 29 September 2020, at 08:35"}, {"title": "Syst\u00e8me universitaire de documentation", "url": "https://en.wikipedia.org/wiki/SUDOC_(identifier)", "lastUpdated": " This page was last edited on 19 October 2019, at 13:42"}, {"title": "SNAC", "url": "https://en.wikipedia.org/wiki/SNAC", "lastUpdated": " This page was last edited on 2 July 2020, at 13:53"} ] ================================================ FILE: 02_04_b/article_crawler/article_crawler/spiders/articles.xml ================================================ Kevin Baconhttps://en.wikipedia.org/wiki/Kevin_Bacon This page was last edited on 19 September 2020, at 00:35 (TV series)https://en.wikipedia.org/wiki/I_Love_Dick_(TV_series) This page was last edited on 18 August 2020, at 20:30 Primetime Emmy Awardhttps://en.wikipedia.org/wiki/Primetime_Emmy_Award This page was last edited on 22 September 2020, at 10:27 SixDegrees.orghttps://en.wikipedia.org/wiki/SixDegrees.org This page was last edited on 20 March 2020, at 11:35 Golden Globe Award for Best Actor – Television Series Musical or Comedyhttps://en.wikipedia.org/wiki/Golden_Globe_Award_for_Best_Actor_%E2%80%93_Television_Series_Musical_or_Comedy This page was last edited on 14 August 2020, at 04:30 List of social networking websiteshttps://en.wikipedia.org/wiki/Social_networks This page was last edited on 6 September 2020, at 23:58 Six Degrees of Kevin Baconhttps://en.wikipedia.org/wiki/Six_Degrees_of_Kevin_Bacon This page was last edited on 2 October 2020, at 20:10 Screen Actors Guild Awardshttps://en.wikipedia.org/wiki/Screen_Actors_Guild_Award This page was last edited on 21 July 2020, at 00:07 <value><Selector xpath='//h1/i/text()' data='The Guardian'></value>https://en.wikipedia.org/wiki/The_Guardian This page was last edited on 18 September 2020, at 16:08 Hollywood Walk of Famehttps://en.wikipedia.org/wiki/Hollywood_Walk_of_Fame This page was last edited on 3 October 2020, at 12:56 Academy Awardshttps://en.wikipedia.org/wiki/Academy_Award This page was last edited on 1 October 2020, at 12:55 Cannes Film Festivalhttps://en.wikipedia.org/wiki/Cannes_Film_Festival This page was last edited on 5 October 2020, at 12:51 <value><Selector xpath='//h1/i/text()' data='Taking Chance'></value>https://en.wikipedia.org/wiki/Taking_Chance This page was last edited on 3 September 2020, at 14:05 Alan Rickmanhttps://en.wikipedia.org/wiki/Alan_Rickman This page was last edited on 7 October 2020, at 00:12 <value><Selector xpath='//h1/i/text()' data='The Following'></value>https://en.wikipedia.org/wiki/The_Following This page was last edited on 11 September 2020, at 16:17 Main Pagehttps://en.wikipedia.org/wiki/Main_Page This page was last edited on 23 July 2020, at 12:44 Fox Broadcasting Companyhttps://en.wikipedia.org/wiki/Fox_Broadcasting_Company This page was last edited on 6 October 2020, at 15:27 Golden Globe Awardshttps://en.wikipedia.org/wiki/Golden_Globe_Award This page was last edited on 8 September 2020, at 12:45 WorldCathttps://en.wikipedia.org/wiki/WorldCat_Identities_(identifier) This page was last edited on 3 October 2020, at 11:46 Virtual International Authority Filehttps://en.wikipedia.org/wiki/VIAF_(identifier) This page was last edited on 30 July 2020, at 18:19 HBOhttps://en.wikipedia.org/wiki/HBO This page was last edited on 7 October 2020, at 00:10 Trovehttps://en.wikipedia.org/wiki/Trove This page was last edited on 18 September 2020, at 03:04 Système universitaire de documentationhttps://en.wikipedia.org/wiki/SUDOC_(identifier) This page was last edited on 19 October 2019, at 13:42 ================================================ FILE: 02_04_b/article_crawler/article_crawler/spiders/wikipedia.py ================================================ # -*- coding: utf-8 -*- import scrapy from scrapy.spiders import CrawlSpider, Rule from scrapy.linkextractors import LinkExtractor from article_crawler.items import Article class WikipediaSpider(CrawlSpider): name = 'wikipedia' allowed_domains = ['en.wikipedia.org'] start_urls = ['https://en.wikipedia.org/wiki/Kevin_Bacon'] rules = [ Rule(LinkExtractor(allow=r'wiki/((?!:).)*$'), callback='parse_info', follow=True) ] custom_settings={ 'FEED_URI': 'articles.xml', 'FEED_FORMAT': 'xml' } def parse_info(self, response): article = Article() article['title']= response.xpath('//h1/text()').get() or response.xpath('//h1/i/text()') article['url'] = response.url article['lastUpdated'] = response.xpath('//li[@id="footer-info-lastmod"]/text()').get() return article ================================================ FILE: 02_04_b/article_crawler/scrapy.cfg ================================================ # Automatically created by: scrapy startproject # # For more information about the [deploy] section see: # https://scrapyd.readthedocs.io/en/latest/deploy.html [settings] default = article_crawler.settings [deploy] #url = http://localhost:6800/ project = article_crawler ================================================ FILE: 02_04_e/article_crawler/article_crawler/__init__.py ================================================ ================================================ FILE: 02_04_e/article_crawler/article_crawler/items.py ================================================ # -*- coding: utf-8 -*- # Define here the models for your scraped items # # See documentation in: # https://docs.scrapy.org/en/latest/topics/items.html import scrapy class Article(scrapy.Item): title = scrapy.Field() url = scrapy.Field() lastUpdated = scrapy.Field() ================================================ FILE: 02_04_e/article_crawler/article_crawler/middlewares.py ================================================ # -*- coding: utf-8 -*- # Define here the models for your spider middleware # # See documentation in: # https://docs.scrapy.org/en/latest/topics/spider-middleware.html from scrapy import signals class ArticleCrawlerSpiderMiddleware: # Not all methods need to be defined. If a method is not defined, # scrapy acts as if the spider middleware does not modify the # passed objects. @classmethod def from_crawler(cls, crawler): # This method is used by Scrapy to create your spiders. s = cls() crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) return s def process_spider_input(self, response, spider): # Called for each response that goes through the spider # middleware and into the spider. # Should return None or raise an exception. return None def process_spider_output(self, response, result, spider): # Called with the results returned from the Spider, after # it has processed the response. # Must return an iterable of Request, dict or Item objects. for i in result: yield i def process_spider_exception(self, response, exception, spider): # Called when a spider or process_spider_input() method # (from other spider middleware) raises an exception. # Should return either None or an iterable of Request, dict # or Item objects. pass def process_start_requests(self, start_requests, spider): # Called with the start requests of the spider, and works # similarly to the process_spider_output() method, except # that it doesn’t have a response associated. # Must return only requests (not items). for r in start_requests: yield r def spider_opened(self, spider): spider.logger.info('Spider opened: %s' % spider.name) class ArticleCrawlerDownloaderMiddleware: # Not all methods need to be defined. If a method is not defined, # scrapy acts as if the downloader middleware does not modify the # passed objects. @classmethod def from_crawler(cls, crawler): # This method is used by Scrapy to create your spiders. s = cls() crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) return s def process_request(self, request, spider): # Called for each request that goes through the downloader # middleware. # Must either: # - return None: continue processing this request # - or return a Response object # - or return a Request object # - or raise IgnoreRequest: process_exception() methods of # installed downloader middleware will be called return None def process_response(self, request, response, spider): # Called with the response returned from the downloader. # Must either; # - return a Response object # - return a Request object # - or raise IgnoreRequest return response def process_exception(self, request, exception, spider): # Called when a download handler or a process_request() # (from other downloader middleware) raises an exception. # Must either: # - return None: continue processing this exception # - return a Response object: stops process_exception() chain # - return a Request object: stops process_exception() chain pass def spider_opened(self, spider): spider.logger.info('Spider opened: %s' % spider.name) ================================================ FILE: 02_04_e/article_crawler/article_crawler/pipelines.py ================================================ # -*- coding: utf-8 -*- # Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html from scrapy.exceptions import DropItem from datetime import datetime class CheckItemPipeline: def process_item(self, article, spider): if not article['lastUPdated'] or not article['url'] or not article['title']: raise DropItem('Missing something!') return article class CleanDatePipeline: def process_item(self, article, spider): article['lastUpdated'].replace('This page was last edited on', '').strip() article['lastUpdated'] = datetime.strptime(article['lastUpdated'], '%d %B %Y, at %H:%M') return article ================================================ FILE: 02_04_e/article_crawler/article_crawler/settings.py ================================================ # -*- coding: utf-8 -*- # Scrapy settings for article_crawler project # # For simplicity, this file contains only settings considered important or # commonly used. You can find more settings consulting the documentation: # # https://docs.scrapy.org/en/latest/topics/settings.html # https://docs.scrapy.org/en/latest/topics/downloader-middleware.html # https://docs.scrapy.org/en/latest/topics/spider-middleware.html BOT_NAME = 'article_crawler' CLOSESPIDER_PAGECOUNT=10 FEED_URI='articles.json' FEED_FORMAT='json' SPIDER_MODULES = ['article_crawler.spiders'] NEWSPIDER_MODULE = 'article_crawler.spiders' # Crawl responsibly by identifying yourself (and your website) on the user-agent #USER_AGENT = 'article_crawler (+http://www.yourdomain.com)' # Obey robots.txt rules ROBOTSTXT_OBEY = True # Configure maximum concurrent requests performed by Scrapy (default: 16) #CONCURRENT_REQUESTS = 32 # Configure a delay for requests for the same website (default: 0) # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay # See also autothrottle settings and docs #DOWNLOAD_DELAY = 3 # The download delay setting will honor only one of: #CONCURRENT_REQUESTS_PER_DOMAIN = 16 #CONCURRENT_REQUESTS_PER_IP = 16 # Disable cookies (enabled by default) #COOKIES_ENABLED = False # Disable Telnet Console (enabled by default) #TELNETCONSOLE_ENABLED = False # Override the default request headers: #DEFAULT_REQUEST_HEADERS = { # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', # 'Accept-Language': 'en', #} # Enable or disable spider middlewares # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html #SPIDER_MIDDLEWARES = { # 'article_crawler.middlewares.ArticleCrawlerSpiderMiddleware': 543, #} # Enable or disable downloader middlewares # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html #DOWNLOADER_MIDDLEWARES = { # 'article_crawler.middlewares.ArticleCrawlerDownloaderMiddleware': 543, #} # Enable or disable extensions # See https://docs.scrapy.org/en/latest/topics/extensions.html #EXTENSIONS = { # 'scrapy.extensions.telnet.TelnetConsole': None, #} # Configure item pipelines # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html ITEM_PIPELINES = { 'article_crawler.pipelines.CheckItemPipeline': 100, 'article_crawler.pipelines.CleanDatePipeline': 200, } # Enable and configure the AutoThrottle extension (disabled by default) # See https://docs.scrapy.org/en/latest/topics/autothrottle.html #AUTOTHROTTLE_ENABLED = True # The initial download delay #AUTOTHROTTLE_START_DELAY = 5 # The maximum download delay to be set in case of high latencies #AUTOTHROTTLE_MAX_DELAY = 60 # The average number of requests Scrapy should be sending in parallel to # each remote server #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 # Enable showing throttling stats for every response received: #AUTOTHROTTLE_DEBUG = False # Enable and configure HTTP caching (disabled by default) # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings #HTTPCACHE_ENABLED = True #HTTPCACHE_EXPIRATION_SECS = 0 #HTTPCACHE_DIR = 'httpcache' #HTTPCACHE_IGNORE_HTTP_CODES = [] #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' ================================================ FILE: 02_04_e/article_crawler/article_crawler/spiders/__init__.py ================================================ # This package will contain the spiders of your Scrapy project # # Please refer to the documentation for information on how to create and manage # your spiders. ================================================ FILE: 02_04_e/article_crawler/article_crawler/spiders/articles.csv ================================================ lastUpdated,title,url " This page was last edited on 19 September 2020, at 00:35",Kevin Bacon,https://en.wikipedia.org/wiki/Kevin_Bacon " This page was last edited on 21 July 2020, at 00:07",Screen Actors Guild Awards,https://en.wikipedia.org/wiki/Screen_Actors_Guild_Award " This page was last edited on 8 September 2020, at 12:45",Golden Globe Awards,https://en.wikipedia.org/wiki/Golden_Globe_Award " This page was last edited on 18 August 2020, at 20:30", (TV series),https://en.wikipedia.org/wiki/I_Love_Dick_(TV_series) " This page was last edited on 6 September 2020, at 23:58",List of social networking websites,https://en.wikipedia.org/wiki/Social_networks " This page was last edited on 3 October 2020, at 12:56",Hollywood Walk of Fame,https://en.wikipedia.org/wiki/Hollywood_Walk_of_Fame " This page was last edited on 14 August 2020, at 04:30",Golden Globe Award for Best Actor – Television Series Musical or Comedy,https://en.wikipedia.org/wiki/Golden_Globe_Award_for_Best_Actor_%E2%80%93_Television_Series_Musical_or_Comedy " This page was last edited on 22 September 2020, at 10:27",Primetime Emmy Award,https://en.wikipedia.org/wiki/Primetime_Emmy_Award " This page was last edited on 3 September 2020, at 14:05",[],https://en.wikipedia.org/wiki/Taking_Chance " This page was last edited on 1 October 2020, at 12:55",Academy Awards,https://en.wikipedia.org/wiki/Academy_Award " This page was last edited on 18 September 2020, at 16:08",[],https://en.wikipedia.org/wiki/The_Guardian " This page was last edited on 26 May 2020, at 18:28",Obie Award,https://en.wikipedia.org/wiki/Obie_Award " This page was last edited on 5 October 2020, at 13:16",Richard Dean Anderson,https://en.wikipedia.org/wiki/Richard_Dean_Anderson " This page was last edited on 23 July 2020, at 12:44",Main Page,https://en.wikipedia.org/wiki/Main_Page " This page was last edited on 11 September 2020, at 16:17",[],https://en.wikipedia.org/wiki/The_Following " This page was last edited on 23 September 2020, at 20:01", (film),https://en.wikipedia.org/wiki/Patriots_Day_(film) " This page was last edited on 1 October 2020, at 17:17", (film),https://en.wikipedia.org/wiki/Black_Mass_(film) " This page was last edited on 6 October 2020, at 15:27",Fox Broadcasting Company,https://en.wikipedia.org/wiki/Fox_Broadcasting_Company " This page was last edited on 7 October 2020, at 00:10",HBO,https://en.wikipedia.org/wiki/HBO lastUpdated,title,url " This page was last edited on 19 September 2020, at 00:35",Kevin Bacon,https://en.wikipedia.org/wiki/Kevin_Bacon " This page was last edited on 18 August 2020, at 20:30", (TV series),https://en.wikipedia.org/wiki/I_Love_Dick_(TV_series) " This page was last edited on 22 September 2020, at 10:27",Primetime Emmy Award,https://en.wikipedia.org/wiki/Primetime_Emmy_Award " This page was last edited on 21 July 2020, at 00:07",Screen Actors Guild Awards,https://en.wikipedia.org/wiki/Screen_Actors_Guild_Award " This page was last edited on 20 March 2020, at 11:35",SixDegrees.org,https://en.wikipedia.org/wiki/SixDegrees.org " This page was last edited on 14 August 2020, at 04:30",Golden Globe Award for Best Actor – Television Series Musical or Comedy,https://en.wikipedia.org/wiki/Golden_Globe_Award_for_Best_Actor_%E2%80%93_Television_Series_Musical_or_Comedy " This page was last edited on 6 September 2020, at 23:58",List of social networking websites,https://en.wikipedia.org/wiki/Social_networks " This page was last edited on 2 October 2020, at 20:10",Six Degrees of Kevin Bacon,https://en.wikipedia.org/wiki/Six_Degrees_of_Kevin_Bacon " This page was last edited on 1 October 2020, at 12:55",Academy Awards,https://en.wikipedia.org/wiki/Academy_Award " This page was last edited on 3 October 2020, at 12:56",Hollywood Walk of Fame,https://en.wikipedia.org/wiki/Hollywood_Walk_of_Fame " This page was last edited on 8 September 2020, at 12:45",Golden Globe Awards,https://en.wikipedia.org/wiki/Golden_Globe_Award " This page was last edited on 3 September 2020, at 14:05",[],https://en.wikipedia.org/wiki/Taking_Chance " This page was last edited on 11 September 2020, at 16:17",[],https://en.wikipedia.org/wiki/The_Following " This page was last edited on 18 September 2020, at 16:08",[],https://en.wikipedia.org/wiki/The_Guardian " This page was last edited on 25 September 2020, at 05:22","[]",https://en.wikipedia.org/wiki/She%27s_Having_a_Baby " This page was last edited on 2 July 2020, at 13:53",SNAC,https://en.wikipedia.org/wiki/SNAC " This page was last edited on 23 July 2020, at 12:44",Main Page,https://en.wikipedia.org/wiki/Main_Page " This page was last edited on 20 September 2020, at 07:23",[],https://en.wikipedia.org/wiki/Los_Angeles_Daily_News " This page was last edited on 7 October 2020, at 00:10",HBO,https://en.wikipedia.org/wiki/HBO " This page was last edited on 22 July 2020, at 18:39", (2020 TV series),https://en.wikipedia.org/wiki/Ana_(2020_TV_series) " This page was last edited on 6 October 2020, at 15:27",Fox Broadcasting Company,https://en.wikipedia.org/wiki/Fox_Broadcasting_Company lastUpdated,title,url " This page was last edited on 19 September 2020, at 00:35",Kevin Bacon,https://en.wikipedia.org/wiki/Kevin_Bacon " This page was last edited on 18 August 2020, at 20:30", (TV series),https://en.wikipedia.org/wiki/I_Love_Dick_(TV_series) " This page was last edited on 22 September 2020, at 10:27",Primetime Emmy Award,https://en.wikipedia.org/wiki/Primetime_Emmy_Award " This page was last edited on 20 March 2020, at 11:35",SixDegrees.org,https://en.wikipedia.org/wiki/SixDegrees.org " This page was last edited on 21 July 2020, at 00:07",Screen Actors Guild Awards,https://en.wikipedia.org/wiki/Screen_Actors_Guild_Award " This page was last edited on 2 October 2020, at 20:10",Six Degrees of Kevin Bacon,https://en.wikipedia.org/wiki/Six_Degrees_of_Kevin_Bacon " This page was last edited on 14 August 2020, at 04:30",Golden Globe Award for Best Actor – Television Series Musical or Comedy,https://en.wikipedia.org/wiki/Golden_Globe_Award_for_Best_Actor_%E2%80%93_Television_Series_Musical_or_Comedy " This page was last edited on 6 September 2020, at 23:58",List of social networking websites,https://en.wikipedia.org/wiki/Social_networks " This page was last edited on 1 October 2020, at 12:55",Academy Awards,https://en.wikipedia.org/wiki/Academy_Award " This page was last edited on 3 October 2020, at 12:56",Hollywood Walk of Fame,https://en.wikipedia.org/wiki/Hollywood_Walk_of_Fame " This page was last edited on 3 September 2020, at 14:05",[],https://en.wikipedia.org/wiki/Taking_Chance " This page was last edited on 11 September 2020, at 16:17",[],https://en.wikipedia.org/wiki/The_Following " This page was last edited on 18 September 2020, at 16:08",[],https://en.wikipedia.org/wiki/The_Guardian " This page was last edited on 8 September 2020, at 12:45",Golden Globe Awards,https://en.wikipedia.org/wiki/Golden_Globe_Award " This page was last edited on 6 October 2020, at 15:27",Fox Broadcasting Company,https://en.wikipedia.org/wiki/Fox_Broadcasting_Company " This page was last edited on 23 July 2020, at 12:44",Main Page,https://en.wikipedia.org/wiki/Main_Page " This page was last edited on 3 October 2020, at 11:46",WorldCat,https://en.wikipedia.org/wiki/WorldCat_Identities_(identifier) " This page was last edited on 7 October 2020, at 00:10",HBO,https://en.wikipedia.org/wiki/HBO " This page was last edited on 6 June 2020, at 20:53",Bruce Gilbert,https://en.wikipedia.org/wiki/Bruce_Gilbert " This page was last edited on 23 June 2020, at 19:06", (TV series),https://en.wikipedia.org/wiki/The_Remix_(TV_series) " This page was last edited on 6 October 2020, at 13:04",[],https://en.wikipedia.org/wiki/The_New_York_Times lastUpdated,title,url " This page was last edited on 19 September 2020, at 00:35",Kevin Bacon,https://en.wikipedia.org/wiki/Kevin_Bacon " This page was last edited on 21 July 2020, at 00:07",Screen Actors Guild Awards,https://en.wikipedia.org/wiki/Screen_Actors_Guild_Award " This page was last edited on 8 September 2020, at 12:45",Golden Globe Awards,https://en.wikipedia.org/wiki/Golden_Globe_Award " This page was last edited on 14 August 2020, at 04:30",Golden Globe Award for Best Actor – Television Series Musical or Comedy,https://en.wikipedia.org/wiki/Golden_Globe_Award_for_Best_Actor_%E2%80%93_Television_Series_Musical_or_Comedy " This page was last edited on 18 August 2020, at 20:30", (TV series),https://en.wikipedia.org/wiki/I_Love_Dick_(TV_series) " This page was last edited on 22 September 2020, at 10:27",Primetime Emmy Award,https://en.wikipedia.org/wiki/Primetime_Emmy_Award " This page was last edited on 6 September 2020, at 23:58",List of social networking websites,https://en.wikipedia.org/wiki/Social_networks " This page was last edited on 3 September 2020, at 14:05",[],https://en.wikipedia.org/wiki/Taking_Chance " This page was last edited on 1 October 2020, at 12:55",Academy Awards,https://en.wikipedia.org/wiki/Academy_Award " This page was last edited on 3 October 2020, at 12:56",Hollywood Walk of Fame,https://en.wikipedia.org/wiki/Hollywood_Walk_of_Fame " This page was last edited on 18 September 2020, at 16:08",[],https://en.wikipedia.org/wiki/The_Guardian " This page was last edited on 1 October 2020, at 17:17", (film),https://en.wikipedia.org/wiki/Black_Mass_(film) " This page was last edited on 11 September 2020, at 16:17",[],https://en.wikipedia.org/wiki/The_Following " This page was last edited on 11 May 2020, at 14:47",National Library of Latvia,https://en.wikipedia.org/wiki/National_Library_of_Latvia " This page was last edited on 23 July 2020, at 12:44",Main Page,https://en.wikipedia.org/wiki/Main_Page " This page was last edited on 23 September 2020, at 20:01", (film),https://en.wikipedia.org/wiki/Patriots_Day_(film) " This page was last edited on 5 October 2020, at 15:12",Judy Garland,https://en.wikipedia.org/wiki/Judy_Garland " This page was last edited on 6 October 2020, at 15:27",Fox Broadcasting Company,https://en.wikipedia.org/wiki/Fox_Broadcasting_Company " This page was last edited on 7 October 2020, at 00:10",HBO,https://en.wikipedia.org/wiki/HBO lastUpdated,title,url " This page was last edited on 19 September 2020, at 00:35",Kevin Bacon,https://en.wikipedia.org/wiki/Kevin_Bacon " This page was last edited on 18 August 2020, at 20:30", (TV series),https://en.wikipedia.org/wiki/I_Love_Dick_(TV_series) " This page was last edited on 22 September 2020, at 10:27",Primetime Emmy Award,https://en.wikipedia.org/wiki/Primetime_Emmy_Award " This page was last edited on 20 March 2020, at 11:35",SixDegrees.org,https://en.wikipedia.org/wiki/SixDegrees.org " This page was last edited on 2 October 2020, at 20:10",Six Degrees of Kevin Bacon,https://en.wikipedia.org/wiki/Six_Degrees_of_Kevin_Bacon " This page was last edited on 1 October 2020, at 12:55",Academy Awards,https://en.wikipedia.org/wiki/Academy_Award " This page was last edited on 18 September 2020, at 16:08",[],https://en.wikipedia.org/wiki/The_Guardian " This page was last edited on 3 October 2020, at 12:56",Hollywood Walk of Fame,https://en.wikipedia.org/wiki/Hollywood_Walk_of_Fame " This page was last edited on 14 August 2020, at 04:30",Golden Globe Award for Best Actor – Television Series Musical or Comedy,https://en.wikipedia.org/wiki/Golden_Globe_Award_for_Best_Actor_%E2%80%93_Television_Series_Musical_or_Comedy " This page was last edited on 3 September 2020, at 14:05",[],https://en.wikipedia.org/wiki/Taking_Chance " This page was last edited on 6 September 2020, at 23:58",List of social networking websites,https://en.wikipedia.org/wiki/Social_networks " This page was last edited on 21 July 2020, at 00:07",Screen Actors Guild Awards,https://en.wikipedia.org/wiki/Screen_Actors_Guild_Award " This page was last edited on 8 September 2020, at 12:45",Golden Globe Awards,https://en.wikipedia.org/wiki/Golden_Globe_Award " This page was last edited on 11 September 2020, at 16:17",[],https://en.wikipedia.org/wiki/The_Following " This page was last edited on 6 October 2020, at 03:55",IMDb,https://en.wikipedia.org/wiki/IMDb " This page was last edited on 23 July 2020, at 12:44",Main Page,https://en.wikipedia.org/wiki/Main_Page " This page was last edited on 3 October 2020, at 11:46",WorldCat,https://en.wikipedia.org/wiki/WorldCat_Identities_(identifier) " This page was last edited on 30 July 2020, at 18:19",Virtual International Authority File,https://en.wikipedia.org/wiki/VIAF_(identifier) " This page was last edited on 18 September 2020, at 03:04",Trove,https://en.wikipedia.org/wiki/Trove " This page was last edited on 6 October 2020, at 15:27",Fox Broadcasting Company,https://en.wikipedia.org/wiki/Fox_Broadcasting_Company " This page was last edited on 7 October 2020, at 00:10",HBO,https://en.wikipedia.org/wiki/HBO ================================================ FILE: 02_04_e/article_crawler/article_crawler/spiders/articles.json ================================================ [ {"title": "Kevin Bacon", "url": "https://en.wikipedia.org/wiki/Kevin_Bacon", "lastUpdated": " This page was last edited on 19 September 2020, at 00:35"}, {"title": "Fox Broadcasting Company", "url": "https://en.wikipedia.org/wiki/Fox_Broadcasting_Company", "lastUpdated": " This page was last edited on 6 October 2020, at 15:27"}, {"title": " (film)", "url": "https://en.wikipedia.org/wiki/Patriots_Day_(film)", "lastUpdated": " This page was last edited on 23 September 2020, at 20:01"}, {"title": " (TV series)", "url": "https://en.wikipedia.org/wiki/I_Love_Dick_(TV_series)", "lastUpdated": " This page was last edited on 18 August 2020, at 20:30"}, {"title": "Screen Actors Guild Awards", "url": "https://en.wikipedia.org/wiki/Screen_Actors_Guild_Award", "lastUpdated": " This page was last edited on 21 July 2020, at 00:07"}, , , {"title": "Primetime Emmy Award", "url": "https://en.wikipedia.org/wiki/Primetime_Emmy_Award", "lastUpdated": " This page was last edited on 22 September 2020, at 10:27"}, {"title": "Golden Globe Awards", "url": "https://en.wikipedia.org/wiki/Golden_Globe_Award", "lastUpdated": " This page was last edited on 8 September 2020, at 12:45"}, {"title": " (film)", "url": "https://en.wikipedia.org/wiki/Black_Mass_(film)", "lastUpdated": " This page was last edited on 1 October 2020, at 17:17"}, {"title": " (film)", "url": "https://en.wikipedia.org/wiki/Frost/Nixon_(film)", "lastUpdated": " This page was last edited on 16 August 2020, at 00:08"}, , , {"title": "HBO", "url": "https://en.wikipedia.org/wiki/HBO", "lastUpdated": " This page was last edited on 7 October 2020, at 00:10"}, , {"title": "Circle in the Square Theatre", "url": "https://en.wikipedia.org/wiki/Circle_in_the_Square", "lastUpdated": " This page was last edited on 27 September 2020, at 21:06"}, {"title": "Main Page", "url": "https://en.wikipedia.org/wiki/Main_Page", "lastUpdated": " This page was last edited on 23 July 2020, at 12:44"}, {"title": "WorldCat", "url": "https://en.wikipedia.org/wiki/WorldCat_Identities_(identifier)", "lastUpdated": " This page was last edited on 3 October 2020, at 11:46"}, {"title": "Virtual International Authority File", "url": "https://en.wikipedia.org/wiki/VIAF_(identifier)", "lastUpdated": " This page was last edited on 30 July 2020, at 18:19"}, {"title": "Trove", "url": "https://en.wikipedia.org/wiki/Trove", "lastUpdated": " This page was last edited on 18 September 2020, at 03:04"}, {"title": " (film)", "url": "https://en.wikipedia.org/wiki/Wild_Things_(film)", "lastUpdated": " This page was last edited on 29 September 2020, at 08:35"}, {"title": "Syst\u00e8me universitaire de documentation", "url": "https://en.wikipedia.org/wiki/SUDOC_(identifier)", "lastUpdated": " This page was last edited on 19 October 2019, at 13:42"}, {"title": "SNAC", "url": "https://en.wikipedia.org/wiki/SNAC", "lastUpdated": " This page was last edited on 2 July 2020, at 13:53"} ] ================================================ FILE: 02_04_e/article_crawler/article_crawler/spiders/articles.xml ================================================ Kevin Baconhttps://en.wikipedia.org/wiki/Kevin_Bacon This page was last edited on 19 September 2020, at 00:35 (TV series)https://en.wikipedia.org/wiki/I_Love_Dick_(TV_series) This page was last edited on 18 August 2020, at 20:30 Primetime Emmy Awardhttps://en.wikipedia.org/wiki/Primetime_Emmy_Award This page was last edited on 22 September 2020, at 10:27 SixDegrees.orghttps://en.wikipedia.org/wiki/SixDegrees.org This page was last edited on 20 March 2020, at 11:35 Golden Globe Award for Best Actor – Television Series Musical or Comedyhttps://en.wikipedia.org/wiki/Golden_Globe_Award_for_Best_Actor_%E2%80%93_Television_Series_Musical_or_Comedy This page was last edited on 14 August 2020, at 04:30 List of social networking websiteshttps://en.wikipedia.org/wiki/Social_networks This page was last edited on 6 September 2020, at 23:58 Six Degrees of Kevin Baconhttps://en.wikipedia.org/wiki/Six_Degrees_of_Kevin_Bacon This page was last edited on 2 October 2020, at 20:10 Screen Actors Guild Awardshttps://en.wikipedia.org/wiki/Screen_Actors_Guild_Award This page was last edited on 21 July 2020, at 00:07 <value><Selector xpath='//h1/i/text()' data='The Guardian'></value>https://en.wikipedia.org/wiki/The_Guardian This page was last edited on 18 September 2020, at 16:08 Hollywood Walk of Famehttps://en.wikipedia.org/wiki/Hollywood_Walk_of_Fame This page was last edited on 3 October 2020, at 12:56 Academy Awardshttps://en.wikipedia.org/wiki/Academy_Award This page was last edited on 1 October 2020, at 12:55 Cannes Film Festivalhttps://en.wikipedia.org/wiki/Cannes_Film_Festival This page was last edited on 5 October 2020, at 12:51 <value><Selector xpath='//h1/i/text()' data='Taking Chance'></value>https://en.wikipedia.org/wiki/Taking_Chance This page was last edited on 3 September 2020, at 14:05 Alan Rickmanhttps://en.wikipedia.org/wiki/Alan_Rickman This page was last edited on 7 October 2020, at 00:12 <value><Selector xpath='//h1/i/text()' data='The Following'></value>https://en.wikipedia.org/wiki/The_Following This page was last edited on 11 September 2020, at 16:17 Main Pagehttps://en.wikipedia.org/wiki/Main_Page This page was last edited on 23 July 2020, at 12:44 Fox Broadcasting Companyhttps://en.wikipedia.org/wiki/Fox_Broadcasting_Company This page was last edited on 6 October 2020, at 15:27 Golden Globe Awardshttps://en.wikipedia.org/wiki/Golden_Globe_Award This page was last edited on 8 September 2020, at 12:45 WorldCathttps://en.wikipedia.org/wiki/WorldCat_Identities_(identifier) This page was last edited on 3 October 2020, at 11:46 Virtual International Authority Filehttps://en.wikipedia.org/wiki/VIAF_(identifier) This page was last edited on 30 July 2020, at 18:19 HBOhttps://en.wikipedia.org/wiki/HBO This page was last edited on 7 October 2020, at 00:10 Trovehttps://en.wikipedia.org/wiki/Trove This page was last edited on 18 September 2020, at 03:04 Système universitaire de documentationhttps://en.wikipedia.org/wiki/SUDOC_(identifier) This page was last edited on 19 October 2019, at 13:42 Kevin Baconhttps://en.wikipedia.org/wiki/Kevin_Bacon2020-09-19 00:35:00 (TV series)https://en.wikipedia.org/wiki/I_Love_Dick_(TV_series)2020-08-18 20:30:00 SixDegrees.orghttps://en.wikipedia.org/wiki/SixDegrees.org2020-03-20 11:35:00 Golden Globe Award for Best Actor – Television Series Musical or Comedyhttps://en.wikipedia.org/wiki/Golden_Globe_Award_for_Best_Actor_%E2%80%93_Television_Series_Musical_or_Comedy2020-08-14 04:30:00 List of social networking websiteshttps://en.wikipedia.org/wiki/Social_networks2020-09-06 23:58:00 Six Degrees of Kevin Baconhttps://en.wikipedia.org/wiki/Six_Degrees_of_Kevin_Bacon2020-10-02 20:10:00 Primetime Emmy Awardhttps://en.wikipedia.org/wiki/Primetime_Emmy_Award2020-09-22 10:27:00 Academy Awardshttps://en.wikipedia.org/wiki/Academy_Award2020-10-01 12:55:00 Hollywood Walk of Famehttps://en.wikipedia.org/wiki/Hollywood_Walk_of_Fame2020-10-03 12:56:00 <value><Selector xpath='//h1/i/text()' data='The Guardian'></value>https://en.wikipedia.org/wiki/The_Guardian2020-09-18 16:08:00 <value><Selector xpath='//h1/i/text()' data='The Following'></value>https://en.wikipedia.org/wiki/The_Following2020-09-11 16:17:00 <value><Selector xpath='//h1/i/text()' data='Taking Chance'></value>https://en.wikipedia.org/wiki/Taking_Chance2020-09-03 14:05:00 Screen Actors Guild Awardshttps://en.wikipedia.org/wiki/Screen_Actors_Guild_Award2020-07-21 00:07:00 Seattle International Film Festivalhttps://en.wikipedia.org/wiki/Seattle_International_Film_Festival2020-04-26 00:39:00 Golden Globe Awardshttps://en.wikipedia.org/wiki/Golden_Globe_Award2020-09-08 12:45:00 Main Pagehttps://en.wikipedia.org/wiki/Main_Page2020-07-23 12:44:00 WorldCathttps://en.wikipedia.org/wiki/WorldCat_Identities_(identifier)2020-10-03 11:46:00 Fox Broadcasting Companyhttps://en.wikipedia.org/wiki/Fox_Broadcasting_Company2020-10-06 15:27:00 Virtual International Authority Filehttps://en.wikipedia.org/wiki/VIAF_(identifier)2020-07-30 18:19:00 Trovehttps://en.wikipedia.org/wiki/Trove2020-09-18 03:04:00 Système universitaire de documentationhttps://en.wikipedia.org/wiki/SUDOC_(identifier)2019-10-19 13:42:00 SNAChttps://en.wikipedia.org/wiki/SNAC2020-07-02 13:53:00 HBOhttps://en.wikipedia.org/wiki/HBO2020-10-07 00:10:00 ================================================ FILE: 02_04_e/article_crawler/article_crawler/spiders/wikipedia.py ================================================ # -*- coding: utf-8 -*- import scrapy from scrapy.spiders import CrawlSpider, Rule from scrapy.linkextractors import LinkExtractor from article_crawler.items import Article class WikipediaSpider(CrawlSpider): name = 'wikipedia' allowed_domains = ['en.wikipedia.org'] start_urls = ['https://en.wikipedia.org/wiki/Kevin_Bacon'] rules = [ Rule(LinkExtractor(allow=r'wiki/((?!:).)*$'), callback='parse_info', follow=True) ] custom_settings={ 'FEED_URI': 'articles.xml', 'FEED_FORMAT': 'xml' } def parse_info(self, response): article = Article() article['title']= response.xpath('//h1/text()').get() or response.xpath('//h1/i/text()') article['url'] = response.url article['lastUpdated'] = response.xpath('//li[@id="footer-info-lastmod"]/text()').get() return article ================================================ FILE: 02_04_e/article_crawler/scrapy.cfg ================================================ # Automatically created by: scrapy startproject # # For more information about the [deploy] section see: # https://scrapyd.readthedocs.io/en/latest/deploy.html [settings] default = article_crawler.settings [deploy] #url = http://localhost:6800/ project = article_crawler ================================================ FILE: 02_05/news_scraper/news_scraper/__init__.py ================================================ ================================================ FILE: 02_05/news_scraper/news_scraper/items.py ================================================ # -*- coding: utf-8 -*- # Define here the models for your scraped items # # See documentation in: # https://docs.scrapy.org/en/latest/topics/items.html import scrapy class NewsArticle(scrapy.Item): url = scrapy.Field() source = scrapy.Field() title = scrapy.Field() description = scrapy.Field() date = scrapy.Field() author = scrapy.Field() text = scrapy.Field() ================================================ FILE: 02_05/news_scraper/news_scraper/middlewares.py ================================================ # -*- coding: utf-8 -*- # Define here the models for your spider middleware # # See documentation in: # https://docs.scrapy.org/en/latest/topics/spider-middleware.html from scrapy import signals class NewsScraperSpiderMiddleware: # Not all methods need to be defined. If a method is not defined, # scrapy acts as if the spider middleware does not modify the # passed objects. @classmethod def from_crawler(cls, crawler): # This method is used by Scrapy to create your spiders. s = cls() crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) return s def process_spider_input(self, response, spider): # Called for each response that goes through the spider # middleware and into the spider. # Should return None or raise an exception. return None def process_spider_output(self, response, result, spider): # Called with the results returned from the Spider, after # it has processed the response. # Must return an iterable of Request, dict or Item objects. for i in result: yield i def process_spider_exception(self, response, exception, spider): # Called when a spider or process_spider_input() method # (from other spider middleware) raises an exception. # Should return either None or an iterable of Request, dict # or Item objects. pass def process_start_requests(self, start_requests, spider): # Called with the start requests of the spider, and works # similarly to the process_spider_output() method, except # that it doesn’t have a response associated. # Must return only requests (not items). for r in start_requests: yield r def spider_opened(self, spider): spider.logger.info('Spider opened: %s' % spider.name) class NewsScraperDownloaderMiddleware: # Not all methods need to be defined. If a method is not defined, # scrapy acts as if the downloader middleware does not modify the # passed objects. @classmethod def from_crawler(cls, crawler): # This method is used by Scrapy to create your spiders. s = cls() crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) return s def process_request(self, request, spider): # Called for each request that goes through the downloader # middleware. # Must either: # - return None: continue processing this request # - or return a Response object # - or return a Request object # - or raise IgnoreRequest: process_exception() methods of # installed downloader middleware will be called return None def process_response(self, request, response, spider): # Called with the response returned from the downloader. # Must either; # - return a Response object # - return a Request object # - or raise IgnoreRequest return response def process_exception(self, request, exception, spider): # Called when a download handler or a process_request() # (from other downloader middleware) raises an exception. # Must either: # - return None: continue processing this exception # - return a Response object: stops process_exception() chain # - return a Request object: stops process_exception() chain pass def spider_opened(self, spider): spider.logger.info('Spider opened: %s' % spider.name) ================================================ FILE: 02_05/news_scraper/news_scraper/pipelines.py ================================================ # -*- coding: utf-8 -*- from datetime import datetime class NewsScraperPipeline: def process_item(self, item, spider): item.date = datetime.strptime(item.date.split('T')[0], '%Y-%B-%D') item.author = item.author.replace(', CNN', '') item.text = [text.strip() for text in item.text] return item ================================================ FILE: 02_05/news_scraper/news_scraper/settings.py ================================================ # -*- coding: utf-8 -*- # Scrapy settings for news_scraper project # # For simplicity, this file contains only settings considered important or # commonly used. You can find more settings consulting the documentation: # # https://docs.scrapy.org/en/latest/topics/settings.html # https://docs.scrapy.org/en/latest/topics/downloader-middleware.html # https://docs.scrapy.org/en/latest/topics/spider-middleware.html BOT_NAME = 'news_scraper' SPIDER_MODULES = ['news_scraper.spiders'] NEWSPIDER_MODULE = 'news_scraper.spiders' CLOSESPIDER_PAGECOUNT=10 FEED_URI='news_articles.json' FEED_FORMAT='json' # Crawl responsibly by identifying yourself (and your website) on the user-agent #USER_AGENT = 'news_scraper (+http://www.yourdomain.com)' # Obey robots.txt rules ROBOTSTXT_OBEY = False # Configure maximum concurrent requests performed by Scrapy (default: 16) #CONCURRENT_REQUESTS = 32 # Configure a delay for requests for the same website (default: 0) # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay # See also autothrottle settings and docs #DOWNLOAD_DELAY = 3 # The download delay setting will honor only one of: #CONCURRENT_REQUESTS_PER_DOMAIN = 16 #CONCURRENT_REQUESTS_PER_IP = 16 # Disable cookies (enabled by default) #COOKIES_ENABLED = False # Disable Telnet Console (enabled by default) #TELNETCONSOLE_ENABLED = False # Override the default request headers: #DEFAULT_REQUEST_HEADERS = { # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', # 'Accept-Language': 'en', #} # Enable or disable spider middlewares # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html #SPIDER_MIDDLEWARES = { # 'news_scraper.middlewares.NewsScraperSpiderMiddleware': 543, #} # Enable or disable downloader middlewares # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html #DOWNLOADER_MIDDLEWARES = { # 'news_scraper.middlewares.NewsScraperDownloaderMiddleware': 543, #} # Enable or disable extensions # See https://docs.scrapy.org/en/latest/topics/extensions.html #EXTENSIONS = { # 'scrapy.extensions.telnet.TelnetConsole': None, #} # Configure item pipelines # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html #ITEM_PIPELINES = { # 'news_scraper.pipelines.NewsScraperPipeline': 300, #} # Enable and configure the AutoThrottle extension (disabled by default) # See https://docs.scrapy.org/en/latest/topics/autothrottle.html #AUTOTHROTTLE_ENABLED = True # The initial download delay #AUTOTHROTTLE_START_DELAY = 5 # The maximum download delay to be set in case of high latencies #AUTOTHROTTLE_MAX_DELAY = 60 # The average number of requests Scrapy should be sending in parallel to # each remote server #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 # Enable showing throttling stats for every response received: #AUTOTHROTTLE_DEBUG = False # Enable and configure HTTP caching (disabled by default) # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings #HTTPCACHE_ENABLED = True #HTTPCACHE_EXPIRATION_SECS = 0 #HTTPCACHE_DIR = 'httpcache' #HTTPCACHE_IGNORE_HTTP_CODES = [] #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' ================================================ FILE: 02_05/news_scraper/news_scraper/spiders/__init__.py ================================================ # This package will contain the spiders of your Scrapy project # # Please refer to the documentation for information on how to create and manage # your spiders. ================================================ FILE: 02_05/news_scraper/news_scraper/spiders/associated_press.py ================================================ # -*- coding: utf-8 -*- from scrapy.spiders import CrawlSpider, Rule from scrapy.linkextractors import LinkExtractor from news_scraper.items import NewsArticle import json class AssociatedPressSpider(CrawlSpider): name = 'associated_press' allowed_domains = ['apnews.com'] start_urls = ['http://apnews.com/'] rules = [Rule(LinkExtractor(allow=r'\/article\/[a-zA-Z\-]+\-[a-zA-Z0-9]{32}'), callback='parse_item', follow=True)] def parse_item(self, response): article = NewsArticle() #