Repository: yijingping/unicrawler Branch: master Commit: 64c1fd0ccabe Files: 56 Total size: 75.0 KB Directory structure: gitextract_cq7_c7ie/ ├── .gitignore ├── LICENSE ├── README.md ├── bin/ │ ├── downloader.py │ ├── extractor.py │ ├── processor.py │ └── scheduler.py ├── configs/ │ ├── __init__.py │ ├── admin.py │ ├── apps.py │ ├── management/ │ │ ├── __init__.py │ │ └── commands/ │ │ ├── __init__.py │ │ └── checkproxies.py │ ├── migrations/ │ │ ├── 0001_initial.py │ │ ├── 0002_auto_20160201_1627.py │ │ ├── 0003_proxy_url.py │ │ ├── 0004_auto_20160202_1712.py │ │ └── __init__.py │ ├── models.py │ ├── proxies.py │ └── util.py ├── cores/ │ ├── __init__.py │ ├── admin.py │ ├── apps.py │ ├── constants.py │ ├── downloaders.py │ ├── extractors.py │ ├── migrations/ │ │ ├── 0001_initial.py │ │ ├── 0002_detailrule_exclude.py │ │ ├── 0003_auto_20160131_2226.py │ │ ├── 0004_auto_20160201_1035.py │ │ ├── 0005_detailrule_multi.py │ │ ├── 0006_detailrule_fresh_time.py │ │ ├── 0007_detailrule_multi_unique.py │ │ ├── 0008_auto_20160407_1426.py │ │ └── __init__.py │ ├── models.py │ ├── processors.py │ └── util.py ├── crontab ├── manage.py ├── monitors/ │ ├── __init__.py │ ├── admin.py │ ├── apps.py │ ├── management/ │ │ ├── __init__.py │ │ └── commands/ │ │ ├── __init__.py │ │ └── monitor.py │ ├── migrations/ │ │ ├── 0001_initial.py │ │ └── __init__.py │ └── models.py ├── requirements.txt ├── supervisord.conf └── unicrawler/ ├── __init__.py ├── settings.py ├── urls.py └── wsgi.py ================================================ FILE CONTENTS ================================================ ================================================ FILE: .gitignore ================================================ .idea *.pyc logs local_settings.py .DS_Store /static/ bin/t.py ================================================ FILE: LICENSE ================================================ MIT License Copyright (c) 2023 yijingping Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ================================================ FILE: README.md ================================================ # unicrawler 一个通用的可配置的的爬虫 # 安装 1)python环境, 检查python的版本,是否为2.7.x,如果不是,安装2.7.6。 centos 6.x 升级python2.6到python2.7,参考教程 http://ruiaylin.github.io/2014/12/12/python%20update/ 2)安装依赖包, clone代码 Mysql-python依赖 ``` yum install python-devel mysql-devel gcc ``` lxml依赖 ``` yum install libxslt-devel libxml2-devel ``` 安装浏览器环境 selenium依赖 ``` yum install xorg-x11-server-Xvfb yum install firefox ``` clone代码,安装依赖python库 ``` $ git clone https://github.com/yijingping/unicrawler.git $ cd unicrawler $ pip install -r requirements.txt ``` 3) 初始化mysql a) 安装mysql-server后,记得设置字符为utf8mb4。在my.cnf中设置: ``` [client] default-character-set = utf8mb4 [mysql] default-character-set = utf8mb4 [mysqld] character-set-client-handshake = FALSE character-set-server = utf8mb4 collation-server = utf8mb4_unicode_ci ``` b) 重启数据库 c) 创建数据库unicrawler ``` mysql> CREATE DATABASE `unicrawler` CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci; ``` d) 初始化表 ``` $ python manage.py migrate ``` 5)运行 ``` python manage.py runserver 0.0.0.0:8001 ``` 访问 http://localhost:8001/。 测试没问题后,参考后面的supervisor脚本启动。 # 部署nginx 前期先用nginx将域名www.mydomain.com转发到8001端口。 # 部署supervisor脚本 参考文件 `supervisord.conf` # 部署crontab脚本 参考文件 `crontab` ================================================ FILE: bin/downloader.py ================================================ # -*- coding: utf-8 -*- __author__ = 'yijingping' # 加载django环境 import sys import os reload(sys) sys.setdefaultencoding('utf8') sys.path.append(os.path.dirname(os.path.dirname(os.path.realpath(__file__)))) os.environ['DJANGO_SETTINGS_MODULE'] = 'unicrawler.settings' import django django.setup() import time import json from django.conf import settings from cores.models import Site from cores.util import get_redis, get_uniqueid from cores.constants import KIND_DETAIL_URL from cores.downloaders import RequestsDownloaderBackend, SeleniumDownloaderBackend from configs.proxies import MysqlProxyBackend import logging logger = logging.getLogger() class Downloader(object): def __init__(self): self.redis = get_redis() def get_proxy(self, kind): if kind == Site.PROXY_MYSQL: return MysqlProxyBackend() else: return None def check_limit_speed(self, config): if config["limit_speed"] <= 0: return False, None else: proxy = self.get_proxy(config['proxy']) key = 'unicrawler:limit_speed:%s:%s' % (config['domain'], proxy) if self.redis.exists(key): return True, proxy else: self.redis.psetex(key, config["limit_speed"], config["limit_speed"]) return False, proxy def check_detail_fresh_time(self, data): unique_key, fresh_time, rule_id = data['unique_key'], data["detail_fresh_time"], data["rule_id"] if fresh_time <= 0: return False else: unique_value = ''.join([str(data.get(item)) for item in unique_key]) key = 'unicrawler:detail_fresh_time:%s:%s' % (rule_id, get_uniqueid(unique_value)) if self.redis.exists(key): return True else: self.redis.setex(key, fresh_time, fresh_time) return False def run(self): r = self.redis if settings.CRAWLER_DEBUG: r.delete(settings.CRAWLER_CONFIG["downloader"]) while True: try: resp_data = r.brpop(settings.CRAWLER_CONFIG["downloader"]) except Exception as e: print e continue try: data = json.loads(resp_data[1]) site_config = data['site_config'] logger.debug(data["url"]) is_limited, proxy = self.check_limit_speed(site_config) if is_limited: print '# 被限制, 放回去, 下次下载' time.sleep(1) # 休息一秒, 延迟放回去的时间 r.lpush(settings.CRAWLER_CONFIG["downloader"], resp_data[1]) elif (data["kind"] == KIND_DETAIL_URL and self.check_detail_fresh_time(data)): print '# 该详情页已下载过, 不下载了' else: print '# 未被限制,可以下载' if site_config['browser'] == Site.BROWSER_NONE: browser = RequestsDownloaderBackend(proxy=proxy) data['body'] = browser.download(data["url"]) elif site_config['browser'] == Site.BROWSER_NORMAL: with SeleniumDownloaderBackend(proxy=proxy) as browser: data['body'] = browser.download(data["url"]) else: return r.lpush(settings.CRAWLER_CONFIG["extractor"], json.dumps(data)) logger.debug(data) except Exception as e: print e raise if __name__ == '__main__': downloader = Downloader() downloader.run() ================================================ FILE: bin/extractor.py ================================================ # -*- coding: utf-8 -*- __author__ = 'yijingping' # 加载django环境 import sys import os reload(sys) sys.setdefaultencoding('utf8') sys.path.append(os.path.dirname(os.path.dirname(os.path.realpath(__file__)))) os.environ['DJANGO_SETTINGS_MODULE'] = 'unicrawler.settings' import django django.setup() import json from cores.constants import KIND_LIST_URL, KIND_DETAIL_URL from django.conf import settings from cores.util import get_redis, get_uniqueid from cores.extractors import XPathExtractor, PythonExtractor, ImageExtractor, VideoExtractor import logging logger = logging.getLogger() class Extractor(object): def __init__(self): self.redis = get_redis() def extract(self, content, rules, context): res = content for rule in rules: try: extractor = None if rule["kind"] == "xpath": extractor = XPathExtractor(res, rule["data"]) elif rule["kind"] == "python": extractor = PythonExtractor(rule["data"], res, context=context) elif rule["kind"] == "image": extractor = ImageExtractor(res) elif rule["kind"] == "video": extractor = VideoExtractor(res) res = extractor.extract() except Exception as e: logger.exception(e) return res def check_detail_fresh_time(self, unique_url, data): fresh_time, rule_id = data["detail_fresh_time"], data["rule_id"] if fresh_time <= 0: return False else: key = 'unicrawler:detail_fresh_time:%s:%s' % (rule_id, get_uniqueid(unique_url)) if self.redis.exists(key): return True else: self.redis.setex(key, fresh_time, fresh_time) return False def get_detail(self, content, data): # 检查是否在exclude规则内. 如果在,放弃存储 exclude_rules = data['detail_exclude'] excluded = self.extract(content, exclude_rules, {'data': data}) if excluded and excluded != content: logger.debug('# url in excludes, abort!') return # 不在exclude规则内,可以存储 result = { "url": data['url'], "seed_id": data['seed_id'], "rule_id": data['rule_id'], 'detail_multi': data['detail_multi'] } rules = data['detail_rules'] for item in rules: col = item["key"] print col col_rules = item["rules"] col_value = self.extract(content, col_rules, {'data': result}) result[col] = col_value # 提前检查多项详情新鲜度 if col == 'url': if data['detail_multi']: if self.check_detail_fresh_time(result['url'], data): # 未过期,不更新 logger.debug('检查多项详情未过期,不更新') return # 更新 self.redis.lpush(settings.CRAWLER_CONFIG["processor"], json.dumps(result)) logger.debug('extracted:%s' % result) def run(self): r = get_redis() if settings.CRAWLER_DEBUG: r.delete(settings.CRAWLER_CONFIG["extractor"]) while True: try: data = r.brpop(settings.CRAWLER_CONFIG["extractor"]) except Exception as e: print e continue #print data data = json.loads(data[1]) body = data['body'] # 1 如果当前接卸的页面是列表页 if data["kind"] == KIND_LIST_URL: # 1.1先找详情页 # 检查详情的内容是否都包含在列表页中 multi_rules = data['detail_multi'] if multi_rules: # 1.1.1 详情都包含在列表页中 multi_parts = self.extract(body, multi_rules, {'data': data}) for part in multi_parts: self.get_detail(part, data) else: # 1.1.2 详情不在列表中,通过列表url去访问详情 detail_urls = self.extract(body, data['list_rules'], {'data': data}) #logger.debug('detail_urls: %s' % detail_urls) for item in detail_urls: item_data = { "url": item, 'kind': KIND_DETAIL_URL, 'seed_id': data['seed_id'], 'rule_id': data['rule_id'], #'fresh_pages': '', #'list_rules': '', #'next_url_rules': '', 'site_config': data['site_config'], 'detail_rules': data['detail_rules'], 'detail_exclude': data['detail_exclude'], 'detail_multi': data['detail_multi'], 'detail_multi_unique': data['detail_multi_unique'], 'detail_fresh_time': data['detail_fresh_time'], 'unique_key': data['unique_key'] } r.lpush(settings.CRAWLER_CONFIG["downloader"], json.dumps(item_data)) # 1.2后找下一页 next_urls = self.extract(body, data["next_url_rules"], {'data': data}) print 'next_urls: %s' % next_urls for item in next_urls: item_data = { "url": item, 'kind': KIND_LIST_URL, 'seed_id': data['seed_id'], 'rule_id': data['rule_id'], 'fresh_pages': data['fresh_pages'] - 1, 'site_config': data['site_config'], 'list_rules': data['list_rules'], 'next_url_rules': data['next_url_rules'], 'detail_rules': data['detail_rules'], 'detail_exclude': data['detail_exclude'], 'detail_multi': data['detail_multi'], 'detail_multi_unique': data['detail_multi_unique'], 'detail_fresh_time': data['detail_fresh_time'], 'unique_key': data['unique_key'] } if item_data['fresh_pages'] > 0: logger.debug('list:%s' % data['url']) r.lpush(settings.CRAWLER_CONFIG["downloader"], json.dumps(item_data)) # 2 如果当前解析的页面是详情页 elif data["kind"] == KIND_DETAIL_URL: logger.debug('detail:%s' % data['url']) # 如果没有多项详情,则只是单项 self.get_detail(body, data) if __name__ == '__main__': my_extractor = Extractor() my_extractor.run() ================================================ FILE: bin/processor.py ================================================ # -*- coding: utf-8 -*- from __future__ import unicode_literals __author__ = 'yijingping' # 加载django环境 import sys import os reload(sys) sys.setdefaultencoding('utf8') sys.path.append(os.path.dirname(os.path.dirname(os.path.realpath(__file__)))) os.environ['DJANGO_SETTINGS_MODULE'] = 'unicrawler.settings' import django django.setup() import json from django.conf import settings from cores.models import Seed from cores.util import get_redis from cores.processors import MysqlBackend, PostgresBackend, DjangoModelBackend, MongoDBBackend import logging logger = logging.getLogger() class Processor(): def __init__(self): self.pools = {} def get_backends(self, seed_id): cache = self.pools.get(seed_id, None) if cache: return cache else: try: seed = Seed.objects.get(pk=seed_id) except Seed.DoesNotExist as e: logger.exception(e) return [] else: for config in seed.data: if config["kind"] == "mysql": backend = MysqlBackend(config) elif config["kind"] == "mongodb": backend = MongoDBBackend(config) elif config["kind"] == "postgres": backend = PostgresBackend(config) elif config["kind"] == "DjangoModel": backend = DjangoModelBackend(config) my_config = self.pools.get(seed_id, []) my_config.append(backend) self.pools[seed_id] = my_config return self.pools.get(seed_id, []) def process(self, data): backends = self.get_backends(data['seed_id']) for backend in backends: backend.process(data) def run(self): r = get_redis() if settings.CRAWLER_DEBUG: r.delete(settings.CRAWLER_CONFIG["processor"]) while True: try: rsp = r.brpop(settings.CRAWLER_CONFIG["processor"]) except Exception as e: print e continue data = json.loads(rsp[1]) #logger.info(json.dumps(data, encoding="UTF-8", ensure_ascii=False)) self.process(data) if __name__ == '__main__': processor = Processor() processor.run() ================================================ FILE: bin/scheduler.py ================================================ # -*- coding: utf-8 -*- __author__ = 'yijingping' # 加载django环境 import sys import os reload(sys) sys.setdefaultencoding('utf8') sys.path.append(os.path.dirname(os.path.dirname(os.path.realpath(__file__)))) os.environ['DJANGO_SETTINGS_MODULE'] = 'unicrawler.settings' import django django.setup() import json from cores.models import Seed, IndexRule, DetailRule from cores.constants import KIND_LIST_URL from django.conf import settings import logging logger = logging.getLogger() from datetime import datetime, timedelta import time from cores.util import get_redis class Scheduler(object): def run(self): r = get_redis() while True: now = datetime.now() for item in Seed.objects.filter(status=Seed.STATUS_ENABLE).order_by('-weight'): rules = IndexRule.objects.filter(seed=item, status=IndexRule.STATUS_ENABLE, next_crawl_time__lte=now) for rule in rules: try: detail_rule = DetailRule.objects.get(index_rule=rule) except DetailRule.DoesNotExist as e: print e continue base = { 'url': '', 'kind': KIND_LIST_URL, "seed_id": item.pk, 'rule_id': rule.pk, "fresh_pages": rule.fresh_pages, 'site_config': rule.site.get_config(), 'list_rules': rule.list_rules, 'next_url_rules': rule.next_url_rules, 'detail_rules': detail_rule.data, 'detail_exclude': detail_rule.exclude, 'detail_multi': detail_rule.multi, 'detail_multi_unique': detail_rule.multi_unique, 'detail_fresh_time': detail_rule.fresh_time, 'unique_key': item.data[0]["unique_key"] } for url in rule.url: data = base.copy() data['url'] = url r.lpush(settings.CRAWLER_CONFIG["downloader"], json.dumps(data)) # 更新index_rule rule.next_crawl_time = now + timedelta(seconds=rule.frequency) rule.save() logging.debug(data) #print r.rpop('unicrawler:urls') time.sleep(1) if __name__ == '__main__': scheduler = Scheduler() scheduler.run() ================================================ FILE: configs/__init__.py ================================================ default_app_config = 'configs.apps.ConfigsAppConfig' ================================================ FILE: configs/admin.py ================================================ from django.contrib import admin from .models import Site, Proxy class SiteAdmin(admin.ModelAdmin): list_display = ('id', 'name', 'domain', 'proxy', 'browser', 'limit_speed', 'status') list_filter = ['proxy', 'browser', 'status'] admin.site.register(Site, SiteAdmin) class ProxyAdmin(admin.ModelAdmin): list_display = ('host', 'port', 'speed', 'status', 'retry', 'address') list_filter = ('status',) admin.site.register(Proxy, ProxyAdmin) ================================================ FILE: configs/apps.py ================================================ # -*- coding: utf-8 -*- __author__ = 'yijingping' from django.apps import AppConfig class ConfigsAppConfig(AppConfig): name = 'configs' verbose_name = u'2 爬虫配置' ================================================ FILE: configs/management/__init__.py ================================================ # -*- coding: utf-8 -*- __author__ = 'yijingping' ================================================ FILE: configs/management/commands/__init__.py ================================================ # -*- coding: utf-8 -*- __author__ = 'yijingping' ================================================ FILE: configs/management/commands/checkproxies.py ================================================ # -*- coding: utf-8 -*- __author__ = 'yijingping' import time from django.core.management.base import BaseCommand from configs.models import Proxy from configs.util import check_proxy class Command(BaseCommand): help = 'check proxies' def handle(self, *args, **options): while True: self.check_all_proxies() time.sleep(60) def check_all_proxies(self): # 检测新代理 qs1 = Proxy.objects.filter(status=Proxy.STATUS_NEW) # 检测成功代理 qs2 = Proxy.objects.filter(status=Proxy.STATUS_SUCCESS) # 检测失败代理 qs3 = Proxy.objects.filter(status=Proxy.STATUS_FAIL, retry__lt=3) for qs in [qs1, qs2, qs3]: for item in qs: has_exception, proxy_detected, time_diff = check_proxy(item.host, item.port) if has_exception or not proxy_detected: item.status = Proxy.STATUS_FAIL item.retry += 1 item.save() else: item.status = Proxy.STATUS_SUCCESS item.speed = time_diff * 1000 item.retry = 0 item.save() ================================================ FILE: configs/migrations/0001_initial.py ================================================ # -*- coding: utf-8 -*- from __future__ import unicode_literals from django.db import models, migrations class Migration(migrations.Migration): dependencies = [ ] operations = [ migrations.CreateModel( name='Proxy', fields=[ ('id', models.AutoField(verbose_name='ID', serialize=False, auto_created=True, primary_key=True)), ('kind', models.IntegerField(default=1, verbose_name=b'\xe4\xbb\xa3\xe7\x90\x86\xe7\xb1\xbb\xe5\x9e\x8b', choices=[(0, b'\xe9\x80\x8f\xe6\x98\x8e\xe4\xbb\xa3\xe7\x90\x86'), (1, b'\xe9\xab\x98\xe5\xba\xa6\xe5\x8c\xbf\xe5\x90\x8d')])), ('user', models.CharField(default=b'', max_length=100, blank=True)), ('password', models.CharField(default=b'', max_length=100, blank=True)), ('host', models.CharField(max_length=100)), ('port', models.IntegerField(default=80)), ('address', models.CharField(default=b'', max_length=100, verbose_name=b'\xe5\x9c\xb0\xe7\x90\x86\xe4\xbd\x8d\xe7\xbd\xae', blank=True)), ('speed', models.IntegerField(default=0, verbose_name=b'\xe8\xbf\x9e\xe6\x8e\xa5\xe9\x80\x9f\xe5\xba\xa6(ms)')), ('status', models.IntegerField(default=0, verbose_name=b'\xe7\x8a\xb6\xe6\x80\x81', choices=[(0, b'\xe6\x9c\xaa\xe6\xa3\x80\xe6\xb5\x8b'), (1, b'\xe6\xa3\x80\xe6\xb5\x8b\xe6\x88\x90\xe5\x8a\x9f'), (2, b'\xe6\xa3\x80\xe6\xb5\x8b\xe5\xa4\xb1\xe8\xb4\xa5')])), ('retry', models.IntegerField(default=0, verbose_name=b'\xe5\xb0\x9d\xe8\xaf\x95\xe6\xac\xa1\xe6\x95\xb0')), ], options={ 'verbose_name_plural': '2 \u8bbf\u95ee\u4ee3\u7406', }, ), migrations.CreateModel( name='Site', fields=[ ('id', models.AutoField(verbose_name='ID', serialize=False, auto_created=True, primary_key=True)), ('name', models.CharField(max_length=100, verbose_name=b'\xe7\xab\x99\xe7\x82\xb9\xe5\x90\x8d\xe7\xa7\xb0')), ('domain', models.CharField(unique=True, max_length=100, verbose_name=b'\xe7\xab\x99\xe7\x82\xb9\xe5\x9f\x9f\xe5\x90\x8d')), ('proxy', models.IntegerField(default=1, verbose_name=b'\xe4\xbb\xa3\xe7\x90\x86', choices=[(1, b'\xe4\xb8\x8d\xe4\xbd\xbf\xe7\x94\xa8\xe4\xbb\xa3\xe7\x90\x86'), (2, b'\xe5\xad\x98\xe5\x82\xa8\xe5\x9c\xa8Mysql\xe6\x95\xb0\xe6\x8d\xae\xe5\xba\x93\xe4\xb8\xad\xe7\x9a\x84\xe4\xbb\xa3\xe7\x90\x86')])), ('browser', models.IntegerField(default=1, verbose_name=b'\xe6\xb5\x8f\xe8\xa7\x88\xe5\x99\xa8\xe5\xa3\xb3', choices=[(1, b'\xe4\xb8\x8d\xe4\xbd\xbf\xe7\x94\xa8\xe6\xb5\x8f\xe8\xa7\x88\xe5\x99\xa8\xe5\xa3\xb3'), (2, b'\xe6\x99\xae\xe9\x80\x9a\xe6\xb5\x8f\xe8\xa7\x88\xe5\x99\xa8')])), ('limit_speed', models.IntegerField(default=100, verbose_name=b'\xe8\xae\xbf\xe9\x97\xae\xe9\x97\xb4\xe9\x9a\x94(\xe6\xaf\xab\xe7\xa7\x92)')), ('status', models.IntegerField(default=1, verbose_name=b'\xe6\x98\xaf\xe5\x90\xa6\xe5\x90\xaf\xe7\x94\xa8', choices=[(1, b'\xe5\x90\xaf\xe7\x94\xa8'), (2, b'\xe7\xa6\x81\xe7\x94\xa8')])), ], options={ 'verbose_name_plural': '1 \u7ad9\u70b9\u914d\u7f6e', }, ), ] ================================================ FILE: configs/migrations/0002_auto_20160201_1627.py ================================================ # -*- coding: utf-8 -*- from __future__ import unicode_literals from django.db import models, migrations import configs.models class Migration(migrations.Migration): dependencies = [ ('configs', '0001_initial'), ] operations = [ migrations.AddField( model_name='proxy', name='create_time', field=models.DateTimeField(default=None, verbose_name=b'\xe5\x88\x9b\xe5\xbb\xba\xe6\x97\xb6\xe9\x97\xb4', auto_now_add=True), preserve_default=False, ), migrations.AddField( model_name='proxy', name='uniqueid', field=models.CharField(default=configs.models.get_default_uniqueid, unique=True, max_length=100, verbose_name=b'url\xe7\x9a\x84md5\xe5\x80\xbc'), ), migrations.AddField( model_name='proxy', name='update_time', field=models.DateTimeField(default=None, verbose_name=b'\xe6\x9b\xb4\xe6\x96\xb0\xe6\x97\xb6\xe9\x97\xb4', auto_now=True), preserve_default=False, ), ] ================================================ FILE: configs/migrations/0003_proxy_url.py ================================================ # -*- coding: utf-8 -*- from __future__ import unicode_literals from django.db import models, migrations class Migration(migrations.Migration): dependencies = [ ('configs', '0002_auto_20160201_1627'), ] operations = [ migrations.AddField( model_name='proxy', name='url', field=models.CharField(default=b'', max_length=500, verbose_name=b'\xe6\x96\x87\xe7\xab\xa0\xe7\x9a\x84url'), ), ] ================================================ FILE: configs/migrations/0004_auto_20160202_1712.py ================================================ # -*- coding: utf-8 -*- from __future__ import unicode_literals from django.db import models, migrations import configs.models class Migration(migrations.Migration): dependencies = [ ('configs', '0003_proxy_url'), ] operations = [ migrations.AlterField( model_name='proxy', name='uniqueid', field=models.CharField(default=configs.models.get_default_uniqueid, unique=True, max_length=100, verbose_name=b'\xe4\xbb\xa3\xe7\x90\x86\xe5\x8f\x82\xe6\x95\xb0\xe7\x9a\x84md5\xe5\x80\xbc'), ), migrations.AlterField( model_name='proxy', name='url', field=models.CharField(default=b'', max_length=500, verbose_name=b'url'), ), ] ================================================ FILE: configs/migrations/__init__.py ================================================ ================================================ FILE: configs/models.py ================================================ # -*- coding: utf-8 -*- __author__ = 'yijingping' import time from django.db import models def get_default_uniqueid(): return str(long(time.time() * 1000000)) class Site(models.Model): STATUS_ENABLE = 1 STATUS_DISABLE = 2 STATUS_CHOICES = ( (STATUS_ENABLE, '启用'), (STATUS_DISABLE, '禁用') ) PROXY_NONE = 1 PROXY_MYSQL = 2 PROXY_CHOICES = ( (PROXY_NONE, '不使用代理'), (PROXY_MYSQL, '存储在Mysql数据库中的代理') ) BROWSER_NONE = 1 BROWSER_NORMAL = 2 BROWSER_CHOICES = ( (BROWSER_NONE, '不使用浏览器壳'), (BROWSER_NORMAL, '普通浏览器') ) name = models.CharField(max_length=100, verbose_name='站点名称') domain = models.CharField(unique=True, max_length=100, verbose_name='站点域名') proxy = models.IntegerField(default=PROXY_NONE, choices=PROXY_CHOICES, verbose_name='代理') browser = models.IntegerField(default=BROWSER_NONE, choices=BROWSER_CHOICES, verbose_name='浏览器壳') limit_speed = models.IntegerField(default=100, verbose_name='访问间隔(毫秒)') status = models.IntegerField(default=STATUS_ENABLE, choices=STATUS_CHOICES, verbose_name="是否启用") def get_config(self): if self.status == self.STATUS_ENABLE: return { 'domain': self.domain, 'proxy': self.proxy, 'browser': self.browser, 'limit_speed': self.limit_speed } else: return { 'domain': self.domain, 'proxy': self.PROXY_NONE, 'browser': self.BROWSER_NONE, 'limit_speed': 0 } def __unicode__(self): return self.name class Meta: verbose_name_plural = "1 站点配置" class Proxy(models.Model): TYPE_TRANSPARENT = 0 TYPE_ANONYMOUS = 1 TYPE_CHOICES = ( (TYPE_TRANSPARENT, '透明代理'), (TYPE_ANONYMOUS, '高度匿名'), ) STATUS_NEW = 0 STATUS_SUCCESS = 1 STATUS_FAIL = 2 STATUS_CHOICES = ( (STATUS_NEW,'未检测'), (STATUS_SUCCESS,'检测成功'), (STATUS_FAIL,'检测失败'), ) uniqueid = models.CharField(unique=True, max_length=100, default=get_default_uniqueid, verbose_name='代理参数的md5值') url = models.CharField(max_length=500, default='', verbose_name='url') kind = models.IntegerField(default=TYPE_ANONYMOUS, choices=TYPE_CHOICES, verbose_name="代理类型") user = models.CharField(default='', blank=True, max_length=100) password = models.CharField(default='', blank=True, max_length=100) host = models.CharField(max_length=100) port = models.IntegerField(default=80) address = models.CharField(default='', blank=True, max_length=100, verbose_name="地理位置") speed = models.IntegerField(default=0, verbose_name="连接速度(ms)") status = models.IntegerField(default=STATUS_NEW, choices=STATUS_CHOICES, verbose_name="状态") retry = models.IntegerField(default=0, verbose_name="尝试次数") create_time = models.DateTimeField(auto_now_add=True, verbose_name='创建时间') update_time = models.DateTimeField(auto_now=True, verbose_name='更新时间') class Meta: verbose_name_plural = "2 访问代理" ================================================ FILE: configs/proxies.py ================================================ # -*- coding: utf-8 -*- __author__ = 'yijingping' from .models import Proxy class MysqlProxyBackend(object): def __init__(self): proxy = Proxy.objects.filter(kind=Proxy.KIND_DOWNLOAD, status=Proxy.STATUS_SUCCESS).order_by('?').first() if proxy: self.user = proxy.user self.password = proxy.password self.host = proxy.host self.port = proxy.port else: self.user, self.password, self.host, self.port = '', '', '', '' def is_valid(self): return self.host and self.port def __str__(self): return ':'.join([str(self.user), str(self.password), str(self.host), str(self.port)]) ================================================ FILE: configs/util.py ================================================ # -*- coding: utf-8 -*- __author__ = 'yijingping' import time import urllib2 ip_check_url = 'http://api.ipify.org' user_agent = 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:12.0) Gecko/20100101 Firefox/12.0' socket_timeout = 3 # Get real public IP address def get_real_pip(): req = urllib2.Request(ip_check_url) req.add_header('User-agent', user_agent) conn = urllib2.urlopen(req) page = conn.read() conn.close() return page # Set global variable containing "real" public IP address real_pip = get_real_pip() def check_proxy(host, port): try: # Build opener proxy_handler = urllib2.ProxyHandler({'http': '%s:%s' % (host, port)}) opener = urllib2.build_opener(proxy_handler) opener.addheaders = [('User-agent', user_agent)] urllib2.install_opener(opener) # Build, time, and execute request req = urllib2.Request(ip_check_url) time_start = time.time() conn = urllib2.urlopen(req, timeout=socket_timeout) time_end = time.time() detected_pip = conn.read() conn.close() # Calculate request time time_diff = time_end - time_start # Check if proxy is detected if detected_pip == real_pip: proxy_detected = False else: proxy_detected = True # Catch exceptions except urllib2.HTTPError, e: print "ERROR: Code ", e.code return (True, False, 999) except Exception, detail: print "ERROR: ", detail return (True, False, 999) # Return False if no exceptions, proxy_detected=True if proxy detected return (False, proxy_detected, time_diff) ================================================ FILE: cores/__init__.py ================================================ default_app_config = 'cores.apps.CoresAppConfig' ================================================ FILE: cores/admin.py ================================================ from django.contrib import admin from .models import Seed, IndexRule, DetailRule class SeedAdmin(admin.ModelAdmin): list_display = ('id', 'name', 'desc', 'weight', 'status') list_filter = ['status'] admin.site.register(Seed, SeedAdmin) class IndexRuleAdmin(admin.ModelAdmin): list_display = ('id', 'seed', 'name', 'site', 'url', 'frequency', 'update_time', 'next_crawl_time', 'fresh_pages', 'status') list_filter = ['status', 'update_time', 'next_crawl_time'] admin.site.register(IndexRule, IndexRuleAdmin) class DetailRuleAdmin(admin.ModelAdmin): list_display = ['index_rule'] admin.site.register(DetailRule, DetailRuleAdmin) ================================================ FILE: cores/apps.py ================================================ # -*- coding: utf-8 -*- __author__ = 'yijingping' from django.apps import AppConfig class CoresAppConfig(AppConfig): name = 'cores' verbose_name = u'1 爬虫' ================================================ FILE: cores/constants.py ================================================ # -*- coding: utf-8 -*- __author__ = 'yijingping' KIND_LIST_URL = 0 KIND_DETAIL_URL = 1 ================================================ FILE: cores/downloaders.py ================================================ # -*- coding: utf-8 -*- __author__ = 'yijingping' import time import requests import platform from random import sample from pyvirtualdisplay import Display from selenium import webdriver from selenium.webdriver.common.proxy import Proxy, ProxyType from django.conf import settings import logging logger = logging.getLogger() CRAWLER_CONFIG = settings.CRAWLER_CONFIG class RequestsDownloaderBackend(object): """ 使用requests直接访问 """ headers = [ { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.118 Safari/537.36' } ] def __init__(self, proxy=None): self.proxy = proxy def format_proxies(self): p = self.proxy if self.proxy: if p.user: data = 'http://%s:%s@%s:%s' % (p.user, p.password, p.host, p.port) else: data = 'http://%s:%s' % (p.host, p.port) return { "http": data } else: return None def download(self, url): header = sample(self.headers, 1)[0] proxies = self.format_proxies() #print url if isinstance(url, basestring): rsp = requests.get(url, headers=header, proxies=proxies) rsp.close() rsp.encoding = rsp.apparent_encoding return rsp.text elif isinstance(url, dict): link, method, data, data_type = url.get('url'), url.get('method'), url.get('data'), url.get('dataType') req = {'GET': requests.get, 'POST': requests.post}.get(method) if method == 'GET': rsp = req(link, params=data, headers=header, proxies=proxies) elif method == 'POST': rsp = req(link, data=data, headers=header, proxies=proxies) rsp.close() rsp.encoding = rsp.apparent_encoding if data_type == 'json': return rsp.json() else: return rsp.text class SeleniumDownloaderBackend(object): """ 使用Selenium模拟浏览器访问 """ headers = [ { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.118 Safari/537.36' } ] def __init__(self, proxy=None): # 设置代理 self.proxy = proxy def __enter__(self): # 打开界面 self.display = self.get_display() # 打开浏览器 self.browser = self.get_browser(self.proxy) return self def __exit__(self, exc_type, exc_val, exc_tb): # 关闭浏览器 try: if self.browser: self.browser.delete_all_cookies() self.browser.quit() except Exception as e: logging.exception(e) # 关闭界面 try: # 关闭浏览器,关闭窗口 self.display and self.display.stop() except Exception as e: logging.exception(e) def get_display(self): if platform.system() != 'Darwin': # 不是mac系统, 启动窗口 display = Display(visible=0, size=(1024, 768)) display.start() else: display = None return display def get_browser(self, proxy): # 启动浏览器 # 禁止加载image firefox_profile = webdriver.FirefoxProfile() #firefox_profile.set_preference('permissions.default.stylesheet', 2) #firefox_profile.set_preference('permissions.default.image', 2) #firefox_profile.set_preference('dom.ipc.plugins.enabled.libflashplayer.so', 'false') # 代理 if proxy and proxy.is_valid(): myProxy = '%s:%s' % (proxy.host, proxy.port) ff_proxy = Proxy({ 'proxyType': ProxyType.MANUAL, 'httpProxy': myProxy, 'ftpProxy': myProxy, 'sslProxy': myProxy, 'noProxy':''}) browser = webdriver.Firefox(firefox_profile=firefox_profile, proxy=ff_proxy) else: browser = webdriver.Firefox(firefox_profile=firefox_profile) return browser def download(self, url): browser = self.browser # 访问首页, 输入wchatid, 点击查询 browser.get(url) time.sleep(3) js = """ return document.documentElement.innerHTML; """ body = browser.execute_script(js) return body class BrowserDownloaderBackend(object): def download(self): pass ================================================ FILE: cores/extractors.py ================================================ # -*- coding: utf-8 -*- __author__ = 'yijingping' from abc import ABCMeta from abc import abstractmethod import requests import oss2 from oss2.exceptions import NotFound from copy import copy from hashlib import md5 from lxml import etree from io import StringIO from django.conf import settings import logging logger = logging.getLogger() OSS2_CONF = settings.OSS2_CONFIG BUCKET = None def get_bucket(): global BUCKET if not BUCKET: auth = oss2.Auth(OSS2_CONF['ACCESS_KEY_ID'], OSS2_CONF['ACCESS_KEY_SECRET']) BUCKET = oss2.Bucket(auth, 'http://%s' % OSS2_CONF['BUCKET_DOMAIN'], OSS2_CONF['BUCKET_NAME']) return BUCKET def download_to_oss(url, path, timeout=3600): r = requests.get(url, timeout=timeout) r.close() key = path + md5(r.content).hexdigest() bucket = get_bucket() try: bucket.head_object(key) except NotFound as e: logging.exception(e) bucket.put_object(key, r, headers={'Content-Type': r.headers.get('Content-Type', '')}) return 'http://%s/%s' % (OSS2_CONF["CDN_DOMAIN"], key) class BaseExtractor(object): __metaclass__ = ABCMeta @abstractmethod def __init__(self): pass @abstractmethod def extract(self): pass class ImageExtractor(BaseExtractor): def __init__(self, data): """ data 是图片url,或者图片url的列表,或者包含img标签的内容 :param data: :return: 如果是url,返回新的url; 如果是列表,返回新的url列表 """ self.data = data def extract(self): d = self.data res = None if not d: return d elif isinstance(d, basestring): if d.startswith('http'): ## 内容是图片地址 res = download_to_oss(d, OSS2_CONF["IMAGES_PATH"], timeout=120) else: ## 内容是包含图片的文字 htmlparser = etree.HTMLParser() tree = etree.parse(StringIO(d), htmlparser) # 找出所有图片src srcs = tree.xpath("//img[starts-with(@src,'http')]/@src") data_srcs = tree.xpath("//img[starts-with(@data-src,'http')]/@data-src") srcs = list(set(srcs + data_srcs)) # 下载并传到OSS中 new_srcs = [download_to_oss(item, OSS2_CONF["IMAGES_PATH"], timeout=120) for item in srcs] # 替换掉原文中的图片src res = self.replace_all(d, srcs, new_srcs) elif isinstance(d, list): res = [download_to_oss(item, OSS2_CONF["IMAGES_PATH"], timeout=120) for item in d] return res def replace_all(self, content, srcs, new_srcs): """ 将content中的srcs全部替换成new_srcs """ replaces = zip(srcs, new_srcs) for src, new_src in replaces: content = content.replace(src.split('?')[0], new_src) return content class VideoExtractor(BaseExtractor): def __init__(self, data): """ data 是视频url,或者视频url的列表 :param data: :return: 如果是url,返回新的url; 如果是列表,返回新的url列表 """ self.data = data def extract(self): d = self.data new_url = None if not d: return d elif isinstance(d, basestring): new_url = download_to_oss(d, OSS2_CONF["VIDEOS_PATH"]) elif isinstance(d, list): new_url = [download_to_oss(item, OSS2_CONF["VIDEOS_PATH"]) for item in d] return new_url class XPathExtractor(BaseExtractor): def __init__(self, content, rule): htmlparser = etree.HTMLParser() self.tree = etree.parse(StringIO(content), htmlparser) self.rule = rule def extract(self): return self.tree.xpath(self.rule) class PythonExtractor(BaseExtractor): def __init__(self, code, in_val, context): self.code = code self.in_val = in_val self.context = copy(context) self.context.update({'in_val': in_val}) def extract(self): res = self.in_val g, l = {}, self.context try: exec(self.code, g, l) res = l["out_val"] except Exception as e: logger.exception(e) finally: return res ================================================ FILE: cores/migrations/0001_initial.py ================================================ # -*- coding: utf-8 -*- from __future__ import unicode_literals from django.db import models, migrations import jsonfield.fields class Migration(migrations.Migration): dependencies = [ ] operations = [ migrations.CreateModel( name='DetailRule', fields=[ ('id', models.AutoField(verbose_name='ID', serialize=False, auto_created=True, primary_key=True)), ('data', jsonfield.fields.JSONField(verbose_name=b'\xe8\xaf\xa6\xe6\x83\x85\xe9\xa1\xb5\xe8\xa7\x84\xe5\x88\x99')), ], options={ 'verbose_name_plural': '4 \u8be6\u60c5\u9875\u722c\u53d6\u89c4\u5219', }, ), migrations.CreateModel( name='IndexRule', fields=[ ('id', models.AutoField(verbose_name='ID', serialize=False, auto_created=True, primary_key=True)), ('name', models.CharField(max_length=100, verbose_name=b'\xe6\x9d\xa5\xe6\xba\x90')), ('url', jsonfield.fields.JSONField(verbose_name=b'\xe7\xb4\xa2\xe5\xbc\x95url\xe5\x88\x97\xe8\xa1\xa8')), ('list_rules', jsonfield.fields.JSONField(verbose_name=b'\xe8\x8e\xb7\xe5\x8f\x96\xe5\x88\x97\xe8\xa1\xa8\xe9\xa1\xb9\xe7\x9a\x84\xe8\xa7\x84\xe5\x88\x99')), ('next_url_rules', jsonfield.fields.JSONField(default=[], verbose_name=b'\xe4\xb8\x8b\xe4\xb8\x80\xe9\xa1\xb5\xe7\xb4\xa2\xe5\xbc\x95\xe7\x9a\x84\xe8\xa7\x84\xe5\x88\x99\xe5\x88\x97\xe8\xa1\xa8', blank=True)), ('frequency', models.IntegerField(default=60, verbose_name=b'\xe7\x88\xac\xe5\x8f\x96\xe9\xa2\x91\xe7\x8e\x87,\xe5\x8d\x95\xe4\xbd\x8d\xe7\xa7\x92')), ('update_time', models.DateTimeField(auto_now=True, verbose_name=b'\xe6\x9b\xb4\xe6\x96\xb0\xe6\x97\xb6\xe9\x97\xb4')), ('next_crawl_time', models.DateTimeField(verbose_name=b'\xe4\xb8\x8b\xe6\xac\xa1\xe7\x88\xac\xe5\x8f\x96\xe6\x97\xb6\xe9\x97\xb4')), ('fresh_pages', models.IntegerField(default=2, verbose_name=b'\xe7\x88\xac\xe5\x8f\x96\xe9\xa1\xb5\xe9\x9d\xa2\xe6\x95\xb0')), ('status', models.IntegerField(default=1, verbose_name=b'\xe6\x98\xaf\xe5\x90\xa6\xe5\x90\xaf\xe7\x94\xa8', choices=[(1, b'\xe5\x90\xaf\xe7\x94\xa8'), (2, b'\xe7\xa6\x81\xe7\x94\xa8')])), ], options={ 'verbose_name_plural': '3 \u7d22\u5f15\u548c\u5217\u8868\u89c4\u5219', }, ), migrations.CreateModel( name='Proxy', fields=[ ('id', models.AutoField(verbose_name='ID', serialize=False, auto_created=True, primary_key=True)), ('kind', models.IntegerField(default=1, verbose_name=b'\xe4\xbb\xa3\xe7\x90\x86\xe7\xb1\xbb\xe5\x9e\x8b', choices=[(0, b'\xe9\x80\x8f\xe6\x98\x8e\xe4\xbb\xa3\xe7\x90\x86'), (1, b'\xe9\xab\x98\xe5\xba\xa6\xe5\x8c\xbf\xe5\x90\x8d')])), ('user', models.CharField(default=b'', max_length=100, blank=True)), ('password', models.CharField(default=b'', max_length=100, blank=True)), ('host', models.CharField(max_length=100)), ('port', models.IntegerField(default=80)), ('address', models.CharField(default=b'', max_length=100, verbose_name=b'\xe5\x9c\xb0\xe7\x90\x86\xe4\xbd\x8d\xe7\xbd\xae', blank=True)), ('speed', models.IntegerField(default=0, verbose_name=b'\xe8\xbf\x9e\xe6\x8e\xa5\xe9\x80\x9f\xe5\xba\xa6(ms)')), ('status', models.IntegerField(default=0, verbose_name=b'\xe7\x8a\xb6\xe6\x80\x81', choices=[(0, b'\xe6\x9c\xaa\xe6\xa3\x80\xe6\xb5\x8b'), (1, b'\xe6\xa3\x80\xe6\xb5\x8b\xe6\x88\x90\xe5\x8a\x9f'), (2, b'\xe6\xa3\x80\xe6\xb5\x8b\xe5\xa4\xb1\xe8\xb4\xa5')])), ('retry', models.IntegerField(default=0, verbose_name=b'\xe5\xb0\x9d\xe8\xaf\x95\xe6\xac\xa1\xe6\x95\xb0')), ], options={ 'verbose_name_plural': '5 \u8bbf\u95ee\u4ee3\u7406', }, ), migrations.CreateModel( name='Seed', fields=[ ('id', models.AutoField(verbose_name='ID', serialize=False, auto_created=True, primary_key=True)), ('name', models.CharField(max_length=100, verbose_name=b'\xe6\xa8\xa1\xe6\x9d\xbf\xe5\x90\x8d\xe7\xa7\xb0')), ('desc', models.TextField(verbose_name=b'\xe7\xae\x80\xe4\xbb\x8b')), ('data', jsonfield.fields.JSONField(default=[], verbose_name=b'\xe5\xad\x98\xe5\x82\xa8\xe6\x95\xb0\xe6\x8d\xae\xe9\x85\x8d\xe7\xbd\xae', blank=True)), ('weight', models.IntegerField(default=0, verbose_name=b'\xe6\x9d\x83\xe9\x87\x8d')), ('status', models.IntegerField(default=1, verbose_name=b'\xe6\x98\xaf\xe5\x90\xa6\xe5\x90\xaf\xe7\x94\xa8', choices=[(1, b'\xe5\x90\xaf\xe7\x94\xa8'), (2, b'\xe7\xa6\x81\xe7\x94\xa8')])), ], options={ 'verbose_name_plural': '1 \u79cd\u5b50', }, ), migrations.CreateModel( name='Site', fields=[ ('id', models.AutoField(verbose_name='ID', serialize=False, auto_created=True, primary_key=True)), ('name', models.CharField(max_length=100, verbose_name=b'\xe7\xab\x99\xe7\x82\xb9\xe5\x90\x8d\xe7\xa7\xb0')), ('domain', models.CharField(unique=True, max_length=100, verbose_name=b'\xe7\xab\x99\xe7\x82\xb9\xe5\x9f\x9f\xe5\x90\x8d')), ('proxy', models.IntegerField(default=1, verbose_name=b'\xe4\xbb\xa3\xe7\x90\x86', choices=[(1, b'\xe4\xb8\x8d\xe4\xbd\xbf\xe7\x94\xa8\xe4\xbb\xa3\xe7\x90\x86'), (2, b'\xe5\xad\x98\xe5\x82\xa8\xe5\x9c\xa8Mysql\xe6\x95\xb0\xe6\x8d\xae\xe5\xba\x93\xe4\xb8\xad\xe7\x9a\x84\xe4\xbb\xa3\xe7\x90\x86')])), ('browser', models.IntegerField(default=1, verbose_name=b'\xe6\xb5\x8f\xe8\xa7\x88\xe5\x99\xa8\xe5\xa3\xb3', choices=[(1, b'\xe4\xb8\x8d\xe4\xbd\xbf\xe7\x94\xa8\xe6\xb5\x8f\xe8\xa7\x88\xe5\x99\xa8\xe5\xa3\xb3'), (2, b'\xe6\x99\xae\xe9\x80\x9a\xe6\xb5\x8f\xe8\xa7\x88\xe5\x99\xa8')])), ('limit_speed', models.IntegerField(default=100, verbose_name=b'\xe8\xae\xbf\xe9\x97\xae\xe9\x97\xb4\xe9\x9a\x94(\xe6\xaf\xab\xe7\xa7\x92)')), ('status', models.IntegerField(default=1, verbose_name=b'\xe6\x98\xaf\xe5\x90\xa6\xe5\x90\xaf\xe7\x94\xa8', choices=[(1, b'\xe5\x90\xaf\xe7\x94\xa8'), (2, b'\xe7\xa6\x81\xe7\x94\xa8')])), ], options={ 'verbose_name_plural': '2 \u7ad9\u70b9\u914d\u7f6e', }, ), migrations.AddField( model_name='indexrule', name='seed', field=models.ForeignKey(to='cores.Seed'), ), migrations.AddField( model_name='indexrule', name='site', field=models.ForeignKey(to='cores.Site'), ), migrations.AddField( model_name='detailrule', name='index_rule', field=models.ForeignKey(to='cores.IndexRule'), ), ] ================================================ FILE: cores/migrations/0002_detailrule_exclude.py ================================================ # -*- coding: utf-8 -*- from __future__ import unicode_literals from django.db import models, migrations import jsonfield.fields class Migration(migrations.Migration): dependencies = [ ('cores', '0001_initial'), ] operations = [ migrations.AddField( model_name='detailrule', name='exclude', field=jsonfield.fields.JSONField(verbose_name=b'\xe6\x8e\x92\xe9\x99\xa4\xe8\xa7\x84\xe5\x88\x99', blank=True), ), ] ================================================ FILE: cores/migrations/0003_auto_20160131_2226.py ================================================ # -*- coding: utf-8 -*- from __future__ import unicode_literals from django.db import models, migrations import jsonfield.fields class Migration(migrations.Migration): dependencies = [ ('cores', '0002_detailrule_exclude'), ] operations = [ migrations.AlterField( model_name='detailrule', name='exclude', field=jsonfield.fields.JSONField(default=[], verbose_name=b'\xe6\x8e\x92\xe9\x99\xa4\xe8\xa7\x84\xe5\x88\x99', blank=True), ), ] ================================================ FILE: cores/migrations/0004_auto_20160201_1035.py ================================================ # -*- coding: utf-8 -*- from __future__ import unicode_literals from django.db import models, migrations class Migration(migrations.Migration): dependencies = [ ('cores', '0003_auto_20160131_2226'), ] operations = [ migrations.DeleteModel( name='Proxy', ), migrations.AlterModelOptions( name='detailrule', options={'verbose_name_plural': '3 \u8be6\u60c5\u9875\u722c\u53d6\u89c4\u5219'}, ), migrations.AlterModelOptions( name='indexrule', options={'verbose_name_plural': '2 \u7d22\u5f15\u548c\u5217\u8868\u89c4\u5219'}, ), migrations.AlterField( model_name='indexrule', name='site', field=models.ForeignKey(to='configs.Site'), ), migrations.DeleteModel( name='Site', ), ] ================================================ FILE: cores/migrations/0005_detailrule_multi.py ================================================ # -*- coding: utf-8 -*- from __future__ import unicode_literals from django.db import models, migrations import jsonfield.fields class Migration(migrations.Migration): dependencies = [ ('cores', '0004_auto_20160201_1035'), ] operations = [ migrations.AddField( model_name='detailrule', name='multi', field=jsonfield.fields.JSONField(verbose_name=b'\xe5\xa4\x9a\xe8\xaf\xa6\xe6\x83\x85\xe8\xa7\x84\xe5\x88\x99', blank=True), ), ] ================================================ FILE: cores/migrations/0006_detailrule_fresh_time.py ================================================ # -*- coding: utf-8 -*- from __future__ import unicode_literals from django.db import models, migrations class Migration(migrations.Migration): dependencies = [ ('cores', '0005_detailrule_multi'), ] operations = [ migrations.AddField( model_name='detailrule', name='fresh_time', field=models.IntegerField(default=2592000, verbose_name=b'\xe6\x96\xb0\xe9\xb2\x9c\xe5\xba\xa6\xe7\xbb\xb4\xe6\x8c\x81\xe6\x97\xb6\xe9\x97\xb4(\xe7\xa7\x92),\xe9\xbb\x98\xe8\xae\xa4\xe4\xb8\x80\xe4\xb8\xaa\xe6\x9c\x88'), ), ] ================================================ FILE: cores/migrations/0007_detailrule_multi_unique.py ================================================ # -*- coding: utf-8 -*- from __future__ import unicode_literals from django.db import models, migrations import jsonfield.fields class Migration(migrations.Migration): dependencies = [ ('cores', '0006_detailrule_fresh_time'), ] operations = [ migrations.AddField( model_name='detailrule', name='multi_unique', field=jsonfield.fields.JSONField(verbose_name=b'\xe5\xa4\x9a\xe8\xaf\xa6\xe6\x83\x85\xe5\x94\xaf\xe4\xb8\x80\xe9\x94\xae\xe8\xa7\x84\xe5\x88\x99', blank=True), ), ] ================================================ FILE: cores/migrations/0008_auto_20160407_1426.py ================================================ # -*- coding: utf-8 -*- from __future__ import unicode_literals from django.db import models, migrations import jsonfield.fields class Migration(migrations.Migration): dependencies = [ ('cores', '0007_detailrule_multi_unique'), ] operations = [ migrations.AlterField( model_name='detailrule', name='multi', field=jsonfield.fields.JSONField(default=[], verbose_name=b'\xe5\xa4\x9a\xe8\xaf\xa6\xe6\x83\x85\xe8\xa7\x84\xe5\x88\x99', blank=True), ), migrations.AlterField( model_name='detailrule', name='multi_unique', field=jsonfield.fields.JSONField(default=[], verbose_name=b'\xe5\xa4\x9a\xe8\xaf\xa6\xe6\x83\x85\xe5\x94\xaf\xe4\xb8\x80\xe9\x94\xae\xe8\xa7\x84\xe5\x88\x99', blank=True), ), migrations.AlterField( model_name='indexrule', name='list_rules', field=jsonfield.fields.JSONField(default=[], verbose_name=b'\xe8\x8e\xb7\xe5\x8f\x96\xe5\x88\x97\xe8\xa1\xa8\xe9\xa1\xb9\xe7\x9a\x84\xe8\xa7\x84\xe5\x88\x99', blank=True), ), ] ================================================ FILE: cores/migrations/__init__.py ================================================ # -*- coding: utf-8 -*- __author__ = 'yijingping' ================================================ FILE: cores/models.py ================================================ # -*- coding: utf-8 -*- __author__ = 'yijingping' import collections from django.db import models from jsonfield import JSONField from configs.models import Site class Seed(models.Model): STATUS_ENABLE = 1 STATUS_DISABLE = 2 STATUS_CHOICES = ( (STATUS_ENABLE, '启用'), (STATUS_DISABLE, '禁用') ) name = models.CharField(max_length=100, verbose_name='模板名称') desc = models.TextField(verbose_name='简介') data = JSONField(verbose_name='存储数据配置', load_kwargs={'object_pairs_hook': collections.OrderedDict}, blank=True, default=[]) weight = models.IntegerField(default=0, verbose_name='权重') status = models.IntegerField(default=STATUS_ENABLE, choices=STATUS_CHOICES, verbose_name="是否启用") def __unicode__(self): return self.name class Meta: verbose_name_plural = "1 种子" class IndexRule(models.Model): STATUS_ENABLE = 1 STATUS_DISABLE = 2 STATUS_CHOICES = ( (STATUS_ENABLE, '启用'), (STATUS_DISABLE, '禁用') ) seed = models.ForeignKey(Seed) name = models.CharField(max_length=100, verbose_name='来源') site = models.ForeignKey(Site) url = JSONField(verbose_name='索引url列表', load_kwargs={'object_pairs_hook': collections.OrderedDict}) list_rules = JSONField(verbose_name='获取列表项的规则', load_kwargs={'object_pairs_hook': collections.OrderedDict}, blank=True, default=[]) next_url_rules = JSONField(verbose_name='下一页索引的规则列表', load_kwargs={'object_pairs_hook': collections.OrderedDict}, blank=True, default=[]) frequency = models.IntegerField(default=60, verbose_name='爬取频率,单位秒') update_time = models.DateTimeField(auto_now=True, verbose_name='更新时间') next_crawl_time = models.DateTimeField(verbose_name='下次爬取时间') fresh_pages = models.IntegerField(default=2, verbose_name='爬取页面数') status = models.IntegerField(default=STATUS_ENABLE, choices=STATUS_CHOICES, verbose_name="是否启用") def __unicode__(self): return self.name class Meta: verbose_name_plural = "2 索引和列表规则" class DetailRule(models.Model): index_rule = models.ForeignKey(IndexRule) data = JSONField(verbose_name='详情页规则', load_kwargs={'object_pairs_hook': collections.OrderedDict}) exclude = JSONField(verbose_name='排除规则', load_kwargs={'object_pairs_hook': collections.OrderedDict}, blank=True, default=[]) multi = JSONField(verbose_name='多详情规则', load_kwargs={'object_pairs_hook': collections.OrderedDict}, blank=True, default=[]) multi_unique = JSONField(verbose_name='多详情唯一键规则', load_kwargs={'object_pairs_hook': collections.OrderedDict}, blank=True, default=[]) fresh_time = models.IntegerField(default=2592000, verbose_name='新鲜度维持时间(秒),默认一个月') def __unicode__(self): return '%s, %s' % (self.index_rule.name, self.index_rule.url) class Meta: verbose_name_plural = "3 详情页爬取规则" ================================================ FILE: cores/processors.py ================================================ # -*- coding: utf-8 -*- __author__ = 'yijingping' from abc import ABCMeta from abc import abstractmethod import _mysql import torndb from datetime import datetime from django.utils.encoding import smart_str, smart_unicode from django.db import models from sqlalchemy import create_engine from sqlalchemy.orm import sessionmaker from cores.util import get_uniqueid import logging logger = logging.getLogger() class BaseProcessorBackend(object): __metaclass__ = ABCMeta @abstractmethod def __init__(self): pass @abstractmethod def process(self, data): pass class MysqlBackend(BaseProcessorBackend): @property def _table(self): return self.db_table def __init__(self, config): db_config = config['database'] self.db = torndb.Connection( host=db_config.get("host"), database=db_config.get("name"), user=db_config.get("user"), password=db_config.get("password"), charset=db_config.get("charset") ) self.db_table = config['table'] self.defaults = config['defaults'] self.unique_key = config["unique_key"] def process(self, params, filters=None): # 加上默认值 data = params.copy() for k, v in self.defaults.iteritems(): data.setdefault(k, v) # 设置唯一键 unique_value = ':'.join(['%s' % data[k] for k in self.unique_key]) data['uniqueid'] = get_uniqueid(unique_value) data['update_time'] = str(datetime.now()) # 清除数据 data.pop('seed_id', None) data.pop('rule_id', None) data.pop('detail_multi', None) # 更新或插入数据库 #print data try: # try update affected = self.update(data, {'uniqueid': data['uniqueid']}) if affected == 0: # row not exists, try create data['create_time'] = str(datetime.now()) self.create(data) except Exception as e: logger.exception(e) finally: logger.debug(data['url']) def create(self, params): keys = params.keys() values = params.values() cols = ','.join(map(lambda s:str(s), keys)) placeholder = ','.join(['%s' for _ in range(len(keys))]) sql = 'INSERT INTO ' + self._table + ' (' + cols + ') ' + ' VALUES (' + placeholder + ');' return self.db.insert(sql, *values) def update(self, params, filters=None): set_keys = params.keys() values = params.values() set_placeholder = ', '.join(['`'+item+'`=%s' for item in set_keys]) sql = 'UPDATE ' + self._table + ' SET ' + set_placeholder if filters: where_keys = filters.keys() where_values = filters.values() where_placeholder = ', '.join(['`'+item+'`=%s' for item in where_keys]) sql = sql + ' WHERE ' + where_placeholder values += where_values return self.db.update(sql, *values) @staticmethod def dict_to_sql(params, sep=', '): cols = [] for k, v in params.iteritems(): k2 = _mysql.escape_string(str(k)) if v is None: col = '`%s`=NULL' % k2 elif isinstance(v, (int, long, float)): col = '`%s`=%s' % (k2, v) elif isinstance(v, unicode): v2 = v.encode('utf-8') col = '`%s`="%s"' % (k2, smart_unicode(_mysql.escape_string(smart_str(v)))) else: col = '`%s`="%s"' % (k2, v) cols.append(col) return smart_unicode(sep.join(cols)) @staticmethod def fields_to_sql(fields): f2 = ["`%s`" % item if item != "*" else "*" for item in fields] return _mysql.escape_string(', '.join(f2)) class DjangoModelBackend(BaseProcessorBackend): def __init__(self, config): self.defaults = config['defaults'] self.unique_key = config["unique_key"] modelstr = config["DjangoModel"] modelclass = models.get_model(modelstr.split('.')[0], modelstr.split('.')[-1]) self._class = modelclass def process(self, params): C = self._class params['uniqueid'] = get_uniqueid('%s:%s' % (params['wechat_id'], params['title'])) # 加上默认值 data = params.copy() for k, v in self.defaults.iteritems(): data.setdefault(k, v) # 设置唯一键 unique_value = ':'.join(['%s' % data[k] for k in self.unique_key]) data['uniqueid'] = get_uniqueid(unique_value) data['update_time'] = str(datetime.now()) # 清除数据 data.pop('seed_id', None) data.pop('rule_id', None) data.pop('detail_multi', None) # 更新或插入数据库 try: C.objects.update_or_create(uniqueid=data['uniqueid'], defaults=data) except Exception as e: logger.exception(e) finally: logger.debug(data['url']) class MongoDBBackend(BaseProcessorBackend): pass class PostgresBackend(BaseProcessorBackend): @property def _table(self): return self.db_table def __init__(self, config): db_config = config['database'] conn_url = "postgresql://%s:%s@%s/%s" % ( db_config.get("user"), db_config.get("password"), db_config.get("host"), db_config.get("name") ) self.engine = create_engine(conn_url) self.db_table = config['table'] self.defaults = config['defaults'] self.unique_key = config["unique_key"] def process(self, params, filters=None): # 加上默认值 data = params.copy() for k, v in self.defaults.iteritems(): data.setdefault(k, v) # 设置唯一键 unique_value = ':'.join(['%s' % data[k] for k in self.unique_key]) data['uniqueid'] = get_uniqueid(unique_value) data['update_time'] = str(datetime.now()) # 清除数据 data.pop('seed_id', None) data.pop('rule_id', None) data.pop('detail_multi', None) # 更新或插入数据库 #print data try: # try update affected = self.update(data, {'uniqueid': data['uniqueid']}) if affected == 0: # row not exists, try create data['create_time'] = str(datetime.now()) self.create(data) except Exception as e: logger.exception(e) finally: logger.debug(data['url']) def create(self, params): keys = params.keys() values = params.values() cols = ','.join(map(lambda s:str(s), keys)) placeholder = ','.join(['%s' for _ in range(len(keys))]) sql = 'INSERT INTO ' + self._table + ' (' + cols + ') ' + ' VALUES (' + placeholder + ');' with self.engine.connect() as con: res = con.execute(sql, *values) def update(self, params, filters=None): set_keys = params.keys() values = params.values() set_placeholder = ', '.join([item+'=%s' for item in set_keys]) sql = 'UPDATE ' + self._table + ' SET ' + set_placeholder if filters: where_keys = filters.keys() where_values = filters.values() where_placeholder = ', '.join([item+'=%s' for item in where_keys]) sql = sql + ' WHERE ' + where_placeholder values += where_values rowcount = 0 with self.engine.connect() as con: res = con.execute(sql, *values) rowcount = res.rowcount return rowcount @staticmethod def dict_to_sql(params, sep=', '): cols = [] for k, v in params.iteritems(): k2 = _mysql.escape_string(str(k)) if v is None: col = '%s=NULL' % k2 elif isinstance(v, (int, long, float)): col = '%s=%s' % (k2, v) elif isinstance(v, unicode): v2 = v.encode('utf-8') col = '%s="%s"' % (k2, smart_unicode(_mysql.escape_string(smart_str(v)))) else: col = '%s="%s"' % (k2, v) cols.append(col) return smart_unicode(sep.join(cols)) @staticmethod def fields_to_sql(fields): f2 = ["%s" % item if item != "*" else "*" for item in fields] return _mysql.escape_string(', '.join(f2)) ================================================ FILE: cores/util.py ================================================ # -*- coding: utf-8 -*- __author__ = 'yijingping' import redis import json from django.conf import settings from hashlib import md5 REDIS_POOL = None def get_redis_pool(): global REDIS_POOL if not REDIS_POOL: REDIS_POOL = redis.ConnectionPool(**settings.REDIS_OPTIONS) return REDIS_POOL def get_redis(): return redis.Redis(connection_pool=get_redis_pool()) def get_uniqueid(url): link = get_link_from_url(url) return md5(link).hexdigest() def get_link_from_url(url): if isinstance(url, basestring): return url elif isinstance(url, dict): return json.dumps(url) ================================================ FILE: crontab ================================================ # 监控服务队列积压数 */1 * * * * cd /var/www/pythonzone/unicrawler; python ./manage.py monitor service >> /var/log/pythonzone/monitor_service.log 2>&1 ================================================ FILE: manage.py ================================================ #!/usr/bin/env python import os import sys if __name__ == "__main__": os.environ.setdefault("DJANGO_SETTINGS_MODULE", "unicrawler.settings") from django.core.management import execute_from_command_line execute_from_command_line(sys.argv) ================================================ FILE: monitors/__init__.py ================================================ default_app_config = 'monitors.apps.MonitorsAppConfig' ================================================ FILE: monitors/admin.py ================================================ from django.contrib import admin from .models import Service class ServiceAdmin(admin.ModelAdmin): list_display = ('id', 'scheduler', 'downloader', 'extractor', 'processor', 'create_time') admin.site.register(Service, ServiceAdmin) ================================================ FILE: monitors/apps.py ================================================ # -*- coding: utf-8 -*- __author__ = 'yijingping' from django.apps import AppConfig class MonitorsAppConfig(AppConfig): name = 'monitors' verbose_name = u'3 爬虫监控' ================================================ FILE: monitors/management/__init__.py ================================================ # -*- coding: utf-8 -*- __author__ = 'yijingping' ================================================ FILE: monitors/management/commands/__init__.py ================================================ # -*- coding: utf-8 -*- __author__ = 'yijingping' ================================================ FILE: monitors/management/commands/monitor.py ================================================ # -*- coding: utf-8 -*- __author__ = 'yijingping' import time from datetime import datetime from django.core.management.base import BaseCommand, CommandError from monitors.models import Service from cores.util import get_redis from django.conf import settings from cores.models import Seed, IndexRule class Command(BaseCommand): help = '获取监控数据' def add_arguments(self, parser): # Positional arguments parser.add_argument('action', choices=['service', 'stats'], help='选择要监控的服务类型') def handle(self, *args, **options): if options["action"] == 'service': self.monitor_service() def monitor_service(self): conf = settings.CRAWLER_CONFIG r = get_redis() now = datetime.now().replace(second=0, microsecond=0) pipe = r.pipeline() result = pipe.llen(conf['downloader']).llen(conf['extractor']).llen(conf['processor']).execute() scheduler = IndexRule.objects.filter(seed__status=Seed.STATUS_ENABLE, status=IndexRule.STATUS_ENABLE, next_crawl_time__lte=now).count() print result Service.objects.create( scheduler=scheduler, downloader=result[0], extractor=result[1], processor=result[2], create_time=now ) ================================================ FILE: monitors/migrations/0001_initial.py ================================================ # -*- coding: utf-8 -*- from __future__ import unicode_literals from django.db import models, migrations class Migration(migrations.Migration): dependencies = [ ] operations = [ migrations.CreateModel( name='Service', fields=[ ('id', models.AutoField(verbose_name='ID', serialize=False, auto_created=True, primary_key=True)), ('scheduler', models.IntegerField(default=0, verbose_name=b'\xe6\x9c\xaa\xe6\x89\xa7\xe8\xa1\x8c\xe7\x9a\x84\xe8\xae\xa1\xe5\x88\x92\xe6\x95\xb0')), ('downloader', models.IntegerField(default=0, verbose_name=b'\xe6\x9c\xaa\xe4\xb8\x8b\xe8\xbd\xbd\xe7\x9a\x84url\xe6\x95\xb0')), ('extractor', models.IntegerField(default=0, verbose_name=b'\xe6\x9c\xaa\xe6\x8a\xbd\xe5\x8f\x96\xe7\x9a\x84url\xe6\x95\xb0')), ('processor', models.IntegerField(default=0, verbose_name=b'\xe6\x9c\xaa\xe5\x85\xa5\xe5\xba\x93\xe7\x9a\x84\xe8\xae\xb0\xe5\xbd\x95\xe6\x95\xb0')), ('create_time', models.DateTimeField(unique=True, verbose_name=b'\xe5\x88\x9b\xe5\xbb\xba\xe6\x97\xb6\xe9\x97\xb4')), ], options={ 'verbose_name_plural': '1 \u670d\u52a1\u961f\u5217\u79ef\u538b', }, ), ] ================================================ FILE: monitors/migrations/__init__.py ================================================ ================================================ FILE: monitors/models.py ================================================ # -*- coding: utf-8 -*- __author__ = 'yijingping' import collections from django.db import models from jsonfield import JSONField class Service(models.Model): scheduler = models.IntegerField(default=0, verbose_name='未执行的计划数') downloader = models.IntegerField(default=0, verbose_name='未下载的url数') extractor = models.IntegerField(default=0, verbose_name='未抽取的url数') processor = models.IntegerField(default=0, verbose_name='未入库的记录数') create_time = models.DateTimeField(verbose_name='创建时间', unique=True) def __unicode__(self): return self.name class Meta: verbose_name_plural = "1 服务队列积压" ================================================ FILE: requirements.txt ================================================ Django==1.8.1 MySQL-python==1.2.5 requests==2.7.0 lxml==3.4.4 jsonfield==1.0.3 hiredis==0.2.0 redis==2.10.3 torndb==0.3 oss2==2.0.5 selenium==2.52.0 PyVirtualDisplay==0.1.5 psycopg2==2.6.1 sqlalchemy==1.1.4 ================================================ FILE: supervisord.conf ================================================ [program:unicrawler.bowenpay.com] command=/bin/python /var/www/pythonzone/unicrawler/manage.py runserver 127.0.0.1:8889 umask=022 user=ripple startsecs=0 stopwaitsecs=0 autostart=true autorestart=true stdout_logfile=/var/log/pythonzone/unicrawler.stdout.log stderr_logfile=/var/log/pythonzone/unicrawler.stderr.log stopsignal=KILL killasgroup=true [program:unicrawler_scheduler] command=/bin/python /var/www/pythonzone/unicrawler/bin/scheduler.py umask=022 user=ripple startsecs=0 stopwaitsecs=0 autostart=true autorestart=true stdout_logfile=/var/log/pythonzone/unicrawler_scheduler.stdout.log stderr_logfile=/var/log/pythonzone/unicrawler_scheduler.stderr.log stopsignal=KILL killasgroup=true [program:unicrawler_downloader] command=/bin/python /var/www/pythonzone/unicrawler/bin/downloader.py umask=022 user=ripple startsecs=0 stopwaitsecs=0 autostart=true autorestart=true stdout_logfile=/var/log/pythonzone/unicrawler_downloader.stdout.log stderr_logfile=/var/log/pythonzone/unicrawler_downloader.stderr.log stopsignal=KILL killasgroup=true process_name=%(process_num)s numprocs=4 [program:unicrawler_extractor] command=/bin/python /var/www/pythonzone/unicrawler/bin/extractor.py umask=022 user=ripple startsecs=0 stopwaitsecs=0 autostart=true autorestart=true stdout_logfile=/var/log/pythonzone/unicrawler_extractor.stdout.log stderr_logfile=/var/log/pythonzone/unicrawler_extractor.stderr.log stopsignal=KILL killasgroup=true process_name=%(process_num)s numprocs=2 [program:unicrawler_processor] command=/bin/python /var/www/pythonzone/unicrawler/bin/processor.py umask=022 user=ripple startsecs=0 stopwaitsecs=0 autostart=true autorestart=true stdout_logfile=/var/log/pythonzone/unicrawler_processor.stdout.log stderr_logfile=/var/log/pythonzone/unicrawler_processor.stderr.log stopsignal=KILL killasgroup=true [program:unicrawler_checkproxies] command=/bin/python /var/www/pythonzone/unicrawler/manage.py checkproxies umask=022 user=ripple startsecs=0 stopwaitsecs=0 autostart=true autorestart=true stdout_logfile=/var/log/pythonzone/unicrawler_checkproxies.stdout.log stderr_logfile=/var/log/pythonzone/unicrawler_checkproxies.stderr.log stopsignal=KILL killasgroup=true ================================================ FILE: unicrawler/__init__.py ================================================ ================================================ FILE: unicrawler/settings.py ================================================ """ Django settings for unicrawler project. Generated by 'django-admin startproject' using Django 1.8.1. For more information on this file, see https://docs.djangoproject.com/en/1.8/topics/settings/ For the full list of settings and their values, see https://docs.djangoproject.com/en/1.8/ref/settings/ """ # Build paths inside the project like this: os.path.join(BASE_DIR, ...) import os BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) # Quick-start development settings - unsuitable for production # See https://docs.djangoproject.com/en/1.8/howto/deployment/checklist/ # SECURITY WARNING: keep the secret key used in production secret! SECRET_KEY = 'bd+g*191h+l1=0e%32h_i8gyrk!v#3(kyxy7^$kq&w=p(q(h0)' # SECURITY WARNING: don't run with debug turned on in production! DEBUG = True ALLOWED_HOSTS = [] # Application definition INSTALLED_APPS = ( 'django.contrib.admin', 'django.contrib.auth', 'django.contrib.contenttypes', 'django.contrib.sessions', 'django.contrib.messages', 'django.contrib.staticfiles', 'unicrawler', 'cores', 'configs', 'monitors' ) MIDDLEWARE_CLASSES = ( 'django.contrib.sessions.middleware.SessionMiddleware', 'django.middleware.common.CommonMiddleware', 'django.middleware.csrf.CsrfViewMiddleware', 'django.contrib.auth.middleware.AuthenticationMiddleware', 'django.contrib.auth.middleware.SessionAuthenticationMiddleware', 'django.contrib.messages.middleware.MessageMiddleware', 'django.middleware.clickjacking.XFrameOptionsMiddleware', 'django.middleware.security.SecurityMiddleware', ) ROOT_URLCONF = 'unicrawler.urls' TEMPLATES = [ { 'BACKEND': 'django.template.backends.django.DjangoTemplates', 'DIRS': [os.path.join(BASE_DIR, 'templates')] , 'APP_DIRS': True, 'OPTIONS': { 'context_processors': [ 'django.template.context_processors.debug', 'django.template.context_processors.request', 'django.contrib.auth.context_processors.auth', 'django.contrib.messages.context_processors.messages', ], }, }, ] WSGI_APPLICATION = 'unicrawler.wsgi.application' # Database # https://docs.djangoproject.com/en/1.8/ref/settings/#databases DATABASES = { 'default': { 'ENGINE': 'django.db.backends.mysql', 'HOST': '127.0.0.1', 'NAME': 'unicrawler', 'USER': 'root', 'PASSWORD': '123456', 'OPTIONS':{ 'charset':'utf8mb4', }, } } # Internationalization # https://docs.djangoproject.com/en/1.8/topics/i18n/ LANGUAGE_CODE = 'zh-hans' TIME_ZONE = 'Asia/Shanghai' USE_I18N = True USE_L10N = True USE_TZ = False # Static files (CSS, JavaScript, Images) # https://docs.djangoproject.com/en/1.8/howto/static-files/ STATIC_URL = '/static/' REDIS_OPTIONS = { 'host': 'localhost', 'port': 6379, 'password': '', 'db': 3 } CRAWLER_CONFIG = { #'scheduler': 'unicrawler:scheduler', 'downloader': 'unicrawler:downloader', 'extractor': 'unicrawler:extractor', 'processor': 'unicrawler:processor' } CRAWLER_DEBUG = False LOGGING = { 'version': 1, 'disable_existing_loggers': False, 'formatters': { 'verbose': { 'format': '%(levelname)s %(asctime)s %(module)s %(process)d %(thread)d %(message)s' }, 'simple': { 'format': '%(levelname)s %(message)s' }, }, 'handlers': { 'console': { 'class': 'logging.StreamHandler', 'formatter': 'verbose' }, }, 'loggers': { 'django': { 'handlers': ['console'], 'level': os.getenv('DJANGO_LOG_LEVEL', 'INFO'), }, '': { 'handlers': ['console'], 'level': 'DEBUG', }, }, } # aliyun oss2 OSS2_CONFIG = { "ACCESS_KEY_ID": "", "ACCESS_KEY_SECRET": "", "ENDPOINT": "", "BUCKET_DOMAIN": "oss-cn-beijing.aliyuncs.com", "BUCKET_NAME": "pythonzone", "IMAGES_PATH": "images/", "VIDEOS_PATH": "videos/", "CDN_DOMAIN": "pystats.bowenpay.com" } ## Import local settings try: from local_settings import * except ImportError: import sys, traceback sys.stderr.write("Warning: Can't find the file 'local_settings.py' in the directory containing %r. It appears you've customized things.\nYou'll have to run django-admin.py, passing it your settings module.\n(If the file settings.py does indeed exist, it's causing an ImportError somehow.)\n" % __file__) sys.stderr.write("\nFor debugging purposes, the exception was:\n\n") traceback.print_exc() ================================================ FILE: unicrawler/urls.py ================================================ """unicrawler URL Configuration The `urlpatterns` list routes URLs to views. For more information please see: https://docs.djangoproject.com/en/1.8/topics/http/urls/ Examples: Function views 1. Add an import: from my_app import views 2. Add a URL to urlpatterns: url(r'^$', views.home, name='home') Class-based views 1. Add an import: from other_app.views import Home 2. Add a URL to urlpatterns: url(r'^$', Home.as_view(), name='home') Including another URLconf 1. Add an import: from blog import urls as blog_urls 2. Add a URL to urlpatterns: url(r'^blog/', include(blog_urls)) """ from django.conf.urls import include, url from django.contrib import admin from django.views.generic.base import RedirectView urlpatterns = [ url(r'^$', RedirectView.as_view(url='admin/', permanent=False)), url(r'^admin/', include(admin.site.urls)) ] ================================================ FILE: unicrawler/wsgi.py ================================================ """ WSGI config for unicrawler project. It exposes the WSGI callable as a module-level variable named ``application``. For more information on this file, see https://docs.djangoproject.com/en/1.8/howto/deployment/wsgi/ """ import os from django.core.wsgi import get_wsgi_application os.environ.setdefault("DJANGO_SETTINGS_MODULE", "unicrawler.settings") application = get_wsgi_application()