[
  {
    "path": ".gitignore",
    "content": ".idea\n*.pyc\nlogs\nlocal_settings.py\n.DS_Store\n/static/\nbin/t.py\n"
  },
  {
    "path": "LICENSE",
    "content": "MIT License\n\nCopyright (c) 2023 yijingping\n\nPermission is hereby granted, free of charge, to any person obtaining a copy\nof this software and associated documentation files (the \"Software\"), to deal\nin the Software without restriction, including without limitation the rights\nto use, copy, modify, merge, publish, distribute, sublicense, and/or sell\ncopies of the Software, and to permit persons to whom the Software is\nfurnished to do so, subject to the following conditions:\n\nThe above copyright notice and this permission notice shall be included in all\ncopies or substantial portions of the Software.\n\nTHE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\nIMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\nFITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\nAUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\nLIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\nOUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\nSOFTWARE.\n"
  },
  {
    "path": "README.md",
    "content": "# unicrawler\n一个通用的可配置的的爬虫\n\n# 安装\n\n1）python环境, 检查python的版本，是否为2.7.x，如果不是，安装2.7.6。\n\ncentos 6.x 升级python2.6到python2.7,参考教程 http://ruiaylin.github.io/2014/12/12/python%20update/\n\n2）安装依赖包, clone代码\n\nMysql-python依赖\n```\nyum install python-devel mysql-devel gcc\n```\n\nlxml依赖\n```\nyum install libxslt-devel libxml2-devel\n```\n\n安装浏览器环境 selenium依赖\n```\nyum install xorg-x11-server-Xvfb\nyum install firefox\n```\n\nclone代码,安装依赖python库\n```\n$ git clone https://github.com/yijingping/unicrawler.git\n$ cd unicrawler\n$ pip install -r requirements.txt\n```\n\n3) 初始化mysql\n\na) 安装mysql-server后，记得设置字符为utf8mb4。在my.cnf中设置：\n\n```\n[client]\ndefault-character-set = utf8mb4\n\n[mysql]\ndefault-character-set = utf8mb4\n\n[mysqld]\ncharacter-set-client-handshake = FALSE\ncharacter-set-server = utf8mb4\ncollation-server = utf8mb4_unicode_ci\n```\n\nb) 重启数据库\n\nc) 创建数据库unicrawler\n\n```\nmysql> CREATE DATABASE `unicrawler` CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci;\n```\n\nd) 初始化表\n```\n$ python manage.py migrate\n```\n\n5）运行\n\n```\npython manage.py runserver 0.0.0.0:8001\n```\n访问 http://localhost:8001/。 测试没问题后，参考后面的supervisor脚本启动。\n\n# 部署nginx\n前期先用nginx将域名www.mydomain.com转发到8001端口。\n\n# 部署supervisor脚本\n参考文件 `supervisord.conf`\n\n# 部署crontab脚本\n参考文件 `crontab`\n"
  },
  {
    "path": "bin/downloader.py",
    "content": "# -*- coding: utf-8 -*-\n__author__ = 'yijingping'\n# 加载django环境\nimport sys\nimport os\nreload(sys)\nsys.setdefaultencoding('utf8') \nsys.path.append(os.path.dirname(os.path.dirname(os.path.realpath(__file__))))\nos.environ['DJANGO_SETTINGS_MODULE'] = 'unicrawler.settings'\nimport django\ndjango.setup()\n\nimport time\nimport json\nfrom django.conf import settings\nfrom cores.models import Site\nfrom cores.util import get_redis, get_uniqueid\nfrom cores.constants import KIND_DETAIL_URL\nfrom cores.downloaders import RequestsDownloaderBackend, SeleniumDownloaderBackend\nfrom configs.proxies import MysqlProxyBackend\n\nimport logging\nlogger = logging.getLogger()\n\n\nclass Downloader(object):\n    def __init__(self):\n        self.redis = get_redis()\n\n    def get_proxy(self, kind):\n        if kind == Site.PROXY_MYSQL:\n            return MysqlProxyBackend()\n        else:\n            return None\n\n    def check_limit_speed(self, config):\n        if config[\"limit_speed\"] <= 0:\n            return False, None\n        else:\n            proxy = self.get_proxy(config['proxy'])\n            key = 'unicrawler:limit_speed:%s:%s' % (config['domain'], proxy)\n            if self.redis.exists(key):\n                return True, proxy\n            else:\n                self.redis.psetex(key, config[\"limit_speed\"], config[\"limit_speed\"])\n                return False, proxy\n\n    def check_detail_fresh_time(self, data):\n        unique_key, fresh_time, rule_id = data['unique_key'], data[\"detail_fresh_time\"], data[\"rule_id\"]\n        if fresh_time <= 0:\n            return False\n        else:\n            unique_value = ''.join([str(data.get(item)) for item in unique_key])\n            key = 'unicrawler:detail_fresh_time:%s:%s' % (rule_id, get_uniqueid(unique_value))\n            if self.redis.exists(key):\n                return True\n            else:\n                self.redis.setex(key, fresh_time, fresh_time)\n                return False\n\n    def run(self):\n        r = self.redis\n        if settings.CRAWLER_DEBUG:\n            r.delete(settings.CRAWLER_CONFIG[\"downloader\"])\n        while True:\n            try:\n                resp_data = r.brpop(settings.CRAWLER_CONFIG[\"downloader\"])\n            except Exception as e:\n                print e\n                continue\n\n            try:\n                data = json.loads(resp_data[1])\n                site_config = data['site_config']\n                logger.debug(data[\"url\"])\n                is_limited, proxy = self.check_limit_speed(site_config)\n                if is_limited:\n                    print '# 被限制, 放回去, 下次下载'\n                    time.sleep(1)  # 休息一秒, 延迟放回去的时间\n                    r.lpush(settings.CRAWLER_CONFIG[\"downloader\"], resp_data[1])\n                elif (data[\"kind\"] == KIND_DETAIL_URL\n                    and self.check_detail_fresh_time(data)):\n                    print '# 该详情页已下载过, 不下载了'\n                else:\n                    print '# 未被限制,可以下载'\n                    if site_config['browser'] == Site.BROWSER_NONE:\n                        browser = RequestsDownloaderBackend(proxy=proxy)\n                        data['body'] = browser.download(data[\"url\"])\n                    elif site_config['browser'] == Site.BROWSER_NORMAL:\n                        with SeleniumDownloaderBackend(proxy=proxy) as browser:\n                            data['body'] = browser.download(data[\"url\"])\n                    else:\n                        return\n\n                    r.lpush(settings.CRAWLER_CONFIG[\"extractor\"], json.dumps(data))\n                    logger.debug(data)\n            except Exception as e:\n                print e\n                raise\n\n\nif __name__ == '__main__':\n    downloader = Downloader()\n    downloader.run()\n"
  },
  {
    "path": "bin/extractor.py",
    "content": "# -*- coding: utf-8 -*-\n__author__ = 'yijingping'\n# 加载django环境\nimport sys\nimport os\nreload(sys)\nsys.setdefaultencoding('utf8') \nsys.path.append(os.path.dirname(os.path.dirname(os.path.realpath(__file__))))\nos.environ['DJANGO_SETTINGS_MODULE'] = 'unicrawler.settings'\nimport django\ndjango.setup()\n\nimport json\nfrom cores.constants import KIND_LIST_URL, KIND_DETAIL_URL\nfrom django.conf import settings\nfrom cores.util import get_redis, get_uniqueid\nfrom cores.extractors import XPathExtractor, PythonExtractor, ImageExtractor, VideoExtractor\nimport logging\nlogger = logging.getLogger()\n\n\nclass Extractor(object):\n    def __init__(self):\n        self.redis = get_redis()\n\n    def extract(self, content, rules, context):\n        res = content\n        for rule in rules:\n            try:\n                extractor = None\n                if rule[\"kind\"] == \"xpath\":\n                    extractor = XPathExtractor(res, rule[\"data\"])\n                elif rule[\"kind\"] == \"python\":\n                    extractor = PythonExtractor(rule[\"data\"], res, context=context)\n                elif rule[\"kind\"] == \"image\":\n                    extractor = ImageExtractor(res)\n                elif rule[\"kind\"] == \"video\":\n                    extractor = VideoExtractor(res)\n\n                res = extractor.extract()\n            except Exception as e:\n                logger.exception(e)\n\n        return res\n\n    def check_detail_fresh_time(self, unique_url, data):\n        fresh_time, rule_id = data[\"detail_fresh_time\"], data[\"rule_id\"]\n        if fresh_time <= 0:\n            return False\n        else:\n            key = 'unicrawler:detail_fresh_time:%s:%s' % (rule_id, get_uniqueid(unique_url))\n            if self.redis.exists(key):\n                return True\n            else:\n                self.redis.setex(key, fresh_time, fresh_time)\n                return False\n\n    def get_detail(self, content, data):\n        # 检查是否在exclude规则内. 如果在,放弃存储\n        exclude_rules = data['detail_exclude']\n        excluded = self.extract(content, exclude_rules, {'data': data})\n        if excluded and excluded != content:\n            logger.debug('# url in excludes, abort!')\n            return\n\n        # 不在exclude规则内,可以存储\n        result = {\n            \"url\": data['url'],\n            \"seed_id\": data['seed_id'],\n            \"rule_id\": data['rule_id'],\n            'detail_multi': data['detail_multi']\n        }\n        rules = data['detail_rules']\n        for item in rules:\n            col = item[\"key\"]\n            print col\n            col_rules = item[\"rules\"]\n            col_value = self.extract(content, col_rules, {'data': result})\n            result[col] = col_value\n            # 提前检查多项详情新鲜度\n            if col == 'url':\n                if data['detail_multi']:\n                    if self.check_detail_fresh_time(result['url'], data):\n                        # 未过期,不更新\n                        logger.debug('检查多项详情未过期,不更新')\n                        return\n\n        # 更新\n        self.redis.lpush(settings.CRAWLER_CONFIG[\"processor\"], json.dumps(result))\n        logger.debug('extracted:%s' % result)\n\n    def run(self):\n        r = get_redis()\n        if settings.CRAWLER_DEBUG:\n            r.delete(settings.CRAWLER_CONFIG[\"extractor\"])\n        while True:\n            try:\n                data = r.brpop(settings.CRAWLER_CONFIG[\"extractor\"])\n            except Exception as e:\n                print e\n                continue\n            #print data\n            data = json.loads(data[1])\n            body = data['body']\n            # 1 如果当前接卸的页面是列表页\n            if data[\"kind\"] == KIND_LIST_URL:\n                # 1.1先找详情页\n                # 检查详情的内容是否都包含在列表页中\n                multi_rules = data['detail_multi']\n                if multi_rules:\n                    # 1.1.1 详情都包含在列表页中\n                    multi_parts = self.extract(body, multi_rules, {'data': data})\n                    for part in multi_parts:\n                        self.get_detail(part, data)\n                else:\n                    # 1.1.2 详情不在列表中,通过列表url去访问详情\n                    detail_urls = self.extract(body, data['list_rules'], {'data': data})\n                    #logger.debug('detail_urls: %s' % detail_urls)\n                    for item in detail_urls:\n                        item_data = {\n                            \"url\": item,\n                            'kind': KIND_DETAIL_URL,\n                            'seed_id': data['seed_id'],\n                            'rule_id': data['rule_id'],\n                            #'fresh_pages': '',\n                            #'list_rules': '',\n                            #'next_url_rules': '',\n                            'site_config': data['site_config'],\n                            'detail_rules': data['detail_rules'],\n                            'detail_exclude': data['detail_exclude'],\n                            'detail_multi': data['detail_multi'],\n                            'detail_multi_unique': data['detail_multi_unique'],\n                            'detail_fresh_time': data['detail_fresh_time'],\n                            'unique_key': data['unique_key']\n                        }\n                        r.lpush(settings.CRAWLER_CONFIG[\"downloader\"], json.dumps(item_data))\n\n                # 1.2后找下一页\n                next_urls = self.extract(body, data[\"next_url_rules\"], {'data': data})\n                print 'next_urls: %s' % next_urls\n                for item in next_urls:\n                    item_data = {\n                        \"url\": item,\n                        'kind': KIND_LIST_URL,\n                        'seed_id': data['seed_id'],\n                        'rule_id': data['rule_id'],\n                        'fresh_pages': data['fresh_pages'] - 1,\n                        'site_config': data['site_config'],\n                        'list_rules': data['list_rules'],\n                        'next_url_rules': data['next_url_rules'],\n                        'detail_rules': data['detail_rules'],\n                        'detail_exclude': data['detail_exclude'],\n                        'detail_multi': data['detail_multi'],\n                        'detail_multi_unique': data['detail_multi_unique'],\n                        'detail_fresh_time': data['detail_fresh_time'],\n                        'unique_key': data['unique_key']\n                    }\n                    if item_data['fresh_pages'] > 0:\n                        logger.debug('list:%s' % data['url'])\n                        r.lpush(settings.CRAWLER_CONFIG[\"downloader\"], json.dumps(item_data))\n            # 2 如果当前解析的页面是详情页\n            elif data[\"kind\"] == KIND_DETAIL_URL:\n                logger.debug('detail:%s' % data['url'])\n                # 如果没有多项详情,则只是单项\n                self.get_detail(body, data)\n\n\nif __name__ == '__main__':\n    my_extractor = Extractor()\n    my_extractor.run()\n"
  },
  {
    "path": "bin/processor.py",
    "content": "# -*- coding: utf-8 -*-\nfrom __future__ import unicode_literals\n__author__ = 'yijingping'\n# 加载django环境\nimport sys\nimport os\nreload(sys)\nsys.setdefaultencoding('utf8') \nsys.path.append(os.path.dirname(os.path.dirname(os.path.realpath(__file__))))\nos.environ['DJANGO_SETTINGS_MODULE'] = 'unicrawler.settings'\nimport django\ndjango.setup()\n\nimport json\nfrom django.conf import settings\nfrom cores.models import Seed\nfrom cores.util import get_redis\nfrom cores.processors import MysqlBackend, PostgresBackend, DjangoModelBackend, MongoDBBackend\n\nimport logging\nlogger = logging.getLogger()\n\n\nclass Processor():\n    def __init__(self):\n        self.pools = {}\n\n    def get_backends(self, seed_id):\n        cache = self.pools.get(seed_id, None)\n        if cache:\n            return cache\n        else:\n            try:\n               seed = Seed.objects.get(pk=seed_id)\n            except Seed.DoesNotExist as e:\n                logger.exception(e)\n                return []\n            else:\n                for config in seed.data:\n                    if config[\"kind\"] == \"mysql\":\n                        backend = MysqlBackend(config)\n                    elif config[\"kind\"] == \"mongodb\":\n                        backend = MongoDBBackend(config)\n                    elif config[\"kind\"] == \"postgres\":\n                        backend = PostgresBackend(config)\n                    elif config[\"kind\"] == \"DjangoModel\":\n                        backend = DjangoModelBackend(config)\n\n                    my_config = self.pools.get(seed_id, [])\n                    my_config.append(backend)\n                    self.pools[seed_id] = my_config\n\n                return self.pools.get(seed_id, [])\n\n    def process(self, data):\n        backends = self.get_backends(data['seed_id'])\n        for backend in backends:\n            backend.process(data)\n\n    def run(self):\n        r = get_redis()\n        if settings.CRAWLER_DEBUG:\n            r.delete(settings.CRAWLER_CONFIG[\"processor\"])\n        while True:\n            try:\n                rsp = r.brpop(settings.CRAWLER_CONFIG[\"processor\"])\n            except Exception as e:\n                print e\n                continue\n\n            data = json.loads(rsp[1])\n            #logger.info(json.dumps(data, encoding=\"UTF-8\", ensure_ascii=False))\n            self.process(data)\n\n\nif __name__ == '__main__':\n    processor = Processor()\n    processor.run()\n"
  },
  {
    "path": "bin/scheduler.py",
    "content": "# -*- coding: utf-8 -*-\n__author__ = 'yijingping'\n# 加载django环境\nimport sys\nimport os\nreload(sys)\nsys.setdefaultencoding('utf8') \nsys.path.append(os.path.dirname(os.path.dirname(os.path.realpath(__file__))))\nos.environ['DJANGO_SETTINGS_MODULE'] = 'unicrawler.settings'\nimport django\ndjango.setup()\n\nimport json\nfrom cores.models import Seed, IndexRule, DetailRule\nfrom cores.constants import KIND_LIST_URL\nfrom django.conf import settings\nimport logging\nlogger = logging.getLogger()\nfrom datetime import datetime, timedelta\nimport time\nfrom cores.util import get_redis\n\n\nclass Scheduler(object):\n    def run(self):\n        r = get_redis()\n        while True:\n            now = datetime.now()\n            for item in Seed.objects.filter(status=Seed.STATUS_ENABLE).order_by('-weight'):\n                rules = IndexRule.objects.filter(seed=item, status=IndexRule.STATUS_ENABLE, next_crawl_time__lte=now)\n                for rule in rules:\n                    try:\n                        detail_rule = DetailRule.objects.get(index_rule=rule)\n                    except DetailRule.DoesNotExist as e:\n                        print e\n                        continue\n\n                    base = {\n                        'url': '',\n                        'kind': KIND_LIST_URL,\n                        \"seed_id\": item.pk,\n                        'rule_id': rule.pk,\n                        \"fresh_pages\": rule.fresh_pages,\n                        'site_config': rule.site.get_config(),\n                        'list_rules': rule.list_rules,\n                        'next_url_rules': rule.next_url_rules,\n                        'detail_rules': detail_rule.data,\n                        'detail_exclude': detail_rule.exclude,\n                        'detail_multi': detail_rule.multi,\n                        'detail_multi_unique': detail_rule.multi_unique,\n                        'detail_fresh_time': detail_rule.fresh_time,\n                        'unique_key': item.data[0][\"unique_key\"]\n                    }\n                    for url in rule.url:\n                        data = base.copy()\n                        data['url'] = url\n                        r.lpush(settings.CRAWLER_CONFIG[\"downloader\"], json.dumps(data))\n\n                    # 更新index_rule\n                    rule.next_crawl_time = now + timedelta(seconds=rule.frequency)\n                    rule.save()\n\n                    logging.debug(data)\n\n            #print r.rpop('unicrawler:urls')\n            time.sleep(1)\n\nif __name__ == '__main__':\n    scheduler = Scheduler()\n    scheduler.run()\n"
  },
  {
    "path": "configs/__init__.py",
    "content": "default_app_config = 'configs.apps.ConfigsAppConfig'"
  },
  {
    "path": "configs/admin.py",
    "content": "from django.contrib import admin\nfrom .models import Site, Proxy\n\n\nclass SiteAdmin(admin.ModelAdmin):\n    list_display = ('id', 'name', 'domain', 'proxy', 'browser', 'limit_speed', 'status')\n    list_filter = ['proxy', 'browser', 'status']\n\nadmin.site.register(Site, SiteAdmin)\n\n\nclass ProxyAdmin(admin.ModelAdmin):\n    list_display = ('host', 'port', 'speed', 'status', 'retry', 'address')\n    list_filter = ('status',)\n\nadmin.site.register(Proxy, ProxyAdmin)"
  },
  {
    "path": "configs/apps.py",
    "content": "# -*- coding: utf-8 -*-\n__author__ = 'yijingping'\nfrom django.apps import AppConfig\n\n\nclass ConfigsAppConfig(AppConfig):\n    name = 'configs'\n    verbose_name = u'2 爬虫配置'"
  },
  {
    "path": "configs/management/__init__.py",
    "content": "# -*- coding: utf-8 -*-\n__author__ = 'yijingping'\n"
  },
  {
    "path": "configs/management/commands/__init__.py",
    "content": "# -*- coding: utf-8 -*-\n__author__ = 'yijingping'\n"
  },
  {
    "path": "configs/management/commands/checkproxies.py",
    "content": "# -*- coding: utf-8 -*-\n__author__ = 'yijingping'\nimport time\nfrom django.core.management.base import BaseCommand\nfrom configs.models import Proxy\nfrom configs.util import check_proxy\n\n\nclass Command(BaseCommand):\n    help = 'check proxies'\n\n    def handle(self, *args, **options):\n        while True:\n            self.check_all_proxies()\n            time.sleep(60)\n\n    def check_all_proxies(self):\n        # 检测新代理\n        qs1 = Proxy.objects.filter(status=Proxy.STATUS_NEW)\n        # 检测成功代理\n        qs2 = Proxy.objects.filter(status=Proxy.STATUS_SUCCESS)\n        # 检测失败代理\n        qs3 = Proxy.objects.filter(status=Proxy.STATUS_FAIL, retry__lt=3)\n        for qs in [qs1, qs2, qs3]:\n            for item in qs:\n                has_exception, proxy_detected, time_diff = check_proxy(item.host, item.port)\n                if has_exception or not proxy_detected:\n                    item.status = Proxy.STATUS_FAIL\n                    item.retry += 1\n                    item.save()\n                else:\n                    item.status = Proxy.STATUS_SUCCESS\n                    item.speed = time_diff * 1000\n                    item.retry = 0\n                    item.save()\n"
  },
  {
    "path": "configs/migrations/0001_initial.py",
    "content": "# -*- coding: utf-8 -*-\nfrom __future__ import unicode_literals\n\nfrom django.db import models, migrations\n\n\nclass Migration(migrations.Migration):\n\n    dependencies = [\n    ]\n\n    operations = [\n        migrations.CreateModel(\n            name='Proxy',\n            fields=[\n                ('id', models.AutoField(verbose_name='ID', serialize=False, auto_created=True, primary_key=True)),\n                ('kind', models.IntegerField(default=1, verbose_name=b'\\xe4\\xbb\\xa3\\xe7\\x90\\x86\\xe7\\xb1\\xbb\\xe5\\x9e\\x8b', choices=[(0, b'\\xe9\\x80\\x8f\\xe6\\x98\\x8e\\xe4\\xbb\\xa3\\xe7\\x90\\x86'), (1, b'\\xe9\\xab\\x98\\xe5\\xba\\xa6\\xe5\\x8c\\xbf\\xe5\\x90\\x8d')])),\n                ('user', models.CharField(default=b'', max_length=100, blank=True)),\n                ('password', models.CharField(default=b'', max_length=100, blank=True)),\n                ('host', models.CharField(max_length=100)),\n                ('port', models.IntegerField(default=80)),\n                ('address', models.CharField(default=b'', max_length=100, verbose_name=b'\\xe5\\x9c\\xb0\\xe7\\x90\\x86\\xe4\\xbd\\x8d\\xe7\\xbd\\xae', blank=True)),\n                ('speed', models.IntegerField(default=0, verbose_name=b'\\xe8\\xbf\\x9e\\xe6\\x8e\\xa5\\xe9\\x80\\x9f\\xe5\\xba\\xa6(ms)')),\n                ('status', models.IntegerField(default=0, verbose_name=b'\\xe7\\x8a\\xb6\\xe6\\x80\\x81', choices=[(0, b'\\xe6\\x9c\\xaa\\xe6\\xa3\\x80\\xe6\\xb5\\x8b'), (1, b'\\xe6\\xa3\\x80\\xe6\\xb5\\x8b\\xe6\\x88\\x90\\xe5\\x8a\\x9f'), (2, b'\\xe6\\xa3\\x80\\xe6\\xb5\\x8b\\xe5\\xa4\\xb1\\xe8\\xb4\\xa5')])),\n                ('retry', models.IntegerField(default=0, verbose_name=b'\\xe5\\xb0\\x9d\\xe8\\xaf\\x95\\xe6\\xac\\xa1\\xe6\\x95\\xb0')),\n            ],\n            options={\n                'verbose_name_plural': '2 \\u8bbf\\u95ee\\u4ee3\\u7406',\n            },\n        ),\n        migrations.CreateModel(\n            name='Site',\n            fields=[\n                ('id', models.AutoField(verbose_name='ID', serialize=False, auto_created=True, primary_key=True)),\n                ('name', models.CharField(max_length=100, verbose_name=b'\\xe7\\xab\\x99\\xe7\\x82\\xb9\\xe5\\x90\\x8d\\xe7\\xa7\\xb0')),\n                ('domain', models.CharField(unique=True, max_length=100, verbose_name=b'\\xe7\\xab\\x99\\xe7\\x82\\xb9\\xe5\\x9f\\x9f\\xe5\\x90\\x8d')),\n                ('proxy', models.IntegerField(default=1, verbose_name=b'\\xe4\\xbb\\xa3\\xe7\\x90\\x86', choices=[(1, b'\\xe4\\xb8\\x8d\\xe4\\xbd\\xbf\\xe7\\x94\\xa8\\xe4\\xbb\\xa3\\xe7\\x90\\x86'), (2, b'\\xe5\\xad\\x98\\xe5\\x82\\xa8\\xe5\\x9c\\xa8Mysql\\xe6\\x95\\xb0\\xe6\\x8d\\xae\\xe5\\xba\\x93\\xe4\\xb8\\xad\\xe7\\x9a\\x84\\xe4\\xbb\\xa3\\xe7\\x90\\x86')])),\n                ('browser', models.IntegerField(default=1, verbose_name=b'\\xe6\\xb5\\x8f\\xe8\\xa7\\x88\\xe5\\x99\\xa8\\xe5\\xa3\\xb3', choices=[(1, b'\\xe4\\xb8\\x8d\\xe4\\xbd\\xbf\\xe7\\x94\\xa8\\xe6\\xb5\\x8f\\xe8\\xa7\\x88\\xe5\\x99\\xa8\\xe5\\xa3\\xb3'), (2, b'\\xe6\\x99\\xae\\xe9\\x80\\x9a\\xe6\\xb5\\x8f\\xe8\\xa7\\x88\\xe5\\x99\\xa8')])),\n                ('limit_speed', models.IntegerField(default=100, verbose_name=b'\\xe8\\xae\\xbf\\xe9\\x97\\xae\\xe9\\x97\\xb4\\xe9\\x9a\\x94(\\xe6\\xaf\\xab\\xe7\\xa7\\x92)')),\n                ('status', models.IntegerField(default=1, verbose_name=b'\\xe6\\x98\\xaf\\xe5\\x90\\xa6\\xe5\\x90\\xaf\\xe7\\x94\\xa8', choices=[(1, b'\\xe5\\x90\\xaf\\xe7\\x94\\xa8'), (2, b'\\xe7\\xa6\\x81\\xe7\\x94\\xa8')])),\n            ],\n            options={\n                'verbose_name_plural': '1 \\u7ad9\\u70b9\\u914d\\u7f6e',\n            },\n        ),\n    ]\n"
  },
  {
    "path": "configs/migrations/0002_auto_20160201_1627.py",
    "content": "# -*- coding: utf-8 -*-\nfrom __future__ import unicode_literals\n\nfrom django.db import models, migrations\nimport configs.models\n\n\nclass Migration(migrations.Migration):\n\n    dependencies = [\n        ('configs', '0001_initial'),\n    ]\n\n    operations = [\n        migrations.AddField(\n            model_name='proxy',\n            name='create_time',\n            field=models.DateTimeField(default=None, verbose_name=b'\\xe5\\x88\\x9b\\xe5\\xbb\\xba\\xe6\\x97\\xb6\\xe9\\x97\\xb4', auto_now_add=True),\n            preserve_default=False,\n        ),\n        migrations.AddField(\n            model_name='proxy',\n            name='uniqueid',\n            field=models.CharField(default=configs.models.get_default_uniqueid, unique=True, max_length=100, verbose_name=b'url\\xe7\\x9a\\x84md5\\xe5\\x80\\xbc'),\n        ),\n        migrations.AddField(\n            model_name='proxy',\n            name='update_time',\n            field=models.DateTimeField(default=None, verbose_name=b'\\xe6\\x9b\\xb4\\xe6\\x96\\xb0\\xe6\\x97\\xb6\\xe9\\x97\\xb4', auto_now=True),\n            preserve_default=False,\n        ),\n    ]\n"
  },
  {
    "path": "configs/migrations/0003_proxy_url.py",
    "content": "# -*- coding: utf-8 -*-\nfrom __future__ import unicode_literals\n\nfrom django.db import models, migrations\n\n\nclass Migration(migrations.Migration):\n\n    dependencies = [\n        ('configs', '0002_auto_20160201_1627'),\n    ]\n\n    operations = [\n        migrations.AddField(\n            model_name='proxy',\n            name='url',\n            field=models.CharField(default=b'', max_length=500, verbose_name=b'\\xe6\\x96\\x87\\xe7\\xab\\xa0\\xe7\\x9a\\x84url'),\n        ),\n    ]\n"
  },
  {
    "path": "configs/migrations/0004_auto_20160202_1712.py",
    "content": "# -*- coding: utf-8 -*-\nfrom __future__ import unicode_literals\n\nfrom django.db import models, migrations\nimport configs.models\n\n\nclass Migration(migrations.Migration):\n\n    dependencies = [\n        ('configs', '0003_proxy_url'),\n    ]\n\n    operations = [\n        migrations.AlterField(\n            model_name='proxy',\n            name='uniqueid',\n            field=models.CharField(default=configs.models.get_default_uniqueid, unique=True, max_length=100, verbose_name=b'\\xe4\\xbb\\xa3\\xe7\\x90\\x86\\xe5\\x8f\\x82\\xe6\\x95\\xb0\\xe7\\x9a\\x84md5\\xe5\\x80\\xbc'),\n        ),\n        migrations.AlterField(\n            model_name='proxy',\n            name='url',\n            field=models.CharField(default=b'', max_length=500, verbose_name=b'url'),\n        ),\n    ]\n"
  },
  {
    "path": "configs/migrations/__init__.py",
    "content": ""
  },
  {
    "path": "configs/models.py",
    "content": "# -*- coding: utf-8 -*-\n__author__ = 'yijingping'\nimport time\nfrom django.db import models\n\n\ndef get_default_uniqueid():\n    return str(long(time.time() * 1000000))\n\n\nclass Site(models.Model):\n    STATUS_ENABLE = 1\n    STATUS_DISABLE = 2\n    STATUS_CHOICES = (\n        (STATUS_ENABLE, '启用'),\n        (STATUS_DISABLE, '禁用')\n    )\n    PROXY_NONE = 1\n    PROXY_MYSQL = 2\n    PROXY_CHOICES = (\n        (PROXY_NONE, '不使用代理'),\n        (PROXY_MYSQL, '存储在Mysql数据库中的代理')\n    )\n    BROWSER_NONE = 1\n    BROWSER_NORMAL = 2\n    BROWSER_CHOICES = (\n        (BROWSER_NONE, '不使用浏览器壳'),\n        (BROWSER_NORMAL, '普通浏览器')\n    )\n    name = models.CharField(max_length=100, verbose_name='站点名称')\n    domain = models.CharField(unique=True, max_length=100, verbose_name='站点域名')\n    proxy = models.IntegerField(default=PROXY_NONE, choices=PROXY_CHOICES, verbose_name='代理')\n    browser = models.IntegerField(default=BROWSER_NONE, choices=BROWSER_CHOICES, verbose_name='浏览器壳')\n    limit_speed = models.IntegerField(default=100, verbose_name='访问间隔(毫秒)')\n    status = models.IntegerField(default=STATUS_ENABLE, choices=STATUS_CHOICES, verbose_name=\"是否启用\")\n\n    def get_config(self):\n        if self.status == self.STATUS_ENABLE:\n            return {\n                'domain': self.domain,\n                'proxy': self.proxy,\n                'browser': self.browser,\n                'limit_speed': self.limit_speed\n            }\n        else:\n            return {\n                'domain': self.domain,\n                'proxy': self.PROXY_NONE,\n                'browser': self.BROWSER_NONE,\n                'limit_speed': 0\n            }\n\n    def __unicode__(self):\n        return self.name\n\n    class Meta:\n        verbose_name_plural = \"1 站点配置\"\n\n\nclass Proxy(models.Model):\n    TYPE_TRANSPARENT = 0\n    TYPE_ANONYMOUS = 1\n    TYPE_CHOICES = (\n        (TYPE_TRANSPARENT, '透明代理'),\n        (TYPE_ANONYMOUS, '高度匿名'),\n    )\n\n    STATUS_NEW = 0\n    STATUS_SUCCESS = 1\n    STATUS_FAIL = 2\n    STATUS_CHOICES = (\n        (STATUS_NEW,'未检测'),\n        (STATUS_SUCCESS,'检测成功'),\n        (STATUS_FAIL,'检测失败'),\n    )\n    uniqueid = models.CharField(unique=True, max_length=100, default=get_default_uniqueid, verbose_name='代理参数的md5值')\n    url = models.CharField(max_length=500, default='', verbose_name='url')\n    kind = models.IntegerField(default=TYPE_ANONYMOUS, choices=TYPE_CHOICES, verbose_name=\"代理类型\")\n    user = models.CharField(default='', blank=True, max_length=100)\n    password = models.CharField(default='', blank=True, max_length=100)\n    host = models.CharField(max_length=100)\n    port = models.IntegerField(default=80)\n    address = models.CharField(default='', blank=True, max_length=100, verbose_name=\"地理位置\")\n    speed = models.IntegerField(default=0, verbose_name=\"连接速度(ms)\")\n    status = models.IntegerField(default=STATUS_NEW, choices=STATUS_CHOICES, verbose_name=\"状态\")\n    retry = models.IntegerField(default=0, verbose_name=\"尝试次数\")\n    create_time = models.DateTimeField(auto_now_add=True, verbose_name='创建时间')\n    update_time = models.DateTimeField(auto_now=True, verbose_name='更新时间')\n\n    class Meta:\n        verbose_name_plural = \"2 访问代理\"\n\n"
  },
  {
    "path": "configs/proxies.py",
    "content": "# -*- coding: utf-8 -*-\n__author__ = 'yijingping'\nfrom .models import Proxy\n\nclass MysqlProxyBackend(object):\n    def __init__(self):\n        proxy = Proxy.objects.filter(kind=Proxy.KIND_DOWNLOAD, status=Proxy.STATUS_SUCCESS).order_by('?').first()\n        if proxy:\n            self.user = proxy.user\n            self.password = proxy.password\n            self.host = proxy.host\n            self.port = proxy.port\n        else:\n            self.user, self.password, self.host, self.port = '', '', '', ''\n\n    def is_valid(self):\n        return self.host and self.port\n\n    def __str__(self):\n        return ':'.join([str(self.user), str(self.password), str(self.host), str(self.port)])\n"
  },
  {
    "path": "configs/util.py",
    "content": "# -*- coding: utf-8 -*-\n__author__ = 'yijingping'\nimport time\nimport urllib2\n\nip_check_url = 'http://api.ipify.org'\nuser_agent = 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:12.0) Gecko/20100101 Firefox/12.0'\nsocket_timeout = 3\n\n\n# Get real public IP address\ndef get_real_pip():\n    req = urllib2.Request(ip_check_url)\n    req.add_header('User-agent', user_agent)\n    conn = urllib2.urlopen(req)\n    page = conn.read()\n    conn.close()\n    return page\n\n# Set global variable containing \"real\" public IP address\nreal_pip = get_real_pip()\n\n\ndef check_proxy(host, port):\n    try:\n        # Build opener\n        proxy_handler = urllib2.ProxyHandler({'http': '%s:%s' % (host, port)})\n        opener = urllib2.build_opener(proxy_handler)\n        opener.addheaders = [('User-agent', user_agent)]\n        urllib2.install_opener(opener)\n\n        # Build, time, and execute request\n        req = urllib2.Request(ip_check_url)\n        time_start = time.time()\n        conn = urllib2.urlopen(req, timeout=socket_timeout)\n        time_end = time.time()\n        detected_pip = conn.read()\n        conn.close()\n\n        # Calculate request time\n        time_diff = time_end - time_start\n\n        # Check if proxy is detected\n        if detected_pip == real_pip:\n            proxy_detected = False\n        else:\n            proxy_detected = True\n\n    # Catch exceptions\n    except urllib2.HTTPError, e:\n        print \"ERROR: Code \", e.code\n        return (True, False, 999)\n    except Exception, detail:\n        print \"ERROR: \", detail\n        return (True, False, 999)\n\n    # Return False if no exceptions, proxy_detected=True if proxy detected\n    return (False, proxy_detected, time_diff)"
  },
  {
    "path": "cores/__init__.py",
    "content": "default_app_config = 'cores.apps.CoresAppConfig'"
  },
  {
    "path": "cores/admin.py",
    "content": "from django.contrib import admin\nfrom .models import Seed, IndexRule, DetailRule\n\n\nclass SeedAdmin(admin.ModelAdmin):\n    list_display = ('id', 'name', 'desc', 'weight', 'status')\n    list_filter = ['status']\n\nadmin.site.register(Seed, SeedAdmin)\n\n\nclass IndexRuleAdmin(admin.ModelAdmin):\n    list_display = ('id', 'seed', 'name', 'site', 'url', 'frequency', 'update_time', 'next_crawl_time', 'fresh_pages', 'status')\n    list_filter = ['status', 'update_time', 'next_crawl_time']\n\nadmin.site.register(IndexRule, IndexRuleAdmin)\n\n\nclass DetailRuleAdmin(admin.ModelAdmin):\n    list_display = ['index_rule']\n\nadmin.site.register(DetailRule, DetailRuleAdmin)\n"
  },
  {
    "path": "cores/apps.py",
    "content": "# -*- coding: utf-8 -*-\n__author__ = 'yijingping'\nfrom django.apps import AppConfig\n\n\nclass CoresAppConfig(AppConfig):\n    name = 'cores'\n    verbose_name = u'1 爬虫'"
  },
  {
    "path": "cores/constants.py",
    "content": "# -*- coding: utf-8 -*-\n__author__ = 'yijingping'\n\nKIND_LIST_URL = 0\nKIND_DETAIL_URL = 1\n\n"
  },
  {
    "path": "cores/downloaders.py",
    "content": "# -*- coding: utf-8 -*-\n__author__ = 'yijingping'\nimport time\nimport requests\nimport platform\nfrom random import sample\nfrom pyvirtualdisplay import Display\nfrom selenium import webdriver\nfrom selenium.webdriver.common.proxy import Proxy, ProxyType\nfrom django.conf import settings\n\nimport logging\nlogger = logging.getLogger()\nCRAWLER_CONFIG = settings.CRAWLER_CONFIG\n\n\nclass RequestsDownloaderBackend(object):\n    \"\"\"\n    使用requests直接访问\n    \"\"\"\n    headers = [\n        {\n            'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.118 Safari/537.36'\n        }\n    ]\n\n    def __init__(self, proxy=None):\n        self.proxy = proxy\n\n    def format_proxies(self):\n        p = self.proxy\n        if self.proxy:\n            if p.user:\n                data = 'http://%s:%s@%s:%s' % (p.user, p.password, p.host, p.port)\n            else:\n                data = 'http://%s:%s' % (p.host, p.port)\n            return {\n                \"http\": data\n            }\n        else:\n            return None\n\n    def download(self, url):\n        header = sample(self.headers, 1)[0]\n        proxies = self.format_proxies()\n        #print url\n        if isinstance(url, basestring):\n            rsp = requests.get(url, headers=header, proxies=proxies)\n            rsp.close()\n            rsp.encoding = rsp.apparent_encoding\n            return rsp.text\n        elif isinstance(url, dict):\n            link, method, data, data_type = url.get('url'), url.get('method'), url.get('data'), url.get('dataType')\n            req = {'GET': requests.get, 'POST': requests.post}.get(method)\n            if method == 'GET':\n                rsp = req(link, params=data, headers=header, proxies=proxies)\n            elif method == 'POST':\n                rsp = req(link, data=data, headers=header, proxies=proxies)\n            rsp.close()\n            rsp.encoding = rsp.apparent_encoding\n            if data_type == 'json':\n                return rsp.json()\n            else:\n                return rsp.text\n\n\nclass SeleniumDownloaderBackend(object):\n    \"\"\"\n    使用Selenium模拟浏览器访问\n    \"\"\"\n    headers = [\n        {\n            'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.118 Safari/537.36'\n        }\n    ]\n\n    def __init__(self, proxy=None):\n        # 设置代理\n        self.proxy = proxy\n\n    def __enter__(self):\n        # 打开界面\n        self.display = self.get_display()\n        #  打开浏览器\n        self.browser = self.get_browser(self.proxy)\n        return self\n\n    def __exit__(self, exc_type, exc_val, exc_tb):\n        # 关闭浏览器\n        try:\n            if self.browser:\n                self.browser.delete_all_cookies()\n                self.browser.quit()\n        except Exception as e:\n            logging.exception(e)\n        # 关闭界面\n        try:\n            # 关闭浏览器,关闭窗口\n            self.display and self.display.stop()\n        except Exception as e:\n            logging.exception(e)\n\n    def get_display(self):\n        if platform.system() != 'Darwin':\n            # 不是mac系统, 启动窗口\n            display = Display(visible=0, size=(1024, 768))\n            display.start()\n        else:\n            display = None\n        return display\n\n    def get_browser(self, proxy):\n        # 启动浏览器\n        # 禁止加载image\n        firefox_profile = webdriver.FirefoxProfile()\n        #firefox_profile.set_preference('permissions.default.stylesheet', 2)\n        #firefox_profile.set_preference('permissions.default.image', 2)\n        #firefox_profile.set_preference('dom.ipc.plugins.enabled.libflashplayer.so', 'false')\n        # 代理\n        if proxy and proxy.is_valid():\n            myProxy = '%s:%s' % (proxy.host, proxy.port)\n            ff_proxy = Proxy({\n                'proxyType': ProxyType.MANUAL,\n                'httpProxy': myProxy,\n                'ftpProxy': myProxy,\n                'sslProxy': myProxy,\n            'noProxy':''})\n\n            browser = webdriver.Firefox(firefox_profile=firefox_profile, proxy=ff_proxy)\n        else:\n            browser = webdriver.Firefox(firefox_profile=firefox_profile)\n\n        return browser\n\n    def download(self, url):\n        browser = self.browser\n        # 访问首页, 输入wchatid, 点击查询\n        browser.get(url)\n        time.sleep(3)\n        js = \"\"\"\n            return document.documentElement.innerHTML;\n        \"\"\"\n        body = browser.execute_script(js)\n        return body\n\n\nclass BrowserDownloaderBackend(object):\n    def download(self):\n        pass\n"
  },
  {
    "path": "cores/extractors.py",
    "content": "# -*- coding: utf-8 -*-\n__author__ = 'yijingping'\nfrom abc import ABCMeta\nfrom abc import abstractmethod\nimport requests\nimport oss2\nfrom oss2.exceptions import NotFound\nfrom copy import copy\nfrom hashlib import md5\nfrom lxml import etree\nfrom io import StringIO\nfrom django.conf import settings\nimport logging\nlogger = logging.getLogger()\n\n\nOSS2_CONF = settings.OSS2_CONFIG\nBUCKET = None\n\n\ndef get_bucket():\n    global BUCKET\n    if not BUCKET:\n        auth = oss2.Auth(OSS2_CONF['ACCESS_KEY_ID'], OSS2_CONF['ACCESS_KEY_SECRET'])\n        BUCKET = oss2.Bucket(auth, 'http://%s' % OSS2_CONF['BUCKET_DOMAIN'], OSS2_CONF['BUCKET_NAME'])\n\n    return BUCKET\n\n\ndef download_to_oss(url, path, timeout=3600):\n    r = requests.get(url, timeout=timeout)\n    r.close()\n    key = path + md5(r.content).hexdigest()\n    bucket = get_bucket()\n    try:\n        bucket.head_object(key)\n    except NotFound as e:\n        logging.exception(e)\n        bucket.put_object(key, r, headers={'Content-Type': r.headers.get('Content-Type', '')})\n\n    return 'http://%s/%s' % (OSS2_CONF[\"CDN_DOMAIN\"], key)\n\n\nclass BaseExtractor(object):\n    __metaclass__ = ABCMeta\n\n    @abstractmethod\n    def __init__(self):\n        pass\n\n    @abstractmethod\n    def extract(self):\n        pass\n\n\nclass ImageExtractor(BaseExtractor):\n    def __init__(self, data):\n        \"\"\" data 是图片url,或者图片url的列表,或者包含img标签的内容\n        :param data:\n        :return: 如果是url,返回新的url; 如果是列表,返回新的url列表\n        \"\"\"\n        self.data = data\n\n    def extract(self):\n        d = self.data\n        res = None\n        if not d:\n            return d\n        elif isinstance(d, basestring):\n            if d.startswith('http'):\n                ## 内容是图片地址\n                res = download_to_oss(d, OSS2_CONF[\"IMAGES_PATH\"], timeout=120)\n            else:\n                ## 内容是包含图片的文字\n                htmlparser = etree.HTMLParser()\n                tree = etree.parse(StringIO(d), htmlparser)\n                # 找出所有图片src\n                srcs = tree.xpath(\"//img[starts-with(@src,'http')]/@src\")\n                data_srcs = tree.xpath(\"//img[starts-with(@data-src,'http')]/@data-src\")\n                srcs = list(set(srcs + data_srcs))\n                # 下载并传到OSS中\n                new_srcs = [download_to_oss(item, OSS2_CONF[\"IMAGES_PATH\"], timeout=120) for item in srcs]\n                # 替换掉原文中的图片src\n                res = self.replace_all(d, srcs, new_srcs)\n        elif isinstance(d, list):\n            res = [download_to_oss(item, OSS2_CONF[\"IMAGES_PATH\"], timeout=120) for item in d]\n\n        return res\n\n\n    def replace_all(self, content, srcs, new_srcs):\n        \"\"\" 将content中的srcs全部替换成new_srcs\n        \"\"\"\n        replaces = zip(srcs, new_srcs)\n        for src, new_src in replaces:\n            content = content.replace(src.split('?')[0], new_src)\n        return content\n\n\nclass VideoExtractor(BaseExtractor):\n    def __init__(self, data):\n        \"\"\" data 是视频url,或者视频url的列表\n        :param data:\n        :return: 如果是url,返回新的url; 如果是列表,返回新的url列表\n        \"\"\"\n        self.data = data\n\n    def extract(self):\n        d = self.data\n        new_url = None\n        if not d:\n            return d\n        elif isinstance(d, basestring):\n            new_url = download_to_oss(d, OSS2_CONF[\"VIDEOS_PATH\"])\n        elif isinstance(d, list):\n            new_url = [download_to_oss(item, OSS2_CONF[\"VIDEOS_PATH\"]) for item in d]\n\n        return new_url\n\n\nclass XPathExtractor(BaseExtractor):\n    def __init__(self, content, rule):\n        htmlparser = etree.HTMLParser()\n        self.tree = etree.parse(StringIO(content), htmlparser)\n        self.rule = rule\n\n    def extract(self):\n        return self.tree.xpath(self.rule)\n\n\nclass PythonExtractor(BaseExtractor):\n    def __init__(self, code, in_val, context):\n        self.code = code\n        self.in_val = in_val\n        self.context = copy(context)\n        self.context.update({'in_val': in_val})\n\n    def extract(self):\n        res = self.in_val\n        g, l = {}, self.context\n        try:\n            exec(self.code, g, l)\n            res = l[\"out_val\"]\n        except Exception as e:\n            logger.exception(e)\n        finally:\n            return res\n"
  },
  {
    "path": "cores/migrations/0001_initial.py",
    "content": "# -*- coding: utf-8 -*-\nfrom __future__ import unicode_literals\n\nfrom django.db import models, migrations\nimport jsonfield.fields\n\n\nclass Migration(migrations.Migration):\n\n    dependencies = [\n    ]\n\n    operations = [\n        migrations.CreateModel(\n            name='DetailRule',\n            fields=[\n                ('id', models.AutoField(verbose_name='ID', serialize=False, auto_created=True, primary_key=True)),\n                ('data', jsonfield.fields.JSONField(verbose_name=b'\\xe8\\xaf\\xa6\\xe6\\x83\\x85\\xe9\\xa1\\xb5\\xe8\\xa7\\x84\\xe5\\x88\\x99')),\n            ],\n            options={\n                'verbose_name_plural': '4 \\u8be6\\u60c5\\u9875\\u722c\\u53d6\\u89c4\\u5219',\n            },\n        ),\n        migrations.CreateModel(\n            name='IndexRule',\n            fields=[\n                ('id', models.AutoField(verbose_name='ID', serialize=False, auto_created=True, primary_key=True)),\n                ('name', models.CharField(max_length=100, verbose_name=b'\\xe6\\x9d\\xa5\\xe6\\xba\\x90')),\n                ('url', jsonfield.fields.JSONField(verbose_name=b'\\xe7\\xb4\\xa2\\xe5\\xbc\\x95url\\xe5\\x88\\x97\\xe8\\xa1\\xa8')),\n                ('list_rules', jsonfield.fields.JSONField(verbose_name=b'\\xe8\\x8e\\xb7\\xe5\\x8f\\x96\\xe5\\x88\\x97\\xe8\\xa1\\xa8\\xe9\\xa1\\xb9\\xe7\\x9a\\x84\\xe8\\xa7\\x84\\xe5\\x88\\x99')),\n                ('next_url_rules', jsonfield.fields.JSONField(default=[], verbose_name=b'\\xe4\\xb8\\x8b\\xe4\\xb8\\x80\\xe9\\xa1\\xb5\\xe7\\xb4\\xa2\\xe5\\xbc\\x95\\xe7\\x9a\\x84\\xe8\\xa7\\x84\\xe5\\x88\\x99\\xe5\\x88\\x97\\xe8\\xa1\\xa8', blank=True)),\n                ('frequency', models.IntegerField(default=60, verbose_name=b'\\xe7\\x88\\xac\\xe5\\x8f\\x96\\xe9\\xa2\\x91\\xe7\\x8e\\x87,\\xe5\\x8d\\x95\\xe4\\xbd\\x8d\\xe7\\xa7\\x92')),\n                ('update_time', models.DateTimeField(auto_now=True, verbose_name=b'\\xe6\\x9b\\xb4\\xe6\\x96\\xb0\\xe6\\x97\\xb6\\xe9\\x97\\xb4')),\n                ('next_crawl_time', models.DateTimeField(verbose_name=b'\\xe4\\xb8\\x8b\\xe6\\xac\\xa1\\xe7\\x88\\xac\\xe5\\x8f\\x96\\xe6\\x97\\xb6\\xe9\\x97\\xb4')),\n                ('fresh_pages', models.IntegerField(default=2, verbose_name=b'\\xe7\\x88\\xac\\xe5\\x8f\\x96\\xe9\\xa1\\xb5\\xe9\\x9d\\xa2\\xe6\\x95\\xb0')),\n                ('status', models.IntegerField(default=1, verbose_name=b'\\xe6\\x98\\xaf\\xe5\\x90\\xa6\\xe5\\x90\\xaf\\xe7\\x94\\xa8', choices=[(1, b'\\xe5\\x90\\xaf\\xe7\\x94\\xa8'), (2, b'\\xe7\\xa6\\x81\\xe7\\x94\\xa8')])),\n            ],\n            options={\n                'verbose_name_plural': '3 \\u7d22\\u5f15\\u548c\\u5217\\u8868\\u89c4\\u5219',\n            },\n        ),\n        migrations.CreateModel(\n            name='Proxy',\n            fields=[\n                ('id', models.AutoField(verbose_name='ID', serialize=False, auto_created=True, primary_key=True)),\n                ('kind', models.IntegerField(default=1, verbose_name=b'\\xe4\\xbb\\xa3\\xe7\\x90\\x86\\xe7\\xb1\\xbb\\xe5\\x9e\\x8b', choices=[(0, b'\\xe9\\x80\\x8f\\xe6\\x98\\x8e\\xe4\\xbb\\xa3\\xe7\\x90\\x86'), (1, b'\\xe9\\xab\\x98\\xe5\\xba\\xa6\\xe5\\x8c\\xbf\\xe5\\x90\\x8d')])),\n                ('user', models.CharField(default=b'', max_length=100, blank=True)),\n                ('password', models.CharField(default=b'', max_length=100, blank=True)),\n                ('host', models.CharField(max_length=100)),\n                ('port', models.IntegerField(default=80)),\n                ('address', models.CharField(default=b'', max_length=100, verbose_name=b'\\xe5\\x9c\\xb0\\xe7\\x90\\x86\\xe4\\xbd\\x8d\\xe7\\xbd\\xae', blank=True)),\n                ('speed', models.IntegerField(default=0, verbose_name=b'\\xe8\\xbf\\x9e\\xe6\\x8e\\xa5\\xe9\\x80\\x9f\\xe5\\xba\\xa6(ms)')),\n                ('status', models.IntegerField(default=0, verbose_name=b'\\xe7\\x8a\\xb6\\xe6\\x80\\x81', choices=[(0, b'\\xe6\\x9c\\xaa\\xe6\\xa3\\x80\\xe6\\xb5\\x8b'), (1, b'\\xe6\\xa3\\x80\\xe6\\xb5\\x8b\\xe6\\x88\\x90\\xe5\\x8a\\x9f'), (2, b'\\xe6\\xa3\\x80\\xe6\\xb5\\x8b\\xe5\\xa4\\xb1\\xe8\\xb4\\xa5')])),\n                ('retry', models.IntegerField(default=0, verbose_name=b'\\xe5\\xb0\\x9d\\xe8\\xaf\\x95\\xe6\\xac\\xa1\\xe6\\x95\\xb0')),\n            ],\n            options={\n                'verbose_name_plural': '5 \\u8bbf\\u95ee\\u4ee3\\u7406',\n            },\n        ),\n        migrations.CreateModel(\n            name='Seed',\n            fields=[\n                ('id', models.AutoField(verbose_name='ID', serialize=False, auto_created=True, primary_key=True)),\n                ('name', models.CharField(max_length=100, verbose_name=b'\\xe6\\xa8\\xa1\\xe6\\x9d\\xbf\\xe5\\x90\\x8d\\xe7\\xa7\\xb0')),\n                ('desc', models.TextField(verbose_name=b'\\xe7\\xae\\x80\\xe4\\xbb\\x8b')),\n                ('data', jsonfield.fields.JSONField(default=[], verbose_name=b'\\xe5\\xad\\x98\\xe5\\x82\\xa8\\xe6\\x95\\xb0\\xe6\\x8d\\xae\\xe9\\x85\\x8d\\xe7\\xbd\\xae', blank=True)),\n                ('weight', models.IntegerField(default=0, verbose_name=b'\\xe6\\x9d\\x83\\xe9\\x87\\x8d')),\n                ('status', models.IntegerField(default=1, verbose_name=b'\\xe6\\x98\\xaf\\xe5\\x90\\xa6\\xe5\\x90\\xaf\\xe7\\x94\\xa8', choices=[(1, b'\\xe5\\x90\\xaf\\xe7\\x94\\xa8'), (2, b'\\xe7\\xa6\\x81\\xe7\\x94\\xa8')])),\n            ],\n            options={\n                'verbose_name_plural': '1 \\u79cd\\u5b50',\n            },\n        ),\n        migrations.CreateModel(\n            name='Site',\n            fields=[\n                ('id', models.AutoField(verbose_name='ID', serialize=False, auto_created=True, primary_key=True)),\n                ('name', models.CharField(max_length=100, verbose_name=b'\\xe7\\xab\\x99\\xe7\\x82\\xb9\\xe5\\x90\\x8d\\xe7\\xa7\\xb0')),\n                ('domain', models.CharField(unique=True, max_length=100, verbose_name=b'\\xe7\\xab\\x99\\xe7\\x82\\xb9\\xe5\\x9f\\x9f\\xe5\\x90\\x8d')),\n                ('proxy', models.IntegerField(default=1, verbose_name=b'\\xe4\\xbb\\xa3\\xe7\\x90\\x86', choices=[(1, b'\\xe4\\xb8\\x8d\\xe4\\xbd\\xbf\\xe7\\x94\\xa8\\xe4\\xbb\\xa3\\xe7\\x90\\x86'), (2, b'\\xe5\\xad\\x98\\xe5\\x82\\xa8\\xe5\\x9c\\xa8Mysql\\xe6\\x95\\xb0\\xe6\\x8d\\xae\\xe5\\xba\\x93\\xe4\\xb8\\xad\\xe7\\x9a\\x84\\xe4\\xbb\\xa3\\xe7\\x90\\x86')])),\n                ('browser', models.IntegerField(default=1, verbose_name=b'\\xe6\\xb5\\x8f\\xe8\\xa7\\x88\\xe5\\x99\\xa8\\xe5\\xa3\\xb3', choices=[(1, b'\\xe4\\xb8\\x8d\\xe4\\xbd\\xbf\\xe7\\x94\\xa8\\xe6\\xb5\\x8f\\xe8\\xa7\\x88\\xe5\\x99\\xa8\\xe5\\xa3\\xb3'), (2, b'\\xe6\\x99\\xae\\xe9\\x80\\x9a\\xe6\\xb5\\x8f\\xe8\\xa7\\x88\\xe5\\x99\\xa8')])),\n                ('limit_speed', models.IntegerField(default=100, verbose_name=b'\\xe8\\xae\\xbf\\xe9\\x97\\xae\\xe9\\x97\\xb4\\xe9\\x9a\\x94(\\xe6\\xaf\\xab\\xe7\\xa7\\x92)')),\n                ('status', models.IntegerField(default=1, verbose_name=b'\\xe6\\x98\\xaf\\xe5\\x90\\xa6\\xe5\\x90\\xaf\\xe7\\x94\\xa8', choices=[(1, b'\\xe5\\x90\\xaf\\xe7\\x94\\xa8'), (2, b'\\xe7\\xa6\\x81\\xe7\\x94\\xa8')])),\n            ],\n            options={\n                'verbose_name_plural': '2 \\u7ad9\\u70b9\\u914d\\u7f6e',\n            },\n        ),\n        migrations.AddField(\n            model_name='indexrule',\n            name='seed',\n            field=models.ForeignKey(to='cores.Seed'),\n        ),\n        migrations.AddField(\n            model_name='indexrule',\n            name='site',\n            field=models.ForeignKey(to='cores.Site'),\n        ),\n        migrations.AddField(\n            model_name='detailrule',\n            name='index_rule',\n            field=models.ForeignKey(to='cores.IndexRule'),\n        ),\n    ]\n"
  },
  {
    "path": "cores/migrations/0002_detailrule_exclude.py",
    "content": "# -*- coding: utf-8 -*-\nfrom __future__ import unicode_literals\n\nfrom django.db import models, migrations\nimport jsonfield.fields\n\n\nclass Migration(migrations.Migration):\n\n    dependencies = [\n        ('cores', '0001_initial'),\n    ]\n\n    operations = [\n        migrations.AddField(\n            model_name='detailrule',\n            name='exclude',\n            field=jsonfield.fields.JSONField(verbose_name=b'\\xe6\\x8e\\x92\\xe9\\x99\\xa4\\xe8\\xa7\\x84\\xe5\\x88\\x99', blank=True),\n        ),\n    ]\n"
  },
  {
    "path": "cores/migrations/0003_auto_20160131_2226.py",
    "content": "# -*- coding: utf-8 -*-\nfrom __future__ import unicode_literals\n\nfrom django.db import models, migrations\nimport jsonfield.fields\n\n\nclass Migration(migrations.Migration):\n\n    dependencies = [\n        ('cores', '0002_detailrule_exclude'),\n    ]\n\n    operations = [\n        migrations.AlterField(\n            model_name='detailrule',\n            name='exclude',\n            field=jsonfield.fields.JSONField(default=[], verbose_name=b'\\xe6\\x8e\\x92\\xe9\\x99\\xa4\\xe8\\xa7\\x84\\xe5\\x88\\x99', blank=True),\n        ),\n    ]\n"
  },
  {
    "path": "cores/migrations/0004_auto_20160201_1035.py",
    "content": "# -*- coding: utf-8 -*-\nfrom __future__ import unicode_literals\n\nfrom django.db import models, migrations\n\n\nclass Migration(migrations.Migration):\n\n    dependencies = [\n        ('cores', '0003_auto_20160131_2226'),\n    ]\n\n    operations = [\n        migrations.DeleteModel(\n            name='Proxy',\n        ),\n        migrations.AlterModelOptions(\n            name='detailrule',\n            options={'verbose_name_plural': '3 \\u8be6\\u60c5\\u9875\\u722c\\u53d6\\u89c4\\u5219'},\n        ),\n        migrations.AlterModelOptions(\n            name='indexrule',\n            options={'verbose_name_plural': '2 \\u7d22\\u5f15\\u548c\\u5217\\u8868\\u89c4\\u5219'},\n        ),\n        migrations.AlterField(\n            model_name='indexrule',\n            name='site',\n            field=models.ForeignKey(to='configs.Site'),\n        ),\n        migrations.DeleteModel(\n            name='Site',\n        ),\n    ]\n"
  },
  {
    "path": "cores/migrations/0005_detailrule_multi.py",
    "content": "# -*- coding: utf-8 -*-\nfrom __future__ import unicode_literals\n\nfrom django.db import models, migrations\nimport jsonfield.fields\n\n\nclass Migration(migrations.Migration):\n\n    dependencies = [\n        ('cores', '0004_auto_20160201_1035'),\n    ]\n\n    operations = [\n        migrations.AddField(\n            model_name='detailrule',\n            name='multi',\n            field=jsonfield.fields.JSONField(verbose_name=b'\\xe5\\xa4\\x9a\\xe8\\xaf\\xa6\\xe6\\x83\\x85\\xe8\\xa7\\x84\\xe5\\x88\\x99', blank=True),\n        ),\n    ]\n"
  },
  {
    "path": "cores/migrations/0006_detailrule_fresh_time.py",
    "content": "# -*- coding: utf-8 -*-\nfrom __future__ import unicode_literals\n\nfrom django.db import models, migrations\n\n\nclass Migration(migrations.Migration):\n\n    dependencies = [\n        ('cores', '0005_detailrule_multi'),\n    ]\n\n    operations = [\n        migrations.AddField(\n            model_name='detailrule',\n            name='fresh_time',\n            field=models.IntegerField(default=2592000, verbose_name=b'\\xe6\\x96\\xb0\\xe9\\xb2\\x9c\\xe5\\xba\\xa6\\xe7\\xbb\\xb4\\xe6\\x8c\\x81\\xe6\\x97\\xb6\\xe9\\x97\\xb4(\\xe7\\xa7\\x92),\\xe9\\xbb\\x98\\xe8\\xae\\xa4\\xe4\\xb8\\x80\\xe4\\xb8\\xaa\\xe6\\x9c\\x88'),\n        ),\n    ]\n"
  },
  {
    "path": "cores/migrations/0007_detailrule_multi_unique.py",
    "content": "# -*- coding: utf-8 -*-\nfrom __future__ import unicode_literals\n\nfrom django.db import models, migrations\nimport jsonfield.fields\n\n\nclass Migration(migrations.Migration):\n\n    dependencies = [\n        ('cores', '0006_detailrule_fresh_time'),\n    ]\n\n    operations = [\n        migrations.AddField(\n            model_name='detailrule',\n            name='multi_unique',\n            field=jsonfield.fields.JSONField(verbose_name=b'\\xe5\\xa4\\x9a\\xe8\\xaf\\xa6\\xe6\\x83\\x85\\xe5\\x94\\xaf\\xe4\\xb8\\x80\\xe9\\x94\\xae\\xe8\\xa7\\x84\\xe5\\x88\\x99', blank=True),\n        ),\n    ]\n"
  },
  {
    "path": "cores/migrations/0008_auto_20160407_1426.py",
    "content": "# -*- coding: utf-8 -*-\nfrom __future__ import unicode_literals\n\nfrom django.db import models, migrations\nimport jsonfield.fields\n\n\nclass Migration(migrations.Migration):\n\n    dependencies = [\n        ('cores', '0007_detailrule_multi_unique'),\n    ]\n\n    operations = [\n        migrations.AlterField(\n            model_name='detailrule',\n            name='multi',\n            field=jsonfield.fields.JSONField(default=[], verbose_name=b'\\xe5\\xa4\\x9a\\xe8\\xaf\\xa6\\xe6\\x83\\x85\\xe8\\xa7\\x84\\xe5\\x88\\x99', blank=True),\n        ),\n        migrations.AlterField(\n            model_name='detailrule',\n            name='multi_unique',\n            field=jsonfield.fields.JSONField(default=[], verbose_name=b'\\xe5\\xa4\\x9a\\xe8\\xaf\\xa6\\xe6\\x83\\x85\\xe5\\x94\\xaf\\xe4\\xb8\\x80\\xe9\\x94\\xae\\xe8\\xa7\\x84\\xe5\\x88\\x99', blank=True),\n        ),\n        migrations.AlterField(\n            model_name='indexrule',\n            name='list_rules',\n            field=jsonfield.fields.JSONField(default=[], verbose_name=b'\\xe8\\x8e\\xb7\\xe5\\x8f\\x96\\xe5\\x88\\x97\\xe8\\xa1\\xa8\\xe9\\xa1\\xb9\\xe7\\x9a\\x84\\xe8\\xa7\\x84\\xe5\\x88\\x99', blank=True),\n        ),\n    ]\n"
  },
  {
    "path": "cores/migrations/__init__.py",
    "content": "# -*- coding: utf-8 -*-\n__author__ = 'yijingping'\n"
  },
  {
    "path": "cores/models.py",
    "content": "# -*- coding: utf-8 -*-\n__author__ = 'yijingping'\nimport collections\nfrom django.db import models\nfrom jsonfield import JSONField\nfrom configs.models import Site\n\n\nclass Seed(models.Model):\n    STATUS_ENABLE = 1\n    STATUS_DISABLE = 2\n    STATUS_CHOICES = (\n        (STATUS_ENABLE, '启用'),\n        (STATUS_DISABLE, '禁用')\n    )\n    name = models.CharField(max_length=100, verbose_name='模板名称')\n    desc = models.TextField(verbose_name='简介')\n    data = JSONField(verbose_name='存储数据配置', load_kwargs={'object_pairs_hook': collections.OrderedDict},\n                     blank=True, default=[])\n    weight = models.IntegerField(default=0, verbose_name='权重')\n    status = models.IntegerField(default=STATUS_ENABLE, choices=STATUS_CHOICES, verbose_name=\"是否启用\")\n\n    def __unicode__(self):\n        return self.name\n\n    class Meta:\n        verbose_name_plural = \"1 种子\"\n\n\nclass IndexRule(models.Model):\n    STATUS_ENABLE = 1\n    STATUS_DISABLE = 2\n    STATUS_CHOICES = (\n        (STATUS_ENABLE, '启用'),\n        (STATUS_DISABLE, '禁用')\n    )\n    seed = models.ForeignKey(Seed)\n    name = models.CharField(max_length=100, verbose_name='来源')\n    site = models.ForeignKey(Site)\n    url = JSONField(verbose_name='索引url列表', load_kwargs={'object_pairs_hook': collections.OrderedDict})\n    list_rules = JSONField(verbose_name='获取列表项的规则', load_kwargs={'object_pairs_hook': collections.OrderedDict},\n                           blank=True, default=[])\n    next_url_rules = JSONField(verbose_name='下一页索引的规则列表',\n                               load_kwargs={'object_pairs_hook': collections.OrderedDict}, blank=True, default=[])\n    frequency = models.IntegerField(default=60, verbose_name='爬取频率,单位秒')\n    update_time = models.DateTimeField(auto_now=True, verbose_name='更新时间')\n    next_crawl_time = models.DateTimeField(verbose_name='下次爬取时间')\n    fresh_pages = models.IntegerField(default=2, verbose_name='爬取页面数')\n    status = models.IntegerField(default=STATUS_ENABLE, choices=STATUS_CHOICES, verbose_name=\"是否启用\")\n\n    def __unicode__(self):\n        return self.name\n\n    class Meta:\n        verbose_name_plural = \"2 索引和列表规则\"\n\n\nclass DetailRule(models.Model):\n    index_rule = models.ForeignKey(IndexRule)\n    data = JSONField(verbose_name='详情页规则', load_kwargs={'object_pairs_hook': collections.OrderedDict})\n    exclude = JSONField(verbose_name='排除规则', load_kwargs={'object_pairs_hook': collections.OrderedDict},\n                        blank=True, default=[])\n    multi = JSONField(verbose_name='多详情规则', load_kwargs={'object_pairs_hook': collections.OrderedDict},\n                        blank=True, default=[])\n    multi_unique = JSONField(verbose_name='多详情唯一键规则', load_kwargs={'object_pairs_hook': collections.OrderedDict},\n                        blank=True, default=[])\n    fresh_time = models.IntegerField(default=2592000, verbose_name='新鲜度维持时间(秒),默认一个月')\n\n    def __unicode__(self):\n        return '%s, %s' % (self.index_rule.name, self.index_rule.url)\n\n    class Meta:\n        verbose_name_plural = \"3 详情页爬取规则\"\n\n"
  },
  {
    "path": "cores/processors.py",
    "content": "# -*- coding: utf-8 -*-\n__author__ = 'yijingping'\nfrom abc import ABCMeta\nfrom abc import abstractmethod\n\nimport _mysql\nimport torndb\nfrom datetime import datetime\nfrom django.utils.encoding import smart_str, smart_unicode\nfrom django.db import models\nfrom sqlalchemy import create_engine\nfrom sqlalchemy.orm import sessionmaker\nfrom cores.util import get_uniqueid\nimport logging\nlogger = logging.getLogger()\n\n\nclass BaseProcessorBackend(object):\n    __metaclass__ = ABCMeta\n\n    @abstractmethod\n    def __init__(self):\n        pass\n\n    @abstractmethod\n    def process(self, data):\n        pass\n\n\nclass MysqlBackend(BaseProcessorBackend):\n    @property\n    def _table(self):\n        return self.db_table\n\n    def __init__(self, config):\n        db_config = config['database']\n        self.db = torndb.Connection(\n            host=db_config.get(\"host\"),\n            database=db_config.get(\"name\"),\n            user=db_config.get(\"user\"),\n            password=db_config.get(\"password\"),\n            charset=db_config.get(\"charset\")\n        )\n        self.db_table = config['table']\n        self.defaults = config['defaults']\n        self.unique_key = config[\"unique_key\"]\n\n    def process(self, params, filters=None):\n        # 加上默认值\n        data = params.copy()\n        for k, v in self.defaults.iteritems():\n            data.setdefault(k, v)\n\n        # 设置唯一键\n        unique_value = ':'.join(['%s' % data[k] for k in self.unique_key])\n        data['uniqueid'] = get_uniqueid(unique_value)\n        data['update_time'] = str(datetime.now())\n        # 清除数据\n        data.pop('seed_id', None)\n        data.pop('rule_id', None)\n        data.pop('detail_multi', None)\n        # 更新或插入数据库\n        #print data\n        try:\n            # try update\n            affected = self.update(data, {'uniqueid': data['uniqueid']})\n            if affected == 0:\n                # row not exists, try create\n                data['create_time'] = str(datetime.now())\n                self.create(data)\n        except Exception as e:\n            logger.exception(e)\n        finally:\n            logger.debug(data['url'])\n\n    def create(self, params):\n        keys = params.keys()\n        values = params.values()\n        cols = ','.join(map(lambda s:str(s), keys))\n        placeholder = ','.join(['%s' for _ in range(len(keys))])\n        sql = 'INSERT INTO ' + self._table + ' (' + cols + ') ' + ' VALUES (' + placeholder + ');'\n\n        return self.db.insert(sql, *values)\n\n    def update(self, params, filters=None):\n        set_keys = params.keys()\n        values = params.values()\n        set_placeholder = ', '.join(['`'+item+'`=%s' for item in set_keys])\n        sql = 'UPDATE ' + self._table + ' SET ' + set_placeholder\n        if filters:\n            where_keys = filters.keys()\n            where_values = filters.values()\n            where_placeholder = ', '.join(['`'+item+'`=%s' for item in where_keys])\n            sql = sql + ' WHERE ' + where_placeholder\n            values += where_values\n        return self.db.update(sql, *values)\n\n    @staticmethod\n    def dict_to_sql(params, sep=', '):\n        cols = []\n        for k, v in params.iteritems():\n            k2 = _mysql.escape_string(str(k))\n            if v is None:\n                col = '`%s`=NULL' % k2\n            elif isinstance(v, (int, long, float)):\n                col = '`%s`=%s' % (k2, v)\n            elif isinstance(v, unicode):\n                v2 =  v.encode('utf-8')\n                col = '`%s`=\"%s\"' % (k2, smart_unicode(_mysql.escape_string(smart_str(v))))\n            else:\n                col = '`%s`=\"%s\"' % (k2, v)\n            cols.append(col)\n        return smart_unicode(sep.join(cols))\n\n    @staticmethod\n    def fields_to_sql(fields):\n        f2 = [\"`%s`\" % item if item != \"*\" else \"*\" for item in fields]\n        return _mysql.escape_string(', '.join(f2))\n\n\nclass DjangoModelBackend(BaseProcessorBackend):\n    def __init__(self, config):\n        self.defaults = config['defaults']\n        self.unique_key = config[\"unique_key\"]\n        modelstr = config[\"DjangoModel\"]\n        modelclass = models.get_model(modelstr.split('.')[0], modelstr.split('.')[-1])\n        self._class = modelclass\n\n    def process(self, params):\n        C = self._class\n        params['uniqueid'] = get_uniqueid('%s:%s' % (params['wechat_id'], params['title']))\n\n        # 加上默认值\n        data = params.copy()\n        for k, v in self.defaults.iteritems():\n            data.setdefault(k, v)\n\n        # 设置唯一键\n        unique_value = ':'.join(['%s' % data[k] for k in self.unique_key])\n        data['uniqueid'] = get_uniqueid(unique_value)\n        data['update_time'] = str(datetime.now())\n        # 清除数据\n        data.pop('seed_id', None)\n        data.pop('rule_id', None)\n        data.pop('detail_multi', None)\n        # 更新或插入数据库\n        try:\n            C.objects.update_or_create(uniqueid=data['uniqueid'], defaults=data)\n        except Exception as e:\n            logger.exception(e)\n        finally:\n            logger.debug(data['url'])\n\n\n\nclass MongoDBBackend(BaseProcessorBackend):\n    pass\n\n\nclass PostgresBackend(BaseProcessorBackend):\n    @property\n    def _table(self):\n        return self.db_table\n\n    def __init__(self, config):\n        db_config = config['database']\n        conn_url = \"postgresql://%s:%s@%s/%s\" % (\n            db_config.get(\"user\"), db_config.get(\"password\"),\n            db_config.get(\"host\"), db_config.get(\"name\")\n        )\n        self.engine = create_engine(conn_url)\n        self.db_table = config['table']\n        self.defaults = config['defaults']\n        self.unique_key = config[\"unique_key\"]\n\n\n    def process(self, params, filters=None):\n        # 加上默认值\n        data = params.copy()\n        for k, v in self.defaults.iteritems():\n            data.setdefault(k, v)\n\n        # 设置唯一键\n        unique_value = ':'.join(['%s' % data[k] for k in self.unique_key])\n        data['uniqueid'] = get_uniqueid(unique_value)\n        data['update_time'] = str(datetime.now())\n        # 清除数据\n        data.pop('seed_id', None)\n        data.pop('rule_id', None)\n        data.pop('detail_multi', None)\n        # 更新或插入数据库\n        #print data\n        try:\n            # try update\n            affected = self.update(data, {'uniqueid': data['uniqueid']})\n            if affected == 0:\n                # row not exists, try create\n                data['create_time'] = str(datetime.now())\n                self.create(data)\n        except Exception as e:\n            logger.exception(e)\n        finally:\n            logger.debug(data['url'])\n\n    def create(self, params):\n        keys = params.keys()\n        values = params.values()\n        cols = ','.join(map(lambda s:str(s), keys))\n        placeholder = ','.join(['%s' for _ in range(len(keys))])\n        sql = 'INSERT INTO ' + self._table + ' (' + cols + ') ' + ' VALUES (' + placeholder + ');'\n\n        with self.engine.connect() as con:\n            res = con.execute(sql, *values)\n\n    def update(self, params, filters=None):\n        set_keys = params.keys()\n        values = params.values()\n        set_placeholder = ', '.join([item+'=%s' for item in set_keys])\n        sql = 'UPDATE ' + self._table + ' SET ' + set_placeholder\n        if filters:\n            where_keys = filters.keys()\n            where_values = filters.values()\n            where_placeholder = ', '.join([item+'=%s' for item in where_keys])\n            sql = sql + ' WHERE ' + where_placeholder\n            values += where_values\n        rowcount = 0\n        with self.engine.connect() as con:\n            res = con.execute(sql, *values)\n            rowcount = res.rowcount\n        return rowcount\n\n    @staticmethod\n    def dict_to_sql(params, sep=', '):\n        cols = []\n        for k, v in params.iteritems():\n            k2 = _mysql.escape_string(str(k))\n            if v is None:\n                col = '%s=NULL' % k2\n            elif isinstance(v, (int, long, float)):\n                col = '%s=%s' % (k2, v)\n            elif isinstance(v, unicode):\n                v2 =  v.encode('utf-8')\n                col = '%s=\"%s\"' % (k2, smart_unicode(_mysql.escape_string(smart_str(v))))\n            else:\n                col = '%s=\"%s\"' % (k2, v)\n            cols.append(col)\n        return smart_unicode(sep.join(cols))\n\n    @staticmethod\n    def fields_to_sql(fields):\n        f2 = [\"%s\" % item if item != \"*\" else \"*\" for item in fields]\n        return _mysql.escape_string(', '.join(f2))\n\n"
  },
  {
    "path": "cores/util.py",
    "content": "# -*- coding: utf-8 -*-\n__author__ = 'yijingping'\nimport redis\nimport json\nfrom django.conf import settings\nfrom hashlib import md5\nREDIS_POOL = None\n\n\ndef get_redis_pool():\n    global REDIS_POOL\n    if not REDIS_POOL:\n        REDIS_POOL = redis.ConnectionPool(**settings.REDIS_OPTIONS)\n\n    return REDIS_POOL\n\n\ndef get_redis():\n    return redis.Redis(connection_pool=get_redis_pool())\n\n\ndef get_uniqueid(url):\n    link = get_link_from_url(url)\n    return md5(link).hexdigest()\n\n\ndef get_link_from_url(url):\n    if isinstance(url, basestring):\n        return url\n    elif isinstance(url, dict):\n        return json.dumps(url)\n\n"
  },
  {
    "path": "crontab",
    "content": "# 监控服务队列积压数\n*/1 * * * * cd /var/www/pythonzone/unicrawler; python ./manage.py monitor service >> /var/log/pythonzone/monitor_service.log 2>&1\n"
  },
  {
    "path": "manage.py",
    "content": "#!/usr/bin/env python\nimport os\nimport sys\n\nif __name__ == \"__main__\":\n    os.environ.setdefault(\"DJANGO_SETTINGS_MODULE\", \"unicrawler.settings\")\n\n    from django.core.management import execute_from_command_line\n\n    execute_from_command_line(sys.argv)\n"
  },
  {
    "path": "monitors/__init__.py",
    "content": "default_app_config = 'monitors.apps.MonitorsAppConfig'"
  },
  {
    "path": "monitors/admin.py",
    "content": "from django.contrib import admin\nfrom .models import Service\n\n\nclass ServiceAdmin(admin.ModelAdmin):\n    list_display = ('id', 'scheduler', 'downloader', 'extractor', 'processor', 'create_time')\n\nadmin.site.register(Service, ServiceAdmin)"
  },
  {
    "path": "monitors/apps.py",
    "content": "# -*- coding: utf-8 -*-\n__author__ = 'yijingping'\nfrom django.apps import AppConfig\n\n\nclass MonitorsAppConfig(AppConfig):\n    name = 'monitors'\n    verbose_name = u'3 爬虫监控'"
  },
  {
    "path": "monitors/management/__init__.py",
    "content": "# -*- coding: utf-8 -*-\n__author__ = 'yijingping'\n"
  },
  {
    "path": "monitors/management/commands/__init__.py",
    "content": "# -*- coding: utf-8 -*-\n__author__ = 'yijingping'\n"
  },
  {
    "path": "monitors/management/commands/monitor.py",
    "content": "# -*- coding: utf-8 -*-\n__author__ = 'yijingping'\nimport time\nfrom datetime import datetime\nfrom django.core.management.base import BaseCommand, CommandError\nfrom monitors.models import Service\nfrom cores.util import get_redis\nfrom django.conf import settings\nfrom cores.models import Seed, IndexRule\n\nclass Command(BaseCommand):\n    help = '获取监控数据'\n\n    def add_arguments(self, parser):\n        # Positional arguments\n        parser.add_argument('action', choices=['service', 'stats'], help='选择要监控的服务类型')\n\n    def handle(self, *args, **options):\n        if options[\"action\"] == 'service':\n            self.monitor_service()\n\n    def monitor_service(self):\n        conf = settings.CRAWLER_CONFIG\n        r = get_redis()\n        now = datetime.now().replace(second=0, microsecond=0)\n        pipe = r.pipeline()\n        result = pipe.llen(conf['downloader']).llen(conf['extractor']).llen(conf['processor']).execute()\n        scheduler = IndexRule.objects.filter(seed__status=Seed.STATUS_ENABLE, status=IndexRule.STATUS_ENABLE,\n                                             next_crawl_time__lte=now).count()\n        print result\n        Service.objects.create(\n            scheduler=scheduler,\n            downloader=result[0],\n            extractor=result[1],\n            processor=result[2],\n            create_time=now\n        )\n\n"
  },
  {
    "path": "monitors/migrations/0001_initial.py",
    "content": "# -*- coding: utf-8 -*-\nfrom __future__ import unicode_literals\n\nfrom django.db import models, migrations\n\n\nclass Migration(migrations.Migration):\n\n    dependencies = [\n    ]\n\n    operations = [\n        migrations.CreateModel(\n            name='Service',\n            fields=[\n                ('id', models.AutoField(verbose_name='ID', serialize=False, auto_created=True, primary_key=True)),\n                ('scheduler', models.IntegerField(default=0, verbose_name=b'\\xe6\\x9c\\xaa\\xe6\\x89\\xa7\\xe8\\xa1\\x8c\\xe7\\x9a\\x84\\xe8\\xae\\xa1\\xe5\\x88\\x92\\xe6\\x95\\xb0')),\n                ('downloader', models.IntegerField(default=0, verbose_name=b'\\xe6\\x9c\\xaa\\xe4\\xb8\\x8b\\xe8\\xbd\\xbd\\xe7\\x9a\\x84url\\xe6\\x95\\xb0')),\n                ('extractor', models.IntegerField(default=0, verbose_name=b'\\xe6\\x9c\\xaa\\xe6\\x8a\\xbd\\xe5\\x8f\\x96\\xe7\\x9a\\x84url\\xe6\\x95\\xb0')),\n                ('processor', models.IntegerField(default=0, verbose_name=b'\\xe6\\x9c\\xaa\\xe5\\x85\\xa5\\xe5\\xba\\x93\\xe7\\x9a\\x84\\xe8\\xae\\xb0\\xe5\\xbd\\x95\\xe6\\x95\\xb0')),\n                ('create_time', models.DateTimeField(unique=True, verbose_name=b'\\xe5\\x88\\x9b\\xe5\\xbb\\xba\\xe6\\x97\\xb6\\xe9\\x97\\xb4')),\n            ],\n            options={\n                'verbose_name_plural': '1 \\u670d\\u52a1\\u961f\\u5217\\u79ef\\u538b',\n            },\n        ),\n    ]\n"
  },
  {
    "path": "monitors/migrations/__init__.py",
    "content": ""
  },
  {
    "path": "monitors/models.py",
    "content": "# -*- coding: utf-8 -*-\n__author__ = 'yijingping'\nimport collections\nfrom django.db import models\nfrom jsonfield import JSONField\n\n\nclass Service(models.Model):\n    scheduler = models.IntegerField(default=0, verbose_name='未执行的计划数')\n    downloader = models.IntegerField(default=0, verbose_name='未下载的url数')\n    extractor = models.IntegerField(default=0, verbose_name='未抽取的url数')\n    processor = models.IntegerField(default=0, verbose_name='未入库的记录数')\n    create_time = models.DateTimeField(verbose_name='创建时间', unique=True)\n\n    def __unicode__(self):\n        return self.name\n\n    class Meta:\n        verbose_name_plural = \"1 服务队列积压\"\n\n\n"
  },
  {
    "path": "requirements.txt",
    "content": "Django==1.8.1\nMySQL-python==1.2.5\nrequests==2.7.0\nlxml==3.4.4\njsonfield==1.0.3\nhiredis==0.2.0\nredis==2.10.3\ntorndb==0.3\noss2==2.0.5\nselenium==2.52.0\nPyVirtualDisplay==0.1.5\npsycopg2==2.6.1\nsqlalchemy==1.1.4\n\n"
  },
  {
    "path": "supervisord.conf",
    "content": "[program:unicrawler.bowenpay.com]\ncommand=/bin/python /var/www/pythonzone/unicrawler/manage.py runserver 127.0.0.1:8889\numask=022\nuser=ripple\nstartsecs=0\nstopwaitsecs=0\nautostart=true\nautorestart=true\nstdout_logfile=/var/log/pythonzone/unicrawler.stdout.log\nstderr_logfile=/var/log/pythonzone/unicrawler.stderr.log\nstopsignal=KILL\nkillasgroup=true\n\n[program:unicrawler_scheduler]\ncommand=/bin/python /var/www/pythonzone/unicrawler/bin/scheduler.py\numask=022\nuser=ripple\nstartsecs=0\nstopwaitsecs=0\nautostart=true\nautorestart=true\nstdout_logfile=/var/log/pythonzone/unicrawler_scheduler.stdout.log\nstderr_logfile=/var/log/pythonzone/unicrawler_scheduler.stderr.log\nstopsignal=KILL\nkillasgroup=true\n\n[program:unicrawler_downloader]\ncommand=/bin/python /var/www/pythonzone/unicrawler/bin/downloader.py\numask=022\nuser=ripple\nstartsecs=0\nstopwaitsecs=0\nautostart=true\nautorestart=true\nstdout_logfile=/var/log/pythonzone/unicrawler_downloader.stdout.log\nstderr_logfile=/var/log/pythonzone/unicrawler_downloader.stderr.log\nstopsignal=KILL\nkillasgroup=true\nprocess_name=%(process_num)s\nnumprocs=4\n\n[program:unicrawler_extractor]\ncommand=/bin/python /var/www/pythonzone/unicrawler/bin/extractor.py\numask=022\nuser=ripple\nstartsecs=0\nstopwaitsecs=0\nautostart=true\nautorestart=true\nstdout_logfile=/var/log/pythonzone/unicrawler_extractor.stdout.log\nstderr_logfile=/var/log/pythonzone/unicrawler_extractor.stderr.log\nstopsignal=KILL\nkillasgroup=true\nprocess_name=%(process_num)s\nnumprocs=2\n\n[program:unicrawler_processor]\ncommand=/bin/python /var/www/pythonzone/unicrawler/bin/processor.py\numask=022\nuser=ripple\nstartsecs=0\nstopwaitsecs=0\nautostart=true\nautorestart=true\nstdout_logfile=/var/log/pythonzone/unicrawler_processor.stdout.log\nstderr_logfile=/var/log/pythonzone/unicrawler_processor.stderr.log\nstopsignal=KILL\nkillasgroup=true\n\n[program:unicrawler_checkproxies]\ncommand=/bin/python /var/www/pythonzone/unicrawler/manage.py checkproxies \numask=022\nuser=ripple\nstartsecs=0\nstopwaitsecs=0\nautostart=true\nautorestart=true\nstdout_logfile=/var/log/pythonzone/unicrawler_checkproxies.stdout.log\nstderr_logfile=/var/log/pythonzone/unicrawler_checkproxies.stderr.log\nstopsignal=KILL\nkillasgroup=true\n"
  },
  {
    "path": "unicrawler/__init__.py",
    "content": ""
  },
  {
    "path": "unicrawler/settings.py",
    "content": "\"\"\"\nDjango settings for unicrawler project.\n\nGenerated by 'django-admin startproject' using Django 1.8.1.\n\nFor more information on this file, see\nhttps://docs.djangoproject.com/en/1.8/topics/settings/\n\nFor the full list of settings and their values, see\nhttps://docs.djangoproject.com/en/1.8/ref/settings/\n\"\"\"\n\n# Build paths inside the project like this: os.path.join(BASE_DIR, ...)\nimport os\n\nBASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))\n\n\n# Quick-start development settings - unsuitable for production\n# See https://docs.djangoproject.com/en/1.8/howto/deployment/checklist/\n\n# SECURITY WARNING: keep the secret key used in production secret!\nSECRET_KEY = 'bd+g*191h+l1=0e%32h_i8gyrk!v#3(kyxy7^$kq&w=p(q(h0)'\n\n# SECURITY WARNING: don't run with debug turned on in production!\nDEBUG = True\n\nALLOWED_HOSTS = []\n\n\n# Application definition\n\nINSTALLED_APPS = (\n    'django.contrib.admin',\n    'django.contrib.auth',\n    'django.contrib.contenttypes',\n    'django.contrib.sessions',\n    'django.contrib.messages',\n    'django.contrib.staticfiles',\n\n    'unicrawler',\n    'cores',\n    'configs',\n    'monitors'\n)\n\nMIDDLEWARE_CLASSES = (\n    'django.contrib.sessions.middleware.SessionMiddleware',\n    'django.middleware.common.CommonMiddleware',\n    'django.middleware.csrf.CsrfViewMiddleware',\n    'django.contrib.auth.middleware.AuthenticationMiddleware',\n    'django.contrib.auth.middleware.SessionAuthenticationMiddleware',\n    'django.contrib.messages.middleware.MessageMiddleware',\n    'django.middleware.clickjacking.XFrameOptionsMiddleware',\n    'django.middleware.security.SecurityMiddleware',\n)\n\nROOT_URLCONF = 'unicrawler.urls'\n\nTEMPLATES = [\n    {\n        'BACKEND': 'django.template.backends.django.DjangoTemplates',\n        'DIRS': [os.path.join(BASE_DIR, 'templates')]\n        ,\n        'APP_DIRS': True,\n        'OPTIONS': {\n            'context_processors': [\n                'django.template.context_processors.debug',\n                'django.template.context_processors.request',\n                'django.contrib.auth.context_processors.auth',\n                'django.contrib.messages.context_processors.messages',\n            ],\n        },\n    },\n]\n\nWSGI_APPLICATION = 'unicrawler.wsgi.application'\n\n\n# Database\n# https://docs.djangoproject.com/en/1.8/ref/settings/#databases\n\nDATABASES = {\n    'default': {\n        'ENGINE': 'django.db.backends.mysql',\n        'HOST': '127.0.0.1',\n        'NAME': 'unicrawler',\n        'USER': 'root',\n        'PASSWORD': '123456',\n        'OPTIONS':{\n            'charset':'utf8mb4',\n        },\n    }\n}\n\n\n# Internationalization\n# https://docs.djangoproject.com/en/1.8/topics/i18n/\n\nLANGUAGE_CODE = 'zh-hans'\n\nTIME_ZONE = 'Asia/Shanghai'\n\nUSE_I18N = True\n\nUSE_L10N = True\n\nUSE_TZ = False\n\n\n# Static files (CSS, JavaScript, Images)\n# https://docs.djangoproject.com/en/1.8/howto/static-files/\n\nSTATIC_URL = '/static/'\n\nREDIS_OPTIONS = {\n    'host': 'localhost',\n    'port': 6379,\n    'password': '',\n    'db': 3\n}\n\nCRAWLER_CONFIG = {\n    #'scheduler': 'unicrawler:scheduler',\n    'downloader': 'unicrawler:downloader',\n    'extractor': 'unicrawler:extractor',\n    'processor': 'unicrawler:processor'\n}\n\nCRAWLER_DEBUG = False\n\nLOGGING = {\n    'version': 1,\n    'disable_existing_loggers': False,\n    'formatters': {\n        'verbose': {\n            'format': '%(levelname)s %(asctime)s %(module)s %(process)d %(thread)d %(message)s'\n        },\n        'simple': {\n            'format': '%(levelname)s %(message)s'\n        },\n    },\n    'handlers': {\n        'console': {\n            'class': 'logging.StreamHandler',\n            'formatter': 'verbose'\n        },\n    },\n    'loggers': {\n        'django': {\n            'handlers': ['console'],\n            'level': os.getenv('DJANGO_LOG_LEVEL', 'INFO'),\n        },\n        '': {\n            'handlers': ['console'],\n            'level': 'DEBUG',\n        },\n    },\n}\n\n# aliyun oss2\nOSS2_CONFIG = {\n    \"ACCESS_KEY_ID\": \"\",\n    \"ACCESS_KEY_SECRET\": \"\",\n    \"ENDPOINT\": \"\",\n    \"BUCKET_DOMAIN\": \"oss-cn-beijing.aliyuncs.com\",\n    \"BUCKET_NAME\": \"pythonzone\",\n    \"IMAGES_PATH\": \"images/\",\n    \"VIDEOS_PATH\": \"videos/\",\n    \"CDN_DOMAIN\": \"pystats.bowenpay.com\"\n}\n\n## Import local settings\ntry:\n    from local_settings import *\nexcept ImportError:\n    import sys, traceback\n    sys.stderr.write(\"Warning: Can't find the file 'local_settings.py' in the directory containing %r. It appears you've customized things.\\nYou'll have to run django-admin.py, passing it your settings module.\\n(If the file settings.py does indeed exist, it's causing an ImportError somehow.)\\n\" % __file__)\n    sys.stderr.write(\"\\nFor debugging purposes, the exception was:\\n\\n\")\n    traceback.print_exc()\n\n\n"
  },
  {
    "path": "unicrawler/urls.py",
    "content": "\"\"\"unicrawler URL Configuration\n\nThe `urlpatterns` list routes URLs to views. For more information please see:\n    https://docs.djangoproject.com/en/1.8/topics/http/urls/\nExamples:\nFunction views\n    1. Add an import:  from my_app import views\n    2. Add a URL to urlpatterns:  url(r'^$', views.home, name='home')\nClass-based views\n    1. Add an import:  from other_app.views import Home\n    2. Add a URL to urlpatterns:  url(r'^$', Home.as_view(), name='home')\nIncluding another URLconf\n    1. Add an import:  from blog import urls as blog_urls\n    2. Add a URL to urlpatterns:  url(r'^blog/', include(blog_urls))\n\"\"\"\nfrom django.conf.urls import include, url\nfrom django.contrib import admin\nfrom django.views.generic.base import RedirectView\n\nurlpatterns = [\n    url(r'^$', RedirectView.as_view(url='admin/', permanent=False)),\n    url(r'^admin/', include(admin.site.urls))\n\n]\n"
  },
  {
    "path": "unicrawler/wsgi.py",
    "content": "\"\"\"\nWSGI config for unicrawler project.\n\nIt exposes the WSGI callable as a module-level variable named ``application``.\n\nFor more information on this file, see\nhttps://docs.djangoproject.com/en/1.8/howto/deployment/wsgi/\n\"\"\"\n\nimport os\n\nfrom django.core.wsgi import get_wsgi_application\n\nos.environ.setdefault(\"DJANGO_SETTINGS_MODULE\", \"unicrawler.settings\")\n\napplication = get_wsgi_application()\n"
  }
]