Repository: zhanghe06/news_spider
Branch: master
Commit: 9e29525a8bcb
Files: 105
Total size: 206.8 KB

Directory structure:
gitextract_ti7etvkp/

├── .coveragerc
├── .gitignore
├── .travis.yml
├── LICENSE
├── README.md
├── apps/
│   ├── __init__.py
│   ├── client_db.py
│   └── client_rk.py
├── config/
│   ├── __init__.py
│   └── default.py
├── db/
│   ├── data/
│   │   └── mysql.sql
│   └── schema/
│       └── mysql.sql
├── docs/
│   ├── Architecture.md
│   ├── Components/
│   │   ├── MariaDB.md
│   │   ├── Redis.md
│   │   ├── SeaweedFS.md
│   │   └── Squid.md
│   ├── README.md
│   ├── SUMMARY.md
│   ├── Spiders/
│   │   ├── README.md
│   │   ├── Toutiao.md
│   │   ├── Weibo.md
│   │   └── Weixin.md
│   └── book.json
├── env_default.sh
├── etc/
│   ├── scrapy.ini
│   ├── scrapyd.ini
│   ├── supervisord.conf
│   ├── tasks.ini
│   └── toutiao.ini
├── libs/
│   ├── __init__.py
│   ├── counter.py
│   ├── ft.py
│   ├── optical_modem.py
│   ├── redis_pub_sub.py
│   ├── redis_queue.py
│   ├── rk.py
│   └── weed_fs.py
├── logs/
│   └── index.html
├── maps/
│   ├── __init__.py
│   ├── channel.py
│   └── platform.py
├── models/
│   ├── __init__.py
│   └── news.py
├── news/
│   ├── __init__.py
│   ├── items.py
│   ├── middlewares/
│   │   ├── __init__.py
│   │   ├── anti_spider.py
│   │   ├── content_type.py
│   │   ├── de_duplication_request.py
│   │   ├── httpproxy.py
│   │   └── useragent.py
│   ├── middlewares.py
│   ├── pipelines/
│   │   ├── __init__.py
│   │   ├── de_duplication_request.py
│   │   ├── de_duplication_store_mysql.py
│   │   ├── exporter_csv.py
│   │   ├── img_remote_to_local_fs.py
│   │   └── store_mysql.py
│   ├── pipelines.py
│   ├── settings.py
│   └── spiders/
│       ├── __init__.py
│       ├── ip.py
│       ├── toutiao_m.py
│       ├── weibo.py
│       └── weixin.py
├── requirements-py2.txt
├── requirements-py3.txt
├── scrapy.cfg
├── tasks/
│   ├── __init__.py
│   ├── job_put_tasks.py
│   ├── job_reboot_net_china_net.py
│   ├── jobs_proxies.py
│   ├── jobs_sogou.py
│   ├── jobs_weixin.py
│   ├── run_job_counter_clear.py
│   ├── run_job_put_tasks_toutiao.py
│   ├── run_job_put_tasks_weibo.py
│   ├── run_job_put_tasks_weixin.py
│   ├── run_job_reboot_net_china_net.py
│   ├── run_job_sogou_cookies.py
│   ├── run_job_weixin_cookies.py
│   ├── run_jobs.py
│   └── run_jobs_apscheduler.py
├── tests/
│   ├── __init__.py
│   ├── test_date_time.py
│   └── test_finger.py
└── tools/
    ├── __init__.py
    ├── anti_spider_sogou.py
    ├── anti_spider_weixin.py
    ├── char.py
    ├── cookies.py
    ├── date_time.py
    ├── duplicate.py
    ├── gen.py
    ├── img.py
    ├── import_task.py
    ├── net_status.py
    ├── proxies.py
    ├── scrapy_tasks.py
    ├── sys_monitor.py
    ├── toutiao_m.py
    ├── url.py
    ├── weibo.py
    └── weixin.py

================================================
FILE CONTENTS
================================================

================================================
FILE: .coveragerc
================================================
[run]
include =
    tests/*
omit =
    __init__.py

[report]
exclude_lines =
    pragma: no cover
    def __repr__
    def __str__
    if self.debug:
    if settings.DEBUG
    except ImportError
    raise AssertionError
    raise NotImplementedError
    if 0:
    if __name__ == .__main__.:


================================================
FILE: .gitignore
================================================
# Created by .ignore support plugin (hsz.mobi)

*.py[cod]
*.env
.idea
.DS_Store

logs/*
!logs/index.html

.coverage
htmlcov/
.coveralls.yml

csv/*


# toutiao
news/spiders/toutiao.py
tools/toutiao.py

# middlewares
news/middlewares/httpproxy_vps.py

# config
#config/*
#!config/__init__.py
#!config/default.py
#
#env_*.sh
#!env_default.sh


# gitbook
docs/_book/*
docs/node_modules
docs/package-lock.json


================================================
FILE: .travis.yml
================================================
sudo: no
dist: trusty
language: python
python:
  - "2.7"
  - "3.6"
# command to install dependencies
install:
  - if [[ $TRAVIS_PYTHON_VERSION == 2.7 ]]; then pip install -r requirements-py2.txt; fi
  - if [[ $TRAVIS_PYTHON_VERSION == 3.6 ]]; then pip install -r requirements-py3.txt; fi
  - pip install coveralls
  - pip install pyyaml
# command to run tests
script:
  - export PYTHONPATH=${PWD}
  - coverage run -a tests/test_date_time.py
  - coverage run -a tests/test_finger.py
  - coverage report
after_success:
# upload test report
  - coveralls


================================================
FILE: LICENSE
================================================
MIT License

Copyright (c) 2018 碎ping子

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.


================================================
FILE: README.md
================================================
## 新闻抓取

[![Build Status](https://travis-ci.org/zhanghe06/news_spider.svg?branch=master)](https://travis-ci.org/zhanghe06/news_spider)
[![Coverage Status](https://coveralls.io/repos/github/zhanghe06/news_spider/badge.svg?branch=master)](https://coveralls.io/github/zhanghe06/news_spider?branch=master)

### 项目演示

服务依赖:
- MariaDB
- Redis
- NodeJS

本项目依赖第三方验证码识别服务

更新配置 config/default.py 用户名和密码
```
RK_CONFIG = {
    'username': '******',
    'password': '******',
    'soft_id': '93676',
    'soft_key': '5d0e00b196c244cb9d8413809c62f9d5',
}

# 斐斐打码
FF_CONFIG = {
    'pd_id': '******',
    'pd_key': '******',
    'app_id': '312451',
    'app_key': '5YuN+6isLserKBZti4hoaI6UR2N5UT2j',
}
```

```bash
# python2
virtualenv news_spider.env              # 创建虚拟环境
# python3
virtualenv news_spider.env -p python3   # 创建虚拟环境

source env_default.sh               # 激活虚拟环境
pip install -r requirements-py2.txt # 安装环境依赖
# 开发环境 模拟单次抓取
python tasks/job_put_tasks.py wx    # 初次创建任务
python tasks/jobs_sogou.py          # 初次应对反爬
scrapy crawl weixin                 # 开启微信爬虫
# 生产环境 开启持续抓取
supervisord                         # 开启守护进程
supervisorctl start all             # 开启工作进程
```

- env_develop.sh   # 开发环境
- env_product.sh   # 生产环境

### 项目创建过程记录

项目依赖明细
```bash
pip install requests
pip install scrapy
pip install sqlalchemy
pip install mysqlclient
pip install sqlacodegen==1.1.6  # 注意: 最新版 sqlacodegen==2.0 有bug
pip install redis
pip install PyExecJS
pip install Pillow
pip install psutil
pip install schedule
pip install future          # 兼容py2、py3
pip install supervisor      # 当前主版本3只支持py2，将来主版本4(未发布)会支持py3
```
因当前`supervisor`不支持`python3`，故在`requirements.txt`中将其去掉

由于任务调度`apscheduler`不支持Py3（其中的依赖`futures`不支持），这里采用`schedule`

`scrapy`的依赖`cryptography`在`2.2.2`版本中有[安全性问题](https://nvd.nist.gov/vuln/detail/CVE-2018-10903), 强烈建议更新至`2.3`及以上版本, 可以通过更新`scrapy`的方式升级

`scrapy`的依赖`parsel`使用了`functools`的`lru_cache`方法（ python2 是`functools32`的`lru_cache`方法；`functools32`是`functools`的反向移植）


Mac 系统环境依赖（mariadb）
```bash
brew unlink mariadb
brew install mariadb-connector-c
ln -s /usr/local/opt/mariadb-connector-c/bin/mariadb_config /usr/local/bin/mysql_config
# pip install MySQL-python
pip install mysqlclient  # 基于 MySQL-python 兼容py2、py3
rm /usr/local/bin/mysql_config
brew unlink mariadb-connector-c
brew link mariadb
```

CentOS 系统环境依赖
```bash
yum install gcc
yum install mysql-devel
yum install python-devel
yum install epel-release
yum install redis
yum install nodejs
```

CentOS 安装 python3 环境（CentOS 默认是不带 python3 的）
```bash
yum install python34
yum install python34-devel
```

CentOS 安装 pip & virtualenv & git & vim
```bash
yum install python-pip
pip install --upgrade pip
pip install virtualenv
yum install git
yum install vim
```

创建项目
```bash
scrapy startproject news .
scrapy genspider weixin mp.weixin.qq.com
```

启动蜘蛛
```bash
scrapy crawl weixin
```

如需测试微博, 修改以下方法, 更改正确用户名和密码

tools/weibo.py
```
def get_login_data():
    return {
        'username': '******',
        'password': '******'
    }
```

### 蜘蛛调试（以微博为例）
1. 清除中间件去重缓存, 重置调试任务
```
127.0.0.1:6379> DEL "dup:weibo:0"
(integer) 1
127.0.0.1:6379> DEL "scrapy:tasks_set:weibo"
(integer) 1
127.0.0.1:6379> SADD "scrapy:tasks_set:weibo" 130
(integer) 1
127.0.0.1:6379>
```
2. 清除调试蜘蛛存储数据
```mysql
DELETE FROM fetch_result WHERE platform_id=2;
```
3. 启动调试蜘蛛
```bash
scrapy crawl weibo
```


### 验证码识别

~~http://www.ruokuai.com/~~

~~http://wiki.ruokuai.com/~~

~~价格类型:~~
~~http://www.ruokuai.com/home/pricetype~~

热心网友反映`若快`已经关闭, 接下来会支持`斐斐打码`, 敬请期待

斐斐打码开发文档 [http://docs.fateadm.com](http://docs.fateadm.com)


### 索引说明

联合索引, 注意顺序, 同时注意查询条件字段类型需要与索引字段类型一致

实测, 数据量8万记录以上, 如果没有命中索引, 查询会很痛苦


### 项目说明

亮点:

1. 支持分布式, 每个蜘蛛抓取进程对应一个独立的抓取任务
2. 采用订阅发布模型的观察者模式, 处理并发场景的验证码识别任务, 避免无效的识别

备注: `mysql`中`text`最大长度为65,535(2的16次方–1)

类型 | 表达式 | 最大字节长度（bytes） | 大致容量
---: | ---: | ---: | ---:
TinyText | 2的8次方–1 | 255 | 255B
Text | 2的16次方–1 | 65,535 | 64KB
MediumText | 2的24次方–1 | 16,777,215 | 16MB
LongText | 2的32次方–1 | 4,294,967,295 | 4GB

由于微信公众号文章标签过多, 长度超过`Text`的最大值, 故建议采用`MediumText`


### 特别说明

头条请求签名
- M端需要2个参数: as、cp
- PC端需要3个参数: as、cp、_signature

M端2个参数获取方法已公开, 参考蜘蛛 toutiao_m

~~PC端3个参数获取方法已破解, 由于公开之后会引起头条反爬机制更新, 故没有公开, 如有需要, 敬请私聊, 仅供学习, 谢绝商用~~

因M端已满足数据获取要求, 不再开源PC端签名破解


### TODO

微博反爬处理


================================================
FILE: apps/__init__.py
================================================
#!/usr/bin/env python
# encoding: utf-8

"""
@author: zhanghe
@software: PyCharm
@file: __init__.py.py
@time: 2018-02-10 17:33
"""


def func():
    pass


class Main(object):
    def __init__(self):
        pass


if __name__ == '__main__':
    pass


================================================
FILE: apps/client_db.py
================================================
#!/usr/bin/env python
# encoding: utf-8

"""
@author: zhanghe
@software: PyCharm
@file: client_db.py
@time: 2018-02-10 17:34
"""


from sqlalchemy import create_engine
from sqlalchemy import distinct
from sqlalchemy import func
from sqlalchemy.orm import sessionmaker
import redis

from config import current_config

SQLALCHEMY_DATABASE_URI_MYSQL = current_config.SQLALCHEMY_DATABASE_URI_MYSQL
SQLALCHEMY_POOL_SIZE = current_config.SQLALCHEMY_POOL_SIZE
REDIS = current_config.REDIS


engine_mysql = create_engine(SQLALCHEMY_DATABASE_URI_MYSQL, pool_size=SQLALCHEMY_POOL_SIZE, max_overflow=0)
db_session_mysql = sessionmaker(bind=engine_mysql, autocommit=True)


redis_client = redis.Redis(**REDIS)


def get_item(model_class, pk_id):
    session = db_session_mysql()
    try:
        result = session.query(model_class).get(pk_id)
        return result
    finally:
        session.close()


def get_all(model_class, *args, **kwargs):
    session = db_session_mysql()
    try:
        result = session.query(model_class).filter(*args).filter_by(**kwargs).all()
        return result
    finally:
        session.close()


def get_distinct(model_class, field, *args, **kwargs):
    session = db_session_mysql()
    try:
        result = session.query(distinct(getattr(model_class, field)).label(field)).filter(*args).filter_by(**kwargs).all()
        return result
    finally:
        session.close()


def get_group(model_class, field, min_count=0, *args, **kwargs):
    field_obj = getattr(model_class, field)
    session = db_session_mysql()
    try:
        result = session.query(field_obj, func.count(field_obj).label('c')).filter(*args).filter_by(
            **kwargs).group_by(field_obj).having(func.count(field_obj) >= min_count).all()
        return result
    finally:
        session.close()


def add_item(model_class, data):
    session = db_session_mysql()
    try:
        ret = model_class(**data)
        session.add(ret)
        # 如需返回id, 需要手动flush
        session.flush()
        return ret.id
    finally:
        session.close()


================================================
FILE: apps/client_rk.py
================================================
#!/usr/bin/env python
# encoding: utf-8

"""
@author: zhanghe
@software: PyCharm
@file: client_rk.py
@time: 2018-02-10 17:34
"""


from libs.rk import RKClient
from libs.counter import CounterClient
from apps.client_db import redis_client
from tools.cookies import len_cookies

from config import current_config

RK_CONFIG = current_config.RK_CONFIG
BASE_DIR = current_config.BASE_DIR
RK_LIMIT_COUNT_DAILY = current_config.RK_LIMIT_COUNT_DAILY
COOKIES_QUEUE_COUNT = current_config.COOKIES_QUEUE_COUNT

rc_client = RKClient(**RK_CONFIG)

rk_counter_client = CounterClient(redis_client, 'rk')

# 正常图形验证码
# 'im_type_id': 1000     # 任意长度数字
# 'im_type_id': 2000     # 任意长度字母
# 'im_type_id': 3000     # 任意长度英数混合
# 'im_type_id': 4000     # 任意长度汉字
# 'im_type_id': 5000     # 任意长度中英数三混


def get_img_code(im, im_type_id):
    """
    获取验证码
    :param im:
    :param im_type_id:
    :return:
    """
    rc_result = rc_client.rk_create(im, im_type_id)
    print(rc_result)
    if 'Error_Code' in rc_result:
        print(rc_result.get('Error'))
        return None, None
    # {u'Result': u'6dx2t8', u'Id': u'c8a897f0-9825-41a1-b19e-6195ba8559ed'}
    return rc_result['Id'], rc_result['Result']


def img_report_error(im_id):
    rc_client.rk_report_error(im_id)


def check_counter_limit():
    """
    检查是否超过限制（True: 没有超过; False: 超过限制）
    :return:
    """
    rk_counter = rk_counter_client.get()
    return rk_counter < RK_LIMIT_COUNT_DAILY


def check_cookies_count(spider_name):
    """
    检查 cookies 长度是否达到要求（True: 没有达到; False: 达到要求）
    :param spider_name:
    :return:
    """
    return len_cookies(spider_name) < COOKIES_QUEUE_COUNT


def counter_clear():
    """
    计数器清零（每天0点）
    :return:
    """
    rk_counter_client.clear()


================================================
FILE: config/__init__.py
================================================
#!/usr/bin/env python
# encoding: utf-8

"""
@author: zhanghe
@software: PyCharm
@file: __init__.py
@time: 2018-02-10 15:02
"""

from __future__ import unicode_literals
from __future__ import print_function

import os
from importlib import import_module


MODE = os.environ.get('MODE') or 'default'


try:
    current_config = import_module('config.' + MODE)
    print('[√] 当前环境变量: %s' % MODE)
except ImportError:
    print('[!] 配置错误，请初始化环境变量')
    print('source env_develop.sh  # 开发环境')
    print('source env_product.sh  # 生产环境')


================================================
FILE: config/default.py
================================================
#!/usr/bin/env python
# encoding: utf-8

"""
@author: zhanghe
@software: PyCharm
@file: default.py
@time: 2018-07-02 17:57
"""


from __future__ import print_function
from __future__ import unicode_literals

import os

BASE_DIR = os.path.dirname(os.path.dirname(__file__))

# requests 超时设置
REQUESTS_TIME_OUT = (30, 30)

HOST_IP = '0.0.0.0'

# 数据库 MySQL
DB_MYSQL = {
    'host': HOST_IP,
    'user': 'root',
    'passwd': '123456',
    'port': 3306,
    'db': 'news_spider'
}

SQLALCHEMY_DATABASE_URI_MYSQL = \
    'mysql+mysqldb://%s:%s@%s:%s/%s?charset=utf8' % \
    (DB_MYSQL['user'], DB_MYSQL['passwd'], DB_MYSQL['host'], DB_MYSQL['port'], DB_MYSQL['db'])

SQLALCHEMY_POOL_SIZE = 5  # 默认 pool_size=5

# 缓存，队列
REDIS = {
    'host': HOST_IP,
    'port': 6379,
    # 'password': '123456'  # redis-cli AUTH 123456
}

# 若快验证码识别
RK_CONFIG = {
    'username': '******',
    'password': '******',
    'soft_id': '93676',
    'soft_key': '5d0e00b196c244cb9d8413809c62f9d5',
}

# 斐斐打码
FF_CONFIG = {
    'pd_id': '******',
    'pd_key': '******',
    'app_id': '312451',
    'app_key': '5YuN+6isLserKBZti4hoaI6UR2N5UT2j',
}

# 熔断机制 每天请求限制（200元==500000快豆）
RK_LIMIT_COUNT_DAILY = 925

# 队列保留 cookies 数量
COOKIES_QUEUE_COUNT = 5

# 分布式文件系统
WEED_FS_URL = 'http://%s:9333' % HOST_IP

# 优先级配置（深度优先）
DEPTH_PRIORITY = 1
PRIORITY_CONFIG = {
    'list': 600,
    'next': 500,
    'detail': 800,
}

# 启动时间（启动时间之前的内容不抓取, 适用于新闻）
START_TIME = '2018-01-01 00:00:00'


================================================
FILE: db/data/mysql.sql
================================================
USE news_spider;

-- 插入用频道信息
TRUNCATE TABLE `channel`;
INSERT INTO `channel` VALUES (1, 'recommend', '推荐', '', '2017-11-20 10:00:00', '2017-11-20 10:00:00');
INSERT INTO `channel` VALUES (2, 'hot', '热点', '', '2017-11-20 10:00:00', '2017-11-20 10:00:00');
INSERT INTO `channel` VALUES (3, 'technology', '科技', '', '2017-11-20 10:00:00', '2017-11-20 10:00:00');
INSERT INTO `channel` VALUES (4, 'social', '社会', '', '2017-11-20 10:00:00', '2017-11-20 10:00:00');
INSERT INTO `channel` VALUES (5, 'entertainment', '娱乐', '', '2017-11-20 10:00:00', '2017-11-20 10:00:00');
INSERT INTO `channel` VALUES (6, 'game', '游戏', '', '2017-11-20 10:00:00', '2017-11-20 10:00:00');
INSERT INTO `channel` VALUES (7, 'sports', '体育', '', '2017-11-20 10:00:00', '2017-11-20 10:00:00');
INSERT INTO `channel` VALUES (8, 'car', '汽车', '', '2017-11-20 10:00:00', '2017-11-20 10:00:00');
INSERT INTO `channel` VALUES (9, 'finance', '财经', '', '2017-11-20 10:00:00', '2017-11-20 10:00:00');
INSERT INTO `channel` VALUES (10, 'military', '军事', '', '2017-11-20 10:00:00', '2017-11-20 10:00:00');
INSERT INTO `channel` VALUES (11, 'international', '国际', '', '2017-11-20 10:00:00', '2017-11-20 10:00:00');
INSERT INTO `channel` VALUES (12, 'fashion', '时尚', '', '2017-11-20 10:00:00', '2017-11-20 10:00:00');
INSERT INTO `channel` VALUES (13, 'travel', '旅游', '', '2017-11-20 10:00:00', '2017-11-20 10:00:00');
INSERT INTO `channel` VALUES (14, 'explore', '探索', '', '2017-11-20 10:00:00', '2017-11-20 10:00:00');
INSERT INTO `channel` VALUES (15, 'childcare', '育儿', '', '2017-11-20 10:00:00', '2017-11-20 10:00:00');
INSERT INTO `channel` VALUES (16, 'health', '养生', '', '2017-11-20 10:00:00', '2017-11-20 10:00:00');
INSERT INTO `channel` VALUES (17, 'article', '美文', '', '2017-11-20 10:00:00', '2017-11-20 10:00:00');
INSERT INTO `channel` VALUES (18, 'history', '历史', '', '2017-11-20 10:00:00', '2017-11-20 10:00:00');
INSERT INTO `channel` VALUES (19, 'food', '美食', '', '2017-11-20 10:00:00', '2017-11-20 10:00:00');
INSERT INTO `channel` VALUES (20, 'education', '教育', '', '2017-11-20 10:00:00', '2017-11-20 10:00:00');
INSERT INTO `channel` VALUES (21, 'electrical', '电气', '', '2017-11-20 10:00:00', '2017-11-20 10:00:00');
INSERT INTO `channel` VALUES (22, 'machine', '机械', '', '2017-11-20 10:00:00', '2017-11-20 10:00:00');
INSERT INTO `channel` VALUES (23, 'medical', '医疗', '', '2017-11-20 10:00:00', '2017-11-20 10:00:00');

-- 插入抓取任务信息
TRUNCATE TABLE `fetch_task`;
INSERT INTO `fetch_task` VALUES (11, 3, 0, '6555293927', '制造业那些事儿', '', 'http://m.toutiao.com/profile/6555293927/', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05');
INSERT INTO `fetch_task` VALUES (12, 3, 0, '51555073058', '制造业福星高赵', '', 'http://m.toutiao.com/profile/51555073058/', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05');
INSERT INTO `fetch_task` VALUES (13, 3, 0, '58075853770', 'AI汽车制造业', '', 'http://m.toutiao.com/profile/58075853770/', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05');
INSERT INTO `fetch_task` VALUES (14, 3, 0, '51397533037', '制造业的云时代', '', 'http://m.toutiao.com/profile/51397533037/', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05');
INSERT INTO `fetch_task` VALUES (15, 3, 0, '6157673577', '电器制造业大事件', '', 'http://m.toutiao.com/profile/6157673577/', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05');
INSERT INTO `fetch_task` VALUES (16, 3, 0, '3810739482', '互联网扒皮王', '', 'http://m.toutiao.com/profile/3810739482/', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05');
INSERT INTO `fetch_task` VALUES (17, 3, 0, '5347877887', '互联网智慧驿站', '', 'http://m.toutiao.com/profile/5347877887/', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05');
INSERT INTO `fetch_task` VALUES (18, 1, 0, 'Root_Id', 'Website_Name', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05');
INSERT INTO `fetch_task` VALUES (19, 1, 0, 'chuangbiandao', '创变岛', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05');
INSERT INTO `fetch_task` VALUES (20, 1, 0, 'changmaiw', '畅脉全球购', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05');
INSERT INTO `fetch_task` VALUES (21, 1, 0, 'BizNext', '企鹅智酷', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05');
INSERT INTO `fetch_task` VALUES (22, 1, 0, 'renhecom', '人和网', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05');
INSERT INTO `fetch_task` VALUES (23, 1, 0, 'rsqwyjs', '人生趣味研究所', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05');
INSERT INTO `fetch_task` VALUES (24, 1, 0, 'shiyehome', '食业家', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05');
INSERT INTO `fetch_task` VALUES (25, 1, 0, 'tyjzksp', '食品商', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05');
INSERT INTO `fetch_task` VALUES (26, 1, 0, 'wisesale_lzzd', '联纵智达', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05');
INSERT INTO `fetch_task` VALUES (27, 1, 0, 'sxlh002', '蓝海果业', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05');
INSERT INTO `fetch_task` VALUES (28, 1, 0, 'huxiu_com', '虎嗅网', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05');
INSERT INTO `fetch_task` VALUES (29, 1, 0, 'HZKSXFPJLQ', '华中快速消费品经理群', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05');
INSERT INTO `fetch_task` VALUES (30, 1, 0, 'kuaixiao999888', '经销商那些事儿', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05');
INSERT INTO `fetch_task` VALUES (31, 1, 0, 'jingxiaoshang168', '经销商', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05');
INSERT INTO `fetch_task` VALUES (32, 1, 0, 'fmcgchina', '快消品网', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05');
INSERT INTO `fetch_task` VALUES (33, 1, 0, 'FMCG-CLUB', '快速消费品精英俱乐部', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05');
INSERT INTO `fetch_task` VALUES (34, 1, 0, 'tyjspb', '食品板', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05');
INSERT INTO `fetch_task` VALUES (35, 1, 0, 'yxts518', '营销透视镜', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05');
INSERT INTO `fetch_task` VALUES (36, 1, 0, 'salesman66', '营销人', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05');
INSERT INTO `fetch_task` VALUES (37, 1, 0, 'cn-beverage', '饮料行业网', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05');
INSERT INTO `fetch_task` VALUES (38, 1, 0, 'youshudejiu', '有数酒业', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05');
INSERT INTO `fetch_task` VALUES (39, 1, 0, 'i-yiou', '亿欧网', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05');
INSERT INTO `fetch_task` VALUES (40, 1, 0, 'CLFDA-001', '中国副食流通协会总监联盟', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05');
INSERT INTO `fetch_task` VALUES (41, 1, 0, 'wbfood', '58食品网', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05');
INSERT INTO `fetch_task` VALUES (42, 1, 0, 'lanhaiyingxiao', '营销兵法', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05');
INSERT INTO `fetch_task` VALUES (43, 1, 0, 'AutoMan-No1', 'AutoMan', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05');
INSERT INTO `fetch_task` VALUES (44, 1, 0, 'leiphone-sz', '雷锋网', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05');
INSERT INTO `fetch_task` VALUES (45, 1, 0, 'coffeeO2O', '餐饮O2O', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05');
INSERT INTO `fetch_task` VALUES (46, 1, 0, 'newso2o', '零售渠道观察', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05');
INSERT INTO `fetch_task` VALUES (47, 1, 0, 'wwwcbocn', '化妆品财经在线', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05');
INSERT INTO `fetch_task` VALUES (48, 1, 0, 'dushekeji', '毒舌科技', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05');
INSERT INTO `fetch_task` VALUES (49, 1, 0, 'zgsppj', '新食品评介', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05');
INSERT INTO `fetch_task` VALUES (50, 1, 0, 'foodinc', '小食代', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05');
INSERT INTO `fetch_task` VALUES (51, 1, 0, 'lookforfoods', '食品饮料新零售内参', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05');
INSERT INTO `fetch_task` VALUES (52, 1, 0, 'wow36kr', '36氪', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05');
INSERT INTO `fetch_task` VALUES (53, 1, 0, 'food-gnosis', '食悟', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05');
INSERT INTO `fetch_task` VALUES (54, 1, 0, 'newfortune', '新财富杂志', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05');
INSERT INTO `fetch_task` VALUES (55, 1, 0, 'lp800315111', '快消家', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05');
INSERT INTO `fetch_task` VALUES (56, 1, 0, 'tancaijing', '叶檀财经', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05');
INSERT INTO `fetch_task` VALUES (57, 1, 0, 'yigejubaopen', '市井财经', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05');
INSERT INTO `fetch_task` VALUES (58, 1, 0, 'njss02584195518', '工程机械微管家', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05');
INSERT INTO `fetch_task` VALUES (59, 1, 0, 'jiajucy', '家具产业', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05');
INSERT INTO `fetch_task` VALUES (60, 1, 0, 'chinafood365', '中国食品网', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05');
INSERT INTO `fetch_task` VALUES (61, 1, 0, 'dqjswol', '电气自动化控制网', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05');
INSERT INTO `fetch_task` VALUES (62, 1, 0, 'zgyybweixin', '中国医药报', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05');
INSERT INTO `fetch_task` VALUES (63, 1, 0, 'fzfzzk', '纺织服装周刊', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05');
INSERT INTO `fetch_task` VALUES (64, 1, 0, 'www-glass-com-cn', '中国玻璃网', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05');
INSERT INTO `fetch_task` VALUES (65, 1, 0, 'amdaily', '先进制造业', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05');
INSERT INTO `fetch_task` VALUES (66, 1, 0, 'cmpzhizao', '制造业那些事儿', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05');
INSERT INTO `fetch_task` VALUES (67, 1, 0, 'zhishexueshuquan', '知社学术圈', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05');
INSERT INTO `fetch_task` VALUES (68, 1, 0, 'keyanquan', '科研圈', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05');
INSERT INTO `fetch_task` VALUES (69, 1, 0, 'iccafe-sh', 'IC咖啡', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05');
INSERT INTO `fetch_task` VALUES (70, 1, 0, 'robotmagazine', '机器人技术与应用', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05');
INSERT INTO `fetch_task` VALUES (71, 1, 0, 'productronicaChina', '慕尼黑上海电子生产设备展', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05');
INSERT INTO `fetch_task` VALUES (72, 1, 0, 'electronicaChina', 'e星球', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05');
INSERT INTO `fetch_task` VALUES (73, 1, 0, 'feelingcar666', '飞灵汽车', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05');
INSERT INTO `fetch_task` VALUES (74, 1, 0, 'depo88', '分布式能源', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05');
INSERT INTO `fetch_task` VALUES (75, 1, 0, 'jianyuecheping', '建约车评', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05');
INSERT INTO `fetch_task` VALUES (76, 1, 0, 'AECC-2016', '中国航发', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05');
INSERT INTO `fetch_task` VALUES (77, 1, 0, 'mesbook', 'MES百科', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05');
INSERT INTO `fetch_task` VALUES (78, 1, 0, 'mtmt-1951', '机床杂志社', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05');
INSERT INTO `fetch_task` VALUES (79, 1, 0, 'AI_era', '新智元', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05');
INSERT INTO `fetch_task` VALUES (80, 1, 0, 'ikanlixiang', '看理想', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05');
INSERT INTO `fetch_task` VALUES (81, 1, 0, 'AVICESI', '中行伊萨', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05');
INSERT INTO `fetch_task` VALUES (82, 1, 0, 'www_51shape_com', '3D科学谷', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05');
INSERT INTO `fetch_task` VALUES (83, 1, 0, 'i-zhoushuo', '周说', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05');
INSERT INTO `fetch_task` VALUES (84, 1, 0, 'guoguo_innovation', '蝈蝈创新随笔', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05');
INSERT INTO `fetch_task` VALUES (85, 1, 0, 'e-zhizao', 'e制造', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05');
INSERT INTO `fetch_task` VALUES (86, 1, 0, 'RoboSpeak', '机器人大讲堂', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05');
INSERT INTO `fetch_task` VALUES (87, 1, 0, 'The-Intellectual', '知识分子', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05');
INSERT INTO `fetch_task` VALUES (88, 1, 0, 'sdr-china', '软件定义世界', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05');
INSERT INTO `fetch_task` VALUES (89, 1, 0, 'wufutu5', '洞见', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05');
INSERT INTO `fetch_task` VALUES (90, 1, 0, 'siid_2inno', '之新网', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05');
INSERT INTO `fetch_task` VALUES (91, 1, 0, 'e-works', '数字化企业', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05');
INSERT INTO `fetch_task` VALUES (92, 1, 0, 'smr8700', '水木然', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05');
INSERT INTO `fetch_task` VALUES (93, 1, 0, 'casic3s', '航天科工系统仿真科技', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05');
INSERT INTO `fetch_task` VALUES (94, 1, 0, 'xiangxt1984', '向小田', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05');
INSERT INTO `fetch_task` VALUES (95, 1, 0, 'gh_7157c03a9f49', '理深科技时评', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05');
INSERT INTO `fetch_task` VALUES (96, 1, 0, 'gh_8189758efb1b', '国富资本熊焰', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05');
INSERT INTO `fetch_task` VALUES (97, 1, 0, 'iscientists', '赛先生', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05');
INSERT INTO `fetch_task` VALUES (98, 1, 0, 'bjcppmp', '中国造纸杂志社', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05');
INSERT INTO `fetch_task` VALUES (99, 1, 0, 'CPA-PAPER', '中国造纸协会', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05');
INSERT INTO `fetch_task` VALUES (100, 1, 0, 'CTAPI-Paper', '中国造纸学会', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05');
INSERT INTO `fetch_task` VALUES (101, 1, 0, 'zzcywd', '造纸产业', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05');
INSERT INTO `fetch_task` VALUES (102, 1, 0, 'paperCEO', '造纸老板内刊', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05');
INSERT INTO `fetch_task` VALUES (103, 1, 0, 'gh_28281e9f6cc4', '造纸助手', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05');
INSERT INTO `fetch_task` VALUES (104, 1, 0, 'qgzzbwh', '全国造纸工业标准化技术委员会', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05');
INSERT INTO `fetch_task` VALUES (105, 1, 0, 'waysmos', '造纸化学品', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05');
INSERT INTO `fetch_task` VALUES (106, 1, 0, 'wff168_com', '第一家具网', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05');
INSERT INTO `fetch_task` VALUES (107, 1, 0, 'jiajuwxw', '家具微新闻', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05');
INSERT INTO `fetch_task` VALUES (108, 1, 0, 'Furniture_China', '上海家具展', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05');
INSERT INTO `fetch_task` VALUES (109, 1, 0, 'jiajuzhuliuMF', '家具主流', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05');
INSERT INTO `fetch_task` VALUES (110, 1, 0, 'jjgle2015', '家具在线', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05');
INSERT INTO `fetch_task` VALUES (111, 1, 0, 'nfsyyjjb', '医药经济报', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05');
INSERT INTO `fetch_task` VALUES (112, 1, 0, 'iyiyaomofang', '医药魔方数据', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05');
INSERT INTO `fetch_task` VALUES (113, 1, 0, 'gh_260ce2309fff', 'MIMS医药资讯', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05');
INSERT INTO `fetch_task` VALUES (114, 1, 0, 'yyguancha', '医药观察家网', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05');
INSERT INTO `fetch_task` VALUES (115, 1, 0, 'yyshoujibao', '医药手机报', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05');
INSERT INTO `fetch_task` VALUES (116, 1, 0, 'shstpa', '上海医药商业行业协会', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05');
INSERT INTO `fetch_task` VALUES (117, 1, 0, 'fangda_healthcare', '医药法律评论', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05');
INSERT INTO `fetch_task` VALUES (118, 1, 0, 'cmpma1989', '中国医药物资协会', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05');
INSERT INTO `fetch_task` VALUES (119, 1, 0, 'yehenala_678', '医药那些事儿', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05');
INSERT INTO `fetch_task` VALUES (120, 1, 0, 'imrobotic', '机器人在线', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05');
INSERT INTO `fetch_task` VALUES (121, 1, 0, 'CSDN_Tech', 'CSDN技术头条', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05');
INSERT INTO `fetch_task` VALUES (122, 1, 0, 'CSDN_BLOG', 'CSDN博客', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05');
INSERT INTO `fetch_task` VALUES (123, 1, 0, 'CSDNLIB', 'CSDN知识库', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05');
INSERT INTO `fetch_task` VALUES (124, 1, 0, 'csdn_iot', 'CSDN物联网开发', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05');
INSERT INTO `fetch_task` VALUES (125, 2, 0, '1005051627825392', '互联网的那点事', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05');
INSERT INTO `fetch_task` VALUES (126, 2, 0, '1006061787567623', '199IT-互联网数据中心', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05');
INSERT INTO `fetch_task` VALUES (127, 2, 0, '1002061577794853', '互联网的一些事', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05');
INSERT INTO `fetch_task` VALUES (128, 2, 0, '1002063318777442', '互联网创业刊', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05');
INSERT INTO `fetch_task` VALUES (129, 2, 0, '1006061661377270', '互联网观察网', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05');
INSERT INTO `fetch_task` VALUES (130, 2, 0, '1002062210869832', '互联网新闻网', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05');
INSERT INTO `fetch_task` VALUES (131, 2, 0, '1006063481197561', '中国互联网安全大会', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05');
INSERT INTO `fetch_task` VALUES (132, 2, 0, '1002061768025224', '互联网周刊', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05');
INSERT INTO `fetch_task` VALUES (133, 2, 0, '1002063819805149', '互联网焦点网', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05');
INSERT INTO `fetch_task` VALUES (134, 3, 0, '55982516338', '奇文志怪', '', 'http://m.toutiao.com/profile/55982516338/', 1, '', '2018-09-06 14:01:05', '2018-09-06 14:01:05');
INSERT INTO `fetch_task` VALUES (135, 3, 0, '6014591174', '鹏君读书', '', 'http://m.toutiao.com/profile/6014591174/', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05');


================================================
FILE: db/schema/mysql.sql
================================================
DROP DATABASE IF EXISTS `news_spider`;
CREATE DATABASE `news_spider` /*!40100 DEFAULT CHARACTER SET utf8 */;


use news_spider;


CREATE TABLE `channel` (
  `id` INT(11) NOT NULL AUTO_INCREMENT,
  `code` VARCHAR(20) COMMENT '频道编号',
  `name` VARCHAR(20) COMMENT '频道名称',
  `description` VARCHAR(500) DEFAULT '' COMMENT '描述',
  `create_time` TIMESTAMP DEFAULT CURRENT_TIMESTAMP COMMENT '创建时间',
  `update_time` TIMESTAMP DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP COMMENT '更新时间',
  PRIMARY KEY (`id`),
  UNIQUE KEY idx_code (`code`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8 COMMENT='频道表';


CREATE TABLE `fetch_task` (
  `id` INT(11) NOT NULL AUTO_INCREMENT,
  `platform_id` TINYINT DEFAULT 0 COMMENT '平台id（1:微信;2:微博;3:头条）',
  `channel_id` TINYINT DEFAULT 0 COMMENT '频道id',
  `follow_id` VARCHAR(45) DEFAULT '' COMMENT '关注账号id',
  `follow_name` VARCHAR(45) DEFAULT '' COMMENT '关注账号名称',
  `avatar_url` VARCHAR(512) DEFAULT '' COMMENT '关注账号头像',
  `fetch_url` VARCHAR(512) DEFAULT '' COMMENT '抓取入口',
  `flag_enabled` TINYINT DEFAULT 0 COMMENT '启用标记（0:未启用;1:已启用）',
  `description` VARCHAR(500) DEFAULT '' COMMENT '描述',
  `create_time` TIMESTAMP DEFAULT CURRENT_TIMESTAMP COMMENT '创建时间',
  `update_time` TIMESTAMP DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP COMMENT '更新时间',
  PRIMARY KEY (`id`),
  UNIQUE KEY idx_platform_follow_id (`platform_id`, `follow_id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8 COMMENT='抓取任务表';


CREATE TABLE `fetch_result` (
  `id` INT(11) NOT NULL AUTO_INCREMENT,
  `task_id` INT NOT NULL COMMENT '任务id',
  `platform_id` TINYINT DEFAULT 0 COMMENT '平台id（1:微信;2:微博;3:头条）',
  `platform_name` VARCHAR(50) DEFAULT '' COMMENT '平台名称（1:微信;2:微博;3:头条）',
  `channel_id` TINYINT DEFAULT 0 COMMENT '频道id',
  `channel_name` VARCHAR(50) DEFAULT '' COMMENT '频道名称',
  `article_id` VARCHAR(50) DEFAULT '' COMMENT '文章id',
  `article_url` VARCHAR(512) DEFAULT '' COMMENT '文章链接',
  `article_title` VARCHAR(100) DEFAULT '' COMMENT '文章标题',
  `article_author_id` VARCHAR(100) DEFAULT '' COMMENT '文章作者id（对应follow_id）',
  `article_author_name` VARCHAR(100) DEFAULT '' COMMENT '文章作者名称（对应follow_name）',
  `article_tags` VARCHAR(100) DEFAULT '' COMMENT '文章标签（半角逗号分隔）',
  `article_abstract` VARCHAR(500) DEFAULT '' COMMENT '文章摘要',
  `article_content` MEDIUMTEXT COMMENT '文章内容',
  `article_pub_time` DATETIME DEFAULT '1000-01-01 00:00:00' COMMENT '文章发布时间',
  `create_time` TIMESTAMP DEFAULT CURRENT_TIMESTAMP COMMENT '创建时间',
  `update_time` TIMESTAMP DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP COMMENT '更新时间',
  PRIMARY KEY (`id`),
  KEY idx_task_id (`task_id`),
  UNIQUE KEY idx_platform_article_id (`platform_id`, `article_id`),
  KEY idx_platform_author_id (`platform_id`, `article_author_id`),
  KEY idx_article_pub_time (`article_pub_time`),
  KEY idx_create_time (`create_time`),
  KEY idx_update_time (`update_time`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8 COMMENT='抓取结果表';


CREATE TABLE `log_task_scheduling` (
  `id` INT(11) NOT NULL AUTO_INCREMENT,
  `platform_id` TINYINT DEFAULT 0 COMMENT '平台id（1:微信;2:微博;3:头条）',
  `platform_name` VARCHAR(50) DEFAULT '' COMMENT '平台名称（1:微信;2:微博;3:头条）',
  `spider_name` VARCHAR(45) DEFAULT '' COMMENT '蜘蛛名称，一般同平台名称',
  `task_quantity` INT(11) DEFAULT 0 COMMENT '任务数量',
  `create_time` TIMESTAMP DEFAULT CURRENT_TIMESTAMP COMMENT '创建时间',
  `update_time` TIMESTAMP DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP COMMENT '更新时间',
  PRIMARY KEY (`id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8 COMMENT='任务调度日志表';

-- 更新记录[2018-02-13]
# ALTER TABLE `fetch_result` MODIFY `article_content` MEDIUMTEXT COMMENT '文章内容';

-- 更新记录[2018-05-29]
# DROP INDEX idx_platform_author_id ON `fetch_result`;
# ALTER TABLE `fetch_result` ADD INDEX idx_platform_author_id (`platform_id`, `article_author_id`);
# ALTER TABLE `fetch_result` MODIFY `article_pub_time` DATETIME DEFAULT '1000-01-01 00:00:00' COMMENT '文章发布时间';
# ALTER TABLE `fetch_result` ADD INDEX idx_article_pub_time (`article_pub_time`);
# ALTER TABLE `fetch_result` ADD INDEX idx_create_time (`create_time`);
# ALTER TABLE `fetch_result` ADD INDEX idx_update_time (`update_time`);


================================================
FILE: docs/Architecture.md
================================================
# 整体架构(Architecture)

- MariaDB

每个公众号/发布号的首页（即爬虫抓取入口）存储于数据库中。

表结构 db/schema/mysql.sql

测试数据 db/data/mysql.sql


- Redis

为了支持分布式, 抓取任务单独存放于缓存, 这样在调试时, 需要手动执行创建任务。

参考[启动说明](Spiders/README.md)

为了方便调试, 本项目所有缓存key均以`scrapy:`作为前缀

- NodeJS

部分详情页面的信息抽取, 本项目使用js处理, 避免正则表达式规则的不完全覆盖。


================================================
FILE: docs/Components/MariaDB.md
================================================
# MariaDB


================================================
FILE: docs/Components/Redis.md
================================================
# Redis


================================================
FILE: docs/Components/SeaweedFS.md
================================================
# SeaweedFS

[SeaweedFS 项目地址](https://github.com/chrislusf/seaweedfs)


## 安装

### Go (Golang)

下载页面： https://golang.org/dl/

```
$ wget https://dl.google.com/go/go1.11.1.linux-amd64.tar.gz
$ sudo tar -C /usr/local -xzf go1.11.1.linux-amd64.tar.gz
$ sudo vim /etc/profile
    export GOROOT=/usr/local/go
    export GOPATH=$HOME/work
    export PATH=$PATH:$GOROOT/bin:$GOPATH/bin
$ source /etc/profile
```

或者仅为当前用户设置环境变量
```
$ vim ~/.bashrc
$ source ~/.bashrc
```

注意：使用 zsh 的用户, 需要为 zsh 设置环境变量
```
$ vim ~/.zshrc
$ source ~/.zshrc
```

### Weed

依赖 git (版本控制工具)

```
go get github.com/chrislusf/seaweedfs/weed
```


## 启动

Start Master Server
```
$ weed master
```

Start Volume Servers
```
$ mkdir /tmp/data1 /tmp/data2
$ chmod 777 /tmp/data1 /tmp/data2
$ weed volume -dir="/tmp/data1" -max=5  -mserver="localhost:9333" -port=8080 &
$ weed volume -dir="/tmp/data2" -max=10 -mserver="localhost:9333" -port=8081 &
```

```
$ weed volume -dir=/tmp/data1/ -mserver="localhost:9333" -ip="192.168.2.32" -port=8080
```


## 启动（方式二）
```
$ weed server -dir=/tmp/data1/ -filer -filer.port=8000 -master.port=9333 -volume.port=8001
```
集群管理: http://127.0.0.1:9333/

归档管理: http://localhost:8000/

卷积管理: http://localhost:8001/ui/index.html

图片地址: http://localhost:8001/


上传文件请求
```
$ curl http://localhost:9333/dir/assign
{"fid":"2,055a54a8ec","url":"127.0.0.1:8080","publicUrl":"127.0.0.1:8080","count":1}
```

上传文件
```
$ curl -X PUT -F file=@/home/zhanghe/metro.jpg http://127.0.0.1:8080/2,055a54a8ec
{"name":"metro.jpg","size":1830848}
```

删除文件
```
$ curl -X DELETE http://127.0.0.1:8080/2,055a54a8ec
{"size":1830869}
```

文件读取
```
$ curl "http://localhost:9333/dir/lookup?volumeId=2"
{"volumeId":"2","locations":[{"url":"127.0.0.1:8080","publicUrl":"127.0.0.1:8080"}]}
```

访问文件
- [http://127.0.0.1:8080/2,055a54a8ec.jpg](http://127.0.0.1:8080/2,055a54a8ec.jpg)
- [http://127.0.0.1:8080/2/055a54a8ec.jpg](http://127.0.0.1:8080/2/055a54a8ec.jpg)
- [http://127.0.0.1:8080/2/055a54a8ec](http://127.0.0.1:8080/2/055a54a8ec)
- [http://127.0.0.1:8080/2/055a54a8ec?height=200&width=200](http://127.0.0.1:8080/2/055a54a8ec?height=200&width=200)


导出文件打包
```
$ weed export -dir=/tmp/data1 -volumeId=1 -o=/tmp/data1.tar -fileNameFormat={{.Name}} -newer='2006-01-02T15:04:05'
```

解包具体文件
```
$ tar -xvf data1.tar
```

## 快速安装
```bash
# Mac系统
$ wget -c https://github.com/chrislusf/seaweedfs/releases/download/0.76/darwin_amd64.tar.gz -O weed_darwin_arm64.tar.gz
$ tar -zxvf weed_darwin_arm64.tar.gz

# Linux系统
$ wget -c https://github.com/chrislusf/seaweedfs/releases/download/0.76/linux_arm64.tar.gz -O weed_linux_arm64.tar.gz
$ tar -zxvf weed_linux_arm64.tar.gz

# 启动
$ ./weed server -dir=weed_data/ -filer -filer.port=8000 -master.port=9333 -volume.port=8001 -volume.max=32
```


================================================
FILE: docs/Components/Squid.md
================================================
# Squid


================================================
FILE: docs/README.md
================================================
# scrapy最佳实践 - 新闻抓取

## GitBook 操作指南

初始化
```bash
cd docs
npm install -g gitbook-cli
npm install --save gitbook-plugin-todo
npm install --save gitbook-plugin-mermaid-full

gitbook init  # 或者 gitbook install
```

开启服务
```bash
gitbook serve
```

访问 [http://localhost:4000](http://localhost:4000)


================================================
FILE: docs/SUMMARY.md
================================================
# Summary

* [项目介绍](README.md)
* [项目架构](Architecture.md)
* [爬虫模块](Spiders/README.md)
    * [微信爬虫](Spiders/Weixin.md)
    * [微博爬虫](Spiders/Weibo.md)
    * [头条爬虫](Spiders/Toutiao.md)
* 组件服务
    * [MariaDB](Components/MariaDB.md)
    * [Redis](Components/Redis.md)
    * [SeaweedFS](Components/SeaweedFS.md)


================================================
FILE: docs/Spiders/README.md
================================================
# Spiders

1、部署系统依赖

- MariaDB
- Redis
- NodeJS

2、部署项目依赖

```
pip install requirements.txt
```

3、创建数据库, 建立抓取入口

- 建表结构 db/schema/mysql.sql
- 测试数据 db/data/mysql.sql

4、创建抓取任务, 写入缓存
```
(news_spider.env) ➜  news_spider git:(master) ✗ python tasks/job_put_tasks.py
[√] 当前环境变量: develop
缺失参数
Example:
	python job_put_tasks.py wx  # 微信
	python job_put_tasks.py wb  # 微博
	python job_put_tasks.py tm  # 头条(M)
```
参考以上提示, 对应蜘蛛执行各自的脚本完成任务创建

5、微信抓取, 需要初始化cookie, 其他两个蜘蛛不需要


生成环境, 可以使用`supervisor`自动守护`scrapy.ini`、`tasks.ini`这两组进程, 根据需要自行修改


================================================
FILE: docs/Spiders/Toutiao.md
================================================
# 头条(M端)

创建任务详情
```mysql
INSERT INTO `fetch_task` VALUES (134, 3, 0, '55982516338', '奇文志怪', '', 'http://m.toutiao.com/profile/55982516338/', 1, '', '2018-09-06 14:01:05', '2018-09-06 14:01:05');
```

进入redis, 检查调度任务数量
```
127.0.0.1:6379> SCARD "scrapy:tasks_set:toutiao_m"
(integer) 439
```

如果没有调度任务, 需要创建调度任务
```
python tasks/job_put_tasks.py tm
```

开启爬虫
```
scrapy crawl toutiao_m
```


================================================
FILE: docs/Spiders/Weibo.md
================================================
# 微博

进入redis, 检查调度任务数量
```
127.0.0.1:6379> SCARD "scrapy:tasks_set:weibo"
(integer) 0
```

如果没有调度任务, 需要创建调度任务
```
python tasks/job_put_tasks.py wb
```

开启爬虫
```
scrapy crawl weibo
```


================================================
FILE: docs/Spiders/Weixin.md
================================================
# 微信

进入redis, 检查调度任务数量
```
127.0.0.1:6379> SCARD "scrapy:tasks_set:weixin"
(integer) 0
```

如果没有调度任务, 需要创建调度任务
```
python tasks/job_put_tasks.py wx
```

开启爬虫
```
scrapy crawl weixin
```


================================================
FILE: docs/book.json
================================================
{
    "language": "zh-hans",
    "author": "碎ping子",
    "plugins": [
        "todo",
        "mermaid-full@>=0.5.1"
    ]
}


================================================
FILE: env_default.sh
================================================
#!/usr/bin/env bash

source news_spider.env/bin/activate

export PATH=${PWD}:${PATH}
export PYTHONPATH=${PWD}
export PYTHONIOENCODING=utf-8
export MODE=default


================================================
FILE: etc/scrapy.ini
================================================
[group:scrapy]
programs=weixin,weibo,toutiao


[program:weixin]
command=scrapy crawl weixin
directory=news
startsecs=0
stopwaitsecs=0
autostart=false
autorestart=true
redirect_stderr=true
stdout_logfile=logs/scrapy_weixin.log


[program:weibo]
command=scrapy crawl weibo
directory=news
startsecs=0
stopwaitsecs=0
autostart=false
autorestart=true
redirect_stderr=true
stdout_logfile=logs/scrapy_weibo.log


[program:toutiao]
command=scrapy crawl toutiao
directory=news
startsecs=0
stopwaitsecs=0
autostart=false
autorestart=true
redirect_stderr=true
stdout_logfile=logs/scrapy_toutiao.log


================================================
FILE: etc/scrapyd.ini
================================================
[program:scrapyd]
command=scrapyd
directory=news
priority=200
startsecs=0
stopwaitsecs=0
autostart=false
autorestart=true
redirect_stderr=true
stdout_logfile=logs/scrapyd.log


================================================
FILE: etc/supervisord.conf
================================================
; Sample supervisor config file.
;
; For more information on the config file, please see:
; http://supervisord.org/configuration.html
;
; Notes:
;  - Shell expansion ("~" or "$HOME") is not supported.  Environment
;    variables can be expanded using this syntax: "%(ENV_HOME)s".
;  - Comments must have a leading space: "a=b ;comment" not "a=b;comment".

;[unix_http_server]
;file=/tmp/supervisor.sock   ; (the path to the socket file)
;chmod=0700                 ; socket file mode (default 0700)
;chown=nobody:nogroup       ; socket file uid:gid owner
;username=user              ; (default is no username (open server))
;password=123               ; (default is no password (open server))

[inet_http_server]         ; inet (TCP) server disabled by default
port=127.0.0.1:9001        ; (ip_address:port specifier, *:port for all iface)
username=user              ; (default is no username (open server))
password=123               ; (default is no password (open server))

[supervisord]
logfile=/tmp/supervisord.log ; (main log file;default $CWD/supervisord.log)
logfile_maxbytes=50MB        ; (max main logfile bytes b4 rotation;default 50MB)
logfile_backups=10           ; (num of main logfile rotation backups;default 10)
loglevel=info                ; (log level;default info; others: debug,warn,trace)
pidfile=/tmp/supervisord.pid ; (supervisord pidfile;default supervisord.pid)
nodaemon=false               ; (start in foreground if true;default false)
minfds=1024                  ; (min. avail startup file descriptors;default 1024)
minprocs=200                 ; (min. avail process descriptors;default 200)
;umask=022                   ; (process file creation umask;default 022)
;user=chrism                 ; (default is current user, required if root)
;identifier=supervisor       ; (supervisord identifier, default is 'supervisor')
;directory=/tmp              ; (default is not to cd during start)
;nocleanup=true              ; (don't clean up tempfiles at start;default false)
;childlogdir=/tmp            ; ('AUTO' child log dir, default $TEMP)
;environment=KEY="value"     ; (key value pairs to add to environment)
;strip_ansi=false            ; (strip ansi escape codes in logs; def. false)

; the below section must remain in the config file for RPC
; (supervisorctl/web interface) to work, additional interfaces may be
; added by defining them in separate rpcinterface: sections
[rpcinterface:supervisor]
supervisor.rpcinterface_factory = supervisor.rpcinterface:make_main_rpcinterface

[supervisorctl]
;serverurl=unix:///tmp/supervisor.sock ; use a unix:// URL  for a unix socket
serverurl=http://127.0.0.1:9001 ; use an http:// url to specify an inet socket
username=user               ; should be same as http_username if set
password=123                ; should be same as http_password if set
;prompt=mysupervisor         ; cmd line prompt (default "supervisor")
;history_file=~/.sc_history  ; use readline history if available

; The below sample program section shows all possible program subsection values,
; create one or more 'real' program: sections to be able to control them under
; supervisor.

;[program:theprogramname]
;command=/bin/cat              ; the program (relative uses PATH, can take args)
;process_name=%(program_name)s ; process_name expr (default %(program_name)s)
;numprocs=1                    ; number of processes copies to start (def 1)
;directory=/tmp                ; directory to cwd to before exec (def no cwd)
;umask=022                     ; umask for process (default None)
;priority=999                  ; the relative start priority (default 999)
;autostart=true                ; start at supervisord start (default: true)
;startsecs=1                   ; # of secs prog must stay up to be running (def. 1)
;startretries=3                ; max # of serial start failures when starting (default 3)
;autorestart=unexpected        ; when to restart if exited after running (def: unexpected)
;exitcodes=0,2                 ; 'expected' exit codes used with autorestart (default 0,2)
;stopsignal=QUIT               ; signal used to kill process (default TERM)
;stopwaitsecs=10               ; max num secs to wait b4 SIGKILL (default 10)
;stopasgroup=false             ; send stop signal to the UNIX process group (default false)
;killasgroup=false             ; SIGKILL the UNIX process group (def false)
;user=chrism                   ; setuid to this UNIX account to run the program
;redirect_stderr=true          ; redirect proc stderr to stdout (default false)
;stdout_logfile=/a/path        ; stdout log path, NONE for none; default AUTO
;stdout_logfile_maxbytes=1MB   ; max # logfile bytes b4 rotation (default 50MB)
;stdout_logfile_backups=10     ; # of stdout logfile backups (default 10)
;stdout_capture_maxbytes=1MB   ; number of bytes in 'capturemode' (default 0)
;stdout_events_enabled=false   ; emit events on stdout writes (default false)
;stderr_logfile=/a/path        ; stderr log path, NONE for none; default AUTO
;stderr_logfile_maxbytes=1MB   ; max # logfile bytes b4 rotation (default 50MB)
;stderr_logfile_backups=10     ; # of stderr logfile backups (default 10)
;stderr_capture_maxbytes=1MB   ; number of bytes in 'capturemode' (default 0)
;stderr_events_enabled=false   ; emit events on stderr writes (default false)
;environment=A="1",B="2"       ; process environment additions (def no adds)
;serverurl=AUTO                ; override serverurl computation (childutils)

; The below sample eventlistener section shows all possible
; eventlistener subsection values, create one or more 'real'
; eventlistener: sections to be able to handle event notifications
; sent by supervisor.

;[eventlistener:theeventlistenername]
;command=/bin/eventlistener    ; the program (relative uses PATH, can take args)
;process_name=%(program_name)s ; process_name expr (default %(program_name)s)
;numprocs=1                    ; number of processes copies to start (def 1)
;events=EVENT                  ; event notif. types to subscribe to (req'd)
;buffer_size=10                ; event buffer queue size (default 10)
;directory=/tmp                ; directory to cwd to before exec (def no cwd)
;umask=022                     ; umask for process (default None)
;priority=-1                   ; the relative start priority (default -1)
;autostart=true                ; start at supervisord start (default: true)
;startsecs=1                   ; # of secs prog must stay up to be running (def. 1)
;startretries=3                ; max # of serial start failures when starting (default 3)
;autorestart=unexpected        ; autorestart if exited after running (def: unexpected)
;exitcodes=0,2                 ; 'expected' exit codes used with autorestart (default 0,2)
;stopsignal=QUIT               ; signal used to kill process (default TERM)
;stopwaitsecs=10               ; max num secs to wait b4 SIGKILL (default 10)
;stopasgroup=false             ; send stop signal to the UNIX process group (default false)
;killasgroup=false             ; SIGKILL the UNIX process group (def false)
;user=chrism                   ; setuid to this UNIX account to run the program
;redirect_stderr=false         ; redirect_stderr=true is not allowed for eventlisteners
;stdout_logfile=/a/path        ; stdout log path, NONE for none; default AUTO
;stdout_logfile_maxbytes=1MB   ; max # logfile bytes b4 rotation (default 50MB)
;stdout_logfile_backups=10     ; # of stdout logfile backups (default 10)
;stdout_events_enabled=false   ; emit events on stdout writes (default false)
;stderr_logfile=/a/path        ; stderr log path, NONE for none; default AUTO
;stderr_logfile_maxbytes=1MB   ; max # logfile bytes b4 rotation (default 50MB)
;stderr_logfile_backups=10     ; # of stderr logfile backups (default 10)
;stderr_events_enabled=false   ; emit events on stderr writes (default false)
;environment=A="1",B="2"       ; process environment additions
;serverurl=AUTO                ; override serverurl computation (childutils)

; The below sample group section shows all possible group values,
; create one or more 'real' group: sections to create "heterogeneous"
; process groups.

;[group:thegroupname]
;programs=progname1,progname2  ; each refers to 'x' in [program:x] definitions
;priority=999                  ; the relative start priority (default 999)

; The [include] section can just contain the "files" setting.  This
; setting can list multiple files (separated by whitespace or
; newlines).  It can also contain wildcards.  The filenames are
; interpreted as relative to this file.  Included files *cannot*
; include files themselves.

;[include]
;files = relative/directory/*.ini

;[include]
;files = scrapy.ini tasks.ini

[include]
files = toutiao.ini


================================================
FILE: etc/tasks.ini
================================================
[group:tasks]
programs=counter_clear,put_tasks_toutiao,put_tasks_weibo,put_tasks_weixin,sogou_cookies,weixin_cookies


[program:counter_clear]
command=python tasks/run_job_counter_clear.py
startsecs=0
stopwaitsecs=0
autostart=false
autorestart=true
redirect_stderr=true
stdout_logfile=logs/counter_clear.log


[program:put_tasks_toutiao]
command=python tasks/run_job_put_tasks_toutiao.py
startsecs=0
stopwaitsecs=0
autostart=false
autorestart=true
redirect_stderr=true
stdout_logfile=logs/put_tasks_toutiao.log


[program:put_tasks_weibo]
command=python tasks/run_job_put_tasks_weibo.py
startsecs=0
stopwaitsecs=0
autostart=false
autorestart=true
redirect_stderr=true
stdout_logfile=logs/put_tasks_weibo.log


[program:put_tasks_weixin]
command=python tasks/run_job_put_tasks_weixin.py
startsecs=0
stopwaitsecs=0
autostart=false
autorestart=true
redirect_stderr=true
stdout_logfile=logs/put_tasks_weixin.log


[program:sogou_cookies]
command=python tasks/run_job_sogou_cookies.py
startsecs=0
stopwaitsecs=0
autostart=false
autorestart=true
redirect_stderr=true
stdout_logfile=logs/sogou_cookies.log


[program:weixin_cookies]
command=python tasks/run_job_weixin_cookies.py
startsecs=0
stopwaitsecs=0
autostart=false
autorestart=true
redirect_stderr=true
stdout_logfile=logs/weixin_cookies.log


================================================
FILE: etc/toutiao.ini
================================================
[group:toutiao]
programs=put_tasks,scrapy

[program:put_tasks]
command=python tasks/run_job_put_tasks_toutiao.py
startsecs=0
stopwaitsecs=0
autostart=false
autorestart=true
redirect_stderr=true
stdout_logfile=logs/put_tasks_toutiao.log

[program:scrapy]
command=scrapy crawl toutiao
directory=news
startsecs=0
stopwaitsecs=0
autostart=false
autorestart=true
redirect_stderr=true
stdout_logfile=logs/scrapy_toutiao.log

;[program:reboot_net]
;command=python tasks/run_job_reboot_net_china_net.py
;startsecs=0
;stopwaitsecs=0
;autostart=false
;autorestart=true
;redirect_stderr=true
;stdout_logfile=logs/reboot_net_china_net.log


================================================
FILE: libs/__init__.py
================================================
#!/usr/bin/env python
# encoding: utf-8

"""
@author: zhanghe
@software: PyCharm
@file: __init__.py.py
@time: 2018-02-10 15:24
"""


def func():
    pass


class Main(object):
    def __init__(self):
        pass


if __name__ == '__main__':
    pass


================================================
FILE: libs/counter.py
================================================
#!/usr/bin/env python
# encoding: utf-8

"""
@author: zhanghe
@software: PyCharm
@file: counter.py
@time: 2018-02-10 15:24
"""

from redis import Redis


class CounterClient(object):
    """
    计数器
    """

    def __init__(self, redis_client, entity_name, prefix='counter'):
        """
        :param redis_client:
        :param entity_name:
        :param prefix:
        """
        self.redis_client = redis_client  # type: Redis
        self.counter_key = "%s:%s" % (prefix, entity_name)

    def increase(self, amount=1):
        """
        增加计数
        :param amount:
        :return:
        """
        return int(self.redis_client.incr(self.counter_key, amount))

    def decrease(self, amount=1):
        """
        减少计数
        :param amount:
        :return:
        """
        return int(self.redis_client.decr(self.counter_key, amount))

    def get(self):
        """
        获取计数
        :return:
        """
        return int(self.redis_client.get(self.counter_key) or 0)

    def clear(self):
        """
        清除计数
        :return:
        """
        return self.redis_client.delete(self.counter_key)


================================================
FILE: libs/ft.py
================================================
#!/usr/bin/env python
# encoding: utf-8

"""
@author: zhanghe
@software: PyCharm
@file: ff.py
@time: 2019-05-26 14:26
"""

import base64
import hashlib
import time
import requests


URL = "http://pred.fateadm.com"


class FTClient(object):
    def __init__(self, pd_id, pd_key, app_id='', app_key=''):
        self.pd_id = pd_id
        self.pd_key = pd_key
        self.app_id = app_id
        self.app_key = app_key
        self.host = URL
        self.s = requests.session()
        self.timeout = 30

    @staticmethod
    def calc_sign(pd_id, pd_key, timestamp):
        md5 = hashlib.md5()
        md5.update(timestamp + pd_key)
        sign_a = md5.hexdigest()

        md5 = hashlib.md5()
        md5.update(pd_id + timestamp + sign_a)
        sign_b = md5.hexdigest()
        return sign_b

    @staticmethod
    def calc_card_sign(card_id, card_key, timestamp, pd_key):
        md5 = hashlib.md5()
        md5.update(pd_key + timestamp + card_id + card_key)
        return md5.hexdigest()

    def query_balance(self):
        """查询余额"""
        tm = str(int(time.time()))
        sign = self.calc_sign(self.pd_id, self.pd_key, tm)
        param = {
            "user_id": self.pd_id,
            "timestamp": tm,
            "sign": sign
        }
        url = self.host + "/api/custval"
        rsp = self.s.post(url, param, timeout=self.timeout).json()
        return rsp

    def query_tts(self, predict_type):
        """查询网络延迟"""
        tm = str(int(time.time()))
        sign = self.calc_sign(self.pd_id, self.pd_key, tm)
        param = {
            "user_id": self.pd_id,
            "timestamp": tm,
            "sign": sign,
            "predict_type": predict_type,
        }
        if self.app_id != "":
            asign = self.calc_sign(self.app_id, self.app_key, tm)
            param["appid"] = self.app_id
            param["asign"] = asign
        url = self.host + "/api/qcrtt"
        rsp = self.s.post(url, param, timeout=self.timeout).json()
        return rsp

    def predict(self, predict_type, img_data):
        """识别验证码"""
        tm = str(int(time.time()))
        sign = self.calc_sign(self.pd_id, self.pd_key, tm)
        img_base64 = base64.b64encode(img_data)
        param = {
            "user_id": self.pd_id,
            "timestamp": tm,
            "sign": sign,
            "predict_type": predict_type,
            "img_data": img_base64,
        }
        if self.app_id != "":
            asign = self.calc_sign(self.app_id, self.app_key, tm)
            param["appid"] = self.app_id
            param["asign"] = asign
        url = self.host + "/api/capreg"
        rsp = self.s.post(url, param, timeout=self.timeout).json()
        return rsp

    def predict_from_file(self, predict_type, file_name):
        """从文件进行验证码识别"""
        with open(file_name, "rb+") as f:
            data = f.read()
        return self.predict(predict_type, data)

    def justice(self, request_id):
        """识别失败，进行退款请求"""
        if request_id == "":
            return
        tm = str(int(time.time()))
        sign = self.calc_sign(self.pd_id, self.pd_key, tm)
        param = {
            "user_id": self.pd_id,
            "timestamp": tm,
            "sign": sign,
            "request_id": request_id
        }
        url = self.host + "/api/capjust"
        rsp = self.s.post(url, param, timeout=self.timeout).json()
        return rsp

    def charge(self, card_id, card_key):
        """充值接口"""
        tm = str(int(time.time()))
        sign = self.calc_sign(self.pd_id, self.pd_key, tm)
        card_sign = self.calc_card_sign(card_id, card_key, tm, self.pd_key)
        param = {
            "user_id": self.pd_id,
            "timestamp": tm,
            "sign": sign,
            'cardid': card_id,
            'csign': card_sign
        }
        url = self.host + "/api/charge"
        rsp = self.s.post(url, param, timeout=self.timeout).json()
        return rsp


def test_ft():
    """
    测试
    {u'RspData': u'{"cust_val":1010}', u'RetCode': u'0', u'ErrMsg': u'succ', u'RequestId': u''}
    {u'RspData': u'{"result": "8x4g"}', u'RetCode': u'0', u'ErrMsg': u'', u'RequestId': u'2019052615005042ad98b2000518d493'}
    :return:
    """
    pd_id = "xxxxxx"
    pd_key = "xxxxxx"
    app_id = "312451"
    app_key = "5YuN+6isLserKBZti4hoaI6UR2N5UT2j"
    predict_type = "30400"
    api = FTClient(pd_id, pd_key, app_id, app_key)
    # 查询余额接口
    res = api.query_balance()
    print(res)
    file_name = "img.jpg"
    rsp = api.predict_from_file(predict_type, file_name)
    print(rsp)

if __name__ == "__main__":
    test_ft()


================================================
FILE: libs/optical_modem.py
================================================
#!/usr/bin/env python
# encoding: utf-8

"""
@author: zhanghe
@software: PyCharm
@file: optical_modem.py
@time: 2018-05-27 00:24
"""

import base64
import json
import time
import re
import random
import hashlib
import requests
from scrapy.selector import Selector


class OpticalModemChinaNet(object):
    """
    电信光猫
    """
    s = requests.session()

    def __init__(self, host='192.168.1.1', username='useradmin', password='crcun'):

        self.host = host
        self.username = username
        self.password = password

        self.url_login = 'http://%s/login.cgi' % self.host
        self.url_get_wan_wifi_status = 'http://%s/gatewayManage.cmd' % self.host
        self.url_reboot = 'http://%s/gatewayManage.cmd' % self.host

        self.timeout = 180

        self.net_ip_o = None
        self.net_ip_n = None

    @staticmethod
    def _get_tc():
        tc = str('%13d' % (time.time() * 1000))
        return tc

    def login(self):
        """
        登录
        :return:
        """
        params = {
            'username': self.username,
            'psd': self.password,
        }
        res = self.s.get(self.url_login, params=params, timeout=self.timeout)
        print(res.status_code, res.url)

    def get_wan_wifi_status(self):
        """
        获取wifi状态
        :return:
        """
        headers = {
            'X-Requested-With': 'XMLHttpRequest',
        }
        params = {
            'timeStamp': self._get_tc(),
        }
        json_cfg = {
            'RPCMethod': 'Post1',
            'ID': '123',
            'Parameter': base64.urlsafe_b64encode("{'CmdType':'GET_WAN_WIFI_STATUS'}")
        }
        data = "jsonCfg=%s" % json.dumps(json_cfg)
        res = self.s.post(self.url_get_wan_wifi_status, headers=headers, params=params, data=data, timeout=self.timeout)
        print(res.status_code, res.url)
        return_parameter = json.loads(base64.decodestring(res.json().get('return_Parameter', '')))
        print(return_parameter)
        print(return_parameter.get('ipAddr'))
        wan_ip = return_parameter.get('ipAddr')
        return wan_ip

    def reboot(self):
        """
        重启
        :return:
        """
        headers = {'X-Requested-With': 'XMLHttpRequest'}
        params = {
            'timeStamp': self._get_tc(),
        }
        json_cfg = {
            'RPCMethod': 'Post1',
            'ID': '123',
            'Parameter': base64.urlsafe_b64encode("{'CmdType':'HG_COMMAND_REBOOT'}")
        }
        data = "jsonCfg=%s" % json.dumps(json_cfg)
        res = self.s.post(self.url_reboot, headers=headers, params=params, data=data, timeout=self.timeout)
        print(res.status_code, res.url)
        return_parameter = json.loads(base64.decodestring(res.json().get('return_Parameter', '')))
        print(return_parameter)

    def get_net_ip(self):
        """
        获取网络IP，这里使用requests不用session，因为重启之后，session会断开
        :return:
        """
        url = 'https://ip.cn/'
        res = requests.get(url, timeout=self.timeout)
        response = Selector(res)
        info = response.xpath('//div[@class="well"]//code/text()').extract()
        ip_info = dict(zip(['ip', 'address'], info))
        net_ip = ip_info['ip']
        print(net_ip)
        return net_ip

    def check_reboot_status(self):
        reboot_status = self.net_ip_o != self.net_ip_n
        print(reboot_status)
        return reboot_status


class OpticalModemChinaMobile(object):
    """
    移动光猫
    登录密码表单SHA256加密
    """
    s = requests.session()
    pid = 1002
    session_token = 0

    def __init__(self, host='192.168.1.1', username='user', password='gkw4p3uv'):

        self.host = host
        self.username = username
        self.password = password

        self.pwd_random = self._get_pwd_random()
        self.encryption_pwd = self._get_encryption_pwd(self.password, self.pwd_random)
        self.token = self._get_token()

        self.url_login = 'http://%s/' % self.host

        self.timeout = 180

        self.net_ip_o = None
        self.net_ip_n = None

    @staticmethod
    def _get_pwd_random():
        pwd_random = str(int(round(random.random() * 89999999)) + 10000000)
        return pwd_random

    @staticmethod
    def _get_encryption_pwd(pwd, r):
        encryption_pwd = hashlib.sha256(''.join([pwd, r])).hexdigest()
        return encryption_pwd

    def _get_token(self):
        url = 'http://%s' % self.host
        res = self.s.get(url)
        html_body = res.text
        token_re = re.compile(r'getObj\("Frm_Logintoken"\)\.value = "(\d+)";')
        token_list = re.findall(token_re, html_body)
        return int(token_list[0]) if token_list else 0

    def _get_pid(self):
        url = 'http://%s/template.gch' % self.host
        res = self.s.get(url, timeout=self.timeout)
        html_body = res.text
        pid_re = re.compile(r'"getpage\.gch\?pid=(\d+)&nextpage="')
        pid_list = re.findall(pid_re, html_body)
        self.pid = int(pid_list[0]) if pid_list else self.pid
        return self.pid

    def _get_session_token(self):
        url = 'http://%s/getpage.gch?pid=%s&nextpage=manager_dev_restart_t.gch' % (self.host, self.pid)
        res = self.s.get(url, timeout=self.timeout)
        html_body = res.text
        session_token_re = re.compile(r'var session_token = "(\d+)";')
        session_token_list = re.findall(session_token_re, html_body)
        self.session_token = int(session_token_list[0]) if session_token_list else self.session_token
        return self.session_token

    def login(self):
        """
        登录
        :return:
        """
        payload = {
            'frashnum': '',
            'action': 'login',
            'Frm_Logintoken': self.token,
            'UserRandomNum': self.pwd_random,
            'Username': self.username,
            'Password': self.encryption_pwd,
        }
        res = self.s.post(self.url_login, data=payload, timeout=self.timeout)
        return 'mainFrame' in res.text

    def reboot(self):
        url = 'http://%s/getpage.gch?pid=%s&nextpage=manager_dev_restart_t.gch' % (self.host, self._get_pid())
        payload = {
            'IF_ACTION': 'devrestart',
            'IF_ERRORSTR': 'SUCC',
            'IF_ERRORPARAM': 'SUCC',
            'IF_ERRORTYPE': -1,
            'flag': 1,
            '_SESSION_TOKEN': self._get_session_token(),
        }

        res = self.s.post(url, data=payload, timeout=self.timeout)
        return '设备重启需要2~3分钟，请耐心等待。' in res.text

    def get_net_ip(self):
        """
        获取网络IP，这里使用requests不用session，因为重启之后，session会断开
        :return:
        """
        url = 'https://ip.cn/'
        res = requests.get(url, timeout=self.timeout)
        response = Selector(res)
        info = response.xpath('//div[@class="well"]//code/text()').extract()
        ip_info = dict(zip(['ip', 'address'], info))
        net_ip = ip_info['ip']
        print(net_ip)
        return net_ip

    def check_reboot_status(self):
        reboot_status = self.net_ip_o != self.net_ip_n
        print(reboot_status)
        return reboot_status


def test_china_net():
    om_cn = OpticalModemChinaNet()

    om_cn.net_ip_o = om_cn.get_net_ip()

    om_cn.login()  # 默认用户名、密码
    om_cn.reboot()

    time.sleep(10)
    c = 3
    while 1:
        if c <= 0:
            break
        try:
            om_cn.net_ip_n = om_cn.get_net_ip()
            break
        except Exception as e:
            c -= 1
            print(e)

    om_cn.check_reboot_status()


def test_china_mobile():
    om_cm = OpticalModemChinaMobile()

    om_cm.net_ip_o = om_cm.get_net_ip()

    om_cm.login()
    om_cm.reboot()

    time.sleep(10)
    c = 3
    while 1:
        if c <= 0:
            break
        try:
            om_cm.net_ip_n = om_cm.get_net_ip()
            break
        except Exception as e:
            c -= 1
            print(e)

    om_cm.check_reboot_status()


if __name__ == '__main__':
    # test_china_net()
    test_china_mobile()


================================================
FILE: libs/redis_pub_sub.py
================================================
#!/usr/bin/env python
# encoding: utf-8

"""
@author: zhanghe
@software: PyCharm
@file: redis_pub_sub.py
@time: 2018-02-10 15:24
"""

import redis


class RedisPubSub(object):
    """
    Pub/Sub
        队列中存储的数据必须是序列化之后的数据
        生产消息: 入队前, 序列化
        消费消息: 出队后, 反序列化
    """

    def __init__(self, name, namespace='pub/sub', redis_client=None, **redis_kwargs):
        """The default connection parameters are: host='localhost', port=6379, db=0"""
        self.__db = redis_client or redis.Redis(**redis_kwargs)
        self.key = '%s:%s' % (namespace, name)

    def pub(self, k, v):
        """
        Pub
        :param k:
        :param v:
        :return:
        """
        ch = '%s:%s' % (self.key, k)
        self.__db.publish(ch, v)

    def sub(self, k):
        """
        Sub
        :param k:
        :return:
        """
        ps = self.__db.pubsub()
        ch = '%s:%s' % (self.key, k)
        ps.subscribe(ch)
        for item in ps.listen():
            # {'pattern': None, 'type': 'subscribe', 'channel': 'pub/sub:test:hh', 'data': 1L}
            yield item
            if item['type'] == 'message':
                yield item.get('data')

    def p_sub(self, k):
        """
        PSub
        订阅一个或多个符合给定模式的频道
        每个模式以 * 作为匹配符
        注意 psubscribe 与 subscribe 区别
        :param k:
        :return:
        """
        ps = self.__db.pubsub()
        ch = '%s:%s' % (self.key, k)
        ps.psubscribe(ch)
        for item in ps.listen():
            # {'pattern': None, 'type': 'psubscribe', 'channel': 'pub/sub:test:*:hh', 'data': 1L}
            # yield item
            if item['type'] == 'pmessage':
                # {'pattern': 'pub/sub:test:*:hh', 'type': 'pmessage', 'channel': 'pub/sub:test:aa:hh', 'data': '123'}
                yield item.get('data')

    def sub_not_loop(self, k):
        """
        Sub 非无限循环，取到结果即退出
        :param k:
        :return:
        """
        ps = self.__db.pubsub()
        ch = '%s:%s' % (self.key, k)
        ps.subscribe(ch)
        for item in ps.listen():
            if item['type'] == 'message':
                return item.get('data')

    def p_sub_not_loop(self, k):
        """
        PSub 非无限循环，取到结果即退出
        :param k:
        :return:
        """
        ps = self.__db.pubsub()
        ch = '%s:%s' % (self.key, k)
        ps.psubscribe(ch)
        for item in ps.listen():
            if item['type'] == 'pmessage':
                return item.get('data')


================================================
FILE: libs/redis_queue.py
================================================
#!/usr/bin/env python
# encoding: utf-8

"""
@author: zhanghe
@software: PyCharm
@file: redis_queue.py
@time: 2018-02-10 15:25
"""

import redis


class RedisQueue(object):
    """Simple Queue with Redis Backend"""

    def __init__(self, name, namespace='queue', redis_client=None, **redis_kwargs):
        """The default connection parameters are: host='localhost', port=6379, db=0"""
        self.__db = redis_client or redis.Redis(**redis_kwargs)
        self.key = '%s:%s' % (namespace, name)

    def qsize(self):
        """Return the approximate size of the queue."""
        return self.__db.llen(self.key)

    def empty(self):
        """Return True if the queue is empty, False otherwise."""
        return self.qsize() == 0

    def put(self, item):
        """Put item into the queue."""
        self.__db.rpush(self.key, item)

    def get(self, block=True, timeout=None):
        """Remove and return an item from the queue.

        If optional args block is true and timeout is None (the default), block
        if necessary until an item is available."""
        if block:
            # ('queue:test', 'hello world')
            item = self.__db.blpop(self.key, timeout=timeout)
        else:
            # hello world
            item = self.__db.lpop(self.key)

        if isinstance(item, tuple):
            item = item[1]
        return item

    def get_nowait(self):
        """Equivalent to get(False)."""
        return self.get(False)


================================================
FILE: libs/rk.py
================================================
#!/usr/bin/env python
# encoding: utf-8

"""
@author: zhanghe
@software: PyCharm
@file: rk.py
@time: 2018-02-10 15:25
"""

from hashlib import md5

import requests


class RKClient(object):
    def __init__(self, username, password, soft_id, soft_key):
        self.username = username
        self.password = md5(password).hexdigest()
        self.soft_id = soft_id
        self.soft_key = soft_key
        self.base_params = {
            'username': self.username,
            'password': self.password,
            'softid': self.soft_id,
            'softkey': self.soft_key,
        }
        self.headers = {
            'Connection': 'Keep-Alive',
            'Expect': '100-continue',
            'User-Agent': 'ben',
        }

    def rk_create(self, im, im_type, timeout=60):
        """
        im: 图片字节
        im_type: 题目类型
        """
        params = {
            'typeid': im_type,
            'timeout': timeout,
        }
        params.update(self.base_params)
        files = {'image': ('a.jpg', im)}
        r = requests.post(
            'http://api.ruokuai.com/create.json',
            data=params,
            files=files,
            headers=self.headers,
            timeout=timeout
        )
        return r.json()

    def rk_report_error(self, im_id):
        """
        im_id:报错题目的ID
        """
        params = {
            'id': im_id,
        }
        params.update(self.base_params)
        r = requests.post(
            'http://api.ruokuai.com/reporterror.json',
            data=params,
            headers=self.headers,
            timeout=30
        )
        return r.json()


if __name__ == '__main__':
    rc = RKClient('username', 'password', 'soft_id', 'soft_key')
    im = open('a.jpg', 'rb').read()
    print(rc.rk_create(im, 3040))


================================================
FILE: libs/weed_fs.py
================================================
#!/usr/bin/env python
# encoding: utf-8

"""
@author: zhanghe
@software: PyCharm
@file: weed_fs.py
@time: 2018-02-10 15:25
"""

import csv

# from urlparse import urlparse                 # PY2
# from urllib.parse import urlparse             # PY3
from future.moves.urllib.parse import urlparse

import requests

from config import current_config


REQUESTS_TIME_OUT = current_config.REQUESTS_TIME_OUT


class WeedFSClient(object):
    request_headers = {
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
        'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:57.0) Gecko/20100101 Firefox/57.0'
    }

    def __init__(self, weed_fs_url):
        self.weed_fs_url = weed_fs_url

    def _get_assign(self):
        """
        获取分配的资源（url fid）
        接口消息 - 正确:
            {"fid":"1,014e123ade","url":"127.0.0.1:8080","publicUrl":"127.0.0.1:8080","count":1}
        接口消息 - 错误:
            {"error":"No free volumes left!"}
        """
        url = '%s/dir/assign' % self.weed_fs_url
        res = requests.get(url, timeout=REQUESTS_TIME_OUT).json()
        if 'error' in res:
            raise Exception(res['error'])
        return res

    def _get_locations(self, fid):
        """
        获取文件服务器列表
        {"volumeId":"1","locations":[{"url":"127.0.0.1:8080","publicUrl":"127.0.0.1:8080"}]}
        """
        volume_id = fid.split(',')[0]
        url = '%s/dir/lookup?volumeId=%s' % (self.weed_fs_url, volume_id)
        return requests.get(url, timeout=REQUESTS_TIME_OUT).json()

    def save_file(self, local_file_path=None, remote_file_path=None, file_obj=None):
        """
        保存本地文件至weed_fs文件系统
        {"name":"test.csv","size":425429}
        """
        assign = self._get_assign()
        url = 'http://%s/%s' % (assign['url'], assign['fid'])

        if local_file_path:
            file_obj = open(local_file_path, 'rb')
        elif remote_file_path:
            headers = {'Host': urlparse(remote_file_path).netloc}  # 防反爬, 指定图片 Host
            headers.update(self.request_headers)
            res = requests.get(remote_file_path, headers=headers, timeout=REQUESTS_TIME_OUT)
            if res.status_code == 200:
                file_obj = res.content
            else:
                raise Exception('File does not exist')
        elif not file_obj:
            raise Exception('File does not exist')

        res = requests.post(url, files={'file': file_obj}, timeout=REQUESTS_TIME_OUT)
        return dict(res.json(), **assign)

    def get_file_url(self, fid, separator=None):
        """
        获取文件链接
        """
        locations = self._get_locations(fid)
        public_url = locations['locations'][0]['publicUrl']
        return 'http://%s/%s' % (public_url, fid.replace(',', separator) if separator else fid)

    def read_csv(self, fid, encoding=None):
        """
        逐行读取远程csv文件
        :param fid:
        :param encoding: 'gbk'/'utf-8'
        :return:
        """
        file_url = self.get_file_url(fid)
        download = requests.get(file_url, timeout=REQUESTS_TIME_OUT)
        csv_rows = csv.reader(download.iter_lines(), delimiter=',', quotechar='"')
        for csv_row in csv_rows:
            line = [item.decode(encoding, 'ignore') if encoding else item for item in csv_row]
            yield line


================================================
FILE: logs/index.html
================================================
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <title>Title</title>
</head>
<body>

</body>
</html>

================================================
FILE: maps/__init__.py
================================================
#!/usr/bin/env python
# encoding: utf-8

"""
@author: zhanghe
@software: PyCharm
@file: __init__.py.py
@time: 2018-02-10 17:58
"""


def func():
    pass


class Main(object):
    def __init__(self):
        pass


if __name__ == '__main__':
    pass


================================================
FILE: maps/channel.py
================================================
#!/usr/bin/env python
# encoding: utf-8

"""
@author: zhanghe
@software: PyCharm
@file: channel.py
@time: 2018-02-10 18:13
"""


channel_name_map = {
}


================================================
FILE: maps/platform.py
================================================
#!/usr/bin/env python
# encoding: utf-8

"""
@author: zhanghe
@software: PyCharm
@file: platform.py
@time: 2018-02-10 17:58
"""


WEIXIN = 1
WEIBO = 2
TOUTIAO = 3


platform_name_map = {
    1: u'微信',
    2: u'微博',
    3: u'头条',
}


================================================
FILE: models/__init__.py
================================================
#!/usr/bin/env python
# encoding: utf-8

"""
@author: zhanghe
@software: PyCharm
@file: __init__.py.py
@time: 2018-02-10 17:10
"""


def func():
    pass


class Main(object):
    def __init__(self):
        pass


if __name__ == '__main__':
    pass


================================================
FILE: models/news.py
================================================
# coding: utf-8
from sqlalchemy import Column, DateTime, Index, Integer, String, text
from sqlalchemy.ext.declarative import declarative_base


Base = declarative_base()
metadata = Base.metadata


def to_dict(self):
    return {c.name: getattr(self, c.name, None) for c in self.__table__.columns}

Base.to_dict = to_dict


class Channel(Base):
    __tablename__ = 'channel'

    id = Column(Integer, primary_key=True)
    code = Column(String(20), unique=True)
    name = Column(String(20))
    description = Column(String(500), server_default=text("''"))
    create_time = Column(DateTime, nullable=False, server_default=text("CURRENT_TIMESTAMP"))
    update_time = Column(DateTime, nullable=False, server_default=text("CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP"))


class FetchResult(Base):
    __tablename__ = 'fetch_result'
    __table_args__ = (
        Index('idx_platform_author_id', 'platform_id', 'article_author_id'),
        Index('idx_platform_article_id', 'platform_id', 'article_id', unique=True)
    )

    id = Column(Integer, primary_key=True)
    task_id = Column(Integer, nullable=False, index=True)
    platform_id = Column(Integer, server_default=text("'0'"))
    platform_name = Column(String(50), server_default=text("''"))
    channel_id = Column(Integer, server_default=text("'0'"))
    channel_name = Column(String(50), server_default=text("''"))
    article_id = Column(String(50), server_default=text("''"))
    article_url = Column(String(512), server_default=text("''"))
    article_title = Column(String(100), server_default=text("''"))
    article_author_id = Column(String(100), server_default=text("''"))
    article_author_name = Column(String(100), server_default=text("''"))
    article_tags = Column(String(100), server_default=text("''"))
    article_abstract = Column(String(500), server_default=text("''"))
    article_content = Column(String)
    article_pub_time = Column(DateTime, index=True, server_default=text("'1000-01-01 00:00:00'"))
    create_time = Column(DateTime, nullable=False, index=True, server_default=text("CURRENT_TIMESTAMP"))
    update_time = Column(DateTime, nullable=False, index=True, server_default=text("CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP"))


class FetchTask(Base):
    __tablename__ = 'fetch_task'
    __table_args__ = (
        Index('idx_platform_follow_id', 'platform_id', 'follow_id', unique=True),
    )

    id = Column(Integer, primary_key=True)
    platform_id = Column(Integer, server_default=text("'0'"))
    channel_id = Column(Integer, server_default=text("'0'"))
    follow_id = Column(String(45), server_default=text("''"))
    follow_name = Column(String(45), server_default=text("''"))
    avatar_url = Column(String(512), server_default=text("''"))
    fetch_url = Column(String(512), server_default=text("''"))
    flag_enabled = Column(Integer, server_default=text("'0'"))
    description = Column(String(500), server_default=text("''"))
    create_time = Column(DateTime, nullable=False, server_default=text("CURRENT_TIMESTAMP"))
    update_time = Column(DateTime, nullable=False, server_default=text("CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP"))


class LogTaskScheduling(Base):
    __tablename__ = 'log_task_scheduling'

    id = Column(Integer, primary_key=True)
    platform_id = Column(Integer, server_default=text("'0'"))
    platform_name = Column(String(50), server_default=text("''"))
    spider_name = Column(String(45), server_default=text("''"))
    task_quantity = Column(Integer, server_default=text("'0'"))
    create_time = Column(DateTime, nullable=False, server_default=text("CURRENT_TIMESTAMP"))
    update_time = Column(DateTime, nullable=False, server_default=text("CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP"))


================================================
FILE: news/__init__.py
================================================


================================================
FILE: news/items.py
================================================
# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html

import scrapy


class FetchTaskItem(scrapy.Item):
    """
    table_name: fetch_task
    primary_key: id
    """
    follow_id = scrapy.Field()
    fetch_url = scrapy.Field()
    description = scrapy.Field()
    platform_id = scrapy.Field()
    channel_id = scrapy.Field()
    avatar_url = scrapy.Field()
    flag_enabled = scrapy.Field()
    follow_name = scrapy.Field()


class FetchResultItem(scrapy.Item):
    """
    table_name: fetch_result
    primary_key: id
    """
    article_title = scrapy.Field()
    platform_name = scrapy.Field()
    task_id = scrapy.Field()
    channel_id = scrapy.Field()
    article_author_name = scrapy.Field()
    article_content = scrapy.Field()
    platform_id = scrapy.Field()
    channel_name = scrapy.Field()
    article_url = scrapy.Field()
    article_abstract = scrapy.Field()
    article_author_id = scrapy.Field()
    article_tags = scrapy.Field()
    article_id = scrapy.Field()
    article_pub_time = scrapy.Field()


class ChannelItem(scrapy.Item):
    """
    table_name: channel
    primary_key: id
    """
    code = scrapy.Field()
    description = scrapy.Field()
    name = scrapy.Field()


================================================
FILE: news/middlewares/__init__.py
================================================
#!/usr/bin/env python
# encoding: utf-8

"""
@author: zhanghe
@software: PyCharm
@file: __init__.py.py
@time: 2018-02-10 17:10
"""


def func():
    pass


class Main(object):
    def __init__(self):
        pass


if __name__ == '__main__':
    pass


================================================
FILE: news/middlewares/anti_spider.py
================================================
# -*- coding: utf-8 -*-

# Define here the models for your spider middleware
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/spider-middleware.html

from __future__ import unicode_literals

import time
from scrapy.exceptions import IgnoreRequest
from scrapy.exceptions import NotConfigured

from tools.cookies import del_cookies
from tasks.jobs_weixin import set_anti_spider_task, sub_anti_spider


class AntiSpiderMiddleware(object):
    """
    反爬中间件
    配置说明:
        RETRY_ENABLED 默认: True
        RETRY_TIMES 默认: 2
        RETRY_HTTP_CODES 默认: [500, 502, 503, 504, 400, 408]
    """
    def __init__(self, settings):
        if not settings.getbool('RETRY_ENABLED'):
            raise NotConfigured
        self.max_retry_times = settings.getint('RETRY_TIMES')
        self.retry_http_codes = set(int(x) for x in settings.getlist('RETRY_HTTP_CODES'))
        self.priority_adjust = settings.getint('RETRY_PRIORITY_ADJUST') or 1

    @classmethod
    def from_crawler(cls, crawler):
        return cls(crawler.settings)

    def process_request(self, request, spider):
        # 处理微信反爬(反爬机制一, sogou)
        if spider.name in ['weixin'] and 'antispider' in request.url:
            # 获取来源链接
            redirect_urls = request.meta['redirect_urls']

            # 清理失效 cookies
            cookies_id = request.meta['cookiejar']
            del_cookies(spider.name, cookies_id)

            # spider.log(message='AntiSpider cookies_id: %s; url: %s' % (cookies_id, redirect_urls[0]))
            raise IgnoreRequest(
                'Spider: %s, AntiSpider cookies_id: %s; url: %s' % (spider.name, cookies_id, redirect_urls[0]))

    def process_response(self, request, response, spider):
        # 处理微信反爬(反爬机制二, weixin)
        if spider.name in ['weixin']:
            title = response.xpath('//title/text()').extract_first(default='').strip()
            if title == '请输入验证码':
                # 设置反爬处理任务
                msg = {
                    'url': response.url,
                    'time': time.strftime('%Y-%m-%d %H:%M:%S')
                }
                set_anti_spider_task(spider.name, msg)

                # 订阅处理结果
                anti_spider_result = sub_anti_spider(spider.name)
                if not anti_spider_result.get('status'):
                    return response

                # 请求重试
                retry_req = request.copy()
                retry_req.dont_filter = True  # 必须设置(禁止重复请求被过滤掉)
                retry_req.priority = request.priority + self.priority_adjust
                return retry_req
        return response


================================================
FILE: news/middlewares/content_type.py
================================================
# -*- coding: utf-8 -*-

# Define here the models for your spider middleware
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/spider-middleware.html


class ContentTypeGb2312Middleware(object):
    """
    处理不规范的页面（优先级降低至580之后才能生效）
    原因:
        默认配置的 DOWNLOADER_MIDDLEWARES 包含 MetaRefreshMiddleware
        当请求页面存在如 Content-Location 类似的 header 时, 会触发重定向请求
    指定 Content-Type 为 gb2312
    """
    def process_response(self, request, response, spider):
        response.headers['Content-Type'] = 'text/html; charset=gb2312'
        return response


================================================
FILE: news/middlewares/de_duplication_request.py
================================================
# -*- coding: utf-8 -*-

# Define here the models for your spider middleware
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/spider-middleware.html


from scrapy.exceptions import IgnoreRequest

from tools.duplicate import is_dup_detail


class DeDuplicationRequestMiddleware(object):
    """
    去重 - 请求
    (数据结构：集合)
    """
    def process_request(self, request, spider):
        if not request.url:
            return None
        channel_id = request.meta.get('channel_id', 0)
        # 处理详情页面（忽略列表页面）与pipeline配合
        if is_dup_detail(request.url, spider.name, channel_id):
            raise IgnoreRequest("Spider: %s, DeDuplicationRequest: %s" % (spider.name, request.url))


================================================
FILE: news/middlewares/httpproxy.py
================================================
# -*- coding: utf-8 -*-

# Define here the models for your spider middleware
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/spider-middleware.html


from scrapy.exceptions import NotConfigured
from tools.proxies import get_proxy, del_proxy


class HttpProxyMiddleware(object):
    """
    代理中间件
    """
    def __init__(self, settings):
        if not settings.getbool('RETRY_ENABLED'):
            raise NotConfigured
        self.max_retry_times = settings.getint('RETRY_TIMES')
        self.retry_http_codes = set(int(x) for x in settings.getlist('RETRY_HTTP_CODES'))
        self.priority_adjust = settings.getint('RETRY_PRIORITY_ADJUST') or 1

    @classmethod
    def from_crawler(cls, crawler):
        return cls(crawler.settings)

    def process_request(self, request, spider):
        # request.meta['proxy'] = "http://YOUR_PROXY_IP:PORT"
        # 当前请求代理（保证重试过程，代理一致）
        request_proxy = request.meta.get('proxy') or get_proxy(spider.name)
        request.meta['proxy'] = request_proxy
        spider.log(request.meta)

    def process_exception(self, request, exception, spider):
        error_proxy = request.meta.get('proxy')
        if not error_proxy:
            return None
        # 重试失败（默认重试2次，共请求3次），删除代理
        if request.meta.get('retry_times', 0) >= self.max_retry_times:
            del_proxy(spider.name, error_proxy)
            spider.log('%s del proxy: %s, error reason: %s' % (spider.name, error_proxy, exception))
            return None


================================================
FILE: news/middlewares/useragent.py
================================================
# -*- coding: utf-8 -*-

# Define here the models for your spider middleware
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/spider-middleware.html


import random


class UserAgentMiddleware(object):
    """
    Randomly rotate user agents based on a list of predefined ones
    """
    def __init__(self, agents):
        self.agents = agents

    @classmethod
    def from_crawler(cls, crawler):
        return cls(crawler.settings.getlist('USER_AGENTS'))

    def process_request(self, request, spider):
        request.headers.setdefault('User-Agent', random.choice(self.agents))
        # request.headers.setdefault('User-Agent', self.agents[0])


================================================
FILE: news/middlewares.py
================================================
# -*- coding: utf-8 -*-

# Define here the models for your spider middleware
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html

from scrapy import signals


class NewsSpiderMiddleware(object):
    # Not all methods need to be defined. If a method is not defined,
    # scrapy acts as if the spider middleware does not modify the
    # passed objects.

    @classmethod
    def from_crawler(cls, crawler):
        # This method is used by Scrapy to create your spiders.
        s = cls()
        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
        return s

    def process_spider_input(self, response, spider):
        # Called for each response that goes through the spider
        # middleware and into the spider.

        # Should return None or raise an exception.
        return None

    def process_spider_output(self, response, result, spider):
        # Called with the results returned from the Spider, after
        # it has processed the response.

        # Must return an iterable of Request, dict or Item objects.
        for i in result:
            yield i

    def process_spider_exception(self, response, exception, spider):
        # Called when a spider or process_spider_input() method
        # (from other spider middleware) raises an exception.

        # Should return either None or an iterable of Response, dict
        # or Item objects.
        pass

    def process_start_requests(self, start_requests, spider):
        # Called with the start requests of the spider, and works
        # similarly to the process_spider_output() method, except
        # that it doesn’t have a response associated.

        # Must return only requests (not items).
        for r in start_requests:
            yield r

    def spider_opened(self, spider):
        spider.logger.info('Spider opened: %s' % spider.name)


class NewsDownloaderMiddleware(object):
    # Not all methods need to be defined. If a method is not defined,
    # scrapy acts as if the downloader middleware does not modify the
    # passed objects.

    @classmethod
    def from_crawler(cls, crawler):
        # This method is used by Scrapy to create your spiders.
        s = cls()
        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
        return s

    def process_request(self, request, spider):
        # Called for each request that goes through the downloader
        # middleware.

        # Must either:
        # - return None: continue processing this request
        # - or return a Response object
        # - or return a Request object
        # - or raise IgnoreRequest: process_exception() methods of
        #   installed downloader middleware will be called
        return None

    def process_response(self, request, response, spider):
        # Called with the response returned from the downloader.

        # Must either;
        # - return a Response object
        # - return a Request object
        # - or raise IgnoreRequest
        return response

    def process_exception(self, request, exception, spider):
        # Called when a download handler or a process_request()
        # (from other downloader middleware) raises an exception.

        # Must either:
        # - return None: continue processing this exception
        # - return a Response object: stops process_exception() chain
        # - return a Request object: stops process_exception() chain
        pass

    def spider_opened(self, spider):
        spider.logger.info('Spider opened: %s' % spider.name)


================================================
FILE: news/pipelines/__init__.py
================================================
#!/usr/bin/env python
# encoding: utf-8

"""
@author: zhanghe
@software: PyCharm
@file: __init__.py.py
@time: 2018-02-10 17:10
"""


def func():
    pass


class Main(object):
    def __init__(self):
        pass


if __name__ == '__main__':
    pass


================================================
FILE: news/pipelines/de_duplication_request.py
================================================
# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html


from news.items import FetchResultItem

from tools.duplicate import is_dup_detail, add_dup_detail


class DeDuplicationRequestPipeline(object):
    """
    去重 - 请求
    注意:
        1、置于数据存储 pipeline 之后
        2、与 DeDuplicationRequestMiddleware 配合使用
    """
    def process_item(self, item, spider):

        spider_name = spider.name
        if isinstance(item, FetchResultItem):
            # 详细页url 加入去重集合
            if not is_dup_detail(item['article_url'], spider_name, item['channel_id']):
                add_dup_detail(item['article_url'], spider_name, item['channel_id'])
        return item


================================================
FILE: news/pipelines/de_duplication_store_mysql.py
================================================
# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html


from models.news import FetchResult
from news.items import FetchResultItem
from apps.client_db import db_session_mysql
from tools.weixin import get_finger
from maps.platform import WEIXIN, WEIBO

from scrapy.exceptions import DropItem


class DeDuplicationStoreMysqlPipeline(object):
    """
    去重 - 入库
    注意:
        1、置于数据存储 pipeline 之前
    """
    def process_item(self, item, spider):

        session = db_session_mysql()
        try:
            if isinstance(item, FetchResultItem):
                if spider.name == 'weixin':
                    # 标题（微信只能通过标题去重, 因为链接带过期签名）
                    article_id_count = session.query(FetchResult) \
                        .filter(FetchResult.platform_id == WEIXIN,
                                FetchResult.article_id == get_finger(item['article_title'])) \
                        .count()
                    if article_id_count:
                        raise DropItem(
                            '%s Has been duplication of article_title: %s' % (spider.name, item['article_title']))

                if spider.name == 'weibo':
                    # 详细链接（微博可以直接通过链接去重）
                    article_url_count = session.query(FetchResult) \
                        .filter(FetchResult.platform_id == WEIBO,
                                FetchResult.article_id == get_finger(item['article_url'])) \
                        .count()
                    if article_url_count:
                        raise DropItem(
                            '%s Has been duplication of article_url: %s' % (spider.name, item['article_url']))

            return item
        except Exception as e:
            raise e
        finally:
            session.close()


================================================
FILE: news/pipelines/exporter_csv.py
================================================
# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html


from scrapy import signals
from scrapy.exporters import CsvItemExporter


class CsvExportPipeline(object):
    def __init__(self):
        self.files = {}
        self.exporter = None

    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline

    def spider_opened(self, spider):
        file_csv = open('%s_items.csv' % spider.name, 'w+b')
        self.files[spider] = file_csv
        self.exporter = CsvItemExporter(file_csv)
        self.exporter.start_exporting()

    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        file_csv = self.files.pop(spider)
        file_csv.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item


================================================
FILE: news/pipelines/img_remote_to_local_fs.py
================================================
# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html


import re

# from urlparse import urljoin                  # PY2
# from urllib.parse import urljoin              # PY3
from future.moves.urllib.parse import urljoin

from news.items import FetchResultItem

from libs.weed_fs import WeedFSClient
from config import current_config

WEED_FS_URL = current_config.WEED_FS_URL

weed_fs_client = WeedFSClient(WEED_FS_URL)


def remote_to_local(remote_file_path):
    """
    保存远程图片文件
    :param remote_file_path:
    :return:
    """
    remote_file_save_result = weed_fs_client.save_file(remote_file_path=remote_file_path)
    local_file_url = weed_fs_client.get_file_url(remote_file_save_result['fid'], '/')
    return local_file_url


def add_src(html_body, base=''):
    """
    添加图片文件链接（1、添加真实链接；2、替换本地链接）
    :param html_body:
    :param base:
    :return:
    """
    rule = r'data-src="(.*?)"'
    img_data_src_list = re.compile(rule, re.I).findall(html_body)
    for img_src in img_data_src_list:
        # 处理相对链接
        if base:
            new_img_src = urljoin(base, img_src)
        if new_img_src.startswith('/'):
            continue
        # 远程转本地
        local_img_src = remote_to_local(new_img_src)
        img_dict = {
            'img_src': img_src,
            'local_img_src': local_img_src
        }
        html_body = html_body.replace(img_src, '%(img_src)s" src="%(local_img_src)s' % img_dict)
    return html_body


def replace_src(html_body, base=''):
    """
    替换图片文件链接（替换本地链接）
    :param html_body:
    :param base:
    :return:
    """
    rule = r'src="(.*?)"'
    img_data_src_list = re.compile(rule, re.I).findall(html_body)
    for img_src in img_data_src_list:
        # 处理//,补充协议
        if img_src.startswith('//'):
            img_src = 'http:%s' % img_src
        # 处理相对链接
        if base:
            new_img_src = urljoin(base, img_src)
        if new_img_src.startswith('/'):
            continue
        # 远程转本地
        local_img_src = remote_to_local(new_img_src)
        img_dict = {
            'img_src': img_src,
            'local_img_src': local_img_src
        }
        html_body = html_body.replace(img_src, '%(local_img_src)s" data-src="%(img_src)s' % img_dict)
    return html_body


class ImgRemoteToLocalFSPipeline(object):
    """
    图片 远程链接 转 本地文件系统链接
    注意:
        1、置于数据存储 pipeline 之前
    """

    def process_item(self, item, spider):

        spider_name = spider.name
        # 读取抓取内容
        if isinstance(item, FetchResultItem):
            if spider_name in ['weixin']:
                html_body = item['article_content']
                base = item['article_url']
                item['article_content'] = add_src(html_body, base)
            if spider_name in ['weibo']:
                html_body = item['article_content']
                base = item['article_url']
                item['article_content'] = replace_src(html_body, base)
            if spider_name in ['toutiao', 'toutiao_m']:
                html_body = item['article_content']
                base = item['article_url']
                item['article_content'] = replace_src(html_body, base)
        return item


================================================
FILE: news/pipelines/store_mysql.py
================================================
# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html


from models.news import FetchResult
from news.items import FetchResultItem
from apps.client_db import db_session_mysql


class StoreMysqlPipeline(object):
    """
    基于 MySQL 的存储
    """

    def process_item(self, item, spider):
        session = db_session_mysql()
        try:
            if isinstance(item, FetchResultItem):
                fetch_result = FetchResult(**item)
                # 数据入库
                session.add(fetch_result)
                session.flush()
                # session.commit()
            return item
        except Exception as e:
            raise e
        finally:
            session.close()


================================================
FILE: news/pipelines.py
================================================
# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html


class NewsPipeline(object):
    def process_item(self, item, spider):
        return item


================================================
FILE: news/settings.py
================================================
# -*- coding: utf-8 -*-

# Scrapy settings for news project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
#     https://doc.scrapy.org/en/latest/topics/settings.html
#     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
#     https://doc.scrapy.org/en/latest/topics/spider-middleware.html

BOT_NAME = 'news'

SPIDER_MODULES = ['news.spiders']
NEWSPIDER_MODULE = 'news.spiders'


# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'news (+http://www.yourdomain.com)'

# Obey robots.txt rules
ROBOTSTXT_OBEY = False

# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32

# Configure a delay for requests for the same website (default: 0)
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
DOWNLOAD_DELAY = 2
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)
COOKIES_ENABLED = True
COOKIES_DEBUG = True

# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False

# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
#   'Accept-Language': 'en',
#}
DEFAULT_REQUEST_HEADERS = {
  'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
  'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
}

# Enable or disable spider middlewares
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
#    'news.middlewares.NewsSpiderMiddleware': 543,
#}

# Enable or disable downloader middlewares
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
#    'news.middlewares.NewsDownloaderMiddleware': 543,
#}

# Enable or disable extensions
# See https://doc.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
#    'scrapy.extensions.telnet.TelnetConsole': None,
#}

# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
#ITEM_PIPELINES = {
#    'news.pipelines.NewsPipeline': 300,
#}
ITEM_PIPELINES = {
   'news.pipelines.store_mysql.StoreMysqlPipeline': 400,
}

# Enable and configure the AutoThrottle extension (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False

# Enable and configure HTTP caching (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

# USER_AGENTS
USER_AGENTS = [
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36",
    "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)",
    "Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
    "Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)",
    "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
    "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
    "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)",
    "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)",
    "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6",
    "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1",
    "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0",
    "Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5",
    "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20",
    "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52"
]


================================================
FILE: news/spiders/__init__.py
================================================
# This package will contain the spiders of your Scrapy project
#
# Please refer to the documentation for information on how to create and manage
# your spiders.


================================================
FILE: news/spiders/ip.py
================================================
# -*- coding: utf-8 -*-
import scrapy


class IpSpider(scrapy.Spider):
    """
    IP代理测试 蜘蛛
    重试3次，每次超时10秒
    使用：
    进入项目目录
    $ scrapy crawl ip
    """
    name = "ip"
    allowed_domains = ["ip.cn"]
    start_urls = (
        'https://ip.cn',
    )

    custom_settings = dict(
        COOKIES_ENABLED=True,
        DEFAULT_REQUEST_HEADERS={
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:57.0) Gecko/20100101 Firefox/57.0'
        },
        USER_AGENT='Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:57.0) Gecko/20100101 Firefox/57.0',
        DOWNLOADER_MIDDLEWARES={
            'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None,
            'news.middlewares.useragent.UserAgentMiddleware': 500,
            'news.middlewares.httpproxy.HttpProxyMiddleware': 720,  # 代理（cookie需要与代理IP关联）
        },
        ITEM_PIPELINES={
            'news.pipelines.store_mysql.StoreMysqlPipeline': 450,
        },
        DOWNLOAD_TIMEOUT=10
    )

    def parse(self, response):
        info = response.xpath('//div[@class="well"]//code/text()').extract()
        ip_info = dict(zip(['ip', 'address'], info))
        yield ip_info


================================================
FILE: news/spiders/toutiao_m.py
================================================
# -*- coding: utf-8 -*-


from __future__ import print_function
from __future__ import unicode_literals

import json
import time

import scrapy

from apps.client_db import get_item
from maps.channel import channel_name_map
from maps.platform import platform_name_map
from models.news import FetchTask
from news.items import FetchResultItem
from tools.date_time import time_local_to_utc
from tools.scrapy_tasks import pop_task
from tools.toutiao_m import get_as_cp, ParseJsTt, parse_toutiao_js_body
from tools.url import get_update_url


class ToutiaoMSpider(scrapy.Spider):
    """
    头条蜘蛛
    """
    name = 'toutiao_m'
    allowed_domains = ['toutiao.com', 'snssdk.com']

    custom_settings = dict(
        COOKIES_ENABLED=True,
        DEFAULT_REQUEST_HEADERS={
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:57.0) Gecko/20100101 Firefox/57.0'
        },
        USER_AGENT='Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:57.0) Gecko/20100101 Firefox/57.0',
        DOWNLOADER_MIDDLEWARES={
            'news.middlewares.de_duplication_request.DeDuplicationRequestMiddleware': 140,  # 去重请求
            # 'news.middlewares.anti_spider.AntiSpiderMiddleware': 160,  # 反爬处理
            'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None,
            'news.middlewares.useragent.UserAgentMiddleware': 500,
            # 'news.middlewares.httpproxy.HttpProxyMiddleware': 720,
        },
        ITEM_PIPELINES={
            'news.pipelines.de_duplication_store_mysql.DeDuplicationStoreMysqlPipeline': 400,  # 去重存储
            'news.pipelines.store_mysql.StoreMysqlPipeline': 450,
            'news.pipelines.de_duplication_request.DeDuplicationRequestPipeline': 500,  # 去重请求
        },
        DOWNLOAD_DELAY=0.5
    )

    # start_urls = ['http://toutiao.com/']
    # start_urls = ['https://www.toutiao.com/ch/news_finance/']

    def start_requests(self):
        """
        入口准备
        :return:
        """
        url_params = {
            'version_code': '6.4.2',
            'version_name': '',
            'device_platform': 'iphone',
            'tt_from': 'weixin',
            'utm_source': 'weixin',
            'utm_medium': 'toutiao_ios',
            'utm_campaign': 'client_share',
            'wxshare_count': '1',
        }

        task_id = pop_task(self.name)

        if not task_id:
            print('%s task is empty' % self.name)
            return
        print('%s task id: %s' % (self.name, task_id))

        task_item = get_item(FetchTask, task_id)
        fetch_url = 'http://m.toutiao.com/profile/%s/' % task_item.follow_id
        url_profile = get_update_url(fetch_url, url_params)
        meta = {
            'task_id': task_item.id,
            'platform_id': task_item.platform_id,
            'channel_id': task_item.channel_id,
            'follow_id': task_item.follow_id,
            'follow_name': task_item.follow_name,
        }
        yield scrapy.Request(url=url_profile, callback=self.get_profile, meta=meta)

    def get_profile(self, response):
        userid = response.xpath('//button[@itemid="topsharebtn"]/@data-userid').extract_first(default='')
        mediaid = response.xpath('//button[@itemid="topsharebtn"]/@data-mediaid').extract_first(default='')

        meta = dict(response.meta, userid=userid, mediaid=mediaid)

        url = 'http://open.snssdk.com/jssdk_signature/'
        url_params = {
            'appid': 'wxe8b89be1715734a6',
            'noncestr': 'Wm3WZYTPz0wzccnW',
            'timestamp': '%13d' % (time.time() * 1000),
            'callback': 'jsonp2',
        }
        url_jssdk_signature = get_update_url(url, url_params)
        yield scrapy.Request(url=url_jssdk_signature, callback=self.jssdk_signature, meta=meta)

    def jssdk_signature(self, response):
        AS, CP = get_as_cp()
        jsonp_index = 3

        url = 'https://www.toutiao.com/pgc/ma/'
        url_params = {
            'page_type': 1,
            'max_behot_time': '',
            'uid': response.meta['userid'],
            'media_id': response.meta['mediaid'],
            'output': 'json',
            'is_json': 1,
            'count': 20,
            'from': 'user_profile_app',
            'version': 2,
            'as': AS,
            'cp': CP,
            'callback': 'jsonp%d' % jsonp_index,
        }
        url_article_list = get_update_url(url, url_params)

        meta = dict(response.meta, jsonp_index=jsonp_index)

        yield scrapy.Request(url=url_article_list, callback=self.parse_article_list, meta=meta)

    def parse_article_list(self, response):
        """
        文章列表
        :param response:
        :return:
        """
        body = response.body_as_unicode()
        jsonp_text = 'jsonp%d' % response.meta.get('jsonp_index', 0)
        result = json.loads(body.lstrip('%s(' % jsonp_text).rstrip(')'))
        # 翻页 TODO FIX
        has_more = result.get('has_more')
        if has_more:
            max_behot_time = result['next']['max_behot_time']
            AS, CP = get_as_cp()
            jsonp_index = response.meta.get('jsonp_index', 0) + 1

            url_params_next = {
                'max_behot_time': max_behot_time,
                'as': AS,
                'cp': CP,
                'callback': 'jsonp%d' % jsonp_index,
            }

            url_article_list_next = get_update_url(response.url, url_params_next)

            meta = dict(response.meta, jsonp_index=jsonp_index)
            yield scrapy.Request(url=url_article_list_next, callback=self.parse_article_list, meta=meta)
        # 详情
        data_list = result.get('data', [])
        for data_item in data_list:
            detail_url = data_item.get('source_url')
            meta = dict(response.meta, detail_url=detail_url)
            yield scrapy.Request(url=detail_url, callback=self.parse_article_detail, meta=meta)

    def parse_article_detail(self, response):
        """
        文章详情
        :param response:
        :return:
        """
        toutiao_body = response.body_as_unicode()
        js_body = parse_toutiao_js_body(toutiao_body, response.meta['detail_url'])
        if not js_body:
            return
        pj = ParseJsTt(js_body=js_body)

        article_id = pj.parse_js_item_id()
        article_title = pj.parse_js_title()
        article_abstract = pj.parse_js_abstract()
        article_content = pj.parse_js_content()
        article_pub_time = pj.parse_js_pub_time()
        article_tags = pj.parse_js_tags()

        fetch_result_item = FetchResultItem()
        fetch_result_item['task_id'] = response.meta['task_id']
        fetch_result_item['platform_id'] = response.meta['platform_id']
        fetch_result_item['platform_name'] = platform_name_map.get(response.meta['platform_id'], '')
        fetch_result_item['channel_id'] = response.meta['channel_id']
        fetch_result_item['channel_name'] = channel_name_map.get(response.meta['channel_id'], '')
        fetch_result_item['article_id'] = article_id
        fetch_result_item['article_title'] = article_title
        fetch_result_item['article_author_id'] = response.meta['follow_id']
        fetch_result_item['article_author_name'] = response.meta['follow_name']
        fetch_result_item['article_pub_time'] = time_local_to_utc(article_pub_time).strftime('%Y-%m-%d %H:%M:%S')
        fetch_result_item['article_url'] = response.url or response.meta['detail_url']
        fetch_result_item['article_tags'] = article_tags
        fetch_result_item['article_abstract'] = article_abstract
        fetch_result_item['article_content'] = article_content

        yield fetch_result_item


================================================
FILE: news/spiders/weibo.py
================================================
# -*- coding: utf-8 -*-


from __future__ import print_function
from __future__ import unicode_literals

import json
import re
import time
from datetime import datetime

import scrapy
import six
from lxml.html import fromstring, tostring

from apps.client_db import get_item
from maps.channel import channel_name_map
from maps.platform import platform_name_map
from models.news import FetchTask
from news.items import FetchResultItem
from tools.date_time import time_local_to_utc
from tools.scrapy_tasks import pop_task
from tools.url import get_update_url, get_request_finger
from tools.weibo import get_su, get_login_data


class WeiboSpider(scrapy.Spider):
    """
    微博蜘蛛
    """
    name = 'weibo'
    allowed_domains = ['weibo.com', 'weibo.cn', 'sina.com.cn', 'sina.cn']

    custom_settings = dict(
        COOKIES_ENABLED=True,
        DEFAULT_REQUEST_HEADERS={
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:57.0) Gecko/20100101 Firefox/57.0'
        },
        USER_AGENT='Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:57.0) Gecko/20100101 Firefox/57.0',
        DOWNLOADER_MIDDLEWARES={
            'news.middlewares.de_duplication_request.DeDuplicationRequestMiddleware': 140,  # 去重请求
            # 'news.middlewares.anti_spider.AntiSpiderMiddleware': 160,  # 反爬处理
            'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None,
            'news.middlewares.useragent.UserAgentMiddleware': 500,
            # 'news.middlewares.httpproxy.HttpProxyMiddleware': 720,
        },
        ITEM_PIPELINES={
            'news.pipelines.de_duplication_store_mysql.DeDuplicationStoreMysqlPipeline': 400,  # 去重存储
            'news.pipelines.store_mysql.StoreMysqlPipeline': 450,
            'news.pipelines.de_duplication_request.DeDuplicationRequestPipeline': 500,  # 去重请求
        },
        DOWNLOAD_DELAY=0.5
    )

    passport_weibo_login_url = 'https://passport.weibo.cn/signin/login'

    start_urls = ['http://weibo.cn/']

    uid = 0

    login_form_data = {
        'username': '',
        'password': '',
        'savestate': '1',
        'r': '',
        'ec': '0',
        'pagerefer': '',
        'entry': 'mweibo',
        'wentry': '',
        'loginfrom': '',
        'client_id': '',
        'code': '',
        'qq': '',
        'mainpageflag': '1',
        'hff': '',
        'hfp': ''
    }

    def parse(self, response):
        return self.passport_weibo_login()

    def passport_weibo_login(self):
        yield scrapy.Request(url=self.passport_weibo_login_url, callback=self.login_sina_sso_prelogin)

    def login_sina_sso_prelogin(self, response):
        login_data = get_login_data()
        self.login_form_data.update(login_data)
        login_sina_sso_prelogin_url = 'https://login.sina.com.cn/sso/prelogin.php'
        query_payload = {
            'checkpin': '1',
            'entry': 'mweibo',
            'su': get_su(login_data.get('username', '')),
            'callback': 'jsonpcallback%13d' % (time.time()*1000),
        }
        request_url = get_update_url(login_sina_sso_prelogin_url, query_payload)

        yield scrapy.Request(url=request_url, callback=self.passport_weibo_sso_login)

    def passport_weibo_sso_login(self, response):
        passport_weibo_sso_login_url = 'https://passport.weibo.cn/sso/login'

        yield scrapy.FormRequest(
            url=passport_weibo_sso_login_url,
            formdata=self.login_form_data,
            callback=self.after_login
        )

    def after_login(self, response):
        data = {
            'savestate': '1',
            'callback': 'jsonpcallback%13d' % (time.time()*1000),
        }

        res = response.body_as_unicode()
        info = json.loads(res)

        crossdomainlist = info['data']['crossdomainlist']
        self.uid = info['data']['uid']

        url_weibo_com = get_update_url(crossdomainlist['weibo.com'], data)
        url_sina_com_cn = get_update_url(crossdomainlist['sina.com.cn'], data)
        url_weibo_cn = get_update_url(crossdomainlist['weibo.cn'], data)

        url_items = {
            'url_weibo_com': url_weibo_com,
            'url_sina_com_cn': url_sina_com_cn,
            'url_weibo_cn': url_weibo_cn,
        }

        meta = dict(response.meta, **url_items)

        # 跨域处理 weibo.com
        yield scrapy.Request(url=url_weibo_com, callback=self.crossdomain_weibo_com, meta=meta)

    def crossdomain_weibo_com(self, response):
        """
        跨域处理 weibo.com
        :param response:
        :return:
        """
        # 跨域处理 sina.com.cn
        url_sina_com_cn = response.meta['url_sina_com_cn']
        yield scrapy.Request(url=url_sina_com_cn, callback=self.crossdomain_sina_com_cn, meta=response.meta)

    def crossdomain_sina_com_cn(self, response):
        """
        跨域处理 sina.com.cn
        :param response:
        :return:
        """
        # 跨域处理 weibo.cn
        url_weibo_cn = response.meta['url_weibo_cn']
        yield scrapy.Request(url=url_weibo_cn, callback=self.crossdomain_weibo_cn, meta=response.meta)

    def crossdomain_weibo_cn(self, response):
        """
        跨域处理 weibo.cn
        :param response:
        :return:
        """
        # 获取登录状态 weibo.cn
        yield scrapy.Request(url='https://weibo.cn/', callback=self.weibo_cn_index)

    def weibo_cn_index(self, response):
        """
        获取登录状态
        :param response:
        :return:
        """
        print(response.url)
        title = response.xpath('//title/text()').extract_first()
        if title == '我的首页':
            print('登录成功')
            # follow_url = 'https://weibo.cn/%s/follow' % self.uid
            # yield scrapy.Request(url=follow_url, callback=self.parse_follow_list)
            # 获取登录状态 weibo.com
            yield scrapy.Request(url='https://weibo.com/', callback=self.weibo_com_index)
        else:
            print('登录失败')

    def weibo_com_index(self, response):
        """
        获取登录状态
        :param response:
        :return:
        """
        print(response.url)
        title = response.xpath('//title/text()').extract_first()
        if '我的首页' in title:
            print('登录成功')
            # follow_url = 'https://weibo.cn/%s/follow' % self.uid
            # yield scrapy.Request(url=follow_url, callback=self.parse_follow_list)
            return self.get_article_task()
        else:
            print('登录失败')

    def parse_follow_list(self, response):
        """
        已关注列表
        """
        print(response.url)
        # 进入关注用户页面
        follows = response.xpath('//table//tr/td/a[1]/@href').extract()
        for follow in follows:
            yield scrapy.Request(url=follow, callback=self.follow_home_list)

        # 关注列表翻页
        next_url = response.xpath('//div[@id="pagelist"]//a[contains(text(), "下页")]/@href').extract_first(default='')
        next_url = response.urljoin(next_url)
        if next_url == response.url:
            print('当前条件列表页最后一页：%s' % response.url)
        else:
            yield scrapy.Request(url=next_url, callback=self.parse_follow_list)

    def follow_home_list(self, response):
        """
        已关注用户首页列表
        """
        contents = response.xpath('//div[@class="c"]//span[@class="ctt"]/text()').extract()
        for content in contents:
            print(content)

    def get_article_task(self):
        """
        文章抓取入口
        :return:
        """
        task_id = pop_task(self.name)

        if not task_id:
            print('%s task is empty' % self.name)
            return
        print('%s task id: %s' % (self.name, task_id))

        task_item = get_item(FetchTask, task_id)

        article_id = task_item.follow_id

        article_list_url = 'https://weibo.com/p/%s/wenzhang' % article_id

        meta = {
            'task_id': task_item.id,
            'platform_id': task_item.platform_id,
            'channel_id': task_item.channel_id,
            'follow_id': task_item.follow_id,
            'follow_name': task_item.follow_name,
        }

        yield scrapy.Request(url=article_list_url, callback=self.parse_article_list, meta=meta)

    @staticmethod
    def replace_all(input_html, replace_dict):
        """
        用字典实现批量替换
        """
        for k, v in six.iteritems(replace_dict):
            input_html = input_html.replace(k, v)
        return input_html

    def parse_article_list(self, response):
        """
        文章列表解析
        没有翻页特征 <a class=\"page next S_txt1 S_line1 page_dis\"><span>下一页<\/span>
        解析链接 href=\"\/p\/1005051627825392\/wenzhang?pids=Pl_Core_ArticleList__61&cfs=600&Pl_Core_ArticleList__61_filter=&Pl_Core_ArticleList__61_page=6#Pl_Core_ArticleList__61\"
        """
        print('task_url: %s' % response.url)
        # 页面解析(微博是JS动态数据, 无法直接解析页面)
        article_list_body = response.body_as_unicode()

        article_list_rule = r'<script>FM.view\({"ns":"pl.content.miniTab.index","domid":"Pl_Core_ArticleList__\d+".*?"html":"(.*?)"}\)</script>'
        article_list_re_parse = re.compile(article_list_rule, re.S).findall(article_list_body)
        if not article_list_re_parse:
            return
        article_list_html = ''.join(article_list_re_parse)

        # 转义字符处理
        article_list_html = article_list_html.replace('\\r', '')
        article_list_html = article_list_html.replace('\\t', '')
        article_list_html = article_list_html.replace('\\n', '')
        article_list_html = article_list_html.replace('\\"', '"')
        article_list_html = article_list_html.replace('\\/', '/')

        article_list_doc = fromstring(article_list_html)
        article_list_doc_parse = article_list_doc.xpath('//div[@class="text_box"]')

        for article_item in article_list_doc_parse:
            article_detail_url = article_item.xpath('./div[@class="title W_autocut"]/a[@class="W_autocut S_txt1"]/@href')
            article_detail_title = article_item.xpath('./div[@class="title W_autocut"]/a[@class="W_autocut S_txt1"]/text()')
            article_detail_abstract = article_item.xpath('./div[@class="text"]/a[@class="S_txt1"]/text()')
            if not (article_detail_url and article_detail_title):
                continue
            article_detail_url = article_detail_url[0].strip()
            article_detail_url = response.urljoin(article_detail_url)
            article_detail_title = article_detail_title[0].strip()

            article_detail_abstract = article_detail_abstract[0].strip() if article_detail_abstract else ''

            meta_article_item = {
                'article_url': article_detail_url,
                'article_title': article_detail_title,
                'article_abstract': article_detail_abstract,
                'article_id': get_request_finger(article_detail_url),
            }

            meta = dict(response.meta, **meta_article_item)

            # 两种不同类型页面
            if '/ttarticle/p/show?id=' in article_detail_url:
                yield scrapy.Request(url=article_detail_url, callback=self.parse_article_detail_html, meta=meta)
            else:
                yield scrapy.Request(url=article_detail_url, callback=self.parse_article_detail_js, meta=meta)

        # 翻页处理
        next_url_parse = article_list_doc.xpath('//a[@class="page next S_txt1 S_line1"]/@href')
        if not next_url_parse:
            print('当前条件列表页最后一页：%s' % response.url)
        else:
            next_url = next_url_parse[0]
            next_url = response.urljoin(next_url)
            print(next_url)
            yield scrapy.Request(url=next_url, callback=self.parse_article_list, meta=response.meta)

    def parse_article_detail_html(self, response):
        """
        文章详情解析 html 版
        :param response:
        :return:
        """
        article_title = response.xpath('//div[@class="title"]/text()').extract_first(default='')
        article_pub_time = response.xpath('//span[@class="time"]/text()').extract_first(default='')
        article_content = response.xpath('//div[@class="WB_editor_iframe"]').extract_first(default='')
        fetch_result_item = FetchResultItem()
        fetch_result_item['task_id'] = response.meta['task_id']
        fetch_result_item['platform_id'] = response.meta['platform_id']
        fetch_result_item['platform_name'] = platform_name_map.get(response.meta['platform_id'], '')
        fetch_result_item['channel_id'] = response.meta['channel_id']
        fetch_result_item['channel_name'] = channel_name_map.get(response.meta['channel_id'], '')
        fetch_result_item['article_id'] = response.meta['article_id']
        fetch_result_item['article_title'] = article_title
        fetch_result_item['article_author_id'] = response.meta['follow_id']
        fetch_result_item['article_author_name'] = response.meta['follow_name']
        fetch_result_item['article_pub_time'] = article_pub_time
        fetch_result_item['article_url'] = response.url
        fetch_result_item['article_tags'] = ''
        fetch_result_item['article_abstract'] = response.meta['article_abstract']
        fetch_result_item['article_content'] = article_content
        yield fetch_result_item

    @staticmethod
    def trans_time(time_str):
        """
        时间转换
        :param time_str:
        :return:
        """
        time_rule = r'(\d+)年(\d+)月(\d+)日 (\d+):(\d+)'
        time_parse = re.compile(time_rule, re.S).findall(time_str)
        if not time_parse:
            return time.strftime('%Y-%m-%d %H:%M:%S')
        return datetime(*[int(i) for i in time_parse[0]]).strftime('%Y-%m-%d %H:%M:%S')

    def parse_article_detail_js(self, response):
        """
        文章详情解析 js 版
        :param response:
        :return:
        """
        article_detail_body = response.body_as_unicode()
        article_detail_rule = r'<script>FM.view\({"ns":.*?"html":"(.*?)"}\)</script>'
        article_detail_re_parse = re.compile(article_detail_rule, re.S).findall(article_detail_body)
        if not article_detail_re_parse:
            return
        article_detail_html = ''.join(article_detail_re_parse)

        # 转义字符处理
        article_detail_html = article_detail_html.replace('\\r', '')
        article_detail_html = article_detail_html.replace('\\t', '')
        article_detail_html = article_detail_html.replace('\\n', '')
        article_detail_html = article_detail_html.replace('\\"', '"')
        article_detail_html = article_detail_html.replace('\\/', '/')

        article_detail_doc = fromstring(article_detail_html)

        article_title_parse = article_detail_doc.xpath('//h1[@class="title"]/text()')
        article_title = article_title_parse[0].strip() if article_title_parse else ''

        article_pub_time_parse = article_detail_doc.xpath('//span[@class="time"]/text()')
        article_pub_time = self.trans_time(article_pub_time_parse[0].strip()) if article_pub_time_parse else time.strftime('%Y-%m-%d %H:%M:%S')

        article_content_parse = article_detail_doc.xpath('//div[@class="WBA_content"]')
        article_content = tostring(article_content_parse[0], encoding='unicode').strip() if article_content_parse else ''

        fetch_result_item = FetchResultItem()
        fetch_result_item['task_id'] = response.meta['task_id']
        fetch_result_item['platform_id'] = response.meta['platform_id']
        fetch_result_item['platform_name'] = platform_name_map.get(response.meta['platform_id'], '')
        fetch_result_item['channel_id'] = response.meta['channel_id']
        fetch_result_item['channel_name'] = channel_name_map.get(response.meta['channel_id'], '')
        fetch_result_item['article_id'] = response.meta['article_id']
        fetch_result_item['article_title'] = article_title
        fetch_result_item['article_author_id'] = response.meta['follow_id']
        fetch_result_item['article_author_name'] = response.meta['follow_name']
        fetch_result_item['article_pub_time'] = time_local_to_utc(article_pub_time).strftime('%Y-%m-%d %H:%M:%S')
        fetch_result_item['article_url'] = response.url
        fetch_result_item['article_tags'] = ''
        fetch_result_item['article_abstract'] = response.meta['article_abstract']
        fetch_result_item['article_content'] = article_content
        yield fetch_result_item


================================================
FILE: news/spiders/weixin.py
================================================
# -*- coding: utf-8 -*-


from __future__ import print_function
from __future__ import unicode_literals

import scrapy

from apps.client_db import get_item
from maps.channel import channel_name_map
from maps.platform import platform_name_map
from models.news import FetchTask
from news.items import FetchResultItem
from tools.cookies import get_cookies
from tools.date_time import time_local_to_utc
from tools.scrapy_tasks import pop_task
from tools.url import get_update_url
from tools.weixin import parse_weixin_js_body, ParseJsWc, check_article_title_duplicate


class WeixinSpider(scrapy.Spider):
    """
    微信公众号蜘蛛
    因微信公众号详情链接是带有效期签名的动态链接, 故无法使用请求去重中间件
    """
    name = 'weixin'
    allowed_domains = ['mp.weixin.qq.com', 'weixin.qq.com', 'qq.com', 'sogou.com']

    custom_settings = dict(
        COOKIES_ENABLED=True,
        DEFAULT_REQUEST_HEADERS={
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:57.0) Gecko/20100101 Firefox/57.0'
        },
        USER_AGENT='Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:57.0) Gecko/20100101 Firefox/57.0',
        DOWNLOADER_MIDDLEWARES={
            # 'news.middlewares.de_duplication_request.DeDuplicationRequestMiddleware': 140,  # 去重请求
            'news.middlewares.anti_spider.AntiSpiderMiddleware': 160,  # 反爬处理
            'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None,
            'news.middlewares.useragent.UserAgentMiddleware': 500,
            # 'news.middlewares.httpproxy.HttpProxyMiddleware': 720,  # 代理（cookie需要与代理IP关联）
        },
        ITEM_PIPELINES={
            'news.pipelines.de_duplication_store_mysql.DeDuplicationStoreMysqlPipeline': 400,  # 去重存储
            # 'news.pipelines.img_remote_to_local_fs.ImgRemoteToLocalFSPipeline': 440,
            'news.pipelines.store_mysql.StoreMysqlPipeline': 450,
            # 'news.pipelines.de_duplication_request.DeDuplicationRequestPipeline': 500,  # 去重请求
        },
        DOWNLOAD_DELAY=0.5
    )

    def start_requests(self):
        """
        入口准备
        :return:
        """
        boot_url = 'http://weixin.sogou.com/weixin'

        task_id = pop_task(self.name)

        if not task_id:
            print('%s task is empty' % self.name)
            return
        print('%s task id: %s' % (self.name, task_id))

        task_item = get_item(FetchTask, task_id)

        cookies_id, cookies = get_cookies(self.name)
        url_params = {
            'type': 1,
            # 'query': task_item.follow_id,
            'query': task_item.follow_name.encode('utf-8'),
        }
        url_profile = get_update_url(boot_url, url_params)
        meta = {
            'cookiejar': cookies_id,
            'task_id': task_item.id,
            'platform_id': task_item.platform_id,
            'channel_id': task_item.channel_id,
            'follow_id': task_item.follow_id,
            'follow_name': task_item.follow_name,
        }

        yield scrapy.Request(url=url_profile, cookies=cookies, callback=self.parse_account_search_list, meta=meta)

    def parse_article_search_list(self, response):
        """
        解析微信文章 搜索列表页面 (废弃)
        :param response:
        :return:
        """
        news_links = response.xpath('//div[@class="txt-box"]/h3/a/@href').extract()
        for new_link in news_links:
            yield scrapy.Request(url=new_link, callback=self.parse_detail)

    def parse_account_search_list(self, response):
        """
        解析公众账号 搜索列表页面
        :param response:
        :return:
        """
        account_link = response.xpath('//div[@class="txt-box"]//a/@href').extract_first()
        if account_link:
            yield scrapy.Request(url=account_link, callback=self.parse_account_article_list, meta=response.meta)

    def parse_account_article_list(self, response):
        """
        解析公众账号 文章列表页面
        :param response:
        :return:
        """
        article_list_body = response.body_as_unicode()
        js_body = parse_weixin_js_body(article_list_body, response.url)
        if not js_body:
            return
        pj = ParseJsWc(js_body=js_body)
        article_list = pj.parse_js_msg_list()

        for article_item in article_list:
            # 标题去重
            if check_article_title_duplicate(article_item['article_title']):
                continue
            meta = dict(response.meta, **article_item)
            yield scrapy.Request(url=article_item['article_url'], callback=self.parse_detail, meta=meta)

    def parse_detail(self, response):
        """
        详细页面
        :param response:
        :return:
        """
        article_content = ''.join([i.strip() for i in response.xpath('//div[@id="js_content"]/*').extract()])

        # 原创内容处理（处理内容为空）
        if not article_content:
            share_source_url = response.xpath('//a[@id="js_share_source"]/@href').extract_first()
            yield scrapy.Request(url=share_source_url, callback=self.parse_detail, meta=response.meta)
            return

        fetch_result_item = FetchResultItem()
        fetch_result_item['task_id'] = response.meta['task_id']
        fetch_result_item['platform_id'] = response.meta['platform_id']
        fetch_result_item['platform_name'] = platform_name_map.get(response.meta['platform_id'], '')
        fetch_result_item['channel_id'] = response.meta['channel_id']
        fetch_result_item['channel_name'] = channel_name_map.get(response.meta['channel_id'], '')
        fetch_result_item['article_id'] = response.meta['article_id']
        fetch_result_item['article_title'] = response.meta['article_title']
        fetch_result_item['article_author_id'] = response.meta['follow_id']
        fetch_result_item['article_author_name'] = response.meta['follow_name']
        fetch_result_item['article_pub_time'] = time_local_to_utc(response.meta['article_pub_time']).strftime('%Y-%m-%d %H:%M:%S')
        fetch_result_item['article_url'] = response.meta['article_url']
        fetch_result_item['article_tags'] = ''
        fetch_result_item['article_abstract'] = response.meta['article_abstract']
        fetch_result_item['article_content'] = article_content

        yield fetch_result_item


================================================
FILE: requirements-py2.txt
================================================
asn1crypto==0.24.0
attrs==19.1.0
Automat==0.7.0
certifi==2019.3.9
cffi==1.12.3
chardet==3.0.4
constantly==15.1.0
cryptography==2.6.1
cssselect==1.0.3
enum34==1.1.6
functools32==3.2.3.post2
future==0.17.1
hyperlink==19.0.0
idna==2.8
incremental==17.5.0
inflect==2.1.0
ipaddress==1.0.22
lxml==4.3.3
mysqlclient==1.4.2.post1
parsel==1.5.1
Pillow==6.0.0
psutil==5.6.2
pyasn1==0.4.5
pyasn1-modules==0.2.5
pycparser==2.19
PyDispatcher==2.0.5
PyExecJS==1.5.1
PyHamcrest==1.9.0
pyOpenSSL==19.0.0
queuelib==1.5.0
redis==3.2.1
requests==2.22.0
schedule==0.6.0
Scrapy==1.6.0
service-identity==18.1.0
six==1.12.0
sqlacodegen==1.1.6
SQLAlchemy==1.3.3
Twisted==19.2.0
urllib3==1.25.3
w3lib==1.20.0
zope.interface==4.6.0


================================================
FILE: requirements-py3.txt
================================================
asn1crypto==0.24.0
attrs==19.1.0
Automat==0.7.0
certifi==2019.3.9
cffi==1.12.3
chardet==3.0.4
constantly==15.1.0
cryptography==2.6.1
cssselect==1.0.3
future==0.17.1
hyperlink==19.0.0
idna==2.8
incremental==17.5.0
inflect==2.1.0
lxml==4.3.3
mysqlclient==1.4.2.post1
parsel==1.5.1
Pillow==6.0.0
psutil==5.6.2
pyasn1==0.4.5
pyasn1-modules==0.2.5
pycparser==2.19
PyDispatcher==2.0.5
PyExecJS==1.5.1
PyHamcrest==1.9.0
pyOpenSSL==19.0.0
queuelib==1.5.0
redis==3.2.1
requests==2.22.0
schedule==0.6.0
Scrapy==1.6.0
service-identity==18.1.0
six==1.12.0
sqlacodegen==1.1.6
SQLAlchemy==1.3.3
Twisted==19.2.0
urllib3==1.25.3
w3lib==1.20.0
zope.interface==4.6.0


================================================
FILE: scrapy.cfg
================================================
# Automatically created by: scrapy startproject
#
# For more information about the [deploy] section see:
# https://scrapyd.readthedocs.io/en/latest/deploy.html

[settings]
default = news.settings

[deploy]
#url = http://localhost:6800/
project = news


================================================
FILE: tasks/__init__.py
================================================
#!/usr/bin/env python
# encoding: utf-8

"""
@author: zhanghe
@software: PyCharm
@file: __init__.py.py
@time: 2018-02-10 17:10
"""


def func():
    pass


class Main(object):
    def __init__(self):
        pass


if __name__ == '__main__':
    pass


================================================
FILE: tasks/job_put_tasks.py
================================================
#!/usr/bin/env python
# encoding: utf-8

"""
@author: zhanghe
@software: PyCharm
@file: job_put_tasks.py
@time: 2018-02-10 17:16
"""

import sys

from models.news import FetchTask, FetchResult, LogTaskScheduling
from apps.client_db import get_group, get_all
from maps.platform import WEIXIN, WEIBO, TOUTIAO
from tools.scrapy_tasks import put_task, get_tasks_count


def job_put_tasks(spider_name):
    # 如果任务队列没有消耗完毕, 不处理
    tasks_count = get_tasks_count(spider_name)
    if tasks_count:
        return True

    spider_map = {
        'weixin': WEIXIN,
        'weibo': WEIBO,
        'toutiao': TOUTIAO,
        'toutiao_m': TOUTIAO,
    }

    # TODO 稳定运行之后需要去掉
    # task_exclude = [i.task_id for i in get_group(FetchResult, 'task_id', min_count=1)]

    task_list = get_all(FetchTask, FetchTask.platform_id == spider_map.get(spider_name))

    c = 0
    for task in task_list:
        # 排除任务
        # if task.id in task_exclude:
        #     continue
        put_task(spider_name, task.id)
        c += 1
        if c % 100 == 0:
            print(c)
    print('put %s tasks count: %s' % (spider_name, c))
    return True


def usage():
    contents = [
        'Example:',
        '\tpython job_put_tasks.py wx  # 微信',
        '\tpython job_put_tasks.py wb  # 微博',
        '\tpython job_put_tasks.py tm  # 头条(M)',
        '\tpython job_put_tasks.py tt  # 头条(PC)',
    ]
    print('\n'.join(contents))


def run():
    """
    入口
    """
    # print(sys.argv)
    spider_name_maps = {
        'wx': 'weixin',
        'wb': 'weibo',
        'tt': 'toutiao',
        'tm': 'toutiao_m',
    }
    try:
        if len(sys.argv) > 1:
            spider_name = spider_name_maps.get(sys.argv[1])
            if not spider_name:
                raise Exception('参数错误')
            job_put_tasks(spider_name)
        else:
            raise Exception('缺失参数')
    except Exception as e:
        print(e.message)
        usage()


if __name__ == '__main__':
    run()


================================================
FILE: tasks/job_reboot_net_china_net.py
================================================
#!/usr/bin/env python
# encoding: utf-8

"""
@author: zhanghe
@software: PyCharm
@file: job_reboot_net_china_net.py
@time: 2018-05-28 19:40
"""


import time
from libs.optical_modem import OpticalModemChinaNet
from tools.net_status import get_reboot_net_status, del_reboot_net_status

net_name = 'optical_modem_china_net'


def job_reboot_net_china_net():
    """
    重启中国电信光猫
    :return:
    """
    # reboot_net_status = get_reboot_net_status(net_name)
    # if not reboot_net_status:
    #     return

    om_cn = OpticalModemChinaNet()
    om_cn.net_ip_o = om_cn.get_net_ip()
    om_cn.login()  # 默认用户名、密码
    om_cn.reboot()
    time.sleep(10)
    om_cn.net_ip_n = om_cn.get_net_ip()
    om_cn.check_reboot_status()

    del_reboot_net_status(net_name)


================================================
FILE: tasks/jobs_proxies.py
================================================
#!/usr/bin/env python
# encoding: utf-8

"""
@author: zhanghe
@software: PyCharm
@file: jobs_proxies.py
@time: 2018-03-13 17:22
"""


from __future__ import print_function

import sys

from tools.proxies import add_proxy, len_proxy, fetch_proxy


def job_proxies(spider_name, mix_num=0):
    if len_proxy(spider_name) <= mix_num:
        proxy_list = fetch_proxy()
        if not proxy_list:
            return
        add_proxy(spider_name, *proxy_list)
        print('%s add proxies: %s' % (spider_name, len(proxy_list)))


def usage():
    contents = [
        'Example:',
        '\tpython jobs_proxies.py ip  # 测试',
        '\tpython jobs_proxies.py wx  # 微信',
        '\tpython jobs_proxies.py wb  # 微博',
        '\tpython jobs_proxies.py tt  # 头条',
    ]
    print('\n'.join(contents))


def run():
    """
    入口
    """
    # print(sys.argv)
    spider_name_maps = {
        'wx': 'weixin',
        'wb': 'weibo',
        'tt': 'toutiao',
    }
    try:
        if len(sys.argv) > 1:
            spider_name = spider_name_maps.get(sys.argv[1], sys.argv[1])
            if not spider_name:
                raise Exception('参数错误')
            job_proxies(spider_name)
        else:
            raise Exception('缺失参数')
    except Exception as e:
        print(e.message)
        usage()


if __name__ == '__main__':
    run()


================================================
FILE: tasks/jobs_sogou.py
================================================
#!/usr/bin/env python
# encoding: utf-8

"""
@author: zhanghe
@software: PyCharm
@file: jobs_sogou.py
@time: 2018-02-10 18:05
"""


from tools.cookies import add_cookies
from tools.anti_spider_sogou import auto_cookies as sogou_cookies
from apps.client_rk import rk_counter_client, check_counter_limit, check_cookies_count


def job_sogou_cookies(spider_name):
    """
    sogou cookies
    :return:
    """
    # 判断每天限制额度
    if not check_counter_limit():
        print('spider_name: %s, There is not enough available quantity' % spider_name)
        return False

    # 判断 cookie 队列长度
    if not check_cookies_count(spider_name):
        print('spider_name: %s, The quantity of cookies is enough' % spider_name)
        return False

    sogou_cookies_obj = sogou_cookies()

    if not sogou_cookies_obj:
        return False

    add_cookies(spider_name, sogou_cookies_obj)
    rk_counter_client.increase(1)
    return True


if __name__ == '__main__':
    job_sogou_cookies('weixin')


================================================
FILE: tasks/jobs_weixin.py
================================================
#!/usr/bin/env python
# encoding: utf-8

"""
@author: zhanghe
@software: PyCharm
@file: jobs_weixin.py
@time: 2018-02-10 18:06
"""


import json
import time
import sys

from libs.redis_pub_sub import RedisPubSub
from libs.redis_queue import RedisQueue
from tools.anti_spider_weixin import auto_cookies as weixin_cookies
from apps.client_db import redis_client
from apps.client_rk import rk_counter_client, check_counter_limit


def set_anti_spider_task(spider_name, msg):
    """
    设置任务队列
    msg = {
        'url': url,
        'time': time.strftime("%Y-%m-%d %H:%M:%S")
    }
    :param spider_name:
    :param msg:
    :return:
    """
    key = 'scrapy:anti_spider_task_weixin:%s' % spider_name
    q_task = RedisQueue(key, redis_client=redis_client)
    q_msg = json.dumps(msg) if isinstance(msg, dict) else msg
    # 因为微信反爬策略是通过IP限制, 这里仅仅处理一个任务
    if q_task.empty():
        q_task.put(q_msg)


def _get_anti_spider_task(spider_name):
    """获取任务队列"""
    key = 'scrapy:anti_spider_task_weixin:%s' % spider_name
    q_task = RedisQueue(key, redis_client=redis_client)
    result = q_task.get(timeout=60)
    return json.loads(result) if result else {}


def _set_anti_spider_result(spider_name, msg):
    """设置结果队列"""
    key = 'scrapy:anti_spider_result_weixin:%s' % spider_name
    q_result = RedisQueue(key, redis_client=redis_client)
    q_msg = json.dumps(msg) if isinstance(msg, dict) else msg
    q_result.put(q_msg)


def _get_anti_spider_result(spider_name):
    """获取任务队列"""
    key = 'scrapy:anti_spider_result_weixin:%s' % spider_name
    q_result = RedisQueue(key, redis_client=redis_client)
    result = q_result.get(timeout=60)
    return json.loads(result) if result else {}


def sub_anti_spider(spider_name):
    """
    蜘蛛订阅验证码处理结果
    :param spider_name:
    :return:
    """
    q = RedisPubSub('scrapy:anti_spider', redis_client=redis_client)
    r = q.sub_not_loop(spider_name)
    return json.loads(r) if r else {}


def _pub_anti_spider(spider_name, msg):
    """
    将对应蜘蛛的验证码处理结果发布给对应订阅者
    :param spider_name:
    :return:
    """
    q = RedisPubSub('scrapy:anti_spider', redis_client=redis_client)
    msg = json.dumps(msg) if isinstance(msg, dict) else msg
    q.pub(spider_name, msg)


def job_weixin_cookies(spider_name):
    """
    weixin cookies
    :return:
    """
    # 判断每天限制额度
    if not check_counter_limit():
        print('spider_name: %s, There is not enough available quantity' % spider_name)
        return False

    # 读取验证码任务队列(超时1分钟)
    task = _get_anti_spider_task(spider_name)
    if not task:
        return False

    # 设置验证码结果队列
    url = task.get('url')
    msg = {
        'url': url,
        'status': False,
        'time': time.strftime("%Y-%m-%d %H:%M:%S")
    }
    try:
        weixin_cookies_status = weixin_cookies(url)
        msg['status'] = weixin_cookies_status

        _set_anti_spider_result(spider_name, msg)

        # 读取验证码结果队列(超时1分钟)
        msg = _get_anti_spider_result(spider_name)

        _pub_anti_spider(spider_name, msg)
        rk_counter_client.increase(1)
        return True
    except Exception as e:
        print(e.message)
        _pub_anti_spider(spider_name, msg)


def usage():
    print('python tasks/jobs_weixin.py <function> <spider_name>')
    print('\tpython tasks/jobs_weixin.py job_weixin_cookies weixin')


def run():
    """
    启动入口
    """
    # print sys.argv
    try:
        if len(sys.argv) >= 3:
            fun_name = globals()[sys.argv[1]]
            fun_name(sys.argv[2])
        else:
            usage()
    except NameError as e:
        print(e)


if __name__ == '__main__':
    job_weixin_cookies('weixin')
    # run()
    # python tasks/jobs_weixin.py job_weixin_cookies weixin


================================================
FILE: tasks/run_job_counter_clear.py
================================================
#!/usr/bin/env python
# encoding: utf-8

"""
@author: zhanghe
@software: PyCharm
@file: run_job_counter_clear.py
@time: 2018-05-02 10:24
"""

import time

import schedule

from apps.client_rk import counter_clear as job_counter_clear
from tools import catch_keyboard_interrupt

# 计数清零
schedule.every().day.at('00:00').do(job_counter_clear)


@catch_keyboard_interrupt
def run():
    while True:
        schedule.run_pending()
        time.sleep(1)


if __name__ == '__main__':
    run()


================================================
FILE: tasks/run_job_put_tasks_toutiao.py
================================================
#!/usr/bin/env python
# encoding: utf-8

"""
@author: zhanghe
@software: PyCharm
@file: run_job_put_tasks_toutiao.py
@time: 2018-05-02 10:23
"""

import time

import schedule

from tasks.job_put_tasks import job_put_tasks
from tools import catch_keyboard_interrupt


# 分布式任务调度 - 头条
schedule.every(1).minutes.do(job_put_tasks, spider_name='toutiao')


@catch_keyboard_interrupt
def run():
    while True:
        schedule.run_pending()
        time.sleep(1)


if __name__ == '__main__':
    run()


================================================
FILE: tasks/run_job_put_tasks_weibo.py
================================================
#!/usr/bin/env python
# encoding: utf-8

"""
@author: zhanghe
@software: PyCharm
@file: run_job_put_tasks_weibo.py
@time: 2018-05-02 10:23
"""

import time

import schedule

from tasks.job_put_tasks import job_put_tasks
from tools import catch_keyboard_interrupt


# 分布式任务调度 - 微博
schedule.every(5).minutes.do(job_put_tasks, spider_name='weibo')


@catch_keyboard_interrupt
def run():
    while True:
        schedule.run_pending()
        time.sleep(1)


if __name__ == '__main__':
    run()


================================================
FILE: tasks/run_job_put_tasks_weixin.py
================================================
#!/usr/bin/env python
# encoding: utf-8

"""
@author: zhanghe
@software: PyCharm
@file: run_job_put_tasks_weixin.py
@time: 2018-05-02 10:23
"""

import time

import schedule

from tasks.job_put_tasks import job_put_tasks
from tools import catch_keyboard_interrupt


# 分布式任务调度 - 微信
schedule.every(5).minutes.do(job_put_tasks, spider_name='weixin')


@catch_keyboard_interrupt
def run():
    while True:
        schedule.run_pending()
        time.sleep(1)


if __name__ == '__main__':
    run()


================================================
FILE: tasks/run_job_reboot_net_china_net.py
================================================
#!/usr/bin/env python
# encoding: utf-8

"""
@author: zhanghe
@software: PyCharm
@file: run_job_optical_modem_china_net.py
@time: 2018-05-28 19:35
"""

import time

import schedule

from tasks.job_reboot_net_china_net import job_reboot_net_china_net
from tools import catch_keyboard_interrupt


# 电信光猫重启
schedule.every(15).minutes.do(job_reboot_net_china_net)


@catch_keyboard_interrupt
def run():
    while True:
        schedule.run_pending()
        time.sleep(1)


if __name__ == '__main__':
    run()


================================================
FILE: tasks/run_job_sogou_cookies.py
================================================
#!/usr/bin/env python
# encoding: utf-8

"""
@author: zhanghe
@software: PyCharm
@file: run_job_sogou_cookies.py
@time: 2018-05-02 10:21
"""

import time

import schedule

from tasks.jobs_sogou import job_sogou_cookies
from tools import catch_keyboard_interrupt

# sogou 反爬任务
schedule.every(5).minutes.do(job_sogou_cookies, spider_name='weixin')


@catch_keyboard_interrupt
def run():
    while True:
        schedule.run_pending()
        time.sleep(1)


if __name__ == '__main__':
    run()


================================================
FILE: tasks/run_job_weixin_cookies.py
================================================
#!/usr/bin/env python
# encoding: utf-8

"""
@author: zhanghe
@software: PyCharm
@file: run_job_weixin_cookies.py
@time: 2018-05-02 10:22
"""

import time

import schedule

from tasks.jobs_weixin import job_weixin_cookies
from tools import catch_keyboard_interrupt

# weixin 反爬任务
schedule.every(5).minutes.do(job_weixin_cookies, spider_name='weixin')


@catch_keyboard_interrupt
def run():
    while True:
        schedule.run_pending()
        time.sleep(1)


if __name__ == '__main__':
    run()


================================================
FILE: tasks/run_jobs.py
================================================
#!/usr/bin/env python
# encoding: utf-8

"""
@author: zhanghe
@software: PyCharm
@file: run_jobs.py
@time: 2018-04-18 11:10
"""


import schedule
import time
from tools import catch_keyboard_interrupt

from tasks import job_put_tasks
from tasks.jobs_sogou import job_sogou_cookies
from tasks.jobs_weixin import job_weixin_cookies
from apps.client_rk import counter_clear as job_counter_clear


# sogou 反爬任务
schedule.every(5).minutes.do(job_sogou_cookies, spider_name='weixin')
# weixin 反爬任务
schedule.every(5).minutes.do(job_weixin_cookies, spider_name='weixin')
# 分布式任务调度 - 微信
schedule.every(5).minutes.do(job_put_tasks, spider_name='weixin')
# 分布式任务调度 - 微博
schedule.every(5).minutes.do(job_put_tasks, spider_name='weibo')
# 分布式任务调度 - 头条
schedule.every(5).minutes.do(job_put_tasks, spider_name='toutiao')
# 计数清零
schedule.every().day.at('00:00').do(job_counter_clear)


@catch_keyboard_interrupt
def run():
    while True:
        schedule.run_pending()
        time.sleep(1)


if __name__ == '__main__':
    run()


================================================
FILE: tasks/run_jobs_apscheduler.py
================================================
#!/usr/bin/env python
# encoding: utf-8

"""
@author: zhanghe
@software: PyCharm
@file: run_jobs_apscheduler.py
@time: 2018-02-10 18:01
"""

# Deprecated


from apscheduler.schedulers.blocking import BlockingScheduler

from config import current_config

from tasks import job_put_tasks
from tasks.jobs_sogou import job_sogou_cookies
from tasks.jobs_weixin import job_weixin_cookies
from apps.client_rk import counter_clear as job_counter_clear

REDIS = current_config.REDIS

scheduler = BlockingScheduler()

job_store_redis_alias = 'news_spider'


def add_job_store_redis():
    """
    127.0.0.1:6379> TYPE "example.jobs"
    hash
    127.0.0.1:6379> TYPE "example.run_times"
    zset
    127.0.0.1:6379> HGETALL "example.jobs"
    1) "45431465e6104f3c924ec01852ed1aeb"
    2) "\x80\x02}q\x01(U\x04argsq\x02)U\bexecutorq\x03U\adefaultq\x04U\rmax_instancesq\x05K\x01U\x04funcq\x06U\x10__main__:task_03q\aU\x02idq\bU 45431465e6104f3c924ec01852ed1aebq\tU\rnext_run_timeq\ncdatetime\ndatetime\nq\x0bU\n\a\xe1\x0c\b\x02\x01\x00\x00\x00\x00cpytz\n_p\nq\x0c(U\rAsia/Shanghaiq\rM\x80pK\x00U\x03CSTq\x0etRq\x0f\x86Rq\x10U\x04nameq\x11U\atask_03q\x12U\x12misfire_grace_timeq\x13K\x01U\atriggerq\x14capscheduler.triggers.cron\nCronTrigger\nq\x15)\x81q\x16}q\x17(U\btimezoneq\x18h\x0c(h\rM\xe8qK\x00U\x03LMTq\x19tRq\x1aU\aversionq\x1bK\x01U\nstart_dateq\x1cNU\bend_dateq\x1dNU\x06fieldsq\x1e]q\x1f(capscheduler.triggers.cron.fields\nBaseField\nq )\x81q!}q\"(U\nis_defaultq#\x88U\x0bexpressionsq$]q%capscheduler.triggers.cron.expressions\nAllExpression\nq&)\x81q'}q(U\x04stepq)Nsbah\x11U\x04yearq*ubh )\x81q+}q,(h#\x88h$]q-h&)\x81q.}q/h)Nsbah\x11U\x05monthq0ubcapscheduler.triggers.cron.fields\nDayOfMonthField\nq1)\x81q2}q3(h#\x88h$]q4h&)\x81q5}q6h)Nsbah\x11U\x03dayq7ubcapscheduler.triggers.cron.fields\nWeekField\nq8)\x81q9}q:(h#\x88h$]q;h&)\x81q<}q=h)Nsbah\x11U\x04weekq>ubcapscheduler.triggers.cron.fields\nDayOfWeekField\nq?)\x81q@}qA(h#\x88h$]qBh&)\x81qC}qDh)Nsbah\x11U\x0bday_of_weekqEubh )\x81qF}qG(h#\x89h$]qHcapscheduler.triggers.cron.expressions\nRangeExpression\nqI)\x81qJ}qK(h)NU\x04lastqLK\x16U\x05firstqMK\x00ubah\x11U\x04hourqNubh )\x81qO}qP(h#\x89h$]qQhI)\x81qR}qS(h)NhLK\x01hMK\x01ubah\x11U\x06minuteqTubh )\x81qU}qV(h#\x88h$]qWhI)\x81qX}qY(h)NhLK\x00hMK\x00ubah\x11U\x06secondqZubeubU\bcoalesceq[\x88h\x1bK\x01U\x06kwargsq\\}q]u."
    3) "f5637d98946848c291da09a4ceb08027"
    4) "\x80\x02}q\x01(U\x04argsq\x02)U\bexecutorq\x03U\adefaultq\x04U\rmax_instancesq\x05K\x01U\x04funcq\x06U\x10__main__:task_04q\aU\x02idq\bU f5637d98946848c291da09a4ceb08027q\tU\rnext_run_timeq\ncdatetime\ndatetime\nq\x0bU\n\a\xe1\x0c\b\x012\x00\x00\x00\x00cpytz\n_p\nq\x0c(U\rAsia/Shanghaiq\rM\x80pK\x00U\x03CSTq\x0etRq\x0f\x86Rq\x10U\x04nameq\x11U\atask_04q\x12U\x12misfire_grace_timeq\x13K\x01U\atriggerq\x14capscheduler.triggers.cron\nCronTrigger\nq\x15)\x81q\x16}q\x17(U\btimezoneq\x18h\x0c(h\rM\xe8qK\x00U\x03LMTq\x19tRq\x1aU\aversionq\x1bK\x01U\nstart_dateq\x1cNU\bend_dateq\x1dNU\x06fieldsq\x1e]q\x1f(capscheduler.triggers.cron.fields\nBaseField\nq )\x81q!}q\"(U\nis_defaultq#\x88U\x0bexpressionsq$]q%capscheduler.triggers.cron.expressions\nAllExpression\nq&)\x81q'}q(U\x04stepq)Nsbah\x11U\x04yearubh )\x81q*}q+(h#\x88h$]q,h&)\x81q-}q.h)Nsbah\x11U\x05monthubcapscheduler.triggers.cron.fields\nDayOfMonthField\nq/)\x81q0}q1(h#\x88h$]q2h&)\x81q3}q4h)Nsbah\x11U\x03dayubcapscheduler.triggers.cron.fields\nWeekField\nq5)\x81q6}q7(h#\x88h$]q8h&)\x81q9}q:h)Nsbah\x11U\x04weekubcapscheduler.triggers.cron.fields\nDayOfWeekField\nq;)\x81q<}q=(h#\x88h$]q>h&)\x81q?}q@h)Nsbah\x11U\x0bday_of_weekubh )\x81qA}qB(h#\x89h$]qCcapscheduler.triggers.cron.expressions\nRangeExpression\nqD)\x81qE}qF(h)NU\x04lastqGK\x16U\x05firstqHK\x00ubah\x11U\x04hourubh )\x81qI}qJ(h#\x89h$]qKh&)\x81qL}qMh)K\x01sbah\x11U\x06minuteubh )\x81qN}qO(h#\x88h$]qPhD)\x81qQ}qR(h)NhGK\x00hHK\x00ubah\x11U\x06secondubeubU\bcoalesceqS\x88h\x1bK\x01U\x06kwargsqT}qUu."
    5) "ba044f7b253a4cb1961e7abf036f8ef7"
    6) "\x80\x02}q\x01(U\x04argsq\x02)U\bexecutorq\x03U\adefaultq\x04U\rmax_instancesq\x05K\x01U\x04funcq\x06U\x10__main__:task_02q\aU\x02idq\bU ba044f7b253a4cb1961e7abf036f8ef7q\tU\rnext_run_timeq\ncdatetime\ndatetime\nq\x0bU\n\a\xe1\x0c\b\x012\r\x0f5\xf9cpytz\n_p\nq\x0c(U\rAsia/Shanghaiq\rM\x80pK\x00U\x03CSTq\x0etRq\x0f\x86Rq\x10U\x04nameq\x11U\atask_02q\x12U\x12misfire_grace_timeq\x13K\x01U\atriggerq\x14capscheduler.triggers.interval\nIntervalTrigger\nq\x15)\x81q\x16}q\x17(U\btimezoneq\x18h\x0c(h\rM\xe8qK\x00U\x03LMTq\x19tRq\x1aU\aversionq\x1bK\x01U\nstart_dateq\x1ch\x0bU\n\a\xe1\x0c\b\x01.\r\x0f5\xf9h\x0f\x86Rq\x1dU\bend_dateq\x1eNU\bintervalq\x1fcdatetime\ntimedelta\nq K\x00K<K\x00\x87Rq!ubU\bcoalesceq\"\x88h\x1bK\x01U\x06kwargsq#}q$u."
    127.0.0.1:6379> ZCARD "example.run_times"
    (integer) 3
    127.0.0.1:6379> ZRANGE "example.run_times" 0 2 WITHSCORES
    1) "f5637d98946848c291da09a4ceb08027"
    2) "1512669060"
    3) "ba044f7b253a4cb1961e7abf036f8ef7"
    4) "1512669073.9968569"
    5) "45431465e6104f3c924ec01852ed1aeb"
    6) "1512669660"

    # 清理数据
    127.0.0.1:6379> DEL example.jobs
    (integer) 1
    127.0.0.1:6379> DEL example.run_times
    (integer) 1
    :return:
    """
    scheduler.add_jobstore(
        'redis',
        alias=job_store_redis_alias,
        jobs_key='news_spider.jobs',
        run_times_key='news_spider.run_times',
        **REDIS
    )


def add_job():
    # sogou 反爬任务
    scheduler.add_job(
        job_sogou_cookies,
        'interval',
        kwargs={'spider_name': 'weixin'},
        minutes=5,
        id='job_sogou_cookies',
        replace_existing=True
    )

    # weixin 反爬任务
    scheduler.add_job(
        job_weixin_cookies,
        'interval',
        kwargs={'spider_name': 'weixin'},
        minutes=2,
        id='job_weixin_cookies',
        replace_existing=True
    )

    # 分布式任务调度 - 微信
    scheduler.add_job(
        job_put_tasks,
        'interval',
        kwargs={'spider_name': 'weixin'},
        minutes=5,
        id='job_put_tasks_weixin',
        replace_existing=True
    )

    # 分布式任务调度 - 微博
    scheduler.add_job(
        job_put_tasks,
        'interval',
        kwargs={'spider_name': 'weibo'},
        minutes=5,
        id='job_put_tasks_weibo',
        replace_existing=True
    )

    # 分布式任务调度 - 头条
    scheduler.add_job(
        job_put_tasks,
        'interval',
        kwargs={'spider_name': 'toutiao'},
        minutes=5,
        id='job_put_tasks_toutiao',
        replace_existing=True
    )

    # 计数清零
    scheduler.add_job(
        job_counter_clear,
        'cron',
        day='*',
        hour='0',
        id='job_counter_clear',
        replace_existing=True
    )


def run_blocking():
    try:
        # add_job_store_redis()   # 后端存储 基于redis(可选)
        add_job()               # 添加任务
        scheduler.start()       # 开启调度
    except (KeyboardInterrupt, SystemExit):
        scheduler.shutdown()    # 关闭调度


if __name__ == '__main__':
    run_blocking()


================================================
FILE: tests/__init__.py
================================================
#!/usr/bin/env python
# encoding: utf-8

"""
@author: zhanghe
@software: PyCharm
@file: __init__.py.py
@time: 2018-02-10 17:39
"""


def func():
    pass


class Main(object):
    def __init__(self):
        pass


if __name__ == '__main__':
    pass


================================================
FILE: tests/test_date_time.py
================================================
#!/usr/bin/env python
# encoding: utf-8

"""
@author: zhanghe
@software: PyCharm
@file: test_date_time.py
@time: 2018-06-25 17:55
"""


from __future__ import unicode_literals

import unittest

import time
import datetime
from tools.date_time import time_local_to_utc, time_utc_to_local


class DateTimeTest(unittest.TestCase):
    """
    日期时间测试
    """
    def setUp(self):
        """
        获取系统时区, 设定一对本地时间和国际时间
        1、断言转换后的时差是否正确
        2、断言转换后的时间是否正确
        :return:
        """
        self.time_offset = time.timezone
        self.local_time = '2018-06-06 18:12:26'
        local_time_obj = datetime.datetime.strptime(self.local_time, '%Y-%m-%d %H:%M:%S')
        self.utc_time = (local_time_obj + datetime.timedelta(hours=self.time_offset/60/60)).strftime('%Y-%m-%d %H:%M:%S')

    def test_local_to_utc(self):
        """
        测试
        :return:
        """
        local_time_obj = datetime.datetime.strptime(self.local_time, '%Y-%m-%d %H:%M:%S')
        utc_time_obj = time_local_to_utc(self.local_time)

        self.assertEqual(utc_time_obj, local_time_obj + datetime.timedelta(seconds=self.time_offset))
        self.assertEqual(self.utc_time, utc_time_obj.strftime('%Y-%m-%d %H:%M:%S'))

    def test_utc_to_local(self):
        """
        测试
        :return:
        """
        utc_time_obj = datetime.datetime.strptime(self.utc_time, '%Y-%m-%d %H:%M:%S')
        local_time_obj = time_utc_to_local(self.utc_time)

        self.assertEqual(utc_time_obj, local_time_obj + datetime.timedelta(seconds=self.time_offset))
        self.assertEqual(self.local_time, local_time_obj.strftime('%Y-%m-%d %H:%M:%S'))

    def tearDown(self):
        pass


if __name__ == '__main__':
    unittest.main()


================================================
FILE: tests/test_finger.py
================================================
#!/usr/bin/env python
# encoding: utf-8

"""
@author: zhanghe
@software: PyCharm
@file: test_finger.py
@time: 2018-02-11 00:06
"""

from __future__ import unicode_literals

import hashlib
import unittest

from scrapy.http import Request
from scrapy.utils import request


class FingerTest(unittest.TestCase):
    """
    指纹测试
    """

    def setUp(self):
        self.url_01 = 'https://www.baidu.com/s?wd=openstack&rsv_spt=1'
        self.url_02 = 'https://www.baidu.com/s?rsv_spt=1&wd=openstack'

    def test_request(self):
        """
        测试请求
        :return:
        """
        req_01 = Request(url=self.url_01)
        result_01 = request.request_fingerprint(req_01)

        req_02 = Request(url=self.url_02)
        result_02 = request.request_fingerprint(req_02)

        self.assertEqual(result_01, result_02)

    def tearDown(self):
        pass


class MD5Test(unittest.TestCase):
    """
    md5测试
    """

    def setUp(self):
        self.url_01 = 'https://www.baidu.com/s?wd=openstack&rsv_spt=1'
        self.url_02 = 'https://www.baidu.com/s?rsv_spt=1&wd=openstack'

    def test_request(self):
        """
        测试请求
        :return:
        """
        m1 = hashlib.md5()
        m1.update(self.url_01.encode('utf-8'))
        result_01 = m1.hexdigest()

        m2 = hashlib.md5()
        m2.update(self.url_02.encode('utf-8'))
        result_02 = m2.hexdigest()

        self.assertNotEqual(result_01, result_02)

    def tearDown(self):
        pass


if __name__ == '__main__':
    unittest.main()


================================================
FILE: tools/__init__.py
================================================
#!/usr/bin/env python
# encoding: utf-8

"""
@author: zhanghe
@software: PyCharm
@file: __init__.py.py
@time: 2018-02-10 17:10
"""

from functools import wraps


def catch_keyboard_interrupt(func):
    @wraps(func)
    def wrapper(*args, **kwargs):
        try:
            return func(*args, **kwargs)
        except KeyboardInterrupt:
            print('\n强制退出')

    return wrapper


================================================
FILE: tools/anti_spider_sogou.py
================================================
#!/usr/bin/env python
# encoding: utf-8

"""
@author: zhanghe
@software: PyCharm
@file: anti_spider_sogou.py
@time: 2018-02-10 17:24
"""


from __future__ import print_function
from __future__ import unicode_literals

from future.builtins import input               # PY2(raw_input)

import random
import time
import json

import requests

from apps.client_rk import get_img_code, img_report_error

from config import current_config


REQUESTS_TIME_OUT = current_config.REQUESTS_TIME_OUT


cookies = {}


s = requests.session()


headers = {
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
    'Accept-Encoding': 'gzip, deflate',
    'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
    'Connection': 'keep-alive',
    # 'Host': 'weixin.sogou.com',
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:57.0) Gecko/20100101 Firefox/57.0'
}


def _get_tc():
    tc = str('%13d' % (time.time() * 1000))
    return tc


def _save_img(res):
    # 保存验证码图片
    img_name = 'sogou_%s.jpg' % _get_tc()
    print('图片名称: %s' % img_name)
    img_content = res.content
    with open(img_name, b'w') as f:
        f.write(img_content)
    time.sleep(1)


def anti_spider():
    url = 'http://weixin.sogou.com/antispider/?from=/weixin?type=2&query=chuangbiandao'

    request_headers = headers.copy()
    request_headers['Host'] = 'weixin.sogou.com'

    request_cookie = {
        'refresh': '1'
    }

    res = s.get(url, headers=request_headers, cookies=request_cookie, timeout=REQUESTS_TIME_OUT)
    cookies.update(res.cookies)
    print('.', end='')
    # print cookies


def code_img_save():
    url = 'http://weixin.sogou.com/antispider/util/seccode.php'

    request_headers = headers.copy()
    request_headers['Host'] = 'weixin.sogou.com'

    request_cookie = cookies.copy()

    params = {
        'tc': _get_tc(),
    }
    res = s.get(url, params=params, headers=request_headers, cookies=request_cookie, timeout=REQUESTS_TIME_OUT)

    # 保存图片
    _save_img(res)

    cookies.update(res.cookies)
    print('.', end='')
    # print cookies


def code_img_obj():
    url = 'http://weixin.sogou.com/antispider/util/seccode.php'

    request_headers = headers.copy()
    request_headers['Host'] = 'weixin.sogou.com'

    request_cookie = cookies.copy()

    params = {
        'tc': _get_tc(),
    }
    res = s.get(url, params=params, headers=request_headers, cookies=request_cookie, timeout=REQUESTS_TIME_OUT)
    print('.', end='')
    return res.content


def pv_refresh():
    url = 'http://pb.sogou.com/pv.gif'

    request_headers = headers.copy()
    request_headers['Host'] = 'pb.sogou.com'

    request_cookie = {
        'IPLOC': cookies['IPLOC'],
        'SUIR': cookies['SUIR'],
    }
    params = {
        'uigs_productid': 'webapp',
        'type': 'antispider',
        'subtype': 'refresh',
        'domain': 'weixin',
        'suv': '',
        'snuid': '',
        't': _get_tc(),
    }
    res = s.get(url, params=params, headers=request_headers, cookies=request_cookie, timeout=REQUESTS_TIME_OUT)
    cookies.update(res.cookies)
    print('.', end='')


def pv_index():
    url = 'http://pb.sogou.com/pv.gif'

    request_headers = headers.copy()
    request_headers['Host'] = 'pb.sogou.com'

    request_cookie = {
        'IPLOC': cookies['IPLOC'],
        'SUIR': cookies['SUIR'],
    }
    params = {
        'uigs_productid': 'webapp',
        'type': 'antispider',
        'subtype': 'index',
        'domain': 'weixin',
        'suv': '',
        'snuid': '',
        't': _get_tc(),
    }
    res = s.get(url, params=params, headers=request_headers, cookies=request_cookie, timeout=REQUESTS_TIME_OUT)
    cookies.update(res.cookies)
    print('.', end='')


def pv_img_cost():
    url = 'http://pb.sogou.com/pv.gif'

    request_headers = headers.copy()
    request_headers['Host'] = 'pb.sogou.com'

    request_cookie = {
        'IPLOC': cookies['IPLOC'],
        'SUIR': cookies['SUIR'],
    }

    params = {
        'uigs_productid': 'webapp',
        'type': 'antispider',
        'subtype': 'imgCost',
        'domain': 'weixin',
        'suv': '',
        'snuid': '',
        't': _get_tc(),
        'cost': '27',
    }
    res = s.get(url, params=params, headers=request_headers, cookies=request_cookie, timeout=REQUESTS_TIME_OUT)
    cookies.update(res.cookies)
    print('.', end='')


def pv_mouse():
    url = 'http://pb.sogou.com/pv.gif'

    request_headers = headers.copy()
    request_headers['Host'] = 'pb.sogou.com'

    request_cookie = {
        'IPLOC': cookies['IPLOC'],
        'SUIR': cookies['SUIR'],
        'SUV': cookies['SUV'],
    }

    params = {
        'uigs_productid': 'webapp',
        'type': 'antispider',
        'subtype': 'mouse',
        'domain': 'weixin',
        'suv': cookies['SUV'],
        'snuid': '',
        't': _get_tc(),
    }
    res = s.get(url, params=params, headers=request_headers, cookies=request_cookie, timeout=REQUESTS_TIME_OUT)
    cookies.update(res.cookies)
    print('.', end='')


def pv_img_success():
    url = 'http://pb.sogou.com/pv.gif'

    request_headers = headers.copy()
    request_headers['Host'] = 'pb.sogou.com'

    request_cookie = {
        'IPLOC': cookies['IPLOC'],
        'SUIR': cookies['SUIR'],
        'SUV': cookies['SUV'],
    }

    params = {
        'uigs_productid': 'webapp',
        'type': 'antispider',
        'subtype': 'imgSuccess',
        'domain': 'weixin',
        'suv': cookies['SUV'],
        'snuid': '',
        't': _get_tc(),
    }
    res = s.get(url, params=params, headers=request_headers, cookies=request_cookie, timeout=REQUESTS_TIME_OUT)
    cookies.update(res.cookies)
    print('.', end='')


def pv_real_index():
    url = 'http://pb.sogou.com/pv.gif'

    request_headers = headers.copy()
    request_headers['Host'] = 'pb.sogou.com'

    request_cookie = {
        'IPLOC': cookies['IPLOC'],
        'SUIR': cookies['SUIR'],
        'SUV': cookies['SUV'],
    }

    params = {
        'uigs_productid': 'webapp',
        'type': 'antispider',
        'subtype': 'realIndex',
        'domain': 'weixin',
        'suv': cookies['SUV'],
        'snuid': '',
        't': _get_tc(),
    }
    res = s.get(url, params=params, headers=request_headers, cookies=request_cookie, timeout=REQUESTS_TIME_OUT)
    cookies.update(res.cookies)
    print('.', end='')


def pv_seccode_focus():
    url = 'http://pb.sogou.com/pv.gif'

    request_headers = headers.copy()
    request_headers['Host'] = 'pb.sogou.com'

    request_cookie = {
        'IPLOC': cookies['IPLOC'],
        'SUIR': cookies['SUIR'],
        'SUV': cookies['SUV'],
    }

    params = {
        'uigs_productid': 'webapp',
        'type': 'antispider',
        'subtype': 'seccodeFocus',
        'domain': 'weixin',
        'suv': cookies['SUV'],
        'snuid': '',
        't': _get_tc(),
    }
    res = s.get(url, params=params, headers=request_headers, cookies=request_cookie, timeout=REQUESTS_TIME_OUT)
    cookies.update(res.cookies)
    print('.', end='')


def pv_seccode_input():
    url = 'http://pb.sogou.com/pv.gif'

    request_headers = headers.copy()
    request_headers['Host'] = 'pb.sogou.com'

    request_cookie = {
        'IPLOC': cookies['IPLOC'],
        'SUIR': cookies['SUIR'],
        'SUV': cookies['SUV'],
    }

    params = {
        'uigs_productid': 'webapp',
        'type': 'antispider',
        'subtype': 'seccodeInput',
        'domain': 'weixin',
        'suv': cookies['SUV'],
        'snuid': '',
        't': _get_tc(),
    }
    res = s.get(url, params=params, headers=request_headers, cookies=request_cookie, timeout=REQUESTS_TIME_OUT)
    cookies.update(res.cookies)
    print('.', end='')


def pv_seccode_blur():
    url = 'http://pb.sogou.com/pv.gif'

    request_headers = headers.copy()
    request_headers['Host'] = 'pb.sogou.com'

    request_cookie = {
        'IPLOC': cookies['IPLOC'],
        'SUIR': cookies['SUIR'],
        'SUV': cookies['SUV'],
    }

    params = {
        'uigs_productid': 'webapp',
        'type': 'antispider',
        'subtype': 'seccodeBlur',
        'domain': 'weixin',
        'suv': cookies['SUV'],
        'snuid': '',
        't': _get_tc(),
    }
    res = s.get(url, params=params, headers=request_headers, cookies=request_cookie, timeout=REQUESTS_TIME_OUT)
    cookies.update(res.cookies)
    print('.', end='')


def thank(code_anti_spider):
    url = 'http://weixin.sogou.com/antispider/thank.php'

    request_headers = headers.copy()
    request_headers['X-Requested-With'] = 'XMLHttpRequest'

    request_cookie = {
        'ABTEST': cookies['ABTEST'],
        'IPLOC': cookies['IPLOC'],
        'SUID': cookies['SUID'],
        'PHPSESSID': cookies['PHPSESSID'],
        'SUIR': cookies['SUIR'],
        'SUV': cookies['SUV'],
    }

    data = {
        'c': code_anti_spider,
        'r': '%2Fweixin%3Ftype%3D2',
        'v': '5',
    }

    res = s.post(url, data=data, headers=request_headers, cookies=request_cookie, timeout=REQUESTS_TIME_OUT)
    cookies.update(res.cookies)
    # print cookies

    json_msg = json.loads(res.content)
    print(json_msg)
    return json_msg
    # {"code": 0,"msg": "解封成功，正在为您跳转来源地址...", "id": "ECB542781D1B4105B09FB4461E0587D4"}
    # {"code": 2,"msg": "未知访问来源"}
    # {"code": 3,"msg": "验证码输入错误, 请重新输入！"}


def _get_cookies():
    print(cookies)
    return cookies


def check_n():
    url = 'http://weixin.sogou.com/weixin?query=chuangbiandao&type=1'
    res = requests.get(url, headers=headers, timeout=REQUESTS_TIME_OUT)
    print(res.content)


def check_y():
    url = 'http://weixin.sogou.com/weixin?query=chuangbiandao&type=1'
    res = s.get(url, headers=headers, cookies=cookies, timeout=REQUESTS_TIME_OUT)
    print(res.content)


def manual_cookies():
    """
    获取 cookies - 手动填验证码
    :return:
    """
    anti_spider()
    code_img_save()

    # 模拟用户行为
    pv_refresh()
    pv_index()
    pv_img_cost()

    # 模拟鼠标滑过
    pv_mouse()
    pv_img_success()
    pv_real_index()

    # 模拟表单输入
    pv_seccode_focus()
    pv_seccode_input()
    pv_seccode_blur()

    input_code = input('code << ')

    thank(input_code)

    return _get_cookies()


def auto_cookies():
    """
    获取 cookies - 第三方识别验证码
    :return:
    """
    anti_spider()

    im = code_img_obj()
    # 6位英数混合 白天:15快豆 夜间:18.75快豆 超时:60秒
    img_id, img_code = get_img_code(im, im_type_id=3060)
    if not img_id:
        return None
    print(img_id, img_code)

    # 模拟用户行为
    pv_refresh()
    pv_index()
    pv_img_cost()

    # 模拟鼠标滑过
    pv_mouse()
    pv_img_success()
    pv_real_index()

    # 模拟表单输入
    pv_seccode_focus()
    pv_seccode_input()
    pv_seccode_blur()

    # 重试3次
    c = 3
    while c > 0:
        c -= 1
        res = thank(img_code)
        if res.get('code') == 0:
            # 识别成功
            cookies['SNUID'] = res.get('id', '')
            break
        elif res.get('code') == 3:
            # 报告错误识别
            img_report_error(img_id)

            # 出错随机等待后重试
            time.sleep(random.randint(1, 5))

            # 换张图片再来一次
            im = code_img_obj()
            # 6位英数混合 白天:15快豆 夜间:18.75快豆 超时:60秒
            img_id, img_code = get_img_code(im, im_type_id=3060)
            print(img_id, img_code)
        else:
            print('Error')
            print(res)
            return None

    return _get_cookies() if c > 0 else None


if __name__ == '__main__':
    # manual_cookies()
    auto_cookies()
    # check_n()
    # check_y()


================================================
FILE: tools/anti_spider_weixin.py
================================================
#!/usr/bin/env python
# encoding: utf-8

"""
@author: zhanghe
@software: PyCharm
@file: anti_spider_weixin.py
@time: 2018-02-10 17:24
"""


from __future__ import print_function
from __future__ import unicode_literals
from future.builtins import input               # PY2(raw_input)

import random
import time
import json

from lxml.html import fromstring

import requests

from apps.client_rk import get_img_code, img_report_error

from config import current_config


REQUESTS_TIME_OUT = current_config.REQUESTS_TIME_OUT


cookies = {}


s = requests.session()


headers = {
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
    'Accept-Encoding': 'gzip, deflate',
    'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
    'Connection': 'keep-alive',
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:57.0) Gecko/20100101 Firefox/57.0'
}


def _get_tc():
    tc = str('%13d' % (time.time() * 1000))
    return tc


def _save_img(res):
    # 保存验证码图片
    img_name = 'weixin_%s.jpg' % _get_tc()
    print('图片名称: %s' % img_name)
    img_content = res.content
    with open(img_name, b'w') as f:
        f.write(img_content)
    time.sleep(1)


def anti_spider(url):
    # url = 'https://mp.weixin.qq.com/profile?src=3&timestamp=1512923946&ver=1&signature=RZh61VIthXnp4HUsow1pgQXJbGxi*v-n4Pr1W6e5PVkmJSbRknd6LMT-EFoQqX4gaM6uGyHREmDPsN6lXkeYfg=='

    request_headers = headers.copy()
    request_headers['Host'] = 'mp.weixin.qq.com'

    res = s.get(url, headers=request_headers, timeout=REQUESTS_TIME_OUT)
    cookies.update(res.cookies)
    print('.', end='')

    doc = fromstring(res.text)
    title = u''.join(i.strip() for i in doc.xpath('//title/text()'))
    print(title)
    return title == '请输入验证码'


def code_img_save():
    url = 'https://mp.weixin.qq.com/mp/verifycode'

    request_headers = headers.copy()
    request_headers['Host'] = 'mp.weixin.qq.com'

    request_cookie = cookies.copy()

    params = {
        'cert': _get_tc(),
    }
    res = s.get(url, params=params, headers=request_headers, cookies=request_cookie, timeout=REQUESTS_TIME_OUT)

    # 保存图片
    _save_img(res)

    cookies.update(res.cookies)
    print('.', end='')
    # print cookies


def code_img_obj():
    url = 'https://mp.weixin.qq.com/mp/verifycode'

    request_headers = headers.copy()
    request_headers['Host'] = 'mp.weixin.qq.com'

    request_cookie = cookies.copy()

    params = {
        'cert': _get_tc(),
    }
    res = s.get(url, params=params, headers=request_headers, cookies=request_cookie, timeout=REQUESTS_TIME_OUT)
    print('.', end='')
    return res.content


def verify_code(input_code):
    url = 'https://mp.weixin.qq.com/mp/verifycode'
    request_headers = headers.copy()
    request_headers['Host'] = 'mp.weixin.qq.com'
    request_headers['X-Requested-With'] = 'XMLHttpRequest'

    request_cookie = cookies.copy()

    data = {
        'cert': _get_tc(),
        'input': input_code,
        'appmsg_token': '',
    }

    res = s.post(url, data=data, headers=request_headers, cookies=request_cookie, timeout=REQUESTS_TIME_OUT)
    cookies.update(res.cookies)
    # print cookies

    json_msg = json.loads(res.content)
    print(json_msg)
    return json_msg
    # {u'cookie_count': 0, u'errmsg': u'', u'ret': 0}
    # {u'cookie_count': 0, u'errmsg': u'', u'ret': 501} 验证码有误


def _get_cookies():
    print(cookies)
    return cookies


def manual_cookies():
    url = input('url << ')
    anti_spider(url)
    code_img_save()

    input_code = input('code << ')

    verify_code(input_code)

    return _get_cookies()


def auto_cookies(url):
    need_status = anti_spider(url)
    if not need_status:
        return True

    im = code_img_obj()
    # 4位纯英文字母 白天:10快豆 夜间:12.5快豆 超时:60秒
    img_id, img_code = get_img_code(im, im_type_id=2040)
    print(img_id, img_code)

    # 重试3次
    c = 3
    while c > 0:
        c -= 1
        res = verify_code(img_code)
        if res.get('ret') == 0:
            # 识别成功
            break
        elif res.get('ret') == 501:
            # 报告错误识别
            img_report_error(img_id)

            # 出错随机等待后重试
            time.sleep(random.randint(1, 5))

            # 换张图片再来一次
            im = code_img_obj()
            # 4位纯英文字母 白天:10快豆 夜间:12.5快豆 超时:60秒
            img_id, img_code = get_img_code(im, im_type_id=2040)
            print(img_id, img_code)
        else:
            print('Error')
            print(res)
            return False

    return True if c > 0 else False


if __name__ == '__main__':
    # manual_cookies()
    anti_spider_url = 'http://mp.weixin.qq.com/profile?src=3&timestamp=1513650933&ver=1&signature=zzgwSdnYIm68Nu5eFz1X8-Heqjojhy4ozHmg4cUz*hEo*QuXma9-qkMrOFxzOGDfzJHHfyechg0AVCFPpsXpuA=='
    print(auto_cookies(anti_spider_url))


================================================
FILE: tools/char.py
================================================
#!/usr/bin/env python
# encoding: utf-8

"""
@author: zhanghe
@software: PyCharm
@file: char.py
@time: 2018-02-10 17:48
"""


import execjs

# from HTMLParser import HTMLParser     # PY2
# from html.parser import HTMLParser    # PY3
from future.moves.html.parser import HTMLParser

html_parser = HTMLParser()


def un_escape(char_str):
    """
    反转译
    :param char_str:
    :return:
    """
    return html_parser.unescape(char_str)


def get_js_36_str(i):
    """
    整数、浮点数 js方式转36进制
    :param i:
    :return:
    """
    js_body = '''
        function get_36_str(i) {
            return i.toString(36);
        };
    '''
    ctx = execjs.compile(js_body)
    return ctx.call("get_36_str", i)


if __name__ == '__main__':
    a = '&#21152;&#20837;&#21040;&#34;&#25105;&#30340;&#20070;&#30446;&#36873;&#21333;&#34;&#20013;'
    b = '\xe5\xbd\x93\xe5\x89\x8d\xe5\xb7\xb2\xe8\xbe\xbe\xe5\x88\xb0\xe6\x8a\x93\xe5\x8f\x96\xe9\x85\x8d\xe7\xbd\xae\xe7\x9a\x84\xe6\x9c\x80\xe5\xa4\xa7\xe9\xa1\xb5\xe7\xa0\x81'
    c = 'https://mp.weixin.qq.com/s?timestamp=1511432702&amp;src=3&amp;ver=1&amp;signature=lAC8MtonFiHnlc5-j4z48WcPRpfP1Nn4zxCmY4ZjCjdXQscLcB5uyi5Jb395m5yaZQHTqqSlqzy*HRR0nAPZHsz0*Efu3w*Y2B8XbIL5v8pZQsGt9cwZQTuvI0GZqAsZobqzaeDptAQzHLB4QKL-qExOz0ANOTG*QAvJ7-ZurMg='
    d = 'http://mp.weixin.qq.com/mp/homepage?__biz=MzAxNzU2Mjc4NQ==&amp;hid=2&amp;sn=8177890cc7e468d3df6f3050d49951c5#wechat_redirect'
    print(un_escape(a))
    print(un_escape(b))
    print(un_escape(c))
    print(un_escape(d))


================================================
FILE: tools/cookies.py
================================================
#!/usr/bin/env python
# encoding: utf-8

"""
@author: zhanghe
@software: PyCharm
@file: cookies.py
@time: 2018-02-10 17:49
"""

from __future__ import print_function
from __future__ import unicode_literals

import json
import hashlib

from apps.client_db import redis_client


def _get_cookies_str(cookies_dict):
    """
    In [1]: import json

    In [2]: sd = {'c':1, 'b':2, 'a':3}

    In [3]: sd
    Out[3]: {'a': 3, 'b': 2, 'c': 1}

    In [4]: items = sd.items()

    In [5]: items
    Out[5]: [('a', 3), ('c', 1), ('b', 2)]

    In [6]: sorted(items)
    Out[6]: [('a', 3), ('b', 2), ('c', 1)]

    In [7]: sorted(items, reverse=True)
    Out[7]: [('c', 1), ('b', 2), ('a', 3)]

    In [8]: json.dumps(sorted(items))
    Out[8]: '[["a", 3], ["b", 2], ["c", 1]]'

    In [9]: json.loads(json.dumps(sorted(items)))
    Out[9]: [[u'a', 3], [u'b', 2], [u'c', 1]]

    In [10]: dict(json.loads(json.dumps(sorted(items))))
    Out[10]: {u'a': 3, u'b': 2, u'c': 1}
    :param cookies_dict:
    :return:
    """
    cookies_str = json.dumps(sorted(cookies_dict.items()))
    return cookies_str


def _get_finger(cookies_str):
    """
    :param cookies_str:
    :return:
    """
    m = hashlib.md5()
    m.update(cookies_str.encode('utf-8') if isinstance(cookies_str, unicode) else cookies_str)
    finger = m.hexdigest()
    return finger


def get_cookies(spider_name):
    """
    获取 cookies
    兼容 redis 没有 cookies 池的情况
    :param spider_name:
    :return:
    """
    key_set = 'scrapy:cookies_set:%(spider_name)s' % {'spider_name': spider_name}
    cookies_id = redis_client.srandmember(key_set)

    key_id = 'scrapy:cookies_id:%(cookies_id)s' % {'cookies_id': cookies_id}
    cookies_str = redis_client.get(key_id)
    cookies_obj = dict(json.loads(cookies_str or '[]'))

    return cookies_id, cookies_obj


def add_cookies(spider_name, cookies_obj):
    """
    添加 cookies
    :param spider_name:
    :param cookies_obj:
    :return:
    """
    cookies_str = _get_cookies_str(cookies_obj)
    cookies_id = _get_finger(cookies_str)

    key_id = 'scrapy:cookies_id:%(cookies_id)s' % {'cookies_id': cookies_id}
    key_set = 'scrapy:cookies_set:%(spider_name)s' % {'spider_name': spider_name}

    if redis_client.sismember(key_set, cookies_id):
        return False

    redis_client.set(key_id, cookies_str)
    redis_client.sadd(key_set, cookies_id)
    return True


def del_cookies(spider_name, cookies_id):
    """
    删除 cookies
    :param spider_name:
    :param cookies_id:
    :return:
    """
    key_id = 'scrapy:cookies_id:%(cookies_id)s' % {'cookies_id': cookies_id}
    key_set = 'scrapy:cookies_set:%(spider_name)s' % {'spider_name': spider_name}

    redis_client.delete(key_id)
    redis_client.srem(key_set, cookies_id)


def len_cookies(spider_name):
    """
    获取 cookies 长度
    :param spider_name:
    :return:
    """
    key_set = 'scrapy:cookies_set:%(spider_name)s' % {'spider_name': spider_name}
    cookies_len = redis_client.scard(key_set)
    return cookies_len


"""
集合
key: cookies_id

字符串
cookies_id_key: cookies_obj
"""


================================================
FILE: tools/date_time.py
================================================
#!/usr/bin/env python
# encoding: utf-8

"""
@author: zhanghe
@software: PyCharm
@file: date_time.py
@time: 2018-06-25 16:44
"""


from __future__ import unicode_literals
import six

import time
import calendar
from datetime import datetime, timedelta, date


def get_tc():
    """
    获取13位字符串时间戳
    :return:
    """
    tc = str('%13d' % (time.time() * 1000))
    return tc


def get_current_day_time_ends():
    """
    获取当天开始结束时刻
    :return:
    """
    today = datetime.today()
    start_time = datetime(today.year, today.month, today.day, 0, 0, 0)
    end_time = datetime(today.year, today.month, today.day, 23, 59, 59)
    return start_time, end_time


def get_current_month_time_ends():
    """
    获取当月开始结束时刻
    :return:
    """
    today = datetime.today()
    _, days = calendar.monthrange(today.year, today.month)
    start_time = datetime(today.year, today.month, 1, 0, 0, 0)
    end_time = datetime(today.year, today.month, days, 23, 59, 59)
    return start_time, end_time


def get_current_year_time_ends():
    """
    获取当年开始结束时刻
    :return:
    """
    today = datetime.today()
    start_time = datetime(today.year, 1, 1, 0, 0, 0)
    end_time = datetime(today.year, 12, 31, 23, 59, 59)
    return start_time, end_time


def get_hours(zerofill=True):
    """
    列出1天所有24小时
    :return:
    """
    if zerofill:
        return ['%02d' % i for i in range(24)]
    else:
        return range(24)


def get_days(year=1970, month=1, zerofill=True):
    """
    列出当月的所有日期
    :param year:
    :param month:
    :param zerofill:
    :return:
    """
    year = int(year)
    month = int(month)
    _, days = calendar.monthrange(year, month)
    if zerofill:
        return ['%02d' % i for i in range(1, days+1)]
    else:
        return range(1, days+1)


def get_weeks():
    """
    列出所有星期
    :return:
    """
    return ['周一', '周二', '周三', '周四', '周五', '周六', '周日']


def get_months(zerofill=True):
    """
    列出1年所有12月份
    :return:
    """
    if zerofill:
        return ['%02d' % i for i in range(1, 13)]
    else:
        return [i for i in range(1, 13)]


def time_local_to_utc(local_time):
    """
    本地时间转UTC时间
    :param local_time:
    :return:
    """
    # 字符串处理
    if isinstance(local_time, six.string_types) and len(local_time) == 10:
        local_time = datetime.strptime(local_time, '%Y-%m-%d')
    elif isinstance(local_time, six.string_types) and len(local_time) >= 19:
        local_time = datetime.strptime(local_time[:19], '%Y-%m-%d %H:%M:%S')
    elif not (isinstance(local_time, datetime) or isinstance(local_time, date)):
        local_time = datetime.now()
    # 时间转换
    utc_time = local_time + timedelta(seconds=time.timezone)
    return utc_time


def time_utc_to_local(utc_time):
    """
    UTC时间转本地时间
    :param utc_time:
    :return:
    """
    # 字符串处理
    if isinstance(utc_time, six.string_types) and len(utc_time) == 10:
        utc_time = datetime.strptime(utc_time, '%Y-%m-%d')
    elif isinstance(utc_time, six.string_types) and len(utc_time) >= 19:
        utc_time = datetime.strptime(utc_time[:19], '%Y-%m-%d %H:%M:%S')
    elif not (isinstance(utc_time, datetime) or isinstance(utc_time, date)):
        utc_time = datetime.utcnow()
    # 时间转换
    local_time = utc_time - timedelta(seconds=time.timezone)
    return local_time


if __name__ == '__main__':
    print(get_current_day_time_ends())
    print(get_current_month_time_ends())
    print(get_current_year_time_ends())
    print(get_hours(zerofill=False))
    print(get_hours(zerofill=True))
    print(get_days(zerofill=False))
    print(get_days(zerofill=True))
    print(get_months(zerofill=False))
    print(get_months(zerofill=True))


================================================
FILE: tools/duplicate.py
================================================
#!/usr/bin/env python
# encoding: utf-8

"""
@author: zhanghe
@software: PyCharm
@file: duplicate.py
@time: 2018-02-10 17:39
"""

from __future__ import print_function
from __future__ import unicode_literals

from apps.client_db import redis_client
from tools.url import get_request_finger


def is_dup_detail(detail_url, spider_name, channel_id=0):
    """
    检查详细页是否重复
    :param detail_url:
    :param spider_name:
    :param channel_id:
    :return:
    """
    detail_dup_key = 'scrapy:dup:%s:%s' % (spider_name, channel_id)
    detail_url_finger = get_request_finger(detail_url)
    return redis_client.sismember(detail_dup_key, detail_url_finger)


def add_dup_detail(detail_url, spider_name, channel_id=0):
    """
    把当前详细页加入集合
    :param detail_url:
    :param spider_name:
    :param channel_id:
    :return:
    """
    detail_dup_key = 'scrapy:dup:%s:%s' % (spider_name, channel_id)
    detail_url_finger = get_request_finger(detail_url)
    return redis_client.sadd(detail_dup_key, detail_url_finger)


================================================
FILE: tools/gen.py
================================================
#!/usr/bin/env python
# encoding: utf-8

"""
@author: zhanghe
@software: PyCharm
@file: gen.py
@time: 2018-02-10 17:19
"""

from __future__ import print_function
from __future__ import unicode_literals

import os
import sys
from sqlalchemy.ext.declarative.api import DeclarativeMeta
from sqlalchemy.inspection import inspect
from config import current_config


BASE_DIR = current_config.BASE_DIR
SQLALCHEMY_DATABASE_URI = current_config.SQLALCHEMY_DATABASE_URI_MYSQL


def gen_models():
    """
    创建 models
    $ python gen.py gen_models
    """
    file_path = os.path.join(BASE_DIR, 'models/news.py')
    cmd = 'sqlacodegen %s --noinflect --outfile %s' % (SQLALCHEMY_DATABASE_URI, file_path)

    output = os.popen(cmd)
    result = output.read()
    print(result)

    # 更新 model 文件
    with open(file_path, b'r') as f:
        lines = f.readlines()
    # 新增 model 转 dict 方法
    with open(file_path, b'w') as f:
        lines.insert(9, b'def to_dict(self):\n')
        lines.insert(10, b'    return {c.name: getattr(self, c.name, None) for c in self.__table__.columns}\n')
        lines.insert(11, b'\n')
        lines.insert(12, b'Base.to_dict = to_dict\n')
        lines.insert(13, b'\n\n')
        f.write(b''.join(lines))


def gen_items():
    """
    创建 items
    $ python gen.py gen_items
    字段规则： 去除自增主键，非自增是需要的。
    """
    from models import news

    file_path = os.path.join(BASE_DIR, 'news/items.py')

    model_list = [(k, v) for k, v in news.__dict__.items() if isinstance(v, DeclarativeMeta) and k != 'Base']

    with open(file_path, b'w') as f:
        f.write(b'# -*- coding: utf-8 -*-\n\n')
        f.write(b'# Define here the models for your scraped items\n#\n')
        f.write(b'# See documentation in:\n')
        f.write(b'# http://doc.scrapy.org/en/latest/topics/items.html\n\n')
        f.write(b'import scrapy\n')

        for model_name, model_class in model_list:
            result = model_class().to_dict()
            table_name = model_class().__tablename__
            model_pk = inspect(model_class).primary_key[0].name
            f.write(b'\n\nclass %sItem(scrapy.Item):\n' % model_name)
            f.write(b'    """\n')
            f.write(b'    table_name: %s\n' % table_name)
            f.write(b'    primary_key: %s\n' % model_pk)
            f.write(b'    """\n')
            for field_name in list(result.keys()):
                if field_name in [model_pk, 'create_time', 'update_time']:
                    continue
                f.write(b'    %s = scrapy.Field()\n' % field_name)


def run():
    """
    入口
    """
    # print sys.argv
    try:
        if len(sys.argv) > 1:
            fun_name = globals()[sys.argv[1]]
            fun_name()
        else:
            print('缺失参数\n')
            usage()
    except NameError as e:
        print(e)
        print('未定义的方法[%s]' % sys.argv[1])


def usage():
    print("""
创建 models
$ python gen.py gen_models

创建 items
$ python gen.py gen_items
""")


if __name__ == '__main__':
    run()
    # print BASE_DIR
    # print SQLALCHEMY_DATABASE_URI


================================================
FILE: tools/img.py
================================================
#!/usr/bin/env python
# encoding: utf-8

"""
@author: zhanghe
@software: PyCharm
@file: img.py
@time: 2018-03-20 14:24
"""


from __future__ import print_function
from __future__ import unicode_literals

import imghdr

import requests
from PIL import Image
from six import BytesIO

from config import current_config

REQUESTS_TIME_OUT = current_config.REQUESTS_TIME_OUT


def filter_img_size(min_width=0, min_height=0, *img_url):
    """
    过滤尺寸不符要求的图片
    :param min_width:
    :param min_height:
    :param img_url:
    :return:
    """
    result = []
    for i in img_url:
        try:
            img_res = requests.get(i, stream=True, timeout=REQUESTS_TIME_OUT)
            if img_res.status_code == 200:
                orig_image = Image.open(BytesIO(img_res.content))
                img_width, img_height = orig_image.size
                if img_width >= min_width and img_height >= min_height:
                    result.append(i)
        except Exception as e:
            print('check images error: %s' % img_url)
            print(e.message)
            continue
    return result


def filter_local_img_type(ignore_type='gif', *img_path):
    """
    过滤指定类型本地图片
    :param ignore_type:
    :param img_path:
    :return:
    """
    result = []
    for i in img_path:
        img_type = imghdr.what(i)
        # print(img_type, i)
        if img_type == ignore_type:
            continue
        result.append(i)
    return result


def filter_remote_img_type(ignore_type='gif', *img_url):
    """
    过滤指定类型远程图片
    :param ignore_type:
    :param img_url:
    :return:
    """
    result = []
    for i in img_url:
        img_type = imghdr.what(None, requests.get(i).content)
        # print(img_type, i)
        if img_type == ignore_type:
            continue
        result.append(i)
    return result


if __name__ == '__main__':
    pass


================================================
FILE: tools/import_task.py
================================================
#!/usr/bin/env python
# encoding: utf-8

"""
@author: zhanghe
@software: PyCharm
@file: import_csv.py
@time: 2018-05-17 18:46
"""

from __future__ import print_function
from __future__ import unicode_literals

import sys
import csv
import json
from apps.client_db import add_item
from models.news import FetchTask


def read_csv(filename):
    """
    读取csv
    :param filename:
    :return:
    """
    count = 0
    with open(filename) as f:
        reader = csv.DictReader(f)
        for line in reader:
            print(json.dumps(line, indent=4, ensure_ascii=False))
            count += 1
            yield line
    print('读取数量: %s' % count)


def import_csv(filename):
    """
    导入csv
    :param filename:
    :return:
    """
    count = 0
    for item in read_csv(filename):
        result = add_item(FetchTask, item)
        print(result)
        count += 1
    print('导入数量: %s' % count)


def usage():
    print('''
导入 csv
注意 csv 格式, 表头与数据库任务表的字段对应（去掉主键）
$ python tools/import_task.py example.csv
''')


def run():
    """
    入口
    """
    # print sys.argv
    try:
        if len(sys.argv) < 2:
            raise Exception('缺失参数\n')
        import_csv(sys.argv[1])
    except Exception as e:
        print('导入异常')
        print(e)
        usage()


if __name__ == '__main__':
    run()


================================================
FILE: tools/net_status.py
================================================
#!/usr/bin/env python
# encoding: utf-8

"""
@author: zhanghe
@software: PyCharm
@file: net_status.py
@time: 2018-05-28 20:45
"""

import time
from apps.client_db import redis_client


def get_reboot_net_status(net_name='optical_modem_china_net'):
    key_reboot_net = 'scrapy:reboot_net:%s' % net_name
    reboot_net_status = redis_client.get(key_reboot_net)
    return reboot_net_status


def set_reboot_net_status(net_name='optical_modem_china_net'):
    key_reboot_net = 'scrapy:reboot_net:%s' % net_name
    reboot_net_status = time.strftime('%Y-%m-%d %H:%M:%S')
    redis_client.set(key_reboot_net, reboot_net_status)


def del_reboot_net_status(net_name='optical_modem_china_net'):
    key_reboot_net = 'scrapy:reboot_net:%s' % net_name
    redis_client.delete(key_reboot_net)


================================================
FILE: tools/proxies.py
================================================
#!/usr/bin/env python
# encoding: utf-8

"""
@author: zhanghe
@software: PyCharm
@file: proxies.py
@time: 2018-03-13 16:37
"""


import json
import requests
from apps.client_db import redis_client
from tools.url import get_update_url

from config import current_config


REQUESTS_TIME_OUT = current_config.REQUESTS_TIME_OUT


def add_proxy(spider_name, *proxy):
    key_set = 'scrapy:proxies_set:%(spider_name)s' % {'spider_name': spider_name}
    return redis_client.sadd(key_set, *proxy)


def del_proxy(spider_name, proxy):
    key_set = 'scrapy:proxies_set:%(spider_name)s' % {'spider_name': spider_name}
    return redis_client.srem(key_set, proxy)


def get_proxy(spider_name):
    key_set = 'scrapy:proxies_set:%(spider_name)s' % {'spider_name': spider_name}
    return redis_client.srandmember(key_set)


def len_proxy(spider_name):
    key_set = 'scrapy:proxies_set:%(spider_name)s' % {'spider_name': spider_name}
    return redis_client.scard(key_set)


def fetch_proxy(country='China', scheme='http'):
    """
    获取代理
    :param country:
    :param scheme:
    :return:
    """
    data = {}
    if country:
        data['country'] = country
    if scheme:
        data['type'] = scheme
    url = 'http://proxy.nghuyong.top/'
    url = get_update_url(url, data)
    res = requests.get(url, timeout=REQUESTS_TIME_OUT).json()
    return ['%s://%s' % (i['type'], i['ip_and_port']) for i in res.get('data', [])]


if __name__ == '__main__':
    proxy_result = fetch_proxy()
    print(json.dumps(proxy_result, indent=4))


================================================
FILE: tools/scrapy_tasks.py
================================================
#!/usr/bin/env python
# encoding: utf-8

"""
@author: zhanghe
@software: PyCharm
@file: scrapy_tasks.py
@time: 2018-02-10 17:42
"""


from apps.client_db import redis_client


def pop_task(spider_name):
    key_set = 'scrapy:tasks_set:%(spider_name)s' % {'spider_name': spider_name}
    return redis_client.spop(key_set)


def put_task(spider_name, *task_ids):
    key_set = 'scrapy:tasks_set:%(spider_name)s' % {'spider_name': spider_name}
    redis_client.sadd(key_set, *task_ids)


def get_tasks_count(spider_name):
    key_set = 'scrapy:tasks_set:%(spider_name)s' % {'spider_name': spider_name}
    cookies_len = redis_client.scard(key_set)
    return cookies_len


================================================
FILE: tools/sys_monitor.py
================================================
#!/usr/bin/env python
# encoding: utf-8

"""
@author: zhanghe
@software: PyCharm
@file: sys_monitor.py
@time: 2018-02-10 17:43
"""


import psutil
import time


def bytes2human(n):
    """
    >>> bytes2human(10000)
    '9.8 K'
    >>> bytes2human(100001221)
    '95.4 M'
    """
    symbols = ('K', 'M', 'G', 'T', 'P', 'E', 'Z', 'Y')
    prefix = {}
    for i, s in enumerate(symbols):
        prefix[s] = 1 << (i + 1) * 10
    for s in reversed(symbols):
        if n >= prefix[s]:
            value = float(n) / prefix[s]
            return '%.2f %s' % (value, s)
    return '%.2f B' % n


def _format_info(k, v):
    if len(str(v)) <= 5:
        return '%-25s %5s' % (k, v)
    elif len(str(v)) <= 10:
        return '%-20s %10s' % (k, v)
    else:
        return '%-15s %15s' % (k, v)


def _print_info(contents, topic=''):
    if topic:
        print('\n[%s]' % topic)
    contents.insert(0, '-' * 31)
    contents.append('-' * 31)
    print('\n'.join(contents))


def _cpu():
    contents = [
        _format_info('cpu_count_logical', psutil.cpu_count()),
        _format_info('cpu_count_physical', psutil.cpu_count(logical=False)),
    ]
    _print_info(contents, 'CPU')


def _memory():
    mem_virtual = psutil.virtual_memory()
    mem_swap = psutil.swap_memory()

    contents = [_format_info('mem_virtual_total', bytes2human(mem_virtual.total)),
                _format_info('mem_virtual_free', bytes2human(mem_virtual.free)),
                _format_info('mem_virtual_percent', '%s %%' % mem_virtual.percent),
                _format_info('mem_swap_total', bytes2human(mem_swap.total)),
                _format_info('mem_swap_free', bytes2human(mem_swap.free)),
                _format_info('mem_swap_percent', '%s %%' % mem_swap.percent)]
    _print_info(contents, 'Memory')


def _disks():
    sdisk_part = psutil.disk_partitions()

    contents = []

    for i in sdisk_part:
        contents.append(_format_info(i.device, i.mountpoint))

        sdisk_usage = psutil.disk_usage(i.mountpoint)
        contents.append(_format_info('disk_usage_total', bytes2human(sdisk_usage.total)))
        contents.append(_format_info('disk_usage_free', bytes2human(sdisk_usage.free)))
        contents.append(_format_info('disk_usage_percent', '%s %%' % sdisk_usage.percent))

    _print_info(contents, 'Disks')


def _network(speed=True):
    snetio = psutil.net_io_counters()
    contents = [_format_info('bytes_sent', bytes2human(snetio.bytes_sent)),
                _format_info('bytes_recv', bytes2human(snetio.bytes_recv))]

    if speed:
        time.sleep(1)
        snetio_after = psutil.net_io_counters()
        contents.append(_format_info('speed_sent', '%s/S' % bytes2human(snetio_after.bytes_sent - snetio.bytes_sent)))
        contents.append(_format_info('speed_recv', '%s/S' % bytes2human(snetio_after.bytes_recv - snetio.bytes_recv)))
    _print_info(contents, 'Network')


def _sensors():

    contents = []

    if hasattr(psutil, "sensors_temperatures"):
        sensors_temperatures = psutil.sensors_temperatures()
        for name, entries in sensors_temperatures.items():
            for entry in entries:
                contents.append(
                    _format_info(entry.label or name, '%s °C' % entry.current))

    sbattery = psutil.sensors_battery()

    if sbattery:
        contents.append(_format_info('battery_percent', '%s %%' % sbattery.percent))
        contents.append(_format_info('secsleft', sbattery.secsleft))
        contents.append(_format_info('power_plugged', sbattery.power_plugged))
    _print_info(contents, 'Sensors')


def stats():
    _cpu()
    _memory()
    _disks()
    _network()
    _sensors()


if __name__ == '__main__':
    stats()


================================================
FILE: tools/toutiao_m.py
================================================
#!/usr/bin/env python
# encoding: utf-8

"""
@author: zhanghe
@software: PyCharm
@file: toutiao_m.py
@time: 2018-02-28 14:14
"""

import hashlib
import math
import re
import time

import execjs

from tools.char import un_escape


def get_as_cp():
    t = int(math.floor(time.time()))
    e = hex(t).upper()[2:]
    m = hashlib.md5()
    m.update(str(t).encode(encoding='utf-8'))
    i = m.hexdigest().upper()

    if len(e) != 8:
        AS = '479BB4B7254C150'
        CP = '7E0AC8874BB0985'
        return AS, CP

    n = i[0:5]
    a = i[-5:]
    s = ''
    r = ''
    for o in range(5):
        s += n[o] + e[o]
        r += e[o + 3] + a[o]

    AS = 'A1' + s + e[-3:]
    CP = e[0:3] + r + 'E1'
    return AS, CP


def parse_toutiao_js_body(html_body, url=''):
    """
    解析js
    :param html_body:
    :param url:
    :return:
    """
    rule = r'<script>(var BASE_DATA = {.*?};)</script>'
    js_list = re.compile(rule, re.S).findall(html_body)
    if not js_list:
        print('parse error url: %s' % url)
        print(html_body)
    return ''.join(js_list)


class ParseJsTt(object):
    """
    解析头条动态数据
    """

    def __init__(self, js_body):
        self.js_body = js_body

        self._add_js_item_id_fn()
        self._add_js_title_fn()
        self._add_js_abstract_fn()
        self._add_js_content_fn()
        self._add_js_pub_time()
        self._add_js_tags_fn()

        self.ctx = execjs.compile(self.js_body)

    def _add_js_item_id_fn(self):
        js_item_id_fn = """
        function r_item_id() {
            return BASE_DATA.articleInfo.itemId;
        };
        """
        self.js_body += js_item_id_fn

    def _add_js_title_fn(self):
        js_title_fn = """
        function r_title() {
            return BASE_DATA.articleInfo.title;
        };
        """
        self.js_body += js_title_fn

    def _add_js_abstract_fn(self):
        js_abstract_fn = """
        function r_abstract() {
            return BASE_DATA.shareInfo.abstract;
        };
        """
        self.js_body += js_abstract_fn

    def _add_js_content_fn(self):
        js_content_fn = """
        function r_content() {
            return BASE_DATA.articleInfo.content;
        };
        """
        self.js_body += js_content_fn

    def _add_js_pub_time(self):
        js_pub_time_fn = """
                function r_pub_time() {
                    return BASE_DATA.articleInfo.subInfo.time;
                };
                """
        self.js_body += js_pub_time_fn

    def _add_js_tags_fn(self):
        js_tags_fn = """
        function r_tags() {
            return BASE_DATA.articleInfo.tagInfo.tags;
        };
        """
        self.js_body += js_tags_fn

    def parse_js_item_id(self):
        return self.ctx.call('r_item_id') or ''

    def parse_js_title(self):
        return self.ctx.call('r_title') or ''

    def parse_js_abstract(self):
        return self.ctx.call('r_abstract') or ''

    def parse_js_content(self):
        return un_escape(self.ctx.call('r_content')) or ''

    def parse_js_pub_time(self):
        return self.ctx.call('r_pub_time') or time.strftime('%Y-%m-%d %H:%M:%S')

    def parse_js_tags(self):
        return ','.join([tag['name'] or '' for tag in self.ctx.call('r_tags')])


if __name__ == '__main__':
    print(get_as_cp())


================================================
FILE: tools/url.py
================================================
#!/usr/bin/env python
# encoding: utf-8

"""
@author: zhanghe
@software: PyCharm
@file: url.py
@time: 2018-02-10 17:38
"""


# from urllib import urlencode                                                      # PY2
# from urlparse import urlparse, urlunparse, parse_qsl                              # PY2
# from urllib.parse import urlparse, urlunparse, parse_qsl, urlencode               # PY3
from future.moves.urllib.parse import urlparse, urlunparse, parse_qsl, urlencode

from scrapy.utils import request
from scrapy.http import Request


def get_update_url(url, data):
    """
    获取更新后的url
    :param url:
    :param data:
    :return:
    """
    result = urlparse(url)
    query_payload = dict(parse_qsl(result.query), **data)
    query_param = urlencode(query_payload)
    return urlunparse((result.scheme, result.netloc, result.path, result.params, query_param, result.fragment))


def get_url_query_param(url, param):
    """
    获取url参数值
    :param url:
    :param param:
    :return:
    """
    result = urlparse(url)
    return dict(parse_qsl(result.query)).get(param)


def get_request_finger(url):
    """
    获取 url 指纹（允许参数无序）
    :param url:
    :return:
    """
    req = Request(url=url)
    return request.request_fingerprint(req)


def allow_url(url, allow_domains):
    url_parse = urlparse(url)
    result = False
    for domain in allow_domains:
        if url_parse.netloc.endswith(domain):
            result = True
    return result


if __name__ == '__main__':
    print(get_update_url('http://www.abc.com/def/', {'b': 2}))
    print(get_update_url('http://www.abc.com/def/?a=1', {'b': 2}))
    print(get_update_url('http://www.abc.com/def/?a=1', {'a': 2}))
    print(get_url_query_param('http://www.abc.com/def/?a=1&b=2', 'a'))
    print(allow_url('http://www.abc.com', ['abc.com']))
    print(allow_url('http://www.abc.com', ['b.com']))


================================================
FILE: tools/weibo.py
================================================
#!/usr/bin/env python
# encoding: utf-8

"""
@author: zhanghe
@software: PyCharm
@file: weibo.py
@time: 2018-02-13 16:20
"""


import base64

# from urllib import quote                      # PY2
# from urllib.parse import quote                # PY3
from future.moves.urllib.parse import quote
from future.builtins import input               # PY2(raw_input)


def get_su(user_name):
    return base64.b64encode(quote(user_name.strip()))


def get_login_data():
    print('Please type username and password!')
    username = input('username < ')
    password = input('password < ')
    if not(username and password):
        raise Exception('Method or function hasn\'t been implemented yet.')
    return {
        'username': username,
        'password': password
    }


if __name__ == '__main__':
    pass


================================================
FILE: tools/weixin.py
================================================
#!/usr/bin/env python
# encoding: utf-8

"""
@author: zhanghe
@software: PyCharm
@file: weixin.py
@time: 2018-02-10 17:55
"""


import re
import time
import hashlib

# from urlparse import urljoin                  # PY2
# from urllib.parse import urljoin              # PY3
from future.moves.urllib.parse import urljoin

import execjs
from tools.char import un_escape
from config import current_config
from models.news import FetchResult
from news.items import FetchResultItem
from apps.client_db import db_session_mysql
from maps.platform import WEIXIN, WEIBO

BASE_DIR = current_config.BASE_DIR


def get_finger(content_str):
    """
    :param content_str:
    :return:
    """
    m = hashlib.md5()
    m.update(content_str.encode('utf-8') if isinstance(content_str, unicode) else content_str)
    finger = m.hexdigest()
    return finger


def parse_weixin_js_body(html_body, url=''):
    """
    解析js
    :param html_body:
    :param url:
    :return:
    """
    rule = r'<script type="text/javascript">.*?(var msgList.*?)seajs.use\("sougou/profile.js"\);.*?</script>'
    js_list = re.compile(rule, re.S).findall(html_body)
    if not js_list:
        print('parse error url: %s' % url)
    return ''.join(js_list)


def parse_weixin_article_id(html_body):
    rule = r'<script nonce="(\d+)" type="text\/javascript">'
    article_id_list = re.compile(rule, re.I).findall(html_body)
    return article_id_list[0]


def add_img_src(html_body):
    rule = r'data-src="(.*?)"'
    img_data_src_list = re.compile(rule, re.I).findall(html_body)
    print(img_data_src_list)
    for img_src in img_data_src_list:
        print(img_src)
        html_body = html_body.replace(img_src, '%(img_src)s" src="%(img_src)s' % {'img_src': img_src})
    return html_body


def get_img_src_list(html_body, host_name='/', limit=None):
    rule = r'src="(%s.*?)"' % host_name
    img_data_src_list = re.compile(rule, re.I).findall(html_body)
    if limit:
        return img_data_src_list[:limit]
    return img_data_src_list


def check_article_title_duplicate(article_title):
    """
    检查标题重复
    :param article_title:
    :return:
    """
    session = db_session_mysql()
    article_id_count = session.query(FetchResult) \
        .filter(FetchResult.platform_id == WEIXIN,
                FetchResult.article_id == get_finger(article_title)) \
        .count()
    return article_id_count


class ParseJsWc(object):
    """
    解析微信动态数据
    """
    def __init__(self, js_body):
        self.js_body = js_body

        self._add_js_msg_list_fn()

        self.ctx = execjs.compile(self.js_body)
        # print(self.ctx)

    def _add_js_msg_list_fn(self):
        js_msg_list_fn = """
        function r_msg_list() {
            return msgList.list;
        };
        """
        self.js_body += js_msg_list_fn

    def parse_js_msg_list(self):
        msg_list = self.ctx.call('r_msg_list')
        app_msg_ext_info_list = [i['app_msg_ext_info'] for i in msg_list]
        comm_msg_info_date_time_list = [time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(i['comm_msg_info']['datetime'])) for i in msg_list]
        # msg_id_list = [i['comm_msg_info']['id'] for i in msg_list]
        msg_data_list = [
            {
                # 'article_id': '%s_000' % msg_id_list[index],
                'article_id': get_finger(i['title']),
                'article_url': urljoin('https://mp.weixin.qq.com', un_escape(i['content_url'])),
                'article_title': i['title'],
                'article_abstract': i['digest'],
                'article_pub_time': comm_msg_info_date_time_list[index],
            } for index, i in enumerate(app_msg_ext_info_list)
        ]
        msg_ext_list = [i['multi_app_msg_item_list'] for i in app_msg_ext_info_list]
        for index_j, j in enumerate(msg_ext_list):
            for index_i, i in enumerate(j):
                msg_data_list.append(
                    {
                        # 'article_id': '%s_%03d' % (msg_id_list[index_j], index_i + 1),
                        'article_id': get_finger(i['title']),
                        'article_url': urljoin('https://mp.weixin.qq.com', un_escape(i['content_url'])),
                        'article_title': i['title'],
                        'article_abstract': i['digest'],
                        'article_pub_time': comm_msg_info_date_time_list[index_j],
                    }
                )
        return msg_data_list