Repository: zhanghe06/news_spider Branch: master Commit: 9e29525a8bcb Files: 105 Total size: 206.8 KB Directory structure: gitextract_ti7etvkp/ ├── .coveragerc ├── .gitignore ├── .travis.yml ├── LICENSE ├── README.md ├── apps/ │ ├── __init__.py │ ├── client_db.py │ └── client_rk.py ├── config/ │ ├── __init__.py │ └── default.py ├── db/ │ ├── data/ │ │ └── mysql.sql │ └── schema/ │ └── mysql.sql ├── docs/ │ ├── Architecture.md │ ├── Components/ │ │ ├── MariaDB.md │ │ ├── Redis.md │ │ ├── SeaweedFS.md │ │ └── Squid.md │ ├── README.md │ ├── SUMMARY.md │ ├── Spiders/ │ │ ├── README.md │ │ ├── Toutiao.md │ │ ├── Weibo.md │ │ └── Weixin.md │ └── book.json ├── env_default.sh ├── etc/ │ ├── scrapy.ini │ ├── scrapyd.ini │ ├── supervisord.conf │ ├── tasks.ini │ └── toutiao.ini ├── libs/ │ ├── __init__.py │ ├── counter.py │ ├── ft.py │ ├── optical_modem.py │ ├── redis_pub_sub.py │ ├── redis_queue.py │ ├── rk.py │ └── weed_fs.py ├── logs/ │ └── index.html ├── maps/ │ ├── __init__.py │ ├── channel.py │ └── platform.py ├── models/ │ ├── __init__.py │ └── news.py ├── news/ │ ├── __init__.py │ ├── items.py │ ├── middlewares/ │ │ ├── __init__.py │ │ ├── anti_spider.py │ │ ├── content_type.py │ │ ├── de_duplication_request.py │ │ ├── httpproxy.py │ │ └── useragent.py │ ├── middlewares.py │ ├── pipelines/ │ │ ├── __init__.py │ │ ├── de_duplication_request.py │ │ ├── de_duplication_store_mysql.py │ │ ├── exporter_csv.py │ │ ├── img_remote_to_local_fs.py │ │ └── store_mysql.py │ ├── pipelines.py │ ├── settings.py │ └── spiders/ │ ├── __init__.py │ ├── ip.py │ ├── toutiao_m.py │ ├── weibo.py │ └── weixin.py ├── requirements-py2.txt ├── requirements-py3.txt ├── scrapy.cfg ├── tasks/ │ ├── __init__.py │ ├── job_put_tasks.py │ ├── job_reboot_net_china_net.py │ ├── jobs_proxies.py │ ├── jobs_sogou.py │ ├── jobs_weixin.py │ ├── run_job_counter_clear.py │ ├── run_job_put_tasks_toutiao.py │ ├── run_job_put_tasks_weibo.py │ ├── run_job_put_tasks_weixin.py │ ├── run_job_reboot_net_china_net.py │ ├── run_job_sogou_cookies.py │ ├── run_job_weixin_cookies.py │ ├── run_jobs.py │ └── run_jobs_apscheduler.py ├── tests/ │ ├── __init__.py │ ├── test_date_time.py │ └── test_finger.py └── tools/ ├── __init__.py ├── anti_spider_sogou.py ├── anti_spider_weixin.py ├── char.py ├── cookies.py ├── date_time.py ├── duplicate.py ├── gen.py ├── img.py ├── import_task.py ├── net_status.py ├── proxies.py ├── scrapy_tasks.py ├── sys_monitor.py ├── toutiao_m.py ├── url.py ├── weibo.py └── weixin.py ================================================ FILE CONTENTS ================================================ ================================================ FILE: .coveragerc ================================================ [run] include = tests/* omit = __init__.py [report] exclude_lines = pragma: no cover def __repr__ def __str__ if self.debug: if settings.DEBUG except ImportError raise AssertionError raise NotImplementedError if 0: if __name__ == .__main__.: ================================================ FILE: .gitignore ================================================ # Created by .ignore support plugin (hsz.mobi) *.py[cod] *.env .idea .DS_Store logs/* !logs/index.html .coverage htmlcov/ .coveralls.yml csv/* # toutiao news/spiders/toutiao.py tools/toutiao.py # middlewares news/middlewares/httpproxy_vps.py # config #config/* #!config/__init__.py #!config/default.py # #env_*.sh #!env_default.sh # gitbook docs/_book/* docs/node_modules docs/package-lock.json ================================================ FILE: .travis.yml ================================================ sudo: no dist: trusty language: python python: - "2.7" - "3.6" # command to install dependencies install: - if [[ $TRAVIS_PYTHON_VERSION == 2.7 ]]; then pip install -r requirements-py2.txt; fi - if [[ $TRAVIS_PYTHON_VERSION == 3.6 ]]; then pip install -r requirements-py3.txt; fi - pip install coveralls - pip install pyyaml # command to run tests script: - export PYTHONPATH=${PWD} - coverage run -a tests/test_date_time.py - coverage run -a tests/test_finger.py - coverage report after_success: # upload test report - coveralls ================================================ FILE: LICENSE ================================================ MIT License Copyright (c) 2018 碎ping子 Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ================================================ FILE: README.md ================================================ ## 新闻抓取 [![Build Status](https://travis-ci.org/zhanghe06/news_spider.svg?branch=master)](https://travis-ci.org/zhanghe06/news_spider) [![Coverage Status](https://coveralls.io/repos/github/zhanghe06/news_spider/badge.svg?branch=master)](https://coveralls.io/github/zhanghe06/news_spider?branch=master) ### 项目演示 服务依赖: - MariaDB - Redis - NodeJS 本项目依赖第三方验证码识别服务 更新配置 config/default.py 用户名和密码 ``` RK_CONFIG = { 'username': '******', 'password': '******', 'soft_id': '93676', 'soft_key': '5d0e00b196c244cb9d8413809c62f9d5', } # 斐斐打码 FF_CONFIG = { 'pd_id': '******', 'pd_key': '******', 'app_id': '312451', 'app_key': '5YuN+6isLserKBZti4hoaI6UR2N5UT2j', } ``` ```bash # python2 virtualenv news_spider.env # 创建虚拟环境 # python3 virtualenv news_spider.env -p python3 # 创建虚拟环境 source env_default.sh # 激活虚拟环境 pip install -r requirements-py2.txt # 安装环境依赖 # 开发环境 模拟单次抓取 python tasks/job_put_tasks.py wx # 初次创建任务 python tasks/jobs_sogou.py # 初次应对反爬 scrapy crawl weixin # 开启微信爬虫 # 生产环境 开启持续抓取 supervisord # 开启守护进程 supervisorctl start all # 开启工作进程 ``` - env_develop.sh # 开发环境 - env_product.sh # 生产环境 ### 项目创建过程记录 项目依赖明细 ```bash pip install requests pip install scrapy pip install sqlalchemy pip install mysqlclient pip install sqlacodegen==1.1.6 # 注意: 最新版 sqlacodegen==2.0 有bug pip install redis pip install PyExecJS pip install Pillow pip install psutil pip install schedule pip install future # 兼容py2、py3 pip install supervisor # 当前主版本3只支持py2,将来主版本4(未发布)会支持py3 ``` 因当前`supervisor`不支持`python3`,故在`requirements.txt`中将其去掉 由于任务调度`apscheduler`不支持Py3(其中的依赖`futures`不支持),这里采用`schedule` `scrapy`的依赖`cryptography`在`2.2.2`版本中有[安全性问题](https://nvd.nist.gov/vuln/detail/CVE-2018-10903), 强烈建议更新至`2.3`及以上版本, 可以通过更新`scrapy`的方式升级 `scrapy`的依赖`parsel`使用了`functools`的`lru_cache`方法( python2 是`functools32`的`lru_cache`方法;`functools32`是`functools`的反向移植) Mac 系统环境依赖(mariadb) ```bash brew unlink mariadb brew install mariadb-connector-c ln -s /usr/local/opt/mariadb-connector-c/bin/mariadb_config /usr/local/bin/mysql_config # pip install MySQL-python pip install mysqlclient # 基于 MySQL-python 兼容py2、py3 rm /usr/local/bin/mysql_config brew unlink mariadb-connector-c brew link mariadb ``` CentOS 系统环境依赖 ```bash yum install gcc yum install mysql-devel yum install python-devel yum install epel-release yum install redis yum install nodejs ``` CentOS 安装 python3 环境(CentOS 默认是不带 python3 的) ```bash yum install python34 yum install python34-devel ``` CentOS 安装 pip & virtualenv & git & vim ```bash yum install python-pip pip install --upgrade pip pip install virtualenv yum install git yum install vim ``` 创建项目 ```bash scrapy startproject news . scrapy genspider weixin mp.weixin.qq.com ``` 启动蜘蛛 ```bash scrapy crawl weixin ``` 如需测试微博, 修改以下方法, 更改正确用户名和密码 tools/weibo.py ``` def get_login_data(): return { 'username': '******', 'password': '******' } ``` ### 蜘蛛调试(以微博为例) 1. 清除中间件去重缓存, 重置调试任务 ``` 127.0.0.1:6379> DEL "dup:weibo:0" (integer) 1 127.0.0.1:6379> DEL "scrapy:tasks_set:weibo" (integer) 1 127.0.0.1:6379> SADD "scrapy:tasks_set:weibo" 130 (integer) 1 127.0.0.1:6379> ``` 2. 清除调试蜘蛛存储数据 ```mysql DELETE FROM fetch_result WHERE platform_id=2; ``` 3. 启动调试蜘蛛 ```bash scrapy crawl weibo ``` ### 验证码识别 ~~http://www.ruokuai.com/~~ ~~http://wiki.ruokuai.com/~~ ~~价格类型:~~ ~~http://www.ruokuai.com/home/pricetype~~ 热心网友反映`若快`已经关闭, 接下来会支持`斐斐打码`, 敬请期待 斐斐打码开发文档 [http://docs.fateadm.com](http://docs.fateadm.com) ### 索引说明 联合索引, 注意顺序, 同时注意查询条件字段类型需要与索引字段类型一致 实测, 数据量8万记录以上, 如果没有命中索引, 查询会很痛苦 ### 项目说明 亮点: 1. 支持分布式, 每个蜘蛛抓取进程对应一个独立的抓取任务 2. 采用订阅发布模型的观察者模式, 处理并发场景的验证码识别任务, 避免无效的识别 备注: `mysql`中`text`最大长度为65,535(2的16次方–1) 类型 | 表达式 | 最大字节长度(bytes) | 大致容量 ---: | ---: | ---: | ---: TinyText | 2的8次方–1 | 255 | 255B Text | 2的16次方–1 | 65,535 | 64KB MediumText | 2的24次方–1 | 16,777,215 | 16MB LongText | 2的32次方–1 | 4,294,967,295 | 4GB 由于微信公众号文章标签过多, 长度超过`Text`的最大值, 故建议采用`MediumText` ### 特别说明 头条请求签名 - M端需要2个参数: as、cp - PC端需要3个参数: as、cp、_signature M端2个参数获取方法已公开, 参考蜘蛛 toutiao_m ~~PC端3个参数获取方法已破解, 由于公开之后会引起头条反爬机制更新, 故没有公开, 如有需要, 敬请私聊, 仅供学习, 谢绝商用~~ 因M端已满足数据获取要求, 不再开源PC端签名破解 ### TODO 微博反爬处理 ================================================ FILE: apps/__init__.py ================================================ #!/usr/bin/env python # encoding: utf-8 """ @author: zhanghe @software: PyCharm @file: __init__.py.py @time: 2018-02-10 17:33 """ def func(): pass class Main(object): def __init__(self): pass if __name__ == '__main__': pass ================================================ FILE: apps/client_db.py ================================================ #!/usr/bin/env python # encoding: utf-8 """ @author: zhanghe @software: PyCharm @file: client_db.py @time: 2018-02-10 17:34 """ from sqlalchemy import create_engine from sqlalchemy import distinct from sqlalchemy import func from sqlalchemy.orm import sessionmaker import redis from config import current_config SQLALCHEMY_DATABASE_URI_MYSQL = current_config.SQLALCHEMY_DATABASE_URI_MYSQL SQLALCHEMY_POOL_SIZE = current_config.SQLALCHEMY_POOL_SIZE REDIS = current_config.REDIS engine_mysql = create_engine(SQLALCHEMY_DATABASE_URI_MYSQL, pool_size=SQLALCHEMY_POOL_SIZE, max_overflow=0) db_session_mysql = sessionmaker(bind=engine_mysql, autocommit=True) redis_client = redis.Redis(**REDIS) def get_item(model_class, pk_id): session = db_session_mysql() try: result = session.query(model_class).get(pk_id) return result finally: session.close() def get_all(model_class, *args, **kwargs): session = db_session_mysql() try: result = session.query(model_class).filter(*args).filter_by(**kwargs).all() return result finally: session.close() def get_distinct(model_class, field, *args, **kwargs): session = db_session_mysql() try: result = session.query(distinct(getattr(model_class, field)).label(field)).filter(*args).filter_by(**kwargs).all() return result finally: session.close() def get_group(model_class, field, min_count=0, *args, **kwargs): field_obj = getattr(model_class, field) session = db_session_mysql() try: result = session.query(field_obj, func.count(field_obj).label('c')).filter(*args).filter_by( **kwargs).group_by(field_obj).having(func.count(field_obj) >= min_count).all() return result finally: session.close() def add_item(model_class, data): session = db_session_mysql() try: ret = model_class(**data) session.add(ret) # 如需返回id, 需要手动flush session.flush() return ret.id finally: session.close() ================================================ FILE: apps/client_rk.py ================================================ #!/usr/bin/env python # encoding: utf-8 """ @author: zhanghe @software: PyCharm @file: client_rk.py @time: 2018-02-10 17:34 """ from libs.rk import RKClient from libs.counter import CounterClient from apps.client_db import redis_client from tools.cookies import len_cookies from config import current_config RK_CONFIG = current_config.RK_CONFIG BASE_DIR = current_config.BASE_DIR RK_LIMIT_COUNT_DAILY = current_config.RK_LIMIT_COUNT_DAILY COOKIES_QUEUE_COUNT = current_config.COOKIES_QUEUE_COUNT rc_client = RKClient(**RK_CONFIG) rk_counter_client = CounterClient(redis_client, 'rk') # 正常图形验证码 # 'im_type_id': 1000 # 任意长度数字 # 'im_type_id': 2000 # 任意长度字母 # 'im_type_id': 3000 # 任意长度英数混合 # 'im_type_id': 4000 # 任意长度汉字 # 'im_type_id': 5000 # 任意长度中英数三混 def get_img_code(im, im_type_id): """ 获取验证码 :param im: :param im_type_id: :return: """ rc_result = rc_client.rk_create(im, im_type_id) print(rc_result) if 'Error_Code' in rc_result: print(rc_result.get('Error')) return None, None # {u'Result': u'6dx2t8', u'Id': u'c8a897f0-9825-41a1-b19e-6195ba8559ed'} return rc_result['Id'], rc_result['Result'] def img_report_error(im_id): rc_client.rk_report_error(im_id) def check_counter_limit(): """ 检查是否超过限制(True: 没有超过; False: 超过限制) :return: """ rk_counter = rk_counter_client.get() return rk_counter < RK_LIMIT_COUNT_DAILY def check_cookies_count(spider_name): """ 检查 cookies 长度是否达到要求(True: 没有达到; False: 达到要求) :param spider_name: :return: """ return len_cookies(spider_name) < COOKIES_QUEUE_COUNT def counter_clear(): """ 计数器清零(每天0点) :return: """ rk_counter_client.clear() ================================================ FILE: config/__init__.py ================================================ #!/usr/bin/env python # encoding: utf-8 """ @author: zhanghe @software: PyCharm @file: __init__.py @time: 2018-02-10 15:02 """ from __future__ import unicode_literals from __future__ import print_function import os from importlib import import_module MODE = os.environ.get('MODE') or 'default' try: current_config = import_module('config.' + MODE) print('[√] 当前环境变量: %s' % MODE) except ImportError: print('[!] 配置错误,请初始化环境变量') print('source env_develop.sh # 开发环境') print('source env_product.sh # 生产环境') ================================================ FILE: config/default.py ================================================ #!/usr/bin/env python # encoding: utf-8 """ @author: zhanghe @software: PyCharm @file: default.py @time: 2018-07-02 17:57 """ from __future__ import print_function from __future__ import unicode_literals import os BASE_DIR = os.path.dirname(os.path.dirname(__file__)) # requests 超时设置 REQUESTS_TIME_OUT = (30, 30) HOST_IP = '0.0.0.0' # 数据库 MySQL DB_MYSQL = { 'host': HOST_IP, 'user': 'root', 'passwd': '123456', 'port': 3306, 'db': 'news_spider' } SQLALCHEMY_DATABASE_URI_MYSQL = \ 'mysql+mysqldb://%s:%s@%s:%s/%s?charset=utf8' % \ (DB_MYSQL['user'], DB_MYSQL['passwd'], DB_MYSQL['host'], DB_MYSQL['port'], DB_MYSQL['db']) SQLALCHEMY_POOL_SIZE = 5 # 默认 pool_size=5 # 缓存,队列 REDIS = { 'host': HOST_IP, 'port': 6379, # 'password': '123456' # redis-cli AUTH 123456 } # 若快验证码识别 RK_CONFIG = { 'username': '******', 'password': '******', 'soft_id': '93676', 'soft_key': '5d0e00b196c244cb9d8413809c62f9d5', } # 斐斐打码 FF_CONFIG = { 'pd_id': '******', 'pd_key': '******', 'app_id': '312451', 'app_key': '5YuN+6isLserKBZti4hoaI6UR2N5UT2j', } # 熔断机制 每天请求限制(200元==500000快豆) RK_LIMIT_COUNT_DAILY = 925 # 队列保留 cookies 数量 COOKIES_QUEUE_COUNT = 5 # 分布式文件系统 WEED_FS_URL = 'http://%s:9333' % HOST_IP # 优先级配置(深度优先) DEPTH_PRIORITY = 1 PRIORITY_CONFIG = { 'list': 600, 'next': 500, 'detail': 800, } # 启动时间(启动时间之前的内容不抓取, 适用于新闻) START_TIME = '2018-01-01 00:00:00' ================================================ FILE: db/data/mysql.sql ================================================ USE news_spider; -- 插入用频道信息 TRUNCATE TABLE `channel`; INSERT INTO `channel` VALUES (1, 'recommend', '推荐', '', '2017-11-20 10:00:00', '2017-11-20 10:00:00'); INSERT INTO `channel` VALUES (2, 'hot', '热点', '', '2017-11-20 10:00:00', '2017-11-20 10:00:00'); INSERT INTO `channel` VALUES (3, 'technology', '科技', '', '2017-11-20 10:00:00', '2017-11-20 10:00:00'); INSERT INTO `channel` VALUES (4, 'social', '社会', '', '2017-11-20 10:00:00', '2017-11-20 10:00:00'); INSERT INTO `channel` VALUES (5, 'entertainment', '娱乐', '', '2017-11-20 10:00:00', '2017-11-20 10:00:00'); INSERT INTO `channel` VALUES (6, 'game', '游戏', '', '2017-11-20 10:00:00', '2017-11-20 10:00:00'); INSERT INTO `channel` VALUES (7, 'sports', '体育', '', '2017-11-20 10:00:00', '2017-11-20 10:00:00'); INSERT INTO `channel` VALUES (8, 'car', '汽车', '', '2017-11-20 10:00:00', '2017-11-20 10:00:00'); INSERT INTO `channel` VALUES (9, 'finance', '财经', '', '2017-11-20 10:00:00', '2017-11-20 10:00:00'); INSERT INTO `channel` VALUES (10, 'military', '军事', '', '2017-11-20 10:00:00', '2017-11-20 10:00:00'); INSERT INTO `channel` VALUES (11, 'international', '国际', '', '2017-11-20 10:00:00', '2017-11-20 10:00:00'); INSERT INTO `channel` VALUES (12, 'fashion', '时尚', '', '2017-11-20 10:00:00', '2017-11-20 10:00:00'); INSERT INTO `channel` VALUES (13, 'travel', '旅游', '', '2017-11-20 10:00:00', '2017-11-20 10:00:00'); INSERT INTO `channel` VALUES (14, 'explore', '探索', '', '2017-11-20 10:00:00', '2017-11-20 10:00:00'); INSERT INTO `channel` VALUES (15, 'childcare', '育儿', '', '2017-11-20 10:00:00', '2017-11-20 10:00:00'); INSERT INTO `channel` VALUES (16, 'health', '养生', '', '2017-11-20 10:00:00', '2017-11-20 10:00:00'); INSERT INTO `channel` VALUES (17, 'article', '美文', '', '2017-11-20 10:00:00', '2017-11-20 10:00:00'); INSERT INTO `channel` VALUES (18, 'history', '历史', '', '2017-11-20 10:00:00', '2017-11-20 10:00:00'); INSERT INTO `channel` VALUES (19, 'food', '美食', '', '2017-11-20 10:00:00', '2017-11-20 10:00:00'); INSERT INTO `channel` VALUES (20, 'education', '教育', '', '2017-11-20 10:00:00', '2017-11-20 10:00:00'); INSERT INTO `channel` VALUES (21, 'electrical', '电气', '', '2017-11-20 10:00:00', '2017-11-20 10:00:00'); INSERT INTO `channel` VALUES (22, 'machine', '机械', '', '2017-11-20 10:00:00', '2017-11-20 10:00:00'); INSERT INTO `channel` VALUES (23, 'medical', '医疗', '', '2017-11-20 10:00:00', '2017-11-20 10:00:00'); -- 插入抓取任务信息 TRUNCATE TABLE `fetch_task`; INSERT INTO `fetch_task` VALUES (11, 3, 0, '6555293927', '制造业那些事儿', '', 'http://m.toutiao.com/profile/6555293927/', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05'); INSERT INTO `fetch_task` VALUES (12, 3, 0, '51555073058', '制造业福星高赵', '', 'http://m.toutiao.com/profile/51555073058/', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05'); INSERT INTO `fetch_task` VALUES (13, 3, 0, '58075853770', 'AI汽车制造业', '', 'http://m.toutiao.com/profile/58075853770/', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05'); INSERT INTO `fetch_task` VALUES (14, 3, 0, '51397533037', '制造业的云时代', '', 'http://m.toutiao.com/profile/51397533037/', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05'); INSERT INTO `fetch_task` VALUES (15, 3, 0, '6157673577', '电器制造业大事件', '', 'http://m.toutiao.com/profile/6157673577/', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05'); INSERT INTO `fetch_task` VALUES (16, 3, 0, '3810739482', '互联网扒皮王', '', 'http://m.toutiao.com/profile/3810739482/', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05'); INSERT INTO `fetch_task` VALUES (17, 3, 0, '5347877887', '互联网智慧驿站', '', 'http://m.toutiao.com/profile/5347877887/', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05'); INSERT INTO `fetch_task` VALUES (18, 1, 0, 'Root_Id', 'Website_Name', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05'); INSERT INTO `fetch_task` VALUES (19, 1, 0, 'chuangbiandao', '创变岛', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05'); INSERT INTO `fetch_task` VALUES (20, 1, 0, 'changmaiw', '畅脉全球购', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05'); INSERT INTO `fetch_task` VALUES (21, 1, 0, 'BizNext', '企鹅智酷', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05'); INSERT INTO `fetch_task` VALUES (22, 1, 0, 'renhecom', '人和网', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05'); INSERT INTO `fetch_task` VALUES (23, 1, 0, 'rsqwyjs', '人生趣味研究所', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05'); INSERT INTO `fetch_task` VALUES (24, 1, 0, 'shiyehome', '食业家', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05'); INSERT INTO `fetch_task` VALUES (25, 1, 0, 'tyjzksp', '食品商', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05'); INSERT INTO `fetch_task` VALUES (26, 1, 0, 'wisesale_lzzd', '联纵智达', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05'); INSERT INTO `fetch_task` VALUES (27, 1, 0, 'sxlh002', '蓝海果业', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05'); INSERT INTO `fetch_task` VALUES (28, 1, 0, 'huxiu_com', '虎嗅网', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05'); INSERT INTO `fetch_task` VALUES (29, 1, 0, 'HZKSXFPJLQ', '华中快速消费品经理群', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05'); INSERT INTO `fetch_task` VALUES (30, 1, 0, 'kuaixiao999888', '经销商那些事儿', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05'); INSERT INTO `fetch_task` VALUES (31, 1, 0, 'jingxiaoshang168', '经销商', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05'); INSERT INTO `fetch_task` VALUES (32, 1, 0, 'fmcgchina', '快消品网', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05'); INSERT INTO `fetch_task` VALUES (33, 1, 0, 'FMCG-CLUB', '快速消费品精英俱乐部', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05'); INSERT INTO `fetch_task` VALUES (34, 1, 0, 'tyjspb', '食品板', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05'); INSERT INTO `fetch_task` VALUES (35, 1, 0, 'yxts518', '营销透视镜', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05'); INSERT INTO `fetch_task` VALUES (36, 1, 0, 'salesman66', '营销人', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05'); INSERT INTO `fetch_task` VALUES (37, 1, 0, 'cn-beverage', '饮料行业网', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05'); INSERT INTO `fetch_task` VALUES (38, 1, 0, 'youshudejiu', '有数酒业', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05'); INSERT INTO `fetch_task` VALUES (39, 1, 0, 'i-yiou', '亿欧网', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05'); INSERT INTO `fetch_task` VALUES (40, 1, 0, 'CLFDA-001', '中国副食流通协会总监联盟', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05'); INSERT INTO `fetch_task` VALUES (41, 1, 0, 'wbfood', '58食品网', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05'); INSERT INTO `fetch_task` VALUES (42, 1, 0, 'lanhaiyingxiao', '营销兵法', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05'); INSERT INTO `fetch_task` VALUES (43, 1, 0, 'AutoMan-No1', 'AutoMan', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05'); INSERT INTO `fetch_task` VALUES (44, 1, 0, 'leiphone-sz', '雷锋网', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05'); INSERT INTO `fetch_task` VALUES (45, 1, 0, 'coffeeO2O', '餐饮O2O', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05'); INSERT INTO `fetch_task` VALUES (46, 1, 0, 'newso2o', '零售渠道观察', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05'); INSERT INTO `fetch_task` VALUES (47, 1, 0, 'wwwcbocn', '化妆品财经在线', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05'); INSERT INTO `fetch_task` VALUES (48, 1, 0, 'dushekeji', '毒舌科技', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05'); INSERT INTO `fetch_task` VALUES (49, 1, 0, 'zgsppj', '新食品评介', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05'); INSERT INTO `fetch_task` VALUES (50, 1, 0, 'foodinc', '小食代', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05'); INSERT INTO `fetch_task` VALUES (51, 1, 0, 'lookforfoods', '食品饮料新零售内参', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05'); INSERT INTO `fetch_task` VALUES (52, 1, 0, 'wow36kr', '36氪', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05'); INSERT INTO `fetch_task` VALUES (53, 1, 0, 'food-gnosis', '食悟', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05'); INSERT INTO `fetch_task` VALUES (54, 1, 0, 'newfortune', '新财富杂志', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05'); INSERT INTO `fetch_task` VALUES (55, 1, 0, 'lp800315111', '快消家', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05'); INSERT INTO `fetch_task` VALUES (56, 1, 0, 'tancaijing', '叶檀财经', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05'); INSERT INTO `fetch_task` VALUES (57, 1, 0, 'yigejubaopen', '市井财经', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05'); INSERT INTO `fetch_task` VALUES (58, 1, 0, 'njss02584195518', '工程机械微管家', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05'); INSERT INTO `fetch_task` VALUES (59, 1, 0, 'jiajucy', '家具产业', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05'); INSERT INTO `fetch_task` VALUES (60, 1, 0, 'chinafood365', '中国食品网', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05'); INSERT INTO `fetch_task` VALUES (61, 1, 0, 'dqjswol', '电气自动化控制网', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05'); INSERT INTO `fetch_task` VALUES (62, 1, 0, 'zgyybweixin', '中国医药报', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05'); INSERT INTO `fetch_task` VALUES (63, 1, 0, 'fzfzzk', '纺织服装周刊', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05'); INSERT INTO `fetch_task` VALUES (64, 1, 0, 'www-glass-com-cn', '中国玻璃网', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05'); INSERT INTO `fetch_task` VALUES (65, 1, 0, 'amdaily', '先进制造业', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05'); INSERT INTO `fetch_task` VALUES (66, 1, 0, 'cmpzhizao', '制造业那些事儿', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05'); INSERT INTO `fetch_task` VALUES (67, 1, 0, 'zhishexueshuquan', '知社学术圈', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05'); INSERT INTO `fetch_task` VALUES (68, 1, 0, 'keyanquan', '科研圈', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05'); INSERT INTO `fetch_task` VALUES (69, 1, 0, 'iccafe-sh', 'IC咖啡', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05'); INSERT INTO `fetch_task` VALUES (70, 1, 0, 'robotmagazine', '机器人技术与应用', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05'); INSERT INTO `fetch_task` VALUES (71, 1, 0, 'productronicaChina', '慕尼黑上海电子生产设备展', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05'); INSERT INTO `fetch_task` VALUES (72, 1, 0, 'electronicaChina', 'e星球', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05'); INSERT INTO `fetch_task` VALUES (73, 1, 0, 'feelingcar666', '飞灵汽车', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05'); INSERT INTO `fetch_task` VALUES (74, 1, 0, 'depo88', '分布式能源', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05'); INSERT INTO `fetch_task` VALUES (75, 1, 0, 'jianyuecheping', '建约车评', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05'); INSERT INTO `fetch_task` VALUES (76, 1, 0, 'AECC-2016', '中国航发', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05'); INSERT INTO `fetch_task` VALUES (77, 1, 0, 'mesbook', 'MES百科', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05'); INSERT INTO `fetch_task` VALUES (78, 1, 0, 'mtmt-1951', '机床杂志社', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05'); INSERT INTO `fetch_task` VALUES (79, 1, 0, 'AI_era', '新智元', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05'); INSERT INTO `fetch_task` VALUES (80, 1, 0, 'ikanlixiang', '看理想', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05'); INSERT INTO `fetch_task` VALUES (81, 1, 0, 'AVICESI', '中行伊萨', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05'); INSERT INTO `fetch_task` VALUES (82, 1, 0, 'www_51shape_com', '3D科学谷', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05'); INSERT INTO `fetch_task` VALUES (83, 1, 0, 'i-zhoushuo', '周说', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05'); INSERT INTO `fetch_task` VALUES (84, 1, 0, 'guoguo_innovation', '蝈蝈创新随笔', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05'); INSERT INTO `fetch_task` VALUES (85, 1, 0, 'e-zhizao', 'e制造', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05'); INSERT INTO `fetch_task` VALUES (86, 1, 0, 'RoboSpeak', '机器人大讲堂', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05'); INSERT INTO `fetch_task` VALUES (87, 1, 0, 'The-Intellectual', '知识分子', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05'); INSERT INTO `fetch_task` VALUES (88, 1, 0, 'sdr-china', '软件定义世界', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05'); INSERT INTO `fetch_task` VALUES (89, 1, 0, 'wufutu5', '洞见', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05'); INSERT INTO `fetch_task` VALUES (90, 1, 0, 'siid_2inno', '之新网', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05'); INSERT INTO `fetch_task` VALUES (91, 1, 0, 'e-works', '数字化企业', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05'); INSERT INTO `fetch_task` VALUES (92, 1, 0, 'smr8700', '水木然', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05'); INSERT INTO `fetch_task` VALUES (93, 1, 0, 'casic3s', '航天科工系统仿真科技', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05'); INSERT INTO `fetch_task` VALUES (94, 1, 0, 'xiangxt1984', '向小田', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05'); INSERT INTO `fetch_task` VALUES (95, 1, 0, 'gh_7157c03a9f49', '理深科技时评', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05'); INSERT INTO `fetch_task` VALUES (96, 1, 0, 'gh_8189758efb1b', '国富资本熊焰', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05'); INSERT INTO `fetch_task` VALUES (97, 1, 0, 'iscientists', '赛先生', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05'); INSERT INTO `fetch_task` VALUES (98, 1, 0, 'bjcppmp', '中国造纸杂志社', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05'); INSERT INTO `fetch_task` VALUES (99, 1, 0, 'CPA-PAPER', '中国造纸协会', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05'); INSERT INTO `fetch_task` VALUES (100, 1, 0, 'CTAPI-Paper', '中国造纸学会', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05'); INSERT INTO `fetch_task` VALUES (101, 1, 0, 'zzcywd', '造纸产业', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05'); INSERT INTO `fetch_task` VALUES (102, 1, 0, 'paperCEO', '造纸老板内刊', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05'); INSERT INTO `fetch_task` VALUES (103, 1, 0, 'gh_28281e9f6cc4', '造纸助手', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05'); INSERT INTO `fetch_task` VALUES (104, 1, 0, 'qgzzbwh', '全国造纸工业标准化技术委员会', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05'); INSERT INTO `fetch_task` VALUES (105, 1, 0, 'waysmos', '造纸化学品', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05'); INSERT INTO `fetch_task` VALUES (106, 1, 0, 'wff168_com', '第一家具网', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05'); INSERT INTO `fetch_task` VALUES (107, 1, 0, 'jiajuwxw', '家具微新闻', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05'); INSERT INTO `fetch_task` VALUES (108, 1, 0, 'Furniture_China', '上海家具展', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05'); INSERT INTO `fetch_task` VALUES (109, 1, 0, 'jiajuzhuliuMF', '家具主流', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05'); INSERT INTO `fetch_task` VALUES (110, 1, 0, 'jjgle2015', '家具在线', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05'); INSERT INTO `fetch_task` VALUES (111, 1, 0, 'nfsyyjjb', '医药经济报', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05'); INSERT INTO `fetch_task` VALUES (112, 1, 0, 'iyiyaomofang', '医药魔方数据', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05'); INSERT INTO `fetch_task` VALUES (113, 1, 0, 'gh_260ce2309fff', 'MIMS医药资讯', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05'); INSERT INTO `fetch_task` VALUES (114, 1, 0, 'yyguancha', '医药观察家网', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05'); INSERT INTO `fetch_task` VALUES (115, 1, 0, 'yyshoujibao', '医药手机报', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05'); INSERT INTO `fetch_task` VALUES (116, 1, 0, 'shstpa', '上海医药商业行业协会', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05'); INSERT INTO `fetch_task` VALUES (117, 1, 0, 'fangda_healthcare', '医药法律评论', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05'); INSERT INTO `fetch_task` VALUES (118, 1, 0, 'cmpma1989', '中国医药物资协会', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05'); INSERT INTO `fetch_task` VALUES (119, 1, 0, 'yehenala_678', '医药那些事儿', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05'); INSERT INTO `fetch_task` VALUES (120, 1, 0, 'imrobotic', '机器人在线', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05'); INSERT INTO `fetch_task` VALUES (121, 1, 0, 'CSDN_Tech', 'CSDN技术头条', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05'); INSERT INTO `fetch_task` VALUES (122, 1, 0, 'CSDN_BLOG', 'CSDN博客', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05'); INSERT INTO `fetch_task` VALUES (123, 1, 0, 'CSDNLIB', 'CSDN知识库', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05'); INSERT INTO `fetch_task` VALUES (124, 1, 0, 'csdn_iot', 'CSDN物联网开发', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05'); INSERT INTO `fetch_task` VALUES (125, 2, 0, '1005051627825392', '互联网的那点事', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05'); INSERT INTO `fetch_task` VALUES (126, 2, 0, '1006061787567623', '199IT-互联网数据中心', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05'); INSERT INTO `fetch_task` VALUES (127, 2, 0, '1002061577794853', '互联网的一些事', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05'); INSERT INTO `fetch_task` VALUES (128, 2, 0, '1002063318777442', '互联网创业刊', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05'); INSERT INTO `fetch_task` VALUES (129, 2, 0, '1006061661377270', '互联网观察网', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05'); INSERT INTO `fetch_task` VALUES (130, 2, 0, '1002062210869832', '互联网新闻网', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05'); INSERT INTO `fetch_task` VALUES (131, 2, 0, '1006063481197561', '中国互联网安全大会', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05'); INSERT INTO `fetch_task` VALUES (132, 2, 0, '1002061768025224', '互联网周刊', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05'); INSERT INTO `fetch_task` VALUES (133, 2, 0, '1002063819805149', '互联网焦点网', '', '', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05'); INSERT INTO `fetch_task` VALUES (134, 3, 0, '55982516338', '奇文志怪', '', 'http://m.toutiao.com/profile/55982516338/', 1, '', '2018-09-06 14:01:05', '2018-09-06 14:01:05'); INSERT INTO `fetch_task` VALUES (135, 3, 0, '6014591174', '鹏君读书', '', 'http://m.toutiao.com/profile/6014591174/', 1, '', '2017-01-11 11:01:05', '2017-01-11 11:01:05'); ================================================ FILE: db/schema/mysql.sql ================================================ DROP DATABASE IF EXISTS `news_spider`; CREATE DATABASE `news_spider` /*!40100 DEFAULT CHARACTER SET utf8 */; use news_spider; CREATE TABLE `channel` ( `id` INT(11) NOT NULL AUTO_INCREMENT, `code` VARCHAR(20) COMMENT '频道编号', `name` VARCHAR(20) COMMENT '频道名称', `description` VARCHAR(500) DEFAULT '' COMMENT '描述', `create_time` TIMESTAMP DEFAULT CURRENT_TIMESTAMP COMMENT '创建时间', `update_time` TIMESTAMP DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP COMMENT '更新时间', PRIMARY KEY (`id`), UNIQUE KEY idx_code (`code`) ) ENGINE=InnoDB DEFAULT CHARSET=utf8 COMMENT='频道表'; CREATE TABLE `fetch_task` ( `id` INT(11) NOT NULL AUTO_INCREMENT, `platform_id` TINYINT DEFAULT 0 COMMENT '平台id(1:微信;2:微博;3:头条)', `channel_id` TINYINT DEFAULT 0 COMMENT '频道id', `follow_id` VARCHAR(45) DEFAULT '' COMMENT '关注账号id', `follow_name` VARCHAR(45) DEFAULT '' COMMENT '关注账号名称', `avatar_url` VARCHAR(512) DEFAULT '' COMMENT '关注账号头像', `fetch_url` VARCHAR(512) DEFAULT '' COMMENT '抓取入口', `flag_enabled` TINYINT DEFAULT 0 COMMENT '启用标记(0:未启用;1:已启用)', `description` VARCHAR(500) DEFAULT '' COMMENT '描述', `create_time` TIMESTAMP DEFAULT CURRENT_TIMESTAMP COMMENT '创建时间', `update_time` TIMESTAMP DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP COMMENT '更新时间', PRIMARY KEY (`id`), UNIQUE KEY idx_platform_follow_id (`platform_id`, `follow_id`) ) ENGINE=InnoDB DEFAULT CHARSET=utf8 COMMENT='抓取任务表'; CREATE TABLE `fetch_result` ( `id` INT(11) NOT NULL AUTO_INCREMENT, `task_id` INT NOT NULL COMMENT '任务id', `platform_id` TINYINT DEFAULT 0 COMMENT '平台id(1:微信;2:微博;3:头条)', `platform_name` VARCHAR(50) DEFAULT '' COMMENT '平台名称(1:微信;2:微博;3:头条)', `channel_id` TINYINT DEFAULT 0 COMMENT '频道id', `channel_name` VARCHAR(50) DEFAULT '' COMMENT '频道名称', `article_id` VARCHAR(50) DEFAULT '' COMMENT '文章id', `article_url` VARCHAR(512) DEFAULT '' COMMENT '文章链接', `article_title` VARCHAR(100) DEFAULT '' COMMENT '文章标题', `article_author_id` VARCHAR(100) DEFAULT '' COMMENT '文章作者id(对应follow_id)', `article_author_name` VARCHAR(100) DEFAULT '' COMMENT '文章作者名称(对应follow_name)', `article_tags` VARCHAR(100) DEFAULT '' COMMENT '文章标签(半角逗号分隔)', `article_abstract` VARCHAR(500) DEFAULT '' COMMENT '文章摘要', `article_content` MEDIUMTEXT COMMENT '文章内容', `article_pub_time` DATETIME DEFAULT '1000-01-01 00:00:00' COMMENT '文章发布时间', `create_time` TIMESTAMP DEFAULT CURRENT_TIMESTAMP COMMENT '创建时间', `update_time` TIMESTAMP DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP COMMENT '更新时间', PRIMARY KEY (`id`), KEY idx_task_id (`task_id`), UNIQUE KEY idx_platform_article_id (`platform_id`, `article_id`), KEY idx_platform_author_id (`platform_id`, `article_author_id`), KEY idx_article_pub_time (`article_pub_time`), KEY idx_create_time (`create_time`), KEY idx_update_time (`update_time`) ) ENGINE=InnoDB DEFAULT CHARSET=utf8 COMMENT='抓取结果表'; CREATE TABLE `log_task_scheduling` ( `id` INT(11) NOT NULL AUTO_INCREMENT, `platform_id` TINYINT DEFAULT 0 COMMENT '平台id(1:微信;2:微博;3:头条)', `platform_name` VARCHAR(50) DEFAULT '' COMMENT '平台名称(1:微信;2:微博;3:头条)', `spider_name` VARCHAR(45) DEFAULT '' COMMENT '蜘蛛名称,一般同平台名称', `task_quantity` INT(11) DEFAULT 0 COMMENT '任务数量', `create_time` TIMESTAMP DEFAULT CURRENT_TIMESTAMP COMMENT '创建时间', `update_time` TIMESTAMP DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP COMMENT '更新时间', PRIMARY KEY (`id`) ) ENGINE=InnoDB DEFAULT CHARSET=utf8 COMMENT='任务调度日志表'; -- 更新记录[2018-02-13] # ALTER TABLE `fetch_result` MODIFY `article_content` MEDIUMTEXT COMMENT '文章内容'; -- 更新记录[2018-05-29] # DROP INDEX idx_platform_author_id ON `fetch_result`; # ALTER TABLE `fetch_result` ADD INDEX idx_platform_author_id (`platform_id`, `article_author_id`); # ALTER TABLE `fetch_result` MODIFY `article_pub_time` DATETIME DEFAULT '1000-01-01 00:00:00' COMMENT '文章发布时间'; # ALTER TABLE `fetch_result` ADD INDEX idx_article_pub_time (`article_pub_time`); # ALTER TABLE `fetch_result` ADD INDEX idx_create_time (`create_time`); # ALTER TABLE `fetch_result` ADD INDEX idx_update_time (`update_time`); ================================================ FILE: docs/Architecture.md ================================================ # 整体架构(Architecture) - MariaDB 每个公众号/发布号的首页(即爬虫抓取入口)存储于数据库中。 表结构 db/schema/mysql.sql 测试数据 db/data/mysql.sql - Redis 为了支持分布式, 抓取任务单独存放于缓存, 这样在调试时, 需要手动执行创建任务。 参考[启动说明](Spiders/README.md) 为了方便调试, 本项目所有缓存key均以`scrapy:`作为前缀 - NodeJS 部分详情页面的信息抽取, 本项目使用js处理, 避免正则表达式规则的不完全覆盖。 ================================================ FILE: docs/Components/MariaDB.md ================================================ # MariaDB ================================================ FILE: docs/Components/Redis.md ================================================ # Redis ================================================ FILE: docs/Components/SeaweedFS.md ================================================ # SeaweedFS [SeaweedFS 项目地址](https://github.com/chrislusf/seaweedfs) ## 安装 ### Go (Golang) 下载页面: https://golang.org/dl/ ``` $ wget https://dl.google.com/go/go1.11.1.linux-amd64.tar.gz $ sudo tar -C /usr/local -xzf go1.11.1.linux-amd64.tar.gz $ sudo vim /etc/profile export GOROOT=/usr/local/go export GOPATH=$HOME/work export PATH=$PATH:$GOROOT/bin:$GOPATH/bin $ source /etc/profile ``` 或者仅为当前用户设置环境变量 ``` $ vim ~/.bashrc $ source ~/.bashrc ``` 注意:使用 zsh 的用户, 需要为 zsh 设置环境变量 ``` $ vim ~/.zshrc $ source ~/.zshrc ``` ### Weed 依赖 git (版本控制工具) ``` go get github.com/chrislusf/seaweedfs/weed ``` ## 启动 Start Master Server ``` $ weed master ``` Start Volume Servers ``` $ mkdir /tmp/data1 /tmp/data2 $ chmod 777 /tmp/data1 /tmp/data2 $ weed volume -dir="/tmp/data1" -max=5 -mserver="localhost:9333" -port=8080 & $ weed volume -dir="/tmp/data2" -max=10 -mserver="localhost:9333" -port=8081 & ``` ``` $ weed volume -dir=/tmp/data1/ -mserver="localhost:9333" -ip="192.168.2.32" -port=8080 ``` ## 启动(方式二) ``` $ weed server -dir=/tmp/data1/ -filer -filer.port=8000 -master.port=9333 -volume.port=8001 ``` 集群管理: http://127.0.0.1:9333/ 归档管理: http://localhost:8000/ 卷积管理: http://localhost:8001/ui/index.html 图片地址: http://localhost:8001/ 上传文件请求 ``` $ curl http://localhost:9333/dir/assign {"fid":"2,055a54a8ec","url":"127.0.0.1:8080","publicUrl":"127.0.0.1:8080","count":1} ``` 上传文件 ``` $ curl -X PUT -F file=@/home/zhanghe/metro.jpg http://127.0.0.1:8080/2,055a54a8ec {"name":"metro.jpg","size":1830848} ``` 删除文件 ``` $ curl -X DELETE http://127.0.0.1:8080/2,055a54a8ec {"size":1830869} ``` 文件读取 ``` $ curl "http://localhost:9333/dir/lookup?volumeId=2" {"volumeId":"2","locations":[{"url":"127.0.0.1:8080","publicUrl":"127.0.0.1:8080"}]} ``` 访问文件 - [http://127.0.0.1:8080/2,055a54a8ec.jpg](http://127.0.0.1:8080/2,055a54a8ec.jpg) - [http://127.0.0.1:8080/2/055a54a8ec.jpg](http://127.0.0.1:8080/2/055a54a8ec.jpg) - [http://127.0.0.1:8080/2/055a54a8ec](http://127.0.0.1:8080/2/055a54a8ec) - [http://127.0.0.1:8080/2/055a54a8ec?height=200&width=200](http://127.0.0.1:8080/2/055a54a8ec?height=200&width=200) 导出文件打包 ``` $ weed export -dir=/tmp/data1 -volumeId=1 -o=/tmp/data1.tar -fileNameFormat={{.Name}} -newer='2006-01-02T15:04:05' ``` 解包具体文件 ``` $ tar -xvf data1.tar ``` ## 快速安装 ```bash # Mac系统 $ wget -c https://github.com/chrislusf/seaweedfs/releases/download/0.76/darwin_amd64.tar.gz -O weed_darwin_arm64.tar.gz $ tar -zxvf weed_darwin_arm64.tar.gz # Linux系统 $ wget -c https://github.com/chrislusf/seaweedfs/releases/download/0.76/linux_arm64.tar.gz -O weed_linux_arm64.tar.gz $ tar -zxvf weed_linux_arm64.tar.gz # 启动 $ ./weed server -dir=weed_data/ -filer -filer.port=8000 -master.port=9333 -volume.port=8001 -volume.max=32 ``` ================================================ FILE: docs/Components/Squid.md ================================================ # Squid ================================================ FILE: docs/README.md ================================================ # scrapy最佳实践 - 新闻抓取 ## GitBook 操作指南 初始化 ```bash cd docs npm install -g gitbook-cli npm install --save gitbook-plugin-todo npm install --save gitbook-plugin-mermaid-full gitbook init # 或者 gitbook install ``` 开启服务 ```bash gitbook serve ``` 访问 [http://localhost:4000](http://localhost:4000) ================================================ FILE: docs/SUMMARY.md ================================================ # Summary * [项目介绍](README.md) * [项目架构](Architecture.md) * [爬虫模块](Spiders/README.md) * [微信爬虫](Spiders/Weixin.md) * [微博爬虫](Spiders/Weibo.md) * [头条爬虫](Spiders/Toutiao.md) * 组件服务 * [MariaDB](Components/MariaDB.md) * [Redis](Components/Redis.md) * [SeaweedFS](Components/SeaweedFS.md) ================================================ FILE: docs/Spiders/README.md ================================================ # Spiders 1、部署系统依赖 - MariaDB - Redis - NodeJS 2、部署项目依赖 ``` pip install requirements.txt ``` 3、创建数据库, 建立抓取入口 - 建表结构 db/schema/mysql.sql - 测试数据 db/data/mysql.sql 4、创建抓取任务, 写入缓存 ``` (news_spider.env) ➜ news_spider git:(master) ✗ python tasks/job_put_tasks.py [√] 当前环境变量: develop 缺失参数 Example: python job_put_tasks.py wx # 微信 python job_put_tasks.py wb # 微博 python job_put_tasks.py tm # 头条(M) ``` 参考以上提示, 对应蜘蛛执行各自的脚本完成任务创建 5、微信抓取, 需要初始化cookie, 其他两个蜘蛛不需要 生成环境, 可以使用`supervisor`自动守护`scrapy.ini`、`tasks.ini`这两组进程, 根据需要自行修改 ================================================ FILE: docs/Spiders/Toutiao.md ================================================ # 头条(M端) 创建任务详情 ```mysql INSERT INTO `fetch_task` VALUES (134, 3, 0, '55982516338', '奇文志怪', '', 'http://m.toutiao.com/profile/55982516338/', 1, '', '2018-09-06 14:01:05', '2018-09-06 14:01:05'); ``` 进入redis, 检查调度任务数量 ``` 127.0.0.1:6379> SCARD "scrapy:tasks_set:toutiao_m" (integer) 439 ``` 如果没有调度任务, 需要创建调度任务 ``` python tasks/job_put_tasks.py tm ``` 开启爬虫 ``` scrapy crawl toutiao_m ``` ================================================ FILE: docs/Spiders/Weibo.md ================================================ # 微博 进入redis, 检查调度任务数量 ``` 127.0.0.1:6379> SCARD "scrapy:tasks_set:weibo" (integer) 0 ``` 如果没有调度任务, 需要创建调度任务 ``` python tasks/job_put_tasks.py wb ``` 开启爬虫 ``` scrapy crawl weibo ``` ================================================ FILE: docs/Spiders/Weixin.md ================================================ # 微信 进入redis, 检查调度任务数量 ``` 127.0.0.1:6379> SCARD "scrapy:tasks_set:weixin" (integer) 0 ``` 如果没有调度任务, 需要创建调度任务 ``` python tasks/job_put_tasks.py wx ``` 开启爬虫 ``` scrapy crawl weixin ``` ================================================ FILE: docs/book.json ================================================ { "language": "zh-hans", "author": "碎ping子", "plugins": [ "todo", "mermaid-full@>=0.5.1" ] } ================================================ FILE: env_default.sh ================================================ #!/usr/bin/env bash source news_spider.env/bin/activate export PATH=${PWD}:${PATH} export PYTHONPATH=${PWD} export PYTHONIOENCODING=utf-8 export MODE=default ================================================ FILE: etc/scrapy.ini ================================================ [group:scrapy] programs=weixin,weibo,toutiao [program:weixin] command=scrapy crawl weixin directory=news startsecs=0 stopwaitsecs=0 autostart=false autorestart=true redirect_stderr=true stdout_logfile=logs/scrapy_weixin.log [program:weibo] command=scrapy crawl weibo directory=news startsecs=0 stopwaitsecs=0 autostart=false autorestart=true redirect_stderr=true stdout_logfile=logs/scrapy_weibo.log [program:toutiao] command=scrapy crawl toutiao directory=news startsecs=0 stopwaitsecs=0 autostart=false autorestart=true redirect_stderr=true stdout_logfile=logs/scrapy_toutiao.log ================================================ FILE: etc/scrapyd.ini ================================================ [program:scrapyd] command=scrapyd directory=news priority=200 startsecs=0 stopwaitsecs=0 autostart=false autorestart=true redirect_stderr=true stdout_logfile=logs/scrapyd.log ================================================ FILE: etc/supervisord.conf ================================================ ; Sample supervisor config file. ; ; For more information on the config file, please see: ; http://supervisord.org/configuration.html ; ; Notes: ; - Shell expansion ("~" or "$HOME") is not supported. Environment ; variables can be expanded using this syntax: "%(ENV_HOME)s". ; - Comments must have a leading space: "a=b ;comment" not "a=b;comment". ;[unix_http_server] ;file=/tmp/supervisor.sock ; (the path to the socket file) ;chmod=0700 ; socket file mode (default 0700) ;chown=nobody:nogroup ; socket file uid:gid owner ;username=user ; (default is no username (open server)) ;password=123 ; (default is no password (open server)) [inet_http_server] ; inet (TCP) server disabled by default port=127.0.0.1:9001 ; (ip_address:port specifier, *:port for all iface) username=user ; (default is no username (open server)) password=123 ; (default is no password (open server)) [supervisord] logfile=/tmp/supervisord.log ; (main log file;default $CWD/supervisord.log) logfile_maxbytes=50MB ; (max main logfile bytes b4 rotation;default 50MB) logfile_backups=10 ; (num of main logfile rotation backups;default 10) loglevel=info ; (log level;default info; others: debug,warn,trace) pidfile=/tmp/supervisord.pid ; (supervisord pidfile;default supervisord.pid) nodaemon=false ; (start in foreground if true;default false) minfds=1024 ; (min. avail startup file descriptors;default 1024) minprocs=200 ; (min. avail process descriptors;default 200) ;umask=022 ; (process file creation umask;default 022) ;user=chrism ; (default is current user, required if root) ;identifier=supervisor ; (supervisord identifier, default is 'supervisor') ;directory=/tmp ; (default is not to cd during start) ;nocleanup=true ; (don't clean up tempfiles at start;default false) ;childlogdir=/tmp ; ('AUTO' child log dir, default $TEMP) ;environment=KEY="value" ; (key value pairs to add to environment) ;strip_ansi=false ; (strip ansi escape codes in logs; def. false) ; the below section must remain in the config file for RPC ; (supervisorctl/web interface) to work, additional interfaces may be ; added by defining them in separate rpcinterface: sections [rpcinterface:supervisor] supervisor.rpcinterface_factory = supervisor.rpcinterface:make_main_rpcinterface [supervisorctl] ;serverurl=unix:///tmp/supervisor.sock ; use a unix:// URL for a unix socket serverurl=http://127.0.0.1:9001 ; use an http:// url to specify an inet socket username=user ; should be same as http_username if set password=123 ; should be same as http_password if set ;prompt=mysupervisor ; cmd line prompt (default "supervisor") ;history_file=~/.sc_history ; use readline history if available ; The below sample program section shows all possible program subsection values, ; create one or more 'real' program: sections to be able to control them under ; supervisor. ;[program:theprogramname] ;command=/bin/cat ; the program (relative uses PATH, can take args) ;process_name=%(program_name)s ; process_name expr (default %(program_name)s) ;numprocs=1 ; number of processes copies to start (def 1) ;directory=/tmp ; directory to cwd to before exec (def no cwd) ;umask=022 ; umask for process (default None) ;priority=999 ; the relative start priority (default 999) ;autostart=true ; start at supervisord start (default: true) ;startsecs=1 ; # of secs prog must stay up to be running (def. 1) ;startretries=3 ; max # of serial start failures when starting (default 3) ;autorestart=unexpected ; when to restart if exited after running (def: unexpected) ;exitcodes=0,2 ; 'expected' exit codes used with autorestart (default 0,2) ;stopsignal=QUIT ; signal used to kill process (default TERM) ;stopwaitsecs=10 ; max num secs to wait b4 SIGKILL (default 10) ;stopasgroup=false ; send stop signal to the UNIX process group (default false) ;killasgroup=false ; SIGKILL the UNIX process group (def false) ;user=chrism ; setuid to this UNIX account to run the program ;redirect_stderr=true ; redirect proc stderr to stdout (default false) ;stdout_logfile=/a/path ; stdout log path, NONE for none; default AUTO ;stdout_logfile_maxbytes=1MB ; max # logfile bytes b4 rotation (default 50MB) ;stdout_logfile_backups=10 ; # of stdout logfile backups (default 10) ;stdout_capture_maxbytes=1MB ; number of bytes in 'capturemode' (default 0) ;stdout_events_enabled=false ; emit events on stdout writes (default false) ;stderr_logfile=/a/path ; stderr log path, NONE for none; default AUTO ;stderr_logfile_maxbytes=1MB ; max # logfile bytes b4 rotation (default 50MB) ;stderr_logfile_backups=10 ; # of stderr logfile backups (default 10) ;stderr_capture_maxbytes=1MB ; number of bytes in 'capturemode' (default 0) ;stderr_events_enabled=false ; emit events on stderr writes (default false) ;environment=A="1",B="2" ; process environment additions (def no adds) ;serverurl=AUTO ; override serverurl computation (childutils) ; The below sample eventlistener section shows all possible ; eventlistener subsection values, create one or more 'real' ; eventlistener: sections to be able to handle event notifications ; sent by supervisor. ;[eventlistener:theeventlistenername] ;command=/bin/eventlistener ; the program (relative uses PATH, can take args) ;process_name=%(program_name)s ; process_name expr (default %(program_name)s) ;numprocs=1 ; number of processes copies to start (def 1) ;events=EVENT ; event notif. types to subscribe to (req'd) ;buffer_size=10 ; event buffer queue size (default 10) ;directory=/tmp ; directory to cwd to before exec (def no cwd) ;umask=022 ; umask for process (default None) ;priority=-1 ; the relative start priority (default -1) ;autostart=true ; start at supervisord start (default: true) ;startsecs=1 ; # of secs prog must stay up to be running (def. 1) ;startretries=3 ; max # of serial start failures when starting (default 3) ;autorestart=unexpected ; autorestart if exited after running (def: unexpected) ;exitcodes=0,2 ; 'expected' exit codes used with autorestart (default 0,2) ;stopsignal=QUIT ; signal used to kill process (default TERM) ;stopwaitsecs=10 ; max num secs to wait b4 SIGKILL (default 10) ;stopasgroup=false ; send stop signal to the UNIX process group (default false) ;killasgroup=false ; SIGKILL the UNIX process group (def false) ;user=chrism ; setuid to this UNIX account to run the program ;redirect_stderr=false ; redirect_stderr=true is not allowed for eventlisteners ;stdout_logfile=/a/path ; stdout log path, NONE for none; default AUTO ;stdout_logfile_maxbytes=1MB ; max # logfile bytes b4 rotation (default 50MB) ;stdout_logfile_backups=10 ; # of stdout logfile backups (default 10) ;stdout_events_enabled=false ; emit events on stdout writes (default false) ;stderr_logfile=/a/path ; stderr log path, NONE for none; default AUTO ;stderr_logfile_maxbytes=1MB ; max # logfile bytes b4 rotation (default 50MB) ;stderr_logfile_backups=10 ; # of stderr logfile backups (default 10) ;stderr_events_enabled=false ; emit events on stderr writes (default false) ;environment=A="1",B="2" ; process environment additions ;serverurl=AUTO ; override serverurl computation (childutils) ; The below sample group section shows all possible group values, ; create one or more 'real' group: sections to create "heterogeneous" ; process groups. ;[group:thegroupname] ;programs=progname1,progname2 ; each refers to 'x' in [program:x] definitions ;priority=999 ; the relative start priority (default 999) ; The [include] section can just contain the "files" setting. This ; setting can list multiple files (separated by whitespace or ; newlines). It can also contain wildcards. The filenames are ; interpreted as relative to this file. Included files *cannot* ; include files themselves. ;[include] ;files = relative/directory/*.ini ;[include] ;files = scrapy.ini tasks.ini [include] files = toutiao.ini ================================================ FILE: etc/tasks.ini ================================================ [group:tasks] programs=counter_clear,put_tasks_toutiao,put_tasks_weibo,put_tasks_weixin,sogou_cookies,weixin_cookies [program:counter_clear] command=python tasks/run_job_counter_clear.py startsecs=0 stopwaitsecs=0 autostart=false autorestart=true redirect_stderr=true stdout_logfile=logs/counter_clear.log [program:put_tasks_toutiao] command=python tasks/run_job_put_tasks_toutiao.py startsecs=0 stopwaitsecs=0 autostart=false autorestart=true redirect_stderr=true stdout_logfile=logs/put_tasks_toutiao.log [program:put_tasks_weibo] command=python tasks/run_job_put_tasks_weibo.py startsecs=0 stopwaitsecs=0 autostart=false autorestart=true redirect_stderr=true stdout_logfile=logs/put_tasks_weibo.log [program:put_tasks_weixin] command=python tasks/run_job_put_tasks_weixin.py startsecs=0 stopwaitsecs=0 autostart=false autorestart=true redirect_stderr=true stdout_logfile=logs/put_tasks_weixin.log [program:sogou_cookies] command=python tasks/run_job_sogou_cookies.py startsecs=0 stopwaitsecs=0 autostart=false autorestart=true redirect_stderr=true stdout_logfile=logs/sogou_cookies.log [program:weixin_cookies] command=python tasks/run_job_weixin_cookies.py startsecs=0 stopwaitsecs=0 autostart=false autorestart=true redirect_stderr=true stdout_logfile=logs/weixin_cookies.log ================================================ FILE: etc/toutiao.ini ================================================ [group:toutiao] programs=put_tasks,scrapy [program:put_tasks] command=python tasks/run_job_put_tasks_toutiao.py startsecs=0 stopwaitsecs=0 autostart=false autorestart=true redirect_stderr=true stdout_logfile=logs/put_tasks_toutiao.log [program:scrapy] command=scrapy crawl toutiao directory=news startsecs=0 stopwaitsecs=0 autostart=false autorestart=true redirect_stderr=true stdout_logfile=logs/scrapy_toutiao.log ;[program:reboot_net] ;command=python tasks/run_job_reboot_net_china_net.py ;startsecs=0 ;stopwaitsecs=0 ;autostart=false ;autorestart=true ;redirect_stderr=true ;stdout_logfile=logs/reboot_net_china_net.log ================================================ FILE: libs/__init__.py ================================================ #!/usr/bin/env python # encoding: utf-8 """ @author: zhanghe @software: PyCharm @file: __init__.py.py @time: 2018-02-10 15:24 """ def func(): pass class Main(object): def __init__(self): pass if __name__ == '__main__': pass ================================================ FILE: libs/counter.py ================================================ #!/usr/bin/env python # encoding: utf-8 """ @author: zhanghe @software: PyCharm @file: counter.py @time: 2018-02-10 15:24 """ from redis import Redis class CounterClient(object): """ 计数器 """ def __init__(self, redis_client, entity_name, prefix='counter'): """ :param redis_client: :param entity_name: :param prefix: """ self.redis_client = redis_client # type: Redis self.counter_key = "%s:%s" % (prefix, entity_name) def increase(self, amount=1): """ 增加计数 :param amount: :return: """ return int(self.redis_client.incr(self.counter_key, amount)) def decrease(self, amount=1): """ 减少计数 :param amount: :return: """ return int(self.redis_client.decr(self.counter_key, amount)) def get(self): """ 获取计数 :return: """ return int(self.redis_client.get(self.counter_key) or 0) def clear(self): """ 清除计数 :return: """ return self.redis_client.delete(self.counter_key) ================================================ FILE: libs/ft.py ================================================ #!/usr/bin/env python # encoding: utf-8 """ @author: zhanghe @software: PyCharm @file: ff.py @time: 2019-05-26 14:26 """ import base64 import hashlib import time import requests URL = "http://pred.fateadm.com" class FTClient(object): def __init__(self, pd_id, pd_key, app_id='', app_key=''): self.pd_id = pd_id self.pd_key = pd_key self.app_id = app_id self.app_key = app_key self.host = URL self.s = requests.session() self.timeout = 30 @staticmethod def calc_sign(pd_id, pd_key, timestamp): md5 = hashlib.md5() md5.update(timestamp + pd_key) sign_a = md5.hexdigest() md5 = hashlib.md5() md5.update(pd_id + timestamp + sign_a) sign_b = md5.hexdigest() return sign_b @staticmethod def calc_card_sign(card_id, card_key, timestamp, pd_key): md5 = hashlib.md5() md5.update(pd_key + timestamp + card_id + card_key) return md5.hexdigest() def query_balance(self): """查询余额""" tm = str(int(time.time())) sign = self.calc_sign(self.pd_id, self.pd_key, tm) param = { "user_id": self.pd_id, "timestamp": tm, "sign": sign } url = self.host + "/api/custval" rsp = self.s.post(url, param, timeout=self.timeout).json() return rsp def query_tts(self, predict_type): """查询网络延迟""" tm = str(int(time.time())) sign = self.calc_sign(self.pd_id, self.pd_key, tm) param = { "user_id": self.pd_id, "timestamp": tm, "sign": sign, "predict_type": predict_type, } if self.app_id != "": asign = self.calc_sign(self.app_id, self.app_key, tm) param["appid"] = self.app_id param["asign"] = asign url = self.host + "/api/qcrtt" rsp = self.s.post(url, param, timeout=self.timeout).json() return rsp def predict(self, predict_type, img_data): """识别验证码""" tm = str(int(time.time())) sign = self.calc_sign(self.pd_id, self.pd_key, tm) img_base64 = base64.b64encode(img_data) param = { "user_id": self.pd_id, "timestamp": tm, "sign": sign, "predict_type": predict_type, "img_data": img_base64, } if self.app_id != "": asign = self.calc_sign(self.app_id, self.app_key, tm) param["appid"] = self.app_id param["asign"] = asign url = self.host + "/api/capreg" rsp = self.s.post(url, param, timeout=self.timeout).json() return rsp def predict_from_file(self, predict_type, file_name): """从文件进行验证码识别""" with open(file_name, "rb+") as f: data = f.read() return self.predict(predict_type, data) def justice(self, request_id): """识别失败,进行退款请求""" if request_id == "": return tm = str(int(time.time())) sign = self.calc_sign(self.pd_id, self.pd_key, tm) param = { "user_id": self.pd_id, "timestamp": tm, "sign": sign, "request_id": request_id } url = self.host + "/api/capjust" rsp = self.s.post(url, param, timeout=self.timeout).json() return rsp def charge(self, card_id, card_key): """充值接口""" tm = str(int(time.time())) sign = self.calc_sign(self.pd_id, self.pd_key, tm) card_sign = self.calc_card_sign(card_id, card_key, tm, self.pd_key) param = { "user_id": self.pd_id, "timestamp": tm, "sign": sign, 'cardid': card_id, 'csign': card_sign } url = self.host + "/api/charge" rsp = self.s.post(url, param, timeout=self.timeout).json() return rsp def test_ft(): """ 测试 {u'RspData': u'{"cust_val":1010}', u'RetCode': u'0', u'ErrMsg': u'succ', u'RequestId': u''} {u'RspData': u'{"result": "8x4g"}', u'RetCode': u'0', u'ErrMsg': u'', u'RequestId': u'2019052615005042ad98b2000518d493'} :return: """ pd_id = "xxxxxx" pd_key = "xxxxxx" app_id = "312451" app_key = "5YuN+6isLserKBZti4hoaI6UR2N5UT2j" predict_type = "30400" api = FTClient(pd_id, pd_key, app_id, app_key) # 查询余额接口 res = api.query_balance() print(res) file_name = "img.jpg" rsp = api.predict_from_file(predict_type, file_name) print(rsp) if __name__ == "__main__": test_ft() ================================================ FILE: libs/optical_modem.py ================================================ #!/usr/bin/env python # encoding: utf-8 """ @author: zhanghe @software: PyCharm @file: optical_modem.py @time: 2018-05-27 00:24 """ import base64 import json import time import re import random import hashlib import requests from scrapy.selector import Selector class OpticalModemChinaNet(object): """ 电信光猫 """ s = requests.session() def __init__(self, host='192.168.1.1', username='useradmin', password='crcun'): self.host = host self.username = username self.password = password self.url_login = 'http://%s/login.cgi' % self.host self.url_get_wan_wifi_status = 'http://%s/gatewayManage.cmd' % self.host self.url_reboot = 'http://%s/gatewayManage.cmd' % self.host self.timeout = 180 self.net_ip_o = None self.net_ip_n = None @staticmethod def _get_tc(): tc = str('%13d' % (time.time() * 1000)) return tc def login(self): """ 登录 :return: """ params = { 'username': self.username, 'psd': self.password, } res = self.s.get(self.url_login, params=params, timeout=self.timeout) print(res.status_code, res.url) def get_wan_wifi_status(self): """ 获取wifi状态 :return: """ headers = { 'X-Requested-With': 'XMLHttpRequest', } params = { 'timeStamp': self._get_tc(), } json_cfg = { 'RPCMethod': 'Post1', 'ID': '123', 'Parameter': base64.urlsafe_b64encode("{'CmdType':'GET_WAN_WIFI_STATUS'}") } data = "jsonCfg=%s" % json.dumps(json_cfg) res = self.s.post(self.url_get_wan_wifi_status, headers=headers, params=params, data=data, timeout=self.timeout) print(res.status_code, res.url) return_parameter = json.loads(base64.decodestring(res.json().get('return_Parameter', ''))) print(return_parameter) print(return_parameter.get('ipAddr')) wan_ip = return_parameter.get('ipAddr') return wan_ip def reboot(self): """ 重启 :return: """ headers = {'X-Requested-With': 'XMLHttpRequest'} params = { 'timeStamp': self._get_tc(), } json_cfg = { 'RPCMethod': 'Post1', 'ID': '123', 'Parameter': base64.urlsafe_b64encode("{'CmdType':'HG_COMMAND_REBOOT'}") } data = "jsonCfg=%s" % json.dumps(json_cfg) res = self.s.post(self.url_reboot, headers=headers, params=params, data=data, timeout=self.timeout) print(res.status_code, res.url) return_parameter = json.loads(base64.decodestring(res.json().get('return_Parameter', ''))) print(return_parameter) def get_net_ip(self): """ 获取网络IP,这里使用requests不用session,因为重启之后,session会断开 :return: """ url = 'https://ip.cn/' res = requests.get(url, timeout=self.timeout) response = Selector(res) info = response.xpath('//div[@class="well"]//code/text()').extract() ip_info = dict(zip(['ip', 'address'], info)) net_ip = ip_info['ip'] print(net_ip) return net_ip def check_reboot_status(self): reboot_status = self.net_ip_o != self.net_ip_n print(reboot_status) return reboot_status class OpticalModemChinaMobile(object): """ 移动光猫 登录密码表单SHA256加密 """ s = requests.session() pid = 1002 session_token = 0 def __init__(self, host='192.168.1.1', username='user', password='gkw4p3uv'): self.host = host self.username = username self.password = password self.pwd_random = self._get_pwd_random() self.encryption_pwd = self._get_encryption_pwd(self.password, self.pwd_random) self.token = self._get_token() self.url_login = 'http://%s/' % self.host self.timeout = 180 self.net_ip_o = None self.net_ip_n = None @staticmethod def _get_pwd_random(): pwd_random = str(int(round(random.random() * 89999999)) + 10000000) return pwd_random @staticmethod def _get_encryption_pwd(pwd, r): encryption_pwd = hashlib.sha256(''.join([pwd, r])).hexdigest() return encryption_pwd def _get_token(self): url = 'http://%s' % self.host res = self.s.get(url) html_body = res.text token_re = re.compile(r'getObj\("Frm_Logintoken"\)\.value = "(\d+)";') token_list = re.findall(token_re, html_body) return int(token_list[0]) if token_list else 0 def _get_pid(self): url = 'http://%s/template.gch' % self.host res = self.s.get(url, timeout=self.timeout) html_body = res.text pid_re = re.compile(r'"getpage\.gch\?pid=(\d+)&nextpage="') pid_list = re.findall(pid_re, html_body) self.pid = int(pid_list[0]) if pid_list else self.pid return self.pid def _get_session_token(self): url = 'http://%s/getpage.gch?pid=%s&nextpage=manager_dev_restart_t.gch' % (self.host, self.pid) res = self.s.get(url, timeout=self.timeout) html_body = res.text session_token_re = re.compile(r'var session_token = "(\d+)";') session_token_list = re.findall(session_token_re, html_body) self.session_token = int(session_token_list[0]) if session_token_list else self.session_token return self.session_token def login(self): """ 登录 :return: """ payload = { 'frashnum': '', 'action': 'login', 'Frm_Logintoken': self.token, 'UserRandomNum': self.pwd_random, 'Username': self.username, 'Password': self.encryption_pwd, } res = self.s.post(self.url_login, data=payload, timeout=self.timeout) return 'mainFrame' in res.text def reboot(self): url = 'http://%s/getpage.gch?pid=%s&nextpage=manager_dev_restart_t.gch' % (self.host, self._get_pid()) payload = { 'IF_ACTION': 'devrestart', 'IF_ERRORSTR': 'SUCC', 'IF_ERRORPARAM': 'SUCC', 'IF_ERRORTYPE': -1, 'flag': 1, '_SESSION_TOKEN': self._get_session_token(), } res = self.s.post(url, data=payload, timeout=self.timeout) return '设备重启需要2~3分钟,请耐心等待。' in res.text def get_net_ip(self): """ 获取网络IP,这里使用requests不用session,因为重启之后,session会断开 :return: """ url = 'https://ip.cn/' res = requests.get(url, timeout=self.timeout) response = Selector(res) info = response.xpath('//div[@class="well"]//code/text()').extract() ip_info = dict(zip(['ip', 'address'], info)) net_ip = ip_info['ip'] print(net_ip) return net_ip def check_reboot_status(self): reboot_status = self.net_ip_o != self.net_ip_n print(reboot_status) return reboot_status def test_china_net(): om_cn = OpticalModemChinaNet() om_cn.net_ip_o = om_cn.get_net_ip() om_cn.login() # 默认用户名、密码 om_cn.reboot() time.sleep(10) c = 3 while 1: if c <= 0: break try: om_cn.net_ip_n = om_cn.get_net_ip() break except Exception as e: c -= 1 print(e) om_cn.check_reboot_status() def test_china_mobile(): om_cm = OpticalModemChinaMobile() om_cm.net_ip_o = om_cm.get_net_ip() om_cm.login() om_cm.reboot() time.sleep(10) c = 3 while 1: if c <= 0: break try: om_cm.net_ip_n = om_cm.get_net_ip() break except Exception as e: c -= 1 print(e) om_cm.check_reboot_status() if __name__ == '__main__': # test_china_net() test_china_mobile() ================================================ FILE: libs/redis_pub_sub.py ================================================ #!/usr/bin/env python # encoding: utf-8 """ @author: zhanghe @software: PyCharm @file: redis_pub_sub.py @time: 2018-02-10 15:24 """ import redis class RedisPubSub(object): """ Pub/Sub 队列中存储的数据必须是序列化之后的数据 生产消息: 入队前, 序列化 消费消息: 出队后, 反序列化 """ def __init__(self, name, namespace='pub/sub', redis_client=None, **redis_kwargs): """The default connection parameters are: host='localhost', port=6379, db=0""" self.__db = redis_client or redis.Redis(**redis_kwargs) self.key = '%s:%s' % (namespace, name) def pub(self, k, v): """ Pub :param k: :param v: :return: """ ch = '%s:%s' % (self.key, k) self.__db.publish(ch, v) def sub(self, k): """ Sub :param k: :return: """ ps = self.__db.pubsub() ch = '%s:%s' % (self.key, k) ps.subscribe(ch) for item in ps.listen(): # {'pattern': None, 'type': 'subscribe', 'channel': 'pub/sub:test:hh', 'data': 1L} yield item if item['type'] == 'message': yield item.get('data') def p_sub(self, k): """ PSub 订阅一个或多个符合给定模式的频道 每个模式以 * 作为匹配符 注意 psubscribe 与 subscribe 区别 :param k: :return: """ ps = self.__db.pubsub() ch = '%s:%s' % (self.key, k) ps.psubscribe(ch) for item in ps.listen(): # {'pattern': None, 'type': 'psubscribe', 'channel': 'pub/sub:test:*:hh', 'data': 1L} # yield item if item['type'] == 'pmessage': # {'pattern': 'pub/sub:test:*:hh', 'type': 'pmessage', 'channel': 'pub/sub:test:aa:hh', 'data': '123'} yield item.get('data') def sub_not_loop(self, k): """ Sub 非无限循环,取到结果即退出 :param k: :return: """ ps = self.__db.pubsub() ch = '%s:%s' % (self.key, k) ps.subscribe(ch) for item in ps.listen(): if item['type'] == 'message': return item.get('data') def p_sub_not_loop(self, k): """ PSub 非无限循环,取到结果即退出 :param k: :return: """ ps = self.__db.pubsub() ch = '%s:%s' % (self.key, k) ps.psubscribe(ch) for item in ps.listen(): if item['type'] == 'pmessage': return item.get('data') ================================================ FILE: libs/redis_queue.py ================================================ #!/usr/bin/env python # encoding: utf-8 """ @author: zhanghe @software: PyCharm @file: redis_queue.py @time: 2018-02-10 15:25 """ import redis class RedisQueue(object): """Simple Queue with Redis Backend""" def __init__(self, name, namespace='queue', redis_client=None, **redis_kwargs): """The default connection parameters are: host='localhost', port=6379, db=0""" self.__db = redis_client or redis.Redis(**redis_kwargs) self.key = '%s:%s' % (namespace, name) def qsize(self): """Return the approximate size of the queue.""" return self.__db.llen(self.key) def empty(self): """Return True if the queue is empty, False otherwise.""" return self.qsize() == 0 def put(self, item): """Put item into the queue.""" self.__db.rpush(self.key, item) def get(self, block=True, timeout=None): """Remove and return an item from the queue. If optional args block is true and timeout is None (the default), block if necessary until an item is available.""" if block: # ('queue:test', 'hello world') item = self.__db.blpop(self.key, timeout=timeout) else: # hello world item = self.__db.lpop(self.key) if isinstance(item, tuple): item = item[1] return item def get_nowait(self): """Equivalent to get(False).""" return self.get(False) ================================================ FILE: libs/rk.py ================================================ #!/usr/bin/env python # encoding: utf-8 """ @author: zhanghe @software: PyCharm @file: rk.py @time: 2018-02-10 15:25 """ from hashlib import md5 import requests class RKClient(object): def __init__(self, username, password, soft_id, soft_key): self.username = username self.password = md5(password).hexdigest() self.soft_id = soft_id self.soft_key = soft_key self.base_params = { 'username': self.username, 'password': self.password, 'softid': self.soft_id, 'softkey': self.soft_key, } self.headers = { 'Connection': 'Keep-Alive', 'Expect': '100-continue', 'User-Agent': 'ben', } def rk_create(self, im, im_type, timeout=60): """ im: 图片字节 im_type: 题目类型 """ params = { 'typeid': im_type, 'timeout': timeout, } params.update(self.base_params) files = {'image': ('a.jpg', im)} r = requests.post( 'http://api.ruokuai.com/create.json', data=params, files=files, headers=self.headers, timeout=timeout ) return r.json() def rk_report_error(self, im_id): """ im_id:报错题目的ID """ params = { 'id': im_id, } params.update(self.base_params) r = requests.post( 'http://api.ruokuai.com/reporterror.json', data=params, headers=self.headers, timeout=30 ) return r.json() if __name__ == '__main__': rc = RKClient('username', 'password', 'soft_id', 'soft_key') im = open('a.jpg', 'rb').read() print(rc.rk_create(im, 3040)) ================================================ FILE: libs/weed_fs.py ================================================ #!/usr/bin/env python # encoding: utf-8 """ @author: zhanghe @software: PyCharm @file: weed_fs.py @time: 2018-02-10 15:25 """ import csv # from urlparse import urlparse # PY2 # from urllib.parse import urlparse # PY3 from future.moves.urllib.parse import urlparse import requests from config import current_config REQUESTS_TIME_OUT = current_config.REQUESTS_TIME_OUT class WeedFSClient(object): request_headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:57.0) Gecko/20100101 Firefox/57.0' } def __init__(self, weed_fs_url): self.weed_fs_url = weed_fs_url def _get_assign(self): """ 获取分配的资源(url fid) 接口消息 - 正确: {"fid":"1,014e123ade","url":"127.0.0.1:8080","publicUrl":"127.0.0.1:8080","count":1} 接口消息 - 错误: {"error":"No free volumes left!"} """ url = '%s/dir/assign' % self.weed_fs_url res = requests.get(url, timeout=REQUESTS_TIME_OUT).json() if 'error' in res: raise Exception(res['error']) return res def _get_locations(self, fid): """ 获取文件服务器列表 {"volumeId":"1","locations":[{"url":"127.0.0.1:8080","publicUrl":"127.0.0.1:8080"}]} """ volume_id = fid.split(',')[0] url = '%s/dir/lookup?volumeId=%s' % (self.weed_fs_url, volume_id) return requests.get(url, timeout=REQUESTS_TIME_OUT).json() def save_file(self, local_file_path=None, remote_file_path=None, file_obj=None): """ 保存本地文件至weed_fs文件系统 {"name":"test.csv","size":425429} """ assign = self._get_assign() url = 'http://%s/%s' % (assign['url'], assign['fid']) if local_file_path: file_obj = open(local_file_path, 'rb') elif remote_file_path: headers = {'Host': urlparse(remote_file_path).netloc} # 防反爬, 指定图片 Host headers.update(self.request_headers) res = requests.get(remote_file_path, headers=headers, timeout=REQUESTS_TIME_OUT) if res.status_code == 200: file_obj = res.content else: raise Exception('File does not exist') elif not file_obj: raise Exception('File does not exist') res = requests.post(url, files={'file': file_obj}, timeout=REQUESTS_TIME_OUT) return dict(res.json(), **assign) def get_file_url(self, fid, separator=None): """ 获取文件链接 """ locations = self._get_locations(fid) public_url = locations['locations'][0]['publicUrl'] return 'http://%s/%s' % (public_url, fid.replace(',', separator) if separator else fid) def read_csv(self, fid, encoding=None): """ 逐行读取远程csv文件 :param fid: :param encoding: 'gbk'/'utf-8' :return: """ file_url = self.get_file_url(fid) download = requests.get(file_url, timeout=REQUESTS_TIME_OUT) csv_rows = csv.reader(download.iter_lines(), delimiter=',', quotechar='"') for csv_row in csv_rows: line = [item.decode(encoding, 'ignore') if encoding else item for item in csv_row] yield line ================================================ FILE: logs/index.html ================================================ Title ================================================ FILE: maps/__init__.py ================================================ #!/usr/bin/env python # encoding: utf-8 """ @author: zhanghe @software: PyCharm @file: __init__.py.py @time: 2018-02-10 17:58 """ def func(): pass class Main(object): def __init__(self): pass if __name__ == '__main__': pass ================================================ FILE: maps/channel.py ================================================ #!/usr/bin/env python # encoding: utf-8 """ @author: zhanghe @software: PyCharm @file: channel.py @time: 2018-02-10 18:13 """ channel_name_map = { } ================================================ FILE: maps/platform.py ================================================ #!/usr/bin/env python # encoding: utf-8 """ @author: zhanghe @software: PyCharm @file: platform.py @time: 2018-02-10 17:58 """ WEIXIN = 1 WEIBO = 2 TOUTIAO = 3 platform_name_map = { 1: u'微信', 2: u'微博', 3: u'头条', } ================================================ FILE: models/__init__.py ================================================ #!/usr/bin/env python # encoding: utf-8 """ @author: zhanghe @software: PyCharm @file: __init__.py.py @time: 2018-02-10 17:10 """ def func(): pass class Main(object): def __init__(self): pass if __name__ == '__main__': pass ================================================ FILE: models/news.py ================================================ # coding: utf-8 from sqlalchemy import Column, DateTime, Index, Integer, String, text from sqlalchemy.ext.declarative import declarative_base Base = declarative_base() metadata = Base.metadata def to_dict(self): return {c.name: getattr(self, c.name, None) for c in self.__table__.columns} Base.to_dict = to_dict class Channel(Base): __tablename__ = 'channel' id = Column(Integer, primary_key=True) code = Column(String(20), unique=True) name = Column(String(20)) description = Column(String(500), server_default=text("''")) create_time = Column(DateTime, nullable=False, server_default=text("CURRENT_TIMESTAMP")) update_time = Column(DateTime, nullable=False, server_default=text("CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP")) class FetchResult(Base): __tablename__ = 'fetch_result' __table_args__ = ( Index('idx_platform_author_id', 'platform_id', 'article_author_id'), Index('idx_platform_article_id', 'platform_id', 'article_id', unique=True) ) id = Column(Integer, primary_key=True) task_id = Column(Integer, nullable=False, index=True) platform_id = Column(Integer, server_default=text("'0'")) platform_name = Column(String(50), server_default=text("''")) channel_id = Column(Integer, server_default=text("'0'")) channel_name = Column(String(50), server_default=text("''")) article_id = Column(String(50), server_default=text("''")) article_url = Column(String(512), server_default=text("''")) article_title = Column(String(100), server_default=text("''")) article_author_id = Column(String(100), server_default=text("''")) article_author_name = Column(String(100), server_default=text("''")) article_tags = Column(String(100), server_default=text("''")) article_abstract = Column(String(500), server_default=text("''")) article_content = Column(String) article_pub_time = Column(DateTime, index=True, server_default=text("'1000-01-01 00:00:00'")) create_time = Column(DateTime, nullable=False, index=True, server_default=text("CURRENT_TIMESTAMP")) update_time = Column(DateTime, nullable=False, index=True, server_default=text("CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP")) class FetchTask(Base): __tablename__ = 'fetch_task' __table_args__ = ( Index('idx_platform_follow_id', 'platform_id', 'follow_id', unique=True), ) id = Column(Integer, primary_key=True) platform_id = Column(Integer, server_default=text("'0'")) channel_id = Column(Integer, server_default=text("'0'")) follow_id = Column(String(45), server_default=text("''")) follow_name = Column(String(45), server_default=text("''")) avatar_url = Column(String(512), server_default=text("''")) fetch_url = Column(String(512), server_default=text("''")) flag_enabled = Column(Integer, server_default=text("'0'")) description = Column(String(500), server_default=text("''")) create_time = Column(DateTime, nullable=False, server_default=text("CURRENT_TIMESTAMP")) update_time = Column(DateTime, nullable=False, server_default=text("CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP")) class LogTaskScheduling(Base): __tablename__ = 'log_task_scheduling' id = Column(Integer, primary_key=True) platform_id = Column(Integer, server_default=text("'0'")) platform_name = Column(String(50), server_default=text("''")) spider_name = Column(String(45), server_default=text("''")) task_quantity = Column(Integer, server_default=text("'0'")) create_time = Column(DateTime, nullable=False, server_default=text("CURRENT_TIMESTAMP")) update_time = Column(DateTime, nullable=False, server_default=text("CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP")) ================================================ FILE: news/__init__.py ================================================ ================================================ FILE: news/items.py ================================================ # -*- coding: utf-8 -*- # Define here the models for your scraped items # # See documentation in: # http://doc.scrapy.org/en/latest/topics/items.html import scrapy class FetchTaskItem(scrapy.Item): """ table_name: fetch_task primary_key: id """ follow_id = scrapy.Field() fetch_url = scrapy.Field() description = scrapy.Field() platform_id = scrapy.Field() channel_id = scrapy.Field() avatar_url = scrapy.Field() flag_enabled = scrapy.Field() follow_name = scrapy.Field() class FetchResultItem(scrapy.Item): """ table_name: fetch_result primary_key: id """ article_title = scrapy.Field() platform_name = scrapy.Field() task_id = scrapy.Field() channel_id = scrapy.Field() article_author_name = scrapy.Field() article_content = scrapy.Field() platform_id = scrapy.Field() channel_name = scrapy.Field() article_url = scrapy.Field() article_abstract = scrapy.Field() article_author_id = scrapy.Field() article_tags = scrapy.Field() article_id = scrapy.Field() article_pub_time = scrapy.Field() class ChannelItem(scrapy.Item): """ table_name: channel primary_key: id """ code = scrapy.Field() description = scrapy.Field() name = scrapy.Field() ================================================ FILE: news/middlewares/__init__.py ================================================ #!/usr/bin/env python # encoding: utf-8 """ @author: zhanghe @software: PyCharm @file: __init__.py.py @time: 2018-02-10 17:10 """ def func(): pass class Main(object): def __init__(self): pass if __name__ == '__main__': pass ================================================ FILE: news/middlewares/anti_spider.py ================================================ # -*- coding: utf-8 -*- # Define here the models for your spider middleware # # See documentation in: # http://doc.scrapy.org/en/latest/topics/spider-middleware.html from __future__ import unicode_literals import time from scrapy.exceptions import IgnoreRequest from scrapy.exceptions import NotConfigured from tools.cookies import del_cookies from tasks.jobs_weixin import set_anti_spider_task, sub_anti_spider class AntiSpiderMiddleware(object): """ 反爬中间件 配置说明: RETRY_ENABLED 默认: True RETRY_TIMES 默认: 2 RETRY_HTTP_CODES 默认: [500, 502, 503, 504, 400, 408] """ def __init__(self, settings): if not settings.getbool('RETRY_ENABLED'): raise NotConfigured self.max_retry_times = settings.getint('RETRY_TIMES') self.retry_http_codes = set(int(x) for x in settings.getlist('RETRY_HTTP_CODES')) self.priority_adjust = settings.getint('RETRY_PRIORITY_ADJUST') or 1 @classmethod def from_crawler(cls, crawler): return cls(crawler.settings) def process_request(self, request, spider): # 处理微信反爬(反爬机制一, sogou) if spider.name in ['weixin'] and 'antispider' in request.url: # 获取来源链接 redirect_urls = request.meta['redirect_urls'] # 清理失效 cookies cookies_id = request.meta['cookiejar'] del_cookies(spider.name, cookies_id) # spider.log(message='AntiSpider cookies_id: %s; url: %s' % (cookies_id, redirect_urls[0])) raise IgnoreRequest( 'Spider: %s, AntiSpider cookies_id: %s; url: %s' % (spider.name, cookies_id, redirect_urls[0])) def process_response(self, request, response, spider): # 处理微信反爬(反爬机制二, weixin) if spider.name in ['weixin']: title = response.xpath('//title/text()').extract_first(default='').strip() if title == '请输入验证码': # 设置反爬处理任务 msg = { 'url': response.url, 'time': time.strftime('%Y-%m-%d %H:%M:%S') } set_anti_spider_task(spider.name, msg) # 订阅处理结果 anti_spider_result = sub_anti_spider(spider.name) if not anti_spider_result.get('status'): return response # 请求重试 retry_req = request.copy() retry_req.dont_filter = True # 必须设置(禁止重复请求被过滤掉) retry_req.priority = request.priority + self.priority_adjust return retry_req return response ================================================ FILE: news/middlewares/content_type.py ================================================ # -*- coding: utf-8 -*- # Define here the models for your spider middleware # # See documentation in: # http://doc.scrapy.org/en/latest/topics/spider-middleware.html class ContentTypeGb2312Middleware(object): """ 处理不规范的页面(优先级降低至580之后才能生效) 原因: 默认配置的 DOWNLOADER_MIDDLEWARES 包含 MetaRefreshMiddleware 当请求页面存在如 Content-Location 类似的 header 时, 会触发重定向请求 指定 Content-Type 为 gb2312 """ def process_response(self, request, response, spider): response.headers['Content-Type'] = 'text/html; charset=gb2312' return response ================================================ FILE: news/middlewares/de_duplication_request.py ================================================ # -*- coding: utf-8 -*- # Define here the models for your spider middleware # # See documentation in: # http://doc.scrapy.org/en/latest/topics/spider-middleware.html from scrapy.exceptions import IgnoreRequest from tools.duplicate import is_dup_detail class DeDuplicationRequestMiddleware(object): """ 去重 - 请求 (数据结构:集合) """ def process_request(self, request, spider): if not request.url: return None channel_id = request.meta.get('channel_id', 0) # 处理详情页面(忽略列表页面)与pipeline配合 if is_dup_detail(request.url, spider.name, channel_id): raise IgnoreRequest("Spider: %s, DeDuplicationRequest: %s" % (spider.name, request.url)) ================================================ FILE: news/middlewares/httpproxy.py ================================================ # -*- coding: utf-8 -*- # Define here the models for your spider middleware # # See documentation in: # http://doc.scrapy.org/en/latest/topics/spider-middleware.html from scrapy.exceptions import NotConfigured from tools.proxies import get_proxy, del_proxy class HttpProxyMiddleware(object): """ 代理中间件 """ def __init__(self, settings): if not settings.getbool('RETRY_ENABLED'): raise NotConfigured self.max_retry_times = settings.getint('RETRY_TIMES') self.retry_http_codes = set(int(x) for x in settings.getlist('RETRY_HTTP_CODES')) self.priority_adjust = settings.getint('RETRY_PRIORITY_ADJUST') or 1 @classmethod def from_crawler(cls, crawler): return cls(crawler.settings) def process_request(self, request, spider): # request.meta['proxy'] = "http://YOUR_PROXY_IP:PORT" # 当前请求代理(保证重试过程,代理一致) request_proxy = request.meta.get('proxy') or get_proxy(spider.name) request.meta['proxy'] = request_proxy spider.log(request.meta) def process_exception(self, request, exception, spider): error_proxy = request.meta.get('proxy') if not error_proxy: return None # 重试失败(默认重试2次,共请求3次),删除代理 if request.meta.get('retry_times', 0) >= self.max_retry_times: del_proxy(spider.name, error_proxy) spider.log('%s del proxy: %s, error reason: %s' % (spider.name, error_proxy, exception)) return None ================================================ FILE: news/middlewares/useragent.py ================================================ # -*- coding: utf-8 -*- # Define here the models for your spider middleware # # See documentation in: # http://doc.scrapy.org/en/latest/topics/spider-middleware.html import random class UserAgentMiddleware(object): """ Randomly rotate user agents based on a list of predefined ones """ def __init__(self, agents): self.agents = agents @classmethod def from_crawler(cls, crawler): return cls(crawler.settings.getlist('USER_AGENTS')) def process_request(self, request, spider): request.headers.setdefault('User-Agent', random.choice(self.agents)) # request.headers.setdefault('User-Agent', self.agents[0]) ================================================ FILE: news/middlewares.py ================================================ # -*- coding: utf-8 -*- # Define here the models for your spider middleware # # See documentation in: # https://doc.scrapy.org/en/latest/topics/spider-middleware.html from scrapy import signals class NewsSpiderMiddleware(object): # Not all methods need to be defined. If a method is not defined, # scrapy acts as if the spider middleware does not modify the # passed objects. @classmethod def from_crawler(cls, crawler): # This method is used by Scrapy to create your spiders. s = cls() crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) return s def process_spider_input(self, response, spider): # Called for each response that goes through the spider # middleware and into the spider. # Should return None or raise an exception. return None def process_spider_output(self, response, result, spider): # Called with the results returned from the Spider, after # it has processed the response. # Must return an iterable of Request, dict or Item objects. for i in result: yield i def process_spider_exception(self, response, exception, spider): # Called when a spider or process_spider_input() method # (from other spider middleware) raises an exception. # Should return either None or an iterable of Response, dict # or Item objects. pass def process_start_requests(self, start_requests, spider): # Called with the start requests of the spider, and works # similarly to the process_spider_output() method, except # that it doesn’t have a response associated. # Must return only requests (not items). for r in start_requests: yield r def spider_opened(self, spider): spider.logger.info('Spider opened: %s' % spider.name) class NewsDownloaderMiddleware(object): # Not all methods need to be defined. If a method is not defined, # scrapy acts as if the downloader middleware does not modify the # passed objects. @classmethod def from_crawler(cls, crawler): # This method is used by Scrapy to create your spiders. s = cls() crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) return s def process_request(self, request, spider): # Called for each request that goes through the downloader # middleware. # Must either: # - return None: continue processing this request # - or return a Response object # - or return a Request object # - or raise IgnoreRequest: process_exception() methods of # installed downloader middleware will be called return None def process_response(self, request, response, spider): # Called with the response returned from the downloader. # Must either; # - return a Response object # - return a Request object # - or raise IgnoreRequest return response def process_exception(self, request, exception, spider): # Called when a download handler or a process_request() # (from other downloader middleware) raises an exception. # Must either: # - return None: continue processing this exception # - return a Response object: stops process_exception() chain # - return a Request object: stops process_exception() chain pass def spider_opened(self, spider): spider.logger.info('Spider opened: %s' % spider.name) ================================================ FILE: news/pipelines/__init__.py ================================================ #!/usr/bin/env python # encoding: utf-8 """ @author: zhanghe @software: PyCharm @file: __init__.py.py @time: 2018-02-10 17:10 """ def func(): pass class Main(object): def __init__(self): pass if __name__ == '__main__': pass ================================================ FILE: news/pipelines/de_duplication_request.py ================================================ # -*- coding: utf-8 -*- # Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html from news.items import FetchResultItem from tools.duplicate import is_dup_detail, add_dup_detail class DeDuplicationRequestPipeline(object): """ 去重 - 请求 注意: 1、置于数据存储 pipeline 之后 2、与 DeDuplicationRequestMiddleware 配合使用 """ def process_item(self, item, spider): spider_name = spider.name if isinstance(item, FetchResultItem): # 详细页url 加入去重集合 if not is_dup_detail(item['article_url'], spider_name, item['channel_id']): add_dup_detail(item['article_url'], spider_name, item['channel_id']) return item ================================================ FILE: news/pipelines/de_duplication_store_mysql.py ================================================ # -*- coding: utf-8 -*- # Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html from models.news import FetchResult from news.items import FetchResultItem from apps.client_db import db_session_mysql from tools.weixin import get_finger from maps.platform import WEIXIN, WEIBO from scrapy.exceptions import DropItem class DeDuplicationStoreMysqlPipeline(object): """ 去重 - 入库 注意: 1、置于数据存储 pipeline 之前 """ def process_item(self, item, spider): session = db_session_mysql() try: if isinstance(item, FetchResultItem): if spider.name == 'weixin': # 标题(微信只能通过标题去重, 因为链接带过期签名) article_id_count = session.query(FetchResult) \ .filter(FetchResult.platform_id == WEIXIN, FetchResult.article_id == get_finger(item['article_title'])) \ .count() if article_id_count: raise DropItem( '%s Has been duplication of article_title: %s' % (spider.name, item['article_title'])) if spider.name == 'weibo': # 详细链接(微博可以直接通过链接去重) article_url_count = session.query(FetchResult) \ .filter(FetchResult.platform_id == WEIBO, FetchResult.article_id == get_finger(item['article_url'])) \ .count() if article_url_count: raise DropItem( '%s Has been duplication of article_url: %s' % (spider.name, item['article_url'])) return item except Exception as e: raise e finally: session.close() ================================================ FILE: news/pipelines/exporter_csv.py ================================================ # -*- coding: utf-8 -*- # Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html from scrapy import signals from scrapy.exporters import CsvItemExporter class CsvExportPipeline(object): def __init__(self): self.files = {} self.exporter = None @classmethod def from_crawler(cls, crawler): pipeline = cls() crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) return pipeline def spider_opened(self, spider): file_csv = open('%s_items.csv' % spider.name, 'w+b') self.files[spider] = file_csv self.exporter = CsvItemExporter(file_csv) self.exporter.start_exporting() def spider_closed(self, spider): self.exporter.finish_exporting() file_csv = self.files.pop(spider) file_csv.close() def process_item(self, item, spider): self.exporter.export_item(item) return item ================================================ FILE: news/pipelines/img_remote_to_local_fs.py ================================================ # -*- coding: utf-8 -*- # Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html import re # from urlparse import urljoin # PY2 # from urllib.parse import urljoin # PY3 from future.moves.urllib.parse import urljoin from news.items import FetchResultItem from libs.weed_fs import WeedFSClient from config import current_config WEED_FS_URL = current_config.WEED_FS_URL weed_fs_client = WeedFSClient(WEED_FS_URL) def remote_to_local(remote_file_path): """ 保存远程图片文件 :param remote_file_path: :return: """ remote_file_save_result = weed_fs_client.save_file(remote_file_path=remote_file_path) local_file_url = weed_fs_client.get_file_url(remote_file_save_result['fid'], '/') return local_file_url def add_src(html_body, base=''): """ 添加图片文件链接(1、添加真实链接;2、替换本地链接) :param html_body: :param base: :return: """ rule = r'data-src="(.*?)"' img_data_src_list = re.compile(rule, re.I).findall(html_body) for img_src in img_data_src_list: # 处理相对链接 if base: new_img_src = urljoin(base, img_src) if new_img_src.startswith('/'): continue # 远程转本地 local_img_src = remote_to_local(new_img_src) img_dict = { 'img_src': img_src, 'local_img_src': local_img_src } html_body = html_body.replace(img_src, '%(img_src)s" src="%(local_img_src)s' % img_dict) return html_body def replace_src(html_body, base=''): """ 替换图片文件链接(替换本地链接) :param html_body: :param base: :return: """ rule = r'src="(.*?)"' img_data_src_list = re.compile(rule, re.I).findall(html_body) for img_src in img_data_src_list: # 处理//,补充协议 if img_src.startswith('//'): img_src = 'http:%s' % img_src # 处理相对链接 if base: new_img_src = urljoin(base, img_src) if new_img_src.startswith('/'): continue # 远程转本地 local_img_src = remote_to_local(new_img_src) img_dict = { 'img_src': img_src, 'local_img_src': local_img_src } html_body = html_body.replace(img_src, '%(local_img_src)s" data-src="%(img_src)s' % img_dict) return html_body class ImgRemoteToLocalFSPipeline(object): """ 图片 远程链接 转 本地文件系统链接 注意: 1、置于数据存储 pipeline 之前 """ def process_item(self, item, spider): spider_name = spider.name # 读取抓取内容 if isinstance(item, FetchResultItem): if spider_name in ['weixin']: html_body = item['article_content'] base = item['article_url'] item['article_content'] = add_src(html_body, base) if spider_name in ['weibo']: html_body = item['article_content'] base = item['article_url'] item['article_content'] = replace_src(html_body, base) if spider_name in ['toutiao', 'toutiao_m']: html_body = item['article_content'] base = item['article_url'] item['article_content'] = replace_src(html_body, base) return item ================================================ FILE: news/pipelines/store_mysql.py ================================================ # -*- coding: utf-8 -*- # Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html from models.news import FetchResult from news.items import FetchResultItem from apps.client_db import db_session_mysql class StoreMysqlPipeline(object): """ 基于 MySQL 的存储 """ def process_item(self, item, spider): session = db_session_mysql() try: if isinstance(item, FetchResultItem): fetch_result = FetchResult(**item) # 数据入库 session.add(fetch_result) session.flush() # session.commit() return item except Exception as e: raise e finally: session.close() ================================================ FILE: news/pipelines.py ================================================ # -*- coding: utf-8 -*- # Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html class NewsPipeline(object): def process_item(self, item, spider): return item ================================================ FILE: news/settings.py ================================================ # -*- coding: utf-8 -*- # Scrapy settings for news project # # For simplicity, this file contains only settings considered important or # commonly used. You can find more settings consulting the documentation: # # https://doc.scrapy.org/en/latest/topics/settings.html # https://doc.scrapy.org/en/latest/topics/downloader-middleware.html # https://doc.scrapy.org/en/latest/topics/spider-middleware.html BOT_NAME = 'news' SPIDER_MODULES = ['news.spiders'] NEWSPIDER_MODULE = 'news.spiders' # Crawl responsibly by identifying yourself (and your website) on the user-agent #USER_AGENT = 'news (+http://www.yourdomain.com)' # Obey robots.txt rules ROBOTSTXT_OBEY = False # Configure maximum concurrent requests performed by Scrapy (default: 16) #CONCURRENT_REQUESTS = 32 # Configure a delay for requests for the same website (default: 0) # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay # See also autothrottle settings and docs DOWNLOAD_DELAY = 2 # The download delay setting will honor only one of: #CONCURRENT_REQUESTS_PER_DOMAIN = 16 #CONCURRENT_REQUESTS_PER_IP = 16 # Disable cookies (enabled by default) COOKIES_ENABLED = True COOKIES_DEBUG = True # Disable Telnet Console (enabled by default) #TELNETCONSOLE_ENABLED = False # Override the default request headers: #DEFAULT_REQUEST_HEADERS = { # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', # 'Accept-Language': 'en', #} DEFAULT_REQUEST_HEADERS = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8', } # Enable or disable spider middlewares # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html #SPIDER_MIDDLEWARES = { # 'news.middlewares.NewsSpiderMiddleware': 543, #} # Enable or disable downloader middlewares # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html #DOWNLOADER_MIDDLEWARES = { # 'news.middlewares.NewsDownloaderMiddleware': 543, #} # Enable or disable extensions # See https://doc.scrapy.org/en/latest/topics/extensions.html #EXTENSIONS = { # 'scrapy.extensions.telnet.TelnetConsole': None, #} # Configure item pipelines # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html #ITEM_PIPELINES = { # 'news.pipelines.NewsPipeline': 300, #} ITEM_PIPELINES = { 'news.pipelines.store_mysql.StoreMysqlPipeline': 400, } # Enable and configure the AutoThrottle extension (disabled by default) # See https://doc.scrapy.org/en/latest/topics/autothrottle.html #AUTOTHROTTLE_ENABLED = True # The initial download delay #AUTOTHROTTLE_START_DELAY = 5 # The maximum download delay to be set in case of high latencies #AUTOTHROTTLE_MAX_DELAY = 60 # The average number of requests Scrapy should be sending in parallel to # each remote server #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 # Enable showing throttling stats for every response received: #AUTOTHROTTLE_DEBUG = False # Enable and configure HTTP caching (disabled by default) # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings #HTTPCACHE_ENABLED = True #HTTPCACHE_EXPIRATION_SECS = 0 #HTTPCACHE_DIR = 'httpcache' #HTTPCACHE_IGNORE_HTTP_CODES = [] #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' # USER_AGENTS USER_AGENTS = [ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36", "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)", "Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)", "Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)", "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)", "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)", "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)", "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)", "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6", "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1", "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0", "Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5", "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20", "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52" ] ================================================ FILE: news/spiders/__init__.py ================================================ # This package will contain the spiders of your Scrapy project # # Please refer to the documentation for information on how to create and manage # your spiders. ================================================ FILE: news/spiders/ip.py ================================================ # -*- coding: utf-8 -*- import scrapy class IpSpider(scrapy.Spider): """ IP代理测试 蜘蛛 重试3次,每次超时10秒 使用: 进入项目目录 $ scrapy crawl ip """ name = "ip" allowed_domains = ["ip.cn"] start_urls = ( 'https://ip.cn', ) custom_settings = dict( COOKIES_ENABLED=True, DEFAULT_REQUEST_HEADERS={ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:57.0) Gecko/20100101 Firefox/57.0' }, USER_AGENT='Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:57.0) Gecko/20100101 Firefox/57.0', DOWNLOADER_MIDDLEWARES={ 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None, 'news.middlewares.useragent.UserAgentMiddleware': 500, 'news.middlewares.httpproxy.HttpProxyMiddleware': 720, # 代理(cookie需要与代理IP关联) }, ITEM_PIPELINES={ 'news.pipelines.store_mysql.StoreMysqlPipeline': 450, }, DOWNLOAD_TIMEOUT=10 ) def parse(self, response): info = response.xpath('//div[@class="well"]//code/text()').extract() ip_info = dict(zip(['ip', 'address'], info)) yield ip_info ================================================ FILE: news/spiders/toutiao_m.py ================================================ # -*- coding: utf-8 -*- from __future__ import print_function from __future__ import unicode_literals import json import time import scrapy from apps.client_db import get_item from maps.channel import channel_name_map from maps.platform import platform_name_map from models.news import FetchTask from news.items import FetchResultItem from tools.date_time import time_local_to_utc from tools.scrapy_tasks import pop_task from tools.toutiao_m import get_as_cp, ParseJsTt, parse_toutiao_js_body from tools.url import get_update_url class ToutiaoMSpider(scrapy.Spider): """ 头条蜘蛛 """ name = 'toutiao_m' allowed_domains = ['toutiao.com', 'snssdk.com'] custom_settings = dict( COOKIES_ENABLED=True, DEFAULT_REQUEST_HEADERS={ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:57.0) Gecko/20100101 Firefox/57.0' }, USER_AGENT='Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:57.0) Gecko/20100101 Firefox/57.0', DOWNLOADER_MIDDLEWARES={ 'news.middlewares.de_duplication_request.DeDuplicationRequestMiddleware': 140, # 去重请求 # 'news.middlewares.anti_spider.AntiSpiderMiddleware': 160, # 反爬处理 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None, 'news.middlewares.useragent.UserAgentMiddleware': 500, # 'news.middlewares.httpproxy.HttpProxyMiddleware': 720, }, ITEM_PIPELINES={ 'news.pipelines.de_duplication_store_mysql.DeDuplicationStoreMysqlPipeline': 400, # 去重存储 'news.pipelines.store_mysql.StoreMysqlPipeline': 450, 'news.pipelines.de_duplication_request.DeDuplicationRequestPipeline': 500, # 去重请求 }, DOWNLOAD_DELAY=0.5 ) # start_urls = ['http://toutiao.com/'] # start_urls = ['https://www.toutiao.com/ch/news_finance/'] def start_requests(self): """ 入口准备 :return: """ url_params = { 'version_code': '6.4.2', 'version_name': '', 'device_platform': 'iphone', 'tt_from': 'weixin', 'utm_source': 'weixin', 'utm_medium': 'toutiao_ios', 'utm_campaign': 'client_share', 'wxshare_count': '1', } task_id = pop_task(self.name) if not task_id: print('%s task is empty' % self.name) return print('%s task id: %s' % (self.name, task_id)) task_item = get_item(FetchTask, task_id) fetch_url = 'http://m.toutiao.com/profile/%s/' % task_item.follow_id url_profile = get_update_url(fetch_url, url_params) meta = { 'task_id': task_item.id, 'platform_id': task_item.platform_id, 'channel_id': task_item.channel_id, 'follow_id': task_item.follow_id, 'follow_name': task_item.follow_name, } yield scrapy.Request(url=url_profile, callback=self.get_profile, meta=meta) def get_profile(self, response): userid = response.xpath('//button[@itemid="topsharebtn"]/@data-userid').extract_first(default='') mediaid = response.xpath('//button[@itemid="topsharebtn"]/@data-mediaid').extract_first(default='') meta = dict(response.meta, userid=userid, mediaid=mediaid) url = 'http://open.snssdk.com/jssdk_signature/' url_params = { 'appid': 'wxe8b89be1715734a6', 'noncestr': 'Wm3WZYTPz0wzccnW', 'timestamp': '%13d' % (time.time() * 1000), 'callback': 'jsonp2', } url_jssdk_signature = get_update_url(url, url_params) yield scrapy.Request(url=url_jssdk_signature, callback=self.jssdk_signature, meta=meta) def jssdk_signature(self, response): AS, CP = get_as_cp() jsonp_index = 3 url = 'https://www.toutiao.com/pgc/ma/' url_params = { 'page_type': 1, 'max_behot_time': '', 'uid': response.meta['userid'], 'media_id': response.meta['mediaid'], 'output': 'json', 'is_json': 1, 'count': 20, 'from': 'user_profile_app', 'version': 2, 'as': AS, 'cp': CP, 'callback': 'jsonp%d' % jsonp_index, } url_article_list = get_update_url(url, url_params) meta = dict(response.meta, jsonp_index=jsonp_index) yield scrapy.Request(url=url_article_list, callback=self.parse_article_list, meta=meta) def parse_article_list(self, response): """ 文章列表 :param response: :return: """ body = response.body_as_unicode() jsonp_text = 'jsonp%d' % response.meta.get('jsonp_index', 0) result = json.loads(body.lstrip('%s(' % jsonp_text).rstrip(')')) # 翻页 TODO FIX has_more = result.get('has_more') if has_more: max_behot_time = result['next']['max_behot_time'] AS, CP = get_as_cp() jsonp_index = response.meta.get('jsonp_index', 0) + 1 url_params_next = { 'max_behot_time': max_behot_time, 'as': AS, 'cp': CP, 'callback': 'jsonp%d' % jsonp_index, } url_article_list_next = get_update_url(response.url, url_params_next) meta = dict(response.meta, jsonp_index=jsonp_index) yield scrapy.Request(url=url_article_list_next, callback=self.parse_article_list, meta=meta) # 详情 data_list = result.get('data', []) for data_item in data_list: detail_url = data_item.get('source_url') meta = dict(response.meta, detail_url=detail_url) yield scrapy.Request(url=detail_url, callback=self.parse_article_detail, meta=meta) def parse_article_detail(self, response): """ 文章详情 :param response: :return: """ toutiao_body = response.body_as_unicode() js_body = parse_toutiao_js_body(toutiao_body, response.meta['detail_url']) if not js_body: return pj = ParseJsTt(js_body=js_body) article_id = pj.parse_js_item_id() article_title = pj.parse_js_title() article_abstract = pj.parse_js_abstract() article_content = pj.parse_js_content() article_pub_time = pj.parse_js_pub_time() article_tags = pj.parse_js_tags() fetch_result_item = FetchResultItem() fetch_result_item['task_id'] = response.meta['task_id'] fetch_result_item['platform_id'] = response.meta['platform_id'] fetch_result_item['platform_name'] = platform_name_map.get(response.meta['platform_id'], '') fetch_result_item['channel_id'] = response.meta['channel_id'] fetch_result_item['channel_name'] = channel_name_map.get(response.meta['channel_id'], '') fetch_result_item['article_id'] = article_id fetch_result_item['article_title'] = article_title fetch_result_item['article_author_id'] = response.meta['follow_id'] fetch_result_item['article_author_name'] = response.meta['follow_name'] fetch_result_item['article_pub_time'] = time_local_to_utc(article_pub_time).strftime('%Y-%m-%d %H:%M:%S') fetch_result_item['article_url'] = response.url or response.meta['detail_url'] fetch_result_item['article_tags'] = article_tags fetch_result_item['article_abstract'] = article_abstract fetch_result_item['article_content'] = article_content yield fetch_result_item ================================================ FILE: news/spiders/weibo.py ================================================ # -*- coding: utf-8 -*- from __future__ import print_function from __future__ import unicode_literals import json import re import time from datetime import datetime import scrapy import six from lxml.html import fromstring, tostring from apps.client_db import get_item from maps.channel import channel_name_map from maps.platform import platform_name_map from models.news import FetchTask from news.items import FetchResultItem from tools.date_time import time_local_to_utc from tools.scrapy_tasks import pop_task from tools.url import get_update_url, get_request_finger from tools.weibo import get_su, get_login_data class WeiboSpider(scrapy.Spider): """ 微博蜘蛛 """ name = 'weibo' allowed_domains = ['weibo.com', 'weibo.cn', 'sina.com.cn', 'sina.cn'] custom_settings = dict( COOKIES_ENABLED=True, DEFAULT_REQUEST_HEADERS={ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:57.0) Gecko/20100101 Firefox/57.0' }, USER_AGENT='Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:57.0) Gecko/20100101 Firefox/57.0', DOWNLOADER_MIDDLEWARES={ 'news.middlewares.de_duplication_request.DeDuplicationRequestMiddleware': 140, # 去重请求 # 'news.middlewares.anti_spider.AntiSpiderMiddleware': 160, # 反爬处理 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None, 'news.middlewares.useragent.UserAgentMiddleware': 500, # 'news.middlewares.httpproxy.HttpProxyMiddleware': 720, }, ITEM_PIPELINES={ 'news.pipelines.de_duplication_store_mysql.DeDuplicationStoreMysqlPipeline': 400, # 去重存储 'news.pipelines.store_mysql.StoreMysqlPipeline': 450, 'news.pipelines.de_duplication_request.DeDuplicationRequestPipeline': 500, # 去重请求 }, DOWNLOAD_DELAY=0.5 ) passport_weibo_login_url = 'https://passport.weibo.cn/signin/login' start_urls = ['http://weibo.cn/'] uid = 0 login_form_data = { 'username': '', 'password': '', 'savestate': '1', 'r': '', 'ec': '0', 'pagerefer': '', 'entry': 'mweibo', 'wentry': '', 'loginfrom': '', 'client_id': '', 'code': '', 'qq': '', 'mainpageflag': '1', 'hff': '', 'hfp': '' } def parse(self, response): return self.passport_weibo_login() def passport_weibo_login(self): yield scrapy.Request(url=self.passport_weibo_login_url, callback=self.login_sina_sso_prelogin) def login_sina_sso_prelogin(self, response): login_data = get_login_data() self.login_form_data.update(login_data) login_sina_sso_prelogin_url = 'https://login.sina.com.cn/sso/prelogin.php' query_payload = { 'checkpin': '1', 'entry': 'mweibo', 'su': get_su(login_data.get('username', '')), 'callback': 'jsonpcallback%13d' % (time.time()*1000), } request_url = get_update_url(login_sina_sso_prelogin_url, query_payload) yield scrapy.Request(url=request_url, callback=self.passport_weibo_sso_login) def passport_weibo_sso_login(self, response): passport_weibo_sso_login_url = 'https://passport.weibo.cn/sso/login' yield scrapy.FormRequest( url=passport_weibo_sso_login_url, formdata=self.login_form_data, callback=self.after_login ) def after_login(self, response): data = { 'savestate': '1', 'callback': 'jsonpcallback%13d' % (time.time()*1000), } res = response.body_as_unicode() info = json.loads(res) crossdomainlist = info['data']['crossdomainlist'] self.uid = info['data']['uid'] url_weibo_com = get_update_url(crossdomainlist['weibo.com'], data) url_sina_com_cn = get_update_url(crossdomainlist['sina.com.cn'], data) url_weibo_cn = get_update_url(crossdomainlist['weibo.cn'], data) url_items = { 'url_weibo_com': url_weibo_com, 'url_sina_com_cn': url_sina_com_cn, 'url_weibo_cn': url_weibo_cn, } meta = dict(response.meta, **url_items) # 跨域处理 weibo.com yield scrapy.Request(url=url_weibo_com, callback=self.crossdomain_weibo_com, meta=meta) def crossdomain_weibo_com(self, response): """ 跨域处理 weibo.com :param response: :return: """ # 跨域处理 sina.com.cn url_sina_com_cn = response.meta['url_sina_com_cn'] yield scrapy.Request(url=url_sina_com_cn, callback=self.crossdomain_sina_com_cn, meta=response.meta) def crossdomain_sina_com_cn(self, response): """ 跨域处理 sina.com.cn :param response: :return: """ # 跨域处理 weibo.cn url_weibo_cn = response.meta['url_weibo_cn'] yield scrapy.Request(url=url_weibo_cn, callback=self.crossdomain_weibo_cn, meta=response.meta) def crossdomain_weibo_cn(self, response): """ 跨域处理 weibo.cn :param response: :return: """ # 获取登录状态 weibo.cn yield scrapy.Request(url='https://weibo.cn/', callback=self.weibo_cn_index) def weibo_cn_index(self, response): """ 获取登录状态 :param response: :return: """ print(response.url) title = response.xpath('//title/text()').extract_first() if title == '我的首页': print('登录成功') # follow_url = 'https://weibo.cn/%s/follow' % self.uid # yield scrapy.Request(url=follow_url, callback=self.parse_follow_list) # 获取登录状态 weibo.com yield scrapy.Request(url='https://weibo.com/', callback=self.weibo_com_index) else: print('登录失败') def weibo_com_index(self, response): """ 获取登录状态 :param response: :return: """ print(response.url) title = response.xpath('//title/text()').extract_first() if '我的首页' in title: print('登录成功') # follow_url = 'https://weibo.cn/%s/follow' % self.uid # yield scrapy.Request(url=follow_url, callback=self.parse_follow_list) return self.get_article_task() else: print('登录失败') def parse_follow_list(self, response): """ 已关注列表 """ print(response.url) # 进入关注用户页面 follows = response.xpath('//table//tr/td/a[1]/@href').extract() for follow in follows: yield scrapy.Request(url=follow, callback=self.follow_home_list) # 关注列表翻页 next_url = response.xpath('//div[@id="pagelist"]//a[contains(text(), "下页")]/@href').extract_first(default='') next_url = response.urljoin(next_url) if next_url == response.url: print('当前条件列表页最后一页:%s' % response.url) else: yield scrapy.Request(url=next_url, callback=self.parse_follow_list) def follow_home_list(self, response): """ 已关注用户首页列表 """ contents = response.xpath('//div[@class="c"]//span[@class="ctt"]/text()').extract() for content in contents: print(content) def get_article_task(self): """ 文章抓取入口 :return: """ task_id = pop_task(self.name) if not task_id: print('%s task is empty' % self.name) return print('%s task id: %s' % (self.name, task_id)) task_item = get_item(FetchTask, task_id) article_id = task_item.follow_id article_list_url = 'https://weibo.com/p/%s/wenzhang' % article_id meta = { 'task_id': task_item.id, 'platform_id': task_item.platform_id, 'channel_id': task_item.channel_id, 'follow_id': task_item.follow_id, 'follow_name': task_item.follow_name, } yield scrapy.Request(url=article_list_url, callback=self.parse_article_list, meta=meta) @staticmethod def replace_all(input_html, replace_dict): """ 用字典实现批量替换 """ for k, v in six.iteritems(replace_dict): input_html = input_html.replace(k, v) return input_html def parse_article_list(self, response): """ 文章列表解析 没有翻页特征 下一页<\/span> 解析链接 href=\"\/p\/1005051627825392\/wenzhang?pids=Pl_Core_ArticleList__61&cfs=600&Pl_Core_ArticleList__61_filter=&Pl_Core_ArticleList__61_page=6#Pl_Core_ArticleList__61\" """ print('task_url: %s' % response.url) # 页面解析(微博是JS动态数据, 无法直接解析页面) article_list_body = response.body_as_unicode() article_list_rule = r'' article_list_re_parse = re.compile(article_list_rule, re.S).findall(article_list_body) if not article_list_re_parse: return article_list_html = ''.join(article_list_re_parse) # 转义字符处理 article_list_html = article_list_html.replace('\\r', '') article_list_html = article_list_html.replace('\\t', '') article_list_html = article_list_html.replace('\\n', '') article_list_html = article_list_html.replace('\\"', '"') article_list_html = article_list_html.replace('\\/', '/') article_list_doc = fromstring(article_list_html) article_list_doc_parse = article_list_doc.xpath('//div[@class="text_box"]') for article_item in article_list_doc_parse: article_detail_url = article_item.xpath('./div[@class="title W_autocut"]/a[@class="W_autocut S_txt1"]/@href') article_detail_title = article_item.xpath('./div[@class="title W_autocut"]/a[@class="W_autocut S_txt1"]/text()') article_detail_abstract = article_item.xpath('./div[@class="text"]/a[@class="S_txt1"]/text()') if not (article_detail_url and article_detail_title): continue article_detail_url = article_detail_url[0].strip() article_detail_url = response.urljoin(article_detail_url) article_detail_title = article_detail_title[0].strip() article_detail_abstract = article_detail_abstract[0].strip() if article_detail_abstract else '' meta_article_item = { 'article_url': article_detail_url, 'article_title': article_detail_title, 'article_abstract': article_detail_abstract, 'article_id': get_request_finger(article_detail_url), } meta = dict(response.meta, **meta_article_item) # 两种不同类型页面 if '/ttarticle/p/show?id=' in article_detail_url: yield scrapy.Request(url=article_detail_url, callback=self.parse_article_detail_html, meta=meta) else: yield scrapy.Request(url=article_detail_url, callback=self.parse_article_detail_js, meta=meta) # 翻页处理 next_url_parse = article_list_doc.xpath('//a[@class="page next S_txt1 S_line1"]/@href') if not next_url_parse: print('当前条件列表页最后一页:%s' % response.url) else: next_url = next_url_parse[0] next_url = response.urljoin(next_url) print(next_url) yield scrapy.Request(url=next_url, callback=self.parse_article_list, meta=response.meta) def parse_article_detail_html(self, response): """ 文章详情解析 html 版 :param response: :return: """ article_title = response.xpath('//div[@class="title"]/text()').extract_first(default='') article_pub_time = response.xpath('//span[@class="time"]/text()').extract_first(default='') article_content = response.xpath('//div[@class="WB_editor_iframe"]').extract_first(default='') fetch_result_item = FetchResultItem() fetch_result_item['task_id'] = response.meta['task_id'] fetch_result_item['platform_id'] = response.meta['platform_id'] fetch_result_item['platform_name'] = platform_name_map.get(response.meta['platform_id'], '') fetch_result_item['channel_id'] = response.meta['channel_id'] fetch_result_item['channel_name'] = channel_name_map.get(response.meta['channel_id'], '') fetch_result_item['article_id'] = response.meta['article_id'] fetch_result_item['article_title'] = article_title fetch_result_item['article_author_id'] = response.meta['follow_id'] fetch_result_item['article_author_name'] = response.meta['follow_name'] fetch_result_item['article_pub_time'] = article_pub_time fetch_result_item['article_url'] = response.url fetch_result_item['article_tags'] = '' fetch_result_item['article_abstract'] = response.meta['article_abstract'] fetch_result_item['article_content'] = article_content yield fetch_result_item @staticmethod def trans_time(time_str): """ 时间转换 :param time_str: :return: """ time_rule = r'(\d+)年(\d+)月(\d+)日 (\d+):(\d+)' time_parse = re.compile(time_rule, re.S).findall(time_str) if not time_parse: return time.strftime('%Y-%m-%d %H:%M:%S') return datetime(*[int(i) for i in time_parse[0]]).strftime('%Y-%m-%d %H:%M:%S') def parse_article_detail_js(self, response): """ 文章详情解析 js 版 :param response: :return: """ article_detail_body = response.body_as_unicode() article_detail_rule = r'' article_detail_re_parse = re.compile(article_detail_rule, re.S).findall(article_detail_body) if not article_detail_re_parse: return article_detail_html = ''.join(article_detail_re_parse) # 转义字符处理 article_detail_html = article_detail_html.replace('\\r', '') article_detail_html = article_detail_html.replace('\\t', '') article_detail_html = article_detail_html.replace('\\n', '') article_detail_html = article_detail_html.replace('\\"', '"') article_detail_html = article_detail_html.replace('\\/', '/') article_detail_doc = fromstring(article_detail_html) article_title_parse = article_detail_doc.xpath('//h1[@class="title"]/text()') article_title = article_title_parse[0].strip() if article_title_parse else '' article_pub_time_parse = article_detail_doc.xpath('//span[@class="time"]/text()') article_pub_time = self.trans_time(article_pub_time_parse[0].strip()) if article_pub_time_parse else time.strftime('%Y-%m-%d %H:%M:%S') article_content_parse = article_detail_doc.xpath('//div[@class="WBA_content"]') article_content = tostring(article_content_parse[0], encoding='unicode').strip() if article_content_parse else '' fetch_result_item = FetchResultItem() fetch_result_item['task_id'] = response.meta['task_id'] fetch_result_item['platform_id'] = response.meta['platform_id'] fetch_result_item['platform_name'] = platform_name_map.get(response.meta['platform_id'], '') fetch_result_item['channel_id'] = response.meta['channel_id'] fetch_result_item['channel_name'] = channel_name_map.get(response.meta['channel_id'], '') fetch_result_item['article_id'] = response.meta['article_id'] fetch_result_item['article_title'] = article_title fetch_result_item['article_author_id'] = response.meta['follow_id'] fetch_result_item['article_author_name'] = response.meta['follow_name'] fetch_result_item['article_pub_time'] = time_local_to_utc(article_pub_time).strftime('%Y-%m-%d %H:%M:%S') fetch_result_item['article_url'] = response.url fetch_result_item['article_tags'] = '' fetch_result_item['article_abstract'] = response.meta['article_abstract'] fetch_result_item['article_content'] = article_content yield fetch_result_item ================================================ FILE: news/spiders/weixin.py ================================================ # -*- coding: utf-8 -*- from __future__ import print_function from __future__ import unicode_literals import scrapy from apps.client_db import get_item from maps.channel import channel_name_map from maps.platform import platform_name_map from models.news import FetchTask from news.items import FetchResultItem from tools.cookies import get_cookies from tools.date_time import time_local_to_utc from tools.scrapy_tasks import pop_task from tools.url import get_update_url from tools.weixin import parse_weixin_js_body, ParseJsWc, check_article_title_duplicate class WeixinSpider(scrapy.Spider): """ 微信公众号蜘蛛 因微信公众号详情链接是带有效期签名的动态链接, 故无法使用请求去重中间件 """ name = 'weixin' allowed_domains = ['mp.weixin.qq.com', 'weixin.qq.com', 'qq.com', 'sogou.com'] custom_settings = dict( COOKIES_ENABLED=True, DEFAULT_REQUEST_HEADERS={ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:57.0) Gecko/20100101 Firefox/57.0' }, USER_AGENT='Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:57.0) Gecko/20100101 Firefox/57.0', DOWNLOADER_MIDDLEWARES={ # 'news.middlewares.de_duplication_request.DeDuplicationRequestMiddleware': 140, # 去重请求 'news.middlewares.anti_spider.AntiSpiderMiddleware': 160, # 反爬处理 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None, 'news.middlewares.useragent.UserAgentMiddleware': 500, # 'news.middlewares.httpproxy.HttpProxyMiddleware': 720, # 代理(cookie需要与代理IP关联) }, ITEM_PIPELINES={ 'news.pipelines.de_duplication_store_mysql.DeDuplicationStoreMysqlPipeline': 400, # 去重存储 # 'news.pipelines.img_remote_to_local_fs.ImgRemoteToLocalFSPipeline': 440, 'news.pipelines.store_mysql.StoreMysqlPipeline': 450, # 'news.pipelines.de_duplication_request.DeDuplicationRequestPipeline': 500, # 去重请求 }, DOWNLOAD_DELAY=0.5 ) def start_requests(self): """ 入口准备 :return: """ boot_url = 'http://weixin.sogou.com/weixin' task_id = pop_task(self.name) if not task_id: print('%s task is empty' % self.name) return print('%s task id: %s' % (self.name, task_id)) task_item = get_item(FetchTask, task_id) cookies_id, cookies = get_cookies(self.name) url_params = { 'type': 1, # 'query': task_item.follow_id, 'query': task_item.follow_name.encode('utf-8'), } url_profile = get_update_url(boot_url, url_params) meta = { 'cookiejar': cookies_id, 'task_id': task_item.id, 'platform_id': task_item.platform_id, 'channel_id': task_item.channel_id, 'follow_id': task_item.follow_id, 'follow_name': task_item.follow_name, } yield scrapy.Request(url=url_profile, cookies=cookies, callback=self.parse_account_search_list, meta=meta) def parse_article_search_list(self, response): """ 解析微信文章 搜索列表页面 (废弃) :param response: :return: """ news_links = response.xpath('//div[@class="txt-box"]/h3/a/@href').extract() for new_link in news_links: yield scrapy.Request(url=new_link, callback=self.parse_detail) def parse_account_search_list(self, response): """ 解析公众账号 搜索列表页面 :param response: :return: """ account_link = response.xpath('//div[@class="txt-box"]//a/@href').extract_first() if account_link: yield scrapy.Request(url=account_link, callback=self.parse_account_article_list, meta=response.meta) def parse_account_article_list(self, response): """ 解析公众账号 文章列表页面 :param response: :return: """ article_list_body = response.body_as_unicode() js_body = parse_weixin_js_body(article_list_body, response.url) if not js_body: return pj = ParseJsWc(js_body=js_body) article_list = pj.parse_js_msg_list() for article_item in article_list: # 标题去重 if check_article_title_duplicate(article_item['article_title']): continue meta = dict(response.meta, **article_item) yield scrapy.Request(url=article_item['article_url'], callback=self.parse_detail, meta=meta) def parse_detail(self, response): """ 详细页面 :param response: :return: """ article_content = ''.join([i.strip() for i in response.xpath('//div[@id="js_content"]/*').extract()]) # 原创内容处理(处理内容为空) if not article_content: share_source_url = response.xpath('//a[@id="js_share_source"]/@href').extract_first() yield scrapy.Request(url=share_source_url, callback=self.parse_detail, meta=response.meta) return fetch_result_item = FetchResultItem() fetch_result_item['task_id'] = response.meta['task_id'] fetch_result_item['platform_id'] = response.meta['platform_id'] fetch_result_item['platform_name'] = platform_name_map.get(response.meta['platform_id'], '') fetch_result_item['channel_id'] = response.meta['channel_id'] fetch_result_item['channel_name'] = channel_name_map.get(response.meta['channel_id'], '') fetch_result_item['article_id'] = response.meta['article_id'] fetch_result_item['article_title'] = response.meta['article_title'] fetch_result_item['article_author_id'] = response.meta['follow_id'] fetch_result_item['article_author_name'] = response.meta['follow_name'] fetch_result_item['article_pub_time'] = time_local_to_utc(response.meta['article_pub_time']).strftime('%Y-%m-%d %H:%M:%S') fetch_result_item['article_url'] = response.meta['article_url'] fetch_result_item['article_tags'] = '' fetch_result_item['article_abstract'] = response.meta['article_abstract'] fetch_result_item['article_content'] = article_content yield fetch_result_item ================================================ FILE: requirements-py2.txt ================================================ asn1crypto==0.24.0 attrs==19.1.0 Automat==0.7.0 certifi==2019.3.9 cffi==1.12.3 chardet==3.0.4 constantly==15.1.0 cryptography==2.6.1 cssselect==1.0.3 enum34==1.1.6 functools32==3.2.3.post2 future==0.17.1 hyperlink==19.0.0 idna==2.8 incremental==17.5.0 inflect==2.1.0 ipaddress==1.0.22 lxml==4.3.3 mysqlclient==1.4.2.post1 parsel==1.5.1 Pillow==6.0.0 psutil==5.6.2 pyasn1==0.4.5 pyasn1-modules==0.2.5 pycparser==2.19 PyDispatcher==2.0.5 PyExecJS==1.5.1 PyHamcrest==1.9.0 pyOpenSSL==19.0.0 queuelib==1.5.0 redis==3.2.1 requests==2.22.0 schedule==0.6.0 Scrapy==1.6.0 service-identity==18.1.0 six==1.12.0 sqlacodegen==1.1.6 SQLAlchemy==1.3.3 Twisted==19.2.0 urllib3==1.25.3 w3lib==1.20.0 zope.interface==4.6.0 ================================================ FILE: requirements-py3.txt ================================================ asn1crypto==0.24.0 attrs==19.1.0 Automat==0.7.0 certifi==2019.3.9 cffi==1.12.3 chardet==3.0.4 constantly==15.1.0 cryptography==2.6.1 cssselect==1.0.3 future==0.17.1 hyperlink==19.0.0 idna==2.8 incremental==17.5.0 inflect==2.1.0 lxml==4.3.3 mysqlclient==1.4.2.post1 parsel==1.5.1 Pillow==6.0.0 psutil==5.6.2 pyasn1==0.4.5 pyasn1-modules==0.2.5 pycparser==2.19 PyDispatcher==2.0.5 PyExecJS==1.5.1 PyHamcrest==1.9.0 pyOpenSSL==19.0.0 queuelib==1.5.0 redis==3.2.1 requests==2.22.0 schedule==0.6.0 Scrapy==1.6.0 service-identity==18.1.0 six==1.12.0 sqlacodegen==1.1.6 SQLAlchemy==1.3.3 Twisted==19.2.0 urllib3==1.25.3 w3lib==1.20.0 zope.interface==4.6.0 ================================================ FILE: scrapy.cfg ================================================ # Automatically created by: scrapy startproject # # For more information about the [deploy] section see: # https://scrapyd.readthedocs.io/en/latest/deploy.html [settings] default = news.settings [deploy] #url = http://localhost:6800/ project = news ================================================ FILE: tasks/__init__.py ================================================ #!/usr/bin/env python # encoding: utf-8 """ @author: zhanghe @software: PyCharm @file: __init__.py.py @time: 2018-02-10 17:10 """ def func(): pass class Main(object): def __init__(self): pass if __name__ == '__main__': pass ================================================ FILE: tasks/job_put_tasks.py ================================================ #!/usr/bin/env python # encoding: utf-8 """ @author: zhanghe @software: PyCharm @file: job_put_tasks.py @time: 2018-02-10 17:16 """ import sys from models.news import FetchTask, FetchResult, LogTaskScheduling from apps.client_db import get_group, get_all from maps.platform import WEIXIN, WEIBO, TOUTIAO from tools.scrapy_tasks import put_task, get_tasks_count def job_put_tasks(spider_name): # 如果任务队列没有消耗完毕, 不处理 tasks_count = get_tasks_count(spider_name) if tasks_count: return True spider_map = { 'weixin': WEIXIN, 'weibo': WEIBO, 'toutiao': TOUTIAO, 'toutiao_m': TOUTIAO, } # TODO 稳定运行之后需要去掉 # task_exclude = [i.task_id for i in get_group(FetchResult, 'task_id', min_count=1)] task_list = get_all(FetchTask, FetchTask.platform_id == spider_map.get(spider_name)) c = 0 for task in task_list: # 排除任务 # if task.id in task_exclude: # continue put_task(spider_name, task.id) c += 1 if c % 100 == 0: print(c) print('put %s tasks count: %s' % (spider_name, c)) return True def usage(): contents = [ 'Example:', '\tpython job_put_tasks.py wx # 微信', '\tpython job_put_tasks.py wb # 微博', '\tpython job_put_tasks.py tm # 头条(M)', '\tpython job_put_tasks.py tt # 头条(PC)', ] print('\n'.join(contents)) def run(): """ 入口 """ # print(sys.argv) spider_name_maps = { 'wx': 'weixin', 'wb': 'weibo', 'tt': 'toutiao', 'tm': 'toutiao_m', } try: if len(sys.argv) > 1: spider_name = spider_name_maps.get(sys.argv[1]) if not spider_name: raise Exception('参数错误') job_put_tasks(spider_name) else: raise Exception('缺失参数') except Exception as e: print(e.message) usage() if __name__ == '__main__': run() ================================================ FILE: tasks/job_reboot_net_china_net.py ================================================ #!/usr/bin/env python # encoding: utf-8 """ @author: zhanghe @software: PyCharm @file: job_reboot_net_china_net.py @time: 2018-05-28 19:40 """ import time from libs.optical_modem import OpticalModemChinaNet from tools.net_status import get_reboot_net_status, del_reboot_net_status net_name = 'optical_modem_china_net' def job_reboot_net_china_net(): """ 重启中国电信光猫 :return: """ # reboot_net_status = get_reboot_net_status(net_name) # if not reboot_net_status: # return om_cn = OpticalModemChinaNet() om_cn.net_ip_o = om_cn.get_net_ip() om_cn.login() # 默认用户名、密码 om_cn.reboot() time.sleep(10) om_cn.net_ip_n = om_cn.get_net_ip() om_cn.check_reboot_status() del_reboot_net_status(net_name) ================================================ FILE: tasks/jobs_proxies.py ================================================ #!/usr/bin/env python # encoding: utf-8 """ @author: zhanghe @software: PyCharm @file: jobs_proxies.py @time: 2018-03-13 17:22 """ from __future__ import print_function import sys from tools.proxies import add_proxy, len_proxy, fetch_proxy def job_proxies(spider_name, mix_num=0): if len_proxy(spider_name) <= mix_num: proxy_list = fetch_proxy() if not proxy_list: return add_proxy(spider_name, *proxy_list) print('%s add proxies: %s' % (spider_name, len(proxy_list))) def usage(): contents = [ 'Example:', '\tpython jobs_proxies.py ip # 测试', '\tpython jobs_proxies.py wx # 微信', '\tpython jobs_proxies.py wb # 微博', '\tpython jobs_proxies.py tt # 头条', ] print('\n'.join(contents)) def run(): """ 入口 """ # print(sys.argv) spider_name_maps = { 'wx': 'weixin', 'wb': 'weibo', 'tt': 'toutiao', } try: if len(sys.argv) > 1: spider_name = spider_name_maps.get(sys.argv[1], sys.argv[1]) if not spider_name: raise Exception('参数错误') job_proxies(spider_name) else: raise Exception('缺失参数') except Exception as e: print(e.message) usage() if __name__ == '__main__': run() ================================================ FILE: tasks/jobs_sogou.py ================================================ #!/usr/bin/env python # encoding: utf-8 """ @author: zhanghe @software: PyCharm @file: jobs_sogou.py @time: 2018-02-10 18:05 """ from tools.cookies import add_cookies from tools.anti_spider_sogou import auto_cookies as sogou_cookies from apps.client_rk import rk_counter_client, check_counter_limit, check_cookies_count def job_sogou_cookies(spider_name): """ sogou cookies :return: """ # 判断每天限制额度 if not check_counter_limit(): print('spider_name: %s, There is not enough available quantity' % spider_name) return False # 判断 cookie 队列长度 if not check_cookies_count(spider_name): print('spider_name: %s, The quantity of cookies is enough' % spider_name) return False sogou_cookies_obj = sogou_cookies() if not sogou_cookies_obj: return False add_cookies(spider_name, sogou_cookies_obj) rk_counter_client.increase(1) return True if __name__ == '__main__': job_sogou_cookies('weixin') ================================================ FILE: tasks/jobs_weixin.py ================================================ #!/usr/bin/env python # encoding: utf-8 """ @author: zhanghe @software: PyCharm @file: jobs_weixin.py @time: 2018-02-10 18:06 """ import json import time import sys from libs.redis_pub_sub import RedisPubSub from libs.redis_queue import RedisQueue from tools.anti_spider_weixin import auto_cookies as weixin_cookies from apps.client_db import redis_client from apps.client_rk import rk_counter_client, check_counter_limit def set_anti_spider_task(spider_name, msg): """ 设置任务队列 msg = { 'url': url, 'time': time.strftime("%Y-%m-%d %H:%M:%S") } :param spider_name: :param msg: :return: """ key = 'scrapy:anti_spider_task_weixin:%s' % spider_name q_task = RedisQueue(key, redis_client=redis_client) q_msg = json.dumps(msg) if isinstance(msg, dict) else msg # 因为微信反爬策略是通过IP限制, 这里仅仅处理一个任务 if q_task.empty(): q_task.put(q_msg) def _get_anti_spider_task(spider_name): """获取任务队列""" key = 'scrapy:anti_spider_task_weixin:%s' % spider_name q_task = RedisQueue(key, redis_client=redis_client) result = q_task.get(timeout=60) return json.loads(result) if result else {} def _set_anti_spider_result(spider_name, msg): """设置结果队列""" key = 'scrapy:anti_spider_result_weixin:%s' % spider_name q_result = RedisQueue(key, redis_client=redis_client) q_msg = json.dumps(msg) if isinstance(msg, dict) else msg q_result.put(q_msg) def _get_anti_spider_result(spider_name): """获取任务队列""" key = 'scrapy:anti_spider_result_weixin:%s' % spider_name q_result = RedisQueue(key, redis_client=redis_client) result = q_result.get(timeout=60) return json.loads(result) if result else {} def sub_anti_spider(spider_name): """ 蜘蛛订阅验证码处理结果 :param spider_name: :return: """ q = RedisPubSub('scrapy:anti_spider', redis_client=redis_client) r = q.sub_not_loop(spider_name) return json.loads(r) if r else {} def _pub_anti_spider(spider_name, msg): """ 将对应蜘蛛的验证码处理结果发布给对应订阅者 :param spider_name: :return: """ q = RedisPubSub('scrapy:anti_spider', redis_client=redis_client) msg = json.dumps(msg) if isinstance(msg, dict) else msg q.pub(spider_name, msg) def job_weixin_cookies(spider_name): """ weixin cookies :return: """ # 判断每天限制额度 if not check_counter_limit(): print('spider_name: %s, There is not enough available quantity' % spider_name) return False # 读取验证码任务队列(超时1分钟) task = _get_anti_spider_task(spider_name) if not task: return False # 设置验证码结果队列 url = task.get('url') msg = { 'url': url, 'status': False, 'time': time.strftime("%Y-%m-%d %H:%M:%S") } try: weixin_cookies_status = weixin_cookies(url) msg['status'] = weixin_cookies_status _set_anti_spider_result(spider_name, msg) # 读取验证码结果队列(超时1分钟) msg = _get_anti_spider_result(spider_name) _pub_anti_spider(spider_name, msg) rk_counter_client.increase(1) return True except Exception as e: print(e.message) _pub_anti_spider(spider_name, msg) def usage(): print('python tasks/jobs_weixin.py ') print('\tpython tasks/jobs_weixin.py job_weixin_cookies weixin') def run(): """ 启动入口 """ # print sys.argv try: if len(sys.argv) >= 3: fun_name = globals()[sys.argv[1]] fun_name(sys.argv[2]) else: usage() except NameError as e: print(e) if __name__ == '__main__': job_weixin_cookies('weixin') # run() # python tasks/jobs_weixin.py job_weixin_cookies weixin ================================================ FILE: tasks/run_job_counter_clear.py ================================================ #!/usr/bin/env python # encoding: utf-8 """ @author: zhanghe @software: PyCharm @file: run_job_counter_clear.py @time: 2018-05-02 10:24 """ import time import schedule from apps.client_rk import counter_clear as job_counter_clear from tools import catch_keyboard_interrupt # 计数清零 schedule.every().day.at('00:00').do(job_counter_clear) @catch_keyboard_interrupt def run(): while True: schedule.run_pending() time.sleep(1) if __name__ == '__main__': run() ================================================ FILE: tasks/run_job_put_tasks_toutiao.py ================================================ #!/usr/bin/env python # encoding: utf-8 """ @author: zhanghe @software: PyCharm @file: run_job_put_tasks_toutiao.py @time: 2018-05-02 10:23 """ import time import schedule from tasks.job_put_tasks import job_put_tasks from tools import catch_keyboard_interrupt # 分布式任务调度 - 头条 schedule.every(1).minutes.do(job_put_tasks, spider_name='toutiao') @catch_keyboard_interrupt def run(): while True: schedule.run_pending() time.sleep(1) if __name__ == '__main__': run() ================================================ FILE: tasks/run_job_put_tasks_weibo.py ================================================ #!/usr/bin/env python # encoding: utf-8 """ @author: zhanghe @software: PyCharm @file: run_job_put_tasks_weibo.py @time: 2018-05-02 10:23 """ import time import schedule from tasks.job_put_tasks import job_put_tasks from tools import catch_keyboard_interrupt # 分布式任务调度 - 微博 schedule.every(5).minutes.do(job_put_tasks, spider_name='weibo') @catch_keyboard_interrupt def run(): while True: schedule.run_pending() time.sleep(1) if __name__ == '__main__': run() ================================================ FILE: tasks/run_job_put_tasks_weixin.py ================================================ #!/usr/bin/env python # encoding: utf-8 """ @author: zhanghe @software: PyCharm @file: run_job_put_tasks_weixin.py @time: 2018-05-02 10:23 """ import time import schedule from tasks.job_put_tasks import job_put_tasks from tools import catch_keyboard_interrupt # 分布式任务调度 - 微信 schedule.every(5).minutes.do(job_put_tasks, spider_name='weixin') @catch_keyboard_interrupt def run(): while True: schedule.run_pending() time.sleep(1) if __name__ == '__main__': run() ================================================ FILE: tasks/run_job_reboot_net_china_net.py ================================================ #!/usr/bin/env python # encoding: utf-8 """ @author: zhanghe @software: PyCharm @file: run_job_optical_modem_china_net.py @time: 2018-05-28 19:35 """ import time import schedule from tasks.job_reboot_net_china_net import job_reboot_net_china_net from tools import catch_keyboard_interrupt # 电信光猫重启 schedule.every(15).minutes.do(job_reboot_net_china_net) @catch_keyboard_interrupt def run(): while True: schedule.run_pending() time.sleep(1) if __name__ == '__main__': run() ================================================ FILE: tasks/run_job_sogou_cookies.py ================================================ #!/usr/bin/env python # encoding: utf-8 """ @author: zhanghe @software: PyCharm @file: run_job_sogou_cookies.py @time: 2018-05-02 10:21 """ import time import schedule from tasks.jobs_sogou import job_sogou_cookies from tools import catch_keyboard_interrupt # sogou 反爬任务 schedule.every(5).minutes.do(job_sogou_cookies, spider_name='weixin') @catch_keyboard_interrupt def run(): while True: schedule.run_pending() time.sleep(1) if __name__ == '__main__': run() ================================================ FILE: tasks/run_job_weixin_cookies.py ================================================ #!/usr/bin/env python # encoding: utf-8 """ @author: zhanghe @software: PyCharm @file: run_job_weixin_cookies.py @time: 2018-05-02 10:22 """ import time import schedule from tasks.jobs_weixin import job_weixin_cookies from tools import catch_keyboard_interrupt # weixin 反爬任务 schedule.every(5).minutes.do(job_weixin_cookies, spider_name='weixin') @catch_keyboard_interrupt def run(): while True: schedule.run_pending() time.sleep(1) if __name__ == '__main__': run() ================================================ FILE: tasks/run_jobs.py ================================================ #!/usr/bin/env python # encoding: utf-8 """ @author: zhanghe @software: PyCharm @file: run_jobs.py @time: 2018-04-18 11:10 """ import schedule import time from tools import catch_keyboard_interrupt from tasks import job_put_tasks from tasks.jobs_sogou import job_sogou_cookies from tasks.jobs_weixin import job_weixin_cookies from apps.client_rk import counter_clear as job_counter_clear # sogou 反爬任务 schedule.every(5).minutes.do(job_sogou_cookies, spider_name='weixin') # weixin 反爬任务 schedule.every(5).minutes.do(job_weixin_cookies, spider_name='weixin') # 分布式任务调度 - 微信 schedule.every(5).minutes.do(job_put_tasks, spider_name='weixin') # 分布式任务调度 - 微博 schedule.every(5).minutes.do(job_put_tasks, spider_name='weibo') # 分布式任务调度 - 头条 schedule.every(5).minutes.do(job_put_tasks, spider_name='toutiao') # 计数清零 schedule.every().day.at('00:00').do(job_counter_clear) @catch_keyboard_interrupt def run(): while True: schedule.run_pending() time.sleep(1) if __name__ == '__main__': run() ================================================ FILE: tasks/run_jobs_apscheduler.py ================================================ #!/usr/bin/env python # encoding: utf-8 """ @author: zhanghe @software: PyCharm @file: run_jobs_apscheduler.py @time: 2018-02-10 18:01 """ # Deprecated from apscheduler.schedulers.blocking import BlockingScheduler from config import current_config from tasks import job_put_tasks from tasks.jobs_sogou import job_sogou_cookies from tasks.jobs_weixin import job_weixin_cookies from apps.client_rk import counter_clear as job_counter_clear REDIS = current_config.REDIS scheduler = BlockingScheduler() job_store_redis_alias = 'news_spider' def add_job_store_redis(): """ 127.0.0.1:6379> TYPE "example.jobs" hash 127.0.0.1:6379> TYPE "example.run_times" zset 127.0.0.1:6379> HGETALL "example.jobs" 1) "45431465e6104f3c924ec01852ed1aeb" 2) "\x80\x02}q\x01(U\x04argsq\x02)U\bexecutorq\x03U\adefaultq\x04U\rmax_instancesq\x05K\x01U\x04funcq\x06U\x10__main__:task_03q\aU\x02idq\bU 45431465e6104f3c924ec01852ed1aebq\tU\rnext_run_timeq\ncdatetime\ndatetime\nq\x0bU\n\a\xe1\x0c\b\x02\x01\x00\x00\x00\x00cpytz\n_p\nq\x0c(U\rAsia/Shanghaiq\rM\x80pK\x00U\x03CSTq\x0etRq\x0f\x86Rq\x10U\x04nameq\x11U\atask_03q\x12U\x12misfire_grace_timeq\x13K\x01U\atriggerq\x14capscheduler.triggers.cron\nCronTrigger\nq\x15)\x81q\x16}q\x17(U\btimezoneq\x18h\x0c(h\rM\xe8qK\x00U\x03LMTq\x19tRq\x1aU\aversionq\x1bK\x01U\nstart_dateq\x1cNU\bend_dateq\x1dNU\x06fieldsq\x1e]q\x1f(capscheduler.triggers.cron.fields\nBaseField\nq )\x81q!}q\"(U\nis_defaultq#\x88U\x0bexpressionsq$]q%capscheduler.triggers.cron.expressions\nAllExpression\nq&)\x81q'}q(U\x04stepq)Nsbah\x11U\x04yearq*ubh )\x81q+}q,(h#\x88h$]q-h&)\x81q.}q/h)Nsbah\x11U\x05monthq0ubcapscheduler.triggers.cron.fields\nDayOfMonthField\nq1)\x81q2}q3(h#\x88h$]q4h&)\x81q5}q6h)Nsbah\x11U\x03dayq7ubcapscheduler.triggers.cron.fields\nWeekField\nq8)\x81q9}q:(h#\x88h$]q;h&)\x81q<}q=h)Nsbah\x11U\x04weekq>ubcapscheduler.triggers.cron.fields\nDayOfWeekField\nq?)\x81q@}qA(h#\x88h$]qBh&)\x81qC}qDh)Nsbah\x11U\x0bday_of_weekqEubh )\x81qF}qG(h#\x89h$]qHcapscheduler.triggers.cron.expressions\nRangeExpression\nqI)\x81qJ}qK(h)NU\x04lastqLK\x16U\x05firstqMK\x00ubah\x11U\x04hourqNubh )\x81qO}qP(h#\x89h$]qQhI)\x81qR}qS(h)NhLK\x01hMK\x01ubah\x11U\x06minuteqTubh )\x81qU}qV(h#\x88h$]qWhI)\x81qX}qY(h)NhLK\x00hMK\x00ubah\x11U\x06secondqZubeubU\bcoalesceq[\x88h\x1bK\x01U\x06kwargsq\\}q]u." 3) "f5637d98946848c291da09a4ceb08027" 4) "\x80\x02}q\x01(U\x04argsq\x02)U\bexecutorq\x03U\adefaultq\x04U\rmax_instancesq\x05K\x01U\x04funcq\x06U\x10__main__:task_04q\aU\x02idq\bU f5637d98946848c291da09a4ceb08027q\tU\rnext_run_timeq\ncdatetime\ndatetime\nq\x0bU\n\a\xe1\x0c\b\x012\x00\x00\x00\x00cpytz\n_p\nq\x0c(U\rAsia/Shanghaiq\rM\x80pK\x00U\x03CSTq\x0etRq\x0f\x86Rq\x10U\x04nameq\x11U\atask_04q\x12U\x12misfire_grace_timeq\x13K\x01U\atriggerq\x14capscheduler.triggers.cron\nCronTrigger\nq\x15)\x81q\x16}q\x17(U\btimezoneq\x18h\x0c(h\rM\xe8qK\x00U\x03LMTq\x19tRq\x1aU\aversionq\x1bK\x01U\nstart_dateq\x1cNU\bend_dateq\x1dNU\x06fieldsq\x1e]q\x1f(capscheduler.triggers.cron.fields\nBaseField\nq )\x81q!}q\"(U\nis_defaultq#\x88U\x0bexpressionsq$]q%capscheduler.triggers.cron.expressions\nAllExpression\nq&)\x81q'}q(U\x04stepq)Nsbah\x11U\x04yearubh )\x81q*}q+(h#\x88h$]q,h&)\x81q-}q.h)Nsbah\x11U\x05monthubcapscheduler.triggers.cron.fields\nDayOfMonthField\nq/)\x81q0}q1(h#\x88h$]q2h&)\x81q3}q4h)Nsbah\x11U\x03dayubcapscheduler.triggers.cron.fields\nWeekField\nq5)\x81q6}q7(h#\x88h$]q8h&)\x81q9}q:h)Nsbah\x11U\x04weekubcapscheduler.triggers.cron.fields\nDayOfWeekField\nq;)\x81q<}q=(h#\x88h$]q>h&)\x81q?}q@h)Nsbah\x11U\x0bday_of_weekubh )\x81qA}qB(h#\x89h$]qCcapscheduler.triggers.cron.expressions\nRangeExpression\nqD)\x81qE}qF(h)NU\x04lastqGK\x16U\x05firstqHK\x00ubah\x11U\x04hourubh )\x81qI}qJ(h#\x89h$]qKh&)\x81qL}qMh)K\x01sbah\x11U\x06minuteubh )\x81qN}qO(h#\x88h$]qPhD)\x81qQ}qR(h)NhGK\x00hHK\x00ubah\x11U\x06secondubeubU\bcoalesceqS\x88h\x1bK\x01U\x06kwargsqT}qUu." 5) "ba044f7b253a4cb1961e7abf036f8ef7" 6) "\x80\x02}q\x01(U\x04argsq\x02)U\bexecutorq\x03U\adefaultq\x04U\rmax_instancesq\x05K\x01U\x04funcq\x06U\x10__main__:task_02q\aU\x02idq\bU ba044f7b253a4cb1961e7abf036f8ef7q\tU\rnext_run_timeq\ncdatetime\ndatetime\nq\x0bU\n\a\xe1\x0c\b\x012\r\x0f5\xf9cpytz\n_p\nq\x0c(U\rAsia/Shanghaiq\rM\x80pK\x00U\x03CSTq\x0etRq\x0f\x86Rq\x10U\x04nameq\x11U\atask_02q\x12U\x12misfire_grace_timeq\x13K\x01U\atriggerq\x14capscheduler.triggers.interval\nIntervalTrigger\nq\x15)\x81q\x16}q\x17(U\btimezoneq\x18h\x0c(h\rM\xe8qK\x00U\x03LMTq\x19tRq\x1aU\aversionq\x1bK\x01U\nstart_dateq\x1ch\x0bU\n\a\xe1\x0c\b\x01.\r\x0f5\xf9h\x0f\x86Rq\x1dU\bend_dateq\x1eNU\bintervalq\x1fcdatetime\ntimedelta\nq K\x00K ZCARD "example.run_times" (integer) 3 127.0.0.1:6379> ZRANGE "example.run_times" 0 2 WITHSCORES 1) "f5637d98946848c291da09a4ceb08027" 2) "1512669060" 3) "ba044f7b253a4cb1961e7abf036f8ef7" 4) "1512669073.9968569" 5) "45431465e6104f3c924ec01852ed1aeb" 6) "1512669660" # 清理数据 127.0.0.1:6379> DEL example.jobs (integer) 1 127.0.0.1:6379> DEL example.run_times (integer) 1 :return: """ scheduler.add_jobstore( 'redis', alias=job_store_redis_alias, jobs_key='news_spider.jobs', run_times_key='news_spider.run_times', **REDIS ) def add_job(): # sogou 反爬任务 scheduler.add_job( job_sogou_cookies, 'interval', kwargs={'spider_name': 'weixin'}, minutes=5, id='job_sogou_cookies', replace_existing=True ) # weixin 反爬任务 scheduler.add_job( job_weixin_cookies, 'interval', kwargs={'spider_name': 'weixin'}, minutes=2, id='job_weixin_cookies', replace_existing=True ) # 分布式任务调度 - 微信 scheduler.add_job( job_put_tasks, 'interval', kwargs={'spider_name': 'weixin'}, minutes=5, id='job_put_tasks_weixin', replace_existing=True ) # 分布式任务调度 - 微博 scheduler.add_job( job_put_tasks, 'interval', kwargs={'spider_name': 'weibo'}, minutes=5, id='job_put_tasks_weibo', replace_existing=True ) # 分布式任务调度 - 头条 scheduler.add_job( job_put_tasks, 'interval', kwargs={'spider_name': 'toutiao'}, minutes=5, id='job_put_tasks_toutiao', replace_existing=True ) # 计数清零 scheduler.add_job( job_counter_clear, 'cron', day='*', hour='0', id='job_counter_clear', replace_existing=True ) def run_blocking(): try: # add_job_store_redis() # 后端存储 基于redis(可选) add_job() # 添加任务 scheduler.start() # 开启调度 except (KeyboardInterrupt, SystemExit): scheduler.shutdown() # 关闭调度 if __name__ == '__main__': run_blocking() ================================================ FILE: tests/__init__.py ================================================ #!/usr/bin/env python # encoding: utf-8 """ @author: zhanghe @software: PyCharm @file: __init__.py.py @time: 2018-02-10 17:39 """ def func(): pass class Main(object): def __init__(self): pass if __name__ == '__main__': pass ================================================ FILE: tests/test_date_time.py ================================================ #!/usr/bin/env python # encoding: utf-8 """ @author: zhanghe @software: PyCharm @file: test_date_time.py @time: 2018-06-25 17:55 """ from __future__ import unicode_literals import unittest import time import datetime from tools.date_time import time_local_to_utc, time_utc_to_local class DateTimeTest(unittest.TestCase): """ 日期时间测试 """ def setUp(self): """ 获取系统时区, 设定一对本地时间和国际时间 1、断言转换后的时差是否正确 2、断言转换后的时间是否正确 :return: """ self.time_offset = time.timezone self.local_time = '2018-06-06 18:12:26' local_time_obj = datetime.datetime.strptime(self.local_time, '%Y-%m-%d %H:%M:%S') self.utc_time = (local_time_obj + datetime.timedelta(hours=self.time_offset/60/60)).strftime('%Y-%m-%d %H:%M:%S') def test_local_to_utc(self): """ 测试 :return: """ local_time_obj = datetime.datetime.strptime(self.local_time, '%Y-%m-%d %H:%M:%S') utc_time_obj = time_local_to_utc(self.local_time) self.assertEqual(utc_time_obj, local_time_obj + datetime.timedelta(seconds=self.time_offset)) self.assertEqual(self.utc_time, utc_time_obj.strftime('%Y-%m-%d %H:%M:%S')) def test_utc_to_local(self): """ 测试 :return: """ utc_time_obj = datetime.datetime.strptime(self.utc_time, '%Y-%m-%d %H:%M:%S') local_time_obj = time_utc_to_local(self.utc_time) self.assertEqual(utc_time_obj, local_time_obj + datetime.timedelta(seconds=self.time_offset)) self.assertEqual(self.local_time, local_time_obj.strftime('%Y-%m-%d %H:%M:%S')) def tearDown(self): pass if __name__ == '__main__': unittest.main() ================================================ FILE: tests/test_finger.py ================================================ #!/usr/bin/env python # encoding: utf-8 """ @author: zhanghe @software: PyCharm @file: test_finger.py @time: 2018-02-11 00:06 """ from __future__ import unicode_literals import hashlib import unittest from scrapy.http import Request from scrapy.utils import request class FingerTest(unittest.TestCase): """ 指纹测试 """ def setUp(self): self.url_01 = 'https://www.baidu.com/s?wd=openstack&rsv_spt=1' self.url_02 = 'https://www.baidu.com/s?rsv_spt=1&wd=openstack' def test_request(self): """ 测试请求 :return: """ req_01 = Request(url=self.url_01) result_01 = request.request_fingerprint(req_01) req_02 = Request(url=self.url_02) result_02 = request.request_fingerprint(req_02) self.assertEqual(result_01, result_02) def tearDown(self): pass class MD5Test(unittest.TestCase): """ md5测试 """ def setUp(self): self.url_01 = 'https://www.baidu.com/s?wd=openstack&rsv_spt=1' self.url_02 = 'https://www.baidu.com/s?rsv_spt=1&wd=openstack' def test_request(self): """ 测试请求 :return: """ m1 = hashlib.md5() m1.update(self.url_01.encode('utf-8')) result_01 = m1.hexdigest() m2 = hashlib.md5() m2.update(self.url_02.encode('utf-8')) result_02 = m2.hexdigest() self.assertNotEqual(result_01, result_02) def tearDown(self): pass if __name__ == '__main__': unittest.main() ================================================ FILE: tools/__init__.py ================================================ #!/usr/bin/env python # encoding: utf-8 """ @author: zhanghe @software: PyCharm @file: __init__.py.py @time: 2018-02-10 17:10 """ from functools import wraps def catch_keyboard_interrupt(func): @wraps(func) def wrapper(*args, **kwargs): try: return func(*args, **kwargs) except KeyboardInterrupt: print('\n强制退出') return wrapper ================================================ FILE: tools/anti_spider_sogou.py ================================================ #!/usr/bin/env python # encoding: utf-8 """ @author: zhanghe @software: PyCharm @file: anti_spider_sogou.py @time: 2018-02-10 17:24 """ from __future__ import print_function from __future__ import unicode_literals from future.builtins import input # PY2(raw_input) import random import time import json import requests from apps.client_rk import get_img_code, img_report_error from config import current_config REQUESTS_TIME_OUT = current_config.REQUESTS_TIME_OUT cookies = {} s = requests.session() headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2', 'Connection': 'keep-alive', # 'Host': 'weixin.sogou.com', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:57.0) Gecko/20100101 Firefox/57.0' } def _get_tc(): tc = str('%13d' % (time.time() * 1000)) return tc def _save_img(res): # 保存验证码图片 img_name = 'sogou_%s.jpg' % _get_tc() print('图片名称: %s' % img_name) img_content = res.content with open(img_name, b'w') as f: f.write(img_content) time.sleep(1) def anti_spider(): url = 'http://weixin.sogou.com/antispider/?from=/weixin?type=2&query=chuangbiandao' request_headers = headers.copy() request_headers['Host'] = 'weixin.sogou.com' request_cookie = { 'refresh': '1' } res = s.get(url, headers=request_headers, cookies=request_cookie, timeout=REQUESTS_TIME_OUT) cookies.update(res.cookies) print('.', end='') # print cookies def code_img_save(): url = 'http://weixin.sogou.com/antispider/util/seccode.php' request_headers = headers.copy() request_headers['Host'] = 'weixin.sogou.com' request_cookie = cookies.copy() params = { 'tc': _get_tc(), } res = s.get(url, params=params, headers=request_headers, cookies=request_cookie, timeout=REQUESTS_TIME_OUT) # 保存图片 _save_img(res) cookies.update(res.cookies) print('.', end='') # print cookies def code_img_obj(): url = 'http://weixin.sogou.com/antispider/util/seccode.php' request_headers = headers.copy() request_headers['Host'] = 'weixin.sogou.com' request_cookie = cookies.copy() params = { 'tc': _get_tc(), } res = s.get(url, params=params, headers=request_headers, cookies=request_cookie, timeout=REQUESTS_TIME_OUT) print('.', end='') return res.content def pv_refresh(): url = 'http://pb.sogou.com/pv.gif' request_headers = headers.copy() request_headers['Host'] = 'pb.sogou.com' request_cookie = { 'IPLOC': cookies['IPLOC'], 'SUIR': cookies['SUIR'], } params = { 'uigs_productid': 'webapp', 'type': 'antispider', 'subtype': 'refresh', 'domain': 'weixin', 'suv': '', 'snuid': '', 't': _get_tc(), } res = s.get(url, params=params, headers=request_headers, cookies=request_cookie, timeout=REQUESTS_TIME_OUT) cookies.update(res.cookies) print('.', end='') def pv_index(): url = 'http://pb.sogou.com/pv.gif' request_headers = headers.copy() request_headers['Host'] = 'pb.sogou.com' request_cookie = { 'IPLOC': cookies['IPLOC'], 'SUIR': cookies['SUIR'], } params = { 'uigs_productid': 'webapp', 'type': 'antispider', 'subtype': 'index', 'domain': 'weixin', 'suv': '', 'snuid': '', 't': _get_tc(), } res = s.get(url, params=params, headers=request_headers, cookies=request_cookie, timeout=REQUESTS_TIME_OUT) cookies.update(res.cookies) print('.', end='') def pv_img_cost(): url = 'http://pb.sogou.com/pv.gif' request_headers = headers.copy() request_headers['Host'] = 'pb.sogou.com' request_cookie = { 'IPLOC': cookies['IPLOC'], 'SUIR': cookies['SUIR'], } params = { 'uigs_productid': 'webapp', 'type': 'antispider', 'subtype': 'imgCost', 'domain': 'weixin', 'suv': '', 'snuid': '', 't': _get_tc(), 'cost': '27', } res = s.get(url, params=params, headers=request_headers, cookies=request_cookie, timeout=REQUESTS_TIME_OUT) cookies.update(res.cookies) print('.', end='') def pv_mouse(): url = 'http://pb.sogou.com/pv.gif' request_headers = headers.copy() request_headers['Host'] = 'pb.sogou.com' request_cookie = { 'IPLOC': cookies['IPLOC'], 'SUIR': cookies['SUIR'], 'SUV': cookies['SUV'], } params = { 'uigs_productid': 'webapp', 'type': 'antispider', 'subtype': 'mouse', 'domain': 'weixin', 'suv': cookies['SUV'], 'snuid': '', 't': _get_tc(), } res = s.get(url, params=params, headers=request_headers, cookies=request_cookie, timeout=REQUESTS_TIME_OUT) cookies.update(res.cookies) print('.', end='') def pv_img_success(): url = 'http://pb.sogou.com/pv.gif' request_headers = headers.copy() request_headers['Host'] = 'pb.sogou.com' request_cookie = { 'IPLOC': cookies['IPLOC'], 'SUIR': cookies['SUIR'], 'SUV': cookies['SUV'], } params = { 'uigs_productid': 'webapp', 'type': 'antispider', 'subtype': 'imgSuccess', 'domain': 'weixin', 'suv': cookies['SUV'], 'snuid': '', 't': _get_tc(), } res = s.get(url, params=params, headers=request_headers, cookies=request_cookie, timeout=REQUESTS_TIME_OUT) cookies.update(res.cookies) print('.', end='') def pv_real_index(): url = 'http://pb.sogou.com/pv.gif' request_headers = headers.copy() request_headers['Host'] = 'pb.sogou.com' request_cookie = { 'IPLOC': cookies['IPLOC'], 'SUIR': cookies['SUIR'], 'SUV': cookies['SUV'], } params = { 'uigs_productid': 'webapp', 'type': 'antispider', 'subtype': 'realIndex', 'domain': 'weixin', 'suv': cookies['SUV'], 'snuid': '', 't': _get_tc(), } res = s.get(url, params=params, headers=request_headers, cookies=request_cookie, timeout=REQUESTS_TIME_OUT) cookies.update(res.cookies) print('.', end='') def pv_seccode_focus(): url = 'http://pb.sogou.com/pv.gif' request_headers = headers.copy() request_headers['Host'] = 'pb.sogou.com' request_cookie = { 'IPLOC': cookies['IPLOC'], 'SUIR': cookies['SUIR'], 'SUV': cookies['SUV'], } params = { 'uigs_productid': 'webapp', 'type': 'antispider', 'subtype': 'seccodeFocus', 'domain': 'weixin', 'suv': cookies['SUV'], 'snuid': '', 't': _get_tc(), } res = s.get(url, params=params, headers=request_headers, cookies=request_cookie, timeout=REQUESTS_TIME_OUT) cookies.update(res.cookies) print('.', end='') def pv_seccode_input(): url = 'http://pb.sogou.com/pv.gif' request_headers = headers.copy() request_headers['Host'] = 'pb.sogou.com' request_cookie = { 'IPLOC': cookies['IPLOC'], 'SUIR': cookies['SUIR'], 'SUV': cookies['SUV'], } params = { 'uigs_productid': 'webapp', 'type': 'antispider', 'subtype': 'seccodeInput', 'domain': 'weixin', 'suv': cookies['SUV'], 'snuid': '', 't': _get_tc(), } res = s.get(url, params=params, headers=request_headers, cookies=request_cookie, timeout=REQUESTS_TIME_OUT) cookies.update(res.cookies) print('.', end='') def pv_seccode_blur(): url = 'http://pb.sogou.com/pv.gif' request_headers = headers.copy() request_headers['Host'] = 'pb.sogou.com' request_cookie = { 'IPLOC': cookies['IPLOC'], 'SUIR': cookies['SUIR'], 'SUV': cookies['SUV'], } params = { 'uigs_productid': 'webapp', 'type': 'antispider', 'subtype': 'seccodeBlur', 'domain': 'weixin', 'suv': cookies['SUV'], 'snuid': '', 't': _get_tc(), } res = s.get(url, params=params, headers=request_headers, cookies=request_cookie, timeout=REQUESTS_TIME_OUT) cookies.update(res.cookies) print('.', end='') def thank(code_anti_spider): url = 'http://weixin.sogou.com/antispider/thank.php' request_headers = headers.copy() request_headers['X-Requested-With'] = 'XMLHttpRequest' request_cookie = { 'ABTEST': cookies['ABTEST'], 'IPLOC': cookies['IPLOC'], 'SUID': cookies['SUID'], 'PHPSESSID': cookies['PHPSESSID'], 'SUIR': cookies['SUIR'], 'SUV': cookies['SUV'], } data = { 'c': code_anti_spider, 'r': '%2Fweixin%3Ftype%3D2', 'v': '5', } res = s.post(url, data=data, headers=request_headers, cookies=request_cookie, timeout=REQUESTS_TIME_OUT) cookies.update(res.cookies) # print cookies json_msg = json.loads(res.content) print(json_msg) return json_msg # {"code": 0,"msg": "解封成功,正在为您跳转来源地址...", "id": "ECB542781D1B4105B09FB4461E0587D4"} # {"code": 2,"msg": "未知访问来源"} # {"code": 3,"msg": "验证码输入错误, 请重新输入!"} def _get_cookies(): print(cookies) return cookies def check_n(): url = 'http://weixin.sogou.com/weixin?query=chuangbiandao&type=1' res = requests.get(url, headers=headers, timeout=REQUESTS_TIME_OUT) print(res.content) def check_y(): url = 'http://weixin.sogou.com/weixin?query=chuangbiandao&type=1' res = s.get(url, headers=headers, cookies=cookies, timeout=REQUESTS_TIME_OUT) print(res.content) def manual_cookies(): """ 获取 cookies - 手动填验证码 :return: """ anti_spider() code_img_save() # 模拟用户行为 pv_refresh() pv_index() pv_img_cost() # 模拟鼠标滑过 pv_mouse() pv_img_success() pv_real_index() # 模拟表单输入 pv_seccode_focus() pv_seccode_input() pv_seccode_blur() input_code = input('code << ') thank(input_code) return _get_cookies() def auto_cookies(): """ 获取 cookies - 第三方识别验证码 :return: """ anti_spider() im = code_img_obj() # 6位英数混合 白天:15快豆 夜间:18.75快豆 超时:60秒 img_id, img_code = get_img_code(im, im_type_id=3060) if not img_id: return None print(img_id, img_code) # 模拟用户行为 pv_refresh() pv_index() pv_img_cost() # 模拟鼠标滑过 pv_mouse() pv_img_success() pv_real_index() # 模拟表单输入 pv_seccode_focus() pv_seccode_input() pv_seccode_blur() # 重试3次 c = 3 while c > 0: c -= 1 res = thank(img_code) if res.get('code') == 0: # 识别成功 cookies['SNUID'] = res.get('id', '') break elif res.get('code') == 3: # 报告错误识别 img_report_error(img_id) # 出错随机等待后重试 time.sleep(random.randint(1, 5)) # 换张图片再来一次 im = code_img_obj() # 6位英数混合 白天:15快豆 夜间:18.75快豆 超时:60秒 img_id, img_code = get_img_code(im, im_type_id=3060) print(img_id, img_code) else: print('Error') print(res) return None return _get_cookies() if c > 0 else None if __name__ == '__main__': # manual_cookies() auto_cookies() # check_n() # check_y() ================================================ FILE: tools/anti_spider_weixin.py ================================================ #!/usr/bin/env python # encoding: utf-8 """ @author: zhanghe @software: PyCharm @file: anti_spider_weixin.py @time: 2018-02-10 17:24 """ from __future__ import print_function from __future__ import unicode_literals from future.builtins import input # PY2(raw_input) import random import time import json from lxml.html import fromstring import requests from apps.client_rk import get_img_code, img_report_error from config import current_config REQUESTS_TIME_OUT = current_config.REQUESTS_TIME_OUT cookies = {} s = requests.session() headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2', 'Connection': 'keep-alive', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:57.0) Gecko/20100101 Firefox/57.0' } def _get_tc(): tc = str('%13d' % (time.time() * 1000)) return tc def _save_img(res): # 保存验证码图片 img_name = 'weixin_%s.jpg' % _get_tc() print('图片名称: %s' % img_name) img_content = res.content with open(img_name, b'w') as f: f.write(img_content) time.sleep(1) def anti_spider(url): # url = 'https://mp.weixin.qq.com/profile?src=3×tamp=1512923946&ver=1&signature=RZh61VIthXnp4HUsow1pgQXJbGxi*v-n4Pr1W6e5PVkmJSbRknd6LMT-EFoQqX4gaM6uGyHREmDPsN6lXkeYfg==' request_headers = headers.copy() request_headers['Host'] = 'mp.weixin.qq.com' res = s.get(url, headers=request_headers, timeout=REQUESTS_TIME_OUT) cookies.update(res.cookies) print('.', end='') doc = fromstring(res.text) title = u''.join(i.strip() for i in doc.xpath('//title/text()')) print(title) return title == '请输入验证码' def code_img_save(): url = 'https://mp.weixin.qq.com/mp/verifycode' request_headers = headers.copy() request_headers['Host'] = 'mp.weixin.qq.com' request_cookie = cookies.copy() params = { 'cert': _get_tc(), } res = s.get(url, params=params, headers=request_headers, cookies=request_cookie, timeout=REQUESTS_TIME_OUT) # 保存图片 _save_img(res) cookies.update(res.cookies) print('.', end='') # print cookies def code_img_obj(): url = 'https://mp.weixin.qq.com/mp/verifycode' request_headers = headers.copy() request_headers['Host'] = 'mp.weixin.qq.com' request_cookie = cookies.copy() params = { 'cert': _get_tc(), } res = s.get(url, params=params, headers=request_headers, cookies=request_cookie, timeout=REQUESTS_TIME_OUT) print('.', end='') return res.content def verify_code(input_code): url = 'https://mp.weixin.qq.com/mp/verifycode' request_headers = headers.copy() request_headers['Host'] = 'mp.weixin.qq.com' request_headers['X-Requested-With'] = 'XMLHttpRequest' request_cookie = cookies.copy() data = { 'cert': _get_tc(), 'input': input_code, 'appmsg_token': '', } res = s.post(url, data=data, headers=request_headers, cookies=request_cookie, timeout=REQUESTS_TIME_OUT) cookies.update(res.cookies) # print cookies json_msg = json.loads(res.content) print(json_msg) return json_msg # {u'cookie_count': 0, u'errmsg': u'', u'ret': 0} # {u'cookie_count': 0, u'errmsg': u'', u'ret': 501} 验证码有误 def _get_cookies(): print(cookies) return cookies def manual_cookies(): url = input('url << ') anti_spider(url) code_img_save() input_code = input('code << ') verify_code(input_code) return _get_cookies() def auto_cookies(url): need_status = anti_spider(url) if not need_status: return True im = code_img_obj() # 4位纯英文字母 白天:10快豆 夜间:12.5快豆 超时:60秒 img_id, img_code = get_img_code(im, im_type_id=2040) print(img_id, img_code) # 重试3次 c = 3 while c > 0: c -= 1 res = verify_code(img_code) if res.get('ret') == 0: # 识别成功 break elif res.get('ret') == 501: # 报告错误识别 img_report_error(img_id) # 出错随机等待后重试 time.sleep(random.randint(1, 5)) # 换张图片再来一次 im = code_img_obj() # 4位纯英文字母 白天:10快豆 夜间:12.5快豆 超时:60秒 img_id, img_code = get_img_code(im, im_type_id=2040) print(img_id, img_code) else: print('Error') print(res) return False return True if c > 0 else False if __name__ == '__main__': # manual_cookies() anti_spider_url = 'http://mp.weixin.qq.com/profile?src=3×tamp=1513650933&ver=1&signature=zzgwSdnYIm68Nu5eFz1X8-Heqjojhy4ozHmg4cUz*hEo*QuXma9-qkMrOFxzOGDfzJHHfyechg0AVCFPpsXpuA==' print(auto_cookies(anti_spider_url)) ================================================ FILE: tools/char.py ================================================ #!/usr/bin/env python # encoding: utf-8 """ @author: zhanghe @software: PyCharm @file: char.py @time: 2018-02-10 17:48 """ import execjs # from HTMLParser import HTMLParser # PY2 # from html.parser import HTMLParser # PY3 from future.moves.html.parser import HTMLParser html_parser = HTMLParser() def un_escape(char_str): """ 反转译 :param char_str: :return: """ return html_parser.unescape(char_str) def get_js_36_str(i): """ 整数、浮点数 js方式转36进制 :param i: :return: """ js_body = ''' function get_36_str(i) { return i.toString(36); }; ''' ctx = execjs.compile(js_body) return ctx.call("get_36_str", i) if __name__ == '__main__': a = '加入到"我的书目选单"中' b = '\xe5\xbd\x93\xe5\x89\x8d\xe5\xb7\xb2\xe8\xbe\xbe\xe5\x88\xb0\xe6\x8a\x93\xe5\x8f\x96\xe9\x85\x8d\xe7\xbd\xae\xe7\x9a\x84\xe6\x9c\x80\xe5\xa4\xa7\xe9\xa1\xb5\xe7\xa0\x81' c = 'https://mp.weixin.qq.com/s?timestamp=1511432702&src=3&ver=1&signature=lAC8MtonFiHnlc5-j4z48WcPRpfP1Nn4zxCmY4ZjCjdXQscLcB5uyi5Jb395m5yaZQHTqqSlqzy*HRR0nAPZHsz0*Efu3w*Y2B8XbIL5v8pZQsGt9cwZQTuvI0GZqAsZobqzaeDptAQzHLB4QKL-qExOz0ANOTG*QAvJ7-ZurMg=' d = 'http://mp.weixin.qq.com/mp/homepage?__biz=MzAxNzU2Mjc4NQ==&hid=2&sn=8177890cc7e468d3df6f3050d49951c5#wechat_redirect' print(un_escape(a)) print(un_escape(b)) print(un_escape(c)) print(un_escape(d)) ================================================ FILE: tools/cookies.py ================================================ #!/usr/bin/env python # encoding: utf-8 """ @author: zhanghe @software: PyCharm @file: cookies.py @time: 2018-02-10 17:49 """ from __future__ import print_function from __future__ import unicode_literals import json import hashlib from apps.client_db import redis_client def _get_cookies_str(cookies_dict): """ In [1]: import json In [2]: sd = {'c':1, 'b':2, 'a':3} In [3]: sd Out[3]: {'a': 3, 'b': 2, 'c': 1} In [4]: items = sd.items() In [5]: items Out[5]: [('a', 3), ('c', 1), ('b', 2)] In [6]: sorted(items) Out[6]: [('a', 3), ('b', 2), ('c', 1)] In [7]: sorted(items, reverse=True) Out[7]: [('c', 1), ('b', 2), ('a', 3)] In [8]: json.dumps(sorted(items)) Out[8]: '[["a", 3], ["b", 2], ["c", 1]]' In [9]: json.loads(json.dumps(sorted(items))) Out[9]: [[u'a', 3], [u'b', 2], [u'c', 1]] In [10]: dict(json.loads(json.dumps(sorted(items)))) Out[10]: {u'a': 3, u'b': 2, u'c': 1} :param cookies_dict: :return: """ cookies_str = json.dumps(sorted(cookies_dict.items())) return cookies_str def _get_finger(cookies_str): """ :param cookies_str: :return: """ m = hashlib.md5() m.update(cookies_str.encode('utf-8') if isinstance(cookies_str, unicode) else cookies_str) finger = m.hexdigest() return finger def get_cookies(spider_name): """ 获取 cookies 兼容 redis 没有 cookies 池的情况 :param spider_name: :return: """ key_set = 'scrapy:cookies_set:%(spider_name)s' % {'spider_name': spider_name} cookies_id = redis_client.srandmember(key_set) key_id = 'scrapy:cookies_id:%(cookies_id)s' % {'cookies_id': cookies_id} cookies_str = redis_client.get(key_id) cookies_obj = dict(json.loads(cookies_str or '[]')) return cookies_id, cookies_obj def add_cookies(spider_name, cookies_obj): """ 添加 cookies :param spider_name: :param cookies_obj: :return: """ cookies_str = _get_cookies_str(cookies_obj) cookies_id = _get_finger(cookies_str) key_id = 'scrapy:cookies_id:%(cookies_id)s' % {'cookies_id': cookies_id} key_set = 'scrapy:cookies_set:%(spider_name)s' % {'spider_name': spider_name} if redis_client.sismember(key_set, cookies_id): return False redis_client.set(key_id, cookies_str) redis_client.sadd(key_set, cookies_id) return True def del_cookies(spider_name, cookies_id): """ 删除 cookies :param spider_name: :param cookies_id: :return: """ key_id = 'scrapy:cookies_id:%(cookies_id)s' % {'cookies_id': cookies_id} key_set = 'scrapy:cookies_set:%(spider_name)s' % {'spider_name': spider_name} redis_client.delete(key_id) redis_client.srem(key_set, cookies_id) def len_cookies(spider_name): """ 获取 cookies 长度 :param spider_name: :return: """ key_set = 'scrapy:cookies_set:%(spider_name)s' % {'spider_name': spider_name} cookies_len = redis_client.scard(key_set) return cookies_len """ 集合 key: cookies_id 字符串 cookies_id_key: cookies_obj """ ================================================ FILE: tools/date_time.py ================================================ #!/usr/bin/env python # encoding: utf-8 """ @author: zhanghe @software: PyCharm @file: date_time.py @time: 2018-06-25 16:44 """ from __future__ import unicode_literals import six import time import calendar from datetime import datetime, timedelta, date def get_tc(): """ 获取13位字符串时间戳 :return: """ tc = str('%13d' % (time.time() * 1000)) return tc def get_current_day_time_ends(): """ 获取当天开始结束时刻 :return: """ today = datetime.today() start_time = datetime(today.year, today.month, today.day, 0, 0, 0) end_time = datetime(today.year, today.month, today.day, 23, 59, 59) return start_time, end_time def get_current_month_time_ends(): """ 获取当月开始结束时刻 :return: """ today = datetime.today() _, days = calendar.monthrange(today.year, today.month) start_time = datetime(today.year, today.month, 1, 0, 0, 0) end_time = datetime(today.year, today.month, days, 23, 59, 59) return start_time, end_time def get_current_year_time_ends(): """ 获取当年开始结束时刻 :return: """ today = datetime.today() start_time = datetime(today.year, 1, 1, 0, 0, 0) end_time = datetime(today.year, 12, 31, 23, 59, 59) return start_time, end_time def get_hours(zerofill=True): """ 列出1天所有24小时 :return: """ if zerofill: return ['%02d' % i for i in range(24)] else: return range(24) def get_days(year=1970, month=1, zerofill=True): """ 列出当月的所有日期 :param year: :param month: :param zerofill: :return: """ year = int(year) month = int(month) _, days = calendar.monthrange(year, month) if zerofill: return ['%02d' % i for i in range(1, days+1)] else: return range(1, days+1) def get_weeks(): """ 列出所有星期 :return: """ return ['周一', '周二', '周三', '周四', '周五', '周六', '周日'] def get_months(zerofill=True): """ 列出1年所有12月份 :return: """ if zerofill: return ['%02d' % i for i in range(1, 13)] else: return [i for i in range(1, 13)] def time_local_to_utc(local_time): """ 本地时间转UTC时间 :param local_time: :return: """ # 字符串处理 if isinstance(local_time, six.string_types) and len(local_time) == 10: local_time = datetime.strptime(local_time, '%Y-%m-%d') elif isinstance(local_time, six.string_types) and len(local_time) >= 19: local_time = datetime.strptime(local_time[:19], '%Y-%m-%d %H:%M:%S') elif not (isinstance(local_time, datetime) or isinstance(local_time, date)): local_time = datetime.now() # 时间转换 utc_time = local_time + timedelta(seconds=time.timezone) return utc_time def time_utc_to_local(utc_time): """ UTC时间转本地时间 :param utc_time: :return: """ # 字符串处理 if isinstance(utc_time, six.string_types) and len(utc_time) == 10: utc_time = datetime.strptime(utc_time, '%Y-%m-%d') elif isinstance(utc_time, six.string_types) and len(utc_time) >= 19: utc_time = datetime.strptime(utc_time[:19], '%Y-%m-%d %H:%M:%S') elif not (isinstance(utc_time, datetime) or isinstance(utc_time, date)): utc_time = datetime.utcnow() # 时间转换 local_time = utc_time - timedelta(seconds=time.timezone) return local_time if __name__ == '__main__': print(get_current_day_time_ends()) print(get_current_month_time_ends()) print(get_current_year_time_ends()) print(get_hours(zerofill=False)) print(get_hours(zerofill=True)) print(get_days(zerofill=False)) print(get_days(zerofill=True)) print(get_months(zerofill=False)) print(get_months(zerofill=True)) ================================================ FILE: tools/duplicate.py ================================================ #!/usr/bin/env python # encoding: utf-8 """ @author: zhanghe @software: PyCharm @file: duplicate.py @time: 2018-02-10 17:39 """ from __future__ import print_function from __future__ import unicode_literals from apps.client_db import redis_client from tools.url import get_request_finger def is_dup_detail(detail_url, spider_name, channel_id=0): """ 检查详细页是否重复 :param detail_url: :param spider_name: :param channel_id: :return: """ detail_dup_key = 'scrapy:dup:%s:%s' % (spider_name, channel_id) detail_url_finger = get_request_finger(detail_url) return redis_client.sismember(detail_dup_key, detail_url_finger) def add_dup_detail(detail_url, spider_name, channel_id=0): """ 把当前详细页加入集合 :param detail_url: :param spider_name: :param channel_id: :return: """ detail_dup_key = 'scrapy:dup:%s:%s' % (spider_name, channel_id) detail_url_finger = get_request_finger(detail_url) return redis_client.sadd(detail_dup_key, detail_url_finger) ================================================ FILE: tools/gen.py ================================================ #!/usr/bin/env python # encoding: utf-8 """ @author: zhanghe @software: PyCharm @file: gen.py @time: 2018-02-10 17:19 """ from __future__ import print_function from __future__ import unicode_literals import os import sys from sqlalchemy.ext.declarative.api import DeclarativeMeta from sqlalchemy.inspection import inspect from config import current_config BASE_DIR = current_config.BASE_DIR SQLALCHEMY_DATABASE_URI = current_config.SQLALCHEMY_DATABASE_URI_MYSQL def gen_models(): """ 创建 models $ python gen.py gen_models """ file_path = os.path.join(BASE_DIR, 'models/news.py') cmd = 'sqlacodegen %s --noinflect --outfile %s' % (SQLALCHEMY_DATABASE_URI, file_path) output = os.popen(cmd) result = output.read() print(result) # 更新 model 文件 with open(file_path, b'r') as f: lines = f.readlines() # 新增 model 转 dict 方法 with open(file_path, b'w') as f: lines.insert(9, b'def to_dict(self):\n') lines.insert(10, b' return {c.name: getattr(self, c.name, None) for c in self.__table__.columns}\n') lines.insert(11, b'\n') lines.insert(12, b'Base.to_dict = to_dict\n') lines.insert(13, b'\n\n') f.write(b''.join(lines)) def gen_items(): """ 创建 items $ python gen.py gen_items 字段规则: 去除自增主键,非自增是需要的。 """ from models import news file_path = os.path.join(BASE_DIR, 'news/items.py') model_list = [(k, v) for k, v in news.__dict__.items() if isinstance(v, DeclarativeMeta) and k != 'Base'] with open(file_path, b'w') as f: f.write(b'# -*- coding: utf-8 -*-\n\n') f.write(b'# Define here the models for your scraped items\n#\n') f.write(b'# See documentation in:\n') f.write(b'# http://doc.scrapy.org/en/latest/topics/items.html\n\n') f.write(b'import scrapy\n') for model_name, model_class in model_list: result = model_class().to_dict() table_name = model_class().__tablename__ model_pk = inspect(model_class).primary_key[0].name f.write(b'\n\nclass %sItem(scrapy.Item):\n' % model_name) f.write(b' """\n') f.write(b' table_name: %s\n' % table_name) f.write(b' primary_key: %s\n' % model_pk) f.write(b' """\n') for field_name in list(result.keys()): if field_name in [model_pk, 'create_time', 'update_time']: continue f.write(b' %s = scrapy.Field()\n' % field_name) def run(): """ 入口 """ # print sys.argv try: if len(sys.argv) > 1: fun_name = globals()[sys.argv[1]] fun_name() else: print('缺失参数\n') usage() except NameError as e: print(e) print('未定义的方法[%s]' % sys.argv[1]) def usage(): print(""" 创建 models $ python gen.py gen_models 创建 items $ python gen.py gen_items """) if __name__ == '__main__': run() # print BASE_DIR # print SQLALCHEMY_DATABASE_URI ================================================ FILE: tools/img.py ================================================ #!/usr/bin/env python # encoding: utf-8 """ @author: zhanghe @software: PyCharm @file: img.py @time: 2018-03-20 14:24 """ from __future__ import print_function from __future__ import unicode_literals import imghdr import requests from PIL import Image from six import BytesIO from config import current_config REQUESTS_TIME_OUT = current_config.REQUESTS_TIME_OUT def filter_img_size(min_width=0, min_height=0, *img_url): """ 过滤尺寸不符要求的图片 :param min_width: :param min_height: :param img_url: :return: """ result = [] for i in img_url: try: img_res = requests.get(i, stream=True, timeout=REQUESTS_TIME_OUT) if img_res.status_code == 200: orig_image = Image.open(BytesIO(img_res.content)) img_width, img_height = orig_image.size if img_width >= min_width and img_height >= min_height: result.append(i) except Exception as e: print('check images error: %s' % img_url) print(e.message) continue return result def filter_local_img_type(ignore_type='gif', *img_path): """ 过滤指定类型本地图片 :param ignore_type: :param img_path: :return: """ result = [] for i in img_path: img_type = imghdr.what(i) # print(img_type, i) if img_type == ignore_type: continue result.append(i) return result def filter_remote_img_type(ignore_type='gif', *img_url): """ 过滤指定类型远程图片 :param ignore_type: :param img_url: :return: """ result = [] for i in img_url: img_type = imghdr.what(None, requests.get(i).content) # print(img_type, i) if img_type == ignore_type: continue result.append(i) return result if __name__ == '__main__': pass ================================================ FILE: tools/import_task.py ================================================ #!/usr/bin/env python # encoding: utf-8 """ @author: zhanghe @software: PyCharm @file: import_csv.py @time: 2018-05-17 18:46 """ from __future__ import print_function from __future__ import unicode_literals import sys import csv import json from apps.client_db import add_item from models.news import FetchTask def read_csv(filename): """ 读取csv :param filename: :return: """ count = 0 with open(filename) as f: reader = csv.DictReader(f) for line in reader: print(json.dumps(line, indent=4, ensure_ascii=False)) count += 1 yield line print('读取数量: %s' % count) def import_csv(filename): """ 导入csv :param filename: :return: """ count = 0 for item in read_csv(filename): result = add_item(FetchTask, item) print(result) count += 1 print('导入数量: %s' % count) def usage(): print(''' 导入 csv 注意 csv 格式, 表头与数据库任务表的字段对应(去掉主键) $ python tools/import_task.py example.csv ''') def run(): """ 入口 """ # print sys.argv try: if len(sys.argv) < 2: raise Exception('缺失参数\n') import_csv(sys.argv[1]) except Exception as e: print('导入异常') print(e) usage() if __name__ == '__main__': run() ================================================ FILE: tools/net_status.py ================================================ #!/usr/bin/env python # encoding: utf-8 """ @author: zhanghe @software: PyCharm @file: net_status.py @time: 2018-05-28 20:45 """ import time from apps.client_db import redis_client def get_reboot_net_status(net_name='optical_modem_china_net'): key_reboot_net = 'scrapy:reboot_net:%s' % net_name reboot_net_status = redis_client.get(key_reboot_net) return reboot_net_status def set_reboot_net_status(net_name='optical_modem_china_net'): key_reboot_net = 'scrapy:reboot_net:%s' % net_name reboot_net_status = time.strftime('%Y-%m-%d %H:%M:%S') redis_client.set(key_reboot_net, reboot_net_status) def del_reboot_net_status(net_name='optical_modem_china_net'): key_reboot_net = 'scrapy:reboot_net:%s' % net_name redis_client.delete(key_reboot_net) ================================================ FILE: tools/proxies.py ================================================ #!/usr/bin/env python # encoding: utf-8 """ @author: zhanghe @software: PyCharm @file: proxies.py @time: 2018-03-13 16:37 """ import json import requests from apps.client_db import redis_client from tools.url import get_update_url from config import current_config REQUESTS_TIME_OUT = current_config.REQUESTS_TIME_OUT def add_proxy(spider_name, *proxy): key_set = 'scrapy:proxies_set:%(spider_name)s' % {'spider_name': spider_name} return redis_client.sadd(key_set, *proxy) def del_proxy(spider_name, proxy): key_set = 'scrapy:proxies_set:%(spider_name)s' % {'spider_name': spider_name} return redis_client.srem(key_set, proxy) def get_proxy(spider_name): key_set = 'scrapy:proxies_set:%(spider_name)s' % {'spider_name': spider_name} return redis_client.srandmember(key_set) def len_proxy(spider_name): key_set = 'scrapy:proxies_set:%(spider_name)s' % {'spider_name': spider_name} return redis_client.scard(key_set) def fetch_proxy(country='China', scheme='http'): """ 获取代理 :param country: :param scheme: :return: """ data = {} if country: data['country'] = country if scheme: data['type'] = scheme url = 'http://proxy.nghuyong.top/' url = get_update_url(url, data) res = requests.get(url, timeout=REQUESTS_TIME_OUT).json() return ['%s://%s' % (i['type'], i['ip_and_port']) for i in res.get('data', [])] if __name__ == '__main__': proxy_result = fetch_proxy() print(json.dumps(proxy_result, indent=4)) ================================================ FILE: tools/scrapy_tasks.py ================================================ #!/usr/bin/env python # encoding: utf-8 """ @author: zhanghe @software: PyCharm @file: scrapy_tasks.py @time: 2018-02-10 17:42 """ from apps.client_db import redis_client def pop_task(spider_name): key_set = 'scrapy:tasks_set:%(spider_name)s' % {'spider_name': spider_name} return redis_client.spop(key_set) def put_task(spider_name, *task_ids): key_set = 'scrapy:tasks_set:%(spider_name)s' % {'spider_name': spider_name} redis_client.sadd(key_set, *task_ids) def get_tasks_count(spider_name): key_set = 'scrapy:tasks_set:%(spider_name)s' % {'spider_name': spider_name} cookies_len = redis_client.scard(key_set) return cookies_len ================================================ FILE: tools/sys_monitor.py ================================================ #!/usr/bin/env python # encoding: utf-8 """ @author: zhanghe @software: PyCharm @file: sys_monitor.py @time: 2018-02-10 17:43 """ import psutil import time def bytes2human(n): """ >>> bytes2human(10000) '9.8 K' >>> bytes2human(100001221) '95.4 M' """ symbols = ('K', 'M', 'G', 'T', 'P', 'E', 'Z', 'Y') prefix = {} for i, s in enumerate(symbols): prefix[s] = 1 << (i + 1) * 10 for s in reversed(symbols): if n >= prefix[s]: value = float(n) / prefix[s] return '%.2f %s' % (value, s) return '%.2f B' % n def _format_info(k, v): if len(str(v)) <= 5: return '%-25s %5s' % (k, v) elif len(str(v)) <= 10: return '%-20s %10s' % (k, v) else: return '%-15s %15s' % (k, v) def _print_info(contents, topic=''): if topic: print('\n[%s]' % topic) contents.insert(0, '-' * 31) contents.append('-' * 31) print('\n'.join(contents)) def _cpu(): contents = [ _format_info('cpu_count_logical', psutil.cpu_count()), _format_info('cpu_count_physical', psutil.cpu_count(logical=False)), ] _print_info(contents, 'CPU') def _memory(): mem_virtual = psutil.virtual_memory() mem_swap = psutil.swap_memory() contents = [_format_info('mem_virtual_total', bytes2human(mem_virtual.total)), _format_info('mem_virtual_free', bytes2human(mem_virtual.free)), _format_info('mem_virtual_percent', '%s %%' % mem_virtual.percent), _format_info('mem_swap_total', bytes2human(mem_swap.total)), _format_info('mem_swap_free', bytes2human(mem_swap.free)), _format_info('mem_swap_percent', '%s %%' % mem_swap.percent)] _print_info(contents, 'Memory') def _disks(): sdisk_part = psutil.disk_partitions() contents = [] for i in sdisk_part: contents.append(_format_info(i.device, i.mountpoint)) sdisk_usage = psutil.disk_usage(i.mountpoint) contents.append(_format_info('disk_usage_total', bytes2human(sdisk_usage.total))) contents.append(_format_info('disk_usage_free', bytes2human(sdisk_usage.free))) contents.append(_format_info('disk_usage_percent', '%s %%' % sdisk_usage.percent)) _print_info(contents, 'Disks') def _network(speed=True): snetio = psutil.net_io_counters() contents = [_format_info('bytes_sent', bytes2human(snetio.bytes_sent)), _format_info('bytes_recv', bytes2human(snetio.bytes_recv))] if speed: time.sleep(1) snetio_after = psutil.net_io_counters() contents.append(_format_info('speed_sent', '%s/S' % bytes2human(snetio_after.bytes_sent - snetio.bytes_sent))) contents.append(_format_info('speed_recv', '%s/S' % bytes2human(snetio_after.bytes_recv - snetio.bytes_recv))) _print_info(contents, 'Network') def _sensors(): contents = [] if hasattr(psutil, "sensors_temperatures"): sensors_temperatures = psutil.sensors_temperatures() for name, entries in sensors_temperatures.items(): for entry in entries: contents.append( _format_info(entry.label or name, '%s °C' % entry.current)) sbattery = psutil.sensors_battery() if sbattery: contents.append(_format_info('battery_percent', '%s %%' % sbattery.percent)) contents.append(_format_info('secsleft', sbattery.secsleft)) contents.append(_format_info('power_plugged', sbattery.power_plugged)) _print_info(contents, 'Sensors') def stats(): _cpu() _memory() _disks() _network() _sensors() if __name__ == '__main__': stats() ================================================ FILE: tools/toutiao_m.py ================================================ #!/usr/bin/env python # encoding: utf-8 """ @author: zhanghe @software: PyCharm @file: toutiao_m.py @time: 2018-02-28 14:14 """ import hashlib import math import re import time import execjs from tools.char import un_escape def get_as_cp(): t = int(math.floor(time.time())) e = hex(t).upper()[2:] m = hashlib.md5() m.update(str(t).encode(encoding='utf-8')) i = m.hexdigest().upper() if len(e) != 8: AS = '479BB4B7254C150' CP = '7E0AC8874BB0985' return AS, CP n = i[0:5] a = i[-5:] s = '' r = '' for o in range(5): s += n[o] + e[o] r += e[o + 3] + a[o] AS = 'A1' + s + e[-3:] CP = e[0:3] + r + 'E1' return AS, CP def parse_toutiao_js_body(html_body, url=''): """ 解析js :param html_body: :param url: :return: """ rule = r'' js_list = re.compile(rule, re.S).findall(html_body) if not js_list: print('parse error url: %s' % url) print(html_body) return ''.join(js_list) class ParseJsTt(object): """ 解析头条动态数据 """ def __init__(self, js_body): self.js_body = js_body self._add_js_item_id_fn() self._add_js_title_fn() self._add_js_abstract_fn() self._add_js_content_fn() self._add_js_pub_time() self._add_js_tags_fn() self.ctx = execjs.compile(self.js_body) def _add_js_item_id_fn(self): js_item_id_fn = """ function r_item_id() { return BASE_DATA.articleInfo.itemId; }; """ self.js_body += js_item_id_fn def _add_js_title_fn(self): js_title_fn = """ function r_title() { return BASE_DATA.articleInfo.title; }; """ self.js_body += js_title_fn def _add_js_abstract_fn(self): js_abstract_fn = """ function r_abstract() { return BASE_DATA.shareInfo.abstract; }; """ self.js_body += js_abstract_fn def _add_js_content_fn(self): js_content_fn = """ function r_content() { return BASE_DATA.articleInfo.content; }; """ self.js_body += js_content_fn def _add_js_pub_time(self): js_pub_time_fn = """ function r_pub_time() { return BASE_DATA.articleInfo.subInfo.time; }; """ self.js_body += js_pub_time_fn def _add_js_tags_fn(self): js_tags_fn = """ function r_tags() { return BASE_DATA.articleInfo.tagInfo.tags; }; """ self.js_body += js_tags_fn def parse_js_item_id(self): return self.ctx.call('r_item_id') or '' def parse_js_title(self): return self.ctx.call('r_title') or '' def parse_js_abstract(self): return self.ctx.call('r_abstract') or '' def parse_js_content(self): return un_escape(self.ctx.call('r_content')) or '' def parse_js_pub_time(self): return self.ctx.call('r_pub_time') or time.strftime('%Y-%m-%d %H:%M:%S') def parse_js_tags(self): return ','.join([tag['name'] or '' for tag in self.ctx.call('r_tags')]) if __name__ == '__main__': print(get_as_cp()) ================================================ FILE: tools/url.py ================================================ #!/usr/bin/env python # encoding: utf-8 """ @author: zhanghe @software: PyCharm @file: url.py @time: 2018-02-10 17:38 """ # from urllib import urlencode # PY2 # from urlparse import urlparse, urlunparse, parse_qsl # PY2 # from urllib.parse import urlparse, urlunparse, parse_qsl, urlencode # PY3 from future.moves.urllib.parse import urlparse, urlunparse, parse_qsl, urlencode from scrapy.utils import request from scrapy.http import Request def get_update_url(url, data): """ 获取更新后的url :param url: :param data: :return: """ result = urlparse(url) query_payload = dict(parse_qsl(result.query), **data) query_param = urlencode(query_payload) return urlunparse((result.scheme, result.netloc, result.path, result.params, query_param, result.fragment)) def get_url_query_param(url, param): """ 获取url参数值 :param url: :param param: :return: """ result = urlparse(url) return dict(parse_qsl(result.query)).get(param) def get_request_finger(url): """ 获取 url 指纹(允许参数无序) :param url: :return: """ req = Request(url=url) return request.request_fingerprint(req) def allow_url(url, allow_domains): url_parse = urlparse(url) result = False for domain in allow_domains: if url_parse.netloc.endswith(domain): result = True return result if __name__ == '__main__': print(get_update_url('http://www.abc.com/def/', {'b': 2})) print(get_update_url('http://www.abc.com/def/?a=1', {'b': 2})) print(get_update_url('http://www.abc.com/def/?a=1', {'a': 2})) print(get_url_query_param('http://www.abc.com/def/?a=1&b=2', 'a')) print(allow_url('http://www.abc.com', ['abc.com'])) print(allow_url('http://www.abc.com', ['b.com'])) ================================================ FILE: tools/weibo.py ================================================ #!/usr/bin/env python # encoding: utf-8 """ @author: zhanghe @software: PyCharm @file: weibo.py @time: 2018-02-13 16:20 """ import base64 # from urllib import quote # PY2 # from urllib.parse import quote # PY3 from future.moves.urllib.parse import quote from future.builtins import input # PY2(raw_input) def get_su(user_name): return base64.b64encode(quote(user_name.strip())) def get_login_data(): print('Please type username and password!') username = input('username < ') password = input('password < ') if not(username and password): raise Exception('Method or function hasn\'t been implemented yet.') return { 'username': username, 'password': password } if __name__ == '__main__': pass ================================================ FILE: tools/weixin.py ================================================ #!/usr/bin/env python # encoding: utf-8 """ @author: zhanghe @software: PyCharm @file: weixin.py @time: 2018-02-10 17:55 """ import re import time import hashlib # from urlparse import urljoin # PY2 # from urllib.parse import urljoin # PY3 from future.moves.urllib.parse import urljoin import execjs from tools.char import un_escape from config import current_config from models.news import FetchResult from news.items import FetchResultItem from apps.client_db import db_session_mysql from maps.platform import WEIXIN, WEIBO BASE_DIR = current_config.BASE_DIR def get_finger(content_str): """ :param content_str: :return: """ m = hashlib.md5() m.update(content_str.encode('utf-8') if isinstance(content_str, unicode) else content_str) finger = m.hexdigest() return finger def parse_weixin_js_body(html_body, url=''): """ 解析js :param html_body: :param url: :return: """ rule = r'' js_list = re.compile(rule, re.S).findall(html_body) if not js_list: print('parse error url: %s' % url) return ''.join(js_list) def parse_weixin_article_id(html_body): rule = r'