Repository: jhao104/proxy_pool Branch: master Commit: 50cc52ea50da Files: 59 Total size: 98.2 KB Directory structure: gitextract_arxmexr3/ ├── .github/ │ └── workflows/ │ ├── docker-image-latest.yml │ └── docker-image-tags.yml ├── .gitignore ├── .travis.yml ├── Dockerfile ├── LICENSE ├── README.md ├── _config.yml ├── api/ │ ├── __init__.py │ └── proxyApi.py ├── db/ │ ├── __init__.py │ ├── dbClient.py │ ├── redisClient.py │ └── ssdbClient.py ├── docker-compose.yml ├── docs/ │ ├── Makefile │ ├── changelog.rst │ ├── conf.py │ ├── dev/ │ │ ├── ext_fetcher.rst │ │ ├── ext_validator.rst │ │ └── index.rst │ ├── index.rst │ ├── make.bat │ └── user/ │ ├── how_to_config.rst │ ├── how_to_run.rst │ ├── how_to_use.rst │ └── index.rst ├── fetcher/ │ ├── __init__.py │ └── proxyFetcher.py ├── handler/ │ ├── __init__.py │ ├── configHandler.py │ ├── logHandler.py │ └── proxyHandler.py ├── helper/ │ ├── __init__.py │ ├── check.py │ ├── fetch.py │ ├── launcher.py │ ├── proxy.py │ ├── scheduler.py │ └── validator.py ├── proxyPool.py ├── requirements.txt ├── setting.py ├── start.sh ├── test/ │ ├── __init__.py │ ├── testConfigHandler.py │ ├── testDbClient.py │ ├── testLogHandler.py │ ├── testProxyClass.py │ ├── testProxyFetcher.py │ ├── testProxyValidator.py │ ├── testRedisClient.py │ └── testSsdbClient.py ├── test.py └── util/ ├── __init__.py ├── lazyProperty.py ├── singleton.py ├── six.py └── webRequest.py ================================================ FILE CONTENTS ================================================ ================================================ FILE: .github/workflows/docker-image-latest.yml ================================================ name: Publish Docker image latest on: push: branches: - 'master' jobs: push_to_registry: name: Push Docker image to Docker Hub runs-on: ubuntu-latest steps: - name: Check out the repo uses: actions/checkout@v2 - name: Log in to Docker Hub uses: docker/login-action@v1 with: username: ${{ secrets.DOCKERHUB_USERNAME }} password: ${{ secrets.DOCKERHUB_TOKEN }} - name: Extract metadata (tags, labels) for Docker id: meta uses: docker/metadata-action@v3 with: images: jhao104/proxy_pool - name: Build and push Docker image uses: docker/build-push-action@v2 with: context: . push: true tags: jhao104/proxy_pool:latest ================================================ FILE: .github/workflows/docker-image-tags.yml ================================================ name: Publish Docker image tags on: push: tags: - '*' jobs: push_to_registry: name: Push Docker image to Docker Hub runs-on: ubuntu-latest steps: - name: Check out the repo uses: actions/checkout@v2 - name: Log in to Docker Hub uses: docker/login-action@v1 with: username: ${{ secrets.DOCKERHUB_USERNAME }} password: ${{ secrets.DOCKERHUB_TOKEN }} - name: Extract metadata (tags, labels) for Docker id: meta uses: docker/metadata-action@v3 with: images: jhao104/proxy_pool - name: Build and push Docker image uses: docker/build-push-action@v2 with: context: . push: true tags: ${{ steps.meta.outputs.tags }} labels: ${{ steps.meta.outputs.labels }} ================================================ FILE: .gitignore ================================================ .idea/ docs/_build *.pyc *.log ================================================ FILE: .travis.yml ================================================ language: python python: - "2.7" - "3.5" - "3.6" - "3.7" - "3.8" - "3.9" - "3.10" - "3.11" os: - linux install: - pip install -r requirements.txt script: python test.py ================================================ FILE: Dockerfile ================================================ FROM python:3.6-alpine MAINTAINER jhao104 WORKDIR /app COPY ./requirements.txt . # apk repository RUN sed -i 's/dl-cdn.alpinelinux.org/mirrors.ustc.edu.cn/g' /etc/apk/repositories # timezone RUN apk add -U tzdata && cp /usr/share/zoneinfo/Asia/Shanghai /etc/localtime && apk del tzdata # runtime environment RUN apk add musl-dev gcc libxml2-dev libxslt-dev && \ pip install --no-cache-dir -r requirements.txt && \ apk del gcc musl-dev COPY . . EXPOSE 5010 ENTRYPOINT [ "sh", "start.sh" ] ================================================ FILE: LICENSE ================================================ MIT License Copyright (c) 2017 J_hao104 Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ================================================ FILE: README.md ================================================ ProxyPool 爬虫代理IP池 ======= [![Build Status](https://travis-ci.org/jhao104/proxy_pool.svg?branch=master)](https://travis-ci.org/jhao104/proxy_pool) [![](https://img.shields.io/badge/Powered%20by-@j_hao104-green.svg)](http://www.spiderpy.cn/blog/) [![Packagist](https://img.shields.io/packagist/l/doctrine/orm.svg)](https://github.com/jhao104/proxy_pool/blob/master/LICENSE) [![GitHub contributors](https://img.shields.io/github/contributors/jhao104/proxy_pool.svg)](https://github.com/jhao104/proxy_pool/graphs/contributors) [![](https://img.shields.io/badge/language-Python-green.svg)](https://github.com/jhao104/proxy_pool) ______ ______ _ | ___ \_ | ___ \ | | | |_/ / \__ __ __ _ __ _ | |_/ /___ ___ | | | __/| _// _ \ \ \/ /| | | || __// _ \ / _ \ | | | | | | | (_) | > < \ |_| || | | (_) | (_) || |___ \_| |_| \___/ /_/\_\ \__ |\_| \___/ \___/ \_____\ __ / / /___ / ### ProxyPool 爬虫代理IP池项目,主要功能为定时采集网上发布的免费代理验证入库,定时验证入库的代理保证代理的可用性,提供API和CLI两种使用方式。同时你也可以扩展代理源以增加代理池IP的质量和数量。 * 文档: [document](https://proxy-pool.readthedocs.io/zh/latest/) [![Documentation Status](https://readthedocs.org/projects/proxy-pool/badge/?version=latest)](https://proxy-pool.readthedocs.io/zh/latest/?badge=latest) * 支持版本: [![](https://img.shields.io/badge/Python-2.7-green.svg)](https://docs.python.org/2.7/) [![](https://img.shields.io/badge/Python-3.5-blue.svg)](https://docs.python.org/3.5/) [![](https://img.shields.io/badge/Python-3.6-blue.svg)](https://docs.python.org/3.6/) [![](https://img.shields.io/badge/Python-3.7-blue.svg)](https://docs.python.org/3.7/) [![](https://img.shields.io/badge/Python-3.8-blue.svg)](https://docs.python.org/3.8/) [![](https://img.shields.io/badge/Python-3.9-blue.svg)](https://docs.python.org/3.9/) [![](https://img.shields.io/badge/Python-3.10-blue.svg)](https://docs.python.org/3.10/) [![](https://img.shields.io/badge/Python-3.11-blue.svg)](https://docs.python.org/3.11/) * 测试地址: http://demo.spiderpy.cn (勿压谢谢) * 付费代理推荐: [luminati-china](https://get.brightdata.com/github_jh). 国外的亮数据BrightData(以前叫luminati)被认为是代理市场领导者,覆盖全球的7200万IP,大部分是真人住宅IP,成功率扛扛的。付费套餐多种,需要高质量代理IP的可以注册后联系中文客服。[申请免费试用](https://get.brightdata.com/github_jh) 目前有50%折扣优惠活动。(PS:用不明白的同学可以参考这个[使用教程](https://www.cnblogs.com/jhao/p/15611785.html))。 ### 运行项目 ##### 下载代码: * git clone ```bash git clone git@github.com:jhao104/proxy_pool.git ``` * releases ```bash https://github.com/jhao104/proxy_pool/releases 下载对应zip文件 ``` ##### 安装依赖: ```bash pip install -r requirements.txt ``` ##### 更新配置: ```python # setting.py 为项目配置文件 # 配置API服务 HOST = "0.0.0.0" # IP PORT = 5000 # 监听端口 # 配置数据库 DB_CONN = 'redis://:pwd@127.0.0.1:8888/0' # 配置 ProxyFetcher PROXY_FETCHER = [ "freeProxy01", # 这里是启用的代理抓取方法名,所有fetch方法位于fetcher/proxyFetcher.py "freeProxy02", # .... ] ``` #### 启动项目: ```bash # 如果已经具备运行条件, 可用通过proxyPool.py启动。 # 程序分为: schedule 调度程序 和 server Api服务 # 启动调度程序 python proxyPool.py schedule # 启动webApi服务 python proxyPool.py server ``` ### Docker Image ```bash docker pull jhao104/proxy_pool docker run --env DB_CONN=redis://:password@ip:port/0 -p 5010:5010 jhao104/proxy_pool:latest ``` ### docker-compose 项目目录下运行: ``` bash docker-compose up -d ``` ### 使用 * Api 启动web服务后, 默认配置下会开启 http://127.0.0.1:5010 的api接口服务: | api | method | Description | params| | ----| ---- | ---- | ----| | / | GET | api介绍 | None | | /get | GET | 随机获取一个代理| 可选参数: `?type=https` 过滤支持https的代理| | /pop | GET | 获取并删除一个代理| 可选参数: `?type=https` 过滤支持https的代理| | /all | GET | 获取所有代理 |可选参数: `?type=https` 过滤支持https的代理| | /count | GET | 查看代理数量 |None| | /delete | GET | 删除代理 |`?proxy=host:ip`| * 爬虫使用   如果要在爬虫代码中使用的话, 可以将此api封装成函数直接使用,例如: ```python import requests def get_proxy(): return requests.get("http://127.0.0.1:5010/get/").json() def delete_proxy(proxy): requests.get("http://127.0.0.1:5010/delete/?proxy={}".format(proxy)) # your spider code def getHtml(): # .... retry_count = 5 proxy = get_proxy().get("proxy") while retry_count > 0: try: html = requests.get('http://www.example.com', proxies={"http": "http://{}".format(proxy)}) # 使用代理访问 return html except Exception: retry_count -= 1 # 删除代理池中代理 delete_proxy(proxy) return None ``` ### 扩展代理   项目默认包含几个免费的代理获取源,但是免费的毕竟质量有限,所以如果直接运行可能拿到的代理质量不理想。所以,提供了代理获取的扩展方法。   添加一个新的代理源方法如下: * 1、首先在[ProxyFetcher](https://github.com/jhao104/proxy_pool/blob/1a3666283806a22ef287fba1a8efab7b94e94bac/fetcher/proxyFetcher.py#L21)类中添加自定义的获取代理的静态方法, 该方法需要以生成器(yield)形式返回`host:ip`格式的代理,例如: ```python class ProxyFetcher(object): # .... # 自定义代理源获取方法 @staticmethod def freeProxyCustom1(): # 命名不和已有重复即可 # 通过某网站或者某接口或某数据库获取代理 # 假设你已经拿到了一个代理列表 proxies = ["x.x.x.x:3128", "x.x.x.x:80"] for proxy in proxies: yield proxy # 确保每个proxy都是 host:ip正确的格式返回 ``` * 2、添加好方法后,修改[setting.py](https://github.com/jhao104/proxy_pool/blob/1a3666283806a22ef287fba1a8efab7b94e94bac/setting.py#L47)文件中的`PROXY_FETCHER`项:   在`PROXY_FETCHER`下添加自定义方法的名字: ```python PROXY_FETCHER = [ "freeProxy01", "freeProxy02", # .... "freeProxyCustom1" # # 确保名字和你添加方法名字一致 ] ```   `schedule` 进程会每隔一段时间抓取一次代理,下次抓取时会自动识别调用你定义的方法。 ### 免费代理源 目前实现的采集免费代理网站有(排名不分先后, 下面仅是对其发布的免费代理情况, 付费代理测评可以参考[这里](https://zhuanlan.zhihu.com/p/33576641)): | 代理名称 | 状态 | 更新速度 | 可用率 | 地址 | 代码 | |---------------| ---- | -------- | ------ | ----- |------------------------------------------------| | 66代理 | ✔ | ★ | * | [地址](http://www.66ip.cn/) | [`freeProxy02`](/fetcher/proxyFetcher.py#L50) | | 开心代理 | ✔ | ★ | * | [地址](http://www.kxdaili.com/) | [`freeProxy03`](/fetcher/proxyFetcher.py#L63) | | FreeProxyList | ✔ | ★ | * | [地址](https://www.freeproxylists.net/zh/) | [`freeProxy04`](/fetcher/proxyFetcher.py#L74) | | 快代理 | ✔ | ★ | * | [地址](https://www.kuaidaili.com/) | [`freeProxy05`](/fetcher/proxyFetcher.py#L92) | | 冰凌代理 | ✔ | ★★★ | * | [地址](https://www.binglx.cn/) | [`freeProxy06`](/fetcher/proxyFetcher.py#L111) | | 云代理 | ✔ | ★ | * | [地址](http://www.ip3366.net/) | [`freeProxy07`](/fetcher/proxyFetcher.py#L123) | | 小幻代理 | ✔ | ★★ | * | [地址](https://ip.ihuan.me/) | [`freeProxy08`](/fetcher/proxyFetcher.py#L133) | | 免费代理库 | ✔ | ☆ | * | [地址](http://ip.jiangxianli.com/) | [`freeProxy09`](/fetcher/proxyFetcher.py#L143) | | 89代理 | ✔ | ☆ | * | [地址](https://www.89ip.cn/) | [`freeProxy10`](/fetcher/proxyFetcher.py#L154) | | 稻壳代理 | ✔ | ★★ | *** | [地址](https://www.docip.ne) | [`freeProxy11`](/fetcher/proxyFetcher.py#L164) | 如果还有其他好的免费代理网站, 可以在提交在[issues](https://github.com/jhao104/proxy_pool/issues/71), 下次更新时会考虑在项目中支持。 ### 问题反馈   任何问题欢迎在[Issues](https://github.com/jhao104/proxy_pool/issues) 中反馈,同时也可以到我的[博客](http://www.spiderpy.cn/blog/message)中留言。   你的反馈会让此项目变得更加完美。 ### 贡献代码   本项目仅作为基本的通用的代理池架构,不接收特有功能(当然,不限于特别好的idea)。   本项目依然不够完善,如果发现bug或有新的功能添加,请在[Issues](https://github.com/jhao104/proxy_pool/issues)中提交bug(或新功能)描述,我会尽力改进,使她更加完美。   这里感谢以下contributor的无私奉献:   [@kangnwh](https://github.com/kangnwh) | [@bobobo80](https://github.com/bobobo80) | [@halleywj](https://github.com/halleywj) | [@newlyedward](https://github.com/newlyedward) | [@wang-ye](https://github.com/wang-ye) | [@gladmo](https://github.com/gladmo) | [@bernieyangmh](https://github.com/bernieyangmh) | [@PythonYXY](https://github.com/PythonYXY) | [@zuijiawoniu](https://github.com/zuijiawoniu) | [@netAir](https://github.com/netAir) | [@scil](https://github.com/scil) | [@tangrela](https://github.com/tangrela) | [@highroom](https://github.com/highroom) | [@luocaodan](https://github.com/luocaodan) | [@vc5](https://github.com/vc5) | [@1again](https://github.com/1again) | [@obaiyan](https://github.com/obaiyan) | [@zsbh](https://github.com/zsbh) | [@jiannanya](https://github.com/jiannanya) | [@Jerry12228](https://github.com/Jerry12228) ### Release Notes [changelog](https://github.com/jhao104/proxy_pool/blob/master/docs/changelog.rst) Featured|HelloGitHub ================================================ FILE: _config.yml ================================================ theme: jekyll-theme-cayman ================================================ FILE: api/__init__.py ================================================ # -*- coding: utf-8 -*- """ ------------------------------------------------- File Name: __init__.py Description : Author : JHao date: 2016/12/3 ------------------------------------------------- Change Activity: 2016/12/3: ------------------------------------------------- """ __author__ = 'JHao' ================================================ FILE: api/proxyApi.py ================================================ # -*- coding: utf-8 -*- # !/usr/bin/env python """ ------------------------------------------------- File Name: ProxyApi.py Description : WebApi Author : JHao date: 2016/12/4 ------------------------------------------------- Change Activity: 2016/12/04: WebApi 2019/08/14: 集成Gunicorn启动方式 2020/06/23: 新增pop接口 2022/07/21: 更新count接口 ------------------------------------------------- """ __author__ = 'JHao' import platform from werkzeug.wrappers import Response from flask import Flask, jsonify, request from util.six import iteritems from helper.proxy import Proxy from handler.proxyHandler import ProxyHandler from handler.configHandler import ConfigHandler app = Flask(__name__) conf = ConfigHandler() proxy_handler = ProxyHandler() class JsonResponse(Response): @classmethod def force_type(cls, response, environ=None): if isinstance(response, (dict, list)): response = jsonify(response) return super(JsonResponse, cls).force_type(response, environ) app.response_class = JsonResponse api_list = [ {"url": "/get", "params": "type: ''https'|''", "desc": "get a proxy"}, {"url": "/pop", "params": "", "desc": "get and delete a proxy"}, {"url": "/delete", "params": "proxy: 'e.g. 127.0.0.1:8080'", "desc": "delete an unable proxy"}, {"url": "/all", "params": "type: ''https'|''", "desc": "get all proxy from proxy pool"}, {"url": "/count", "params": "", "desc": "return proxy count"} # 'refresh': 'refresh proxy pool', ] @app.route('/') def index(): return {'url': api_list} @app.route('/get/') def get(): https = request.args.get("type", "").lower() == 'https' proxy = proxy_handler.get(https) return proxy.to_dict if proxy else {"code": 0, "src": "no proxy"} @app.route('/pop/') def pop(): https = request.args.get("type", "").lower() == 'https' proxy = proxy_handler.pop(https) return proxy.to_dict if proxy else {"code": 0, "src": "no proxy"} @app.route('/refresh/') def refresh(): # TODO refresh会有守护程序定时执行,由api直接调用性能较差,暂不使用 return 'success' @app.route('/all/') def getAll(): https = request.args.get("type", "").lower() == 'https' proxies = proxy_handler.getAll(https) return jsonify([_.to_dict for _ in proxies]) @app.route('/delete/', methods=['GET']) def delete(): proxy = request.args.get('proxy') status = proxy_handler.delete(Proxy(proxy)) return {"code": 0, "src": status} @app.route('/count/') def getCount(): proxies = proxy_handler.getAll() http_type_dict = {} source_dict = {} for proxy in proxies: http_type = 'https' if proxy.https else 'http' http_type_dict[http_type] = http_type_dict.get(http_type, 0) + 1 for source in proxy.source.split('/'): source_dict[source] = source_dict.get(source, 0) + 1 return {"http_type": http_type_dict, "source": source_dict, "count": len(proxies)} def runFlask(): if platform.system() == "Windows": app.run(host=conf.serverHost, port=conf.serverPort) else: import gunicorn.app.base class StandaloneApplication(gunicorn.app.base.BaseApplication): def __init__(self, app, options=None): self.options = options or {} self.application = app super(StandaloneApplication, self).__init__() def load_config(self): _config = dict([(key, value) for key, value in iteritems(self.options) if key in self.cfg.settings and value is not None]) for key, value in iteritems(_config): self.cfg.set(key.lower(), value) def load(self): return self.application _options = { 'bind': '%s:%s' % (conf.serverHost, conf.serverPort), 'workers': 4, 'accesslog': '-', # log to stdout 'access_log_format': '%(h)s %(l)s %(t)s "%(r)s" %(s)s "%(a)s"' } StandaloneApplication(app, _options).run() if __name__ == '__main__': runFlask() ================================================ FILE: db/__init__.py ================================================ # -*- coding: utf-8 -*- """ ------------------------------------------------- File Name: __init__.py.py Description : Author : JHao date: 2016/12/2 ------------------------------------------------- Change Activity: 2016/12/2: ------------------------------------------------- """ ================================================ FILE: db/dbClient.py ================================================ # -*- coding: utf-8 -*- # !/usr/bin/env python """ ------------------------------------------------- File Name: DbClient.py Description : DB工厂类 Author : JHao date: 2016/12/2 ------------------------------------------------- Change Activity: 2016/12/02: DB工厂类 2020/07/03: 取消raw_proxy储存 ------------------------------------------------- """ __author__ = 'JHao' import os import sys from util.six import urlparse, withMetaclass from util.singleton import Singleton sys.path.append(os.path.dirname(os.path.abspath(__file__))) class DbClient(withMetaclass(Singleton)): """ DbClient DB工厂类 提供get/put/update/pop/delete/exists/getAll/clean/getCount/changeTable方法 抽象方法定义: get(): 随机返回一个proxy; put(proxy): 存入一个proxy; pop(): 顺序返回并删除一个proxy; update(proxy): 更新指定proxy信息; delete(proxy): 删除指定proxy; exists(proxy): 判断指定proxy是否存在; getAll(): 返回所有代理; clean(): 清除所有proxy信息; getCount(): 返回proxy统计信息; changeTable(name): 切换操作对象 所有方法需要相应类去具体实现: ssdb: ssdbClient.py redis: redisClient.py mongodb: mongodbClient.py """ def __init__(self, db_conn): """ init :return: """ self.parseDbConn(db_conn) self.__initDbClient() @classmethod def parseDbConn(cls, db_conn): db_conf = urlparse(db_conn) cls.db_type = db_conf.scheme.upper().strip() cls.db_host = db_conf.hostname cls.db_port = db_conf.port cls.db_user = db_conf.username cls.db_pwd = db_conf.password cls.db_name = db_conf.path[1:] return cls def __initDbClient(self): """ init DB Client :return: """ __type = None if "SSDB" == self.db_type: __type = "ssdbClient" elif "REDIS" == self.db_type: __type = "redisClient" else: pass assert __type, 'type error, Not support DB type: {}'.format(self.db_type) self.client = getattr(__import__(__type), "%sClient" % self.db_type.title())(host=self.db_host, port=self.db_port, username=self.db_user, password=self.db_pwd, db=self.db_name) def get(self, https, **kwargs): return self.client.get(https, **kwargs) def put(self, key, **kwargs): return self.client.put(key, **kwargs) def update(self, key, value, **kwargs): return self.client.update(key, value, **kwargs) def delete(self, key, **kwargs): return self.client.delete(key, **kwargs) def exists(self, key, **kwargs): return self.client.exists(key, **kwargs) def pop(self, https, **kwargs): return self.client.pop(https, **kwargs) def getAll(self, https): return self.client.getAll(https) def clear(self): return self.client.clear() def changeTable(self, name): self.client.changeTable(name) def getCount(self): return self.client.getCount() def test(self): return self.client.test() ================================================ FILE: db/redisClient.py ================================================ # -*- coding: utf-8 -*- """ ----------------------------------------------------- File Name: redisClient.py Description : 封装Redis相关操作 Author : JHao date: 2019/8/9 ------------------------------------------------------ Change Activity: 2019/08/09: 封装Redis相关操作 2020/06/23: 优化pop方法, 改用hscan命令 2021/05/26: 区别http/https代理 ------------------------------------------------------ """ __author__ = 'JHao' from redis.exceptions import TimeoutError, ConnectionError, ResponseError from redis.connection import BlockingConnectionPool from handler.logHandler import LogHandler from random import choice from redis import Redis import json class RedisClient(object): """ Redis client Redis中代理存放的结构为hash: key为ip:port, value为代理属性的字典; """ def __init__(self, **kwargs): """ init :param host: host :param port: port :param password: password :param db: db :return: """ self.name = "" kwargs.pop("username") self.__conn = Redis(connection_pool=BlockingConnectionPool(decode_responses=True, timeout=5, socket_timeout=5, **kwargs)) def get(self, https): """ 返回一个代理 :return: """ if https: items = self.__conn.hvals(self.name) proxies = list(filter(lambda x: json.loads(x).get("https"), items)) return choice(proxies) if proxies else None else: proxies = self.__conn.hkeys(self.name) proxy = choice(proxies) if proxies else None return self.__conn.hget(self.name, proxy) if proxy else None def put(self, proxy_obj): """ 将代理放入hash, 使用changeTable指定hash name :param proxy_obj: Proxy obj :return: """ data = self.__conn.hset(self.name, proxy_obj.proxy, proxy_obj.to_json) return data def pop(self, https): """ 弹出一个代理 :return: dict {proxy: value} """ proxy = self.get(https) if proxy: self.__conn.hdel(self.name, json.loads(proxy).get("proxy", "")) return proxy if proxy else None def delete(self, proxy_str): """ 移除指定代理, 使用changeTable指定hash name :param proxy_str: proxy str :return: """ return self.__conn.hdel(self.name, proxy_str) def exists(self, proxy_str): """ 判断指定代理是否存在, 使用changeTable指定hash name :param proxy_str: proxy str :return: """ return self.__conn.hexists(self.name, proxy_str) def update(self, proxy_obj): """ 更新 proxy 属性 :param proxy_obj: :return: """ return self.__conn.hset(self.name, proxy_obj.proxy, proxy_obj.to_json) def getAll(self, https): """ 字典形式返回所有代理, 使用changeTable指定hash name :return: """ items = self.__conn.hvals(self.name) if https: return list(filter(lambda x: json.loads(x).get("https"), items)) else: return items def clear(self): """ 清空所有代理, 使用changeTable指定hash name :return: """ return self.__conn.delete(self.name) def getCount(self): """ 返回代理数量 :return: """ proxies = self.getAll(https=False) return {'total': len(proxies), 'https': len(list(filter(lambda x: json.loads(x).get("https"), proxies)))} def changeTable(self, name): """ 切换操作对象 :param name: :return: """ self.name = name def test(self): log = LogHandler('redis_client') try: self.getCount() except TimeoutError as e: log.error('redis connection time out: %s' % str(e), exc_info=True) return e except ConnectionError as e: log.error('redis connection error: %s' % str(e), exc_info=True) return e except ResponseError as e: log.error('redis connection error: %s' % str(e), exc_info=True) return e ================================================ FILE: db/ssdbClient.py ================================================ # -*- coding: utf-8 -*- # !/usr/bin/env python """ ------------------------------------------------- File Name: ssdbClient.py Description : 封装SSDB操作 Author : JHao date: 2016/12/2 ------------------------------------------------- Change Activity: 2016/12/2: 2017/09/22: PY3中 redis-py返回的数据是bytes型 2017/09/27: 修改pop()方法 返回{proxy:value}字典 2020/07/03: 2.1.0 优化代码结构 2021/05/26: 区分http和https代理 ------------------------------------------------- """ __author__ = 'JHao' from redis.exceptions import TimeoutError, ConnectionError, ResponseError from redis.connection import BlockingConnectionPool from handler.logHandler import LogHandler from random import choice from redis import Redis import json class SsdbClient(object): """ SSDB client SSDB中代理存放的结构为hash: key为代理的ip:por, value为代理属性的字典; """ def __init__(self, **kwargs): """ init :param host: host :param port: port :param password: password :return: """ self.name = "" kwargs.pop("username") self.__conn = Redis(connection_pool=BlockingConnectionPool(decode_responses=True, timeout=5, socket_timeout=5, **kwargs)) def get(self, https): """ 从hash中随机返回一个代理 :return: """ if https: items_dict = self.__conn.hgetall(self.name) proxies = list(filter(lambda x: json.loads(x).get("https"), items_dict.values())) return choice(proxies) if proxies else None else: proxies = self.__conn.hkeys(self.name) proxy = choice(proxies) if proxies else None return self.__conn.hget(self.name, proxy) if proxy else None def put(self, proxy_obj): """ 将代理放入hash :param proxy_obj: Proxy obj :return: """ result = self.__conn.hset(self.name, proxy_obj.proxy, proxy_obj.to_json) return result def pop(self, https): """ 顺序弹出一个代理 :return: proxy """ proxy = self.get(https) if proxy: self.__conn.hdel(self.name, json.loads(proxy).get("proxy", "")) return proxy if proxy else None def delete(self, proxy_str): """ 移除指定代理, 使用changeTable指定hash name :param proxy_str: proxy str :return: """ self.__conn.hdel(self.name, proxy_str) def exists(self, proxy_str): """ 判断指定代理是否存在, 使用changeTable指定hash name :param proxy_str: proxy str :return: """ return self.__conn.hexists(self.name, proxy_str) def update(self, proxy_obj): """ 更新 proxy 属性 :param proxy_obj: :return: """ self.__conn.hset(self.name, proxy_obj.proxy, proxy_obj.to_json) def getAll(self, https): """ 字典形式返回所有代理, 使用changeTable指定hash name :return: """ item_dict = self.__conn.hgetall(self.name) if https: return list(filter(lambda x: json.loads(x).get("https"), item_dict.values())) else: return item_dict.values() def clear(self): """ 清空所有代理, 使用changeTable指定hash name :return: """ return self.__conn.delete(self.name) def getCount(self): """ 返回代理数量 :return: """ proxies = self.getAll(https=False) return {'total': len(proxies), 'https': len(list(filter(lambda x: json.loads(x).get("https"), proxies)))} def changeTable(self, name): """ 切换操作对象 :param name: :return: """ self.name = name def test(self): log = LogHandler('ssdb_client') try: self.getCount() except TimeoutError as e: log.error('ssdb connection time out: %s' % str(e), exc_info=True) return e except ConnectionError as e: log.error('ssdb connection error: %s' % str(e), exc_info=True) return e except ResponseError as e: log.error('ssdb connection error: %s' % str(e), exc_info=True) return e ================================================ FILE: docker-compose.yml ================================================ version: '2' services: proxy_pool: build: . container_name: proxy_pool ports: - "5010:5010" links: - proxy_redis environment: DB_CONN: "redis://@proxy_redis:6379/0" proxy_redis: image: "redis" container_name: proxy_redis ================================================ FILE: docs/Makefile ================================================ # Minimal makefile for Sphinx documentation # # You can set these variables from the command line, and also # from the environment for the first two. SPHINXOPTS ?= SPHINXBUILD ?= sphinx-build SOURCEDIR = . BUILDDIR = _build # Put it first so that "make" without argument is like "make help". help: @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) .PHONY: help Makefile # Catch-all target: route all unknown targets to Sphinx using the new # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). %: Makefile @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) ================================================ FILE: docs/changelog.rst ================================================ .. _changelog: ChangeLog ========== 2.4.2 (2024-01-18) ------------------ 1. 代理格式检查支持需认证的代理格式 `username:password@ip:port` ; (2023-03-10) 2. 新增代理源 **稻壳代理**; (2023-05-15) 3. 新增代理源 **冰凌代理**; (2023-01-18) 2.4.1 (2022-07-17) ------------------ 1. 新增代理源 **FreeProxyList**; (2022-07-21) 2. 新增代理源 **FateZero**; (2022-08-01) 3. 新增代理属性 ``region``; (2022-08-16) 2.4.0 (2021-11-17) ------------------ 1. 移除无效代理源 **神鸡代理**; (2021-11-16) 2. 移除无效代理源 **极速代理**; (2021-11-16) 3. 移除代理源 **西拉代理**; (2021-11-16) 4. 新增代理源 **蝶鸟IP**; (2021-11-16) 5. 新增代理源 **PROXY11**; (2021-11-16) 6. 多线程采集代理; (2021-11-17) 2.3.0 (2021-05-27) ------------------ 1. 修复Dockerfile时区问题; (2021-04-12) 2. 新增Proxy属性 ``source``, 标记代理来源; (2021-04-13) 3. 新增Proxy属性 ``https``, 标记支持https的代理; (2021-05-27) 2.2.0 (2021-04-08) ------------------ 1. 启动时检查数据库连通性; 2. 新增免费代理源 **米扑代理**; 3. 新增免费代理源 **Pzzqz**; 4. 新增免费代理源 **神鸡代理**; 5. 新增免费代理源 **极速代理**; 6. 新增免费代理源 **小幻代理**; 2.1.1 (2021-02-23) ------------------ 1. Fix Bug `#493`_, 新增时区配置; (2020-08-12) 2. 修复 **66代理** 采集; (2020-11-04) 3. 修复 **全网代理** 采集, 解决HTML端口加密问题; (2020-11-04) 4. 新增 **代理盒子** 免费源; (2020-11-04) 5. 新增 ``POOL_SIZE_MIN`` 配置项, runProxyCheck时, 剩余代理少于POOL_SIZE_MIN触发抓取; (2021-02-23) .. _#493: https://github.com/jhao104/proxy_pool/issues/493 2.1.0 (2020.07) ------------------ 1. 新增免费代理源 **西拉代理** (2020-03-30) 2. Fix Bug `#356`_ `#401`_ 3. 优化Docker镜像体积; (2020-06-19) 4. 优化配置方式; 5. 优化代码结构; 6. 不再储存raw_proxy, 抓取后直接验证入库; .. _#401: https://github.com/jhao104/proxy_pool/issues/401 .. _#356: https://github.com/jhao104/proxy_pool/issues/356 2.0.1 (2019.10) ----------------- 1. 新增免费代理源 **89免费代理**; #. 新增免费代理源 **齐云代理** 2.0.0 (2019.08) ------------------ 1. WebApi集成Gunicorn方式启动, Windows平台暂不支持; #. 优化Proxy调度程序; #. 扩展Proxy属性; #. 新增cli工具, 更加方便启动proxyPool 1.14 (2019.07) ----------------- 1. 修复 Queue阻塞导致的 ``ProxyValidSchedule`` 假死bug; #. 修改代理源 **云代理** 抓取; #. 修改代理源 **码农代理** 抓取; #. 修改代理源 **代理66** 抓取, 引入 ``PyExecJS`` 模块破解加速乐动态Cookies加密; 1.13 (2019.02) ----------------- 1. 使用.py文件替换.ini作为配置文件; #. 优化代理采集部分; 1.12 (2018.04) ----------------- 1. 优化代理格式检查; #. 增加代理源; #. fix bug `#122`_ `#126`_ .. _#122: https://github.com/jhao104/proxy_pool/issues/122 .. _#126: https://github.com/jhao104/proxy_pool/issues/126 1.11 (2017.08) ----------------- 1. 使用多线程验证useful_pool; 1.10 (2016.11) ----------------- 1. 第一版; #. 支持PY2/PY3; #. 代理池基本功能; ================================================ FILE: docs/conf.py ================================================ # Configuration file for the Sphinx documentation builder. # # This file only contains a selection of the most common options. For a full # list see the documentation: # https://www.sphinx-doc.org/en/master/usage/configuration.html # -- Path setup -------------------------------------------------------------- # If extensions (or modules to document with autodoc) are in another directory, # add these directories to sys.path here. If the directory is relative to the # documentation root, use os.path.abspath to make it absolute, like shown here. # # import os # import sys # sys.path.insert(0, os.path.abspath('.')) import sphinx_rtd_theme # -- Project information ----------------------------------------------------- project = 'ProxyPool' copyright = '2020, jhao104' author = 'jhao104' master_doc = 'index' # The full version, including alpha/beta/rc tags release = '2.1.0' # -- General configuration --------------------------------------------------- # Add any Sphinx extension module names here, as strings. They can be # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom # ones. extensions = [ ] # If true, sectionauthor and moduleauthor directives will be shown in the # output. They are ignored by default. show_authors = False # The name of the Pygments (syntax highlighting) style to use. pygments_style = "sphinx" # Add any paths that contain templates here, relative to this directory. templates_path = ['_templates'] # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. # # This is also used if you do content translation via gettext catalogs. # Usually you set "language" from the command line for these cases. language = 'zh_CN' # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. # This pattern also affects html_static_path and html_extra_path. exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] # -- Options for HTML output ------------------------------------------------- # The theme to use for HTML and HTML Help pages. See the documentation for # a list of builtin themes. # html_theme = 'sphinx_rtd_theme' html_theme_path = [sphinx_rtd_theme.get_html_theme_path()] # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". html_static_path = ['_static'] ================================================ FILE: docs/dev/ext_fetcher.rst ================================================ .. ext_fetcher 扩展代理源 ----------- 项目默认包含几个免费的代理获取源,但是免费的毕竟质量有限,如果直接运行可能拿到的代理质量不理想。因此提供了用户自定义扩展代理获取的方法。 如果要添加一个新的代理获取方法, 过程如下: 1. 首先在 `ProxyFetcher`_ 类中添加自定义的获取代理的静态方法,该方法需要以生成器(yield)形式返回 ``host:ip`` 格式的代理字符串, 例如: .. code-block:: python class ProxyFetcher(object): # .... # 自定义代理源获取方法 @staticmethod def freeProxyCustom01(): # 命名不和已有重复即可 # 通过某网站或者某接口或某数据库获取代理 # 假设你已经拿到了一个代理列表 proxies = ["x.x.x.x:3128", "x.x.x.x:80"] for proxy in proxies: yield proxy # 确保每个proxy都是 host:ip正确的格式返回 2. 添加好方法后,修改配置文件 `setting.py`_ 中的 ``PROXY_FETCHER`` 项, 加入刚才添加的自定义方法的名字: .. code-block:: python PROXY_FETCHER = [ # .... "freeProxyCustom01" # # 确保名字和你添加方法名字一致 ] .. _ProxyFetcher: https://github.com/jhao104/proxy_pool/blob/1a3666283806a22ef287fba1a8efab7b94e94bac/fetcher/proxyFetcher.py#L20 .. _setting.py: https://github.com/jhao104/proxy_pool/blob/1a3666283806a22ef287fba1a8efab7b94e94bac/setting.py#L47 ================================================ FILE: docs/dev/ext_validator.rst ================================================ .. ext_validator 代理校验 ----------- 内置校验 >>>>>>>>> 项目中使用的代理校验方法全部定义在 `validator.py`_ 中, 通过 `ProxyValidator`_ 类中提供的装饰器来区分。校验方法返回 ``True`` 表示 校验通过, 返回 ``False`` 表示校验不通过。 * 代理校验方法分为三类: ``preValidator`` 、 ``httpValidator`` 、 ``httpsValidator``: * **preValidator**: 预校验,在代理抓取后验证前调用,目前实现了 `formatValidator`_ 校验代理IP格式是否合法; * **httpValidator**: 代理可用性校验,通过则认为代理可用, 目前实现了 `httpTimeOutValidator`_ 校验; * **httpsValidator**: 校验代理是否支持https,目前实现了 `httpsTimeOutValidator`_ 校验。 .. _validator.py: https://github.com/jhao104/proxy_pool/blob/release-2.3.0/helper/validator.py .. _ProxyValidator: https://github.com/jhao104/proxy_pool/blob/release-2.3.0/helper/validator.py#L29 .. _formatValidator: https://github.com/jhao104/proxy_pool/blob/release-2.3.0/helper/validator.py#L51 .. _httpTimeOutValidator: https://github.com/jhao104/proxy_pool/blob/release-2.3.0/helper/validator.py#L58 .. _httpsTimeOutValidator: https://github.com/jhao104/proxy_pool/blob/release-2.3.0/helper/validator.py#L71 每种校验可以定义多个方法,只有 **所有** 方法都返回 ``True`` 的情况下才视为该校验通过,校验方法执行顺序为: 先执行 **httpValidator** , 前者通过后再执行 **httpsValidator** 。 只有 `preValidator` 校验通过的代理才会进入可用性校验, `httpValidator` 校验通过后认为代理可用准备更新入代理池, `httpValidator` 校验通过后视为代理支持https更新代理的 `https` 属性为 `True` 。 扩展校验 >>>>>>>>> 在 `validator.py`_ 已有自定义校验的示例,自定义函数需返回True或者False,使用 `ProxyValidator`_ 中提供的装饰器来区分校验类型。 下面是两个例子: * 1. 自定义一个代理可用性的校验(``addHttpValidator``): .. code-block:: python @ProxyValidator.addHttpValidator def customValidatorExample01(proxy): """自定义代理可用性校验函数""" proxies = {"http": "http://{proxy}".format(proxy=proxy)} try: r = requests.get("http://www.baidu.com/", headers=HEADER, proxies=proxies, timeout=5) return True if r.status_code == 200 and len(r.content) > 200 else False except Exception as e: return False * 2. 自定义一个代理是否支持https的校验(``addHttpsValidator``): .. code-block:: python @ProxyValidator.addHttpsValidator def customValidatorExample02(proxy): """自定义代理是否支持https校验函数""" proxies = {"https": "https://{proxy}".format(proxy=proxy)} try: r = requests.get("https://www.baidu.com/", headers=HEADER, proxies=proxies, timeout=5, verify=False) return True if r.status_code == 200 and len(r.content) > 200 else False except Exception as e: return False 注意,比如在运行代理可用性校验时,所有被 ``ProxyValidator.addHttpValidator`` 装饰的函数会被依次按定义顺序执行,只有当所有函数都返回True时才会判断代理可用。 ``HttpsValidator`` 运行机制也是如此。 ================================================ FILE: docs/dev/index.rst ================================================ ========= 开发指南 ========= .. module:: dev .. toctree:: :maxdepth: 2 ext_fetcher ext_validator ================================================ FILE: docs/index.rst ================================================ .. ProxyPool documentation master file, created by sphinx-quickstart on Wed Jul 8 16:13:42 2020. You can adapt this file completely to your liking, but it should at least contain the root `toctree` directive. ProxyPool ===================================== :: **************************************************************** *** ______ ********************* ______ *********** _ ******** *** | ___ \_ ******************** | ___ \ ********* | | ******** *** | |_/ / \__ __ __ _ __ _ | |_/ /___ * ___ | | ******** *** | __/| _// _ \ \ \/ /| | | || __// _ \ / _ \ | | ******** *** | | | | | (_) | > < \ |_| || | | (_) | (_) || |___ **** *** \_| |_| \___/ /_/\_\ \__ |\_| \___/ \___/ \_____/ **** **** __ / / ***** ************************* /___ / ******************************* ************************* ******************************** **************************************************************** Python爬虫代理IP池 安装 ----- * 下载代码 .. code-block:: console $ git clone git@github.com:jhao104/proxy_pool.git * 安装依赖 .. code-block:: console $ pip install -r requirements.txt * 更新配置 .. code-block:: python HOST = "0.0.0.0" PORT = 5000 DB_CONN = 'redis://@127.0.0.1:8888' PROXY_FETCHER = [ "freeProxy01", "freeProxy02", # .... ] * 启动项目 .. code-block:: console $ python proxyPool.py schedule $ python proxyPool.py server 使用 ______ * API ============ ======== ================ ============== Api Method Description Params ============ ======== ================ ============== / GET API介绍 无 /get GET 返回一个代理 可选参数: `?type=https` 过滤支持https的代理 /pop GET 返回并删除一个代理 可选参数: `?type=https` 过滤支持https的代理 /all GET 返回所有代理 可选参数: `?type=https` 过滤支持https的代理 /count GET 返回代理数量 无 /delete GET 删除指定代理 `?proxy=host:ip` ============ ======== ================ ============== * 爬虫 .. code-block:: python import requests def get_proxy(): return requests.get("http://127.0.0.1:5010/get?type=https").json() def delete_proxy(proxy): requests.get("http://127.0.0.1:5010/delete/?proxy={}".format(proxy)) # your spider code def getHtml(): # .... retry_count = 5 proxy = get_proxy().get("proxy") while retry_count > 0: try: html = requests.get('https://www.example.com', proxies={"http": "http://{}".format(proxy), "https": "https://{}".format(proxy)}) # 使用代理访问 return html except Exception: retry_count -= 1 # 删除代理池中代理 delete_proxy(proxy) return None Contents -------- .. toctree:: :maxdepth: 2 user/index dev/index changelog ================================================ FILE: docs/make.bat ================================================ @ECHO OFF pushd %~dp0 REM Command file for Sphinx documentation if "%SPHINXBUILD%" == "" ( set SPHINXBUILD=sphinx-build ) set SOURCEDIR=. set BUILDDIR=_build if "%1" == "" goto help %SPHINXBUILD% >NUL 2>NUL if errorlevel 9009 ( echo. echo.The 'sphinx-build' command was not found. Make sure you have Sphinx echo.installed, then set the SPHINXBUILD environment variable to point echo.to the full path of the 'sphinx-build' executable. Alternatively you echo.may add the Sphinx directory to PATH. echo. echo.If you don't have Sphinx installed, grab it from echo.http://sphinx-doc.org/ exit /b 1 ) %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% goto end :help %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% :end popd ================================================ FILE: docs/user/how_to_config.rst ================================================ .. how_to_config 配置参考 --------- 配置文件 ``setting.py`` 位于项目的主目录下, 配置主要分为四类: **服务配置** 、 **数据库配置** 、 **采集配置** 、 **校验配置**. 服务配置 >>>>>>>>> * ``HOST`` API服务监听的IP, 本机访问设置为 ``127.0.0.1``, 开启远程访问设置为: ``0.0.0.0``. * ``PORT`` API服务监听的端口. 数据库配置 >>>>>>>>>>> * ``DB_CONN`` 用户存放代理IP的数据库URI, 配置格式为: ``db_type://[[user]:[pwd]]@ip:port/[db]``. 目前支持的db_type有: ``ssdb`` 、 ``redis``. 配置示例: .. code-block:: python # SSDB IP: 127.0.0.1 Port: 8888 DB_CONN = 'ssdb://@127.0.0.1:8888' # SSDB IP: 127.0.0.1 Port: 8899 Password: 123456 DB_CONN = 'ssdb://:123456@127.0.0.1:8888' # Redis IP: 127.0.0.1 Port: 6379 DB_CONN = 'redis://@127.0.0.1:6379' # Redis IP: 127.0.0.1 Port: 6379 Password: 123456 DB_CONN = 'redis://:123456@127.0.0.1:6379' # Redis IP: 127.0.0.1 Port: 6379 Password: 123456 DB: 15 DB_CONN = 'redis://:123456@127.0.0.1:6379/15' * ``TABLE_NAME`` 存放代理的数据载体名称, ssdb和redis的存放结构为hash. 采集配置 >>>>>>>>> * ``PROXY_FETCHER`` 启用的代理采集方法名, 代理采集方法位于 ``fetcher/proxyFetcher.py`` 类中. 由于各个代理源的稳定性不容易掌握, 当某个代理采集方法失效时, 可以该配置中注释掉其名称. 如果有增加某些代理采集方法, 也请在该配置中添加其方法名, 具体请参考 :doc:`/dev/extend_fetcher`. 调度程序每次执行采集任务时都会再次加载该配置, 保证每次运行的采集方法都是有效的. 校验配置 >>>>>>>>> * ``HTTP_URL`` 用于检验代理是否可用的地址, 默认为 ``http://httpbin.org``, 可根据使用场景修改为其他地址. * ``HTTPS_URL`` 用于检验代理是否支持HTTPS的地址, 默认为 ``https://www.qq.com``, 可根据使用场景修改为其他地址. * ``VERIFY_TIMEOUT`` 检验代理的超时时间, 默认为 ``10`` , 单位秒. 使用代理访问 ``HTTP(S)_URL`` 耗时超过 ``VERIFY_TIMEOUT`` 时, 视为代理不可用. * ``MAX_FAIL_COUNT`` 检验代理允许最大失败次数, 默认为 ``0``, 即出错一次即删除. * ``POOL_SIZE_MIN`` 代理检测定时任务运行前若代理数量小于 `POOL_SIZE_MIN`, 则先运行抓取程序. ================================================ FILE: docs/user/how_to_run.rst ================================================ .. how_to_run 如何运行 --------- 下载代码 >>>>>>>>> 本项目需要下载代码到本地运行, 通过 ``git`` 下载: .. code-block:: console $ git clone git@github.com:jhao104/proxy_pool.git 或者下载特定的 ``release`` 版本: .. code-block:: console https://github.com/jhao104/proxy_pool/releases 安装依赖 >>>>>>>>> 到项目目录下使用 ``pip`` 安装依赖库: .. code-block:: console $ pip install -r requirements.txt 更新配置 >>>>>>>>> 配置文件 ``setting.py`` 位于项目的主目录下: .. code-block:: python # 配置API服务 HOST = "0.0.0.0" # IP PORT = 5000 # 监听端口 # 配置数据库 DB_CONN = 'redis://@127.0.0.1:8888/0' # 配置 ProxyFetcher PROXY_FETCHER = [ "freeProxy01", # 这里是启用的代理抓取方法,所有fetch方法位于fetcher/proxyFetcher.py "freeProxy02", # .... ] 更多配置请参考 :doc:`/user/how_to_config` 启动项目 >>>>>>>>> 如果已配置好运行环境, 具备运行条件, 可以通过 ``proxyPool.py`` 启动. ``proxyPool.py`` 是项目的CLI入口. 完整程序包含两部份: ``schedule`` 调度程序和 ``server`` API服务, 调度程序负责采集和验证代理, API服务提供代理服务HTTP接口. 通过命令行程序分别启动调度程序和API服务: .. code-block:: console # 启动调度程序 $ python proxyPool.py schedule # 启动webApi服务 $ python proxyPool.py server ================================================ FILE: docs/user/how_to_use.rst ================================================ .. how_to_use 如何使用 ---------- 爬虫代码要对接代理池目前有两种方式: 一是通过调用API接口使用, 二是直接读取数据库. 调用API >>>>>>>>> 启动ProxyPool的 ``server`` 后会提供如下几个http接口: ============ ======== ================ ============== Api Method Description Arg ============ ======== ================ ============== / GET API介绍 无 /get GET 随机返回一个代理 无 /get_all GET 返回所有代理 无 /get_status GET 返回代理数量 无 /delete GET 删除指定代理 proxy=host:ip ============ ======== ================ ============== 在代码中可以通过封装上面的API接口来使用代理, 例子: .. code-block:: python import requests def get_proxy(): return requests.get("http://127.0.0.1:5010/get/").json() def delete_proxy(proxy): requests.get("http://127.0.0.1:5010/delete/?proxy={}".format(proxy)) # your spider code def getHtml(): # .... retry_count = 5 proxy = get_proxy().get("proxy") while retry_count > 0: try: # 使用代理访问 html = requests.get('http://www.example.com', proxies={"http": "http://{}".format(proxy)}) return html except Exception: retry_count -= 1 # 删除代理池中代理 delete_proxy(proxy) return None 本例中我们在本地 ``127.0.0.1`` 启动端口为 ``5010`` 的 ``server``, 使用 ``/get`` 接口获取代理, ``/delete`` 删除代理. 读数据库 >>>>>>>>> 目前支持配置两种数据库: ``REDIS`` 、 ``SSDB``. * **REDIS** 储存结构为 ``hash``, hash name为配置项中的 **TABLE_NAME** * **SSDB** 储存结构为 ``hash``, hash name为配置项中的 **TABLE_NAME** 可以在代码中自行读取. ================================================ FILE: docs/user/index.rst ================================================ ========= 用户指南 ========= .. module:: user .. toctree:: :maxdepth: 2 how_to_run how_to_use how_to_config ================================================ FILE: fetcher/__init__.py ================================================ # -*- coding: utf-8 -*- """ ------------------------------------------------- File Name: __init__.py Description : Author : JHao date: 2016/11/25 ------------------------------------------------- Change Activity: 2016/11/25: ------------------------------------------------- """ ================================================ FILE: fetcher/proxyFetcher.py ================================================ # -*- coding: utf-8 -*- """ ------------------------------------------------- File Name: proxyFetcher Description : Author : JHao date: 2016/11/25 ------------------------------------------------- Change Activity: 2016/11/25: proxyFetcher ------------------------------------------------- """ __author__ = 'JHao' import re import json from time import sleep from util.webRequest import WebRequest class ProxyFetcher(object): """ proxy getter """ @staticmethod def freeProxy01(): """ 站大爷 https://www.zdaye.com/dayProxy.html """ start_url = "https://www.zdaye.com/dayProxy.html" html_tree = WebRequest().get(start_url, verify=False).tree latest_page_time = html_tree.xpath("//span[@class='thread_time_info']/text()")[0].strip() from datetime import datetime interval = datetime.now() - datetime.strptime(latest_page_time, "%Y/%m/%d %H:%M:%S") if interval.seconds < 300: # 只采集5分钟内的更新 target_url = "https://www.zdaye.com/" + html_tree.xpath("//h3[@class='thread_title']/a/@href")[0].strip() while target_url: _tree = WebRequest().get(target_url, verify=False).tree for tr in _tree.xpath("//table//tr"): ip = "".join(tr.xpath("./td[1]/text()")).strip() port = "".join(tr.xpath("./td[2]/text()")).strip() yield "%s:%s" % (ip, port) next_page = _tree.xpath("//div[@class='page']/a[@title='下一页']/@href") target_url = "https://www.zdaye.com/" + next_page[0].strip() if next_page else False sleep(5) @staticmethod def freeProxy02(): """ 代理66 http://www.66ip.cn/ """ url = "http://www.66ip.cn/" resp = WebRequest().get(url, timeout=10).tree for i, tr in enumerate(resp.xpath("(//table)[3]//tr")): if i > 0: ip = "".join(tr.xpath("./td[1]/text()")).strip() port = "".join(tr.xpath("./td[2]/text()")).strip() yield "%s:%s" % (ip, port) @staticmethod def freeProxy03(): """ 开心代理 """ target_urls = ["http://www.kxdaili.com/dailiip.html", "http://www.kxdaili.com/dailiip/2/1.html"] for url in target_urls: tree = WebRequest().get(url).tree for tr in tree.xpath("//table[@class='active']//tr")[1:]: ip = "".join(tr.xpath('./td[1]/text()')).strip() port = "".join(tr.xpath('./td[2]/text()')).strip() yield "%s:%s" % (ip, port) @staticmethod def freeProxy04(): """ FreeProxyList https://www.freeproxylists.net/zh/ """ url = "https://www.freeproxylists.net/zh/?c=CN&pt=&pr=&a%5B%5D=0&a%5B%5D=1&a%5B%5D=2&u=50" tree = WebRequest().get(url, verify=False).tree from urllib import parse def parse_ip(input_str): html_str = parse.unquote(input_str) ips = re.findall(r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}', html_str) return ips[0] if ips else None for tr in tree.xpath("//tr[@class='Odd']") + tree.xpath("//tr[@class='Even']"): ip = parse_ip("".join(tr.xpath('./td[1]/script/text()')).strip()) port = "".join(tr.xpath('./td[2]/text()')).strip() if ip: yield "%s:%s" % (ip, port) @staticmethod def freeProxy05(page_count=1): """ 快代理 https://www.kuaidaili.com """ url_pattern = [ 'https://www.kuaidaili.com/free/inha/{}/', 'https://www.kuaidaili.com/free/intr/{}/' ] url_list = [] for page_index in range(1, page_count + 1): for pattern in url_pattern: url_list.append(pattern.format(page_index)) for url in url_list: tree = WebRequest().get(url).tree proxy_list = tree.xpath('.//table//tr') sleep(1) # 必须sleep 不然第二条请求不到数据 for tr in proxy_list[1:]: yield ':'.join(tr.xpath('./td/text()')[0:2]) @staticmethod def freeProxy06(): """ 冰凌代理 https://www.binglx.cn """ url = "https://www.binglx.cn/?page=1" try: tree = WebRequest().get(url).tree proxy_list = tree.xpath('.//table//tr') for tr in proxy_list[1:]: yield ':'.join(tr.xpath('./td/text()')[0:2]) except Exception as e: print(e) @staticmethod def freeProxy07(): """ 云代理 """ urls = ['http://www.ip3366.net/free/?stype=1', "http://www.ip3366.net/free/?stype=2"] for url in urls: r = WebRequest().get(url, timeout=10) proxies = re.findall(r'(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})[\s\S]*?(\d+)', r.text) for proxy in proxies: yield ":".join(proxy) @staticmethod def freeProxy08(): """ 小幻代理 """ urls = ['https://ip.ihuan.me/address/5Lit5Zu9.html'] for url in urls: r = WebRequest().get(url, timeout=10) proxies = re.findall(r'>\s*?(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})\s*?(\d+)', r.text) for proxy in proxies: yield ":".join(proxy) @staticmethod def freeProxy09(page_count=1): """ 免费代理库 """ for i in range(1, page_count + 1): url = 'http://ip.jiangxianli.com/?country=中国&page={}'.format(i) html_tree = WebRequest().get(url, verify=False).tree for index, tr in enumerate(html_tree.xpath("//table//tr")): if index == 0: continue yield ":".join(tr.xpath("./td/text()")[0:2]).strip() @staticmethod def freeProxy10(): """ 89免费代理 """ r = WebRequest().get("https://www.89ip.cn/index_1.html", timeout=10) proxies = re.findall( r'[\s\S]*?(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})[\s\S]*?[\s\S]*?[\s\S]*?(\d+)[\s\S]*?', r.text) for proxy in proxies: yield ':'.join(proxy) @staticmethod def freeProxy11(): """ 稻壳代理 https://www.docip.net/ """ r = WebRequest().get("https://www.docip.net/data/free.json", timeout=10) try: for each in r.json['data']: yield each['ip'] except Exception as e: print(e) # @staticmethod # def wallProxy01(): # """ # PzzQz https://pzzqz.com/ # """ # from requests import Session # from lxml import etree # session = Session() # try: # index_resp = session.get("https://pzzqz.com/", timeout=20, verify=False).text # x_csrf_token = re.findall('X-CSRFToken": "(.*?)"', index_resp) # if x_csrf_token: # data = {"http": "on", "ping": "3000", "country": "cn", "ports": ""} # proxy_resp = session.post("https://pzzqz.com/", verify=False, # headers={"X-CSRFToken": x_csrf_token[0]}, json=data).json() # tree = etree.HTML(proxy_resp["proxy_html"]) # for tr in tree.xpath("//tr"): # ip = "".join(tr.xpath("./td[1]/text()")) # port = "".join(tr.xpath("./td[2]/text()")) # yield "%s:%s" % (ip, port) # except Exception as e: # print(e) # @staticmethod # def freeProxy10(): # """ # 墙外网站 cn-proxy # :return: # """ # urls = ['http://cn-proxy.com/', 'http://cn-proxy.com/archives/218'] # request = WebRequest() # for url in urls: # r = request.get(url, timeout=10) # proxies = re.findall(r'(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})[\w\W](\d+)', r.text) # for proxy in proxies: # yield ':'.join(proxy) # @staticmethod # def freeProxy11(): # """ # https://proxy-list.org/english/index.php # :return: # """ # urls = ['https://proxy-list.org/english/index.php?p=%s' % n for n in range(1, 10)] # request = WebRequest() # import base64 # for url in urls: # r = request.get(url, timeout=10) # proxies = re.findall(r"Proxy\('(.*?)'\)", r.text) # for proxy in proxies: # yield base64.b64decode(proxy).decode() # @staticmethod # def freeProxy12(): # urls = ['https://list.proxylistplus.com/Fresh-HTTP-Proxy-List-1'] # request = WebRequest() # for url in urls: # r = request.get(url, timeout=10) # proxies = re.findall(r'(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})[\s\S]*?(\d+)', r.text) # for proxy in proxies: # yield ':'.join(proxy) if __name__ == '__main__': p = ProxyFetcher() for _ in p.freeProxy06(): print(_) # http://nntime.com/proxy-list-01.htm ================================================ FILE: handler/__init__.py ================================================ # -*- coding: utf-8 -*- """ ------------------------------------------------- File Name: __init__.py Description : Author : JHao date: 2016/12/3 ------------------------------------------------- Change Activity: 2016/12/3: ------------------------------------------------- """ __author__ = 'JHao' # from handler.ProxyManager import ProxyManager ================================================ FILE: handler/configHandler.py ================================================ # -*- coding: utf-8 -*- """ ------------------------------------------------- File Name: configHandler Description : Author : JHao date: 2020/6/22 ------------------------------------------------- Change Activity: 2020/6/22: ------------------------------------------------- """ __author__ = 'JHao' import os import setting from util.singleton import Singleton from util.lazyProperty import LazyProperty from util.six import reload_six, withMetaclass class ConfigHandler(withMetaclass(Singleton)): def __init__(self): pass @LazyProperty def serverHost(self): return os.environ.get("HOST", setting.HOST) @LazyProperty def serverPort(self): return os.environ.get("PORT", setting.PORT) @LazyProperty def dbConn(self): return os.getenv("DB_CONN", setting.DB_CONN) @LazyProperty def tableName(self): return os.getenv("TABLE_NAME", setting.TABLE_NAME) @property def fetchers(self): reload_six(setting) return setting.PROXY_FETCHER @LazyProperty def httpUrl(self): return os.getenv("HTTP_URL", setting.HTTP_URL) @LazyProperty def httpsUrl(self): return os.getenv("HTTPS_URL", setting.HTTPS_URL) @LazyProperty def verifyTimeout(self): return int(os.getenv("VERIFY_TIMEOUT", setting.VERIFY_TIMEOUT)) # @LazyProperty # def proxyCheckCount(self): # return int(os.getenv("PROXY_CHECK_COUNT", setting.PROXY_CHECK_COUNT)) @LazyProperty def maxFailCount(self): return int(os.getenv("MAX_FAIL_COUNT", setting.MAX_FAIL_COUNT)) # @LazyProperty # def maxFailRate(self): # return int(os.getenv("MAX_FAIL_RATE", setting.MAX_FAIL_RATE)) @LazyProperty def poolSizeMin(self): return int(os.getenv("POOL_SIZE_MIN", setting.POOL_SIZE_MIN)) @LazyProperty def proxyRegion(self): return bool(os.getenv("PROXY_REGION", setting.PROXY_REGION)) @LazyProperty def timezone(self): return os.getenv("TIMEZONE", setting.TIMEZONE) ================================================ FILE: handler/logHandler.py ================================================ # -*- coding: utf-8 -*- """ ------------------------------------------------- File Name: LogHandler.py Description : 日志操作模块 Author : JHao date: 2017/3/6 ------------------------------------------------- Change Activity: 2017/03/06: log handler 2017/09/21: 屏幕输出/文件输出 可选(默认屏幕和文件均输出) 2020/07/13: Windows下TimedRotatingFileHandler线程不安全, 不再使用 ------------------------------------------------- """ __author__ = 'JHao' import os import logging import platform from logging.handlers import TimedRotatingFileHandler # 日志级别 CRITICAL = 50 FATAL = CRITICAL ERROR = 40 WARNING = 30 WARN = WARNING INFO = 20 DEBUG = 10 NOTSET = 0 CURRENT_PATH = os.path.dirname(os.path.abspath(__file__)) ROOT_PATH = os.path.join(CURRENT_PATH, os.pardir) LOG_PATH = os.path.join(ROOT_PATH, 'log') if not os.path.exists(LOG_PATH): try: os.mkdir(LOG_PATH) except FileExistsError: pass class LogHandler(logging.Logger): """ LogHandler """ def __init__(self, name, level=DEBUG, stream=True, file=True): self.name = name self.level = level logging.Logger.__init__(self, self.name, level=level) if stream: self.__setStreamHandler__() if file: if platform.system() != "Windows": self.__setFileHandler__() def __setFileHandler__(self, level=None): """ set file handler :param level: :return: """ file_name = os.path.join(LOG_PATH, '{name}.log'.format(name=self.name)) # 设置日志回滚, 保存在log目录, 一天保存一个文件, 保留15天 file_handler = TimedRotatingFileHandler(filename=file_name, when='D', interval=1, backupCount=15) file_handler.suffix = '%Y%m%d.log' if not level: file_handler.setLevel(self.level) else: file_handler.setLevel(level) formatter = logging.Formatter('%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s') file_handler.setFormatter(formatter) self.file_handler = file_handler self.addHandler(file_handler) def __setStreamHandler__(self, level=None): """ set stream handler :param level: :return: """ stream_handler = logging.StreamHandler() formatter = logging.Formatter('%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s') stream_handler.setFormatter(formatter) if not level: stream_handler.setLevel(self.level) else: stream_handler.setLevel(level) self.addHandler(stream_handler) if __name__ == '__main__': log = LogHandler('test') log.info('this is a test msg') ================================================ FILE: handler/proxyHandler.py ================================================ # -*- coding: utf-8 -*- """ ------------------------------------------------- File Name: ProxyHandler.py Description : Author : JHao date: 2016/12/3 ------------------------------------------------- Change Activity: 2016/12/03: 2020/05/26: 区分http和https ------------------------------------------------- """ __author__ = 'JHao' from helper.proxy import Proxy from db.dbClient import DbClient from handler.configHandler import ConfigHandler class ProxyHandler(object): """ Proxy CRUD operator""" def __init__(self): self.conf = ConfigHandler() self.db = DbClient(self.conf.dbConn) self.db.changeTable(self.conf.tableName) def get(self, https=False): """ return a proxy Args: https: True/False Returns: """ proxy = self.db.get(https) return Proxy.createFromJson(proxy) if proxy else None def pop(self, https): """ return and delete a useful proxy :return: """ proxy = self.db.pop(https) if proxy: return Proxy.createFromJson(proxy) return None def put(self, proxy): """ put proxy into use proxy :return: """ self.db.put(proxy) def delete(self, proxy): """ delete useful proxy :param proxy: :return: """ return self.db.delete(proxy.proxy) def getAll(self, https=False): """ get all proxy from pool as Proxy list :return: """ proxies = self.db.getAll(https) return [Proxy.createFromJson(_) for _ in proxies] def exists(self, proxy): """ check proxy exists :param proxy: :return: """ return self.db.exists(proxy.proxy) def getCount(self): """ return raw_proxy and use_proxy count :return: """ total_use_proxy = self.db.getCount() return {'count': total_use_proxy} ================================================ FILE: helper/__init__.py ================================================ ================================================ FILE: helper/check.py ================================================ # -*- coding: utf-8 -*- """ ------------------------------------------------- File Name: check Description : 执行代理校验 Author : JHao date: 2019/8/6 ------------------------------------------------- Change Activity: 2019/08/06: 执行代理校验 2021/05/25: 分别校验http和https 2022/08/16: 获取代理Region信息 ------------------------------------------------- """ __author__ = 'JHao' from util.six import Empty from threading import Thread from datetime import datetime from util.webRequest import WebRequest from handler.logHandler import LogHandler from helper.validator import ProxyValidator from handler.proxyHandler import ProxyHandler from handler.configHandler import ConfigHandler class DoValidator(object): """ 执行校验 """ conf = ConfigHandler() @classmethod def validator(cls, proxy, work_type): """ 校验入口 Args: proxy: Proxy Object work_type: raw/use Returns: Proxy Object """ http_r = cls.httpValidator(proxy) https_r = False if not http_r else cls.httpsValidator(proxy) proxy.check_count += 1 proxy.last_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S") proxy.last_status = True if http_r else False if http_r: if proxy.fail_count > 0: proxy.fail_count -= 1 proxy.https = True if https_r else False if work_type == "raw": proxy.region = cls.regionGetter(proxy) if cls.conf.proxyRegion else "" else: proxy.fail_count += 1 return proxy @classmethod def httpValidator(cls, proxy): for func in ProxyValidator.http_validator: if not func(proxy.proxy): return False return True @classmethod def httpsValidator(cls, proxy): for func in ProxyValidator.https_validator: if not func(proxy.proxy): return False return True @classmethod def preValidator(cls, proxy): for func in ProxyValidator.pre_validator: if not func(proxy): return False return True @classmethod def regionGetter(cls, proxy): try: url = 'https://searchplugin.csdn.net/api/v1/ip/get?ip=%s' % proxy.proxy.split(':')[0] r = WebRequest().get(url=url, retry_time=1, timeout=2).json return r['data']['address'] except: return 'error' class _ThreadChecker(Thread): """ 多线程检测 """ def __init__(self, work_type, target_queue, thread_name): Thread.__init__(self, name=thread_name) self.work_type = work_type self.log = LogHandler("checker") self.proxy_handler = ProxyHandler() self.target_queue = target_queue self.conf = ConfigHandler() def run(self): self.log.info("{}ProxyCheck - {}: start".format(self.work_type.title(), self.name)) while True: try: proxy = self.target_queue.get(block=False) except Empty: self.log.info("{}ProxyCheck - {}: complete".format(self.work_type.title(), self.name)) break proxy = DoValidator.validator(proxy, self.work_type) if self.work_type == "raw": self.__ifRaw(proxy) else: self.__ifUse(proxy) self.target_queue.task_done() def __ifRaw(self, proxy): if proxy.last_status: if self.proxy_handler.exists(proxy): self.log.info('RawProxyCheck - {}: {} exist'.format(self.name, proxy.proxy.ljust(23))) else: self.log.info('RawProxyCheck - {}: {} pass'.format(self.name, proxy.proxy.ljust(23))) self.proxy_handler.put(proxy) else: self.log.info('RawProxyCheck - {}: {} fail'.format(self.name, proxy.proxy.ljust(23))) def __ifUse(self, proxy): if proxy.last_status: self.log.info('UseProxyCheck - {}: {} pass'.format(self.name, proxy.proxy.ljust(23))) self.proxy_handler.put(proxy) else: if proxy.fail_count > self.conf.maxFailCount: self.log.info('UseProxyCheck - {}: {} fail, count {} delete'.format(self.name, proxy.proxy.ljust(23), proxy.fail_count)) self.proxy_handler.delete(proxy) else: self.log.info('UseProxyCheck - {}: {} fail, count {} keep'.format(self.name, proxy.proxy.ljust(23), proxy.fail_count)) self.proxy_handler.put(proxy) def Checker(tp, queue): """ run Proxy ThreadChecker :param tp: raw/use :param queue: Proxy Queue :return: """ thread_list = list() for index in range(20): thread_list.append(_ThreadChecker(tp, queue, "thread_%s" % str(index).zfill(2))) for thread in thread_list: thread.setDaemon(True) thread.start() for thread in thread_list: thread.join() ================================================ FILE: helper/fetch.py ================================================ # -*- coding: utf-8 -*- """ ------------------------------------------------- File Name: fetchScheduler Description : Author : JHao date: 2019/8/6 ------------------------------------------------- Change Activity: 2021/11/18: 多线程采集 ------------------------------------------------- """ __author__ = 'JHao' from threading import Thread from helper.proxy import Proxy from helper.check import DoValidator from handler.logHandler import LogHandler from handler.proxyHandler import ProxyHandler from fetcher.proxyFetcher import ProxyFetcher from handler.configHandler import ConfigHandler class _ThreadFetcher(Thread): def __init__(self, fetch_source, proxy_dict): Thread.__init__(self) self.fetch_source = fetch_source self.proxy_dict = proxy_dict self.fetcher = getattr(ProxyFetcher, fetch_source, None) self.log = LogHandler("fetcher") self.conf = ConfigHandler() self.proxy_handler = ProxyHandler() def run(self): self.log.info("ProxyFetch - {func}: start".format(func=self.fetch_source)) try: for proxy in self.fetcher(): self.log.info('ProxyFetch - %s: %s ok' % (self.fetch_source, proxy.ljust(23))) proxy = proxy.strip() if proxy in self.proxy_dict: self.proxy_dict[proxy].add_source(self.fetch_source) else: self.proxy_dict[proxy] = Proxy( proxy, source=self.fetch_source) except Exception as e: self.log.error("ProxyFetch - {func}: error".format(func=self.fetch_source)) self.log.error(str(e)) class Fetcher(object): name = "fetcher" def __init__(self): self.log = LogHandler(self.name) self.conf = ConfigHandler() def run(self): """ fetch proxy with proxyFetcher :return: """ proxy_dict = dict() thread_list = list() self.log.info("ProxyFetch : start") for fetch_source in self.conf.fetchers: self.log.info("ProxyFetch - {func}: start".format(func=fetch_source)) fetcher = getattr(ProxyFetcher, fetch_source, None) if not fetcher: self.log.error("ProxyFetch - {func}: class method not exists!".format(func=fetch_source)) continue if not callable(fetcher): self.log.error("ProxyFetch - {func}: must be class method".format(func=fetch_source)) continue thread_list.append(_ThreadFetcher(fetch_source, proxy_dict)) for thread in thread_list: thread.setDaemon(True) thread.start() for thread in thread_list: thread.join() self.log.info("ProxyFetch - all complete!") for _ in proxy_dict.values(): if DoValidator.preValidator(_.proxy): yield _ ================================================ FILE: helper/launcher.py ================================================ # -*- coding: utf-8 -*- """ ------------------------------------------------- File Name: launcher Description : 启动器 Author : JHao date: 2021/3/26 ------------------------------------------------- Change Activity: 2021/3/26: 启动器 ------------------------------------------------- """ __author__ = 'JHao' import sys from db.dbClient import DbClient from handler.logHandler import LogHandler from handler.configHandler import ConfigHandler log = LogHandler('launcher') def startServer(): __beforeStart() from api.proxyApi import runFlask runFlask() def startScheduler(): __beforeStart() from helper.scheduler import runScheduler runScheduler() def __beforeStart(): __showVersion() __showConfigure() if __checkDBConfig(): log.info('exit!') sys.exit() def __showVersion(): from setting import VERSION log.info("ProxyPool Version: %s" % VERSION) def __showConfigure(): conf = ConfigHandler() log.info("ProxyPool configure HOST: %s" % conf.serverHost) log.info("ProxyPool configure PORT: %s" % conf.serverPort) log.info("ProxyPool configure PROXY_FETCHER: %s" % conf.fetchers) def __checkDBConfig(): conf = ConfigHandler() db = DbClient(conf.dbConn) log.info("============ DATABASE CONFIGURE ================") log.info("DB_TYPE: %s" % db.db_type) log.info("DB_HOST: %s" % db.db_host) log.info("DB_PORT: %s" % db.db_port) log.info("DB_NAME: %s" % db.db_name) log.info("DB_USER: %s" % db.db_user) log.info("=================================================") return db.test() ================================================ FILE: helper/proxy.py ================================================ # -*- coding: utf-8 -*- """ ------------------------------------------------- File Name: Proxy Description : 代理对象类型封装 Author : JHao date: 2019/7/11 ------------------------------------------------- Change Activity: 2019/7/11: 代理对象类型封装 ------------------------------------------------- """ __author__ = 'JHao' import json class Proxy(object): def __init__(self, proxy, fail_count=0, region="", anonymous="", source="", check_count=0, last_status="", last_time="", https=False): self._proxy = proxy self._fail_count = fail_count self._region = region self._anonymous = anonymous self._source = source.split('/') self._check_count = check_count self._last_status = last_status self._last_time = last_time self._https = https @classmethod def createFromJson(cls, proxy_json): _dict = json.loads(proxy_json) return cls(proxy=_dict.get("proxy", ""), fail_count=_dict.get("fail_count", 0), region=_dict.get("region", ""), anonymous=_dict.get("anonymous", ""), source=_dict.get("source", ""), check_count=_dict.get("check_count", 0), last_status=_dict.get("last_status", ""), last_time=_dict.get("last_time", ""), https=_dict.get("https", False) ) @property def proxy(self): """ 代理 ip:port """ return self._proxy @property def fail_count(self): """ 检测失败次数 """ return self._fail_count @property def region(self): """ 地理位置(国家/城市) """ return self._region @property def anonymous(self): """ 匿名 """ return self._anonymous @property def source(self): """ 代理来源 """ return '/'.join(self._source) @property def check_count(self): """ 代理检测次数 """ return self._check_count @property def last_status(self): """ 最后一次检测结果 True -> 可用; False -> 不可用""" return self._last_status @property def last_time(self): """ 最后一次检测时间 """ return self._last_time @property def https(self): """ 是否支持https """ return self._https @property def to_dict(self): """ 属性字典 """ return {"proxy": self.proxy, "https": self.https, "fail_count": self.fail_count, "region": self.region, "anonymous": self.anonymous, "source": self.source, "check_count": self.check_count, "last_status": self.last_status, "last_time": self.last_time} @property def to_json(self): """ 属性json格式 """ return json.dumps(self.to_dict, ensure_ascii=False) @fail_count.setter def fail_count(self, value): self._fail_count = value @check_count.setter def check_count(self, value): self._check_count = value @last_status.setter def last_status(self, value): self._last_status = value @last_time.setter def last_time(self, value): self._last_time = value @https.setter def https(self, value): self._https = value @region.setter def region(self, value): self._region = value def add_source(self, source_str): if source_str: self._source.append(source_str) self._source = list(set(self._source)) ================================================ FILE: helper/scheduler.py ================================================ # -*- coding: utf-8 -*- """ ------------------------------------------------- File Name: proxyScheduler Description : Author : JHao date: 2019/8/5 ------------------------------------------------- Change Activity: 2019/08/05: proxyScheduler 2021/02/23: runProxyCheck时,剩余代理少于POOL_SIZE_MIN时执行抓取 ------------------------------------------------- """ __author__ = 'JHao' from apscheduler.schedulers.blocking import BlockingScheduler from apscheduler.executors.pool import ProcessPoolExecutor from util.six import Queue from helper.fetch import Fetcher from helper.check import Checker from handler.logHandler import LogHandler from handler.proxyHandler import ProxyHandler from handler.configHandler import ConfigHandler def __runProxyFetch(): proxy_queue = Queue() proxy_fetcher = Fetcher() for proxy in proxy_fetcher.run(): proxy_queue.put(proxy) Checker("raw", proxy_queue) def __runProxyCheck(): proxy_handler = ProxyHandler() proxy_queue = Queue() if proxy_handler.db.getCount().get("total", 0) < proxy_handler.conf.poolSizeMin: __runProxyFetch() for proxy in proxy_handler.getAll(): proxy_queue.put(proxy) Checker("use", proxy_queue) def runScheduler(): __runProxyFetch() timezone = ConfigHandler().timezone scheduler_log = LogHandler("scheduler") scheduler = BlockingScheduler(logger=scheduler_log, timezone=timezone) scheduler.add_job(__runProxyFetch, 'interval', minutes=4, id="proxy_fetch", name="proxy采集") scheduler.add_job(__runProxyCheck, 'interval', minutes=2, id="proxy_check", name="proxy检查") executors = { 'default': {'type': 'threadpool', 'max_workers': 20}, 'processpool': ProcessPoolExecutor(max_workers=5) } job_defaults = { 'coalesce': False, 'max_instances': 10 } scheduler.configure(executors=executors, job_defaults=job_defaults, timezone=timezone) scheduler.start() if __name__ == '__main__': runScheduler() ================================================ FILE: helper/validator.py ================================================ # -*- coding: utf-8 -*- """ ------------------------------------------------- File Name: _validators Description : 定义proxy验证方法 Author : JHao date: 2021/5/25 ------------------------------------------------- Change Activity: 2023/03/10: 支持带用户认证的代理格式 username:password@ip:port ------------------------------------------------- """ __author__ = 'JHao' import re from requests import head from util.six import withMetaclass from util.singleton import Singleton from handler.configHandler import ConfigHandler conf = ConfigHandler() HEADER = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0', 'Accept': '*/*', 'Connection': 'keep-alive', 'Accept-Language': 'zh-CN,zh;q=0.8'} IP_REGEX = re.compile(r"(.*:.*@)?\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}:\d{1,5}") class ProxyValidator(withMetaclass(Singleton)): pre_validator = [] http_validator = [] https_validator = [] @classmethod def addPreValidator(cls, func): cls.pre_validator.append(func) return func @classmethod def addHttpValidator(cls, func): cls.http_validator.append(func) return func @classmethod def addHttpsValidator(cls, func): cls.https_validator.append(func) return func @ProxyValidator.addPreValidator def formatValidator(proxy): """检查代理格式""" return True if IP_REGEX.fullmatch(proxy) else False @ProxyValidator.addHttpValidator def httpTimeOutValidator(proxy): """ http检测超时 """ proxies = {"http": "http://{proxy}".format(proxy=proxy), "https": "https://{proxy}".format(proxy=proxy)} try: r = head(conf.httpUrl, headers=HEADER, proxies=proxies, timeout=conf.verifyTimeout) return True if r.status_code == 200 else False except Exception as e: return False @ProxyValidator.addHttpsValidator def httpsTimeOutValidator(proxy): """https检测超时""" proxies = {"http": "http://{proxy}".format(proxy=proxy), "https": "https://{proxy}".format(proxy=proxy)} try: r = head(conf.httpsUrl, headers=HEADER, proxies=proxies, timeout=conf.verifyTimeout, verify=False) return True if r.status_code == 200 else False except Exception as e: return False @ProxyValidator.addHttpValidator def customValidatorExample(proxy): """自定义validator函数,校验代理是否可用, 返回True/False""" return True ================================================ FILE: proxyPool.py ================================================ # -*- coding: utf-8 -*- """ ------------------------------------------------- File Name: proxy_pool Description : proxy pool 启动入口 Author : JHao date: 2020/6/19 ------------------------------------------------- Change Activity: 2020/6/19: ------------------------------------------------- """ __author__ = 'JHao' import click from helper.launcher import startServer, startScheduler from setting import BANNER, VERSION CONTEXT_SETTINGS = dict(help_option_names=['-h', '--help']) @click.group(context_settings=CONTEXT_SETTINGS) @click.version_option(version=VERSION) def cli(): """ProxyPool cli工具""" @cli.command(name="schedule") def schedule(): """ 启动调度程序 """ click.echo(BANNER) startScheduler() @cli.command(name="server") def server(): """ 启动api服务 """ click.echo(BANNER) startServer() if __name__ == '__main__': cli() ================================================ FILE: requirements.txt ================================================ requests==2.20.0 gunicorn==19.9.0 lxml==4.9.2 redis==3.5.3 APScheduler==3.10.0;python_version>="3.10" APScheduler==3.2.0;python_version<"3.10" click==8.0.1;python_version>"3.6" click==7.0;python_version<="3.6" Flask==2.1.1;python_version>"3.6" Flask==1.0;python_version<="3.6" werkzeug==2.1.0;python_version>"3.6" werkzeug==0.15.5;python_version<="3.6" ================================================ FILE: setting.py ================================================ # -*- coding: utf-8 -*- """ ------------------------------------------------- File Name: setting.py Description : 配置文件 Author : JHao date: 2019/2/15 ------------------------------------------------- Change Activity: 2019/2/15: ------------------------------------------------- """ BANNER = r""" **************************************************************** *** ______ ********************* ______ *********** _ ******** *** | ___ \_ ******************** | ___ \ ********* | | ******** *** | |_/ / \__ __ __ _ __ _ | |_/ /___ * ___ | | ******** *** | __/| _// _ \ \ \/ /| | | || __// _ \ / _ \ | | ******** *** | | | | | (_) | > < \ |_| || | | (_) | (_) || |___ **** *** \_| |_| \___/ /_/\_\ \__ |\_| \___/ \___/ \_____/ **** **** __ / / ***** ************************* /___ / ******************************* ************************* ******************************** **************************************************************** """ VERSION = "2.4.0" # ############### server config ############### HOST = "0.0.0.0" PORT = 5010 # ############### database config ################### # db connection uri # example: # Redis: redis://:password@ip:port/db # Ssdb: ssdb://:password@ip:port DB_CONN = 'redis://:pwd@127.0.0.1:6379/0' # proxy table name TABLE_NAME = 'use_proxy' # ###### config the proxy fetch function ###### PROXY_FETCHER = [ "freeProxy01", "freeProxy02", "freeProxy03", "freeProxy04", "freeProxy05", "freeProxy06", "freeProxy07", "freeProxy08", "freeProxy09", "freeProxy10", "freeProxy11" ] # ############# proxy validator ################# # 代理验证目标网站 HTTP_URL = "http://httpbin.org" HTTPS_URL = "https://www.qq.com" # 代理验证时超时时间 VERIFY_TIMEOUT = 10 # 近PROXY_CHECK_COUNT次校验中允许的最大失败次数,超过则剔除代理 MAX_FAIL_COUNT = 0 # 近PROXY_CHECK_COUNT次校验中允许的最大失败率,超过则剔除代理 # MAX_FAIL_RATE = 0.1 # proxyCheck时代理数量少于POOL_SIZE_MIN触发抓取 POOL_SIZE_MIN = 20 # ############# proxy attributes ################# # 是否启用代理地域属性 PROXY_REGION = True # ############# scheduler config ################# # Set the timezone for the scheduler forcely (optional) # If it is running on a VM, and # "ValueError: Timezone offset does not match system offset" # was raised during scheduling. # Please uncomment the following line and set a timezone for the scheduler. # Otherwise it will detect the timezone from the system automatically. TIMEZONE = "Asia/Shanghai" ================================================ FILE: start.sh ================================================ #!/usr/bin/env bash python proxyPool.py server & python proxyPool.py schedule ================================================ FILE: test/__init__.py ================================================ # -*- coding: utf-8 -*- """ ------------------------------------------------- File Name: __init__ Description : Author : JHao date: 2019/2/15 ------------------------------------------------- Change Activity: 2019/2/15: ------------------------------------------------- """ __author__ = 'JHao' ================================================ FILE: test/testConfigHandler.py ================================================ # -*- coding: utf-8 -*- """ ------------------------------------------------- File Name: testGetConfig Description : testGetConfig Author : J_hao date: 2017/7/31 ------------------------------------------------- Change Activity: 2017/7/31: ------------------------------------------------- """ __author__ = 'J_hao' from handler.configHandler import ConfigHandler from time import sleep def testConfig(): """ :return: """ conf = ConfigHandler() print(conf.dbConn) print(conf.serverPort) print(conf.serverHost) print(conf.tableName) assert isinstance(conf.fetchers, list) print(conf.fetchers) for _ in range(2): print(conf.fetchers) sleep(5) if __name__ == '__main__': testConfig() ================================================ FILE: test/testDbClient.py ================================================ # -*- coding: utf-8 -*- """ ------------------------------------------------- File Name: testDbClient Description : Author : JHao date: 2020/6/23 ------------------------------------------------- Change Activity: 2020/6/23: ------------------------------------------------- """ __author__ = 'JHao' from db.dbClient import DbClient def testDbClient(): # ############### ssdb ############### ssdb_uri = "ssdb://:password@127.0.0.1:8888" s = DbClient.parseDbConn(ssdb_uri) assert s.db_type == "SSDB" assert s.db_pwd == "password" assert s.db_host == "127.0.0.1" assert s.db_port == 8888 # ############### redis ############### redis_uri = "redis://:password@127.0.0.1:6379/1" r = DbClient.parseDbConn(redis_uri) assert r.db_type == "REDIS" assert r.db_pwd == "password" assert r.db_host == "127.0.0.1" assert r.db_port == 6379 assert r.db_name == "1" print("DbClient ok!") if __name__ == '__main__': testDbClient() ================================================ FILE: test/testLogHandler.py ================================================ # -*- coding: utf-8 -*- """ ------------------------------------------------- File Name: testLogHandler Description : Author : J_hao date: 2017/8/2 ------------------------------------------------- Change Activity: 2017/8/2: ------------------------------------------------- """ __author__ = 'J_hao' from handler.logHandler import LogHandler def testLogHandler(): log = LogHandler('test') log.info('this is info') log.error('this is error') if __name__ == '__main__': testLogHandler() ================================================ FILE: test/testProxyClass.py ================================================ # -*- coding: utf-8 -*- """ ------------------------------------------------- File Name: testProxyClass Description : Author : JHao date: 2019/8/8 ------------------------------------------------- Change Activity: 2019/8/8: ------------------------------------------------- """ __author__ = 'JHao' import json from helper.proxy import Proxy def testProxyClass(): proxy = Proxy("127.0.0.1:8080") print(proxy.to_json) proxy.source = "test" proxy_str = json.dumps(proxy.to_dict, ensure_ascii=False) print(proxy_str) print(Proxy.createFromJson(proxy_str).to_dict) if __name__ == '__main__': testProxyClass() ================================================ FILE: test/testProxyFetcher.py ================================================ # -*- coding: utf-8 -*- """ ------------------------------------------------- File Name: testProxyFetcher Description : Author : JHao date: 2020/6/23 ------------------------------------------------- Change Activity: 2020/6/23: ------------------------------------------------- """ __author__ = 'JHao' from fetcher.proxyFetcher import ProxyFetcher from handler.configHandler import ConfigHandler def testProxyFetcher(): conf = ConfigHandler() proxy_getter_functions = conf.fetchers proxy_counter = {_: 0 for _ in proxy_getter_functions} for proxyGetter in proxy_getter_functions: for proxy in getattr(ProxyFetcher, proxyGetter.strip())(): if proxy: print('{func}: fetch proxy {proxy}'.format(func=proxyGetter, proxy=proxy)) proxy_counter[proxyGetter] = proxy_counter.get(proxyGetter) + 1 for key, value in proxy_counter.items(): print(key, value) if __name__ == '__main__': testProxyFetcher() ================================================ FILE: test/testProxyValidator.py ================================================ # -*- coding: utf-8 -*- """ ------------------------------------------------- File Name: testProxyValidator Description : Author : JHao date: 2021/5/25 ------------------------------------------------- Change Activity: 2021/5/25: ------------------------------------------------- """ __author__ = 'JHao' from helper.validator import ProxyValidator def testProxyValidator(): for _ in ProxyValidator.pre_validator: print(_) for _ in ProxyValidator.http_validator: print(_) for _ in ProxyValidator.https_validator: print(_) if __name__ == '__main__': testProxyValidator() ================================================ FILE: test/testRedisClient.py ================================================ # -*- coding: utf-8 -*- """ ------------------------------------------------- File Name: testRedisClient Description : Author : JHao date: 2020/6/23 ------------------------------------------------- Change Activity: 2020/6/23: ------------------------------------------------- """ __author__ = 'JHao' def testRedisClient(): from db.dbClient import DbClient from helper.proxy import Proxy uri = "redis://:pwd@127.0.0.1:6379" db = DbClient(uri) db.changeTable("use_proxy") proxy = Proxy.createFromJson('{"proxy": "118.190.79.36:8090", "https": false, "fail_count": 0, "region": "", "anonymous": "", "source": "freeProxy14", "check_count": 4, "last_status": true, "last_time": "2021-05-26 10:58:04"}') print("put: ", db.put(proxy)) print("get: ", db.get(https=None)) print("exists: ", db.exists("27.38.96.101:9797")) print("exists: ", db.exists("27.38.96.101:8888")) print("pop: ", db.pop(https=None)) print("getAll: ", db.getAll(https=None)) print("getCount", db.getCount()) if __name__ == '__main__': testRedisClient() ================================================ FILE: test/testSsdbClient.py ================================================ # -*- coding: utf-8 -*- """ ------------------------------------------------- File Name: testSsdbClient Description : Author : JHao date: 2020/7/3 ------------------------------------------------- Change Activity: 2020/7/3: ------------------------------------------------- """ __author__ = 'JHao' def testSsdbClient(): from db.dbClient import DbClient from helper.proxy import Proxy uri = "ssdb://@127.0.0.1:8888" db = DbClient(uri) db.changeTable("use_proxy") proxy = Proxy.createFromJson('{"proxy": "118.190.79.36:8090", "https": false, "fail_count": 0, "region": "", "anonymous": "", "source": "freeProxy14", "check_count": 4, "last_status": true, "last_time": "2021-05-26 10:58:04"}') print("put: ", db.put(proxy)) print("get: ", db.get(https=None)) print("exists: ", db.exists("27.38.96.101:9797")) print("exists: ", db.exists("27.38.96.101:8888")) print("getAll: ", db.getAll(https=None)) # print("pop: ", db.pop(https=None)) print("clear: ", db.clear()) print("getCount", db.getCount()) if __name__ == '__main__': testSsdbClient() ================================================ FILE: test.py ================================================ # -*- coding: utf-8 -*- """ ------------------------------------------------- File Name: test.py Description : Author : JHao date: 2017/3/7 ------------------------------------------------- Change Activity: 2017/3/7: ------------------------------------------------- """ __author__ = 'JHao' from test import testProxyValidator from test import testConfigHandler from test import testLogHandler from test import testDbClient if __name__ == '__main__': print("ConfigHandler:") testConfigHandler.testConfig() print("LogHandler:") testLogHandler.testLogHandler() print("DbClient:") testDbClient.testDbClient() print("ProxyValidator:") testProxyValidator.testProxyValidator() ================================================ FILE: util/__init__.py ================================================ # -*- coding: utf-8 -*- """ ------------------------------------------------- File Name: __init__ Description : Author : JHao date: 2020/7/6 ------------------------------------------------- Change Activity: 2020/7/6: ------------------------------------------------- """ __author__ = 'JHao' ================================================ FILE: util/lazyProperty.py ================================================ # -*- coding: utf-8 -*- """ ------------------------------------------------- File Name: lazyProperty Description : Author : JHao date: 2016/12/3 ------------------------------------------------- Change Activity: 2016/12/3: ------------------------------------------------- """ __author__ = 'JHao' class LazyProperty(object): """ LazyProperty explain: http://www.spiderpy.cn/blog/5/ """ def __init__(self, func): self.func = func def __get__(self, instance, owner): if instance is None: return self else: value = self.func(instance) setattr(instance, self.func.__name__, value) return value ================================================ FILE: util/singleton.py ================================================ # -*- coding: utf-8 -*- """ ------------------------------------------------- File Name: singleton Description : Author : JHao date: 2016/12/3 ------------------------------------------------- Change Activity: 2016/12/3: ------------------------------------------------- """ __author__ = 'JHao' class Singleton(type): """ Singleton Metaclass """ _inst = {} def __call__(cls, *args, **kwargs): if cls not in cls._inst: cls._inst[cls] = super(Singleton, cls).__call__(*args) return cls._inst[cls] ================================================ FILE: util/six.py ================================================ # -*- coding: utf-8 -*- """ ------------------------------------------------- File Name: six Description : Author : JHao date: 2020/6/22 ------------------------------------------------- Change Activity: 2020/6/22: ------------------------------------------------- """ __author__ = 'JHao' import sys PY2 = sys.version_info[0] == 2 PY3 = sys.version_info[0] == 3 if PY3: def iteritems(d, **kw): return iter(d.items(**kw)) else: def iteritems(d, **kw): return d.iteritems(**kw) if PY3: from urllib.parse import urlparse else: from urlparse import urlparse if PY3: from imp import reload as reload_six else: reload_six = reload if PY3: from queue import Empty, Queue else: from Queue import Empty, Queue def withMetaclass(meta, *bases): """Create a base class with a metaclass.""" # This requires a bit of explanation: the basic idea is to make a dummy # metaclass for one level of class instantiation that replaces itself with # the actual metaclass. class MetaClass(meta): def __new__(cls, name, this_bases, d): return meta(name, bases, d) return type.__new__(MetaClass, 'temporary_class', (), {}) ================================================ FILE: util/webRequest.py ================================================ # -*- coding: utf-8 -*- """ ------------------------------------------------- File Name: WebRequest Description : Network Requests Class Author : J_hao date: 2017/7/31 ------------------------------------------------- Change Activity: 2017/7/31: ------------------------------------------------- """ __author__ = 'J_hao' from requests.models import Response from lxml import etree import requests import random import time from handler.logHandler import LogHandler requests.packages.urllib3.disable_warnings() class WebRequest(object): name = "web_request" def __init__(self, *args, **kwargs): self.log = LogHandler(self.name, file=False) self.response = Response() @property def user_agent(self): """ return an User-Agent at random :return: """ ua_list = [ 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71', 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)', 'Mozilla/5.0 (Windows NT 5.1; U; en; rv:1.8.1) Gecko/20061208 Firefox/2.0.0 Opera 9.50', 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0', ] return random.choice(ua_list) @property def header(self): """ basic header :return: """ return {'User-Agent': self.user_agent, 'Accept': '*/*', 'Connection': 'keep-alive', 'Accept-Language': 'zh-CN,zh;q=0.8'} def get(self, url, header=None, retry_time=3, retry_interval=5, timeout=5, *args, **kwargs): """ get method :param url: target url :param header: headers :param retry_time: retry time :param retry_interval: retry interval :param timeout: network timeout :return: """ headers = self.header if header and isinstance(header, dict): headers.update(header) while True: try: self.response = requests.get(url, headers=headers, timeout=timeout, *args, **kwargs) return self except Exception as e: self.log.error("requests: %s error: %s" % (url, str(e))) retry_time -= 1 if retry_time <= 0: resp = Response() resp.status_code = 200 return self self.log.info("retry %s second after" % retry_interval) time.sleep(retry_interval) @property def tree(self): return etree.HTML(self.response.content) @property def text(self): return self.response.text @property def json(self): try: return self.response.json() except Exception as e: self.log.error(str(e)) return {}