Repository: binux/pyspider Branch: master Commit: 897891cafb21 Files: 165 Total size: 775.9 KB Directory structure: gitextract_jmd7ykkk/ ├── .coveragerc ├── .github/ │ └── ISSUE_TEMPLATE.md ├── .gitignore ├── .travis.yml ├── Dockerfile ├── LICENSE ├── MANIFEST.in ├── README.md ├── config_example.json ├── docker-compose.yaml ├── docs/ │ ├── About-Projects.md │ ├── About-Tasks.md │ ├── Architecture.md │ ├── Command-Line.md │ ├── Deployment-demo.pyspider.org.md │ ├── Deployment.md │ ├── Frequently-Asked-Questions.md │ ├── Quickstart.md │ ├── Running-pyspider-with-Docker.md │ ├── Script-Environment.md │ ├── Working-with-Results.md │ ├── apis/ │ │ ├── @catch_status_code_error.md │ │ ├── @every.md │ │ ├── Response.md │ │ ├── index.md │ │ ├── self.crawl.md │ │ └── self.send_message.md │ ├── conf.py │ ├── index.md │ └── tutorial/ │ ├── AJAX-and-more-HTTP.md │ ├── HTML-and-CSS-Selector.md │ ├── Render-with-PhantomJS.md │ └── index.md ├── mkdocs.yml ├── pyspider/ │ ├── __init__.py │ ├── database/ │ │ ├── __init__.py │ │ ├── base/ │ │ │ ├── __init__.py │ │ │ ├── projectdb.py │ │ │ ├── resultdb.py │ │ │ └── taskdb.py │ │ ├── basedb.py │ │ ├── couchdb/ │ │ │ ├── __init__.py │ │ │ ├── couchdbbase.py │ │ │ ├── projectdb.py │ │ │ ├── resultdb.py │ │ │ └── taskdb.py │ │ ├── elasticsearch/ │ │ │ ├── __init__.py │ │ │ ├── projectdb.py │ │ │ ├── resultdb.py │ │ │ └── taskdb.py │ │ ├── local/ │ │ │ ├── __init__.py │ │ │ └── projectdb.py │ │ ├── mongodb/ │ │ │ ├── __init__.py │ │ │ ├── mongodbbase.py │ │ │ ├── projectdb.py │ │ │ ├── resultdb.py │ │ │ └── taskdb.py │ │ ├── mysql/ │ │ │ ├── __init__.py │ │ │ ├── mysqlbase.py │ │ │ ├── projectdb.py │ │ │ ├── resultdb.py │ │ │ └── taskdb.py │ │ ├── redis/ │ │ │ ├── __init__.py │ │ │ └── taskdb.py │ │ ├── sqlalchemy/ │ │ │ ├── __init__.py │ │ │ ├── projectdb.py │ │ │ ├── resultdb.py │ │ │ ├── sqlalchemybase.py │ │ │ └── taskdb.py │ │ └── sqlite/ │ │ ├── __init__.py │ │ ├── projectdb.py │ │ ├── resultdb.py │ │ ├── sqlitebase.py │ │ └── taskdb.py │ ├── fetcher/ │ │ ├── __init__.py │ │ ├── cookie_utils.py │ │ ├── phantomjs_fetcher.js │ │ ├── puppeteer_fetcher.js │ │ ├── splash_fetcher.lua │ │ └── tornado_fetcher.py │ ├── libs/ │ │ ├── ListIO.py │ │ ├── __init__.py │ │ ├── base_handler.py │ │ ├── bench.py │ │ ├── counter.py │ │ ├── dataurl.py │ │ ├── log.py │ │ ├── multiprocessing_queue.py │ │ ├── pprint.py │ │ ├── response.py │ │ ├── result_dump.py │ │ ├── sample_handler.py │ │ ├── url.py │ │ ├── utils.py │ │ └── wsgi_xmlrpc.py │ ├── logging.conf │ ├── message_queue/ │ │ ├── __init__.py │ │ ├── kombu_queue.py │ │ ├── rabbitmq.py │ │ └── redis_queue.py │ ├── processor/ │ │ ├── __init__.py │ │ ├── processor.py │ │ └── project_module.py │ ├── result/ │ │ ├── __init__.py │ │ └── result_worker.py │ ├── run.py │ ├── scheduler/ │ │ ├── __init__.py │ │ ├── scheduler.py │ │ ├── task_queue.py │ │ └── token_bucket.py │ └── webui/ │ ├── __init__.py │ ├── app.py │ ├── bench_test.py │ ├── debug.py │ ├── index.py │ ├── login.py │ ├── result.py │ ├── static/ │ │ ├── .babelrc │ │ ├── package.json │ │ ├── src/ │ │ │ ├── css_selector_helper.js │ │ │ ├── debug.js │ │ │ ├── debug.less │ │ │ ├── index.js │ │ │ ├── index.less │ │ │ ├── result.less │ │ │ ├── splitter.js │ │ │ ├── task.less │ │ │ ├── tasks.less │ │ │ └── variable.less │ │ └── webpack.config.js │ ├── task.py │ ├── templates/ │ │ ├── debug.html │ │ ├── index.html │ │ ├── result.html │ │ ├── task.html │ │ └── tasks.html │ └── webdav.py ├── requirements.txt ├── run.py ├── setup.py ├── tests/ │ ├── __init__.py │ ├── data_fetcher_processor_handler.py │ ├── data_handler.py │ ├── data_sample_handler.py │ ├── data_test_webpage.py │ ├── test_base_handler.py │ ├── test_bench.py │ ├── test_counter.py │ ├── test_database.py │ ├── test_fetcher.py │ ├── test_fetcher_processor.py │ ├── test_message_queue.py │ ├── test_processor.py │ ├── test_response.py │ ├── test_result_dump.py │ ├── test_result_worker.py │ ├── test_run.py │ ├── test_scheduler.py │ ├── test_task_queue.py │ ├── test_utils.py │ ├── test_webdav.py │ ├── test_webui.py │ └── test_xmlrpc.py ├── tools/ │ └── migrate.py └── tox.ini ================================================ FILE CONTENTS ================================================ ================================================ FILE: .coveragerc ================================================ [run] source = pyspider parallel = True [report] omit = pyspider/libs/sample_handler.py pyspider/libs/pprint.py exclude_lines = pragma: no cover def __repr__ if self.debug: if settings.DEBUG raise AssertionError raise NotImplementedError if 0: if __name__ == .__main__.: except ImportError: pass ================================================ FILE: .github/ISSUE_TEMPLATE.md ================================================ * pyspider version: * Operating system: * Start up command: ### Expected behavior ### Actual behavior ### How to reproduce ================================================ FILE: .gitignore ================================================ *.py[cod] data/* .venv .idea # C extensions *.so # Packages *.egg *.egg-info dist build eggs parts bin var sdist develop-eggs .installed.cfg lib lib64 __pycache__ # Installer logs pip-log.txt # Unit test / coverage reports .coverage .tox nosetests.xml # Translations *.mo # Mr Developer .mr.developer.cfg .project .pydevproject .idea ================================================ FILE: .travis.yml ================================================ language: python cache: pip python: - 3.5 - 3.6 - 3.7 #- 3.8 services: - docker - mongodb - rabbitmq - redis - mysql # - elasticsearch - postgresql addons: postgresql: "9.4" apt: packages: - rabbitmq-server env: - IGNORE_COUCHDB=1 before_install: - sudo apt-get update -qq - curl -O https://download.elastic.co/elasticsearch/release/org/elasticsearch/distribution/deb/elasticsearch/2.4.0/elasticsearch-2.4.0.deb && sudo dpkg -i --force-confnew elasticsearch-2.4.0.deb && sudo service elasticsearch restart - npm install express puppeteer - sudo docker pull scrapinghub/splash - sudo docker run -d --net=host scrapinghub/splash before_script: - psql -c "CREATE DATABASE pyspider_test_taskdb ENCODING 'UTF8' TEMPLATE=template0;" -U postgres - psql -c "CREATE DATABASE pyspider_test_projectdb ENCODING 'UTF8' TEMPLATE=template0;" -U postgres - psql -c "CREATE DATABASE pyspider_test_resultdb ENCODING 'UTF8' TEMPLATE=template0;" -U postgres - sleep 10 install: - pip install https://github.com/marcus67/easywebdav/archive/master.zip - sudo apt-get install libgnutls28-dev - pip install -e .[all,test] - pip install coveralls script: - coverage run setup.py test after_success: - coverage combine - coveralls ================================================ FILE: Dockerfile ================================================ FROM python:3.6 MAINTAINER binux # install phantomjs RUN mkdir -p /opt/phantomjs \ && cd /opt/phantomjs \ && wget -O phantomjs.tar.bz2 https://bitbucket.org/ariya/phantomjs/downloads/phantomjs-2.1.1-linux-x86_64.tar.bz2 \ && tar xavf phantomjs.tar.bz2 --strip-components 1 \ && ln -s /opt/phantomjs/bin/phantomjs /usr/local/bin/phantomjs \ && rm phantomjs.tar.bz2 # Fix Error: libssl_conf.so: cannot open shared object file: No such file or directory ENV OPENSSL_CONF=/etc/ssl/ # install nodejs ENV NODEJS_VERSION=8.15.0 \ PATH=$PATH:/opt/node/bin WORKDIR "/opt/node" RUN apt-get -qq update && apt-get -qq install -y curl ca-certificates libx11-xcb1 libxtst6 libnss3 libasound2 libatk-bridge2.0-0 libgtk-3-0 --no-install-recommends && \ curl -sL https://nodejs.org/dist/v${NODEJS_VERSION}/node-v${NODEJS_VERSION}-linux-x64.tar.gz | tar xz --strip-components=1 && \ rm -rf /var/lib/apt/lists/* RUN npm install puppeteer express # install requirements COPY requirements.txt /opt/pyspider/requirements.txt RUN pip install -r /opt/pyspider/requirements.txt # add all repo ADD ./ /opt/pyspider # run test WORKDIR /opt/pyspider RUN pip install -e .[all] # Create a symbolic link to node_modules RUN ln -s /opt/node/node_modules ./node_modules #VOLUME ["/opt/pyspider"] ENTRYPOINT ["pyspider"] EXPOSE 5000 23333 24444 25555 22222 ================================================ FILE: LICENSE ================================================ Apache License Version 2.0, January 2004 http://www.apache.org/licenses/ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 1. Definitions. "License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document. "Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License. "Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity. "You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License. "Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files. "Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types. "Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below). "Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof. "Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution." "Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work. 2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form. 3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed. 4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions: (a) You must give any other recipients of the Work or Derivative Works a copy of this License; and (b) You must cause any modified files to carry prominent notices stating that You changed the files; and (c) You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and (d) If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License. You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License. 5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions. 6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file. 7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License. 8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages. 9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability. END OF TERMS AND CONDITIONS APPENDIX: How to apply the Apache License to your work. To apply the Apache License to your work, attach the following boilerplate notice, with the fields enclosed by brackets "{}" replaced with your own identifying information. (Don't include the brackets!) The text should be enclosed in the appropriate comment syntax for the file format. We also recommend that a file or class name and description of purpose be included on the same "printed page" as the copyright notice for easier identification within third-party archives. Copyright 2014 Binux Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. ================================================ FILE: MANIFEST.in ================================================ include README.md include requirements.txt include Dockerfile include LICENSE include pyspider/logging.conf include pyspider/webui/static/* include pyspider/webui/templates/* ================================================ FILE: README.md ================================================ pyspider [![Build Status]][Travis CI] [![Coverage Status]][Coverage] ======== A Powerful Spider(Web Crawler) System in Python. - Write script in Python - Powerful WebUI with script editor, task monitor, project manager and result viewer - [MySQL](https://www.mysql.com/), [MongoDB](https://www.mongodb.org/), [Redis](http://redis.io/), [SQLite](https://www.sqlite.org/), [Elasticsearch](https://www.elastic.co/products/elasticsearch); [PostgreSQL](http://www.postgresql.org/) with [SQLAlchemy](http://www.sqlalchemy.org/) as database backend - [RabbitMQ](http://www.rabbitmq.com/), [Redis](http://redis.io/) and [Kombu](http://kombu.readthedocs.org/) as message queue - Task priority, retry, periodical, recrawl by age, etc... - Distributed architecture, Crawl Javascript pages, Python 2.{6,7}, 3.{3,4,5,6} support, etc... Tutorial: [http://docs.pyspider.org/en/latest/tutorial/](http://docs.pyspider.org/en/latest/tutorial/) Documentation: [http://docs.pyspider.org/](http://docs.pyspider.org/) Release notes: [https://github.com/binux/pyspider/releases](https://github.com/binux/pyspider/releases) Sample Code ----------- ```python from pyspider.libs.base_handler import * class Handler(BaseHandler): crawl_config = { } @every(minutes=24 * 60) def on_start(self): self.crawl('http://scrapy.org/', callback=self.index_page) @config(age=10 * 24 * 60 * 60) def index_page(self, response): for each in response.doc('a[href^="http"]').items(): self.crawl(each.attr.href, callback=self.detail_page) def detail_page(self, response): return { "url": response.url, "title": response.doc('title').text(), } ``` Installation ------------ * `pip install pyspider` * run command `pyspider`, visit [http://localhost:5000/](http://localhost:5000/) **WARNING:** WebUI is open to the public by default, it can be used to execute any command which may harm your system. Please use it in an internal network or [enable `need-auth` for webui](http://docs.pyspider.org/en/latest/Command-Line/#-config). Quickstart: [http://docs.pyspider.org/en/latest/Quickstart/](http://docs.pyspider.org/en/latest/Quickstart/) Contribute ---------- * Use It * Open [Issue], send PR * [User Group] * [中文问答](http://segmentfault.com/t/pyspider) TODO ---- ### v0.4.0 - [ ] a visual scraping interface like [portia](https://github.com/scrapinghub/portia) License ------- Licensed under the Apache License, Version 2.0 [Build Status]: https://img.shields.io/travis/binux/pyspider/master.svg?style=flat [Travis CI]: https://travis-ci.org/binux/pyspider [Coverage Status]: https://img.shields.io/coveralls/binux/pyspider.svg?branch=master&style=flat [Coverage]: https://coveralls.io/r/binux/pyspider [Try]: https://img.shields.io/badge/try-pyspider-blue.svg?style=flat [Issue]: https://github.com/binux/pyspider/issues [User Group]: https://groups.google.com/group/pyspider-users ================================================ FILE: config_example.json ================================================ { "taskdb": "couchdb+taskdb://user:password@couchdb:5984", "projectdb": "couchdb+projectdb://user:password@couchdb:5984", "resultdb": "couchdb+resultdb://user:password@couchdb:5984", "message_queue": "amqp://rabbitmq:5672/%2F", "webui": { "username": "username", "password": "password", "need-auth": true, "scheduler-rpc": "http://scheduler:23333", "fetcher-rpc": "http://fetcher:24444" } } ================================================ FILE: docker-compose.yaml ================================================ version: "3.7" # replace /path/to/dir/ to point to config.json # The RabbitMQ and CouchDB services can take some time to startup. # During this time most of the pyspider services will exit and restart. # Once RabbitMQ and CouchDB are fully up and running everything should run as normal. services: rabbitmq: image: rabbitmq:alpine container_name: rabbitmq networks: - pyspider command: rabbitmq-server mysql: image: mysql:latest container_name: mysql volumes: - /tmp:/var/lib/mysql environment: - MYSQL_ALLOW_EMPTY_PASSWORD=yes networks: - pyspider phantomjs: image: pyspider:latest container_name: phantomjs networks: - pyspider volumes: - ./config_example.json:/opt/pyspider/config.json command: -c config.json phantomjs depends_on: - couchdb - rabbitmq restart: unless-stopped result: image: pyspider:latest container_name: result networks: - pyspider volumes: - ./config_example.json:/opt/pyspider/config.json command: -c config.json result_worker depends_on: - couchdb - rabbitmq restart: unless-stopped # Sometimes we'll get a connection refused error because couchdb has yet to fully start processor: container_name: processor image: pyspider:latest networks: - pyspider volumes: - ./config_example.json:/opt/pyspider/config.json command: -c config.json processor depends_on: - couchdb - rabbitmq restart: unless-stopped fetcher: image: pyspider:latest container_name: fetcher networks: - pyspider volumes: - ./config_example.json:/opt/pyspider/config.json command : -c config.json fetcher depends_on: - couchdb - rabbitmq restart: unless-stopped scheduler: image: pyspider:latest container_name: scheduler networks: - pyspider volumes: - ./config_example.json:/opt/pyspider/config.json command: -c config.json scheduler depends_on: - couchdb - rabbitmq restart: unless-stopped webui: image: pyspider:latest container_name: webui ports: - "5050:5000" networks: - pyspider volumes: - ./config_example.json:/opt/pyspider/config.json command: -c config.json webui depends_on: - couchdb - rabbitmq restart: unless-stopped networks: pyspider: external: name: pyspider default: driver: bridge ================================================ FILE: docs/About-Projects.md ================================================ About Projects ============== In most cases, a project is one script you write for one website. * Projects are independent, but you can import another project as a module with `from projects import other_project` * A project has 5 status: `TODO`, `STOP`, `CHECKING`, `DEBUG` and `RUNNING` - `TODO` - a script is just created to be written - `STOP` - you can mark a project as `STOP` if you want it to STOP (= =). - `CHECKING` - when a running project is modified, to prevent incomplete modification, project status will be set as `CHECKING` automatically. - `DEBUG`/`RUNNING` - these two status have no difference to spider. But it's good to mark it as `DEBUG` when it's running the first time then change it to `RUNNING` after being checked. * The crawl rate is controlled by `rate` and `burst` with [token-bucket](http://en.wikipedia.org/wiki/Token_bucket) algorithm. - `rate` - how many requests in one second - `burst` - consider this situation, `rate/burst = 0.1/3`, it means that the spider scrawls 1 page every 10 seconds. All tasks are finished, project is checking last updated items every minute. Assume that 3 new items are found, pyspider will "burst" and crawl 3 tasks without waiting 3*10 seconds. However, the fourth task needs wait 10 seconds. * To delete a project, set `group` to `delete` and status to `STOP`, wait 24 hours. `on_finished` callback -------------------- You can override `on_finished` method in the project, the method would be triggered when the task_queue goes to 0. Example 1: When you start a project to crawl a website with 100 pages, the `on_finished` callback will be fired when 100 pages are successfully crawled or failed after retries. Example 2: A project with `auto_recrawl` tasks will **NEVER** trigger the `on_finished` callback, because time queue will never become 0 when there are auto_recrawl tasks in it. Example 3: A project with `@every` decorated method will trigger the `on_finished` callback every time when the newly submitted tasks are finished. ================================================ FILE: docs/About-Tasks.md ================================================ About Tasks =========== Tasks are the basic unit to be scheduled. Basis ----- * A task is differentiated by its `taskid`. (Default: `md5(url)`, can be changed by overriding the `def get_taskid(self, task)` method) * Tasks are isolated between different projects. * A Task has 4 status: - active - failed - success - bad - not used * Only tasks in active status will be scheduled. * Tasks are served in order of `priority`. Schedule -------- #### new task When a new task (never seen before) comes in: * If `exetime` is set but not arrived, it will be put into a time-based queue to wait. * Otherwise it will be accepted. When the task is already in the queue: * Ignored unless `force_update` When a completed task comes out: * If `age` is set, `last_crawl_time + age < now` it will be accepted. Otherwise discarded. * If `itag` is set and not equal to it's previous value, it will be accepted. Otherwise discarded. #### task retry When a fetch error or script error happens, the task will retry 3 times by default. The first retry will execute every time after 30 seconds, 1 hour, 6 hours, 12 hours and any more retries will postpone 24 hours. If `age` is specified, the retry delay will not larger then `age`. You can config the retry delay by adding a variable named `retry_delay` to handler. `retry_delay` is a dict to specify retry intervals. The items in the dict are {retried: seconds}, and a special key: '' (empty string) is used to specify the default retry delay if not specified. e.g. the default `retry_delay` declares like: ``` class MyHandler(BaseHandler): retry_delay = { 0: 30, 1: 1*60*60, 2: 6*60*60, 3: 12*60*60, '': 24*60*60 } ``` ================================================ FILE: docs/Architecture.md ================================================ Architecture ============ This document describes the reason why I made pyspider and the architecture. Why --- Two years ago, I was working on a vertical search engine. We are facing following needs on crawling: 1. collect 100-200 websites, they may on/offline or change their templates at any time > We need a really powerful monitor to find out which website is changing. And a good tool to help us write script/template for each website. 2. data should be collected in 5min when website updated > We solve this problem by check index page frequently, and use something like 'last update time' or 'last reply time' to determine which page is changed. In addition to this, we recheck pages after X days in case to prevent the omission. > **pyspider will never stop as WWW is changing all the time** Furthermore, we have some APIs from our cooperators, the API may need POST, proxy, request signature etc. Full control from script is more convenient than some global parameters of components. Overview -------- The following diagram shows an overview of the pyspider architecture with its components and an outline of the data flow that takes place inside the system. ![pyspider](imgs/pyspider-arch.png) Components are connected by message queue. Every component, including message queue, is running in their own process/thread, and replaceable. That means, when process is slow, you can have many instances of processor and make full use of multiple CPUs, or deploy to multiple machines. This architecture makes pyspider really fast. [benchmarking](https://gist.github.com/binux/67b276c51e988f8e2c31#comment-1339242). Components ---------- ### Scheduler The Scheduler receives tasks from newtask_queue from processor. Decide whether the task is new or requires re-crawl. Sort tasks according to priority and feeding them to fetcher with traffic control ([token bucket](http://en.wikipedia.org/wiki/Token_bucket) algorithm). Take care of periodic tasks, lost tasks and failed tasks and retry later. All of above can be set via `self.crawl` [API](apis/). Note that in current implement of scheduler, only one scheduler is allowed. ### Fetcher The Fetcher is responsible for fetching web pages then send results to processor. For flexible, fetcher support [Data URI](http://en.wikipedia.org/wiki/Data_URI_scheme) and pages that rendered by JavaScript (via [phantomjs](http://phantomjs.org/)). Fetch method, headers, cookies, proxy, etag etc can be controlled by script via [API](apis/self.crawl/#fetch). ### Phantomjs Fetcher Phantomjs Fetcher works like a proxy. It's connected to general Fetcher, fetch and render pages with JavaScript enabled, output a general HTML back to Fetcher: ``` scheduler -> fetcher -> processor | phantomjs | internet ``` ### Processor The Processor is responsible for running the script written by users to parse and extract information. Your script is running in an unlimited environment. Although we have various tools(like [PyQuery](https://pythonhosted.org/pyquery/)) for you to extract information and links, you can use anything you want to deal with the response. You may refer to [Script Environment](Script-Environment) and [API Reference](apis/) to get more information about script. Processor will capture the exceptions and logs, send status(task track) and new tasks to `scheduler`, send results to `Result Worker`. ### Result Worker (optional) Result worker receives results from `Processor`. Pyspider has a built-in result worker to save result to `resultdb`. Overwrite it to deal with result by your needs. ### WebUI WebUI is a web frontend for everything. It contains: * script editor, debugger * project manager * task monitor * result viewer, exporter Maybe webui is the most attractive part of pyspider. With this powerful UI, you can debug your scripts step by step just as pyspider do. Starting or stop a project. Finding which project is going wrong and what request is failed and try it again with debugger. Data flow --------- The data flow in pyspider is just as your seen in diagram above: 1. Each script has a callback named `on_start`, when you press the `Run` button on WebUI. A new task of `on_start` is submitted to Scheduler as the entries of project. 2. Scheduler dispatches this `on_start` task with a Data URI as a normal task to Fetcher. 3. Fetcher makes a request and a response to it (for Data URI, it's a fake request and response, but has no difference with other normal tasks), then feeds to Processor. 4. Processor calls the `on_start` method and generated some new URL to crawl. Processor send a message to Scheduler that this task is finished and new tasks via message queue to Scheduler (here is no results for `on_start` in most case. If has results, Processor send them to `result_queue`). 5. Scheduler receives the new tasks, looking up in the database, determine whether the task is new or requires re-crawl, if so, put them into task queue. Dispatch tasks in order. 6. The process repeats (from step 3) and wouldn't stop till WWW is dead ;-). Scheduler will check periodic tasks to crawl latest data. ================================================ FILE: docs/Command-Line.md ================================================ Command Line ============ Global Config ------------- You can get command help via `pyspider --help` and `pyspider all --help` for subcommand help. global options work for all subcommands. ``` Usage: pyspider [OPTIONS] COMMAND [ARGS]... A powerful spider system in python. Options: -c, --config FILENAME a json file with default values for subcommands. {“webui”: {“port”:5001}} --logging-config TEXT logging config file for built-in python logging module [default: pyspider/pyspider/logging.conf] --debug debug mode --queue-maxsize INTEGER maxsize of queue --taskdb TEXT database url for taskdb, default: sqlite --projectdb TEXT database url for projectdb, default: sqlite --resultdb TEXT database url for resultdb, default: sqlite --message-queue TEXT connection url to message queue, default: builtin multiprocessing.Queue --amqp-url TEXT [deprecated] amqp url for rabbitmq. please use --message-queue instead. --beanstalk TEXT [deprecated] beanstalk config for beanstalk queue. please use --message-queue instead. --phantomjs-proxy TEXT phantomjs proxy ip:port --data-path TEXT data dir path --version Show the version and exit. --help Show this message and exit. ``` #### --config Config file is a JSON file with config values for global options or subcommands (a sub-dict named after subcommand). [example](/Deployment/#configjson) ``` json { "taskdb": "mysql+taskdb://username:password@host:port/taskdb", "projectdb": "mysql+projectdb://username:password@host:port/projectdb", "resultdb": "mysql+resultdb://username:password@host:port/resultdb", "message_queue": "amqp://username:password@host:port/%2F", "webui": { "username": "some_name", "password": "some_passwd", "need-auth": true } } ``` #### --queue-maxsize Queue size limit, 0 for not limit #### --taskdb, --projectdb, --resultdb ``` mysql: mysql+type://user:passwd@host:port/database sqlite: # relative path sqlite+type:///path/to/database.db # absolute path sqlite+type:////path/to/database.db # memory database sqlite+type:// mongodb: mongodb+type://[username:password@]host1[:port1][,host2[:port2],...[,hostN[:portN]]][/[database][?options]] more: http://docs.mongodb.org/manual/reference/connection-string/ couchdb: couchdb+type://[username:password@]host[:port] sqlalchemy: sqlalchemy+postgresql+type://user:passwd@host:port/database sqlalchemy+mysql+mysqlconnector+type://user:passwd@host:port/database more: http://docs.sqlalchemy.org/en/rel_0_9/core/engines.html local: local+projectdb://filepath,filepath type: should be one of `taskdb`, `projectdb`, `resultdb`. ``` #### --message-queue ``` rabbitmq: amqp://username:password@host:5672/%2F see https://www.rabbitmq.com/uri-spec.html redis: redis://host:6379/db redis://host1:port1,host2:port2,...,hostn:portn (for redis 3.x in cluster mode) kombu: kombu+transport://userid:password@hostname:port/virtual_host see http://kombu.readthedocs.org/en/latest/userguide/connections.html#urls builtin: None ``` #### --phantomjs-proxy The phantomjs proxy address, you need a phantomjs installed and running phantomjs proxy with command: [`pyspider phantomjs`](#phantomjs). #### --data-path SQLite database and counter dump files saved path all --- ``` Usage: pyspider all [OPTIONS] Run all the components in subprocess or thread Options: --fetcher-num INTEGER instance num of fetcher --processor-num INTEGER instance num of processor --result-worker-num INTEGER instance num of result worker --run-in [subprocess|thread] run each components in thread or subprocess. always using thread for windows. --help Show this message and exit. ``` one --- ``` Usage: pyspider one [OPTIONS] [SCRIPTS]... One mode not only means all-in-one, it runs every thing in one process over tornado.ioloop, for debug purpose Options: -i, --interactive enable interactive mode, you can choose crawl url. --phantomjs enable phantomjs, will spawn a subprocess for phantomjs --help Show this message and exit. ``` **NOTE: WebUI is not running in one mode.** In `one` mode, results will be written to stdout by default. You can capture them via `pyspider one > result.txt`. #### [SCRIPTS] The script file path of projects. Project status is RUNNING, `rate` and `burst` can be set via script comments: ``` # rate: 1.0 # burst: 3 ``` When SCRIPTS is set, `taskdb` and `resultdb` will use a in-memory sqlite db by default (can be overridden by global config `--taskdb`, `--resultdb`). on_start callback will be triggered on start. #### -i, --interactive With interactive mode, pyspider will start an interactive console asking what to do in next loop of process. In the console, you can use: ``` python crawl(url, project=None, **kwargs) Crawl given url, same parameters as BaseHandler.crawl url - url or taskid, parameters will be used if in taskdb project - can be omitted if only one project exists. quit_interactive() Quit interactive mode quit_pyspider() Close pyspider ``` You can use `pyspider.libs.utils.python_console()` to open an interactive console in your script. bench ----- ``` Usage: pyspider bench [OPTIONS] Run Benchmark test. In bench mode, in-memory sqlite database is used instead of on-disk sqlite database. Options: --fetcher-num INTEGER instance num of fetcher --processor-num INTEGER instance num of processor --result-worker-num INTEGER instance num of result worker --run-in [subprocess|thread] run each components in thread or subprocess. always using thread for windows. --total INTEGER total url in test page --show INTEGER show how many urls in a page --help Show this message and exit. ``` scheduler --------- ``` Usage: pyspider scheduler [OPTIONS] Run Scheduler, only one scheduler is allowed. Options: --xmlrpc / --no-xmlrpc --xmlrpc-host TEXT --xmlrpc-port INTEGER --inqueue-limit INTEGER size limit of task queue for each project, tasks will been ignored when overflow --delete-time INTEGER delete time before marked as delete --active-tasks INTEGER active log size --loop-limit INTEGER maximum number of tasks due with in a loop --scheduler-cls TEXT scheduler class to be used. --help Show this message and exit. ``` #### --scheduler-cls set this option to use customized Scheduler class phantomjs --------- ``` Usage: run.py phantomjs [OPTIONS] [ARGS]... Run phantomjs fetcher if phantomjs is installed. Options: --phantomjs-path TEXT phantomjs path --port INTEGER phantomjs port --auto-restart TEXT auto restart phantomjs if crashed --help Show this message and exit. ``` #### ARGS Addition args pass to phantomjs command line. fetcher ------- ``` Usage: pyspider fetcher [OPTIONS] Run Fetcher. Options: --xmlrpc / --no-xmlrpc --xmlrpc-host TEXT --xmlrpc-port INTEGER --poolsize INTEGER max simultaneous fetches --proxy TEXT proxy host:port --user-agent TEXT user agent --timeout TEXT default fetch timeout --fetcher-cls TEXT Fetcher class to be used. --help Show this message and exit. ``` #### --proxy Default proxy used by fetcher, can been override by `self.crawl` option. [DOC](apis/self.crawl/#fetch) processor --------- ``` Usage: pyspider processor [OPTIONS] Run Processor. Options: --processor-cls TEXT Processor class to be used. --help Show this message and exit. ``` result_worker ------------- ``` Usage: pyspider result_worker [OPTIONS] Run result worker. Options: --result-cls TEXT ResultWorker class to be used. --help Show this message and exit. ``` webui ----- ``` Usage: pyspider webui [OPTIONS] Run WebUI Options: --host TEXT webui bind to host --port INTEGER webui bind to host --cdn TEXT js/css cdn server --scheduler-rpc TEXT xmlrpc path of scheduler --fetcher-rpc TEXT xmlrpc path of fetcher --max-rate FLOAT max rate for each project --max-burst FLOAT max burst for each project --username TEXT username of lock -ed projects --password TEXT password of lock -ed projects --need-auth need username and password --webui-instance TEXT webui Flask Application instance to be used. --help Show this message and exit. ``` #### --cdn JS/CSS libs CDN service, URL must compatible with [cdnjs](https://cdnjs.com/) #### --fetcher-rpc XML-RPC path URI for fetcher XMLRPC server. If not set, use a Fetcher instance. #### --need-auth If true, all pages require username and password specified via `--username` and `--password`. ================================================ FILE: docs/Deployment-demo.pyspider.org.md ================================================ Deployment of demo.pyspider.org =============================== [demo.pyspider.org](http://demo.pyspider.org/) is running on three VPSs connected together with private network using [tinc](http://www.tinc-vpn.org/). 1vCore 4GB RAM | 1vCore 2GB RAM * 2 ---------------|---------------- database
message queue
scheduler | phantomjs * 2
phantomjs-lb * 1
fetcher * 1
fetcher-lb * 1
processor * 2
result-worker * 1
webui * 4
webui-lb * 1
nginx * 1
All components are running inside docker containers. database / message queue / scheduler ------------------------------------ The database is postgresql and the message queue is redis. Scheduler may have a lot of database operations, it's better to put it close to the database. ```bash docker run --name postgres -v /data/postgres/:/var/lib/postgresql/data -d -p $LOCAL_IP:5432:5432 -e POSTGRES_PASSWORD="" postgres docker run --name redis -d -p $LOCAL_IP:6379:6379 redis docker run --name scheduler -d -p $LOCAL_IP:23333:23333 --restart=always binux/pyspider \ --taskdb "sqlalchemy+postgresql+taskdb://binux@10.21.0.7/taskdb" \ --resultdb "sqlalchemy+postgresql+resultdb://binux@10.21.0.7/resultdb" \ --projectdb "sqlalchemy+postgresql+projectdb://binux@10.21.0.7/projectdb" \ --message-queue "redis://10.21.0.7:6379/1" \ scheduler --inqueue-limit 5000 --delete-time 43200 ``` other components ---------------- fetcher, processor, result_worker are running on two boxes with same configuration managed with [docker-compose](https://docs.docker.com/compose/). ```yaml phantomjs: image: 'binux/pyspider:latest' command: phantomjs cpu_shares: 512 environment: - 'EXCLUDE_PORTS=5000,23333,24444' expose: - '25555' mem_limit: 512m restart: always phantomjs-lb: image: 'dockercloud/haproxy:latest' links: - phantomjs restart: always fetcher: image: 'binux/pyspider:latest' command: '--message-queue "redis://10.21.0.7:6379/1" --phantomjs-proxy "phantomjs:80" fetcher --xmlrpc' cpu_shares: 512 environment: - 'EXCLUDE_PORTS=5000,25555,23333' links: - 'phantomjs-lb:phantomjs' mem_limit: 128m restart: always fetcher-lb: image: 'dockercloud/haproxy:latest' links: - fetcher restart: always processor: image: 'binux/pyspider:latest' command: '--projectdb "sqlalchemy+postgresql+projectdb://binux@10.21.0.7/projectdb" --message-queue "redis://10.21.0.7:6379/1" processor' cpu_shares: 512 mem_limit: 256m restart: always result-worker: image: 'binux/pyspider:latest' command: '--taskdb "sqlalchemy+postgresql+taskdb://binux@10.21.0.7/taskdb" --projectdb "sqlalchemy+postgresql+projectdb://binux@10.21.0.7/projectdb" --resultdb "sqlalchemy+postgresql+resultdb://binux@10.21.0.7/resultdb" --message-queue "redis://10.21.0.7:6379/1" result_worker' cpu_shares: 512 mem_limit: 256m restart: always webui: image: 'binux/pyspider:latest' command: '--taskdb "sqlalchemy+postgresql+taskdb://binux@10.21.0.7/taskdb" --projectdb "sqlalchemy+postgresql+projectdb://binux@10.21.0.7/projectdb" --resultdb "sqlalchemy+postgresql+resultdb://binux@10.21.0.7/resultdb" --message-queue "redis://10.21.0.7:6379/1" webui --max-rate 0.2 --max-burst 3 --scheduler-rpc "http://o4.i.binux.me:23333/" --fetcher-rpc "http://fetcher/"' cpu_shares: 512 environment: - 'EXCLUDE_PORTS=24444,25555,23333' links: - 'fetcher-lb:fetcher' mem_limit: 256m restart: always webui-lb: image: 'dockercloud/haproxy:latest' links: - webui restart: always nginx: image: 'nginx' links: - 'webui-lb:HAPROXY' ports: - '0.0.0.0:80:80' volumes: - /home/binux/nfs/profile/nginx/nginx.conf:/etc/nginx/nginx.conf - /home/binux/nfs/profile/nginx/conf.d/:/etc/nginx/conf.d/ restart: always ``` With the config, you can change the scale by `docker-compose scale phantomjs=2 processor=2 webui=4` when you need. #### load balance phantomjs-lb, fetcher-lb, webui-lb are automaticlly configed haproxy, allow any number of upstreams. #### phantomjs phantomjs have memory leak issue, memory limit applied, and it's recommended to restart it every hour. #### fetcher fetcher is implemented with aync IO, it supportes 100 concurrent connections. If the upstream queue are not choked, one fetcher should be enough. #### processor processor is CPU bound component, recommended number of instance is number of CPU cores + 1~2 or CPU cores * 10%~15% when you have more then 20 cores. #### result-worker If you didn't override result-worker, it only write results into database, and should be very fast. ================================================ FILE: docs/Deployment.md ================================================ Deployment =========== Since pyspider has various components, you can just run `pyspider` to start a standalone and third service free instance. Or using MySQL or MongoDB and RabbitMQ to deploy a distributed crawl cluster. To deploy pyspider in product environment, running component in each process and store data in database service is more reliable and flexible. Installation ------------ To deploy pyspider components in each single processes, you need at least one database service. pyspider now supports [MySQL](http://www.mysql.com/), [CouchDB](https://couchdb.apache.org), [MongoDB](http://www.mongodb.org/) and [PostgreSQL](http://www.postgresql.org/). You can choose one of them. And you need a message queue service to connect the components together. You can use [RabbitMQ](http://www.rabbitmq.com/) or [Redis](http://redis.io/) as message queue. `pip install --allow-all-external pyspider[all]` > Even if you had install pyspider using `pip` before. Install with `pyspider[all]` is necessary to install the requirements for MySQL/MongoDB/RabbitMQ. if you are using Ubuntu, try: ``` apt-get install python python-dev python-distribute python-pip libcurl4-openssl-dev libxml2-dev libxslt1-dev python-lxml ``` to install binary packages. Deployment ---------- **This document is based on MySQL + RabbitMQ** ### config.json Although you can use command-line to specify the parameters. A config file is a better choice. ``` { "taskdb": "mysql+taskdb://username:password@host:port/taskdb", "projectdb": "mysql+projectdb://username:password@host:port/projectdb", "resultdb": "mysql+resultdb://username:password@host:port/resultdb", "message_queue": "amqp://username:password@host:port/%2F", "webui": { "username": "some_name", "password": "some_passwd", "need-auth": true } } ``` you can get complete options by running `pyspider --help` and `pyspider webui --help` for subcommands. `"webui"` in JSON is configs for subcommands. You can add parameters for other components similar to this one. #### Database Connection URI `"taskdb"`, `"projectdb”`, `"resultdb"` is using database connection URI with format below: ``` mysql: mysql+type://user:passwd@host:port/database sqlite: # relative path sqlite+type:///path/to/database.db # absolute path sqlite+type:////path/to/database.db # memory database sqlite+type:// mongodb: mongodb+type://[username:password@]host1[:port1][,host2[:port2],...[,hostN[:portN]]][/[database][?options]] more: http://docs.mongodb.org/manual/reference/connection-string/ couchdb: couchdb+type://[username:password@]host[:port][?options]] sqlalchemy: sqlalchemy+postgresql+type://user:passwd@host:port/database sqlalchemy+mysql+mysqlconnector+type://user:passwd@host:port/database more: http://docs.sqlalchemy.org/en/rel_0_9/core/engines.html local: local+projectdb://filepath,filepath type: should be one of `taskdb`, `projectdb`, `resultdb`. ``` #### Message Queue URL You can use connection URL to specify the message queue: ``` rabbitmq: amqp://username:password@host:5672/%2F Refer: https://www.rabbitmq.com/uri-spec.html redis: redis://host:6379/db redis://host1:port1,host2:port2,...,hostn:portn (for redis 3.x in cluster mode) builtin: None ``` > Hint for postgresql: you need to create database with encoding utf8 by your own. pyspider will not create database for you. running ------- You should run components alone with subcommands. You may add `&` after command to make it running in background and use [screen](http://linux.die.net/man/1/screen) or [nohup](http://linux.die.net/man/1/nohup) to prevent exit after your ssh session ends. **It's recommended to manage components with [Supervisor](http://supervisord.org/).** ``` # start **only one** scheduler instance pyspider -c config.json scheduler # phantomjs pyspider -c config.json phantomjs # start fetcher / processor / result_worker instances as many as your needs pyspider -c config.json --phantomjs-proxy="localhost:25555" fetcher pyspider -c config.json processor pyspider -c config.json result_worker # start webui, set `--scheduler-rpc` if scheduler is not running on the same host as webui pyspider -c config.json webui ``` Running with Docker ------------------- [Running pyspider with Docker](Running-pyspider-with-Docker) Deployment of demo.pyspider.org ------------------------------- [Deployment of demo.pyspider.org](Deployment-demo.pyspider.org) ================================================ FILE: docs/Frequently-Asked-Questions.md ================================================ Frequently Asked Questions ========================== Does pyspider Work with Windows? -------------------------------- Yes, it should, some users have made it work on Windows. But as I don't have windows development environment, I cannot test. Only some tips for users who want to use pyspider on Windows: - Some package needs binary libs (e.g. pycurl, lxml), that maybe you cannot install it from pip, Windowns binaries packages could be found in [http://www.lfd.uci.edu/~gohlke/pythonlibs/](http://www.lfd.uci.edu/~gohlke/pythonlibs/). - Make a clean environment with [virtualenv](https://virtualenv.readthedocs.org/en/latest/) - Try 32bit version of Python, especially your are facing crash issue. - Avoid using Python 3.4.1 ([#194](https://github.com/binux/pyspider/issues/194), [#217](https://github.com/binux/pyspider/issues/217)) Unreadable Code (乱码) Returned from Phantomjs --------------------------------------------- Phantomjs doesn't support gzip, don't set `Accept-Encoding` header with `gzip`. How to Delete a Project? ------------------------ set `group` to `delete` and `status` to `STOP` then wait 24 hours. You can change the time before a project deleted via `scheduler.DELETE_TIME`. How to Restart a Project? ------------------------- #### Why It happens after you modified a script, and wants to crawl everything again with new strategy. But as the [age](/apis/self.crawl/#age) of urls are not expired. Scheduler will discard all of the new requests. #### Solution 1. Create a new project. 2. Using a [itag](/apis/self.crawl/#itag) within `Handler.crawl_config` to specify the version of your script. How to Use WebDAV Mode? ----------------------- Mount `http://hostname/dav/` to your filesystem, edit or create scripts with your favourite editor. > OSX: `mount_webdav http://hostname/dav/ /Volumes/dav` > Linux: Install davfs2, `mount.davfs http://hostname/dav/ /mnt/dav` > VIM: `vim http://hostname/dav/script_name.py` When you are editing script without WebUI, you need to change it to `WebDAV Mode` while debugging. After you saved script in editor, WebUI can load and use latest script to debug your code. What does the progress bar mean on the dashboard? ------------------------------------------------- When mouse move onto the progress bar, you can see the explaintions. For 5m, 1h, 1d the number are the events triggered in 5m, 1h, 1d. For all progress bar, they are the number of total tasks in correspond status. Only the tasks in DEBUG/RUNNING status will show the progress. How many scheduler/fetcher/processor/result_worker do I need? or pyspider stop working -------------------------------------------------------------------------------------- You can have only have one scheduler, and multiple fetcher/processor/result_worker depends on the bottleneck. You can use the queue status on dashboard to view the bottleneck of the system: ![run one step](imgs/queue_status.png) For example, the number between scheduler and fetcher indicate the queue size of scheduler to fetchers, when it's hitting 100 (default maximum queue size), fetcher might crashed, or you should considered adding more fetchers. The number `0+0` below fetcher indicate the queue size of new tasks and status packs between processors and schduler. You can put your mouse over the numbers to see the tips. ================================================ FILE: docs/Quickstart.md ================================================ Quickstart ========== Installation ------------ * `pip install pyspider` * run command `pyspider`, visit [http://localhost:5000/](http://localhost:5000/) if you are using ubuntu, try: ``` apt-get install python python-dev python-distribute python-pip \ libcurl4-openssl-dev libxml2-dev libxslt1-dev python-lxml \ libssl-dev zlib1g-dev ``` to install binary packages first. please install PhantomJS if needed: http://phantomjs.org/build.html note that PhantomJS will be enabled only if it is excutable in the `PATH` or in the System Environment **Note:** `pyspider` command is running pyspider in `all` mode, which running components in threads or subprocesses. For production environment, please refer to [Deployment](Deployment). **WARNING:** WebUI is opened to public by default, it can be used to execute any command which may harm to you system. Please use it in internal network or [enable `need-auth` for webui](http://docs.pyspider.org/en/latest/Command-Line/#-config). Your First Script ----------------- ```python from pyspider.libs.base_handler import * class Handler(BaseHandler): crawl_config = { } @every(minutes=24 * 60) def on_start(self): self.crawl('http://scrapy.org/', callback=self.index_page) @config(age=10 * 24 * 60 * 60) def index_page(self, response): for each in response.doc('a[href^="http"]').items(): self.crawl(each.attr.href, callback=self.detail_page) @config(priority=2) def detail_page(self, response): return { "url": response.url, "title": response.doc('title').text(), } ``` > * `def on_start(self)` is the entry point of the script. It will be called when you click the `run` button on dashboard. > * [`self.crawl(url, callback=self.index_page)`*](/apis/self.crawl) is the most important API here. It will add a new task to be crawled. Most of the options will be spicified via `self.crawl` arguments. > * `def index_page(self, response)` get a [`Response`*](/apis/Response) object. [`response.doc`*](/apis/Response/#responsedoc) is a [pyquery](https://pythonhosted.org/pyquery/) object which has jQuery-like API to select elements to be extracted. > * `def detail_page(self, response)` return a `dict` object as result. The result will be captured into `resultdb` by default. You can override `on_result(self, result)` method to manage the result yourself. More things you may want to know: > * [`@every(minutes=24*60, seconds=0)`*](/apis/@every/) is a helper to tell the scheduler that `on_start` method should be called everyday. > * [`@config(age=10 * 24 * 60 * 60)`*](/apis/self.crawl/#configkwargs) specified the default `age` parameter of `self.crawl` with page type `index_page` (when `callback=self.index_page`). The parameter [`age`*](/apis/self.crawl/#age) can be specified via `self.crawl(url, age=10*24*60*60)` (highest priority) and `crawl_config` (lowest priority). > * [`age=10 * 24 * 60 * 60`*](/apis/self.crawl/#age) tell scheduler discard the request if it have been crawled in 10 days. pyspider will not crawl a same URL twice by default (discard forever), even you had modified the code, it's very common for beginners that runs the project the first time and modified it and run it the second time, it will not crawl again (read [`itag`](/apis/self.crawl/#itag) for solution) > * [`@config(priority=2)`*](/apis/self.crawl/#schedule) mark that detail pages should be crawled first. You can test your script step by step by click the green `run` button. Switch to `follows` panel, click the play button to move on. ![run one step](imgs/run_one_step.png) Start Running ------------- 1. Save your script. 2. Back to dashboard find your project. 3. Changing the `status` to `DEBUG` or `RUNNING`. 4. Click the `run` button. ![index demo](imgs/index_page.png) Your script is running now! ================================================ FILE: docs/Running-pyspider-with-Docker.md ================================================ ```shell # mysql docker run --name mysql -d -v /data/mysql:/var/lib/mysql -e MYSQL_ALLOW_EMPTY_PASSWORD=yes mysql:latest # rabbitmq docker run --name rabbitmq -d rabbitmq:latest # phantomjs docker run --name phantomjs -d binux/pyspider:latest phantomjs # result worker docker run --name result_worker -m 128m -d --link mysql:mysql --link rabbitmq:rabbitmq binux/pyspider:latest result_worker # processor, run multiple instance if needed. docker run --name processor -m 256m -d --link mysql:mysql --link rabbitmq:rabbitmq binux/pyspider:latest processor # fetcher, run multiple instance if needed. docker run --name fetcher -m 256m -d --link phantomjs:phantomjs --link rabbitmq:rabbitmq binux/pyspider:latest fetcher --no-xmlrpc # scheduler docker run --name scheduler -d --link mysql:mysql --link rabbitmq:rabbitmq binux/pyspider:latest scheduler # webui docker run --name webui -m 256m -d -p 5000:5000 --link mysql:mysql --link rabbitmq:rabbitmq --link scheduler:scheduler --link phantomjs:phantomjs binux/pyspider:latest webui ``` or running with [Docker Compose](https://docs.docker.com/compose/) with `docker-compose.yml`: NOTE: It's recommended to run mysql and rabbitmq outside compose as they may not been restarted with pyspider. You can find commands to start mysql and rabbitmq service above. ``` phantomjs: image: binux/pyspider:latest command: phantomjs result: image: binux/pyspider:latest external_links: - mysql - rabbitmq command: result_worker processor: image: binux/pyspider:latest external_links: - mysql - rabbitmq command: processor fetcher: image: binux/pyspider:latest external_links: - rabbitmq links: - phantomjs command : fetcher scheduler: image: binux/pyspider:latest external_links: - mysql - rabbitmq command: scheduler webui: image: binux/pyspider:latest external_links: - mysql - rabbitmq links: - scheduler - phantomjs command: webui ports: - "5000:5000" ``` `docker-compose up` ================================================ FILE: docs/Script-Environment.md ================================================ Script Environment ================== Variables --------- * `self.project_name` * `self.project` information about current project * `self.response` * `self.task` About Script ------------ * The name of `Handler` is not matters, but you need at least one class inherit from `BaseHandler` * A third parameter can be set to get task object: `def callback(self, response, task)` * Non-200 response will not submit to callback by default. Use `@catch_status_code_error` About Environment ----------------- * `logging`, `print` and exceptions will be captured. * You can import other projects as module with `from projects import some_project` ### Web view * view the page as a browser would render (approximately) ### HTML view * view the HTML of the current callback (index_page, detail_page, etc.) ### Follows view * view the callbacks that can be made from the current callback * index_page follows view will show the detail_page callbacks that can be executed. ### Messages view * shows the messages send by [`self.send_message`](apis/self.send_message) API. ### Enable CSS Selector Helper * Enable a CSS Selector Helper of the Web view. It gets the CSS Selector of the element you clicked then add it to your script. ================================================ FILE: docs/Working-with-Results.md ================================================ Working with Results ==================== Downloading and viewing your data from WebUI is convenient, but may not suitable for computer. Working with ResultDB --------------------- Although resultdb is only designed for result preview, not suitable for large scale storage. But if you want to grab data from resultdb, there are some simple snippets using database API that can help you to connect and select the data. ``` from pyspider.database import connect_database resultdb = connect_database("") for project in resultdb.projects: for result in resultdb.select(project): assert result['taskid'] assert result['url'] assert result['result'] ``` The `result['result']` is the object submitted by `return` statement from your script. Working with ResultWorker ------------------------- In product environment, you may want to connect pyspider to your system / post-processing pipeline, rather than store it into resultdb. It's highly recommended to override ResultWorker. ``` from pyspider.result import ResultWorker class MyResultWorker(ResultWorker): def on_result(self, task, result): assert task['taskid'] assert task['project'] assert task['url'] assert result # your processing code goes here ``` `result` is the object submitted by `return` statement from your script. You can put this script (e.g., `my_result_worker.py`) at the folder where you launch pyspider. Add argument for `result_worker` subcommand: `pyspider result_worker --result-cls=my_result_worker.MyResultWorker` Or ``` { ... "result_worker": { "result_cls": "my_result_worker.MyResultWorker" } ... } ``` if you are using config file. [Please refer to Deployment](/Deployment) Design Your Own Database Schema ------------------------------- The results stored in database is encoded as JSON for compatibility. It's highly recommended to design your own database, and override the ResultWorker described above. TIPS about Results ------------------- #### Want to return more than one result in callback? As resultdb de-duplicate results by taskid(url), the latest will overwrite previous results. One workaround is using `send_message` API to make a `fake` taskid for each result. ``` def detail_page(self, response): for li in response.doc('li').items(): self.send_message(self.project_name, { ... }, url=response.url+"#"+li('a.product-sku').text()) def on_message(self, project, msg): return msg ``` See Also: [apis/self.send_message](/apis/self.send_message) ================================================ FILE: docs/apis/@catch_status_code_error.md ================================================ @catch_status_code_error ======================== non-200 response will been regarded as fetch failed and will not pass to callback. use this decorator to override this feature. ```python def on_start(self): self.crawl('http://httpbin.org/status/404', self.callback) @catch_status_code_error def callback(self, response): ... ``` > The `callback` would not be executed as the request is failed (with status code 404). With the `@catch_status_code_error` decorater, the `callback` would be executed even if the request failed. ================================================ FILE: docs/apis/@every.md ================================================ @every(minutes=0, seconds=0) ============================ method will been called every `minutes` or `seconds` ```python @every(minutes=24 * 60) def on_start(self): for url in urllist: self.crawl(url, callback=self.index_page) ``` The urls would be restarted every 24 hours. Note that, if `age` is also used and the period is longer then `@every`, the crawl request would be discarded as it's regarded as not changed: ```python @every(minutes=24 * 60) def on_start(self): self.crawl('http://www.example.org/', callback=self.index_page) @config(age=10 * 24 * 60 * 60) def index_page(self): ... ``` > Even though the crawl request triggered every day, but it's discard and only restarted every 10 days. ================================================ FILE: docs/apis/Response.md ================================================ Response ======== The attributes of Response object. ### Response.url final URL. ### Response.text Content of response, in unicode. if `Response.encoding` is None and `chardet` module is available, encoding of content will be guessed. ### Response.content Content of response, in bytes. ### Response.doc A [PyQuery](https://pythonhosted.org/pyquery/) object of the response's content. Links have made as absolute by default. Refer to the documentation of PyQuery: [https://pythonhosted.org/pyquery/](https://pythonhosted.org/pyquery/) It's important that I will repeat, refer to the documentation of PyQuery: [https://pythonhosted.org/pyquery/](https://pythonhosted.org/pyquery/) ### Response.etree A [lxml](http://lxml.de/) object of the response's content. ### Response.json The JSON-encoded content of the response, if any. ### Response.status_code ### Response.orig_url If there is any redirection during the request, here is the url you just submit via `self.crawl`. ### Response.headers A case insensitive dict holds the headers of response. ### Response.cookies ### Response.error Messages when fetch error ### Response.time Time used during fetching. ### Response.ok True if `status_code` is 200 and no error. ### Response.encoding Encoding of Response.content. If Response.encoding is None, encoding will be guessed by header or content or `chardet`(if available). Set encoding of content manually will overwrite the guessed encoding. ### Response.save The object saved by [`self.crawl`](/apis/self.crawl/#save) API ### Response.js_script_result content returned by JS script ### Response.raise_for_status() Raise HTTPError if status code is not 200 or `Response.error` exists. ================================================ FILE: docs/apis/index.md ================================================ API Reference ============= - [self.crawl](self.crawl) - [Response](Response) - [self.send_message](self.send_message) - [@every](@every) - [@catch_status_code_error](@catch_status_code_error) ================================================ FILE: docs/apis/self.crawl.md ================================================ self.crawl =========== self.crawl(url, **kwargs) ------------------------- `self.crawl` is the main interface to tell pyspider which url(s) should be crawled. ### Parameters: ##### url the url or url list to be crawled. ##### callback the method to parse the response. _default: `__call__` _ ```python def on_start(self): self.crawl('http://scrapy.org/', callback=self.index_page) ``` the following parameters are optional ##### age the period of validity of the task. The page would be regarded as not modified during the period. _default: -1(never recrawl)_ ```python @config(age=10 * 24 * 60 * 60) def index_page(self, response): ... ``` > Every pages parsed by the callback `index_page` would be regarded not changed within 10 days. If you submit the task within 10 days since last crawled it would be discarded. ##### priority the priority of task to be scheduled, higher the better. _default: 0_ ```python def index_page(self): self.crawl('http://www.example.org/page2.html', callback=self.index_page) self.crawl('http://www.example.org/233.html', callback=self.detail_page, priority=1) ``` > The page `233.html` would be crawled before `page2.html`. Use this parameter can do a [BFS](http://en.wikipedia.org/wiki/Breadth-first_search) and reduce the number of tasks in queue(which may cost more memory resources). ##### exetime the executed time of task in unix timestamp. _default: 0(immediately)_ ```python import time def on_start(self): self.crawl('http://www.example.org/', callback=self.callback, exetime=time.time()+30*60) ``` > The page would be crawled 30 minutes later. ##### retries retry times while failed. _default: 3_ ##### itag a marker from frontier page to reveal the potential modification of the task. It will be compared to its last value, recrawl when it's changed. _default: None_ ```python def index_page(self, response): for item in response.doc('.item').items(): self.crawl(item.find('a').attr.url, callback=self.detail_page, itag=item.find('.update-time').text()) ``` > In the sample, `.update-time` is used as itag. If it's not changed, the request would be discarded. Or you can use `itag` with `Handler.crawl_config` to specify the script version if you want to restart all of the tasks. ```python class Handler(BaseHandler): crawl_config = { 'itag': 'v223' } ``` > Change the value of itag after you modified the script and click run button again. It doesn't matter if not set before. ##### auto_recrawl when enabled, task would be recrawled every `age` time. _default: False_ ```python def on_start(self): self.crawl('http://www.example.org/', callback=self.callback, age=5*60*60, auto_recrawl=True) ``` > The page would be restarted every `age` 5 hours. ##### method HTTP method to use. _default: GET_ ##### params dictionary of URL parameters to append to the URL. ```python def on_start(self): self.crawl('http://httpbin.org/get', callback=self.callback, params={'a': 123, 'b': 'c'}) self.crawl('http://httpbin.org/get?a=123&b=c', callback=self.callback) ``` > The two requests are the same. ##### data the body to attach to the request. If a dictionary is provided, form-encoding will take place. ```python def on_start(self): self.crawl('http://httpbin.org/post', callback=self.callback, method='POST', data={'a': 123, 'b': 'c'}) ``` ##### files dictionary of `{field: {filename: 'content'}}` files to multipart upload.` ##### user_agent the User-Agent of the request ##### headers dictionary of headers to send. ##### cookies dictionary of cookies to attach to this request. ##### connect_timeout timeout for initial connection in seconds. _default: 20_ ##### timeout maximum time in seconds to fetch the page. _default: 120_ ##### allow_redirects follow `30x` redirect _default: True_ ##### validate_cert For HTTPS requests, validate the server’s certificate? _default: True_ ##### proxy proxy server of `username:password@hostname:port` to use, only http proxy is supported currently. ```python class Handler(BaseHandler): crawl_config = { 'proxy': 'localhost:8080' } ``` > `Handler.crawl_config` can be used with `proxy` to set a proxy for whole project. ##### etag use HTTP Etag mechanism to pass the process if the content of the page is not changed. _default: True_ ###### last_modified use HTTP Last-Modified header mechanism to pass the process if the content of the page is not changed. _default: True_ ##### fetch_type set to `js` to enable JavaScript fetcher. _default: None_ ##### js_script JavaScript run before or after page loaded, should been wrapped by a function like `function() { document.write("binux"); }`. ```python def on_start(self): self.crawl('http://www.example.org/', callback=self.callback, fetch_type='js', js_script=''' function() { window.scrollTo(0,document.body.scrollHeight); return 123; } ''') ``` > The script would scroll the page to bottom. The value returned in function could be captured via `Response.js_script_result`. ##### js_run_at run JavaScript specified via `js_script` at `document-start` or `document-end`. _default: `document-end`_ ##### js_viewport_width/js_viewport_height set the size of the viewport for the JavaScript fetcher of the layout process. ##### load_images load images when JavaScript fetcher enabled. _default: False_ ##### save a object pass to the callback method, can be visit via `response.save`. ```python def on_start(self): self.crawl('http://www.example.org/', callback=self.callback, save={'a': 123}) def callback(self, response): return response.save['a'] ``` > `123` would be returned in `callback` ##### taskid unique id to identify the task, default is the MD5 check code of the URL, can be overridden by method `def get_taskid(self, task)` ```python import json from pyspider.libs.utils import md5string def get_taskid(self, task): return md5string(task['url']+json.dumps(task['fetch'].get('data', ''))) ``` > Only url is md5 -ed as taskid by default, the code above add `data` of POST request as part of taskid. ##### force_update force update task params even if the task is in `ACTIVE` status. ##### cancel cancel a task, should be used with `force_update` to cancel a active task. To cancel an `auto_recrawl` task, you should set `auto_recrawl=False` as well. cURL command ------------ `self.crawl(curl_command)` cURL is a command line tool to make a HTTP request. It can easily get form Chrome Devtools > Network panel, right click the request and "Copy as cURL". You can use cURL command as the first argument of `self.crawl`. It will parse the command and make the HTTP request just like curl do. @config(**kwargs) ----------------- default parameters of `self.crawl` when use the decorated method as callback. For example: ```python @config(age=15*60) def index_page(self, response): self.crawl('http://www.example.org/list-1.html', callback=self.index_page) self.crawl('http://www.example.org/product-233', callback=self.detail_page) @config(age=10*24*60*60) def detail_page(self, response): return {...} ``` `age` of `list-1.html` is 15min while the `age` of `product-233.html` is 10days. Because the callback of `product-233.html` is `detail_page`, means it's a `detail_page` so it shares the config of `detail_page`. Handler.crawl_config = {} ------------------------- default parameters of `self.crawl` for the whole project. The parameters in `crawl_config` for scheduler (priority, retries, exetime, age, itag, force_update, auto_recrawl, cancel) will be joined when the task created, the parameters for fetcher and processor will be joined when executed. You can use this mechanism to change the fetch config (e.g. cookies) afterwards. ```python class Handler(BaseHandler): crawl_config = { 'headers': { 'User-Agent': 'GoogleBot', } } ... ``` > crawl_config set a project level user-agent. ================================================ FILE: docs/apis/self.send_message.md ================================================ self.send_message ================= self.send_message(project, msg, [url]) -------------------------------------- send messages to other project. can been received by `def on_message(self, project, message)` callback. - `project` - other project name - `msg` - any json-able object - `url` - result will been overwrite if have same `taskid`. `send_message` share a same `taskid` by default. Change this to return multiple result by one response. ```python def detail_page(self, response): for i, each in enumerate(response.json['products']): self.send_message(self.project_name, { "name": each['name'], 'price': each['prices'], }, url="%s#%s" % (response.url, i)) def on_message(self, project, msg): return msg ``` pyspider send_message [OPTIONS] PROJECT MESSAGE ----------------------------------------------- You can also send message from command line. ``` Usage: pyspider send_message [OPTIONS] PROJECT MESSAGE Send Message to project from command line Options: --scheduler-rpc TEXT xmlrpc path of scheduler --help Show this message and exit. ``` def on_message(self, project, message) -------------------------------------- receive message from other project ================================================ FILE: docs/conf.py ================================================ #!/usr/bin/env python # -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # Created on 2015-11-10 01:31:54 import sys from unittest.mock import MagicMock from recommonmark.parser import CommonMarkParser class Mock(MagicMock): @classmethod def __getattr__(cls, name): return Mock() MOCK_MODULES = ['pycurl', 'lxml', 'psycopg2'] sys.modules.update((mod_name, Mock()) for mod_name in MOCK_MODULES) source_parsers = { '.md': CommonMarkParser, } source_suffix = ['.rst', '.md'] ================================================ FILE: docs/index.md ================================================ pyspider [![Build Status][Build Status]][Travis CI] [![Coverage Status][Coverage Status]][Coverage] [![Try][Try]][Demo] ======== A Powerful Spider(Web Crawler) System in Python. **[TRY IT NOW!][Demo]** - Write script in Python - Powerful WebUI with script editor, task monitor, project manager and result viewer - [MySQL](https://www.mysql.com/), [CouchDB](https://couchdb.apache.org), [MongoDB](https://www.mongodb.org/), [Redis](http://redis.io/), [SQLite](https://www.sqlite.org/), [Elasticsearch](https://www.elastic.co/products/elasticsearch); [PostgreSQL](http://www.postgresql.org/) with [SQLAlchemy](http://www.sqlalchemy.org/) as database backend - [RabbitMQ](http://www.rabbitmq.com/), [Redis](http://redis.io/) and [Kombu](http://kombu.readthedocs.org/) as message queue - Task priority, retry, periodical, recrawl by age, etc... - Distributed architecture, Crawl Javascript pages, Python 2&3, etc... Tutorial: [http://docs.pyspider.org/en/latest/tutorial/](http://docs.pyspider.org/en/latest/tutorial/) Documentation: [http://docs.pyspider.org/](http://docs.pyspider.org/) Release notes: [https://github.com/binux/pyspider/releases](https://github.com/binux/pyspider/releases) Sample Code ----------- ```python from pyspider.libs.base_handler import * class Handler(BaseHandler): crawl_config = { } @every(minutes=24 * 60) def on_start(self): self.crawl('http://scrapy.org/', callback=self.index_page) @config(age=10 * 24 * 60 * 60) def index_page(self, response): for each in response.doc('a[href^="http"]').items(): self.crawl(each.attr.href, callback=self.detail_page) def detail_page(self, response): return { "url": response.url, "title": response.doc('title').text(), } ``` [![Demo][Demo Img]][Demo] Installation ------------ * `pip install pyspider` * run command `pyspider`, visit [http://localhost:5000/](http://localhost:5000/) Quickstart: [http://docs.pyspider.org/en/latest/Quickstart/](http://docs.pyspider.org/en/latest/Quickstart/) Contribute ---------- * Use It * Open [Issue], send PR * [User Group] * [中文问答](http://segmentfault.com/t/pyspider) TODO ---- ### v0.4.0 - [x] local mode, load script from file. - [x] works as a framework (all components running in one process, no threads) - [x] redis - [x] shell mode like `scrapy shell` - [ ] a visual scraping interface like [portia](https://github.com/scrapinghub/portia) ### more - [x] edit script with vim via [WebDAV](http://en.wikipedia.org/wiki/WebDAV) License ------- Licensed under the Apache License, Version 2.0 [Build Status]: https://img.shields.io/travis/binux/pyspider/master.svg?style=flat [Travis CI]: https://travis-ci.org/binux/pyspider [Coverage Status]: https://img.shields.io/coveralls/binux/pyspider.svg?branch=master&style=flat [Coverage]: https://coveralls.io/r/binux/pyspider [Try]: https://img.shields.io/badge/try-pyspider-blue.svg?style=flat [Demo]: http://demo.pyspider.org/ [Demo Img]: imgs/demo.png [Issue]: https://github.com/binux/pyspider/issues [User Group]: https://groups.google.com/group/pyspider-users ================================================ FILE: docs/tutorial/AJAX-and-more-HTTP.md ================================================ Level 2: AJAX and More HTTP =========================== In the last article, we discussed how to extract links and information from HTML documents. However, web contents are becoming more complicated using some technology like AJAX. You may find that page looks different with it in browser, the information you want to extract is not in the HTML of the page. In this article, we will not write complete scrape scripts, but some snippets of web page cases using the technology like AJAX or needs some HTTP parameters besides URL. AJAX ---- [AJAX] is short for asynchronous JavaScript + XML. AJAX is using existing standards to update parts of a web page without loading the whole page. A common usage of AJAX is loading [JSON] data and render to HTML on the client side. You may find elements missing in HTML fetched by pyspider or [wget](https://www.gnu.org/software/wget/). When you open it in browser some elements appear after page loaded with(maybe not) a 'loading' animation or words. For example, we want to scrape all channels of Dota 2 from [http://www.twitch.tv/directory/game/Dota%202](http://www.twitch.tv/directory/game/Dota%202) ![twitch](../imgs/twitch.png) But you may find nothing in the page. ### Finding the request As [AJAX] data is transferred in [HTTP], we can find the real request with the help of [Chrome Developer Tools](https://developer.chrome.com/devtools). 0. Open a new tab. 1. Use `Ctrl`+`Shift`+`I` (or `Cmd`+`Opt`+`I` on Mac) to open the DevTools. 2. Switch to Network panel. 3. Open the URL [http://www.twitch.tv/directory/game/Dota%202](http://www.twitch.tv/directory/game/Dota%202) in this tab. While resources are been loaded, you may find a table of requested resources. ![developer tools network](../imgs/developer-tools-network.png) AJAX is using [XMLHttpRequest](https://developer.mozilla.org/en-US/docs/Web/API/XMLHttpRequest) object to send and retrieve data which is generally shorted as "XHR". Use Filter (funnel icon) to filter out the XHR requests. Glance over each requests using preview: ![find request](../imgs/search-for-request.png) To determine which one is the key request, you can use a filter to reduce the number of requests, guess the usage of the request by this path and parameters, then view the response contents for confirmation. Here we found the request: [http://api.twitch.tv/kraken/streams?limit=20&offset=0&game=Dota+2&broadcaster_language=&on_site=1](http://api.twitch.tv/kraken/streams?limit=20&offset=0&game=Dota+2&broadcaster_language=&on_site=1) Now, open the URL in a new tab, you would see a [JSON] data containing channel list. You can use a extension [JSONView](https://chrome.google.com/webstore/detail/jsonview/chklaanhfefbnpoihckbnefhakgolnmc) ([for Firfox](http://jsonview.com/)) to have a pretty printed view of JSON. A sample code is trying extract the name, current title and viewers of each channel. ``` class Handler(BaseHandler): @every(minutes=10) def on_start(self): self.crawl('http://api.twitch.tv/kraken/streams?limit=20&offset=0&game=Dota+2&broadcaster_language=&on_site=1', callback=self.index_page) @config(age=10*60) def index_page(self, response): return [{ "name": x['channel']['display_name'], "viewers": x['viewers'], "status": x['channel'].get('status'), } for x in response.json['streams']] ``` > * You can use `response.json` to convert content to a python `dict` object. > * As channel list is changing frequently, we update it every 10 minutes and use [`@config(age=10*60)`](/apis/self.crawl/#configkwargs) to set the age. Otherwise, it will be ignored as scheduler thinks it's new enough and refuse to update the content. Here is an online demo for twitch as well as a measure using [PhantomJS] which will be discussed in the next level: [http://demo.pyspider.org/debug/tutorial_twitch](http://demo.pyspider.org/debug/tutorial_twitch) HTTP ---- [HTTP] is the protocol to exchange or transfer hypertext. We had used it in last article, we used `self.crawl` and a URL to fetch HTML content which is transferred by [HTTP]. When you got `403 Forbidden` or needed login. You need right parameters of HTTP request. A typical HTTP request message to [http://example.com/](http://example.com/) looks like: ``` GET / HTTP/1.1 Host: example.com Connection: keep-alive Cache-Control: max-age=0 Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8 User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/40.0.2214.45 Safari/537.36 Referer: http://en.wikipedia.org/wiki/Example.com Accept-Encoding: gzip, deflate, sdch Accept-Language: zh-CN,zh;q=0.8 If-None-Match: "359670651" If-Modified-Since: Fri, 09 Aug 2013 23:54:35 GMT ``` > * the first line contains [HTTP method](http://www.w3schools.com/tags/ref_httpmethods.asp), path and HTTP version > * several lines of request header fields in `key: value` format. > * if has message body(say POST request), an empty line and message body would be appended to end of request message. You can get this with [Chrome Developer Tools](https://developer.chrome.com/devtools) - Network panel we used in above section: ![request header](../imgs/request-headers.png) In most case, the last thing you need is to copy right URL + method + headers + body from Network panel. cURL command ------------ `self.crawl` supports `cURL` command as argument to make the HTTP request. It will parse the arguments in the command and use it as fetch parameters. With `Copy as cURL` of a request, you can get a `cURL` command and paste to `self.crawl(command)` to make crawling easy. HTTP Method ----------- [HTTP] defines methods to indicate the desired action to be performed on the identified resource. Two commonly used methods are: GET and POST. GET is when you open a URL, requests the content of a specified resource. POST is used to submit data to server. TODO: need example here. HTTP Headers ------------ [HTTP Headers](http://en.wikipedia.org/wiki/List_of_HTTP_header_fields) is a list of parameters of a request. Some headers you need to attention while scraping: ### User-Agent A [user agent string](http://en.wikipedia.org/wiki/User_agent_string) tell server the application type, operating system or software revision who send the HTTP request. pyspider's default user agent string is: `pyspider/VERSION (+http://pyspider.org/)` ### Referer [Referer](http://en.wikipedia.org/wiki/HTTP_referer) is the address of the previous webpage from which a link to the currently requested page was followed. Some website uses this in image resources to prevent deep linking. TODO: need example here. HTTP Cookie ----------- [HTTP Cookie](http://en.wikipedia.org/wiki/HTTP_cookie) is a field in HTTP headers used for tracking which user is making the request. Generally used for user login and prevent unauthorized requests. You can use [`self.crawl(cookies={"key": value})`](/apis/self.crawl/#fetch) to set cookie via a dict like API. TODO: need example here. [PhantomJS]: http://phantomjs.org/ [AJAX]: http://en.wikipedia.org/wiki/Ajax_%28programming%29 [JSON]: http://en.wikipedia.org/wiki/JSON [HTTP]: http://en.wikipedia.org/wiki/Hypertext_Transfer_Protocol ================================================ FILE: docs/tutorial/HTML-and-CSS-Selector.md ================================================ Level 1: HTML and CSS Selector ============================== In this tutorial, we will scrape information of movies and TV from [IMDb]. An online demo with completed code is: [http://demo.pyspider.org/debug/tutorial_imdb](http://demo.pyspider.org/debug/tutorial_imdb) . Before Start ------------ You should have pyspider installed. You can refer to the documentation [QuickStart](Quickstart). Or test your code on [demo.pyspider.org](http://demo.pyspider.org). Some basic knowledges you should know before scraping: * [Web][WWW] is a system of interlinked hypertext pages. * Pages is identified on the Web via uniform resource locator ([URL]). * Pages transferred via the Hypertext Transfer Protocol ([HTTP]). * Web Pages structured using HyperText Markup Language ([HTML]). To scrape information from a web is 1. Finding URLs of the pages contain the information we want. 2. Fetching the pages via HTTP. 3. Extracting the information from HTML. 4. Finding more URL contains what we want, go back to 2. Pick a start URL ---------------- As we want to get all of the movies on [IMDb], the first thing is finding a list. A good list page may: * containing links to the [movies](http://www.imdb.com/title/tt0167260/) as many as possible. * by following next page, you can traverse all of the movies. * list sorted by last updated time would be a great help to get latest movies. By looking around at the index page of [IMDb], I found this: ![IMDb front page](../imgs/tutorial_imdb_front.png) [http://www.imdb.com/search/title?count=100&title_type=feature,tv_series,tv_movie&ref_=nv_ch_mm_1](http://www.imdb.com/search/title?count=100&title_type=feature,tv_series,tv_movie&ref_=nv_ch_mm_1) ### Creating a project You can find "Create" on the bottom right of baseboard. Click and name a project. ![Creating a project](../imgs/creating_a_project.png) Changing the crawl URL in `on_start` callback: ``` @every(minutes=24 * 60) def on_start(self): self.crawl('http://www.imdb.com/search/title?count=100&title_type=feature,tv_series,tv_movie&ref_=nv_ch_mm_1', callback=self.index_page) ``` > * `self.crawl` would fetch the page and call the `callback` method to parse the response. > * The [`@every` decorator](http://docs.pyspider.org/en/latest/apis/@every/) represents `on_start` would execute every day, to make sure not missing any new movies. Click the green `run` button, you should find a red 1 above follows, switch to follows panel, click the green play button: ![Run one step](../imgs/run_one_step.png) Index Page ---------- From [index page](http://www.imdb.com/search/title?count=100&title_type=feature,tv_series,tv_movie&ref_=nv_ch_mm_1), we need extract two things: * links of the movies like `http://www.imdb.com/title/tt0167260/` * links of [Next](http://www.imdb.com/search/title?count=100&ref_=nv_ch_mm_1&start=101&title_type=feature,tv_series,tv_movie) page ### Find Movies As you can see, the sample handler had already extracted 1900+ links from the page. A measure of extracting movie pages is filtering links with regular expression: ``` import re ... def index_page(self, response): for each in response.doc('a[href^="http"]').items(): if re.match("http://www.imdb.com/title/tt\d+/$", each.attr.href): self.crawl(each.attr.href, callback=self.detail_page) ``` > * `callback` is `self.detail_page` here to use another callback method to parse. Remember you can always use the power of python or anything you are familiar with to extract information. But using tools like CSS selector is recommended. ### Next page #### CSS Selectors CSS selectors are patterns used by [CSS] to select HTML elements which are wanted to style. As elements containing information may have different style in document, It's appropriate to use CSS Selector to select elements we want. More information about CSS selectors could be found in above links: * [CSS Selectors](http://www.w3schools.com/css/css_selectors.asp) * [CSS Selector Reference](http://www.w3schools.com/cssref/css_selectors.asp) You can use CSS Selector with built-in `response.doc` object, which is provided by [PyQuery], you may find the full reference there. #### CSS Selector Helper pyspider provide a tool called `CSS selector helper` to make it easier to generate a selector pattern to element you clicked. Enable CSS selector helper by click the button and switch to `web` panel. ![CSS Selector helper](../imgs/css_selector_helper.png) The element will be highlighted in yellow while mouse over. When you click it, a pre-selected CSS Selector pattern is shown on the bar above. You can edit the features to locate the element and add it to your source code. click "Next »" in the page and add selector pattern to your code: ``` def index_page(self, response): for each in response.doc('a[href^="http"]').items(): if re.match("http://www.imdb.com/title/tt\d+/$", each.attr.href): self.crawl(each.attr.href, callback=self.detail_page) self.crawl(response.doc('#right a').attr.href, callback=self.index_page) ``` Click `run` again and move to the next page, we found that "« Prev" has the same selector pattern as "Next »". When using above code you may find pyspider selected the link of "« Prev", not "Next »". A solution for this is select both of them: ``` self.crawl([x.attr.href for x in response.doc('#right a').items()], callback=self.index_page) ``` Extracting Information ---------------------- Click `run` again and follow to detail page. Add keys you need to result dict and collect value using `CSS selector helper` repeatedly: ``` def detail_page(self, response): return { "url": response.url, "title": response.doc('.header > [itemprop="name"]').text(), "rating": response.doc('.star-box-giga-star').text(), "director": [x.text() for x in response.doc('[itemprop="director"] span').items()], } ``` Note that, `CSS Selector helper` may not always work. You could write selector pattern manually with tools like [Chrome Dev Tools](https://developer.chrome.com/devtools): ![inspect element](../imgs/inspect_element.png) You doesn't need to write every ancestral element in selector pattern, only the elements which can differentiate with not needed elements, is enough. However, it needs experience on scraping or Web developing to know which attribute is important, can be used as locator. You can also test CSS Selector in the JavaScript Console by using `$$` like `$$('[itemprop="director"] span')` Running ------- 1. After tested you code, don't forget to save it. 2. Back to dashboard find your project. 3. Changing the `status` to `DEBUG` or `RUNNING`. 4. Press the `run` button. ![index demo](../imgs/index_page.png) Notes ----- The script is just a simple, you may found more issues when scraping IMDb: * ref in list page url is for tracing user, it's better remove it. * IMDb does not serve more than 100000 results for any query, you need find more lists with lesser results, like [this](http://www.imdb.com/search/title?genres=action&title_type=feature&sort=moviemeter,asc) * You may need a list sorted by last updated time and update it with a shorter interval. * Some attribute is hard to extract, you may need write selector pattern on hand or using [XPATH](http://www.w3schools.com/xpath/xpath_syntax.asp) and/or some python code to extract information. [IMDb]: http://www.imdb.com/ [WWW]: http://en.wikipedia.org/wiki/World_Wide_Web [HTTP]: http://en.wikipedia.org/wiki/Hypertext_Transfer_Protocol [HTML]: http://en.wikipedia.org/wiki/HTML [URL]: http://en.wikipedia.org/wiki/Uniform_resource_locator [CSS]: https://developer.mozilla.org/en-US/docs/Web/Guide/CSS/Getting_Started/What_is_CSS [PyQuery]: https://pythonhosted.org/pyquery/ ================================================ FILE: docs/tutorial/Render-with-PhantomJS.md ================================================ Level 3: Render with PhantomJS ============================== Sometimes web page is too complex to find out the API request. It's time to meet the power of [PhantomJS]. To use PhantomJS, you should have PhantomJS [installed](http://phantomjs.org/download.html). If you are running pyspider with `all` mode, PhantomJS is enabled if excutable in the `PATH`. Make sure phantomjs is working by running ``` $ pyspider phantomjs ``` Continue with the rest of the tutorial if the output is ``` Web server running on port 25555 ``` Use PhantomJS ------------- When pyspider with PhantomJS connected, you can enable this feature by adding a parameter `fetch_type='js'` to `self.crawl`. We use PhantomJS to scrape channel list of [http://www.twitch.tv/directory/game/Dota%202](http://www.twitch.tv/directory/game/Dota%202) which is loaded with AJAX we discussed in [Level 2](tutorial/AJAX-and-more-HTTP#ajax): ``` class Handler(BaseHandler): def on_start(self): self.crawl('http://www.twitch.tv/directory/game/Dota%202', fetch_type='js', callback=self.index_page) def index_page(self, response): return { "url": response.url, "channels": [{ "title": x('.title').text(), "viewers": x('.info').contents()[2], "name": x('.info a').text(), } for x in response.doc('.stream.item').items()] } ``` > I used some API to handle the list of streams. You can find complete API reference from [PyQuery complete API](https://pythonhosted.org/pyquery/api.html) Running JavaScript on Page -------------------------- We will try to scrape images from [http://www.pinterest.com/categories/popular/](http://www.pinterest.com/categories/popular/) in this section. Only 25 images is shown at the beginning, more images would be loaded when you scroll to the bottom of the page. To scrape images as many as posible we can use a [`js_script` parameter](/apis/self.crawl/#enable-javascript-fetcher-need-support-by-fetcher) to set some function wrapped JavaScript codes to simulate the scroll action: ``` class Handler(BaseHandler): def on_start(self): self.crawl('http://www.pinterest.com/categories/popular/', fetch_type='js', js_script=""" function() { window.scrollTo(0,document.body.scrollHeight); } """, callback=self.index_page) def index_page(self, response): return { "url": response.url, "images": [{ "title": x('.richPinGridTitle').text(), "img": x('.pinImg').attr('src'), "author": x('.creditName').text(), } for x in response.doc('.item').items() if x('.pinImg')] } ``` > * Script would been executed after page loaded(can been changed via [`js_run_at` parameter](/apis/self.crawl/#enable-javascript-fetcher-need-support-by-fetcher)) > * We scroll once after page loaded, you can scroll multiple times using [`setTimeout`](https://developer.mozilla.org/en-US/docs/Web/API/WindowTimers.setTimeout). PhantomJS will fetch as many items as possible before timeout arrived. Online demo: [http://demo.pyspider.org/debug/tutorial_pinterest](http://demo.pyspider.org/debug/tutorial_pinterest) [PhantomJS]: http://phantomjs.org/ ================================================ FILE: docs/tutorial/index.md ================================================ pyspider Tutorial ================= > The best way to learn how to scrap is learning how to make it. * [Level 1: HTML and CSS Selector](HTML-and-CSS-Selector) * [Level 2: AJAX and More HTTP](AJAX-and-more-HTTP) * [Level 3: Render with PhantomJS](Render-with-PhantomJS) If you have problem using pyspider, [user group](https://groups.google.com/group/pyspider-users) is a place for discussing. ================================================ FILE: mkdocs.yml ================================================ site_name: pyspider site_description: A Powerful Spider(Web Crawler) System in Python. site_author: binux repo_url: https://github.com/binux/pyspider pages: - Introduction: index.md - Quickstart: Quickstart.md - Command Line: Command-Line.md - Tutorial: - Index: tutorial/index.md - 'Level 1: HTML and CSS Selector': tutorial/HTML-and-CSS-Selector.md - 'Level 2: AJAX and More HTTP': tutorial/AJAX-and-more-HTTP.md - 'Level 3: Render with PhantomJS': tutorial/Render-with-PhantomJS.md - About pyspider: - Architecture: Architecture.md - About Tasks: About-Tasks.md - About Projects: About-Projects.md - Script Environment: Script-Environment.md - Working with Results: Working-with-Results.md - API Reference: - Index: apis/index.md - self.crawl: apis/self.crawl.md - Response: apis/Response.md - self.send_message: apis/self.send_message.md - '@catch_status_code_error': apis/@catch_status_code_error.md - '@every': apis/@every.md - Deployment: Deployment.md - Running pyspider with Docker: Running-pyspider-with-Docker.md - Deployment of demo.pyspider.org: Deployment-demo.pyspider.org.md - Frequently Asked Questions: Frequently-Asked-Questions.md theme: readthedocs markdown_extensions: ['toc(permalink=true)', ] ================================================ FILE: pyspider/__init__.py ================================================ #!/usr/bin/env python # -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # Created on 2014-11-17 19:17:12 __version__ = '0.4.0' ================================================ FILE: pyspider/database/__init__.py ================================================ #!/usr/bin/env python # -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # Created on 2014-10-08 15:04:08 import os, requests, json from six.moves.urllib.parse import urlparse, parse_qs def connect_database(url): """ create database object by url mysql: mysql+type://user:passwd@host:port/database sqlite: # relative path sqlite+type:///path/to/database.db # absolute path sqlite+type:////path/to/database.db # memory database sqlite+type:// mongodb: mongodb+type://[username:password@]host1[:port1][,host2[:port2],...[,hostN[:portN]]][/[database][?options]] more: http://docs.mongodb.org/manual/reference/connection-string/ sqlalchemy: sqlalchemy+postgresql+type://user:passwd@host:port/database sqlalchemy+mysql+mysqlconnector+type://user:passwd@host:port/database more: http://docs.sqlalchemy.org/en/rel_0_9/core/engines.html redis: redis+taskdb://host:port/db elasticsearch: elasticsearch+type://host:port/?index=pyspider couchdb: couchdb+type://[username:password@]host[:port] local: local+projectdb://filepath,filepath type: taskdb projectdb resultdb """ db = _connect_database(url) db.copy = lambda: _connect_database(url) return db def _connect_database(url): # NOQA parsed = urlparse(url) scheme = parsed.scheme.split('+') if len(scheme) == 1: raise Exception('wrong scheme format: %s' % parsed.scheme) else: engine, dbtype = scheme[0], scheme[-1] other_scheme = "+".join(scheme[1:-1]) if dbtype not in ('taskdb', 'projectdb', 'resultdb'): raise LookupError('unknown database type: %s, ' 'type should be one of ["taskdb", "projectdb", "resultdb"]', dbtype) if engine == 'mysql': return _connect_mysql(parsed,dbtype) elif engine == 'sqlite': return _connect_sqlite(parsed,dbtype) elif engine == 'mongodb': return _connect_mongodb(parsed,dbtype,url) elif engine == 'sqlalchemy': return _connect_sqlalchemy(parsed, dbtype, url, other_scheme) elif engine == 'redis': if dbtype == 'taskdb': from .redis.taskdb import TaskDB return TaskDB(parsed.hostname, parsed.port, int(parsed.path.strip('/') or 0)) else: raise LookupError('not supported dbtype: %s', dbtype) elif engine == 'local': scripts = url.split('//', 1)[1].split(',') if dbtype == 'projectdb': from .local.projectdb import ProjectDB return ProjectDB(scripts) else: raise LookupError('not supported dbtype: %s', dbtype) elif engine == 'elasticsearch' or engine == 'es': return _connect_elasticsearch(parsed, dbtype) elif engine == 'couchdb': return _connect_couchdb(parsed, dbtype, url) else: raise Exception('unknown engine: %s' % engine) def _connect_mysql(parsed,dbtype): parames = {} if parsed.username: parames['user'] = parsed.username if parsed.password: parames['passwd'] = parsed.password if parsed.hostname: parames['host'] = parsed.hostname if parsed.port: parames['port'] = parsed.port if parsed.path.strip('/'): parames['database'] = parsed.path.strip('/') if dbtype == 'taskdb': from .mysql.taskdb import TaskDB return TaskDB(**parames) elif dbtype == 'projectdb': from .mysql.projectdb import ProjectDB return ProjectDB(**parames) elif dbtype == 'resultdb': from .mysql.resultdb import ResultDB return ResultDB(**parames) else: raise LookupError def _connect_sqlite(parsed,dbtype): if parsed.path.startswith('//'): path = '/' + parsed.path.strip('/') elif parsed.path.startswith('/'): path = './' + parsed.path.strip('/') elif not parsed.path: path = ':memory:' else: raise Exception('error path: %s' % parsed.path) if dbtype == 'taskdb': from .sqlite.taskdb import TaskDB return TaskDB(path) elif dbtype == 'projectdb': from .sqlite.projectdb import ProjectDB return ProjectDB(path) elif dbtype == 'resultdb': from .sqlite.resultdb import ResultDB return ResultDB(path) else: raise LookupError def _connect_mongodb(parsed,dbtype,url): url = url.replace(parsed.scheme, 'mongodb') parames = {} if parsed.path.strip('/'): parames['database'] = parsed.path.strip('/') if dbtype == 'taskdb': from .mongodb.taskdb import TaskDB return TaskDB(url, **parames) elif dbtype == 'projectdb': from .mongodb.projectdb import ProjectDB return ProjectDB(url, **parames) elif dbtype == 'resultdb': from .mongodb.resultdb import ResultDB return ResultDB(url, **parames) else: raise LookupError def _connect_sqlalchemy(parsed, dbtype,url, other_scheme): if not other_scheme: raise Exception('wrong scheme format: %s' % parsed.scheme) url = url.replace(parsed.scheme, other_scheme) if dbtype == 'taskdb': from .sqlalchemy.taskdb import TaskDB return TaskDB(url) elif dbtype == 'projectdb': from .sqlalchemy.projectdb import ProjectDB return ProjectDB(url) elif dbtype == 'resultdb': from .sqlalchemy.resultdb import ResultDB return ResultDB(url) else: raise LookupError def _connect_elasticsearch(parsed, dbtype): # in python 2.6 url like "http://host/?query", query will not been splitted if parsed.path.startswith('/?'): index = parse_qs(parsed.path[2:]) else: index = parse_qs(parsed.query) if 'index' in index and index['index']: index = index['index'][0] else: index = 'pyspider' if dbtype == 'projectdb': from .elasticsearch.projectdb import ProjectDB return ProjectDB([parsed.netloc], index=index) elif dbtype == 'resultdb': from .elasticsearch.resultdb import ResultDB return ResultDB([parsed.netloc], index=index) elif dbtype == 'taskdb': from .elasticsearch.taskdb import TaskDB return TaskDB([parsed.netloc], index=index) def _connect_couchdb(parsed, dbtype, url): if os.environ.get('COUCHDB_HTTPS'): url = "https://" + parsed.netloc + "/" else: url = "http://" + parsed.netloc + "/" params = {} # default to env, then url, then hard coded params['username'] = os.environ.get('COUCHDB_USER') or parsed.username params['password'] = os.environ.get('COUCHDB_PASSWORD') or parsed.password if dbtype == 'taskdb': from .couchdb.taskdb import TaskDB return TaskDB(url, **params) elif dbtype == 'projectdb': from .couchdb.projectdb import ProjectDB return ProjectDB(url, **params) elif dbtype == 'resultdb': from .couchdb.resultdb import ResultDB return ResultDB(url, **params) else: raise LookupError ================================================ FILE: pyspider/database/base/__init__.py ================================================ ================================================ FILE: pyspider/database/base/projectdb.py ================================================ #!/usr/bin/env python # -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # Created on 2014-02-09 11:28:52 import re # NOTE: When get/get_all/check_update from database with default fields, # all following fields should be included in output dict. { 'project': { 'name': str, 'group': str, 'status': str, 'script': str, # 'config': str, 'comments': str, # 'priority': int, 'rate': int, 'burst': int, 'updatetime': int, } } class ProjectDB(object): status_str = [ 'TODO', 'STOP', 'CHECKING', 'DEBUG', 'RUNNING', ] def insert(self, name, obj={}): raise NotImplementedError def update(self, name, obj={}, **kwargs): raise NotImplementedError def get_all(self, fields=None): raise NotImplementedError def get(self, name, fields): raise NotImplementedError def drop(self, name): raise NotImplementedError def check_update(self, timestamp, fields=None): raise NotImplementedError def split_group(self, group, lower=True): if lower: return re.split("\W+", (group or '').lower()) else: return re.split("\W+", group or '') def verify_project_name(self, name): if len(name) > 64: return False if re.search(r"[^\w]", name): return False return True def copy(self): ''' database should be able to copy itself to create new connection it's implemented automatically by pyspider.database.connect_database if you are not create database connection via connect_database method, you should implement this ''' raise NotImplementedError ================================================ FILE: pyspider/database/base/resultdb.py ================================================ #!/usr/bin/env python # -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # Created on 2014-10-11 18:40:03 # result schema { 'result': { 'taskid': str, # new, not changeable 'project': str, # new, not changeable 'url': str, # new, not changeable 'result': str, # json string 'updatetime': int, } } class ResultDB(object): """ database for result """ projects = set() # projects in resultdb def save(self, project, taskid, url, result): raise NotImplementedError def select(self, project, fields=None, offset=0, limit=None): raise NotImplementedError def count(self, project): raise NotImplementedError def get(self, project, taskid, fields=None): raise NotImplementedError def drop(self, project): raise NotImplementedError def copy(self): ''' database should be able to copy itself to create new connection it's implemented automatically by pyspider.database.connect_database if you are not create database connection via connect_database method, you should implement this ''' raise NotImplementedError ================================================ FILE: pyspider/database/base/taskdb.py ================================================ #!/usr/bin/env python # -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # Created on 2014-02-08 10:28:48 # task schema { 'task': { 'taskid': str, # new, not change 'project': str, # new, not change 'url': str, # new, not change 'status': int, # change 'schedule': { 'priority': int, 'retries': int, 'retried': int, 'exetime': int, 'age': int, 'itag': str, # 'recrawl': int }, # new and restart 'fetch': { 'method': str, 'headers': dict, 'data': str, 'timeout': int, 'save': dict, }, # new and restart 'process': { 'callback': str, }, # new and restart 'track': { 'fetch': { 'ok': bool, 'time': int, 'status_code': int, 'headers': dict, 'encoding': str, 'content': str, }, 'process': { 'ok': bool, 'time': int, 'follows': int, 'outputs': int, 'logs': str, 'exception': str, }, 'save': object, # jsonable object saved by processor }, # finish 'lastcrawltime': int, # keep between request 'updatetime': int, # keep between request } } class TaskDB(object): ACTIVE = 1 SUCCESS = 2 FAILED = 3 BAD = 4 projects = set() # projects in taskdb def load_tasks(self, status, project=None, fields=None): raise NotImplementedError def get_task(self, project, taskid, fields=None): raise NotImplementedError def status_count(self, project): ''' return a dict ''' raise NotImplementedError def insert(self, project, taskid, obj={}): raise NotImplementedError def update(self, project, taskid, obj={}, **kwargs): raise NotImplementedError def drop(self, project): raise NotImplementedError @staticmethod def status_to_string(status): return { 1: 'ACTIVE', 2: 'SUCCESS', 3: 'FAILED', 4: 'BAD', }.get(status, 'UNKNOWN') @staticmethod def status_to_int(status): return { 'ACTIVE': 1, 'SUCCESS': 2, 'FAILED': 3, 'BAD': 4, }.get(status, 4) def copy(self): ''' database should be able to copy itself to create new connection it's implemented automatically by pyspider.database.connect_database if you are not create database connection via connect_database method, you should implement this ''' raise NotImplementedError ================================================ FILE: pyspider/database/basedb.py ================================================ #!/usr/bin/env python # -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # Created on 2012-08-30 17:43:49 from __future__ import unicode_literals, division, absolute_import import logging logger = logging.getLogger('database.basedb') from six import itervalues from pyspider.libs import utils class BaseDB: ''' BaseDB dbcur should be overwirte ''' __tablename__ = None placeholder = '%s' maxlimit = -1 @staticmethod def escape(string): return '`%s`' % string @property def dbcur(self): raise NotImplementedError def _execute(self, sql_query, values=[]): dbcur = self.dbcur dbcur.execute(sql_query, values) return dbcur def _select(self, tablename=None, what="*", where="", where_values=[], offset=0, limit=None): tablename = self.escape(tablename or self.__tablename__) if isinstance(what, list) or isinstance(what, tuple) or what is None: what = ','.join(self.escape(f) for f in what) if what else '*' sql_query = "SELECT %s FROM %s" % (what, tablename) if where: sql_query += " WHERE %s" % where if limit: sql_query += " LIMIT %d, %d" % (offset, limit) elif offset: sql_query += " LIMIT %d, %d" % (offset, self.maxlimit) logger.debug("", sql_query) for row in self._execute(sql_query, where_values): yield row def _select2dic(self, tablename=None, what="*", where="", where_values=[], order=None, offset=0, limit=None): tablename = self.escape(tablename or self.__tablename__) if isinstance(what, list) or isinstance(what, tuple) or what is None: what = ','.join(self.escape(f) for f in what) if what else '*' sql_query = "SELECT %s FROM %s" % (what, tablename) if where: sql_query += " WHERE %s" % where if order: sql_query += ' ORDER BY %s' % order if limit: sql_query += " LIMIT %d, %d" % (offset, limit) elif offset: sql_query += " LIMIT %d, %d" % (offset, self.maxlimit) logger.debug("", sql_query) dbcur = self._execute(sql_query, where_values) # f[0] may return bytes type # https://github.com/mysql/mysql-connector-python/pull/37 fields = [utils.text(f[0]) for f in dbcur.description] for row in dbcur: yield dict(zip(fields, row)) def _replace(self, tablename=None, **values): tablename = self.escape(tablename or self.__tablename__) if values: _keys = ", ".join(self.escape(k) for k in values) _values = ", ".join([self.placeholder, ] * len(values)) sql_query = "REPLACE INTO %s (%s) VALUES (%s)" % (tablename, _keys, _values) else: sql_query = "REPLACE INTO %s DEFAULT VALUES" % tablename logger.debug("", sql_query) if values: dbcur = self._execute(sql_query, list(itervalues(values))) else: dbcur = self._execute(sql_query) return dbcur.lastrowid def _insert(self, tablename=None, **values): tablename = self.escape(tablename or self.__tablename__) if values: _keys = ", ".join((self.escape(k) for k in values)) _values = ", ".join([self.placeholder, ] * len(values)) sql_query = "INSERT INTO %s (%s) VALUES (%s)" % (tablename, _keys, _values) else: sql_query = "INSERT INTO %s DEFAULT VALUES" % tablename logger.debug("", sql_query) if values: dbcur = self._execute(sql_query, list(itervalues(values))) else: dbcur = self._execute(sql_query) return dbcur.lastrowid def _update(self, tablename=None, where="1=0", where_values=[], **values): tablename = self.escape(tablename or self.__tablename__) _key_values = ", ".join([ "%s = %s" % (self.escape(k), self.placeholder) for k in values ]) sql_query = "UPDATE %s SET %s WHERE %s" % (tablename, _key_values, where) logger.debug("", sql_query) return self._execute(sql_query, list(itervalues(values)) + list(where_values)) def _delete(self, tablename=None, where="1=0", where_values=[]): tablename = self.escape(tablename or self.__tablename__) sql_query = "DELETE FROM %s" % tablename if where: sql_query += " WHERE %s" % where logger.debug("", sql_query) return self._execute(sql_query, where_values) if __name__ == "__main__": import sqlite3 class DB(BaseDB): __tablename__ = "test" placeholder = "?" def __init__(self): self.conn = sqlite3.connect(":memory:") cursor = self.conn.cursor() cursor.execute( '''CREATE TABLE `%s` (id INTEGER PRIMARY KEY AUTOINCREMENT, name, age)''' % self.__tablename__ ) @property def dbcur(self): return self.conn.cursor() db = DB() assert db._insert(db.__tablename__, name="binux", age=23) == 1 assert db._select(db.__tablename__, "name, age").next() == ("binux", 23) assert db._select2dic(db.__tablename__, "name, age").next()["name"] == "binux" assert db._select2dic(db.__tablename__, "name, age").next()["age"] == 23 db._replace(db.__tablename__, id=1, age=24) assert db._select(db.__tablename__, "name, age").next() == (None, 24) db._update(db.__tablename__, "id = 1", age=16) assert db._select(db.__tablename__, "name, age").next() == (None, 16) db._delete(db.__tablename__, "id = 1") assert [row for row in db._select(db.__tablename__)] == [] ================================================ FILE: pyspider/database/couchdb/__init__.py ================================================ ================================================ FILE: pyspider/database/couchdb/couchdbbase.py ================================================ import time, requests, json from requests.auth import HTTPBasicAuth class SplitTableMixin(object): UPDATE_PROJECTS_TIME = 10 * 60 def __init__(self): self.session = requests.session() if self.username: self.session.auth = HTTPBasicAuth(self.username, self.password) self.session.headers.update({'Content-Type': 'application/json'}) def _collection_name(self, project): if self.collection_prefix: return "%s_%s" % (self.collection_prefix, project) else: return project @property def projects(self): if time.time() - getattr(self, '_last_update_projects', 0) > self.UPDATE_PROJECTS_TIME: self._list_project() return self._projects @projects.setter def projects(self, value): self._projects = value def _list_project(self): self._last_update_projects = time.time() self.projects = set() if self.collection_prefix: prefix = "%s." % self.collection_prefix else: prefix = '' url = self.base_url + "_all_dbs" res = self.session.get(url, json={}).json() for each in res: if each.startswith('_'): continue if each.startswith(self.database): self.projects.add(each[len(self.database)+1+len(prefix):]) def create_database(self, name): url = self.base_url + name res = self.session.put(url).json() if 'error' in res and res['error'] == 'unauthorized': raise Exception("Supplied credentials are incorrect. Reason: {} for User: {} Password: {}".format(res['reason'], self.username, self.password)) return res def get_doc(self, db_name, doc_id): url = self.base_url + db_name + "/" + doc_id res = self.session.get(url).json() if "error" in res and res["error"] == "not_found": return None return res def get_docs(self, db_name, selector): url = self.base_url + db_name + "/_find" selector['use_index'] = self.index res = self.session.post(url, json=selector).json() if 'error' in res and res['error'] == 'not_found': return [] return res['docs'] def get_all_docs(self, db_name): return self.get_docs(db_name, {"selector": {}}) def insert_doc(self, db_name, doc_id, doc): url = self.base_url + db_name + "/" + doc_id return self.session.put(url, json=doc).json() def update_doc(self, db_name, doc_id, new_doc): doc = self.get_doc(db_name, doc_id) if doc is None: return self.insert_doc(db_name, doc_id, new_doc) for key in new_doc: doc[key] = new_doc[key] url = self.base_url + db_name + "/" + doc_id return self.session.put(url, json=doc).json() def delete(self, url): return self.session.delete(url).json() ================================================ FILE: pyspider/database/couchdb/projectdb.py ================================================ import time, requests, json from requests.auth import HTTPBasicAuth from pyspider.database.base.projectdb import ProjectDB as BaseProjectDB class ProjectDB(BaseProjectDB): __collection_name__ = 'projectdb' def __init__(self, url, database='projectdb', username=None, password=None): self.username = username self.password = password self.url = url + self.__collection_name__ + "_" + database + "/" self.database = database self.session = requests.session() if username: self.session.auth = HTTPBasicAuth(self.username, self.password) self.session.headers.update({'Content-Type': 'application/json'}) # Create the db res = self.session.put(self.url).json() if 'error' in res and res['error'] == 'unauthorized': raise Exception( "Supplied credentials are incorrect. Reason: {} for User: {} Password: {}".format(res['reason'], self.username, self.password)) # create index payload = { 'index': { 'fields': ['name'] }, 'name': self.__collection_name__ + "_" + database } res = self.session.post(self.url + "_index", json=payload).json() self.index = res['id'] def _default_fields(self, each): if each is None: return each each.setdefault('group', None) each.setdefault('status', 'TODO') each.setdefault('script', '') each.setdefault('comments', None) each.setdefault('rate', 0) each.setdefault('burst', 0) each.setdefault('updatetime', 0) return each def insert(self, name, obj={}): url = self.url + name obj = dict(obj) obj['name'] = name obj['updatetime'] = time.time() res = self.session.put(url, json=obj).json() return res def update(self, name, obj={}, **kwargs): # object contains the fields to update and their new values update = self.get(name) # update will contain _rev if update is None: return None obj = dict(obj) obj['updatetime'] = time.time() obj.update(kwargs) for key in obj: update[key] = obj[key] return self.insert(name, update) def get_all(self, fields=None): if fields is None: fields = [] payload = { "selector": {}, "fields": fields, "use_index": self.index } url = self.url + "_find" res = self.session.post(url, json=payload).json() for doc in res['docs']: yield self._default_fields(doc) def get(self, name, fields=None): if fields is None: fields = [] payload = { "selector": {"name": name}, "fields": fields, "limit": 1, "use_index": self.index } url = self.url + "_find" res = self.session.post(url, json=payload).json() if len(res['docs']) == 0: return None return self._default_fields(res['docs'][0]) def check_update(self, timestamp, fields=None): if fields is None: fields = [] for project in self.get_all(fields=('updatetime', 'name')): if project['updatetime'] > timestamp: project = self.get(project['name'], fields) yield self._default_fields(project) def drop(self, name): doc = self.get(name) payload = {"rev": doc["_rev"]} url = self.url + name return self.session.delete(url, params=payload).json() def drop_database(self): return self.session.delete(self.url).json() ================================================ FILE: pyspider/database/couchdb/resultdb.py ================================================ import time, json from pyspider.database.base.resultdb import ResultDB as BaseResultDB from .couchdbbase import SplitTableMixin class ResultDB(SplitTableMixin, BaseResultDB): collection_prefix = '' def __init__(self, url, database='resultdb', username=None, password=None): self.username = username self.password = password self.base_url = url self.url = url + database + "/" self.database = database super().__init__() self.create_database(database) self.index = None def _get_collection_name(self, project): return self.database + "_" + self._collection_name(project) def _create_project(self, project): collection_name = self._get_collection_name(project) self.create_database(collection_name) # create index payload = { 'index': { 'fields': ['taskid'] }, 'name': collection_name } res = self.session.post(self.base_url + collection_name + "/_index", json=payload).json() self.index = res['id'] self._list_project() def save(self, project, taskid, url, result): if project not in self.projects: self._create_project(project) collection_name = self._get_collection_name(project) obj = { 'taskid': taskid, 'url': url, 'result': result, 'updatetime': time.time(), } return self.update_doc(collection_name, taskid, obj) def select(self, project, fields=None, offset=0, limit=0): if project not in self.projects: self._list_project() if project not in self.projects: return offset = offset or 0 limit = limit or 0 collection_name = self._get_collection_name(project) if fields is None: fields = [] if limit == 0: sel = { 'selector': {}, 'fields': fields, 'skip': offset } else: sel = { 'selector': {}, 'fields': fields, 'skip': offset, 'limit': limit } for result in self.get_docs(collection_name, sel): yield result def count(self, project): if project not in self.projects: self._list_project() if project not in self.projects: return collection_name = self._get_collection_name(project) return len(self.get_all_docs(collection_name)) def get(self, project, taskid, fields=None): if project not in self.projects: self._list_project() if project not in self.projects: return collection_name = self._get_collection_name(project) if fields is None: fields = [] sel = { 'selector': {'taskid': taskid}, 'fields': fields } ret = self.get_docs(collection_name, sel) if len(ret) == 0: return None return ret[0] def drop_database(self): return self.delete(self.url) def drop(self, project): # drop the project collection_name = self._get_collection_name(project) url = self.base_url + collection_name return self.delete(url) ================================================ FILE: pyspider/database/couchdb/taskdb.py ================================================ import json, time from pyspider.database.base.taskdb import TaskDB as BaseTaskDB from .couchdbbase import SplitTableMixin class TaskDB(SplitTableMixin, BaseTaskDB): collection_prefix = '' def __init__(self, url, database='taskdb', username=None, password=None): self.username = username self.password = password self.base_url = url self.url = url + database + "/" self.database = database self.index = None super().__init__() self.create_database(database) self.projects = set() self._list_project() def _get_collection_name(self, project): return self.database + "_" + self._collection_name(project) def _create_project(self, project): collection_name = self._get_collection_name(project) self.create_database(collection_name) # create index payload = { 'index': { 'fields': ['status', 'taskid'] }, 'name': collection_name } res = self.session.post(self.base_url + collection_name + "/_index", json=payload).json() self.index = res['id'] self._list_project() def load_tasks(self, status, project=None, fields=None): if not project: self._list_project() if fields is None: fields = [] if project: projects = [project, ] else: projects = self.projects for project in projects: collection_name = self._get_collection_name(project) for task in self.get_docs(collection_name, {"selector": {"status": status}, "fields": fields}): yield task def get_task(self, project, taskid, fields=None): if project not in self.projects: self._list_project() if project not in self.projects: return if fields is None: fields = [] collection_name = self._get_collection_name(project) ret = self.get_docs(collection_name, {"selector": {"taskid": taskid}, "fields": fields}) if len(ret) == 0: return None return ret[0] def status_count(self, project): if project not in self.projects: self._list_project() if project not in self.projects: return {} collection_name = self._get_collection_name(project) def _count_for_status(collection_name, status): total = len(self.get_docs(collection_name, {"selector": {'status': status}})) return {'total': total, "_id": status} if total else None c = collection_name ret = filter(lambda x: x,map(lambda s: _count_for_status(c, s), [self.ACTIVE, self.SUCCESS, self.FAILED])) result = {} if isinstance(ret, dict): ret = ret.get('result', []) for each in ret: result[each['_id']] = each['total'] return result def insert(self, project, taskid, obj={}): if project not in self.projects: self._create_project(project) obj = dict(obj) obj['taskid'] = taskid obj['project'] = project obj['updatetime'] = time.time() return self.update(project, taskid, obj=obj) def update(self, project, taskid, obj={}, **kwargs): obj = dict(obj) obj.update(kwargs) obj['updatetime'] = time.time() collection_name = self._get_collection_name(project) return self.update_doc(collection_name, taskid, obj) def drop_database(self): return self.delete(self.url) def drop(self, project): collection_name = self._get_collection_name(project) url = self.base_url + collection_name return self.delete(url) ================================================ FILE: pyspider/database/elasticsearch/__init__.py ================================================ #!/usr/bin/env python # -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # Created on 2016-01-17 18:31:58 ================================================ FILE: pyspider/database/elasticsearch/projectdb.py ================================================ #!/usr/bin/env python # -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # Created on 2016-01-17 18:32:33 import time import elasticsearch.helpers from elasticsearch import Elasticsearch from pyspider.database.base.projectdb import ProjectDB as BaseProjectDB class ProjectDB(BaseProjectDB): __type__ = 'project' def __init__(self, hosts, index='pyspider'): self.index = index self.es = Elasticsearch(hosts=hosts) self.es.indices.create(index=self.index, ignore=400) if not self.es.indices.get_mapping(index=self.index, doc_type=self.__type__): self.es.indices.put_mapping(index=self.index, doc_type=self.__type__, body={ "_all": {"enabled": False}, "properties": { "updatetime": {"type": "double"} } }) def insert(self, name, obj={}): obj = dict(obj) obj['name'] = name obj['updatetime'] = time.time() obj.setdefault('group', '') obj.setdefault('status', 'TODO') obj.setdefault('script', '') obj.setdefault('comments', '') obj.setdefault('rate', 0) obj.setdefault('burst', 0) return self.es.index(index=self.index, doc_type=self.__type__, body=obj, id=name, refresh=True) def update(self, name, obj={}, **kwargs): obj = dict(obj) obj.update(kwargs) obj['updatetime'] = time.time() return self.es.update(index=self.index, doc_type=self.__type__, body={'doc': obj}, id=name, refresh=True, ignore=404) def get_all(self, fields=None): for record in elasticsearch.helpers.scan(self.es, index=self.index, doc_type=self.__type__, query={'query': {"match_all": {}}}, _source_include=fields or []): yield record['_source'] def get(self, name, fields=None): ret = self.es.get(index=self.index, doc_type=self.__type__, id=name, _source_include=fields or [], ignore=404) return ret.get('_source', None) def check_update(self, timestamp, fields=None): for record in elasticsearch.helpers.scan(self.es, index=self.index, doc_type=self.__type__, query={'query': {"range": { "updatetime": {"gte": timestamp} }}}, _source_include=fields or []): yield record['_source'] def drop(self, name): return self.es.delete(index=self.index, doc_type=self.__type__, id=name, refresh=True) ================================================ FILE: pyspider/database/elasticsearch/resultdb.py ================================================ #!/usr/bin/env python # -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # Created on 2016-01-18 19:41:24 import time import elasticsearch.helpers from elasticsearch import Elasticsearch from pyspider.database.base.resultdb import ResultDB as BaseResultDB class ResultDB(BaseResultDB): __type__ = 'result' def __init__(self, hosts, index='pyspider'): self.index = index self.es = Elasticsearch(hosts=hosts) self.es.indices.create(index=self.index, ignore=400) if not self.es.indices.get_mapping(index=self.index, doc_type=self.__type__): self.es.indices.put_mapping(index=self.index, doc_type=self.__type__, body={ "_all": {"enabled": True}, "properties": { "taskid": {"enabled": False}, "project": {"type": "string", "index": "not_analyzed"}, "url": {"enabled": False}, } }) @property def projects(self): ret = self.es.search(index=self.index, doc_type=self.__type__, body={"aggs": {"projects": { "terms": {"field": "project"} }}}, _source=False) return [each['key'] for each in ret['aggregations']['projects'].get('buckets', [])] def save(self, project, taskid, url, result): obj = { 'taskid': taskid, 'project': project, 'url': url, 'result': result, 'updatetime': time.time(), } return self.es.index(index=self.index, doc_type=self.__type__, body=obj, id='%s:%s' % (project, taskid)) def select(self, project, fields=None, offset=0, limit=0): offset = offset or 0 limit = limit or 0 if not limit: for record in elasticsearch.helpers.scan(self.es, index=self.index, doc_type=self.__type__, query={'query': {'term': {'project': project}}}, _source_include=fields or [], from_=offset, sort="updatetime:desc"): yield record['_source'] else: for record in self.es.search(index=self.index, doc_type=self.__type__, body={'query': {'term': {'project': project}}}, _source_include=fields or [], from_=offset, size=limit, sort="updatetime:desc" ).get('hits', {}).get('hits', []): yield record['_source'] def count(self, project): return self.es.count(index=self.index, doc_type=self.__type__, body={'query': {'term': {'project': project}}} ).get('count', 0) def get(self, project, taskid, fields=None): ret = self.es.get(index=self.index, doc_type=self.__type__, id="%s:%s" % (project, taskid), _source_include=fields or [], ignore=404) return ret.get('_source', None) def drop(self, project): self.refresh() for record in elasticsearch.helpers.scan(self.es, index=self.index, doc_type=self.__type__, query={'query': {'term': {'project': project}}}, _source=False): self.es.delete(index=self.index, doc_type=self.__type__, id=record['_id']) def refresh(self): """ Explicitly refresh one or more index, making all operations performed since the last refresh available for search. """ self.es.indices.refresh(index=self.index) ================================================ FILE: pyspider/database/elasticsearch/taskdb.py ================================================ #!/usr/bin/env python # -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # Created on 2016-01-20 20:20:55 import time import json import elasticsearch.helpers from elasticsearch import Elasticsearch from pyspider.database.base.taskdb import TaskDB as BaseTaskDB class TaskDB(BaseTaskDB): __type__ = 'task' def __init__(self, hosts, index='pyspider'): self.index = index self._changed = False self.es = Elasticsearch(hosts=hosts) self.es.indices.create(index=self.index, ignore=400) if not self.es.indices.get_mapping(index=self.index, doc_type=self.__type__): self.es.indices.put_mapping(index=self.index, doc_type=self.__type__, body={ "_all": {"enabled": False}, "properties": { "project": {"type": "string", "index": "not_analyzed"}, "status": {"type": "byte"}, } }) def _parse(self, data): if not data: return data for each in ('schedule', 'fetch', 'process', 'track'): if each in data: if data[each]: data[each] = json.loads(data[each]) else: data[each] = {} return data def _stringify(self, data): for each in ('schedule', 'fetch', 'process', 'track'): if each in data: data[each] = json.dumps(data[each]) return data @property def projects(self): ret = self.es.search(index=self.index, doc_type=self.__type__, body={"aggs": {"projects": { "terms": {"field": "project"} }}}, _source=False) return [each['key'] for each in ret['aggregations']['projects'].get('buckets', [])] def load_tasks(self, status, project=None, fields=None): self.refresh() if project is None: for project in self.projects: for each in self.load_tasks(status, project, fields): yield each else: for record in elasticsearch.helpers.scan(self.es, index=self.index, doc_type=self.__type__, query={'query': {'bool': { 'must': {'term': {'project': project}}, 'should': [{'term': {'status': status}}], 'minimum_should_match': 1, }}}, _source_include=fields or []): yield self._parse(record['_source']) def get_task(self, project, taskid, fields=None): if self._changed: self.refresh() ret = self.es.get(index=self.index, doc_type=self.__type__, id="%s:%s" % (project, taskid), _source_include=fields or [], ignore=404) return self._parse(ret.get('_source', None)) def status_count(self, project): self.refresh() ret = self.es.search(index=self.index, doc_type=self.__type__, body={"query": {'term': {'project': project}}, "aggs": {"status": { "terms": {"field": "status"} }}}, _source=False) result = {} for each in ret['aggregations']['status'].get('buckets', []): result[each['key']] = each['doc_count'] return result def insert(self, project, taskid, obj={}): self._changed = True obj = dict(obj) obj['taskid'] = taskid obj['project'] = project obj['updatetime'] = time.time() return self.es.index(index=self.index, doc_type=self.__type__, body=self._stringify(obj), id='%s:%s' % (project, taskid)) def update(self, project, taskid, obj={}, **kwargs): self._changed = True obj = dict(obj) obj.update(kwargs) obj['updatetime'] = time.time() return self.es.update(index=self.index, doc_type=self.__type__, id='%s:%s' % (project, taskid), body={"doc": self._stringify(obj)}, ignore=404) def drop(self, project): self.refresh() for record in elasticsearch.helpers.scan(self.es, index=self.index, doc_type=self.__type__, query={'query': {'term': {'project': project}}}, _source=False): self.es.delete(index=self.index, doc_type=self.__type__, id=record['_id']) self.refresh() def refresh(self): """ Explicitly refresh one or more index, making all operations performed since the last refresh available for search. """ self._changed = False self.es.indices.refresh(index=self.index) ================================================ FILE: pyspider/database/local/__init__.py ================================================ #!/usr/bin/env python # -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # Created on 2015-01-17 20:56:50 ================================================ FILE: pyspider/database/local/projectdb.py ================================================ #!/usr/bin/env python # -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # Created on 2015-01-17 12:32:17 import os import re import six import glob import logging from pyspider.database.base.projectdb import ProjectDB as BaseProjectDB class ProjectDB(BaseProjectDB): """ProjectDB loading scripts from local file.""" def __init__(self, files): self.files = files self.projects = {} self.load_scripts() def load_scripts(self): project_names = set(self.projects.keys()) for path in self.files: for filename in glob.glob(path): name = os.path.splitext(os.path.basename(filename))[0] if name in project_names: project_names.remove(name) updatetime = os.path.getmtime(filename) if name not in self.projects or updatetime > self.projects[name]['updatetime']: project = self._build_project(filename) if not project: continue self.projects[project['name']] = project for name in project_names: del self.projects[name] rate_re = re.compile(r'^\s*#\s*rate.*?(\d+(\.\d+)?)', re.I | re.M) burst_re = re.compile(r'^\s*#\s*burst.*?(\d+(\.\d+)?)', re.I | re.M) def _build_project(self, filename): try: with open(filename) as fp: script = fp.read() m = self.rate_re.search(script) if m: rate = float(m.group(1)) else: rate = 1 m = self.burst_re.search(script) if m: burst = float(m.group(1)) else: burst = 3 return { 'name': os.path.splitext(os.path.basename(filename))[0], 'group': None, 'status': 'RUNNING', 'script': script, 'comments': None, 'rate': rate, 'burst': burst, 'updatetime': os.path.getmtime(filename), } except OSError as e: logging.error('loading project script error: %s', e) return None def get_all(self, fields=None): for projectname in self.projects: yield self.get(projectname, fields) def get(self, name, fields=None): if name not in self.projects: return None project = self.projects[name] result = {} for f in fields or project: if f in project: result[f] = project[f] else: result[f] = None return result def check_update(self, timestamp, fields=None): self.load_scripts() for projectname, project in six.iteritems(self.projects): if project['updatetime'] > timestamp: yield self.get(projectname, fields) ================================================ FILE: pyspider/database/mongodb/__init__.py ================================================ ================================================ FILE: pyspider/database/mongodb/mongodbbase.py ================================================ #!/usr/bin/env python # -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # Created on 2014-11-22 20:42:01 import time class SplitTableMixin(object): UPDATE_PROJECTS_TIME = 10 * 60 def _collection_name(self, project): if self.collection_prefix: return "%s.%s" % (self.collection_prefix, project) else: return project @property def projects(self): if time.time() - getattr(self, '_last_update_projects', 0) > self.UPDATE_PROJECTS_TIME: self._list_project() return self._projects @projects.setter def projects(self, value): self._projects = value def _list_project(self): self._last_update_projects = time.time() self.projects = set() if self.collection_prefix: prefix = "%s." % self.collection_prefix else: prefix = '' for each in self.database.collection_names(): if each.startswith('system.'): continue if each.startswith(prefix): self.projects.add(each[len(prefix):]) def drop(self, project): if project not in self.projects: self._list_project() if project not in self.projects: return collection_name = self._collection_name(project) self.database[collection_name].drop() self._list_project() ================================================ FILE: pyspider/database/mongodb/projectdb.py ================================================ #!/usr/bin/env python # -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # Created on 2014-10-12 12:22:42 import time from pymongo import MongoClient from pyspider.database.base.projectdb import ProjectDB as BaseProjectDB class ProjectDB(BaseProjectDB): __collection_name__ = 'projectdb' def __init__(self, url, database='projectdb'): self.conn = MongoClient(url) self.conn.admin.command("ismaster") self.database = self.conn[database] self.collection = self.database[self.__collection_name__] self.collection.ensure_index('name', unique=True) def _default_fields(self, each): if each is None: return each each.setdefault('group', None) each.setdefault('status', 'TODO') each.setdefault('script', '') each.setdefault('comments', None) each.setdefault('rate', 0) each.setdefault('burst', 0) each.setdefault('updatetime', 0) return each def insert(self, name, obj={}): obj = dict(obj) obj['name'] = name obj['updatetime'] = time.time() return self.collection.update({'name': name}, {'$set': obj}, upsert=True) def update(self, name, obj={}, **kwargs): obj = dict(obj) obj.update(kwargs) obj['updatetime'] = time.time() return self.collection.update({'name': name}, {'$set': obj}) def get_all(self, fields=None): for each in self.collection.find({}, fields): if each and '_id' in each: del each['_id'] yield self._default_fields(each) def get(self, name, fields=None): each = self.collection.find_one({'name': name}, fields) if each and '_id' in each: del each['_id'] return self._default_fields(each) def check_update(self, timestamp, fields=None): for project in self.get_all(fields=('updatetime', 'name')): if project['updatetime'] > timestamp: project = self.get(project['name'], fields) yield self._default_fields(project) def drop(self, name): return self.collection.remove({'name': name}) ================================================ FILE: pyspider/database/mongodb/resultdb.py ================================================ #!/usr/bin/env python # -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # Created on 2014-10-13 22:18:36 import json import time from pymongo import MongoClient from pyspider.database.base.resultdb import ResultDB as BaseResultDB from .mongodbbase import SplitTableMixin class ResultDB(SplitTableMixin, BaseResultDB): collection_prefix = '' def __init__(self, url, database='resultdb'): self.conn = MongoClient(url) self.conn.admin.command("ismaster") self.database = self.conn[database] self.projects = set() self._list_project() # we suggest manually build index in advance, instead of indexing # in the startup process, # for project in self.projects: # collection_name = self._collection_name(project) # self.database[collection_name].ensure_index('taskid') pass def _create_project(self, project): collection_name = self._collection_name(project) self.database[collection_name].ensure_index('taskid') self._list_project() def _parse(self, data): data['_id'] = str(data['_id']) if 'result' in data: data['result'] = json.loads(data['result']) return data def _stringify(self, data): if 'result' in data: data['result'] = json.dumps(data['result']) return data def save(self, project, taskid, url, result): if project not in self.projects: self._create_project(project) collection_name = self._collection_name(project) obj = { 'taskid' : taskid, 'url' : url, 'result' : result, 'updatetime': time.time(), } return self.database[collection_name].update( {'taskid': taskid}, {"$set": self._stringify(obj)}, upsert=True ) def select(self, project, fields=None, offset=0, limit=0): if project not in self.projects: self._list_project() if project not in self.projects: return offset = offset or 0 limit = limit or 0 collection_name = self._collection_name(project) for result in self.database[collection_name].find({}, fields, skip=offset, limit=limit): yield self._parse(result) def count(self, project): if project not in self.projects: self._list_project() if project not in self.projects: return collection_name = self._collection_name(project) return self.database[collection_name].count() def get(self, project, taskid, fields=None): if project not in self.projects: self._list_project() if project not in self.projects: return collection_name = self._collection_name(project) ret = self.database[collection_name].find_one({'taskid': taskid}, fields) if not ret: return ret return self._parse(ret) ================================================ FILE: pyspider/database/mongodb/taskdb.py ================================================ #!/usr/bin/env python # -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # Created on 2014-10-11 23:54:50 import json import time from pymongo import MongoClient from pyspider.database.base.taskdb import TaskDB as BaseTaskDB from .mongodbbase import SplitTableMixin class TaskDB(SplitTableMixin, BaseTaskDB): collection_prefix = '' def __init__(self, url, database='taskdb'): self.conn = MongoClient(url) self.conn.admin.command("ismaster") self.database = self.conn[database] self.projects = set() self._list_project() # we suggest manually build index in advance, instead of indexing # in the startup process, # for project in self.projects: # collection_name = self._collection_name(project) # self.database[collection_name].ensure_index('status') # self.database[collection_name].ensure_index('taskid') def _create_project(self, project): collection_name = self._collection_name(project) self.database[collection_name].ensure_index('status') self.database[collection_name].ensure_index('taskid') self._list_project() def _parse(self, data): if '_id' in data: del data['_id'] for each in ('schedule', 'fetch', 'process', 'track'): if each in data: if data[each]: if isinstance(data[each], bytearray): data[each] = str(data[each]) data[each] = json.loads(data[each], encoding='utf8') else: data[each] = {} return data def _stringify(self, data): for each in ('schedule', 'fetch', 'process', 'track'): if each in data: data[each] = json.dumps(data[each]) return data def load_tasks(self, status, project=None, fields=None): if not project: self._list_project() if project: projects = [project, ] else: projects = self.projects for project in projects: collection_name = self._collection_name(project) for task in self.database[collection_name].find({'status': status}, fields): yield self._parse(task) def get_task(self, project, taskid, fields=None): if project not in self.projects: self._list_project() if project not in self.projects: return collection_name = self._collection_name(project) ret = self.database[collection_name].find_one({'taskid': taskid}, fields) if not ret: return ret return self._parse(ret) def status_count(self, project): if project not in self.projects: self._list_project() if project not in self.projects: return {} collection_name = self._collection_name(project) # when there are too many data in task collection , aggregate operation will take a very long time, # and this will cause scheduler module startup to be particularly slow # ret = self.database[collection_name].aggregate([ # {'$group': { # '_id' : '$status', # 'total': { # '$sum': 1 # } # } # }]) # Instead of aggregate, use find-count on status(with index) field. def _count_for_status(collection, status): total = collection.find({'status': status}).count() return {'total': total, "_id": status} if total else None c = self.database[collection_name] ret = filter( lambda x: x, map( lambda s: _count_for_status(c, s), [self.ACTIVE, self.SUCCESS, self.FAILED] ) ) result = {} if isinstance(ret, dict): ret = ret.get('result', []) for each in ret: result[each['_id']] = each['total'] return result def insert(self, project, taskid, obj={}): if project not in self.projects: self._create_project(project) obj = dict(obj) obj['taskid'] = taskid obj['project'] = project obj['updatetime'] = time.time() return self.update(project, taskid, obj=obj) def update(self, project, taskid, obj={}, **kwargs): obj = dict(obj) obj.update(kwargs) obj['updatetime'] = time.time() collection_name = self._collection_name(project) return self.database[collection_name].update( {'taskid': taskid}, {"$set": self._stringify(obj)}, upsert=True ) ================================================ FILE: pyspider/database/mysql/__init__.py ================================================ #!/usr/bin/env python # -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # Created on 2014-07-17 20:12:54 ================================================ FILE: pyspider/database/mysql/mysqlbase.py ================================================ #!/usr/bin/env python # -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # Created on 2014-11-05 10:42:24 import time import mysql.connector class MySQLMixin(object): maxlimit = 18446744073709551615 @property def dbcur(self): try: if self.conn.unread_result: self.conn.get_rows() if hasattr(self.conn, 'free_result'): self.conn.free_result() return self.conn.cursor() except (mysql.connector.OperationalError, mysql.connector.InterfaceError): self.conn.ping(reconnect=True) self.conn.database = self.database_name return self.conn.cursor() class SplitTableMixin(object): UPDATE_PROJECTS_TIME = 10 * 60 def _tablename(self, project): if self.__tablename__: return '%s_%s' % (self.__tablename__, project) else: return project @property def projects(self): if time.time() - getattr(self, '_last_update_projects', 0) \ > self.UPDATE_PROJECTS_TIME: self._list_project() return self._projects @projects.setter def projects(self, value): self._projects = value def _list_project(self): self._last_update_projects = time.time() self.projects = set() if self.__tablename__: prefix = '%s_' % self.__tablename__ else: prefix = '' for project, in self._execute('show tables;'): if project.startswith(prefix): project = project[len(prefix):] self.projects.add(project) def drop(self, project): if project not in self.projects: self._list_project() if project not in self.projects: return tablename = self._tablename(project) self._execute("DROP TABLE %s" % self.escape(tablename)) self._list_project() ================================================ FILE: pyspider/database/mysql/projectdb.py ================================================ #!/usr/bin/env python # -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # Created on 2014-07-17 21:06:43 import time import mysql.connector from pyspider.database.base.projectdb import ProjectDB as BaseProjectDB from pyspider.database.basedb import BaseDB from .mysqlbase import MySQLMixin class ProjectDB(MySQLMixin, BaseProjectDB, BaseDB): __tablename__ = 'projectdb' def __init__(self, host='localhost', port=3306, database='projectdb', user='root', passwd=None): self.database_name = database self.conn = mysql.connector.connect(user=user, password=passwd, host=host, port=port, autocommit=True) if database not in [x[0] for x in self._execute('show databases')]: self._execute('CREATE DATABASE %s' % self.escape(database)) self.conn.database = database self._execute('''CREATE TABLE IF NOT EXISTS %s ( `name` varchar(64) PRIMARY KEY, `group` varchar(64), `status` varchar(16), `script` TEXT, `comments` varchar(1024), `rate` float(11, 4), `burst` float(11, 4), `updatetime` double(16, 4) ) ENGINE=InnoDB CHARSET=utf8''' % self.escape(self.__tablename__)) def insert(self, name, obj={}): obj = dict(obj) obj['name'] = name obj['updatetime'] = time.time() return self._insert(**obj) def update(self, name, obj={}, **kwargs): obj = dict(obj) obj.update(kwargs) obj['updatetime'] = time.time() ret = self._update(where="`name` = %s" % self.placeholder, where_values=(name, ), **obj) return ret.rowcount def get_all(self, fields=None): return self._select2dic(what=fields) def get(self, name, fields=None): where = "`name` = %s" % self.placeholder for each in self._select2dic(what=fields, where=where, where_values=(name, )): return each return None def drop(self, name): where = "`name` = %s" % self.placeholder return self._delete(where=where, where_values=(name, )) def check_update(self, timestamp, fields=None): where = "`updatetime` >= %f" % timestamp return self._select2dic(what=fields, where=where) ================================================ FILE: pyspider/database/mysql/resultdb.py ================================================ #!/usr/bin/env python # -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # Created on 2014-10-13 22:02:57 import re import six import time import json import mysql.connector from pyspider.libs import utils from pyspider.database.base.resultdb import ResultDB as BaseResultDB from pyspider.database.basedb import BaseDB from .mysqlbase import MySQLMixin, SplitTableMixin class ResultDB(MySQLMixin, SplitTableMixin, BaseResultDB, BaseDB): __tablename__ = '' def __init__(self, host='localhost', port=3306, database='resultdb', user='root', passwd=None): self.database_name = database self.conn = mysql.connector.connect(user=user, password=passwd, host=host, port=port, autocommit=True) if database not in [x[0] for x in self._execute('show databases')]: self._execute('CREATE DATABASE %s' % self.escape(database)) self.conn.database = database self._list_project() def _create_project(self, project): assert re.match(r'^\w+$', project) is not None tablename = self._tablename(project) if tablename in [x[0] for x in self._execute('show tables')]: return self._execute('''CREATE TABLE %s ( `taskid` varchar(64) PRIMARY KEY, `url` varchar(1024), `result` MEDIUMBLOB, `updatetime` double(16, 4) ) ENGINE=InnoDB CHARSET=utf8''' % self.escape(tablename)) def _parse(self, data): for key, value in list(six.iteritems(data)): if isinstance(value, (bytearray, six.binary_type)): data[key] = utils.text(value) if 'result' in data: data['result'] = json.loads(data['result']) return data def _stringify(self, data): if 'result' in data: data['result'] = json.dumps(data['result']) return data def save(self, project, taskid, url, result): tablename = self._tablename(project) if project not in self.projects: self._create_project(project) self._list_project() obj = { 'taskid': taskid, 'url': url, 'result': result, 'updatetime': time.time(), } return self._replace(tablename, **self._stringify(obj)) def select(self, project, fields=None, offset=0, limit=None): if project not in self.projects: self._list_project() if project not in self.projects: return tablename = self._tablename(project) for task in self._select2dic(tablename, what=fields, order='updatetime DESC', offset=offset, limit=limit): yield self._parse(task) def count(self, project): if project not in self.projects: self._list_project() if project not in self.projects: return 0 tablename = self._tablename(project) for count, in self._execute("SELECT count(1) FROM %s" % self.escape(tablename)): return count def get(self, project, taskid, fields=None): if project not in self.projects: self._list_project() if project not in self.projects: return tablename = self._tablename(project) where = "`taskid` = %s" % self.placeholder for task in self._select2dic(tablename, what=fields, where=where, where_values=(taskid, )): return self._parse(task) ================================================ FILE: pyspider/database/mysql/taskdb.py ================================================ #!/usr/bin/envutils # -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # Created on 2014-07-17 18:53:01 import re import six import time import json import mysql.connector from pyspider.libs import utils from pyspider.database.base.taskdb import TaskDB as BaseTaskDB from pyspider.database.basedb import BaseDB from .mysqlbase import MySQLMixin, SplitTableMixin class TaskDB(MySQLMixin, SplitTableMixin, BaseTaskDB, BaseDB): __tablename__ = '' def __init__(self, host='localhost', port=3306, database='taskdb', user='root', passwd=None): self.database_name = database self.conn = mysql.connector.connect(user=user, password=passwd, host=host, port=port, autocommit=True) if database not in [x[0] for x in self._execute('show databases')]: self._execute('CREATE DATABASE %s' % self.escape(database)) self.conn.database = database self._list_project() def _create_project(self, project): assert re.match(r'^\w+$', project) is not None tablename = self._tablename(project) if tablename in [x[0] for x in self._execute('show tables')]: return self._execute('''CREATE TABLE IF NOT EXISTS %s ( `taskid` varchar(64) PRIMARY KEY, `project` varchar(64), `url` varchar(1024), `status` int(1), `schedule` BLOB, `fetch` BLOB, `process` BLOB, `track` BLOB, `lastcrawltime` double(16, 4), `updatetime` double(16, 4), INDEX `status_index` (`status`) ) ENGINE=InnoDB CHARSET=utf8''' % self.escape(tablename)) def _parse(self, data): for key, value in list(six.iteritems(data)): if isinstance(value, (bytearray, six.binary_type)): data[key] = utils.text(value) for each in ('schedule', 'fetch', 'process', 'track'): if each in data: if data[each]: data[each] = json.loads(data[each]) else: data[each] = {} return data def _stringify(self, data): for each in ('schedule', 'fetch', 'process', 'track'): if each in data: data[each] = json.dumps(data[each]) return data def load_tasks(self, status, project=None, fields=None): if project and project not in self.projects: return where = "`status` = %s" % self.placeholder if project: projects = [project, ] else: projects = self.projects for project in projects: tablename = self._tablename(project) for each in self._select2dic( tablename, what=fields, where=where, where_values=(status, ) ): yield self._parse(each) def get_task(self, project, taskid, fields=None): if project not in self.projects: self._list_project() if project not in self.projects: return None where = "`taskid` = %s" % self.placeholder tablename = self._tablename(project) for each in self._select2dic(tablename, what=fields, where=where, where_values=(taskid, )): return self._parse(each) return None def status_count(self, project): result = dict() if project not in self.projects: self._list_project() if project not in self.projects: return result tablename = self._tablename(project) for status, count in self._execute("SELECT `status`, count(1) FROM %s GROUP BY `status`" % self.escape(tablename)): result[status] = count return result def insert(self, project, taskid, obj={}): if project not in self.projects: self._list_project() if project not in self.projects: self._create_project(project) self._list_project() obj = dict(obj) obj['taskid'] = taskid obj['project'] = project obj['updatetime'] = time.time() tablename = self._tablename(project) return self._insert(tablename, **self._stringify(obj)) def update(self, project, taskid, obj={}, **kwargs): if project not in self.projects: self._list_project() if project not in self.projects: raise LookupError tablename = self._tablename(project) obj = dict(obj) obj.update(kwargs) obj['updatetime'] = time.time() return self._update( tablename, where="`taskid` = %s" % self.placeholder, where_values=(taskid, ), **self._stringify(obj) ) ================================================ FILE: pyspider/database/redis/__init__.py ================================================ #!/usr/bin/env python # -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # Created on 2015-05-17 01:34:21 ================================================ FILE: pyspider/database/redis/taskdb.py ================================================ #!/usr/bin/env python # -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # Created on 2015-05-16 21:01:52 import six import time import json import redis import logging import itertools from pyspider.libs import utils from pyspider.database.base.taskdb import TaskDB as BaseTaskDB class TaskDB(BaseTaskDB): UPDATE_PROJECTS_TIME = 10 * 60 __prefix__ = 'taskdb_' def __init__(self, host='localhost', port=6379, db=0): self.redis = redis.StrictRedis(host=host, port=port, db=db) try: self.redis.scan(count=1) self.scan_available = True except Exception as e: logging.debug("redis_scan disabled: %r", e) self.scan_available = False def _gen_key(self, project, taskid): return "%s%s_%s" % (self.__prefix__, project, taskid) def _gen_status_key(self, project, status): return '%s%s_status_%d' % (self.__prefix__, project, status) def _parse(self, data): if six.PY3: result = {} for key, value in data.items(): if isinstance(value, bytes): value = utils.text(value) result[utils.text(key)] = value data = result for each in ('schedule', 'fetch', 'process', 'track'): if each in data: if data[each]: data[each] = json.loads(data[each]) else: data[each] = {} if 'status' in data: data['status'] = int(data['status']) if 'lastcrawltime' in data: data['lastcrawltime'] = float(data['lastcrawltime'] or 0) if 'updatetime' in data: data['updatetime'] = float(data['updatetime'] or 0) return data def _stringify(self, data): for each in ('schedule', 'fetch', 'process', 'track'): if each in data: data[each] = json.dumps(data[each]) return data @property def projects(self): if time.time() - getattr(self, '_last_update_projects', 0) \ > self.UPDATE_PROJECTS_TIME: self._projects = set(utils.text(x) for x in self.redis.smembers( self.__prefix__ + 'projects')) return self._projects def load_tasks(self, status, project=None, fields=None): if project is None: project = self.projects elif not isinstance(project, list): project = [project, ] if self.scan_available: scan_method = self.redis.sscan_iter else: scan_method = self.redis.smembers if fields: def get_method(key): obj = self.redis.hmget(key, fields) if all(x is None for x in obj): return None return dict(zip(fields, obj)) else: get_method = self.redis.hgetall for p in project: status_key = self._gen_status_key(p, status) for taskid in scan_method(status_key): obj = get_method(self._gen_key(p, utils.text(taskid))) if not obj: #self.redis.srem(status_key, taskid) continue else: yield self._parse(obj) def get_task(self, project, taskid, fields=None): if fields: obj = self.redis.hmget(self._gen_key(project, taskid), fields) if all(x is None for x in obj): return None obj = dict(zip(fields, obj)) else: obj = self.redis.hgetall(self._gen_key(project, taskid)) if not obj: return None return self._parse(obj) def status_count(self, project): ''' return a dict ''' pipe = self.redis.pipeline(transaction=False) for status in range(1, 5): pipe.scard(self._gen_status_key(project, status)) ret = pipe.execute() result = {} for status, count in enumerate(ret): if count > 0: result[status + 1] = count return result def insert(self, project, taskid, obj={}): obj = dict(obj) obj['taskid'] = taskid obj['project'] = project obj['updatetime'] = time.time() obj.setdefault('status', self.ACTIVE) task_key = self._gen_key(project, taskid) pipe = self.redis.pipeline(transaction=False) if project not in self.projects: pipe.sadd(self.__prefix__ + 'projects', project) pipe.hmset(task_key, self._stringify(obj)) pipe.sadd(self._gen_status_key(project, obj['status']), taskid) pipe.execute() def update(self, project, taskid, obj={}, **kwargs): obj = dict(obj) obj.update(kwargs) obj['updatetime'] = time.time() pipe = self.redis.pipeline(transaction=False) pipe.hmset(self._gen_key(project, taskid), self._stringify(obj)) if 'status' in obj: for status in range(1, 5): if status == obj['status']: pipe.sadd(self._gen_status_key(project, status), taskid) else: pipe.srem(self._gen_status_key(project, status), taskid) pipe.execute() def drop(self, project): self.redis.srem(self.__prefix__ + 'projects', project) if self.scan_available: scan_method = self.redis.scan_iter else: scan_method = self.redis.keys for each in itertools.tee(scan_method("%s%s_*" % (self.__prefix__, project)), 100): each = list(each) if each: self.redis.delete(*each) ================================================ FILE: pyspider/database/sqlalchemy/__init__.py ================================================ #!/usr/bin/env python # -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # Created on 2014-12-04 20:11:04 ================================================ FILE: pyspider/database/sqlalchemy/projectdb.py ================================================ #!/usr/bin/env python # -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # Created on 2014-12-04 23:25:10 import six import time import sqlalchemy.exc from sqlalchemy import create_engine, MetaData, Table, Column, String, Float, Text from sqlalchemy.engine.url import make_url from pyspider.libs import utils from pyspider.database.base.projectdb import ProjectDB as BaseProjectDB from .sqlalchemybase import result2dict class ProjectDB(BaseProjectDB): __tablename__ = 'projectdb' def __init__(self, url): self.table = Table(self.__tablename__, MetaData(), Column('name', String(64), primary_key=True), Column('group', String(64)), Column('status', String(16)), Column('script', Text), Column('comments', String(1024)), Column('rate', Float(11)), Column('burst', Float(11)), Column('updatetime', Float(32)), mysql_engine='InnoDB', mysql_charset='utf8' ) self.url = make_url(url) if self.url.database: database = self.url.database self.url.database = None try: engine = create_engine(self.url, convert_unicode=True, pool_recycle=3600) conn = engine.connect() conn.execute("commit") conn.execute("CREATE DATABASE %s" % database) except sqlalchemy.exc.SQLAlchemyError: pass self.url.database = database self.engine = create_engine(url, convert_unicode=True, pool_recycle=3600) self.table.create(self.engine, checkfirst=True) @staticmethod def _parse(data): return data @staticmethod def _stringify(data): return data def insert(self, name, obj={}): obj = dict(obj) obj['name'] = name obj['updatetime'] = time.time() return self.engine.execute(self.table.insert() .values(**self._stringify(obj))) def update(self, name, obj={}, **kwargs): obj = dict(obj) obj.update(kwargs) obj['updatetime'] = time.time() return self.engine.execute(self.table.update() .where(self.table.c.name == name) .values(**self._stringify(obj))) def get_all(self, fields=None): columns = [getattr(self.table.c, f, f) for f in fields] if fields else self.table.c for task in self.engine.execute(self.table.select() .with_only_columns(columns)): yield self._parse(result2dict(columns, task)) def get(self, name, fields=None): columns = [getattr(self.table.c, f, f) for f in fields] if fields else self.table.c for task in self.engine.execute(self.table.select() .where(self.table.c.name == name) .limit(1) .with_only_columns(columns)): return self._parse(result2dict(columns, task)) def drop(self, name): return self.engine.execute(self.table.delete() .where(self.table.c.name == name)) def check_update(self, timestamp, fields=None): columns = [getattr(self.table.c, f, f) for f in fields] if fields else self.table.c for task in self.engine.execute(self.table.select() .with_only_columns(columns) .where(self.table.c.updatetime >= timestamp)): yield self._parse(result2dict(columns, task)) ================================================ FILE: pyspider/database/sqlalchemy/resultdb.py ================================================ #!/usr/bin/env python # -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # Created on 2014-12-04 18:48:15 import re import six import time import json import sqlalchemy.exc from sqlalchemy import (create_engine, MetaData, Table, Column, String, Float, Text) from sqlalchemy.engine.url import make_url from pyspider.database.base.resultdb import ResultDB as BaseResultDB from pyspider.libs import utils from .sqlalchemybase import SplitTableMixin, result2dict class ResultDB(SplitTableMixin, BaseResultDB): __tablename__ = '' def __init__(self, url): self.table = Table('__tablename__', MetaData(), Column('taskid', String(64), primary_key=True, nullable=False), Column('url', String(1024)), Column('result', Text()), Column('updatetime', Float(32)), mysql_engine='InnoDB', mysql_charset='utf8' ) self.url = make_url(url) if self.url.database: database = self.url.database self.url.database = None try: engine = create_engine(self.url, convert_unicode=True, pool_recycle=3600) conn = engine.connect() conn.execute("commit") conn.execute("CREATE DATABASE %s" % database) except sqlalchemy.exc.SQLAlchemyError: pass self.url.database = database self.engine = create_engine(url, convert_unicode=True, pool_recycle=3600) self._list_project() def _create_project(self, project): assert re.match(r'^\w+$', project) is not None if project in self.projects: return self.table.name = self._tablename(project) self.table.create(self.engine) @staticmethod def _parse(data): for key, value in list(six.iteritems(data)): if isinstance(value, six.binary_type): data[key] = utils.text(value) if 'result' in data: if data['result']: data['result'] = json.loads(data['result']) else: data['result'] = {} return data @staticmethod def _stringify(data): if 'result' in data: if data['result']: data['result'] = json.dumps(data['result']) else: data['result'] = json.dumps({}) return data def save(self, project, taskid, url, result): if project not in self.projects: self._create_project(project) self._list_project() self.table.name = self._tablename(project) obj = { 'taskid': taskid, 'url': url, 'result': result, 'updatetime': time.time(), } if self.get(project, taskid, ('taskid', )): del obj['taskid'] return self.engine.execute(self.table.update() .where(self.table.c.taskid == taskid) .values(**self._stringify(obj))) else: return self.engine.execute(self.table.insert() .values(**self._stringify(obj))) def select(self, project, fields=None, offset=0, limit=None): if project not in self.projects: self._list_project() if project not in self.projects: return self.table.name = self._tablename(project) columns = [getattr(self.table.c, f, f) for f in fields] if fields else self.table.c for task in self.engine.execute(self.table.select() .with_only_columns(columns=columns) .order_by(self.table.c.updatetime.desc()) .offset(offset).limit(limit) .execution_options(autocommit=True)): yield self._parse(result2dict(columns, task)) def count(self, project): if project not in self.projects: self._list_project() if project not in self.projects: return 0 self.table.name = self._tablename(project) for count, in self.engine.execute(self.table.count()): return count def get(self, project, taskid, fields=None): if project not in self.projects: self._list_project() if project not in self.projects: return self.table.name = self._tablename(project) columns = [getattr(self.table.c, f, f) for f in fields] if fields else self.table.c for task in self.engine.execute(self.table.select() .with_only_columns(columns=columns) .where(self.table.c.taskid == taskid) .limit(1)): return self._parse(result2dict(columns, task)) ================================================ FILE: pyspider/database/sqlalchemy/sqlalchemybase.py ================================================ #!/usr/bin/env python # -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # Created on 2014-12-04 18:48:47 import time def result2dict(columns, task): return dict(task) class SplitTableMixin(object): UPDATE_PROJECTS_TIME = 10 * 60 def _tablename(self, project): if self.__tablename__: return '%s_%s' % (self.__tablename__, project) else: return project @property def projects(self): if time.time() - getattr(self, '_last_update_projects', 0) \ > self.UPDATE_PROJECTS_TIME: self._list_project() return self._projects @projects.setter def projects(self, value): self._projects = value def _list_project(self): self._last_update_projects = time.time() self.projects = set() if self.__tablename__: prefix = '%s_' % self.__tablename__ else: prefix = '' for project in self.engine.table_names(): if project.startswith(prefix): project = project[len(prefix):] self.projects.add(project) def drop(self, project): if project not in self.projects: self._list_project() if project not in self.projects: return self.table.name = self._tablename(project) self.table.drop(self.engine) self._list_project() ================================================ FILE: pyspider/database/sqlalchemy/taskdb.py ================================================ #!/usr/bin/env python # -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # Created on 2014-12-04 22:33:43 import re import six import time import json import sqlalchemy.exc from sqlalchemy import (create_engine, MetaData, Table, Column, Index, Integer, String, Float, Text, func) from sqlalchemy.engine.url import make_url from pyspider.libs import utils from pyspider.database.base.taskdb import TaskDB as BaseTaskDB from .sqlalchemybase import SplitTableMixin, result2dict class TaskDB(SplitTableMixin, BaseTaskDB): __tablename__ = '' def __init__(self, url): self.table = Table('__tablename__', MetaData(), Column('taskid', String(64), primary_key=True, nullable=False), Column('project', String(64)), Column('url', String(1024)), Column('status', Integer), Column('schedule', Text()), Column('fetch', Text()), Column('process', Text()), Column('track', Text()), Column('lastcrawltime', Float(32)), Column('updatetime', Float(32)), mysql_engine='InnoDB', mysql_charset='utf8' ) self.url = make_url(url) if self.url.database: database = self.url.database self.url.database = None try: engine = create_engine(self.url, convert_unicode=True, pool_recycle=3600) conn = engine.connect() conn.execute("commit") conn.execute("CREATE DATABASE %s" % database) except sqlalchemy.exc.SQLAlchemyError: pass self.url.database = database self.engine = create_engine(url, convert_unicode=True, pool_recycle=3600) self._list_project() def _create_project(self, project): assert re.match(r'^\w+$', project) is not None if project in self.projects: return self.table.name = self._tablename(project) Index('status_%s_index' % self.table.name, self.table.c.status) self.table.create(self.engine, checkfirst=True) self.table.indexes.clear() @staticmethod def _parse(data): for key, value in list(six.iteritems(data)): if isinstance(value, six.binary_type): data[key] = utils.text(value) for each in ('schedule', 'fetch', 'process', 'track'): if each in data: if data[each]: data[each] = json.loads(data[each]) else: data[each] = {} return data @staticmethod def _stringify(data): for each in ('schedule', 'fetch', 'process', 'track'): if each in data: if data[each]: data[each] = json.dumps(data[each]) else: data[each] = json.dumps({}) return data def load_tasks(self, status, project=None, fields=None): if project and project not in self.projects: return if project: projects = [project, ] else: projects = self.projects columns = [getattr(self.table.c, f, f) for f in fields] if fields else self.table.c for project in projects: self.table.name = self._tablename(project) for task in self.engine.execute(self.table.select() .with_only_columns(columns) .where(self.table.c.status == status)): yield self._parse(result2dict(columns, task)) def get_task(self, project, taskid, fields=None): if project not in self.projects: self._list_project() if project not in self.projects: return None self.table.name = self._tablename(project) columns = [getattr(self.table.c, f, f) for f in fields] if fields else self.table.c for each in self.engine.execute(self.table.select() .with_only_columns(columns) .limit(1) .where(self.table.c.taskid == taskid)): return self._parse(result2dict(columns, each)) def status_count(self, project): result = dict() if project not in self.projects: self._list_project() if project not in self.projects: return result self.table.name = self._tablename(project) for status, count in self.engine.execute( self.table.select() .with_only_columns((self.table.c.status, func.count(1))) .group_by(self.table.c.status)): result[status] = count return result def insert(self, project, taskid, obj={}): if project not in self.projects: self._list_project() if project not in self.projects: self._create_project(project) self._list_project() obj = dict(obj) obj['taskid'] = taskid obj['project'] = project obj['updatetime'] = time.time() self.table.name = self._tablename(project) return self.engine.execute(self.table.insert() .values(**self._stringify(obj))) def update(self, project, taskid, obj={}, **kwargs): if project not in self.projects: self._list_project() if project not in self.projects: raise LookupError self.table.name = self._tablename(project) obj = dict(obj) obj.update(kwargs) obj['updatetime'] = time.time() return self.engine.execute(self.table.update() .where(self.table.c.taskid == taskid) .values(**self._stringify(obj))) ================================================ FILE: pyspider/database/sqlite/__init__.py ================================================ ================================================ FILE: pyspider/database/sqlite/projectdb.py ================================================ #!/usr/bin/env python # -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # Created on 2014-02-09 12:05:52 import time from .sqlitebase import SQLiteMixin from pyspider.database.base.projectdb import ProjectDB as BaseProjectDB from pyspider.database.basedb import BaseDB class ProjectDB(SQLiteMixin, BaseProjectDB, BaseDB): __tablename__ = 'projectdb' placeholder = '?' def __init__(self, path): self.path = path self.last_pid = 0 self.conn = None self._execute('''CREATE TABLE IF NOT EXISTS `%s` ( name PRIMARY KEY, `group`, status, script, comments, rate, burst, updatetime )''' % self.__tablename__) def insert(self, name, obj={}): obj = dict(obj) obj['name'] = name obj['updatetime'] = time.time() return self._insert(**obj) def update(self, name, obj={}, **kwargs): obj = dict(obj) obj.update(kwargs) obj['updatetime'] = time.time() ret = self._update(where="`name` = %s" % self.placeholder, where_values=(name, ), **obj) return ret.rowcount def get_all(self, fields=None): return self._select2dic(what=fields) def get(self, name, fields=None): where = "`name` = %s" % self.placeholder for each in self._select2dic(what=fields, where=where, where_values=(name, )): return each return None def check_update(self, timestamp, fields=None): where = "`updatetime` >= %f" % timestamp return self._select2dic(what=fields, where=where) def drop(self, name): where = "`name` = %s" % self.placeholder return self._delete(where=where, where_values=(name, )) ================================================ FILE: pyspider/database/sqlite/resultdb.py ================================================ #!/usr/bin/env python # -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # Created on 2014-10-13 17:08:43 import re import time import json from .sqlitebase import SQLiteMixin, SplitTableMixin from pyspider.database.base.resultdb import ResultDB as BaseResultDB from pyspider.database.basedb import BaseDB class ResultDB(SQLiteMixin, SplitTableMixin, BaseResultDB, BaseDB): __tablename__ = 'resultdb' placeholder = '?' def __init__(self, path): self.path = path self.last_pid = 0 self.conn = None self._list_project() def _create_project(self, project): assert re.match(r'^\w+$', project) is not None tablename = self._tablename(project) self._execute('''CREATE TABLE IF NOT EXISTS `%s` ( taskid PRIMARY KEY, url, result, updatetime )''' % tablename) def _parse(self, data): if 'result' in data: data['result'] = json.loads(data['result']) return data def _stringify(self, data): if 'result' in data: data['result'] = json.dumps(data['result']) return data def save(self, project, taskid, url, result): tablename = self._tablename(project) if project not in self.projects: self._create_project(project) self._list_project() obj = { 'taskid': taskid, 'url': url, 'result': result, 'updatetime': time.time(), } return self._replace(tablename, **self._stringify(obj)) def select(self, project, fields=None, offset=0, limit=None): if project not in self.projects: self._list_project() if project not in self.projects: return tablename = self._tablename(project) for task in self._select2dic(tablename, what=fields, order='updatetime DESC', offset=offset, limit=limit): yield self._parse(task) def count(self, project): if project not in self.projects: self._list_project() if project not in self.projects: return 0 tablename = self._tablename(project) for count, in self._execute("SELECT count(1) FROM %s" % self.escape(tablename)): return count def get(self, project, taskid, fields=None): if project not in self.projects: self._list_project() if project not in self.projects: return tablename = self._tablename(project) where = "`taskid` = %s" % self.placeholder for task in self._select2dic(tablename, what=fields, where=where, where_values=(taskid, )): return self._parse(task) ================================================ FILE: pyspider/database/sqlite/sqlitebase.py ================================================ #!/usr/bin/env python # -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # Created on 2014-11-22 20:30:44 import os import time import sqlite3 import threading class SQLiteMixin(object): @property def dbcur(self): pid = (os.getpid(), threading.current_thread().ident) if not (self.conn and pid == self.last_pid): self.last_pid = pid self.conn = sqlite3.connect(self.path, isolation_level=None) return self.conn.cursor() class SplitTableMixin(object): UPDATE_PROJECTS_TIME = 10 * 60 def _tablename(self, project): if self.__tablename__: return '%s_%s' % (self.__tablename__, project) else: return project @property def projects(self): if time.time() - getattr(self, '_last_update_projects', 0) \ > self.UPDATE_PROJECTS_TIME: self._list_project() return self._projects @projects.setter def projects(self, value): self._projects = value def _list_project(self): self._last_update_projects = time.time() self.projects = set() if self.__tablename__: prefix = '%s_' % self.__tablename__ else: prefix = '' for project, in self._select('sqlite_master', what='name', where='type = "table"'): if project.startswith(prefix): project = project[len(prefix):] self.projects.add(project) def drop(self, project): if project not in self.projects: self._list_project() if project not in self.projects: return tablename = self._tablename(project) self._execute("DROP TABLE %s" % self.escape(tablename)) self._list_project() ================================================ FILE: pyspider/database/sqlite/taskdb.py ================================================ #!/usr/bin/env python # -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # Created on 2014-02-08 10:25:34 import re import time import json from .sqlitebase import SQLiteMixin, SplitTableMixin from pyspider.database.base.taskdb import TaskDB as BaseTaskDB from pyspider.database.basedb import BaseDB class TaskDB(SQLiteMixin, SplitTableMixin, BaseTaskDB, BaseDB): __tablename__ = 'taskdb' placeholder = '?' def __init__(self, path): self.path = path self.last_pid = 0 self.conn = None self._list_project() def _create_project(self, project): assert re.match(r'^\w+$', project) is not None tablename = self._tablename(project) self._execute('''CREATE TABLE IF NOT EXISTS `%s` ( taskid PRIMARY KEY, project, url, status, schedule, fetch, process, track, lastcrawltime, updatetime )''' % tablename) self._execute( '''CREATE INDEX `status_%s_index` ON %s (status)''' % (tablename, self.escape(tablename)) ) def _parse(self, data): for each in ('schedule', 'fetch', 'process', 'track'): if each in data: if data[each]: data[each] = json.loads(data[each]) else: data[each] = {} return data def _stringify(self, data): for each in ('schedule', 'fetch', 'process', 'track'): if each in data: data[each] = json.dumps(data[each]) return data def load_tasks(self, status, project=None, fields=None): if project and project not in self.projects: return where = "status = %d" % status if project: projects = [project, ] else: projects = self.projects for project in projects: tablename = self._tablename(project) for each in self._select2dic(tablename, what=fields, where=where): yield self._parse(each) def get_task(self, project, taskid, fields=None): if project not in self.projects: self._list_project() if project not in self.projects: return None where = "`taskid` = %s" % self.placeholder if project not in self.projects: return None tablename = self._tablename(project) for each in self._select2dic(tablename, what=fields, where=where, where_values=(taskid, )): return self._parse(each) return None def status_count(self, project): ''' return a dict ''' result = dict() if project not in self.projects: self._list_project() if project not in self.projects: return result tablename = self._tablename(project) for status, count in self._execute("SELECT `status`, count(1) FROM %s GROUP BY `status`" % self.escape(tablename)): result[status] = count return result def insert(self, project, taskid, obj={}): if project not in self.projects: self._create_project(project) self._list_project() obj = dict(obj) obj['taskid'] = taskid obj['project'] = project obj['updatetime'] = time.time() tablename = self._tablename(project) return self._insert(tablename, **self._stringify(obj)) def update(self, project, taskid, obj={}, **kwargs): if project not in self.projects: raise LookupError tablename = self._tablename(project) obj = dict(obj) obj.update(kwargs) obj['updatetime'] = time.time() return self._update( tablename, where="`taskid` = %s" % self.placeholder, where_values=(taskid, ), **self._stringify(obj) ) ================================================ FILE: pyspider/fetcher/__init__.py ================================================ from .tornado_fetcher import Fetcher ================================================ FILE: pyspider/fetcher/cookie_utils.py ================================================ #!/usr/bin/env python # -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # Created on 2014-12-14 09:07:11 from requests.cookies import MockRequest class MockResponse(object): def __init__(self, headers): self._headers = headers def info(self): return self def getheaders(self, name): """make cookie python 2 version use this method to get cookie list""" return self._headers.get_list(name) def get_all(self, name, default=None): """make cookie python 3 version use this instead of getheaders""" if default is None: default = [] return self._headers.get_list(name) or default def extract_cookies_to_jar(jar, request, response): req = MockRequest(request) res = MockResponse(response) jar.extract_cookies(res, req) ================================================ FILE: pyspider/fetcher/phantomjs_fetcher.js ================================================ // vim: set et sw=2 ts=2 sts=2 ff=unix fenc=utf8: // Author: Binux // http://binux.me // Created on 2014-10-29 22:12:14 var port, server, service, wait_before_end = 1000, system = require('system'), webpage = require('webpage'); if (system.args.length !== 2) { console.log('Usage: simpleserver.js '); phantom.exit(1); } else { port = system.args[1]; server = require('webserver').create(); console.debug = function(){}; service = server.listen(port, { 'keepAlive': false }, function (request, response) { phantom.clearCookies(); //console.debug(JSON.stringify(request, null, 4)); // check method if (request.method == 'GET') { body = "method not allowed!"; response.statusCode = 403; response.headers = { 'Cache': 'no-cache', 'Content-Length': body.length }; response.write(body); response.closeGracefully(); return; } var first_response = null, finished = false, page_loaded = false, start_time = Date.now(), end_time = null, script_executed = false, script_result = null; var fetch = JSON.parse(request.postRaw); console.debug(JSON.stringify(fetch, null, 2)); // create and set page var page = webpage.create(); if (fetch.proxy) { if (fetch.proxy.indexOf('://') == -1){ fetch.proxy = 'http://' + fetch.proxy } page.setProxy(fetch.proxy); } page.onConsoleMessage = function(msg) { console.log('console: ' + msg); }; page.viewportSize = { width: fetch.js_viewport_width || 1024, height: fetch.js_viewport_height || 768*3 } if (fetch.headers) { fetch.headers['Accept-Encoding'] = undefined; fetch.headers['Connection'] = undefined; fetch.headers['Content-Length'] = undefined; } if (fetch.headers && fetch.headers['User-Agent']) { page.settings.userAgent = fetch.headers['User-Agent']; } // this may cause memory leak: https://github.com/ariya/phantomjs/issues/12903 page.settings.loadImages = fetch.load_images === undefined ? true : fetch.load_images; page.settings.resourceTimeout = fetch.timeout ? fetch.timeout * 1000 : 20*1000; if (fetch.headers) { page.customHeaders = fetch.headers; } // add callbacks page.onInitialized = function() { if (!script_executed && fetch.js_script && fetch.js_run_at === "document-start") { script_executed = true; console.log('running document-start script.'); script_result = page.evaluateJavaScript(fetch.js_script); } }; page.onLoadFinished = function(status) { page_loaded = true; if (!script_executed && fetch.js_script && fetch.js_run_at !== "document-start") { script_executed = true; console.log('running document-end script.'); script_result = page.evaluateJavaScript(fetch.js_script); } console.debug("waiting "+wait_before_end+"ms before finished."); end_time = Date.now() + wait_before_end; setTimeout(make_result, wait_before_end+10, page); }; page.onResourceRequested = function(request) { console.debug("Starting request: #"+request.id+" ["+request.method+"]"+request.url); end_time = null; }; page.onResourceReceived = function(response) { console.debug("Request finished: #"+response.id+" ["+response.status+"]"+response.url); if (first_response === null && response.status != 301 && response.status != 302) { first_response = response; } if (page_loaded) { console.debug("waiting "+wait_before_end+"ms before finished."); end_time = Date.now() + wait_before_end; setTimeout(make_result, wait_before_end+10, page); } } page.onResourceError = page.onResourceTimeout=function(response) { console.info("Request error: #"+response.id+" ["+response.errorCode+"="+response.errorString+"]"+response.url); if (first_response === null) { first_response = response; } if (page_loaded) { console.debug("waiting "+wait_before_end+"ms before finished."); end_time = Date.now() + wait_before_end; setTimeout(make_result, wait_before_end+10, page); } } // make sure request will finished setTimeout(make_result, page.settings.resourceTimeout + 100, page); // send request page.open(fetch.url, { operation: fetch.method, data: fetch.data, }); // make response function make_result(page) { if (finished) { return; } if (Date.now() - start_time < page.settings.resourceTimeout) { if (!!!end_time) { return; } if (end_time > Date.now()) { setTimeout(make_result, Math.min(Date.now() - end_time, 100), page); return; } } var result = {}; try { result = _make_result(page); page.close(); finished = true; console.log("["+result.status_code+"] "+result.orig_url+" "+result.time) } catch (e) { result = { orig_url: fetch.url, status_code: 599, error: e.toString(), content: page.content || "", headers: {}, url: page.url || fetch.url, cookies: {}, time: (Date.now() - start_time) / 1000, js_script_result: null, save: fetch.save } } var body = JSON.stringify(result, null, 2); response.writeHead(200, { 'Cache': 'no-cache', 'Content-Type': 'application/json', }); response.write(body); response.closeGracefully(); } function _make_result(page) { if (first_response === null) { throw "Timeout before first response."; } var cookies = {}; page.cookies.forEach(function(e) { cookies[e.name] = e.value; }); var headers = {}; if (first_response.headers) { first_response.headers.forEach(function(e) { headers[e.name] = e.value; }); } return { orig_url: fetch.url, status_code: first_response.status || 599, error: first_response.errorString, content: page.content, headers: headers, url: page.url, cookies: cookies, time: (Date.now() - start_time) / 1000, js_script_result: script_result, save: fetch.save } } }); if (service) { console.log('phantomjs fetcher running on port ' + port); } else { console.log('Error: Could not create web server listening on port ' + port); phantom.exit(); } } ================================================ FILE: pyspider/fetcher/puppeteer_fetcher.js ================================================ const express = require("express"); const puppeteer = require('puppeteer'); const bodyParser = require('body-parser'); const app = express(); app.use(bodyParser.json()); app.use(bodyParser.urlencoded({extended: false})); let init_browser = true; let browser_settings = {}; app.use(async (req, res, next) => { if (init_browser) { var options = req.body; if (options.proxy) { if (options.proxy.indexOf("://") == -1) { options.proxy = "http://" + options.proxy; } browser_settings["args"] = ['--no-sandbox', "--disable-setuid-sandbox", "--proxy-server="+options.proxy]; } else { browser_settings["args"] = ['--no-sandbox', "--disable-setuid-sandbox"]; } browser_settings["headless"] = options.headless === "false"? false:true browser = await puppeteer.launch(browser_settings); init_browser=false; console.log("init browser success!"); next(); } else { next(); }; }); async function fetch(options) { var page = await browser.newPage(); options.start_time = Date.now(); try { await _fetch(page, options); var result = await make_result(page, options); await page.close(); return result } catch (error) { console.log('catch error ', error); var result = await make_result(page, options, error); await page.close(); return result } } async function _fetch(page, options) { width = options.js_viewport_width || 1024; height = options.js_viewport_height || 768 * 3; await page.setViewport({ "width": width, "height": height }); if (options.headers) { await page.setExtraHTTPHeaders(options.headers); } if (options.headers && options.headers["User-Agent"]) { page.setUserAgent(options.headers["User-Agent"]); } page.on("console", msg => { console.log('console: ' + msg.args()); }); // Http post method let first_request = true; let request_reseted = false; await page.setRequestInterception(true); if (options.method && options.method.toLowerCase() === "post") { page.on("request", interceptedRequest => { request_reseted = false; end_time = null; if (first_request) { first_request = false; var data = { "method": "POST", "postData": options.data }; console.log(data); interceptedRequest.continue(data); request_reseted = true } }) } else { page.on("request", interceptedRequest => { request_reseted = false; end_time = null; }) } // load images or not if (options.load_images && options.load_images.toLowerCase() === "false") { page.on("request", request => { if (!!!request_reseted) { if (request.resourceType() === 'image') request.abort(); else request.continue(); } }) } else { page.on("request", request => { if (!!!request_reseted) request.continue() }) } let error_message = null; page.on("error", e => { error_message = e }); let page_settings = {}; var page_timeout = options.timeout ? options.timeout * 1000 : 20 * 1000; page_settings["timeout"] = page_timeout page_settings["waitUntil"] = ["domcontentloaded", "networkidle0"]; console.log('goto ', options.url) var response = await page.goto(options.url, page_settings); if (error_message) { throw error_message } if (options.js_script) { console.log('running document-end script.'); script_result = await page.evaluate(options.js_script); console.log("end script_result is: ", script_result); options.script_result = script_result } if (options.screenshot_path) { await page.screenshot({path: options.screenshot_path}); } options.response = response } async function make_result(page, options, error) { response = options.response; var cookies = {}; var tmp_cookies = await page.cookies(); tmp_cookies.forEach(function (e) { cookies[e.name] = e.value; }); let status_code = null; let headers = null; let page_content = null; if (!!!error) { response = options.response; status_code = response.status(); headers = response.headers(); page_content = await page.content(); } return { orig_url: options.url, status_code: status_code || 599, error: error, content: page_content, headers: headers, url: page.url(), cookies: cookies, time: (Date.now() - options.start_time) / 1000, js_script_result: options.script_result, save: options.save } } app.get("/", function (request, response) { body = "method not allowed!"; response.status(403); response.set({ "cache": "no-cache", "Content-Length": body.length }); response.send(body); }); let max_open_pages = 5; let opened_page_nums = 0; app.post("/", async (request, response) => { console.log("opened pages: " + opened_page_nums); if (opened_page_nums >= max_open_pages){ body = "browser pages is too many, open new browser process!"; response.status(403); response.set({ "cache": "no-cache", "Content-Length": body.length }); response.send(body); } else { opened_page_nums += 1; let options = request.body; result = await fetch(options); opened_page_nums -= 1; response.send(result) } }); let port = 22222; if (process.argv.length === 3) { port = parseInt(process.argv[2]) } app.listen(port, function () { console.log("puppeteer fetcher running on port " + port); }); ================================================ FILE: pyspider/fetcher/splash_fetcher.lua ================================================ --#! /usr/bin/env lua -- -- splash_fetcher.lua -- Copyright (C) 2016 Binux -- -- Distributed under terms of the Apache license, version 2.0. -- json = require("json") function render(splash, fetch) local debug = true local function log_message(message, level) if debug or level ~= nil then print(message) end end if not splash.with_timeout then function with_timeout(self, func, timeout) return true, func() end splash.with_timeout = with_timeout end log_message(json.encode(fetch)) -- create and set page local start_time = os.time() splash:clear_cookies() splash:autoload_reset() splash:on_request_reset() splash:on_response_reset() splash:set_viewport_size(fetch.js_viewport_width or 1024, fetch.js_viewport_height or 768 * 3) if fetch.headers and fetch.headers["User-Agent"] ~= nil then splash:set_user_agent(fetch.headers["User-Agent"]) end if fetch.headers then fetch.headers['Accept-Encoding'] = nil fetch.headers['Connection'] = nil fetch.headers['Content-Length'] = nil splash:set_custom_headers(fetch.headers) end splash.images_enabled = (fetch.load_images == true) splash.resource_timeout = math.min((fetch.timeout or 20), 58) fetch.timeout = splash.resource_timeout local wait_before_end = 1.0; local end_time = start_time + fetch.timeout - 0.1 -- callbacks splash:on_request(function(request) -- wait for new request end_time = start_time + fetch.timeout - 0.1 log_message("Starting request: [" .. tostring(request.method) .. "]" .. tostring(request.url)) if fetch.proxy_host and fetch.proxy_port then request:set_proxy({ host = fetch.proxy_host, port = tonumber(fetch.proxy_port), username = fetch.proxy_username, password = fetch.proxy_password, type = 'HTTP' }) end end) local first_response = nil splash:on_response(function(response) if first_response == nil then first_response = response end -- wait for some other respond and render end_time = math.min(os.time() + wait_before_end + 0.1, start_time + fetch.timeout - 0.1) log_message("Request finished: [" .. tostring(response.status) .. "]" .. tostring(response.url)) end) -- send request local js_script_result = nil local timeout_ok, ok, reason = splash:with_timeout(function() local js_script = nil if fetch.js_script then ok, js_script = pcall(function() return splash:jsfunc(fetch.js_script) end) if not ok then log_message("js_script error: " .. tostring(js_script), 1) js_script = nil end end if js_script and fetch.js_run_at == "document-start" then log_message("running document-start script."); ok, js_script_result = pcall(js_script) if not ok then log_message("running document-start script error: " .. tostring(js_script_result), 1) end end local ok, reason = splash:go{url=fetch.url, http_method=fetch.method, body=fetch.data} end_time = math.min(os.time() + wait_before_end + 0.1, start_time + fetch.timeout - 0.1) if js_script and fetch.js_run_at ~= "document-start" then splash:wait(0.5) log_message("running document-end script."); ok, js_script_result = pcall(js_script) if not ok then log_message("running document-end script error: " .. tostring(js_script_result), 1) end end -- wait for all requests finished local now = os.time() while now <= end_time do splash:wait(math.min(end_time - now, 0.1)) now = os.time() end return ok, reason end, fetch.timeout + 0.1) -- make response local cookies = {} for i, c in ipairs(splash:get_cookies()) do cookies[c.name] = c.value end if (not timeout_ok and first_response.ok) or (timeok and ok) then return { orig_url = fetch.url, status_code = first_response.status == 0 and 599 or first_response.status, error = nil, content = splash:html(), headers = first_response.headers, url = splash:url(), cookies = cookies, time = os.time() - start_time, js_script_result = js_script_result and tostring(js_script_result), save = fetch.save } else if first_response then return { orig_url = fetch.url, status_code = first_response.status == 0 and 599 or first_response.status, error = reason, content = splash:html(), headers = first_response.headers, url = splash:url(), cookies = cookies, time = os.time() - start_time, js_script_result = js_script_result and tostring(js_script_result), save = fetch.save } else return { orig_url = fetch.url, status_code = 599, error = reason, content = splash:html(), headers = {}, url = splash:url(), cookies = cookies, time = os.time() - start_time, js_script_result = js_script_result and tostring(js_script_result), save = fetch.save } end end end function main(splash) local fetch = splash.args local start_time = os.time() ok, result = pcall(function() return render(splash, fetch) end) if ok then return result else return { orig_url = fetch.url, status_code = 599, error = result, content = splash:html(), headers = {}, url = splash:url(), cookies = {}, time = os.time() - start_time, js_script_result = nil, save = fetch.save } end end ================================================ FILE: pyspider/fetcher/tornado_fetcher.py ================================================ #!/usr/bin/env python # -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # Created on 2012-12-17 11:07:19 from __future__ import unicode_literals import os import sys import six import copy import time import json import logging import traceback import functools import threading import tornado.ioloop import tornado.httputil import tornado.httpclient import pyspider from six.moves import queue, http_cookies from six.moves.urllib.robotparser import RobotFileParser from requests import cookies from six.moves.urllib.parse import urljoin, urlsplit from tornado import gen from tornado.curl_httpclient import CurlAsyncHTTPClient from tornado.simple_httpclient import SimpleAsyncHTTPClient from pyspider.libs import utils, dataurl, counter from pyspider.libs.url import quote_chinese from .cookie_utils import extract_cookies_to_jar logger = logging.getLogger('fetcher') class MyCurlAsyncHTTPClient(CurlAsyncHTTPClient): def free_size(self): return len(self._free_list) def size(self): return len(self._curls) - self.free_size() class MySimpleAsyncHTTPClient(SimpleAsyncHTTPClient): def free_size(self): return self.max_clients - self.size() def size(self): return len(self.active) fetcher_output = { "status_code": int, "orig_url": str, "url": str, "headers": dict, "content": str, "cookies": dict, } class Fetcher(object): user_agent = "pyspider/%s (+http://pyspider.org/)" % pyspider.__version__ default_options = { 'method': 'GET', 'headers': { }, 'use_gzip': True, 'timeout': 120, 'connect_timeout': 20, } phantomjs_proxy = None splash_endpoint = None splash_lua_source = open(os.path.join(os.path.dirname(__file__), "splash_fetcher.lua")).read() robot_txt_age = 60*60 # 1h def __init__(self, inqueue, outqueue, poolsize=100, proxy=None, async_mode=True): self.inqueue = inqueue self.outqueue = outqueue self.poolsize = poolsize self._running = False self._quit = False self.proxy = proxy self.async_mode = async_mode self.ioloop = tornado.ioloop.IOLoop() self.robots_txt_cache = {} # binding io_loop to http_client here if self.async_mode: self.http_client = MyCurlAsyncHTTPClient(max_clients=self.poolsize, io_loop=self.ioloop) else: self.http_client = tornado.httpclient.HTTPClient(MyCurlAsyncHTTPClient, max_clients=self.poolsize) self._cnt = { '5m': counter.CounterManager( lambda: counter.TimebaseAverageWindowCounter(30, 10)), '1h': counter.CounterManager( lambda: counter.TimebaseAverageWindowCounter(60, 60)), } def send_result(self, type, task, result): '''Send fetch result to processor''' if self.outqueue: try: self.outqueue.put((task, result)) except Exception as e: logger.exception(e) def fetch(self, task, callback=None): if self.async_mode: return self.async_fetch(task, callback) else: return self.async_fetch(task, callback).result() @gen.coroutine def async_fetch(self, task, callback=None): '''Do one fetch''' url = task.get('url', 'data:,') if callback is None: callback = self.send_result type = 'None' start_time = time.time() try: if url.startswith('data:'): type = 'data' result = yield gen.maybe_future(self.data_fetch(url, task)) elif task.get('fetch', {}).get('fetch_type') in ('js', 'phantomjs'): type = 'phantomjs' result = yield self.phantomjs_fetch(url, task) elif task.get('fetch', {}).get('fetch_type') in ('splash', ): type = 'splash' result = yield self.splash_fetch(url, task) elif task.get('fetch', {}).get('fetch_type') in ('puppeteer', ): type = 'puppeteer' result = yield self.puppeteer_fetch(url, task) else: type = 'http' result = yield self.http_fetch(url, task) except Exception as e: logger.exception(e) result = self.handle_error(type, url, task, start_time, e) callback(type, task, result) self.on_result(type, task, result) raise gen.Return(result) def sync_fetch(self, task): '''Synchronization fetch, usually used in xmlrpc thread''' if not self._running: return self.ioloop.run_sync(functools.partial(self.async_fetch, task, lambda t, _, r: True)) wait_result = threading.Condition() _result = {} def callback(type, task, result): wait_result.acquire() _result['type'] = type _result['task'] = task _result['result'] = result wait_result.notify() wait_result.release() wait_result.acquire() self.ioloop.add_callback(self.fetch, task, callback) while 'result' not in _result: wait_result.wait() wait_result.release() return _result['result'] def data_fetch(self, url, task): '''A fake fetcher for dataurl''' self.on_fetch('data', task) result = {} result['orig_url'] = url result['content'] = dataurl.decode(url) result['headers'] = {} result['status_code'] = 200 result['url'] = url result['cookies'] = {} result['time'] = 0 result['save'] = task.get('fetch', {}).get('save') if len(result['content']) < 70: logger.info("[200] %s:%s %s 0s", task.get('project'), task.get('taskid'), url) else: logger.info( "[200] %s:%s data:,%s...[content:%d] 0s", task.get('project'), task.get('taskid'), result['content'][:70], len(result['content']) ) return result def handle_error(self, type, url, task, start_time, error): result = { 'status_code': getattr(error, 'code', 599), 'error': utils.text(error), 'traceback': traceback.format_exc() if sys.exc_info()[0] else None, 'content': "", 'time': time.time() - start_time, 'orig_url': url, 'url': url, "save": task.get('fetch', {}).get('save') } logger.error("[%d] %s:%s %s, %r %.2fs", result['status_code'], task.get('project'), task.get('taskid'), url, error, result['time']) return result allowed_options = ['method', 'data', 'connect_timeout', 'timeout', 'cookies', 'use_gzip', 'validate_cert'] def pack_tornado_request_parameters(self, url, task): fetch = copy.deepcopy(self.default_options) fetch['url'] = url fetch['headers'] = tornado.httputil.HTTPHeaders(fetch['headers']) fetch['headers']['User-Agent'] = self.user_agent task_fetch = task.get('fetch', {}) for each in self.allowed_options: if each in task_fetch: fetch[each] = task_fetch[each] fetch['headers'].update(task_fetch.get('headers', {})) if task.get('track'): track_headers = tornado.httputil.HTTPHeaders( task.get('track', {}).get('fetch', {}).get('headers') or {}) track_ok = task.get('track', {}).get('process', {}).get('ok', False) else: track_headers = {} track_ok = False # proxy proxy_string = None if isinstance(task_fetch.get('proxy'), six.string_types): proxy_string = task_fetch['proxy'] elif self.proxy and task_fetch.get('proxy', True): proxy_string = self.proxy if proxy_string: if '://' not in proxy_string: proxy_string = 'http://' + proxy_string proxy_splited = urlsplit(proxy_string) fetch['proxy_host'] = proxy_splited.hostname if proxy_splited.username: fetch['proxy_username'] = proxy_splited.username if proxy_splited.password: fetch['proxy_password'] = proxy_splited.password if six.PY2: for key in ('proxy_host', 'proxy_username', 'proxy_password'): if key in fetch: fetch[key] = fetch[key].encode('utf8') fetch['proxy_port'] = proxy_splited.port or 8080 # etag if task_fetch.get('etag', True): _t = None if isinstance(task_fetch.get('etag'), six.string_types): _t = task_fetch.get('etag') elif track_ok: _t = track_headers.get('etag') if _t and 'If-None-Match' not in fetch['headers']: fetch['headers']['If-None-Match'] = _t # last modifed if task_fetch.get('last_modified', task_fetch.get('last_modifed', True)): last_modified = task_fetch.get('last_modified', task_fetch.get('last_modifed', True)) _t = None if isinstance(last_modified, six.string_types): _t = last_modified elif track_ok: _t = track_headers.get('last-modified') if _t and 'If-Modified-Since' not in fetch['headers']: fetch['headers']['If-Modified-Since'] = _t # timeout if 'timeout' in fetch: fetch['request_timeout'] = fetch['timeout'] del fetch['timeout'] # data rename to body if 'data' in fetch: fetch['body'] = fetch['data'] del fetch['data'] return fetch @gen.coroutine def can_fetch(self, user_agent, url): parsed = urlsplit(url) domain = parsed.netloc if domain in self.robots_txt_cache: robot_txt = self.robots_txt_cache[domain] if time.time() - robot_txt.mtime() > self.robot_txt_age: robot_txt = None else: robot_txt = None if robot_txt is None: robot_txt = RobotFileParser() try: response = yield gen.maybe_future(self.http_client.fetch( urljoin(url, '/robots.txt'), connect_timeout=10, request_timeout=30)) content = response.body except tornado.httpclient.HTTPError as e: logger.error('load robots.txt from %s error: %r', domain, e) content = '' try: content = content.decode('utf8', 'ignore') except UnicodeDecodeError: content = '' robot_txt.parse(content.splitlines()) self.robots_txt_cache[domain] = robot_txt raise gen.Return(robot_txt.can_fetch(user_agent, url)) def clear_robot_txt_cache(self): now = time.time() for domain, robot_txt in self.robots_txt_cache.items(): if now - robot_txt.mtime() > self.robot_txt_age: del self.robots_txt_cache[domain] @gen.coroutine def http_fetch(self, url, task): '''HTTP fetcher''' start_time = time.time() self.on_fetch('http', task) handle_error = lambda x: self.handle_error('http', url, task, start_time, x) # setup request parameters fetch = self.pack_tornado_request_parameters(url, task) task_fetch = task.get('fetch', {}) session = cookies.RequestsCookieJar() # fix for tornado request obj if 'Cookie' in fetch['headers']: c = http_cookies.SimpleCookie() try: c.load(fetch['headers']['Cookie']) except AttributeError: c.load(utils.utf8(fetch['headers']['Cookie'])) for key in c: session.set(key, c[key]) del fetch['headers']['Cookie'] if 'cookies' in fetch: session.update(fetch['cookies']) del fetch['cookies'] max_redirects = task_fetch.get('max_redirects', 5) # we will handle redirects by hand to capture cookies fetch['follow_redirects'] = False # making requests while True: # robots.txt if task_fetch.get('robots_txt', False): can_fetch = yield self.can_fetch(fetch['headers']['User-Agent'], fetch['url']) if not can_fetch: error = tornado.httpclient.HTTPError(403, 'Disallowed by robots.txt') raise gen.Return(handle_error(error)) try: request = tornado.httpclient.HTTPRequest(**fetch) # if cookie already in header, get_cookie_header wouldn't work old_cookie_header = request.headers.get('Cookie') if old_cookie_header: del request.headers['Cookie'] cookie_header = cookies.get_cookie_header(session, request) if cookie_header: request.headers['Cookie'] = cookie_header elif old_cookie_header: request.headers['Cookie'] = old_cookie_header except Exception as e: logger.exception(fetch) raise gen.Return(handle_error(e)) try: response = yield gen.maybe_future(self.http_client.fetch(request)) except tornado.httpclient.HTTPError as e: if e.response: response = e.response else: raise gen.Return(handle_error(e)) extract_cookies_to_jar(session, response.request, response.headers) if (response.code in (301, 302, 303, 307) and response.headers.get('Location') and task_fetch.get('allow_redirects', True)): if max_redirects <= 0: error = tornado.httpclient.HTTPError( 599, 'Maximum (%d) redirects followed' % task_fetch.get('max_redirects', 5), response) raise gen.Return(handle_error(error)) if response.code in (302, 303): fetch['method'] = 'GET' if 'body' in fetch: del fetch['body'] fetch['url'] = quote_chinese(urljoin(fetch['url'], response.headers['Location'])) fetch['request_timeout'] -= time.time() - start_time if fetch['request_timeout'] < 0: fetch['request_timeout'] = 0.1 max_redirects -= 1 continue result = {} result['orig_url'] = url result['content'] = response.body or '' result['headers'] = dict(response.headers) result['status_code'] = response.code result['url'] = response.effective_url or url result['time'] = time.time() - start_time result['cookies'] = session.get_dict() result['save'] = task_fetch.get('save') if response.error: result['error'] = utils.text(response.error) if 200 <= response.code < 300: logger.info("[%d] %s:%s %s %.2fs", response.code, task.get('project'), task.get('taskid'), url, result['time']) else: logger.warning("[%d] %s:%s %s %.2fs", response.code, task.get('project'), task.get('taskid'), url, result['time']) raise gen.Return(result) @gen.coroutine def phantomjs_fetch(self, url, task): '''Fetch with phantomjs proxy''' start_time = time.time() self.on_fetch('phantomjs', task) handle_error = lambda x: self.handle_error('phantomjs', url, task, start_time, x) # check phantomjs proxy is enabled if not self.phantomjs_proxy: result = { "orig_url": url, "content": "phantomjs is not enabled.", "headers": {}, "status_code": 501, "url": url, "time": time.time() - start_time, "cookies": {}, "save": task.get('fetch', {}).get('save') } logger.warning("[501] %s:%s %s 0s", task.get('project'), task.get('taskid'), url) raise gen.Return(result) # setup request parameters fetch = self.pack_tornado_request_parameters(url, task) task_fetch = task.get('fetch', {}) for each in task_fetch: if each not in fetch: fetch[each] = task_fetch[each] # robots.txt if task_fetch.get('robots_txt', False): user_agent = fetch['headers']['User-Agent'] can_fetch = yield self.can_fetch(user_agent, url) if not can_fetch: error = tornado.httpclient.HTTPError(403, 'Disallowed by robots.txt') raise gen.Return(handle_error(error)) request_conf = { 'follow_redirects': False } request_conf['connect_timeout'] = fetch.get('connect_timeout', 20) request_conf['request_timeout'] = fetch.get('request_timeout', 120) + 1 session = cookies.RequestsCookieJar() if 'Cookie' in fetch['headers']: c = http_cookies.SimpleCookie() try: c.load(fetch['headers']['Cookie']) except AttributeError: c.load(utils.utf8(fetch['headers']['Cookie'])) for key in c: session.set(key, c[key]) del fetch['headers']['Cookie'] if 'cookies' in fetch: session.update(fetch['cookies']) del fetch['cookies'] request = tornado.httpclient.HTTPRequest(url=fetch['url']) cookie_header = cookies.get_cookie_header(session, request) if cookie_header: fetch['headers']['Cookie'] = cookie_header # making requests fetch['headers'] = dict(fetch['headers']) try: request = tornado.httpclient.HTTPRequest( url=self.phantomjs_proxy, method="POST", body=json.dumps(fetch), **request_conf) except Exception as e: raise gen.Return(handle_error(e)) try: response = yield gen.maybe_future(self.http_client.fetch(request)) except tornado.httpclient.HTTPError as e: if e.response: response = e.response else: raise gen.Return(handle_error(e)) if not response.body: raise gen.Return(handle_error(Exception('no response from phantomjs: %r' % response))) result = {} try: result = json.loads(utils.text(response.body)) assert 'status_code' in result, result except Exception as e: if response.error: result['error'] = utils.text(response.error) raise gen.Return(handle_error(e)) if result.get('status_code', 200): logger.info("[%d] %s:%s %s %.2fs", result['status_code'], task.get('project'), task.get('taskid'), url, result['time']) else: logger.error("[%d] %s:%s %s, %r %.2fs", result['status_code'], task.get('project'), task.get('taskid'), url, result['content'], result['time']) raise gen.Return(result) @gen.coroutine def splash_fetch(self, url, task): '''Fetch with splash''' start_time = time.time() self.on_fetch('splash', task) handle_error = lambda x: self.handle_error('splash', url, task, start_time, x) # check phantomjs proxy is enabled if not self.splash_endpoint: result = { "orig_url": url, "content": "splash is not enabled.", "headers": {}, "status_code": 501, "url": url, "time": time.time() - start_time, "cookies": {}, "save": task.get('fetch', {}).get('save') } logger.warning("[501] %s:%s %s 0s", task.get('project'), task.get('taskid'), url) raise gen.Return(result) # setup request parameters fetch = self.pack_tornado_request_parameters(url, task) task_fetch = task.get('fetch', {}) for each in task_fetch: if each not in fetch: fetch[each] = task_fetch[each] # robots.txt if task_fetch.get('robots_txt', False): user_agent = fetch['headers']['User-Agent'] can_fetch = yield self.can_fetch(user_agent, url) if not can_fetch: error = tornado.httpclient.HTTPError(403, 'Disallowed by robots.txt') raise gen.Return(handle_error(error)) request_conf = { 'follow_redirects': False, 'headers': { 'Content-Type': 'application/json', } } request_conf['connect_timeout'] = fetch.get('connect_timeout', 20) request_conf['request_timeout'] = fetch.get('request_timeout', 120) + 1 session = cookies.RequestsCookieJar() if 'Cookie' in fetch['headers']: c = http_cookies.SimpleCookie() try: c.load(fetch['headers']['Cookie']) except AttributeError: c.load(utils.utf8(fetch['headers']['Cookie'])) for key in c: session.set(key, c[key]) del fetch['headers']['Cookie'] if 'cookies' in fetch: session.update(fetch['cookies']) del fetch['cookies'] request = tornado.httpclient.HTTPRequest(url=fetch['url']) cookie_header = cookies.get_cookie_header(session, request) if cookie_header: fetch['headers']['Cookie'] = cookie_header # making requests fetch['lua_source'] = self.splash_lua_source fetch['headers'] = dict(fetch['headers']) try: request = tornado.httpclient.HTTPRequest( url=self.splash_endpoint, method="POST", body=json.dumps(fetch), **request_conf) except Exception as e: raise gen.Return(handle_error(e)) try: response = yield gen.maybe_future(self.http_client.fetch(request)) except tornado.httpclient.HTTPError as e: if e.response: response = e.response else: raise gen.Return(handle_error(e)) if not response.body: raise gen.Return(handle_error(Exception('no response from phantomjs'))) result = {} try: result = json.loads(utils.text(response.body)) assert 'status_code' in result, result except ValueError as e: logger.error("result is not json: %r", response.body[:500]) raise gen.Return(handle_error(e)) except Exception as e: if response.error: result['error'] = utils.text(response.error) raise gen.Return(handle_error(e)) if result.get('status_code', 200): logger.info("[%d] %s:%s %s %.2fs", result['status_code'], task.get('project'), task.get('taskid'), url, result['time']) else: logger.error("[%d] %s:%s %s, %r %.2fs", result['status_code'], task.get('project'), task.get('taskid'), url, result['content'], result['time']) raise gen.Return(result) @gen.coroutine def puppeteer_fetch(self, url, task): '''Fetch with puppeteer proxy''' start_time = time.time() self.on_fetch('puppeteer', task) handle_error = lambda x: self.handle_error('puppeteer', url, task, start_time, x) # check puppeteer proxy is enabled if not self.puppeteer_proxy: result = { "orig_url": url, "content": "puppeteer is not enabled.", "headers": {}, "status_code": 501, "url": url, "time": time.time() - start_time, "cookies": {}, "save": task.get('fetch', {}).get('save') } logger.warning("[501] %s:%s %s 0s", task.get('project'), task.get('taskid'), url) raise gen.Return(result) # setup request parameters fetch = self.pack_tornado_request_parameters(url, task) task_fetch = task.get('fetch', {}) for each in task_fetch: if each not in fetch: fetch[each] = task_fetch[each] # robots.txt if task_fetch.get('robots_txt', False): user_agent = fetch['headers']['User-Agent'] can_fetch = yield self.can_fetch(user_agent, url) if not can_fetch: error = tornado.httpclient.HTTPError(403, 'Disallowed by robots.txt') raise gen.Return(handle_error(error)) request_conf = { 'follow_redirects': False } request_conf['connect_timeout'] = fetch.get('connect_timeout', 20) request_conf['request_timeout'] = fetch.get('request_timeout', 120) + 1 session = cookies.RequestsCookieJar() if 'Cookie' in fetch['headers']: c = http_cookies.SimpleCookie() try: c.load(fetch['headers']['Cookie']) except AttributeError: c.load(utils.utf8(fetch['headers']['Cookie'])) for key in c: session.set(key, c[key]) del fetch['headers']['Cookie'] if 'cookies' in fetch: session.update(fetch['cookies']) del fetch['cookies'] request = tornado.httpclient.HTTPRequest(url=fetch['url']) cookie_header = cookies.get_cookie_header(session, request) if cookie_header: fetch['headers']['Cookie'] = cookie_header logger.info("%s", self.puppeteer_proxy) # making requests fetch['headers'] = dict(fetch['headers']) headers = {} headers['Content-Type'] = 'application/json; charset=UTF-8' try: request = tornado.httpclient.HTTPRequest( url=self.puppeteer_proxy, method="POST", headers=headers, body=json.dumps(fetch), **request_conf) except Exception as e: raise gen.Return(handle_error(e)) try: response = yield gen.maybe_future(self.http_client.fetch(request)) except tornado.httpclient.HTTPError as e: if e.response: response = e.response else: raise gen.Return(handle_error(e)) if not response.body: raise gen.Return(handle_error(Exception('no response from puppeteer: %r' % response))) result = {} try: result = json.loads(utils.text(response.body)) assert 'status_code' in result, result except Exception as e: if response.error: result['error'] = utils.text(response.error) raise gen.Return(handle_error(e)) if result.get('status_code', 200): logger.info("[%d] %s:%s %s %.2fs", result['status_code'], task.get('project'), task.get('taskid'), url, result['time']) else: logger.error("[%d] %s:%s %s, %r %.2fs", result['status_code'], task.get('project'), task.get('taskid'), url, result['content'], result['time']) raise gen.Return(result) def run(self): '''Run loop''' logger.info("fetcher starting...") def queue_loop(): if not self.outqueue or not self.inqueue: return while not self._quit: try: if self.outqueue.full(): break if self.http_client.free_size() <= 0: break task = self.inqueue.get_nowait() # FIXME: decode unicode_obj should used after data selete from # database, it's used here for performance task = utils.decode_unicode_obj(task) self.fetch(task) except queue.Empty: break except KeyboardInterrupt: break except Exception as e: logger.exception(e) break tornado.ioloop.PeriodicCallback(queue_loop, 100, io_loop=self.ioloop).start() tornado.ioloop.PeriodicCallback(self.clear_robot_txt_cache, 10000, io_loop=self.ioloop).start() self._running = True try: self.ioloop.start() except KeyboardInterrupt: pass logger.info("fetcher exiting...") def quit(self): '''Quit fetcher''' self._running = False self._quit = True self.ioloop.add_callback(self.ioloop.stop) if hasattr(self, 'xmlrpc_server'): self.xmlrpc_ioloop.add_callback(self.xmlrpc_server.stop) self.xmlrpc_ioloop.add_callback(self.xmlrpc_ioloop.stop) def size(self): return self.http_client.size() def xmlrpc_run(self, port=24444, bind='127.0.0.1', logRequests=False): '''Run xmlrpc server''' import umsgpack from pyspider.libs.wsgi_xmlrpc import WSGIXMLRPCApplication try: from xmlrpc.client import Binary except ImportError: from xmlrpclib import Binary application = WSGIXMLRPCApplication() application.register_function(self.quit, '_quit') application.register_function(self.size) def sync_fetch(task): result = self.sync_fetch(task) result = Binary(umsgpack.packb(result)) return result application.register_function(sync_fetch, 'fetch') def dump_counter(_time, _type): return self._cnt[_time].to_dict(_type) application.register_function(dump_counter, 'counter') import tornado.wsgi import tornado.ioloop import tornado.httpserver container = tornado.wsgi.WSGIContainer(application) self.xmlrpc_ioloop = tornado.ioloop.IOLoop() self.xmlrpc_server = tornado.httpserver.HTTPServer(container, io_loop=self.xmlrpc_ioloop) self.xmlrpc_server.listen(port=port, address=bind) logger.info('fetcher.xmlrpc listening on %s:%s', bind, port) self.xmlrpc_ioloop.start() def on_fetch(self, type, task): '''Called before task fetch''' logger.info('on fetch %s:%s', type, task) def on_result(self, type, task, result): '''Called after task fetched''' status_code = result.get('status_code', 599) if status_code != 599: status_code = (int(status_code) / 100 * 100) self._cnt['5m'].event((task.get('project'), status_code), +1) self._cnt['1h'].event((task.get('project'), status_code), +1) if type in ('http', 'phantomjs') and result.get('time'): content_len = len(result.get('content', '')) self._cnt['5m'].event((task.get('project'), 'speed'), float(content_len) / result.get('time')) self._cnt['1h'].event((task.get('project'), 'speed'), float(content_len) / result.get('time')) self._cnt['5m'].event((task.get('project'), 'time'), result.get('time')) self._cnt['1h'].event((task.get('project'), 'time'), result.get('time')) ================================================ FILE: pyspider/libs/ListIO.py ================================================ #!/usr/bin/env python # -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # Created on 2014-02-26 23:41:51 class ListO(object): """A StringO write to list.""" def __init__(self, buffer=None): self._buffer = buffer if self._buffer is None: self._buffer = [] def isatty(self): return False def close(self): pass def flush(self): pass def seek(self, n, mode=0): pass def readline(self): pass def reset(self): pass def write(self, x): self._buffer.append(x) def writelines(self, x): self._buffer.extend(x) ================================================ FILE: pyspider/libs/__init__.py ================================================ ================================================ FILE: pyspider/libs/base_handler.py ================================================ #!/usr/bin/env python # -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # Created on 2014-02-16 23:12:48 import sys import inspect import functools import fractions import six from six import add_metaclass, iteritems from pyspider.libs.url import ( quote_chinese, _build_url, _encode_params, _encode_multipart_formdata, curl_to_arguments) from pyspider.libs.utils import md5string, timeout from pyspider.libs.ListIO import ListO from pyspider.libs.response import rebuild_response from pyspider.libs.pprint import pprint from pyspider.processor import ProcessorResult def catch_status_code_error(func): """ Non-200 response will been regarded as fetch failed and will not pass to callback. Use this decorator to override this feature. """ func._catch_status_code_error = True return func def not_send_status(func): """ Do not send process status package back to scheduler. It's used by callbacks like on_message, on_result etc... """ @functools.wraps(func) def wrapper(self, response, task): self._extinfo['not_send_status'] = True function = func.__get__(self, self.__class__) return self._run_func(function, response, task) return wrapper def config(_config=None, **kwargs): """ A decorator for setting the default kwargs of `BaseHandler.crawl`. Any self.crawl with this callback will use this config. """ if _config is None: _config = {} _config.update(kwargs) def wrapper(func): func._config = _config return func return wrapper class NOTSET(object): pass def every(minutes=NOTSET, seconds=NOTSET): """ method will been called every minutes or seconds """ def wrapper(func): # mark the function with variable 'is_cronjob=True', the function would be # collected into the list Handler._cron_jobs by meta class func.is_cronjob = True # collect interval and unify to seconds, it's used in meta class. See the # comments in meta class. func.tick = minutes * 60 + seconds return func if inspect.isfunction(minutes): func = minutes minutes = 1 seconds = 0 return wrapper(func) if minutes is NOTSET: if seconds is NOTSET: minutes = 1 seconds = 0 else: minutes = 0 if seconds is NOTSET: seconds = 0 return wrapper class BaseHandlerMeta(type): def __new__(cls, name, bases, attrs): # A list of all functions which is marked as 'is_cronjob=True' cron_jobs = [] # The min_tick is the greatest common divisor(GCD) of the interval of cronjobs # this value would be queried by scheduler when the project initial loaded. # Scheudler may only send _on_cronjob task every min_tick seconds. It can reduce # the number of tasks sent from scheduler. min_tick = 0 for each in attrs.values(): if inspect.isfunction(each) and getattr(each, 'is_cronjob', False): cron_jobs.append(each) min_tick = fractions.gcd(min_tick, each.tick) newcls = type.__new__(cls, name, bases, attrs) newcls._cron_jobs = cron_jobs newcls._min_tick = min_tick return newcls @add_metaclass(BaseHandlerMeta) class BaseHandler(object): """ BaseHandler for all scripts. `BaseHandler.run` is the main method to handler the task. """ crawl_config = {} project_name = None _cron_jobs = [] _min_tick = 0 __env__ = {'not_inited': True} retry_delay = {} def _reset(self): """ reset before each task """ self._extinfo = {} self._messages = [] self._follows = [] self._follows_keys = set() def _run_func(self, function, *arguments): """ Running callback function with requested number of arguments """ args, varargs, keywords, defaults = inspect.getargspec(function) task = arguments[-1] process_time_limit = task['process'].get('process_time_limit', self.__env__.get('process_time_limit', 0)) if process_time_limit > 0: with timeout(process_time_limit, 'process timeout'): ret = function(*arguments[:len(args) - 1]) else: ret = function(*arguments[:len(args) - 1]) return ret def _run_task(self, task, response): """ Finding callback specified by `task['callback']` raising status error for it if needed. """ process = task.get('process', {}) callback = process.get('callback', '__call__') if not hasattr(self, callback): raise NotImplementedError("self.%s() not implemented!" % callback) function = getattr(self, callback) # do not run_func when 304 if response.status_code == 304 and not getattr(function, '_catch_status_code_error', False): return None if not getattr(function, '_catch_status_code_error', False): response.raise_for_status() return self._run_func(function, response, task) def run_task(self, module, task, response): """ Processing the task, catching exceptions and logs, return a `ProcessorResult` object """ self.logger = logger = module.logger result = None exception = None stdout = sys.stdout self.task = task if isinstance(response, dict): response = rebuild_response(response) self.response = response self.save = (task.get('track') or {}).get('save', {}) try: if self.__env__.get('enable_stdout_capture', True): sys.stdout = ListO(module.log_buffer) self._reset() result = self._run_task(task, response) if inspect.isgenerator(result): for r in result: self._run_func(self.on_result, r, response, task) else: self._run_func(self.on_result, result, response, task) except Exception as e: logger.exception(e) exception = e finally: follows = self._follows messages = self._messages logs = list(module.log_buffer) extinfo = self._extinfo save = self.save sys.stdout = stdout self.task = None self.response = None self.save = None module.log_buffer[:] = [] return ProcessorResult(result, follows, messages, logs, exception, extinfo, save) schedule_fields = ('priority', 'retries', 'exetime', 'age', 'itag', 'force_update', 'auto_recrawl', 'cancel') fetch_fields = ('method', 'headers', 'user_agent', 'data', 'connect_timeout', 'timeout', 'allow_redirects', 'cookies', 'proxy', 'etag', 'last_modifed', 'last_modified', 'save', 'js_run_at', 'js_script', 'js_viewport_width', 'js_viewport_height', 'load_images', 'fetch_type', 'use_gzip', 'validate_cert', 'max_redirects', 'robots_txt') process_fields = ('callback', 'process_time_limit') @staticmethod def task_join_crawl_config(task, crawl_config): task_fetch = task.get('fetch', {}) for k in BaseHandler.fetch_fields: if k in crawl_config: v = crawl_config[k] if isinstance(v, dict) and isinstance(task_fetch.get(k), dict): v = dict(v) v.update(task_fetch[k]) task_fetch[k] = v else: task_fetch.setdefault(k, v) if task_fetch: task['fetch'] = task_fetch task_process = task.get('process', {}) for k in BaseHandler.process_fields: if k in crawl_config: v = crawl_config[k] if isinstance(v, dict) and isinstance(task_process.get(k), dict): task_process[k].update(v) else: task_process.setdefault(k, v) if task_process: task['process'] = task_process return task def _crawl(self, url, **kwargs): """ real crawl API checking kwargs, and repack them to each sub-dict """ task = {} assert len(url) < 1024, "Maximum (1024) URL length error." if kwargs.get('callback'): callback = kwargs['callback'] if isinstance(callback, six.string_types) and hasattr(self, callback): func = getattr(self, callback) elif six.callable(callback) and six.get_method_self(callback) is self: func = callback kwargs['callback'] = func.__name__ elif six.callable(callback) and hasattr(self, callback.__name__): func = getattr(self, callback.__name__) kwargs['callback'] = func.__name__ else: raise NotImplementedError("self.%s() not implemented!" % callback) if hasattr(func, '_config'): for k, v in iteritems(func._config): if isinstance(v, dict) and isinstance(kwargs.get(k), dict): kwargs[k].update(v) else: kwargs.setdefault(k, v) url = quote_chinese(_build_url(url.strip(), kwargs.pop('params', None))) if kwargs.get('files'): assert isinstance( kwargs.get('data', {}), dict), "data must be a dict when using with files!" content_type, data = _encode_multipart_formdata(kwargs.pop('data', {}), kwargs.pop('files', {})) kwargs.setdefault('headers', {}) kwargs['headers']['Content-Type'] = content_type kwargs['data'] = data if kwargs.get('data'): kwargs['data'] = _encode_params(kwargs['data']) if kwargs.get('data'): kwargs.setdefault('method', 'POST') if kwargs.get('user_agent'): kwargs.setdefault('headers', {}) kwargs['headers']['User-Agent'] = kwargs.get('user_agent') schedule = {} for key in self.schedule_fields: if key in kwargs: schedule[key] = kwargs.pop(key) elif key in self.crawl_config: schedule[key] = self.crawl_config[key] task['schedule'] = schedule fetch = {} for key in self.fetch_fields: if key in kwargs: fetch[key] = kwargs.pop(key) task['fetch'] = fetch process = {} for key in self.process_fields: if key in kwargs: process[key] = kwargs.pop(key) task['process'] = process task['project'] = self.project_name task['url'] = url if 'taskid' in kwargs: task['taskid'] = kwargs.pop('taskid') else: task['taskid'] = self.get_taskid(task) if kwargs: raise TypeError('crawl() got unexpected keyword argument: %s' % kwargs.keys()) if self.is_debugger(): task = self.task_join_crawl_config(task, self.crawl_config) cache_key = "%(project)s:%(taskid)s" % task if cache_key not in self._follows_keys: self._follows_keys.add(cache_key) self._follows.append(task) return task def get_taskid(self, task): '''Generate taskid by information of task md5(url) by default, override me''' return md5string(task['url']) # apis def crawl(self, url, **kwargs): ''' available params: url callback method params data files headers timeout allow_redirects cookies proxy etag last_modified auto_recrawl fetch_type js_run_at js_script js_viewport_width js_viewport_height load_images priority retries exetime age itag cancel save taskid full documents: http://pyspider.readthedocs.org/en/latest/apis/self.crawl/ ''' if isinstance(url, six.string_types) and url.startswith('curl '): curl_kwargs = curl_to_arguments(url) url = curl_kwargs.pop('urls') for k, v in iteritems(curl_kwargs): kwargs.setdefault(k, v) if isinstance(url, six.string_types): return self._crawl(url, **kwargs) elif hasattr(url, "__iter__"): result = [] for each in url: result.append(self._crawl(each, **kwargs)) return result def is_debugger(self): """Return true if running in debugger""" return self.__env__.get('debugger') def send_message(self, project, msg, url='data:,on_message'): """Send messages to other project.""" self._messages.append((project, msg, url)) def on_message(self, project, msg): """Receive message from other project, override me.""" pass def on_result(self, result): """Receiving returns from other callback, override me.""" if not result: return assert self.task, "on_result can't outside a callback." if self.is_debugger(): pprint(result) if self.__env__.get('result_queue'): self.__env__['result_queue'].put((self.task, result)) def on_finished(self, response, task): """ Triggered when all tasks in task queue finished. http://docs.pyspider.org/en/latest/About-Projects/#on_finished-callback """ pass @not_send_status def _on_message(self, response): project, msg = response.save return self.on_message(project, msg) @not_send_status def _on_cronjob(self, response, task): if (not response.save or not isinstance(response.save, dict) or 'tick' not in response.save): return # When triggered, a '_on_cronjob' task is sent from scheudler with 'tick' in # Response.save. Scheduler may at least send the trigger task every GCD of the # inverval of the cronjobs. The method should check the tick for each cronjob # function to confirm the execute interval. for cronjob in self._cron_jobs: if response.save['tick'] % cronjob.tick != 0: continue function = cronjob.__get__(self, self.__class__) self._run_func(function, response, task) def _on_get_info(self, response, task): """Sending runtime infomation about this script.""" for each in response.save or []: if each == 'min_tick': self.save[each] = self._min_tick elif each == 'retry_delay': if not isinstance(self.retry_delay, dict): self.retry_delay = {'': self.retry_delay} self.save[each] = self.retry_delay elif each == 'crawl_config': self.save[each] = self.crawl_config ================================================ FILE: pyspider/libs/bench.py ================================================ #!/usr/bin/env python # -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # Created on 2014-12-08 22:23:10 # rate: 10000000000 # burst: 10000000000 import time import logging logger = logging.getLogger('bench') from six.moves import queue as Queue from pyspider.scheduler import ThreadBaseScheduler as Scheduler from pyspider.fetcher.tornado_fetcher import Fetcher from pyspider.processor import Processor from pyspider.result import ResultWorker from pyspider.libs.utils import md5string def bench_test_taskdb(taskdb): project_name = '__bench_test__' task = { "fetch": { "fetch_type": "js", "headers": { "User-Agent": "BaiDuSpider" } }, "process": { "callback": "detail_page" }, "project": project_name, "taskid": "553300d2582154413b4982c00c34a2d5", "url": "http://www.sciencedirect.com/science/article/pii/S1674200109000704" } track = { "fetch": { "content": None, "encoding": "unicode", "error": None, "headers": { "last-modified": "Wed, 04 Mar 2015 09:24:33 GMT" }, "ok": True, "redirect_url": None, "status_code": 200, "time": 5.543 }, "process": { "exception": None, "follows": 4, "logs": "", "ok": True, "result": "{'url': u'", "time": 0.07105398178100586 } } def test_insert(n, start=0): logger.info("taskdb insert %d", n) start_time = time.time() for i in range(n): task['url'] = 'http://bench.pyspider.org/?l=%d' % (i + start) task['taskid'] = md5string(task['url']) task['track'] = {} taskdb.insert(task['project'], task['taskid'], task) end_time = time.time() cost_time = end_time - start_time logger.info("cost %.2fs, %.2f/s %.2fms", cost_time, n * 1.0 / cost_time, cost_time / n * 1000) def test_update(n, start=0): logger.info("taskdb update %d" % n) start_time = time.time() for i in range(n): task['url'] = 'http://bench.pyspider.org/?l=%d' % (i + start) task['taskid'] = md5string(task['url']) task['track'] = track taskdb.update(task['project'], task['taskid'], task) end_time = time.time() cost_time = end_time - start_time logger.info("cost %.2fs, %.2f/s %.2fms", cost_time, n * 1.0 / cost_time, cost_time / n * 1000) request_task_fields = [ 'taskid', 'project', 'url', 'status', 'fetch', 'process', 'track', 'lastcrawltime' ] def test_get(n, start=0, random=True, fields=request_task_fields): logger.info("taskdb get %d %s" % (n, "randomly" if random else "")) range_n = list(range(n)) if random: from random import shuffle shuffle(range_n) start_time = time.time() for i in range_n: task['url'] = 'http://bench.pyspider.org/?l=%d' % (i + start) task['taskid'] = md5string(task['url']) task['track'] = track taskdb.get_task(task['project'], task['taskid'], fields=fields) end_time = time.time() cost_time = end_time - start_time logger.info("cost %.2fs, %.2f/s %.2fms", cost_time, n * 1.0 / cost_time, cost_time / n * 1000) try: test_insert(1000) test_update(1000) test_get(1000) test_insert(10000, 1000) test_update(10000, 1000) test_get(10000, 1000) except Exception as e: logger.exception(e) finally: taskdb.drop(project_name) def bench_test_message_queue(queue): task = { "fetch": { "fetch_type": "js", "headers": { "User-Agent": "BaiDuSpider" } }, "process": { "callback": "detail_page" }, "project": "__bench_test__", "taskid": "553300d2582154413b4982c00c34a2d5", "url": "http://www.sciencedirect.com/science/article/pii/S1674200109000704" } def test_put(n): logger.info("message queue put %d", n) start_time = time.time() for i in range(n): task['url'] = 'http://bench.pyspider.org/?l=%d' % i task['taskid'] = md5string(task['url']) queue.put(task, block=True, timeout=1) end_time = time.time() cost_time = end_time - start_time logger.info("cost %.2fs, %.2f/s %.2fms", cost_time, n * 1.0 / cost_time, cost_time / n * 1000) def test_get(n): logger.info("message queue get %d", n) start_time = time.time() for i in range(n): try: queue.get(True, 1) except Queue.Empty: logger.error('message queue empty while get %d', i) raise end_time = time.time() cost_time = end_time - start_time logger.info("cost %.2fs, %.2f/s %.2fms", cost_time, n * 1.0 / cost_time, cost_time / n * 1000) try: test_put(1000) test_get(1000) test_put(10000) test_get(10000) except Exception as e: logger.exception(e) finally: if hasattr(queue, 'channel'): queue.channel.queue_purge(queue.name) # clear message queue try: while queue.get(False): continue except Queue.Empty: pass class BenchMixin(object): """Report to logger for bench test""" def _bench_init(self): self.done_cnt = 0 self.start_time = time.time() self.last_cnt = 0 self.last_report = 0 def _bench_report(self, name, prefix=0, rjust=0): self.done_cnt += 1 now = time.time() if now - self.last_report >= 1: rps = float(self.done_cnt - self.last_cnt) / (now - self.last_report) output = '' if prefix: output += " " * prefix output += ("%s %s pages (at %d pages/min)" % ( name, self.done_cnt, rps * 60.0)).rjust(rjust) logger.info(output) self.last_cnt = self.done_cnt self.last_report = now class BenchScheduler(Scheduler, BenchMixin): def __init__(self, *args, **kwargs): super(BenchScheduler, self).__init__(*args, **kwargs) self._bench_init() def on_task_status(self, task): self._bench_report('Crawled') return super(BenchScheduler, self).on_task_status(task) class BenchFetcher(Fetcher, BenchMixin): def __init__(self, *args, **kwargs): super(BenchFetcher, self).__init__(*args, **kwargs) self._bench_init() def on_result(self, type, task, result): self._bench_report("Fetched", 0, 75) return super(BenchFetcher, self).on_result(type, task, result) class BenchProcessor(Processor, BenchMixin): def __init__(self, *args, **kwargs): super(BenchProcessor, self).__init__(*args, **kwargs) self._bench_init() def on_task(self, task, response): self._bench_report("Processed", 75) return super(BenchProcessor, self).on_task(task, response) class BenchResultWorker(ResultWorker, BenchMixin): def __init__(self, *args, **kwargs): super(BenchResultWorker, self).__init__(*args, **kwargs) self._bench_init() def on_result(self, task, result): self._bench_report("Saved", 0, 150) super(BenchResultWorker, self).on_result(task, result) from pyspider.libs.base_handler import BaseHandler class Handler(BaseHandler): def on_start(self, response): self.crawl('http://127.0.0.1:5000/bench', params={'total': response.save.get('total', 10000), 'show': response.save.get('show', 20)}, callback=self.index_page) def index_page(self, response): for each in response.doc('a[href^="http://"]').items(): self.crawl(each.attr.href, callback=self.index_page) return response.url ================================================ FILE: pyspider/libs/counter.py ================================================ #!/usr/bin/env python # -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # Created on 2012-11-14 17:09:50 from __future__ import unicode_literals, division, absolute_import import time import logging from collections import deque try: from UserDict import DictMixin except ImportError: from collections import Mapping as DictMixin import six from six import iteritems from six.moves import cPickle class BaseCounter(object): def __init__(self): pass def event(self, value=1): """Fire a event.""" raise NotImplementedError def value(self, value): """Set counter value.""" raise NotImplementedError @property def avg(self): """Get average value""" raise NotImplementedError @property def sum(self): """Get sum of counter""" raise NotImplementedError def empty(self): """Clear counter""" raise NotImplementedError class TotalCounter(BaseCounter): """Total counter""" def __init__(self): super(TotalCounter, self).__init__() self.cnt = 0 def event(self, value=1): self.cnt += value def value(self, value): self.cnt = value @property def avg(self): return self.cnt @property def sum(self): return self.cnt def empty(self): return self.cnt == 0 class AverageWindowCounter(BaseCounter): """ Record last N(window) value """ def __init__(self, window_size=300): super(AverageWindowCounter, self).__init__() self.window_size = window_size self.values = deque(maxlen=window_size) def event(self, value=1): self.values.append(value) value = event @property def avg(self): return self.sum / len(self.values) @property def sum(self): return sum(self.values) def empty(self): if not self.values: return True class TimebaseAverageEventCounter(BaseCounter): """ Record last window_size * window_interval seconds event. records will trim ever window_interval seconds """ def __init__(self, window_size=30, window_interval=10): super(TimebaseAverageEventCounter, self).__init__() self.max_window_size = window_size self.window_size = 0 self.window_interval = window_interval self.values = deque(maxlen=window_size) self.events = deque(maxlen=window_size) self.times = deque(maxlen=window_size) self.cache_value = 0 self.cache_event = 0 self.cache_start = None self._first_data_time = None def event(self, value=1): now = time.time() if self._first_data_time is None: self._first_data_time = now if self.cache_start is None: self.cache_value = value self.cache_event = 1 self.cache_start = now elif now - self.cache_start > self.window_interval: self.values.append(self.cache_value) self.events.append(self.cache_event) self.times.append(self.cache_start) self.on_append(self.cache_value, self.cache_start) self.cache_value = value self.cache_event = 1 self.cache_start = now else: self.cache_value += value self.cache_event += 1 return self def value(self, value): self.cache_value = value def _trim_window(self): now = time.time() if self.cache_start and now - self.cache_start > self.window_interval: self.values.append(self.cache_value) self.events.append(self.cache_event) self.times.append(self.cache_start) self.on_append(self.cache_value, self.cache_start) self.cache_value = 0 self.cache_start = None if self.window_size != self.max_window_size and self._first_data_time is not None: time_passed = now - self._first_data_time self.window_size = min(self.max_window_size, time_passed / self.window_interval) window_limit = now - self.window_size * self.window_interval while self.times and self.times[0] < window_limit: self.times.popleft() self.events.popleft() self.values.popleft() @property def avg(self): events = (sum(self.events) + self.cache_event) if not events: return 0 return float(self.sum) / events @property def sum(self): self._trim_window() return sum(self.values) + self.cache_value def empty(self): self._trim_window() if not self.values and not self.cache_start: return True def on_append(self, value, time): pass class TimebaseAverageWindowCounter(BaseCounter): """ Record last window_size * window_interval seconds values. records will trim ever window_interval seconds """ def __init__(self, window_size=30, window_interval=10): super(TimebaseAverageWindowCounter, self).__init__() self.max_window_size = window_size self.window_size = 0 self.window_interval = window_interval self.values = deque(maxlen=window_size) self.times = deque(maxlen=window_size) self.cache_value = 0 self.cache_start = None self._first_data_time = None def event(self, value=1): now = time.time() if self._first_data_time is None: self._first_data_time = now if self.cache_start is None: self.cache_value = value self.cache_start = now elif now - self.cache_start > self.window_interval: self.values.append(self.cache_value) self.times.append(self.cache_start) self.on_append(self.cache_value, self.cache_start) self.cache_value = value self.cache_start = now else: self.cache_value += value return self def value(self, value): self.cache_value = value def _trim_window(self): now = time.time() if self.cache_start and now - self.cache_start > self.window_interval: self.values.append(self.cache_value) self.times.append(self.cache_start) self.on_append(self.cache_value, self.cache_start) self.cache_value = 0 self.cache_start = None if self.window_size != self.max_window_size and self._first_data_time is not None: time_passed = now - self._first_data_time self.window_size = min(self.max_window_size, time_passed / self.window_interval) window_limit = now - self.window_size * self.window_interval while self.times and self.times[0] < window_limit: self.times.popleft() self.values.popleft() @property def avg(self): sum = float(self.sum) if not self.window_size: return 0 return sum / self.window_size / self.window_interval @property def sum(self): self._trim_window() return sum(self.values) + self.cache_value def empty(self): self._trim_window() if not self.values and not self.cache_start: return True def on_append(self, value, time): pass class CounterValue(DictMixin): """ A dict like value item for CounterManager. """ def __init__(self, manager, keys): self.manager = manager self._keys = keys def __getitem__(self, key): if key == '__value__': key = self._keys return self.manager.counters[key] else: key = self._keys + (key, ) available_keys = [] for _key in list(self.manager.counters.keys()): if _key[:len(key)] == key: available_keys.append(_key) if len(available_keys) == 0: raise KeyError elif len(available_keys) == 1: if available_keys[0] == key: return self.manager.counters.get(key) else: return CounterValue(self.manager, key) else: return CounterValue(self.manager, key) def __len__(self): return len(self.keys()) def __iter__(self): return iter(self.keys()) def __contains__(self, key): return key in self.keys() def keys(self): result = set() for key in list(self.manager.counters.keys()): if key[:len(self._keys)] == self._keys: key = key[len(self._keys):] result.add(key[0] if key else '__value__') return result def to_dict(self, get_value=None): """Dump counters as a dict""" result = {} for key, value in iteritems(self): if isinstance(value, BaseCounter): if get_value is not None: value = getattr(value, get_value) result[key] = value else: result[key] = value.to_dict(get_value) return result class CounterManager(DictMixin): """ A dict like counter manager. When using a tuple as event key, say: ('foo', 'bar'), You can visite counter with manager['foo']['bar']. Or get all counters which first element is 'foo' by manager['foo']. It's useful for a group of counters. """ def __init__(self, cls=TimebaseAverageWindowCounter): """init manager with Counter cls""" self.cls = cls self.counters = {} def event(self, key, value=1): """Fire a event of a counter by counter key""" if isinstance(key, six.string_types): key = (key, ) assert isinstance(key, tuple), "event key type error" if key not in self.counters: self.counters[key] = self.cls() self.counters[key].event(value) return self def value(self, key, value=1): """Set value of a counter by counter key""" if isinstance(key, six.string_types): key = (key, ) # assert all(isinstance(k, six.string_types) for k in key) assert isinstance(key, tuple), "event key type error" if key not in self.counters: self.counters[key] = self.cls() self.counters[key].value(value) return self def trim(self): """Clear not used counters""" for key, value in list(iteritems(self.counters)): if value.empty(): del self.counters[key] def __getitem__(self, key): key = (key, ) available_keys = [] for _key in list(self.counters.keys()): if _key[:len(key)] == key: available_keys.append(_key) if len(available_keys) == 0: raise KeyError elif len(available_keys) == 1: if available_keys[0] == key: return self.counters.get(key) else: return CounterValue(self, key) else: return CounterValue(self, key) def __delitem__(self, key): key = (key, ) available_keys = [] for _key in list(self.counters.keys()): if _key[:len(key)] == key: available_keys.append(_key) for _key in available_keys: del self.counters[_key] def __iter__(self): return iter(self.keys()) def __len__(self): return len(self.keys()) def keys(self): result = set() for key in self.counters.keys(): result.add(key[0] if key else ()) return result def to_dict(self, get_value=None): """Dump counters as a dict""" self.trim() result = {} for key, value in iteritems(self.counters): if get_value is not None: value = getattr(value, get_value) r = result for _key in key[:-1]: r = r.setdefault(_key, {}) r[key[-1]] = value return result def dump(self, filename): """Dump counters to file""" try: with open(filename, 'wb') as fp: cPickle.dump(self.counters, fp) except Exception as e: logging.warning("can't dump counter to file %s: %s", filename, e) return False return True def load(self, filename): """Load counters to file""" try: with open(filename, 'rb') as fp: self.counters = cPickle.load(fp) except: logging.debug("can't load counter from file: %s", filename) return False return True ================================================ FILE: pyspider/libs/dataurl.py ================================================ #!/usr/bin/env python # -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # Created on 2012-11-16 10:33:20 import six from base64 import b64encode, b64decode from . import utils from six.moves.urllib.parse import quote, unquote def encode(data, mime_type='', charset='utf-8', base64=True): """ Encode data to DataURL """ if isinstance(data, six.text_type): data = data.encode(charset) else: charset = None if base64: data = utils.text(b64encode(data)) else: data = utils.text(quote(data)) result = ['data:', ] if mime_type: result.append(mime_type) if charset: result.append(';charset=') result.append(charset) if base64: result.append(';base64') result.append(',') result.append(data) return ''.join(result) def decode(data_url): """ Decode DataURL data """ metadata, data = data_url.rsplit(',', 1) _, metadata = metadata.split('data:', 1) parts = metadata.split(';') if parts[-1] == 'base64': data = b64decode(data) else: data = unquote(data) for part in parts: if part.startswith("charset="): data = data.decode(part[8:]) return data ================================================ FILE: pyspider/libs/log.py ================================================ #!/usr/bin/env python # -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # Created on 2012-10-24 16:08:17 import logging try: import curses except ImportError: curses = None from tornado.log import LogFormatter as _LogFormatter class LogFormatter(_LogFormatter, object): """Init tornado.log.LogFormatter from logging.config.fileConfig""" def __init__(self, fmt=None, datefmt=None, color=True, *args, **kwargs): if fmt is None: fmt = _LogFormatter.DEFAULT_FORMAT super(LogFormatter, self).__init__(color=color, fmt=fmt, *args, **kwargs) class SaveLogHandler(logging.Handler): """LogHandler that save records to a list""" def __init__(self, saveto=None, *args, **kwargs): self.saveto = saveto logging.Handler.__init__(self, *args, **kwargs) def emit(self, record): if self.saveto is not None: self.saveto.append(record) handle = emit def enable_pretty_logging(logger=logging.getLogger()): channel = logging.StreamHandler() channel.setFormatter(LogFormatter()) logger.addHandler(channel) ================================================ FILE: pyspider/libs/multiprocessing_queue.py ================================================ import six import platform import multiprocessing from multiprocessing.queues import Queue as BaseQueue # The SharedCounter and Queue classes come from: # https://github.com/vterron/lemon/commit/9ca6b4b class SharedCounter(object): """ A synchronized shared counter. The locking done by multiprocessing.Value ensures that only a single process or thread may read or write the in-memory ctypes object. However, in order to do n += 1, Python performs a read followed by a write, so a second process may read the old value before the new one is written by the first process. The solution is to use a multiprocessing.Lock to guarantee the atomicity of the modifications to Value. This class comes almost entirely from Eli Bendersky's blog: http://eli.thegreenplace.net/2012/01/04/shared-counter-with-pythons-multiprocessing/ """ def __init__(self, n=0): self.count = multiprocessing.Value('i', n) def increment(self, n=1): """ Increment the counter by n (default = 1) """ with self.count.get_lock(): self.count.value += n @property def value(self): """ Return the value of the counter """ return self.count.value class MultiProcessingQueue(BaseQueue): """ A portable implementation of multiprocessing.Queue. Because of multithreading / multiprocessing semantics, Queue.qsize() may raise the NotImplementedError exception on Unix platforms like Mac OS X where sem_getvalue() is not implemented. This subclass addresses this problem by using a synchronized shared counter (initialized to zero) and increasing / decreasing its value every time the put() and get() methods are called, respectively. This not only prevents NotImplementedError from being raised, but also allows us to implement a reliable version of both qsize() and empty(). """ def __init__(self, *args, **kwargs): super(MultiProcessingQueue, self).__init__(*args, **kwargs) self.size = SharedCounter(0) def put(self, *args, **kwargs): self.size.increment(1) super(MultiProcessingQueue, self).put(*args, **kwargs) def get(self, *args, **kwargs): v = super(MultiProcessingQueue, self).get(*args, **kwargs) self.size.increment(-1) return v def qsize(self): """ Reliable implementation of multiprocessing.Queue.qsize() """ return self.size.value if platform.system() == 'Darwin': if hasattr(multiprocessing, 'get_context'): # for py34 def Queue(maxsize=0): return MultiProcessingQueue(maxsize, ctx=multiprocessing.get_context()) else: def Queue(maxsize=0): return MultiProcessingQueue(maxsize) else: from multiprocessing import Queue # flake8: noqa ================================================ FILE: pyspider/libs/pprint.py ================================================ # Author: Fred L. Drake, Jr. # fdrake@... # # This is a simple little module I wrote to make life easier. I didn't # see anything quite like it in the library, though I may have overlooked # something. I wrote this when I was trying to read some heavily nested # tuples with fairly non-descriptive content. This is modeled very much # after Lisp/Scheme - style pretty-printing of lists. If you find it # useful, thank small children who sleep at night. """Support to pretty-print lists, tuples, & dictionaries recursively. Very simple, but useful, especially in debugging data structures. Classes ------- PrettyPrinter() Handle pretty-printing operations onto a stream using a configured set of formatting parameters. Functions --------- pformat() Format a Python object into a pretty-printed representation. pprint() Pretty-print a Python object to a stream [default is sys.stdout]. saferepr() Generate a 'standard' repr()-like value, but protect against recursive data structures. """ from __future__ import print_function import six import sys as _sys from io import BytesIO, StringIO __all__ = ["pprint", "pformat", "isreadable", "isrecursive", "saferepr", "PrettyPrinter"] # cache these for faster access: _commajoin = ", ".join _id = id _len = len _type = type def pprint(object, stream=None, indent=1, width=80, depth=None): """Pretty-print a Python object to a stream [default is sys.stdout].""" printer = PrettyPrinter( stream=stream, indent=indent, width=width, depth=depth) printer.pprint(object) def pformat(object, indent=1, width=80, depth=None): """Format a Python object into a pretty-printed representation.""" return PrettyPrinter(indent=indent, width=width, depth=depth).pformat(object) def saferepr(object): """Version of repr() which can handle recursive data structures.""" return _safe_repr(object, {}, None, 0)[0] def isreadable(object): """Determine if saferepr(object) is readable by eval().""" return _safe_repr(object, {}, None, 0)[1] def isrecursive(object): """Determine if object requires a recursive representation.""" return _safe_repr(object, {}, None, 0)[2] def _sorted(iterable): return sorted(iterable) class PrettyPrinter: def __init__(self, indent=1, width=80, depth=None, stream=None): """Handle pretty printing operations onto a stream using a set of configured parameters. indent Number of spaces to indent for each level of nesting. width Attempted maximum number of columns in the output. depth The maximum depth to print out nested structures. stream The desired output stream. If omitted (or false), the standard output stream available at construction will be used. """ indent = int(indent) width = int(width) assert indent >= 0, "indent must be >= 0" assert depth is None or depth > 0, "depth must be > 0" assert width, "width must be != 0" self._depth = depth self._indent_per_level = indent self._width = width if stream is not None: self._stream = stream else: self._stream = _sys.stdout def pprint(self, object): self._format(object, self._stream, 0, 0, {}, 0) self._stream.write("\n") def pformat(self, object): sio = BytesIO() self._format(object, sio, 0, 0, {}, 0) return sio.getvalue() def isrecursive(self, object): return self.format(object, {}, 0, 0)[2] def isreadable(self, object): s, readable, recursive = self.format(object, {}, 0, 0) return readable and not recursive def _format(self, object, stream, indent, allowance, context, level): level = level + 1 objid = _id(object) if objid in context: stream.write(_recursion(object)) self._recursive = True self._readable = False return rep = self._repr(object, context, level - 1) typ = _type(object) sepLines = _len(rep) > (self._width - 1 - indent - allowance) write = stream.write if self._depth and level > self._depth: write(rep) return r = getattr(typ, "__repr__", None) if issubclass(typ, dict) and r is dict.__repr__: write('{') if self._indent_per_level > 1: write((self._indent_per_level - 1) * ' ') length = _len(object) if length: context[objid] = 1 indent = indent + self._indent_per_level items = _sorted(object.items()) key, ent = items[0] rep = self._repr(key, context, level) write(rep) write(': ') self._format(ent, stream, indent + _len(rep) + 2, allowance + 1, context, level) if length > 1: for key, ent in items[1:]: rep = self._repr(key, context, level) if sepLines: write(',\n%s%s: ' % (' ' * indent, rep)) else: write(', %s: ' % rep) self._format(ent, stream, indent + _len(rep) + 2, allowance + 1, context, level) indent = indent - self._indent_per_level del context[objid] write('}') return if ( (issubclass(typ, list) and r is list.__repr__) or (issubclass(typ, tuple) and r is tuple.__repr__) or (issubclass(typ, set) and r is set.__repr__) or (issubclass(typ, frozenset) and r is frozenset.__repr__) ): length = _len(object) if issubclass(typ, list): write('[') endchar = ']' elif issubclass(typ, set): if not length: write('set()') return write('set([') endchar = '])' object = _sorted(object) indent += 4 elif issubclass(typ, frozenset): if not length: write('frozenset()') return write('frozenset([') endchar = '])' object = _sorted(object) indent += 10 else: write('(') endchar = ')' if self._indent_per_level > 1 and sepLines: write((self._indent_per_level - 1) * ' ') if length: context[objid] = 1 indent = indent + self._indent_per_level self._format(object[0], stream, indent, allowance + 1, context, level) if length > 1: for ent in object[1:]: if sepLines: write(',\n' + ' ' * indent) else: write(', ') self._format(ent, stream, indent, allowance + 1, context, level) indent = indent - self._indent_per_level del context[objid] if issubclass(typ, tuple) and length == 1: write(',') write(endchar) return write(rep) def _repr(self, object, context, level): repr, readable, recursive = self.format(object, context.copy(), self._depth, level) if not readable: self._readable = False if recursive: self._recursive = True return repr def format(self, object, context, maxlevels, level): """Format object for a specific context, returning a string and flags indicating whether the representation is 'readable' and whether the object represents a recursive construct. """ return _safe_repr(object, context, maxlevels, level) # Return triple (repr_string, isreadable, isrecursive). def _safe_repr(object, context, maxlevels, level): typ = _type(object) if typ is str: string = object string = string.replace('\n', '\\n').replace('\r', '\\r').replace('\t', '\\t') if 'locale' not in _sys.modules: return repr(object), True, False if "'" in object and '"' not in object: closure = '"' quotes = {'"': '\\"'} string = string.replace('"', '\\"') else: closure = "'" quotes = {"'": "\\'"} string = string.replace("'", "\\'") try: string.decode('utf8').encode('gbk', 'replace') return ("%s%s%s" % (closure, string, closure)), True, False except: pass qget = quotes.get sio = StringIO() write = sio.write for char in object: if char.isalpha(): write(char) else: write(qget(char, repr(char)[1:-1])) return ("%s%s%s" % (closure, sio.getvalue(), closure)), True, False if typ is six.text_type: string = object.encode("utf8", 'replace') string = string.replace('\n', '\\n').replace('\r', '\\r').replace('\t', '\\t') if "'" in object and '"' not in object: closure = '"' quotes = {'"': '\\"'} string = string.replace('"', '\\"') else: closure = "'" quotes = {"'": "\\'"} string = string.replace("'", "\\'") return ("u%s%s%s" % (closure, string, closure)), True, False r = getattr(typ, "__repr__", None) if issubclass(typ, dict) and r is dict.__repr__: if not object: return "{}", True, False objid = _id(object) if maxlevels and level >= maxlevels: return "{...}", False, objid in context if objid in context: return _recursion(object), False, True context[objid] = 1 readable = True recursive = False components = [] append = components.append level += 1 saferepr = _safe_repr for k, v in _sorted(object.items()): krepr, kreadable, krecur = saferepr(k, context, maxlevels, level) vrepr, vreadable, vrecur = saferepr(v, context, maxlevels, level) append("%s: %s" % (krepr, vrepr)) readable = readable and kreadable and vreadable if krecur or vrecur: recursive = True del context[objid] return "{%s}" % _commajoin(components), readable, recursive if (issubclass(typ, list) and r is list.__repr__) or \ (issubclass(typ, tuple) and r is tuple.__repr__): if issubclass(typ, list): if not object: return "[]", True, False format = "[%s]" elif _len(object) == 1: format = "(%s,)" else: if not object: return "()", True, False format = "(%s)" objid = _id(object) if maxlevels and level >= maxlevels: return format % "...", False, objid in context if objid in context: return _recursion(object), False, True context[objid] = 1 readable = True recursive = False components = [] append = components.append level += 1 for o in object: orepr, oreadable, orecur = _safe_repr(o, context, maxlevels, level) append(orepr) if not oreadable: readable = False if orecur: recursive = True del context[objid] return format % _commajoin(components), readable, recursive rep = repr(object) return rep, (rep and not rep.startswith('<')), False def _recursion(object): return ("" % (_type(object).__name__, _id(object))) def _perfcheck(object=None): import time if object is None: object = [("string", (1, 2), [3, 4], {5: 6, 7: 8})] * 100000 p = PrettyPrinter() t1 = time.time() _safe_repr(object, {}, None, 0) t2 = time.time() p.pformat(object) t3 = time.time() print("_safe_repr:", t2 - t1) print("pformat:", t3 - t2) if __name__ == "__main__": _perfcheck() ================================================ FILE: pyspider/libs/response.py ================================================ #!/usr/bin/env python # -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # Created on 2012-11-02 11:16:02 import cgi import re import six import json import chardet import lxml.html import lxml.etree from tblib import Traceback from pyquery import PyQuery from requests.structures import CaseInsensitiveDict from requests import HTTPError from pyspider.libs import utils class Response(object): def __init__(self, status_code=None, url=None, orig_url=None, headers=CaseInsensitiveDict(), content='', cookies=None, error=None, traceback=None, save=None, js_script_result=None, time=0): if cookies is None: cookies = {} self.status_code = status_code self.url = url self.orig_url = orig_url self.headers = headers self.content = content self.cookies = cookies self.error = error self.traceback = traceback self.save = save self.js_script_result = js_script_result self.time = time def __repr__(self): return u'' % self.status_code def __bool__(self): """Returns true if `status_code` is 200 and no error""" return self.ok def __nonzero__(self): """Returns true if `status_code` is 200 and no error.""" return self.ok @property def ok(self): """Return true if `status_code` is 200 and no error.""" try: self.raise_for_status() except: return False return True @property def encoding(self): """ encoding of Response.content. if Response.encoding is None, encoding will be guessed by header or content or chardet if available. """ if hasattr(self, '_encoding'): return self._encoding # content is unicode if isinstance(self.content, six.text_type): return 'unicode' # Try charset from content-type or content encoding = get_encoding(self.headers, self.content) # Fallback to auto-detected encoding. if not encoding and chardet is not None: encoding = chardet.detect(self.content[:600])['encoding'] if encoding and encoding.lower() == 'gb2312': encoding = 'gb18030' self._encoding = encoding or 'utf-8' return self._encoding @encoding.setter def encoding(self, value): """ set encoding of content manually it will overwrite the guessed encoding """ self._encoding = value self._text = None @property def text(self): """ Content of the response, in unicode. if Response.encoding is None and chardet module is available, encoding will be guessed. """ if hasattr(self, '_text') and self._text: return self._text if not self.content: return u'' if isinstance(self.content, six.text_type): return self.content content = None encoding = self.encoding # Decode unicode from given encoding. try: content = self.content.decode(encoding, 'replace') except LookupError: # A LookupError is raised if the encoding was not found which could # indicate a misspelling or similar mistake. # # So we try blindly encoding. content = self.content.decode('utf-8', 'replace') self._text = content return content @property def json(self): """Returns the json-encoded content of the response, if any.""" if hasattr(self, '_json'): return self._json try: self._json = json.loads(self.text or self.content) except ValueError: self._json = None return self._json @property def doc(self): """Returns a PyQuery object of the response's content""" if hasattr(self, '_doc'): return self._doc elements = self.etree doc = self._doc = PyQuery(elements) doc.make_links_absolute(utils.text(self.url)) return doc @property def etree(self): """Returns a lxml object of the response's content that can be selected by xpath""" if not hasattr(self, '_elements'): try: parser = lxml.html.HTMLParser(encoding=self.encoding) self._elements = lxml.html.fromstring(self.content, parser=parser) except LookupError: # lxml would raise LookupError when encoding not supported # try fromstring without encoding instead. # on windows, unicode is not availabe as encoding for lxml self._elements = lxml.html.fromstring(self.content) if isinstance(self._elements, lxml.etree._ElementTree): self._elements = self._elements.getroot() return self._elements def raise_for_status(self, allow_redirects=True): """Raises stored :class:`HTTPError` or :class:`URLError`, if one occurred.""" if self.status_code == 304: return elif self.error: if self.traceback: six.reraise(Exception, Exception(self.error), Traceback.from_string(self.traceback).as_traceback()) http_error = HTTPError(self.error) elif (self.status_code >= 300) and (self.status_code < 400) and not allow_redirects: http_error = HTTPError('%s Redirection' % (self.status_code)) elif (self.status_code >= 400) and (self.status_code < 500): http_error = HTTPError('%s Client Error' % (self.status_code)) elif (self.status_code >= 500) and (self.status_code < 600): http_error = HTTPError('%s Server Error' % (self.status_code)) else: return http_error.response = self raise http_error def isok(self): try: self.raise_for_status() return True except: return False def rebuild_response(r): response = Response( status_code=r.get('status_code', 599), url=r.get('url', ''), headers=CaseInsensitiveDict(r.get('headers', {})), content=r.get('content', ''), cookies=r.get('cookies', {}), error=r.get('error'), traceback=r.get('traceback'), time=r.get('time', 0), orig_url=r.get('orig_url', r.get('url', '')), js_script_result=r.get('js_script_result'), save=r.get('save'), ) return response def get_encoding(headers, content): """Get encoding from request headers or page head.""" encoding = None content_type = headers.get('content-type') if content_type: _, params = cgi.parse_header(content_type) if 'charset' in params: encoding = params['charset'].strip("'\"") if not encoding: content = utils.pretty_unicode(content[:1000]) if six.PY3 else content charset_re = re.compile(r']', flags=re.I) pragma_re = re.compile(r']', flags=re.I) xml_re = re.compile(r'^<\?xml.*?encoding=["\']*(.+?)["\'>]') encoding = (charset_re.findall(content) + pragma_re.findall(content) + xml_re.findall(content)) encoding = encoding and encoding[0] or None return encoding ================================================ FILE: pyspider/libs/result_dump.py ================================================ #!/usr/bin/env python # -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # Created on 2015-03-27 20:12:11 import six import csv import json import itertools from io import StringIO, BytesIO from six import iteritems def result_formater(results): common_fields = None for result in results: result.setdefault('result', None) if isinstance(result['result'], dict): if common_fields is None: common_fields = set(result['result'].keys()) else: common_fields &= set(result['result'].keys()) else: common_fields = set() for result in results: result['result_formated'] = {} if not common_fields: result['others'] = result['result'] elif not isinstance(result['result'], dict): result['others'] = result['result'] else: result_formated = {} others = {} for key, value in iteritems(result['result']): if key in common_fields: result_formated[key] = value else: others[key] = value result['result_formated'] = result_formated result['others'] = others return common_fields or set(), results def dump_as_json(results, valid=False): first = True if valid: yield '[' for result in results: if valid: if first: first = False else: yield ', ' yield json.dumps(result, ensure_ascii=False) + '\n' if valid: yield ']' def dump_as_txt(results): for result in results: yield ( result.get('url', None) + '\t' + json.dumps(result.get('result', None), ensure_ascii=False) + '\n' ) def dump_as_csv(results): def toString(obj): if isinstance(obj, six.binary_type): if six.PY2: return obj else: return obj.decode('utf8') elif isinstance(obj, six.text_type): if six.PY2: return obj.encode('utf8') else: return obj else: if six.PY2: return json.dumps(obj, ensure_ascii=False).encode('utf8') else: return json.dumps(obj, ensure_ascii=False) # python2 needs byes when python3 needs unicode if six.PY2: stringio = BytesIO() else: stringio = StringIO() csv_writer = csv.writer(stringio) it = iter(results) first_30 = [] for result in it: first_30.append(result) if len(first_30) >= 30: break common_fields, _ = result_formater(first_30) common_fields_l = sorted(common_fields) csv_writer.writerow([toString('url')] + [toString(x) for x in common_fields_l] + [toString('...')]) for result in itertools.chain(first_30, it): result['result_formated'] = {} if not common_fields: result['others'] = result['result'] elif not isinstance(result['result'], dict): result['others'] = result['result'] else: result_formated = {} others = {} for key, value in iteritems(result['result']): if key in common_fields: result_formated[key] = value else: others[key] = value result['result_formated'] = result_formated result['others'] = others csv_writer.writerow( [toString(result['url'])] + [toString(result['result_formated'].get(k, '')) for k in common_fields_l] + [toString(result['others'])] ) yield stringio.getvalue() stringio.truncate(0) stringio.seek(0) ================================================ FILE: pyspider/libs/sample_handler.py ================================================ #!/usr/bin/env python # -*- encoding: utf-8 -*- # Created on __DATE__ # Project: __PROJECT_NAME__ from pyspider.libs.base_handler import * class Handler(BaseHandler): crawl_config = { } @every(minutes=24 * 60) def on_start(self): self.crawl('__START_URL__', callback=self.index_page) @config(age=10 * 24 * 60 * 60) def index_page(self, response): for each in response.doc('a[href^="http"]').items(): self.crawl(each.attr.href, callback=self.detail_page) @config(priority=2) def detail_page(self, response): return { "url": response.url, "title": response.doc('title').text(), } ================================================ FILE: pyspider/libs/url.py ================================================ #!/usr/bin/env python # -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # Created on 2012-11-09 14:39:57 import mimetypes import six import shlex from six.moves.urllib.parse import urlparse, urlunparse from requests.models import RequestEncodingMixin def get_content_type(filename): """Guessing file type by filename""" return mimetypes.guess_type(filename)[0] or 'application/octet-stream' _encode_params = RequestEncodingMixin._encode_params def _encode_multipart_formdata(fields, files): body, content_type = RequestEncodingMixin._encode_files(files, fields) return content_type, body def _build_url(url, _params): """Build the actual URL to use.""" # Support for unicode domain names and paths. scheme, netloc, path, params, query, fragment = urlparse(url) netloc = netloc.encode('idna').decode('utf-8') if not path: path = '/' if six.PY2: if isinstance(scheme, six.text_type): scheme = scheme.encode('utf-8') if isinstance(netloc, six.text_type): netloc = netloc.encode('utf-8') if isinstance(path, six.text_type): path = path.encode('utf-8') if isinstance(params, six.text_type): params = params.encode('utf-8') if isinstance(query, six.text_type): query = query.encode('utf-8') if isinstance(fragment, six.text_type): fragment = fragment.encode('utf-8') enc_params = _encode_params(_params) if enc_params: if query: query = '%s&%s' % (query, enc_params) else: query = enc_params url = (urlunparse([scheme, netloc, path, params, query, fragment])) return url def quote_chinese(url, encodeing="utf-8"): """Quote non-ascii characters""" if isinstance(url, six.text_type): return quote_chinese(url.encode(encodeing)) if six.PY3: res = [six.int2byte(b).decode('latin-1') if b < 128 else '%%%02X' % b for b in url] else: res = [b if ord(b) < 128 else '%%%02X' % ord(b) for b in url] return "".join(res) def curl_to_arguments(curl): kwargs = {} headers = {} command = None urls = [] current_opt = None for part in shlex.split(curl): if command is None: # curl command = part elif not part.startswith('-') and not current_opt: # waiting for url urls.append(part) elif current_opt is None and part.startswith('-'): # flags if part == '--compressed': kwargs['use_gzip'] = True else: current_opt = part else: # option if current_opt is None: raise TypeError('Unknow curl argument: %s' % part) elif current_opt in ('-H', '--header'): key_value = part.split(':', 1) if len(key_value) == 2: key, value = key_value headers[key.strip()] = value.strip() elif current_opt in ('-d', '--data'): kwargs['data'] = part elif current_opt in ('--data-binary'): if part[0] == '$': part = part[1:] kwargs['data'] = part elif current_opt in ('-X', '--request'): kwargs['method'] = part else: raise TypeError('Unknow curl option: %s' % current_opt) current_opt = None if not urls: raise TypeError('curl: no URL specified!') if current_opt: raise TypeError('Unknow curl option: %s' % current_opt) kwargs['urls'] = urls if headers: kwargs['headers'] = headers return kwargs ================================================ FILE: pyspider/libs/utils.py ================================================ #!/usr/bin/env python # -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # Created on 2012-11-06 11:50:13 import math import logging import hashlib import datetime import socket import base64 import warnings import threading import six from six import iteritems md5string = lambda x: hashlib.md5(utf8(x)).hexdigest() class ReadOnlyDict(dict): """A Read Only Dict""" def __setitem__(self, key, value): raise Exception("dict is read-only") def getitem(obj, key=0, default=None): """Get first element of list or return default""" try: return obj[key] except: return default def hide_me(tb, g=globals()): """Hide stack traceback of given stack""" base_tb = tb try: while tb and tb.tb_frame.f_globals is not g: tb = tb.tb_next while tb and tb.tb_frame.f_globals is g: tb = tb.tb_next except Exception as e: logging.exception(e) tb = base_tb if not tb: tb = base_tb return tb def run_in_thread(func, *args, **kwargs): """Run function in thread, return a Thread object""" from threading import Thread thread = Thread(target=func, args=args, kwargs=kwargs) thread.daemon = True thread.start() return thread def run_in_subprocess(func, *args, **kwargs): """Run function in subprocess, return a Process object""" from multiprocessing import Process thread = Process(target=func, args=args, kwargs=kwargs) thread.daemon = True thread.start() return thread def format_date(date, gmt_offset=0, relative=True, shorter=False, full_format=False): """Formats the given date (which should be GMT). By default, we return a relative time (e.g., "2 minutes ago"). You can return an absolute date string with ``relative=False``. You can force a full format date ("July 10, 1980") with ``full_format=True``. This method is primarily intended for dates in the past. For dates in the future, we fall back to full format. From tornado """ if not date: return '-' if isinstance(date, float) or isinstance(date, int): date = datetime.datetime.utcfromtimestamp(date) now = datetime.datetime.utcnow() if date > now: if relative and (date - now).seconds < 60: # Due to click skew, things are some things slightly # in the future. Round timestamps in the immediate # future down to now in relative mode. date = now else: # Otherwise, future dates always use the full format. full_format = True local_date = date - datetime.timedelta(minutes=gmt_offset) local_now = now - datetime.timedelta(minutes=gmt_offset) local_yesterday = local_now - datetime.timedelta(hours=24) difference = now - date seconds = difference.seconds days = difference.days format = None if not full_format: ret_, fff_format = fix_full_format(days, seconds, relative, shorter, local_date, local_yesterday) format = fff_format if ret_: return format else: format = format if format is None: format = "%(month_name)s %(day)s, %(year)s" if shorter else \ "%(month_name)s %(day)s, %(year)s at %(time)s" str_time = "%d:%02d" % (local_date.hour, local_date.minute) return format % { "month_name": local_date.strftime('%b'), "weekday": local_date.strftime('%A'), "day": str(local_date.day), "year": str(local_date.year), "month": local_date.month, "time": str_time } def fix_full_format(days, seconds, relative, shorter, local_date, local_yesterday): if relative and days == 0: if seconds < 50: return True, (("1 second ago" if seconds <= 1 else "%(seconds)d seconds ago") % {"seconds": seconds}) if seconds < 50 * 60: minutes = round(seconds / 60.0) return True, (("1 minute ago" if minutes <= 1 else "%(minutes)d minutes ago") % {"minutes": minutes}) hours = round(seconds / (60.0 * 60)) return True, (("1 hour ago" if hours <= 1 else "%(hours)d hours ago") % {"hours": hours}) format = None if days == 0: format = "%(time)s" elif days == 1 and local_date.day == local_yesterday.day and \ relative: format = "yesterday" if shorter else "yesterday at %(time)s" elif days < 5: format = "%(weekday)s" if shorter else "%(weekday)s at %(time)s" elif days < 334: # 11mo, since confusing for same month last year format = "%(month)s-%(day)s" if shorter else \ "%(month)s-%(day)s at %(time)s" return False, format class TimeoutError(Exception): pass try: import signal if not hasattr(signal, 'SIGALRM'): raise ImportError('signal') class timeout: """ Time limit of command with timeout(3): time.sleep(10) """ def __init__(self, seconds=1, error_message='Timeout'): self.seconds = seconds self.error_message = error_message def handle_timeout(self, signum, frame): raise TimeoutError(self.error_message) def __enter__(self): if not isinstance(threading.current_thread(), threading._MainThread): logging.warning("timeout only works on main thread, are you running pyspider in threads?") self.seconds = 0 if self.seconds: signal.signal(signal.SIGALRM, self.handle_timeout) signal.alarm(int(math.ceil(self.seconds))) def __exit__(self, type, value, traceback): if self.seconds: signal.alarm(0) except ImportError as e: warnings.warn("timeout is not supported on your platform.", FutureWarning) class timeout: """ Time limit of command (for windows) """ def __init__(self, seconds=1, error_message='Timeout'): pass def __enter__(self): pass def __exit__(self, type, value, traceback): pass def utf8(string): """ Make sure string is utf8 encoded bytes. If parameter is a object, object.__str__ will been called before encode as bytes """ if isinstance(string, six.text_type): return string.encode('utf8') elif isinstance(string, six.binary_type): return string else: return six.text_type(string).encode('utf8') def text(string, encoding='utf8'): """ Make sure string is unicode type, decode with given encoding if it's not. If parameter is a object, object.__str__ will been called """ if isinstance(string, six.text_type): return string elif isinstance(string, six.binary_type): return string.decode(encoding) else: return six.text_type(string) def pretty_unicode(string): """ Make sure string is unicode, try to decode with utf8, or unicode escaped string if failed. """ if isinstance(string, six.text_type): return string try: return string.decode("utf8") except UnicodeDecodeError: return string.decode('Latin-1').encode('unicode_escape').decode("utf8") def unicode_string(string): """ Make sure string is unicode, try to default with utf8, or base64 if failed. can been decode by `decode_unicode_string` """ if isinstance(string, six.text_type): return string try: return string.decode("utf8") except UnicodeDecodeError: return '[BASE64-DATA]' + base64.b64encode(string) + '[/BASE64-DATA]' def unicode_dict(_dict): """ Make sure keys and values of dict is unicode. """ r = {} for k, v in iteritems(_dict): r[unicode_obj(k)] = unicode_obj(v) return r def unicode_list(_list): """ Make sure every element in list is unicode. bytes will encode in base64 """ return [unicode_obj(x) for x in _list] def unicode_obj(obj): """ Make sure keys and values of dict/list/tuple is unicode. bytes will encode in base64. Can been decode by `decode_unicode_obj` """ if isinstance(obj, dict): return unicode_dict(obj) elif isinstance(obj, (list, tuple)): return unicode_list(obj) elif isinstance(obj, six.string_types): return unicode_string(obj) elif isinstance(obj, (int, float)): return obj elif obj is None: return obj else: try: return text(obj) except: return text(repr(obj)) def decode_unicode_string(string): """ Decode string encoded by `unicode_string` """ if string.startswith('[BASE64-DATA]') and string.endswith('[/BASE64-DATA]'): return base64.b64decode(string[len('[BASE64-DATA]'):-len('[/BASE64-DATA]')]) return string def decode_unicode_obj(obj): """ Decode unicoded dict/list/tuple encoded by `unicode_obj` """ if isinstance(obj, dict): r = {} for k, v in iteritems(obj): r[decode_unicode_string(k)] = decode_unicode_obj(v) return r elif isinstance(obj, six.string_types): return decode_unicode_string(obj) elif isinstance(obj, (list, tuple)): return [decode_unicode_obj(x) for x in obj] else: return obj class Get(object): """ Lazy value calculate for object """ def __init__(self, getter): self.getter = getter def __get__(self, instance, owner): return self.getter() class ObjectDict(dict): """ Object like dict, every dict[key] can visite by dict.key If dict[key] is `Get`, calculate it's value. """ def __getattr__(self, name): ret = self.__getitem__(name) if hasattr(ret, '__get__'): return ret.__get__(self, ObjectDict) return ret def load_object(name): """Load object from module""" if "." not in name: raise Exception('load object need module.object') module_name, object_name = name.rsplit('.', 1) if six.PY2: module = __import__(module_name, globals(), locals(), [utf8(object_name)], -1) else: module = __import__(module_name, globals(), locals(), [object_name]) return getattr(module, object_name) def get_python_console(namespace=None): """ Return a interactive python console instance with caller's stack """ if namespace is None: import inspect frame = inspect.currentframe() caller = frame.f_back if not caller: logging.error("can't find caller who start this console.") caller = frame namespace = dict(caller.f_globals) namespace.update(caller.f_locals) try: from IPython.terminal.interactiveshell import TerminalInteractiveShell shell = TerminalInteractiveShell(user_ns=namespace) except ImportError: try: import readline import rlcompleter readline.set_completer(rlcompleter.Completer(namespace).complete) readline.parse_and_bind("tab: complete") except ImportError: pass import code shell = code.InteractiveConsole(namespace) shell._quit = False def exit(): shell._quit = True def readfunc(prompt=""): if shell._quit: raise EOFError return six.moves.input(prompt) # inject exit method shell.ask_exit = exit shell.raw_input = readfunc return shell def python_console(namespace=None): """Start a interactive python console with caller's stack""" if namespace is None: import inspect frame = inspect.currentframe() caller = frame.f_back if not caller: logging.error("can't find caller who start this console.") caller = frame namespace = dict(caller.f_globals) namespace.update(caller.f_locals) return get_python_console(namespace=namespace).interact() def check_port_open(port, addr='127.0.0.1'): with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock: result = sock.connect_ex((addr, port)) if result == 0: return True else: return False ================================================ FILE: pyspider/libs/wsgi_xmlrpc.py ================================================ # Copyright (c) 2006-2007 Open Source Applications Foundation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # # Origin: https://code.google.com/p/wsgi-xmlrpc/ from six.moves.xmlrpc_server import SimpleXMLRPCDispatcher import logging logger = logging.getLogger(__name__) class WSGIXMLRPCApplication(object): """Application to handle requests to the XMLRPC service""" def __init__(self, instance=None, methods=None): """Create windmill xmlrpc dispatcher""" if methods is None: methods = [] try: self.dispatcher = SimpleXMLRPCDispatcher(allow_none=True, encoding=None) except TypeError: # python 2.4 self.dispatcher = SimpleXMLRPCDispatcher() if instance is not None: self.dispatcher.register_instance(instance) for method in methods: self.dispatcher.register_function(method) self.dispatcher.register_introspection_functions() def register_instance(self, instance): return self.dispatcher.register_instance(instance) def register_function(self, function, name=None): return self.dispatcher.register_function(function, name) def handler(self, environ, start_response): """XMLRPC service for windmill browser core to communicate with""" if environ['REQUEST_METHOD'] == 'POST': return self.handle_POST(environ, start_response) else: start_response("400 Bad request", [('Content-Type', 'text/plain')]) return [''] def handle_POST(self, environ, start_response): """Handles the HTTP POST request. Attempts to interpret all HTTP POST requests as XML-RPC calls, which are forwarded to the server's _dispatch method for handling. Most code taken from SimpleXMLRPCServer with modifications for wsgi and my custom dispatcher. """ try: # Get arguments by reading body of request. # We read this in chunks to avoid straining # socket.read(); around the 10 or 15Mb mark, some platforms # begin to have problems (bug #792570). length = int(environ['CONTENT_LENGTH']) data = environ['wsgi.input'].read(length) # In previous versions of SimpleXMLRPCServer, _dispatch # could be overridden in this class, instead of in # SimpleXMLRPCDispatcher. To maintain backwards compatibility, # check to see if a subclass implements _dispatch and # using that method if present. response = self.dispatcher._marshaled_dispatch( data, getattr(self.dispatcher, '_dispatch', None) ) response += b'\n' except Exception as e: # This should only happen if the module is buggy # internal error, report as HTTP server error logger.exception(e) start_response("500 Server error", [('Content-Type', 'text/plain')]) return [] else: # got a valid XML RPC response start_response("200 OK", [('Content-Type', 'text/xml'), ('Content-Length', str(len(response)),)]) return [response] def __call__(self, environ, start_response): return self.handler(environ, start_response) ================================================ FILE: pyspider/logging.conf ================================================ [loggers] keys=root,scheduler,fetcher,processor,webui,bench,werkzeug [logger_root] level=INFO handlers=screen [logger_scheduler] level=INFO handlers=screen qualname=scheduler propagate=0 [logger_fetcher] level=DEBUG handlers=screen qualname=fetcher propagate=0 [logger_processor] level=DEBUG handlers=screen qualname=processor propagate=0 [logger_webui] level=DEBUG handlers=screen qualname=webui propagate=0 [logger_bench] level=DEBUG handlers=screen qualname=bench propagate=0 [logger_werkzeug] level=INFO handlers=screen qualname=werkzeug propagate=0 [handlers] keys=screen [handler_screen] class=logging.StreamHandler formatter=pretty level=DEBUG args=(sys.stderr, ) [formatters] keys=pretty [formatter_pretty] class=pyspider.libs.log.LogFormatter ================================================ FILE: pyspider/message_queue/__init__.py ================================================ #!/usr/bin/env python # -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # Created on 2015-04-30 21:47:08 import logging try: from urllib import parse as urlparse except ImportError: import urlparse def connect_message_queue(name, url=None, maxsize=0, lazy_limit=True): """ create connection to message queue name: name of message queue rabbitmq: amqp://username:password@host:5672/%2F see https://www.rabbitmq.com/uri-spec.html redis: redis://host:6379/db redis://host1:port1,host2:port2,...,hostn:portn (for redis 3.x in cluster mode) kombu: kombu+transport://userid:password@hostname:port/virtual_host see http://kombu.readthedocs.org/en/latest/userguide/connections.html#urls builtin: None """ if not url: from pyspider.libs.multiprocessing_queue import Queue return Queue(maxsize=maxsize) parsed = urlparse.urlparse(url) if parsed.scheme == 'amqp': from .rabbitmq import Queue return Queue(name, url, maxsize=maxsize, lazy_limit=lazy_limit) elif parsed.scheme == 'redis': from .redis_queue import Queue if ',' in parsed.netloc: """ redis in cluster mode (there is no concept of 'db' in cluster mode) ex. redis://host1:port1,host2:port2,...,hostn:portn """ cluster_nodes = [] for netloc in parsed.netloc.split(','): cluster_nodes.append({'host': netloc.split(':')[0], 'port': int(netloc.split(':')[1])}) return Queue(name=name, maxsize=maxsize, lazy_limit=lazy_limit, cluster_nodes=cluster_nodes) else: db = parsed.path.lstrip('/').split('/') try: db = int(db[0]) except: logging.warning('redis DB must zero-based numeric index, using 0 instead') db = 0 password = parsed.password or None return Queue(name=name, host=parsed.hostname, port=parsed.port, db=db, maxsize=maxsize, password=password, lazy_limit=lazy_limit) elif url.startswith('kombu+'): url = url[len('kombu+'):] from .kombu_queue import Queue return Queue(name, url, maxsize=maxsize, lazy_limit=lazy_limit) else: raise Exception('unknown connection url: %s', url) ================================================ FILE: pyspider/message_queue/kombu_queue.py ================================================ #!/usr/bin/env python # -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # Created on 2015-05-22 20:54:01 import time import umsgpack from kombu import Connection, enable_insecure_serializers from kombu.serialization import register from kombu.exceptions import ChannelError from six.moves import queue as BaseQueue register('umsgpack', umsgpack.packb, umsgpack.unpackb, 'application/x-msgpack') enable_insecure_serializers(['umsgpack']) class KombuQueue(object): """ kombu is a high-level interface for multiple message queue backends. KombuQueue is built on top of kombu API. """ Empty = BaseQueue.Empty Full = BaseQueue.Full max_timeout = 0.3 def __init__(self, name, url="amqp://", maxsize=0, lazy_limit=True): """ Constructor for KombuQueue url: http://kombu.readthedocs.org/en/latest/userguide/connections.html#urls maxsize: an integer that sets the upperbound limit on the number of items that can be placed in the queue. """ self.name = name self.conn = Connection(url) self.queue = self.conn.SimpleQueue(self.name, no_ack=True, serializer='umsgpack') self.maxsize = maxsize self.lazy_limit = lazy_limit if self.lazy_limit and self.maxsize: self.qsize_diff_limit = int(self.maxsize * 0.1) else: self.qsize_diff_limit = 0 self.qsize_diff = 0 def qsize(self): try: return self.queue.qsize() except ChannelError: return 0 def empty(self): if self.qsize() == 0: return True else: return False def full(self): if self.maxsize and self.qsize() >= self.maxsize: return True else: return False def put(self, obj, block=True, timeout=None): if not block: return self.put_nowait(obj) start_time = time.time() while True: try: return self.put_nowait(obj) except BaseQueue.Full: if timeout: lasted = time.time() - start_time if timeout > lasted: time.sleep(min(self.max_timeout, timeout - lasted)) else: raise else: time.sleep(self.max_timeout) def put_nowait(self, obj): if self.lazy_limit and self.qsize_diff < self.qsize_diff_limit: pass elif self.full(): raise BaseQueue.Full else: self.qsize_diff = 0 return self.queue.put(obj) def get(self, block=True, timeout=None): try: ret = self.queue.get(block, timeout) return ret.payload except self.queue.Empty: raise BaseQueue.Empty def get_nowait(self): try: ret = self.queue.get_nowait() return ret.payload except self.queue.Empty: raise BaseQueue.Empty def delete(self): self.queue.queue.delete() def __del__(self): self.queue.close() Queue = KombuQueue ================================================ FILE: pyspider/message_queue/rabbitmq.py ================================================ #!/usr/bin/env python # -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux<17175297.hk@gmail.com> # http://binux.me # Created on 2012-11-15 17:27:54 import time import socket import select import logging import umsgpack import threading import amqp from six.moves.urllib.parse import unquote try: from urllib import parse as urlparse except ImportError: import urlparse from six.moves import queue as BaseQueue def catch_error(func): """Catch errors of rabbitmq then reconnect""" import amqp try: import pika.exceptions connect_exceptions = ( pika.exceptions.ConnectionClosed, pika.exceptions.AMQPConnectionError, ) except ImportError: connect_exceptions = () connect_exceptions += ( select.error, socket.error, amqp.ConnectionError ) def wrap(self, *args, **kwargs): try: return func(self, *args, **kwargs) except connect_exceptions as e: logging.error('RabbitMQ error: %r, reconnect.', e) self.reconnect() return func(self, *args, **kwargs) return wrap class PikaQueue(object): """ A Queue like rabbitmq connector """ Empty = BaseQueue.Empty Full = BaseQueue.Full max_timeout = 0.3 def __init__(self, name, amqp_url='amqp://guest:guest@localhost:5672/%2F', maxsize=0, lazy_limit=True): """ Constructor for a PikaQueue. Not works with python 3. Default for python 2. amqp_url: https://www.rabbitmq.com/uri-spec.html maxsize: an integer that sets the upperbound limit on the number of items that can be placed in the queue. lazy_limit: as rabbitmq is shared between multipul instance, for a strict limit on the number of items in the queue. PikaQueue have to update current queue size before every put operation. When `lazy_limit` is enabled, PikaQueue will check queue size every max_size / 10 put operation for better performace. """ self.name = name self.amqp_url = amqp_url self.maxsize = maxsize self.lock = threading.RLock() self.lazy_limit = lazy_limit if self.lazy_limit and self.maxsize: self.qsize_diff_limit = int(self.maxsize * 0.1) else: self.qsize_diff_limit = 0 self.qsize_diff = 0 self.reconnect() def reconnect(self): """Reconnect to rabbitmq server""" import pika import pika.exceptions self.connection = pika.BlockingConnection(pika.URLParameters(self.amqp_url)) self.channel = self.connection.channel() try: self.channel.queue_declare(self.name) except pika.exceptions.ChannelClosed: self.connection = pika.BlockingConnection(pika.URLParameters(self.amqp_url)) self.channel = self.connection.channel() #self.channel.queue_purge(self.name) @catch_error def qsize(self): with self.lock: ret = self.channel.queue_declare(self.name, passive=True) return ret.method.message_count def empty(self): if self.qsize() == 0: return True else: return False def full(self): if self.maxsize and self.qsize() >= self.maxsize: return True else: return False @catch_error def put(self, obj, block=True, timeout=None): if not block: return self.put_nowait() start_time = time.time() while True: try: return self.put_nowait(obj) except BaseQueue.Full: if timeout: lasted = time.time() - start_time if timeout > lasted: time.sleep(min(self.max_timeout, timeout - lasted)) else: raise else: time.sleep(self.max_timeout) @catch_error def put_nowait(self, obj): if self.lazy_limit and self.qsize_diff < self.qsize_diff_limit: pass elif self.full(): raise BaseQueue.Full else: self.qsize_diff = 0 with self.lock: self.qsize_diff += 1 return self.channel.basic_publish("", self.name, umsgpack.packb(obj)) @catch_error def get(self, block=True, timeout=None, ack=False): if not block: return self.get_nowait() start_time = time.time() while True: try: return self.get_nowait(ack) except BaseQueue.Empty: if timeout: lasted = time.time() - start_time if timeout > lasted: time.sleep(min(self.max_timeout, timeout - lasted)) else: raise else: time.sleep(self.max_timeout) @catch_error def get_nowait(self, ack=False): with self.lock: method_frame, header_frame, body = self.channel.basic_get(self.name, not ack) if method_frame is None: raise BaseQueue.Empty if ack: self.channel.basic_ack(method_frame.delivery_tag) return umsgpack.unpackb(body) @catch_error def delete(self): with self.lock: return self.channel.queue_delete(queue=self.name) class AmqpQueue(PikaQueue): Empty = BaseQueue.Empty Full = BaseQueue.Full max_timeout = 0.3 def __init__(self, name, amqp_url='amqp://guest:guest@localhost:5672/%2F', maxsize=0, lazy_limit=True): """ Constructor for a AmqpQueue. Default for python 3. amqp_url: https://www.rabbitmq.com/uri-spec.html maxsize: an integer that sets the upperbound limit on the number of items that can be placed in the queue. lazy_limit: as rabbitmq is shared between multipul instance, for a strict limit on the number of items in the queue. PikaQueue have to update current queue size before every put operation. When `lazy_limit` is enabled, PikaQueue will check queue size every max_size / 10 put operation for better performace. """ self.name = name self.amqp_url = amqp_url self.maxsize = maxsize self.lock = threading.RLock() self.lazy_limit = lazy_limit if self.lazy_limit and self.maxsize: self.qsize_diff_limit = int(self.maxsize * 0.1) else: self.qsize_diff_limit = 0 self.qsize_diff = 0 self.reconnect() def reconnect(self): """Reconnect to rabbitmq server""" parsed = urlparse.urlparse(self.amqp_url) port = parsed.port or 5672 self.connection = amqp.Connection(host="%s:%s" % (parsed.hostname, port), userid=parsed.username or 'guest', password=parsed.password or 'guest', virtual_host=unquote( parsed.path.lstrip('/') or '%2F')).connect() self.channel = self.connection.channel() try: self.channel.queue_declare(self.name) except amqp.exceptions.PreconditionFailed: pass #self.channel.queue_purge(self.name) @catch_error def qsize(self): with self.lock: name, message_count, consumer_count = self.channel.queue_declare( self.name, passive=True) return message_count @catch_error def put_nowait(self, obj): if self.lazy_limit and self.qsize_diff < self.qsize_diff_limit: pass elif self.full(): raise BaseQueue.Full else: self.qsize_diff = 0 with self.lock: self.qsize_diff += 1 msg = amqp.Message(umsgpack.packb(obj)) return self.channel.basic_publish(msg, exchange="", routing_key=self.name) @catch_error def get_nowait(self, ack=False): with self.lock: message = self.channel.basic_get(self.name, not ack) if message is None: raise BaseQueue.Empty if ack: self.channel.basic_ack(message.delivery_tag) return umsgpack.unpackb(message.body) Queue = PikaQueue ================================================ FILE: pyspider/message_queue/redis_queue.py ================================================ #!/usr/bin/env python # -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # Created on 2015-04-27 22:48:04 import time import redis import umsgpack from six.moves import queue as BaseQueue class RedisQueue(object): """ A Queue like message built over redis """ Empty = BaseQueue.Empty Full = BaseQueue.Full max_timeout = 0.3 def __init__(self, name, host='localhost', port=6379, db=0, maxsize=0, lazy_limit=True, password=None, cluster_nodes=None): """ Constructor for RedisQueue maxsize: an integer that sets the upperbound limit on the number of items that can be placed in the queue. lazy_limit: redis queue is shared via instance, a lazy size limit is used for better performance. """ self.name = name if(cluster_nodes is not None): from rediscluster import StrictRedisCluster self.redis = StrictRedisCluster(startup_nodes=cluster_nodes) else: self.redis = redis.StrictRedis(host=host, port=port, db=db, password=password) self.maxsize = maxsize self.lazy_limit = lazy_limit self.last_qsize = 0 def qsize(self): self.last_qsize = self.redis.llen(self.name) return self.last_qsize def empty(self): if self.qsize() == 0: return True else: return False def full(self): if self.maxsize and self.qsize() >= self.maxsize: return True else: return False def put_nowait(self, obj): if self.lazy_limit and self.last_qsize < self.maxsize: pass elif self.full(): raise self.Full self.last_qsize = self.redis.rpush(self.name, umsgpack.packb(obj)) return True def put(self, obj, block=True, timeout=None): if not block: return self.put_nowait(obj) start_time = time.time() while True: try: return self.put_nowait(obj) except self.Full: if timeout: lasted = time.time() - start_time if timeout > lasted: time.sleep(min(self.max_timeout, timeout - lasted)) else: raise else: time.sleep(self.max_timeout) def get_nowait(self): ret = self.redis.lpop(self.name) if ret is None: raise self.Empty return umsgpack.unpackb(ret) def get(self, block=True, timeout=None): if not block: return self.get_nowait() start_time = time.time() while True: try: return self.get_nowait() except self.Empty: if timeout: lasted = time.time() - start_time if timeout > lasted: time.sleep(min(self.max_timeout, timeout - lasted)) else: raise else: time.sleep(self.max_timeout) Queue = RedisQueue ================================================ FILE: pyspider/processor/__init__.py ================================================ from .processor import ProcessorResult, Processor ================================================ FILE: pyspider/processor/processor.py ================================================ #!/usr/bin/env python # -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # Created on 2014-02-16 22:59:56 import sys import six import time import logging import traceback logger = logging.getLogger("processor") from six.moves import queue as Queue from pyspider.libs import utils from pyspider.libs.log import LogFormatter from pyspider.libs.utils import pretty_unicode, hide_me from pyspider.libs.response import rebuild_response from .project_module import ProjectManager, ProjectFinder class ProcessorResult(object): """The result and logs producted by a callback""" def __init__(self, result=None, follows=(), messages=(), logs=(), exception=None, extinfo=None, save=None): if extinfo is None: extinfo = {} self.result = result self.follows = follows self.messages = messages self.logs = logs self.exception = exception self.extinfo = extinfo self.save = save def rethrow(self): """rethrow the exception""" if self.exception: raise self.exception def logstr(self): """handler the log records to formatted string""" result = [] formater = LogFormatter(color=False) for record in self.logs: if isinstance(record, six.string_types): result.append(pretty_unicode(record)) else: if record.exc_info: a, b, tb = record.exc_info tb = hide_me(tb, globals()) record.exc_info = a, b, tb result.append(pretty_unicode(formater.format(record))) result.append(u'\n') return u''.join(result) class Processor(object): PROCESS_TIME_LIMIT = 30 EXCEPTION_LIMIT = 3 RESULT_LOGS_LIMIT = 1000 RESULT_RESULT_LIMIT = 10 def __init__(self, projectdb, inqueue, status_queue, newtask_queue, result_queue, enable_stdout_capture=True, enable_projects_import=True, process_time_limit=PROCESS_TIME_LIMIT): self.inqueue = inqueue self.status_queue = status_queue self.newtask_queue = newtask_queue self.result_queue = result_queue self.projectdb = projectdb self.enable_stdout_capture = enable_stdout_capture self._quit = False self._exceptions = 10 self.project_manager = ProjectManager(projectdb, dict( result_queue=self.result_queue, enable_stdout_capture=self.enable_stdout_capture, process_time_limit=process_time_limit, )) if enable_projects_import: self.enable_projects_import() def enable_projects_import(self): ''' Enable import other project as module `from project import project_name` ''' sys.meta_path.append(ProjectFinder(self.projectdb)) def __del__(self): pass def on_task(self, task, response): '''Deal one task''' start_time = time.time() response = rebuild_response(response) try: assert 'taskid' in task, 'need taskid in task' project = task['project'] updatetime = task.get('project_updatetime', None) md5sum = task.get('project_md5sum', None) project_data = self.project_manager.get(project, updatetime, md5sum) assert project_data, "no such project!" if project_data.get('exception'): ret = ProcessorResult(logs=(project_data.get('exception_log'), ), exception=project_data['exception']) else: ret = project_data['instance'].run_task( project_data['module'], task, response) except Exception as e: logstr = traceback.format_exc() ret = ProcessorResult(logs=(logstr, ), exception=e) process_time = time.time() - start_time if not ret.extinfo.get('not_send_status', False): if ret.exception: track_headers = dict(response.headers) else: track_headers = {} for name in ('etag', 'last-modified'): if name not in response.headers: continue track_headers[name] = response.headers[name] status_pack = { 'taskid': task['taskid'], 'project': task['project'], 'url': task.get('url'), 'track': { 'fetch': { 'ok': response.isok(), 'redirect_url': response.url if response.url != response.orig_url else None, 'time': response.time, 'error': response.error, 'status_code': response.status_code, 'encoding': getattr(response, '_encoding', None), 'headers': track_headers, 'content': response.text[:500] if ret.exception else None, }, 'process': { 'ok': not ret.exception, 'time': process_time, 'follows': len(ret.follows), 'result': ( None if ret.result is None else utils.text(ret.result)[:self.RESULT_RESULT_LIMIT] ), 'logs': ret.logstr()[-self.RESULT_LOGS_LIMIT:], 'exception': ret.exception, }, 'save': ret.save, }, } if 'schedule' in task: status_pack['schedule'] = task['schedule'] # FIXME: unicode_obj should used in scheduler before store to database # it's used here for performance. self.status_queue.put(utils.unicode_obj(status_pack)) # FIXME: unicode_obj should used in scheduler before store to database # it's used here for performance. if ret.follows: for each in (ret.follows[x:x + 1000] for x in range(0, len(ret.follows), 1000)): self.newtask_queue.put([utils.unicode_obj(newtask) for newtask in each]) for project, msg, url in ret.messages: try: self.on_task({ 'taskid': utils.md5string(url), 'project': project, 'url': url, 'process': { 'callback': '_on_message', } }, { 'status_code': 200, 'url': url, 'save': (task['project'], msg), }) except Exception as e: logger.exception('Sending message error.') continue if ret.exception: logger_func = logger.error else: logger_func = logger.info logger_func('process %s:%s %s -> [%d] len:%d -> result:%.10r fol:%d msg:%d err:%r' % ( task['project'], task['taskid'], task.get('url'), response.status_code, len(response.content), ret.result, len(ret.follows), len(ret.messages), ret.exception)) return True def quit(self): '''Set quit signal''' self._quit = True def run(self): '''Run loop''' logger.info("processor starting...") while not self._quit: try: task, response = self.inqueue.get(timeout=1) self.on_task(task, response) self._exceptions = 0 except Queue.Empty as e: continue except KeyboardInterrupt: break except Exception as e: logger.exception(e) self._exceptions += 1 if self._exceptions > self.EXCEPTION_LIMIT: break continue logger.info("processor exiting...") ================================================ FILE: pyspider/processor/project_module.py ================================================ #!/usr/bin/env python # -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # Created on 2014-02-16 22:24:20 import os import six import sys import imp import time import weakref import logging import inspect import traceback import linecache from pyspider.libs import utils from pyspider.libs.log import SaveLogHandler, LogFormatter logger = logging.getLogger("processor") class ProjectManager(object): """ load projects from projectdb, update project """ CHECK_PROJECTS_INTERVAL = 5 * 60 RELOAD_PROJECT_INTERVAL = 60 * 60 @staticmethod def build_module(project, env=None): '''Build project script as module''' from pyspider.libs import base_handler assert 'name' in project, 'need name of project' assert 'script' in project, 'need script of project' if env is None: env = {} # fix for old non-package version scripts pyspider_path = os.path.join(os.path.dirname(__file__), "..") if pyspider_path not in sys.path: sys.path.insert(1, pyspider_path) env = dict(env) env.update({ 'debug': project.get('status', 'DEBUG') == 'DEBUG', }) loader = ProjectLoader(project) module = loader.load_module(project['name']) # logger inject module.log_buffer = [] module.logging = module.logger = logging.Logger(project['name']) if env.get('enable_stdout_capture', True): handler = SaveLogHandler(module.log_buffer) handler.setFormatter(LogFormatter(color=False)) else: handler = logging.StreamHandler() handler.setFormatter(LogFormatter(color=True)) module.logger.addHandler(handler) if '__handler_cls__' not in module.__dict__: BaseHandler = module.__dict__.get('BaseHandler', base_handler.BaseHandler) for each in list(six.itervalues(module.__dict__)): if inspect.isclass(each) and each is not BaseHandler \ and issubclass(each, BaseHandler): module.__dict__['__handler_cls__'] = each _class = module.__dict__.get('__handler_cls__') assert _class is not None, "need BaseHandler in project module" instance = _class() instance.__env__ = env instance.project_name = project['name'] instance.project = project return { 'loader': loader, 'module': module, 'class': _class, 'instance': instance, 'exception': None, 'exception_log': '', 'info': project, 'load_time': time.time(), } def __init__(self, projectdb, env): self.projectdb = projectdb self.env = env self.projects = {} self.last_check_projects = time.time() def _need_update(self, project_name, updatetime=None, md5sum=None): '''Check if project_name need update''' if project_name not in self.projects: return True elif md5sum and md5sum != self.projects[project_name]['info'].get('md5sum'): return True elif updatetime and updatetime > self.projects[project_name]['info'].get('updatetime', 0): return True elif time.time() - self.projects[project_name]['load_time'] > self.RELOAD_PROJECT_INTERVAL: return True return False def _check_projects(self): '''Check projects by last update time''' for project in self.projectdb.check_update(self.last_check_projects, ['name', 'updatetime']): if project['name'] not in self.projects: continue if project['updatetime'] > self.projects[project['name']]['info'].get('updatetime', 0): self._update_project(project['name']) self.last_check_projects = time.time() def _update_project(self, project_name): '''Update one project from database''' project = self.projectdb.get(project_name) if not project: return None return self._load_project(project) def _load_project(self, project): '''Load project into self.projects from project info dict''' try: project['md5sum'] = utils.md5string(project['script']) ret = self.build_module(project, self.env) self.projects[project['name']] = ret except Exception as e: logger.exception("load project %s error", project.get('name', None)) ret = { 'loader': None, 'module': None, 'class': None, 'instance': None, 'exception': e, 'exception_log': traceback.format_exc(), 'info': project, 'load_time': time.time(), } self.projects[project['name']] = ret return False logger.debug('project: %s updated.', project.get('name', None)) return True def get(self, project_name, updatetime=None, md5sum=None): '''get project data object, return None if not exists''' if time.time() - self.last_check_projects > self.CHECK_PROJECTS_INTERVAL: self._check_projects() if self._need_update(project_name, updatetime, md5sum): self._update_project(project_name) return self.projects.get(project_name, None) class ProjectLoader(object): '''ProjectLoader class for sys.meta_path''' def __init__(self, project, mod=None): self.project = project self.name = project['name'] self.mod = mod pass def load_module(self, fullname): if self.mod is None: self.mod = mod = imp.new_module(fullname) else: mod = self.mod mod.__file__ = '<%s>' % self.name mod.__loader__ = self mod.__project__ = self.project mod.__package__ = '' code = self.get_code(fullname) six.exec_(code, mod.__dict__) linecache.clearcache() if sys.version_info[:2] == (3, 3): sys.modules[fullname] = mod return mod def is_package(self, fullname): return False def get_code(self, fullname): return compile(self.get_source(fullname), '<%s>' % self.name, 'exec') def get_source(self, fullname): script = self.project['script'] if isinstance(script, six.text_type): return script.encode('utf8') return script if six.PY2: class ProjectFinder(object): '''ProjectFinder class for sys.meta_path''' def __init__(self, projectdb): self.get_projectdb = weakref.ref(projectdb) @property def projectdb(self): return self.get_projectdb() def find_module(self, fullname, path=None): if fullname == 'projects': return self parts = fullname.split('.') if len(parts) == 2 and parts[0] == 'projects': name = parts[1] if not self.projectdb: return info = self.projectdb.get(name) if info: return ProjectLoader(info) def load_module(self, fullname): mod = imp.new_module(fullname) mod.__file__ = '' mod.__loader__ = self mod.__path__ = [''] mod.__package__ = 'projects' return mod def is_package(self, fullname): return True else: import importlib.abc class ProjectFinder(importlib.abc.MetaPathFinder): '''ProjectFinder class for sys.meta_path''' def __init__(self, projectdb): self.get_projectdb = weakref.ref(projectdb) @property def projectdb(self): return self.get_projectdb() def find_spec(self, fullname, path, target=None): loader = self.find_module(fullname, path) if loader: return importlib.util.spec_from_loader(fullname, loader) def find_module(self, fullname, path): if fullname == 'projects': return ProjectsLoader() parts = fullname.split('.') if len(parts) == 2 and parts[0] == 'projects': name = parts[1] if not self.projectdb: return info = self.projectdb.get(name) if info: return ProjectLoader(info) class ProjectsLoader(importlib.abc.InspectLoader): def load_module(self, fullname): mod = imp.new_module(fullname) mod.__file__ = '' mod.__loader__ = self mod.__path__ = [''] mod.__package__ = 'projects' if sys.version_info[:2] == (3, 3): sys.modules[fullname] = mod return mod def module_repr(self, module): return '' def is_package(self, fullname): return True def get_source(self, path): return '' def get_code(self, fullname): return compile(self.get_source(fullname), '', 'exec') class ProjectLoader(ProjectLoader, importlib.abc.Loader): def create_module(self, spec): return self.load_module(spec.name) def exec_module(self, module): return module def module_repr(self, module): return '' % self.name ================================================ FILE: pyspider/result/__init__.py ================================================ #!/usr/bin/env python # -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # Created on 2014-10-19 16:10:19 from .result_worker import ResultWorker, OneResultWorker ================================================ FILE: pyspider/result/result_worker.py ================================================ #!/usr/bin/env python # -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # Created on 2014-10-19 15:37:46 import time import json import logging from six.moves import queue as Queue logger = logging.getLogger("result") class ResultWorker(object): """ do with result override this if needed. """ def __init__(self, resultdb, inqueue): self.resultdb = resultdb self.inqueue = inqueue self._quit = False def on_result(self, task, result): '''Called every result''' if not result: return if 'taskid' in task and 'project' in task and 'url' in task: logger.info('result %s:%s %s -> %.30r' % ( task['project'], task['taskid'], task['url'], result)) return self.resultdb.save( project=task['project'], taskid=task['taskid'], url=task['url'], result=result ) else: logger.warning('result UNKNOW -> %.30r' % result) return def quit(self): self._quit = True def run(self): '''Run loop''' logger.info("result_worker starting...") while not self._quit: try: task, result = self.inqueue.get(timeout=1) self.on_result(task, result) except Queue.Empty as e: continue except KeyboardInterrupt: break except AssertionError as e: logger.error(e) continue except Exception as e: logger.exception(e) continue logger.info("result_worker exiting...") class OneResultWorker(ResultWorker): '''Result Worker for one mode, write results to stdout''' def on_result(self, task, result): '''Called every result''' if not result: return if 'taskid' in task and 'project' in task and 'url' in task: logger.info('result %s:%s %s -> %.30r' % ( task['project'], task['taskid'], task['url'], result)) print(json.dumps({ 'taskid': task['taskid'], 'project': task['project'], 'url': task['url'], 'result': result, 'updatetime': time.time() })) else: logger.warning('result UNKNOW -> %.30r' % result) return ================================================ FILE: pyspider/run.py ================================================ #!/usr/bin/env python # -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # Created on 2014-03-05 00:11:49 import os import sys import six import copy import time import shutil import logging import logging.config import click import pyspider from pyspider.message_queue import connect_message_queue from pyspider.database import connect_database from pyspider.libs import utils def read_config(ctx, param, value): if not value: return {} import json def underline_dict(d): if not isinstance(d, dict): return d return dict((k.replace('-', '_'), underline_dict(v)) for k, v in six.iteritems(d)) config = underline_dict(json.load(value)) ctx.default_map = config return config def connect_db(ctx, param, value): if not value: return return utils.Get(lambda: connect_database(value)) def load_cls(ctx, param, value): if isinstance(value, six.string_types): return utils.load_object(value) return value def connect_rpc(ctx, param, value): if not value: return try: from six.moves import xmlrpc_client except ImportError: import xmlrpclib as xmlrpc_client return xmlrpc_client.ServerProxy(value, allow_none=True) @click.group(invoke_without_command=True) @click.option('-c', '--config', callback=read_config, type=click.File('r'), help='a json file with default values for subcommands. {"webui": {"port":5001}}') @click.option('--logging-config', default=os.path.join(os.path.dirname(__file__), "logging.conf"), help="logging config file for built-in python logging module", show_default=True) @click.option('--debug', envvar='DEBUG', default=False, is_flag=True, help='debug mode') @click.option('--queue-maxsize', envvar='QUEUE_MAXSIZE', default=100, help='maxsize of queue') @click.option('--taskdb', envvar='TASKDB', callback=connect_db, help='database url for taskdb, default: sqlite') @click.option('--projectdb', envvar='PROJECTDB', callback=connect_db, help='database url for projectdb, default: sqlite') @click.option('--resultdb', envvar='RESULTDB', callback=connect_db, help='database url for resultdb, default: sqlite') @click.option('--message-queue', envvar='AMQP_URL', help='connection url to message queue, ' 'default: builtin multiprocessing.Queue') @click.option('--amqp-url', help='[deprecated] amqp url for rabbitmq. ' 'please use --message-queue instead.') @click.option('--beanstalk', envvar='BEANSTALK_HOST', help='[deprecated] beanstalk config for beanstalk queue. ' 'please use --message-queue instead.') @click.option('--phantomjs-proxy', envvar='PHANTOMJS_PROXY', help="phantomjs proxy ip:port") @click.option('--puppeteer-proxy', envvar='PUPPETEER_PROXY', help="puppeteer proxy ip:port") @click.option('--data-path', default='./data', help='data dir path') @click.option('--add-sys-path/--not-add-sys-path', default=True, is_flag=True, help='add current working directory to python lib search path') @click.version_option(version=pyspider.__version__, prog_name=pyspider.__name__) @click.pass_context def cli(ctx, **kwargs): """ A powerful spider system in python. """ if kwargs['add_sys_path']: sys.path.append(os.getcwd()) logging.config.fileConfig(kwargs['logging_config']) # get db from env for db in ('taskdb', 'projectdb', 'resultdb'): if kwargs[db] is not None: continue if os.environ.get('MYSQL_NAME'): kwargs[db] = utils.Get(lambda db=db: connect_database( 'sqlalchemy+mysql+%s://%s:%s/%s' % ( db, os.environ['MYSQL_PORT_3306_TCP_ADDR'], os.environ['MYSQL_PORT_3306_TCP_PORT'], db))) elif os.environ.get('MONGODB_NAME'): kwargs[db] = utils.Get(lambda db=db: connect_database( 'mongodb+%s://%s:%s/%s' % ( db, os.environ['MONGODB_PORT_27017_TCP_ADDR'], os.environ['MONGODB_PORT_27017_TCP_PORT'], db))) elif os.environ.get('COUCHDB_NAME'): kwargs[db] = utils.Get(lambda db=db: connect_database( 'couchdb+%s://%s:%s/%s' % ( db, os.environ['COUCHDB_PORT_5984_TCP_ADDR'] or 'couchdb', os.environ['COUCHDB_PORT_5984_TCP_PORT'] or '5984', db))) elif ctx.invoked_subcommand == 'bench': if kwargs['data_path'] == './data': kwargs['data_path'] += '/bench' shutil.rmtree(kwargs['data_path'], ignore_errors=True) os.mkdir(kwargs['data_path']) if db in ('taskdb', 'resultdb'): kwargs[db] = utils.Get(lambda db=db: connect_database('sqlite+%s://' % (db))) elif db in ('projectdb', ): kwargs[db] = utils.Get(lambda db=db: connect_database('local+%s://%s' % ( db, os.path.join(os.path.dirname(__file__), 'libs/bench.py')))) else: if not os.path.exists(kwargs['data_path']): os.mkdir(kwargs['data_path']) kwargs[db] = utils.Get(lambda db=db: connect_database('sqlite+%s:///%s/%s.db' % ( db, kwargs['data_path'], db[:-2]))) kwargs['is_%s_default' % db] = True # create folder for counter.dump if not os.path.exists(kwargs['data_path']): os.mkdir(kwargs['data_path']) # message queue, compatible with old version if kwargs.get('message_queue'): pass elif kwargs.get('amqp_url'): kwargs['message_queue'] = kwargs['amqp_url'] elif os.environ.get('RABBITMQ_NAME'): kwargs['message_queue'] = ("amqp://guest:guest@%(RABBITMQ_PORT_5672_TCP_ADDR)s" ":%(RABBITMQ_PORT_5672_TCP_PORT)s/%%2F" % os.environ) for name in ('newtask_queue', 'status_queue', 'scheduler2fetcher', 'fetcher2processor', 'processor2result'): if kwargs.get('message_queue'): kwargs[name] = utils.Get(lambda name=name: connect_message_queue( name, kwargs.get('message_queue'), kwargs['queue_maxsize'])) else: kwargs[name] = connect_message_queue(name, kwargs.get('message_queue'), kwargs['queue_maxsize']) # phantomjs-proxy if kwargs.get('phantomjs_proxy'): pass elif os.environ.get('PHANTOMJS_NAME'): kwargs['phantomjs_proxy'] = os.environ['PHANTOMJS_PORT_25555_TCP'][len('tcp://'):] # puppeteer-proxy if kwargs.get('puppeteer_proxy'): pass elif os.environ.get('PUPPETEER_NAME'): kwargs['puppeteer_proxy'] = os.environ['PUPPETEER_PORT_22222_TCP'][len('tcp://'):] ctx.obj = utils.ObjectDict(ctx.obj or {}) ctx.obj['instances'] = [] ctx.obj.update(kwargs) if ctx.invoked_subcommand is None and not ctx.obj.get('testing_mode'): ctx.invoke(all) return ctx @cli.command() @click.option('--xmlrpc', is_flag=True, help="Enable xmlrpc (Default=True)") @click.option('--no-xmlrpc', is_flag=True, help="Disable xmlrpc") @click.option('--xmlrpc-host', default='0.0.0.0') @click.option('--xmlrpc-port', envvar='SCHEDULER_XMLRPC_PORT', default=23333) @click.option('--inqueue-limit', default=0, help='size limit of task queue for each project, ' 'tasks will been ignored when overflow') @click.option('--delete-time', default=24 * 60 * 60, help='delete time before marked as delete') @click.option('--active-tasks', default=100, help='active log size') @click.option('--loop-limit', default=1000, help='maximum number of tasks due with in a loop') @click.option('--fail-pause-num', default=10, help='auto pause the project when last FAIL_PAUSE_NUM task failed, set 0 to disable') @click.option('--scheduler-cls', default='pyspider.scheduler.ThreadBaseScheduler', callback=load_cls, help='scheduler class to be used.') @click.option('--threads', default=None, help='thread number for ThreadBaseScheduler, default: 4') @click.pass_context def scheduler(ctx, xmlrpc, no_xmlrpc, xmlrpc_host, xmlrpc_port, inqueue_limit, delete_time, active_tasks, loop_limit, fail_pause_num, scheduler_cls, threads, get_object=False): """ Run Scheduler, only one scheduler is allowed. """ g = ctx.obj Scheduler = load_cls(None, None, scheduler_cls) kwargs = dict(taskdb=g.taskdb, projectdb=g.projectdb, resultdb=g.resultdb, newtask_queue=g.newtask_queue, status_queue=g.status_queue, out_queue=g.scheduler2fetcher, data_path=g.get('data_path', 'data')) if threads: kwargs['threads'] = int(threads) scheduler = Scheduler(**kwargs) scheduler.INQUEUE_LIMIT = inqueue_limit scheduler.DELETE_TIME = delete_time scheduler.ACTIVE_TASKS = active_tasks scheduler.LOOP_LIMIT = loop_limit scheduler.FAIL_PAUSE_NUM = fail_pause_num g.instances.append(scheduler) if g.get('testing_mode') or get_object: return scheduler if not no_xmlrpc: utils.run_in_thread(scheduler.xmlrpc_run, port=xmlrpc_port, bind=xmlrpc_host) scheduler.run() @cli.command() @click.option('--xmlrpc', is_flag=True, help="Enable xmlrpc (Default=True)") @click.option('--no-xmlrpc', is_flag=True, help="Disable xmlrpc") @click.option('--xmlrpc-host', default='0.0.0.0') @click.option('--xmlrpc-port', envvar='FETCHER_XMLRPC_PORT', default=24444) @click.option('--poolsize', default=100, help="max simultaneous fetches") @click.option('--proxy', help="proxy host:port") @click.option('--user-agent', help='user agent') @click.option('--timeout', help='default fetch timeout') @click.option('--phantomjs-endpoint', help="endpoint of phantomjs, start via pyspider phantomjs") @click.option('--puppeteer-endpoint', help="endpoint of puppeteer, start via pyspider puppeteer") @click.option('--splash-endpoint', help="execute endpoint of splash: http://splash.readthedocs.io/en/stable/api.html#execute") @click.option('--fetcher-cls', default='pyspider.fetcher.Fetcher', callback=load_cls, help='Fetcher class to be used.') @click.pass_context def fetcher(ctx, xmlrpc, no_xmlrpc, xmlrpc_host, xmlrpc_port, poolsize, proxy, user_agent, timeout, phantomjs_endpoint, puppeteer_endpoint, splash_endpoint, fetcher_cls, async_mode=True, get_object=False, no_input=False): """ Run Fetcher. """ g = ctx.obj Fetcher = load_cls(None, None, fetcher_cls) if no_input: inqueue = None outqueue = None else: inqueue = g.scheduler2fetcher outqueue = g.fetcher2processor fetcher = Fetcher(inqueue=inqueue, outqueue=outqueue, poolsize=poolsize, proxy=proxy, async_mode=async_mode) fetcher.phantomjs_proxy = phantomjs_endpoint or g.phantomjs_proxy fetcher.puppeteer_proxy = puppeteer_endpoint or g.puppeteer_proxy fetcher.splash_endpoint = splash_endpoint if user_agent: fetcher.user_agent = user_agent if timeout: fetcher.default_options = copy.deepcopy(fetcher.default_options) fetcher.default_options['timeout'] = timeout g.instances.append(fetcher) if g.get('testing_mode') or get_object: return fetcher if not no_xmlrpc: utils.run_in_thread(fetcher.xmlrpc_run, port=xmlrpc_port, bind=xmlrpc_host) fetcher.run() @cli.command() @click.option('--processor-cls', default='pyspider.processor.Processor', callback=load_cls, help='Processor class to be used.') @click.option('--process-time-limit', default=30, help='script process time limit') @click.pass_context def processor(ctx, processor_cls, process_time_limit, enable_stdout_capture=True, get_object=False): """ Run Processor. """ g = ctx.obj Processor = load_cls(None, None, processor_cls) processor = Processor(projectdb=g.projectdb, inqueue=g.fetcher2processor, status_queue=g.status_queue, newtask_queue=g.newtask_queue, result_queue=g.processor2result, enable_stdout_capture=enable_stdout_capture, process_time_limit=process_time_limit) g.instances.append(processor) if g.get('testing_mode') or get_object: return processor processor.run() @cli.command() @click.option('--result-cls', default='pyspider.result.ResultWorker', callback=load_cls, help='ResultWorker class to be used.') @click.pass_context def result_worker(ctx, result_cls, get_object=False): """ Run result worker. """ g = ctx.obj ResultWorker = load_cls(None, None, result_cls) result_worker = ResultWorker(resultdb=g.resultdb, inqueue=g.processor2result) g.instances.append(result_worker) if g.get('testing_mode') or get_object: return result_worker result_worker.run() @cli.command() @click.option('--host', default='0.0.0.0', envvar='WEBUI_HOST', help='webui bind to host') @click.option('--port', default=5000, envvar='WEBUI_PORT', help='webui bind to host') @click.option('--cdn', default='//cdnjs.cloudflare.com/ajax/libs/', help='js/css cdn server') @click.option('--scheduler-rpc', help='xmlrpc path of scheduler') @click.option('--fetcher-rpc', help='xmlrpc path of fetcher') @click.option('--max-rate', type=float, help='max rate for each project') @click.option('--max-burst', type=float, help='max burst for each project') @click.option('--username', envvar='WEBUI_USERNAME', help='username of lock -ed projects') @click.option('--password', envvar='WEBUI_PASSWORD', help='password of lock -ed projects') @click.option('--need-auth', is_flag=True, default=False, help='need username and password') @click.option('--webui-instance', default='pyspider.webui.app.app', callback=load_cls, help='webui Flask Application instance to be used.') @click.option('--process-time-limit', default=30, help='script process time limit in debug') @click.pass_context def webui(ctx, host, port, cdn, scheduler_rpc, fetcher_rpc, max_rate, max_burst, username, password, need_auth, webui_instance, process_time_limit, get_object=False): """ Run WebUI """ app = load_cls(None, None, webui_instance) g = ctx.obj app.config['taskdb'] = g.taskdb app.config['projectdb'] = g.projectdb app.config['resultdb'] = g.resultdb app.config['cdn'] = cdn if max_rate: app.config['max_rate'] = max_rate if max_burst: app.config['max_burst'] = max_burst if username: app.config['webui_username'] = username if password: app.config['webui_password'] = password app.config['need_auth'] = need_auth app.config['process_time_limit'] = process_time_limit # inject queues for webui for name in ('newtask_queue', 'status_queue', 'scheduler2fetcher', 'fetcher2processor', 'processor2result'): app.config['queues'][name] = getattr(g, name, None) # fetcher rpc if isinstance(fetcher_rpc, six.string_types): import umsgpack fetcher_rpc = connect_rpc(ctx, None, fetcher_rpc) app.config['fetch'] = lambda x: umsgpack.unpackb(fetcher_rpc.fetch(x).data) else: # get fetcher instance for webui fetcher_config = g.config.get('fetcher', {}) webui_fetcher = ctx.invoke(fetcher, async_mode=False, get_object=True, no_input=True, **fetcher_config) app.config['fetch'] = lambda x: webui_fetcher.fetch(x) # scheduler rpc if isinstance(scheduler_rpc, six.string_types): scheduler_rpc = connect_rpc(ctx, None, scheduler_rpc) if scheduler_rpc is None and os.environ.get('SCHEDULER_PORT_23333_TCP_ADDR'): app.config['scheduler_rpc'] = connect_rpc(ctx, None, 'http://{}:{}/'.format(os.environ.get('SCHEDULER_PORT_23333_TCP_ADDR'), os.environ.get('SCHEDULER_PORT_23333_TCP_PORT') or 23333)) elif scheduler_rpc is None: app.config['scheduler_rpc'] = connect_rpc(ctx, None, 'http://127.0.0.1:23333/') else: app.config['scheduler_rpc'] = scheduler_rpc app.debug = g.debug g.instances.append(app) if g.get('testing_mode') or get_object: return app app.run(host=host, port=port) @cli.command() @click.option('--phantomjs-path', default='phantomjs', help='phantomjs path') @click.option('--port', default=25555, help='phantomjs port') @click.option('--auto-restart', default=False, help='auto restart phantomjs if crashed') @click.argument('args', nargs=-1) @click.pass_context def phantomjs(ctx, phantomjs_path, port, auto_restart, args): """ Run phantomjs fetcher if phantomjs is installed. """ args = args or ctx.default_map and ctx.default_map.get('args', []) import subprocess g = ctx.obj _quit = [] phantomjs_fetcher = os.path.join( os.path.dirname(pyspider.__file__), 'fetcher/phantomjs_fetcher.js') cmd = [phantomjs_path, # this may cause memory leak: https://github.com/ariya/phantomjs/issues/12903 #'--load-images=false', '--ssl-protocol=any', '--disk-cache=true'] + list(args or []) + [phantomjs_fetcher, str(port)] try: _phantomjs = subprocess.Popen(cmd) except OSError: logging.warning('phantomjs not found, continue running without it.') return None def quit(*args, **kwargs): _quit.append(1) _phantomjs.kill() _phantomjs.wait() logging.info('phantomjs exited.') if not g.get('phantomjs_proxy'): g['phantomjs_proxy'] = '127.0.0.1:%s' % port phantomjs = utils.ObjectDict(port=port, quit=quit) g.instances.append(phantomjs) if g.get('testing_mode'): return phantomjs while True: _phantomjs.wait() if _quit or not auto_restart: break _phantomjs = subprocess.Popen(cmd) @cli.command() @click.option('--port', default=22222, help='puppeteer port') @click.option('--auto-restart', default=False, help='auto restart puppeteer if crashed') @click.argument('args', nargs=-1) @click.pass_context def puppeteer(ctx, port, auto_restart, args): """ Run puppeteer fetcher if puppeteer is installed. """ import subprocess g = ctx.obj _quit = [] puppeteer_fetcher = os.path.join( os.path.dirname(pyspider.__file__), 'fetcher/puppeteer_fetcher.js') cmd = ['node', puppeteer_fetcher, str(port)] try: _puppeteer = subprocess.Popen(cmd) except OSError: logging.warning('puppeteer not found, continue running without it.') return None def quit(*args, **kwargs): _quit.append(1) _puppeteer.kill() _puppeteer.wait() logging.info('puppeteer exited.') if not g.get('puppeteer_proxy'): g['puppeteer_proxy'] = '127.0.0.1:%s' % port puppeteer = utils.ObjectDict(port=port, quit=quit) g.instances.append(puppeteer) if g.get('testing_mode'): return puppeteer while True: _puppeteer.wait() if _quit or not auto_restart: break _puppeteer = subprocess.Popen(cmd) @cli.command() @click.option('--fetcher-num', default=1, help='instance num of fetcher') @click.option('--processor-num', default=1, help='instance num of processor') @click.option('--result-worker-num', default=1, help='instance num of result worker') @click.option('--run-in', default='subprocess', type=click.Choice(['subprocess', 'thread']), help='run each components in thread or subprocess. ' 'always using thread for windows.') @click.pass_context def all(ctx, fetcher_num, processor_num, result_worker_num, run_in): """ Run all the components in subprocess or thread """ ctx.obj['debug'] = False g = ctx.obj # FIXME: py34 cannot run components with threads if run_in == 'subprocess' and os.name != 'nt': run_in = utils.run_in_subprocess else: run_in = utils.run_in_thread threads = [] try: # phantomjs if not g.get('phantomjs_proxy'): phantomjs_config = g.config.get('phantomjs', {}) phantomjs_config.setdefault('auto_restart', True) threads.append(run_in(ctx.invoke, phantomjs, **phantomjs_config)) time.sleep(2) if threads[-1].is_alive() and not g.get('phantomjs_proxy'): g['phantomjs_proxy'] = '127.0.0.1:%s' % phantomjs_config.get('port', 25555) # puppeteer if not g.get('puppeteer_proxy'): puppeteer_config = g.config.get('puppeteer', {}) puppeteer_config.setdefault('auto_restart', True) threads.append(run_in(ctx.invoke, puppeteer, **puppeteer_config)) time.sleep(2) if threads[-1].is_alive() and not g.get('puppeteer_proxy'): g['puppeteer_proxy'] = '127.0.0.1:%s' % puppeteer_config.get('port', 22222) # result worker result_worker_config = g.config.get('result_worker', {}) for i in range(result_worker_num): threads.append(run_in(ctx.invoke, result_worker, **result_worker_config)) # processor processor_config = g.config.get('processor', {}) for i in range(processor_num): threads.append(run_in(ctx.invoke, processor, **processor_config)) # fetcher fetcher_config = g.config.get('fetcher', {}) fetcher_config.setdefault('xmlrpc_host', '127.0.0.1') for i in range(fetcher_num): threads.append(run_in(ctx.invoke, fetcher, **fetcher_config)) # scheduler scheduler_config = g.config.get('scheduler', {}) scheduler_config.setdefault('xmlrpc_host', '127.0.0.1') threads.append(run_in(ctx.invoke, scheduler, **scheduler_config)) # running webui in main thread to make it exitable webui_config = g.config.get('webui', {}) webui_config.setdefault('scheduler_rpc', 'http://127.0.0.1:%s/' % g.config.get('scheduler', {}).get('xmlrpc_port', 23333)) ctx.invoke(webui, **webui_config) finally: # exit components run in threading for each in g.instances: each.quit() # exit components run in subprocess for each in threads: if not each.is_alive(): continue if hasattr(each, 'terminate'): each.terminate() each.join() @cli.command() @click.option('--fetcher-num', default=1, help='instance num of fetcher') @click.option('--processor-num', default=2, help='instance num of processor') @click.option('--result-worker-num', default=1, help='instance num of result worker') @click.option('--run-in', default='subprocess', type=click.Choice(['subprocess', 'thread']), help='run each components in thread or subprocess. ' 'always using thread for windows.') @click.option('--total', default=10000, help="total url in test page") @click.option('--show', default=20, help="show how many urls in a page") @click.option('--taskdb-bench', default=False, is_flag=True, help="only run taskdb bench test") @click.option('--message-queue-bench', default=False, is_flag=True, help="only run message queue bench test") @click.option('--all-bench', default=False, is_flag=True, help="only run all bench test") @click.pass_context def bench(ctx, fetcher_num, processor_num, result_worker_num, run_in, total, show, taskdb_bench, message_queue_bench, all_bench): """ Run Benchmark test. In bench mode, in-memory sqlite database is used instead of on-disk sqlite database. """ from pyspider.libs import bench from pyspider.webui import bench_test # flake8: noqa ctx.obj['debug'] = False g = ctx.obj if result_worker_num == 0: g['processor2result'] = None if run_in == 'subprocess' and os.name != 'nt': run_in = utils.run_in_subprocess else: run_in = utils.run_in_thread all_test = not taskdb_bench and not message_queue_bench and not all_bench # test taskdb if all_test or taskdb_bench: bench.bench_test_taskdb(g.taskdb) # test message queue if all_test or message_queue_bench: bench.bench_test_message_queue(g.scheduler2fetcher) # test all if not all_test and not all_bench: return project_name = 'bench' def clear_project(): g.taskdb.drop(project_name) g.resultdb.drop(project_name) clear_project() # disable log logging.getLogger().setLevel(logging.ERROR) logging.getLogger('scheduler').setLevel(logging.ERROR) logging.getLogger('fetcher').setLevel(logging.ERROR) logging.getLogger('processor').setLevel(logging.ERROR) logging.getLogger('result').setLevel(logging.ERROR) logging.getLogger('webui').setLevel(logging.ERROR) logging.getLogger('werkzeug').setLevel(logging.ERROR) try: threads = [] # result worker result_worker_config = g.config.get('result_worker', {}) for i in range(result_worker_num): threads.append(run_in(ctx.invoke, result_worker, result_cls='pyspider.libs.bench.BenchResultWorker', **result_worker_config)) # processor processor_config = g.config.get('processor', {}) for i in range(processor_num): threads.append(run_in(ctx.invoke, processor, processor_cls='pyspider.libs.bench.BenchProcessor', **processor_config)) # fetcher fetcher_config = g.config.get('fetcher', {}) fetcher_config.setdefault('xmlrpc_host', '127.0.0.1') for i in range(fetcher_num): threads.append(run_in(ctx.invoke, fetcher, fetcher_cls='pyspider.libs.bench.BenchFetcher', **fetcher_config)) # webui webui_config = g.config.get('webui', {}) webui_config.setdefault('scheduler_rpc', 'http://127.0.0.1:%s/' % g.config.get('scheduler', {}).get('xmlrpc_port', 23333)) threads.append(run_in(ctx.invoke, webui, **webui_config)) # scheduler scheduler_config = g.config.get('scheduler', {}) scheduler_config.setdefault('xmlrpc_host', '127.0.0.1') scheduler_config.setdefault('xmlrpc_port', 23333) threads.append(run_in(ctx.invoke, scheduler, scheduler_cls='pyspider.libs.bench.BenchScheduler', **scheduler_config)) scheduler_rpc = connect_rpc(ctx, None, 'http://%(xmlrpc_host)s:%(xmlrpc_port)s/' % scheduler_config) for _ in range(20): if utils.check_port_open(23333): break time.sleep(1) scheduler_rpc.newtask({ "project": project_name, "taskid": "on_start", "url": "data:,on_start", "fetch": { "save": {"total": total, "show": show} }, "process": { "callback": "on_start", }, }) # wait bench test finished while True: time.sleep(1) if scheduler_rpc.size() == 0: break finally: # exit components run in threading for each in g.instances: each.quit() # exit components run in subprocess for each in threads: if hasattr(each, 'terminate'): each.terminate() each.join(1) clear_project() @cli.command() @click.option('-i', '--interactive', default=False, is_flag=True, help='enable interactive mode, you can choose crawl url.') @click.option('--phantomjs', 'enable_phantomjs', default=False, is_flag=True, help='enable phantomjs, will spawn a subprocess for phantomjs') @click.option('--puppeteer', 'enable_puppeteer', default=False, is_flag=True, help='enable puppeteer, will spawn a subprocess for puppeteer') @click.argument('scripts', nargs=-1) @click.pass_context def one(ctx, interactive, enable_phantomjs, enable_puppeteer, scripts): """ One mode not only means all-in-one, it runs every thing in one process over tornado.ioloop, for debug purpose """ ctx.obj['debug'] = False g = ctx.obj g['testing_mode'] = True if scripts: from pyspider.database.local.projectdb import ProjectDB g['projectdb'] = ProjectDB(scripts) if g.get('is_taskdb_default'): g['taskdb'] = connect_database('sqlite+taskdb://') if g.get('is_resultdb_default'): g['resultdb'] = None if enable_phantomjs: phantomjs_config = g.config.get('phantomjs', {}) phantomjs_obj = ctx.invoke(phantomjs, **phantomjs_config) if phantomjs_obj: g.setdefault('phantomjs_proxy', '127.0.0.1:%s' % phantomjs_obj.port) else: phantomjs_obj = None if enable_puppeteer: puppeteer_config = g.config.get('puppeteer', {}) puppeteer_obj = ctx.invoke(puppeteer, **puppeteer_config) if puppeteer_obj: g.setdefault('puppeteer_proxy', '127.0.0.1:%s' % puppeteer.port) else: puppeteer_obj = None result_worker_config = g.config.get('result_worker', {}) if g.resultdb is None: result_worker_config.setdefault('result_cls', 'pyspider.result.OneResultWorker') result_worker_obj = ctx.invoke(result_worker, **result_worker_config) processor_config = g.config.get('processor', {}) processor_config.setdefault('enable_stdout_capture', False) processor_obj = ctx.invoke(processor, **processor_config) fetcher_config = g.config.get('fetcher', {}) fetcher_config.setdefault('xmlrpc', False) fetcher_obj = ctx.invoke(fetcher, **fetcher_config) scheduler_config = g.config.get('scheduler', {}) scheduler_config.setdefault('xmlrpc', False) scheduler_config.setdefault('scheduler_cls', 'pyspider.scheduler.OneScheduler') scheduler_obj = ctx.invoke(scheduler, **scheduler_config) scheduler_obj.init_one(ioloop=fetcher_obj.ioloop, fetcher=fetcher_obj, processor=processor_obj, result_worker=result_worker_obj, interactive=interactive) if scripts: for project in g.projectdb.projects: scheduler_obj.trigger_on_start(project) try: scheduler_obj.run() finally: scheduler_obj.quit() if phantomjs_obj: phantomjs_obj.quit() if puppeteer_obj: puppeteer_obj.quit() @cli.command() @click.option('--scheduler-rpc', callback=connect_rpc, help='xmlrpc path of scheduler') @click.argument('project', nargs=1) @click.argument('message', nargs=1) @click.pass_context def send_message(ctx, scheduler_rpc, project, message): """ Send Message to project from command line """ if isinstance(scheduler_rpc, six.string_types): scheduler_rpc = connect_rpc(ctx, None, scheduler_rpc) if scheduler_rpc is None and os.environ.get('SCHEDULER_PORT_23333_TCP_ADDR'): scheduler_rpc = connect_rpc(ctx, None, 'http://%s:%s/' % (os.environ['SCHEDULER_PORT_23333_TCP_ADDR'], os.environ['SCHEDULER_PORT_23333_TCP_PORT'] or 23333)) if scheduler_rpc is None: scheduler_rpc = connect_rpc(ctx, None, 'http://127.0.0.1:23333/') return scheduler_rpc.send_task({ 'taskid': utils.md5string('data:,on_message'), 'project': project, 'url': 'data:,on_message', 'fetch': { 'save': ('__command__', message), }, 'process': { 'callback': '_on_message', } }) def main(): cli() if __name__ == '__main__': main() ================================================ FILE: pyspider/scheduler/__init__.py ================================================ from .scheduler import Scheduler, OneScheduler, ThreadBaseScheduler # NOQA ================================================ FILE: pyspider/scheduler/scheduler.py ================================================ #!/usr/bin/env python # -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # Created on 2014-02-07 17:05:11 import itertools import json import logging import os import time from collections import deque from six import iteritems, itervalues from six.moves import queue as Queue from pyspider.libs import counter, utils from pyspider.libs.base_handler import BaseHandler from .task_queue import TaskQueue logger = logging.getLogger('scheduler') class Project(object): ''' project for scheduler ''' def __init__(self, scheduler, project_info): ''' ''' self.scheduler = scheduler self.active_tasks = deque(maxlen=scheduler.ACTIVE_TASKS) self.task_queue = TaskQueue() self.task_loaded = False self._selected_tasks = False # selected tasks after recent pause self._send_finished_event_wait = 0 # wait for scheduler.FAIL_PAUSE_NUM loop steps before sending the event self.md5sum = None self._send_on_get_info = False self.waiting_get_info = True self._paused = False self._paused_time = 0 self._unpause_last_seen = None self.update(project_info) @property def paused(self): if self.scheduler.FAIL_PAUSE_NUM <= 0: return False # unpaused --(last FAIL_PAUSE_NUM task failed)--> paused --(PAUSE_TIME)--> unpause_checking # unpaused <--(last UNPAUSE_CHECK_NUM task have success)--| # paused <--(last UNPAUSE_CHECK_NUM task no success)--| if not self._paused: fail_cnt = 0 for _, task in self.active_tasks: # ignore select task if task.get('type') == self.scheduler.TASK_PACK: continue if 'process' not in task['track']: logger.error('process not in task, %r', task) if task['track']['process']['ok']: break else: fail_cnt += 1 if fail_cnt >= self.scheduler.FAIL_PAUSE_NUM: break if fail_cnt >= self.scheduler.FAIL_PAUSE_NUM: self._paused = True self._paused_time = time.time() elif self._paused is True and (self._paused_time + self.scheduler.PAUSE_TIME < time.time()): self._paused = 'checking' self._unpause_last_seen = self.active_tasks[0][1] if len(self.active_tasks) else None elif self._paused == 'checking': cnt = 0 fail_cnt = 0 for _, task in self.active_tasks: if task is self._unpause_last_seen: break # ignore select task if task.get('type') == self.scheduler.TASK_PACK: continue cnt += 1 if task['track']['process']['ok']: # break with enough check cnt cnt = max(cnt, self.scheduler.UNPAUSE_CHECK_NUM) break else: fail_cnt += 1 if cnt >= self.scheduler.UNPAUSE_CHECK_NUM: if fail_cnt == cnt: self._paused = True self._paused_time = time.time() else: self._paused = False return self._paused is True def update(self, project_info): self.project_info = project_info self.name = project_info['name'] self.group = project_info['group'] self.db_status = project_info['status'] self.updatetime = project_info['updatetime'] md5sum = utils.md5string(project_info['script']) if self.md5sum != md5sum: self.waiting_get_info = True self.md5sum = md5sum if self.waiting_get_info and self.active: self._send_on_get_info = True if self.active: self.task_queue.rate = project_info['rate'] self.task_queue.burst = project_info['burst'] else: self.task_queue.rate = 0 self.task_queue.burst = 0 logger.info('project %s updated, status:%s, paused:%s, %d tasks', self.name, self.db_status, self.paused, len(self.task_queue)) def on_get_info(self, info): self.waiting_get_info = False self.min_tick = info.get('min_tick', 0) self.retry_delay = info.get('retry_delay', {}) self.crawl_config = info.get('crawl_config', {}) @property def active(self): return self.db_status in ('RUNNING', 'DEBUG') class Scheduler(object): UPDATE_PROJECT_INTERVAL = 5 * 60 default_schedule = { 'priority': 0, 'retries': 3, 'exetime': 0, 'age': -1, 'itag': None, } LOOP_LIMIT = 1000 LOOP_INTERVAL = 0.1 ACTIVE_TASKS = 100 INQUEUE_LIMIT = 0 EXCEPTION_LIMIT = 3 DELETE_TIME = 24 * 60 * 60 DEFAULT_RETRY_DELAY = { 0: 30, 1: 1*60*60, 2: 6*60*60, 3: 12*60*60, '': 24*60*60 } FAIL_PAUSE_NUM = 10 PAUSE_TIME = 5*60 UNPAUSE_CHECK_NUM = 3 TASK_PACK = 1 STATUS_PACK = 2 # current not used REQUEST_PACK = 3 # current not used def __init__(self, taskdb, projectdb, newtask_queue, status_queue, out_queue, data_path='./data', resultdb=None): self.taskdb = taskdb self.projectdb = projectdb self.resultdb = resultdb self.newtask_queue = newtask_queue self.status_queue = status_queue self.out_queue = out_queue self.data_path = data_path self._send_buffer = deque() self._quit = False self._exceptions = 0 self.projects = dict() self._force_update_project = False self._last_update_project = 0 self._last_tick = int(time.time()) self._postpone_request = [] self._cnt = { "5m_time": counter.CounterManager( lambda: counter.TimebaseAverageEventCounter(30, 10)), "5m": counter.CounterManager( lambda: counter.TimebaseAverageWindowCounter(30, 10)), "1h": counter.CounterManager( lambda: counter.TimebaseAverageWindowCounter(60, 60)), "1d": counter.CounterManager( lambda: counter.TimebaseAverageWindowCounter(10 * 60, 24 * 6)), "all": counter.CounterManager( lambda: counter.TotalCounter()), } self._cnt['1h'].load(os.path.join(self.data_path, 'scheduler.1h')) self._cnt['1d'].load(os.path.join(self.data_path, 'scheduler.1d')) self._cnt['all'].load(os.path.join(self.data_path, 'scheduler.all')) self._last_dump_cnt = 0 def _update_projects(self): '''Check project update''' now = time.time() if ( not self._force_update_project and self._last_update_project + self.UPDATE_PROJECT_INTERVAL > now ): return for project in self.projectdb.check_update(self._last_update_project): self._update_project(project) logger.debug("project: %s updated.", project['name']) self._force_update_project = False self._last_update_project = now get_info_attributes = ['min_tick', 'retry_delay', 'crawl_config'] def _update_project(self, project): '''update one project''' if project['name'] not in self.projects: self.projects[project['name']] = Project(self, project) else: self.projects[project['name']].update(project) project = self.projects[project['name']] if project._send_on_get_info: # update project runtime info from processor by sending a _on_get_info # request, result is in status_page.track.save project._send_on_get_info = False self.on_select_task({ 'taskid': '_on_get_info', 'project': project.name, 'url': 'data:,_on_get_info', 'status': self.taskdb.SUCCESS, 'fetch': { 'save': self.get_info_attributes, }, 'process': { 'callback': '_on_get_info', }, }) # load task queue when project is running and delete task_queue when project is stoped if project.active: if not project.task_loaded: self._load_tasks(project) project.task_loaded = True else: if project.task_loaded: project.task_queue = TaskQueue() project.task_loaded = False if project not in self._cnt['all']: self._update_project_cnt(project.name) scheduler_task_fields = ['taskid', 'project', 'schedule', ] def _load_tasks(self, project): '''load tasks from database''' task_queue = project.task_queue for task in self.taskdb.load_tasks( self.taskdb.ACTIVE, project.name, self.scheduler_task_fields ): taskid = task['taskid'] _schedule = task.get('schedule', self.default_schedule) priority = _schedule.get('priority', self.default_schedule['priority']) exetime = _schedule.get('exetime', self.default_schedule['exetime']) task_queue.put(taskid, priority, exetime) project.task_loaded = True logger.debug('project: %s loaded %d tasks.', project.name, len(task_queue)) if project not in self._cnt['all']: self._update_project_cnt(project.name) self._cnt['all'].value((project.name, 'pending'), len(project.task_queue)) def _update_project_cnt(self, project_name): status_count = self.taskdb.status_count(project_name) self._cnt['all'].value( (project_name, 'success'), status_count.get(self.taskdb.SUCCESS, 0) ) self._cnt['all'].value( (project_name, 'failed'), status_count.get(self.taskdb.FAILED, 0) + status_count.get(self.taskdb.BAD, 0) ) self._cnt['all'].value( (project_name, 'pending'), status_count.get(self.taskdb.ACTIVE, 0) ) def task_verify(self, task): ''' return False if any of 'taskid', 'project', 'url' is not in task dict or project in not in task_queue ''' for each in ('taskid', 'project', 'url', ): if each not in task or not task[each]: logger.error('%s not in task: %.200r', each, task) return False if task['project'] not in self.projects: logger.error('unknown project: %s', task['project']) return False project = self.projects[task['project']] if not project.active: logger.error('project %s not started, please set status to RUNNING or DEBUG', task['project']) return False return True def insert_task(self, task): '''insert task into database''' return self.taskdb.insert(task['project'], task['taskid'], task) def update_task(self, task): '''update task in database''' return self.taskdb.update(task['project'], task['taskid'], task) def put_task(self, task): '''put task to task queue''' _schedule = task.get('schedule', self.default_schedule) self.projects[task['project']].task_queue.put( task['taskid'], priority=_schedule.get('priority', self.default_schedule['priority']), exetime=_schedule.get('exetime', self.default_schedule['exetime']) ) def send_task(self, task, force=True): ''' dispatch task to fetcher out queue may have size limit to prevent block, a send_buffer is used ''' try: self.out_queue.put_nowait(task) except Queue.Full: if force: self._send_buffer.appendleft(task) else: raise def _check_task_done(self): '''Check status queue''' cnt = 0 try: while True: task = self.status_queue.get_nowait() # check _on_get_info result here if task.get('taskid') == '_on_get_info' and 'project' in task and 'track' in task: if task['project'] not in self.projects: continue project = self.projects[task['project']] project.on_get_info(task['track'].get('save') or {}) logger.info( '%s on_get_info %r', task['project'], task['track'].get('save', {}) ) continue elif not self.task_verify(task): continue self.on_task_status(task) cnt += 1 except Queue.Empty: pass return cnt merge_task_fields = ['taskid', 'project', 'url', 'status', 'schedule', 'lastcrawltime'] def _check_request(self): '''Check new task queue''' # check _postpone_request first todo = [] for task in self._postpone_request: if task['project'] not in self.projects: continue if self.projects[task['project']].task_queue.is_processing(task['taskid']): todo.append(task) else: self.on_request(task) self._postpone_request = todo tasks = {} while len(tasks) < self.LOOP_LIMIT: try: task = self.newtask_queue.get_nowait() except Queue.Empty: break if isinstance(task, list): _tasks = task else: _tasks = (task, ) for task in _tasks: if not self.task_verify(task): continue if task['taskid'] in self.projects[task['project']].task_queue: if not task.get('schedule', {}).get('force_update', False): logger.debug('ignore newtask %(project)s:%(taskid)s %(url)s', task) continue if task['taskid'] in tasks: if not task.get('schedule', {}).get('force_update', False): continue tasks[task['taskid']] = task for task in itervalues(tasks): self.on_request(task) return len(tasks) def _check_cronjob(self): """Check projects cronjob tick, return True when a new tick is sended""" now = time.time() self._last_tick = int(self._last_tick) if now - self._last_tick < 1: return False self._last_tick += 1 for project in itervalues(self.projects): if not project.active: continue if project.waiting_get_info: continue if int(project.min_tick) == 0: continue if self._last_tick % int(project.min_tick) != 0: continue self.on_select_task({ 'taskid': '_on_cronjob', 'project': project.name, 'url': 'data:,_on_cronjob', 'status': self.taskdb.SUCCESS, 'fetch': { 'save': { 'tick': self._last_tick, }, }, 'process': { 'callback': '_on_cronjob', }, }) return True request_task_fields = [ 'taskid', 'project', 'url', 'status', 'schedule', 'fetch', 'process', 'track', 'lastcrawltime' ] def _check_select(self): '''Select task to fetch & process''' while self._send_buffer: _task = self._send_buffer.pop() try: # use force=False here to prevent automatic send_buffer append and get exception self.send_task(_task, False) except Queue.Full: self._send_buffer.append(_task) break if self.out_queue.full(): return {} taskids = [] cnt = 0 cnt_dict = dict() limit = self.LOOP_LIMIT # dynamic assign select limit for each project, use qsize as weight project_weights, total_weight = dict(), 0 for project in itervalues(self.projects): # type:Project if not project.active: continue # only check project pause when select new tasks, cronjob and new request still working if project.paused: continue if project.waiting_get_info: continue # task queue task_queue = project.task_queue # type:TaskQueue pro_weight = task_queue.size() total_weight += pro_weight project_weights[project.name] = pro_weight pass min_project_limit = int(limit / 10.) # ensure minimum select limit for each project max_project_limit = int(limit / 3.0) # ensure maximum select limit for each project for pro_name, pro_weight in iteritems(project_weights): if cnt >= limit: break project = self.projects[pro_name] # type:Project # task queue task_queue = project.task_queue task_queue.check_update() project_cnt = 0 # calculate select limit for project if total_weight < 1 or pro_weight < 1: project_limit = min_project_limit else: project_limit = int((1.0 * pro_weight / total_weight) * limit) if project_limit < min_project_limit: project_limit = min_project_limit elif project_limit > max_project_limit: project_limit = max_project_limit # check send_buffer here. when not empty, out_queue may blocked. Not sending tasks while cnt < limit and project_cnt < project_limit: taskid = task_queue.get() if not taskid: break taskids.append((project.name, taskid)) if taskid != 'on_finished': project_cnt += 1 cnt += 1 cnt_dict[project.name] = project_cnt if project_cnt: project._selected_tasks = True project._send_finished_event_wait = 0 # check and send finished event to project if not project_cnt and len(task_queue) == 0 and project._selected_tasks: # wait for self.FAIL_PAUSE_NUM steps to make sure all tasks in queue have been processed if project._send_finished_event_wait < self.FAIL_PAUSE_NUM: project._send_finished_event_wait += 1 else: project._selected_tasks = False project._send_finished_event_wait = 0 self._postpone_request.append({ 'project': project.name, 'taskid': 'on_finished', 'url': 'data:,on_finished', 'process': { 'callback': 'on_finished', }, "schedule": { "age": 0, "priority": 9, "force_update": True, }, }) for project, taskid in taskids: self._load_put_task(project, taskid) return cnt_dict def _load_put_task(self, project, taskid): try: task = self.taskdb.get_task(project, taskid, fields=self.request_task_fields) except ValueError: logger.error('bad task pack %s:%s', project, taskid) return if not task: return task = self.on_select_task(task) def _print_counter_log(self): # print top 5 active counters keywords = ('pending', 'success', 'retry', 'failed') total_cnt = {} project_actives = [] project_fails = [] for key in keywords: total_cnt[key] = 0 for project, subcounter in iteritems(self._cnt['5m']): actives = 0 for key in keywords: cnt = subcounter.get(key, None) if cnt: cnt = cnt.sum total_cnt[key] += cnt actives += cnt project_actives.append((actives, project)) fails = subcounter.get('failed', None) if fails: project_fails.append((fails.sum, project)) top_2_fails = sorted(project_fails, reverse=True)[:2] top_3_actives = sorted([x for x in project_actives if x[1] not in top_2_fails], reverse=True)[:5 - len(top_2_fails)] log_str = ("in 5m: new:%(pending)d,success:%(success)d," "retry:%(retry)d,failed:%(failed)d" % total_cnt) for _, project in itertools.chain(top_3_actives, top_2_fails): subcounter = self._cnt['5m'][project].to_dict(get_value='sum') log_str += " %s:%d,%d,%d,%d" % (project, subcounter.get('pending', 0), subcounter.get('success', 0), subcounter.get('retry', 0), subcounter.get('failed', 0)) logger.info(log_str) def _dump_cnt(self): '''Dump counters to file''' self._cnt['1h'].dump(os.path.join(self.data_path, 'scheduler.1h')) self._cnt['1d'].dump(os.path.join(self.data_path, 'scheduler.1d')) self._cnt['all'].dump(os.path.join(self.data_path, 'scheduler.all')) def _try_dump_cnt(self): '''Dump counters every 60 seconds''' now = time.time() if now - self._last_dump_cnt > 60: self._last_dump_cnt = now self._dump_cnt() self._print_counter_log() def _check_delete(self): '''Check project delete''' now = time.time() for project in list(itervalues(self.projects)): if project.db_status != 'STOP': continue if now - project.updatetime < self.DELETE_TIME: continue if 'delete' not in self.projectdb.split_group(project.group): continue logger.warning("deleting project: %s!", project.name) del self.projects[project.name] self.taskdb.drop(project.name) self.projectdb.drop(project.name) if self.resultdb: self.resultdb.drop(project.name) for each in self._cnt.values(): del each[project.name] def __len__(self): return sum(len(x.task_queue) for x in itervalues(self.projects)) def quit(self): '''Set quit signal''' self._quit = True # stop xmlrpc server if hasattr(self, 'xmlrpc_server'): self.xmlrpc_ioloop.add_callback(self.xmlrpc_server.stop) self.xmlrpc_ioloop.add_callback(self.xmlrpc_ioloop.stop) def run_once(self): '''comsume queues and feed tasks to fetcher, once''' self._update_projects() self._check_task_done() self._check_request() while self._check_cronjob(): pass self._check_select() self._check_delete() self._try_dump_cnt() def run(self): '''Start scheduler loop''' logger.info("scheduler starting...") while not self._quit: try: time.sleep(self.LOOP_INTERVAL) self.run_once() self._exceptions = 0 except KeyboardInterrupt: break except Exception as e: logger.exception(e) self._exceptions += 1 if self._exceptions > self.EXCEPTION_LIMIT: break continue logger.info("scheduler exiting...") self._dump_cnt() def trigger_on_start(self, project): '''trigger an on_start callback of project''' self.newtask_queue.put({ "project": project, "taskid": "on_start", "url": "data:,on_start", "process": { "callback": "on_start", }, }) def xmlrpc_run(self, port=23333, bind='127.0.0.1', logRequests=False): '''Start xmlrpc interface''' from pyspider.libs.wsgi_xmlrpc import WSGIXMLRPCApplication application = WSGIXMLRPCApplication() application.register_function(self.quit, '_quit') application.register_function(self.__len__, 'size') def dump_counter(_time, _type): try: return self._cnt[_time].to_dict(_type) except: logger.exception('') application.register_function(dump_counter, 'counter') def new_task(task): if self.task_verify(task): self.newtask_queue.put(task) return True return False application.register_function(new_task, 'newtask') def send_task(task): '''dispatch task to fetcher''' self.send_task(task) return True application.register_function(send_task, 'send_task') def update_project(): self._force_update_project = True application.register_function(update_project, 'update_project') def get_active_tasks(project=None, limit=100): allowed_keys = set(( 'type', 'taskid', 'project', 'status', 'url', 'lastcrawltime', 'updatetime', 'track', )) track_allowed_keys = set(( 'ok', 'time', 'follows', 'status_code', )) iters = [iter(x.active_tasks) for k, x in iteritems(self.projects) if x and (k == project if project else True)] tasks = [next(x, None) for x in iters] result = [] while len(result) < limit and tasks and not all(x is None for x in tasks): updatetime, task = t = max(t for t in tasks if t) i = tasks.index(t) tasks[i] = next(iters[i], None) for key in list(task): if key == 'track': for k in list(task[key].get('fetch', [])): if k not in track_allowed_keys: del task[key]['fetch'][k] for k in list(task[key].get('process', [])): if k not in track_allowed_keys: del task[key]['process'][k] if key in allowed_keys: continue del task[key] result.append(t) # fix for ":dictionary key must be string" # have no idea why return json.loads(json.dumps(result)) application.register_function(get_active_tasks, 'get_active_tasks') def get_projects_pause_status(): result = {} for project_name, project in iteritems(self.projects): result[project_name] = project.paused return result application.register_function(get_projects_pause_status, 'get_projects_pause_status') def webui_update(): return { 'pause_status': get_projects_pause_status(), 'counter': { '5m_time': dump_counter('5m_time', 'avg'), '5m': dump_counter('5m', 'sum'), '1h': dump_counter('1h', 'sum'), '1d': dump_counter('1d', 'sum'), 'all': dump_counter('all', 'sum'), }, } application.register_function(webui_update, 'webui_update') import tornado.wsgi import tornado.ioloop import tornado.httpserver container = tornado.wsgi.WSGIContainer(application) self.xmlrpc_ioloop = tornado.ioloop.IOLoop() self.xmlrpc_server = tornado.httpserver.HTTPServer(container, io_loop=self.xmlrpc_ioloop) self.xmlrpc_server.listen(port=port, address=bind) logger.info('scheduler.xmlrpc listening on %s:%s', bind, port) self.xmlrpc_ioloop.start() def on_request(self, task): if self.INQUEUE_LIMIT and len(self.projects[task['project']].task_queue) >= self.INQUEUE_LIMIT: logger.debug('overflow task %(project)s:%(taskid)s %(url)s', task) return oldtask = self.taskdb.get_task(task['project'], task['taskid'], fields=self.merge_task_fields) if oldtask: return self.on_old_request(task, oldtask) else: return self.on_new_request(task) def on_new_request(self, task): '''Called when a new request is arrived''' task['status'] = self.taskdb.ACTIVE self.insert_task(task) self.put_task(task) project = task['project'] self._cnt['5m'].event((project, 'pending'), +1) self._cnt['1h'].event((project, 'pending'), +1) self._cnt['1d'].event((project, 'pending'), +1) self._cnt['all'].event((project, 'pending'), +1) logger.info('new task %(project)s:%(taskid)s %(url)s', task) return task def on_old_request(self, task, old_task): '''Called when a crawled task is arrived''' now = time.time() _schedule = task.get('schedule', self.default_schedule) old_schedule = old_task.get('schedule', {}) if _schedule.get('force_update') and self.projects[task['project']].task_queue.is_processing(task['taskid']): # when a task is in processing, the modify may conflict with the running task. # postpone the modify after task finished. logger.info('postpone modify task %(project)s:%(taskid)s %(url)s', task) self._postpone_request.append(task) return restart = False schedule_age = _schedule.get('age', self.default_schedule['age']) if _schedule.get('itag') and _schedule['itag'] != old_schedule.get('itag'): restart = True elif schedule_age >= 0 and schedule_age + (old_task.get('lastcrawltime', 0) or 0) < now: restart = True elif _schedule.get('force_update'): restart = True if not restart: logger.debug('ignore newtask %(project)s:%(taskid)s %(url)s', task) return if _schedule.get('cancel'): logger.info('cancel task %(project)s:%(taskid)s %(url)s', task) task['status'] = self.taskdb.BAD self.update_task(task) self.projects[task['project']].task_queue.delete(task['taskid']) return task task['status'] = self.taskdb.ACTIVE self.update_task(task) self.put_task(task) project = task['project'] if old_task['status'] != self.taskdb.ACTIVE: self._cnt['5m'].event((project, 'pending'), +1) self._cnt['1h'].event((project, 'pending'), +1) self._cnt['1d'].event((project, 'pending'), +1) if old_task['status'] == self.taskdb.SUCCESS: self._cnt['all'].event((project, 'success'), -1).event((project, 'pending'), +1) elif old_task['status'] == self.taskdb.FAILED: self._cnt['all'].event((project, 'failed'), -1).event((project, 'pending'), +1) logger.info('restart task %(project)s:%(taskid)s %(url)s', task) return task def on_task_status(self, task): '''Called when a status pack is arrived''' try: procesok = task['track']['process']['ok'] if not self.projects[task['project']].task_queue.done(task['taskid']): logging.error('not processing pack: %(project)s:%(taskid)s %(url)s', task) return None except KeyError as e: logger.error("Bad status pack: %s", e) return None if procesok: ret = self.on_task_done(task) else: ret = self.on_task_failed(task) if task['track']['fetch'].get('time'): self._cnt['5m_time'].event((task['project'], 'fetch_time'), task['track']['fetch']['time']) if task['track']['process'].get('time'): self._cnt['5m_time'].event((task['project'], 'process_time'), task['track']['process'].get('time')) self.projects[task['project']].active_tasks.appendleft((time.time(), task)) return ret def on_task_done(self, task): '''Called when a task is done and success, called by `on_task_status`''' task['status'] = self.taskdb.SUCCESS task['lastcrawltime'] = time.time() if 'schedule' in task: if task['schedule'].get('auto_recrawl') and 'age' in task['schedule']: task['status'] = self.taskdb.ACTIVE next_exetime = task['schedule'].get('age') task['schedule']['exetime'] = time.time() + next_exetime self.put_task(task) else: del task['schedule'] self.update_task(task) project = task['project'] self._cnt['5m'].event((project, 'success'), +1) self._cnt['1h'].event((project, 'success'), +1) self._cnt['1d'].event((project, 'success'), +1) self._cnt['all'].event((project, 'success'), +1).event((project, 'pending'), -1) logger.info('task done %(project)s:%(taskid)s %(url)s', task) return task def on_task_failed(self, task): '''Called when a task is failed, called by `on_task_status`''' if 'schedule' not in task: old_task = self.taskdb.get_task(task['project'], task['taskid'], fields=['schedule']) if old_task is None: logging.error('unknown status pack: %s' % task) return task['schedule'] = old_task.get('schedule', {}) retries = task['schedule'].get('retries', self.default_schedule['retries']) retried = task['schedule'].get('retried', 0) project_info = self.projects[task['project']] retry_delay = project_info.retry_delay or self.DEFAULT_RETRY_DELAY next_exetime = retry_delay.get(retried, retry_delay.get('', self.DEFAULT_RETRY_DELAY[''])) if task['schedule'].get('auto_recrawl') and 'age' in task['schedule']: next_exetime = min(next_exetime, task['schedule'].get('age')) else: if retried >= retries: next_exetime = -1 elif 'age' in task['schedule'] and next_exetime > task['schedule'].get('age'): next_exetime = task['schedule'].get('age') if next_exetime < 0: task['status'] = self.taskdb.FAILED task['lastcrawltime'] = time.time() self.update_task(task) project = task['project'] self._cnt['5m'].event((project, 'failed'), +1) self._cnt['1h'].event((project, 'failed'), +1) self._cnt['1d'].event((project, 'failed'), +1) self._cnt['all'].event((project, 'failed'), +1).event((project, 'pending'), -1) logger.info('task failed %(project)s:%(taskid)s %(url)s' % task) return task else: task['schedule']['retried'] = retried + 1 task['schedule']['exetime'] = time.time() + next_exetime task['lastcrawltime'] = time.time() self.update_task(task) self.put_task(task) project = task['project'] self._cnt['5m'].event((project, 'retry'), +1) self._cnt['1h'].event((project, 'retry'), +1) self._cnt['1d'].event((project, 'retry'), +1) # self._cnt['all'].event((project, 'retry'), +1) logger.info('task retry %d/%d %%(project)s:%%(taskid)s %%(url)s' % ( retried, retries), task) return task def on_select_task(self, task): '''Called when a task is selected to fetch & process''' # inject informations about project logger.info('select %(project)s:%(taskid)s %(url)s', task) project_info = self.projects.get(task['project']) assert project_info, 'no such project' task['type'] = self.TASK_PACK task['group'] = project_info.group task['project_md5sum'] = project_info.md5sum task['project_updatetime'] = project_info.updatetime # lazy join project.crawl_config if getattr(project_info, 'crawl_config', None): task = BaseHandler.task_join_crawl_config(task, project_info.crawl_config) project_info.active_tasks.appendleft((time.time(), task)) self.send_task(task) return task from tornado import gen class OneScheduler(Scheduler): """ Scheduler Mixin class for one mode overwirted send_task method call processor.on_task(fetcher.fetch(task)) instead of consuming queue """ def _check_select(self): """ interactive mode of select tasks """ if not self.interactive: return super(OneScheduler, self)._check_select() # waiting for running tasks if self.running_task > 0: return is_crawled = [] def run(project=None): return crawl('on_start', project=project) def crawl(url, project=None, **kwargs): """ Crawl given url, same parameters as BaseHandler.crawl url - url or taskid, parameters will be used if in taskdb project - can be ignored if only one project exists. """ # looking up the project instance if project is None: if len(self.projects) == 1: project = list(self.projects.keys())[0] else: raise LookupError('You need specify the project: %r' % list(self.projects.keys())) project_data = self.processor.project_manager.get(project) if not project_data: raise LookupError('no such project: %s' % project) # get task package instance = project_data['instance'] instance._reset() task = instance.crawl(url, **kwargs) if isinstance(task, list): raise Exception('url list is not allowed in interactive mode') # check task in taskdb if not kwargs: dbtask = self.taskdb.get_task(task['project'], task['taskid'], fields=self.request_task_fields) if not dbtask: dbtask = self.taskdb.get_task(task['project'], task['url'], fields=self.request_task_fields) if dbtask: task = dbtask # select the task self.on_select_task(task) is_crawled.append(True) shell.ask_exit() def quit_interactive(): '''Quit interactive mode''' is_crawled.append(True) self.interactive = False shell.ask_exit() def quit_pyspider(): '''Close pyspider''' is_crawled[:] = [] shell.ask_exit() shell = utils.get_python_console() banner = ( 'pyspider shell - Select task\n' 'crawl(url, project=None, **kwargs) - same parameters as BaseHandler.crawl\n' 'quit_interactive() - Quit interactive mode\n' 'quit_pyspider() - Close pyspider' ) if hasattr(shell, 'show_banner'): shell.show_banner(banner) shell.interact() else: shell.interact(banner) if not is_crawled: self.ioloop.add_callback(self.ioloop.stop) def __getattr__(self, name): """patch for crawl(url, callback=self.index_page) API""" if self.interactive: return name raise AttributeError(name) def on_task_status(self, task): """Ignore not processing error in interactive mode""" if not self.interactive: super(OneScheduler, self).on_task_status(task) try: procesok = task['track']['process']['ok'] except KeyError as e: logger.error("Bad status pack: %s", e) return None if procesok: ret = self.on_task_done(task) else: ret = self.on_task_failed(task) if task['track']['fetch'].get('time'): self._cnt['5m_time'].event((task['project'], 'fetch_time'), task['track']['fetch']['time']) if task['track']['process'].get('time'): self._cnt['5m_time'].event((task['project'], 'process_time'), task['track']['process'].get('time')) self.projects[task['project']].active_tasks.appendleft((time.time(), task)) return ret def init_one(self, ioloop, fetcher, processor, result_worker=None, interactive=False): self.ioloop = ioloop self.fetcher = fetcher self.processor = processor self.result_worker = result_worker self.interactive = interactive self.running_task = 0 @gen.coroutine def do_task(self, task): self.running_task += 1 result = yield gen.Task(self.fetcher.fetch, task) type, task, response = result.args self.processor.on_task(task, response) # do with message while not self.processor.inqueue.empty(): _task, _response = self.processor.inqueue.get() self.processor.on_task(_task, _response) # do with results while not self.processor.result_queue.empty(): _task, _result = self.processor.result_queue.get() if self.result_worker: self.result_worker.on_result(_task, _result) self.running_task -= 1 def send_task(self, task, force=True): if self.fetcher.http_client.free_size() <= 0: if force: self._send_buffer.appendleft(task) else: raise self.outqueue.Full self.ioloop.add_future(self.do_task(task), lambda x: x.result()) def run(self): import tornado.ioloop tornado.ioloop.PeriodicCallback(self.run_once, 100, io_loop=self.ioloop).start() self.ioloop.start() def quit(self): self.ioloop.stop() logger.info("scheduler exiting...") import random import threading from pyspider.database.sqlite.sqlitebase import SQLiteMixin class ThreadBaseScheduler(Scheduler): def __init__(self, threads=4, *args, **kwargs): self.local = threading.local() super(ThreadBaseScheduler, self).__init__(*args, **kwargs) if isinstance(self.taskdb, SQLiteMixin): self.threads = 1 else: self.threads = threads self._taskdb = self.taskdb self._projectdb = self.projectdb self._resultdb = self.resultdb self.thread_objs = [] self.thread_queues = [] self._start_threads() assert len(self.thread_queues) > 0 @property def taskdb(self): if not hasattr(self.local, 'taskdb'): self.taskdb = self._taskdb.copy() return self.local.taskdb @taskdb.setter def taskdb(self, taskdb): self.local.taskdb = taskdb @property def projectdb(self): if not hasattr(self.local, 'projectdb'): self.projectdb = self._projectdb.copy() return self.local.projectdb @projectdb.setter def projectdb(self, projectdb): self.local.projectdb = projectdb @property def resultdb(self): if not hasattr(self.local, 'resultdb'): self.resultdb = self._resultdb.copy() return self.local.resultdb @resultdb.setter def resultdb(self, resultdb): self.local.resultdb = resultdb def _start_threads(self): for i in range(self.threads): queue = Queue.Queue() thread = threading.Thread(target=self._thread_worker, args=(queue, )) thread.daemon = True thread.start() self.thread_objs.append(thread) self.thread_queues.append(queue) def _thread_worker(self, queue): while True: method, args, kwargs = queue.get() try: method(*args, **kwargs) except Exception as e: logger.exception(e) def _run_in_thread(self, method, *args, **kwargs): i = kwargs.pop('_i', None) block = kwargs.pop('_block', False) if i is None: while True: for queue in self.thread_queues: if queue.empty(): break else: if block: time.sleep(0.1) continue else: queue = self.thread_queues[random.randint(0, len(self.thread_queues)-1)] break else: queue = self.thread_queues[i % len(self.thread_queues)] queue.put((method, args, kwargs)) if block: self._wait_thread() def _wait_thread(self): while True: if all(queue.empty() for queue in self.thread_queues): break time.sleep(0.1) def _update_project(self, project): self._run_in_thread(Scheduler._update_project, self, project) def on_task_status(self, task): i = hash(task['taskid']) self._run_in_thread(Scheduler.on_task_status, self, task, _i=i) def on_request(self, task): i = hash(task['taskid']) self._run_in_thread(Scheduler.on_request, self, task, _i=i) def _load_put_task(self, project, taskid): i = hash(taskid) self._run_in_thread(Scheduler._load_put_task, self, project, taskid, _i=i) def run_once(self): super(ThreadBaseScheduler, self).run_once() self._wait_thread() ================================================ FILE: pyspider/scheduler/task_queue.py ================================================ #!/usr/bin/env python # -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # Created on 2014-02-07 13:12:10 import heapq import logging import threading import time try: from UserDict import DictMixin except ImportError: from collections import Mapping as DictMixin from .token_bucket import Bucket from six.moves import queue as Queue logger = logging.getLogger('scheduler') try: cmp except NameError: cmp = lambda x, y: (x > y) - (x < y) class AtomInt(object): __value__ = 0 __mutex__ = threading.RLock() @classmethod def get_value(cls): cls.__mutex__.acquire() cls.__value__ = cls.__value__ + 1 value = cls.__value__ cls.__mutex__.release() return value class InQueueTask(DictMixin): __slots__ = ('taskid', 'priority', 'exetime', 'sequence') __getitem__ = lambda *x: getattr(*x) __setitem__ = lambda *x: setattr(*x) __iter__ = lambda self: iter(self.__slots__) __len__ = lambda self: len(self.__slots__) keys = lambda self: self.__slots__ def __init__(self, taskid, priority=0, exetime=0): self.taskid = taskid self.priority = priority self.exetime = exetime self.sequence = AtomInt.get_value() def __cmp__(self, other): if self.exetime == 0 and other.exetime == 0: diff = -cmp(self.priority, other.priority) else: diff = cmp(self.exetime, other.exetime) # compare in-queue sequence number finally if two element has the same # priority or exetime return diff if diff != 0 else cmp(self.sequence, other.sequence) def __lt__(self, other): return self.__cmp__(other) < 0 class PriorityTaskQueue(Queue.Queue): ''' TaskQueue Same taskid items will been merged ''' def _init(self, maxsize): self.queue = [] self.queue_dict = dict() def _qsize(self, len=len): return len(self.queue_dict) def _put(self, item, heappush=heapq.heappush): if item.taskid in self.queue_dict: task = self.queue_dict[item.taskid] changed = False if item < task: changed = True task.priority = max(item.priority, task.priority) task.exetime = min(item.exetime, task.exetime) if changed: self._resort() else: heappush(self.queue, item) self.queue_dict[item.taskid] = item def _get(self, heappop=heapq.heappop): while self.queue: item = heappop(self.queue) if item.taskid is None: continue self.queue_dict.pop(item.taskid, None) return item return None @property def top(self): while self.queue and self.queue[0].taskid is None: heapq.heappop(self.queue) if self.queue: return self.queue[0] return None def _resort(self): heapq.heapify(self.queue) def __contains__(self, taskid): return taskid in self.queue_dict def __getitem__(self, taskid): return self.queue_dict[taskid] def __setitem__(self, taskid, item): assert item.taskid == taskid self.put(item) def __delitem__(self, taskid): self.queue_dict.pop(taskid).taskid = None class TaskQueue(object): ''' task queue for scheduler, have a priority queue and a time queue for delayed tasks ''' processing_timeout = 10 * 60 def __init__(self, rate=0, burst=0): self.mutex = threading.RLock() self.priority_queue = PriorityTaskQueue() self.time_queue = PriorityTaskQueue() self.processing = PriorityTaskQueue() self.bucket = Bucket(rate=rate, burst=burst) @property def rate(self): return self.bucket.rate @rate.setter def rate(self, value): self.bucket.rate = value @property def burst(self): return self.bucket.burst @burst.setter def burst(self, value): self.bucket.burst = value def check_update(self): ''' Check time queue and processing queue put tasks to priority queue when execute time arrived or process timeout ''' self._check_time_queue() self._check_processing() def _check_time_queue(self): now = time.time() self.mutex.acquire() while self.time_queue.qsize() and self.time_queue.top and self.time_queue.top.exetime < now: task = self.time_queue.get_nowait() # type: InQueueTask task.exetime = 0 self.priority_queue.put(task) self.mutex.release() def _check_processing(self): now = time.time() self.mutex.acquire() while self.processing.qsize() and self.processing.top and self.processing.top.exetime < now: task = self.processing.get_nowait() if task.taskid is None: continue task.exetime = 0 self.priority_queue.put(task) logger.info("processing: retry %s", task.taskid) self.mutex.release() def put(self, taskid, priority=0, exetime=0): """ Put a task into task queue when use heap sort, if we put tasks(with the same priority and exetime=0) into queue, the queue is not a strict FIFO queue, but more like a FILO stack. It is very possible that when there are continuous big flow, the speed of select is slower than request, resulting in priority-queue accumulation in short time. In this scenario, the tasks more earlier entering the priority-queue will not get processed until the request flow becomes small. Thus, we store a global atom self increasing value into task.sequence which represent the task enqueue sequence. When the comparison of exetime and priority have no difference, we compare task.sequence to ensure that the entire queue is ordered. """ now = time.time() task = InQueueTask(taskid, priority, exetime) self.mutex.acquire() if taskid in self.priority_queue: self.priority_queue.put(task) elif taskid in self.time_queue: self.time_queue.put(task) elif taskid in self.processing and self.processing[taskid].taskid: # force update a processing task is not allowed as there are so many # problems may happen pass else: if exetime and exetime > now: self.time_queue.put(task) else: task.exetime = 0 self.priority_queue.put(task) self.mutex.release() def get(self): '''Get a task from queue when bucket available''' if self.bucket.get() < 1: return None now = time.time() self.mutex.acquire() try: task = self.priority_queue.get_nowait() self.bucket.desc() except Queue.Empty: self.mutex.release() return None task.exetime = now + self.processing_timeout self.processing.put(task) self.mutex.release() return task.taskid def done(self, taskid): '''Mark task done''' if taskid in self.processing: self.mutex.acquire() if taskid in self.processing: del self.processing[taskid] self.mutex.release() return True return False def delete(self, taskid): if taskid not in self: return False if taskid in self.priority_queue: self.mutex.acquire() del self.priority_queue[taskid] self.mutex.release() elif taskid in self.time_queue: self.mutex.acquire() del self.time_queue[taskid] self.mutex.release() elif taskid in self.processing: self.done(taskid) return True def size(self): return self.priority_queue.qsize() + self.time_queue.qsize() + self.processing.qsize() def is_processing(self, taskid): ''' return True if taskid is in processing ''' return taskid in self.processing and self.processing[taskid].taskid def __len__(self): return self.size() def __contains__(self, taskid): if taskid in self.priority_queue or taskid in self.time_queue: return True if taskid in self.processing and self.processing[taskid].taskid: return True return False if __name__ == '__main__': task_queue = TaskQueue() task_queue.processing_timeout = 0.1 task_queue.put('a3', 3, time.time() + 0.1) task_queue.put('a1', 1) task_queue.put('a2', 2) assert task_queue.get() == 'a2' time.sleep(0.1) task_queue._check_time_queue() assert task_queue.get() == 'a3' assert task_queue.get() == 'a1' task_queue._check_processing() assert task_queue.get() == 'a2' assert len(task_queue) == 0 ================================================ FILE: pyspider/scheduler/token_bucket.py ================================================ #!/usr/bin/env python # -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # Created on 2014-02-07 16:53:08 import time try: import threading as _threading except ImportError: import dummy_threading as _threading class Bucket(object): ''' traffic flow control with token bucket ''' update_interval = 30 def __init__(self, rate=1, burst=None): self.rate = float(rate) if burst is None: self.burst = float(rate) * 10 else: self.burst = float(burst) self.mutex = _threading.Lock() self.bucket = self.burst self.last_update = time.time() def get(self): '''Get the number of tokens in bucket''' now = time.time() if self.bucket >= self.burst: self.last_update = now return self.bucket bucket = self.rate * (now - self.last_update) self.mutex.acquire() if bucket > 1: self.bucket += bucket if self.bucket > self.burst: self.bucket = self.burst self.last_update = now self.mutex.release() return self.bucket def set(self, value): '''Set number of tokens in bucket''' self.bucket = value def desc(self, value=1): '''Use value tokens''' self.bucket -= value ================================================ FILE: pyspider/webui/__init__.py ================================================ #!/usr/bin/env python # -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # Created on 2014-02-22 23:20:40 from . import app, index, debug, task, result, login ================================================ FILE: pyspider/webui/app.py ================================================ #!/usr/bin/env python # -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # Created on 2014-02-22 23:17:13 import os import sys import logging logger = logging.getLogger("webui") from six import reraise from six.moves import builtins from six.moves.urllib.parse import urljoin from flask import Flask from pyspider.fetcher import tornado_fetcher if os.name == 'nt': import mimetypes mimetypes.add_type("text/css", ".css", True) class QuitableFlask(Flask): """Add quit() method to Flask object""" @property def logger(self): return logger def run(self, host=None, port=None, debug=None, **options): import tornado.wsgi import tornado.ioloop import tornado.httpserver import tornado.web if host is None: host = '127.0.0.1' if port is None: server_name = self.config['SERVER_NAME'] if server_name and ':' in server_name: port = int(server_name.rsplit(':', 1)[1]) else: port = 5000 if debug is not None: self.debug = bool(debug) hostname = host port = port application = self use_reloader = self.debug use_debugger = self.debug if use_debugger: from werkzeug.debug import DebuggedApplication application = DebuggedApplication(application, True) try: from .webdav import dav_app except ImportError as e: logger.warning('WebDav interface not enabled: %r', e) dav_app = None if dav_app: from werkzeug.wsgi import DispatcherMiddleware application = DispatcherMiddleware(application, { '/dav': dav_app }) container = tornado.wsgi.WSGIContainer(application) self.http_server = tornado.httpserver.HTTPServer(container) self.http_server.listen(port, hostname) if use_reloader: from tornado import autoreload autoreload.start() self.logger.info('webui running on %s:%s', hostname, port) self.ioloop = tornado.ioloop.IOLoop.current() self.ioloop.start() def quit(self): if hasattr(self, 'ioloop'): self.ioloop.add_callback(self.http_server.stop) self.ioloop.add_callback(self.ioloop.stop) self.logger.info('webui exiting...') app = QuitableFlask('webui', static_folder=os.path.join(os.path.dirname(__file__), 'static'), template_folder=os.path.join(os.path.dirname(__file__), 'templates')) app.secret_key = os.urandom(24) app.jinja_env.line_statement_prefix = '#' app.jinja_env.globals.update(builtins.__dict__) app.config.update({ 'fetch': lambda x: tornado_fetcher.Fetcher(None, None, async_mode=False).fetch(x), 'taskdb': None, 'projectdb': None, 'scheduler_rpc': None, 'queues': dict(), 'process_time_limit': 30, }) def cdn_url_handler(error, endpoint, kwargs): if endpoint == 'cdn': path = kwargs.pop('path') # cdn = app.config.get('cdn', 'http://cdn.staticfile.org/') # cdn = app.config.get('cdn', '//cdnjs.cloudflare.com/ajax/libs/') cdn = app.config.get('cdn', '//cdnjscn.b0.upaiyun.com/libs/') return urljoin(cdn, path) else: exc_type, exc_value, tb = sys.exc_info() if exc_value is error: reraise(exc_type, exc_value, tb) else: raise error app.handle_url_build_error = cdn_url_handler ================================================ FILE: pyspider/webui/bench_test.py ================================================ #!/usr/bin/env python # -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # Created on 2014-12-08 22:31:17 import random try: from urllib import urlencode except ImportError: from urllib.parse import urlencode from flask import request from .app import app @app.route('/bench') def bench_test(): total = int(request.args.get('total', 10000)) show = int(request.args.get('show', 20)) nlist = [random.randint(1, total) for _ in range(show)] result = [] result.append("") args = dict(request.args) for nl in nlist: args['n'] = nl argstr = urlencode(sorted(args.items()), doseq=True) result.append("follow {1}
".format(argstr, nl)) result.append("") return "".join(result) ================================================ FILE: pyspider/webui/debug.py ================================================ #!/usr/bin/env python # -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # Created on 2014-02-23 00:19:06 import sys import time import socket import inspect import datetime import traceback from flask import render_template, request, json try: import flask_login as login except ImportError: from flask.ext import login from pyspider.libs import utils, sample_handler, dataurl from pyspider.libs.response import rebuild_response from pyspider.processor.project_module import ProjectManager, ProjectFinder from .app import app default_task = { 'taskid': 'data:,on_start', 'project': '', 'url': 'data:,on_start', 'process': { 'callback': 'on_start', }, } default_script = inspect.getsource(sample_handler) @app.route('/debug/', methods=['GET', 'POST']) def debug(project): projectdb = app.config['projectdb'] if not projectdb.verify_project_name(project): return 'project name is not allowed!', 400 info = projectdb.get(project, fields=['name', 'script']) if info: script = info['script'] else: script = (default_script .replace('__DATE__', datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")) .replace('__PROJECT_NAME__', project) .replace('__START_URL__', request.values.get('start-urls') or '__START_URL__')) taskid = request.args.get('taskid') if taskid: taskdb = app.config['taskdb'] task = taskdb.get_task( project, taskid, ['taskid', 'project', 'url', 'fetch', 'process']) else: task = default_task default_task['project'] = project return render_template("debug.html", task=task, script=script, project_name=project) @app.before_first_request def enable_projects_import(): sys.meta_path.append(ProjectFinder(app.config['projectdb'])) @app.route('/debug//run', methods=['POST', ]) def run(project): start_time = time.time() try: task = utils.decode_unicode_obj(json.loads(request.form['task'])) except Exception: result = { 'fetch_result': "", 'logs': u'task json error', 'follows': [], 'messages': [], 'result': None, 'time': time.time() - start_time, } return json.dumps(utils.unicode_obj(result)), \ 200, {'Content-Type': 'application/json'} project_info = { 'name': project, 'status': 'DEBUG', 'script': request.form['script'], } if request.form.get('webdav_mode') == 'true': projectdb = app.config['projectdb'] info = projectdb.get(project, fields=['name', 'script']) if not info: result = { 'fetch_result': "", 'logs': u' in wevdav mode, cannot load script', 'follows': [], 'messages': [], 'result': None, 'time': time.time() - start_time, } return json.dumps(utils.unicode_obj(result)), \ 200, {'Content-Type': 'application/json'} project_info['script'] = info['script'] fetch_result = {} try: module = ProjectManager.build_module(project_info, { 'debugger': True, 'process_time_limit': app.config['process_time_limit'], }) # The code below is to mock the behavior that crawl_config been joined when selected by scheduler. # but to have a better view of joined tasks, it has been done in BaseHandler.crawl when `is_debugger is True` # crawl_config = module['instance'].crawl_config # task = module['instance'].task_join_crawl_config(task, crawl_config) fetch_result = app.config['fetch'](task) response = rebuild_response(fetch_result) ret = module['instance'].run_task(module['module'], task, response) except Exception: type, value, tb = sys.exc_info() tb = utils.hide_me(tb, globals()) logs = ''.join(traceback.format_exception(type, value, tb)) result = { 'fetch_result': fetch_result, 'logs': logs, 'follows': [], 'messages': [], 'result': None, 'time': time.time() - start_time, } else: result = { 'fetch_result': fetch_result, 'logs': ret.logstr(), 'follows': ret.follows, 'messages': ret.messages, 'result': ret.result, 'time': time.time() - start_time, } result['fetch_result']['content'] = response.text if (response.headers.get('content-type', '').startswith('image')): result['fetch_result']['dataurl'] = dataurl.encode( response.content, response.headers['content-type']) try: # binary data can't encode to JSON, encode result as unicode obj # before send it to frontend return json.dumps(utils.unicode_obj(result)), 200, {'Content-Type': 'application/json'} except Exception: type, value, tb = sys.exc_info() tb = utils.hide_me(tb, globals()) logs = ''.join(traceback.format_exception(type, value, tb)) result = { 'fetch_result': "", 'logs': logs, 'follows': [], 'messages': [], 'result': None, 'time': time.time() - start_time, } return json.dumps(utils.unicode_obj(result)), 200, {'Content-Type': 'application/json'} @app.route('/debug//save', methods=['POST', ]) def save(project): projectdb = app.config['projectdb'] if not projectdb.verify_project_name(project): return 'project name is not allowed!', 400 script = request.form['script'] project_info = projectdb.get(project, fields=['name', 'status', 'group']) if project_info and 'lock' in projectdb.split_group(project_info.get('group')) \ and not login.current_user.is_active(): return app.login_response if project_info: info = { 'script': script, } if project_info.get('status') in ('DEBUG', 'RUNNING', ): info['status'] = 'CHECKING' projectdb.update(project, info) else: info = { 'name': project, 'script': script, 'status': 'TODO', 'rate': app.config.get('max_rate', 1), 'burst': app.config.get('max_burst', 3), } projectdb.insert(project, info) rpc = app.config['scheduler_rpc'] if rpc is not None: try: rpc.update_project() except socket.error as e: app.logger.warning('connect to scheduler rpc error: %r', e) return 'rpc error', 200 return 'ok', 200 @app.route('/debug//get') def get_script(project): projectdb = app.config['projectdb'] if not projectdb.verify_project_name(project): return 'project name is not allowed!', 400 info = projectdb.get(project, fields=['name', 'script']) return json.dumps(utils.unicode_obj(info)), \ 200, {'Content-Type': 'application/json'} @app.route('/blank.html') def blank_html(): return "" ================================================ FILE: pyspider/webui/index.py ================================================ #!/usr/bin/env python # -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # Created on 2014-02-22 23:20:39 import socket from six import iteritems, itervalues from flask import render_template, request, json try: import flask_login as login except ImportError: from flask.ext import login from .app import app index_fields = ['name', 'group', 'status', 'comments', 'rate', 'burst', 'updatetime'] @app.route('/') def index(): projectdb = app.config['projectdb'] projects = sorted(projectdb.get_all(fields=index_fields), key=lambda k: (0 if k['group'] else 1, k['group'] or '', k['name'])) return render_template("index.html", projects=projects) @app.route('/queues') def get_queues(): def try_get_qsize(queue): if queue is None: return 'None' try: return queue.qsize() except Exception as e: return "%r" % e result = {} queues = app.config.get('queues', {}) for key in queues: result[key] = try_get_qsize(queues[key]) return json.dumps(result), 200, {'Content-Type': 'application/json'} @app.route('/update', methods=['POST', ]) def project_update(): projectdb = app.config['projectdb'] project = request.form['pk'] name = request.form['name'] value = request.form['value'] project_info = projectdb.get(project, fields=('name', 'group')) if not project_info: return "no such project.", 404 if 'lock' in projectdb.split_group(project_info.get('group')) \ and not login.current_user.is_active(): return app.login_response if name not in ('group', 'status', 'rate'): return 'unknown field: %s' % name, 400 if name == 'rate': value = value.split('/') if len(value) != 2: return 'format error: rate/burst', 400 rate = float(value[0]) burst = float(value[1]) update = { 'rate': min(rate, app.config.get('max_rate', rate)), 'burst': min(burst, app.config.get('max_burst', burst)), } else: update = { name: value } ret = projectdb.update(project, update) if ret: rpc = app.config['scheduler_rpc'] if rpc is not None: try: rpc.update_project() except socket.error as e: app.logger.warning('connect to scheduler rpc error: %r', e) return 'rpc error', 200 return 'ok', 200 else: app.logger.warning("[webui index] projectdb.update() error - res: {}".format(ret)) return 'update error', 500 @app.route('/counter') def counter(): rpc = app.config['scheduler_rpc'] if rpc is None: return json.dumps({}) result = {} try: data = rpc.webui_update() for type, counters in iteritems(data['counter']): for project, counter in iteritems(counters): result.setdefault(project, {})[type] = counter for project, paused in iteritems(data['pause_status']): result.setdefault(project, {})['paused'] = paused except socket.error as e: app.logger.warning('connect to scheduler rpc error: %r', e) return json.dumps({}), 200, {'Content-Type': 'application/json'} return json.dumps(result), 200, {'Content-Type': 'application/json'} @app.route('/run', methods=['POST', ]) def runtask(): rpc = app.config['scheduler_rpc'] if rpc is None: return json.dumps({}) projectdb = app.config['projectdb'] project = request.form['project'] project_info = projectdb.get(project, fields=('name', 'group')) if not project_info: return "no such project.", 404 if 'lock' in projectdb.split_group(project_info.get('group')) \ and not login.current_user.is_active(): return app.login_response newtask = { "project": project, "taskid": "on_start", "url": "data:,on_start", "process": { "callback": "on_start", }, "schedule": { "age": 0, "priority": 9, "force_update": True, }, } try: ret = rpc.newtask(newtask) except socket.error as e: app.logger.warning('connect to scheduler rpc error: %r', e) return json.dumps({"result": False}), 200, {'Content-Type': 'application/json'} return json.dumps({"result": ret}), 200, {'Content-Type': 'application/json'} @app.route('/robots.txt') def robots(): return """User-agent: * Disallow: / Allow: /$ Allow: /debug Disallow: /debug/*?taskid=* """, 200, {'Content-Type': 'text/plain'} ================================================ FILE: pyspider/webui/login.py ================================================ #!/usr/bin/env python # -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # Created on 2014-12-10 20:36:27 import base64 from flask import Response try: import flask_login as login except ImportError: from flask.ext import login from .app import app login_manager = login.LoginManager() login_manager.init_app(app) class AnonymousUser(login.AnonymousUserMixin): def is_anonymous(self): return True def is_active(self): return False def is_authenticated(self): return False def get_id(self): return class User(login.UserMixin): def __init__(self, id, password): self.id = id self.password = password def is_authenticated(self): if not app.config.get('webui_username'): return True if self.id == app.config.get('webui_username') \ and self.password == app.config.get('webui_password'): return True return False def is_active(self): return self.is_authenticated() login_manager.anonymous_user = AnonymousUser @login_manager.request_loader def load_user_from_request(request): api_key = request.headers.get('Authorization') if api_key: api_key = api_key[len("Basic "):] try: api_key = base64.b64decode(api_key).decode('utf8') return User(*api_key.split(":", 1)) except Exception as e: app.logger.error('wrong api key: %r, %r', api_key, e) return None return None app.login_response = Response( "need auth.", 401, {'WWW-Authenticate': 'Basic realm="Login Required"'} ) @app.before_request def before_request(): if app.config.get('need_auth', False): if not login.current_user.is_active(): return app.login_response ================================================ FILE: pyspider/webui/result.py ================================================ #!/usr/bin/env python # -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # Created on 2014-10-19 16:23:55 from __future__ import unicode_literals from flask import render_template, request, json from flask import Response from .app import app from pyspider.libs import result_dump @app.route('/results') def result(): resultdb = app.config['resultdb'] project = request.args.get('project') offset = int(request.args.get('offset', 0)) limit = int(request.args.get('limit', 20)) count = resultdb.count(project) results = list(resultdb.select(project, offset=offset, limit=limit)) return render_template( "result.html", count=count, results=results, result_formater=result_dump.result_formater, project=project, offset=offset, limit=limit, json=json ) @app.route('/results/dump/.<_format>') def dump_result(project, _format): resultdb = app.config['resultdb'] # force update project list resultdb.get(project, 'any') if project not in resultdb.projects: return "no such project.", 404 offset = int(request.args.get('offset', 0)) or None limit = int(request.args.get('limit', 0)) or None results = resultdb.select(project, offset=offset, limit=limit) if _format == 'json': valid = request.args.get('style', 'rows') == 'full' return Response(result_dump.dump_as_json(results, valid), mimetype='application/json') elif _format == 'txt': return Response(result_dump.dump_as_txt(results), mimetype='text/plain') elif _format == 'csv': return Response(result_dump.dump_as_csv(results), mimetype='text/csv') ================================================ FILE: pyspider/webui/static/.babelrc ================================================ { "presets": ["es2015"] } ================================================ FILE: pyspider/webui/static/package.json ================================================ { "name": "pyspider-webui", "version": "0.3.9", "description": "webui of pyspider", "scripts": { "build": "webpack --progress --colors --optimize-minimize", "dev": "webpack --progress --colors --optimize-minimize --watch" }, "keywords": [ "pyspider" ], "author": "binux", "license": "MIT", "devDependencies": { "babel-core": "^6.14.0", "babel-loader": "^6.2.5", "babel-preset-es2015": "^6.14.0", "css-loader": "^0.25.0", "extract-text-webpack-plugin": "^1.0.1", "less": "^2.7.1", "less-loader": "^2.2.3", "style-loader": "^0.13.1", "webpack": "^1.13.2" } } ================================================ FILE: pyspider/webui/static/src/css_selector_helper.js ================================================ // vim: set et sw=2 ts=2 sts=2 ff=unix fenc=utf8: // Author: Binux // http://binux.me // Created on 2013-11-11 18:50:58 import EventEmitter from 'events' function arrayEquals(a, b) { if (!a || !b) return false; if (a.length != b.length) return false; for (var i = 0, l = a.length; i < l; i++) { if (a[i] !== b[i]) return false; } return true; } function getOffset(elem) { var top = 0; var left = 0; do { if ( !isNaN( elem.offsetLeft) ) left += elem.offsetLeft; if ( !isNaN( elem.offsetTop) ) top += elem.offsetTop; } while( elem = elem.offsetParent ) return {top: top, left: left}; } function merge_name(features) { var element_name = ''; features.forEach(function(f) { if (f.selected) element_name += f.name; }) return element_name; } function merge_pattern(path, end) { var pattern = ''; var prev = null; path.forEach(function(p, i) { if (end >= 0 && i > end) { return; } if (p.invalid) { prev = null; } else if (p.selected) { if (prev) { pattern += ' >'; } var element_pattern = ''; p.features.forEach(function(f) { if (f.selected) { element_pattern += f.pattern; } }); if (element_pattern === '') { element_pattern = '*'; } pattern += ' '+element_pattern; prev = p; } else { prev = null; } }) if (pattern === '') { pattern = '*'; } return pattern; } function path_info(doc, element) { var path = []; do { var features = []; // tagName features.push({ name: element.tagName.toLowerCase(), pattern: element.tagName.toLowerCase(), selected: true, }); // id if (element.getAttribute('id')) { features.push({ name: '#'+element.getAttribute('id'), pattern: '#'+element.getAttribute('id'), selected: true, }); } // class if (element.classList.length > 0) { for (var i=0; i 1 && i < siblings.length; i++) { var sibling = siblings[i]; if (sibling === element) { xpath += '['+(ix+1)+']'; break; } else if (sibling.tagName == element.tagName) { ix++; } } // pack it up path.push({ tag: element.tagName.toLowerCase(), name: merge_name(features), xpath: xpath, selected: true, invalid: element.tagName.toLowerCase() === 'tbody', features: features, }); } while (element = element.parentElement); path.reverse(); // select elements var selected_elements = doc.querySelectorAll(merge_pattern(path)); path.forEach(function(p, i) { if (p.invalid) return; // select features var feature_selected_elements = doc.querySelectorAll(merge_pattern(path, i)); p.features.forEach(function(f, fi) { f.selected = false; if (arrayEquals(feature_selected_elements, doc.querySelectorAll(merge_pattern(path, i)))) { return; } f.selected = true; }); if (p.features.every(function(f) { return !f.selected; })) { p.features[0].selected = true; } p.name = merge_name(p.features); }); path.forEach(function(p, i) { p.selected = false; if (arrayEquals(selected_elements, doc.querySelectorAll(merge_pattern(path)))) { p.name = p.tag; return; } p.selected = true; }); return path; } export default class CSSSelectorHelperServer extends EventEmitter { constructor(window) { super(); this.window = window; this.document = window.document; this.document.addEventListener("mouseover", (ev) => { this.overlay(ev.target); }); this.document.addEventListener("click", (ev) => { ev.preventDefault(); ev.stopPropagation(); this.emit('selector_helper_click', path_info(this.document, ev.target)); }); } overlay(elements) { if (typeof elements === 'string') { elements = this.document.querySelectorAll(elements); } if (elements instanceof this.window.Element) { elements = [elements]; } [...this.document.querySelectorAll('.pyspider_overlay')].forEach((elem) => { elem.remove(); }); [...elements].forEach((elem) => { const offset = getOffset(elem); const div = this.document.createElement("div"); div.className = "pyspider_overlay"; div.setAttribute('style', 'z-index: 999999;background-color: rgba(255, 165, 0, 0.3);position: absolute;pointer-events: none;' +'top: '+offset.top+'px;' +'left:'+offset.left+'px;' +'width: '+elem.offsetWidth+'px;' +'height: '+elem.offsetHeight+'px;'); this.document.body.appendChild(div); }); } heightlight(elements) { if (typeof elements === 'string') { elements = this.document.querySelectorAll(elements); } console.log(elements); if (elements instanceof this.window.Element) { elements = [elements]; } [...this.document.querySelectorAll('.pyspider_highlight')].forEach((elem) => { elem.remove(); }); [...elements].forEach((elem) => { const offset = getOffset(elem); const div = this.document.createElement("div"); div.className = "pyspider_highlight"; div.setAttribute('style', 'z-index: 888888;border: 2px solid #c00;position: absolute;pointer-events: none;' +'top: '+(offset.top-2)+'px;' +'left:'+(offset.left-2)+'px;' +'width: '+elem.offsetWidth+'px;' +'height: '+elem.offsetHeight+'px;'); this.document.body.appendChild(div); }); } getElementByXpath(path) { return this.document.evaluate(path, this.document, null, this.window.XPathResult.FIRST_ORDERED_NODE_TYPE, null).singleNodeValue; } } ================================================ FILE: pyspider/webui/static/src/debug.js ================================================ // vim: set et sw=2 ts=2 sts=2 ff=unix fenc=utf8: // Author: Binux // http://binux.me // Created on 2014-02-23 15:19:19 import "./debug.less" import "./splitter" import CSSSelectorHelperServer from "./css_selector_helper" window.SelectorHelper = (function() { var helper = $('#css-selector-helper'); var server = null; function merge_name(p) { var features = p.features; var element_name = ''; features.forEach(function(f) { if (f.selected) element_name += f.name; }); if (element_name === '') { return p.tag; } return element_name; } function merge_pattern(path, end) { var pattern = ''; var prev = null; path.forEach(function(p, i) { if (end >= 0 && i > end) { return; } if (p.invalid) { prev = null; } else if (p.selected) { if (prev) { pattern += ' >'; } var element_pattern = ''; p.features.forEach(function(f) { if (f.selected) { element_pattern += f.pattern; } }); if (element_pattern === '') { element_pattern = '*'; } pattern += ' '+element_pattern; prev = p; } else { prev = null; } }) if (pattern === '') { pattern = '*'; } return pattern.trim(); } var current_path = null; function selector_changed(path) { current_path = path; server.heightlight(merge_pattern(path)); } function render_selector_helper(path) { helper.find('.element').remove(); var elements = []; $.each(path, function(i, p) { var span = $('').addClass('element').data('info', p); $('').text(p.name).appendTo(span); if (p.selected) span.addClass('selected'); if (p.invalid) span.addClass('invalid'); var ul = $('
    '); $.each(p.features, function(i, f) { var li = $('
  • ').text(f.name).data('feature', f); if (f.selected) li.addClass('selected'); li.appendTo(ul); // feature on click li.on('click', function(ev) { ev.stopPropagation(); var $this = $(this); var f = $this.data('feature'); if (f.selected) { f.selected = false; $this.removeClass('selected'); } else { f.selected = true; $this.addClass('selected'); } var element = $this.parents('.element'); if (!p.selected) { p.selected = true; element.addClass('selected'); } element.find('.element-name').text(merge_name(p)); selector_changed(path); }); }); ul.appendTo(span); span.on('mouseover', (ev) => { var xpath = []; $.each(path, function(i, _p) { xpath.push(_p.xpath); if (_p === p) { return false; } }); server.overlay(server.getElementByXpath('/' + xpath.join('/'))); }) // path on click span.on('click', function(ev) { ev.stopPropagation(); var $this = $(this); var p = $this.data('info'); if (p.selected) { p.selected = false; $this.removeClass('selected'); } else { p.selected = true; $this.addClass('selected'); } $this.find('.element-name').text(merge_name($this.data('info'))); selector_changed(path); }); elements.push(span); }); helper.prepend(elements); adjustHelper(); selector_changed(path); } function adjustHelper() { while (helper[0].scrollWidth > helper.width()) { var e = helper.find('.element:visible:first'); if (e.length == 0) { return; } e.addClass('invalid').data('info')['invalid'] = true; } } var tab_web = $('#tab-web'); return { init: function() { var _this = this; _this.clear(); $("#J-enable-css-selector-helper").on('click', ev => { this.clear(); server = new CSSSelectorHelperServer($("#tab-web iframe")[0].contentWindow); server.on('selector_helper_click', path => { render_selector_helper(path); }) this.enable(); }); $("#task-panel").on("scroll", function(ev) { if (!helper.is(':visible')) { return; } if ($("#debug-tabs").position().top < 0) { helper.addClass('fixed'); tab_web.addClass('fixed'); } else { helper.removeClass('fixed'); tab_web.removeClass('fixed'); } }); // copy button var input = helper.find('.copy-selector-input'); input.on('focus', function(ev) { $(this).select(); }); helper.find('.copy-selector').on('click', function(ev) { if (!current_path) { return; } if (input.is(':visible')) { input.hide(); helper.find('.element').show(); } else { helper.find('.element').hide(); input.val(merge_pattern(current_path)).show(); } }); // add button helper.find('.add-to-editor').on('click', function(ev) { Debugger.python_editor_replace_selection(merge_pattern(current_path)); }); }, clear: function() { current_path = null; helper.hide(); helper.removeClass('fixed'); tab_web.removeClass('fixed'); helper.find('.element').remove(); }, enable: function() { helper.show(); helper.find('.copy-selector-input').hide(); if ($("#debug-tabs").position().top < 0) { helper.addClass('fixed'); tab_web.addClass('fixed'); } else { helper.removeClass('fixed'); tab_web.removeClass('fixed'); } }, } })(); window.Debugger = (function() { var tmp_div = $('
    '); function escape(text) { return tmp_div.text(text).html(); } return { init: function() { //init resizer this.splitter = $(".debug-panel:not(:first)").splitter().data('splitter') .trigger('init') .on('resize-start', function() { $('#left-area .overlay').show(); }) .on('resize-end', function() { $('#left-area .overlay').hide(); }); //codemirror CodeMirror.keyMap.basic.Tab = 'indentMore'; this.init_python_editor($("#python-editor")); this.init_task_editor($("#task-editor")); this.bind_debug_tabs(); this.bind_run(); this.bind_save(); this.bind_others(); // css selector helper SelectorHelper.init(); }, not_saved: false, init_python_editor: function($el) { var _this = this; this.python_editor_elem = $el; var cm = this.python_editor = CodeMirror($el[0], { value: script_content, mode: "python", lineNumbers: true, indentUnit: 4, lineWrapping: true, styleActiveLine: true, autofocus: true }); cm.on('focus', function() { $el.addClass("focus"); }); cm.on('blur', function() { $el.removeClass("focus"); }); cm.on('change', function() { _this.not_saved = true; }); window.addEventListener('beforeunload', function(e) { if (_this.not_saved) { var returnValue = "You have not saved changes."; (e || window.event).returnValue = returnValue; return returnValue; } }); }, python_editor_replace_selection: function(content) { this.python_editor.getDoc().replaceSelection(content); }, auto_format: function(cm) { var pos = cm.getCursor(true); CodeMirror.commands.selectAll(cm); cm.autoFormatRange(cm.getCursor(true), cm.getCursor(false)); cm.setCursor(pos); }, format_string: function(value, mode) { var div = document.createElement('div'); var cm = CodeMirror(div, { value: value, mode: mode }); this.auto_format(cm); return cm.getDoc().getValue(); }, init_task_editor: function($el) { var cm = this.task_editor = CodeMirror($el[0], { value: task_content, mode: "application/json", indentUnit: 2, lineWrapping: true, styleActiveLine: true, lint: true }); this.auto_format(cm); cm.getDoc().clearHistory(); cm.on('focus', function() { $el.addClass("focus"); }); cm.on('blur', function() { $el.removeClass("focus"); }); }, bind_debug_tabs: function() { var _this = this; $('#tab-control > li[data-id]').on('click', function() { $('#tab-control > li[data-id]').removeClass('active'); var name = $(this).addClass('active').data('id'); $('#debug-tabs .tab').hide(); $('#debug-tabs #'+name).show(); }); $("#tab-control li[data-id=tab-html]").on('click', function() { if (!!!$("#tab-html").data("format")) { var html_styled = ""; CodeMirror.runMode(_this.format_string($("#tab-html pre").text(), 'text/html'), 'text/html', function(text, classname) { if (classname) html_styled += ''+escape(text)+''; else html_styled += escape(text); }); $("#tab-html pre").html(html_styled); $("#tab-html").data("format", true); } }); }, bind_run: function() { var _this = this; $('#run-task-btn').on('click', function() { _this.run(); }); $('#undo-btn').on('click', function(ev) { _this.task_editor.execCommand('undo'); }); $('#redo-btn').on('click', function(ev) { _this.task_editor.execCommand('redo'); }); }, bind_save: function() { var _this = this; $('#save-task-btn').on('click', function() { var script = _this.python_editor.getDoc().getValue(); $('#right-area .overlay').show(); $.ajax({ type: "POST", url: location.pathname+'/save', data: { script: script }, success: function(data) { console.log(data); _this.python_log(''); _this.python_log("saved!"); _this.not_saved = false; $('#right-area .overlay').hide(); }, error: function(xhr, textStatus, errorThrown) { console.log(xhr, textStatus, errorThrown); _this.python_log("save error!\n"+xhr.responseText); $('#right-area .overlay').hide(); } }); }); }, bind_follows: function() { var _this = this; $('.newtask').on('click', function() { if ($(this).next().hasClass("task-show")) { $(this).next().remove(); return; } var task = $(this).after('
    ').data("task"); task = JSON.stringify(window.newtasks[task], null, ' '); CodeMirror.runMode(task, 'application/json', $(this).next().find('pre')[0]); }); $('.newtask .task-run').on('click', function(event) { event.preventDefault(); event.stopPropagation(); let task_id = $(this).parents('.newtask').data("task"); let task = window.newtasks[task_id]; _this.task_editor.setValue(JSON.stringify(task, null, ' ')); _this.task_updated(task); _this.run(); }); }, task_updated: function task_updated(task) { $('#history-wrap').hide(); if (task.project && task.taskid) { $.ajax({ url: `/task/${task.project}:${task.taskid}.json`, success: (data) => { if (!data.code && !data.error) { $('#history-link').attr('href', `/task/${task.project}:${task.taskid}`).text(`status: ${data.status_string}`); $('#history-wrap').show(); } } }) } }, bind_others: function() { var _this = this; $('#python-log-show').on('click', function() { if ($('#python-log pre').is(":visible")) { $('#python-log pre').hide(); $(this).height(8); } else { $('#python-log pre').show(); $(this).height(0); } }); $('.webdav-btn').on('click', function() { _this.toggle_webdav_mode(this); }) }, render_html: function(html, base_url, block_script=true, block_iframe=true) { if (html === undefined) { html = ''; } let dom = (new DOMParser()).parseFromString(html, "text/html"); $(dom).find('base').remove(); $(dom).find('head').prepend(''); $(dom).find('base').attr('href', base_url); if (block_script) { $(dom).find('script').attr('type', 'text/plain'); } if (block_iframe) { $(dom).find('iframe[src]').each((i, e) => { e = $(e); e.attr('__src', e.attr('src')) e.attr('src', encodeURI('data:text/html;,

    iframe blocked

    ')); }); } return dom.documentElement.innerHTML; }, run: function() { var script = this.python_editor.getDoc().getValue(); var task = this.task_editor.getDoc().getValue(); var _this = this; // reset SelectorHelper.clear(); $("#tab-web .iframe-box").html(''); $("#tab-html pre").html(''); $('#tab-follows').html(''); $("#tab-control li[data-id=tab-follows] .num").hide(); $('#python-log').hide(); $('#left-area .overlay').show(); $.ajax({ type: "POST", url: location.pathname+'/run', data: { webdav_mode: _this.webdav_mode, script: _this.webdav_mode ? '' : script, task: task }, success: function(data) { console.log(data); $('#left-area .overlay').hide(); //web $("#tab-web .iframe-box").html(''); const iframe = $("#tab-web iframe")[0]; const content_type = data.fetch_result.headers && data.fetch_result.headers['Content-Type'] && data.fetch_result.headers['Content-Type'] || "text/plain"; //html $("#tab-html pre").text(data.fetch_result.content); $("#tab-html").data("format", true); let iframe_content = null; if (content_type.indexOf('application/json') == 0) { try { let content = JSON.parse(data.fetch_result.content); content = JSON.stringify(content, null, ' '); content = "
    "+content+"
    "; iframe_content = _this.render_html(content, data.fetch_result.url, true, true, false); } catch (e) { iframe_content = "data:,Content-Type:"+content_type+" parse error."; } } else if (content_type.indexOf("text/html") == 0) { $("#tab-html").data("format", false); iframe_content = _this.render_html(data.fetch_result.content, data.fetch_result.url, true, true, false); } else if (content_type.indexOf("text") == 0) { iframe_content = "data:"+content_type+","+data.fetch_result.content; } else if (data.fetch_result.dataurl) { iframe_content = data.fetch_result.dataurl } else { iframe_content = "data:,Content-Type:"+content_type; } const doc = iframe.contentDocument; doc.open("text/html", "replace"); doc.write(iframe_content) doc.close(); doc.onreadystatechange = () => { if (doc.readyState === 'complete') { $("#tab-web iframe").height(doc.body.scrollHeight + 60); } }; //follows $('#tab-follows').html(''); var elem = $("#tab-control li[data-id=tab-follows] .num"); var newtask_template = '
    __callback__ > __url__
    '; if (data.follows.length > 0) { elem.text(data.follows.length).show(); var all_content = ""; window.newtasks = {}; $.each(data.follows, function(i, task) { var callback = task.process; callback = callback && callback.callback || '__call__'; var content = newtask_template.replace('__callback__', callback); content = content.replace('__url__', task.url || 'no_url!'); all_content += content.replace('__task__', i); window.newtasks[i] = task; }); $('#tab-follows').append(all_content); _this.bind_follows(); } else { elem.hide(); } //messages $('#tab-messages pre').html(''); if (data.messages.length > 0) { $("#tab-control li[data-id=tab-messages] .num").text(data.messages.length).show(); var messages = JSON.stringify(data.messages, null, ' '); CodeMirror.runMode(messages, 'application/json', $('#tab-messages pre')[0]); $('#tab-messages')[0] } else { $("#tab-control li[data-id=tab-messages] .num").hide(); } $("#tab-control li.active").click(); // logs _this.python_log(data.logs); }, error: function(xhr, textStatus, errorThrown) { console.log(xhr, textStatus, errorThrown); _this.python_log('error: '+textStatus); $('#left-area .overlay').hide(); } }); }, python_log: function(text) { if (text) { $('#python-log pre').text(text); $('#python-log pre, #python-log').show(); $('#python-log-show').height(0); } else { $('#python-log pre, #python-log').hide(); } }, webdav_mode: false, toggle_webdav_mode: function(button) { if (!this.webdav_mode) { if (this.not_saved) { if (!confirm("You have not saved changes. Ignore changes and switch to WebDav mode.")) { return; } this.not_saved = false; } this.python_editor_elem.hide(); this.splitter.trigger('fullsize', 'prev'); $(button).addClass('active'); this.webdav_mode = !this.webdav_mode; } else { // leaving webdav mode, reload script var _this = this; $.ajax({ type: "GET", url: location.pathname + '/get', success: function (data) { _this.splitter.trigger('init'); _this.python_editor_elem.show(); _this.python_editor.setValue(data.script); _this.not_saved = false; $(button).removeClass('active'); _this.webdav_mode = !_this.webdav_mode; }, error: function() { alert('Loading script from database error. Script may out-of-date.'); _this.python_editor_elem.show(); _this.splitter.trigger('init'); $(button).removeClass('active'); _this.webdav_mode = !_this.webdav_mode; }, }); } }, }; })(); Debugger.init(); ================================================ FILE: pyspider/webui/static/src/debug.less ================================================ /* vim: set et sw=2 ts=2 sts=2 ff=unix fenc=utf8: */ /* Author: Binux */ /* http://binux.me */ /* Created on 2014-02-23 00:28:30 */ @import "variable"; body { margin: 0; padding: 0; height: 100%; overflow: hidden; } .warning { color: @orange; } .error { color: @red; } @control-height: 35px; #control { z-index: 9999; min-width: 760px; width: 100%; height: @control-height; position: fixed; left: 0; right: 0; background-color: @gray-lighter; box-shadow: 0px 1px 2px @gray-light; div { line-height: 35px; margin-left: 10px; margin-right: 10px; } .webdav-btn { position: relative; float: right; padding: 1px 7px 0 7px; line-height: 21px; border-radius: 5px; border: solid 1px @blue; background: white; color: @blue; cursor: pointer; margin: 6px 0 0 10px; &:hover { background: lighten(@blue, 10%); color: white; } &.active { background: @blue; color: white; } } } #editarea { width: 100%; position: fixed; top: @control-height + 2px; left: 0; right: 0; bottom: 0; //debug } .debug-panel { position: absolute; top: 0; left: 0; right: 0; bottom: 0; } .resize { background-color: @gray; cursor: ew-resize; &:hover + .debug-panel { border-left: dashed 1px @gray !important; } } .overlay { position: absolute; top: 0; bottom: 0; left: 0; right: 0; z-index: 9999; background: rgba(0, 0, 0, 40%); } .focus .CodeMirror-activeline-background { background: #e8f2ff !important; } .CodeMirror-activeline-background { background: transparent !important; } #task-panel { height: 100%; overflow-x: auto; } .right-top-btn(@color: @green) { z-index: 99; position: absolute; top: 0; right: 0; background: @color; border-radius: 0 0 0 5px; color: white; margin: 0; padding: 3px 7px 5px 10px; cursor: pointer; font-weight: bold; line-height: 15px; &:hover { background: darken(@color, 10%); } } #run-task-btn { .right-top-btn(@color: @green); } #undo-redo-btn-group { @color: lighten(@green, 15%); .right-top-btn(@color: @color); top: auto; bottom: 0; border-radius: 5px 0 0 0; padding: 5px 0 3px 0; /*box-shadow: 0px 0px 30px @color;*/ overflow: hidden; &:hover { background: @color; } a { color: white; text-decoration: none; padding: 5px 7px 3px 10px; &:hover { background: darken(@color, 10%); } } } #save-task-btn { .right-top-btn(@color: @blue); } #task-editor { position: relative; .CodeMirror { height: auto; padding-bottom: 3px; background: lighten(@green, 30%); } .CodeMirror-scroll { overflow-x: auto; overflow-y: hidden; } &.focus .CodeMirror-activeline-background { background: lighten(@green, 40%) !important; } } #tab-control { list-style-type: none; position: absolute; bottom: 0; right: 0; margin: 8px 20px; padding: 0; li { position: relative; float: right; padding: 1px 7px 0 7px; line-height: 21px; margin-left: 10px; border-radius: 5px; border: solid 1px @blue; background: white; color: @blue; cursor: pointer; &:hover { background: lighten(@blue, 10%); color: white; } &.active { background: @blue; color: white; } span { position: absolute; top: -5px; right: -10px; background: @red; color: white; font-size: 80%; font-weight: bold; padding: 2px 5px 0 5px; border-radius: 10px; } } } #debug-tabs { margin-bottom: 45px; } #tab-web { &.fixed { padding-top: 24px; } iframe { border-width: 0; width: 100%; } } #tab-html { margin: 0; padding: 7px 5px; pre { margin: 0; padding: 0; } } #tab-follows { .newtask { position: relative; height: 30px; line-height: 30px; background: lighten(@orange, 30%); border-bottom: solid 1px @orange; border-top: solid 1px @orange; margin-top: -1px; padding-left: 5px; padding-right: 70px; overflow: hidden; white-space: nowrap; text-overflow: ellipsis; cursor: pointer; &:hover { background: lighten(@orange, 20%); .task-more { background: lighten(@orange, 20%); } } .task-callback { color: darken(@orange, 10%); } .task-url { font-size: 95%; text-decoration: underline; font-weight: lighter; color: @blue; } .task-more { position: absolute; right: 33px; top: 0px; float: right; color: @orange; padding: 0 10px; background: lighten(@orange, 30%); border-radius: 10px; } .task-run { position: absolute; right: 0; top: 0; font-size: 80%; padding: 0 10px 0 30px; float: right; border-bottom: solid 1px lighten(@green, 20%); border-top: solid 1px lighten(@green, 20%); background: lighten(@green, 10%); color: white; text-shadow: 0 0 10px white; font-weight: bold; &:hover { background: @green; } } } .task-show { pre { margin: 5px 5px 10px 5px; } } } #python-editor { position: absolute; top: 0; width: 100%; bottom: 0; .CodeMirror { height: 100%; padding-bottom: 20px; } } #python-log { width: 100%; min-height: 10px; max-height: 40%; background: rgba(0, 0, 0, 60%); overflow: auto; #python-log-show { z-index: 89; width: auto; padding-top: 5px; background: @red; box-shadow: 0 2px 20px @red; cursor: pointer; } pre { margin: 0; padding: 10px 10px; color: white; } } #css-selector-helper { background-color: @gray-lighter; padding: 0; width: 100%; height: 24px; text-align: right; white-space: nowrap; &.fixed { position: absolute; top: 0; } button { line-height: 16px; vertical-align: 2px; } } span.element { position: relative; height: 24px; display: inline-block; padding: 0 0.2em; cursor: pointer; color: lighten(@gray, 35%); z-index: 99999; &.invalid { display: none; } &.selected { color: black; } &:hover { background-color: darken(@gray-lighter, 15%); & > ul { display: block; } } & > ul { display: none; margin: 0; padding: 0; position: absolute; top: 24px; left: 0; background-color: @gray-lighter; border: 1px solid black; border-top-width: 0; color: lighten(@gray, 35%); & > li { display: block; text-align: left; white-space: nowrap; padding: 0 4px; &.selected { color: black; } &:hover { background-color: darken(@gray-lighter, 15%); } } } } .copy-selector-input { height: 24px; padding: 0; border: 0; margin: 0; padding-right: 0.2em; font-size: 1em; text-align: right; width: 100%; margin-left: -100px; background: @gray-lighter; } ================================================ FILE: pyspider/webui/static/src/index.js ================================================ // vim: set et sw=2 ts=2 sts=2 ff=unix fenc=utf8: // Author: Binux // http://binux.me // Created on 2014-03-02 17:53:23 import "./index.less"; $(function() { //$("input[name=start-urls]").on('keydown', function(ev) { //if (ev.keyCode == 13) { //var value = $(this).val(); //var textarea = $('').replaceAll(this); //textarea.val(value).focus(); //} //}); function init_editable(projects_app) { $(".project-group>span").editable({ name: 'group', pk: function(e) { return $(this).parents('tr').data("name"); }, emptytext: '[group]', placement: 'right', url: "/update", success: function(response, value) { var project_name = $(this).parents('tr').data("name"); projects_app.projects[project_name].group = value; $(this).attr('style', ''); } }); $(".project-status>span").editable({ type: 'select', name: 'status', source: [ {value: 'TODO', text: 'TODO'}, {value: 'STOP', text: 'STOP'}, {value: 'CHECKING', text: 'CHECKING'}, {value: 'DEBUG', text: 'DEBUG'}, {value: 'RUNNING', text: 'RUNNING'} ], pk: function(e) { return $(this).parents('tr').data("name"); }, emptytext: '[status]', placement: 'right', url: "/update", success: function(response, value) { var project_name = $(this).parents('tr').data("name"); projects_app.projects[project_name].status = value; $(this).removeClass('status-'+$(this).attr('data-value')).addClass('status-'+value).attr('data-value', value).attr('style', ''); } }); $(".project-rate>span").editable({ name: 'rate', pk: function(e) { return $(this).parents('tr').data("name"); }, validate: function(value) { var s = value.split('/'); if (s.length != 2) return "format error: rate/burst"; if (!$.isNumeric(s[0]) || !$.isNumeric(s[1])) return "format error: rate/burst"; }, highlight: false, emptytext: '0/0', placement: 'right', url: "/update", success: function(response, value) { var project_name = $(this).parents('tr').data("name"); var s = value.split('/'); projects_app.projects[project_name].rate = parseFloat(s[0]); projects_app.projects[project_name].burst = parseFloat(s[1]); $(this).attr('style', ''); } }); } function init_sortable() { // table sortable Sortable.getColumnType = function(table, i) { var type = $($(table).find('th').get(i)).data('type'); if (type == "num") { return Sortable.types.numeric; } else if (type == "date") { return Sortable.types.date; } return Sortable.types.alpha; }; $('table.projects').attr('data-sortable', true); Sortable.init(); } $("#create-project-modal form").on('submit', function(ev) { var $this = $(this); var project_name = $this.find('[name=project-name]').val() if (project_name.length == 0 || project_name.search(/[^\w]/) != -1) { $this.find('[name=project-name]').parents('.form-group').addClass('has-error'); $this.find('[name=project-name] ~ .help-block').show(); return false; } var mode = $this.find('[name=script-mode]:checked').val(); $this.attr('action', '/debug/'+project_name); return true; }); function update_counters() { $.get('/counter', function(data) { for (let project in data) { var info = data[project]; if (projects_app.projects[project] === undefined) continue; // data inject var types = "5m,1h,1d,all".split(','); for (let type of types) { var d = info[type]; if (d === undefined) continue; var pending = d.pending || 0, success = d.success || 0, retry = d.retry || 0, failed = d.failed || 0, sum = d.task || pending + success + retry + failed; d.task = sum; d.title = ""+type+" of "+sum+" tasks:\n" +(type == "all" ? "pending("+(pending/sum*100).toFixed(1)+"%): \t"+pending+"\n" : "new("+(pending/sum*100).toFixed(1)+"%): \t\t"+pending+"\n") +"success("+(success/sum*100).toFixed(1)+"%): \t"+success+"\n" +"retry("+(retry/sum*100).toFixed(1)+"%): \t"+retry+"\n" +"failed("+(failed/sum*100).toFixed(1)+"%): \t"+failed; } projects_app.projects[project].paused = info['paused']; projects_app.projects[project].time = info['5m_time']; projects_app.projects[project].progress = info; } }); } function update_queues() { $.get('/queues', function(data) { //console.log(data); $('.queue_value').each(function(i, e) { var attr = $(e).attr('title'); if (data[attr] !== undefined) { $(e).text(data[attr]); } else { $(e).text('???'); } }); }); } // projects vue var projects_map = {}; projects.forEach(function(p) { p.paused = false; p.time = {}; p.progress = {}; projects_map[p.name] = p; }); var projects_app = new Vue({ el: '.projects', data: { projects: projects_map }, ready: function() { init_editable(this); init_sortable(this); update_counters(); window.setInterval(update_counters, 15*1000); update_queues(); window.setInterval(update_queues, 15*1000); }, methods: { project_run: function(project, event) { $("#need-set-status-alert").hide(); if (project.status != "RUNNING" && project.status != "DEBUG") { $("#need-set-status-alert").show(); } var _this = event.target; $(_this).addClass("btn-warning"); $.ajax({ type: "POST", url: '/run', data: { project: project.name }, success: function(data) { $(_this).removeClass("btn-warning"); if (!data.result) { $(_this).addClass("btn-danger"); } }, error: function() { $(_this).removeClass("btn-warning").addClass("btn-danger"); } }); } } }); }); ================================================ FILE: pyspider/webui/static/src/index.less ================================================ /* vim: set et sw=2 ts=2 sts=2 ff=unix fenc=utf8: */ /* Author: Binux */ /* http://binux.me */ /* Created on 2014-02-23 00:28:30 */ @import "variable"; h1 { margin-top: 5px; } header .alert { position: absolute;; width: 50rem; left: 50%; margin-left: -25rem; } .queue-info { th, td { text-align: center; border: 1px solid #ddd; } } [v-cloak] { display: none; } .projects { min-width: 850px; border-top: 1px solid #ddd; border-bottom: 1px solid #ddd; .project-group { width: 80px; } .project-name { font-weight: bold; } .project-status { width: 100px; } .project-status-span(@color) { border: solid 1px darken(@color, 10%); padding: 1px 5px 0 5px; background: @color; color: white; } .project-status>span { .project-status-span(@gray-light); } span.status-TODO { .project-status-span(@orange); } span.status-STOP { .project-status-span(@red); } span.status-CHECKING { .project-status-span(darken(@yellow, 10%)); } span.status-DEBUG { .project-status-span(@blue); } span.status-RUNNING { .project-status-span(@green); } span.status-PAUSED { .project-status-span(@gray); } .project-rate { width: 110px; } .project-time { width: 110px; } th.project-progress { position: relative; span { position: absolute; } } td.project-progress { position: relative; min-width: 5%; &.progress-all { min-width: 10%; } .progress { position: relative; margin: 0; background-color: #aaa; .progress-text { width: 100%; text-align: center; position: absolute; font-weight: bold; color: #fff; pointer-events: none; } .progress-bar { -webkit-transition: none; transition: none; } } } .project-actions { width: 200px; } } .global-btn { margin-top: -5px; padding: 10px 10px 10px 10px; .create-btn-div { float: right; } .active-btn-div { float: left; } } ================================================ FILE: pyspider/webui/static/src/result.less ================================================ /* vim: set et sw=2 ts=2 sts=2 ff=unix fenc=utf8: */ /* Author: Binux */ /* http://binux.me */ /* Created on 2014-10-22 22:38:45 */ @import "variable"; .top-bar { padding: 10px 15px 2px 15px; height: 46px; background-color: #f5f5f5; border-bottom: 1px solid #ddd; position: relative; h1 { margin: 0 0 10px 0; font-size: 18px; } .btn-group { margin: 8px 10px 0 0; position: absolute; right: 0; top: 0; a.btn { } } } .pagination-wrap { text-align: right; padding-right: 15px; } table { border-bottom: 1px solid #ddd; td { word-break: break-all; } } ================================================ FILE: pyspider/webui/static/src/splitter.js ================================================ // vim: set et sw=2 ts=2 sts=2 ff=unix fenc=utf8: // Author: Binux // http://binux.me // Created on 2014-02-23 01:35:35 // from: https://github.com/jsbin/jsbin $.fn.splitter = function (_type) { var $document = $(document), $blocker = $('
    '), $body = $('body'); // blockiframe = $blocker.find('iframe')[0]; var splitterSettings = JSON.parse(localStorage.getItem('splitterSettings') || '[]'); return this.each(function () { var $el = $(this), $originalContainer = $(this), guid = $.fn.splitter.guid++, $parent = $el.parent(), type = _type || 'x', $prev = type === 'x' ? $el.prevAll(':visible:first') : $el.nextAll(':visible:first'), $handle = $('
    '), dragging = false, width = $parent.width(), parentOffset = $parent.offset(), left = parentOffset.left, top = parentOffset.top, // usually zero :( props = { x: { display: 'block', currentPos: $parent.offset().left, multiplier: 1, cssProp: 'left', otherCssProp: 'right', size: $parent.width(), sizeProp: 'width', moveProp: 'pageX', init: { top: 0, bottom: 0, width: 8, 'margin-left': '-4px', height: '100%', left: 'auto', right: 'auto', opacity: 0, position: 'absolute', cursor: 'ew-resize', // 'border-top': '0', 'border-left': '1px solid rgba(218, 218, 218, 0.5)', 'z-index': 99999 } }, y: { display: 'block', currentPos: $parent.offset().top, multiplier: -1, size: $parent.height(), cssProp: 'bottom', otherCssProp: 'top', sizeProp: 'height', moveProp: 'pageY', init: { top: 'auto', cursor: 'ns-resize', bottom: 'auto', height: 8, width: '100%', left: 0, right: 0, opacity: 0, position: 'absolute', border: 0, // 'border-top': '1px solid rgba(218, 218, 218, 0.5)', 'z-index': 99999 } } }, refreshTimer = null, settings = splitterSettings[guid] || {}; var tracker = { down: { x: null, y: null }, delta: { x: null, y: null }, track: false, timer: null }; $handle.bind('mousedown', function (event) { tracker.down.x = event.pageX; tracker.down.y = event.pageY; tracker.delta = { x: null, y: null }; tracker.target = $handle[type == 'x' ? 'height' : 'width']() * 0.25; }); $document.bind('mousemove', function (event) { if (dragging) { tracker.delta.x = tracker.down.x - event.pageX; tracker.delta.y = tracker.down.y - event.pageY; clearTimeout(tracker.timer); tracker.timer = setTimeout(function () { tracker.down.x = event.pageX; tracker.down.y = event.pageY; }, 250); //disable change to y //var targetType = type == 'x' ? 'y' : 'x'; //if (Math.abs(tracker.delta[targetType]) > tracker.target) { //$handle.trigger('change', targetType, event[props[targetType].moveProp]); //tracker.down.x = event.pageX; //tracker.down.y = event.pageY; //} } }); function moveSplitter(pos) { if (type === 'y') { pos -= top; } var v = pos - props[type].currentPos, split = 100 / props[type].size * v, delta = (pos - settings[type]) * props[type].multiplier, prevSize = $prev[props[type].sizeProp](), elSize = $el[props[type].sizeProp](); if (type === 'y') { split = 100 - split; } // if prev panel is too small and delta is negative, block if (prevSize < 100 && delta < 0) { // ignore } else if (elSize < 100 && delta > 0) { // ignore } else { // allow sizing to happen $el.css(props[type].cssProp, split + '%'); $prev.css(props[type].otherCssProp, (100 - split) + '%'); var css = {}; css[props[type].cssProp] = split + '%'; $handle.css(css); settings[type] = pos; splitterSettings[guid] = settings; localStorage.setItem('splitterSettings', JSON.stringify(splitterSettings)); // wait until animations have completed! if (moveSplitter.timer) clearTimeout(moveSplitter.timer); moveSplitter.timer = setTimeout(function () { $document.trigger('sizeeditors'); }, 120); } } function resetPrev() { $prev = type === 'x' ? $handle.prevAll(':visible:first') : $handle.nextAll(':visible:first'); } $document.bind('mouseup touchend', function () { if (dragging) { dragging = false; $handle.trigger('resize-end'); $blocker.remove(); // $handle.css( 'opacity', '0'); $body.removeClass('dragging'); } }).bind('mousemove touchmove', function (event) { if (dragging) { moveSplitter(event[props[type].moveProp] || event.originalEvent.touches[0][props[type].moveProp]); } }); $blocker.bind('mousemove touchmove', function (event) { if (dragging) { moveSplitter(event[props[type].moveProp] || event.originalEvent.touches[0][props[type].moveProp]); } }); $handle.bind('mousedown touchstart', function (e) { dragging = true; $handle.trigger('resize-start'); $body.append($blocker).addClass('dragging'); props[type].size = $parent[props[type].sizeProp](); props[type].currentPos = 0; // is this really required then? resetPrev(); e.preventDefault(); }); /* .hover(function () { $handle.css('opacity', '1'); }, function () { if (!dragging) { $handle.css('opacity', '0'); } }) */ $handle.bind('fullsize', function(event, panel) { if (panel === undefined) { panel = 'prev'; } var split = 0; if (panel === 'prev') { split = 100; } $el.css(props[type].cssProp, split + '%'); $prev.css(props[type].otherCssProp, (100 - split) + '%'); $handle.hide(); }); $handle.bind('init', function (event, x) { $handle.css(props[type].init); props[type].size = $parent[props[type].sizeProp](); resetPrev(); // can only be read at init top = $parent.offset().top; $blocker.css('cursor', type == 'x' ? 'ew-resize' : 'ns-resize'); if (type == 'y') { $el.css('border-right', 0); $prev.css('border-left', 0); $prev.css('border-top', '2px solid #ccc'); } else { // $el.css('border-right', '1px solid #ccc'); $el.css('border-top', 0); // $prev.css('border-right', '2px solid #ccc'); } if ($el.is(':hidden')) { $handle.hide(); } else { if ($prev.length) { $el.css('border-' + props[type].cssProp, '1px solid #ccc'); } else { $el.css('border-' + props[type].cssProp, '0'); } moveSplitter(x !== undefined ? x : settings[type] || $el.offset()[props[type].cssProp]); } }); //.trigger('init', settings.x || $el.offset().left); $handle.bind('change', function (event, toType, value) { $el.css(props[type].cssProp, '0'); $prev.css(props[type].otherCssProp, '0'); $el.css('border-' + props[type].cssProp, '0'); if (toType === 'y') { // 1. drop inside of a new div that encompases the elements $el = $el.find('> *'); $handle.appendTo($prev); $el.appendTo($prev); $prev.css('height', '100%'); $originalContainer.hide(); $handle.css('margin-left', 0); $handle.css('margin-top', 5); $handle.addClass('vertical'); delete settings.x; $originalContainer.nextAll(':visible:first').trigger('init'); // 2. change splitter to the right to point to new block div } else { $el = $prev; $prev = $tmp; $el.appendTo($originalContainer); $handle.insertBefore($originalContainer); $handle.removeClass('vertical'); $el.css('border-top', 0); $el = $originalContainer; $originalContainer.show(); $handle.css('margin-top', 0); $handle.css('margin-left', -4); delete settings.y; setTimeout(function() { $originalContainer.nextAll(':visible:first').trigger('init'); }, 0); } resetPrev(); type = toType; // if (type == 'y') { // FIXME $prev should check visible var $tmp = $el; $el = $prev; $prev = $tmp; // } else { // } $el.css(props[type].otherCssProp, '0'); $prev.css(props[type].cssProp, '0'); // TODO // reset top/bottom positions // reset left/right positions if ($el.is(':visible')) { // find all other handles and recalc their height if (type === 'y') { var otherhandles = $el.find('.resize'); otherhandles.each(function (i) { // find the top of the var $h = $(this); if (this === $handle[0]) { // ignore } else { // TODO change to real px :( $h.trigger('init', 100 / (otherhandles - i - 1)); } }); } $handle.trigger('init', value || $el.offset()[props[type].cssProp] || props[type].size / 2); } }); $prev.css('width', 'auto'); $prev.css('height', 'auto'); $el.data('splitter', $handle); $el.before($handle); // if (settings.y) { // $handle.trigger('change', 'y'); // } }); }; $.fn.splitter.guid = 0; ================================================ FILE: pyspider/webui/static/src/task.less ================================================ /* vim: set et sw=2 ts=2 sts=2 ff=unix fenc=utf8: */ /* Author: Binux */ /* http://binux.me */ /* Created on 2014-07-16 19:20:30 */ @import "variable"; .base-info { padding: 10px 15px 2px 15px; background-color: #f5f5f5; border-bottom: 1px solid #ddd; } .more-info { padding: 10px 15px; } .more-info dd { display: block; font-family: monospace; white-space: pre; word-break: break-all; word-wrap: break-word; margin: 1em 0px; } .status_mix(@color: lighten(black, 50%)) { border: solid 1px darken(@color, 10%); padding: 1px 5px 0 5px; background: @color; color: white; } .status { &-1 { .status_mix(@blue); } &-2 { .status_mix(@green); } &-3 { .status_mix(@red); } &-4 { .status_mix; } } .url { font-size: 120%; text-decoration: underline; } .callback { color: @orange; font-weight: bold; &:hover, &:focus { color: darken(@orange, 10%); } } dt .glyphicon-ok { color: @green; } dt .glyphicon-remove { color: @red; } ================================================ FILE: pyspider/webui/static/src/tasks.less ================================================ /* vim: set et sw=2 ts=2 sts=2 ff=unix fenc=utf8: */ /* Author: Binux */ /* http://binux.me */ /* Created on 2014-07-18 23:20:46 */ @import "variable"; @import "task"; .tasks { margin: 0; padding: 0; list-style-type: none; li { .base-info; &:nth-child(even) { background-color: white; } } .url { display: inline-block; vertical-align: bottom; max-width: 40em; overflow: hidden; white-space: nowrap; text-overflow: ellipsis; } .update-time { font-weight: bold; } } ================================================ FILE: pyspider/webui/static/src/variable.less ================================================ /* vim: set et sw=2 ts=2 sts=2 ff=unix fenc=utf8: */ /* Author: Binux */ /* http://binux.me */ /* Created on 2014-07-16 19:18:30 */ // colors @gray-darker: lighten(#000, 13.5%); // #222 @gray-dark: lighten(#000, 20%); // #333 @gray: lighten(#000, 33.5%); // #555 @gray-light: lighten(#000, 60%); // #999 @gray-lighter: lighten(#000, 93.5%); // #eee @blue: #428bca; @green: #5cb85c; @blue-light: #5bc0de; @orange: #f0ad4e; @yellow: #ffe543; @red: #d9534f; ================================================ FILE: pyspider/webui/static/webpack.config.js ================================================ var webpack = require("webpack"); var ExtractTextPlugin = require("extract-text-webpack-plugin"); module.exports = { entry: { index: "./src/index", debug: "./src/debug", result: "./src/result.less", task: "./src/task.less", tasks: "./src/tasks.less", }, output: { //path: "./dist", filename: "[name].min.js" }, module: { loaders: [ { test: /\.js$/, loader: "babel-loader" }, { test: /\.less$/, loader: ExtractTextPlugin.extract("style-loader", "css-loader?sourceMap!less-loader?sourceMap") } ] }, devtool: 'source-map', plugins: [ new ExtractTextPlugin("[name].min.css"), new webpack.optimize.UglifyJsPlugin({ compress: { warnings: false } }), ] } ================================================ FILE: pyspider/webui/task.py ================================================ #!/usr/bin/env python # -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # Created on 2014-07-16 15:30:57 import socket from flask import abort, render_template, request, json from pyspider.libs import utils from .app import app @app.route('/task/') def task(taskid): if ':' not in taskid: abort(400) project, taskid = taskid.split(':', 1) taskdb = app.config['taskdb'] task = taskdb.get_task(project, taskid) if not task: abort(404) resultdb = app.config['resultdb'] result = {} if resultdb: result = resultdb.get(project, taskid) return render_template("task.html", task=task, json=json, result=result, status_to_string=app.config['taskdb'].status_to_string) @app.route('/task/.json') def task_in_json(taskid): if ':' not in taskid: return json.jsonify({'code': 400, 'error': 'bad project:task_id format'}) project, taskid = taskid.split(':', 1) taskdb = app.config['taskdb'] task = taskdb.get_task(project, taskid) if not task: return json.jsonify({'code': 404, 'error': 'not found'}) task['status_string'] = app.config['taskdb'].status_to_string(task['status']) return json.jsonify(task) @app.route('/tasks') def tasks(): rpc = app.config['scheduler_rpc'] taskdb = app.config['taskdb'] project = request.args.get('project', "") limit = int(request.args.get('limit', 100)) try: updatetime_tasks = rpc.get_active_tasks(project, limit) except socket.error as e: app.logger.warning('connect to scheduler rpc error: %r', e) return 'connect to scheduler error', 502 tasks = {} result = [] for updatetime, task in sorted(updatetime_tasks, key=lambda x: x[0]): key = '%(project)s:%(taskid)s' % task task['updatetime'] = updatetime if key in tasks and tasks[key].get('status', None) != taskdb.ACTIVE: result.append(tasks[key]) tasks[key] = task result.extend(tasks.values()) return render_template( "tasks.html", tasks=result, status_to_string=taskdb.status_to_string ) @app.route('/active_tasks') def active_tasks(): rpc = app.config['scheduler_rpc'] taskdb = app.config['taskdb'] project = request.args.get('project', "") limit = int(request.args.get('limit', 100)) try: tasks = rpc.get_active_tasks(project, limit) except socket.error as e: app.logger.warning('connect to scheduler rpc error: %r', e) return '{}', 502, {'Content-Type': 'application/json'} result = [] for updatetime, task in tasks: task['updatetime'] = updatetime task['updatetime_text'] = utils.format_date(updatetime) if 'status' in task: task['status_text'] = taskdb.status_to_string(task['status']) result.append(task) return json.dumps(result), 200, {'Content-Type': 'application/json'} app.template_filter('format_date')(utils.format_date) ================================================ FILE: pyspider/webui/templates/debug.html ================================================ {{ project_name }} - Debugger - pyspider
    pyspider > {{ project_name }}
    Documentation WebDAV Mode
    {#
    __callback__ > __url__
    #}
    • messages
    • follows
    • html
    • web
    • enable css selector helper
    save
    ================================================ FILE: pyspider/webui/templates/index.html ================================================ Dashboard - pyspider

    pyspider dashboard

    scheduler ??? fetcher ??? processor ??? result_worker
    ??? + ???
    {% if config.scheduler_rpc is not none %} Recent Active Tasks {% endif %}
    {% raw %} {% endraw %} {% raw %}
    group project name status rate/burst avg time  progress               actions
    {{ project.group }} {{* project.name }} {{ project.paused ? 'PAUSED' : project.status }} {{ project.rate }}/{{ project.burst }} {{ (project.time.fetch_time * 1000).toFixed(1) }}+{{ (project.time.process_time * 1000).toFixed(2) }}
    {{* type }}: {{ project.progress[type].task }}
    {% endraw %} # if config.scheduler_rpc is not none: {% raw %} Active Tasks {% endraw %} # endif # if config.resultdb: {% raw %} Results {% endraw %} # endif
    ================================================ FILE: pyspider/webui/templates/result.html ================================================ Results - {{ project }} - pyspider

    {{ project }} - Results

    # set common_fields, results = result_formater(results) # for field in common_fields|sort # endfor # for result in results # for field in common_fields|sort # endfor # endfor
    url {{ field }} ...
    {{ result.url }} {{ json.dumps(result.result_formated[field], ensure_ascii=False) | truncate(100, True) }} {{ json.dumps(result.others, ensure_ascii=False) | truncate(100, True) }}
      # set current_page = int(offset/limit) + (1 if offset%limit else 0) # set count = count if count is not none else 0 # set total_page = int(count/limit) + (1 if count%limit else 0)
    • «
    • # set prev = 0 # for i in range(0, total_page): # if abs(i-0) < 2 or abs(i-total_page) < 3 or -2 < i-current_page < 5: # set prev = i
    • {{ i + 1 }}
    • # elif prev == i-1:
    • # endif # endfor
    • = total_page else "" }}"> »
    ================================================ FILE: pyspider/webui/templates/task.html ================================================ Task - {{ task.project }}:{{ task.taskid }} - pyspider

    {{ status_to_string(task.status) }} {{ task.project }}.{{ task.process.callback }} > {{ task.url }} {% if task.status in (2, 3, 4) %} ({{ task.lastcrawltime | format_date }} crawled ) {% else %} ({{ task.updatetime | format_date }} updated ) {% endif %}

    taskid
    {{ task.taskid }}
    lastcrawltime
    {{ task.lastcrawltime }} ({{ task.lastcrawltime | format_date }})
    updatetime
    {{ task.updatetime }} ({{ task.updatetime | format_date }})
    # if task.schedule and task.schedule.exetime
    exetime
    {{ task.schedule.exetime }} ({{ task.schedule.exetime | format_date }})
    # endif # if task.track and task.track.fetch
    track.fetch {{ (task.track.fetch.time * 1000) | round(2) }}ms
    {{ json.dumps(task.track.fetch, indent=2, ensure_ascii=False) }}
    # endif # if task.track and task.track.process
    track.process {{ (task.track.process.time * 1000) | round(2) }}ms # if task.track.process.follows +{{ task.track.process.follows | int }} # endif
    #- if task.track.process.exception {{- task.track.process.exception or '' }} # endif #- if task.track.process.logs {{- task.track.process.logs or '' }} # endif {{- json.dumps(task.track.process, indent=2, ensure_ascii=False) -}}
    # endif
    #- set not_shown_keys = ('status', 'url', 'project', 'taskid', 'lastcrawltime', 'updatetime', 'track', ) #- for key, value in task.items() if key not in not_shown_keys
    {{ key }}
    {{ json.dumps(value, indent=2, ensure_ascii=False) if value is mapping else value }}
    #- endfor
    # if result and result.get('result'):
    result
    {{ json.dumps(result['result'], indent=2, ensure_ascii=False) }}
    # endif
    ================================================ FILE: pyspider/webui/templates/tasks.html ================================================ Tasks - pyspider
      {% for task in tasks | sort(reverse=True, attribute='updatetime') %}
    1. {% if task.status %} {{ status_to_string(task.status) }} {% elif task.track %} {% set fetchok = task.track.fetch and task.track.fetch.ok %} {% set processok = task.track.process and task.track.process.ok %} {%- if not fetchok -%} FETCH_ERROR {%- elif not processok -%} PROCESS_ERROR {%- endif -%} {% else %} ERROR {% endif %} {{ task.project }} > {{ task.url }} {{ task.updatetime | format_date }} {% if task.track and task.track.fetch %} {{- '%.1f' | format(task.track.fetch.time * 1000) }}+{{ '%.2f' | format(task.track.process.time * 1000 if task.track and task.track.process else 0) }}ms {% endif %} {% if task.track and task.track.process %} +{{ task.track.process.follows | int }} {% endif %}
    2. {% endfor %}
    ================================================ FILE: pyspider/webui/webdav.py ================================================ #!/usr/bin/env python # -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # Created on 2015-6-3 11:29 import os import time import base64 import six from six import BytesIO from wsgidav.wsgidav_app import DEFAULT_CONFIG, WsgiDAVApp from wsgidav.dav_provider import DAVProvider, DAVCollection, DAVNonCollection from wsgidav.dav_error import DAVError, HTTP_FORBIDDEN from pyspider.libs.utils import utf8, text from .app import app def check_user(environ): authheader = environ.get("HTTP_AUTHORIZATION") if not authheader: return False authheader = authheader[len("Basic "):] try: username, password = text(base64.b64decode(authheader)).split(':', 1) except Exception as e: app.logger.error('wrong api key: %r, %r', authheader, e) return False if username == app.config['webui_username'] \ and password == app.config['webui_password']: return True else: return False class ContentIO(BytesIO): def close(self): self.content = self.getvalue() BytesIO.close(self) #old class class ScriptResource(DAVNonCollection): def __init__(self, path, environ, app, project=None): super(ScriptResource, self).__init__(path, environ) self.app = app self.new_project = False self._project = project self.project_name = text(self.name) self.writebuffer = None if self.project_name.endswith('.py'): self.project_name = self.project_name[:-len('.py')] @property def project(self): if self._project: return self._project projectdb = self.app.config['projectdb'] if projectdb: self._project = projectdb.get(self.project_name) if not self._project: if projectdb.verify_project_name(self.project_name) and text(self.name).endswith('.py'): self.new_project = True self._project = { 'name': self.project_name, 'script': '', 'status': 'TODO', 'rate': self.app.config.get('max_rate', 1), 'burst': self.app.config.get('max_burst', 3), 'updatetime': time.time(), } else: raise DAVError(HTTP_FORBIDDEN) return self._project @property def readonly(self): projectdb = self.app.config['projectdb'] if not projectdb: return True if 'lock' in projectdb.split_group(self.project.get('group')) \ and self.app.config.get('webui_username') \ and self.app.config.get('webui_password'): return not check_user(self.environ) return False def getContentLength(self): return len(utf8(self.project['script'])) def getContentType(self): return 'text/plain' def getLastModified(self): return self.project['updatetime'] def getContent(self): return BytesIO(utf8(self.project['script'])) def beginWrite(self, contentType=None): if self.readonly: self.app.logger.error('webdav.beginWrite readonly') return super(ScriptResource, self).beginWrite(contentType) self.writebuffer = ContentIO() return self.writebuffer def endWrite(self, withErrors): if withErrors: self.app.logger.error('webdav.endWrite error: %r', withErrors) return super(ScriptResource, self).endWrite(withErrors) if not self.writebuffer: return projectdb = self.app.config['projectdb'] if not projectdb: return info = { 'script': text(getattr(self.writebuffer, 'content', '')) } if self.project.get('status') in ('DEBUG', 'RUNNING'): info['status'] = 'CHECKING' if self.new_project: self.project.update(info) self.new_project = False return projectdb.insert(self.project_name, self.project) else: return projectdb.update(self.project_name, info) class RootCollection(DAVCollection): def __init__(self, path, environ, app): super(RootCollection, self).__init__(path, environ) self.app = app self.projectdb = self.app.config['projectdb'] def getMemberList(self): members = [] for project in self.projectdb.get_all(): project_name = project['name'] if not project_name.endswith('.py'): project_name += '.py' native_path = os.path.join(self.path, project_name) native_path = text(native_path) if six.PY3 else utf8(native_path) members.append(ScriptResource( native_path, self.environ, self.app, project )) return members def getMemberNames(self): members = [] for project in self.projectdb.get_all(fields=['name', ]): project_name = project['name'] if not project_name.endswith('.py'): project_name += '.py' members.append(utf8(project_name)) return members class ScriptProvider(DAVProvider): def __init__(self, app): super(ScriptProvider, self).__init__() self.app = app def __repr__(self): return "pyspiderScriptProvider" def getResourceInst(self, path, environ): path = os.path.normpath(path).replace('\\', '/') if path in ('/', '.', ''): path = '/' return RootCollection(path, environ, self.app) else: return ScriptResource(path, environ, self.app) class NeedAuthController(object): def __init__(self, app): self.app = app def getDomainRealm(self, inputRelativeURL, environ): return 'need auth' def requireAuthentication(self, realmname, environ): return self.app.config.get('need_auth', False) def isRealmUser(self, realmname, username, environ): return username == self.app.config.get('webui_username') def getRealmUserPassword(self, realmname, username, environ): return self.app.config.get('webui_password') def authDomainUser(self, realmname, username, password, environ): return username == self.app.config.get('webui_username') \ and password == self.app.config.get('webui_password') config = DEFAULT_CONFIG.copy() config.update({ 'mount_path': '/dav', 'provider_mapping': { '/': ScriptProvider(app) }, 'domaincontroller': NeedAuthController(app), 'verbose': 1 if app.debug else 0, 'dir_browser': {'davmount': False, 'enable': True, 'msmount': False, 'response_trailer': ''}, }) dav_app = WsgiDAVApp(config) ================================================ FILE: requirements.txt ================================================ Flask==0.10 Jinja2==2.7 chardet==3.0.4 cssselect==0.9 lxml==4.3.3 pycurl==7.43.0.3 pyquery==1.4.0 requests==2.24.0 tornado==4.5.3 mysql-connector-python==8.0.16 pika==1.1.0 pymongo==3.9.0 Flask-Login==0.2.11 u-msgpack-python==1.6 click==6.6 SQLAlchemy==1.3.10 six==1.10.0 amqp==2.4.0 redis==2.10.6 redis-py-cluster==1.3.6 kombu==4.4.0 psycopg2==2.8.2 elasticsearch==2.3.0 tblib==1.4.0 ================================================ FILE: run.py ================================================ #!/usr/bin/env python # -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # Created on 2014-11-24 23:11:49 from pyspider.run import main if __name__ == '__main__': main() ================================================ FILE: setup.py ================================================ #!/usr/bin/env python # -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # Created on 2014-11-24 22:27:45 import sys from setuptools import setup, find_packages from codecs import open from os import path here = path.abspath(path.dirname(__file__)) with open(path.join(here, 'README.md'), encoding='utf-8') as f: long_description = f.read() import pyspider install_requires = [ 'Flask==0.10', 'Jinja2==2.7', 'chardet==3.0.4', 'cssselect==0.9', "lxml==4.3.3", 'pycurl==7.43.0.3', 'requests==2.24.0', 'Flask-Login==0.2.11', 'u-msgpack-python==1.6', 'click==3.3', 'six==1.10.0', 'tblib==1.4.0', 'wsgidav==2.3.0', 'tornado>=3.2,<=4.5.3', 'pyquery', ] extras_require_all = [ 'mysql-connector-python==8.0.16', 'pymongo==3.9.0', 'redis==2.10.6', 'redis-py-cluster==1.3.6', 'psycopg2==2.8.2', 'elasticsearch==2.3.0', 'kombu==4.4.0', 'amqp==2.4.0', 'SQLAlchemy==1.3.10', 'pika==1.1.0' ] setup( name='pyspider', version=pyspider.__version__, description='A Powerful Spider System in Python', long_description=long_description, url='https://github.com/binux/pyspider', author='Roy Binux', author_email='roy@binux.me', license='Apache License, Version 2.0', classifiers=[ 'Development Status :: 4 - Beta', 'Programming Language :: Python :: 3.5', 'Programming Language :: Python :: 3.6', 'Programming Language :: Python :: 3.7', 'License :: OSI Approved :: Apache Software License', 'Intended Audience :: Developers', 'Operating System :: OS Independent', 'Environment :: Web Environment', 'Topic :: Internet :: WWW/HTTP', 'Topic :: Software Development :: Libraries :: Application Frameworks', 'Topic :: Software Development :: Libraries :: Python Modules', ], keywords='scrapy crawler spider webui', packages=find_packages(exclude=['data', 'tests*']), install_requires=install_requires, extras_require={ 'all': extras_require_all, 'test': [ 'coverage', 'Werkzeug==0.16.1', 'httpbin==0.7.0', 'pyproxy==0.1.6', 'easywebdav==1.2.0', ] }, package_data={ 'pyspider': [ 'logging.conf', 'fetcher/phantomjs_fetcher.js', 'fetcher/splash_fetcher.lua', 'webui/static/*.js', 'webui/static/*.css', 'webui/templates/*' ], }, entry_points={ 'console_scripts': [ 'pyspider=pyspider.run:main' ] }, test_suite='tests.all_suite', ) ================================================ FILE: tests/__init__.py ================================================ #!/usr/bin/env python # -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # Created on 2014-02-09 10:53:19 import os import unittest all_suite = unittest.TestLoader().discover(os.path.dirname(__file__), "test_*.py") ================================================ FILE: tests/data_fetcher_processor_handler.py ================================================ #!/usr/bin/env python # -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # Created on 2015-01-18 14:12:55 from pyspider.libs.base_handler import * class Handler(BaseHandler): @not_send_status def not_send_status(self, response): self.crawl('http://www.baidu.com/') return response.text def url_deduplicated(self, response): self.crawl('http://www.baidu.com/') self.crawl('http://www.google.com/') self.crawl('http://www.baidu.com/') self.crawl('http://www.google.com/') self.crawl('http://www.google.com/') @catch_status_code_error def catch_http_error(self, response): self.crawl('http://www.baidu.com/') return response.status_code def json(self, response): return response.json def html(self, response): return response.doc('h1').text() def links(self, response): self.crawl([x.attr.href for x in response.doc('a').items()], callback=self.links) def cookies(self, response): return response.cookies def get_save(self, response): return response.save def get_process_save(self, response): return self.save def set_process_save(self, response): self.save['roy'] = 'binux' class IgnoreHandler(BaseHandler): pass __handler_cls__ = Handler ================================================ FILE: tests/data_handler.py ================================================ #!/usr/bin/env python # -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # Created on 2014-02-22 14:02:21 import time from pyspider.libs.base_handler import BaseHandler, catch_status_code_error, every class IgnoreHandler(object): pass class TestHandler(BaseHandler): retry_delay = { 1: 10, '': -1 } def hello(self): return "hello world!" def echo(self, response): return response.content def saved(self, response): return response.save def echo_task(self, response, task): return task['project'] @catch_status_code_error def catch_status_code(self, response): return response.status_code def raise_exception(self): print('print') logger.info("info") logger.warning("warning") logger.error("error") raise Exception('exception') def add_task(self, response): self.crawl('http://www.google.com', callback='echo', params={'wd': u'中文'}) self.send_message('some_project', {'some': 'message'}) @every def on_cronjob1(self, response): logger.info('on_cronjob1') @every(seconds=10) def on_cronjob2(self, response): logger.info('on_cronjob2') def generator(self, response): yield "a" yield "b" def sleep(self, response): time.sleep(response.save) ================================================ FILE: tests/data_sample_handler.py ================================================ #!/usr/bin/env python # -*- encoding: utf-8 -*- # Created on __DATE__ # Project: __PROJECT_NAME__ from pyspider.libs.base_handler import * class Handler(BaseHandler): crawl_config = { } @every(minutes=24 * 60) def on_start(self): self.crawl('http://127.0.0.1:14887/pyspider/test.html', callback=self.index_page) @config(age=10 * 24 * 60 * 60) def index_page(self, response): for each in response.doc('a[href^="http"]').items(): self.crawl(each.attr.href, callback=self.detail_page) @config(priority=2) def detail_page(self, response): return { "url": response.url, "title": response.doc('title').text(), } ================================================ FILE: tests/data_test_webpage.py ================================================ #!/usr/bin/env python # -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # Created on 2015-01-24 13:44:10 from httpbin import app @app.route('/pyspider/test.html') def test_page(): return ''' 404 0 1 2 3 4 gzip get deflate html xml robots cache stream ''' @app.route('/pyspider/ajax.html') def test_ajax(): return '''
    loading...
    ''' @app.route('/pyspider/ajax_click.html') def test_ajax_click(): return '''
    loading...
    load ''' ================================================ FILE: tests/test_base_handler.py ================================================ #!/usr/bin/env python # -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # Created on 2017-02-26 10:35:23 import unittest from pyspider.libs.base_handler import BaseHandler class TestBaseHandler(unittest.TestCase): sample_task_http = { 'taskid': 'taskid', 'project': 'project', 'url': '', 'fetch': { 'method': 'GET', 'headers': { 'Cookie': 'a=b', 'a': 'b' }, 'cookies': { 'c': 'd', }, 'timeout': 60, 'save': 'abc', }, 'process': { 'callback': 'callback', 'save': [1, 2, 3], }, } def test_task_join_crawl_config(self): task = dict(self.sample_task_http) crawl_config = { 'taskid': 'xxxx', # should not affect finial task 'proxy': 'username:password@hostname:port', # should add proxy 'headers': { # should merge headers 'Cookie': 'abc', # should not affect cookie 'c': 'd', # should add header c } } ret = BaseHandler.task_join_crawl_config(task, crawl_config) self.assertDictEqual(ret, { 'taskid': 'taskid', 'project': 'project', 'url': '', 'fetch': { 'method': 'GET', 'proxy': 'username:password@hostname:port', 'headers': { 'Cookie': 'a=b', 'a': 'b', 'c': 'd' }, 'cookies': { 'c': 'd', }, 'timeout': 60, 'save': 'abc', }, 'process': { 'callback': 'callback', 'save': [1, 2, 3], }, }); ================================================ FILE: tests/test_bench.py ================================================ #!/usr/bin/env python # -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # Created on 2014-12-10 01:34:09 import os import sys import time import click import shutil import inspect import unittest from pyspider import run from pyspider.libs import utils class TestBench(unittest.TestCase): @classmethod def setUpClass(self): shutil.rmtree('./data/bench', ignore_errors=True) os.makedirs('./data/bench') @classmethod def tearDownClass(self): shutil.rmtree('./data/bench', ignore_errors=True) def test_10_bench(self): import subprocess #cmd = [sys.executable] cmd = ['coverage', 'run'] p = subprocess.Popen(cmd+[ inspect.getsourcefile(run), '--queue-maxsize=0', 'bench', '--total=500' ], close_fds=True, stderr=subprocess.PIPE) stdout, stderr = p.communicate() stderr = utils.text(stderr) print(stderr) self.assertEqual(p.returncode, 0, stderr) self.assertIn('Crawled', stderr) self.assertIn('Fetched', stderr) self.assertIn('Processed', stderr) self.assertIn('Saved', stderr) ================================================ FILE: tests/test_counter.py ================================================ #!/usr/bin/env python # -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # Created on 2015-04-05 00:05:58 import sys import time import unittest from pyspider.libs import counter class TestCounter(unittest.TestCase): def test_010_TimebaseAverageEventCounter(self): c = counter.TimebaseAverageEventCounter(2, 1) for i in range(100): time.sleep(0.1) c.event(100+i) self.assertEqual(c.sum, float(180+199)*20/2) self.assertEqual(c.avg, float(180+199)/2) def test_020_TotalCounter(self): c = counter.TotalCounter() for i in range(3): c.event(i) self.assertEqual(c.avg, 3) self.assertEqual(c.sum, 3) def test_030_AverageWindowCounter(self): c = counter.AverageWindowCounter(10) self.assertTrue(c.empty()) for i in range(20): c.event(i) self.assertFalse(c.empty()) self.assertEqual(c.avg, 14.5) self.assertEqual(c.sum, 145) def test_020_delete(self): c = counter.CounterManager() c.event(('a', 'b'), 1) c.event(('a', 'c'), 1) c.event(('b', 'c'), 1) self.assertIsNotNone(c['a']) self.assertIsNotNone(c['b']) del c['a'] self.assertNotIn('a', c) self.assertIsNotNone(c['b']) ================================================ FILE: tests/test_database.py ================================================ #!/usr/bin/env python # -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # Created on 2014-02-08 22:37:13 from __future__ import unicode_literals, division import os import six import time import unittest from pyspider import database from pyspider.database.base.taskdb import TaskDB class TaskDBCase(object): sample_task = { 'taskid': 'taskid', 'project': 'project', 'url': 'www.baidu.com/', 'status': TaskDB.FAILED, 'schedule': { 'priority': 1, 'retries': 3, 'exetime': 0, 'age': 3600, 'itag': 'itag', 'recrawl': 5, }, 'fetch': { 'method': 'GET', 'headers': { 'Cookie': 'a=b', }, 'data': 'a=b&c=d', 'timeout': 60, }, 'process': { 'callback': 'callback', 'save': [1, 2, 3], }, 'track': { 'fetch': { 'ok': True, 'time': 300, 'status_code': 200, 'headers': { 'Content-Type': 'plain/html', }, 'encoding': 'utf8', # 'content': 'asdfasdfasdfasdf', }, 'process': { 'ok': False, 'time': 10, 'follows': 3, 'outputs': 5, 'exception': u"中文", }, }, 'lastcrawltime': time.time(), 'updatetime': time.time(), } @classmethod def setUpClass(self): raise NotImplementedError # this test not works for mongodb # def test_10_create_project(self): # with self.assertRaises(AssertionError): # self.taskdb._create_project('abc.abc') # self.taskdb._create_project('abc') # self.taskdb._list_project() # self.assertEqual(len(self.taskdb.projects), 1) # self.assertIn('abc', self.taskdb.projects) def test_20_insert(self): self.taskdb.insert('project', 'taskid', self.sample_task) self.taskdb.insert('project', 'taskid2', self.sample_task) def test_25_get_task(self): task = self.taskdb.get_task('project', 'taskid2') self.assertIsNotNone(task) self.assertEqual(task['taskid'], 'taskid2') self.assertEqual(task['project'], self.sample_task['project']) self.assertEqual(task['url'], self.sample_task['url']) self.assertEqual(task['status'], self.taskdb.FAILED) self.assertEqual(task['schedule'], self.sample_task['schedule']) self.assertEqual(task['fetch'], self.sample_task['fetch']) self.assertEqual(task['process'], self.sample_task['process']) self.assertEqual(task['track'], self.sample_task['track']) task = self.taskdb.get_task('project', 'taskid1', fields=['status']) self.assertIsNone(task) task = self.taskdb.get_task('project', 'taskid', fields=['taskid', 'track', ]) self.assertIn('track', task) self.assertNotIn('project', task) def test_30_status_count(self): status = self.taskdb.status_count('abc') self.assertEqual(status, {}) status = self.taskdb.status_count('project') self.assertEqual(status, {self.taskdb.FAILED: 2}) def test_40_update_and_status_count(self): self.taskdb.update('project', 'taskid', status=self.taskdb.ACTIVE) status = self.taskdb.status_count('project') self.assertEqual(status, {self.taskdb.ACTIVE: 1, self.taskdb.FAILED: 1}) self.taskdb.update('project', 'taskid', track={}) task = self.taskdb.get_task('project', 'taskid', fields=['taskid', 'track', ]) self.assertIn('track', task) self.assertEqual(task['track'], {}) def test_50_load_tasks(self): tasks = list(self.taskdb.load_tasks(self.taskdb.ACTIVE)) self.assertEqual(len(tasks), 1) task = tasks[0] self.assertIn('taskid', task, task) self.assertEqual(task['taskid'], 'taskid', task) self.assertEqual(task['schedule'], self.sample_task['schedule']) self.assertEqual(task['fetch'], self.sample_task['fetch']) self.assertEqual(task['process'], self.sample_task['process']) self.assertEqual(task['track'], {}) tasks = list(self.taskdb.load_tasks(self.taskdb.ACTIVE, project='project', fields=['taskid'])) self.assertEqual(len(tasks), 1) self.assertEqual(tasks[0]['taskid'], 'taskid') self.assertNotIn('project', tasks[0]) def test_60_relist_projects(self): if hasattr(self.taskdb, '_list_project'): self.taskdb._list_project() self.assertNotIn('system.indexes', self.taskdb.projects) def test_z10_drop(self): self.taskdb.insert('drop_project2', 'taskid', self.sample_task) self.taskdb.insert('drop_project3', 'taskid', self.sample_task) self.taskdb.drop('drop_project3') self.assertIsNotNone(self.taskdb.get_task('drop_project2', 'taskid'), None) self.assertIsNone(self.taskdb.get_task('drop_project3', 'taskid'), None) def test_z20_update_projects(self): saved = getattr(self.taskdb, 'UPDATE_PROJECTS_TIME', None) self.taskdb.UPDATE_PROJECTS_TIME = 0.1 time.sleep(0.2) self.assertIn('drop_project2', self.taskdb.projects) self.assertNotIn('drop_project3', self.taskdb.projects) self.taskdb.UPDATE_PROJECTS_TIME = saved class ProjectDBCase(object): sample_project = { 'name': 'name', 'script': 'import time\nprint(time.time(), "!@#$%^&*()\';:<>?/|")', 'status': 'TODO', 'rate': 1.0, 'burst': 10.0, } @classmethod def setUpClass(self): raise NotImplemented def test_10_insert(self): self.projectdb.insert('abc', self.sample_project) self.projectdb.insert(u'name中文', self.sample_project) project = self.projectdb.get('abc') self.assertIsNotNone(project) def test_20_get_all(self): projects = list(self.projectdb.get_all()) self.assertEqual(len(projects), 2) for project in projects: if project['name'] == 'abc': break for key in ('name', 'group', 'status', 'script', 'comments', 'rate', 'burst', 'updatetime'): self.assertIn(key, project) self.assertEqual(project['name'], u'abc') self.assertEqual(project['status'], self.sample_project['status']) self.assertEqual(project['script'], self.sample_project['script']) self.assertEqual(project['rate'], self.sample_project['rate']) self.assertEqual(type(project['rate']), float) self.assertEqual(project['burst'], self.sample_project['burst']) self.assertEqual(type(project['burst']), float) projects = list(self.projectdb.get_all(fields=['name', 'script'])) self.assertEqual(len(projects), 2) project = projects[1] self.assertIn('name', project) self.assertNotIn('gourp', project) def test_30_update(self): self.projectdb.update('not_found', status='RUNNING') project = self.projectdb.get('not_found') self.assertIsNone(project) def test_40_check_update(self): time.sleep(0.1) now = time.time() time.sleep(0.1) self.projectdb.update('abc', status='RUNNING') projects = list(self.projectdb.check_update( now, fields=['name', 'status', 'group', 'updatetime', ] )) self.assertEqual(len(projects), 1, repr(projects)) project = projects[0] self.assertEqual(project['name'], 'abc') self.assertEqual(project['status'], 'RUNNING') def test_45_check_update_when_bootup(self): projects = list(self.projectdb.check_update(0)) project = projects[0] for key in ('name', 'group', 'status', 'script', 'comments', 'rate', 'burst', 'updatetime'): self.assertIn(key, project) def test_50_get(self): project = self.projectdb.get('not_found') self.assertIsNone(project) project = self.projectdb.get('abc') self.assertEqual(project['name'], 'abc') self.assertEqual(project['status'], 'RUNNING') project = self.projectdb.get(u'name中文', ['group', 'status', 'name']) self.assertEqual(project['name'], u'name中文') self.assertIn('status', project) self.assertNotIn('gourp', project) def test_z10_drop(self): self.projectdb.insert(u'drop_project2', self.sample_project) self.projectdb.insert(u'drop_project3', self.sample_project) self.projectdb.drop('drop_project3') self.assertIsNotNone(self.projectdb.get('drop_project2')) self.assertIsNone(self.projectdb.get('drop_project3')) class ResultDBCase(object): @classmethod def setUpClass(self): raise NotImplemented def test_10_save(self): self.resultdb.save('test_project', 'test_taskid', 'test_url', 'result') result = self.resultdb.get('test_project', 'test_taskid') self.assertIsNotNone(result) self.assertEqual(result['result'], 'result') self.resultdb.save('test_project', 'test_taskid', 'test_url_updated', 'result_updated') result = self.resultdb.get('test_project', 'test_taskid') self.assertEqual(result['result'], 'result_updated') self.assertEqual(result['url'], 'test_url_updated') def test_20_get(self): result = self.resultdb.get('test_project', 'not_exists') self.assertIsNone(result) result = self.resultdb.get('not_exists', 'test_taskid') self.assertIsNone(result) result = self.resultdb.get('test_project', 'test_taskid', fields=('url', )) self.assertIsNotNone(result) self.assertIn('url', result) self.assertNotIn('result', result) result = self.resultdb.get('test_project', 'test_taskid') self.assertEqual(result['taskid'], 'test_taskid') self.assertEqual(result['url'], 'test_url_updated') self.assertEqual(result['result'], 'result_updated') self.assertIn('updatetime', result) def test_30_select(self): for i in range(5): self.resultdb.save('test_project', 'test_taskid-%d' % i, 'test_url', 'result-%d' % i) ret = list(self.resultdb.select('test_project')) self.assertEqual(len(ret), 6) ret = list(self.resultdb.select('test_project', limit=4)) self.assertEqual(len(ret), 4) for ret in self.resultdb.select('test_project', fields=('url', ), limit=1): self.assertIn('url', ret) self.assertNotIn('result', ret) def test_35_select_limit(self): ret = list(self.resultdb.select('test_project', limit=None, offset=None)) self.assertEqual(len(ret), 6) ret = list(self.resultdb.select('test_project', limit=None, offset=2)) self.assertEqual(len(ret), 4, ret) def test_40_count(self): self.assertEqual(self.resultdb.count('test_project'), 6) def test_50_select_not_finished(self): for i in self.resultdb.select('test_project'): break self.assertEqual(self.resultdb.count('test_project'), 6) def test_60_relist_projects(self): if hasattr(self.resultdb, '_list_project'): self.resultdb._list_project() self.assertNotIn('system.indexes', self.resultdb.projects) def test_z10_drop(self): self.resultdb.save('drop_project2', 'test_taskid', 'test_url', 'result') self.resultdb.save('drop_project3', 'test_taskid', 'test_url', 'result') self.resultdb.drop('drop_project3') self.assertIsNotNone(self.resultdb.get('drop_project2', 'test_taskid')) self.assertIsNone(self.resultdb.get('drop_project3', 'test_taskid')) def test_z20_update_projects(self): saved = self.resultdb.UPDATE_PROJECTS_TIME self.resultdb.UPDATE_PROJECTS_TIME = 0.1 time.sleep(0.2) self.assertIn('drop_project2', self.resultdb.projects) self.assertNotIn('drop_project3', self.resultdb.projects) self.resultdb.UPDATE_PROJECTS_TIME = saved class TestSqliteTaskDB(TaskDBCase, unittest.TestCase): @classmethod def setUpClass(self): self.taskdb = database.connect_database('sqlite+taskdb://') self.assertIsNotNone(self, self.taskdb) @classmethod def tearDownClass(self): del self.taskdb class TestSqliteProjectDB(ProjectDBCase, unittest.TestCase): @classmethod def setUpClass(self): self.projectdb = database.connect_database('sqlite+projectdb://') self.assertIsNotNone(self, self.projectdb) @classmethod def tearDownClass(self): del self.projectdb class TestSqliteResultDB(ResultDBCase, unittest.TestCase): @classmethod def setUpClass(self): self.resultdb = database.connect_database('sqlite+resultdb://') self.assertIsNotNone(self, self.resultdb) @classmethod def tearDownClass(self): del self.resultdb @unittest.skipIf(os.environ.get('IGNORE_MYSQL') or os.environ.get('IGNORE_ALL'), 'no mysql server for test.') class TestMysqlTaskDB(TaskDBCase, unittest.TestCase): @classmethod def setUpClass(self): self.taskdb = database.connect_database('mysql+taskdb://localhost/pyspider_test_taskdb') self.assertIsNotNone(self, self.taskdb) @classmethod def tearDownClass(self): self.taskdb._execute('DROP DATABASE pyspider_test_taskdb') @unittest.skipIf(os.environ.get('IGNORE_MYSQL') or os.environ.get('IGNORE_ALL'), 'no mysql server for test.') class TestMysqlProjectDB(ProjectDBCase, unittest.TestCase): @classmethod def setUpClass(self): self.projectdb = database.connect_database( 'mysql+projectdb://localhost/pyspider_test_projectdb' ) self.assertIsNotNone(self, self.projectdb) @classmethod def tearDownClass(self): self.projectdb._execute('DROP DATABASE pyspider_test_projectdb') @unittest.skipIf(os.environ.get('IGNORE_MYSQL') or os.environ.get('IGNORE_ALL'), 'no mysql server for test.') class TestMysqlResultDB(ResultDBCase, unittest.TestCase): @classmethod def setUpClass(self): self.resultdb = database.connect_database( 'mysql+resultdb://localhost/pyspider_test_resultdb' ) self.assertIsNotNone(self, self.resultdb) @classmethod def tearDownClass(self): self.resultdb._execute('DROP DATABASE pyspider_test_resultdb') @unittest.skipIf(os.environ.get('IGNORE_MONGODB') or os.environ.get('IGNORE_ALL'), 'no mongodb server for test.') class TestMongoDBTaskDB(TaskDBCase, unittest.TestCase): @classmethod def setUpClass(self): self.taskdb = database.connect_database( 'mongodb+taskdb://localhost:27017/pyspider_test_taskdb' ) self.assertIsNotNone(self, self.taskdb) @classmethod def tearDownClass(self): self.taskdb.conn.drop_database(self.taskdb.database.name) def test_create_project(self): self.assertNotIn('test_create_project', self.taskdb.projects) self.taskdb._create_project('test_create_project') self.assertIn('test_create_project', self.taskdb.projects) @unittest.skipIf(os.environ.get('IGNORE_MONGODB') or os.environ.get('IGNORE_ALL'), 'no mongodb server for test.') class TestMongoDBProjectDB(ProjectDBCase, unittest.TestCase): @classmethod def setUpClass(self): self.projectdb = database.connect_database( 'mongodb+projectdb://localhost/pyspider_test_projectdb' ) self.assertIsNotNone(self, self.projectdb) @classmethod def tearDownClass(self): self.projectdb.conn.drop_database(self.projectdb.database.name) @unittest.skipIf(os.environ.get('IGNORE_MONGODB') or os.environ.get('IGNORE_ALL'), 'no mongodb server for test.') class TestMongoDBResultDB(ResultDBCase, unittest.TestCase): @classmethod def setUpClass(self): self.resultdb = database.connect_database( 'mongodb+resultdb://localhost/pyspider_test_resultdb' ) self.assertIsNotNone(self, self.resultdb) @classmethod def tearDownClass(self): self.resultdb.conn.drop_database(self.resultdb.database.name) def test_create_project(self): self.assertNotIn('test_create_project', self.resultdb.projects) self.resultdb._create_project('test_create_project') self.assertIn('test_create_project', self.resultdb.projects) @unittest.skipIf(os.environ.get('IGNORE_MYSQL') or os.environ.get('IGNORE_ALL'), 'no mysql server for test.') class TestSQLAlchemyMySQLTaskDB(TaskDBCase, unittest.TestCase): @classmethod def setUpClass(self): self.taskdb = database.connect_database( 'sqlalchemy+mysql+mysqlconnector+taskdb://root@localhost/pyspider_test_taskdb' ) self.assertIsNotNone(self, self.taskdb) @classmethod def tearDownClass(self): self.taskdb.engine.execute('DROP DATABASE pyspider_test_taskdb') @unittest.skipIf(os.environ.get('IGNORE_MYSQL') or os.environ.get('IGNORE_ALL'), 'no mysql server for test.') class TestSQLAlchemyMySQLProjectDB(ProjectDBCase, unittest.TestCase): @classmethod def setUpClass(self): self.projectdb = database.connect_database( 'sqlalchemy+mysql+mysqlconnector+projectdb://root@localhost/pyspider_test_projectdb' ) self.assertIsNotNone(self, self.projectdb) @classmethod def tearDownClass(self): self.projectdb.engine.execute('DROP DATABASE pyspider_test_projectdb') @unittest.skipIf(os.environ.get('IGNORE_MYSQL') or os.environ.get('IGNORE_ALL'), 'no mysql server for test.') class TestSQLAlchemyMySQLResultDB(ResultDBCase, unittest.TestCase): @classmethod def setUpClass(self): self.resultdb = database.connect_database( 'sqlalchemy+mysql+mysqlconnector+resultdb://root@localhost/pyspider_test_resultdb' ) self.assertIsNotNone(self, self.resultdb) @classmethod def tearDownClass(self): self.resultdb.engine.execute('DROP DATABASE pyspider_test_resultdb') class TestSQLAlchemyTaskDB(TaskDBCase, unittest.TestCase): @classmethod def setUpClass(self): self.taskdb = database.connect_database( 'sqlalchemy+sqlite+taskdb://' ) self.assertIsNotNone(self, self.taskdb) @classmethod def tearDownClass(self): del self.taskdb class TestSQLAlchemyProjectDB(ProjectDBCase, unittest.TestCase): @classmethod def setUpClass(self): self.projectdb = database.connect_database( 'sqlalchemy+sqlite+projectdb://' ) self.assertIsNotNone(self, self.projectdb) @classmethod def tearDownClass(self): del self.projectdb class TestSQLAlchemyResultDB(ResultDBCase, unittest.TestCase): @classmethod def setUpClass(self): self.resultdb = database.connect_database( 'sqlalchemy+sqlite+resultdb://' ) self.assertIsNotNone(self, self.resultdb) @classmethod def tearDownClass(self): del self.resultdb @unittest.skipIf(os.environ.get('IGNORE_POSTGRESQL') or os.environ.get('IGNORE_ALL'), 'no postgresql server for test.') class TestPGTaskDB(TaskDBCase, unittest.TestCase): @classmethod def setUpClass(self): self.taskdb = database.connect_database( 'sqlalchemy+postgresql+taskdb://postgres@127.0.0.1:5432/pyspider_test_taskdb' ) self.assertIsNotNone(self, self.taskdb) self.tearDownClass() @classmethod def tearDownClass(self): for project in self.taskdb.projects: self.taskdb.drop(project) @unittest.skipIf(os.environ.get('IGNORE_POSTGRESQL') or os.environ.get('IGNORE_ALL'), 'no postgresql server for test.') class TestPGProjectDB(ProjectDBCase, unittest.TestCase): @classmethod def setUpClass(self): self.projectdb = database.connect_database( 'sqlalchemy+postgresql+projectdb://postgres@127.0.0.1:5432/pyspider_test_projectdb' ) self.assertIsNotNone(self, self.projectdb) self.tearDownClass() @classmethod def tearDownClass(self): for project in self.projectdb.get_all(fields=['name']): self.projectdb.drop(project['name']) @unittest.skipIf(os.environ.get('IGNORE_POSTGRESQL') or os.environ.get('IGNORE_ALL'), 'no postgresql server for test.') class TestPGResultDB(ResultDBCase, unittest.TestCase): @classmethod def setUpClass(self): self.resultdb = database.connect_database( 'sqlalchemy+postgresql+resultdb://postgres@127.0.0.1/pyspider_test_resultdb' ) self.assertIsNotNone(self, self.resultdb) self.tearDownClass() @classmethod def tearDownClass(self): for project in self.resultdb.projects: self.resultdb.drop(project) @unittest.skipIf(os.environ.get('IGNORE_REDIS') or os.environ.get('IGNORE_ALL'), 'no redis server for test.') class TestRedisTaskDB(TaskDBCase, unittest.TestCase): @classmethod def setUpClass(self): self.taskdb = database.connect_database('redis+taskdb://localhost:6379/15') self.assertIsNotNone(self, self.taskdb) self.taskdb.__prefix__ = 'testtaskdb_' @classmethod def tearDownClass(self): for project in self.taskdb.projects: self.taskdb.drop(project) @unittest.skipIf(os.environ.get('IGNORE_ELASTICSEARCH') or os.environ.get('IGNORE_ALL'), 'no elasticsearch server for test.') class TestESProjectDB(ProjectDBCase, unittest.TestCase): @classmethod def setUpClass(self): self.projectdb = database.connect_database( 'elasticsearch+projectdb://127.0.0.1:9200/?index=test_pyspider_projectdb' ) self.assertIsNotNone(self, self.projectdb) assert self.projectdb.index == 'test_pyspider_projectdb' @classmethod def tearDownClass(self): self.projectdb.es.indices.delete(index='test_pyspider_projectdb', ignore=[400, 404]) @unittest.skipIf(os.environ.get('IGNORE_ELASTICSEARCH') or os.environ.get('IGNORE_ALL'), 'no elasticsearch server for test.') class TestESResultDB(ResultDBCase, unittest.TestCase): @classmethod def setUpClass(self): self.resultdb = database.connect_database( 'elasticsearch+resultdb://127.0.0.1:9200/?index=test_pyspider_resultdb' ) self.assertIsNotNone(self, self.resultdb) assert self.resultdb.index == 'test_pyspider_resultdb' @classmethod def tearDownClass(self): self.resultdb.es.indices.delete(index='test_pyspider_resultdb', ignore=[400, 404]) def test_15_save(self): self.resultdb.refresh() def test_30_select(self): for i in range(5): self.resultdb.save('test_project', 'test_taskid-%d' % i, 'test_url', 'result-%d' % i) self.resultdb.refresh() ret = list(self.resultdb.select('test_project')) self.assertEqual(len(ret), 6) ret = list(self.resultdb.select('test_project', limit=4)) self.assertEqual(len(ret), 4) for ret in self.resultdb.select('test_project', fields=('url', ), limit=1): self.assertIn('url', ret) self.assertNotIn('result', ret) def test_35_select_limit(self): pass def test_z20_update_projects(self): self.resultdb.refresh() self.assertIn('drop_project2', self.resultdb.projects) self.assertNotIn('drop_project3', self.resultdb.projects) @unittest.skipIf(os.environ.get('IGNORE_ELASTICSEARCH') or os.environ.get('IGNORE_ALL'), 'no elasticsearch server for test.') class TestESTaskDB(TaskDBCase, unittest.TestCase): @classmethod def setUpClass(self): self.taskdb = database.connect_database( 'elasticsearch+taskdb://127.0.0.1:9200/?index=test_pyspider_taskdb' ) self.assertIsNotNone(self, self.taskdb) assert self.taskdb.index == 'test_pyspider_taskdb' @classmethod def tearDownClass(self): self.taskdb.es.indices.delete(index='test_pyspider_taskdb', ignore=[400, 404]) @unittest.skipIf(os.environ.get('IGNORE_COUCHDB') or os.environ.get('IGNORE_ALL'), 'no couchdb server for test.') class TestCouchDBProjectDB(ProjectDBCase, unittest.TestCase): @classmethod def setUpClass(self): # create a test admin user self.projectdb = database.connect_database( 'couchdb+projectdb://localhost:5984/' ) self.assertIsNotNone(self, self.projectdb) @classmethod def tearDownClass(self): # remove the test admin user self.projectdb.drop_database() @unittest.skipIf(os.environ.get('IGNORE_COUCHDB') or os.environ.get('IGNORE_ALL'), 'no couchdb server for test.') class TestCouchDBResultDB(ResultDBCase, unittest.TestCase): @classmethod def setUpClass(self): # create a test admin user self.resultdb = database.connect_database( 'couchdb+resultdb://localhost:5984/' ) self.assertIsNotNone(self, self.resultdb) @classmethod def tearDownClass(self): # remove the test admin user self.resultdb.drop_database() def test_create_project(self): self.assertNotIn('test_create_project', self.resultdb.projects) self.resultdb._create_project('test_create_project') self.assertIn('test_create_project', self.resultdb.projects) @unittest.skipIf(os.environ.get('IGNORE_COUCHDB') or os.environ.get('IGNORE_ALL'), 'no couchdb server for test.') class TestCouchDBTaskDB(TaskDBCase, unittest.TestCase): @classmethod def setUpClass(self): # create a test admin user import requests self.taskdb = database.connect_database( 'couchdb+taskdb://localhost:5984/' ) self.assertIsNotNone(self, self.taskdb) @classmethod def tearDownClass(self): # remove the test admin user import requests from requests.auth import HTTPBasicAuth self.taskdb.drop_database() def test_create_project(self): self.assertNotIn('test_create_project', self.taskdb.projects) self.taskdb._create_project('test_create_project') self.assertIn('test_create_project', self.taskdb.projects) if __name__ == '__main__': unittest.main() ================================================ FILE: tests/test_fetcher.py ================================================ #!/usr/bin/env python # -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # Created on 2014-02-15 22:10:35 import os import json import copy import time import socket import umsgpack import subprocess import unittest import logging import logging.config logging.config.fileConfig("pyspider/logging.conf") try: from six.moves import xmlrpc_client except ImportError: import xmlrpclib as xmlrpc_client from pyspider.libs import utils from pyspider.libs.multiprocessing_queue import Queue from pyspider.libs.response import rebuild_response from pyspider.fetcher.tornado_fetcher import Fetcher class TestFetcher(unittest.TestCase): sample_task_http = { 'taskid': 'taskid', 'project': 'project', 'url': '', 'fetch': { 'method': 'GET', 'headers': { 'Cookie': 'a=b', 'a': 'b' }, 'cookies': { 'c': 'd', }, 'timeout': 60, 'save': 'abc', }, 'process': { 'callback': 'callback', 'save': [1, 2, 3], }, } @classmethod def setUpClass(self): import tests.data_test_webpage import httpbin self.httpbin_thread = utils.run_in_subprocess(httpbin.app.run, port=14887, passthrough_errors=False) self.httpbin = 'http://127.0.0.1:14887' self.inqueue = Queue(10) self.outqueue = Queue(10) self.fetcher = Fetcher(self.inqueue, self.outqueue) self.fetcher.phantomjs_proxy = '127.0.0.1:25555' self.rpc = xmlrpc_client.ServerProxy('http://localhost:%d' % 24444) self.xmlrpc_thread = utils.run_in_thread(self.fetcher.xmlrpc_run, port=24444) self.thread = utils.run_in_thread(self.fetcher.run) self.proxy_thread = subprocess.Popen(['pyproxy', '--username=binux', '--password=123456', '--port=14830', '--debug'], close_fds=True) self.proxy = '127.0.0.1:14830' try: self.phantomjs = subprocess.Popen(['phantomjs', os.path.join(os.path.dirname(__file__), '../pyspider/fetcher/phantomjs_fetcher.js'), '25555']) except OSError: self.phantomjs = None time.sleep(0.5) @classmethod def tearDownClass(self): self.proxy_thread.terminate() self.proxy_thread.wait() self.httpbin_thread.terminate() self.httpbin_thread.join() if self.phantomjs: self.phantomjs.kill() self.phantomjs.wait() self.rpc._quit() self.thread.join() assert not utils.check_port_open(5000) assert not utils.check_port_open(23333) assert not utils.check_port_open(24444) assert not utils.check_port_open(25555) assert not utils.check_port_open(14887) time.sleep(1) def test_10_http_get(self): request = copy.deepcopy(self.sample_task_http) request['url'] = self.httpbin+'/get' result = self.fetcher.sync_fetch(request) response = rebuild_response(result) self.assertEqual(response.status_code, 200, result) self.assertEqual(response.orig_url, request['url']) self.assertEqual(response.save, request['fetch']['save']) self.assertIsNotNone(response.json, response.content) self.assertEqual(response.json['headers'].get('A'), 'b', response.json) self.assertIn('c=d', response.json['headers'].get('Cookie'), response.json) self.assertIn('a=b', response.json['headers'].get('Cookie'), response.json) def test_15_http_post(self): request = copy.deepcopy(self.sample_task_http) request['url'] = self.httpbin+'/post' request['fetch']['method'] = 'POST' request['fetch']['data'] = 'binux' request['fetch']['cookies'] = {'c': 'd'} result = self.fetcher.sync_fetch(request) response = rebuild_response(result) self.assertEqual(response.status_code, 200) self.assertEqual(response.orig_url, request['url']) self.assertEqual(response.save, request['fetch']['save']) self.assertIsNotNone(response.json, response.content) self.assertEqual(response.json['form'].get('binux'), '') self.assertEqual(response.json['headers'].get('A'), 'b', response.json) self.assertIn('c=d', response.json['headers'].get('Cookie'), response.json) self.assertIn('a=b', response.json['headers'].get('Cookie'), response.json) def test_20_dataurl_get(self): request = copy.deepcopy(self.sample_task_http) request['url'] = 'data:,hello' result = self.fetcher.sync_fetch(request) response = rebuild_response(result) self.assertEqual(response.status_code, 200) self.assertEqual(response.text, 'hello') def test_30_with_queue(self): request= copy.deepcopy(self.sample_task_http) request['url'] = 'data:,hello' self.inqueue.put(request) task, result = self.outqueue.get() response = rebuild_response(result) self.assertEqual(response.status_code, 200) self.assertEqual(response.text, 'hello') def test_40_with_rpc(self): request = copy.deepcopy(self.sample_task_http) request['url'] = 'data:,hello' result = umsgpack.unpackb(self.rpc.fetch(request).data) response = rebuild_response(result) self.assertEqual(response.status_code, 200) self.assertEqual(response.text, 'hello') def test_50_base64_data(self): request = copy.deepcopy(self.sample_task_http) request['url'] = self.httpbin+'/post' request['fetch']['method'] = 'POST' # utf8 encoding 中文 request['fetch']['data'] = "[BASE64-DATA]5Lit5paH[/BASE64-DATA]" self.inqueue.put(request) task, result = self.outqueue.get() response = rebuild_response(result) self.assertEqual(response.status_code, 200, response.error) self.assertIsNotNone(response.json, response.content) self.assertIn(u'中文', response.json['form'], response.json) def test_55_base64_data(self): request = copy.deepcopy(self.sample_task_http) request['url'] = self.httpbin+'/post' request['fetch']['method'] = 'POST' # gbk encoding 中文 request['fetch']['data'] = "[BASE64-DATA]1tDOxA==[/BASE64-DATA]" self.inqueue.put(request) task, result = self.outqueue.get() response = rebuild_response(result) self.assertEqual(response.status_code, 200, response.error) self.assertIsNotNone(response.json, response.content) def test_60_timeout(self): request = copy.deepcopy(self.sample_task_http) request['url'] = self.httpbin+'/delay/5' request['fetch']['timeout'] = 3 start_time = time.time() self.inqueue.put(request) task, result = self.outqueue.get() end_time = time.time() self.assertGreater(end_time - start_time, 1.5) self.assertLess(end_time - start_time, 4.5) response = rebuild_response(result) self.assertEqual(response.orig_url, request['url']) self.assertEqual(response.save, request['fetch']['save']) def test_65_418(self): request = copy.deepcopy(self.sample_task_http) request['url'] = self.httpbin+'/status/418' self.inqueue.put(request) task, result = self.outqueue.get() response = rebuild_response(result) self.assertEqual(response.status_code, 418) self.assertIn('teapot', response.text) def test_69_no_phantomjs(self): phantomjs_proxy = self.fetcher.phantomjs_proxy self.fetcher.phantomjs_proxy = None if not self.phantomjs: raise unittest.SkipTest('no phantomjs') request = copy.deepcopy(self.sample_task_http) request['url'] = self.httpbin + '/get' request['fetch']['fetch_type'] = 'phantomjs' result = self.fetcher.sync_fetch(request) response = rebuild_response(result) self.assertEqual(response.status_code, 501, result) self.fetcher.phantomjs_proxy = phantomjs_proxy def test_70_phantomjs_url(self): if not self.phantomjs: raise unittest.SkipTest('no phantomjs') request = copy.deepcopy(self.sample_task_http) request['url'] = self.httpbin + '/get' request['fetch']['fetch_type'] = 'phantomjs' result = self.fetcher.sync_fetch(request) response = rebuild_response(result) self.assertEqual(response.status_code, 200, result) self.assertEqual(response.orig_url, request['url']) self.assertEqual(response.save, request['fetch']['save']) data = json.loads(response.doc('pre').text()) self.assertEqual(data['headers'].get('A'), 'b', response.content) self.assertIn('c=d', data['headers'].get('Cookie'), response.content) self.assertIn('a=b', data['headers'].get('Cookie'), response.content) def test_75_phantomjs_robots(self): if not self.phantomjs: raise unittest.SkipTest('no phantomjs') request = copy.deepcopy(self.sample_task_http) request['url'] = self.httpbin + '/deny' request['fetch']['fetch_type'] = 'phantomjs' request['fetch']['robots_txt'] = True result = self.fetcher.sync_fetch(request) response = rebuild_response(result) self.assertEqual(response.status_code, 403, result) def test_80_phantomjs_timeout(self): if not self.phantomjs: raise unittest.SkipTest('no phantomjs') request = copy.deepcopy(self.sample_task_http) request['url'] = self.httpbin+'/delay/5' request['fetch']['fetch_type'] = 'phantomjs' request['fetch']['timeout'] = 3 start_time = time.time() result = self.fetcher.sync_fetch(request) end_time = time.time() self.assertGreater(end_time - start_time, 2) self.assertLess(end_time - start_time, 5) self.assertEqual(result['status_code'], 599) self.assertIn('js_script_result', result) def test_90_phantomjs_js_script(self): if not self.phantomjs: raise unittest.SkipTest('no phantomjs') request = copy.deepcopy(self.sample_task_http) request['url'] = self.httpbin + '/html' request['fetch']['fetch_type'] = 'phantomjs' request['fetch']['js_script'] = 'function() { document.write("binux") }' result = self.fetcher.sync_fetch(request) self.assertEqual(result['status_code'], 200) self.assertIn('binux', result['content']) def test_a100_phantomjs_sharp_url(self): if not self.phantomjs: raise unittest.SkipTest('no phantomjs') request = copy.deepcopy(self.sample_task_http) request['url'] = self.httpbin+'/pyspider/ajax.html' request['fetch']['fetch_type'] = 'phantomjs' request['fetch']['headers']['User-Agent'] = 'pyspider-test' result = self.fetcher.sync_fetch(request) self.assertEqual(result['status_code'], 200) self.assertNotIn('loading', result['content']) self.assertIn('done', result['content']) self.assertIn('pyspider-test', result['content']) def test_a110_dns_error(self): request = copy.deepcopy(self.sample_task_http) request['url'] = 'http://www.not-exists-site-binux.com/' result = self.fetcher.sync_fetch(request) self.assertEqual(result['status_code'], 599) self.assertIn('error', result) self.assertIn('resolve', result['error']) self.inqueue.put(request) task, result = self.outqueue.get() self.assertEqual(result['status_code'], 599) self.assertIn('error', result) self.assertIn('resolve', result['error']) def test_a120_http_get_with_proxy_fail(self): self.fetcher.proxy = self.proxy request = copy.deepcopy(self.sample_task_http) request['url'] = self.httpbin+'/get' result = self.fetcher.sync_fetch(request) response = rebuild_response(result) self.assertEqual(response.status_code, 403, result) self.fetcher.proxy = None def test_a130_http_get_with_proxy_ok(self): self.fetcher.proxy = self.proxy request = copy.deepcopy(self.sample_task_http) request['url'] = self.httpbin+'/get?username=binux&password=123456' result = self.fetcher.sync_fetch(request) response = rebuild_response(result) self.assertEqual(response.status_code, 200, result) self.assertEqual(response.orig_url, request['url']) self.assertEqual(response.save, request['fetch']['save']) self.assertIsNotNone(response.json, response.content) self.assertEqual(response.json['headers'].get('A'), 'b', response.json) self.assertIn('c=d', response.json['headers'].get('Cookie'), response.json) self.assertIn('a=b', response.json['headers'].get('Cookie'), response.json) self.fetcher.proxy = None def test_a140_redirect(self): request = copy.deepcopy(self.sample_task_http) request['url'] = self.httpbin+'/redirect-to?url=/get' result = self.fetcher.sync_fetch(request) response = rebuild_response(result) self.assertEqual(response.status_code, 200, result) self.assertEqual(response.orig_url, request['url']) self.assertEqual(response.url, self.httpbin+'/get') def test_a150_too_much_redirect(self): request = copy.deepcopy(self.sample_task_http) request['url'] = self.httpbin+'/redirect/10' result = self.fetcher.sync_fetch(request) response = rebuild_response(result) self.assertEqual(response.status_code, 599, result) self.assertIn('redirects followed', response.error) def test_a160_cookie(self): request = copy.deepcopy(self.sample_task_http) request['url'] = self.httpbin+'/cookies/set?k1=v1&k2=v2' result = self.fetcher.sync_fetch(request) response = rebuild_response(result) self.assertEqual(response.status_code, 200, result) self.assertEqual(response.cookies, {'a': 'b', 'k1': 'v1', 'k2': 'v2', 'c': 'd'}, result) def test_a170_validate_cert(self): request = copy.deepcopy(self.sample_task_http) request['fetch']['validate_cert'] = False request['url'] = self.httpbin+'/get' result = self.fetcher.sync_fetch(request) response = rebuild_response(result) self.assertEqual(response.status_code, 200, result) def test_a180_max_redirects(self): request = copy.deepcopy(self.sample_task_http) request['fetch']['max_redirects'] = 10 request['url'] = self.httpbin+'/redirect/10' result = self.fetcher.sync_fetch(request) response = rebuild_response(result) self.assertEqual(response.status_code, 200, result) def test_a200_robots_txt(self): request = copy.deepcopy(self.sample_task_http) request['fetch']['robots_txt'] = False request['url'] = self.httpbin+'/deny' result = self.fetcher.sync_fetch(request) response = rebuild_response(result) self.assertEqual(response.status_code, 200, result) request['fetch']['robots_txt'] = True result = self.fetcher.sync_fetch(request) response = rebuild_response(result) self.assertEqual(response.status_code, 403, result) def test_zzzz_issue375(self): phantomjs_proxy = self.fetcher.phantomjs_proxy self.fetcher.phantomjs_proxy = '127.0.0.1:20000' if not self.phantomjs: raise unittest.SkipTest('no phantomjs') request = copy.deepcopy(self.sample_task_http) request['url'] = self.httpbin + '/get' request['fetch']['fetch_type'] = 'phantomjs' result = self.fetcher.sync_fetch(request) response = rebuild_response(result) self.assertEqual(response.status_code, 599, result) self.fetcher.phantomjs_proxy = phantomjs_proxy @unittest.skipIf(os.environ.get('IGNORE_SPLASH') or os.environ.get('IGNORE_ALL'), 'no splash server for test.') class TestSplashFetcher(unittest.TestCase): @property def sample_task_http(self): return { 'taskid': 'taskid', 'project': 'project', 'url': '', 'fetch': { 'method': 'GET', 'headers': { 'Cookie': 'a=b', 'a': 'b' }, 'cookies': { 'c': 'd', }, 'timeout': 60, 'save': 'abc', }, 'process': { 'callback': 'callback', 'save': [1, 2, 3], }, } @classmethod def setUpClass(self): import tests.data_test_webpage import httpbin self.httpbin_thread = utils.run_in_subprocess(httpbin.app.run, host='0.0.0.0', port=14887, passthrough_errors=False) self.httpbin = 'http://' + socket.gethostbyname(socket.gethostname()) + ':14887' self.inqueue = Queue(10) self.outqueue = Queue(10) self.fetcher = Fetcher(self.inqueue, self.outqueue) self.fetcher.splash_endpoint = 'http://127.0.0.1:8050/execute' self.rpc = xmlrpc_client.ServerProxy('http://localhost:%d' % 24444) self.xmlrpc_thread = utils.run_in_thread(self.fetcher.xmlrpc_run, port=24444) self.thread = utils.run_in_thread(self.fetcher.run) self.proxy_thread = subprocess.Popen(['pyproxy', '--username=binux', '--bind=0.0.0.0', '--password=123456', '--port=14830', '--debug'], close_fds=True) self.proxy = socket.gethostbyname(socket.gethostname()) + ':14830' @classmethod def tearDownClass(self): self.rpc("close")() self.proxy_thread.terminate() self.proxy_thread.wait() self.httpbin_thread.terminate() self.httpbin_thread.join() self.rpc._quit() self.thread.join() assert not utils.check_port_open(5000) assert not utils.check_port_open(23333) assert not utils.check_port_open(24444) assert not utils.check_port_open(25555) assert not utils.check_port_open(14887) time.sleep(1) def test_69_no_splash(self): splash_endpoint = self.fetcher.splash_endpoint self.fetcher.splash_endpoint = None request = self.sample_task_http request['url'] = self.httpbin + '/get' request['fetch']['fetch_type'] = 'splash' result = self.fetcher.sync_fetch(request) response = rebuild_response(result) self.assertEqual(response.status_code, 501, result) self.fetcher.splash_endpoint = splash_endpoint def test_70_splash_url(self): request = self.sample_task_http request['url'] = self.httpbin + '/get' request['fetch']['fetch_type'] = 'splash' result = self.fetcher.sync_fetch(request) response = rebuild_response(result) self.assertEqual(response.status_code, 200, result) self.assertEqual(response.orig_url, request['url']) self.assertEqual(response.save, request['fetch']['save']) data = json.loads(response.doc('pre').text()) self.assertEqual(data['headers'].get('A'), 'b', response.content) self.assertIn('c=d', data['headers'].get('Cookie'), response.content) self.assertIn('a=b', data['headers'].get('Cookie'), response.content) def test_75_splash_robots(self): request = self.sample_task_http request['url'] = self.httpbin + '/deny' request['fetch']['fetch_type'] = 'splash' request['fetch']['robots_txt'] = True result = self.fetcher.sync_fetch(request) response = rebuild_response(result) self.assertEqual(response.status_code, 403, result) def test_80_splash_timeout(self): request = self.sample_task_http request['url'] = self.httpbin+'/delay/5' request['fetch']['fetch_type'] = 'splash' request['fetch']['timeout'] = 3 start_time = time.time() result = self.fetcher.sync_fetch(request) end_time = time.time() self.assertGreater(end_time - start_time, 2) self.assertLess(end_time - start_time, 5) self.assertEqual(result['status_code'], 599) # self.assertIn('js_script_result', result) TODO: lua nil is not exists def test_90_splash_js_script(self): request = self.sample_task_http request['url'] = self.httpbin + '/html' request['fetch']['fetch_type'] = 'splash' request['fetch']['js_script'] = 'function() { document.write("binux") }' result = self.fetcher.sync_fetch(request) self.assertEqual(result['status_code'], 200) self.assertIn('binux', result['content']) def test_95_splash_js_script_2(self): request = self.sample_task_http request['url'] = self.httpbin + '/pyspider/ajax_click.html' request['fetch']['fetch_type'] = 'splash' request['fetch']['headers']['User-Agent'] = 'pyspider-test' request['fetch']['js_script'] = 'function() { document.querySelector("a").click(); return "abc" }' result = self.fetcher.sync_fetch(request) self.assertEqual(result['status_code'], 200) self.assertNotIn('loading', result['content']) self.assertIn('done', result['content']) self.assertIn('pyspider-test', result['content']) self.assertIn('abc', result['js_script_result']) def test_a100_splash_sharp_url(self): request = self.sample_task_http request['url'] = self.httpbin+'/pyspider/ajax.html' request['fetch']['fetch_type'] = 'splash' request['fetch']['headers']['User-Agent'] = 'pyspider-test' result = self.fetcher.sync_fetch(request) self.assertEqual(result['status_code'], 200) self.assertNotIn('loading', result['content']) self.assertIn('done', result['content']) self.assertIn('pyspider-test', result['content']) def test_a120_http_get_with_proxy_fail_1(self): self.fetcher.proxy = self.proxy request = copy.deepcopy(self.sample_task_http) request['url'] = self.httpbin+'/get' result = self.fetcher.sync_fetch(request) response = rebuild_response(result) self.assertEqual(response.status_code, 403, result) self.fetcher.proxy = None def test_a120_http_get_with_proxy_fail(self): self.fetcher.proxy = self.proxy request = copy.deepcopy(self.sample_task_http) request['url'] = self.httpbin+'/get' request['fetch']['fetch_type'] = 'splash' result = self.fetcher.sync_fetch(request) response = rebuild_response(result) self.assertEqual(response.status_code, 403, result) self.fetcher.proxy = None def test_a130_http_get_with_proxy_ok_1(self): self.fetcher.proxy = 'http://binux:123456@%s/' % self.proxy request = copy.deepcopy(self.sample_task_http) request['url'] = self.httpbin+'/get' result = self.fetcher.sync_fetch(request) response = rebuild_response(result) self.assertEqual(response.status_code, 200, result) self.assertEqual(response.orig_url, request['url']) self.assertEqual(response.save, request['fetch']['save']) self.assertIsNotNone(response.json, response.content) self.assertEqual(response.json['headers'].get('A'), 'b', response.json) self.assertIn('c=d', response.json['headers'].get('Cookie'), response.json) self.assertIn('a=b', response.json['headers'].get('Cookie'), response.json) self.fetcher.proxy = None def test_a130_http_get_with_proxy_ok(self): self.fetcher.proxy = 'http://binux:123456@%s/' % self.proxy request = copy.deepcopy(self.sample_task_http) request['url'] = self.httpbin+'/get' request['fetch']['fetch_type'] = 'splash' result = self.fetcher.sync_fetch(request) response = rebuild_response(result) self.assertEqual(response.status_code, 200, result) self.assertEqual(response.orig_url, request['url']) self.assertEqual(response.save, request['fetch']['save']) data = json.loads(response.doc('pre').text()) self.assertEqual(data['headers'].get('A'), 'b', response.content) self.assertIn('c=d', data['headers'].get('Cookie'), response.content) self.assertIn('a=b', data['headers'].get('Cookie'), response.content) self.fetcher.proxy = None ================================================ FILE: tests/test_fetcher_processor.py ================================================ #!/usr/bin/env python # -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # Created on 2015-01-18 14:09:41 import os import time import httpbin import subprocess import unittest from pyspider.database.local.projectdb import ProjectDB from pyspider.fetcher import Fetcher from pyspider.processor import Processor from pyspider.libs import utils, dataurl from six.moves.queue import Queue from tests.data_fetcher_processor_handler import Handler class TestFetcherProcessor(Handler, unittest.TestCase): @classmethod def setUpClass(self): self.projectdb = ProjectDB([os.path.join(os.path.dirname(__file__), 'data_fetcher_processor_handler.py')]) self.fetcher = Fetcher(None, None, async_mode=False) self.status_queue = Queue() self.newtask_queue = Queue() self.result_queue = Queue() self.httpbin_thread = utils.run_in_subprocess(httpbin.app.run, port=14887, passthrough_errors=False) self.httpbin = 'http://127.0.0.1:14887' self.proxy_thread = subprocess.Popen(['pyproxy', '--username=binux', '--password=123456', '--port=14830', '--debug'], close_fds=True) self.proxy = '127.0.0.1:14830' self.processor = Processor(projectdb=self.projectdb, inqueue=None, status_queue=self.status_queue, newtask_queue=self.newtask_queue, result_queue=self.result_queue) self.project_name = 'data_fetcher_processor_handler' time.sleep(0.5) @classmethod def tearDownClass(self): self.proxy_thread.terminate() self.proxy_thread.wait() self.httpbin_thread.terminate() self.httpbin_thread.join() @classmethod def crawl(self, url=None, track=None, **kwargs): if url is None and kwargs.get('callback'): url = dataurl.encode(utils.text(kwargs.get('callback'))) project_data = self.processor.project_manager.get(self.project_name) assert project_data, "can't find project: %s" % self.project_name instance = project_data['instance'] instance._reset() task = instance.crawl(url, **kwargs) if isinstance(task, list): task = task[0] task['track'] = track result = self.fetcher.fetch(task) self.processor.on_task(task, result) status = None while not self.status_queue.empty(): status = self.status_queue.get() newtasks = [] while not self.newtask_queue.empty(): newtasks = self.newtask_queue.get() result = None while not self.result_queue.empty(): _, result = self.result_queue.get() return status, newtasks, result @classmethod def assertStatusOk(self, status): self.assertTrue(self.status_ok(status, 'fetch'), status.get('track', {}).get('fetch')) self.assertTrue(self.status_ok(status, 'process'), status.get('track', {}).get('process')) @classmethod def status_ok(self, status, type): if not status: return False return status.get('track', {}).get(type, {}).get('ok', False) def test_10_not_status(self): status, newtasks, result = self.crawl(callback=self.not_send_status.__name__) self.assertIsNone(status) self.assertEqual(len(newtasks), 1, newtasks) self.assertEqual(result, 'not_send_status') def test_20_url_deduplicated(self): status, newtasks, result = self.crawl(callback=self.url_deduplicated) self.assertStatusOk(status) self.assertIsNone(status['track']['fetch']['error']) self.assertIsNone(status['track']['fetch']['content']) self.assertFalse(status['track']['fetch']['headers']) self.assertFalse(status['track']['process']['logs']) self.assertEqual(len(newtasks), 2, newtasks) self.assertIsNone(result) def test_30_catch_status_code_error(self): status, newtasks, result = self.crawl(self.httpbin + '/status/418', callback=self.json) self.assertFalse(self.status_ok(status, 'fetch')) self.assertFalse(self.status_ok(status, 'process')) self.assertIn('HTTP 418', status['track']['fetch']['error']) self.assertTrue(status['track']['fetch']['content'], '') self.assertTrue(status['track']['fetch']['headers']) self.assertTrue(status['track']['process']['logs']) self.assertIn('HTTPError: HTTP 418', status['track']['process']['logs']) self.assertFalse(newtasks) status, newtasks, result = self.crawl(self.httpbin + '/status/400', callback=self.catch_http_error) self.assertFalse(self.status_ok(status, 'fetch')) self.assertTrue(self.status_ok(status, 'process')) self.assertEqual(len(newtasks), 1, newtasks) self.assertEqual(result, 400) status, newtasks, result = self.crawl(self.httpbin + '/status/500', callback=self.catch_http_error) self.assertFalse(self.status_ok(status, 'fetch')) self.assertTrue(self.status_ok(status, 'process')) self.assertEqual(len(newtasks), 1, newtasks) self.assertEqual(result, 500) status, newtasks, result = self.crawl(self.httpbin + '/status/302', allow_redirects=False, callback=self.catch_http_error) self.assertFalse(self.status_ok(status, 'fetch')) self.assertTrue(self.status_ok(status, 'process')) self.assertEqual(len(newtasks), 1, newtasks) self.assertEqual(result, 302) def test_40_method(self): status, newtasks, result = self.crawl(self.httpbin + '/delete', method='DELETE', callback=self.json) self.assertStatusOk(status) self.assertFalse(newtasks) status, newtasks, result = self.crawl(self.httpbin + '/get', method='DELETE', callback=self.catch_http_error) self.assertFalse(self.status_ok(status, 'fetch')) self.assertTrue(self.status_ok(status, 'process')) self.assertTrue(newtasks) self.assertEqual(result, 405) def test_50_params(self): status, newtasks, result = self.crawl(self.httpbin + '/get', params={ 'roy': 'binux', u'中文': '.', }, callback=self.json) self.assertStatusOk(status) self.assertFalse(newtasks) self.assertEqual(result['args'], {'roy': 'binux', u'中文': '.'}) def test_60_data(self): status, newtasks, result = self.crawl(self.httpbin + '/post', data={ 'roy': 'binux', u'中文': '.', }, callback=self.json) self.assertStatusOk(status) self.assertFalse(newtasks) self.assertEqual(result['form'], {'roy': 'binux', u'中文': '.'}) def test_70_redirect(self): status, newtasks, result = self.crawl(self.httpbin + '/redirect-to?url=/get', callback=self.json) self.assertStatusOk(status) self.assertEqual(status['track']['fetch']['redirect_url'], self.httpbin + '/get') self.assertFalse(newtasks) def test_80_redirect_too_many(self): status, newtasks, result = self.crawl(self.httpbin + '/redirect/10', callback=self.json) self.assertFalse(self.status_ok(status, 'fetch')) self.assertFalse(self.status_ok(status, 'process')) self.assertFalse(newtasks) self.assertEqual(status['track']['fetch']['status_code'], 599) self.assertIn('redirects followed', status['track']['fetch']['error']) def test_90_files(self): status, newtasks, result = self.crawl(self.httpbin + '/put', method='PUT', files={os.path.basename(__file__): open(__file__).read()}, callback=self.json) self.assertStatusOk(status) self.assertFalse(newtasks) self.assertIn(os.path.basename(__file__), result['files']) def test_a100_files_with_data(self): status, newtasks, result = self.crawl(self.httpbin + '/put', method='PUT', files={os.path.basename(__file__): open(__file__).read()}, data={ 'roy': 'binux', # '中文': '.', # FIXME: not work }, callback=self.json) self.assertStatusOk(status) self.assertFalse(newtasks) self.assertEqual(result['form'], {'roy': 'binux'}) self.assertIn(os.path.basename(__file__), result['files']) def test_a110_headers(self): status, newtasks, result = self.crawl(self.httpbin + '/get', headers={ 'a': 'b', 'C-d': 'e-F', }, callback=self.json) self.assertStatusOk(status) self.assertFalse(newtasks) self.assertEqual(result['headers'].get('A'), 'b') self.assertEqual(result['headers'].get('C-D'), 'e-F') def test_a115_user_agent(self): status, newtasks, result = self.crawl(self.httpbin + '/get', user_agent='binux', callback=self.json) self.assertStatusOk(status) self.assertFalse(newtasks) self.assertEqual(result['headers'].get('User-Agent'), 'binux') def test_a120_cookies(self): status, newtasks, result = self.crawl(self.httpbin + '/get', cookies={ 'a': 'b', 'C-d': 'e-F' }, callback=self.json) self.assertStatusOk(status) self.assertFalse(newtasks) self.assertIn('a=b', result['headers'].get('Cookie')) self.assertIn('C-d=e-F', result['headers'].get('Cookie')) def test_a130_cookies_with_headers(self): status, newtasks, result = self.crawl(self.httpbin + '/get', headers={ 'Cookie': 'g=h; I=j', }, cookies={ 'a': 'b', 'C-d': 'e-F' }, callback=self.json) self.assertStatusOk(status) self.assertFalse(newtasks) self.assertIn('g=h', result['headers'].get('Cookie')) self.assertIn('I=j', result['headers'].get('Cookie')) self.assertIn('a=b', result['headers'].get('Cookie')) self.assertIn('C-d=e-F', result['headers'].get('Cookie')) def test_a140_response_cookie(self): status, newtasks, result = self.crawl(self.httpbin + '/cookies/set?k1=v1&k2=v2', callback=self.cookies) self.assertStatusOk(status) self.assertFalse(newtasks) self.assertEqual(result, {'k1': 'v1', 'k2': 'v2'}) def test_a145_redirect_cookie(self): status, newtasks, result = self.crawl(self.httpbin + '/cookies/set?k1=v1&k2=v2', callback=self.json) self.assertStatusOk(status) self.assertFalse(newtasks) self.assertEqual(result['cookies'], {'k1': 'v1', 'k2': 'v2'}) def test_a150_timeout(self): status, newtasks, result = self.crawl(self.httpbin + '/delay/2', timeout=1, callback=self.json) self.assertFalse(self.status_ok(status, 'fetch')) self.assertFalse(self.status_ok(status, 'process')) self.assertFalse(newtasks) self.assertEqual(int(status['track']['fetch']['time']), 1) def test_a160_etag(self): status, newtasks, result = self.crawl(self.httpbin + '/cache', etag='abc', callback=self.json) self.assertStatusOk(status) self.assertFalse(newtasks) self.assertFalse(result) def test_a170_last_modified(self): status, newtasks, result = self.crawl(self.httpbin + '/cache', last_modified='0', callback=self.json) self.assertStatusOk(status) self.assertFalse(newtasks) self.assertFalse(result) def test_a180_save(self): status, newtasks, result = self.crawl(callback=self.get_save, save={'roy': 'binux', u'中文': 'value'}) self.assertStatusOk(status) self.assertFalse(newtasks) self.assertEqual(result, {'roy': 'binux', u'中文': 'value'}) def test_a190_taskid(self): status, newtasks, result = self.crawl(callback=self.get_save, taskid='binux-taskid') self.assertStatusOk(status) self.assertEqual(status['taskid'], 'binux-taskid') self.assertFalse(newtasks) self.assertFalse(result) def test_a200_no_proxy(self): old_proxy = self.fetcher.proxy self.fetcher.proxy = self.proxy status, newtasks, result = self.crawl(self.httpbin + '/get', params={ 'test': 'a200' }, proxy=False, callback=self.json) self.assertStatusOk(status) self.assertFalse(newtasks) self.fetcher.proxy = old_proxy def test_a210_proxy_failed(self): old_proxy = self.fetcher.proxy self.fetcher.proxy = self.proxy status, newtasks, result = self.crawl(self.httpbin + '/get', params={ 'test': 'a210' }, callback=self.catch_http_error) self.assertFalse(self.status_ok(status, 'fetch')) self.assertTrue(self.status_ok(status, 'process')) self.assertEqual(len(newtasks), 1, newtasks) self.assertEqual(result, 403) self.fetcher.proxy = old_proxy def test_a220_proxy_ok(self): old_proxy = self.fetcher.proxy self.fetcher.proxy = self.proxy status, newtasks, result = self.crawl(self.httpbin + '/get', params={ 'test': 'a220', 'username': 'binux', 'password': '123456', }, callback=self.catch_http_error) self.assertStatusOk(status) self.assertEqual(result, 200) self.fetcher.proxy = old_proxy def test_a230_proxy_parameter_fail(self): status, newtasks, result = self.crawl(self.httpbin + '/get', params={ 'test': 'a230', }, proxy=self.proxy, callback=self.catch_http_error) self.assertFalse(self.status_ok(status, 'fetch')) self.assertTrue(self.status_ok(status, 'process')) self.assertEqual(result, 403) def test_a240_proxy_parameter_ok(self): status, newtasks, result = self.crawl(self.httpbin + '/post', method='POST', data={ 'test': 'a240', 'username': 'binux', 'password': '123456', }, proxy=self.proxy, callback=self.catch_http_error) self.assertStatusOk(status) self.assertEqual(result, 200) def test_a250_proxy_userpass(self): status, newtasks, result = self.crawl(self.httpbin + '/post', method='POST', data={ 'test': 'a250', }, proxy='binux:123456@' + self.proxy, callback=self.catch_http_error) self.assertStatusOk(status) self.assertEqual(result, 200) def test_a260_process_save(self): status, newtasks, result = self.crawl(callback=self.set_process_save) self.assertStatusOk(status) self.assertIn('roy', status['track']['save']) self.assertEqual(status['track']['save']['roy'], 'binux') status, newtasks, result = self.crawl(callback=self.get_process_save, track=status['track']) self.assertStatusOk(status) self.assertIn('roy', result) self.assertEqual(result['roy'], 'binux') def test_zzz_links(self): status, newtasks, result = self.crawl(self.httpbin + '/links/10/0', callback=self.links) self.assertStatusOk(status) self.assertEqual(len(newtasks), 9, newtasks) self.assertFalse(result) def test_zzz_html(self): status, newtasks, result = self.crawl(self.httpbin + '/html', callback=self.html) self.assertStatusOk(status) self.assertFalse(newtasks) self.assertEqual(result, 'Herman Melville - Moby-Dick') def test_zzz_etag_enabled(self): status, newtasks, result = self.crawl(self.httpbin + '/cache', callback=self.json) self.assertStatusOk(status) self.assertTrue(result) status, newtasks, result = self.crawl(self.httpbin + '/cache', track=status['track'], callback=self.json) self.assertStatusOk(status) self.assertFalse(newtasks) self.assertFalse(result) def test_zzz_etag_not_working(self): status, newtasks, result = self.crawl(self.httpbin + '/cache', callback=self.json) self.assertStatusOk(status) self.assertTrue(result) status['track']['process']['ok'] = False status, newtasks, result = self.crawl(self.httpbin + '/cache', track=status['track'], callback=self.json) self.assertStatusOk(status) self.assertTrue(result) def test_zzz_unexpected_crawl_argument(self): with self.assertRaisesRegexp(TypeError, "unexpected keyword argument"): self.crawl(self.httpbin + '/cache', cookie={}, callback=self.json) def test_zzz_curl_get(self): status, newtasks, result = self.crawl( "curl '" + self.httpbin + '''/get' -H 'DNT: 1' -H 'Accept-Encoding: gzip, deflate, sdch' -H 'Accept-Language: en,zh-CN;q=0.8,zh;q=0.6' -H 'User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.17 Safari/537.36' -H 'Binux-Header: Binux-Value' -H 'Accept: */*' -H 'Cookie: _gauges_unique_year=1; _gauges_unique=1; _ga=GA1.2.415471573.1419316591' -H 'Connection: keep-alive' --compressed''', callback=self.json) self.assertStatusOk(status) self.assertTrue(result) self.assertTrue(result['headers'].get('Binux-Header'), 'Binux-Value') def test_zzz_curl_post(self): status, newtasks, result = self.crawl( "curl '" + self.httpbin + '''/post' -H 'Origin: chrome-extension://hgmloofddffdnphfgcellkdfbfbjeloo' -H 'Accept-Encoding: gzip, deflate' -H 'Accept-Language: en,zh-CN;q=0.8,zh;q=0.6' -H 'User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.17 Safari/537.36' -H 'Content-Type: application/x-www-form-urlencoded' -H 'Accept: */*' -H 'Cookie: _gauges_unique_year=1; _gauges_unique=1; _ga=GA1.2.415471573.1419316591' -H 'Connection: keep-alive' -H 'DNT: 1' --data 'Binux-Key=%E4%B8%AD%E6%96%87+value' --compressed''', callback=self.json) self.assertStatusOk(status) self.assertTrue(result) self.assertTrue(result['form'].get('Binux-Key'), '中文 value') def test_zzz_curl_put(self): status, newtasks, result = self.crawl( "curl '" + self.httpbin + '''/put' -X PUT -H 'Origin: chrome-extension://hgmloofddffdnphfgcellkdfbfbjeloo' -H 'Accept-Encoding: gzip, deflate, sdch' -H 'Accept-Language: en,zh-CN;q=0.8,zh;q=0.6' -H 'User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.17 Safari/537.36' -H 'Content-Type: multipart/form-data; boundary=----WebKitFormBoundaryYlkgyaA7SRGOQYUG' -H 'Accept: */*' -H 'Cookie: _gauges_unique_year=1; _gauges_unique=1; _ga=GA1.2.415471573.1419316591' -H 'Connection: keep-alive' -H 'DNT: 1' --data-binary $'------WebKitFormBoundaryYlkgyaA7SRGOQYUG\r\nContent-Disposition: form-data; name="Binux-Key"\r\n\r\n%E4%B8%AD%E6%96%87+value\r\n------WebKitFormBoundaryYlkgyaA7SRGOQYUG\r\nContent-Disposition: form-data; name="fileUpload1"; filename="1"\r\nContent-Type: application/octet-stream\r\n\r\n\r\n------WebKitFormBoundaryYlkgyaA7SRGOQYUG--\r\n' --compressed''', callback=self.json) self.assertStatusOk(status) self.assertTrue(result) self.assertIn('fileUpload1', result['files'], result) def test_zzz_curl_no_url(self): with self.assertRaisesRegexp(TypeError, 'no URL'): status, newtasks, result = self.crawl( '''curl -X PUT -H 'Origin: chrome-extension://hgmloofddffdnphfgcellkdfbfbjeloo' --compressed''', callback=self.json) def test_zzz_curl_bad_option(self): with self.assertRaisesRegexp(TypeError, 'Unknow curl option'): status, newtasks, result = self.crawl( '''curl '%s/put' -X PUT -H 'Origin: chrome-extension://hgmloofddffdnphfgcellkdfbfbjeloo' -v''' % self.httpbin, callback=self.json) with self.assertRaisesRegexp(TypeError, 'Unknow curl option'): status, newtasks, result = self.crawl( '''curl '%s/put' -X PUT -v -H 'Origin: chrome-extension://hgmloofddffdnphfgcellkdfbfbjeloo' ''' % self.httpbin, callback=self.json) def test_zzz_robots_txt(self): status, newtasks, result = self.crawl(self.httpbin + '/deny', robots_txt=True, callback=self.catch_http_error) self.assertEqual(result, 403) def test_zzz_connect_timeout(self): start_time = time.time() status, newtasks, result = self.crawl('http://240.0.0.1/', connect_timeout=5, callback=self.catch_http_error) end_time = time.time() self.assertTrue(5 <= end_time - start_time <= 6) ================================================ FILE: tests/test_message_queue.py ================================================ #!/usr/bin/env python # -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # Created on 2014-10-07 10:33:38 import os import six import time import unittest from pyspider.libs import utils from six.moves import queue as Queue class TestMessageQueue(object): @classmethod def setUpClass(self): raise NotImplementedError def test_10_put(self): self.assertEqual(self.q1.qsize(), 0) self.assertEqual(self.q2.qsize(), 0) self.q1.put('TEST_DATA1', timeout=3) self.q1.put('TEST_DATA2_中文', timeout=3) time.sleep(0.01) self.assertEqual(self.q1.qsize(), 2) self.assertEqual(self.q2.qsize(), 2) def test_20_get(self): self.assertEqual(self.q1.get(timeout=0.01), 'TEST_DATA1') self.assertEqual(self.q2.get_nowait(), 'TEST_DATA2_中文') with self.assertRaises(Queue.Empty): self.q2.get(timeout=0.01) with self.assertRaises(Queue.Empty): self.q2.get_nowait() def test_30_full(self): self.assertEqual(self.q1.qsize(), 0) self.assertEqual(self.q2.qsize(), 0) for i in range(2): self.q1.put_nowait('TEST_DATA%d' % i) for i in range(3): self.q2.put('TEST_DATA%d' % i) with self.assertRaises(Queue.Full): self.q1.put('TEST_DATA6', timeout=0.01) with self.assertRaises(Queue.Full): self.q1.put_nowait('TEST_DATA6') def test_40_multiple_threading_error(self): def put(q): for i in range(100): q.put("DATA_%d" % i) def get(q): for i in range(100): q.get() t = utils.run_in_thread(put, self.q3) get(self.q3) t.join() class BuiltinQueue(TestMessageQueue, unittest.TestCase): @classmethod def setUpClass(self): from pyspider.message_queue import connect_message_queue with utils.timeout(3): self.q1 = self.q2 = connect_message_queue('test_queue', maxsize=5) self.q3 = connect_message_queue('test_queue_for_threading_test') #@unittest.skipIf(six.PY3, 'pika not suport python 3') @unittest.skipIf(os.environ.get('IGNORE_RABBITMQ') or os.environ.get('IGNORE_ALL'), 'no rabbitmq server for test.') class TestPikaRabbitMQ(TestMessageQueue, unittest.TestCase): @classmethod def setUpClass(self): from pyspider.message_queue import rabbitmq with utils.timeout(3): self.q1 = rabbitmq.PikaQueue('test_queue', maxsize=5, lazy_limit=False) self.q2 = rabbitmq.PikaQueue('test_queue', amqp_url='amqp://localhost:5672/%2F', maxsize=5, lazy_limit=False) self.q3 = rabbitmq.PikaQueue('test_queue_for_threading_test', amqp_url='amqp://guest:guest@localhost:5672/', lazy_limit=False) self.q2.delete() self.q2.reconnect() self.q3.delete() self.q3.reconnect() @classmethod def tearDownClass(self): self.q2.delete() self.q3.delete() del self.q1 del self.q2 del self.q3 def test_30_full(self): self.assertEqual(self.q1.qsize(), 0) self.assertEqual(self.q2.qsize(), 0) for i in range(2): self.q1.put_nowait('TEST_DATA%d' % i) for i in range(3): self.q2.put('TEST_DATA%d' % i) print(self.q1.__dict__) print(self.q1.qsize()) with self.assertRaises(Queue.Full): self.q1.put_nowait('TEST_DATA6') print(self.q1.__dict__) print(self.q1.qsize()) with self.assertRaises(Queue.Full): self.q1.put('TEST_DATA6', timeout=0.01) @unittest.skipIf(six.PY3, 'Python 3 now using Pika') @unittest.skipIf(os.environ.get('IGNORE_RABBITMQ') or os.environ.get('IGNORE_ALL'), 'no rabbitmq server for test.') class TestAmqpRabbitMQ(TestMessageQueue, unittest.TestCase): @classmethod def setUpClass(self): from pyspider.message_queue import connect_message_queue with utils.timeout(3): self.q1 = connect_message_queue('test_queue', 'amqp://localhost:5672/', maxsize=5, lazy_limit=False) self.q2 = connect_message_queue('test_queue', 'amqp://localhost:5672/%2F', maxsize=5, lazy_limit=False) self.q3 = connect_message_queue('test_queue_for_threading_test', 'amqp://guest:guest@localhost:5672/', lazy_limit=False) self.q2.delete() self.q2.reconnect() self.q3.delete() self.q3.reconnect() @classmethod def tearDownClass(self): self.q2.delete() self.q3.delete() del self.q1 del self.q2 del self.q3 def test_30_full(self): self.assertEqual(self.q1.qsize(), 0) self.assertEqual(self.q2.qsize(), 0) for i in range(2): self.q1.put_nowait('TEST_DATA%d' % i) for i in range(3): self.q2.put('TEST_DATA%d' % i) print(self.q1.__dict__) print(self.q1.qsize()) with self.assertRaises(Queue.Full): self.q1.put('TEST_DATA6', timeout=0.01) print(self.q1.__dict__) print(self.q1.qsize()) with self.assertRaises(Queue.Full): self.q1.put_nowait('TEST_DATA6') @unittest.skipIf(os.environ.get('IGNORE_REDIS') or os.environ.get('IGNORE_ALL'), 'no redis server for test.') class TestRedisQueue(TestMessageQueue, unittest.TestCase): @classmethod def setUpClass(self): from pyspider.message_queue import connect_message_queue from pyspider.message_queue import redis_queue with utils.timeout(3): self.q1 = redis_queue.RedisQueue('test_queue', maxsize=5, lazy_limit=False) self.q2 = redis_queue.RedisQueue('test_queue', maxsize=5, lazy_limit=False) self.q3 = connect_message_queue('test_queue_for_threading_test', 'redis://localhost:6379/') while not self.q1.empty(): self.q1.get() while not self.q2.empty(): self.q2.get() while not self.q3.empty(): self.q3.get() @classmethod def tearDownClass(self): while not self.q1.empty(): self.q1.get() while not self.q2.empty(): self.q2.get() while not self.q3.empty(): self.q3.get() class TestKombuQueue(TestMessageQueue, unittest.TestCase): kombu_url = 'kombu+memory://' @classmethod def setUpClass(self): from pyspider.message_queue import connect_message_queue with utils.timeout(3): self.q1 = connect_message_queue('test_queue', self.kombu_url, maxsize=5, lazy_limit=False) self.q2 = connect_message_queue('test_queue', self.kombu_url, maxsize=5, lazy_limit=False) self.q3 = connect_message_queue('test_queue_for_threading_test', self.kombu_url, lazy_limit=False) while not self.q1.empty(): self.q1.get() while not self.q2.empty(): self.q2.get() while not self.q3.empty(): self.q3.get() @classmethod def tearDownClass(self): while not self.q1.empty(): self.q1.get() self.q1.delete() while not self.q2.empty(): self.q2.get() self.q2.delete() while not self.q3.empty(): self.q3.get() self.q3.delete() @unittest.skip('test cannot pass, get is buffered') @unittest.skipIf(os.environ.get('IGNORE_RABBITMQ') or os.environ.get('IGNORE_ALL'), 'no rabbitmq server for test.') class TestKombuAmpqQueue(TestKombuQueue): kombu_url = 'kombu+amqp://' @unittest.skip('test cannot pass, put is buffered') @unittest.skipIf(os.environ.get('IGNORE_REDIS') or os.environ.get('IGNORE_ALL'), 'no redis server for test.') class TestKombuRedisQueue(TestKombuQueue): kombu_url = 'kombu+redis://' @unittest.skipIf(os.environ.get('IGNORE_MONGODB') or os.environ.get('IGNORE_ALL'), 'no mongodb server for test.') class TestKombuMongoDBQueue(TestKombuQueue): kombu_url = 'kombu+mongodb://' ================================================ FILE: tests/test_processor.py ================================================ #!/usr/bin/env python # -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # Created on 2014-02-22 14:00:05 import os import six import copy import time import unittest import logging.config logging.config.fileConfig("pyspider/logging.conf") from pyspider.libs import utils from pyspider.processor.project_module import ProjectManager class TestProjectModule(unittest.TestCase): @property def base_task(self): return { 'taskid': 'taskid', 'project': 'test.project', 'url': 'www.baidu.com/', 'schedule': { 'priority': 1, 'retries': 3, 'exetime': 0, 'age': 3600, 'itag': 'itag', 'recrawl': 5, }, 'fetch': { 'method': 'GET', 'headers': { 'Cookie': 'a=b', }, 'data': 'a=b&c=d', 'timeout': 60, 'save': [1, 2, 3], }, 'process': { 'callback': 'callback', }, } @property def fetch_result(self): return { 'status_code': 200, 'orig_url': 'www.baidu.com/', 'url': 'http://www.baidu.com/', 'headers': { 'cookie': 'abc', }, 'content': 'test data', 'cookies': { 'a': 'b', }, 'save': [1, 2, 3], } def setUp(self): self.project = "test.project" self.script = open(os.path.join(os.path.dirname(__file__), 'data_handler.py')).read() self.env = { 'test': True, } self.project_info = { 'name': self.project, 'status': 'DEBUG', } data = ProjectManager.build_module({ 'name': self.project, 'script': self.script }, {'test': True}) self.module = data['module'] self.instance = data['instance'] def test_2_hello(self): base_task = self.base_task base_task['process']['callback'] = 'hello' ret = self.instance.run_task(self.module, base_task, self.fetch_result) self.assertIsNone(ret.exception) self.assertEqual(ret.result, "hello world!") def test_3_echo(self): base_task = self.base_task base_task['process']['callback'] = 'echo' ret = self.instance.run_task(self.module, base_task, self.fetch_result) self.assertIsNone(ret.exception) self.assertEqual(ret.result, "test data") def test_4_saved(self): base_task = self.base_task base_task['process']['callback'] = 'saved' ret = self.instance.run_task(self.module, base_task, self.fetch_result) self.assertIsNone(ret.exception) self.assertEqual(ret.result, base_task['fetch']['save']) def test_5_echo_task(self): base_task = self.base_task base_task['process']['callback'] = 'echo_task' ret = self.instance.run_task(self.module, base_task, self.fetch_result) self.assertIsNone(ret.exception) self.assertEqual(ret.result, self.project) def test_6_catch_status_code(self): base_task = self.base_task fetch_result = self.fetch_result fetch_result['status_code'] = 403 base_task['process']['callback'] = 'catch_status_code' ret = self.instance.run_task(self.module, base_task, fetch_result) self.assertIsNone(ret.exception) self.assertEqual(ret.result, 403) def test_7_raise_exception(self): base_task = self.base_task base_task['process']['callback'] = 'raise_exception' ret = self.instance.run_task(self.module, base_task, self.fetch_result) self.assertIsNotNone(ret.exception) logstr = ret.logstr() self.assertIn('info', logstr) self.assertIn('warning', logstr) self.assertIn('error', logstr) def test_8_add_task(self): base_task = self.base_task base_task['process']['callback'] = 'add_task' ret = self.instance.run_task(self.module, base_task, self.fetch_result) self.assertIsNone(ret.exception, ret.logstr()) self.assertEqual(len(ret.follows), 1) self.assertEqual(len(ret.messages), 1) def test_10_cronjob(self): task = { 'taskid': '_on_cronjob', 'project': self.project, 'url': 'data:,_on_cronjob', 'fetch': { 'save': { 'tick': 11, }, }, 'process': { 'callback': '_on_cronjob', }, } fetch_result = self.fetch_result fetch_result['save'] = { 'tick': 11, } ret = self.instance.run_task(self.module, task, fetch_result) logstr = ret.logstr() self.assertNotIn('on_cronjob1', logstr) self.assertNotIn('on_cronjob2', logstr) task['fetch']['save']['tick'] = 10 fetch_result['save'] = task['fetch']['save'] ret = self.instance.run_task(self.module, task, fetch_result) logstr = ret.logstr() self.assertNotIn('on_cronjob1', logstr) self.assertIn('on_cronjob2', logstr) task['fetch']['save']['tick'] = 60 fetch_result['save'] = task['fetch']['save'] ret = self.instance.run_task(self.module, task, fetch_result) logstr = ret.logstr() self.assertIn('on_cronjob1', logstr) self.assertIn('on_cronjob2', logstr) def test_20_get_info(self): task = { 'taskid': '_on_get_info', 'project': self.project, 'url': 'data:,_on_get_info', 'fetch': { 'save': ['min_tick', 'retry_delay'], }, 'process': { 'callback': '_on_get_info', }, } fetch_result = self.fetch_result fetch_result['save'] = task['fetch']['save'] ret = self.instance.run_task(self.module, task, fetch_result) self.assertEqual(len(ret.save), 2, ret.logstr()) for each in ret.follows: self.assertEqual(each['url'], 'data:,on_get_info') self.assertEqual(each['fetch']['save']['min_tick'], 10) self.assertEqual(each['fetch']['save']['retry_delay'], {}) def test_30_generator(self): base_task = self.base_task base_task['process']['callback'] = 'generator' ret = self.instance.run_task(self.module, base_task, self.fetch_result) self.assertIsNone(ret.exception) self.assertIn('generator object', repr(ret.result)) def test_40_sleep(self): base_task = self.base_task fetch_result = self.fetch_result base_task['process']['callback'] = 'sleep' fetch_result['save'] = 1 start_time = time.time() ret = self.instance.run_task(self.module, base_task, fetch_result) self.assertGreaterEqual(time.time() - start_time, 1) def test_50_timeout(self): base_task = self.base_task fetch_result = self.fetch_result base_task['process']['callback'] = 'sleep' base_task['process']['process_time_limit'] = 0.5 fetch_result['save'] = 2 start_time = time.time() ret = self.instance.run_task(self.module, base_task, fetch_result) self.assertIsNotNone(ret.exception) logstr = ret.logstr() self.assertIn('TimeoutError: process timeout', logstr) self.assertGreaterEqual(time.time() - start_time, 1) self.assertLess(time.time() - start_time, 2) def test_60_timeout_in_thread(self): base_task = self.base_task fetch_result = self.fetch_result base_task['process']['callback'] = 'sleep' base_task['process']['process_time_limit'] = 0.5 fetch_result['save'] = 2 start_time = time.time() thread = utils.run_in_thread(lambda self=self: self.instance.run_task(self.module, base_task, fetch_result)) thread.join() self.assertGreaterEqual(time.time() - start_time, 2) import shutil import inspect from pyspider.database.sqlite import projectdb from pyspider.processor.processor import Processor from pyspider.libs.multiprocessing_queue import Queue from pyspider.libs.utils import run_in_thread from pyspider.libs import sample_handler class TestProcessor(unittest.TestCase): projectdb_path = './data/tests/project.db' @classmethod def setUpClass(self): shutil.rmtree('./data/tests/', ignore_errors=True) os.makedirs('./data/tests/') def get_projectdb(): return projectdb.ProjectDB(self.projectdb_path) self.projectdb = get_projectdb() self.in_queue = Queue(10) self.status_queue = Queue(10) self.newtask_queue = Queue(10) self.result_queue = Queue(10) def run_processor(): self.processor = Processor(get_projectdb(), self.in_queue, self.status_queue, self.newtask_queue, self.result_queue) self.processor.project_manager.CHECK_PROJECTS_INTERVAL = 0.1 self.processor.run() self.process = run_in_thread(run_processor) time.sleep(1) @classmethod def tearDownClass(self): if self.process.is_alive(): self.processor.quit() self.process.join(2) assert not self.process.is_alive() shutil.rmtree('./data/tests/', ignore_errors=True) def test_10_update_project(self): self.assertIsNone(self.processor.project_manager.get('test_project')) self.projectdb.insert('test_project', { 'name': 'test_project', 'group': 'group', 'status': 'TODO', 'script': inspect.getsource(sample_handler), 'comments': 'test project', 'rate': 1.0, 'burst': 10, }) self.assertIsNone(self.processor.project_manager.get('not_exists')) self.assertIsNotNone(self.processor.project_manager.get('test_project')) task = { "process": { "callback": "on_start" }, "project": "not_exists", "taskid": "data:,on_start", "url": "data:,on_start" } self.in_queue.put((task, {})) time.sleep(1) self.assertFalse(self.status_queue.empty()) while not self.status_queue.empty(): status = self.status_queue.get() self.assertEqual(status['track']['process']['ok'], False) self.assertIsNone(self.processor.project_manager.get('not_exists')) def test_20_broken_project(self): self.assertIsNone(self.processor.project_manager.get('test_broken_project')) self.projectdb.insert('test_broken_project', { 'name': 'test_broken_project', 'group': 'group', 'status': 'DEBUG', 'script': inspect.getsource(sample_handler)[:10], 'comments': 'test project', 'rate': 1.0, 'burst': 10, }) self.assertIsNone(self.processor.project_manager.get('not_exists')) self.assertIsNotNone(self.processor.project_manager.get('test_broken_project')) project_data = self.processor.project_manager.get('test_broken_project') self.assertIsNotNone(project_data.get('exception')) def test_30_new_task(self): self.assertTrue(self.status_queue.empty()) self.assertTrue(self.newtask_queue.empty()) task = { "process": { "callback": "on_start" }, "project": "test_project", "taskid": "data:,on_start", "url": "data:,on_start" } fetch_result = { "orig_url": "data:,on_start", "content": "on_start", "headers": {}, "status_code": 200, "url": "data:,on_start", "time": 0, } self.in_queue.put((task, fetch_result)) time.sleep(1) self.assertFalse(self.status_queue.empty()) while not self.status_queue.empty(): self.status_queue.get() self.assertFalse(self.newtask_queue.empty()) def test_40_index_page(self): task = None while not self.newtask_queue.empty(): task = self.newtask_queue.get()[0] self.assertIsNotNone(task) fetch_result = { "orig_url": task['url'], "content": ( "" "binux" "binux" "1" "2" "" ), "headers": {'a': 'b', 'etag': 'tag'}, "status_code": 200, "url": task['url'], "time": 0, } self.in_queue.put((task, fetch_result)) time.sleep(1) self.assertFalse(self.status_queue.empty()) self.assertFalse(self.newtask_queue.empty()) status = self.status_queue.get() self.assertEqual(status['track']['fetch']['ok'], True) self.assertEqual(status['track']['fetch']['time'], 0) self.assertEqual(status['track']['fetch']['status_code'], 200) self.assertEqual('tag', status['track']['fetch']['headers']['etag']) self.assertIsNone(status['track']['fetch']['content']) self.assertEqual(status['track']['process']['ok'], True) self.assertGreater(status['track']['process']['time'], 0) self.assertEqual(status['track']['process']['follows'], 3) self.assertIsNone(status['track']['process']['result']) self.assertEqual(status['track']['process']['logs'], '') self.assertIsNone(status['track']['process']['exception']) tasks = self.newtask_queue.get() self.assertEqual(len(tasks), 3) self.assertEqual(tasks[0]['url'], 'http://binux.me/') self.assertTrue(tasks[1]['url'].startswith('http://binux.me/%'), task['url']) def test_50_fetch_error(self): # clear new task queue while not self.newtask_queue.empty(): self.newtask_queue.get() # clear status queue while not self.status_queue.empty(): self.status_queue.get() task = { "process": { "callback": "index_page" }, "project": "test_project", "taskid": "data:,test_fetch_error", "url": "data:,test_fetch_error" } fetch_result = { "orig_url": task['url'], "content": "test_fetch_error", "error": "test_fetch_error", "headers": {'a': 'b', 'last-modified': '123'}, "status_code": 598, "url": task['url'], "time": 0, } self.in_queue.put((task, fetch_result)) time.sleep(1) self.assertFalse(self.status_queue.empty()) self.assertTrue(self.newtask_queue.empty()) status = self.status_queue.get() self.assertEqual(status['track']['fetch']['ok'], False) self.assertEqual(status['track']['fetch']['time'], 0) self.assertEqual(status['track']['fetch']['status_code'], 598) self.assertEqual('123', status['track']['fetch']['headers']['last-modified']) self.assertIsNotNone(status['track']['fetch']['content']) self.assertEqual(status['track']['process']['ok'], False) self.assertGreater(status['track']['process']['time'], 0) self.assertEqual(status['track']['process']['follows'], 0) self.assertIsNone(status['track']['process']['result']) self.assertGreater(len(status['track']['process']['logs']), 0) self.assertIsNotNone(status['track']['process']['exception']) def test_60_call_broken_project(self): # clear new task queue while not self.newtask_queue.empty(): self.newtask_queue.get() # clear status queue while not self.status_queue.empty(): self.status_queue.get() task = { "process": { "callback": "on_start" }, "project": "test_broken_project", "taskid": "data:,on_start", "url": "data:,on_start", } fetch_result = { "orig_url": "data:,on_start", "content": "on_start", "headers": {}, "status_code": 200, "url": "data:,on_start", "time": 0, } self.in_queue.put((task, fetch_result)) time.sleep(1) self.assertFalse(self.status_queue.empty()) while not self.status_queue.empty(): status = self.status_queue.get() self.assertEqual(status['track']['fetch']['ok'], True) self.assertEqual(status['track']['process']['ok'], False) self.assertGreater(len(status['track']['process']['logs']), 0) self.assertIsNotNone(status['track']['process']['exception']) self.assertTrue(self.newtask_queue.empty()) def test_70_update_project(self): self.processor.project_manager.CHECK_PROJECTS_INTERVAL = 1000000 self.processor.project_manager._check_projects() self.assertIsNotNone(self.processor.project_manager.get('test_broken_project')) # clear new task queue while not self.newtask_queue.empty(): self.newtask_queue.get() # clear status queue while not self.status_queue.empty(): self.status_queue.get() task = { "process": { "callback": "on_start" }, "project": "test_broken_project", "taskid": "data:,on_start", "url": "data:,on_start" } fetch_result = { "orig_url": "data:,on_start", "content": "on_start", "headers": {}, "status_code": 200, "url": "data:,on_start", "time": 0, } self.projectdb.update('test_broken_project', { 'script': inspect.getsource(sample_handler), }) # not update self.in_queue.put((task, fetch_result)) time.sleep(1) self.assertFalse(self.status_queue.empty()) while not self.status_queue.empty(): status = self.status_queue.get() self.assertEqual(status['track']['fetch']['ok'], True) self.assertEqual(status['track']['process']['ok'], False) # updated task['project_updatetime'] = time.time() self.in_queue.put((task, fetch_result)) time.sleep(1) self.assertFalse(self.status_queue.empty()) while not self.status_queue.empty(): status = self.status_queue.get() self.assertEqual(status['track']['fetch']['ok'], True) self.assertEqual(status['track']['process']['ok'], True) self.projectdb.update('test_broken_project', { 'script': inspect.getsource(sample_handler)[:10], }) # update with md5 task['project_md5sum'] = 'testmd5' del task['project_updatetime'] self.in_queue.put((task, fetch_result)) time.sleep(1) self.assertFalse(self.status_queue.empty()) while not self.status_queue.empty(): status = self.status_queue.get() self.assertEqual(status['track']['fetch']['ok'], True) self.assertEqual(status['track']['process']['ok'], False) self.processor.project_manager.CHECK_PROJECTS_INTERVAL = 0.1 def test_80_import_project(self): self.projectdb.insert('test_project2', { 'name': 'test_project', 'group': 'group', 'status': 'TODO', 'script': inspect.getsource(sample_handler), 'comments': 'test project', 'rate': 1.0, 'burst': 10, }) self.projectdb.insert('test_project3', { 'name': 'test_project', 'group': 'group', 'status': 'TODO', 'script': inspect.getsource(sample_handler), 'comments': 'test project', 'rate': 1.0, 'burst': 10, }) from projects import test_project self.assertIsNotNone(test_project) self.assertIsNotNone(test_project.Handler) from projects.test_project2 import Handler self.assertIsNotNone(Handler) import projects.test_project3 self.assertIsNotNone(projects.test_project3.Handler) ================================================ FILE: tests/test_response.py ================================================ #!/usr/bin/env python # -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # Created on 2015-01-18 11:10:27 import os import copy import time import httpbin import unittest import logging import logging.config logging.config.fileConfig("pyspider/logging.conf") from pyspider.libs import utils from pyspider.libs.response import rebuild_response from pyspider.fetcher.tornado_fetcher import Fetcher class TestResponse(unittest.TestCase): sample_task_http = { 'taskid': 'taskid', 'project': 'project', 'url': '', } @classmethod def setUpClass(self): self.fetcher = Fetcher(None, None, async_mode=False) self.httpbin_thread = utils.run_in_subprocess(httpbin.app.run, port=14887, passthrough_errors=False) self.httpbin = 'http://127.0.0.1:14887' time.sleep(0.5) @classmethod def tearDownClass(self): self.httpbin_thread.terminate() def get(self, url, **kwargs): if not url.startswith('http://'): url = self.httpbin + url request = copy.deepcopy(self.sample_task_http) request['url'] = url request.update(kwargs) result = self.fetcher.fetch(request) response = rebuild_response(result) return response def test_10_html(self): response = self.get('/html') self.assertEqual(response.status_code, 200) self.assertIsNotNone(response.doc('h1')) def test_20_xml(self): response = self.get('/xml') self.assertEqual(response.status_code, 200) self.assertIsNotNone(response.doc('item')) def test_30_gzip(self): response = self.get('/gzip') self.assertEqual(response.status_code, 200) self.assertIn('gzipped', response.text) def test_40_deflate(self): response = self.get('/deflate') self.assertEqual(response.status_code, 200) self.assertIn('deflated', response.text) def test_50_ok(self): response = self.get('/status/200') self.assertTrue(response.ok) self.assertTrue(response) response = self.get('/status/302') self.assertTrue(response.ok) self.assertTrue(response) with self.assertRaises(Exception): self.raise_for_status(allow_redirects=False) def test_60_not_ok(self): response = self.get('/status/400') self.assertFalse(response.ok) self.assertFalse(response) response = self.get('/status/500') self.assertFalse(response.ok) self.assertFalse(response) response = self.get('/status/600') self.assertFalse(response.ok) self.assertFalse(response) def test_70_reraise_exception(self): response = self.get('file://abc') with self.assertRaisesRegex(Exception, 'HTTP 599'): response.raise_for_status() ================================================ FILE: tests/test_result_dump.py ================================================ #!/usr/bin/env python # -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # Created on 2015-10-12 22:17:57 from __future__ import unicode_literals, division import six import csv import time import json import unittest from six import StringIO from pyspider.libs import result_dump results1 = [ {'taskid': 'taskid1', 'url': 'http://example.org/url1', 'pdatetime': time.time(), 'result': {'a': 1, 'b': 2} }, {'taskid': 'taskid1', 'url': 'http://example.org/url1', 'pdatetime': time.time(), 'result': {'a': 1, 'b': 2, 'c': 3} }, ] results2 = results1 + [ {'taskid': 'taskid1', 'url': 'http://example.org/url1', 'pdatetime': time.time(), 'result': [1, 2, '中文', u'中文'] }, ] results_error = results2 + [ {'taskid': 'taskid1', 'url': 'http://example.org/url1', 'pdatetime': time.time(), 'result': None}, {'taskid': 'taskid1', 'url': 'http://example.org/url1', 'pdatetime': time.time() }, {'taskid': 'taskid1', 'pdatetime': time.time() }, ] result_list_error = [ {'taskid': 'taskid1', 'url': 'http://example.org/url1', 'pdatetime': time.time(), 'result': [{"rate": "8.2", "title": '1'}, {"rate": "8.2", "title": '1'}]}, {'taskid': 'taskid1', 'url': 'http://example.org/url1', 'pdatetime': time.time(), 'result': [{"rate": "8.2", "title": '1'}, {"rate": "8.2", "title": '1'}]}, ] class TestResultDump(unittest.TestCase): def test_result_formater_1(self): common_fields, results = result_dump.result_formater(results1) self.assertEqual(common_fields, set(('a', 'b'))) def test_result_formater_2(self): common_fields, results = result_dump.result_formater(results2) self.assertEqual(common_fields, set()) def test_result_formater_error(self): common_fields, results = result_dump.result_formater(results_error) self.assertEqual(common_fields, set()) def test_dump_as_json(self): for i, line in enumerate((''.join( result_dump.dump_as_json(results2))).splitlines()): self.assertDictEqual(results2[i], json.loads(line)) def test_dump_as_json_valid(self): ret = json.loads(''.join(result_dump.dump_as_json(results2, True))) for i, j in zip(results2, ret): self.assertDictEqual(i, j) def test_dump_as_txt(self): for i, line in enumerate((''.join( result_dump.dump_as_txt(results2))).splitlines()): url, json_data = line.split('\t', 2) self.assertEqual(results2[i]['result'], json.loads(json_data)) def test_dump_as_csv(self): reader = csv.reader(StringIO(''.join(result_dump.dump_as_csv(results1)))) for row in reader: self.assertEqual(len(row), 4) def test_dump_as_csv_case_1(self): reader = csv.reader(StringIO(''.join(result_dump.dump_as_csv(result_list_error)))) for row in reader: self.assertEqual(len(row), 2) ================================================ FILE: tests/test_result_worker.py ================================================ #!/usr/bin/env python # -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # Created on 2014-11-11 20:52:53 import os import time import unittest import logging.config logging.config.fileConfig("pyspider/logging.conf") import shutil from pyspider.database.sqlite import resultdb from pyspider.result.result_worker import ResultWorker from pyspider.libs.multiprocessing_queue import Queue from pyspider.libs.utils import run_in_thread class TestProcessor(unittest.TestCase): resultdb_path = './data/tests/result.db' @classmethod def setUpClass(self): shutil.rmtree('./data/tests/', ignore_errors=True) os.makedirs('./data/tests/') def get_resultdb(): return resultdb.ResultDB(self.resultdb_path) self.resultdb = get_resultdb() self.inqueue = Queue(10) def run_result_worker(): self.result_worker = ResultWorker(get_resultdb(), self.inqueue) self.result_worker.run() self.process = run_in_thread(run_result_worker) time.sleep(1) @classmethod def tearDownClass(self): if self.process.is_alive(): self.result_worker.quit() self.process.join(2) assert not self.process.is_alive() shutil.rmtree('./data/tests/', ignore_errors=True) def test_10_bad_result(self): self.inqueue.put(({'project': 'test_project'}, {})) self.resultdb._list_project() self.assertEqual(len(self.resultdb.projects), 0) self.assertEqual(self.resultdb.count('test_project'), 0) def test_10_bad_result_2(self): self.inqueue.put(({'project': 'test_project'}, {'a': 'b'})) self.resultdb._list_project() self.assertEqual(len(self.resultdb.projects), 0) self.assertEqual(self.resultdb.count('test_project'), 0) def test_20_insert_result(self): data = { 'a': 'b' } self.inqueue.put(({ 'project': 'test_project', 'taskid': 'id1', 'url': 'url1' }, data)) time.sleep(0.5) self.resultdb._list_project() self.assertEqual(len(self.resultdb.projects), 1) self.assertEqual(self.resultdb.count('test_project'), 1) result = self.resultdb.get('test_project', 'id1') self.assertEqual(result['result'], data) def test_30_overwrite(self): self.inqueue.put(({ 'project': 'test_project', 'taskid': 'id1', 'url': 'url1' }, "abc")) time.sleep(0.1) result = self.resultdb.get('test_project', 'id1') self.assertEqual(result['result'], "abc") def test_40_insert_list(self): self.inqueue.put(({ 'project': 'test_project', 'taskid': 'id2', 'url': 'url1' }, ['a', 'b'])) time.sleep(0.1) result = self.resultdb.get('test_project', 'id2') self.assertEqual(result['result'], ['a', 'b']) ================================================ FILE: tests/test_run.py ================================================ #!/usr/bin/env python # -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # Created on 2014-11-21 22:32:35 from __future__ import print_function import os import sys import six import time import json import signal import shutil import inspect import requests import unittest from pyspider import run from pyspider.libs import utils from tests import data_sample_handler class TestRun(unittest.TestCase): @classmethod def setUpClass(self): shutil.rmtree('./data/tests', ignore_errors=True) os.makedirs('./data/tests') import tests.data_test_webpage import httpbin self.httpbin_thread = utils.run_in_subprocess(httpbin.app.run, port=14887, passthrough_errors=False) self.httpbin = 'http://127.0.0.1:14887' @classmethod def tearDownClass(self): self.httpbin_thread.terminate() self.httpbin_thread.join() assert not utils.check_port_open(5000) assert not utils.check_port_open(23333) assert not utils.check_port_open(24444) assert not utils.check_port_open(25555) assert not utils.check_port_open(14887) shutil.rmtree('./data/tests', ignore_errors=True) def test_10_cli(self): ctx = run.cli.make_context('test', [], None, obj=dict(testing_mode=True)) ctx = run.cli.invoke(ctx) self.assertEqual(ctx.obj.debug, False) for db in ('taskdb', 'projectdb', 'resultdb'): self.assertIsNotNone(getattr(ctx.obj, db)) for name in ('newtask_queue', 'status_queue', 'scheduler2fetcher', 'fetcher2processor', 'processor2result'): self.assertIsNotNone(getattr(ctx.obj, name)) self.assertEqual(len(ctx.obj.instances), 0) def test_20_cli_config(self): with open('./data/tests/config.json', 'w') as fp: json.dump({ 'debug': True, 'taskdb': 'mysql+taskdb://localhost:23456/taskdb', 'amqp-url': 'amqp://guest:guest@localhost:23456/%%2F' }, fp) ctx = run.cli.make_context('test', ['--config', './data/tests/config.json'], None, obj=dict(testing_mode=True)) ctx = run.cli.invoke(ctx) self.assertEqual(ctx.obj.debug, True) import mysql.connector with self.assertRaises(mysql.connector.Error): ctx.obj.taskdb with self.assertRaises(Exception): ctx.obj.newtask_queue def test_30_cli_command_line(self): ctx = run.cli.make_context( 'test', ['--projectdb', 'mongodb+projectdb://localhost:23456/projectdb'], None, obj=dict(testing_mode=True) ) ctx = run.cli.invoke(ctx) from pymongo.errors import ConnectionFailure with self.assertRaises(ConnectionFailure): ctx.obj.projectdb def test_30a_cli_command_line(self): ctx = run.cli.make_context( 'test', ['--projectdb', 'couchdb+projectdb://localhost:5984/projectdb'], None, obj=dict(testing_mode=True) ) ctx = run.cli.invoke(ctx) with self.assertRaises(Exception): # TODO: MORE SPECIFIC ctx.obj.projectdb def test_40_cli_env(self): try: os.environ['RESULTDB'] = 'sqlite+resultdb://' ctx = run.cli.make_context('test', [], None, obj=dict(testing_mode=True)) ctx = run.cli.invoke(ctx) from pyspider.database.sqlite import resultdb self.assertIsInstance(ctx.obj.resultdb, resultdb.ResultDB) finally: del os.environ['RESULTDB'] @unittest.skipIf(os.environ.get('IGNORE_RABBITMQ') or os.environ.get('IGNORE_ALL'), 'no rabbitmq server for test.') def test_50_docker_rabbitmq(self): try: os.environ['RABBITMQ_NAME'] = 'rabbitmq' os.environ['RABBITMQ_PORT_5672_TCP_ADDR'] = 'localhost' os.environ['RABBITMQ_PORT_5672_TCP_PORT'] = '5672' ctx = run.cli.make_context('test', [], None, obj=dict(testing_mode=True)) ctx = run.cli.invoke(ctx) queue = ctx.obj.newtask_queue queue.put('abc') queue.delete() except Exception as e: self.assertIsNone(e) finally: del os.environ['RABBITMQ_NAME'] del os.environ['RABBITMQ_PORT_5672_TCP_ADDR'] del os.environ['RABBITMQ_PORT_5672_TCP_PORT'] @unittest.skipIf(os.environ.get('IGNORE_MONGODB') or os.environ.get('IGNORE_ALL'), 'no mongodb server for test.') def test_60_docker_mongodb(self): try: os.environ['MONGODB_NAME'] = 'mongodb' os.environ['MONGODB_PORT_27017_TCP_ADDR'] = 'localhost' os.environ['MONGODB_PORT_27017_TCP_PORT'] = '27017' ctx = run.cli.make_context('test', [], None, obj=dict(testing_mode=True)) ctx = run.cli.invoke(ctx) ctx.obj.resultdb except Exception as e: self.assertIsNone(e) finally: del os.environ['MONGODB_NAME'] del os.environ['MONGODB_PORT_27017_TCP_ADDR'] del os.environ['MONGODB_PORT_27017_TCP_PORT'] @unittest.skipIf(os.environ.get('IGNORE_COUCHDB') or os.environ.get('IGNORE_ALL'), 'no couchdb server for test.') def test_60a_docker_couchdb(self): try: # create a test admin user os.environ['COUCHDB_NAME'] = 'couchdb' os.environ['COUCHDB_PORT_5984_TCP_ADDR'] = 'localhost' os.environ['COUCHDB_PORT_5984_TCP_PORT'] = '5984' ctx = run.cli.make_context('test', [], None, obj=dict(testing_mode=True)) ctx = run.cli.invoke(ctx) ctx.obj.resultdb except Exception as e: self.assertIsNone(e) finally: # remove the test admin user del os.environ['COUCHDB_NAME'] del os.environ['COUCHDB_PORT_5984_TCP_ADDR'] del os.environ['COUCHDB_PORT_5984_TCP_PORT'] @unittest.skip('only available in docker') @unittest.skipIf(os.environ.get('IGNORE_MYSQL') or os.environ.get('IGNORE_ALL'), 'no mysql server for test.') def test_70_docker_mysql(self): try: os.environ['MYSQL_NAME'] = 'mysql' os.environ['MYSQL_PORT_3306_TCP_ADDR'] = 'localhost' os.environ['MYSQL_PORT_3306_TCP_PORT'] = '3306' ctx = run.cli.make_context('test', [], None, obj=dict(testing_mode=True)) ctx = run.cli.invoke(ctx) ctx.obj.resultdb except Exception as e: self.assertIsNone(e) finally: del os.environ['MYSQL_NAME'] del os.environ['MYSQL_PORT_3306_TCP_ADDR'] del os.environ['MYSQL_PORT_3306_TCP_PORT'] def test_80_docker_phantomjs(self): try: os.environ['PHANTOMJS_NAME'] = 'phantomjs' os.environ['PHANTOMJS_PORT_25555_TCP'] = 'tpc://binux:25678' ctx = run.cli.make_context('test', [], None, obj=dict(testing_mode=True)) ctx = run.cli.invoke(ctx) self.assertEqual(ctx.obj.phantomjs_proxy, 'binux:25678') except Exception as e: self.assertIsNone(e) finally: del os.environ['PHANTOMJS_NAME'] del os.environ['PHANTOMJS_PORT_25555_TCP'] def test_90_docker_scheduler(self): try: os.environ['SCHEDULER_PORT_23333_TCP_ADDR'] = 'scheduler' os.environ['SCHEDULER_PORT_23333_TCP_PORT'] = '23333' ctx = run.cli.make_context('test', [], None, obj=dict(testing_mode=True)) ctx = run.cli.invoke(ctx) webui = run.cli.get_command(ctx, 'webui') webui_ctx = webui.make_context('webui', [], ctx) app = webui.invoke(webui_ctx) rpc = app.config['scheduler_rpc'] self.assertEqual(rpc._ServerProxy__host, '{}:{}'.format(os.environ['SCHEDULER_PORT_23333_TCP_ADDR'], os.environ['SCHEDULER_PORT_23333_TCP_PORT'])) except Exception as e: self.assertIsNone(e) finally: del os.environ['SCHEDULER_PORT_23333_TCP_ADDR'] del os.environ['SCHEDULER_PORT_23333_TCP_PORT'] def test_a100_all(self): import subprocess #cmd = [sys.executable] cmd = ['coverage', 'run'] p = subprocess.Popen(cmd+[ inspect.getsourcefile(run), '--taskdb', 'sqlite+taskdb:///data/tests/all_test_task.db', '--resultdb', 'sqlite+resultdb:///data/tests/all_test_result.db', '--projectdb', 'local+projectdb://'+inspect.getsourcefile(data_sample_handler), 'all', ], close_fds=True, preexec_fn=os.setsid) try: limit = 30 while limit >= 0: time.sleep(3) # click run try: requests.post('http://localhost:5000/run', data={ 'project': 'data_sample_handler', }) except requests.exceptions.ConnectionError: limit -= 1 continue break limit = 30 data = requests.get('http://localhost:5000/counter') self.assertEqual(data.status_code, 200) while data.json().get('data_sample_handler', {}).get('5m', {}).get('success', 0) < 5: time.sleep(1) data = requests.get('http://localhost:5000/counter') limit -= 1 if limit <= 0: break self.assertGreater(limit, 0) rv = requests.get('http://localhost:5000/results?project=data_sample_handler') self.assertIn('url', rv.text) self.assertIn('class=url', rv.text) except: raise finally: time.sleep(1) os.killpg(p.pid, signal.SIGTERM) p.wait() def test_a110_one(self): pid, fd = os.forkpty() #cmd = [sys.executable] cmd = ['coverage', 'run'] cmd += [ inspect.getsourcefile(run), 'one', '-i', inspect.getsourcefile(data_sample_handler) ] if pid == 0: # child os.execvp(cmd[0], cmd) else: # parent def wait_text(timeout=1): import select text = [] while True: rl, wl, xl = select.select([fd], [], [], timeout) if not rl: break try: t = os.read(fd, 1024) except OSError: break if not t: break t = utils.text(t) text.append(t) print(t, end='') return ''.join(text) text = wait_text(3) self.assertIn('new task data_sample_handler:on_start', text) self.assertIn('pyspider shell', text) os.write(fd, utils.utf8('run()\n')) text = wait_text() self.assertIn('task done data_sample_handler:on_start', text) os.write(fd, utils.utf8('crawl("%s/pyspider/test.html")\n' % self.httpbin)) text = wait_text() self.assertIn('/robots.txt', text) os.write(fd, utils.utf8('crawl("%s/links/10/0")\n' % self.httpbin)) text = wait_text() if '"title": "Links"' not in text: os.write(fd, utils.utf8('crawl("%s/links/10/1")\n' % self.httpbin)) text = wait_text() self.assertIn('"title": "Links"', text) os.write(fd, utils.utf8('crawl("%s/404")\n' % self.httpbin)) text = wait_text() self.assertIn('task retry', text) os.write(fd, b'quit_pyspider()\n') text = wait_text() self.assertIn('scheduler exiting...', text) os.close(fd) os.kill(pid, signal.SIGINT) class TestSendMessage(unittest.TestCase): @classmethod def setUpClass(self): shutil.rmtree('./data/tests', ignore_errors=True) os.makedirs('./data/tests') ctx = run.cli.make_context('test', [ '--taskdb', 'sqlite+taskdb:///data/tests/task.db', '--projectdb', 'sqlite+projectdb:///data/tests/projectdb.db', '--resultdb', 'sqlite+resultdb:///data/tests/resultdb.db', ], None, obj=dict(testing_mode=True)) self.ctx = run.cli.invoke(ctx) ctx = run.scheduler.make_context('scheduler', [], self.ctx) scheduler = run.scheduler.invoke(ctx) self.xmlrpc_thread = utils.run_in_thread(scheduler.xmlrpc_run) self.scheduler_thread = utils.run_in_thread(scheduler.run) time.sleep(1) @classmethod def tearDownClass(self): for each in self.ctx.obj.instances: each.quit() self.xmlrpc_thread.join() self.scheduler_thread.join() time.sleep(1) assert not utils.check_port_open(5000) assert not utils.check_port_open(23333) assert not utils.check_port_open(24444) assert not utils.check_port_open(25555) shutil.rmtree('./data/tests', ignore_errors=True) def test_10_send_message(self): ctx = run.send_message.make_context('send_message', [ 'test_project', 'test_message' ], self.ctx) self.assertTrue(run.send_message.invoke(ctx)) while True: task = self.ctx.obj.scheduler2fetcher.get(timeout=1) if task['url'] == 'data:,on_message': break self.assertEqual(task['process']['callback'], '_on_message') ================================================ FILE: tests/test_scheduler.py ================================================ #!/usr/bin/env python # -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # Created on 2014-02-08 22:37:13 import os import time import shutil import unittest import logging import logging.config logging.config.fileConfig("pyspider/logging.conf") from pyspider.scheduler.task_queue import TaskQueue from pyspider.libs import utils class TestTaskQueue(unittest.TestCase): @classmethod def setUpClass(self): self.task_queue = TaskQueue() self.task_queue.rate = 100000 self.task_queue.burst = 100000 self.task_queue.processing_timeout = 0.5 def test_10_put(self): self.task_queue.put('a3', 0, time.time() + 0.5) self.task_queue.put('a4', 3, time.time() + 0.2) self.task_queue.put('a2', 0) self.task_queue.put('a1', 1) self.assertEqual(self.task_queue.size(), 4) def test_20_update(self): self.task_queue.put('a2', 4) self.assertEqual(self.task_queue.size(), 4) self.task_queue.put('a3', 2, 0) self.assertEqual(self.task_queue.size(), 4) def test_30_get_from_priority_queue(self): self.assertEqual(self.task_queue.get(), 'a2') self.assertEqual(self.task_queue.size(), 4) def test_40_time_queue_1(self): self.task_queue.check_update() self.assertEqual(self.task_queue.get(), 'a3') self.assertEqual(self.task_queue.size(), 4) def test_50_time_queue_2(self): time.sleep(0.3) self.task_queue.check_update() self.assertEqual(self.task_queue.get(), 'a4') self.assertEqual(self.task_queue.get(), 'a1') self.assertEqual(self.task_queue.size(), 4) def test_60_processing_queue(self): time.sleep(0.5) self.task_queue.check_update() self.assertEqual(self.task_queue.get(), 'a2') self.assertEqual(len(self.task_queue), 4) self.assertEqual(self.task_queue.get(), 'a4') self.assertEqual(self.task_queue.get(), 'a3') self.assertEqual(self.task_queue.get(), 'a1') self.assertEqual(len(self.task_queue), 4) def test_70_done(self): self.assertTrue(self.task_queue.done('a2')) self.assertTrue(self.task_queue.done('a1')) self.assertEqual(len(self.task_queue), 2) self.assertTrue(self.task_queue.done('a4')) self.assertTrue(self.task_queue.done('a3')) self.assertEqual(len(self.task_queue), 0) from pyspider.scheduler.token_bucket import Bucket class TestBucket(unittest.TestCase): def test_bucket(self): bucket = Bucket(100, 1000) self.assertEqual(bucket.get(), 1000) time.sleep(0.1) self.assertEqual(bucket.get(), 1000) bucket.desc(100) self.assertEqual(bucket.get(), 900) time.sleep(0.1) self.assertAlmostEqual(bucket.get(), 910, delta=2) time.sleep(0.1) self.assertAlmostEqual(bucket.get(), 920, delta=2) try: from six.moves import xmlrpc_client except ImportError: import xmlrpclib as xmlrpc_client from pyspider.scheduler.scheduler import Scheduler from pyspider.database.sqlite import taskdb, projectdb, resultdb from pyspider.libs.multiprocessing_queue import Queue from pyspider.libs.utils import run_in_thread class TestScheduler(unittest.TestCase): taskdb_path = './data/tests/task.db' projectdb_path = './data/tests/project.db' resultdb_path = './data/tests/result.db' check_project_time = 1 scheduler_xmlrpc_port = 23333 @classmethod def setUpClass(self): shutil.rmtree('./data/tests', ignore_errors=True) os.makedirs('./data/tests') def get_taskdb(): return taskdb.TaskDB(self.taskdb_path) self.taskdb = get_taskdb() def get_projectdb(): return projectdb.ProjectDB(self.projectdb_path) self.projectdb = get_projectdb() def get_resultdb(): return resultdb.ResultDB(self.resultdb_path) self.resultdb = get_resultdb() self.newtask_queue = Queue(10) self.status_queue = Queue(10) self.scheduler2fetcher = Queue(10) self.rpc = xmlrpc_client.ServerProxy('http://localhost:%d' % self.scheduler_xmlrpc_port) def run_scheduler(): scheduler = Scheduler(taskdb=get_taskdb(), projectdb=get_projectdb(), newtask_queue=self.newtask_queue, status_queue=self.status_queue, out_queue=self.scheduler2fetcher, data_path="./data/tests/", resultdb=get_resultdb()) scheduler.UPDATE_PROJECT_INTERVAL = 0.1 scheduler.LOOP_INTERVAL = 0.1 scheduler.INQUEUE_LIMIT = 10 scheduler.DELETE_TIME = 0 scheduler.DEFAULT_RETRY_DELAY = {'': 5} scheduler._last_tick = int(time.time()) # not dispatch cronjob self.xmlrpc_thread = run_in_thread(scheduler.xmlrpc_run, port=self.scheduler_xmlrpc_port) scheduler.run() self.process = run_in_thread(run_scheduler) time.sleep(1) @classmethod def tearDownClass(self): if self.process.is_alive(): self.rpc._quit() self.process.join(5) self.xmlrpc_thread.join() assert not self.process.is_alive() shutil.rmtree('./data/tests', ignore_errors=True) time.sleep(1) assert not utils.check_port_open(5000) assert not utils.check_port_open(self.scheduler_xmlrpc_port) assert not utils.check_port_open(24444) assert not utils.check_port_open(25555) def test_10_new_task_ignore(self): ''' task_queue = [ ] ''' self.newtask_queue.put({ 'taskid': 'taskid', 'project': 'test_project', 'url': 'url' }) # unknown project: test_project self.assertEqual(self.rpc.size(), 0) self.assertEqual(len(self.rpc.get_active_tasks()), 0) def test_20_new_project(self): ''' task_queue = [ ] ''' self.projectdb.insert('test_project', { 'name': 'test_project', 'group': 'group', 'status': 'TODO', 'script': 'import time\nprint(time.time())', 'comments': 'test project', 'rate': 1.0, 'burst': 10, }) def test_30_update_project(self): ''' task_queue = [ ] ''' from six.moves import queue as Queue with self.assertRaises(Queue.Empty): task = self.scheduler2fetcher.get(timeout=1) self.projectdb.update('test_project', status="DEBUG") time.sleep(0.1) self.rpc.update_project() task = self.scheduler2fetcher.get(timeout=10) self.assertIsNotNone(task) self.assertEqual(task['taskid'], '_on_get_info') # select test_project:_on_get_info data:,_on_get_info def test_32_get_info(self): self.status_queue.put({ 'taskid': '_on_get_info', 'project': 'test_project', 'track': { 'save': { } } }) # test_project on_get_info {} def test_34_new_not_used_project(self): ''' task_queue = [] ''' self.projectdb.insert('test_project_not_started', { 'name': 'test_project_not_started', 'group': 'group', 'status': 'RUNNING', 'script': 'import time\nprint(time.time())', 'comments': 'test project', 'rate': 1.0, 'burst': 10, }) task = self.scheduler2fetcher.get(timeout=5) # select test_project_not_started:_on_get_info data:,_on_get_info self.assertEqual(task['taskid'], '_on_get_info') def test_35_new_task(self): ''' task_queue = [ ] ''' time.sleep(0.2) self.newtask_queue.put({ 'taskid': 'taskid', 'project': 'test_project', 'url': 'url', 'fetch': { 'data': 'abc', }, 'process': { 'data': 'abc', }, 'schedule': { 'age': 0, }, }) # new task test_project:taskid url # task_queue = [ test_project:taskid ] time.sleep(0.5) task = self.scheduler2fetcher.get(timeout=10) # select test_project:taskid self.assertGreater(len(self.rpc.get_active_tasks()), 0) self.assertIsNotNone(task) self.assertEqual(task['taskid'], 'taskid') self.assertEqual(task['project'], 'test_project') self.assertIn('schedule', task) self.assertIn('fetch', task) self.assertIn('process', task) self.assertIn('track', task) self.assertEqual(task['fetch']['data'], 'abc') def test_37_force_update_processing_task(self): ''' processing = [ test_project:taskid ] ''' self.newtask_queue.put({ 'taskid': 'taskid', 'project': 'test_project', 'url': 'url_force_update', 'schedule': { 'age': 10, 'force_update': True, }, }) # restart task test_project:taskid url_force_update time.sleep(0.2) # it should not block next def test_40_taskdone_error_no_project(self): ''' processing = [ test_project:taskid ] ''' self.status_queue.put({ 'taskid': 'taskid', 'project': 'no_project', 'url': 'url' }) # unknown project: no_project time.sleep(0.1) self.assertEqual(self.rpc.size(), 1) def test_50_taskdone_error_no_track(self): ''' processing = [ test_project:taskid ] ''' self.status_queue.put({ 'taskid': 'taskid', 'project': 'test_project', 'url': 'url' }) # Bad status pack: 'track' time.sleep(0.1) self.assertEqual(self.rpc.size(), 1) self.status_queue.put({ 'taskid': 'taskid', 'project': 'test_project', 'url': 'url', 'track': {} }) # Bad status pack: 'process' time.sleep(0.1) self.assertEqual(self.rpc.size(), 1) def test_60_taskdone_failed_retry(self): ''' processing = [ test_project:taskid ] ''' self.status_queue.put({ 'taskid': 'taskid', 'project': 'test_project', 'url': 'url', 'track': { 'fetch': { 'ok': True }, 'process': { 'ok': False }, } }) # task retry 0/3 test_project:taskid url from six.moves import queue as Queue # with self.assertRaises(Queue.Empty): # task = self.scheduler2fetcher.get(timeout=4) task = self.scheduler2fetcher.get(timeout=5) # select test_project:taskid url self.assertIsNotNone(task) def test_70_taskdone_ok(self): ''' processing = [ test_project:taskid ] ''' self.status_queue.put({ 'taskid': 'taskid', 'project': 'test_project', 'url': 'url', 'track': { 'fetch': { 'ok': True }, 'process': { 'ok': True }, } }) # task done test_project:taskid url time.sleep(0.2) self.assertEqual(self.rpc.size(), 0) def test_75_on_finished_msg(self): task = self.scheduler2fetcher.get(timeout=5) # select test_project:on_finished data:,on_finished self.assertEqual(task['taskid'], 'on_finished') self.status_queue.put({ 'taskid': 'on_finished', 'project': 'test_project', 'url': 'url', 'track': { 'fetch': { 'ok': True }, 'process': { 'ok': True }, } }) # task done test_project:on_finished url time.sleep(0.2) self.assertEqual(self.rpc.size(), 0) def test_80_newtask_age_ignore(self): ''' processing = [ ] ''' self.newtask_queue.put({ 'taskid': 'taskid', 'project': 'test_project', 'url': 'url', 'fetch': { 'data': 'abc', }, 'process': { 'data': 'abc', }, 'schedule': { 'age': 30, }, }) time.sleep(0.1) self.assertEqual(self.rpc.size(), 0) def test_82_newtask_via_rpc(self): ''' processing = [ ] ''' self.rpc.newtask({ 'taskid': 'taskid', 'project': 'test_project', 'url': 'url', 'fetch': { 'data': 'abc', }, 'process': { 'data': 'abc', }, 'schedule': { 'age': 30, }, }) time.sleep(0.1) self.assertEqual(self.rpc.size(), 0) def test_90_newtask_with_itag(self): ''' task_queue = [ ] processing = [ ] ''' time.sleep(0.1) self.newtask_queue.put({ 'taskid': 'taskid', 'project': 'test_project', 'url': 'url', 'fetch': { 'data': 'abc', }, 'process': { 'data': 'abc', }, 'schedule': { 'itag': "abc", 'retries': 1 }, }) # restart task test_project:taskid url task = self.scheduler2fetcher.get(timeout=10) # select test_project:taskid url self.assertIsNotNone(task) self.assertEqual(task['taskid'], 'taskid') self.test_70_taskdone_ok() # task done test_project:taskid url self.test_75_on_finished_msg() # select test_project:on_finished data:,on_finished def test_a10_newtask_restart_by_age(self): self.newtask_queue.put({ 'taskid': 'taskid', 'project': 'test_project', 'url': 'url', 'fetch': { 'data': 'abc', }, 'process': { 'data': 'abc', }, 'schedule': { 'age': 0, 'retries': 1 }, }) # restart task test_project:taskid url task = self.scheduler2fetcher.get(timeout=10) # select test_project:taskid url self.assertIsNotNone(task) self.assertEqual(task['taskid'], 'taskid') def test_a20_failed_retry(self): ''' processing: [ test_project:taskid ] ''' self.status_queue.put({ 'taskid': 'taskid', 'project': 'test_project', 'url': 'url', 'track': { 'fetch': { 'ok': True }, 'process': { 'ok': False }, } }) # task retry 0/1 test_project:taskid url task = self.scheduler2fetcher.get(timeout=5) # select test_project:taskid url self.assertIsNotNone(task) self.assertEqual(task['taskid'], 'taskid') self.status_queue.put({ 'taskid': 'taskid', 'project': 'test_project', 'url': 'url', 'track': { 'fetch': { 'ok': False }, 'process': { 'ok': False }, } }) # task failed test_project:taskid url self.test_75_on_finished_msg() # select test_project:on_finished data:,on_finished from six.moves import queue as Queue with self.assertRaises(Queue.Empty): self.scheduler2fetcher.get(timeout=5) def test_a30_task_verify(self): self.assertFalse(self.rpc.newtask({ #'taskid': 'taskid#', 'project': 'test_project', 'url': 'url', })) # taskid not in task: {'project': 'test_project', 'url': 'url'} self.assertFalse(self.rpc.newtask({ 'taskid': 'taskid#', #'project': 'test_project', 'url': 'url', })) # project not in task: {'url': 'url', 'taskid': 'taskid#'} self.assertFalse(self.rpc.newtask({ 'taskid': 'taskid#', 'project': 'test_project', #'url': 'url', })) # url not in task: {'project': 'test_project', 'taskid': 'taskid#'} self.assertFalse(self.rpc.newtask({ 'taskid': 'taskid#', 'project': 'not_exist_project', 'url': 'url', })) # unknown project: not_exist_project self.assertTrue(self.rpc.newtask({ 'taskid': 'taskid#', 'project': 'test_project', 'url': 'url', })) # new task test_project:taskid# url def test_a40_success_recrawl(self): ''' task_queue = [ test_project:taskid# ] ''' self.newtask_queue.put({ 'taskid': 'taskid', 'project': 'test_project', 'url': 'url', 'fetch': { 'data': 'abc', }, 'process': { 'data': 'abc', }, 'schedule': { 'age': 0, 'retries': 1, 'auto_recrawl': True, }, }) # restart task test_project:taskid url task1 = self.scheduler2fetcher.get(timeout=10) # select test_project:taskid# url task2 = self.scheduler2fetcher.get(timeout=10) # select test_project:taskid url self.assertIsNotNone(task1) self.assertIsNotNone(task2) self.assertTrue(task1['taskid'] == 'taskid#' or task2['taskid'] == 'taskid#') self.status_queue.put({ 'taskid': 'taskid', 'project': 'test_project', 'url': 'url', 'schedule': { 'age': 0, 'retries': 1, 'auto_recrawl': True, }, 'track': { 'fetch': { 'ok': True }, 'process': { 'ok': True }, } }) # task done test_project:taskid url task = self.scheduler2fetcher.get(timeout=10) self.assertIsNotNone(task) def test_a50_failed_recrawl(self): ''' time_queue = [ test_project:taskid ] scheduler2fetcher = [ test_project:taskid# ] processing = [ test_project:taskid# ] ''' for i in range(3): self.status_queue.put({ 'taskid': 'taskid', 'project': 'test_project', 'url': 'url', 'schedule': { 'age': 0, 'retries': 1, 'auto_recrawl': True, }, 'track': { 'fetch': { 'ok': True }, 'process': { 'ok': False }, } }) # not processing pack: test_project:taskid url # select test_project:taskid url # task retry 0/1 test_project:taskid url # select test_project:taskid url # task retry 0/1 test_project:taskid url # select test_project:taskid url task = self.scheduler2fetcher.get(timeout=10) self.assertIsNotNone(task) self.assertEqual(task['taskid'], 'taskid') def test_a60_disable_recrawl(self): ''' time_queue = [ test_project:taskid ] scheduler2fetcher = [ test_project:taskid# ] processing = [ test_project:taskid# ] ''' self.status_queue.put({ 'taskid': 'taskid', 'project': 'test_project', 'url': 'url', 'schedule': { 'age': 0, 'retries': 1, }, 'track': { 'fetch': { 'ok': True }, 'process': { 'ok': True }, } }) # task done test_project:taskid url from six.moves import queue as Queue with self.assertRaises(Queue.Empty): self.scheduler2fetcher.get(timeout=5) def test_38_cancel_task(self): current_size = self.rpc.size() self.newtask_queue.put({ 'taskid': 'taskid_to_cancel', 'project': 'test_project', 'url': 'url', 'fetch': { 'data': 'abc', }, 'process': { 'data': 'abc', }, 'schedule': { 'age': 0, 'exetime': time.time() + 30 }, }) # new task test_project:taskid_to_cancel url # task_queue = [ test_project:taskid_to_cancel ] time.sleep(0.2) self.assertEqual(self.rpc.size(), current_size+1) self.newtask_queue.put({ 'taskid': 'taskid_to_cancel', 'project': 'test_project', 'url': 'url', 'fetch': { 'data': 'abc', }, 'process': { 'data': 'abc', }, 'schedule': { 'force_update': True, 'age': 0, 'cancel': True }, }) # new cancel test_project:taskid_to_cancel url # task_queue = [ ] time.sleep(0.2) self.assertEqual(self.rpc.size(), current_size) def test_x10_inqueue_limit(self): self.projectdb.insert('test_inqueue_project', { 'name': 'test_inqueue_project', 'group': 'group', 'status': 'DEBUG', 'script': 'import time\nprint(time.time())', 'comments': 'test project', 'rate': 0, 'burst': 0, }) time.sleep(0.1) pre_size = self.rpc.size() for i in range(20): self.newtask_queue.put({ 'taskid': 'taskid%d' % i, 'project': 'test_inqueue_project', 'url': 'url', 'schedule': { 'age': 3000, 'force_update': True, }, }) time.sleep(1) self.assertEqual(self.rpc.size() - pre_size, 10) def test_x20_delete_project(self): self.assertIsNotNone(self.projectdb.get('test_inqueue_project')) #self.assertIsNotNone(self.taskdb.get_task('test_inqueue_project', 'taskid1')) self.projectdb.update('test_inqueue_project', status="STOP", group="lock,delete") time.sleep(1) self.assertIsNone(self.projectdb.get('test_inqueue_project')) self.taskdb._list_project() self.assertIsNone(self.taskdb.get_task('test_inqueue_project', 'taskid1')) self.assertNotIn('test_inqueue_project', self.rpc.counter('5m', 'sum')) def test_z10_startup(self): self.assertTrue(self.process.is_alive()) def test_z20_quit(self): self.rpc._quit() time.sleep(0.2) self.assertFalse(self.process.is_alive()) self.assertEqual( self.taskdb.get_task('test_project', 'taskid')['status'], self.taskdb.SUCCESS ) from pyspider.scheduler.scheduler import Project class TestProject(unittest.TestCase): task_pack = { 'type': Scheduler.TASK_PACK, 'taskid': 'taskid', 'project': 'test_project', 'url': 'url', 'fetch': { 'data': 'abc', }, 'process': { 'data': 'abc', }, 'schedule': { 'age': 0, }, } status_ok_pack = { 'taskid': 'taskid', 'project': 'test_project', 'url': 'url', 'schedule': { 'age': 0, 'retries': 1, }, 'track': { 'fetch': { 'ok': True }, 'process': { 'ok': True }, } } status_fail_pack = { 'taskid': 'taskid', 'project': 'test_project', 'url': 'url', 'schedule': { 'age': 0, 'retries': 1, }, 'track': { 'fetch': { 'ok': False }, 'process': { 'ok': False }, } } @classmethod def setUpClass(self): self.scheduler = Scheduler(taskdb=None, projectdb=None, newtask_queue=None, status_queue=None, out_queue=None) self.scheduler.PAUSE_TIME = 2 self.project = Project(self.scheduler, { 'name': 'test_project_not_started', 'group': 'group', 'status': 'RUNNING', 'script': 'import time\nprint(time.time())', 'comments': 'test project', 'rate': 1.0, 'burst': 10, 'updatetime': time.time(), }) def test_pause_10_unpaused(self): self.assertFalse(self.project.paused) def test_pause_20_no_enough_fail_tasks(self): for i in range(3): self.project.active_tasks.appendleft((time.time(), dict(self.task_pack))) self.assertFalse(self.project.paused) for i in range(1): self.project.active_tasks.appendleft((time.time(), dict(self.status_ok_pack))) for i in range(self.scheduler.FAIL_PAUSE_NUM - 5): self.project.active_tasks.appendleft((time.time(), dict(self.status_fail_pack))) self.assertFalse(self.project.paused) for i in range(5): self.project.active_tasks.appendleft((time.time(), dict(self.status_fail_pack))) for i in range(1): self.project.active_tasks.appendleft((time.time(), dict(self.status_ok_pack))) self.assertFalse(self.project.paused) for i in range(self.scheduler.FAIL_PAUSE_NUM): self.project.active_tasks.appendleft((time.time(), dict(self.task_pack))) self.assertFalse(self.project.paused) def test_pause_30_paused(self): for i in range(self.scheduler.FAIL_PAUSE_NUM): self.project.active_tasks.appendleft((time.time(), dict(self.status_fail_pack))) for i in range(self.scheduler.FAIL_PAUSE_NUM): self.project.active_tasks.appendleft((time.time(), dict(self.task_pack))) self.assertTrue(self.project.paused) def test_pause_40_unpause_checking(self): time.sleep(3) self.assertFalse(self.project.paused) def test_pause_50_paused_again(self): for i in range(self.scheduler.UNPAUSE_CHECK_NUM): self.project.active_tasks.appendleft((time.time(), dict(self.status_fail_pack))) self.assertTrue(self.project.paused) def test_pause_60_unpause_checking(self): time.sleep(3) self.assertFalse(self.project.paused) def test_pause_70_unpaused(self): for i in range(1): self.project.active_tasks.appendleft((time.time(), dict(self.status_ok_pack))) for i in range(self.scheduler.UNPAUSE_CHECK_NUM): self.project.active_tasks.appendleft((time.time(), dict(self.status_fail_pack))) for i in range(self.scheduler.FAIL_PAUSE_NUM): self.project.active_tasks.appendleft((time.time(), dict(self.task_pack))) self.assertFalse(self.project.paused) self.assertFalse(self.project._paused) def test_pause_x_disable_auto_pause(self): fail_pause_num = self.scheduler.FAIL_PAUSE_NUM self.scheduler.FAIL_PAUSE_NUM = 0 for i in range(100): self.project.active_tasks.appendleft((time.time(), dict(self.status_fail_pack))) self.assertFalse(self.project.paused) self.scheduler.FAIL_PAUSE_NUM = fail_pause_num if __name__ == '__main__': unittest.main() ================================================ FILE: tests/test_task_queue.py ================================================ #!/usr/bin/env python # -*- coding: utf-8 -*- import time import unittest import six from six.moves import queue as Queue from pyspider.scheduler.task_queue import InQueueTask, TaskQueue class TestTaskQueue(unittest.TestCase): """ TestTaskQueue """ def test_task_queue_in_time_order(self): tq = TaskQueue(rate=300, burst=1000) queues = dict() tasks = dict() for i in range(0, 100): it = InQueueTask(str(i), priority=int(i // 10), exetime=0) tq.put(it.taskid, it.priority, it.exetime) if it.priority not in queues: queues[it.priority] = Queue.Queue() q = queues[it.priority] # type:Queue.Queue q.put(it) tasks[it.taskid] = it # six.print_('put, taskid=', it.taskid, 'priority=', it.priority, 'exetime=', it.exetime) for i in range(0, 100): task_id = tq.get() task = tasks[task_id] q = queues[task.priority] # type: Queue.Queue expect_task = q.get() self.assertEqual(task_id, expect_task.taskid) self.assertEqual(task.priority, int(9 - i // 10)) # six.print_('get, taskid=', task.taskid, 'priority=', task.priority, 'exetime=', task.exetime) self.assertEqual(tq.size(), 100) self.assertEqual(tq.priority_queue.qsize(), 0) self.assertEqual(tq.processing.qsize(), 100) for q in six.itervalues(queues): # type:Queue.Queue self.assertEqual(q.qsize(), 0) pass pass class TestTimeQueue(unittest.TestCase): def test_time_queue(self): # six.print_('Test time queue order by time only') tq = TaskQueue(rate=300, burst=1000) fifo_queue = Queue.Queue() interval = 5.0 / 1000 for i in range(0, 20): it = InQueueTask(str(i), priority=int(i // 10), exetime=time.time() + (i + 1) * interval) tq.put(it.taskid, it.priority, it.exetime) fifo_queue.put(it) # six.print_('put, taskid=', it.taskid, 'priority=', it.priority, 'exetime=', it.exetime) self.assertEqual(tq.priority_queue.qsize(), 0) self.assertEqual(tq.processing.qsize(), 0) self.assertEqual(tq.time_queue.qsize(), 20) for i in range(0, 20): t1 = fifo_queue.get() t2 = tq.time_queue.get() self.assertEqual(t1.taskid, t2.taskid) # six.print_('get, taskid=', t2.taskid, 'priority=', t2.priority, 'exetime=', t2.exetime) self.assertEqual(tq.priority_queue.qsize(), 0) self.assertEqual(tq.processing.qsize(), 0) self.assertEqual(tq.time_queue.qsize(), 0) queues = dict() tasks = dict() for i in range(0, 20): priority = int(i // 10) it = InQueueTask(str(i), priority=priority, exetime=time.time() + (i + 1) * interval) tq.put(it.taskid, it.priority, it.exetime) tasks[it.taskid] = it if priority not in queues: queues[priority] = Queue.Queue() q = queues[priority] q.put(it) pass self.assertEqual(tq.priority_queue.qsize(), 0) self.assertEqual(tq.processing.qsize(), 0) self.assertEqual(tq.time_queue.qsize(), 20) time.sleep(20 * interval) tq.check_update() self.assertEqual(tq.priority_queue.qsize(), 20) self.assertEqual(tq.processing.qsize(), 0) self.assertEqual(tq.time_queue.qsize(), 0) for i in range(0, 20): taskid = tq.get() t1 = tasks[taskid] t2 = queues[t1.priority].get() self.assertEqual(t1.taskid, t2.taskid) self.assertEqual(tq.priority_queue.qsize(), 0) self.assertEqual(tq.processing.qsize(), 20) self.assertEqual(tq.time_queue.qsize(), 0) pass pass if __name__ == '__main__': unittest.main() ================================================ FILE: tests/test_utils.py ================================================ #!/usr/bin/env python # -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # Created on 2015-01-18 16:53:49 import sys import time import unittest from pyspider.libs import utils class TestFetcher(unittest.TestCase): def test_readonlydict(self): data = dict(a='a', b=123) data['c'] = self data = utils.ReadOnlyDict(data) with self.assertRaises(Exception): data['d'] = 9 def test_getitem(self): l = [1, 2] self.assertEqual(utils.getitem(l, 0), 1) self.assertEqual(utils.getitem(l, 1), 2) self.assertEqual(utils.getitem(l, 3), None) self.assertEqual(utils.getitem(l, 3, 9), 9) self.assertEqual(utils.getitem(l, 'key'), None) self.assertEqual(utils.getitem(l, 'key', 8), 8) data = dict(a='a', b=123) self.assertEqual(utils.getitem(data, 'a'), 'a') self.assertEqual(utils.getitem(data, 'b'), 123) self.assertEqual(utils.getitem(data, 'c'), None) self.assertEqual(utils.getitem(data, 'c', 9), 9) def test_format_data(self): now = time.time() self.assertEqual(utils.format_date(now - 30), '30 seconds ago') self.assertEqual(utils.format_date(now - 60), '1 minute ago') self.assertEqual(utils.format_date(now - 2*60), '2 minutes ago') self.assertEqual(utils.format_date(now - 30*60), '30 minutes ago') self.assertEqual(utils.format_date(now - 60*60), '1 hour ago') self.assertEqual(utils.format_date(1963475336), 'Mar 21, 2032 at 9:48') self.assertEqual(utils.format_date(now - 12*60*60), '12 hours ago') self.assertRegex(utils.format_date(now - 24*60*60), r'^yesterday at \d{1,2}:\d{2}$') self.assertRegex(utils.format_date(now - 2*24*60*60), r'^[A-Z][a-z]+ at \d{1,2}:\d{2}$') self.assertRegex(utils.format_date(now - 3*24*60*60), r'^[A-Z][a-z]+ at \d{1,2}:\d{2}$') self.assertRegex(utils.format_date(now - 4*24*60*60), r'^[A-Z][a-z]+ at \d{1,2}:\d{2}$') self.assertRegex(utils.format_date(now - 5*24*60*60), r'^\d{1,2}-\d{1,2} at \d{1,2}:\d{2}$') self.assertRegex(utils.format_date(now - 333*24*60*60), r'^\d{1,2}-\d{1,2} at \d{1,2}:\d{2}$') self.assertRegex(utils.format_date(now - 334*24*60*60), r'^[A-Z][a-z]+ \d{1,2}, \d{4} at \d{1,2}:\d{2}$') ================================================ FILE: tests/test_webdav.py ================================================ #!/usr/bin/env python # -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # Created on 2015-06-03 21:15 import os import sys import six import time import shutil import inspect import unittest from six import BytesIO from pyspider import run from pyspider.libs import utils from tests import data_sample_handler, data_handler @unittest.skipIf(sys.version_info >= (3, 6), "easywebdav doesn't support python 3.6") class TestWebDav(unittest.TestCase): @classmethod def setUpClass(self): import easywebdav shutil.rmtree('./data/tests', ignore_errors=True) os.makedirs('./data/tests') ctx = run.cli.make_context('test', [ '--taskdb', 'sqlite+taskdb:///data/tests/task.db', '--projectdb', 'sqlite+projectdb:///data/tests/projectdb.db', '--resultdb', 'sqlite+resultdb:///data/tests/resultdb.db', ], None, obj=utils.ObjectDict(testing_mode=True)) self.ctx = run.cli.invoke(ctx) ctx = run.webui.make_context('webui', [ '--username', 'binux', '--password', '4321', ], self.ctx) self.app = run.webui.invoke(ctx) self.app_thread = utils.run_in_thread(self.app.run) time.sleep(5) self.webdav = easywebdav.connect('localhost', port=5000, path='dav') self.webdav_up = easywebdav.connect('localhost', port=5000, path='dav', username='binux', password='4321') @classmethod def tearDownClass(self): for each in self.ctx.obj.instances: each.quit() self.app_thread.join() time.sleep(1) assert not utils.check_port_open(5000) assert not utils.check_port_open(23333) assert not utils.check_port_open(24444) assert not utils.check_port_open(25555) assert not utils.check_port_open(14887) shutil.rmtree('./data/tests', ignore_errors=True) def test_10_ls(self): self.assertEqual(len(self.webdav.ls()), 1) def test_20_create_error(self): import easywebdav with self.assertRaises(easywebdav.OperationFailed): self.webdav.upload(inspect.getsourcefile(data_sample_handler), 'bad_file_name') with self.assertRaises(easywebdav.OperationFailed): self.webdav.upload(inspect.getsourcefile(data_sample_handler), 'bad.file.name') def test_30_create_ok(self): self.webdav.upload(inspect.getsourcefile(data_handler), 'handler.py') self.webdav.upload(inspect.getsourcefile(data_sample_handler), 'sample_handler.py') self.assertEqual(len(self.webdav.ls()), 3) def test_40_get_404(self): io = BytesIO() import easywebdav with self.assertRaises(easywebdav.OperationFailed): self.webdav.download('not_exitst', io) io.close() def test_50_get(self): io = BytesIO() self.webdav.download('handler.py', io) self.assertEqual(utils.text(inspect.getsource(data_handler)), utils.text(io.getvalue())) io.close() io = BytesIO() self.webdav.download('sample_handler.py', io) self.assertEqual(utils.text(inspect.getsource(data_sample_handler)), utils.text(io.getvalue())) io.close() def test_60_edit(self): self.webdav.upload(inspect.getsourcefile(data_handler), 'sample_handler.py') def test_70_get(self): io = BytesIO() self.webdav.download('sample_handler.py', io) self.assertEqual(utils.text(inspect.getsource(data_handler)), utils.text(io.getvalue())) io.close() def test_80_password(self): import requests rv = requests.post('http://localhost:5000/update', data={ 'name': 'group', 'value': 'lock', 'pk': 'sample_handler', }) self.assertEqual(rv.status_code, 200) import easywebdav with self.assertRaises(easywebdav.OperationFailed): self.webdav.upload(inspect.getsourcefile(data_sample_handler), 'sample_handler.py') self.webdav_up.upload(inspect.getsourcefile(data_sample_handler), 'sample_handler.py') @unittest.skipIf(sys.version_info >= (3, 6), "easywebdav doesn't support python 3.6") class TestWebDavNeedAuth(unittest.TestCase): @classmethod def setUpClass(self): import easywebdav shutil.rmtree('./data/tests', ignore_errors=True) os.makedirs('./data/tests') ctx = run.cli.make_context('test', [ '--taskdb', 'sqlite+taskdb:///data/tests/task.db', '--projectdb', 'sqlite+projectdb:///data/tests/projectdb.db', '--resultdb', 'sqlite+resultdb:///data/tests/resultdb.db', ], None, obj=utils.ObjectDict(testing_mode=True)) self.ctx = run.cli.invoke(ctx) ctx = run.webui.make_context('webui', [ '--username', 'binux', '--password', '4321', '--need-auth', ], self.ctx) self.app = run.webui.invoke(ctx) self.app_thread = utils.run_in_thread(self.app.run) time.sleep(5) self.webdav = easywebdav.connect('localhost', port=5000, path='dav') self.webdav_up = easywebdav.connect('localhost', port=5000, path='dav', username='binux', password='4321') @classmethod def tearDownClass(self): for each in self.ctx.obj.instances: each.quit() self.app_thread.join() time.sleep(1) assert not utils.check_port_open(5000) assert not utils.check_port_open(23333) assert not utils.check_port_open(24444) assert not utils.check_port_open(25555) assert not utils.check_port_open(14887) shutil.rmtree('./data/tests', ignore_errors=True) def test_10_ls(self): import easywebdav with self.assertRaises(easywebdav.OperationFailed): self.assertEqual(len(self.webdav.ls()), 1) self.assertEqual(len(self.webdav_up.ls()), 1) def test_30_create_ok(self): self.webdav_up.upload(inspect.getsourcefile(data_handler), 'handler.py') self.assertEqual(len(self.webdav_up.ls()), 2) def test_50_get(self): import easywebdav with self.assertRaises(easywebdav.OperationFailed): io = BytesIO() self.webdav.download('handler.py', io) io.close() io = BytesIO() self.webdav_up.download('handler.py', io) self.assertEqual(utils.text(inspect.getsource(data_handler)), utils.text(io.getvalue())) io.close() ================================================ FILE: tests/test_webui.py ================================================ #!/usr/bin/env python # -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # Created on 2014-11-18 21:03:22 import os import re import time import json import shutil import unittest from pyspider import run from pyspider.libs import utils from pyspider.libs.utils import run_in_thread, ObjectDict class TestWebUI(unittest.TestCase): @classmethod def setUpClass(self): shutil.rmtree('./data/tests', ignore_errors=True) os.makedirs('./data/tests') import tests.data_test_webpage import httpbin from pyspider.webui import bench_test # flake8: noqa self.httpbin_thread = utils.run_in_subprocess(httpbin.app.run, port=14887, passthrough_errors=False) self.httpbin = 'http://127.0.0.1:14887' ctx = run.cli.make_context('test', [ '--taskdb', 'sqlalchemy+sqlite+taskdb:///data/tests/task.db', '--projectdb', 'sqlalchemy+sqlite+projectdb:///data/tests/projectdb.db', '--resultdb', 'sqlalchemy+sqlite+resultdb:///data/tests/resultdb.db', ], None, obj=ObjectDict(testing_mode=True)) self.ctx = run.cli.invoke(ctx) self.threads = [] ctx = run.scheduler.make_context('scheduler', [], self.ctx) self.scheduler = scheduler = run.scheduler.invoke(ctx) self.threads.append(run_in_thread(scheduler.xmlrpc_run)) self.threads.append(run_in_thread(scheduler.run)) ctx = run.fetcher.make_context('fetcher', [ '--xmlrpc-port', '24444', ], self.ctx) fetcher = run.fetcher.invoke(ctx) self.threads.append(run_in_thread(fetcher.xmlrpc_run)) self.threads.append(run_in_thread(fetcher.run)) ctx = run.processor.make_context('processor', [], self.ctx) processor = run.processor.invoke(ctx) self.threads.append(run_in_thread(processor.run)) ctx = run.result_worker.make_context('result_worker', [], self.ctx) result_worker = run.result_worker.invoke(ctx) self.threads.append(run_in_thread(result_worker.run)) ctx = run.webui.make_context('webui', [ '--scheduler-rpc', 'http://localhost:23333/' ], self.ctx) app = run.webui.invoke(ctx) app.debug = True self.app = app.test_client() self.rpc = app.config['scheduler_rpc'] time.sleep(1) @classmethod def tearDownClass(self): for each in self.ctx.obj.instances: each.quit() time.sleep(1) for thread in self.threads: thread.join() self.httpbin_thread.terminate() self.httpbin_thread.join() assert not utils.check_port_open(5000) assert not utils.check_port_open(23333) assert not utils.check_port_open(24444) assert not utils.check_port_open(25555) assert not utils.check_port_open(14887) shutil.rmtree('./data/tests', ignore_errors=True) def test_10_index_page(self): rv = self.app.get('/') self.assertEqual(rv.status_code, 200) self.assertIn(b'dashboard', rv.data) def test_20_debug(self): rv = self.app.get('/debug/test_project') self.assertEqual(rv.status_code, 200) self.assertIn(b'debugger', rv.data) self.assertIn(b'var task_content = ', rv.data) self.assertIn(b'var script_content = ', rv.data) m = re.search(r'var task_content = (.*);\n', utils.text(rv.data)) self.assertIsNotNone(m) self.assertIn('test_project', json.loads(m.group(1))) m = re.search(r'var script_content = (.*);\n', utils.text(rv.data)) self.assertIsNotNone(m) self.assertIn('__START_URL__', json.loads(m.group(1))) def test_25_debug_post(self): rv = self.app.post('/debug/test_project', data={ 'project-name': 'other_project', 'start-urls': 'http://127.0.0.1:14887/pyspider/test.html', 'script-mode': 'script', }) self.assertEqual(rv.status_code, 200) self.assertIn(b'debugger', rv.data) self.assertIn(b'var task_content = ', rv.data) self.assertIn(b'var script_content = ', rv.data) m = re.search(r'var task_content = (.*);\n', utils.text(rv.data)) self.assertIsNotNone(m) self.assertIn('test_project', m.group(1)) self.__class__.task_content = json.loads(m.group(1)) m = re.search(r'var script_content = (.*);\n', utils.text(rv.data)) self.assertIsNotNone(m) self.assertIn('127.0.0.1:14887', m.group(1)) self.__class__.script_content = json.loads(m.group(1)) def test_30_run(self): rv = self.app.post('/debug/test_project/run', data={ 'script': self.script_content, 'task': self.task_content }) self.assertEqual(rv.status_code, 200) data = json.loads(utils.text(rv.data)) self.assertIn(b'follows', rv.data) self.assertGreater(len(data['follows']), 0) self.__class__.task_content2 = data['follows'][0] def test_32_run_bad_task(self): rv = self.app.post('/debug/test_project/run', data={ 'script': self.script_content, 'task': self.task_content+'asdfasdf312!@#' }) self.assertEqual(rv.status_code, 200) data = json.loads(utils.text(rv.data)) self.assertGreater(len(data['logs']), 0) self.assertEqual(len(data['follows']), 0) def test_33_run_bad_script(self): rv = self.app.post('/debug/test_project/run', data={ 'script': self.script_content+'adfasfasdf', 'task': self.task_content }) self.assertEqual(rv.status_code, 200) data = json.loads(utils.text(rv.data)) self.assertGreater(len(data['logs']), 0) self.assertEqual(len(data['follows']), 0) def test_35_run_http_task(self): rv = self.app.post('/debug/test_project/run', data={ 'script': self.script_content, 'task': json.dumps(self.task_content2) }) self.assertEqual(rv.status_code, 200) data = json.loads(utils.text(rv.data)) self.assertIn('follows', data) def test_40_save(self): rv = self.app.post('/debug/test_project/save', data={ 'script': self.script_content, }) self.assertEqual(rv.status_code, 200) self.assertIn(b'ok', rv.data) def test_42_get(self): rv = self.app.get('/debug/test_project/get') self.assertEqual(rv.status_code, 200) data = json.loads(utils.text(rv.data)) self.assertIn('script', data) self.assertEqual(data['script'], self.script_content) def test_45_run_with_saved_script(self): rv = self.app.post('/debug/test_project/run', data={ 'webdav_mode': 'true', 'script': '', 'task': self.task_content }) self.assertEqual(rv.status_code, 200) data = json.loads(utils.text(rv.data)) self.assertIn(b'follows', rv.data) self.assertGreater(len(data['follows']), 0) self.__class__.task_content2 = data['follows'][0] def test_50_index_page_list(self): rv = self.app.get('/') self.assertEqual(rv.status_code, 200) self.assertIn(b'"test_project"', rv.data) def test_52_change_status(self): rv = self.app.post('/update', data={ 'name': 'status', 'value': 'RUNNING', 'pk': 'test_project' }) self.assertEqual(rv.status_code, 200) self.assertIn(b'ok', rv.data) def test_55_reopen(self): rv = self.app.get('/debug/test_project') self.assertEqual(rv.status_code, 200) self.assertIn(b'debugger', rv.data) def test_57_resave(self): rv = self.app.post('/debug/test_project/save', data={ 'script': self.script_content, }) self.assertEqual(rv.status_code, 200) self.assertIn(b'ok', rv.data) def test_58_index_page_list(self): rv = self.app.get('/') self.assertEqual(rv.status_code, 200) self.assertIn(b'CHECKING', rv.data) def test_60_change_rate(self): rv = self.app.post('/update', data={ 'name': 'rate', 'value': '1/4', 'pk': 'test_project' }) self.assertEqual(rv.status_code, 200) self.assertIn(b'ok', rv.data) def test_70_change_status(self): rv = self.app.post('/update', data={ 'name': 'status', 'value': 'RUNNING', 'pk': 'test_project' }) self.assertEqual(rv.status_code, 200) self.assertIn(b'ok', rv.data) def test_80_change_group(self): rv = self.app.post('/update', data={ 'name': 'group', 'value': 'test_binux', 'pk': 'test_project' }) self.assertEqual(rv.status_code, 200) self.assertIn(b'ok', rv.data) rv = self.app.get('/') self.assertEqual(rv.status_code, 200) self.assertIn(b'test_binux', rv.data) def test_90_run(self): time.sleep(0.5) rv = self.app.post('/run', data={ 'project': 'test_project', }) self.assertEqual(rv.status_code, 200) self.assertEqual(json.loads(utils.text(rv.data))['result'], True) def test_a10_counter(self): for i in range(30): time.sleep(1) if self.rpc.counter('5m', 'sum')\ .get('test_project', {}).get('success', 0) > 5: break rv = self.app.get('/counter') self.assertEqual(rv.status_code, 200) data = json.loads(utils.text(rv.data)) self.assertGreater(len(data), 0) self.assertGreater(data['test_project']['5m']['success'], 3) self.assertGreater(data['test_project']['1h']['success'], 3) self.assertGreater(data['test_project']['1d']['success'], 3) self.assertGreater(data['test_project']['all']['success'], 3) def test_a15_queues(self): rv = self.app.get('/queues') self.assertEqual(rv.status_code, 200) data = json.loads(utils.text(rv.data)) self.assertGreater(len(data), 0) self.assertIn('scheduler2fetcher', data) self.assertIn('fetcher2processor', data) self.assertIn('processor2result', data) self.assertIn('newtask_queue', data) self.assertIn('status_queue', data) def test_a20_tasks(self): rv = self.app.get('/tasks') self.assertEqual(rv.status_code, 200, rv.data) self.assertIn(b'SUCCESS', rv.data) self.assertNotIn(b'>ERROR', rv.data) m = re.search(r'/task/test_project:[^"]+', utils.text(rv.data)) self.assertIsNotNone(m) self.__class__.task_url = m.group(0) self.assertIsNotNone(self.task_url) m = re.search(r'/debug/test_project[^"]+', utils.text(rv.data)) self.assertIsNotNone(m) self.__class__.debug_task_url = m.group(0) self.assertIsNotNone(self.debug_task_url) rv = self.app.get('/tasks?project=test_project') self.assertEqual(rv.status_code, 200) self.assertIn(b'SUCCESS', rv.data) self.assertNotIn(b'>ERROR', rv.data) def test_a22_active_tasks(self): rv = self.app.get('/active_tasks') data = json.loads(utils.text(rv.data)) track = False self.assertGreater(len(data), 0) for task in data: for k in ('taskid', 'project', 'url', 'updatetime'): self.assertIn(k, task) if task.get('track'): track = True self.assertIn('fetch', task['track']) self.assertIn('ok', task['track']['fetch']) self.assertIn('time', task['track']['fetch']) self.assertIn('process', task['track']) self.assertIn('ok', task['track']['process']) self.assertIn('time', task['track']['process']) self.assertTrue(track) def test_a24_task(self): rv = self.app.get(self.task_url) self.assertEqual(rv.status_code, 200) self.assertIn(b'lastcrawltime', rv.data) def test_a25_task_json(self): rv = self.app.get(self.task_url + '.json') self.assertEqual(rv.status_code, 200) self.assertIn('status_string', json.loads(utils.text(rv.data))) def test_a26_debug_task(self): rv = self.app.get(self.debug_task_url) self.assertEqual(rv.status_code, 200) def test_a30_results(self): rv = self.app.get('/results?project=test_project') self.assertEqual(rv.status_code, 200) self.assertIn(b'url', rv.data) self.assertIn(b'open-url', rv.data) def test_a30_export_json(self): rv = self.app.get('/results/dump/test_project.json') self.assertEqual(rv.status_code, 200) self.assertIn(b'"taskid":', rv.data) def test_a32_export_json_style_full(self): rv = self.app.get('/results/dump/test_project.json?style=full') self.assertEqual(rv.status_code, 200) data = json.loads(rv.data.decode('utf8')) self.assertGreater(len(data), 1) def test_a34_export_json_style_full_limit_1(self): rv = self.app.get('/results/dump/test_project.json?style=full&limit=1&offset=1') self.assertEqual(rv.status_code, 200) data = json.loads(rv.data.decode('utf8')) self.assertEqual(len(data), 1) def test_a40_export_url_json(self): rv = self.app.get('/results/dump/test_project.txt') self.assertEqual(rv.status_code, 200) self.assertIn(b'"url":', rv.data) def test_a50_export_csv(self): rv = self.app.get('/results/dump/test_project.csv') self.assertEqual(rv.status_code, 200) self.assertIn(b'url,title,url', rv.data) def test_a60_fetch_via_cannot_connect_fetcher(self): ctx = run.webui.make_context('webui', [ '--fetcher-rpc', 'http://localhost:20000/', ], self.ctx) app = run.webui.invoke(ctx) app = app.test_client() rv = app.post('/debug/test_project/run', data={ 'script': self.script_content, 'task': self.task_content }) self.assertEqual(rv.status_code, 200) data = json.loads(utils.text(rv.data)) self.assertGreater(len(data['logs']), 0) self.assertEqual(len(data['follows']), 0) def test_a70_fetch_via_fetcher(self): ctx = run.webui.make_context('webui', [ '--fetcher-rpc', 'http://localhost:24444/', ], self.ctx) app = run.webui.invoke(ctx) app = app.test_client() rv = app.post('/debug/test_project/run', data={ 'script': self.script_content, 'task': self.task_content }) self.assertEqual(rv.status_code, 200) data = json.loads(utils.text(rv.data)) self.assertEqual(len(data['logs']), 0, data['logs']) self.assertIn(b'follows', rv.data) self.assertGreater(len(data['follows']), 0) def test_h000_auth(self): ctx = run.webui.make_context('webui', [ '--scheduler-rpc', 'http://localhost:23333/', '--username', 'binux', '--password', '4321', ], self.ctx) app = run.webui.invoke(ctx) self.__class__.app = app.test_client() self.__class__.rpc = app.config['scheduler_rpc'] def test_h005_no_such_project(self): rv = self.app.post('/update', data={ 'name': 'group', 'value': 'lock', 'pk': 'not_exist_project' }) self.assertEqual(rv.status_code, 404) def test_h005_unknown_field(self): rv = self.app.post('/update', data={ 'name': 'unknown_field', 'value': 'lock', 'pk': 'test_project' }) self.assertEqual(rv.status_code, 400) def test_h005_rate_wrong_format(self): rv = self.app.post('/update', data={ 'name': 'rate', 'value': 'xxx', 'pk': 'test_project' }) self.assertEqual(rv.status_code, 400) def test_h010_change_group(self): rv = self.app.post('/update', data={ 'name': 'group', 'value': 'lock', 'pk': 'test_project' }) self.assertEqual(rv.status_code, 200) self.assertIn(b'ok', rv.data) rv = self.app.get('/') self.assertEqual(rv.status_code, 200) self.assertIn(b'lock', rv.data) def test_h020_change_group_lock_failed(self): rv = self.app.post('/update', data={ 'name': 'group', 'value': '', 'pk': 'test_project' }) self.assertEqual(rv.status_code, 401) def test_h020_change_group_lock_ok(self): rv = self.app.post('/update', data={ 'name': 'group', 'value': 'test_binux', 'pk': 'test_project' }, headers={ 'Authorization': 'Basic YmludXg6NDMyMQ==' }) self.assertEqual(rv.status_code, 200) def test_h030_need_auth(self): ctx = run.webui.make_context('webui', [ '--scheduler-rpc', 'http://localhost:23333/', '--username', 'binux', '--password', '4321', '--need-auth', ], self.ctx) app = run.webui.invoke(ctx) self.__class__.app = app.test_client() self.__class__.rpc = app.config['scheduler_rpc'] def test_h040_auth_fail(self): rv = self.app.get('/') self.assertEqual(rv.status_code, 401) def test_h050_auth_fail2(self): rv = self.app.get('/', headers={ 'Authorization': 'Basic Ymlasdfsd' }) self.assertEqual(rv.status_code, 401) def test_h060_auth_fail3(self): rv = self.app.get('/', headers={ 'Authorization': 'Basic YmludXg6MQ==' }) self.assertEqual(rv.status_code, 401) def test_h070_auth_ok(self): rv = self.app.get('/', headers={ 'Authorization': 'Basic YmludXg6NDMyMQ==' }) self.assertEqual(rv.status_code, 200) def test_x0_disconnected_scheduler(self): ctx = run.webui.make_context('webui', [ '--scheduler-rpc', 'http://localhost:23458/' ], self.ctx) app = run.webui.invoke(ctx) self.__class__.app = app.test_client() self.__class__.rpc = app.config['scheduler_rpc'] def test_x10_project_update(self): rv = self.app.post('/update', data={ 'name': 'status', 'value': 'RUNNING', 'pk': 'test_project' }) self.assertEqual(rv.status_code, 200) self.assertNotIn(b'ok', rv.data) def test_x20_counter(self): rv = self.app.get('/counter?time=5m&type=sum') self.assertEqual(rv.status_code, 200) self.assertEqual(json.loads(utils.text(rv.data)), {}) def test_x30_run_not_exists_project(self): rv = self.app.post('/run', data={ 'project': 'not_exist_project', }) self.assertEqual(rv.status_code, 404) def test_x30_run(self): rv = self.app.post('/run', data={ 'project': 'test_project', }) self.assertEqual(rv.status_code, 200) self.assertEqual(json.loads(utils.text(rv.data))['result'], False) def test_x40_debug_save(self): rv = self.app.post('/debug/test_project/save', data={ 'script': self.script_content, }) self.assertEqual(rv.status_code, 200) self.assertNotIn(b'ok', rv.data) def test_x50_tasks(self): rv = self.app.get('/tasks') self.assertEqual(rv.status_code, 502) def test_x60_robots(self): rv = self.app.get('/robots.txt') self.assertEqual(rv.status_code, 200) self.assertIn(b'ser-agent', rv.data) def test_x70_bench(self): rv = self.app.get('/bench?total=10&show=5') self.assertEqual(rv.status_code, 200) ================================================ FILE: tests/test_xmlrpc.py ================================================ # Copyright (c) 2006-2007 Open Source Applications Foundation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # # Origin: https://code.google.com/p/wsgi-xmlrpc/ import unittest import tornado.wsgi import tornado.ioloop import tornado.httpserver from pyspider.libs import utils class TestXMLRPCServer(unittest.TestCase): @classmethod def setUpClass(self): from pyspider.libs import wsgi_xmlrpc def test_1(): return 'test_1' class Test2(object): def test_3(self, obj): return obj test = Test2() application = wsgi_xmlrpc.WSGIXMLRPCApplication() application.register_instance(Test2()) application.register_function(test_1) container = tornado.wsgi.WSGIContainer(application) self.io_loop = tornado.ioloop.IOLoop.current() http_server = tornado.httpserver.HTTPServer(container, io_loop=self.io_loop) http_server.listen(3423) self.thread = utils.run_in_thread(self.io_loop.start) @classmethod def tearDownClass(self): self.io_loop.add_callback(self.io_loop.stop) self.thread.join() def test_xmlrpc_server(self, uri='http://127.0.0.1:3423'): from six.moves.xmlrpc_client import ServerProxy client = ServerProxy(uri) assert client.test_1() == 'test_1' assert client.test_3({'asdf':4}) == {'asdf':4} ================================================ FILE: tools/migrate.py ================================================ #!/usr/bin/env python # -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # Created on 2015-09-30 23:22:46 import click import logging from pyspider.database.base.projectdb import ProjectDB from pyspider.database.base.taskdb import TaskDB from pyspider.database.base.resultdb import ResultDB from pyspider.database import connect_database from pyspider.libs.utils import unicode_obj from multiprocessing.pool import ThreadPool as Pool logging.getLogger().setLevel(logging.INFO) def taskdb_migrating(project, from_connection, to_connection): logging.info("taskdb: %s", project) f = connect_database(from_connection) t = connect_database(to_connection) t.drop(project) for status in range(1, 5): for task in f.load_tasks(status, project=project): t.insert(project, task['taskid'], task) def resultdb_migrating(project, from_connection, to_connection): logging.info("resultdb: %s", project) f = connect_database(from_connection) t = connect_database(to_connection) t.drop(project) for result in f.select(project): t.save(project, result['taskid'], result['url'], result['result']) @click.command() @click.option('--pool', default=10, help='cocurrent worker size.') @click.argument('from_connection', required=1) @click.argument('to_connection', required=1) def migrate(pool, from_connection, to_connection): """ Migrate tool for pyspider """ f = connect_database(from_connection) t = connect_database(to_connection) if isinstance(f, ProjectDB): for each in f.get_all(): each = unicode_obj(each) logging.info("projectdb: %s", each['name']) t.drop(each['name']) t.insert(each['name'], each) elif isinstance(f, TaskDB): pool = Pool(pool) pool.map( lambda x, f=from_connection, t=to_connection: taskdb_migrating(x, f, t), f.projects) elif isinstance(f, ResultDB): pool = Pool(pool) pool.map( lambda x, f=from_connection, t=to_connection: resultdb_migrating(x, f, t), f.projects) if __name__ == '__main__': migrate() ================================================ FILE: tox.ini ================================================ [tox] envlist = py35,py36,py37,py38 [testenv] install_command = pip install --allow-all-external 'https://dev.mysql.com/get/Downloads/Connector-Python/mysql-connector-python-2.1.5.zip#md5=ce4a24cb1746c1c8f6189a97087f21c1' {opts} -e .[all,test] {packages} commands = python setup.py test []