Repository: binux/pyspider
Branch: master
Commit: 897891cafb21
Files: 165
Total size: 775.9 KB

Directory structure:
gitextract_jmd7ykkk/

├── .coveragerc
├── .github/
│   └── ISSUE_TEMPLATE.md
├── .gitignore
├── .travis.yml
├── Dockerfile
├── LICENSE
├── MANIFEST.in
├── README.md
├── config_example.json
├── docker-compose.yaml
├── docs/
│   ├── About-Projects.md
│   ├── About-Tasks.md
│   ├── Architecture.md
│   ├── Command-Line.md
│   ├── Deployment-demo.pyspider.org.md
│   ├── Deployment.md
│   ├── Frequently-Asked-Questions.md
│   ├── Quickstart.md
│   ├── Running-pyspider-with-Docker.md
│   ├── Script-Environment.md
│   ├── Working-with-Results.md
│   ├── apis/
│   │   ├── @catch_status_code_error.md
│   │   ├── @every.md
│   │   ├── Response.md
│   │   ├── index.md
│   │   ├── self.crawl.md
│   │   └── self.send_message.md
│   ├── conf.py
│   ├── index.md
│   └── tutorial/
│       ├── AJAX-and-more-HTTP.md
│       ├── HTML-and-CSS-Selector.md
│       ├── Render-with-PhantomJS.md
│       └── index.md
├── mkdocs.yml
├── pyspider/
│   ├── __init__.py
│   ├── database/
│   │   ├── __init__.py
│   │   ├── base/
│   │   │   ├── __init__.py
│   │   │   ├── projectdb.py
│   │   │   ├── resultdb.py
│   │   │   └── taskdb.py
│   │   ├── basedb.py
│   │   ├── couchdb/
│   │   │   ├── __init__.py
│   │   │   ├── couchdbbase.py
│   │   │   ├── projectdb.py
│   │   │   ├── resultdb.py
│   │   │   └── taskdb.py
│   │   ├── elasticsearch/
│   │   │   ├── __init__.py
│   │   │   ├── projectdb.py
│   │   │   ├── resultdb.py
│   │   │   └── taskdb.py
│   │   ├── local/
│   │   │   ├── __init__.py
│   │   │   └── projectdb.py
│   │   ├── mongodb/
│   │   │   ├── __init__.py
│   │   │   ├── mongodbbase.py
│   │   │   ├── projectdb.py
│   │   │   ├── resultdb.py
│   │   │   └── taskdb.py
│   │   ├── mysql/
│   │   │   ├── __init__.py
│   │   │   ├── mysqlbase.py
│   │   │   ├── projectdb.py
│   │   │   ├── resultdb.py
│   │   │   └── taskdb.py
│   │   ├── redis/
│   │   │   ├── __init__.py
│   │   │   └── taskdb.py
│   │   ├── sqlalchemy/
│   │   │   ├── __init__.py
│   │   │   ├── projectdb.py
│   │   │   ├── resultdb.py
│   │   │   ├── sqlalchemybase.py
│   │   │   └── taskdb.py
│   │   └── sqlite/
│   │       ├── __init__.py
│   │       ├── projectdb.py
│   │       ├── resultdb.py
│   │       ├── sqlitebase.py
│   │       └── taskdb.py
│   ├── fetcher/
│   │   ├── __init__.py
│   │   ├── cookie_utils.py
│   │   ├── phantomjs_fetcher.js
│   │   ├── puppeteer_fetcher.js
│   │   ├── splash_fetcher.lua
│   │   └── tornado_fetcher.py
│   ├── libs/
│   │   ├── ListIO.py
│   │   ├── __init__.py
│   │   ├── base_handler.py
│   │   ├── bench.py
│   │   ├── counter.py
│   │   ├── dataurl.py
│   │   ├── log.py
│   │   ├── multiprocessing_queue.py
│   │   ├── pprint.py
│   │   ├── response.py
│   │   ├── result_dump.py
│   │   ├── sample_handler.py
│   │   ├── url.py
│   │   ├── utils.py
│   │   └── wsgi_xmlrpc.py
│   ├── logging.conf
│   ├── message_queue/
│   │   ├── __init__.py
│   │   ├── kombu_queue.py
│   │   ├── rabbitmq.py
│   │   └── redis_queue.py
│   ├── processor/
│   │   ├── __init__.py
│   │   ├── processor.py
│   │   └── project_module.py
│   ├── result/
│   │   ├── __init__.py
│   │   └── result_worker.py
│   ├── run.py
│   ├── scheduler/
│   │   ├── __init__.py
│   │   ├── scheduler.py
│   │   ├── task_queue.py
│   │   └── token_bucket.py
│   └── webui/
│       ├── __init__.py
│       ├── app.py
│       ├── bench_test.py
│       ├── debug.py
│       ├── index.py
│       ├── login.py
│       ├── result.py
│       ├── static/
│       │   ├── .babelrc
│       │   ├── package.json
│       │   ├── src/
│       │   │   ├── css_selector_helper.js
│       │   │   ├── debug.js
│       │   │   ├── debug.less
│       │   │   ├── index.js
│       │   │   ├── index.less
│       │   │   ├── result.less
│       │   │   ├── splitter.js
│       │   │   ├── task.less
│       │   │   ├── tasks.less
│       │   │   └── variable.less
│       │   └── webpack.config.js
│       ├── task.py
│       ├── templates/
│       │   ├── debug.html
│       │   ├── index.html
│       │   ├── result.html
│       │   ├── task.html
│       │   └── tasks.html
│       └── webdav.py
├── requirements.txt
├── run.py
├── setup.py
├── tests/
│   ├── __init__.py
│   ├── data_fetcher_processor_handler.py
│   ├── data_handler.py
│   ├── data_sample_handler.py
│   ├── data_test_webpage.py
│   ├── test_base_handler.py
│   ├── test_bench.py
│   ├── test_counter.py
│   ├── test_database.py
│   ├── test_fetcher.py
│   ├── test_fetcher_processor.py
│   ├── test_message_queue.py
│   ├── test_processor.py
│   ├── test_response.py
│   ├── test_result_dump.py
│   ├── test_result_worker.py
│   ├── test_run.py
│   ├── test_scheduler.py
│   ├── test_task_queue.py
│   ├── test_utils.py
│   ├── test_webdav.py
│   ├── test_webui.py
│   └── test_xmlrpc.py
├── tools/
│   └── migrate.py
└── tox.ini

================================================
FILE CONTENTS
================================================

================================================
FILE: .coveragerc
================================================
[run]
source =
    pyspider
parallel = True

[report]
omit =
    pyspider/libs/sample_handler.py
    pyspider/libs/pprint.py

exclude_lines =
    pragma: no cover
    def __repr__
    if self.debug:
    if settings.DEBUG
    raise AssertionError
    raise NotImplementedError
    if 0:
    if __name__ == .__main__.:
    except ImportError:
    pass


================================================
FILE: .github/ISSUE_TEMPLATE.md
================================================
<!--
Thanks for using pyspider!

如果你需要使用中文提问，请将问题提交到 https://segmentfault.com/t/pyspider
-->

* pyspider version:
* Operating system:
* Start up command:

### Expected behavior

<!-- What do you think should happen? -->

### Actual behavior

<!-- What actually happens? -->

### How to reproduce

<!-- 

The best chance of getting help is providing enough information that can be reproduce the issue you have.

If it's related to API or extraction behavior, please paste the script of your project.
If it's related to scheduling of whole project, please paste the screenshot of queue status on the top in dashboard.

-->


================================================
FILE: .gitignore
================================================
*.py[cod]
data/*
.venv
.idea
# C extensions
*.so

# Packages
*.egg
*.egg-info
dist
build
eggs
parts
bin
var
sdist
develop-eggs
.installed.cfg
lib
lib64
__pycache__

# Installer logs
pip-log.txt

# Unit test / coverage reports
.coverage
.tox
nosetests.xml

# Translations
*.mo

# Mr Developer
.mr.developer.cfg
.project
.pydevproject
.idea


================================================
FILE: .travis.yml
================================================
language: python
cache: pip
python:
  - 3.5
  - 3.6
  - 3.7
  #- 3.8
services:
    - docker
    - mongodb
    - rabbitmq
    - redis
    - mysql
    # - elasticsearch
    - postgresql
addons:
  postgresql: "9.4"
  apt:
    packages:
    - rabbitmq-server
env:
    - IGNORE_COUCHDB=1

before_install:
    - sudo apt-get update -qq
    - curl -O https://download.elastic.co/elasticsearch/release/org/elasticsearch/distribution/deb/elasticsearch/2.4.0/elasticsearch-2.4.0.deb && sudo dpkg -i --force-confnew elasticsearch-2.4.0.deb && sudo service elasticsearch restart
    - npm install express puppeteer
    - sudo docker pull scrapinghub/splash
    - sudo docker run -d --net=host scrapinghub/splash
before_script:
    - psql -c "CREATE DATABASE pyspider_test_taskdb ENCODING 'UTF8' TEMPLATE=template0;" -U postgres
    - psql -c "CREATE DATABASE pyspider_test_projectdb ENCODING 'UTF8' TEMPLATE=template0;" -U postgres
    - psql -c "CREATE DATABASE pyspider_test_resultdb ENCODING 'UTF8' TEMPLATE=template0;" -U postgres
    - sleep 10
install:
    - pip install https://github.com/marcus67/easywebdav/archive/master.zip
    - sudo apt-get install libgnutls28-dev
    - pip install -e .[all,test]
    - pip install coveralls
script:
    - coverage run setup.py test
after_success:
    - coverage combine
    - coveralls


================================================
FILE: Dockerfile
================================================
FROM python:3.6
MAINTAINER binux <roy@binux.me>

# install phantomjs
RUN mkdir -p /opt/phantomjs \
        && cd /opt/phantomjs \
        && wget -O phantomjs.tar.bz2 https://bitbucket.org/ariya/phantomjs/downloads/phantomjs-2.1.1-linux-x86_64.tar.bz2 \
        && tar xavf phantomjs.tar.bz2 --strip-components 1 \
        && ln -s /opt/phantomjs/bin/phantomjs /usr/local/bin/phantomjs \
        && rm phantomjs.tar.bz2
# Fix Error: libssl_conf.so: cannot open shared object file: No such file or directory
ENV OPENSSL_CONF=/etc/ssl/

# install nodejs
ENV NODEJS_VERSION=8.15.0 \
    PATH=$PATH:/opt/node/bin
WORKDIR "/opt/node"
RUN apt-get -qq update && apt-get -qq install -y curl ca-certificates libx11-xcb1 libxtst6 libnss3 libasound2 libatk-bridge2.0-0 libgtk-3-0 --no-install-recommends && \
    curl -sL https://nodejs.org/dist/v${NODEJS_VERSION}/node-v${NODEJS_VERSION}-linux-x64.tar.gz | tar xz --strip-components=1 && \
    rm -rf /var/lib/apt/lists/*
RUN npm install puppeteer express

# install requirements
COPY requirements.txt /opt/pyspider/requirements.txt
RUN pip install -r /opt/pyspider/requirements.txt

# add all repo
ADD ./ /opt/pyspider

# run test
WORKDIR /opt/pyspider
RUN pip install -e .[all]

# Create a symbolic link to node_modules
RUN ln -s /opt/node/node_modules ./node_modules

#VOLUME ["/opt/pyspider"]
ENTRYPOINT ["pyspider"]

EXPOSE 5000 23333 24444 25555 22222


================================================
FILE: LICENSE
================================================
Apache License
                           Version 2.0, January 2004
                        http://www.apache.org/licenses/

   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION

   1. Definitions.

      "License" shall mean the terms and conditions for use, reproduction,
      and distribution as defined by Sections 1 through 9 of this document.

      "Licensor" shall mean the copyright owner or entity authorized by
      the copyright owner that is granting the License.

      "Legal Entity" shall mean the union of the acting entity and all
      other entities that control, are controlled by, or are under common
      control with that entity. For the purposes of this definition,
      "control" means (i) the power, direct or indirect, to cause the
      direction or management of such entity, whether by contract or
      otherwise, or (ii) ownership of fifty percent (50%) or more of the
      outstanding shares, or (iii) beneficial ownership of such entity.

      "You" (or "Your") shall mean an individual or Legal Entity
      exercising permissions granted by this License.

      "Source" form shall mean the preferred form for making modifications,
      including but not limited to software source code, documentation
      source, and configuration files.

      "Object" form shall mean any form resulting from mechanical
      transformation or translation of a Source form, including but
      not limited to compiled object code, generated documentation,
      and conversions to other media types.

      "Work" shall mean the work of authorship, whether in Source or
      Object form, made available under the License, as indicated by a
      copyright notice that is included in or attached to the work
      (an example is provided in the Appendix below).

      "Derivative Works" shall mean any work, whether in Source or Object
      form, that is based on (or derived from) the Work and for which the
      editorial revisions, annotations, elaborations, or other modifications
      represent, as a whole, an original work of authorship. For the purposes
      of this License, Derivative Works shall not include works that remain
      separable from, or merely link (or bind by name) to the interfaces of,
      the Work and Derivative Works thereof.

      "Contribution" shall mean any work of authorship, including
      the original version of the Work and any modifications or additions
      to that Work or Derivative Works thereof, that is intentionally
      submitted to Licensor for inclusion in the Work by the copyright owner
      or by an individual or Legal Entity authorized to submit on behalf of
      the copyright owner. For the purposes of this definition, "submitted"
      means any form of electronic, verbal, or written communication sent
      to the Licensor or its representatives, including but not limited to
      communication on electronic mailing lists, source code control systems,
      and issue tracking systems that are managed by, or on behalf of, the
      Licensor for the purpose of discussing and improving the Work, but
      excluding communication that is conspicuously marked or otherwise
      designated in writing by the copyright owner as "Not a Contribution."

      "Contributor" shall mean Licensor and any individual or Legal Entity
      on behalf of whom a Contribution has been received by Licensor and
      subsequently incorporated within the Work.

   2. Grant of Copyright License. Subject to the terms and conditions of
      this License, each Contributor hereby grants to You a perpetual,
      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
      copyright license to reproduce, prepare Derivative Works of,
      publicly display, publicly perform, sublicense, and distribute the
      Work and such Derivative Works in Source or Object form.

   3. Grant of Patent License. Subject to the terms and conditions of
      this License, each Contributor hereby grants to You a perpetual,
      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
      (except as stated in this section) patent license to make, have made,
      use, offer to sell, sell, import, and otherwise transfer the Work,
      where such license applies only to those patent claims licensable
      by such Contributor that are necessarily infringed by their
      Contribution(s) alone or by combination of their Contribution(s)
      with the Work to which such Contribution(s) was submitted. If You
      institute patent litigation against any entity (including a
      cross-claim or counterclaim in a lawsuit) alleging that the Work
      or a Contribution incorporated within the Work constitutes direct
      or contributory patent infringement, then any patent licenses
      granted to You under this License for that Work shall terminate
      as of the date such litigation is filed.

   4. Redistribution. You may reproduce and distribute copies of the
      Work or Derivative Works thereof in any medium, with or without
      modifications, and in Source or Object form, provided that You
      meet the following conditions:

      (a) You must give any other recipients of the Work or
          Derivative Works a copy of this License; and

      (b) You must cause any modified files to carry prominent notices
          stating that You changed the files; and

      (c) You must retain, in the Source form of any Derivative Works
          that You distribute, all copyright, patent, trademark, and
          attribution notices from the Source form of the Work,
          excluding those notices that do not pertain to any part of
          the Derivative Works; and

      (d) If the Work includes a "NOTICE" text file as part of its
          distribution, then any Derivative Works that You distribute must
          include a readable copy of the attribution notices contained
          within such NOTICE file, excluding those notices that do not
          pertain to any part of the Derivative Works, in at least one
          of the following places: within a NOTICE text file distributed
          as part of the Derivative Works; within the Source form or
          documentation, if provided along with the Derivative Works; or,
          within a display generated by the Derivative Works, if and
          wherever such third-party notices normally appear. The contents
          of the NOTICE file are for informational purposes only and
          do not modify the License. You may add Your own attribution
          notices within Derivative Works that You distribute, alongside
          or as an addendum to the NOTICE text from the Work, provided
          that such additional attribution notices cannot be construed
          as modifying the License.

      You may add Your own copyright statement to Your modifications and
      may provide additional or different license terms and conditions
      for use, reproduction, or distribution of Your modifications, or
      for any such Derivative Works as a whole, provided Your use,
      reproduction, and distribution of the Work otherwise complies with
      the conditions stated in this License.

   5. Submission of Contributions. Unless You explicitly state otherwise,
      any Contribution intentionally submitted for inclusion in the Work
      by You to the Licensor shall be under the terms and conditions of
      this License, without any additional terms or conditions.
      Notwithstanding the above, nothing herein shall supersede or modify
      the terms of any separate license agreement you may have executed
      with Licensor regarding such Contributions.

   6. Trademarks. This License does not grant permission to use the trade
      names, trademarks, service marks, or product names of the Licensor,
      except as required for reasonable and customary use in describing the
      origin of the Work and reproducing the content of the NOTICE file.

   7. Disclaimer of Warranty. Unless required by applicable law or
      agreed to in writing, Licensor provides the Work (and each
      Contributor provides its Contributions) on an "AS IS" BASIS,
      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
      implied, including, without limitation, any warranties or conditions
      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
      PARTICULAR PURPOSE. You are solely responsible for determining the
      appropriateness of using or redistributing the Work and assume any
      risks associated with Your exercise of permissions under this License.

   8. Limitation of Liability. In no event and under no legal theory,
      whether in tort (including negligence), contract, or otherwise,
      unless required by applicable law (such as deliberate and grossly
      negligent acts) or agreed to in writing, shall any Contributor be
      liable to You for damages, including any direct, indirect, special,
      incidental, or consequential damages of any character arising as a
      result of this License or out of the use or inability to use the
      Work (including but not limited to damages for loss of goodwill,
      work stoppage, computer failure or malfunction, or any and all
      other commercial damages or losses), even if such Contributor
      has been advised of the possibility of such damages.

   9. Accepting Warranty or Additional Liability. While redistributing
      the Work or Derivative Works thereof, You may choose to offer,
      and charge a fee for, acceptance of support, warranty, indemnity,
      or other liability obligations and/or rights consistent with this
      License. However, in accepting such obligations, You may act only
      on Your own behalf and on Your sole responsibility, not on behalf
      of any other Contributor, and only if You agree to indemnify,
      defend, and hold each Contributor harmless for any liability
      incurred by, or claims asserted against, such Contributor by reason
      of your accepting any such warranty or additional liability.

   END OF TERMS AND CONDITIONS

   APPENDIX: How to apply the Apache License to your work.

      To apply the Apache License to your work, attach the following
      boilerplate notice, with the fields enclosed by brackets "{}"
      replaced with your own identifying information. (Don't include
      the brackets!)  The text should be enclosed in the appropriate
      comment syntax for the file format. We also recommend that a
      file or class name and description of purpose be included on the
      same "printed page" as the copyright notice for easier
      identification within third-party archives.

   Copyright 2014 Binux

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.


================================================
FILE: MANIFEST.in
================================================
include README.md
include requirements.txt
include Dockerfile
include LICENSE
include pyspider/logging.conf
include pyspider/webui/static/*
include pyspider/webui/templates/*


================================================
FILE: README.md
================================================
pyspider [![Build Status]][Travis CI] [![Coverage Status]][Coverage]
========

A Powerful Spider(Web Crawler) System in Python.

- Write script in Python
- Powerful WebUI with script editor, task monitor, project manager and result viewer
- [MySQL](https://www.mysql.com/), [MongoDB](https://www.mongodb.org/), [Redis](http://redis.io/), [SQLite](https://www.sqlite.org/), [Elasticsearch](https://www.elastic.co/products/elasticsearch); [PostgreSQL](http://www.postgresql.org/) with [SQLAlchemy](http://www.sqlalchemy.org/) as database backend
- [RabbitMQ](http://www.rabbitmq.com/), [Redis](http://redis.io/) and [Kombu](http://kombu.readthedocs.org/) as message queue
- Task priority, retry, periodical, recrawl by age, etc...
- Distributed architecture, Crawl Javascript pages, Python 2.{6,7}, 3.{3,4,5,6} support, etc...

Tutorial: [http://docs.pyspider.org/en/latest/tutorial/](http://docs.pyspider.org/en/latest/tutorial/)  
Documentation: [http://docs.pyspider.org/](http://docs.pyspider.org/)  
Release notes: [https://github.com/binux/pyspider/releases](https://github.com/binux/pyspider/releases)  

Sample Code 
-----------

```python
from pyspider.libs.base_handler import *


class Handler(BaseHandler):
    crawl_config = {
    }

    @every(minutes=24 * 60)
    def on_start(self):
        self.crawl('http://scrapy.org/', callback=self.index_page)

    @config(age=10 * 24 * 60 * 60)
    def index_page(self, response):
        for each in response.doc('a[href^="http"]').items():
            self.crawl(each.attr.href, callback=self.detail_page)

    def detail_page(self, response):
        return {
            "url": response.url,
            "title": response.doc('title').text(),
        }
```


Installation
------------

* `pip install pyspider`
* run command `pyspider`, visit [http://localhost:5000/](http://localhost:5000/)

**WARNING:** WebUI is open to the public by default, it can be used to execute any command which may harm your system. Please use it in an internal network or [enable `need-auth` for webui](http://docs.pyspider.org/en/latest/Command-Line/#-config).

Quickstart: [http://docs.pyspider.org/en/latest/Quickstart/](http://docs.pyspider.org/en/latest/Quickstart/)

Contribute
----------

* Use It
* Open [Issue], send PR
* [User Group]
* [中文问答](http://segmentfault.com/t/pyspider)


TODO
----

### v0.4.0

- [ ] a visual scraping interface like [portia](https://github.com/scrapinghub/portia)


License
-------
Licensed under the Apache License, Version 2.0


[Build Status]:         https://img.shields.io/travis/binux/pyspider/master.svg?style=flat
[Travis CI]:            https://travis-ci.org/binux/pyspider
[Coverage Status]:      https://img.shields.io/coveralls/binux/pyspider.svg?branch=master&style=flat
[Coverage]:             https://coveralls.io/r/binux/pyspider
[Try]:                  https://img.shields.io/badge/try-pyspider-blue.svg?style=flat
[Issue]:                https://github.com/binux/pyspider/issues
[User Group]:           https://groups.google.com/group/pyspider-users


================================================
FILE: config_example.json
================================================
{
  "taskdb": "couchdb+taskdb://user:password@couchdb:5984",
  "projectdb": "couchdb+projectdb://user:password@couchdb:5984",
  "resultdb": "couchdb+resultdb://user:password@couchdb:5984",
  "message_queue": "amqp://rabbitmq:5672/%2F",
  "webui": {
    "username": "username",
    "password": "password",
    "need-auth": true,
    "scheduler-rpc": "http://scheduler:23333",
    "fetcher-rpc": "http://fetcher:24444"
  }
}


================================================
FILE: docker-compose.yaml
================================================
version: "3.7"

# replace /path/to/dir/ to point to config.json

# The RabbitMQ and CouchDB services can take some time to startup.
# During this time most of the pyspider services will exit and restart.
# Once RabbitMQ and CouchDB are fully up and running everything should run as normal.

services:
  rabbitmq:
    image: rabbitmq:alpine
    container_name: rabbitmq
    networks:
      - pyspider
    command: rabbitmq-server
  mysql:
    image: mysql:latest
    container_name: mysql
    volumes:
      - /tmp:/var/lib/mysql
    environment:
      - MYSQL_ALLOW_EMPTY_PASSWORD=yes
    networks:
      - pyspider
  phantomjs:
    image: pyspider:latest
    container_name: phantomjs
    networks:
      - pyspider
    volumes:
      - ./config_example.json:/opt/pyspider/config.json
    command: -c config.json phantomjs
    depends_on:
      - couchdb
      - rabbitmq
    restart: unless-stopped
  result:
    image: pyspider:latest
    container_name: result
    networks:
      - pyspider
    volumes:
      - ./config_example.json:/opt/pyspider/config.json
    command: -c config.json result_worker
    depends_on:
      - couchdb
      - rabbitmq
    restart: unless-stopped # Sometimes we'll get a connection refused error because couchdb has yet to fully start
  processor:
    container_name: processor
    image: pyspider:latest
    networks:
      - pyspider
    volumes:
      - ./config_example.json:/opt/pyspider/config.json
    command: -c config.json processor
    depends_on:
      - couchdb
      - rabbitmq
    restart: unless-stopped
  fetcher:
    image: pyspider:latest
    container_name: fetcher
    networks:
      - pyspider
    volumes:
      - ./config_example.json:/opt/pyspider/config.json
    command : -c config.json fetcher
    depends_on:
      - couchdb
      - rabbitmq
    restart: unless-stopped
  scheduler:
    image: pyspider:latest
    container_name: scheduler
    networks:
      - pyspider
    volumes:
      - ./config_example.json:/opt/pyspider/config.json
    command: -c config.json scheduler
    depends_on:
      - couchdb
      - rabbitmq
    restart: unless-stopped
  webui:
    image: pyspider:latest
    container_name: webui
    ports:
      - "5050:5000"
    networks:
      - pyspider
    volumes:
      - ./config_example.json:/opt/pyspider/config.json
    command: -c config.json webui
    depends_on:
      - couchdb
      - rabbitmq
    restart: unless-stopped

networks:
  pyspider:
    external:
      name: pyspider
  default:
    driver: bridge


================================================
FILE: docs/About-Projects.md
================================================
About Projects
==============

In most cases, a project is one script you write for one website.

* Projects are independent, but you can import another project as a module with `from projects import other_project`
* A project has 5 status: `TODO`, `STOP`, `CHECKING`, `DEBUG` and `RUNNING`
    - `TODO` - a script is just created to be written
    - `STOP` - you can mark a project as `STOP` if you want it to STOP (= =).
    - `CHECKING` - when a running project is modified, to prevent incomplete modification, project status will be set as `CHECKING` automatically.
    - `DEBUG`/`RUNNING` - these two status have no difference to spider. But it's good to mark it as `DEBUG` when it's running the first time then change it to `RUNNING` after being checked.
* The crawl rate is controlled by `rate` and `burst` with [token-bucket](http://en.wikipedia.org/wiki/Token_bucket) algorithm.
    - `rate` - how many requests in one second
    - `burst` - consider this situation, `rate/burst = 0.1/3`, it means that the spider scrawls 1 page every 10 seconds. All tasks are finished, project is checking last updated items every minute. Assume that 3 new items are found, pyspider will "burst" and crawl 3 tasks without waiting 3*10 seconds. However, the fourth task needs wait 10 seconds.
* To delete a project, set `group` to `delete` and status to `STOP`, wait 24 hours.


`on_finished` callback
--------------------
You can override `on_finished` method in the project, the method would be triggered when the task_queue goes to 0.

Example 1: When you start a project to crawl a website with 100 pages, the `on_finished` callback will be fired when 100 pages are successfully crawled or failed after retries.

Example 2: A project with `auto_recrawl` tasks will **NEVER** trigger the `on_finished` callback, because time queue will never become 0 when there are auto_recrawl tasks in it.

Example 3: A project with `@every` decorated method will trigger the `on_finished` callback every time when the newly submitted tasks are finished.


================================================
FILE: docs/About-Tasks.md
================================================
About Tasks
===========

Tasks are the basic unit to be scheduled.

Basis
-----

* A task is differentiated by its `taskid`. (Default: `md5(url)`, can be changed by overriding the `def get_taskid(self, task)` method)
* Tasks are isolated between different projects.
* A Task has 4 status:
    - active
    - failed
    - success
    - bad - not used
* Only tasks in active status will be scheduled.
* Tasks are served in order of `priority`.

Schedule
--------

#### new task

When a new task (never seen before) comes in:

* If `exetime` is set but not arrived, it will be put into a time-based queue to wait.
* Otherwise it will be accepted.

When the task is already in the queue:

* Ignored unless `force_update`

When a completed task comes out:

* If `age` is set, `last_crawl_time + age < now` it will be accepted. Otherwise discarded.
* If `itag` is set and not equal to it's previous value, it will be accepted. Otherwise discarded.


#### task retry

When a fetch error or script error happens, the task will retry 3 times by default.

The first retry will execute every time after 30 seconds, 1 hour, 6 hours, 12 hours and any more retries will postpone 24 hours.

If `age` is specified, the retry delay will not larger then `age`.

You can config the retry delay by adding a variable named `retry_delay` to handler. `retry_delay` is a dict to specify retry intervals. The items in the dict are {retried: seconds}, and a special key: '' (empty string) is used to specify the default retry delay if not specified.

e.g. the default `retry_delay` declares like:


```
class MyHandler(BaseHandler):
    retry_delay = {
        0: 30,
        1: 1*60*60,
        2: 6*60*60,
        3: 12*60*60,
        '': 24*60*60
    }
```


================================================
FILE: docs/Architecture.md
================================================
Architecture
============

This document describes the reason why I made pyspider and the architecture.

Why
---
Two years ago, I was working on a vertical search engine. We are facing following needs on crawling:

1. collect 100-200 websites, they may on/offline or change their templates at any time
> We need a really powerful monitor to find out which website is changing. And a good tool to help us write script/template for each website.

2. data should be collected in 5min when website updated
> We solve this problem by check index page frequently, and use something like 'last update time' or 'last reply time' to determine which page is changed. In addition to this, we recheck pages after X days in case to prevent the omission.  
> **pyspider will never stop as WWW is changing all the time**

Furthermore, we have some APIs from our cooperators, the API may need POST, proxy, request signature etc. Full control from script is more convenient than some global parameters of components.

Overview
--------
The following diagram shows an overview of the pyspider architecture with its components and an outline of the data flow that takes place inside the system.

![pyspider](imgs/pyspider-arch.png)

Components are connected by message queue. Every component, including message queue, is running in their own process/thread, and replaceable. That means, when process is slow, you can have many instances of processor and make full use of multiple CPUs, or deploy to multiple machines. This architecture makes pyspider really fast. [benchmarking](https://gist.github.com/binux/67b276c51e988f8e2c31#comment-1339242).

Components
----------

### Scheduler
The Scheduler receives tasks from newtask_queue from processor. Decide whether the task is new or requires re-crawl. Sort tasks according to priority and feeding them to fetcher with traffic control ([token bucket](http://en.wikipedia.org/wiki/Token_bucket) algorithm). Take care of periodic tasks, lost tasks and failed tasks and retry later.

All of above can be set via `self.crawl` [API](apis/). 

Note that in current implement of scheduler, only one scheduler is allowed.

### Fetcher
The Fetcher is responsible for fetching web pages then send results to processor. For flexible, fetcher support [Data URI](http://en.wikipedia.org/wiki/Data_URI_scheme) and pages that rendered by JavaScript (via [phantomjs](http://phantomjs.org/)). Fetch method, headers, cookies, proxy, etag etc can be controlled by script via [API](apis/self.crawl/#fetch).

### Phantomjs Fetcher
Phantomjs Fetcher works like a proxy. It's connected to general Fetcher, fetch and render pages with JavaScript enabled, output a general HTML back to Fetcher:

```
scheduler -> fetcher -> processor
                |
            phantomjs
                |
             internet
```

### Processor
The Processor is responsible for running the script written by users to parse and extract information. Your script is running in an unlimited environment. Although we have various tools(like [PyQuery](https://pythonhosted.org/pyquery/)) for you to extract information and links, you can use anything you want to deal with the response. You may refer to [Script Environment](Script-Environment) and [API Reference](apis/) to get more information about script.

Processor will capture the exceptions and logs, send status(task track) and new tasks to `scheduler`, send results to `Result Worker`.

### Result Worker (optional)
Result worker receives results from `Processor`. Pyspider has a built-in result worker to save result to `resultdb`. Overwrite it to deal with result by your needs.

### WebUI
WebUI is a web frontend for everything. It contains:

* script editor, debugger
* project manager
* task monitor
* result viewer, exporter

Maybe webui is the most attractive part of pyspider. With this powerful UI, you can debug your scripts step by step just as pyspider do. Starting or stop a project. Finding which project is going wrong and what request is failed and try it again with debugger.

Data flow
---------
The data flow in pyspider is just as your seen in diagram above:

1. Each script has a callback named `on_start`, when you press the `Run` button on WebUI. A new task of `on_start` is submitted to Scheduler as the entries of project.
2. Scheduler dispatches this `on_start` task with a Data URI as a normal task to Fetcher.
3. Fetcher makes a request and a response to it (for Data URI, it's a fake request and response, but has no difference with other normal tasks), then feeds to Processor.
4. Processor calls the `on_start` method and generated some new URL to crawl. Processor send a message to Scheduler that this task is finished and new tasks via message queue to Scheduler (here is no results for `on_start` in most case. If has results, Processor send them to `result_queue`).
5. Scheduler receives the new tasks, looking up in the database, determine whether the task is new or requires re-crawl, if so, put them into task queue. Dispatch tasks in order.
6. The process repeats (from step 3) and wouldn't stop till WWW is dead ;-). Scheduler will check periodic tasks to crawl latest data.


================================================
FILE: docs/Command-Line.md
================================================
Command Line
============

Global Config
-------------

You can get command help via `pyspider --help` and `pyspider all --help` for subcommand help.

global options work for all subcommands.

```
Usage: pyspider [OPTIONS] COMMAND [ARGS]...

  A powerful spider system in python.

Options:
  -c, --config FILENAME    a json file with default values for subcommands.
                           {“webui”: {“port”:5001}}
  --logging-config TEXT    logging config file for built-in python logging
                           module  [default: pyspider/pyspider/logging.conf]
  --debug                  debug mode
  --queue-maxsize INTEGER  maxsize of queue
  --taskdb TEXT            database url for taskdb, default: sqlite
  --projectdb TEXT         database url for projectdb, default: sqlite
  --resultdb TEXT          database url for resultdb, default: sqlite
  --message-queue TEXT     connection url to message queue, default: builtin
                           multiprocessing.Queue
  --amqp-url TEXT          [deprecated] amqp url for rabbitmq. please use
                           --message-queue instead.
  --beanstalk TEXT         [deprecated] beanstalk config for beanstalk queue.
                           please use --message-queue instead.
  --phantomjs-proxy TEXT   phantomjs proxy ip:port
  --data-path TEXT         data dir path
  --version                Show the version and exit.
  --help                   Show this message and exit.
```

#### --config

Config file is a JSON file with config values for global options or subcommands (a sub-dict named after subcommand). [example](/Deployment/#configjson)

``` json
{
  "taskdb": "mysql+taskdb://username:password@host:port/taskdb",
  "projectdb": "mysql+projectdb://username:password@host:port/projectdb",
  "resultdb": "mysql+resultdb://username:password@host:port/resultdb",
  "message_queue": "amqp://username:password@host:port/%2F",
  "webui": {
    "username": "some_name",
    "password": "some_passwd",
    "need-auth": true
  }
}
```

#### --queue-maxsize

Queue size limit, 0 for not limit

#### --taskdb, --projectdb, --resultdb

```
mysql:
    mysql+type://user:passwd@host:port/database
sqlite:
    # relative path
    sqlite+type:///path/to/database.db
    # absolute path
    sqlite+type:////path/to/database.db
    # memory database
    sqlite+type://
mongodb:
    mongodb+type://[username:password@]host1[:port1][,host2[:port2],...[,hostN[:portN]]][/[database][?options]]
    more: http://docs.mongodb.org/manual/reference/connection-string/
couchdb:
    couchdb+type://[username:password@]host[:port]
sqlalchemy:
    sqlalchemy+postgresql+type://user:passwd@host:port/database
    sqlalchemy+mysql+mysqlconnector+type://user:passwd@host:port/database
    more: http://docs.sqlalchemy.org/en/rel_0_9/core/engines.html
local:
    local+projectdb://filepath,filepath
    
type:
    should be one of `taskdb`, `projectdb`, `resultdb`.
```


#### --message-queue

```
rabbitmq:
    amqp://username:password@host:5672/%2F
    see https://www.rabbitmq.com/uri-spec.html
redis:
    redis://host:6379/db
    redis://host1:port1,host2:port2,...,hostn:portn (for redis 3.x in cluster mode)
kombu:
    kombu+transport://userid:password@hostname:port/virtual_host
    see http://kombu.readthedocs.org/en/latest/userguide/connections.html#urls
builtin:
    None
```

#### --phantomjs-proxy

The phantomjs proxy address, you need a phantomjs installed and running phantomjs proxy with command: [`pyspider phantomjs`](#phantomjs).

#### --data-path

SQLite database and counter dump files saved path


all
---

```
Usage: pyspider all [OPTIONS]

  Run all the components in subprocess or thread

Options:
  --fetcher-num INTEGER         instance num of fetcher
  --processor-num INTEGER       instance num of processor
  --result-worker-num INTEGER   instance num of result worker
  --run-in [subprocess|thread]  run each components in thread or subprocess.
                                always using thread for windows.
  --help                        Show this message and exit.
```


one
---

```
Usage: pyspider one [OPTIONS] [SCRIPTS]...

  One mode not only means all-in-one, it runs every thing in one process
  over tornado.ioloop, for debug purpose

Options:
  -i, --interactive  enable interactive mode, you can choose crawl url.
  --phantomjs        enable phantomjs, will spawn a subprocess for phantomjs
  --help             Show this message and exit.
```

**NOTE: WebUI is not running in one mode.**

In `one` mode, results will be written to stdout by default. You can capture them via `pyspider one > result.txt`.

#### [SCRIPTS]

The script file path of projects. Project status is RUNNING, `rate` and `burst` can be set via script comments:

```
# rate: 1.0
# burst: 3
```

When SCRIPTS is set, `taskdb` and `resultdb` will use a in-memory sqlite db by default (can be overridden by global config `--taskdb`, `--resultdb`). on_start callback will be triggered on start.

#### -i, --interactive

With interactive mode, pyspider will start an interactive console asking what to do in next loop of process. In the console, you can use:

``` python
crawl(url, project=None, **kwargs)
    Crawl given url, same parameters as BaseHandler.crawl

    url - url or taskid, parameters will be used if in taskdb
    project - can be omitted if only one project exists.
    
quit_interactive()
    Quit interactive mode
    
quit_pyspider()
    Close pyspider
```

You can use `pyspider.libs.utils.python_console()` to open an interactive console in your script.

bench
-----

```
Usage: pyspider bench [OPTIONS]

  Run Benchmark test. In bench mode, in-memory sqlite database is used
  instead of on-disk sqlite database.

Options:
  --fetcher-num INTEGER         instance num of fetcher
  --processor-num INTEGER       instance num of processor
  --result-worker-num INTEGER   instance num of result worker
  --run-in [subprocess|thread]  run each components in thread or subprocess.
                                always using thread for windows.
  --total INTEGER               total url in test page
  --show INTEGER                show how many urls in a page
  --help                        Show this message and exit.
```


scheduler
---------

```
Usage: pyspider scheduler [OPTIONS]

  Run Scheduler, only one scheduler is allowed.

Options:
  --xmlrpc / --no-xmlrpc
  --xmlrpc-host TEXT
  --xmlrpc-port INTEGER
  --inqueue-limit INTEGER  size limit of task queue for each project, tasks
                           will been ignored when overflow
  --delete-time INTEGER    delete time before marked as delete
  --active-tasks INTEGER   active log size
  --loop-limit INTEGER     maximum number of tasks due with in a loop
  --scheduler-cls TEXT     scheduler class to be used.
  --help                   Show this message and exit.
```

#### --scheduler-cls

set this option to use customized Scheduler class

phantomjs
---------

```
Usage: run.py phantomjs [OPTIONS] [ARGS]...

  Run phantomjs fetcher if phantomjs is installed.

Options:
  --phantomjs-path TEXT  phantomjs path
  --port INTEGER         phantomjs port
  --auto-restart TEXT    auto restart phantomjs if crashed
  --help                 Show this message and exit.
```

#### ARGS

Addition args pass to phantomjs command line.

fetcher
-------

```
Usage: pyspider fetcher [OPTIONS]

  Run Fetcher.

Options:
  --xmlrpc / --no-xmlrpc
  --xmlrpc-host TEXT
  --xmlrpc-port INTEGER
  --poolsize INTEGER      max simultaneous fetches
  --proxy TEXT            proxy host:port
  --user-agent TEXT       user agent
  --timeout TEXT          default fetch timeout
  --fetcher-cls TEXT      Fetcher class to be used.
  --help                  Show this message and exit.
```

#### --proxy

Default proxy used by fetcher, can been override by `self.crawl` option. [DOC](apis/self.crawl/#fetch)


processor
---------

```
Usage: pyspider processor [OPTIONS]

  Run Processor.

Options:
  --processor-cls TEXT  Processor class to be used.
  --help                Show this message and exit.
```

result_worker
-------------

```
Usage: pyspider result_worker [OPTIONS]

  Run result worker.

Options:
  --result-cls TEXT  ResultWorker class to be used.
  --help             Show this message and exit.
```


webui
-----

```
Usage: pyspider webui [OPTIONS]

  Run WebUI

Options:
  --host TEXT            webui bind to host
  --port INTEGER         webui bind to host
  --cdn TEXT             js/css cdn server
  --scheduler-rpc TEXT   xmlrpc path of scheduler
  --fetcher-rpc TEXT     xmlrpc path of fetcher
  --max-rate FLOAT       max rate for each project
  --max-burst FLOAT      max burst for each project
  --username TEXT        username of lock -ed projects
  --password TEXT        password of lock -ed projects
  --need-auth            need username and password
  --webui-instance TEXT  webui Flask Application instance to be used.
  --help                 Show this message and exit.
```

#### --cdn

JS/CSS libs CDN service, URL must compatible with [cdnjs](https://cdnjs.com/)

#### --fetcher-rpc

XML-RPC path URI for fetcher XMLRPC server. If not set, use a Fetcher instance.

#### --need-auth

If true, all pages require username and password specified via `--username` and `--password`.


================================================
FILE: docs/Deployment-demo.pyspider.org.md
================================================
Deployment of demo.pyspider.org
===============================

[demo.pyspider.org](http://demo.pyspider.org/) is running on three VPSs connected together with private network using [tinc](http://www.tinc-vpn.org/).

1vCore 4GB RAM | 1vCore 2GB RAM * 2
---------------|----------------
database<br>message queue<br>scheduler | phantomjs * 2<br>phantomjs-lb * 1<br>fetcher * 1<br>fetcher-lb * 1<br>processor * 2<br>result-worker * 1<br>webui * 4<br>webui-lb * 1<br>nginx * 1<br>

All components are running inside docker containers.

database / message queue / scheduler
------------------------------------

The database is postgresql and the message queue is redis.

Scheduler may have a lot of database operations, it's better to put it close to the database.

```bash
docker run --name postgres -v /data/postgres/:/var/lib/postgresql/data -d -p $LOCAL_IP:5432:5432 -e POSTGRES_PASSWORD="" postgres
docker run --name redis -d -p  $LOCAL_IP:6379:6379 redis
docker run --name scheduler -d -p $LOCAL_IP:23333:23333 --restart=always binux/pyspider \
 --taskdb "sqlalchemy+postgresql+taskdb://binux@10.21.0.7/taskdb" \
 --resultdb "sqlalchemy+postgresql+resultdb://binux@10.21.0.7/resultdb" \
 --projectdb "sqlalchemy+postgresql+projectdb://binux@10.21.0.7/projectdb" \
 --message-queue "redis://10.21.0.7:6379/1" \
 scheduler --inqueue-limit 5000 --delete-time 43200
```

other components
----------------

fetcher, processor, result_worker are running on two boxes with same configuration managed with [docker-compose](https://docs.docker.com/compose/).

```yaml
phantomjs:
  image: 'binux/pyspider:latest'
  command: phantomjs
  cpu_shares: 512
  environment:
    - 'EXCLUDE_PORTS=5000,23333,24444'
  expose:
    - '25555'
  mem_limit: 512m
  restart: always
phantomjs-lb:
  image: 'dockercloud/haproxy:latest'
  links:
    - phantomjs
  restart: always
  
fetcher:
  image: 'binux/pyspider:latest'
  command: '--message-queue "redis://10.21.0.7:6379/1" --phantomjs-proxy "phantomjs:80" fetcher --xmlrpc'
  cpu_shares: 512
  environment:
    - 'EXCLUDE_PORTS=5000,25555,23333'
  links:
    - 'phantomjs-lb:phantomjs'
  mem_limit: 128m
  restart: always
fetcher-lb:
  image: 'dockercloud/haproxy:latest'
  links:
    - fetcher
  restart: always
  
processor:
  image: 'binux/pyspider:latest'
  command: '--projectdb "sqlalchemy+postgresql+projectdb://binux@10.21.0.7/projectdb" --message-queue "redis://10.21.0.7:6379/1" processor'
  cpu_shares: 512
  mem_limit: 256m
  restart: always
  
result-worker:
  image: 'binux/pyspider:latest'
  command: '--taskdb "sqlalchemy+postgresql+taskdb://binux@10.21.0.7/taskdb"  --projectdb "sqlalchemy+postgresql+projectdb://binux@10.21.0.7/projectdb" --resultdb "sqlalchemy+postgresql+resultdb://binux@10.21.0.7/resultdb" --message-queue "redis://10.21.0.7:6379/1" result_worker'
  cpu_shares: 512
  mem_limit: 256m
  restart: always
  
webui:
  image: 'binux/pyspider:latest'
  command: '--taskdb "sqlalchemy+postgresql+taskdb://binux@10.21.0.7/taskdb"  --projectdb "sqlalchemy+postgresql+projectdb://binux@10.21.0.7/projectdb" --resultdb "sqlalchemy+postgresql+resultdb://binux@10.21.0.7/resultdb" --message-queue "redis://10.21.0.7:6379/1" webui --max-rate 0.2 --max-burst 3 --scheduler-rpc "http://o4.i.binux.me:23333/" --fetcher-rpc "http://fetcher/"'

  cpu_shares: 512
  environment:
    - 'EXCLUDE_PORTS=24444,25555,23333'
  links:
    - 'fetcher-lb:fetcher'
  mem_limit: 256m
  restart: always
webui-lb:
  image: 'dockercloud/haproxy:latest'
  links:
    - webui
  restart: always
  
nginx:
  image: 'nginx'
  links:
    - 'webui-lb:HAPROXY'
  ports:
    - '0.0.0.0:80:80'
  volumes:
    - /home/binux/nfs/profile/nginx/nginx.conf:/etc/nginx/nginx.conf
    - /home/binux/nfs/profile/nginx/conf.d/:/etc/nginx/conf.d/
  restart: always
```

With the config, you can change the scale by `docker-compose scale phantomjs=2 processor=2 webui=4` when you need. 

#### load balance

phantomjs-lb, fetcher-lb, webui-lb are automaticlly configed haproxy, allow any number of upstreams.

#### phantomjs

phantomjs have memory leak issue, memory limit applied, and it's recommended to restart it every hour.

#### fetcher

fetcher is implemented with aync IO, it supportes 100 concurrent connections. If the upstream queue are not choked, one fetcher should be enough.

#### processor

processor is CPU bound component, recommended number of instance is number of CPU cores + 1~2 or CPU cores * 10%~15% when you have more then 20 cores.

#### result-worker

If you didn't override result-worker, it only write results into database, and should be very fast.


================================================
FILE: docs/Deployment.md
================================================
Deployment
===========

Since pyspider has various components, you can just run `pyspider` to start a standalone and third service free instance. Or using MySQL or MongoDB and RabbitMQ to deploy a distributed crawl cluster.

To deploy pyspider in product environment, running component in each process and store data in database service is more reliable and flexible.

Installation
------------

To deploy pyspider components in each single processes, you need at least one database service. pyspider now supports [MySQL](http://www.mysql.com/), [CouchDB](https://couchdb.apache.org), [MongoDB](http://www.mongodb.org/) and [PostgreSQL](http://www.postgresql.org/). You can choose one of them.

And you need a message queue service to connect the components together. You can use [RabbitMQ](http://www.rabbitmq.com/) or [Redis](http://redis.io/) as message queue.

`pip install --allow-all-external pyspider[all]`

> Even if you had install pyspider using `pip` before. Install with `pyspider[all]` is necessary to install the requirements for MySQL/MongoDB/RabbitMQ.

if you are using Ubuntu, try:
```
apt-get install python python-dev python-distribute python-pip libcurl4-openssl-dev libxml2-dev libxslt1-dev python-lxml
```
to install binary packages.

Deployment
----------

**This document is based on MySQL + RabbitMQ**

### config.json

Although you can use command-line to specify the parameters. A config file is a better choice.

```
{
  "taskdb": "mysql+taskdb://username:password@host:port/taskdb",
  "projectdb": "mysql+projectdb://username:password@host:port/projectdb",
  "resultdb": "mysql+resultdb://username:password@host:port/resultdb",
  "message_queue": "amqp://username:password@host:port/%2F",
  "webui": {
    "username": "some_name",
    "password": "some_passwd",
    "need-auth": true
  }
}
```

you can get complete options by running `pyspider --help` and `pyspider webui --help` for subcommands. `"webui"` in JSON  is configs for subcommands. You can add parameters for other components similar to this one.

#### Database Connection URI
`"taskdb"`, `"projectdb”`, `"resultdb"` is using database connection URI with format below:

```
mysql:
    mysql+type://user:passwd@host:port/database
sqlite:
    # relative path
    sqlite+type:///path/to/database.db
    # absolute path
    sqlite+type:////path/to/database.db
    # memory database
    sqlite+type://
mongodb:
    mongodb+type://[username:password@]host1[:port1][,host2[:port2],...[,hostN[:portN]]][/[database][?options]]
    more: http://docs.mongodb.org/manual/reference/connection-string/
couchdb:
    couchdb+type://[username:password@]host[:port][?options]]
sqlalchemy:
    sqlalchemy+postgresql+type://user:passwd@host:port/database
    sqlalchemy+mysql+mysqlconnector+type://user:passwd@host:port/database
    more: http://docs.sqlalchemy.org/en/rel_0_9/core/engines.html
local:
    local+projectdb://filepath,filepath
    
type:
    should be one of `taskdb`, `projectdb`, `resultdb`.
```

#### Message Queue URL
You can use connection URL to specify the message queue:

```
rabbitmq:
    amqp://username:password@host:5672/%2F
    Refer: https://www.rabbitmq.com/uri-spec.html
redis:
    redis://host:6379/db
    redis://host1:port1,host2:port2,...,hostn:portn (for redis 3.x in cluster mode)
builtin:
    None
```

> Hint for postgresql: you need to create database with encoding utf8 by your own. pyspider will not create database for you.

running
-------

You should run components alone with subcommands. You may add `&` after command to make it running in background and use [screen](http://linux.die.net/man/1/screen) or [nohup](http://linux.die.net/man/1/nohup) to prevent exit after your ssh session ends. **It's recommended to manage components with [Supervisor](http://supervisord.org/).**

```
# start **only one** scheduler instance
pyspider -c config.json scheduler

# phantomjs
pyspider -c config.json phantomjs

# start fetcher / processor / result_worker instances as many as your needs
pyspider -c config.json --phantomjs-proxy="localhost:25555" fetcher
pyspider -c config.json processor
pyspider -c config.json result_worker

# start webui, set `--scheduler-rpc` if scheduler is not running on the same host as webui
pyspider -c config.json webui
```

Running with Docker
-------------------
[Running pyspider with Docker](Running-pyspider-with-Docker)


Deployment of demo.pyspider.org
-------------------------------
[Deployment of demo.pyspider.org](Deployment-demo.pyspider.org)


================================================
FILE: docs/Frequently-Asked-Questions.md
================================================
Frequently Asked Questions
==========================

Does pyspider Work with Windows?
--------------------------------
Yes, it should, some users have made it work on Windows. But as I don't have windows development environment, I cannot test. Only some tips for users who want to use pyspider on Windows:

- Some package needs binary libs (e.g. pycurl, lxml), that maybe you cannot install it from pip, Windowns binaries packages could be found in [http://www.lfd.uci.edu/~gohlke/pythonlibs/](http://www.lfd.uci.edu/~gohlke/pythonlibs/).
- Make a clean environment with [virtualenv](https://virtualenv.readthedocs.org/en/latest/)
- Try 32bit version of Python, especially your are facing crash issue.
- Avoid using Python 3.4.1 ([#194](https://github.com/binux/pyspider/issues/194), [#217](https://github.com/binux/pyspider/issues/217))

Unreadable Code (乱码) Returned from Phantomjs
---------------------------------------------

Phantomjs doesn't support gzip, don't set `Accept-Encoding` header with `gzip`.


How to Delete a Project?
------------------------

set `group` to `delete` and `status` to `STOP` then wait 24 hours. You can change the time before a project deleted via `scheduler.DELETE_TIME`.

How to Restart a Project?
-------------------------
#### Why
It happens after you modified a script, and wants to crawl everything again with new strategy. But as the [age](/apis/self.crawl/#age) of urls are not expired. Scheduler will discard all of the new requests.

#### Solution
1. Create a new project.
2. Using a [itag](/apis/self.crawl/#itag) within `Handler.crawl_config` to specify the version of your script.

How to Use WebDAV Mode?
-----------------------
Mount `http://hostname/dav/` to your filesystem, edit or create scripts with your favourite editor.

> OSX: `mount_webdav http://hostname/dav/ /Volumes/dav`  
> Linux: Install davfs2, `mount.davfs http://hostname/dav/ /mnt/dav`  
> VIM: `vim http://hostname/dav/script_name.py`

When you are editing script without WebUI, you need to change it to `WebDAV Mode` while debugging. After you saved script in editor, WebUI can load and use latest script to debug your code.

What does the progress bar mean on the dashboard?
-------------------------------------------------
When mouse move onto the progress bar, you can see the explaintions.

For 5m, 1h, 1d the number are the events triggered in 5m, 1h, 1d. For all progress bar, they are the number of total tasks in correspond status.

Only the tasks in DEBUG/RUNNING status will show the progress.

How many scheduler/fetcher/processor/result_worker do I need? or pyspider stop working
--------------------------------------------------------------------------------------
You can have only have one scheduler, and multiple fetcher/processor/result_worker depends on the bottleneck. You can use the queue status on dashboard to view the bottleneck of the system:

![run one step](imgs/queue_status.png)

For example, the number between scheduler and fetcher indicate the queue size of scheduler to fetchers, when it's hitting 100 (default maximum queue size), fetcher might crashed, or you should considered adding more fetchers.

The number `0+0` below fetcher indicate the queue size of new tasks and status packs between processors and schduler. You can put your mouse over the numbers to see the tips.

================================================
FILE: docs/Quickstart.md
================================================
Quickstart
==========

Installation
------------

* `pip install pyspider`
* run command `pyspider`, visit [http://localhost:5000/](http://localhost:5000/)

if you are using ubuntu, try:
```
apt-get install python python-dev python-distribute python-pip \
libcurl4-openssl-dev libxml2-dev libxslt1-dev python-lxml \
libssl-dev zlib1g-dev
```
to install binary packages first.


please install PhantomJS if needed: http://phantomjs.org/build.html

note that PhantomJS will be enabled only if it is excutable in the `PATH` or in the System Environment

**Note:** `pyspider` command is running pyspider in `all` mode, which running components in threads or subprocesses. For production environment, please refer to [Deployment](Deployment).

**WARNING:** WebUI is opened to public by default, it can be used to execute any command which may harm to you system. Please use it in internal network or [enable `need-auth` for webui](http://docs.pyspider.org/en/latest/Command-Line/#-config).

Your First Script
-----------------

```python
from pyspider.libs.base_handler import *


class Handler(BaseHandler):
    crawl_config = {
    }

    @every(minutes=24 * 60)
    def on_start(self):
        self.crawl('http://scrapy.org/', callback=self.index_page)

    @config(age=10 * 24 * 60 * 60)
    def index_page(self, response):
        for each in response.doc('a[href^="http"]').items():
            self.crawl(each.attr.href, callback=self.detail_page)

    @config(priority=2)
    def detail_page(self, response):
        return {
            "url": response.url,
            "title": response.doc('title').text(),
        }
```

> * `def on_start(self)` is the entry point of the script. It will be called when you click the `run` button on dashboard.
> * [`self.crawl(url, callback=self.index_page)`*](/apis/self.crawl) is the most important API here. It will add a new task to be crawled. Most of the options will be spicified via `self.crawl` arguments.
> * `def index_page(self, response)` get a [`Response`*](/apis/Response) object. [`response.doc`*](/apis/Response/#responsedoc) is a [pyquery](https://pythonhosted.org/pyquery/) object which has jQuery-like API to select elements to be extracted.
> * `def detail_page(self, response)` return a `dict` object as result. The result will be captured into `resultdb` by default. You can override `on_result(self, result)` method to manage the result yourself.


More things you may want to know:

> * [`@every(minutes=24*60, seconds=0)`*](/apis/@every/) is a helper to tell the scheduler that `on_start` method should be called everyday.
> * [`@config(age=10 * 24 * 60 * 60)`*](/apis/self.crawl/#configkwargs) specified the default `age` parameter of `self.crawl` with page type `index_page` (when `callback=self.index_page`). The parameter [`age`*](/apis/self.crawl/#age) can be specified via `self.crawl(url, age=10*24*60*60)` (highest priority) and `crawl_config` (lowest priority).
> * [`age=10 * 24 * 60 * 60`*](/apis/self.crawl/#age) tell scheduler discard the request if it have been crawled in 10 days. pyspider will not crawl a same URL twice by default (discard forever), even you had modified the code, it's very common for beginners that runs the project the first time and modified it and run it the second time, it will not crawl again (read [`itag`](/apis/self.crawl/#itag) for solution)
> * [`@config(priority=2)`*](/apis/self.crawl/#schedule) mark that detail pages should be crawled first.

You can test your script step by step by click the green `run` button. Switch to `follows` panel, click the play button to move on.

![run one step](imgs/run_one_step.png)

Start Running
-------------

1. Save your script.
2. Back to dashboard find your project.
3. Changing the `status` to `DEBUG` or `RUNNING`.
4. Click the `run` button.

![index demo](imgs/index_page.png)

Your script is running now!


================================================
FILE: docs/Running-pyspider-with-Docker.md
================================================
```shell
# mysql
docker run --name mysql -d -v /data/mysql:/var/lib/mysql -e MYSQL_ALLOW_EMPTY_PASSWORD=yes mysql:latest
# rabbitmq
docker run --name rabbitmq -d rabbitmq:latest

# phantomjs
docker run --name phantomjs -d binux/pyspider:latest phantomjs

# result worker
docker run --name result_worker -m 128m -d --link mysql:mysql --link rabbitmq:rabbitmq binux/pyspider:latest result_worker
# processor, run multiple instance if needed.
docker run --name processor -m 256m -d --link mysql:mysql --link rabbitmq:rabbitmq binux/pyspider:latest processor
# fetcher, run multiple instance if needed.
docker run --name fetcher -m 256m -d --link phantomjs:phantomjs --link rabbitmq:rabbitmq binux/pyspider:latest fetcher --no-xmlrpc
# scheduler
docker run --name scheduler -d --link mysql:mysql --link rabbitmq:rabbitmq binux/pyspider:latest scheduler
# webui
docker run --name webui -m 256m -d -p 5000:5000 --link mysql:mysql --link rabbitmq:rabbitmq --link scheduler:scheduler --link phantomjs:phantomjs binux/pyspider:latest webui
```

or running with [Docker Compose](https://docs.docker.com/compose/) with `docker-compose.yml`:

NOTE: It's recommended to run mysql and rabbitmq outside compose as they may not been restarted with pyspider. You can find commands to start mysql and rabbitmq service above.

```
phantomjs:
  image: binux/pyspider:latest
  command: phantomjs
result:
  image: binux/pyspider:latest
  external_links:
    - mysql
    - rabbitmq
  command: result_worker
processor:
  image: binux/pyspider:latest
  external_links:
    - mysql
    - rabbitmq
  command: processor
fetcher:
  image: binux/pyspider:latest
  external_links:
    - rabbitmq
  links:
    - phantomjs
  command : fetcher
scheduler:
  image: binux/pyspider:latest
  external_links:
    - mysql
    - rabbitmq
  command: scheduler
webui:
  image: binux/pyspider:latest
  external_links:
    - mysql
    - rabbitmq
  links:
    - scheduler
    - phantomjs
  command: webui
  ports:
    - "5000:5000"
```

`docker-compose up`


================================================
FILE: docs/Script-Environment.md
================================================
Script Environment
==================

Variables
---------
* `self.project_name`
* `self.project` information about current project
* `self.response`
* `self.task`

About Script
------------
* The name of `Handler` is not matters, but you need at least one class inherit from `BaseHandler`
* A third parameter can be set to get task object: `def callback(self, response, task)`
* Non-200 response will not submit to callback by default. Use `@catch_status_code_error` 

About Environment
-----------------
* `logging`, `print` and exceptions will be captured.
* You can import other projects as module with `from projects import some_project`

### Web view

* view the page as a browser would render (approximately)

### HTML view

* view the HTML of the current callback (index_page, detail_page, etc.)

### Follows view

* view the callbacks that can be made from the current callback
* index_page follows view will show the detail_page callbacks that can be executed.

### Messages view

* shows the messages send by [`self.send_message`](apis/self.send_message) API.

### Enable CSS Selector Helper

* Enable a CSS Selector Helper of the Web view. It gets the CSS Selector of the element you clicked then add it to your script.


================================================
FILE: docs/Working-with-Results.md
================================================
Working with Results
====================
Downloading and viewing your data from WebUI is convenient, but may not suitable for computer.

Working with ResultDB
---------------------
Although resultdb is only designed for result preview, not suitable for large scale storage. But if you want to grab data from resultdb, there are some simple snippets using database API that can help you to connect and select the data.

```
from pyspider.database import connect_database
resultdb = connect_database("<your resutldb connection url>")
for project in resultdb.projects:
    for result in resultdb.select(project):
        assert result['taskid']
        assert result['url']
        assert result['result']
```

The `result['result']` is the object submitted by `return` statement from your script.

Working with ResultWorker
-------------------------
In product environment, you may want to connect pyspider to your system / post-processing pipeline, rather than store it into resultdb. It's highly recommended to override ResultWorker.

```
from pyspider.result import ResultWorker

class MyResultWorker(ResultWorker):
    def on_result(self, task, result):
        assert task['taskid']
        assert task['project']
        assert task['url']
        assert result
        # your processing code goes here
```

`result` is the object submitted by `return` statement from your script.

You can put this script (e.g., `my_result_worker.py`) at the folder where you launch pyspider. Add argument for `result_worker` subcommand:

`pyspider result_worker --result-cls=my_result_worker.MyResultWorker`

Or

```
{
  ...
  "result_worker": {
    "result_cls": "my_result_worker.MyResultWorker"
  }
  ...
}
```

if you are using config file. [Please refer to Deployment](/Deployment)

Design Your Own Database Schema
-------------------------------
The results stored in database is encoded as JSON for compatibility. It's highly recommended to design your own database, and override the ResultWorker described above.

TIPS about Results
-------------------
#### Want to return more than one result in callback?
As resultdb de-duplicate results by taskid(url), the latest will overwrite previous results.

One workaround is using `send_message` API to make a `fake` taskid for each result.

```
def detail_page(self, response):
    for li in response.doc('li').items():
        self.send_message(self.project_name, {
            ...
        }, url=response.url+"#"+li('a.product-sku').text())

def on_message(self, project, msg):
    return msg
```

See Also: [apis/self.send_message](/apis/self.send_message)


================================================
FILE: docs/apis/@catch_status_code_error.md
================================================
@catch_status_code_error
========================

non-200 response will been regarded as fetch failed and will not pass to callback. use this decorator to override this feature.

```python
def on_start(self):
    self.crawl('http://httpbin.org/status/404', self.callback)

@catch_status_code_error  
def callback(self, response):
    ...
```

>  The `callback` would not be executed as the request is failed (with status code 404). With the `@catch_status_code_error` decorater, the `callback` would be executed even if the request failed.


================================================
FILE: docs/apis/@every.md
================================================
@every(minutes=0, seconds=0)
============================

method will been called every `minutes` or `seconds`


```python
@every(minutes=24 * 60)
def on_start(self):
    for url in urllist:
        self.crawl(url, callback=self.index_page)
```

The urls would be restarted every 24 hours. Note that, if `age` is also used and the period is longer then `@every`, the crawl request would be discarded as it's regarded as not changed:

```python
@every(minutes=24 * 60)
def on_start(self):
    self.crawl('http://www.example.org/', callback=self.index_page)

@config(age=10 * 24 * 60 * 60)
def index_page(self):
    ...
```

> Even though the crawl request triggered every day, but it's discard and only restarted every 10 days.


================================================
FILE: docs/apis/Response.md
================================================
Response
========

The attributes of Response object.

### Response.url

final URL.

### Response.text

Content of response, in unicode.

if `Response.encoding` is None and `chardet` module is available, encoding of content will be guessed.

### Response.content

Content of response, in bytes.

### Response.doc

A [PyQuery](https://pythonhosted.org/pyquery/) object of the response's content. Links have made as absolute by default.

Refer to the documentation of PyQuery: [https://pythonhosted.org/pyquery/](https://pythonhosted.org/pyquery/)

It's important that I will repeat, refer to the documentation of PyQuery: [https://pythonhosted.org/pyquery/](https://pythonhosted.org/pyquery/)

### Response.etree

A [lxml](http://lxml.de/) object of the response's content.

### Response.json

The JSON-encoded content of the response, if any.

### Response.status_code

### Response.orig_url

If there is any redirection during the request, here is the url you just submit via `self.crawl`.

### Response.headers

A case insensitive dict holds the headers of response.

### Response.cookies

### Response.error

Messages when fetch error

### Response.time

Time used during fetching.

### Response.ok

True if `status_code` is 200 and no error.

### Response.encoding

Encoding of Response.content.

If Response.encoding is None, encoding will be guessed by header or content or `chardet`(if available).

Set encoding of content manually will overwrite the guessed encoding.

### Response.save

The object saved by [`self.crawl`](/apis/self.crawl/#save) API

### Response.js_script_result

content returned by JS script

### Response.raise_for_status()

Raise HTTPError if status code is not 200 or `Response.error` exists.


================================================
FILE: docs/apis/index.md
================================================
API Reference
=============
    
- [self.crawl](self.crawl)
- [Response](Response)
- [self.send_message](self.send_message)
- [@every](@every)
- [@catch_status_code_error](@catch_status_code_error)


================================================
FILE: docs/apis/self.crawl.md
================================================
self.crawl
===========

self.crawl(url, **kwargs)
-------------------------

`self.crawl` is the main interface to tell pyspider which url(s) should be crawled.

### Parameters:

##### url
the url or url list to be crawled.

##### callback
the method to parse the response. _default: `__call__` _


```python
def on_start(self):
    self.crawl('http://scrapy.org/', callback=self.index_page)
```

the following parameters are optional

##### age

the period of validity of the task. The page would be regarded as not modified during the period. _default: -1(never recrawl)_ 

```python
@config(age=10 * 24 * 60 * 60)
def index_page(self, response):
    ...
```
> Every pages parsed by the callback `index_page` would be regarded not changed within 10 days. If you submit the task within 10 days since last crawled it would be discarded.

##### priority

the priority of task to be scheduled, higher the better. _default: 0_ 

```python
def index_page(self):
    self.crawl('http://www.example.org/page2.html', callback=self.index_page)
    self.crawl('http://www.example.org/233.html', callback=self.detail_page,
               priority=1)
```
> The page `233.html` would be crawled before `page2.html`. Use this parameter can do a [BFS](http://en.wikipedia.org/wiki/Breadth-first_search) and reduce the number of tasks in queue(which may cost more memory resources).

##### exetime

the executed time of task in unix timestamp. _default: 0(immediately)_ 

```python
import time
def on_start(self):
    self.crawl('http://www.example.org/', callback=self.callback,
               exetime=time.time()+30*60)
```
> The page would be crawled 30 minutes later.

##### retries

retry times while failed. _default: 3_ 

##### itag

a marker from frontier page to reveal the potential modification of the task. It will be compared to its last value, recrawl when it's changed. _default: None_ 

```python
def index_page(self, response):
    for item in response.doc('.item').items():
        self.crawl(item.find('a').attr.url, callback=self.detail_page,
                   itag=item.find('.update-time').text())
```
> In the sample, `.update-time` is used as itag. If it's not changed, the request would be discarded.

Or you can use `itag` with `Handler.crawl_config` to specify the script version if you want to restart all of the tasks.

```python
class Handler(BaseHandler):
    crawl_config = {
        'itag': 'v223'
    }
```
> Change the value of itag after you modified the script and click run button again. It doesn't matter if not set before. 

##### auto_recrawl

when enabled, task would be recrawled every `age` time. _default: False_ 

```python
def on_start(self):
    self.crawl('http://www.example.org/', callback=self.callback,
               age=5*60*60, auto_recrawl=True)
```
> The page would be restarted every `age` 5 hours.

##### method
    
HTTP method to use. _default: GET_ 

##### params

dictionary of URL parameters to append to the URL. 

```python
def on_start(self):
    self.crawl('http://httpbin.org/get', callback=self.callback,
               params={'a': 123, 'b': 'c'})
    self.crawl('http://httpbin.org/get?a=123&b=c', callback=self.callback)
```
> The two requests are the same.

##### data

the body to attach to the request. If a dictionary is provided, form-encoding will take place. 

```python
def on_start(self):
    self.crawl('http://httpbin.org/post', callback=self.callback,
               method='POST', data={'a': 123, 'b': 'c'})
```

##### files

dictionary of `{field: {filename: 'content'}}` files to multipart upload.` 

##### user_agent

the User-Agent of the request

##### headers

dictionary of headers to send. 

##### cookies

dictionary of cookies to attach to this request. 

##### connect_timeout

timeout for initial connection in seconds. _default: 20_

##### timeout

maximum time in seconds to fetch the page. _default: 120_ 

##### allow_redirects

follow `30x` redirect _default: True_ 

##### validate_cert

For HTTPS requests, validate the server’s certificate? _default: True_ 

##### proxy

proxy server of `username:password@hostname:port` to use, only http proxy is supported currently. 

```python
class Handler(BaseHandler):
    crawl_config = {
        'proxy': 'localhost:8080'
    }
```
> `Handler.crawl_config` can be used with `proxy` to set a proxy for whole project.

##### etag 

use HTTP Etag mechanism to pass the process if the content of the page is not changed. _default: True_ 

###### last_modified

use HTTP Last-Modified header mechanism to pass the process if the content of the page is not changed. _default: True_ 

##### fetch_type

set to `js` to enable JavaScript fetcher. _default: None_ 

##### js_script

JavaScript run before or after page loaded, should been wrapped by a function like `function() { document.write("binux"); }`. 


```python
def on_start(self):
    self.crawl('http://www.example.org/', callback=self.callback,
               fetch_type='js', js_script='''
               function() {
                   window.scrollTo(0,document.body.scrollHeight);
                   return 123;
               }
               ''')
```
> The script would scroll the page to bottom. The value returned in function could be captured via `Response.js_script_result`.

##### js_run_at

run JavaScript specified via `js_script` at `document-start` or `document-end`. _default: `document-end`_ 

##### js_viewport_width/js_viewport_height

set the size of the viewport for the JavaScript fetcher of the layout process. 

##### load_images

load images when JavaScript fetcher enabled. _default: False_ 

##### save

a object pass to the callback method, can be visit via `response.save`. 


```python
def on_start(self):
    self.crawl('http://www.example.org/', callback=self.callback,
               save={'a': 123})

def callback(self, response):
    return response.save['a']
```
> `123` would be returned in `callback`

##### taskid
    
unique id to identify the task, default is the MD5 check code of the URL, can be overridden by method `def get_taskid(self, task)` 

```python
import json
from pyspider.libs.utils import md5string
def get_taskid(self, task):
    return md5string(task['url']+json.dumps(task['fetch'].get('data', '')))
```
> Only url is md5 -ed as taskid by default, the code above add `data` of POST request as part of taskid.

##### force_update
    
force update task params even if the task is in `ACTIVE` status. 

##### cancel

cancel a task, should be used with `force_update` to cancel a active task. To cancel an `auto_recrawl` task, you should set `auto_recrawl=False` as well.

cURL command
------------

`self.crawl(curl_command)`

cURL is a command line tool to make a HTTP request. It can easily get form Chrome Devtools > Network panel,  right click the request and "Copy as cURL".

You can use cURL command as the first argument of `self.crawl`. It will parse the command and make the HTTP request just like curl do.

@config(**kwargs)
-----------------
default parameters of `self.crawl` when use the decorated method as callback. For example:

```python
@config(age=15*60)
def index_page(self, response):
    self.crawl('http://www.example.org/list-1.html', callback=self.index_page)
    self.crawl('http://www.example.org/product-233', callback=self.detail_page)
    
@config(age=10*24*60*60)
def detail_page(self, response):
    return {...}
```

`age` of `list-1.html` is 15min while the `age` of `product-233.html` is 10days. Because the callback of `product-233.html` is `detail_page`, means it's a `detail_page` so it shares the config of `detail_page`.

Handler.crawl_config = {}
-------------------------
default parameters of `self.crawl` for the whole project. The parameters in `crawl_config` for scheduler (priority, retries, exetime, age, itag, force_update, auto_recrawl, cancel) will be joined when the task created, the parameters for fetcher and processor will be joined when executed. You can use this mechanism to change the fetch config (e.g. cookies) afterwards.

```python
class Handler(BaseHandler):
    crawl_config = {
        'headers': {
            'User-Agent': 'GoogleBot',
        }
    }
    
    ...
```
> crawl_config set a project level user-agent.


================================================
FILE: docs/apis/self.send_message.md
================================================
self.send_message
=================

self.send_message(project, msg, [url])
--------------------------------------
send messages to other project. can been received by `def on_message(self, project, message)` callback.

- `project` - other project name
- `msg` - any json-able object
- `url` - result will been overwrite if have same `taskid`. `send_message` share a same `taskid` by default. Change this to return multiple result by one response.

```python
def detail_page(self, response):
    for i, each in enumerate(response.json['products']):
        self.send_message(self.project_name, {
                "name": each['name'],
                'price': each['prices'],
             }, url="%s#%s" % (response.url, i))

def on_message(self, project, msg):
    return msg
``` 

pyspider send_message [OPTIONS] PROJECT MESSAGE
-----------------------------------------------

You can also send message from command line.

```
Usage: pyspider send_message [OPTIONS] PROJECT MESSAGE

  Send Message to project from command line

Options:
  --scheduler-rpc TEXT  xmlrpc path of scheduler
  --help                Show this message and exit.
```

def on_message(self, project, message)
--------------------------------------
receive message from other project


================================================
FILE: docs/conf.py
================================================
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:
# Author: Binux<roy@binux.me>
#         http://binux.me
# Created on 2015-11-10 01:31:54

import sys
from unittest.mock import MagicMock
from recommonmark.parser import CommonMarkParser

class Mock(MagicMock):
    @classmethod
    def __getattr__(cls, name):
            return Mock()

MOCK_MODULES = ['pycurl', 'lxml', 'psycopg2']
sys.modules.update((mod_name, Mock()) for mod_name in MOCK_MODULES)

source_parsers = {
        '.md': CommonMarkParser,
}

source_suffix = ['.rst', '.md']


================================================
FILE: docs/index.md
================================================
pyspider [![Build Status][Build Status]][Travis CI] [![Coverage Status][Coverage Status]][Coverage] [![Try][Try]][Demo]
========

A Powerful Spider(Web Crawler) System in Python. **[TRY IT NOW!][Demo]**

- Write script in Python
- Powerful WebUI with script editor, task monitor, project manager and result viewer
- [MySQL](https://www.mysql.com/), [CouchDB](https://couchdb.apache.org), [MongoDB](https://www.mongodb.org/), [Redis](http://redis.io/), [SQLite](https://www.sqlite.org/), [Elasticsearch](https://www.elastic.co/products/elasticsearch); [PostgreSQL](http://www.postgresql.org/) with [SQLAlchemy](http://www.sqlalchemy.org/) as database backend
- [RabbitMQ](http://www.rabbitmq.com/), [Redis](http://redis.io/) and [Kombu](http://kombu.readthedocs.org/) as message queue
- Task priority, retry, periodical, recrawl by age, etc...
- Distributed architecture, Crawl Javascript pages, Python 2&3, etc...

Tutorial: [http://docs.pyspider.org/en/latest/tutorial/](http://docs.pyspider.org/en/latest/tutorial/)  
Documentation: [http://docs.pyspider.org/](http://docs.pyspider.org/)  
Release notes: [https://github.com/binux/pyspider/releases](https://github.com/binux/pyspider/releases)  

Sample Code 
-----------

```python
from pyspider.libs.base_handler import *


class Handler(BaseHandler):
    crawl_config = {
    }

    @every(minutes=24 * 60)
    def on_start(self):
        self.crawl('http://scrapy.org/', callback=self.index_page)

    @config(age=10 * 24 * 60 * 60)
    def index_page(self, response):
        for each in response.doc('a[href^="http"]').items():
            self.crawl(each.attr.href, callback=self.detail_page)

    def detail_page(self, response):
        return {
            "url": response.url,
            "title": response.doc('title').text(),
        }
```

[![Demo][Demo Img]][Demo]


Installation
------------

* `pip install pyspider`
* run command `pyspider`, visit [http://localhost:5000/](http://localhost:5000/)

Quickstart: [http://docs.pyspider.org/en/latest/Quickstart/](http://docs.pyspider.org/en/latest/Quickstart/)

Contribute
----------

* Use It
* Open [Issue], send PR
* [User Group]
* [中文问答](http://segmentfault.com/t/pyspider)


TODO
----

### v0.4.0

- [x] local mode, load script from file.
- [x] works as a framework (all components running in one process, no threads)
- [x] redis
- [x] shell mode like `scrapy shell` 
- [ ] a visual scraping interface like [portia](https://github.com/scrapinghub/portia)


### more

- [x] edit script with vim via [WebDAV](http://en.wikipedia.org/wiki/WebDAV)


License
-------
Licensed under the Apache License, Version 2.0


[Build Status]:         https://img.shields.io/travis/binux/pyspider/master.svg?style=flat
[Travis CI]:            https://travis-ci.org/binux/pyspider
[Coverage Status]:      https://img.shields.io/coveralls/binux/pyspider.svg?branch=master&style=flat
[Coverage]:             https://coveralls.io/r/binux/pyspider
[Try]:                  https://img.shields.io/badge/try-pyspider-blue.svg?style=flat
[Demo]:                 http://demo.pyspider.org/
[Demo Img]:             imgs/demo.png
[Issue]:                https://github.com/binux/pyspider/issues
[User Group]:           https://groups.google.com/group/pyspider-users


================================================
FILE: docs/tutorial/AJAX-and-more-HTTP.md
================================================
Level 2: AJAX and More HTTP
===========================

In the last article, we discussed how to extract links and information from HTML documents. However, web contents are becoming more complicated using some technology like AJAX. You may find that page looks different with it in browser, the information you want to extract is not in the HTML of the page.

In this article, we will not write complete scrape scripts, but some snippets of web page cases using the technology like AJAX or needs some HTTP parameters besides URL.

AJAX
----

[AJAX] is short for asynchronous JavaScript + XML. AJAX is using existing standards to update parts of a web page without loading the whole page. A common usage of AJAX is loading [JSON] data and render to HTML on the client side.

You may find elements missing in HTML fetched by pyspider or [wget](https://www.gnu.org/software/wget/). When you open it in browser some elements appear after page loaded with(maybe not) a 'loading' animation or words. For example, we want to scrape all channels of Dota 2 from [http://www.twitch.tv/directory/game/Dota%202](http://www.twitch.tv/directory/game/Dota%202)

![twitch](../imgs/twitch.png)

But you may find nothing in the page. 

### Finding the request

As [AJAX] data is transferred in [HTTP], we can find the real request with the help of [Chrome Developer Tools](https://developer.chrome.com/devtools).

0. Open a new tab.
1. Use `Ctrl`+`Shift`+`I` (or `Cmd`+`Opt`+`I` on Mac) to open the DevTools.
2. Switch to Network panel.
3. Open the URL [http://www.twitch.tv/directory/game/Dota%202](http://www.twitch.tv/directory/game/Dota%202) in this tab.

While resources are been loaded, you may find a table of requested resources.

![developer tools network](../imgs/developer-tools-network.png)

AJAX is using [XMLHttpRequest](https://developer.mozilla.org/en-US/docs/Web/API/XMLHttpRequest) object to send and retrieve data which is generally shorted as "XHR". Use Filter (funnel icon) to filter out the XHR requests. Glance over each requests using preview:

![find request](../imgs/search-for-request.png)

To determine which one is the key request, you can use a filter to reduce the number of requests, guess the usage of the request by this path and parameters, then view the response contents for confirmation. Here we found the request: [http://api.twitch.tv/kraken/streams?limit=20&offset=0&game=Dota+2&broadcaster_language=&on_site=1](http://api.twitch.tv/kraken/streams?limit=20&offset=0&game=Dota+2&broadcaster_language=&on_site=1)

Now, open the URL in a new tab, you would see a [JSON] data containing channel list. You can use a extension [JSONView](https://chrome.google.com/webstore/detail/jsonview/chklaanhfefbnpoihckbnefhakgolnmc) ([for Firfox](http://jsonview.com/)) to have a pretty printed view of JSON. A sample code is trying extract the name, current title and viewers of each channel.

```
class Handler(BaseHandler):
    @every(minutes=10)
    def on_start(self):
        self.crawl('http://api.twitch.tv/kraken/streams?limit=20&offset=0&game=Dota+2&broadcaster_language=&on_site=1', callback=self.index_page)

    @config(age=10*60)
    def index_page(self, response):
        return [{
                "name": x['channel']['display_name'],
                "viewers": x['viewers'],
                "status": x['channel'].get('status'),
             } for x in response.json['streams']]
```

> * You can use `response.json` to convert content to a python `dict` object.
> * As channel list is changing frequently, we update it every 10 minutes and use [`@config(age=10*60)`](/apis/self.crawl/#configkwargs) to set the age. Otherwise, it will be ignored as scheduler thinks it's new enough and refuse to update the content.

Here is an online demo for twitch as well as a measure using [PhantomJS] which will be discussed in the next level: [http://demo.pyspider.org/debug/tutorial_twitch](http://demo.pyspider.org/debug/tutorial_twitch)

HTTP
----

[HTTP] is the protocol to exchange or transfer hypertext. We had used it in last article, we used `self.crawl` and a URL to fetch HTML content which is transferred by [HTTP].

When you got `403 Forbidden` or needed login. You need right parameters of HTTP request.

A typical HTTP request message to [http://example.com/](http://example.com/) looks like:

```
GET / HTTP/1.1
Host: example.com
Connection: keep-alive
Cache-Control: max-age=0
Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8
User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/40.0.2214.45 Safari/537.36
Referer: http://en.wikipedia.org/wiki/Example.com
Accept-Encoding: gzip, deflate, sdch
Accept-Language: zh-CN,zh;q=0.8
If-None-Match: "359670651"
If-Modified-Since: Fri, 09 Aug 2013 23:54:35 GMT
```

> * the first line contains [HTTP method](http://www.w3schools.com/tags/ref_httpmethods.asp), path and HTTP version
> * several lines of request header fields in `key: value` format.
> * if has message body(say POST request), an empty line and message body would be appended to end of request message.

You can get this with [Chrome Developer Tools](https://developer.chrome.com/devtools) - Network panel we used in above section:

![request header](../imgs/request-headers.png)

In most case, the last thing you need is to copy right URL + method + headers + body from Network panel.

cURL command
------------

`self.crawl` supports `cURL` command as argument to make the HTTP request. It will parse the arguments in the command and use it as fetch parameters.

With `Copy as cURL` of a request, you can get a `cURL` command and paste to `self.crawl(command)` to make crawling easy.

HTTP Method
-----------

[HTTP] defines methods to indicate the desired action to be performed on the identified resource. Two commonly used methods are: GET and POST. GET is when you open a URL, requests the content of a specified resource. POST is used to submit data to server.

TODO: need example here.

HTTP Headers
------------

[HTTP Headers](http://en.wikipedia.org/wiki/List_of_HTTP_header_fields) is a list of parameters of a request. Some headers you need to attention while scraping:

### User-Agent

A [user agent string](http://en.wikipedia.org/wiki/User_agent_string) tell server the application type, operating system or software revision who send the HTTP request.

pyspider's default user agent string is: `pyspider/VERSION (+http://pyspider.org/)`

### Referer

[Referer](http://en.wikipedia.org/wiki/HTTP_referer) is the address of the previous webpage from which a link to the currently requested page was followed. Some website uses this in image resources to prevent deep linking.

TODO: need example here.

HTTP Cookie
-----------

[HTTP Cookie](http://en.wikipedia.org/wiki/HTTP_cookie) is a field in HTTP headers used for tracking which user is making the request. Generally used for user login and prevent unauthorized requests.

You can use [`self.crawl(cookies={"key": value})`](/apis/self.crawl/#fetch) to set cookie via a dict like API.

TODO: need example here.

[PhantomJS]:           http://phantomjs.org/
[AJAX]:          http://en.wikipedia.org/wiki/Ajax_%28programming%29
[JSON]:          http://en.wikipedia.org/wiki/JSON
[HTTP]:          http://en.wikipedia.org/wiki/Hypertext_Transfer_Protocol


================================================
FILE: docs/tutorial/HTML-and-CSS-Selector.md
================================================
Level 1: HTML and CSS Selector
==============================

In this tutorial, we will scrape information of movies and TV from [IMDb].

An online demo with completed code is: [http://demo.pyspider.org/debug/tutorial_imdb](http://demo.pyspider.org/debug/tutorial_imdb) .


Before Start
------------

You should have pyspider installed. You can refer to the documentation [QuickStart](Quickstart). Or test your code on [demo.pyspider.org](http://demo.pyspider.org).

Some basic knowledges you should know before scraping:

* [Web][WWW] is a system of interlinked hypertext pages.
* Pages is identified on the Web via uniform resource locator ([URL]).
* Pages transferred via the Hypertext Transfer Protocol ([HTTP]).
* Web Pages structured using HyperText Markup Language ([HTML]).

To scrape information from a web is

1. Finding URLs of the pages contain the information we want.
2. Fetching the pages via HTTP.
3. Extracting the information from HTML.
4. Finding more URL contains what we want, go back to 2.


Pick a start URL
----------------

As we want to get all of the movies on [IMDb], the first thing is finding a list.  A good list page may:

* containing links to the [movies](http://www.imdb.com/title/tt0167260/) as many as possible.
* by following next page, you can traverse all of the movies. 
* list sorted by last updated time would be a great help to get latest movies.

By looking around at the index page of [IMDb], I found this:

![IMDb front page](../imgs/tutorial_imdb_front.png)

[http://www.imdb.com/search/title?count=100&title_type=feature,tv_series,tv_movie&ref_=nv_ch_mm_1](http://www.imdb.com/search/title?count=100&title_type=feature,tv_series,tv_movie&ref_=nv_ch_mm_1)

### Creating a project

You can find "Create" on the bottom right of baseboard. Click and name a project.

![Creating a project](../imgs/creating_a_project.png)

Changing the crawl URL in `on_start` callback:

```
    @every(minutes=24 * 60)
    def on_start(self):
        self.crawl('http://www.imdb.com/search/title?count=100&title_type=feature,tv_series,tv_movie&ref_=nv_ch_mm_1', callback=self.index_page)
```

> * `self.crawl` would fetch the page and call the `callback` method to parse the response.  
> * The [`@every` decorator](http://docs.pyspider.org/en/latest/apis/@every/) represents `on_start` would execute every day, to make sure not missing any new movies.

Click the green `run` button, you should find a red 1 above follows, switch to follows panel, click the green play button:

![Run one step](../imgs/run_one_step.png)

Index Page
----------

From [index page](http://www.imdb.com/search/title?count=100&title_type=feature,tv_series,tv_movie&ref_=nv_ch_mm_1), we need extract two things:

* links of the movies like `http://www.imdb.com/title/tt0167260/`
* links of [Next](http://www.imdb.com/search/title?count=100&ref_=nv_ch_mm_1&start=101&title_type=feature,tv_series,tv_movie) page

### Find Movies

As you can see, the sample handler had already extracted 1900+ links from the page. A measure of extracting movie pages is filtering links with regular expression:

```
import re
...

    def index_page(self, response):
        for each in response.doc('a[href^="http"]').items():
            if re.match("http://www.imdb.com/title/tt\d+/$", each.attr.href):
                self.crawl(each.attr.href, callback=self.detail_page)
```

> * `callback` is `self.detail_page` here to use another callback method to parse.

Remember you can always use the power of python or anything you are familiar with to extract information. But using tools like CSS selector is recommended.

### Next page

#### CSS Selectors

CSS selectors are patterns used by [CSS] to select HTML elements which are wanted to style. As elements containing information may have different style in document, It's appropriate to use CSS Selector to select elements we want. More information about CSS selectors could be found in above links:

* [CSS Selectors](http://www.w3schools.com/css/css_selectors.asp)
* [CSS Selector Reference](http://www.w3schools.com/cssref/css_selectors.asp)

You can use CSS Selector with built-in `response.doc` object, which is provided by [PyQuery], you may find the full reference there.

#### CSS Selector Helper

pyspider provide a tool called `CSS selector helper` to make it easier to generate a selector pattern to element you clicked. Enable CSS selector helper by click the button and switch to `web` panel.

![CSS Selector helper](../imgs/css_selector_helper.png)

The element will be highlighted in yellow while mouse over. When you click it, a pre-selected CSS Selector pattern is shown on the bar above. You can edit the features to locate the element and add it to your source code.

click "Next »" in the page and add selector pattern to your code:

```
    def index_page(self, response):
        for each in response.doc('a[href^="http"]').items():
            if re.match("http://www.imdb.com/title/tt\d+/$", each.attr.href):
                self.crawl(each.attr.href, callback=self.detail_page)
        self.crawl(response.doc('#right a').attr.href, callback=self.index_page)
```

Click `run` again and move to the next page, we found that "« Prev" has the same selector pattern as "Next »". When using above code you may find pyspider selected the link of "« Prev", not "Next »". A solution for this is select both of them:

```
        self.crawl([x.attr.href for x in response.doc('#right a').items()], callback=self.index_page)
```

Extracting Information
----------------------

Click `run` again and follow to detail page.

Add keys you need to result dict and collect value using `CSS selector helper` repeatedly:

```
    def detail_page(self, response):
        return {
            "url": response.url,
            "title": response.doc('.header > [itemprop="name"]').text(),
            "rating": response.doc('.star-box-giga-star').text(),
            "director": [x.text() for x in response.doc('[itemprop="director"] span').items()],
        }
```

Note that, `CSS Selector helper` may not always work. You could write selector pattern manually with tools like [Chrome Dev Tools](https://developer.chrome.com/devtools):

![inspect element](../imgs/inspect_element.png)

You doesn't need to write every ancestral element in selector pattern, only the elements which can differentiate with not needed elements, is enough. However, it needs experience on scraping or Web developing to know which attribute is important, can be used as locator. You can also test CSS Selector in the JavaScript Console by using `$$` like `$$('[itemprop="director"] span')`

Running
-------

1. After tested you code, don't forget to save it.
2. Back to dashboard find your project.
3. Changing the `status` to `DEBUG` or `RUNNING`.
4. Press the `run` button. 

![index demo](../imgs/index_page.png)

Notes
-----

The script is just a simple, you may found more issues when scraping IMDb:

* ref in list page url is for tracing user, it's better remove it.
* IMDb does not serve more than 100000 results for any query, you need find more lists with lesser results, like [this](http://www.imdb.com/search/title?genres=action&title_type=feature&sort=moviemeter,asc)
* You may need a list sorted by last updated time and update it with a shorter interval.
* Some attribute is hard to extract, you may need write selector pattern on hand or using [XPATH](http://www.w3schools.com/xpath/xpath_syntax.asp) and/or some python code to extract information.

[IMDb]:          http://www.imdb.com/
[WWW]:           http://en.wikipedia.org/wiki/World_Wide_Web
[HTTP]:          http://en.wikipedia.org/wiki/Hypertext_Transfer_Protocol
[HTML]:          http://en.wikipedia.org/wiki/HTML
[URL]:           http://en.wikipedia.org/wiki/Uniform_resource_locator
[CSS]:           https://developer.mozilla.org/en-US/docs/Web/Guide/CSS/Getting_Started/What_is_CSS
[PyQuery]:       https://pythonhosted.org/pyquery/


================================================
FILE: docs/tutorial/Render-with-PhantomJS.md
================================================
Level 3: Render with PhantomJS
==============================

Sometimes web page is too complex to find out the API request. It's time to meet the power of [PhantomJS].

To use PhantomJS, you should have PhantomJS [installed](http://phantomjs.org/download.html). If you are running pyspider with `all` mode, PhantomJS is enabled if excutable in the `PATH`.

Make sure phantomjs is working by running
```
$ pyspider phantomjs
```

Continue with the rest of the tutorial if the output is
```
Web server running on port 25555
```

Use PhantomJS
-------------

When pyspider with PhantomJS connected, you can enable this feature by adding a parameter `fetch_type='js'` to `self.crawl`. We use PhantomJS to scrape channel list of  [http://www.twitch.tv/directory/game/Dota%202](http://www.twitch.tv/directory/game/Dota%202) which is loaded with AJAX we discussed in [Level 2](tutorial/AJAX-and-more-HTTP#ajax):

```
class Handler(BaseHandler):
    def on_start(self):
        self.crawl('http://www.twitch.tv/directory/game/Dota%202',
                   fetch_type='js', callback=self.index_page)
             
    def index_page(self, response):
        return {
            "url": response.url,
            "channels": [{
                "title": x('.title').text(),
                "viewers": x('.info').contents()[2],
                "name": x('.info a').text(),
            } for x in response.doc('.stream.item').items()]
        }
```
> I used some API to handle the list of streams. You can find complete API reference from [PyQuery complete API](https://pythonhosted.org/pyquery/api.html)

Running JavaScript on Page
--------------------------

We will try to scrape images from [http://www.pinterest.com/categories/popular/](http://www.pinterest.com/categories/popular/) in this section. Only 25 images is shown at the beginning, more images would be loaded when you scroll to the bottom of the page.

To scrape images as many as posible we can use a [`js_script` parameter](/apis/self.crawl/#enable-javascript-fetcher-need-support-by-fetcher) to set some function wrapped JavaScript codes to simulate the scroll action: 

```
class Handler(BaseHandler):
    def on_start(self):
        self.crawl('http://www.pinterest.com/categories/popular/',
                   fetch_type='js', js_script="""
                   function() {
                       window.scrollTo(0,document.body.scrollHeight);
                   }
                   """, callback=self.index_page)

    def index_page(self, response):
        return {
            "url": response.url,
            "images": [{
                "title": x('.richPinGridTitle').text(),
                "img": x('.pinImg').attr('src'),
                "author": x('.creditName').text(),
            } for x in response.doc('.item').items() if x('.pinImg')]
        }
```

> * Script would been executed after page loaded(can been changed via [`js_run_at` parameter](/apis/self.crawl/#enable-javascript-fetcher-need-support-by-fetcher))
> * We scroll once after page loaded, you can scroll multiple times using [`setTimeout`](https://developer.mozilla.org/en-US/docs/Web/API/WindowTimers.setTimeout). PhantomJS will fetch as many items as possible before timeout arrived.

Online demo: [http://demo.pyspider.org/debug/tutorial_pinterest](http://demo.pyspider.org/debug/tutorial_pinterest)


[PhantomJS]:           http://phantomjs.org/


================================================
FILE: docs/tutorial/index.md
================================================
pyspider Tutorial
=================

> The best way to learn how to scrap is learning how to make it.

* [Level 1: HTML and CSS Selector](HTML-and-CSS-Selector)
* [Level 2: AJAX and More HTTP](AJAX-and-more-HTTP)
* [Level 3: Render with PhantomJS](Render-with-PhantomJS)

If you have problem using pyspider, [user group](https://groups.google.com/group/pyspider-users) is a place for discussing.


================================================
FILE: mkdocs.yml
================================================
site_name: pyspider
site_description: A Powerful Spider(Web Crawler) System in Python.
site_author: binux
repo_url: https://github.com/binux/pyspider
pages:
- Introduction: index.md
- Quickstart: Quickstart.md
- Command Line: Command-Line.md
- Tutorial:
  - Index: tutorial/index.md
  - 'Level 1: HTML and CSS Selector': tutorial/HTML-and-CSS-Selector.md
  - 'Level 2: AJAX and More HTTP': tutorial/AJAX-and-more-HTTP.md
  - 'Level 3: Render with PhantomJS': tutorial/Render-with-PhantomJS.md
- About pyspider:
  - Architecture: Architecture.md
  - About Tasks: About-Tasks.md
  - About Projects: About-Projects.md
  - Script Environment: Script-Environment.md
  - Working with Results: Working-with-Results.md
- API Reference:
  - Index: apis/index.md
  - self.crawl: apis/self.crawl.md
  - Response: apis/Response.md
  - self.send_message: apis/self.send_message.md
  - '@catch_status_code_error': apis/@catch_status_code_error.md
  - '@every': apis/@every.md
- Deployment: Deployment.md
- Running pyspider with Docker: Running-pyspider-with-Docker.md
- Deployment of demo.pyspider.org: Deployment-demo.pyspider.org.md
- Frequently Asked Questions: Frequently-Asked-Questions.md

theme: readthedocs
markdown_extensions: ['toc(permalink=true)', ]


================================================
FILE: pyspider/__init__.py
================================================
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:
# Author: Binux<i@binux.me>
#         http://binux.me
# Created on 2014-11-17 19:17:12

__version__ = '0.4.0'


================================================
FILE: pyspider/database/__init__.py
================================================
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:
# Author: Binux<i@binux.me>
#         http://binux.me
# Created on 2014-10-08 15:04:08

import os, requests, json
from six.moves.urllib.parse import urlparse, parse_qs


def connect_database(url):
    """
    create database object by url

    mysql:
        mysql+type://user:passwd@host:port/database
    sqlite:
        # relative path
        sqlite+type:///path/to/database.db
        # absolute path
        sqlite+type:////path/to/database.db
        # memory database
        sqlite+type://
    mongodb:
        mongodb+type://[username:password@]host1[:port1][,host2[:port2],...[,hostN[:portN]]][/[database][?options]]
        more: http://docs.mongodb.org/manual/reference/connection-string/
    sqlalchemy:
        sqlalchemy+postgresql+type://user:passwd@host:port/database
        sqlalchemy+mysql+mysqlconnector+type://user:passwd@host:port/database
        more: http://docs.sqlalchemy.org/en/rel_0_9/core/engines.html
    redis:
        redis+taskdb://host:port/db
    elasticsearch:
        elasticsearch+type://host:port/?index=pyspider
    couchdb:
        couchdb+type://[username:password@]host[:port]
    local:
        local+projectdb://filepath,filepath

    type:
        taskdb
        projectdb
        resultdb

    """
    db = _connect_database(url)
    db.copy = lambda: _connect_database(url)
    return db


def _connect_database(url):  # NOQA
    parsed = urlparse(url)

    scheme = parsed.scheme.split('+')
    if len(scheme) == 1:
        raise Exception('wrong scheme format: %s' % parsed.scheme)
    else:
        engine, dbtype = scheme[0], scheme[-1]
        other_scheme = "+".join(scheme[1:-1])

    if dbtype not in ('taskdb', 'projectdb', 'resultdb'):
        raise LookupError('unknown database type: %s, '
                          'type should be one of ["taskdb", "projectdb", "resultdb"]', dbtype)

    if engine == 'mysql':
        return _connect_mysql(parsed,dbtype)

    elif engine == 'sqlite':
        return _connect_sqlite(parsed,dbtype)
    elif engine == 'mongodb':
        return _connect_mongodb(parsed,dbtype,url)

    elif engine == 'sqlalchemy':
        return _connect_sqlalchemy(parsed, dbtype, url, other_scheme)


    elif engine == 'redis':
        if dbtype == 'taskdb':
            from .redis.taskdb import TaskDB
            return TaskDB(parsed.hostname, parsed.port,
                          int(parsed.path.strip('/') or 0))
        else:
            raise LookupError('not supported dbtype: %s', dbtype)
    elif engine == 'local':
        scripts = url.split('//', 1)[1].split(',')
        if dbtype == 'projectdb':
            from .local.projectdb import ProjectDB
            return ProjectDB(scripts)
        else:
            raise LookupError('not supported dbtype: %s', dbtype)
    elif engine == 'elasticsearch' or engine == 'es':
        return _connect_elasticsearch(parsed, dbtype)

    elif engine == 'couchdb':
        return _connect_couchdb(parsed, dbtype, url)

    else:
        raise Exception('unknown engine: %s' % engine)


def _connect_mysql(parsed,dbtype):
    parames = {}
    if parsed.username:
        parames['user'] = parsed.username
    if parsed.password:
        parames['passwd'] = parsed.password
    if parsed.hostname:
        parames['host'] = parsed.hostname
    if parsed.port:
        parames['port'] = parsed.port
    if parsed.path.strip('/'):
        parames['database'] = parsed.path.strip('/')

    if dbtype == 'taskdb':
        from .mysql.taskdb import TaskDB
        return TaskDB(**parames)
    elif dbtype == 'projectdb':
        from .mysql.projectdb import ProjectDB
        return ProjectDB(**parames)
    elif dbtype == 'resultdb':
        from .mysql.resultdb import ResultDB
        return ResultDB(**parames)
    else:
        raise LookupError


def _connect_sqlite(parsed,dbtype):
    if parsed.path.startswith('//'):
        path = '/' + parsed.path.strip('/')
    elif parsed.path.startswith('/'):
        path = './' + parsed.path.strip('/')
    elif not parsed.path:
        path = ':memory:'
    else:
        raise Exception('error path: %s' % parsed.path)

    if dbtype == 'taskdb':
        from .sqlite.taskdb import TaskDB
        return TaskDB(path)
    elif dbtype == 'projectdb':
        from .sqlite.projectdb import ProjectDB
        return ProjectDB(path)
    elif dbtype == 'resultdb':
        from .sqlite.resultdb import ResultDB
        return ResultDB(path)
    else:
        raise LookupError


def _connect_mongodb(parsed,dbtype,url):
    url = url.replace(parsed.scheme, 'mongodb')
    parames = {}
    if parsed.path.strip('/'):
        parames['database'] = parsed.path.strip('/')

    if dbtype == 'taskdb':
        from .mongodb.taskdb import TaskDB
        return TaskDB(url, **parames)
    elif dbtype == 'projectdb':
        from .mongodb.projectdb import ProjectDB
        return ProjectDB(url, **parames)
    elif dbtype == 'resultdb':
        from .mongodb.resultdb import ResultDB
        return ResultDB(url, **parames)
    else:
        raise LookupError


def _connect_sqlalchemy(parsed, dbtype,url, other_scheme):
    if not other_scheme:
        raise Exception('wrong scheme format: %s' % parsed.scheme)
    url = url.replace(parsed.scheme, other_scheme)
    if dbtype == 'taskdb':
        from .sqlalchemy.taskdb import TaskDB
        return TaskDB(url)
    elif dbtype == 'projectdb':
        from .sqlalchemy.projectdb import ProjectDB
        return ProjectDB(url)
    elif dbtype == 'resultdb':
        from .sqlalchemy.resultdb import ResultDB
        return ResultDB(url)
    else:
        raise LookupError


def _connect_elasticsearch(parsed, dbtype):
    # in python 2.6 url like "http://host/?query", query will not been splitted
    if parsed.path.startswith('/?'):
        index = parse_qs(parsed.path[2:])
    else:
        index = parse_qs(parsed.query)
    if 'index' in index and index['index']:
        index = index['index'][0]
    else:
        index = 'pyspider'

    if dbtype == 'projectdb':
        from .elasticsearch.projectdb import ProjectDB
        return ProjectDB([parsed.netloc], index=index)
    elif dbtype == 'resultdb':
        from .elasticsearch.resultdb import ResultDB
        return ResultDB([parsed.netloc], index=index)
    elif dbtype == 'taskdb':
        from .elasticsearch.taskdb import TaskDB
        return TaskDB([parsed.netloc], index=index)


def _connect_couchdb(parsed, dbtype, url):
    if os.environ.get('COUCHDB_HTTPS'):
        url = "https://" + parsed.netloc + "/"
    else:
        url = "http://" + parsed.netloc + "/"
    params = {}

    # default to env, then url, then hard coded
    params['username'] = os.environ.get('COUCHDB_USER') or parsed.username
    params['password'] = os.environ.get('COUCHDB_PASSWORD') or parsed.password

    if dbtype == 'taskdb':
        from .couchdb.taskdb import TaskDB
        return TaskDB(url, **params)
    elif dbtype == 'projectdb':
        from .couchdb.projectdb import ProjectDB
        return ProjectDB(url, **params)
    elif dbtype == 'resultdb':
        from .couchdb.resultdb import ResultDB
        return ResultDB(url, **params)
    else:
        raise LookupError


================================================
FILE: pyspider/database/base/__init__.py
================================================


================================================
FILE: pyspider/database/base/projectdb.py
================================================
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:
# Author: Binux<i@binux.me>
#         http://binux.me
# Created on 2014-02-09 11:28:52

import re

# NOTE: When get/get_all/check_update from database with default fields,
#       all following fields should be included in output dict.
{
    'project': {
        'name': str,
        'group': str,
        'status': str,
        'script': str,
        # 'config': str,
        'comments': str,
        # 'priority': int,
        'rate': int,
        'burst': int,
        'updatetime': int,
    }
}


class ProjectDB(object):
    status_str = [
        'TODO',
        'STOP',
        'CHECKING',
        'DEBUG',
        'RUNNING',
    ]

    def insert(self, name, obj={}):
        raise NotImplementedError

    def update(self, name, obj={}, **kwargs):
        raise NotImplementedError

    def get_all(self, fields=None):
        raise NotImplementedError

    def get(self, name, fields):
        raise NotImplementedError

    def drop(self, name):
        raise NotImplementedError

    def check_update(self, timestamp, fields=None):
        raise NotImplementedError

    def split_group(self, group, lower=True):
        if lower:
            return re.split("\W+", (group or '').lower())
        else:
            return re.split("\W+", group or '')

    def verify_project_name(self, name):
        if len(name) > 64:
            return False
        if re.search(r"[^\w]", name):
            return False
        return True

    def copy(self):
        '''
        database should be able to copy itself to create new connection

        it's implemented automatically by pyspider.database.connect_database
        if you are not create database connection via connect_database method,
        you should implement this
        '''
        raise NotImplementedError


================================================
FILE: pyspider/database/base/resultdb.py
================================================
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:
# Author: Binux<i@binux.me>
#         http://binux.me
# Created on 2014-10-11 18:40:03

# result schema
{
    'result': {
        'taskid': str,  # new, not changeable
        'project': str,  # new, not changeable
        'url': str,  # new, not changeable
        'result': str,  # json string
        'updatetime': int,
    }
}


class ResultDB(object):
    """
    database for result
    """
    projects = set()  # projects in resultdb

    def save(self, project, taskid, url, result):
        raise NotImplementedError

    def select(self, project, fields=None, offset=0, limit=None):
        raise NotImplementedError

    def count(self, project):
        raise NotImplementedError

    def get(self, project, taskid, fields=None):
        raise NotImplementedError

    def drop(self, project):
        raise NotImplementedError

    def copy(self):
        '''
        database should be able to copy itself to create new connection

        it's implemented automatically by pyspider.database.connect_database
        if you are not create database connection via connect_database method,
        you should implement this
        '''
        raise NotImplementedError


================================================
FILE: pyspider/database/base/taskdb.py
================================================
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:
# Author: Binux<i@binux.me>
#         http://binux.me
# Created on 2014-02-08 10:28:48

# task schema
{
    'task': {
        'taskid': str,  # new, not change
        'project': str,  # new, not change
        'url': str,  # new, not change
        'status': int,  # change
        'schedule': {
            'priority': int,
            'retries': int,
            'retried': int,
            'exetime': int,
            'age': int,
            'itag': str,
            # 'recrawl': int
        },  # new and restart
        'fetch': {
            'method': str,
            'headers': dict,
            'data': str,
            'timeout': int,
            'save': dict,
        },  # new and restart
        'process': {
            'callback': str,
        },  # new and restart
        'track': {
            'fetch': {
                'ok': bool,
                'time': int,
                'status_code': int,
                'headers': dict,
                'encoding': str,
                'content': str,
            },
            'process': {
                'ok': bool,
                'time': int,
                'follows': int,
                'outputs': int,
                'logs': str,
                'exception': str,
            },
            'save': object,  # jsonable object saved by processor
        },  # finish
        'lastcrawltime': int,  # keep between request
        'updatetime': int,  # keep between request
    }
}


class TaskDB(object):
    ACTIVE = 1
    SUCCESS = 2
    FAILED = 3
    BAD = 4

    projects = set()  # projects in taskdb

    def load_tasks(self, status, project=None, fields=None):
        raise NotImplementedError

    def get_task(self, project, taskid, fields=None):
        raise NotImplementedError

    def status_count(self, project):
        '''
        return a dict
        '''
        raise NotImplementedError

    def insert(self, project, taskid, obj={}):
        raise NotImplementedError

    def update(self, project, taskid, obj={}, **kwargs):
        raise NotImplementedError

    def drop(self, project):
        raise NotImplementedError

    @staticmethod
    def status_to_string(status):
        return {
            1: 'ACTIVE',
            2: 'SUCCESS',
            3: 'FAILED',
            4: 'BAD',
        }.get(status, 'UNKNOWN')

    @staticmethod
    def status_to_int(status):
        return {
            'ACTIVE': 1,
            'SUCCESS': 2,
            'FAILED': 3,
            'BAD': 4,
        }.get(status, 4)

    def copy(self):
        '''
        database should be able to copy itself to create new connection

        it's implemented automatically by pyspider.database.connect_database
        if you are not create database connection via connect_database method,
        you should implement this
        '''
        raise NotImplementedError


================================================
FILE: pyspider/database/basedb.py
================================================
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:
# Author: Binux<i@binux.com>
#         http://binux.me
# Created on 2012-08-30 17:43:49

from __future__ import unicode_literals, division, absolute_import

import logging
logger = logging.getLogger('database.basedb')

from six import itervalues
from pyspider.libs import utils


class BaseDB:

    '''
    BaseDB

    dbcur should be overwirte
    '''
    __tablename__ = None
    placeholder = '%s'
    maxlimit = -1

    @staticmethod
    def escape(string):
        return '`%s`' % string

    @property
    def dbcur(self):
        raise NotImplementedError

    def _execute(self, sql_query, values=[]):
        dbcur = self.dbcur
        dbcur.execute(sql_query, values)
        return dbcur

    def _select(self, tablename=None, what="*", where="", where_values=[], offset=0, limit=None):
        tablename = self.escape(tablename or self.__tablename__)
        if isinstance(what, list) or isinstance(what, tuple) or what is None:
            what = ','.join(self.escape(f) for f in what) if what else '*'

        sql_query = "SELECT %s FROM %s" % (what, tablename)
        if where:
            sql_query += " WHERE %s" % where
        if limit:
            sql_query += " LIMIT %d, %d" % (offset, limit)
        elif offset:
            sql_query += " LIMIT %d, %d" % (offset, self.maxlimit)
        logger.debug("<sql: %s>", sql_query)

        for row in self._execute(sql_query, where_values):
            yield row

    def _select2dic(self, tablename=None, what="*", where="", where_values=[],
                    order=None, offset=0, limit=None):
        tablename = self.escape(tablename or self.__tablename__)
        if isinstance(what, list) or isinstance(what, tuple) or what is None:
            what = ','.join(self.escape(f) for f in what) if what else '*'

        sql_query = "SELECT %s FROM %s" % (what, tablename)
        if where:
            sql_query += " WHERE %s" % where
        if order:
            sql_query += ' ORDER BY %s' % order
        if limit:
            sql_query += " LIMIT %d, %d" % (offset, limit)
        elif offset:
            sql_query += " LIMIT %d, %d" % (offset, self.maxlimit)
        logger.debug("<sql: %s>", sql_query)

        dbcur = self._execute(sql_query, where_values)

        # f[0] may return bytes type
        # https://github.com/mysql/mysql-connector-python/pull/37
        fields = [utils.text(f[0]) for f in dbcur.description]

        for row in dbcur:
            yield dict(zip(fields, row))

    def _replace(self, tablename=None, **values):
        tablename = self.escape(tablename or self.__tablename__)
        if values:
            _keys = ", ".join(self.escape(k) for k in values)
            _values = ", ".join([self.placeholder, ] * len(values))
            sql_query = "REPLACE INTO %s (%s) VALUES (%s)" % (tablename, _keys, _values)
        else:
            sql_query = "REPLACE INTO %s DEFAULT VALUES" % tablename
        logger.debug("<sql: %s>", sql_query)

        if values:
            dbcur = self._execute(sql_query, list(itervalues(values)))
        else:
            dbcur = self._execute(sql_query)
        return dbcur.lastrowid

    def _insert(self, tablename=None, **values):
        tablename = self.escape(tablename or self.__tablename__)
        if values:
            _keys = ", ".join((self.escape(k) for k in values))
            _values = ", ".join([self.placeholder, ] * len(values))
            sql_query = "INSERT INTO %s (%s) VALUES (%s)" % (tablename, _keys, _values)
        else:
            sql_query = "INSERT INTO %s DEFAULT VALUES" % tablename
        logger.debug("<sql: %s>", sql_query)

        if values:
            dbcur = self._execute(sql_query, list(itervalues(values)))
        else:
            dbcur = self._execute(sql_query)
        return dbcur.lastrowid

    def _update(self, tablename=None, where="1=0", where_values=[], **values):
        tablename = self.escape(tablename or self.__tablename__)
        _key_values = ", ".join([
            "%s = %s" % (self.escape(k), self.placeholder) for k in values
        ])
        sql_query = "UPDATE %s SET %s WHERE %s" % (tablename, _key_values, where)
        logger.debug("<sql: %s>", sql_query)

        return self._execute(sql_query, list(itervalues(values)) + list(where_values))

    def _delete(self, tablename=None, where="1=0", where_values=[]):
        tablename = self.escape(tablename or self.__tablename__)
        sql_query = "DELETE FROM %s" % tablename
        if where:
            sql_query += " WHERE %s" % where
        logger.debug("<sql: %s>", sql_query)

        return self._execute(sql_query, where_values)

if __name__ == "__main__":
    import sqlite3

    class DB(BaseDB):
        __tablename__ = "test"
        placeholder = "?"

        def __init__(self):
            self.conn = sqlite3.connect(":memory:")
            cursor = self.conn.cursor()
            cursor.execute(
                '''CREATE TABLE `%s` (id INTEGER PRIMARY KEY AUTOINCREMENT, name, age)'''
                % self.__tablename__
            )

        @property
        def dbcur(self):
            return self.conn.cursor()

    db = DB()
    assert db._insert(db.__tablename__, name="binux", age=23) == 1
    assert db._select(db.__tablename__, "name, age").next() == ("binux", 23)
    assert db._select2dic(db.__tablename__, "name, age").next()["name"] == "binux"
    assert db._select2dic(db.__tablename__, "name, age").next()["age"] == 23
    db._replace(db.__tablename__, id=1, age=24)
    assert db._select(db.__tablename__, "name, age").next() == (None, 24)
    db._update(db.__tablename__, "id = 1", age=16)
    assert db._select(db.__tablename__, "name, age").next() == (None, 16)
    db._delete(db.__tablename__, "id = 1")
    assert [row for row in db._select(db.__tablename__)] == []


================================================
FILE: pyspider/database/couchdb/__init__.py
================================================


================================================
FILE: pyspider/database/couchdb/couchdbbase.py
================================================
import time, requests, json
from requests.auth import HTTPBasicAuth

class SplitTableMixin(object):
    UPDATE_PROJECTS_TIME = 10 * 60

    def __init__(self):
        self.session = requests.session()
        if self.username:
            self.session.auth = HTTPBasicAuth(self.username, self.password)
        self.session.headers.update({'Content-Type': 'application/json'})

    def _collection_name(self, project):
        if self.collection_prefix:
            return "%s_%s" % (self.collection_prefix, project)
        else:
            return project


    @property
    def projects(self):
        if time.time() - getattr(self, '_last_update_projects', 0) > self.UPDATE_PROJECTS_TIME:
            self._list_project()
        return self._projects


    @projects.setter
    def projects(self, value):
        self._projects = value


    def _list_project(self):
        self._last_update_projects = time.time()
        self.projects = set()
        if self.collection_prefix:
            prefix = "%s." % self.collection_prefix
        else:
            prefix = ''

        url = self.base_url + "_all_dbs"
        res = self.session.get(url, json={}).json()
        for each in res:
            if each.startswith('_'):
                continue
            if each.startswith(self.database):
                self.projects.add(each[len(self.database)+1+len(prefix):])


    def create_database(self, name):
        url = self.base_url + name
        res = self.session.put(url).json()
        if 'error' in res and res['error'] == 'unauthorized':
            raise Exception("Supplied credentials are incorrect. Reason: {} for User: {} Password: {}".format(res['reason'], self.username, self.password))
        return res


    def get_doc(self, db_name, doc_id):
        url = self.base_url + db_name + "/" + doc_id
        res = self.session.get(url).json()
        if "error" in res and res["error"] == "not_found":
            return None
        return res


    def get_docs(self, db_name, selector):
        url = self.base_url + db_name + "/_find"
        selector['use_index'] = self.index
        res = self.session.post(url, json=selector).json()
        if 'error' in res and res['error'] == 'not_found':
            return []
        return res['docs']


    def get_all_docs(self, db_name):
        return self.get_docs(db_name, {"selector": {}})


    def insert_doc(self, db_name, doc_id, doc):
        url = self.base_url + db_name + "/" + doc_id
        return self.session.put(url, json=doc).json()


    def update_doc(self, db_name, doc_id, new_doc):
        doc = self.get_doc(db_name, doc_id)
        if doc is None:
            return self.insert_doc(db_name, doc_id, new_doc)
        for key in new_doc:
            doc[key] = new_doc[key]
        url = self.base_url + db_name + "/" + doc_id
        return self.session.put(url, json=doc).json()


    def delete(self, url):
        return self.session.delete(url).json()


================================================
FILE: pyspider/database/couchdb/projectdb.py
================================================
import time, requests, json
from requests.auth import HTTPBasicAuth
from pyspider.database.base.projectdb import ProjectDB as BaseProjectDB


class ProjectDB(BaseProjectDB):
    __collection_name__ = 'projectdb'

    def __init__(self, url, database='projectdb', username=None, password=None):
        self.username = username
        self.password = password
        self.url = url + self.__collection_name__ + "_" + database + "/"
        self.database = database

        self.session = requests.session()
        if username:
            self.session.auth = HTTPBasicAuth(self.username, self.password)
        self.session.headers.update({'Content-Type': 'application/json'})

        # Create the db
        res = self.session.put(self.url).json()
        if 'error' in res and res['error'] == 'unauthorized':
            raise Exception(
                "Supplied credentials are incorrect. Reason: {} for User: {} Password: {}".format(res['reason'],
                                                                                                  self.username,
                                                                                                  self.password))
        # create index
        payload = {
            'index': {
                'fields': ['name']
            },
            'name': self.__collection_name__ + "_" + database
        }
        res = self.session.post(self.url + "_index", json=payload).json()
        self.index = res['id']

    def _default_fields(self, each):
        if each is None:
            return each
        each.setdefault('group', None)
        each.setdefault('status', 'TODO')
        each.setdefault('script', '')
        each.setdefault('comments', None)
        each.setdefault('rate', 0)
        each.setdefault('burst', 0)
        each.setdefault('updatetime', 0)
        return each

    def insert(self, name, obj={}):
        url = self.url + name
        obj = dict(obj)
        obj['name'] = name
        obj['updatetime'] = time.time()
        res = self.session.put(url, json=obj).json()
        return res

    def update(self, name, obj={}, **kwargs):
        # object contains the fields to update and their new values
        update = self.get(name) # update will contain _rev
        if update is None:
            return None
        obj = dict(obj)
        obj['updatetime'] = time.time()
        obj.update(kwargs)
        for key in obj:
            update[key] = obj[key]
        return self.insert(name, update)

    def get_all(self, fields=None):
        if fields is None:
            fields = []
        payload = {
            "selector": {},
            "fields": fields,
            "use_index": self.index
        }
        url = self.url + "_find"
        res = self.session.post(url, json=payload).json()
        for doc in res['docs']:
            yield self._default_fields(doc)

    def get(self, name, fields=None):
        if fields is None:
            fields = []
        payload = {
            "selector": {"name": name},
            "fields": fields,
            "limit": 1,
            "use_index": self.index
        }
        url = self.url + "_find"
        res = self.session.post(url, json=payload).json()
        if len(res['docs']) == 0:
            return None
        return self._default_fields(res['docs'][0])

    def check_update(self, timestamp, fields=None):
        if fields is None:
            fields = []
        for project in self.get_all(fields=('updatetime', 'name')):
            if project['updatetime'] > timestamp:
                project = self.get(project['name'], fields)
                yield self._default_fields(project)

    def drop(self, name):
        doc = self.get(name)
        payload = {"rev": doc["_rev"]}
        url = self.url + name
        return self.session.delete(url, params=payload).json()

    def drop_database(self):
        return self.session.delete(self.url).json()


================================================
FILE: pyspider/database/couchdb/resultdb.py
================================================
import time, json
from pyspider.database.base.resultdb import ResultDB as BaseResultDB
from .couchdbbase import SplitTableMixin


class ResultDB(SplitTableMixin, BaseResultDB):
    collection_prefix = ''

    def __init__(self, url, database='resultdb', username=None, password=None):
        self.username = username
        self.password = password
        self.base_url = url
        self.url = url + database + "/"
        self.database = database

        super().__init__()
        self.create_database(database)
        self.index = None

    def _get_collection_name(self, project):
        return self.database + "_" + self._collection_name(project)

    def _create_project(self, project):
        collection_name = self._get_collection_name(project)
        self.create_database(collection_name)
        # create index
        payload = {
            'index': {
                'fields': ['taskid']
            },
            'name': collection_name
        }

        res = self.session.post(self.base_url + collection_name + "/_index", json=payload).json()
        self.index = res['id']
        self._list_project()

    def save(self, project, taskid, url, result):
        if project not in self.projects:
            self._create_project(project)
        collection_name = self._get_collection_name(project)
        obj = {
            'taskid': taskid,
            'url': url,
            'result': result,
            'updatetime': time.time(),
        }
        return self.update_doc(collection_name, taskid, obj)

    def select(self, project, fields=None, offset=0, limit=0):
        if project not in self.projects:
            self._list_project()
        if project not in self.projects:
            return
        offset = offset or 0
        limit = limit or 0
        collection_name = self._get_collection_name(project)
        if fields is None:
            fields = []
        if limit == 0:
            sel = {
                'selector': {},
                'fields': fields,
                'skip': offset
            }
        else:
            sel = {
              'selector': {},
              'fields': fields,
              'skip': offset,
              'limit': limit
            }
        for result in self.get_docs(collection_name, sel):
            yield result

    def count(self, project):
        if project not in self.projects:
            self._list_project()
        if project not in self.projects:
            return
        collection_name = self._get_collection_name(project)
        return len(self.get_all_docs(collection_name))

    def get(self, project, taskid, fields=None):
        if project not in self.projects:
            self._list_project()
        if project not in self.projects:
            return
        collection_name = self._get_collection_name(project)
        if fields is None:
            fields = []
        sel = {
            'selector': {'taskid': taskid},
            'fields': fields
        }
        ret = self.get_docs(collection_name, sel)
        if len(ret) == 0:
            return None
        return ret[0]

    def drop_database(self):
        return self.delete(self.url)

    def drop(self, project):
        # drop the project
        collection_name = self._get_collection_name(project)
        url = self.base_url + collection_name
        return self.delete(url)

================================================
FILE: pyspider/database/couchdb/taskdb.py
================================================
import json, time
from pyspider.database.base.taskdb import TaskDB as BaseTaskDB
from .couchdbbase import SplitTableMixin


class TaskDB(SplitTableMixin, BaseTaskDB):
    collection_prefix = ''

    def __init__(self, url, database='taskdb', username=None, password=None):
        self.username = username
        self.password = password
        self.base_url = url
        self.url = url + database + "/"
        self.database = database
        self.index = None

        super().__init__()

        self.create_database(database)
        self.projects = set()
        self._list_project()

    def _get_collection_name(self, project):
        return self.database + "_" + self._collection_name(project)

    def _create_project(self, project):
        collection_name = self._get_collection_name(project)
        self.create_database(collection_name)
        # create index
        payload = {
            'index': {
                'fields': ['status', 'taskid']
            },
            'name': collection_name
        }
        res = self.session.post(self.base_url + collection_name + "/_index", json=payload).json()
        self.index = res['id']
        self._list_project()

    def load_tasks(self, status, project=None, fields=None):
        if not project:
            self._list_project()
        if fields is None:
            fields = []
        if project:
            projects = [project, ]
        else:
            projects = self.projects
        for project in projects:
            collection_name = self._get_collection_name(project)
            for task in self.get_docs(collection_name, {"selector": {"status": status}, "fields": fields}):
                yield task

    def get_task(self, project, taskid, fields=None):
        if project not in self.projects:
            self._list_project()
        if project not in self.projects:
            return
        if fields is None:
            fields = []
        collection_name = self._get_collection_name(project)
        ret = self.get_docs(collection_name, {"selector": {"taskid": taskid}, "fields": fields})
        if len(ret) == 0:
            return None
        return ret[0]

    def status_count(self, project):
        if project not in self.projects:
            self._list_project()
        if project not in self.projects:
            return {}
        collection_name = self._get_collection_name(project)

        def _count_for_status(collection_name, status):
            total = len(self.get_docs(collection_name, {"selector": {'status': status}}))
            return {'total': total, "_id": status} if total else None

        c = collection_name
        ret = filter(lambda x: x,map(lambda s: _count_for_status(c, s), [self.ACTIVE, self.SUCCESS, self.FAILED]))

        result = {}
        if isinstance(ret, dict):
            ret = ret.get('result', [])
        for each in ret:
            result[each['_id']] = each['total']
        return result

    def insert(self, project, taskid, obj={}):
        if project not in self.projects:
            self._create_project(project)
        obj = dict(obj)
        obj['taskid'] = taskid
        obj['project'] = project
        obj['updatetime'] = time.time()
        return self.update(project, taskid, obj=obj)

    def update(self, project, taskid, obj={}, **kwargs):
        obj = dict(obj)
        obj.update(kwargs)
        obj['updatetime'] = time.time()
        collection_name = self._get_collection_name(project)
        return self.update_doc(collection_name, taskid, obj)

    def drop_database(self):
        return self.delete(self.url)

    def drop(self, project):
        collection_name = self._get_collection_name(project)
        url = self.base_url + collection_name
        return self.delete(url)

================================================
FILE: pyspider/database/elasticsearch/__init__.py
================================================
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:
# Author: Binux<roy@binux.me>
#         http://binux.me
# Created on 2016-01-17 18:31:58


================================================
FILE: pyspider/database/elasticsearch/projectdb.py
================================================
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:
# Author: Binux<roy@binux.me>
#         http://binux.me
# Created on 2016-01-17 18:32:33

import time

import elasticsearch.helpers
from elasticsearch import Elasticsearch
from pyspider.database.base.projectdb import ProjectDB as BaseProjectDB


class ProjectDB(BaseProjectDB):
    __type__ = 'project'

    def __init__(self, hosts, index='pyspider'):
        self.index = index
        self.es = Elasticsearch(hosts=hosts)

        self.es.indices.create(index=self.index, ignore=400)
        if not self.es.indices.get_mapping(index=self.index, doc_type=self.__type__):
            self.es.indices.put_mapping(index=self.index, doc_type=self.__type__, body={
                "_all": {"enabled": False},
                "properties": {
                    "updatetime": {"type": "double"}
                }
            })

    def insert(self, name, obj={}):
        obj = dict(obj)
        obj['name'] = name
        obj['updatetime'] = time.time()

        obj.setdefault('group', '')
        obj.setdefault('status', 'TODO')
        obj.setdefault('script', '')
        obj.setdefault('comments', '')
        obj.setdefault('rate', 0)
        obj.setdefault('burst', 0)

        return self.es.index(index=self.index, doc_type=self.__type__, body=obj, id=name,
                             refresh=True)

    def update(self, name, obj={}, **kwargs):
        obj = dict(obj)
        obj.update(kwargs)
        obj['updatetime'] = time.time()
        return self.es.update(index=self.index, doc_type=self.__type__,
                              body={'doc': obj}, id=name, refresh=True, ignore=404)

    def get_all(self, fields=None):
        for record in elasticsearch.helpers.scan(self.es, index=self.index, doc_type=self.__type__,
                                                 query={'query': {"match_all": {}}},
                                                 _source_include=fields or []):
            yield record['_source']

    def get(self, name, fields=None):
        ret = self.es.get(index=self.index, doc_type=self.__type__, id=name,
                          _source_include=fields or [], ignore=404)
        return ret.get('_source', None)

    def check_update(self, timestamp, fields=None):
        for record in elasticsearch.helpers.scan(self.es, index=self.index, doc_type=self.__type__,
                                                 query={'query': {"range": {
                                                     "updatetime": {"gte": timestamp}
                                                 }}}, _source_include=fields or []):
            yield record['_source']

    def drop(self, name):
        return self.es.delete(index=self.index, doc_type=self.__type__, id=name, refresh=True)


================================================
FILE: pyspider/database/elasticsearch/resultdb.py
================================================
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:
# Author: Binux<roy@binux.me>
#         http://binux.me
# Created on 2016-01-18 19:41:24


import time

import elasticsearch.helpers
from elasticsearch import Elasticsearch
from pyspider.database.base.resultdb import ResultDB as BaseResultDB


class ResultDB(BaseResultDB):
    __type__ = 'result'

    def __init__(self, hosts, index='pyspider'):
        self.index = index
        self.es = Elasticsearch(hosts=hosts)

        self.es.indices.create(index=self.index, ignore=400)
        if not self.es.indices.get_mapping(index=self.index, doc_type=self.__type__):
            self.es.indices.put_mapping(index=self.index, doc_type=self.__type__, body={
                "_all": {"enabled": True},
                "properties": {
                    "taskid": {"enabled": False},
                    "project": {"type": "string", "index": "not_analyzed"},
                    "url": {"enabled": False},
                }
            })

    @property
    def projects(self):
        ret = self.es.search(index=self.index, doc_type=self.__type__,
                             body={"aggs": {"projects": {
                                 "terms": {"field": "project"}
                             }}}, _source=False)
        return [each['key'] for each in ret['aggregations']['projects'].get('buckets', [])]

    def save(self, project, taskid, url, result):
        obj = {
            'taskid': taskid,
            'project': project,
            'url': url,
            'result': result,
            'updatetime': time.time(),
        }
        return self.es.index(index=self.index, doc_type=self.__type__,
                             body=obj, id='%s:%s' % (project, taskid))

    def select(self, project, fields=None, offset=0, limit=0):
        offset = offset or 0
        limit = limit or 0
        if not limit:
            for record in elasticsearch.helpers.scan(self.es, index=self.index, doc_type=self.__type__,
                                                     query={'query': {'term': {'project': project}}},
                                                     _source_include=fields or [], from_=offset,
                                                     sort="updatetime:desc"):
                yield record['_source']
        else:
            for record in self.es.search(index=self.index, doc_type=self.__type__,
                                         body={'query': {'term': {'project': project}}},
                                         _source_include=fields or [], from_=offset, size=limit,
                                         sort="updatetime:desc"
                                         ).get('hits', {}).get('hits', []):
                yield record['_source']

    def count(self, project):
        return self.es.count(index=self.index, doc_type=self.__type__,
                             body={'query': {'term': {'project': project}}}
                             ).get('count', 0)

    def get(self, project, taskid, fields=None):
        ret = self.es.get(index=self.index, doc_type=self.__type__, id="%s:%s" % (project, taskid),
                          _source_include=fields or [], ignore=404)
        return ret.get('_source', None)

    def drop(self, project):
        self.refresh()
        for record in elasticsearch.helpers.scan(self.es, index=self.index, doc_type=self.__type__,
                                                 query={'query': {'term': {'project': project}}},
                                                 _source=False):
            self.es.delete(index=self.index, doc_type=self.__type__, id=record['_id'])

    def refresh(self):
        """
        Explicitly refresh one or more index, making all operations
        performed since the last refresh available for search.
        """
        self.es.indices.refresh(index=self.index)


================================================
FILE: pyspider/database/elasticsearch/taskdb.py
================================================
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:
# Author: Binux<roy@binux.me>
#         http://binux.me
# Created on 2016-01-20 20:20:55


import time
import json

import elasticsearch.helpers
from elasticsearch import Elasticsearch
from pyspider.database.base.taskdb import TaskDB as BaseTaskDB


class TaskDB(BaseTaskDB):
    __type__ = 'task'

    def __init__(self, hosts, index='pyspider'):
        self.index = index
        self._changed = False
        self.es = Elasticsearch(hosts=hosts)

        self.es.indices.create(index=self.index, ignore=400)
        if not self.es.indices.get_mapping(index=self.index, doc_type=self.__type__):
            self.es.indices.put_mapping(index=self.index, doc_type=self.__type__, body={
                "_all": {"enabled": False},
                "properties": {
                    "project": {"type": "string", "index": "not_analyzed"},
                    "status": {"type": "byte"},
                }
            })

    def _parse(self, data):
        if not data:
            return data
        for each in ('schedule', 'fetch', 'process', 'track'):
            if each in data:
                if data[each]:
                    data[each] = json.loads(data[each])
                else:
                    data[each] = {}
        return data

    def _stringify(self, data):
        for each in ('schedule', 'fetch', 'process', 'track'):
            if each in data:
                data[each] = json.dumps(data[each])
        return data

    @property
    def projects(self):
        ret = self.es.search(index=self.index, doc_type=self.__type__,
                             body={"aggs": {"projects": {
                                 "terms": {"field": "project"}
                             }}}, _source=False)
        return [each['key'] for each in ret['aggregations']['projects'].get('buckets', [])]

    def load_tasks(self, status, project=None, fields=None):
        self.refresh()
        if project is None:
            for project in self.projects:
                for each in self.load_tasks(status, project, fields):
                    yield each
        else:
            for record in elasticsearch.helpers.scan(self.es, index=self.index, doc_type=self.__type__,
                                                     query={'query': {'bool': {
                                                         'must': {'term': {'project': project}},
                                                         'should': [{'term': {'status': status}}],
                                                         'minimum_should_match': 1,
                                                     }}}, _source_include=fields or []):
                yield self._parse(record['_source'])

    def get_task(self, project, taskid, fields=None):
        if self._changed:
            self.refresh()
        ret = self.es.get(index=self.index, doc_type=self.__type__, id="%s:%s" % (project, taskid),
                          _source_include=fields or [], ignore=404)
        return self._parse(ret.get('_source', None))

    def status_count(self, project):
        self.refresh()
        ret = self.es.search(index=self.index, doc_type=self.__type__,
                             body={"query": {'term': {'project': project}},
                                   "aggs": {"status": {
                                       "terms": {"field": "status"}
                                   }}}, _source=False)
        result = {}
        for each in ret['aggregations']['status'].get('buckets', []):
            result[each['key']] = each['doc_count']
        return result

    def insert(self, project, taskid, obj={}):
        self._changed = True
        obj = dict(obj)
        obj['taskid'] = taskid
        obj['project'] = project
        obj['updatetime'] = time.time()
        return self.es.index(index=self.index, doc_type=self.__type__,
                             body=self._stringify(obj), id='%s:%s' % (project, taskid))

    def update(self, project, taskid, obj={}, **kwargs):
        self._changed = True
        obj = dict(obj)
        obj.update(kwargs)
        obj['updatetime'] = time.time()
        return self.es.update(index=self.index, doc_type=self.__type__, id='%s:%s' % (project, taskid),
                              body={"doc": self._stringify(obj)}, ignore=404)

    def drop(self, project):
        self.refresh()
        for record in elasticsearch.helpers.scan(self.es, index=self.index, doc_type=self.__type__,
                                                 query={'query': {'term': {'project': project}}},
                                                 _source=False):
            self.es.delete(index=self.index, doc_type=self.__type__, id=record['_id'])
        self.refresh()

    def refresh(self):
        """
        Explicitly refresh one or more index, making all operations
        performed since the last refresh available for search.
        """
        self._changed = False
        self.es.indices.refresh(index=self.index)


================================================
FILE: pyspider/database/local/__init__.py
================================================
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:
# Author: Binux<roy@binux.me>
#         http://binux.me
# Created on 2015-01-17 20:56:50


================================================
FILE: pyspider/database/local/projectdb.py
================================================
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:
# Author: Binux<roy@binux.me>
#         http://binux.me
# Created on 2015-01-17 12:32:17

import os
import re
import six
import glob
import logging

from pyspider.database.base.projectdb import ProjectDB as BaseProjectDB


class ProjectDB(BaseProjectDB):
    """ProjectDB loading scripts from local file."""

    def __init__(self, files):
        self.files = files
        self.projects = {}
        self.load_scripts()

    def load_scripts(self):
        project_names = set(self.projects.keys())
        for path in self.files:
            for filename in glob.glob(path):
                name = os.path.splitext(os.path.basename(filename))[0]
                if name in project_names:
                    project_names.remove(name)
                updatetime = os.path.getmtime(filename)
                if name not in self.projects or updatetime > self.projects[name]['updatetime']:
                    project = self._build_project(filename)
                    if not project:
                        continue
                    self.projects[project['name']] = project

        for name in project_names:
            del self.projects[name]

    rate_re = re.compile(r'^\s*#\s*rate.*?(\d+(\.\d+)?)', re.I | re.M)
    burst_re = re.compile(r'^\s*#\s*burst.*?(\d+(\.\d+)?)', re.I | re.M)

    def _build_project(self, filename):
        try:
            with open(filename) as fp:
                script = fp.read()
            m = self.rate_re.search(script)
            if m:
                rate = float(m.group(1))
            else:
                rate = 1

            m = self.burst_re.search(script)
            if m:
                burst = float(m.group(1))
            else:
                burst = 3

            return {
                'name': os.path.splitext(os.path.basename(filename))[0],
                'group': None,
                'status': 'RUNNING',
                'script': script,
                'comments': None,
                'rate': rate,
                'burst': burst,
                'updatetime': os.path.getmtime(filename),
            }
        except OSError as e:
            logging.error('loading project script error: %s', e)
            return None

    def get_all(self, fields=None):
        for projectname in self.projects:
            yield self.get(projectname, fields)

    def get(self, name, fields=None):
        if name not in self.projects:
            return None
        project = self.projects[name]
        result = {}
        for f in fields or project:
            if f in project:
                result[f] = project[f]
            else:
                result[f] = None
        return result

    def check_update(self, timestamp, fields=None):
        self.load_scripts()
        for projectname, project in six.iteritems(self.projects):
            if project['updatetime'] > timestamp:
                yield self.get(projectname, fields)


================================================
FILE: pyspider/database/mongodb/__init__.py
================================================


================================================
FILE: pyspider/database/mongodb/mongodbbase.py
================================================
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:
# Author: Binux<roy@binux.me>
#         http://binux.me
# Created on 2014-11-22 20:42:01

import time


class SplitTableMixin(object):
    UPDATE_PROJECTS_TIME = 10 * 60

    def _collection_name(self, project):
        if self.collection_prefix:
            return "%s.%s" % (self.collection_prefix, project)
        else:
            return project

    @property
    def projects(self):
        if time.time() - getattr(self, '_last_update_projects', 0) > self.UPDATE_PROJECTS_TIME:
            self._list_project()
        return self._projects

    @projects.setter
    def projects(self, value):
        self._projects = value

    def _list_project(self):
        self._last_update_projects = time.time()
        self.projects = set()
        if self.collection_prefix:
            prefix = "%s." % self.collection_prefix
        else:
            prefix = ''
        for each in self.database.collection_names():
            if each.startswith('system.'):
                continue
            if each.startswith(prefix):
                self.projects.add(each[len(prefix):])

    def drop(self, project):
        if project not in self.projects:
            self._list_project()
        if project not in self.projects:
            return
        collection_name = self._collection_name(project)
        self.database[collection_name].drop()
        self._list_project()


================================================
FILE: pyspider/database/mongodb/projectdb.py
================================================
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:
# Author: Binux<i@binux.me>
#         http://binux.me
# Created on 2014-10-12 12:22:42

import time
from pymongo import MongoClient

from pyspider.database.base.projectdb import ProjectDB as BaseProjectDB


class ProjectDB(BaseProjectDB):
    __collection_name__ = 'projectdb'

    def __init__(self, url, database='projectdb'):
        self.conn = MongoClient(url)
        self.conn.admin.command("ismaster")
        self.database = self.conn[database]
        self.collection = self.database[self.__collection_name__]

        self.collection.ensure_index('name', unique=True)

    def _default_fields(self, each):
        if each is None:
            return each
        each.setdefault('group', None)
        each.setdefault('status', 'TODO')
        each.setdefault('script', '')
        each.setdefault('comments', None)
        each.setdefault('rate', 0)
        each.setdefault('burst', 0)
        each.setdefault('updatetime', 0)
        return each

    def insert(self, name, obj={}):
        obj = dict(obj)
        obj['name'] = name
        obj['updatetime'] = time.time()
        return self.collection.update({'name': name}, {'$set': obj}, upsert=True)

    def update(self, name, obj={}, **kwargs):
        obj = dict(obj)
        obj.update(kwargs)
        obj['updatetime'] = time.time()
        return self.collection.update({'name': name}, {'$set': obj})

    def get_all(self, fields=None):
        for each in self.collection.find({}, fields):
            if each and '_id' in each:
                del each['_id']
            yield self._default_fields(each)

    def get(self, name, fields=None):
        each = self.collection.find_one({'name': name}, fields)
        if each and '_id' in each:
            del each['_id']
        return self._default_fields(each)

    def check_update(self, timestamp, fields=None):
        for project in self.get_all(fields=('updatetime', 'name')):
            if project['updatetime'] > timestamp:
                project = self.get(project['name'], fields)
                yield self._default_fields(project)

    def drop(self, name):
        return self.collection.remove({'name': name})


================================================
FILE: pyspider/database/mongodb/resultdb.py
================================================
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:
# Author: Binux<i@binux.me>
#         http://binux.me
# Created on 2014-10-13 22:18:36

import json
import time

from pymongo import MongoClient

from pyspider.database.base.resultdb import ResultDB as BaseResultDB
from .mongodbbase import SplitTableMixin


class ResultDB(SplitTableMixin, BaseResultDB):
    collection_prefix = ''

    def __init__(self, url, database='resultdb'):
        self.conn = MongoClient(url)
        self.conn.admin.command("ismaster")
        self.database = self.conn[database]
        self.projects = set()

        self._list_project()
        # we suggest manually build index in advance, instead of indexing
        #  in the startup process,
        # for project in self.projects:
        #     collection_name = self._collection_name(project)
        #     self.database[collection_name].ensure_index('taskid')
        pass

    def _create_project(self, project):
        collection_name = self._collection_name(project)
        self.database[collection_name].ensure_index('taskid')
        self._list_project()

    def _parse(self, data):
        data['_id'] = str(data['_id'])
        if 'result' in data:
            data['result'] = json.loads(data['result'])
        return data

    def _stringify(self, data):
        if 'result' in data:
            data['result'] = json.dumps(data['result'])
        return data

    def save(self, project, taskid, url, result):
        if project not in self.projects:
            self._create_project(project)
        collection_name = self._collection_name(project)
        obj = {
            'taskid'    : taskid,
            'url'       : url,
            'result'    : result,
            'updatetime': time.time(),
        }
        return self.database[collection_name].update(
            {'taskid': taskid}, {"$set": self._stringify(obj)}, upsert=True
        )

    def select(self, project, fields=None, offset=0, limit=0):
        if project not in self.projects:
            self._list_project()
        if project not in self.projects:
            return
        offset = offset or 0
        limit = limit or 0
        collection_name = self._collection_name(project)
        for result in self.database[collection_name].find({}, fields, skip=offset, limit=limit):
            yield self._parse(result)

    def count(self, project):
        if project not in self.projects:
            self._list_project()
        if project not in self.projects:
            return
        collection_name = self._collection_name(project)
        return self.database[collection_name].count()

    def get(self, project, taskid, fields=None):
        if project not in self.projects:
            self._list_project()
        if project not in self.projects:
            return
        collection_name = self._collection_name(project)
        ret = self.database[collection_name].find_one({'taskid': taskid}, fields)
        if not ret:
            return ret
        return self._parse(ret)


================================================
FILE: pyspider/database/mongodb/taskdb.py
================================================
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:
# Author: Binux<i@binux.me>
#         http://binux.me
# Created on 2014-10-11 23:54:50

import json
import time

from pymongo import MongoClient

from pyspider.database.base.taskdb import TaskDB as BaseTaskDB
from .mongodbbase import SplitTableMixin


class TaskDB(SplitTableMixin, BaseTaskDB):
    collection_prefix = ''

    def __init__(self, url, database='taskdb'):
        self.conn = MongoClient(url)
        self.conn.admin.command("ismaster")
        self.database = self.conn[database]
        self.projects = set()

        self._list_project()
        # we suggest manually build index in advance, instead of indexing
        #  in the startup process,
        # for project in self.projects:
        #     collection_name = self._collection_name(project)
        #     self.database[collection_name].ensure_index('status')
        #     self.database[collection_name].ensure_index('taskid')

    def _create_project(self, project):
        collection_name = self._collection_name(project)
        self.database[collection_name].ensure_index('status')
        self.database[collection_name].ensure_index('taskid')
        self._list_project()

    def _parse(self, data):
        if '_id' in data:
            del data['_id']
        for each in ('schedule', 'fetch', 'process', 'track'):
            if each in data:
                if data[each]:
                    if isinstance(data[each], bytearray):
                        data[each] = str(data[each])
                    data[each] = json.loads(data[each], encoding='utf8')
                else:
                    data[each] = {}
        return data

    def _stringify(self, data):
        for each in ('schedule', 'fetch', 'process', 'track'):
            if each in data:
                data[each] = json.dumps(data[each])
        return data

    def load_tasks(self, status, project=None, fields=None):
        if not project:
            self._list_project()

        if project:
            projects = [project, ]
        else:
            projects = self.projects

        for project in projects:
            collection_name = self._collection_name(project)
            for task in self.database[collection_name].find({'status': status}, fields):
                yield self._parse(task)

    def get_task(self, project, taskid, fields=None):
        if project not in self.projects:
            self._list_project()
        if project not in self.projects:
            return
        collection_name = self._collection_name(project)
        ret = self.database[collection_name].find_one({'taskid': taskid}, fields)
        if not ret:
            return ret
        return self._parse(ret)

    def status_count(self, project):
        if project not in self.projects:
            self._list_project()
        if project not in self.projects:
            return {}
        collection_name = self._collection_name(project)

        # when there are too many data in task collection , aggregate operation will take a very long time,
        #  and this will cause scheduler module startup to be particularly slow

        # ret = self.database[collection_name].aggregate([
        #     {'$group': {
        #         '_id'  : '$status',
        #         'total': {
        #             '$sum': 1
        #         }
        #     }
        #     }])

        # Instead of aggregate, use find-count on status(with index) field.
        def _count_for_status(collection, status):
            total = collection.find({'status': status}).count()
            return {'total': total, "_id": status} if total else None

        c = self.database[collection_name]
        ret = filter(
            lambda x: x,
            map(
                lambda s: _count_for_status(c, s), [self.ACTIVE, self.SUCCESS, self.FAILED]
            )
        )

        result = {}
        if isinstance(ret, dict):
            ret = ret.get('result', [])
        for each in ret:
            result[each['_id']] = each['total']
        return result

    def insert(self, project, taskid, obj={}):
        if project not in self.projects:
            self._create_project(project)
        obj = dict(obj)
        obj['taskid'] = taskid
        obj['project'] = project
        obj['updatetime'] = time.time()
        return self.update(project, taskid, obj=obj)

    def update(self, project, taskid, obj={}, **kwargs):
        obj = dict(obj)
        obj.update(kwargs)
        obj['updatetime'] = time.time()
        collection_name = self._collection_name(project)
        return self.database[collection_name].update(
            {'taskid': taskid},
            {"$set": self._stringify(obj)},
            upsert=True
        )


================================================
FILE: pyspider/database/mysql/__init__.py
================================================
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:
# Author: Binux<i@binux.me>
#         http://binux.me
# Created on 2014-07-17 20:12:54


================================================
FILE: pyspider/database/mysql/mysqlbase.py
================================================
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:
# Author: Binux<i@binux.me>
#         http://binux.me
# Created on 2014-11-05 10:42:24

import time
import mysql.connector


class MySQLMixin(object):
    maxlimit = 18446744073709551615

    @property
    def dbcur(self):
        try:
            if self.conn.unread_result:
                self.conn.get_rows()
                if hasattr(self.conn, 'free_result'):
                    self.conn.free_result()
            return self.conn.cursor()
        except (mysql.connector.OperationalError, mysql.connector.InterfaceError):
            self.conn.ping(reconnect=True)
            self.conn.database = self.database_name
            return self.conn.cursor()


class SplitTableMixin(object):
    UPDATE_PROJECTS_TIME = 10 * 60

    def _tablename(self, project):
        if self.__tablename__:
            return '%s_%s' % (self.__tablename__, project)
        else:
            return project

    @property
    def projects(self):
        if time.time() - getattr(self, '_last_update_projects', 0) \
                > self.UPDATE_PROJECTS_TIME:
            self._list_project()
        return self._projects

    @projects.setter
    def projects(self, value):
        self._projects = value

    def _list_project(self):
        self._last_update_projects = time.time()
        self.projects = set()
        if self.__tablename__:
            prefix = '%s_' % self.__tablename__
        else:
            prefix = ''
        for project, in self._execute('show tables;'):
            if project.startswith(prefix):
                project = project[len(prefix):]
                self.projects.add(project)

    def drop(self, project):
        if project not in self.projects:
            self._list_project()
        if project not in self.projects:
            return
        tablename = self._tablename(project)
        self._execute("DROP TABLE %s" % self.escape(tablename))
        self._list_project()


================================================
FILE: pyspider/database/mysql/projectdb.py
================================================
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:
# Author: Binux<i@binux.me>
#         http://binux.me
# Created on 2014-07-17 21:06:43

import time
import mysql.connector

from pyspider.database.base.projectdb import ProjectDB as BaseProjectDB
from pyspider.database.basedb import BaseDB
from .mysqlbase import MySQLMixin


class ProjectDB(MySQLMixin, BaseProjectDB, BaseDB):
    __tablename__ = 'projectdb'

    def __init__(self, host='localhost', port=3306, database='projectdb',
                 user='root', passwd=None):
        self.database_name = database
        self.conn = mysql.connector.connect(user=user, password=passwd,
                                            host=host, port=port, autocommit=True)
        if database not in [x[0] for x in self._execute('show databases')]:
            self._execute('CREATE DATABASE %s' % self.escape(database))
        self.conn.database = database

        self._execute('''CREATE TABLE IF NOT EXISTS %s (
            `name` varchar(64) PRIMARY KEY,
            `group` varchar(64),
            `status` varchar(16),
            `script` TEXT,
            `comments` varchar(1024),
            `rate` float(11, 4),
            `burst` float(11, 4),
            `updatetime` double(16, 4)
            ) ENGINE=InnoDB CHARSET=utf8''' % self.escape(self.__tablename__))

    def insert(self, name, obj={}):
        obj = dict(obj)
        obj['name'] = name
        obj['updatetime'] = time.time()
        return self._insert(**obj)

    def update(self, name, obj={}, **kwargs):
        obj = dict(obj)
        obj.update(kwargs)
        obj['updatetime'] = time.time()
        ret = self._update(where="`name` = %s" % self.placeholder, where_values=(name, ), **obj)
        return ret.rowcount

    def get_all(self, fields=None):
        return self._select2dic(what=fields)

    def get(self, name, fields=None):
        where = "`name` = %s" % self.placeholder
        for each in self._select2dic(what=fields, where=where, where_values=(name, )):
            return each
        return None

    def drop(self, name):
        where = "`name` = %s" % self.placeholder
        return self._delete(where=where, where_values=(name, ))

    def check_update(self, timestamp, fields=None):
        where = "`updatetime` >= %f" % timestamp
        return self._select2dic(what=fields, where=where)


================================================
FILE: pyspider/database/mysql/resultdb.py
================================================
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:
# Author: Binux<i@binux.me>
#         http://binux.me
# Created on 2014-10-13 22:02:57

import re
import six
import time
import json
import mysql.connector

from pyspider.libs import utils
from pyspider.database.base.resultdb import ResultDB as BaseResultDB
from pyspider.database.basedb import BaseDB
from .mysqlbase import MySQLMixin, SplitTableMixin


class ResultDB(MySQLMixin, SplitTableMixin, BaseResultDB, BaseDB):
    __tablename__ = ''

    def __init__(self, host='localhost', port=3306, database='resultdb',
                 user='root', passwd=None):
        self.database_name = database
        self.conn = mysql.connector.connect(user=user, password=passwd,
                                            host=host, port=port, autocommit=True)
        if database not in [x[0] for x in self._execute('show databases')]:
            self._execute('CREATE DATABASE %s' % self.escape(database))
        self.conn.database = database
        self._list_project()

    def _create_project(self, project):
        assert re.match(r'^\w+$', project) is not None
        tablename = self._tablename(project)
        if tablename in [x[0] for x in self._execute('show tables')]:
            return
        self._execute('''CREATE TABLE %s (
            `taskid` varchar(64) PRIMARY KEY,
            `url` varchar(1024),
            `result` MEDIUMBLOB,
            `updatetime` double(16, 4)
            ) ENGINE=InnoDB CHARSET=utf8''' % self.escape(tablename))

    def _parse(self, data):
        for key, value in list(six.iteritems(data)):
            if isinstance(value, (bytearray, six.binary_type)):
                data[key] = utils.text(value)
        if 'result' in data:
            data['result'] = json.loads(data['result'])
        return data

    def _stringify(self, data):
        if 'result' in data:
            data['result'] = json.dumps(data['result'])
        return data

    def save(self, project, taskid, url, result):
        tablename = self._tablename(project)
        if project not in self.projects:
            self._create_project(project)
            self._list_project()
        obj = {
            'taskid': taskid,
            'url': url,
            'result': result,
            'updatetime': time.time(),
        }
        return self._replace(tablename, **self._stringify(obj))

    def select(self, project, fields=None, offset=0, limit=None):
        if project not in self.projects:
            self._list_project()
        if project not in self.projects:
            return
        tablename = self._tablename(project)

        for task in self._select2dic(tablename, what=fields, order='updatetime DESC',
                                     offset=offset, limit=limit):
            yield self._parse(task)

    def count(self, project):
        if project not in self.projects:
            self._list_project()
        if project not in self.projects:
            return 0
        tablename = self._tablename(project)
        for count, in self._execute("SELECT count(1) FROM %s" % self.escape(tablename)):
            return count

    def get(self, project, taskid, fields=None):
        if project not in self.projects:
            self._list_project()
        if project not in self.projects:
            return
        tablename = self._tablename(project)
        where = "`taskid` = %s" % self.placeholder
        for task in self._select2dic(tablename, what=fields,
                                     where=where, where_values=(taskid, )):
            return self._parse(task)


================================================
FILE: pyspider/database/mysql/taskdb.py
================================================
#!/usr/bin/envutils
# -*- encoding: utf-8 -*-
# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:
# Author: Binux<i@binux.me>
#         http://binux.me
# Created on 2014-07-17 18:53:01


import re
import six
import time
import json
import mysql.connector

from pyspider.libs import utils
from pyspider.database.base.taskdb import TaskDB as BaseTaskDB
from pyspider.database.basedb import BaseDB
from .mysqlbase import MySQLMixin, SplitTableMixin


class TaskDB(MySQLMixin, SplitTableMixin, BaseTaskDB, BaseDB):
    __tablename__ = ''

    def __init__(self, host='localhost', port=3306, database='taskdb',
                 user='root', passwd=None):
        self.database_name = database
        self.conn = mysql.connector.connect(user=user, password=passwd,
                                            host=host, port=port, autocommit=True)
        if database not in [x[0] for x in self._execute('show databases')]:
            self._execute('CREATE DATABASE %s' % self.escape(database))
        self.conn.database = database
        self._list_project()

    def _create_project(self, project):
        assert re.match(r'^\w+$', project) is not None
        tablename = self._tablename(project)
        if tablename in [x[0] for x in self._execute('show tables')]:
            return
        self._execute('''CREATE TABLE IF NOT EXISTS %s (
            `taskid` varchar(64) PRIMARY KEY,
            `project` varchar(64),
            `url` varchar(1024),
            `status` int(1),
            `schedule` BLOB,
            `fetch` BLOB,
            `process` BLOB,
            `track` BLOB,
            `lastcrawltime` double(16, 4),
            `updatetime` double(16, 4),
            INDEX `status_index` (`status`)
            ) ENGINE=InnoDB CHARSET=utf8''' % self.escape(tablename))

    def _parse(self, data):
        for key, value in list(six.iteritems(data)):
            if isinstance(value, (bytearray, six.binary_type)):
                data[key] = utils.text(value)
        for each in ('schedule', 'fetch', 'process', 'track'):
            if each in data:
                if data[each]:
                    data[each] = json.loads(data[each])
                else:
                    data[each] = {}
        return data

    def _stringify(self, data):
        for each in ('schedule', 'fetch', 'process', 'track'):
            if each in data:
                data[each] = json.dumps(data[each])
        return data

    def load_tasks(self, status, project=None, fields=None):
        if project and project not in self.projects:
            return
        where = "`status` = %s" % self.placeholder

        if project:
            projects = [project, ]
        else:
            projects = self.projects

        for project in projects:
            tablename = self._tablename(project)
            for each in self._select2dic(
                tablename, what=fields, where=where, where_values=(status, )
            ):
                yield self._parse(each)

    def get_task(self, project, taskid, fields=None):
        if project not in self.projects:
            self._list_project()
        if project not in self.projects:
            return None
        where = "`taskid` = %s" % self.placeholder
        tablename = self._tablename(project)
        for each in self._select2dic(tablename, what=fields, where=where, where_values=(taskid, )):
            return self._parse(each)
        return None

    def status_count(self, project):
        result = dict()
        if project not in self.projects:
            self._list_project()
        if project not in self.projects:
            return result
        tablename = self._tablename(project)
        for status, count in self._execute("SELECT `status`, count(1) FROM %s GROUP BY `status`" %
                                           self.escape(tablename)):
            result[status] = count
        return result

    def insert(self, project, taskid, obj={}):
        if project not in self.projects:
            self._list_project()
        if project not in self.projects:
            self._create_project(project)
            self._list_project()
        obj = dict(obj)
        obj['taskid'] = taskid
        obj['project'] = project
        obj['updatetime'] = time.time()
        tablename = self._tablename(project)
        return self._insert(tablename, **self._stringify(obj))

    def update(self, project, taskid, obj={}, **kwargs):
        if project not in self.projects:
            self._list_project()
        if project not in self.projects:
            raise LookupError
        tablename = self._tablename(project)
        obj = dict(obj)
        obj.update(kwargs)
        obj['updatetime'] = time.time()
        return self._update(
            tablename,
            where="`taskid` = %s" % self.placeholder,
            where_values=(taskid, ),
            **self._stringify(obj)
        )


================================================
FILE: pyspider/database/redis/__init__.py
================================================
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:
# Author: Binux<roy@binux.me>
#         http://binux.me
# Created on 2015-05-17 01:34:21


================================================
FILE: pyspider/database/redis/taskdb.py
================================================
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:
# Author: Binux<roy@binux.me>
#         http://binux.me
# Created on 2015-05-16 21:01:52

import six
import time
import json
import redis
import logging
import itertools

from pyspider.libs import utils
from pyspider.database.base.taskdb import TaskDB as BaseTaskDB


class TaskDB(BaseTaskDB):
    UPDATE_PROJECTS_TIME = 10 * 60
    __prefix__ = 'taskdb_'

    def __init__(self, host='localhost', port=6379, db=0):
        self.redis = redis.StrictRedis(host=host, port=port, db=db)
        try:
            self.redis.scan(count=1)
            self.scan_available = True
        except Exception as e:
            logging.debug("redis_scan disabled: %r", e)
            self.scan_available = False

    def _gen_key(self, project, taskid):
        return "%s%s_%s" % (self.__prefix__, project, taskid)

    def _gen_status_key(self, project, status):
        return '%s%s_status_%d' % (self.__prefix__, project, status)

    def _parse(self, data):
        if six.PY3:
            result = {}
            for key, value in data.items():
                if isinstance(value, bytes):
                    value = utils.text(value)
                result[utils.text(key)] = value
            data = result

        for each in ('schedule', 'fetch', 'process', 'track'):
            if each in data:
                if data[each]:
                    data[each] = json.loads(data[each])
                else:
                    data[each] = {}
        if 'status' in data:
            data['status'] = int(data['status'])
        if 'lastcrawltime' in data:
            data['lastcrawltime'] = float(data['lastcrawltime'] or 0)
        if 'updatetime' in data:
            data['updatetime'] = float(data['updatetime'] or 0)
        return data

    def _stringify(self, data):
        for each in ('schedule', 'fetch', 'process', 'track'):
            if each in data:
                data[each] = json.dumps(data[each])
        return data

    @property
    def projects(self):
        if time.time() - getattr(self, '_last_update_projects', 0) \
                > self.UPDATE_PROJECTS_TIME:
            self._projects = set(utils.text(x) for x in self.redis.smembers(
                self.__prefix__ + 'projects'))
        return self._projects

    def load_tasks(self, status, project=None, fields=None):
        if project is None:
            project = self.projects
        elif not isinstance(project, list):
            project = [project, ]

        if self.scan_available:
            scan_method = self.redis.sscan_iter
        else:
            scan_method = self.redis.smembers

        if fields:
            def get_method(key):
                obj = self.redis.hmget(key, fields)
                if all(x is None for x in obj):
                    return None
                return dict(zip(fields, obj))
        else:
            get_method = self.redis.hgetall

        for p in project:
            status_key = self._gen_status_key(p, status)
            for taskid in scan_method(status_key):
                obj = get_method(self._gen_key(p, utils.text(taskid)))
                if not obj:
                    #self.redis.srem(status_key, taskid)
                    continue
                else:
                    yield self._parse(obj)

    def get_task(self, project, taskid, fields=None):
        if fields:
            obj = self.redis.hmget(self._gen_key(project, taskid), fields)
            if all(x is None for x in obj):
                return None
            obj = dict(zip(fields, obj))
        else:
            obj = self.redis.hgetall(self._gen_key(project, taskid))

        if not obj:
            return None
        return self._parse(obj)

    def status_count(self, project):
        '''
        return a dict
        '''
        pipe = self.redis.pipeline(transaction=False)
        for status in range(1, 5):
            pipe.scard(self._gen_status_key(project, status))
        ret = pipe.execute()

        result = {}
        for status, count in enumerate(ret):
            if count > 0:
                result[status + 1] = count
        return result

    def insert(self, project, taskid, obj={}):
        obj = dict(obj)
        obj['taskid'] = taskid
        obj['project'] = project
        obj['updatetime'] = time.time()
        obj.setdefault('status', self.ACTIVE)

        task_key = self._gen_key(project, taskid)

        pipe = self.redis.pipeline(transaction=False)
        if project not in self.projects:
            pipe.sadd(self.__prefix__ + 'projects', project)
        pipe.hmset(task_key, self._stringify(obj))
        pipe.sadd(self._gen_status_key(project, obj['status']), taskid)
        pipe.execute()

    def update(self, project, taskid, obj={}, **kwargs):
        obj = dict(obj)
        obj.update(kwargs)
        obj['updatetime'] = time.time()

        pipe = self.redis.pipeline(transaction=False)
        pipe.hmset(self._gen_key(project, taskid), self._stringify(obj))
        if 'status' in obj:
            for status in range(1, 5):
                if status == obj['status']:
                    pipe.sadd(self._gen_status_key(project, status), taskid)
                else:
                    pipe.srem(self._gen_status_key(project, status), taskid)
        pipe.execute()

    def drop(self, project):
        self.redis.srem(self.__prefix__ + 'projects', project)

        if self.scan_available:
            scan_method = self.redis.scan_iter
        else:
            scan_method = self.redis.keys

        for each in itertools.tee(scan_method("%s%s_*" % (self.__prefix__, project)), 100):
            each = list(each)
            if each:
                self.redis.delete(*each)


================================================
FILE: pyspider/database/sqlalchemy/__init__.py
================================================
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:
# Author: Binux<roy@binux.me>
#         http://binux.me
# Created on 2014-12-04 20:11:04


================================================
FILE: pyspider/database/sqlalchemy/projectdb.py
================================================
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:
# Author: Binux<roy@binux.me>
#         http://binux.me
# Created on 2014-12-04 23:25:10

import six
import time
import sqlalchemy.exc

from sqlalchemy import create_engine, MetaData, Table, Column, String, Float, Text
from sqlalchemy.engine.url import make_url
from pyspider.libs import utils
from pyspider.database.base.projectdb import ProjectDB as BaseProjectDB
from .sqlalchemybase import result2dict


class ProjectDB(BaseProjectDB):
    __tablename__ = 'projectdb'

    def __init__(self, url):
        self.table = Table(self.__tablename__, MetaData(),
                           Column('name', String(64), primary_key=True),
                           Column('group', String(64)),
                           Column('status', String(16)),
                           Column('script', Text),
                           Column('comments', String(1024)),
                           Column('rate', Float(11)),
                           Column('burst', Float(11)),
                           Column('updatetime', Float(32)),
                           mysql_engine='InnoDB',
                           mysql_charset='utf8'
                           )

        self.url = make_url(url)
        if self.url.database:
            database = self.url.database
            self.url.database = None
            try:
                engine = create_engine(self.url, convert_unicode=True, pool_recycle=3600)
                conn = engine.connect()
                conn.execute("commit")
                conn.execute("CREATE DATABASE %s" % database)
            except sqlalchemy.exc.SQLAlchemyError:
                pass
            self.url.database = database
        self.engine = create_engine(url, convert_unicode=True, pool_recycle=3600)
        self.table.create(self.engine, checkfirst=True)

    @staticmethod
    def _parse(data):
        return data

    @staticmethod
    def _stringify(data):
        return data

    def insert(self, name, obj={}):
        obj = dict(obj)
        obj['name'] = name
        obj['updatetime'] = time.time()
        return self.engine.execute(self.table.insert()
                                   .values(**self._stringify(obj)))

    def update(self, name, obj={}, **kwargs):
        obj = dict(obj)
        obj.update(kwargs)
        obj['updatetime'] = time.time()
        return self.engine.execute(self.table.update()
                                   .where(self.table.c.name == name)
                                   .values(**self._stringify(obj)))

    def get_all(self, fields=None):
        columns = [getattr(self.table.c, f, f) for f in fields] if fields else self.table.c
        for task in self.engine.execute(self.table.select()
                                        .with_only_columns(columns)):
            yield self._parse(result2dict(columns, task))

    def get(self, name, fields=None):
        columns = [getattr(self.table.c, f, f) for f in fields] if fields else self.table.c
        for task in self.engine.execute(self.table.select()
                                        .where(self.table.c.name == name)
                                        .limit(1)
                                        .with_only_columns(columns)):
            return self._parse(result2dict(columns, task))

    def drop(self, name):
        return self.engine.execute(self.table.delete()
                                   .where(self.table.c.name == name))

    def check_update(self, timestamp, fields=None):
        columns = [getattr(self.table.c, f, f) for f in fields] if fields else self.table.c
        for task in self.engine.execute(self.table.select()
                                        .with_only_columns(columns)
                                        .where(self.table.c.updatetime >= timestamp)):
            yield self._parse(result2dict(columns, task))


================================================
FILE: pyspider/database/sqlalchemy/resultdb.py
================================================
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:
# Author: Binux<roy@binux.me>
#         http://binux.me
# Created on 2014-12-04 18:48:15

import re
import six
import time
import json
import sqlalchemy.exc

from sqlalchemy import (create_engine, MetaData, Table, Column,
                        String, Float, Text)
from sqlalchemy.engine.url import make_url
from pyspider.database.base.resultdb import ResultDB as BaseResultDB
from pyspider.libs import utils
from .sqlalchemybase import SplitTableMixin, result2dict


class ResultDB(SplitTableMixin, BaseResultDB):
    __tablename__ = ''

    def __init__(self, url):
        self.table = Table('__tablename__', MetaData(),
                           Column('taskid', String(64), primary_key=True, nullable=False),
                           Column('url', String(1024)),
                           Column('result', Text()),
                           Column('updatetime', Float(32)),
                           mysql_engine='InnoDB',
                           mysql_charset='utf8'
                           )

        self.url = make_url(url)
        if self.url.database:
            database = self.url.database
            self.url.database = None
            try:
                engine = create_engine(self.url, convert_unicode=True, pool_recycle=3600)
                conn = engine.connect()
                conn.execute("commit")
                conn.execute("CREATE DATABASE %s" % database)
            except sqlalchemy.exc.SQLAlchemyError:
                pass
            self.url.database = database
        self.engine = create_engine(url, convert_unicode=True,
                                    pool_recycle=3600)

        self._list_project()

    def _create_project(self, project):
        assert re.match(r'^\w+$', project) is not None
        if project in self.projects:
            return
        self.table.name = self._tablename(project)
        self.table.create(self.engine)

    @staticmethod
    def _parse(data):
        for key, value in list(six.iteritems(data)):
            if isinstance(value, six.binary_type):
                data[key] = utils.text(value)
        if 'result' in data:
            if data['result']:
                data['result'] = json.loads(data['result'])
            else:
                data['result'] = {}
        return data

    @staticmethod
    def _stringify(data):
        if 'result' in data:
            if data['result']:
                data['result'] = json.dumps(data['result'])
            else:
                data['result'] = json.dumps({})
        return data

    def save(self, project, taskid, url, result):
        if project not in self.projects:
            self._create_project(project)
            self._list_project()
        self.table.name = self._tablename(project)
        obj = {
            'taskid': taskid,
            'url': url,
            'result': result,
            'updatetime': time.time(),
        }
        if self.get(project, taskid, ('taskid', )):
            del obj['taskid']
            return self.engine.execute(self.table.update()
                                       .where(self.table.c.taskid == taskid)
                                       .values(**self._stringify(obj)))
        else:
            return self.engine.execute(self.table.insert()
                                       .values(**self._stringify(obj)))

    def select(self, project, fields=None, offset=0, limit=None):
        if project not in self.projects:
            self._list_project()
        if project not in self.projects:
            return
        self.table.name = self._tablename(project)

        columns = [getattr(self.table.c, f, f) for f in fields] if fields else self.table.c
        for task in self.engine.execute(self.table.select()
                                        .with_only_columns(columns=columns)
                                        .order_by(self.table.c.updatetime.desc())
                                        .offset(offset).limit(limit)
                                        .execution_options(autocommit=True)):
            yield self._parse(result2dict(columns, task))

    def count(self, project):
        if project not in self.projects:
            self._list_project()
        if project not in self.projects:
            return 0
        self.table.name = self._tablename(project)

        for count, in self.engine.execute(self.table.count()):
            return count

    def get(self, project, taskid, fields=None):
        if project not in self.projects:
            self._list_project()
        if project not in self.projects:
            return
        self.table.name = self._tablename(project)

        columns = [getattr(self.table.c, f, f) for f in fields] if fields else self.table.c
        for task in self.engine.execute(self.table.select()
                                        .with_only_columns(columns=columns)
                                        .where(self.table.c.taskid == taskid)
                                        .limit(1)):
            return self._parse(result2dict(columns, task))


================================================
FILE: pyspider/database/sqlalchemy/sqlalchemybase.py
================================================
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:
# Author: Binux<roy@binux.me>
#         http://binux.me
# Created on 2014-12-04 18:48:47

import time


def result2dict(columns, task):
    return dict(task)


class SplitTableMixin(object):
    UPDATE_PROJECTS_TIME = 10 * 60

    def _tablename(self, project):
        if self.__tablename__:
            return '%s_%s' % (self.__tablename__, project)
        else:
            return project

    @property
    def projects(self):
        if time.time() - getattr(self, '_last_update_projects', 0) \
                > self.UPDATE_PROJECTS_TIME:
            self._list_project()
        return self._projects

    @projects.setter
    def projects(self, value):
        self._projects = value

    def _list_project(self):
        self._last_update_projects = time.time()
        self.projects = set()
        if self.__tablename__:
            prefix = '%s_' % self.__tablename__
        else:
            prefix = ''

        for project in self.engine.table_names():
            if project.startswith(prefix):
                project = project[len(prefix):]
                self.projects.add(project)

    def drop(self, project):
        if project not in self.projects:
            self._list_project()
        if project not in self.projects:
            return
        self.table.name = self._tablename(project)
        self.table.drop(self.engine)
        self._list_project()


================================================
FILE: pyspider/database/sqlalchemy/taskdb.py
================================================
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:
# Author: Binux<roy@binux.me>
#         http://binux.me
# Created on 2014-12-04 22:33:43

import re
import six
import time
import json
import sqlalchemy.exc

from sqlalchemy import (create_engine, MetaData, Table, Column, Index,
                        Integer, String, Float, Text, func)
from sqlalchemy.engine.url import make_url
from pyspider.libs import utils
from pyspider.database.base.taskdb import TaskDB as BaseTaskDB
from .sqlalchemybase import SplitTableMixin, result2dict


class TaskDB(SplitTableMixin, BaseTaskDB):
    __tablename__ = ''

    def __init__(self, url):
        self.table = Table('__tablename__', MetaData(),
                           Column('taskid', String(64), primary_key=True, nullable=False),
                           Column('project', String(64)),
                           Column('url', String(1024)),
                           Column('status', Integer),
                           Column('schedule', Text()),
                           Column('fetch', Text()),
                           Column('process', Text()),
                           Column('track', Text()),
                           Column('lastcrawltime', Float(32)),
                           Column('updatetime', Float(32)),
                           mysql_engine='InnoDB',
                           mysql_charset='utf8'
                           )

        self.url = make_url(url)
        if self.url.database:
            database = self.url.database
            self.url.database = None
            try:
                engine = create_engine(self.url, convert_unicode=True, pool_recycle=3600)
                conn = engine.connect()
                conn.execute("commit")
                conn.execute("CREATE DATABASE %s" % database)
            except sqlalchemy.exc.SQLAlchemyError:
                pass
            self.url.database = database
        self.engine = create_engine(url, convert_unicode=True, pool_recycle=3600)

        self._list_project()

    def _create_project(self, project):
        assert re.match(r'^\w+$', project) is not None
        if project in self.projects:
            return
        self.table.name = self._tablename(project)
        Index('status_%s_index' % self.table.name, self.table.c.status)
        self.table.create(self.engine, checkfirst=True)
        self.table.indexes.clear()

    @staticmethod
    def _parse(data):
        for key, value in list(six.iteritems(data)):
            if isinstance(value, six.binary_type):
                data[key] = utils.text(value)
        for each in ('schedule', 'fetch', 'process', 'track'):
            if each in data:
                if data[each]:
                    data[each] = json.loads(data[each])
                else:
                    data[each] = {}
        return data

    @staticmethod
    def _stringify(data):
        for each in ('schedule', 'fetch', 'process', 'track'):
            if each in data:
                if data[each]:
                    data[each] = json.dumps(data[each])
                else:
                    data[each] = json.dumps({})
        return data

    def load_tasks(self, status, project=None, fields=None):
        if project and project not in self.projects:
            return

        if project:
            projects = [project, ]
        else:
            projects = self.projects

        columns = [getattr(self.table.c, f, f) for f in fields] if fields else self.table.c
        for project in projects:
            self.table.name = self._tablename(project)
            for task in self.engine.execute(self.table.select()
                                            .with_only_columns(columns)
                                            .where(self.table.c.status == status)):
                yield self._parse(result2dict(columns, task))

    def get_task(self, project, taskid, fields=None):
        if project not in self.projects:
            self._list_project()
        if project not in self.projects:
            return None

        self.table.name = self._tablename(project)
        columns = [getattr(self.table.c, f, f) for f in fields] if fields else self.table.c
        for each in self.engine.execute(self.table.select()
                                        .with_only_columns(columns)
                                        .limit(1)
                                        .where(self.table.c.taskid == taskid)):
            return self._parse(result2dict(columns, each))

    def status_count(self, project):
        result = dict()
        if project not in self.projects:
            self._list_project()
        if project not in self.projects:
            return result

        self.table.name = self._tablename(project)
        for status, count in self.engine.execute(
                self.table.select()
                .with_only_columns((self.table.c.status, func.count(1)))
                .group_by(self.table.c.status)):
            result[status] = count
        return result

    def insert(self, project, taskid, obj={}):
        if project not in self.projects:
            self._list_project()
        if project not in self.projects:
            self._create_project(project)
            self._list_project()
        obj = dict(obj)
        obj['taskid'] = taskid
        obj['project'] = project
        obj['updatetime'] = time.time()
        self.table.name = self._tablename(project)
        return self.engine.execute(self.table.insert()
                                   .values(**self._stringify(obj)))

    def update(self, project, taskid, obj={}, **kwargs):
        if project not in self.projects:
            self._list_project()
        if project not in self.projects:
            raise LookupError
        self.table.name = self._tablename(project)
        obj = dict(obj)
        obj.update(kwargs)
        obj['updatetime'] = time.time()
        return self.engine.execute(self.table.update()
                                   .where(self.table.c.taskid == taskid)
                                   .values(**self._stringify(obj)))


================================================
FILE: pyspider/database/sqlite/__init__.py
================================================


================================================
FILE: pyspider/database/sqlite/projectdb.py
================================================
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:
# Author: Binux<i@binux.me>
#         http://binux.me
# Created on 2014-02-09 12:05:52

import time

from .sqlitebase import SQLiteMixin
from pyspider.database.base.projectdb import ProjectDB as BaseProjectDB
from pyspider.database.basedb import BaseDB


class ProjectDB(SQLiteMixin, BaseProjectDB, BaseDB):
    __tablename__ = 'projectdb'
    placeholder = '?'

    def __init__(self, path):
        self.path = path
        self.last_pid = 0
        self.conn = None
        self._execute('''CREATE TABLE IF NOT EXISTS `%s` (
                name PRIMARY KEY,
                `group`,
                status, script, comments,
                rate, burst, updatetime
                )''' % self.__tablename__)

    def insert(self, name, obj={}):
        obj = dict(obj)
        obj['name'] = name
        obj['updatetime'] = time.time()
        return self._insert(**obj)

    def update(self, name, obj={}, **kwargs):
        obj = dict(obj)
        obj.update(kwargs)
        obj['updatetime'] = time.time()
        ret = self._update(where="`name` = %s" % self.placeholder, where_values=(name, ), **obj)
        return ret.rowcount

    def get_all(self, fields=None):
        return self._select2dic(what=fields)

    def get(self, name, fields=None):
        where = "`name` = %s" % self.placeholder
        for each in self._select2dic(what=fields, where=where, where_values=(name, )):
            return each
        return None

    def check_update(self, timestamp, fields=None):
        where = "`updatetime` >= %f" % timestamp
        return self._select2dic(what=fields, where=where)

    def drop(self, name):
        where = "`name` = %s" % self.placeholder
        return self._delete(where=where, where_values=(name, ))


================================================
FILE: pyspider/database/sqlite/resultdb.py
================================================
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:
# Author: Binux<i@binux.me>
#         http://binux.me
# Created on 2014-10-13 17:08:43

import re
import time
import json

from .sqlitebase import SQLiteMixin, SplitTableMixin
from pyspider.database.base.resultdb import ResultDB as BaseResultDB
from pyspider.database.basedb import BaseDB


class ResultDB(SQLiteMixin, SplitTableMixin, BaseResultDB, BaseDB):
    __tablename__ = 'resultdb'
    placeholder = '?'

    def __init__(self, path):
        self.path = path
        self.last_pid = 0
        self.conn = None
        self._list_project()

    def _create_project(self, project):
        assert re.match(r'^\w+$', project) is not None
        tablename = self._tablename(project)
        self._execute('''CREATE TABLE IF NOT EXISTS `%s` (
                taskid PRIMARY KEY,
                url,
                result,
                updatetime
                )''' % tablename)

    def _parse(self, data):
        if 'result' in data:
            data['result'] = json.loads(data['result'])
        return data

    def _stringify(self, data):
        if 'result' in data:
            data['result'] = json.dumps(data['result'])
        return data

    def save(self, project, taskid, url, result):
        tablename = self._tablename(project)
        if project not in self.projects:
            self._create_project(project)
            self._list_project()
        obj = {
            'taskid': taskid,
            'url': url,
            'result': result,
            'updatetime': time.time(),
        }
        return self._replace(tablename, **self._stringify(obj))

    def select(self, project, fields=None, offset=0, limit=None):
        if project not in self.projects:
            self._list_project()
        if project not in self.projects:
            return
        tablename = self._tablename(project)

        for task in self._select2dic(tablename, what=fields, order='updatetime DESC',
                                     offset=offset, limit=limit):
            yield self._parse(task)

    def count(self, project):
        if project not in self.projects:
            self._list_project()
        if project not in self.projects:
            return 0
        tablename = self._tablename(project)
        for count, in self._execute("SELECT count(1) FROM %s" % self.escape(tablename)):
            return count

    def get(self, project, taskid, fields=None):
        if project not in self.projects:
            self._list_project()
        if project not in self.projects:
            return
        tablename = self._tablename(project)
        where = "`taskid` = %s" % self.placeholder
        for task in self._select2dic(tablename, what=fields,
                                     where=where, where_values=(taskid, )):
            return self._parse(task)


================================================
FILE: pyspider/database/sqlite/sqlitebase.py
================================================
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:
# Author: Binux<roy@binux.me>
#         http://binux.me
# Created on 2014-11-22 20:30:44

import os
import time
import sqlite3
import threading


class SQLiteMixin(object):

    @property
    def dbcur(self):
        pid = (os.getpid(), threading.current_thread().ident)
        if not (self.conn and pid == self.last_pid):
            self.last_pid = pid
            self.conn = sqlite3.connect(self.path, isolation_level=None)
        return self.conn.cursor()


class SplitTableMixin(object):
    UPDATE_PROJECTS_TIME = 10 * 60

    def _tablename(self, project):
        if self.__tablename__:
            return '%s_%s' % (self.__tablename__, project)
        else:
            return project

    @property
    def projects(self):
        if time.time() - getattr(self, '_last_update_projects', 0) \
                > self.UPDATE_PROJECTS_TIME:
            self._list_project()
        return self._projects

    @projects.setter
    def projects(self, value):
        self._projects = value

    def _list_project(self):
        self._last_update_projects = time.time()
        self.projects = set()
        if self.__tablename__:
            prefix = '%s_' % self.__tablename__
        else:
            prefix = ''
        for project, in self._select('sqlite_master', what='name',
                                     where='type = "table"'):
            if project.startswith(prefix):
                project = project[len(prefix):]
                self.projects.add(project)

    def drop(self, project):
        if project not in self.projects:
            self._list_project()
        if project not in self.projects:
            return
        tablename = self._tablename(project)
        self._execute("DROP TABLE %s" % self.escape(tablename))
        self._list_project()


================================================
FILE: pyspider/database/sqlite/taskdb.py
================================================
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:
# Author: Binux<i@binux.me>
#         http://binux.me
# Created on 2014-02-08 10:25:34

import re
import time
import json

from .sqlitebase import SQLiteMixin, SplitTableMixin
from pyspider.database.base.taskdb import TaskDB as BaseTaskDB
from pyspider.database.basedb import BaseDB


class TaskDB(SQLiteMixin, SplitTableMixin, BaseTaskDB, BaseDB):
    __tablename__ = 'taskdb'
    placeholder = '?'

    def __init__(self, path):
        self.path = path
        self.last_pid = 0
        self.conn = None
        self._list_project()

    def _create_project(self, project):
        assert re.match(r'^\w+$', project) is not None
        tablename = self._tablename(project)
        self._execute('''CREATE TABLE IF NOT EXISTS `%s` (
                taskid PRIMARY KEY,
                project,
                url, status,
                schedule, fetch, process, track,
                lastcrawltime, updatetime
                )''' % tablename)
        self._execute(
            '''CREATE INDEX `status_%s_index` ON %s (status)'''
            % (tablename, self.escape(tablename))
        )

    def _parse(self, data):
        for each in ('schedule', 'fetch', 'process', 'track'):
            if each in data:
                if data[each]:
                    data[each] = json.loads(data[each])
                else:
                    data[each] = {}
        return data

    def _stringify(self, data):
        for each in ('schedule', 'fetch', 'process', 'track'):
            if each in data:
                data[each] = json.dumps(data[each])
        return data

    def load_tasks(self, status, project=None, fields=None):
        if project and project not in self.projects:
            return
        where = "status = %d" % status

        if project:
            projects = [project, ]
        else:
            projects = self.projects

        for project in projects:
            tablename = self._tablename(project)
            for each in self._select2dic(tablename, what=fields, where=where):
                yield self._parse(each)

    def get_task(self, project, taskid, fields=None):
        if project not in self.projects:
            self._list_project()
        if project not in self.projects:
            return None
        where = "`taskid` = %s" % self.placeholder
        if project not in self.projects:
            return None
        tablename = self._tablename(project)
        for each in self._select2dic(tablename, what=fields, where=where, where_values=(taskid, )):
            return self._parse(each)
        return None

    def status_count(self, project):
        '''
        return a dict
        '''
        result = dict()
        if project not in self.projects:
            self._list_project()
        if project not in self.projects:
            return result
        tablename = self._tablename(project)
        for status, count in self._execute("SELECT `status`, count(1) FROM %s GROUP BY `status`" %
                                           self.escape(tablename)):
            result[status] = count
        return result

    def insert(self, project, taskid, obj={}):
        if project not in self.projects:
            self._create_project(project)
            self._list_project()
        obj = dict(obj)
        obj['taskid'] = taskid
        obj['project'] = project
        obj['updatetime'] = time.time()
        tablename = self._tablename(project)
        return self._insert(tablename, **self._stringify(obj))

    def update(self, project, taskid, obj={}, **kwargs):
        if project not in self.projects:
            raise LookupError
        tablename = self._tablename(project)
        obj = dict(obj)
        obj.update(kwargs)
        obj['updatetime'] = time.time()
        return self._update(
            tablename, where="`taskid` = %s" % self.placeholder, where_values=(taskid, ),
            **self._stringify(obj)
        )


================================================
FILE: pyspider/fetcher/__init__.py
================================================
from .tornado_fetcher import Fetcher


================================================
FILE: pyspider/fetcher/cookie_utils.py
================================================
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:
# Author: Binux<roy@binux.me>
#         http://binux.me
# Created on 2014-12-14 09:07:11

from requests.cookies import MockRequest


class MockResponse(object):

    def __init__(self, headers):
        self._headers = headers

    def info(self):
        return self

    def getheaders(self, name):
        """make cookie python 2 version use this method to get cookie list"""
        return self._headers.get_list(name)

    def get_all(self, name, default=None):
        """make cookie python 3 version use this instead of getheaders"""
        if default is None:
            default = []
        return self._headers.get_list(name) or default


def extract_cookies_to_jar(jar, request, response):
    req = MockRequest(request)
    res = MockResponse(response)
    jar.extract_cookies(res, req)


================================================
FILE: pyspider/fetcher/phantomjs_fetcher.js
================================================
// vim: set et sw=2 ts=2 sts=2 ff=unix fenc=utf8:
// Author: Binux<i@binux.me>
//         http://binux.me
// Created on 2014-10-29 22:12:14

var port, server, service,
  wait_before_end = 1000,
  system = require('system'),
  webpage = require('webpage');

if (system.args.length !== 2) {
  console.log('Usage: simpleserver.js <portnumber>');
  phantom.exit(1);
} else {
  port = system.args[1];
  server = require('webserver').create();
  console.debug = function(){};

  service = server.listen(port, {
    'keepAlive': false
  }, function (request, response) {
    phantom.clearCookies();

    //console.debug(JSON.stringify(request, null, 4));
    // check method
    if (request.method == 'GET') {
      body = "method not allowed!";
      response.statusCode = 403;
      response.headers = {
        'Cache': 'no-cache',
        'Content-Length': body.length
      };
      response.write(body);
      response.closeGracefully();
      return;
    }
    
    var first_response = null,
        finished = false,
        page_loaded = false,
        start_time = Date.now(),
        end_time = null,
        script_executed = false,
        script_result = null;

    var fetch = JSON.parse(request.postRaw);
    console.debug(JSON.stringify(fetch, null, 2));

    // create and set page
    var page = webpage.create();
    if (fetch.proxy) {
      if (fetch.proxy.indexOf('://') == -1){
        fetch.proxy = 'http://' + fetch.proxy
      }
      page.setProxy(fetch.proxy);
    }
    page.onConsoleMessage = function(msg) {
        console.log('console: ' + msg);
    };
    page.viewportSize = {
      width: fetch.js_viewport_width || 1024,
      height: fetch.js_viewport_height || 768*3
    }
    if (fetch.headers) {
      fetch.headers['Accept-Encoding'] = undefined;
      fetch.headers['Connection'] = undefined;
      fetch.headers['Content-Length'] = undefined;
    }
    if (fetch.headers && fetch.headers['User-Agent']) {
      page.settings.userAgent = fetch.headers['User-Agent'];
    }
    // this may cause memory leak: https://github.com/ariya/phantomjs/issues/12903
    page.settings.loadImages = fetch.load_images === undefined ? true : fetch.load_images;
    page.settings.resourceTimeout = fetch.timeout ? fetch.timeout * 1000 : 20*1000;
    if (fetch.headers) {
      page.customHeaders = fetch.headers;
    }

    // add callbacks
    page.onInitialized = function() {
      if (!script_executed && fetch.js_script && fetch.js_run_at === "document-start") {
        script_executed = true;
        console.log('running document-start script.');
        script_result = page.evaluateJavaScript(fetch.js_script);
      }
    };
    page.onLoadFinished = function(status) {
      page_loaded = true;
      if (!script_executed && fetch.js_script && fetch.js_run_at !== "document-start") {
        script_executed = true;
        console.log('running document-end script.');
        script_result = page.evaluateJavaScript(fetch.js_script);
      }
      console.debug("waiting "+wait_before_end+"ms before finished.");
      end_time = Date.now() + wait_before_end;
      setTimeout(make_result, wait_before_end+10, page);
    };
    page.onResourceRequested = function(request) {
      console.debug("Starting request: #"+request.id+" ["+request.method+"]"+request.url);
      end_time = null;
    };
    page.onResourceReceived = function(response) {
      console.debug("Request finished: #"+response.id+" ["+response.status+"]"+response.url);
      if (first_response === null && response.status != 301 && response.status != 302) {
        first_response = response;
      }
      if (page_loaded) {
        console.debug("waiting "+wait_before_end+"ms before finished.");
        end_time = Date.now() + wait_before_end;
        setTimeout(make_result, wait_before_end+10, page);
      }
    }
    page.onResourceError = page.onResourceTimeout=function(response) {
      console.info("Request error: #"+response.id+" ["+response.errorCode+"="+response.errorString+"]"+response.url);
      if (first_response === null) {
        first_response = response;
      }
      if (page_loaded) {
        console.debug("waiting "+wait_before_end+"ms before finished.");
        end_time = Date.now() + wait_before_end;
        setTimeout(make_result, wait_before_end+10, page);
      }
    }

    // make sure request will finished
    setTimeout(make_result, page.settings.resourceTimeout + 100, page);

    // send request
    page.open(fetch.url, {
      operation: fetch.method,
      data: fetch.data,
    });

    // make response
    function make_result(page) {
      if (finished) {
        return;
      }
      if (Date.now() - start_time < page.settings.resourceTimeout) {
        if (!!!end_time) {
          return;
        }
        if (end_time > Date.now()) {
          setTimeout(make_result, Math.min(Date.now() - end_time, 100), page);
          return;
        }
      }

      var result = {};
      try {
        result = _make_result(page);
        page.close();
        finished = true;
        console.log("["+result.status_code+"] "+result.orig_url+" "+result.time)
      } catch (e) {
        result = {
          orig_url: fetch.url,
          status_code: 599,
          error: e.toString(),
          content: page.content || "",
          headers: {},
          url: page.url || fetch.url,
          cookies: {},
          time: (Date.now() - start_time) / 1000,
          js_script_result: null,
          save: fetch.save
        }
      }

      var body = JSON.stringify(result, null, 2);
      response.writeHead(200, {
        'Cache': 'no-cache',
        'Content-Type': 'application/json',
      });
      response.write(body);
      response.closeGracefully();
    }

    function _make_result(page) {
      if (first_response === null) {
        throw "Timeout before first response.";
      }

      var cookies = {};
      page.cookies.forEach(function(e) {
        cookies[e.name] = e.value;
      });

      var headers = {};
      if (first_response.headers) {
        first_response.headers.forEach(function(e) {
          headers[e.name] = e.value;
        });
      }

      return {
        orig_url: fetch.url,
        status_code: first_response.status || 599,
        error: first_response.errorString,
        content:  page.content,
        headers: headers,
        url: page.url,
        cookies: cookies,
        time: (Date.now() - start_time) / 1000,
        js_script_result: script_result,
        save: fetch.save
      }
    }
  });

  if (service) {
    console.log('phantomjs fetcher running on port ' + port);
  } else {
    console.log('Error: Could not create web server listening on port ' + port);
    phantom.exit();
  }
}


================================================
FILE: pyspider/fetcher/puppeteer_fetcher.js
================================================
const express = require("express");
const puppeteer = require('puppeteer');
const bodyParser = require('body-parser');

const app = express();

app.use(bodyParser.json());
app.use(bodyParser.urlencoded({extended: false}));

let init_browser = true;
let browser_settings = {};

app.use(async (req, res, next) => {
    if (init_browser) {
        var options = req.body;
        if (options.proxy) {
            if (options.proxy.indexOf("://") == -1) {
                options.proxy = "http://" + options.proxy;
            }
            browser_settings["args"] = ['--no-sandbox', "--disable-setuid-sandbox", "--proxy-server="+options.proxy];
        } else {
          browser_settings["args"] = ['--no-sandbox', "--disable-setuid-sandbox"];
        }
        browser_settings["headless"] = options.headless === "false"? false:true
        browser = await puppeteer.launch(browser_settings);
        init_browser=false;
        console.log("init browser success!");
        next();
    } else {
        next();
    };
});


async function fetch(options) {
    var page = await browser.newPage();
    options.start_time = Date.now();
    try {
        await _fetch(page, options);
        var result = await make_result(page, options);
        await page.close();
        return result
    } catch (error) {
        console.log('catch error ', error);
        var result = await make_result(page, options, error);
        await page.close();
        return result
    }
}

async function _fetch(page, options) {

    width = options.js_viewport_width || 1024;
    height = options.js_viewport_height || 768 * 3;
    await page.setViewport({
        "width": width,
        "height": height
    });

    if (options.headers) {
        await page.setExtraHTTPHeaders(options.headers);
    }

    if (options.headers && options.headers["User-Agent"]) {
        page.setUserAgent(options.headers["User-Agent"]);
    }

    page.on("console", msg => {
        console.log('console: ' + msg.args());
    });

    // Http post method
    let first_request = true;
    let request_reseted = false;
    await page.setRequestInterception(true);
    if (options.method && options.method.toLowerCase() === "post") {
        page.on("request", interceptedRequest => {
            request_reseted = false;
            end_time = null;
            if (first_request) {
                first_request = false;
                var data = {
                    "method": "POST",
                    "postData": options.data
                };
                console.log(data);
                interceptedRequest.continue(data);
                request_reseted = true
            }
        })
    } else {
        page.on("request", interceptedRequest => {
            request_reseted = false;
            end_time = null;
        })
    }

    // load images or not
    if (options.load_images && options.load_images.toLowerCase() === "false") {
        page.on("request", request => {
            if (!!!request_reseted) {
                if (request.resourceType() === 'image')
                    request.abort();
                else
                    request.continue();
            }
        })
    } else {
        page.on("request", request => {
            if (!!!request_reseted)
                request.continue()
        })
    }

    let error_message = null;
    page.on("error", e => {
        error_message = e
    });

    let page_settings = {};
    var page_timeout = options.timeout ? options.timeout * 1000 : 20 * 1000;
    page_settings["timeout"] = page_timeout
    page_settings["waitUntil"] = ["domcontentloaded", "networkidle0"];

    console.log('goto ', options.url)
    var response = await page.goto(options.url, page_settings);

    if (error_message) {
        throw error_message
    }

    if (options.js_script) {
        console.log('running document-end script.');
        script_result = await page.evaluate(options.js_script);
        console.log("end script_result is: ", script_result);
        options.script_result = script_result
    }

    if (options.screenshot_path) {
        await page.screenshot({path: options.screenshot_path});
    }

    options.response = response
}

async function make_result(page, options, error) {
    response = options.response;

    var cookies = {};
    var tmp_cookies = await page.cookies();
    tmp_cookies.forEach(function (e) {
        cookies[e.name] = e.value;
    });

    let status_code = null;
    let headers = null;
    let page_content = null;

    if (!!!error) {
        response = options.response;
        status_code = response.status();
        headers = response.headers();
        page_content = await page.content();
    }

    return {
        orig_url: options.url,
        status_code: status_code || 599,
        error: error,
        content: page_content,
        headers: headers,
        url: page.url(),
        cookies: cookies,
        time: (Date.now() - options.start_time) / 1000,
        js_script_result: options.script_result,
        save: options.save
    }
}

app.get("/", function (request, response) {
    body = "method not allowed!";
    response.status(403);
    response.set({
        "cache": "no-cache",
        "Content-Length": body.length
    });
    response.send(body);
});


let max_open_pages = 5;
let opened_page_nums = 0;

app.post("/", async (request, response) => {
    console.log("opened pages: " + opened_page_nums);
    if (opened_page_nums >= max_open_pages){
        body = "browser pages is too many, open new browser process!";
        response.status(403);
        response.set({
            "cache": "no-cache",
            "Content-Length": body.length
        });
        response.send(body);
    } else {
        opened_page_nums += 1;
        let options = request.body;
        result = await fetch(options);
        opened_page_nums -= 1;
        response.send(result)
    }
});


let port = 22222;

if (process.argv.length === 3) {
    port = parseInt(process.argv[2])
}

app.listen(port, function () {
    console.log("puppeteer fetcher running on port " + port);
});


================================================
FILE: pyspider/fetcher/splash_fetcher.lua
================================================
--#! /usr/bin/env lua
--
-- splash_fetcher.lua
-- Copyright (C) 2016 Binux <roy@binux.me>
--
-- Distributed under terms of the Apache license, version 2.0.
--

json = require("json")

function render(splash, fetch)
    local debug = true
    local function log_message(message, level)
        if debug or level ~= nil then
            print(message)
        end
    end
    if not splash.with_timeout then
        function with_timeout(self, func, timeout)
            return true, func()
        end
        splash.with_timeout = with_timeout
    end

    log_message(json.encode(fetch))

    -- create and set page
    local start_time = os.time()

    splash:clear_cookies()
    splash:autoload_reset()
    splash:on_request_reset()
    splash:on_response_reset()

    splash:set_viewport_size(fetch.js_viewport_width or 1024, fetch.js_viewport_height or 768 * 3)
    if fetch.headers and fetch.headers["User-Agent"] ~= nil then
        splash:set_user_agent(fetch.headers["User-Agent"])
    end
    if fetch.headers then
        fetch.headers['Accept-Encoding'] = nil
        fetch.headers['Connection'] = nil
        fetch.headers['Content-Length'] = nil
        splash:set_custom_headers(fetch.headers)
    end
    splash.images_enabled = (fetch.load_images == true)
    splash.resource_timeout = math.min((fetch.timeout or 20), 58)
    fetch.timeout = splash.resource_timeout

    local wait_before_end = 1.0;
    local end_time = start_time + fetch.timeout - 0.1
    

    -- callbacks
    splash:on_request(function(request)
        -- wait for new request
        end_time = start_time + fetch.timeout - 0.1
        log_message("Starting request: [" .. tostring(request.method) .. "]" .. tostring(request.url))

        if fetch.proxy_host and fetch.proxy_port then
            request:set_proxy({
                host = fetch.proxy_host,
                port = tonumber(fetch.proxy_port),
                username = fetch.proxy_username,
                password = fetch.proxy_password,
                type = 'HTTP'
            })
        end
    end)

    local first_response = nil
    splash:on_response(function(response)
        if first_response == nil then
            first_response = response
        end
        -- wait for some other respond and render
        end_time = math.min(os.time() + wait_before_end + 0.1, start_time + fetch.timeout - 0.1)
        log_message("Request finished: [" .. tostring(response.status) .. "]" .. tostring(response.url))
    end)

    -- send request
    local js_script_result = nil
    local timeout_ok, ok, reason = splash:with_timeout(function()
        local js_script = nil
        if fetch.js_script then
            ok, js_script = pcall(function()
                return splash:jsfunc(fetch.js_script)
            end)
            if not ok then
                log_message("js_script error: " .. tostring(js_script), 1)
                js_script = nil
            end
        end

        if js_script and fetch.js_run_at == "document-start" then
            log_message("running document-start script.");
            ok, js_script_result = pcall(js_script)
            if not ok then
                log_message("running document-start script error: " .. tostring(js_script_result), 1)
            end
        end

        local ok, reason = splash:go{url=fetch.url, http_method=fetch.method, body=fetch.data}
        end_time = math.min(os.time() + wait_before_end + 0.1, start_time + fetch.timeout - 0.1)

        if js_script and fetch.js_run_at ~= "document-start" then
            splash:wait(0.5)
            log_message("running document-end script.");
            ok, js_script_result = pcall(js_script)
            if not ok then
                log_message("running document-end script error: " .. tostring(js_script_result), 1)
            end
        end

        -- wait for all requests finished
        local now = os.time()
        while now <= end_time do
            splash:wait(math.min(end_time - now, 0.1))
            now = os.time()
        end

        return ok, reason
    end, fetch.timeout + 0.1)

    -- make response
    local cookies = {}
    for i, c in ipairs(splash:get_cookies()) do
        cookies[c.name] = c.value
    end
    if (not timeout_ok and first_response.ok) or (timeok and ok) then
        return {
            orig_url = fetch.url,
            status_code = first_response.status == 0 and 599 or first_response.status,
            error = nil,
            content = splash:html(),
            headers = first_response.headers,
            url = splash:url(),
            cookies = cookies,
            time = os.time() - start_time,
            js_script_result = js_script_result and tostring(js_script_result),
            save = fetch.save
        }
    else
        if first_response then
            return {
                orig_url = fetch.url,
                status_code = first_response.status == 0 and 599 or first_response.status,
                error = reason,
                content = splash:html(),
                headers = first_response.headers,
                url = splash:url(),
                cookies = cookies,
                time = os.time() - start_time,
                js_script_result = js_script_result and tostring(js_script_result),
                save = fetch.save
            }
        else
            return {
                orig_url = fetch.url,
                status_code = 599,
                error = reason,
                content = splash:html(),
                headers = {},
                url = splash:url(),
                cookies = cookies,
                time = os.time() - start_time,
                js_script_result = js_script_result and tostring(js_script_result),
                save = fetch.save
            }
        end
    end

end

function main(splash)
    local fetch = splash.args
    local start_time = os.time()

    ok, result = pcall(function()
        return render(splash, fetch)
    end)

    if ok then
        return result
    else
        return {
            orig_url = fetch.url,
            status_code = 599,
            error = result,
            content = splash:html(),
            headers = {},
            url = splash:url(),
            cookies = {},
            time = os.time() - start_time,
            js_script_result = nil,
            save = fetch.save
        }
    end
end


================================================
FILE: pyspider/fetcher/tornado_fetcher.py
================================================
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:
# Author: Binux<i@binux.me>
#         http://binux.me
# Created on 2012-12-17 11:07:19

from __future__ import unicode_literals

import os
import sys
import six
import copy
import time
import json
import logging
import traceback
import functools
import threading
import tornado.ioloop
import tornado.httputil
import tornado.httpclient
import pyspider

from six.moves import queue, http_cookies
from six.moves.urllib.robotparser import RobotFileParser
from requests import cookies
from six.moves.urllib.parse import urljoin, urlsplit
from tornado import gen
from tornado.curl_httpclient import CurlAsyncHTTPClient
from tornado.simple_httpclient import SimpleAsyncHTTPClient

from pyspider.libs import utils, dataurl, counter
from pyspider.libs.url import quote_chinese
from .cookie_utils import extract_cookies_to_jar
logger = logging.getLogger('fetcher')


class MyCurlAsyncHTTPClient(CurlAsyncHTTPClient):

    def free_size(self):
        return len(self._free_list)

    def size(self):
        return len(self._curls) - self.free_size()


class MySimpleAsyncHTTPClient(SimpleAsyncHTTPClient):

    def free_size(self):
        return self.max_clients - self.size()

    def size(self):
        return len(self.active)

fetcher_output = {
    "status_code": int,
    "orig_url": str,
    "url": str,
    "headers": dict,
    "content": str,
    "cookies": dict,
}


class Fetcher(object):
    user_agent = "pyspider/%s (+http://pyspider.org/)" % pyspider.__version__
    default_options = {
        'method': 'GET',
        'headers': {
        },
        'use_gzip': True,
        'timeout': 120,
        'connect_timeout': 20,
    }
    phantomjs_proxy = None
    splash_endpoint = None
    splash_lua_source = open(os.path.join(os.path.dirname(__file__), "splash_fetcher.lua")).read()
    robot_txt_age = 60*60  # 1h

    def __init__(self, inqueue, outqueue, poolsize=100, proxy=None, async_mode=True):
        self.inqueue = inqueue
        self.outqueue = outqueue

        self.poolsize = poolsize
        self._running = False
        self._quit = False
        self.proxy = proxy
        self.async_mode = async_mode
        self.ioloop = tornado.ioloop.IOLoop()

        self.robots_txt_cache = {}

        # binding io_loop to http_client here
        if self.async_mode:
            self.http_client = MyCurlAsyncHTTPClient(max_clients=self.poolsize,
                                                     io_loop=self.ioloop)
        else:
            self.http_client = tornado.httpclient.HTTPClient(MyCurlAsyncHTTPClient, max_clients=self.poolsize)

        self._cnt = {
            '5m': counter.CounterManager(
                lambda: counter.TimebaseAverageWindowCounter(30, 10)),
            '1h': counter.CounterManager(
                lambda: counter.TimebaseAverageWindowCounter(60, 60)),
        }

    def send_result(self, type, task, result):
        '''Send fetch result to processor'''
        if self.outqueue:
            try:
                self.outqueue.put((task, result))
            except Exception as e:
                logger.exception(e)

    def fetch(self, task, callback=None):
        if self.async_mode:
            return self.async_fetch(task, callback)
        else:
            return self.async_fetch(task, callback).result()

    @gen.coroutine
    def async_fetch(self, task, callback=None):
        '''Do one fetch'''
        url = task.get('url', 'data:,')
        if callback is None:
            callback = self.send_result

        type = 'None'
        start_time = time.time()
        try:
            if url.startswith('data:'):
                type = 'data'
                result = yield gen.maybe_future(self.data_fetch(url, task))
            elif task.get('fetch', {}).get('fetch_type') in ('js', 'phantomjs'):
                type = 'phantomjs'
                result = yield self.phantomjs_fetch(url, task)
            elif task.get('fetch', {}).get('fetch_type') in ('splash', ):
                type = 'splash'
                result = yield self.splash_fetch(url, task)
            elif task.get('fetch', {}).get('fetch_type') in ('puppeteer', ):
                type = 'puppeteer'
                result = yield self.puppeteer_fetch(url, task)
            else:
                type = 'http'
                result = yield self.http_fetch(url, task)
        except Exception as e:
            logger.exception(e)
            result = self.handle_error(type, url, task, start_time, e)

        callback(type, task, result)
        self.on_result(type, task, result)
        raise gen.Return(result)

    def sync_fetch(self, task):
        '''Synchronization fetch, usually used in xmlrpc thread'''
        if not self._running:
            return self.ioloop.run_sync(functools.partial(self.async_fetch, task, lambda t, _, r: True))

        wait_result = threading.Condition()
        _result = {}

        def callback(type, task, result):
            wait_result.acquire()
            _result['type'] = type
            _result['task'] = task
            _result['result'] = result
            wait_result.notify()
            wait_result.release()

        wait_result.acquire()
        self.ioloop.add_callback(self.fetch, task, callback)
        while 'result' not in _result:
            wait_result.wait()
        wait_result.release()
        return _result['result']

    def data_fetch(self, url, task):
        '''A fake fetcher for dataurl'''
        self.on_fetch('data', task)
        result = {}
        result['orig_url'] = url
        result['content'] = dataurl.decode(url)
        result['headers'] = {}
        result['status_code'] = 200
        result['url'] = url
        result['cookies'] = {}
        result['time'] = 0
        result['save'] = task.get('fetch', {}).get('save')
        if len(result['content']) < 70:
            logger.info("[200] %s:%s %s 0s", task.get('project'), task.get('taskid'), url)
        else:
            logger.info(
                "[200] %s:%s data:,%s...[content:%d] 0s",
                task.get('project'), task.get('taskid'),
                result['content'][:70],
                len(result['content'])
            )

        return result

    def handle_error(self, type, url, task, start_time, error):
        result = {
            'status_code': getattr(error, 'code', 599),
            'error': utils.text(error),
            'traceback': traceback.format_exc() if sys.exc_info()[0] else None,
            'content': "",
            'time': time.time() - start_time,
            'orig_url': url,
            'url': url,
            "save": task.get('fetch', {}).get('save')
        }
        logger.error("[%d] %s:%s %s, %r %.2fs",
                     result['status_code'], task.get('project'), task.get('taskid'),
                     url, error, result['time'])
        return result

    allowed_options = ['method', 'data', 'connect_timeout', 'timeout', 'cookies', 'use_gzip', 'validate_cert']

    def pack_tornado_request_parameters(self, url, task):
        fetch = copy.deepcopy(self.default_options)
        fetch['url'] = url
        fetch['headers'] = tornado.httputil.HTTPHeaders(fetch['headers'])
        fetch['headers']['User-Agent'] = self.user_agent
        task_fetch = task.get('fetch', {})
        for each in self.allowed_options:
            if each in task_fetch:
                fetch[each] = task_fetch[each]
        fetch['headers'].update(task_fetch.get('headers', {}))

        if task.get('track'):
            track_headers = tornado.httputil.HTTPHeaders(
                task.get('track', {}).get('fetch', {}).get('headers') or {})
            track_ok = task.get('track', {}).get('process', {}).get('ok', False)
        else:
            track_headers = {}
            track_ok = False
        # proxy
        proxy_string = None
        if isinstance(task_fetch.get('proxy'), six.string_types):
            proxy_string = task_fetch['proxy']
        elif self.proxy and task_fetch.get('proxy', True):
            proxy_string = self.proxy
        if proxy_string:
            if '://' not in proxy_string:
                proxy_string = 'http://' + proxy_string
            proxy_splited = urlsplit(proxy_string)
            fetch['proxy_host'] = proxy_splited.hostname
            if proxy_splited.username:
                fetch['proxy_username'] = proxy_splited.username
            if proxy_splited.password:
                fetch['proxy_password'] = proxy_splited.password
            if six.PY2:
                for key in ('proxy_host', 'proxy_username', 'proxy_password'):
                    if key in fetch:
                        fetch[key] = fetch[key].encode('utf8')
            fetch['proxy_port'] = proxy_splited.port or 8080

        # etag
        if task_fetch.get('etag', True):
            _t = None
            if isinstance(task_fetch.get('etag'), six.string_types):
                _t = task_fetch.get('etag')
            elif track_ok:
                _t = track_headers.get('etag')
            if _t and 'If-None-Match' not in fetch['headers']:
                fetch['headers']['If-None-Match'] = _t
        # last modifed
        if task_fetch.get('last_modified', task_fetch.get('last_modifed', True)):
            last_modified = task_fetch.get('last_modified', task_fetch.get('last_modifed', True))
            _t = None
            if isinstance(last_modified, six.string_types):
                _t = last_modified
            elif track_ok:
                _t = track_headers.get('last-modified')
            if _t and 'If-Modified-Since' not in fetch['headers']:
                fetch['headers']['If-Modified-Since'] = _t
        # timeout
        if 'timeout' in fetch:
            fetch['request_timeout'] = fetch['timeout']
            del fetch['timeout']
        # data rename to body
        if 'data' in fetch:
            fetch['body'] = fetch['data']
            del fetch['data']

        return fetch

    @gen.coroutine
    def can_fetch(self, user_agent, url):
        parsed = urlsplit(url)
        domain = parsed.netloc
        if domain in self.robots_txt_cache:
            robot_txt = self.robots_txt_cache[domain]
            if time.time() - robot_txt.mtime() > self.robot_txt_age:
                robot_txt = None
        else:
            robot_txt = None

        if robot_txt is None:
            robot_txt = RobotFileParser()
            try:
                response = yield gen.maybe_future(self.http_client.fetch(
                    urljoin(url, '/robots.txt'), connect_timeout=10, request_timeout=30))
                content = response.body
            except tornado.httpclient.HTTPError as e:
                logger.error('load robots.txt from %s error: %r', domain, e)
                content = ''

            try:
                content = content.decode('utf8', 'ignore')
            except UnicodeDecodeError:
                content = ''

            robot_txt.parse(content.splitlines())
            self.robots_txt_cache[domain] = robot_txt

        raise gen.Return(robot_txt.can_fetch(user_agent, url))

    def clear_robot_txt_cache(self):
        now = time.time()
        for domain, robot_txt in self.robots_txt_cache.items():
            if now - robot_txt.mtime() > self.robot_txt_age:
                del self.robots_txt_cache[domain]

    @gen.coroutine
    def http_fetch(self, url, task):
        '''HTTP fetcher'''
        start_time = time.time()
        self.on_fetch('http', task)
        handle_error = lambda x: self.handle_error('http', url, task, start_time, x)

        # setup request parameters
        fetch = self.pack_tornado_request_parameters(url, task)
        task_fetch = task.get('fetch', {})

        session = cookies.RequestsCookieJar()
        # fix for tornado request obj
        if 'Cookie' in fetch['headers']:
            c = http_cookies.SimpleCookie()
            try:
                c.load(fetch['headers']['Cookie'])
            except AttributeError:
                c.load(utils.utf8(fetch['headers']['Cookie']))
            for key in c:
                session.set(key, c[key])
            del fetch['headers']['Cookie']
        if 'cookies' in fetch:
            session.update(fetch['cookies'])
            del fetch['cookies']

        max_redirects = task_fetch.get('max_redirects', 5)
        # we will handle redirects by hand to capture cookies
        fetch['follow_redirects'] = False

        # making requests
        while True:
            # robots.txt
            if task_fetch.get('robots_txt', False):
                can_fetch = yield self.can_fetch(fetch['headers']['User-Agent'], fetch['url'])
                if not can_fetch:
                    error = tornado.httpclient.HTTPError(403, 'Disallowed by robots.txt')
                    raise gen.Return(handle_error(error))

            try:
                request = tornado.httpclient.HTTPRequest(**fetch)
                # if cookie already in header, get_cookie_header wouldn't work
                old_cookie_header = request.headers.get('Cookie')
                if old_cookie_header:
                    del request.headers['Cookie']
                cookie_header = cookies.get_cookie_header(session, request)
                if cookie_header:
                    request.headers['Cookie'] = cookie_header
                elif old_cookie_header:
                    request.headers['Cookie'] = old_cookie_header
            except Exception as e:
                logger.exception(fetch)
                raise gen.Return(handle_error(e))

            try:
                response = yield gen.maybe_future(self.http_client.fetch(request))
            except tornado.httpclient.HTTPError as e:
                if e.response:
                    response = e.response
                else:
                    raise gen.Return(handle_error(e))

            extract_cookies_to_jar(session, response.request, response.headers)
            if (response.code in (301, 302, 303, 307)
                    and response.headers.get('Location')
                    and task_fetch.get('allow_redirects', True)):
                if max_redirects <= 0:
                    error = tornado.httpclient.HTTPError(
                        599, 'Maximum (%d) redirects followed' % task_fetch.get('max_redirects', 5),
                        response)
                    raise gen.Return(handle_error(error))
                if response.code in (302, 303):
                    fetch['method'] = 'GET'
                    if 'body' in fetch:
                        del fetch['body']
                fetch['url'] = quote_chinese(urljoin(fetch['url'], response.headers['Location']))
                fetch['request_timeout'] -= time.time() - start_time
                if fetch['request_timeout'] < 0:
                    fetch['request_timeout'] = 0.1
                max_redirects -= 1
                continue

            result = {}
            result['orig_url'] = url
            result['content'] = response.body or ''
            result['headers'] = dict(response.headers)
            result['status_code'] = response.code
            result['url'] = response.effective_url or url
            result['time'] = time.time() - start_time
            result['cookies'] = session.get_dict()
            result['save'] = task_fetch.get('save')
            if response.error:
                result['error'] = utils.text(response.error)
            if 200 <= response.code < 300:
                logger.info("[%d] %s:%s %s %.2fs", response.code,
                            task.get('project'), task.get('taskid'),
                            url, result['time'])
            else:
                logger.warning("[%d] %s:%s %s %.2fs", response.code,
                               task.get('project'), task.get('taskid'),
                               url, result['time'])

            raise gen.Return(result)

    @gen.coroutine
    def phantomjs_fetch(self, url, task):
        '''Fetch with phantomjs proxy'''
        start_time = time.time()
        self.on_fetch('phantomjs', task)
        handle_error = lambda x: self.handle_error('phantomjs', url, task, start_time, x)

        # check phantomjs proxy is enabled
        if not self.phantomjs_proxy:
            result = {
                "orig_url": url,
                "content": "phantomjs is not enabled.",
                "headers": {},
                "status_code": 501,
                "url": url,
                "time": time.time() - start_time,
                "cookies": {},
                "save": task.get('fetch', {}).get('save')
            }
            logger.warning("[501] %s:%s %s 0s", task.get('project'), task.get('taskid'), url)
            raise gen.Return(result)

        # setup request parameters
        fetch = self.pack_tornado_request_parameters(url, task)
        task_fetch = task.get('fetch', {})
        for each in task_fetch:
            if each not in fetch:
                fetch[each] = task_fetch[each]

        # robots.txt
        if task_fetch.get('robots_txt', False):
            user_agent = fetch['headers']['User-Agent']
            can_fetch = yield self.can_fetch(user_agent, url)
            if not can_fetch:
                error = tornado.httpclient.HTTPError(403, 'Disallowed by robots.txt')
                raise gen.Return(handle_error(error))

        request_conf = {
            'follow_redirects': False
        }
        request_conf['connect_timeout'] = fetch.get('connect_timeout', 20)
        request_conf['request_timeout'] = fetch.get('request_timeout', 120) + 1

        session = cookies.RequestsCookieJar()
        if 'Cookie' in fetch['headers']:
            c = http_cookies.SimpleCookie()
            try:
                c.load(fetch['headers']['Cookie'])
            except AttributeError:
                c.load(utils.utf8(fetch['headers']['Cookie']))
            for key in c:
                session.set(key, c[key])
            del fetch['headers']['Cookie']
        if 'cookies' in fetch:
            session.update(fetch['cookies'])
            del fetch['cookies']

        request = tornado.httpclient.HTTPRequest(url=fetch['url'])
        cookie_header = cookies.get_cookie_header(session, request)
        if cookie_header:
            fetch['headers']['Cookie'] = cookie_header

        # making requests
        fetch['headers'] = dict(fetch['headers'])
        try:
            request = tornado.httpclient.HTTPRequest(
                url=self.phantomjs_proxy, method="POST",
                body=json.dumps(fetch), **request_conf)
        except Exception as e:
            raise gen.Return(handle_error(e))

        try:
            response = yield gen.maybe_future(self.http_client.fetch(request))
        except tornado.httpclient.HTTPError as e:
            if e.response:
                response = e.response
            else:
                raise gen.Return(handle_error(e))

        if not response.body:
            raise gen.Return(handle_error(Exception('no response from phantomjs: %r' % response)))

        result = {}
        try:
            result = json.loads(utils.text(response.body))
            assert 'status_code' in result, result
        except Exception as e:
            if response.error:
                result['error'] = utils.text(response.error)
            raise gen.Return(handle_error(e))

        if result.get('status_code', 200):
            logger.info("[%d] %s:%s %s %.2fs", result['status_code'],
                        task.get('project'), task.get('taskid'), url, result['time'])
        else:
            logger.error("[%d] %s:%s %s, %r %.2fs", result['status_code'],
                         task.get('project'), task.get('taskid'),
                         url, result['content'], result['time'])

        raise gen.Return(result)

    @gen.coroutine
    def splash_fetch(self, url, task):
        '''Fetch with splash'''
        start_time = time.time()
        self.on_fetch('splash', task)
        handle_error = lambda x: self.handle_error('splash', url, task, start_time, x)

        # check phantomjs proxy is enabled
        if not self.splash_endpoint:
            result = {
                "orig_url": url,
                "content": "splash is not enabled.",
                "headers": {},
                "status_code": 501,
                "url": url,
                "time": time.time() - start_time,
                "cookies": {},
                "save": task.get('fetch', {}).get('save')
            }
            logger.warning("[501] %s:%s %s 0s", task.get('project'), task.get('taskid'), url)
            raise gen.Return(result)

        # setup request parameters
        fetch = self.pack_tornado_request_parameters(url, task)
        task_fetch = task.get('fetch', {})
        for each in task_fetch:
            if each not in fetch:
                fetch[each] = task_fetch[each]

        # robots.txt
        if task_fetch.get('robots_txt', False):
            user_agent = fetch['headers']['User-Agent']
            can_fetch = yield self.can_fetch(user_agent, url)
            if not can_fetch:
                error = tornado.httpclient.HTTPError(403, 'Disallowed by robots.txt')
                raise gen.Return(handle_error(error))

        request_conf = {
            'follow_redirects': False,
            'headers': {
                'Content-Type': 'application/json',
            }
        }
        request_conf['connect_timeout'] = fetch.get('connect_timeout', 20)
        request_conf['request_timeout'] = fetch.get('request_timeout', 120) + 1

        session = cookies.RequestsCookieJar()
        if 'Cookie' in fetch['headers']:
            c = http_cookies.SimpleCookie()
            try:
                c.load(fetch['headers']['Cookie'])
            except AttributeError:
                c.load(utils.utf8(fetch['headers']['Cookie']))
            for key in c:
                session.set(key, c[key])
            del fetch['headers']['Cookie']
        if 'cookies' in fetch:
            session.update(fetch['cookies'])
            del fetch['cookies']

        request = tornado.httpclient.HTTPRequest(url=fetch['url'])
        cookie_header = cookies.get_cookie_header(session, request)
        if cookie_header:
            fetch['headers']['Cookie'] = cookie_header

        # making requests
        fetch['lua_source'] = self.splash_lua_source
        fetch['headers'] = dict(fetch['headers'])
        try:
            request = tornado.httpclient.HTTPRequest(
                url=self.splash_endpoint, method="POST",
                body=json.dumps(fetch), **request_conf)
        except Exception as e:
            raise gen.Return(handle_error(e))

        try:
            response = yield gen.maybe_future(self.http_client.fetch(request))
        except tornado.httpclient.HTTPError as e:
            if e.response:
                response = e.response
            else:
                raise gen.Return(handle_error(e))

        if not response.body:
            raise gen.Return(handle_error(Exception('no response from phantomjs')))

        result = {}
        try:
            result = json.loads(utils.text(response.body))
            assert 'status_code' in result, result
        except ValueError as e:
            logger.error("result is not json: %r", response.body[:500])
            raise gen.Return(handle_error(e))
        except Exception as e:
            if response.error:
                result['error'] = utils.text(response.error)
            raise gen.Return(handle_error(e))

        if result.get('status_code', 200):
            logger.info("[%d] %s:%s %s %.2fs", result['status_code'],
                        task.get('project'), task.get('taskid'), url, result['time'])
        else:
            logger.error("[%d] %s:%s %s, %r %.2fs", result['status_code'],
                         task.get('project'), task.get('taskid'),
                         url, result['content'], result['time'])

        raise gen.Return(result)

    @gen.coroutine
    def puppeteer_fetch(self, url, task):
        '''Fetch with puppeteer proxy'''
        start_time = time.time()
        self.on_fetch('puppeteer', task)
        handle_error = lambda x: self.handle_error('puppeteer', url, task, start_time, x)

        # check puppeteer proxy is enabled
        if not self.puppeteer_proxy:
            result = {
                "orig_url": url,
                "content": "puppeteer is not enabled.",
                "headers": {},
                "status_code": 501,
                "url": url,
                "time": time.time() - start_time,
                "cookies": {},
                "save": task.get('fetch', {}).get('save')
            }
            logger.warning("[501] %s:%s %s 0s", task.get('project'), task.get('taskid'), url)
            raise gen.Return(result)

        # setup request parameters
        fetch = self.pack_tornado_request_parameters(url, task)
        task_fetch = task.get('fetch', {})
        for each in task_fetch:
            if each not in fetch:
                fetch[each] = task_fetch[each]

        # robots.txt
        if task_fetch.get('robots_txt', False):
            user_agent = fetch['headers']['User-Agent']
            can_fetch = yield self.can_fetch(user_agent, url)
            if not can_fetch:
                error = tornado.httpclient.HTTPError(403, 'Disallowed by robots.txt')
                raise gen.Return(handle_error(error))

        request_conf = {
            'follow_redirects': False
        }
        request_conf['connect_timeout'] = fetch.get('connect_timeout', 20)
        request_conf['request_timeout'] = fetch.get('request_timeout', 120) + 1

        session = cookies.RequestsCookieJar()
        if 'Cookie' in fetch['headers']:
            c = http_cookies.SimpleCookie()
            try:
                c.load(fetch['headers']['Cookie'])
            except AttributeError:
                c.load(utils.utf8(fetch['headers']['Cookie']))
            for key in c:
                session.set(key, c[key])
            del fetch['headers']['Cookie']
        if 'cookies' in fetch:
            session.update(fetch['cookies'])
            del fetch['cookies']

        request = tornado.httpclient.HTTPRequest(url=fetch['url'])
        cookie_header = cookies.get_cookie_header(session, request)
        if cookie_header:
            fetch['headers']['Cookie'] = cookie_header

        logger.info("%s", self.puppeteer_proxy)
        # making requests
        fetch['headers'] = dict(fetch['headers'])
        headers = {}
        headers['Content-Type'] = 'application/json; charset=UTF-8'
        try:
            request = tornado.httpclient.HTTPRequest(
                url=self.puppeteer_proxy, method="POST", headers=headers,
                body=json.dumps(fetch), **request_conf)
        except Exception as e:
            raise gen.Return(handle_error(e))

        try:
            response = yield gen.maybe_future(self.http_client.fetch(request))
        except tornado.httpclient.HTTPError as e:
            if e.response:
                response = e.response
            else:
                raise gen.Return(handle_error(e))

        if not response.body:
            raise gen.Return(handle_error(Exception('no response from puppeteer: %r' % response)))

        result = {}
        try:
            result = json.loads(utils.text(response.body))
            assert 'status_code' in result, result
        except Exception as e:
            if response.error:
                result['error'] = utils.text(response.error)
            raise gen.Return(handle_error(e))

        if result.get('status_code', 200):
            logger.info("[%d] %s:%s %s %.2fs", result['status_code'],
                        task.get('project'), task.get('taskid'), url, result['time'])
        else:
            logger.error("[%d] %s:%s %s, %r %.2fs", result['status_code'],
                         task.get('project'), task.get('taskid'),
                         url, result['content'], result['time'])

        raise gen.Return(result)

    def run(self):
        '''Run loop'''
        logger.info("fetcher starting...")

        def queue_loop():
            if not self.outqueue or not self.inqueue:
                return
            while not self._quit:
                try:
                    if self.outqueue.full():
                        break
                    if self.http_client.free_size() <= 0:
                        break
                    task = self.inqueue.get_nowait()
                    # FIXME: decode unicode_obj should used after data selete from
                    # database, it's used here for performance
                    task = utils.decode_unicode_obj(task)
                    self.fetch(task)
                except queue.Empty:
                    break
                except KeyboardInterrupt:
                    break
                except Exception as e:
                    logger.exception(e)
                    break

        tornado.ioloop.PeriodicCallback(queue_loop, 100, io_loop=self.ioloop).start()
        tornado.ioloop.PeriodicCallback(self.clear_robot_txt_cache, 10000, io_loop=self.ioloop).start()
        self._running = True

        try:
            self.ioloop.start()
        except KeyboardInterrupt:
            pass

        logger.info("fetcher exiting...")

    def quit(self):
        '''Quit fetcher'''
        self._running = False
        self._quit = True
        self.ioloop.add_callback(self.ioloop.stop)
        if hasattr(self, 'xmlrpc_server'):
            self.xmlrpc_ioloop.add_callback(self.xmlrpc_server.stop)
            self.xmlrpc_ioloop.add_callback(self.xmlrpc_ioloop.stop)

    def size(self):
        return self.http_client.size()

    def xmlrpc_run(self, port=24444, bind='127.0.0.1', logRequests=False):
        '''Run xmlrpc server'''
        import umsgpack
        from pyspider.libs.wsgi_xmlrpc import WSGIXMLRPCApplication
        try:
            from xmlrpc.client import Binary
        except ImportError:
            from xmlrpclib import Binary

        application = WSGIXMLRPCApplication()

        application.register_function(self.quit, '_quit')
        application.register_function(self.size)

        def sync_fetch(task):
            result = self.sync_fetch(task)
            result = Binary(umsgpack.packb(result))
            return result
        application.register_function(sync_fetch, 'fetch')

        def dump_counter(_time, _type):
            return self._cnt[_time].to_dict(_type)
        application.register_function(dump_counter, 'counter')

        import tornado.wsgi
        import tornado.ioloop
        import tornado.httpserver

        container = tornado.wsgi.WSGIContainer(application)
        self.xmlrpc_ioloop = tornado.ioloop.IOLoop()
        self.xmlrpc_server = tornado.httpserver.HTTPServer(container, io_loop=self.xmlrpc_ioloop)
        self.xmlrpc_server.listen(port=port, address=bind)
        logger.info('fetcher.xmlrpc listening on %s:%s', bind, port)
        self.xmlrpc_ioloop.start()

    def on_fetch(self, type, task):
        '''Called before task fetch'''
        logger.info('on fetch %s:%s', type, task)

    def on_result(self, type, task, result):
        '''Called after task fetched'''
        status_code = result.get('status_code', 599)
        if status_code != 599:
            status_code = (int(status_code) / 100 * 100)
        self._cnt['5m'].event((task.get('project'), status_code), +1)
        self._cnt['1h'].event((task.get('project'), status_code), +1)

        if type in ('http', 'phantomjs') and result.get('time'):
            content_len = len(result.get('content', ''))
            self._cnt['5m'].event((task.get('project'), 'speed'),
                                  float(content_len) / result.get('time'))
            self._cnt['1h'].event((task.get('project'), 'speed'),
                                  float(content_len) / result.get('time'))
            self._cnt['5m'].event((task.get('project'), 'time'), result.get('time'))
            self._cnt['1h'].event((task.get('project'), 'time'), result.get('time'))


================================================
FILE: pyspider/libs/ListIO.py
================================================
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:
# Author: Binux<i@binux.me>
#         http://binux.me
# Created on 2014-02-26 23:41:51


class ListO(object):

    """A StringO write to list."""

    def __init__(self, buffer=None):
        self._buffer = buffer
        if self._buffer is None:
            self._buffer = []

    def isatty(self):
        return False

    def close(self):
        pass

    def flush(self):
        pass

    def seek(self, n, mode=0):
        pass

    def readline(self):
        pass

    def reset(self):
        pass

    def write(self, x):
        self._buffer.append(x)

    def writelines(self, x):
        self._buffer.extend(x)


================================================
FILE: pyspider/libs/__init__.py
================================================


================================================
FILE: pyspider/libs/base_handler.py
================================================
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:
# Author: Binux<i@binux.me>
#         http://binux.me
# Created on 2014-02-16 23:12:48

import sys
import inspect
import functools
import fractions

import six
from six import add_metaclass, iteritems

from pyspider.libs.url import (
    quote_chinese, _build_url, _encode_params,
    _encode_multipart_formdata, curl_to_arguments)
from pyspider.libs.utils import md5string, timeout
from pyspider.libs.ListIO import ListO
from pyspider.libs.response import rebuild_response
from pyspider.libs.pprint import pprint
from pyspider.processor import ProcessorResult


def catch_status_code_error(func):
    """
    Non-200 response will been regarded as fetch failed and will not pass to callback.
    Use this decorator to override this feature.
    """
    func._catch_status_code_error = True
    return func


def not_send_status(func):
    """
    Do not send process status package back to scheduler.

    It's used by callbacks like on_message, on_result etc...
    """
    @functools.wraps(func)
    def wrapper(self, response, task):
        self._extinfo['not_send_status'] = True
        function = func.__get__(self, self.__class__)
        return self._run_func(function, response, task)
    return wrapper


def config(_config=None, **kwargs):
    """
    A decorator for setting the default kwargs of `BaseHandler.crawl`.
    Any self.crawl with this callback will use this config.
    """
    if _config is None:
        _config = {}
    _config.update(kwargs)

    def wrapper(func):
        func._config = _config
        return func
    return wrapper


class NOTSET(object):
    pass


def every(minutes=NOTSET, seconds=NOTSET):
    """
    method will been called every minutes or seconds
    """
    def wrapper(func):
        # mark the function with variable 'is_cronjob=True', the function would be
        # collected into the list Handler._cron_jobs by meta class
        func.is_cronjob = True

        # collect interval and unify to seconds, it's used in meta class. See the
        # comments in meta class.
        func.tick = minutes * 60 + seconds
        return func

    if inspect.isfunction(minutes):
        func = minutes
        minutes = 1
        seconds = 0
        return wrapper(func)

    if minutes is NOTSET:
        if seconds is NOTSET:
            minutes = 1
            seconds = 0
        else:
            minutes = 0
    if seconds is NOTSET:
        seconds = 0

    return wrapper


class BaseHandlerMeta(type):

    def __new__(cls, name, bases, attrs):
        # A list of all functions which is marked as 'is_cronjob=True'
        cron_jobs = []

        # The min_tick is the greatest common divisor(GCD) of the interval of cronjobs
        # this value would be queried by scheduler when the project initial loaded.
        # Scheudler may only send _on_cronjob task every min_tick seconds. It can reduce
        # the number of tasks sent from scheduler.
        min_tick = 0

        for each in attrs.values():
            if inspect.isfunction(each) and getattr(each, 'is_cronjob', False):
                cron_jobs.append(each)
                min_tick = fractions.gcd(min_tick, each.tick)
        newcls = type.__new__(cls, name, bases, attrs)
        newcls._cron_jobs = cron_jobs
        newcls._min_tick = min_tick
        return newcls


@add_metaclass(BaseHandlerMeta)
class BaseHandler(object):
    """
    BaseHandler for all scripts.

    `BaseHandler.run` is the main method to handler the task.
    """
    crawl_config = {}
    project_name = None
    _cron_jobs = []
    _min_tick = 0
    __env__ = {'not_inited': True}
    retry_delay = {}

    def _reset(self):
        """
        reset before each task
        """
        self._extinfo = {}
        self._messages = []
        self._follows = []
        self._follows_keys = set()

    def _run_func(self, function, *arguments):
        """
        Running callback function with requested number of arguments
        """
        args, varargs, keywords, defaults = inspect.getargspec(function)
        task = arguments[-1]
        process_time_limit = task['process'].get('process_time_limit',
                                                 self.__env__.get('process_time_limit', 0))
        if process_time_limit > 0:
            with timeout(process_time_limit, 'process timeout'):
                ret = function(*arguments[:len(args) - 1])
        else:
            ret = function(*arguments[:len(args) - 1])
        return ret

    def _run_task(self, task, response):
        """
        Finding callback specified by `task['callback']`
        raising status error for it if needed.
        """
        process = task.get('process', {})
        callback = process.get('callback', '__call__')
        if not hasattr(self, callback):
            raise NotImplementedError("self.%s() not implemented!" % callback)

        function = getattr(self, callback)
        # do not run_func when 304
        if response.status_code == 304 and not getattr(function, '_catch_status_code_error', False):
            return None
        if not getattr(function, '_catch_status_code_error', False):
            response.raise_for_status()
        return self._run_func(function, response, task)

    def run_task(self, module, task, response):
        """
        Processing the task, catching exceptions and logs, return a `ProcessorResult` object
        """
        self.logger = logger = module.logger
        result = None
        exception = None
        stdout = sys.stdout
        self.task = task
        if isinstance(response, dict):
            response = rebuild_response(response)
        self.response = response
        self.save = (task.get('track') or {}).get('save', {})

        try:
            if self.__env__.get('enable_stdout_capture', True):
                sys.stdout = ListO(module.log_buffer)
            self._reset()
            result = self._run_task(task, response)
            if inspect.isgenerator(result):
                for r in result:
                    self._run_func(self.on_result, r, response, task)
            else:
                self._run_func(self.on_result, result, response, task)
        except Exception as e:
            logger.exception(e)
            exception = e
        finally:
            follows = self._follows
            messages = self._messages
            logs = list(module.log_buffer)
            extinfo = self._extinfo
            save = self.save

            sys.stdout = stdout
            self.task = None
            self.response = None
            self.save = None

        module.log_buffer[:] = []
        return ProcessorResult(result, follows, messages, logs, exception, extinfo, save)

    schedule_fields = ('priority', 'retries', 'exetime', 'age', 'itag', 'force_update', 'auto_recrawl', 'cancel')
    fetch_fields = ('method', 'headers', 'user_agent', 'data', 'connect_timeout', 'timeout', 'allow_redirects', 'cookies',
                    'proxy', 'etag', 'last_modifed', 'last_modified', 'save', 'js_run_at', 'js_script',
                    'js_viewport_width', 'js_viewport_height', 'load_images', 'fetch_type', 'use_gzip', 'validate_cert',
                    'max_redirects', 'robots_txt')
    process_fields = ('callback', 'process_time_limit')

    @staticmethod
    def task_join_crawl_config(task, crawl_config):
        task_fetch = task.get('fetch', {})
        for k in BaseHandler.fetch_fields:
            if k in crawl_config:
                v = crawl_config[k]
                if isinstance(v, dict) and isinstance(task_fetch.get(k), dict):
                    v = dict(v)
                    v.update(task_fetch[k])
                    task_fetch[k] = v
                else:
                    task_fetch.setdefault(k, v)
        if task_fetch:
            task['fetch'] = task_fetch

        task_process = task.get('process', {})
        for k in BaseHandler.process_fields:
            if k in crawl_config:
                v = crawl_config[k]
                if isinstance(v, dict) and isinstance(task_process.get(k), dict):
                    task_process[k].update(v)
                else:
                    task_process.setdefault(k, v)
        if task_process:
            task['process'] = task_process

        return task

    def _crawl(self, url, **kwargs):
        """
        real crawl API

        checking kwargs, and repack them to each sub-dict
        """
        task = {}

        assert len(url) < 1024, "Maximum (1024) URL length error."

        if kwargs.get('callback'):
            callback = kwargs['callback']
            if isinstance(callback, six.string_types) and hasattr(self, callback):
                func = getattr(self, callback)
            elif six.callable(callback) and six.get_method_self(callback) is self:
                func = callback
                kwargs['callback'] = func.__name__
            elif six.callable(callback) and hasattr(self, callback.__name__):
                func = getattr(self, callback.__name__)
                kwargs['callback'] = func.__name__
            else:
                raise NotImplementedError("self.%s() not implemented!" % callback)
            if hasattr(func, '_config'):
                for k, v in iteritems(func._config):
                    if isinstance(v, dict) and isinstance(kwargs.get(k), dict):
                        kwargs[k].update(v)
                    else:
                        kwargs.setdefault(k, v)

        url = quote_chinese(_build_url(url.strip(), kwargs.pop('params', None)))
        if kwargs.get('files'):
            assert isinstance(
                kwargs.get('data', {}), dict), "data must be a dict when using with files!"
            content_type, data = _encode_multipart_formdata(kwargs.pop('data', {}),
                                                            kwargs.pop('files', {}))
            kwargs.setdefault('headers', {})
            kwargs['headers']['Content-Type'] = content_type
            kwargs['data'] = data
        if kwargs.get('data'):
            kwargs['data'] = _encode_params(kwargs['data'])
        if kwargs.get('data'):
            kwargs.setdefault('method', 'POST')

        if kwargs.get('user_agent'):
            kwargs.setdefault('headers', {})
            kwargs['headers']['User-Agent'] = kwargs.get('user_agent')

        schedule = {}
        for key in self.schedule_fields:
            if key in kwargs:
                schedule[key] = kwargs.pop(key)
            elif key in self.crawl_config:
                schedule[key] = self.crawl_config[key]

        task['schedule'] = schedule

        fetch = {}
        for key in self.fetch_fields:
            if key in kwargs:
                fetch[key] = kwargs.pop(key)
        task['fetch'] = fetch

        process = {}
        for key in self.process_fields:
            if key in kwargs:
                process[key] = kwargs.pop(key)
        task['process'] = process

        task['project'] = self.project_name
        task['url'] = url
        if 'taskid' in kwargs:
            task['taskid'] = kwargs.pop('taskid')
        else:
            task['taskid'] = self.get_taskid(task)

        if kwargs:
            raise TypeError('crawl() got unexpected keyword argument: %s' % kwargs.keys())

        if self.is_debugger():
            task = self.task_join_crawl_config(task, self.crawl_config)

        cache_key = "%(project)s:%(taskid)s" % task
        if cache_key not in self._follows_keys:
            self._follows_keys.add(cache_key)
            self._follows.append(task)
        return task

    def get_taskid(self, task):
        '''Generate taskid by information of task md5(url) by default, override me'''
        return md5string(task['url'])

    # apis
    def crawl(self, url, **kwargs):
        '''
        available params:
          url
          callback

          method
          params
          data
          files
          headers
          timeout
          allow_redirects
          cookies
          proxy
          etag
          last_modified
          auto_recrawl

          fetch_type
          js_run_at
          js_script
          js_viewport_width
          js_viewport_height
          load_images

          priority
          retries
          exetime
          age
          itag
          cancel

          save
          taskid

          full documents: http://pyspider.readthedocs.org/en/latest/apis/self.crawl/
        '''

        if isinstance(url, six.string_types) and url.startswith('curl '):
            curl_kwargs = curl_to_arguments(url)
            url = curl_kwargs.pop('urls')
            for k, v in iteritems(curl_kwargs):
                kwargs.setdefault(k, v)

        if isinstance(url, six.string_types):
            return self._crawl(url, **kwargs)
        elif hasattr(url, "__iter__"):
            result = []
            for each in url:
                result.append(self._crawl(each, **kwargs))
            return result

    def is_debugger(self):
        """Return true if running in debugger"""
        return self.__env__.get('debugger')

    def send_message(self, project, msg, url='data:,on_message'):
        """Send messages to other project."""
        self._messages.append((project, msg, url))

    def on_message(self, project, msg):
        """Receive message from other project, override me."""
        pass

    def on_result(self, result):
        """Receiving returns from other callback, override me."""
        if not result:
            return
        assert self.task, "on_result can't outside a callback."
        if self.is_debugger():
            pprint(result)
        if self.__env__.get('result_queue'):
            self.__env__['result_queue'].put((self.task, result))

    def on_finished(self, response, task):
        """
        Triggered when all tasks in task queue finished.
        http://docs.pyspider.org/en/latest/About-Projects/#on_finished-callback
        """
        pass

    @not_send_status
    def _on_message(self, response):
        project, msg = response.save
        return self.on_message(project, msg)

    @not_send_status
    def _on_cronjob(self, response, task):
        if (not response.save
                or not isinstance(response.save, dict)
                or 'tick' not in response.save):
            return

        # When triggered, a '_on_cronjob' task is sent from scheudler with 'tick' in
        # Response.save. Scheduler may at least send the trigger task every GCD of the
        # inverval of the cronjobs. The method should check the tick for each cronjob
        # function to confirm the execute interval.
        for cronjob in self._cron_jobs:
            if response.save['tick'] % cronjob.tick != 0:
                continue
            function = cronjob.__get__(self, self.__class__)
            self._run_func(function, response, task)

    def _on_get_info(self, response, task):
        """Sending runtime infomation about this script."""
        for each in response.save or []:
            if each == 'min_tick':
                self.save[each] = self._min_tick
            elif each == 'retry_delay':
                if not isinstance(self.retry_delay, dict):
                    self.retry_delay = {'': self.retry_delay}
                self.save[each] = self.retry_delay
            elif each == 'crawl_config':
                self.save[each] = self.crawl_config


================================================
FILE: pyspider/libs/bench.py
================================================
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:
# Author: Binux<roy@binux.me>
#         http://binux.me
# Created on 2014-12-08 22:23:10
# rate: 10000000000
# burst: 10000000000

import time
import logging
logger = logging.getLogger('bench')

from six.moves import queue as Queue
from pyspider.scheduler import ThreadBaseScheduler as Scheduler
from pyspider.fetcher.tornado_fetcher import Fetcher
from pyspider.processor import Processor
from pyspider.result import ResultWorker
from pyspider.libs.utils import md5string


def bench_test_taskdb(taskdb):
    project_name = '__bench_test__'
    task = {
        "fetch": {
            "fetch_type": "js",
            "headers": {
                "User-Agent": "BaiDuSpider"
            }
        },
        "process": {
            "callback": "detail_page"
        },
        "project": project_name,
        "taskid": "553300d2582154413b4982c00c34a2d5",
        "url": "http://www.sciencedirect.com/science/article/pii/S1674200109000704"
    }

    track = {
        "fetch": {
            "content": None,
            "encoding": "unicode",
            "error": None,
            "headers": {
                "last-modified": "Wed, 04 Mar 2015 09:24:33 GMT"
            },
            "ok": True,
            "redirect_url": None,
            "status_code": 200,
            "time": 5.543
        },
        "process": {
            "exception": None,
            "follows": 4,
            "logs": "",
            "ok": True,
            "result": "{'url': u'",
            "time": 0.07105398178100586
        }
    }

    def test_insert(n, start=0):
        logger.info("taskdb insert %d", n)
        start_time = time.time()
        for i in range(n):
            task['url'] = 'http://bench.pyspider.org/?l=%d' % (i + start)
            task['taskid'] = md5string(task['url'])
            task['track'] = {}
            taskdb.insert(task['project'], task['taskid'], task)
        end_time = time.time()
        cost_time = end_time - start_time
        logger.info("cost %.2fs, %.2f/s %.2fms",
                    cost_time, n * 1.0 / cost_time, cost_time / n * 1000)

    def test_update(n, start=0):
        logger.info("taskdb update %d" % n)
        start_time = time.time()
        for i in range(n):
            task['url'] = 'http://bench.pyspider.org/?l=%d' % (i + start)
            task['taskid'] = md5string(task['url'])
            task['track'] = track
            taskdb.update(task['project'], task['taskid'], task)
        end_time = time.time()
        cost_time = end_time - start_time
        logger.info("cost %.2fs, %.2f/s %.2fms",
                    cost_time, n * 1.0 / cost_time, cost_time / n * 1000)

    request_task_fields = [
        'taskid',
        'project',
        'url',
        'status',
        'fetch',
        'process',
        'track',
        'lastcrawltime'
    ]

    def test_get(n, start=0, random=True, fields=request_task_fields):
        logger.info("taskdb get %d %s" % (n, "randomly" if random else ""))
        range_n = list(range(n))
        if random:
            from random import shuffle
            shuffle(range_n)
        start_time = time.time()
        for i in range_n:
            task['url'] = 'http://bench.pyspider.org/?l=%d' % (i + start)
            task['taskid'] = md5string(task['url'])
            task['track'] = track
            taskdb.get_task(task['project'], task['taskid'], fields=fields)
        end_time = time.time()
        cost_time = end_time - start_time
        logger.info("cost %.2fs, %.2f/s %.2fms",
                    cost_time, n * 1.0 / cost_time, cost_time / n * 1000)

    try:
        test_insert(1000)
        test_update(1000)
        test_get(1000)
        test_insert(10000, 1000)
        test_update(10000, 1000)
        test_get(10000, 1000)
    except Exception as e:
        logger.exception(e)
    finally:
        taskdb.drop(project_name)


def bench_test_message_queue(queue):
    task = {
        "fetch": {
            "fetch_type": "js",
            "headers": {
                "User-Agent": "BaiDuSpider"
            }
        },
        "process": {
            "callback": "detail_page"
        },
        "project": "__bench_test__",
        "taskid": "553300d2582154413b4982c00c34a2d5",
        "url": "http://www.sciencedirect.com/science/article/pii/S1674200109000704"
    }

    def test_put(n):
        logger.info("message queue put %d", n)
        start_time = time.time()
        for i in range(n):
            task['url'] = 'http://bench.pyspider.org/?l=%d' % i
            task['taskid'] = md5string(task['url'])
            queue.put(task, block=True, timeout=1)
        end_time = time.time()
        cost_time = end_time - start_time
        logger.info("cost %.2fs, %.2f/s %.2fms",
                    cost_time, n * 1.0 / cost_time, cost_time / n * 1000)

    def test_get(n):
        logger.info("message queue get %d", n)
        start_time = time.time()
        for i in range(n):
            try:
                queue.get(True, 1)
            except Queue.Empty:
                logger.error('message queue empty while get %d', i)
                raise
        end_time = time.time()
        cost_time = end_time - start_time
        logger.info("cost %.2fs, %.2f/s %.2fms",
                    cost_time, n * 1.0 / cost_time, cost_time / n * 1000)

    try:
        test_put(1000)
        test_get(1000)
        test_put(10000)
        test_get(10000)
    except Exception as e:
        logger.exception(e)
    finally:
        if hasattr(queue, 'channel'):
            queue.channel.queue_purge(queue.name)

        # clear message queue
        try:
            while queue.get(False):
                continue
        except Queue.Empty:
            pass


class BenchMixin(object):
    """Report to logger for bench test"""
    def _bench_init(self):
        self.done_cnt = 0
        self.start_time = time.time()
        self.last_cnt = 0
        self.last_report = 0

    def _bench_report(self, name, prefix=0, rjust=0):
        self.done_cnt += 1
        now = time.time()
        if now - self.last_report >= 1:
            rps = float(self.done_cnt - self.last_cnt) / (now - self.last_report)
            output = ''
            if prefix:
                output += " " * prefix
            output += ("%s %s pages (at %d pages/min)" % (
                name, self.done_cnt, rps * 60.0)).rjust(rjust)
            logger.info(output)
            self.last_cnt = self.done_cnt
            self.last_report = now


class BenchScheduler(Scheduler, BenchMixin):
    def __init__(self, *args, **kwargs):
        super(BenchScheduler, self).__init__(*args, **kwargs)
        self._bench_init()

    def on_task_status(self, task):
        self._bench_report('Crawled')
        return super(BenchScheduler, self).on_task_status(task)


class BenchFetcher(Fetcher, BenchMixin):
    def __init__(self, *args, **kwargs):
        super(BenchFetcher, self).__init__(*args, **kwargs)
        self._bench_init()

    def on_result(self, type, task, result):
        self._bench_report("Fetched", 0, 75)
        return super(BenchFetcher, self).on_result(type, task, result)


class BenchProcessor(Processor, BenchMixin):
    def __init__(self, *args, **kwargs):
        super(BenchProcessor, self).__init__(*args, **kwargs)
        self._bench_init()

    def on_task(self, task, response):
        self._bench_report("Processed", 75)
        return super(BenchProcessor, self).on_task(task, response)


class BenchResultWorker(ResultWorker, BenchMixin):
    def __init__(self, *args, **kwargs):
        super(BenchResultWorker, self).__init__(*args, **kwargs)
        self._bench_init()

    def on_result(self, task, result):
        self._bench_report("Saved", 0, 150)
        super(BenchResultWorker, self).on_result(task, result)


from pyspider.libs.base_handler import BaseHandler


class Handler(BaseHandler):
    def on_start(self, response):
        self.crawl('http://127.0.0.1:5000/bench',
                   params={'total': response.save.get('total', 10000), 'show': response.save.get('show', 20)},
                   callback=self.index_page)

    def index_page(self, response):
        for each in response.doc('a[href^="http://"]').items():
            self.crawl(each.attr.href, callback=self.index_page)
        return response.url


================================================
FILE: pyspider/libs/counter.py
================================================
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:
# Author: Binux<i@binux.me>
#         http://binux.me
# Created on 2012-11-14 17:09:50

from __future__ import unicode_literals, division, absolute_import

import time
import logging
from collections import deque
try:
    from UserDict import DictMixin
except ImportError:
    from collections import Mapping as DictMixin

import six
from six import iteritems
from six.moves import cPickle


class BaseCounter(object):

    def __init__(self):
        pass

    def event(self, value=1):
        """Fire a event."""
        raise NotImplementedError

    def value(self, value):
        """Set counter value."""
        raise NotImplementedError

    @property
    def avg(self):
        """Get average value"""
        raise NotImplementedError

    @property
    def sum(self):
        """Get sum of counter"""
        raise NotImplementedError

    def empty(self):
        """Clear counter"""
        raise NotImplementedError


class TotalCounter(BaseCounter):
    """Total counter"""

    def __init__(self):
        super(TotalCounter, self).__init__()
        self.cnt = 0

    def event(self, value=1):
        self.cnt += value

    def value(self, value):
        self.cnt = value

    @property
    def avg(self):
        return self.cnt

    @property
    def sum(self):
        return self.cnt

    def empty(self):
        return self.cnt == 0


class AverageWindowCounter(BaseCounter):
    """
    Record last N(window) value
    """

    def __init__(self, window_size=300):
        super(AverageWindowCounter, self).__init__()
        self.window_size = window_size
        self.values = deque(maxlen=window_size)

    def event(self, value=1):
        self.values.append(value)

    value = event

    @property
    def avg(self):
        return self.sum / len(self.values)

    @property
    def sum(self):
        return sum(self.values)

    def empty(self):
        if not self.values:
            return True


class TimebaseAverageEventCounter(BaseCounter):
    """
    Record last window_size * window_interval seconds event.

    records will trim ever window_interval seconds
    """

    def __init__(self, window_size=30, window_interval=10):
        super(TimebaseAverageEventCounter, self).__init__()
        self.max_window_size = window_size
        self.window_size = 0
        self.window_interval = window_interval
        self.values = deque(maxlen=window_size)
        self.events = deque(maxlen=window_size)
        self.times = deque(maxlen=window_size)

        self.cache_value = 0
        self.cache_event = 0
        self.cache_start = None
        self._first_data_time = None

    def event(self, value=1):
        now = time.time()
        if self._first_data_time is None:
            self._first_data_time = now

        if self.cache_start is None:
            self.cache_value = value
            self.cache_event = 1
            self.cache_start = now
        elif now - self.cache_start > self.window_interval:
            self.values.append(self.cache_value)
            self.events.append(self.cache_event)
            self.times.append(self.cache_start)
            self.on_append(self.cache_value, self.cache_start)
            self.cache_value = value
            self.cache_event = 1
            self.cache_start = now
        else:
            self.cache_value += value
            self.cache_event += 1
        return self

    def value(self, value):
        self.cache_value = value

    def _trim_window(self):
        now = time.time()
        if self.cache_start and now - self.cache_start > self.window_interval:
            self.values.append(self.cache_value)
            self.events.append(self.cache_event)
            self.times.append(self.cache_start)
            self.on_append(self.cache_value, self.cache_start)
            self.cache_value = 0
            self.cache_start = None

        if self.window_size != self.max_window_size and self._first_data_time is not None:
            time_passed = now - self._first_data_time
            self.window_size = min(self.max_window_size, time_passed / self.window_interval)
        window_limit = now - self.window_size * self.window_interval
        while self.times and self.times[0] < window_limit:
            self.times.popleft()
            self.events.popleft()
            self.values.popleft()

    @property
    def avg(self):
        events = (sum(self.events) + self.cache_event)
        if not events:
            return 0
        return float(self.sum) / events

    @property
    def sum(self):
        self._trim_window()
        return sum(self.values) + self.cache_value

    def empty(self):
        self._trim_window()
        if not self.values and not self.cache_start:
            return True

    def on_append(self, value, time):
        pass


class TimebaseAverageWindowCounter(BaseCounter):
    """
    Record last window_size * window_interval seconds values.

    records will trim ever window_interval seconds
    """

    def __init__(self, window_size=30, window_interval=10):
        super(TimebaseAverageWindowCounter, self).__init__()
        self.max_window_size = window_size
        self.window_size = 0
        self.window_interval = window_interval
        self.values = deque(maxlen=window_size)
        self.times = deque(maxlen=window_size)

        self.cache_value = 0
        self.cache_start = None
        self._first_data_time = None

    def event(self, value=1):
        now = time.time()
        if self._first_data_time is None:
            self._first_data_time = now

        if self.cache_start is None:
            self.cache_value = value
            self.cache_start = now
        elif now - self.cache_start > self.window_interval:
            self.values.append(self.cache_value)
            self.times.append(self.cache_start)
            self.on_append(self.cache_value, self.cache_start)
            self.cache_value = value
            self.cache_start = now
        else:
            self.cache_value += value
        return self

    def value(self, value):
        self.cache_value = value

    def _trim_window(self):
        now = time.time()
        if self.cache_start and now - self.cache_start > self.window_interval:
            self.values.append(self.cache_value)
            self.times.append(self.cache_start)
            self.on_append(self.cache_value, self.cache_start)
            self.cache_value = 0
            self.cache_start = None

        if self.window_size != self.max_window_size and self._first_data_time is not None:
            time_passed = now - self._first_data_time
            self.window_size = min(self.max_window_size, time_passed / self.window_interval)
        window_limit = now - self.window_size * self.window_interval
        while self.times and self.times[0] < window_limit:
            self.times.popleft()
            self.values.popleft()

    @property
    def avg(self):
        sum = float(self.sum)
        if not self.window_size:
            return 0
        return sum / self.window_size / self.window_interval

    @property
    def sum(self):
        self._trim_window()
        return sum(self.values) + self.cache_value

    def empty(self):
        self._trim_window()
        if not self.values and not self.cache_start:
            return True

    def on_append(self, value, time):
        pass


class CounterValue(DictMixin):
    """
    A dict like value item for CounterManager.
    """

    def __init__(self, manager, keys):
        self.manager = manager
        self._keys = keys

    def __getitem__(self, key):
        if key == '__value__':
            key = self._keys
            return self.manager.counters[key]
        else:
            key = self._keys + (key, )

        available_keys = []
        for _key in list(self.manager.counters.keys()):
            if _key[:len(key)] == key:
                available_keys.append(_key)

        if len(available_keys) == 0:
            raise KeyError
        elif len(available_keys) == 1:
            if available_keys[0] == key:
                return self.manager.counters.get(key)
            else:
                return CounterValue(self.manager, key)
        else:
            return CounterValue(self.manager, key)

    def __len__(self):
        return len(self.keys())

    def __iter__(self):
        return iter(self.keys())

    def __contains__(self, key):
        return key in self.keys()

    def keys(self):
        result = set()
        for key in list(self.manager.counters.keys()):
            if key[:len(self._keys)] == self._keys:
                key = key[len(self._keys):]
                result.add(key[0] if key else '__value__')
        return result

    def to_dict(self, get_value=None):
        """Dump counters as a dict"""
        result = {}
        for key, value in iteritems(self):
            if isinstance(value, BaseCounter):
                if get_value is not None:
                    value = getattr(value, get_value)
                result[key] = value
            else:
                result[key] = value.to_dict(get_value)
        return result


class CounterManager(DictMixin):
    """
    A dict like counter manager.

    When using a tuple as event key, say: ('foo', 'bar'), You can visite counter
    with manager['foo']['bar'].  Or get all counters which first element is 'foo'
    by manager['foo'].

    It's useful for a group of counters.
    """

    def __init__(self, cls=TimebaseAverageWindowCounter):
        """init manager with Counter cls"""
        self.cls = cls
        self.counters = {}

    def event(self, key, value=1):
        """Fire a event of a counter by counter key"""
        if isinstance(key, six.string_types):
            key = (key, )
        assert isinstance(key, tuple), "event key type error"
        if key not in self.counters:
            self.counters[key] = self.cls()
        self.counters[key].event(value)
        return self

    def value(self, key, value=1):
        """Set value of a counter by counter key"""
        if isinstance(key, six.string_types):
            key = (key, )
        # assert all(isinstance(k, six.string_types) for k in key)
        assert isinstance(key, tuple), "event key type error"
        if key not in self.counters:
            self.counters[key] = self.cls()
        self.counters[key].value(value)
        return self

    def trim(self):
        """Clear not used counters"""
        for key, value in list(iteritems(self.counters)):
            if value.empty():
                del self.counters[key]

    def __getitem__(self, key):
        key = (key, )
        available_keys = []
        for _key in list(self.counters.keys()):
            if _key[:len(key)] == key:
                available_keys.append(_key)

        if len(available_keys) == 0:
            raise KeyError
        elif len(available_keys) == 1:
            if available_keys[0] == key:
                return self.counters.get(key)
            else:
                return CounterValue(self, key)
        else:
            return CounterValue(self, key)

    def __delitem__(self, key):
        key = (key, )
        available_keys = []
        for _key in list(self.counters.keys()):
            if _key[:len(key)] == key:
                available_keys.append(_key)
        for _key in available_keys:
            del self.counters[_key]

    def __iter__(self):
        return iter(self.keys())

    def __len__(self):
        return len(self.keys())

    def keys(self):
        result = set()
        for key in self.counters.keys():
            result.add(key[0] if key else ())
        return result

    def to_dict(self, get_value=None):
        """Dump counters as a dict"""
        self.trim()
        result = {}
        for key, value in iteritems(self.counters):
            if get_value is not None:
                value = getattr(value, get_value)
            r = result
            for _key in key[:-1]:
                r = r.setdefault(_key, {})
            r[key[-1]] = value
        return result

    def dump(self, filename):
        """Dump counters to file"""
        try:
            with open(filename, 'wb') as fp:
                cPickle.dump(self.counters, fp)
        except Exception as e:
            logging.warning("can't dump counter to file %s: %s", filename, e)
            return False
        return True

    def load(self, filename):
        """Load counters to file"""
        try:
            with open(filename, 'rb') as fp:
                self.counters = cPickle.load(fp)
        except:
            logging.debug("can't load counter from file: %s", filename)
            return False
        return True


================================================
FILE: pyspider/libs/dataurl.py
================================================
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:
# Author: Binux<i@binux.me>
#         http://binux.me
# Created on 2012-11-16 10:33:20

import six
from base64 import b64encode, b64decode
from . import utils
from six.moves.urllib.parse import quote, unquote


def encode(data, mime_type='', charset='utf-8', base64=True):
    """
    Encode data to DataURL
    """
    if isinstance(data, six.text_type):
        data = data.encode(charset)
    else:
        charset = None
    if base64:
        data = utils.text(b64encode(data))
    else:
        data = utils.text(quote(data))

    result = ['data:', ]
    if mime_type:
        result.append(mime_type)
    if charset:
        result.append(';charset=')
        result.append(charset)
    if base64:
        result.append(';base64')
    result.append(',')
    result.append(data)

    return ''.join(result)


def decode(data_url):
    """
    Decode DataURL data
    """
    metadata, data = data_url.rsplit(',', 1)
    _, metadata = metadata.split('data:', 1)
    parts = metadata.split(';')
    if parts[-1] == 'base64':
        data = b64decode(data)
    else:
        data = unquote(data)

    for part in parts:
        if part.startswith("charset="):
            data = data.decode(part[8:])
    return data


================================================
FILE: pyspider/libs/log.py
================================================
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:
# Author: Binux<i@binux.me>
#         http://binux.me
# Created on 2012-10-24 16:08:17

import logging

try:
    import curses
except ImportError:
    curses = None

from tornado.log import LogFormatter as _LogFormatter


class LogFormatter(_LogFormatter, object):
    """Init tornado.log.LogFormatter from logging.config.fileConfig"""
    def __init__(self, fmt=None, datefmt=None, color=True, *args, **kwargs):
        if fmt is None:
            fmt = _LogFormatter.DEFAULT_FORMAT
        super(LogFormatter, self).__init__(color=color, fmt=fmt, *args, **kwargs)


class SaveLogHandler(logging.Handler):
    """LogHandler that save records to a list"""

    def __init__(self, saveto=None, *args, **kwargs):
        self.saveto = saveto
        logging.Handler.__init__(self, *args, **kwargs)

    def emit(self, record):
        if self.saveto is not None:
            self.saveto.append(record)

    handle = emit


def enable_pretty_logging(logger=logging.getLogger()):
    channel = logging.StreamHandler()
    channel.setFormatter(LogFormatter())
    logger.addHandler(channel)


================================================
FILE: pyspider/libs/multiprocessing_queue.py
================================================
import six
import platform
import multiprocessing
from multiprocessing.queues import Queue as BaseQueue


# The SharedCounter and Queue classes come from:
# https://github.com/vterron/lemon/commit/9ca6b4b

class SharedCounter(object):
    """ A synchronized shared counter.
    The locking done by multiprocessing.Value ensures that only a single
    process or thread may read or write the in-memory ctypes object. However,
    in order to do n += 1, Python performs a read followed by a write, so a
    second process may read the old value before the new one is written by the
    first process. The solution is to use a multiprocessing.Lock to guarantee
    the atomicity of the modifications to Value.
    This class comes almost entirely from Eli Bendersky's blog:
    http://eli.thegreenplace.net/2012/01/04/shared-counter-with-pythons-multiprocessing/
    """

    def __init__(self, n=0):
        self.count = multiprocessing.Value('i', n)

    def increment(self, n=1):
        """ Increment the counter by n (default = 1) """
        with self.count.get_lock():
            self.count.value += n

    @property
    def value(self):
        """ Return the value of the counter """
        return self.count.value


class MultiProcessingQueue(BaseQueue):
    """ A portable implementation of multiprocessing.Queue.
    Because of multithreading / multiprocessing semantics, Queue.qsize() may
    raise the NotImplementedError exception on Unix platforms like Mac OS X
    where sem_getvalue() is not implemented. This subclass addresses this
    problem by using a synchronized shared counter (initialized to zero) and
    increasing / decreasing its value every time the put() and get() methods
    are called, respectively. This not only prevents NotImplementedError from
    being raised, but also allows us to implement a reliable version of both
    qsize() and empty().
    """
    def __init__(self, *args, **kwargs):
        super(MultiProcessingQueue, self).__init__(*args, **kwargs)
        self.size = SharedCounter(0)

    def put(self, *args, **kwargs):
        self.size.increment(1)
        super(MultiProcessingQueue, self).put(*args, **kwargs)

    def get(self, *args, **kwargs):
        v = super(MultiProcessingQueue, self).get(*args, **kwargs)
        self.size.increment(-1)
        return v

    def qsize(self):
        """ Reliable implementation of multiprocessing.Queue.qsize() """
        return self.size.value


if platform.system() == 'Darwin':
    if hasattr(multiprocessing, 'get_context'):  # for py34
        def Queue(maxsize=0):
            return MultiProcessingQueue(maxsize, ctx=multiprocessing.get_context())
    else:
        def Queue(maxsize=0):
            return MultiProcessingQueue(maxsize)
else:
    from multiprocessing import Queue  # flake8: noqa


================================================
FILE: pyspider/libs/pprint.py
================================================
#  Author:      Fred L. Drake, Jr.
#               fdrake@...
#
#  This is a simple little module I wrote to make life easier.  I didn't
#  see anything quite like it in the library, though I may have overlooked
#  something.  I wrote this when I was trying to read some heavily nested
#  tuples with fairly non-descriptive content.  This is modeled very much
#  after Lisp/Scheme - style pretty-printing of lists.  If you find it
#  useful, thank small children who sleep at night.

"""Support to pretty-print lists, tuples, & dictionaries recursively.

Very simple, but useful, especially in debugging data structures.

Classes
-------

PrettyPrinter()
    Handle pretty-printing operations onto a stream using a configured
    set of formatting parameters.

Functions
---------

pformat()
    Format a Python object into a pretty-printed representation.

pprint()
    Pretty-print a Python object to a stream [default is sys.stdout].

saferepr()
    Generate a 'standard' repr()-like value, but protect against recursive
    data structures.

"""

from __future__ import print_function

import six
import sys as _sys

from io import BytesIO, StringIO

__all__ = ["pprint", "pformat", "isreadable", "isrecursive", "saferepr",
           "PrettyPrinter"]

# cache these for faster access:
_commajoin = ", ".join
_id = id
_len = len
_type = type


def pprint(object, stream=None, indent=1, width=80, depth=None):
    """Pretty-print a Python object to a stream [default is sys.stdout]."""
    printer = PrettyPrinter(
        stream=stream, indent=indent, width=width, depth=depth)
    printer.pprint(object)


def pformat(object, indent=1, width=80, depth=None):
    """Format a Python object into a pretty-printed representation."""
    return PrettyPrinter(indent=indent, width=width, depth=depth).pformat(object)


def saferepr(object):
    """Version of repr() which can handle recursive data structures."""
    return _safe_repr(object, {}, None, 0)[0]


def isreadable(object):
    """Determine if saferepr(object) is readable by eval()."""
    return _safe_repr(object, {}, None, 0)[1]


def isrecursive(object):
    """Determine if object requires a recursive representation."""
    return _safe_repr(object, {}, None, 0)[2]


def _sorted(iterable):
    return sorted(iterable)


class PrettyPrinter:

    def __init__(self, indent=1, width=80, depth=None, stream=None):
        """Handle pretty printing operations onto a stream using a set of
        configured parameters.

        indent
            Number of spaces to indent for each level of nesting.

        width
            Attempted maximum number of columns in the output.

        depth
            The maximum depth to print out nested structures.

        stream
            The desired output stream.  If omitted (or false), the standard
            output stream available at construction will be used.

        """
        indent = int(indent)
        width = int(width)
        assert indent >= 0, "indent must be >= 0"
        assert depth is None or depth > 0, "depth must be > 0"
        assert width, "width must be != 0"
        self._depth = depth
        self._indent_per_level = indent
        self._width = width
        if stream is not None:
            self._stream = stream
        else:
            self._stream = _sys.stdout

    def pprint(self, object):
        self._format(object, self._stream, 0, 0, {}, 0)
        self._stream.write("\n")

    def pformat(self, object):
        sio = BytesIO()
        self._format(object, sio, 0, 0, {}, 0)
        return sio.getvalue()

    def isrecursive(self, object):
        return self.format(object, {}, 0, 0)[2]

    def isreadable(self, object):
        s, readable, recursive = self.format(object, {}, 0, 0)
        return readable and not recursive

    def _format(self, object, stream, indent, allowance, context, level):
        level = level + 1
        objid = _id(object)
        if objid in context:
            stream.write(_recursion(object))
            self._recursive = True
            self._readable = False
            return
        rep = self._repr(object, context, level - 1)
        typ = _type(object)
        sepLines = _len(rep) > (self._width - 1 - indent - allowance)
        write = stream.write

        if self._depth and level > self._depth:
            write(rep)
            return

        r = getattr(typ, "__repr__", None)
        if issubclass(typ, dict) and r is dict.__repr__:
            write('{')
            if self._indent_per_level > 1:
                write((self._indent_per_level - 1) * ' ')
            length = _len(object)
            if length:
                context[objid] = 1
                indent = indent + self._indent_per_level
                items = _sorted(object.items())
                key, ent = items[0]
                rep = self._repr(key, context, level)
                write(rep)
                write(': ')
                self._format(ent, stream, indent + _len(rep) + 2,
                             allowance + 1, context, level)
                if length > 1:
                    for key, ent in items[1:]:
                        rep = self._repr(key, context, level)
                        if sepLines:
                            write(',\n%s%s: ' % (' ' * indent, rep))
                        else:
                            write(', %s: ' % rep)
                        self._format(ent, stream, indent + _len(rep) + 2,
                                     allowance + 1, context, level)
                indent = indent - self._indent_per_level
                del context[objid]
            write('}')
            return

        if (
                (issubclass(typ, list) and r is list.__repr__) or
                (issubclass(typ, tuple) and r is tuple.__repr__) or
                (issubclass(typ, set) and r is set.__repr__) or
                (issubclass(typ, frozenset) and r is frozenset.__repr__)
        ):
            length = _len(object)
            if issubclass(typ, list):
                write('[')
                endchar = ']'
            elif issubclass(typ, set):
                if not length:
                    write('set()')
                    return
                write('set([')
                endchar = '])'
                object = _sorted(object)
                indent += 4
            elif issubclass(typ, frozenset):
                if not length:
                    write('frozenset()')
                    return
                write('frozenset([')
                endchar = '])'
                object = _sorted(object)
                indent += 10
            else:
                write('(')
                endchar = ')'
            if self._indent_per_level > 1 and sepLines:
                write((self._indent_per_level - 1) * ' ')
            if length:
                context[objid] = 1
                indent = indent + self._indent_per_level
                self._format(object[0], stream, indent, allowance + 1,
                             context, level)
                if length > 1:
                    for ent in object[1:]:
                        if sepLines:
                            write(',\n' + ' ' * indent)
                        else:
                            write(', ')
                        self._format(ent, stream, indent,
                                     allowance + 1, context, level)
                indent = indent - self._indent_per_level
                del context[objid]
            if issubclass(typ, tuple) and length == 1:
                write(',')
            write(endchar)
            return

        write(rep)

    def _repr(self, object, context, level):
        repr, readable, recursive = self.format(object, context.copy(),
                                                self._depth, level)
        if not readable:
            self._readable = False
        if recursive:
            self._recursive = True
        return repr

    def format(self, object, context, maxlevels, level):
        """Format object for a specific context, returning a string
        and flags indicating whether the representation is 'readable'
        and whether the object represents a recursive construct.
        """
        return _safe_repr(object, context, maxlevels, level)


# Return triple (repr_string, isreadable, isrecursive).

def _safe_repr(object, context, maxlevels, level):
    typ = _type(object)
    if typ is str:
        string = object
        string = string.replace('\n', '\\n').replace('\r', '\\r').replace('\t', '\\t')
        if 'locale' not in _sys.modules:
            return repr(object), True, False
        if "'" in object and '"' not in object:
            closure = '"'
            quotes = {'"': '\\"'}
            string = string.replace('"', '\\"')
        else:
            closure = "'"
            quotes = {"'": "\\'"}
            string = string.replace("'", "\\'")
        try:
            string.decode('utf8').encode('gbk', 'replace')
            return ("%s%s%s" % (closure, string, closure)), True, False
        except:
            pass
        qget = quotes.get
        sio = StringIO()
        write = sio.write
        for char in object:
            if char.isalpha():
                write(char)
            else:
                write(qget(char, repr(char)[1:-1]))
        return ("%s%s%s" % (closure, sio.getvalue(), closure)), True, False

    if typ is six.text_type:
        string = object.encode("utf8", 'replace')
        string = string.replace('\n', '\\n').replace('\r', '\\r').replace('\t', '\\t')
        if "'" in object and '"' not in object:
            closure = '"'
            quotes = {'"': '\\"'}
            string = string.replace('"', '\\"')
        else:
            closure = "'"
            quotes = {"'": "\\'"}
            string = string.replace("'", "\\'")
        return ("u%s%s%s" % (closure, string, closure)), True, False

    r = getattr(typ, "__repr__", None)
    if issubclass(typ, dict) and r is dict.__repr__:
        if not object:
            return "{}", True, False
        objid = _id(object)
        if maxlevels and level >= maxlevels:
            return "{...}", False, objid in context
        if objid in context:
            return _recursion(object), False, True
        context[objid] = 1
        readable = True
        recursive = False
        components = []
        append = components.append
        level += 1
        saferepr = _safe_repr
        for k, v in _sorted(object.items()):
            krepr, kreadable, krecur = saferepr(k, context, maxlevels, level)
            vrepr, vreadable, vrecur = saferepr(v, context, maxlevels, level)
            append("%s: %s" % (krepr, vrepr))
            readable = readable and kreadable and vreadable
            if krecur or vrecur:
                recursive = True
        del context[objid]
        return "{%s}" % _commajoin(components), readable, recursive

    if (issubclass(typ, list) and r is list.__repr__) or \
            (issubclass(typ, tuple) and r is tuple.__repr__):
        if issubclass(typ, list):
            if not object:
                return "[]", True, False
            format = "[%s]"
        elif _len(object) == 1:
            format = "(%s,)"
        else:
            if not object:
                return "()", True, False
            format = "(%s)"
        objid = _id(object)
        if maxlevels and level >= maxlevels:
            return format % "...", False, objid in context
        if objid in context:
            return _recursion(object), False, True
        context[objid] = 1
        readable = True
        recursive = False
        components = []
        append = components.append
        level += 1
        for o in object:
            orepr, oreadable, orecur = _safe_repr(o, context, maxlevels, level)
            append(orepr)
            if not oreadable:
                readable = False
            if orecur:
                recursive = True
        del context[objid]
        return format % _commajoin(components), readable, recursive

    rep = repr(object)
    return rep, (rep and not rep.startswith('<')), False


def _recursion(object):
    return ("<Recursion on %s with id=%s>"
            % (_type(object).__name__, _id(object)))


def _perfcheck(object=None):
    import time
    if object is None:
        object = [("string", (1, 2), [3, 4], {5: 6, 7: 8})] * 100000
    p = PrettyPrinter()
    t1 = time.time()
    _safe_repr(object, {}, None, 0)
    t2 = time.time()
    p.pformat(object)
    t3 = time.time()
    print("_safe_repr:", t2 - t1)
    print("pformat:", t3 - t2)

if __name__ == "__main__":
    _perfcheck()


================================================
FILE: pyspider/libs/response.py
================================================
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:
# Author: Binux<i@binux.me>
#         http://binux.me
# Created on 2012-11-02 11:16:02

import cgi
import re
import six
import json
import chardet
import lxml.html
import lxml.etree
from tblib import Traceback
from pyquery import PyQuery
from requests.structures import CaseInsensitiveDict
from requests import HTTPError
from pyspider.libs import utils


class Response(object):

    def __init__(self, status_code=None, url=None, orig_url=None, headers=CaseInsensitiveDict(),
                 content='', cookies=None, error=None, traceback=None, save=None, js_script_result=None, time=0):
        if cookies is None:
            cookies = {}
        self.status_code = status_code
        self.url = url
        self.orig_url = orig_url
        self.headers = headers
        self.content = content
        self.cookies = cookies
        self.error = error
        self.traceback = traceback
        self.save = save
        self.js_script_result = js_script_result
        self.time = time

    def __repr__(self):
        return u'<Response [%d]>' % self.status_code

    def __bool__(self):
        """Returns true if `status_code` is 200 and no error"""
        return self.ok

    def __nonzero__(self):
        """Returns true if `status_code` is 200 and no error."""
        return self.ok

    @property
    def ok(self):
        """Return true if `status_code` is 200 and no error."""
        try:
            self.raise_for_status()
        except:
            return False
        return True

    @property
    def encoding(self):
        """
        encoding of Response.content.

        if Response.encoding is None, encoding will be guessed
        by header or content or chardet if available.
        """
        if hasattr(self, '_encoding'):
            return self._encoding

        # content is unicode
        if isinstance(self.content, six.text_type):
            return 'unicode'

        # Try charset from content-type or content
        encoding = get_encoding(self.headers, self.content)

        # Fallback to auto-detected encoding.
        if not encoding and chardet is not None:
            encoding = chardet.detect(self.content[:600])['encoding']

        if encoding and encoding.lower() == 'gb2312':
            encoding = 'gb18030'

        self._encoding = encoding or 'utf-8'
        return self._encoding

    @encoding.setter
    def encoding(self, value):
        """
        set encoding of content manually
        it will overwrite the guessed encoding
        """
        self._encoding = value
        self._text = None

    @property
    def text(self):
        """
        Content of the response, in unicode.

        if Response.encoding is None and chardet module is available, encoding
        will be guessed.
        """
        if hasattr(self, '_text') and self._text:
            return self._text
        if not self.content:
            return u''
        if isinstance(self.content, six.text_type):
            return self.content

        content = None
        encoding = self.encoding

        # Decode unicode from given encoding.
        try:
            content = self.content.decode(encoding, 'replace')
        except LookupError:
            # A LookupError is raised if the encoding was not found which could
            # indicate a misspelling or similar mistake.
            #
            # So we try blindly encoding.
            content = self.content.decode('utf-8', 'replace')

        self._text = content
        return content

    @property
    def json(self):
        """Returns the json-encoded content of the response, if any."""
        if hasattr(self, '_json'):
            return self._json
        try:
            self._json = json.loads(self.text or self.content)
        except ValueError:
            self._json = None
        return self._json

    @property
    def doc(self):
        """Returns a PyQuery object of the response's content"""
        if hasattr(self, '_doc'):
            return self._doc
        elements = self.etree
        doc = self._doc = PyQuery(elements)
        doc.make_links_absolute(utils.text(self.url))
        return doc

    @property
    def etree(self):
        """Returns a lxml object of the response's content that can be selected by xpath"""
        if not hasattr(self, '_elements'):
            try:
                parser = lxml.html.HTMLParser(encoding=self.encoding)
                self._elements = lxml.html.fromstring(self.content, parser=parser)
            except LookupError:
                # lxml would raise LookupError when encoding not supported
                # try fromstring without encoding instead.
                # on windows, unicode is not availabe as encoding for lxml
                self._elements = lxml.html.fromstring(self.content)
        if isinstance(self._elements, lxml.etree._ElementTree):
            self._elements = self._elements.getroot()
        return self._elements

    def raise_for_status(self, allow_redirects=True):
        """Raises stored :class:`HTTPError` or :class:`URLError`, if one occurred."""

        if self.status_code == 304:
            return
        elif self.error:
            if self.traceback:
                six.reraise(Exception, Exception(self.error), Traceback.from_string(self.traceback).as_traceback())
            http_error = HTTPError(self.error)
        elif (self.status_code >= 300) and (self.status_code < 400) and not allow_redirects:
            http_error = HTTPError('%s Redirection' % (self.status_code))
        elif (self.status_code >= 400) and (self.status_code < 500):
            http_error = HTTPError('%s Client Error' % (self.status_code))
        elif (self.status_code >= 500) and (self.status_code < 600):
            http_error = HTTPError('%s Server Error' % (self.status_code))
        else:
            return

        http_error.response = self
        raise http_error

    def isok(self):
        try:
            self.raise_for_status()
            return True
        except:
            return False


def rebuild_response(r):
    response = Response(
        status_code=r.get('status_code', 599),
        url=r.get('url', ''),
        headers=CaseInsensitiveDict(r.get('headers', {})),
        content=r.get('content', ''),
        cookies=r.get('cookies', {}),
        error=r.get('error'),
        traceback=r.get('traceback'),
        time=r.get('time', 0),
        orig_url=r.get('orig_url', r.get('url', '')),
        js_script_result=r.get('js_script_result'),
        save=r.get('save'),
    )
    return response


def get_encoding(headers, content):
    """Get encoding from request headers or page head."""
    encoding = None

    content_type = headers.get('content-type')
    if content_type:
        _, params = cgi.parse_header(content_type)
        if 'charset' in params:
            encoding = params['charset'].strip("'\"")

    if not encoding:
        content = utils.pretty_unicode(content[:1000]) if six.PY3 else content

        charset_re = re.compile(r'<meta.*?charset=["\']*(.+?)["\'>]',
                                flags=re.I)
        pragma_re = re.compile(r'<meta.*?content=["\']*;?charset=(.+?)["\'>]',
                               flags=re.I)
        xml_re = re.compile(r'^<\?xml.*?encoding=["\']*(.+?)["\'>]')
        encoding = (charset_re.findall(content) +
                    pragma_re.findall(content) +
                    xml_re.findall(content))
        encoding = encoding and encoding[0] or None

    return encoding


================================================
FILE: pyspider/libs/result_dump.py
================================================
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:
# Author: Binux<roy@binux.me>
#         http://binux.me
# Created on 2015-03-27 20:12:11

import six
import csv
import json
import itertools
from io import StringIO, BytesIO
from six import iteritems


def result_formater(results):
    common_fields = None
    for result in results:
        result.setdefault('result', None)
        if isinstance(result['result'], dict):
            if common_fields is None:
                common_fields = set(result['result'].keys())
            else:
                common_fields &= set(result['result'].keys())
        else:
            common_fields = set()
    for result in results:
        result['result_formated'] = {}
        if not common_fields:
            result['others'] = result['result']
        elif not isinstance(result['result'], dict):
            result['others'] = result['result']
        else:
            result_formated = {}
            others = {}
            for key, value in iteritems(result['result']):
                if key in common_fields:
                    result_formated[key] = value
                else:
                    others[key] = value
            result['result_formated'] = result_formated
            result['others'] = others
    return common_fields or set(), results


def dump_as_json(results, valid=False):
    first = True
    if valid:
        yield '['

    for result in results:
        if valid:
            if first:
                first = False
            else:
                yield ', '

        yield json.dumps(result, ensure_ascii=False) + '\n'

    if valid:
        yield ']'


def dump_as_txt(results):
    for result in results:
        yield (
            result.get('url', None) + '\t' +
            json.dumps(result.get('result', None), ensure_ascii=False) + '\n'
        )


def dump_as_csv(results):
    def toString(obj):
        if isinstance(obj, six.binary_type):
            if six.PY2:
                return obj
            else:
                return obj.decode('utf8')
        elif isinstance(obj, six.text_type):
            if six.PY2:
                return obj.encode('utf8')
            else:
                return obj
        else:
            if six.PY2:
                return json.dumps(obj, ensure_ascii=False).encode('utf8')
            else:
                return json.dumps(obj, ensure_ascii=False)

    # python2 needs byes when python3 needs unicode
    if six.PY2:
        stringio = BytesIO()
    else:
        stringio = StringIO()
    csv_writer = csv.writer(stringio)

    it = iter(results)
    first_30 = []
    for result in it:
        first_30.append(result)
        if len(first_30) >= 30:
            break
    common_fields, _ = result_formater(first_30)
    common_fields_l = sorted(common_fields)

    csv_writer.writerow([toString('url')]
                        + [toString(x) for x in common_fields_l]
                        + [toString('...')])
    for result in itertools.chain(first_30, it):
        result['result_formated'] = {}
        if not common_fields:
            result['others'] = result['result']
        elif not isinstance(result['result'], dict):
            result['others'] = result['result']
        else:
            result_formated = {}
            others = {}
            for key, value in iteritems(result['result']):
                if key in common_fields:
                    result_formated[key] = value
                else:
                    others[key] = value
            result['result_formated'] = result_formated
            result['others'] = others
        csv_writer.writerow(
            [toString(result['url'])]
            + [toString(result['result_formated'].get(k, '')) for k in common_fields_l]
            + [toString(result['others'])]
        )
        yield stringio.getvalue()
        stringio.truncate(0)
        stringio.seek(0)


================================================
FILE: pyspider/libs/sample_handler.py
================================================
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# Created on __DATE__
# Project: __PROJECT_NAME__

from pyspider.libs.base_handler import *


class Handler(BaseHandler):
    crawl_config = {
    }

    @every(minutes=24 * 60)
    def on_start(self):
        self.crawl('__START_URL__', callback=self.index_page)

    @config(age=10 * 24 * 60 * 60)
    def index_page(self, response):
        for each in response.doc('a[href^="http"]').items():
            self.crawl(each.attr.href, callback=self.detail_page)

    @config(priority=2)
    def detail_page(self, response):
        return {
            "url": response.url,
            "title": response.doc('title').text(),
        }


================================================
FILE: pyspider/libs/url.py
================================================
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:
# Author: Binux<i@binux.me>
#         http://binux.me
# Created on 2012-11-09 14:39:57

import mimetypes

import six
import shlex
from six.moves.urllib.parse import urlparse, urlunparse
from requests.models import RequestEncodingMixin


def get_content_type(filename):
    """Guessing file type by filename"""
    return mimetypes.guess_type(filename)[0] or 'application/octet-stream'


_encode_params = RequestEncodingMixin._encode_params


def _encode_multipart_formdata(fields, files):
    body, content_type = RequestEncodingMixin._encode_files(files, fields)
    return content_type, body


def _build_url(url, _params):
    """Build the actual URL to use."""

    # Support for unicode domain names and paths.
    scheme, netloc, path, params, query, fragment = urlparse(url)
    netloc = netloc.encode('idna').decode('utf-8')
    if not path:
        path = '/'

    if six.PY2:
        if isinstance(scheme, six.text_type):
            scheme = scheme.encode('utf-8')
        if isinstance(netloc, six.text_type):
            netloc = netloc.encode('utf-8')
        if isinstance(path, six.text_type):
            path = path.encode('utf-8')
        if isinstance(params, six.text_type):
            params = params.encode('utf-8')
        if isinstance(query, six.text_type):
            query = query.encode('utf-8')
        if isinstance(fragment, six.text_type):
            fragment = fragment.encode('utf-8')

    enc_params = _encode_params(_params)
    if enc_params:
        if query:
            query = '%s&%s' % (query, enc_params)
        else:
            query = enc_params
    url = (urlunparse([scheme, netloc, path, params, query, fragment]))
    return url


def quote_chinese(url, encodeing="utf-8"):
    """Quote non-ascii characters"""
    if isinstance(url, six.text_type):
        return quote_chinese(url.encode(encodeing))
    if six.PY3:
        res = [six.int2byte(b).decode('latin-1') if b < 128 else '%%%02X' % b for b in url]
    else:
        res = [b if ord(b) < 128 else '%%%02X' % ord(b) for b in url]
    return "".join(res)


def curl_to_arguments(curl):
    kwargs = {}
    headers = {}
    command = None
    urls = []
    current_opt = None

    for part in shlex.split(curl):
        if command is None:
            # curl
            command = part
        elif not part.startswith('-') and not current_opt:
            # waiting for url
            urls.append(part)
        elif current_opt is None and part.startswith('-'):
            # flags
            if part == '--compressed':
                kwargs['use_gzip'] = True
            else:
                current_opt = part
        else:
            # option
            if current_opt is None:
                raise TypeError('Unknow curl argument: %s' % part)
            elif current_opt in ('-H', '--header'):
                key_value = part.split(':', 1)
                if len(key_value) == 2:
                    key, value = key_value
                    headers[key.strip()] = value.strip()
            elif current_opt in ('-d', '--data'):
                kwargs['data'] = part
            elif current_opt in ('--data-binary'):
                if part[0] == '$':
                    part = part[1:]
                kwargs['data'] = part
            elif current_opt in ('-X', '--request'):
                kwargs['method'] = part
            else:
                raise TypeError('Unknow curl option: %s' % current_opt)
            current_opt = None

    if not urls:
        raise TypeError('curl: no URL specified!')
    if current_opt:
        raise TypeError('Unknow curl option: %s' % current_opt)

    kwargs['urls'] = urls
    if headers:
        kwargs['headers'] = headers

    return kwargs


================================================
FILE: pyspider/libs/utils.py
================================================
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:
# Author: Binux<i@binux.me>
#         http://binux.me
# Created on 2012-11-06 11:50:13

import math
import logging
import hashlib
import datetime
import socket
import base64
import warnings
import threading

import six
from six import iteritems

md5string = lambda x: hashlib.md5(utf8(x)).hexdigest()


class ReadOnlyDict(dict):
    """A Read Only Dict"""

    def __setitem__(self, key, value):
        raise Exception("dict is read-only")


def getitem(obj, key=0, default=None):
    """Get first element of list or return default"""
    try:
        return obj[key]
    except:
        return default


def hide_me(tb, g=globals()):
    """Hide stack traceback of given stack"""
    base_tb = tb
    try:
        while tb and tb.tb_frame.f_globals is not g:
            tb = tb.tb_next
        while tb and tb.tb_frame.f_globals is g:
            tb = tb.tb_next
    except Exception as e:
        logging.exception(e)
        tb = base_tb
    if not tb:
        tb = base_tb
    return tb


def run_in_thread(func, *args, **kwargs):
    """Run function in thread, return a Thread object"""
    from threading import Thread
    thread = Thread(target=func, args=args, kwargs=kwargs)
    thread.daemon = True
    thread.start()
    return thread


def run_in_subprocess(func, *args, **kwargs):
    """Run function in subprocess, return a Process object"""
    from multiprocessing import Process
    thread = Process(target=func, args=args, kwargs=kwargs)
    thread.daemon = True
    thread.start()
    return thread


def format_date(date, gmt_offset=0, relative=True, shorter=False, full_format=False):
    """Formats the given date (which should be GMT).

    By default, we return a relative time (e.g., "2 minutes ago"). You
    can return an absolute date string with ``relative=False``.

    You can force a full format date ("July 10, 1980") with
    ``full_format=True``.

    This method is primarily intended for dates in the past.
    For dates in the future, we fall back to full format.

    From tornado
    """

    if not date:
        return '-'
    if isinstance(date, float) or isinstance(date, int):
        date = datetime.datetime.utcfromtimestamp(date)
    now = datetime.datetime.utcnow()
    if date > now:
        if relative and (date - now).seconds < 60:
            # Due to click skew, things are some things slightly
            # in the future. Round timestamps in the immediate
            # future down to now in relative mode.
            date = now
        else:
            # Otherwise, future dates always use the full format.
            full_format = True
    local_date = date - datetime.timedelta(minutes=gmt_offset)
    local_now = now - datetime.timedelta(minutes=gmt_offset)
    local_yesterday = local_now - datetime.timedelta(hours=24)
    difference = now - date
    seconds = difference.seconds
    days = difference.days

    format = None
    if not full_format:
        ret_, fff_format = fix_full_format(days, seconds, relative, shorter, local_date, local_yesterday)
        format = fff_format
        if ret_:
            return format
        else:
            format = format

    if format is None:
        format = "%(month_name)s %(day)s, %(year)s" if shorter else \
            "%(month_name)s %(day)s, %(year)s at %(time)s"

    str_time = "%d:%02d" % (local_date.hour, local_date.minute)

    return format % {
        "month_name": local_date.strftime('%b'),
        "weekday": local_date.strftime('%A'),
        "day": str(local_date.day),
        "year": str(local_date.year),
        "month": local_date.month,
        "time": str_time
    }


def fix_full_format(days, seconds, relative, shorter, local_date, local_yesterday):
    if relative and days == 0:
        if seconds < 50:
            return True, (("1 second ago" if seconds <= 1 else
                    "%(seconds)d seconds ago") % {"seconds": seconds})

        if seconds < 50 * 60:
            minutes = round(seconds / 60.0)
            return True, (("1 minute ago" if minutes <= 1 else
                    "%(minutes)d minutes ago") % {"minutes": minutes})

        hours = round(seconds / (60.0 * 60))
        return True, (("1 hour ago" if hours <= 1 else
                "%(hours)d hours ago") % {"hours": hours})
    format = None
    if days == 0:
        format = "%(time)s"
    elif days == 1 and local_date.day == local_yesterday.day and \
            relative:
        format = "yesterday" if shorter else "yesterday at %(time)s"
    elif days < 5:
        format = "%(weekday)s" if shorter else "%(weekday)s at %(time)s"
    elif days < 334:  # 11mo, since confusing for same month last year
        format = "%(month)s-%(day)s" if shorter else \
            "%(month)s-%(day)s at %(time)s"
    return False, format

class TimeoutError(Exception):
    pass

try:
    import signal
    if not hasattr(signal, 'SIGALRM'):
        raise ImportError('signal')

    class timeout:
        """
        Time limit of command

        with timeout(3):
            time.sleep(10)
        """

        def __init__(self, seconds=1, error_message='Timeout'):
            self.seconds = seconds
            self.error_message = error_message

        def handle_timeout(self, signum, frame):
            raise TimeoutError(self.error_message)

        def __enter__(self):
            if not isinstance(threading.current_thread(), threading._MainThread):
                logging.warning("timeout only works on main thread, are you running pyspider in threads?")
                self.seconds = 0
            if self.seconds:
                signal.signal(signal.SIGALRM, self.handle_timeout)
                signal.alarm(int(math.ceil(self.seconds)))

        def __exit__(self, type, value, traceback):
            if self.seconds:
                signal.alarm(0)

except ImportError as e:
    warnings.warn("timeout is not supported on your platform.", FutureWarning)

    class timeout:
        """
        Time limit of command (for windows)
        """

        def __init__(self, seconds=1, error_message='Timeout'):
            pass

        def __enter__(self):
            pass

        def __exit__(self, type, value, traceback):
            pass


def utf8(string):
    """
    Make sure string is utf8 encoded bytes.

    If parameter is a object, object.__str__ will been called before encode as bytes
    """
    if isinstance(string, six.text_type):
        return string.encode('utf8')
    elif isinstance(string, six.binary_type):
        return string
    else:
        return six.text_type(string).encode('utf8')


def text(string, encoding='utf8'):
    """
    Make sure string is unicode type, decode with given encoding if it's not.

    If parameter is a object, object.__str__ will been called
    """
    if isinstance(string, six.text_type):
        return string
    elif isinstance(string, six.binary_type):
        return string.decode(encoding)
    else:
        return six.text_type(string)


def pretty_unicode(string):
    """
    Make sure string is unicode, try to decode with utf8, or unicode escaped string if failed.
    """
    if isinstance(string, six.text_type):
        return string
    try:
        return string.decode("utf8")
    except UnicodeDecodeError:
        return string.decode('Latin-1').encode('unicode_escape').decode("utf8")


def unicode_string(string):
    """
    Make sure string is unicode, try to default with utf8, or base64 if failed.

    can been decode by `decode_unicode_string`
    """
    if isinstance(string, six.text_type):
        return string
    try:
        return string.decode("utf8")
    except UnicodeDecodeError:
        return '[BASE64-DATA]' + base64.b64encode(string) + '[/BASE64-DATA]'


def unicode_dict(_dict):
    """
    Make sure keys and values of dict is unicode.
    """
    r = {}
    for k, v in iteritems(_dict):
        r[unicode_obj(k)] = unicode_obj(v)
    return r


def unicode_list(_list):
    """
    Make sure every element in list is unicode. bytes will encode in base64
    """
    return [unicode_obj(x) for x in _list]


def unicode_obj(obj):
    """
    Make sure keys and values of dict/list/tuple is unicode. bytes will encode in base64.

    Can been decode by `decode_unicode_obj`
    """
    if isinstance(obj, dict):
        return unicode_dict(obj)
    elif isinstance(obj, (list, tuple)):
        return unicode_list(obj)
    elif isinstance(obj, six.string_types):
        return unicode_string(obj)
    elif isinstance(obj, (int, float)):
        return obj
    elif obj is None:
        return obj
    else:
        try:
            return text(obj)
        except:
            return text(repr(obj))


def decode_unicode_string(string):
    """
    Decode string encoded by `unicode_string`
    """
    if string.startswith('[BASE64-DATA]') and string.endswith('[/BASE64-DATA]'):
        return base64.b64decode(string[len('[BASE64-DATA]'):-len('[/BASE64-DATA]')])
    return string


def decode_unicode_obj(obj):
    """
    Decode unicoded dict/list/tuple encoded by `unicode_obj`
    """
    if isinstance(obj, dict):
        r = {}
        for k, v in iteritems(obj):
            r[decode_unicode_string(k)] = decode_unicode_obj(v)
        return r
    elif isinstance(obj, six.string_types):
        return decode_unicode_string(obj)
    elif isinstance(obj, (list, tuple)):
        return [decode_unicode_obj(x) for x in obj]
    else:
        return obj


class Get(object):
    """
    Lazy value calculate for object
    """

    def __init__(self, getter):
        self.getter = getter

    def __get__(self, instance, owner):
        return self.getter()


class ObjectDict(dict):
    """
    Object like dict, every dict[key] can visite by dict.key

    If dict[key] is `Get`, calculate it's value.
    """

    def __getattr__(self, name):
        ret = self.__getitem__(name)
        if hasattr(ret, '__get__'):
            return ret.__get__(self, ObjectDict)
        return ret


def load_object(name):
    """Load object from module"""

    if "." not in name:
        raise Exception('load object need module.object')

    module_name, object_name = name.rsplit('.', 1)
    if six.PY2:
        module = __import__(module_name, globals(), locals(), [utf8(object_name)], -1)
    else:
        module = __import__(module_name, globals(), locals(), [object_name])
    return getattr(module, object_name)


def get_python_console(namespace=None):
    """
    Return a interactive python console instance with caller's stack
    """

    if namespace is None:
        import inspect
        frame = inspect.currentframe()
        caller = frame.f_back
        if not caller:
            logging.error("can't find caller who start this console.")
            caller = frame
        namespace = dict(caller.f_globals)
        namespace.update(caller.f_locals)

    try:
        from IPython.terminal.interactiveshell import TerminalInteractiveShell
        shell = TerminalInteractiveShell(user_ns=namespace)
    except ImportError:
        try:
            import readline
            import rlcompleter
            readline.set_completer(rlcompleter.Completer(namespace).complete)
            readline.parse_and_bind("tab: complete")
        except ImportError:
            pass
        import code
        shell = code.InteractiveConsole(namespace)
        shell._quit = False

        def exit():
            shell._quit = True

        def readfunc(prompt=""):
            if shell._quit:
                raise EOFError
            return six.moves.input(prompt)

        # inject exit method
        shell.ask_exit = exit
        shell.raw_input = readfunc

    return shell


def python_console(namespace=None):
    """Start a interactive python console with caller's stack"""

    if namespace is None:
        import inspect
        frame = inspect.currentframe()
        caller = frame.f_back
        if not caller:
            logging.error("can't find caller who start this console.")
            caller = frame
        namespace = dict(caller.f_globals)
        namespace.update(caller.f_locals)

    return get_python_console(namespace=namespace).interact()


def check_port_open(port, addr='127.0.0.1'):
    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock:
        result = sock.connect_ex((addr, port))
        if result == 0:
            return True
        else:
            return False


================================================
FILE: pyspider/libs/wsgi_xmlrpc.py
================================================
#   Copyright (c) 2006-2007 Open Source Applications Foundation
#
#   Licensed under the Apache License, Version 2.0 (the "License");
#   you may not use this file except in compliance with the License.
#   You may obtain a copy of the License at
#
#       http://www.apache.org/licenses/LICENSE-2.0
#
#   Unless required by applicable law or agreed to in writing, software
#   distributed under the License is distributed on an "AS IS" BASIS,
#   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#   See the License for the specific language governing permissions and
#   limitations under the License.
#
#   Origin: https://code.google.com/p/wsgi-xmlrpc/


from six.moves.xmlrpc_server import SimpleXMLRPCDispatcher
import logging

logger = logging.getLogger(__name__)


class WSGIXMLRPCApplication(object):
    """Application to handle requests to the XMLRPC service"""

    def __init__(self, instance=None, methods=None):
        """Create windmill xmlrpc dispatcher"""
        if methods is None:
            methods = []
        try:
            self.dispatcher = SimpleXMLRPCDispatcher(allow_none=True, encoding=None)
        except TypeError:
            # python 2.4
            self.dispatcher = SimpleXMLRPCDispatcher()
        if instance is not None:
            self.dispatcher.register_instance(instance)
        for method in methods:
            self.dispatcher.register_function(method)
        self.dispatcher.register_introspection_functions()

    def register_instance(self, instance):
        return self.dispatcher.register_instance(instance)

    def register_function(self, function, name=None):
        return self.dispatcher.register_function(function, name)

    def handler(self, environ, start_response):
        """XMLRPC service for windmill browser core to communicate with"""

        if environ['REQUEST_METHOD'] == 'POST':
            return self.handle_POST(environ, start_response)
        else:
            start_response("400 Bad request", [('Content-Type', 'text/plain')])
            return ['']

    def handle_POST(self, environ, start_response):
        """Handles the HTTP POST request.

        Attempts to interpret all HTTP POST requests as XML-RPC calls,
        which are forwarded to the server's _dispatch method for handling.

        Most code taken from SimpleXMLRPCServer with modifications for wsgi and my custom dispatcher.
        """

        try:
            # Get arguments by reading body of request.
            # We read this in chunks to avoid straining
            # socket.read(); around the 10 or 15Mb mark, some platforms
            # begin to have problems (bug #792570).

            length = int(environ['CONTENT_LENGTH'])
            data = environ['wsgi.input'].read(length)

            # In previous versions of SimpleXMLRPCServer, _dispatch
            # could be overridden in this class, instead of in
            # SimpleXMLRPCDispatcher. To maintain backwards compatibility,
            # check to see if a subclass implements _dispatch and
            # using that method if present.
            response = self.dispatcher._marshaled_dispatch(
                data, getattr(self.dispatcher, '_dispatch', None)
            )
            response += b'\n'
        except Exception as e:  # This should only happen if the module is buggy
            # internal error, report as HTTP server error
            logger.exception(e)
            start_response("500 Server error", [('Content-Type', 'text/plain')])
            return []
        else:
            # got a valid XML RPC response
            start_response("200 OK", [('Content-Type', 'text/xml'), ('Content-Length', str(len(response)),)])
            return [response]

    def __call__(self, environ, start_response):
        return self.handler(environ, start_response)


================================================
FILE: pyspider/logging.conf
================================================
[loggers]
keys=root,scheduler,fetcher,processor,webui,bench,werkzeug

[logger_root]
level=INFO
handlers=screen

[logger_scheduler]
level=INFO
handlers=screen
qualname=scheduler
propagate=0

[logger_fetcher]
level=DEBUG
handlers=screen
qualname=fetcher
propagate=0

[logger_processor]
level=DEBUG
handlers=screen
qualname=processor
propagate=0

[logger_webui]
level=DEBUG
handlers=screen
qualname=webui
propagate=0

[logger_bench]
level=DEBUG
handlers=screen
qualname=bench
propagate=0

[logger_werkzeug]
level=INFO
handlers=screen
qualname=werkzeug
propagate=0

[handlers]
keys=screen

[handler_screen]
class=logging.StreamHandler
formatter=pretty
level=DEBUG
args=(sys.stderr, )

[formatters]
keys=pretty

[formatter_pretty]
class=pyspider.libs.log.LogFormatter


================================================
FILE: pyspider/message_queue/__init__.py
================================================
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:
# Author: Binux<roy@binux.me>
#         http://binux.me
# Created on 2015-04-30 21:47:08

import logging

try:
    from urllib import parse as urlparse
except ImportError:
    import urlparse


def connect_message_queue(name, url=None, maxsize=0, lazy_limit=True):
    """
    create connection to message queue

    name:
        name of message queue

    rabbitmq:
        amqp://username:password@host:5672/%2F
        see https://www.rabbitmq.com/uri-spec.html
    redis:
        redis://host:6379/db
        redis://host1:port1,host2:port2,...,hostn:portn (for redis 3.x in cluster mode)
    kombu:
        kombu+transport://userid:password@hostname:port/virtual_host
        see http://kombu.readthedocs.org/en/latest/userguide/connections.html#urls
    builtin:
        None
    """

    if not url:
        from pyspider.libs.multiprocessing_queue import Queue
        return Queue(maxsize=maxsize)

    parsed = urlparse.urlparse(url)
    if parsed.scheme == 'amqp':
        from .rabbitmq import Queue
        return Queue(name, url, maxsize=maxsize, lazy_limit=lazy_limit)
    elif parsed.scheme == 'redis':
        from .redis_queue import Queue
        if ',' in parsed.netloc:
            """
            redis in cluster mode (there is no concept of 'db' in cluster mode)
            ex. redis://host1:port1,host2:port2,...,hostn:portn
            """
            cluster_nodes = []
            for netloc in parsed.netloc.split(','):
                cluster_nodes.append({'host': netloc.split(':')[0], 'port': int(netloc.split(':')[1])})

            return Queue(name=name, maxsize=maxsize, lazy_limit=lazy_limit, cluster_nodes=cluster_nodes)

        else:
            db = parsed.path.lstrip('/').split('/')
            try:
                db = int(db[0])
            except:
                logging.warning('redis DB must zero-based numeric index, using 0 instead')
                db = 0

            password = parsed.password or None

            return Queue(name=name, host=parsed.hostname, port=parsed.port, db=db, maxsize=maxsize, password=password, lazy_limit=lazy_limit)
    elif url.startswith('kombu+'):
        url = url[len('kombu+'):]
        from .kombu_queue import Queue
        return Queue(name, url, maxsize=maxsize, lazy_limit=lazy_limit)
    else:
        raise Exception('unknown connection url: %s', url)


================================================
FILE: pyspider/message_queue/kombu_queue.py
================================================
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:
# Author: Binux<roy@binux.me>
#         http://binux.me
# Created on 2015-05-22 20:54:01

import time
import umsgpack
from kombu import Connection, enable_insecure_serializers
from kombu.serialization import register
from kombu.exceptions import ChannelError
from six.moves import queue as BaseQueue


register('umsgpack', umsgpack.packb, umsgpack.unpackb, 'application/x-msgpack')
enable_insecure_serializers(['umsgpack'])


class KombuQueue(object):
    """
    kombu is a high-level interface for multiple message queue backends.

    KombuQueue is built on top of kombu API.
    """

    Empty = BaseQueue.Empty
    Full = BaseQueue.Full
    max_timeout = 0.3

    def __init__(self, name, url="amqp://", maxsize=0, lazy_limit=True):
        """
        Constructor for KombuQueue

        url:        http://kombu.readthedocs.org/en/latest/userguide/connections.html#urls
        maxsize:    an integer that sets the upperbound limit on the number of
                    items that can be placed in the queue.
        """
        self.name = name
        self.conn = Connection(url)
        self.queue = self.conn.SimpleQueue(self.name, no_ack=True, serializer='umsgpack')

        self.maxsize = maxsize
        self.lazy_limit = lazy_limit
        if self.lazy_limit and self.maxsize:
            self.qsize_diff_limit = int(self.maxsize * 0.1)
        else:
            self.qsize_diff_limit = 0
        self.qsize_diff = 0

    def qsize(self):
        try:
            return self.queue.qsize()
        except ChannelError:
            return 0

    def empty(self):
        if self.qsize() == 0:
            return True
        else:
            return False

    def full(self):
        if self.maxsize and self.qsize() >= self.maxsize:
            return True
        else:
            return False

    def put(self, obj, block=True, timeout=None):
        if not block:
            return self.put_nowait(obj)

        start_time = time.time()
        while True:
            try:
                return self.put_nowait(obj)
            except BaseQueue.Full:
                if timeout:
                    lasted = time.time() - start_time
                    if timeout > lasted:
                        time.sleep(min(self.max_timeout, timeout - lasted))
                    else:
                        raise
                else:
                    time.sleep(self.max_timeout)

    def put_nowait(self, obj):
        if self.lazy_limit and self.qsize_diff < self.qsize_diff_limit:
            pass
        elif self.full():
            raise BaseQueue.Full
        else:
            self.qsize_diff = 0
        return self.queue.put(obj)

    def get(self, block=True, timeout=None):
        try:
            ret = self.queue.get(block, timeout)
            return ret.payload
        except self.queue.Empty:
            raise BaseQueue.Empty

    def get_nowait(self):
        try:
            ret = self.queue.get_nowait()
            return ret.payload
        except self.queue.Empty:
            raise BaseQueue.Empty

    def delete(self):
        self.queue.queue.delete()

    def __del__(self):
        self.queue.close()


Queue = KombuQueue


================================================
FILE: pyspider/message_queue/rabbitmq.py
================================================
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:
# Author: Binux<17175297.hk@gmail.com>
#         http://binux.me
# Created on 2012-11-15 17:27:54

import time
import socket
import select
import logging
import umsgpack
import threading

import amqp
from six.moves.urllib.parse import unquote
try:
    from urllib import parse as urlparse
except ImportError:
    import urlparse
from six.moves import queue as BaseQueue


def catch_error(func):
    """Catch errors of rabbitmq then reconnect"""
    import amqp
    try:
        import pika.exceptions
        connect_exceptions = (
            pika.exceptions.ConnectionClosed,
            pika.exceptions.AMQPConnectionError,
        )
    except ImportError:
        connect_exceptions = ()

    connect_exceptions += (
        select.error,
        socket.error,
        amqp.ConnectionError
    )

    def wrap(self, *args, **kwargs):
        try:
            return func(self, *args, **kwargs)
        except connect_exceptions as e:
            logging.error('RabbitMQ error: %r, reconnect.', e)
            self.reconnect()
            return func(self, *args, **kwargs)
    return wrap


class PikaQueue(object):
    """
    A Queue like rabbitmq connector
    """

    Empty = BaseQueue.Empty
    Full = BaseQueue.Full
    max_timeout = 0.3

    def __init__(self, name, amqp_url='amqp://guest:guest@localhost:5672/%2F',
                 maxsize=0, lazy_limit=True):
        """
        Constructor for a PikaQueue.

        Not works with python 3. Default for python 2.

        amqp_url:   https://www.rabbitmq.com/uri-spec.html
        maxsize:    an integer that sets the upperbound limit on the number of
                    items that can be placed in the queue.
        lazy_limit: as rabbitmq is shared between multipul instance, for a strict
                    limit on the number of items in the queue. PikaQueue have to
                    update current queue size before every put operation. When
                    `lazy_limit` is enabled, PikaQueue will check queue size every
                    max_size / 10 put operation for better performace.
        """
        self.name = name
        self.amqp_url = amqp_url
        self.maxsize = maxsize
        self.lock = threading.RLock()

        self.lazy_limit = lazy_limit
        if self.lazy_limit and self.maxsize:
            self.qsize_diff_limit = int(self.maxsize * 0.1)
        else:
            self.qsize_diff_limit = 0
        self.qsize_diff = 0

        self.reconnect()

    def reconnect(self):
        """Reconnect to rabbitmq server"""
        import pika
        import pika.exceptions

        self.connection = pika.BlockingConnection(pika.URLParameters(self.amqp_url))
        self.channel = self.connection.channel()
        try:
            self.channel.queue_declare(self.name)
        except pika.exceptions.ChannelClosed:
            self.connection = pika.BlockingConnection(pika.URLParameters(self.amqp_url))
            self.channel = self.connection.channel()
        #self.channel.queue_purge(self.name)

    @catch_error
    def qsize(self):
        with self.lock:
            ret = self.channel.queue_declare(self.name, passive=True)
        return ret.method.message_count

    def empty(self):
        if self.qsize() == 0:
            return True
        else:
            return False

    def full(self):
        if self.maxsize and self.qsize() >= self.maxsize:
            return True
        else:
            return False

    @catch_error
    def put(self, obj, block=True, timeout=None):
        if not block:
            return self.put_nowait()

        start_time = time.time()
        while True:
            try:
                return self.put_nowait(obj)
            except BaseQueue.Full:
                if timeout:
                    lasted = time.time() - start_time
                    if timeout > lasted:
                        time.sleep(min(self.max_timeout, timeout - lasted))
                    else:
                        raise
                else:
                    time.sleep(self.max_timeout)

    @catch_error
    def put_nowait(self, obj):
        if self.lazy_limit and self.qsize_diff < self.qsize_diff_limit:
            pass
        elif self.full():
            raise BaseQueue.Full
        else:
            self.qsize_diff = 0
        with self.lock:
            self.qsize_diff += 1
            return self.channel.basic_publish("", self.name, umsgpack.packb(obj))

    @catch_error
    def get(self, block=True, timeout=None, ack=False):
        if not block:
            return self.get_nowait()

        start_time = time.time()
        while True:
            try:
                return self.get_nowait(ack)
            except BaseQueue.Empty:
                if timeout:
                    lasted = time.time() - start_time
                    if timeout > lasted:
                        time.sleep(min(self.max_timeout, timeout - lasted))
                    else:
                        raise
                else:
                    time.sleep(self.max_timeout)

    @catch_error
    def get_nowait(self, ack=False):
        with self.lock:
            method_frame, header_frame, body = self.channel.basic_get(self.name, not ack)
            if method_frame is None:
                raise BaseQueue.Empty
            if ack:
                self.channel.basic_ack(method_frame.delivery_tag)
        return umsgpack.unpackb(body)

    @catch_error
    def delete(self):
        with self.lock:
            return self.channel.queue_delete(queue=self.name)


class AmqpQueue(PikaQueue):
    Empty = BaseQueue.Empty
    Full = BaseQueue.Full
    max_timeout = 0.3

    def __init__(self, name, amqp_url='amqp://guest:guest@localhost:5672/%2F',
                 maxsize=0, lazy_limit=True):
        """
        Constructor for a AmqpQueue.

        Default for python 3.

        amqp_url:   https://www.rabbitmq.com/uri-spec.html
        maxsize:    an integer that sets the upperbound limit on the number of
                    items that can be placed in the queue.
        lazy_limit: as rabbitmq is shared between multipul instance, for a strict
                    limit on the number of items in the queue. PikaQueue have to
                    update current queue size before every put operation. When
                    `lazy_limit` is enabled, PikaQueue will check queue size every
                    max_size / 10 put operation for better performace.
        """
        self.name = name
        self.amqp_url = amqp_url
        self.maxsize = maxsize
        self.lock = threading.RLock()

        self.lazy_limit = lazy_limit
        if self.lazy_limit and self.maxsize:
            self.qsize_diff_limit = int(self.maxsize * 0.1)
        else:
            self.qsize_diff_limit = 0
        self.qsize_diff = 0

        self.reconnect()

    def reconnect(self):
        """Reconnect to rabbitmq server"""
        parsed = urlparse.urlparse(self.amqp_url)
        port = parsed.port or 5672
        self.connection = amqp.Connection(host="%s:%s" % (parsed.hostname, port),
                                          userid=parsed.username or 'guest',
                                          password=parsed.password or 'guest',
                                          virtual_host=unquote(
                                              parsed.path.lstrip('/') or '%2F')).connect()
        self.channel = self.connection.channel()
        try:
            self.channel.queue_declare(self.name)
        except amqp.exceptions.PreconditionFailed:
            pass
        #self.channel.queue_purge(self.name)

    @catch_error
    def qsize(self):
        with self.lock:
            name, message_count, consumer_count = self.channel.queue_declare(
                self.name, passive=True)
        return message_count

    @catch_error
    def put_nowait(self, obj):
        if self.lazy_limit and self.qsize_diff < self.qsize_diff_limit:
            pass
        elif self.full():
            raise BaseQueue.Full
        else:
            self.qsize_diff = 0
        with self.lock:
            self.qsize_diff += 1
            msg = amqp.Message(umsgpack.packb(obj))
            return self.channel.basic_publish(msg, exchange="", routing_key=self.name)

    @catch_error
    def get_nowait(self, ack=False):
        with self.lock:
            message = self.channel.basic_get(self.name, not ack)
            if message is None:
                raise BaseQueue.Empty
            if ack:
                self.channel.basic_ack(message.delivery_tag)
        return umsgpack.unpackb(message.body)

Queue = PikaQueue


================================================
FILE: pyspider/message_queue/redis_queue.py
================================================
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:
# Author: Binux<roy@binux.me>
#         http://binux.me
# Created on 2015-04-27 22:48:04

import time
import redis
import umsgpack
from six.moves import queue as BaseQueue


class RedisQueue(object):
    """
    A Queue like message built over redis
    """

    Empty = BaseQueue.Empty
    Full = BaseQueue.Full
    max_timeout = 0.3

    def __init__(self, name, host='localhost', port=6379, db=0,
                 maxsize=0, lazy_limit=True, password=None, cluster_nodes=None):
        """
        Constructor for RedisQueue

        maxsize:    an integer that sets the upperbound limit on the number of
                    items that can be placed in the queue.
        lazy_limit: redis queue is shared via instance, a lazy size limit is used
                    for better performance.
        """
        self.name = name
        if(cluster_nodes is not None):
            from rediscluster import StrictRedisCluster
            self.redis = StrictRedisCluster(startup_nodes=cluster_nodes)
        else:
            self.redis = redis.StrictRedis(host=host, port=port, db=db, password=password)
        self.maxsize = maxsize
        self.lazy_limit = lazy_limit
        self.last_qsize = 0

    def qsize(self):
        self.last_qsize = self.redis.llen(self.name)
        return self.last_qsize

    def empty(self):
        if self.qsize() == 0:
            return True
        else:
            return False

    def full(self):
        if self.maxsize and self.qsize() >= self.maxsize:
            return True
        else:
            return False

    def put_nowait(self, obj):
        if self.lazy_limit and self.last_qsize < self.maxsize:
            pass
        elif self.full():
            raise self.Full
        self.last_qsize = self.redis.rpush(self.name, umsgpack.packb(obj))
        return True

    def put(self, obj, block=True, timeout=None):
        if not block:
            return self.put_nowait(obj)

        start_time = time.time()
        while True:
            try:
                return self.put_nowait(obj)
            except self.Full:
                if timeout:
                    lasted = time.time() - start_time
                    if timeout > lasted:
                        time.sleep(min(self.max_timeout, timeout - lasted))
                    else:
                        raise
                else:
                    time.sleep(self.max_timeout)

    def get_nowait(self):
        ret = self.redis.lpop(self.name)
        if ret is None:
            raise self.Empty
        return umsgpack.unpackb(ret)

    def get(self, block=True, timeout=None):
        if not block:
            return self.get_nowait()

        start_time = time.time()
        while True:
            try:
                return self.get_nowait()
            except self.Empty:
                if timeout:
                    lasted = time.time() - start_time
                    if timeout > lasted:
                        time.sleep(min(self.max_timeout, timeout - lasted))
                    else:
                        raise
                else:
                    time.sleep(self.max_timeout)

Queue = RedisQueue


================================================
FILE: pyspider/processor/__init__.py
================================================
from .processor import ProcessorResult, Processor


================================================
FILE: pyspider/processor/processor.py
================================================
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:
# Author: Binux<i@binux.me>
#         http://binux.me
# Created on 2014-02-16 22:59:56

import sys
import six
import time
import logging
import traceback
logger = logging.getLogger("processor")

from six.moves import queue as Queue
from pyspider.libs import utils
from pyspider.libs.log import LogFormatter
from pyspider.libs.utils import pretty_unicode, hide_me
from pyspider.libs.response import rebuild_response
from .project_module import ProjectManager, ProjectFinder


class ProcessorResult(object):
    """The result and logs producted by a callback"""

    def __init__(self, result=None, follows=(), messages=(),
                 logs=(), exception=None, extinfo=None, save=None):
        if extinfo is None:
            extinfo = {}
        self.result = result
        self.follows = follows
        self.messages = messages
        self.logs = logs
        self.exception = exception
        self.extinfo = extinfo
        self.save = save

    def rethrow(self):
        """rethrow the exception"""

        if self.exception:
            raise self.exception

    def logstr(self):
        """handler the log records to formatted string"""

        result = []
        formater = LogFormatter(color=False)
        for record in self.logs:
            if isinstance(record, six.string_types):
                result.append(pretty_unicode(record))
            else:
                if record.exc_info:
                    a, b, tb = record.exc_info
                    tb = hide_me(tb, globals())
                    record.exc_info = a, b, tb
                result.append(pretty_unicode(formater.format(record)))
                result.append(u'\n')
        return u''.join(result)


class Processor(object):
    PROCESS_TIME_LIMIT = 30
    EXCEPTION_LIMIT = 3

    RESULT_LOGS_LIMIT = 1000
    RESULT_RESULT_LIMIT = 10

    def __init__(self, projectdb, inqueue, status_queue, newtask_queue, result_queue,
                 enable_stdout_capture=True,
                 enable_projects_import=True,
                 process_time_limit=PROCESS_TIME_LIMIT):
        self.inqueue = inqueue
        self.status_queue = status_queue
        self.newtask_queue = newtask_queue
        self.result_queue = result_queue
        self.projectdb = projectdb
        self.enable_stdout_capture = enable_stdout_capture

        self._quit = False
        self._exceptions = 10
        self.project_manager = ProjectManager(projectdb, dict(
            result_queue=self.result_queue,
            enable_stdout_capture=self.enable_stdout_capture,
            process_time_limit=process_time_limit,
        ))

        if enable_projects_import:
            self.enable_projects_import()

    def enable_projects_import(self):
        '''
        Enable import other project as module

        `from project import project_name`
        '''
        sys.meta_path.append(ProjectFinder(self.projectdb))

    def __del__(self):
        pass

    def on_task(self, task, response):
        '''Deal one task'''
        start_time = time.time()
        response = rebuild_response(response)

        try:
            assert 'taskid' in task, 'need taskid in task'
            project = task['project']
            updatetime = task.get('project_updatetime', None)
            md5sum = task.get('project_md5sum', None)
            project_data = self.project_manager.get(project, updatetime, md5sum)
            assert project_data, "no such project!"
            if project_data.get('exception'):
                ret = ProcessorResult(logs=(project_data.get('exception_log'), ),
                                      exception=project_data['exception'])
            else:
                ret = project_data['instance'].run_task(
                    project_data['module'], task, response)
        except Exception as e:
            logstr = traceback.format_exc()
            ret = ProcessorResult(logs=(logstr, ), exception=e)
        process_time = time.time() - start_time

        if not ret.extinfo.get('not_send_status', False):
            if ret.exception:
                track_headers = dict(response.headers)
            else:
                track_headers = {}
                for name in ('etag', 'last-modified'):
                    if name not in response.headers:
                        continue
                    track_headers[name] = response.headers[name]

            status_pack = {
                'taskid': task['taskid'],
                'project': task['project'],
                'url': task.get('url'),
                'track': {
                    'fetch': {
                        'ok': response.isok(),
                        'redirect_url': response.url if response.url != response.orig_url else None,
                        'time': response.time,
                        'error': response.error,
                        'status_code': response.status_code,
                        'encoding': getattr(response, '_encoding', None),
                        'headers': track_headers,
                        'content': response.text[:500] if ret.exception else None,
                    },
                    'process': {
                        'ok': not ret.exception,
                        'time': process_time,
                        'follows': len(ret.follows),
                        'result': (
                            None if ret.result is None
                            else utils.text(ret.result)[:self.RESULT_RESULT_LIMIT]
                        ),
                        'logs': ret.logstr()[-self.RESULT_LOGS_LIMIT:],
                        'exception': ret.exception,
                    },
                    'save': ret.save,
                },
            }
            if 'schedule' in task:
                status_pack['schedule'] = task['schedule']

            # FIXME: unicode_obj should used in scheduler before store to database
            # it's used here for performance.
            self.status_queue.put(utils.unicode_obj(status_pack))

        # FIXME: unicode_obj should used in scheduler before store to database
        # it's used here for performance.
        if ret.follows:
            for each in (ret.follows[x:x + 1000] for x in range(0, len(ret.follows), 1000)):
                self.newtask_queue.put([utils.unicode_obj(newtask) for newtask in each])

        for project, msg, url in ret.messages:
            try:
                self.on_task({
                    'taskid': utils.md5string(url),
                    'project': project,
                    'url': url,
                    'process': {
                        'callback': '_on_message',
                    }
                }, {
                    'status_code': 200,
                    'url': url,
                    'save': (task['project'], msg),
                })
            except Exception as e:
                logger.exception('Sending message error.')
                continue

        if ret.exception:
            logger_func = logger.error
        else:
            logger_func = logger.info
        logger_func('process %s:%s %s -> [%d] len:%d -> result:%.10r fol:%d msg:%d err:%r' % (
            task['project'], task['taskid'],
            task.get('url'), response.status_code, len(response.content),
            ret.result, len(ret.follows), len(ret.messages), ret.exception))
        return True

    def quit(self):
        '''Set quit signal'''
        self._quit = True

    def run(self):
        '''Run loop'''
        logger.info("processor starting...")

        while not self._quit:
            try:
                task, response = self.inqueue.get(timeout=1)
                self.on_task(task, response)
                self._exceptions = 0
            except Queue.Empty as e:
                continue
            except KeyboardInterrupt:
                break
            except Exception as e:
                logger.exception(e)
                self._exceptions += 1
                if self._exceptions > self.EXCEPTION_LIMIT:
                    break
                continue

        logger.info("processor exiting...")


================================================
FILE: pyspider/processor/project_module.py
================================================
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:
# Author: Binux<i@binux.me>
#         http://binux.me
# Created on 2014-02-16 22:24:20

import os
import six
import sys
import imp
import time
import weakref
import logging
import inspect
import traceback
import linecache
from pyspider.libs import utils
from pyspider.libs.log import SaveLogHandler, LogFormatter
logger = logging.getLogger("processor")


class ProjectManager(object):
    """
    load projects from projectdb, update project
    """

    CHECK_PROJECTS_INTERVAL = 5 * 60
    RELOAD_PROJECT_INTERVAL = 60 * 60

    @staticmethod
    def build_module(project, env=None):
        '''Build project script as module'''
        from pyspider.libs import base_handler
        assert 'name' in project, 'need name of project'
        assert 'script' in project, 'need script of project'

        if env is None:
            env = {}
        # fix for old non-package version scripts
        pyspider_path = os.path.join(os.path.dirname(__file__), "..")
        if pyspider_path not in sys.path:
            sys.path.insert(1, pyspider_path)

        env = dict(env)
        env.update({
            'debug': project.get('status', 'DEBUG') == 'DEBUG',
        })

        loader = ProjectLoader(project)
        module = loader.load_module(project['name'])

        # logger inject
        module.log_buffer = []
        module.logging = module.logger = logging.Logger(project['name'])
        if env.get('enable_stdout_capture', True):
            handler = SaveLogHandler(module.log_buffer)
            handler.setFormatter(LogFormatter(color=False))
        else:
            handler = logging.StreamHandler()
            handler.setFormatter(LogFormatter(color=True))
        module.logger.addHandler(handler)

        if '__handler_cls__' not in module.__dict__:
            BaseHandler = module.__dict__.get('BaseHandler', base_handler.BaseHandler)
            for each in list(six.itervalues(module.__dict__)):
                if inspect.isclass(each) and each is not BaseHandler \
                        and issubclass(each, BaseHandler):
                    module.__dict__['__handler_cls__'] = each
        _class = module.__dict__.get('__handler_cls__')
        assert _class is not None, "need BaseHandler in project module"

        instance = _class()
        instance.__env__ = env
        instance.project_name = project['name']
        instance.project = project

        return {
            'loader': loader,
            'module': module,
            'class': _class,
            'instance': instance,
            'exception': None,
            'exception_log': '',
            'info': project,
            'load_time': time.time(),
        }

    def __init__(self, projectdb, env):
        self.projectdb = projectdb
        self.env = env

        self.projects = {}
        self.last_check_projects = time.time()

    def _need_update(self, project_name, updatetime=None, md5sum=None):
        '''Check if project_name need update'''
        if project_name not in self.projects:
            return True
        elif md5sum and md5sum != self.projects[project_name]['info'].get('md5sum'):
            return True
        elif updatetime and updatetime > self.projects[project_name]['info'].get('updatetime', 0):
            return True
        elif time.time() - self.projects[project_name]['load_time'] > self.RELOAD_PROJECT_INTERVAL:
            return True
        return False

    def _check_projects(self):
        '''Check projects by last update time'''
        for project in self.projectdb.check_update(self.last_check_projects,
                                                   ['name', 'updatetime']):
            if project['name'] not in self.projects:
                continue
            if project['updatetime'] > self.projects[project['name']]['info'].get('updatetime', 0):
                self._update_project(project['name'])
        self.last_check_projects = time.time()

    def _update_project(self, project_name):
        '''Update one project from database'''
        project = self.projectdb.get(project_name)
        if not project:
            return None
        return self._load_project(project)

    def _load_project(self, project):
        '''Load project into self.projects from project info dict'''
        try:
            project['md5sum'] = utils.md5string(project['script'])
            ret = self.build_module(project, self.env)
            self.projects[project['name']] = ret
        except Exception as e:
            logger.exception("load project %s error", project.get('name', None))
            ret = {
                'loader': None,
                'module': None,
                'class': None,
                'instance': None,
                'exception': e,
                'exception_log': traceback.format_exc(),
                'info': project,
                'load_time': time.time(),
            }
            self.projects[project['name']] = ret
            return False
        logger.debug('project: %s updated.', project.get('name', None))
        return True

    def get(self, project_name, updatetime=None, md5sum=None):
        '''get project data object, return None if not exists'''
        if time.time() - self.last_check_projects > self.CHECK_PROJECTS_INTERVAL:
            self._check_projects()
        if self._need_update(project_name, updatetime, md5sum):
            self._update_project(project_name)
        return self.projects.get(project_name, None)


class ProjectLoader(object):
    '''ProjectLoader class for sys.meta_path'''

    def __init__(self, project, mod=None):
        self.project = project
        self.name = project['name']
        self.mod = mod
        pass

    def load_module(self, fullname):
        if self.mod is None:
            self.mod = mod = imp.new_module(fullname)
        else:
            mod = self.mod
        mod.__file__ = '<%s>' % self.name
        mod.__loader__ = self
        mod.__project__ = self.project
        mod.__package__ = ''
        code = self.get_code(fullname)
        six.exec_(code, mod.__dict__)
        linecache.clearcache()
        if sys.version_info[:2] == (3, 3):
            sys.modules[fullname] = mod
        return mod

    def is_package(self, fullname):
        return False

    def get_code(self, fullname):
        return compile(self.get_source(fullname), '<%s>' % self.name, 'exec')

    def get_source(self, fullname):
        script = self.project['script']
        if isinstance(script, six.text_type):
            return script.encode('utf8')
        return script


if six.PY2:
    class ProjectFinder(object):
        '''ProjectFinder class for sys.meta_path'''

        def __init__(self, projectdb):
            self.get_projectdb = weakref.ref(projectdb)

        @property
        def projectdb(self):
            return self.get_projectdb()

        def find_module(self, fullname, path=None):
            if fullname == 'projects':
                return self
            parts = fullname.split('.')
            if len(parts) == 2 and parts[0] == 'projects':
                name = parts[1]
                if not self.projectdb:
                    return
                info = self.projectdb.get(name)
                if info:
                    return ProjectLoader(info)

        def load_module(self, fullname):
            mod = imp.new_module(fullname)
            mod.__file__ = '<projects>'
            mod.__loader__ = self
            mod.__path__ = ['<projects>']
            mod.__package__ = 'projects'
            return mod

        def is_package(self, fullname):
            return True
else:
    import importlib.abc

    class ProjectFinder(importlib.abc.MetaPathFinder):
        '''ProjectFinder class for sys.meta_path'''

        def __init__(self, projectdb):
            self.get_projectdb = weakref.ref(projectdb)

        @property
        def projectdb(self):
            return self.get_projectdb()

        def find_spec(self, fullname, path, target=None):
            loader = self.find_module(fullname, path)
            if loader:
                return importlib.util.spec_from_loader(fullname, loader)

        def find_module(self, fullname, path):
            if fullname == 'projects':
                return ProjectsLoader()
            parts = fullname.split('.')
            if len(parts) == 2 and parts[0] == 'projects':
                name = parts[1]
                if not self.projectdb:
                    return
                info = self.projectdb.get(name)
                if info:
                    return ProjectLoader(info)

    class ProjectsLoader(importlib.abc.InspectLoader):
        def load_module(self, fullname):
            mod = imp.new_module(fullname)
            mod.__file__ = '<projects>'
            mod.__loader__ = self
            mod.__path__ = ['<projects>']
            mod.__package__ = 'projects'
            if sys.version_info[:2] == (3, 3):
                sys.modules[fullname] = mod
            return mod

        def module_repr(self, module):
            return '<Module projects>'

        def is_package(self, fullname):
            return True

        def get_source(self, path):
            return ''

        def get_code(self, fullname):
            return compile(self.get_source(fullname), '<projects>', 'exec')

    class ProjectLoader(ProjectLoader, importlib.abc.Loader):
        def create_module(self, spec):
            return self.load_module(spec.name)

        def exec_module(self, module):
            return module

        def module_repr(self, module):
            return '<Module projects.%s>' % self.name


================================================
FILE: pyspider/result/__init__.py
================================================
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:
# Author: Binux<i@binux.me>
#         http://binux.me
# Created on 2014-10-19 16:10:19

from .result_worker import ResultWorker, OneResultWorker


================================================
FILE: pyspider/result/result_worker.py
================================================
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:
# Author: Binux<i@binux.me>
#         http://binux.me
# Created on 2014-10-19 15:37:46

import time
import json
import logging
from six.moves import queue as Queue
logger = logging.getLogger("result")


class ResultWorker(object):

    """
    do with result
    override this if needed.
    """

    def __init__(self, resultdb, inqueue):
        self.resultdb = resultdb
        self.inqueue = inqueue
        self._quit = False

    def on_result(self, task, result):
        '''Called every result'''
        if not result:
            return
        if 'taskid' in task and 'project' in task and 'url' in task:
            logger.info('result %s:%s %s -> %.30r' % (
                task['project'], task['taskid'], task['url'], result))
            return self.resultdb.save(
                project=task['project'],
                taskid=task['taskid'],
                url=task['url'],
                result=result
            )
        else:
            logger.warning('result UNKNOW -> %.30r' % result)
            return

    def quit(self):
        self._quit = True

    def run(self):
        '''Run loop'''
        logger.info("result_worker starting...")

        while not self._quit:
            try:
                task, result = self.inqueue.get(timeout=1)
                self.on_result(task, result)
            except Queue.Empty as e:
                continue
            except KeyboardInterrupt:
                break
            except AssertionError as e:
                logger.error(e)
                continue
            except Exception as e:
                logger.exception(e)
                continue

        logger.info("result_worker exiting...")


class OneResultWorker(ResultWorker):
    '''Result Worker for one mode, write results to stdout'''
    def on_result(self, task, result):
        '''Called every result'''
        if not result:
            return
        if 'taskid' in task and 'project' in task and 'url' in task:
            logger.info('result %s:%s %s -> %.30r' % (
                task['project'], task['taskid'], task['url'], result))
            print(json.dumps({
                'taskid': task['taskid'],
                'project': task['project'],
                'url': task['url'],
                'result': result,
                'updatetime': time.time()
            }))
        else:
            logger.warning('result UNKNOW -> %.30r' % result)
            return


================================================
FILE: pyspider/run.py
================================================
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:
# Author: Binux<i@binux.me>
#         http://binux.me
# Created on 2014-03-05 00:11:49


import os
import sys
import six
import copy
import time
import shutil
import logging
import logging.config

import click
import pyspider
from pyspider.message_queue import connect_message_queue
from pyspider.database import connect_database
from pyspider.libs import utils


def read_config(ctx, param, value):
    if not value:
        return {}
    import json

    def underline_dict(d):
        if not isinstance(d, dict):
            return d
        return dict((k.replace('-', '_'), underline_dict(v)) for k, v in six.iteritems(d))

    config = underline_dict(json.load(value))
    ctx.default_map = config
    return config


def connect_db(ctx, param, value):
    if not value:
        return
    return utils.Get(lambda: connect_database(value))


def load_cls(ctx, param, value):
    if isinstance(value, six.string_types):
        return utils.load_object(value)
    return value


def connect_rpc(ctx, param, value):
    if not value:
        return
    try:
        from six.moves import xmlrpc_client
    except ImportError:
        import xmlrpclib as xmlrpc_client
    return xmlrpc_client.ServerProxy(value, allow_none=True)


@click.group(invoke_without_command=True)
@click.option('-c', '--config', callback=read_config, type=click.File('r'),
              help='a json file with default values for subcommands. {"webui": {"port":5001}}')
@click.option('--logging-config', default=os.path.join(os.path.dirname(__file__), "logging.conf"),
              help="logging config file for built-in python logging module", show_default=True)
@click.option('--debug', envvar='DEBUG', default=False, is_flag=True, help='debug mode')
@click.option('--queue-maxsize', envvar='QUEUE_MAXSIZE', default=100,
              help='maxsize of queue')
@click.option('--taskdb', envvar='TASKDB', callback=connect_db,
              help='database url for taskdb, default: sqlite')
@click.option('--projectdb', envvar='PROJECTDB', callback=connect_db,
              help='database url for projectdb, default: sqlite')
@click.option('--resultdb', envvar='RESULTDB', callback=connect_db,
              help='database url for resultdb, default: sqlite')
@click.option('--message-queue', envvar='AMQP_URL',
              help='connection url to message queue, '
              'default: builtin multiprocessing.Queue')
@click.option('--amqp-url', help='[deprecated] amqp url for rabbitmq. '
              'please use --message-queue instead.')
@click.option('--beanstalk', envvar='BEANSTALK_HOST',
              help='[deprecated] beanstalk config for beanstalk queue. '
              'please use --message-queue instead.')
@click.option('--phantomjs-proxy', envvar='PHANTOMJS_PROXY', help="phantomjs proxy ip:port")
@click.option('--puppeteer-proxy', envvar='PUPPETEER_PROXY', help="puppeteer proxy ip:port")
@click.option('--data-path', default='./data', help='data dir path')
@click.option('--add-sys-path/--not-add-sys-path', default=True, is_flag=True,
              help='add current working directory to python lib search path')
@click.version_option(version=pyspider.__version__, prog_name=pyspider.__name__)
@click.pass_context
def cli(ctx, **kwargs):
    """
    A powerful spider system in python.
    """
    if kwargs['add_sys_path']:
        sys.path.append(os.getcwd())

    logging.config.fileConfig(kwargs['logging_config'])

    # get db from env
    for db in ('taskdb', 'projectdb', 'resultdb'):
        if kwargs[db] is not None:
            continue
        if os.environ.get('MYSQL_NAME'):
            kwargs[db] = utils.Get(lambda db=db: connect_database(
                'sqlalchemy+mysql+%s://%s:%s/%s' % (
                    db, os.environ['MYSQL_PORT_3306_TCP_ADDR'],
                    os.environ['MYSQL_PORT_3306_TCP_PORT'], db)))
        elif os.environ.get('MONGODB_NAME'):
            kwargs[db] = utils.Get(lambda db=db: connect_database(
                'mongodb+%s://%s:%s/%s' % (
                    db, os.environ['MONGODB_PORT_27017_TCP_ADDR'],
                    os.environ['MONGODB_PORT_27017_TCP_PORT'], db)))
        elif os.environ.get('COUCHDB_NAME'):
            kwargs[db] = utils.Get(lambda db=db: connect_database(
                'couchdb+%s://%s:%s/%s' % (
                    db,
                    os.environ['COUCHDB_PORT_5984_TCP_ADDR'] or 'couchdb',
                    os.environ['COUCHDB_PORT_5984_TCP_PORT'] or '5984',
                    db)))
        elif ctx.invoked_subcommand == 'bench':
            if kwargs['data_path'] == './data':
                kwargs['data_path'] += '/bench'
                shutil.rmtree(kwargs['data_path'], ignore_errors=True)
                os.mkdir(kwargs['data_path'])
            if db in ('taskdb', 'resultdb'):
                kwargs[db] = utils.Get(lambda db=db: connect_database('sqlite+%s://' % (db)))
            elif db in ('projectdb', ):
                kwargs[db] = utils.Get(lambda db=db: connect_database('local+%s://%s' % (
                    db, os.path.join(os.path.dirname(__file__), 'libs/bench.py'))))
        else:
            if not os.path.exists(kwargs['data_path']):
                os.mkdir(kwargs['data_path'])
            kwargs[db] = utils.Get(lambda db=db: connect_database('sqlite+%s:///%s/%s.db' % (
                db, kwargs['data_path'], db[:-2])))
            kwargs['is_%s_default' % db] = True

    # create folder for counter.dump
    if not os.path.exists(kwargs['data_path']):
        os.mkdir(kwargs['data_path'])

    # message queue, compatible with old version
    if kwargs.get('message_queue'):
        pass
    elif kwargs.get('amqp_url'):
        kwargs['message_queue'] = kwargs['amqp_url']
    elif os.environ.get('RABBITMQ_NAME'):
        kwargs['message_queue'] = ("amqp://guest:guest@%(RABBITMQ_PORT_5672_TCP_ADDR)s"
                                   ":%(RABBITMQ_PORT_5672_TCP_PORT)s/%%2F" % os.environ)

    for name in ('newtask_queue', 'status_queue', 'scheduler2fetcher',
                 'fetcher2processor', 'processor2result'):
        if kwargs.get('message_queue'):
            kwargs[name] = utils.Get(lambda name=name: connect_message_queue(
                name, kwargs.get('message_queue'), kwargs['queue_maxsize']))
        else:
            kwargs[name] = connect_message_queue(name, kwargs.get('message_queue'),
                                                 kwargs['queue_maxsize'])

    # phantomjs-proxy
    if kwargs.get('phantomjs_proxy'):
        pass
    elif os.environ.get('PHANTOMJS_NAME'):
        kwargs['phantomjs_proxy'] = os.environ['PHANTOMJS_PORT_25555_TCP'][len('tcp://'):]

    # puppeteer-proxy
    if kwargs.get('puppeteer_proxy'):
        pass
    elif os.environ.get('PUPPETEER_NAME'):
        kwargs['puppeteer_proxy'] = os.environ['PUPPETEER_PORT_22222_TCP'][len('tcp://'):]

    ctx.obj = utils.ObjectDict(ctx.obj or {})
    ctx.obj['instances'] = []
    ctx.obj.update(kwargs)

    if ctx.invoked_subcommand is None and not ctx.obj.get('testing_mode'):
        ctx.invoke(all)
    return ctx


@cli.command()
@click.option('--xmlrpc', is_flag=True, help="Enable xmlrpc (Default=True)")
@click.option('--no-xmlrpc', is_flag=True, help="Disable xmlrpc")
@click.option('--xmlrpc-host', default='0.0.0.0')
@click.option('--xmlrpc-port', envvar='SCHEDULER_XMLRPC_PORT', default=23333)
@click.option('--inqueue-limit', default=0,
              help='size limit of task queue for each project, '
              'tasks will been ignored when overflow')
@click.option('--delete-time', default=24 * 60 * 60,
              help='delete time before marked as delete')
@click.option('--active-tasks', default=100, help='active log size')
@click.option('--loop-limit', default=1000, help='maximum number of tasks due with in a loop')
@click.option('--fail-pause-num', default=10, help='auto pause the project when last FAIL_PAUSE_NUM task failed, set 0 to disable')
@click.option('--scheduler-cls', default='pyspider.scheduler.ThreadBaseScheduler', callback=load_cls,
              help='scheduler class to be used.')
@click.option('--threads', default=None, help='thread number for ThreadBaseScheduler, default: 4')
@click.pass_context
def scheduler(ctx, xmlrpc, no_xmlrpc, xmlrpc_host, xmlrpc_port,
              inqueue_limit, delete_time, active_tasks, loop_limit, fail_pause_num,
              scheduler_cls, threads, get_object=False):
    """
    Run Scheduler, only one scheduler is allowed.
    """
    g = ctx.obj
    Scheduler = load_cls(None, None, scheduler_cls)

    kwargs = dict(taskdb=g.taskdb, projectdb=g.projectdb, resultdb=g.resultdb,
                  newtask_queue=g.newtask_queue, status_queue=g.status_queue,
                  out_queue=g.scheduler2fetcher, data_path=g.get('data_path', 'data'))
    if threads:
        kwargs['threads'] = int(threads)

    scheduler = Scheduler(**kwargs)
    scheduler.INQUEUE_LIMIT = inqueue_limit
    scheduler.DELETE_TIME = delete_time
    scheduler.ACTIVE_TASKS = active_tasks
    scheduler.LOOP_LIMIT = loop_limit
    scheduler.FAIL_PAUSE_NUM = fail_pause_num

    g.instances.append(scheduler)
    if g.get('testing_mode') or get_object:
        return scheduler

    if not no_xmlrpc:
        utils.run_in_thread(scheduler.xmlrpc_run, port=xmlrpc_port, bind=xmlrpc_host)

    scheduler.run()


@cli.command()
@click.option('--xmlrpc', is_flag=True, help="Enable xmlrpc (Default=True)")
@click.option('--no-xmlrpc', is_flag=True, help="Disable xmlrpc")
@click.option('--xmlrpc-host', default='0.0.0.0')
@click.option('--xmlrpc-port', envvar='FETCHER_XMLRPC_PORT', default=24444)
@click.option('--poolsize', default=100, help="max simultaneous fetches")
@click.option('--proxy', help="proxy host:port")
@click.option('--user-agent', help='user agent')
@click.option('--timeout', help='default fetch timeout')
@click.option('--phantomjs-endpoint', help="endpoint of phantomjs, start via pyspider phantomjs")
@click.option('--puppeteer-endpoint', help="endpoint of puppeteer, start via pyspider puppeteer")
@click.option('--splash-endpoint', help="execute endpoint of splash: http://splash.readthedocs.io/en/stable/api.html#execute")
@click.option('--fetcher-cls', default='pyspider.fetcher.Fetcher', callback=load_cls,
              help='Fetcher class to be used.')
@click.pass_context
def fetcher(ctx, xmlrpc, no_xmlrpc, xmlrpc_host, xmlrpc_port, poolsize, proxy, user_agent,
            timeout, phantomjs_endpoint, puppeteer_endpoint, splash_endpoint, fetcher_cls,
            async_mode=True, get_object=False, no_input=False):
    """
    Run Fetcher.
    """
    g = ctx.obj
    Fetcher = load_cls(None, None, fetcher_cls)

    if no_input:
        inqueue = None
        outqueue = None
    else:
        inqueue = g.scheduler2fetcher
        outqueue = g.fetcher2processor
    fetcher = Fetcher(inqueue=inqueue, outqueue=outqueue,
                      poolsize=poolsize, proxy=proxy, async_mode=async_mode)
    fetcher.phantomjs_proxy = phantomjs_endpoint or g.phantomjs_proxy
    fetcher.puppeteer_proxy = puppeteer_endpoint or g.puppeteer_proxy
    fetcher.splash_endpoint = splash_endpoint
    if user_agent:
        fetcher.user_agent = user_agent
    if timeout:
        fetcher.default_options = copy.deepcopy(fetcher.default_options)
        fetcher.default_options['timeout'] = timeout

    g.instances.append(fetcher)
    if g.get('testing_mode') or get_object:
        return fetcher

    if not no_xmlrpc:
        utils.run_in_thread(fetcher.xmlrpc_run, port=xmlrpc_port, bind=xmlrpc_host)

    fetcher.run()


@cli.command()
@click.option('--processor-cls', default='pyspider.processor.Processor',
              callback=load_cls, help='Processor class to be used.')
@click.option('--process-time-limit', default=30, help='script process time limit')
@click.pass_context
def processor(ctx, processor_cls, process_time_limit, enable_stdout_capture=True, get_object=False):
    """
    Run Processor.
    """
    g = ctx.obj
    Processor = load_cls(None, None, processor_cls)

    processor = Processor(projectdb=g.projectdb,
                          inqueue=g.fetcher2processor, status_queue=g.status_queue,
                          newtask_queue=g.newtask_queue, result_queue=g.processor2result,
                          enable_stdout_capture=enable_stdout_capture,
                          process_time_limit=process_time_limit)

    g.instances.append(processor)
    if g.get('testing_mode') or get_object:
        return processor

    processor.run()


@cli.command()
@click.option('--result-cls', default='pyspider.result.ResultWorker', callback=load_cls,
              help='ResultWorker class to be used.')
@click.pass_context
def result_worker(ctx, result_cls, get_object=False):
    """
    Run result worker.
    """
    g = ctx.obj
    ResultWorker = load_cls(None, None, result_cls)

    result_worker = ResultWorker(resultdb=g.resultdb, inqueue=g.processor2result)

    g.instances.append(result_worker)
    if g.get('testing_mode') or get_object:
        return result_worker

    result_worker.run()


@cli.command()
@click.option('--host', default='0.0.0.0', envvar='WEBUI_HOST',
              help='webui bind to host')
@click.option('--port', default=5000, envvar='WEBUI_PORT',
              help='webui bind to host')
@click.option('--cdn', default='//cdnjs.cloudflare.com/ajax/libs/',
              help='js/css cdn server')
@click.option('--scheduler-rpc', help='xmlrpc path of scheduler')
@click.option('--fetcher-rpc', help='xmlrpc path of fetcher')
@click.option('--max-rate', type=float, help='max rate for each project')
@click.option('--max-burst', type=float, help='max burst for each project')
@click.option('--username', envvar='WEBUI_USERNAME',
              help='username of lock -ed projects')
@click.option('--password', envvar='WEBUI_PASSWORD',
              help='password of lock -ed projects')
@click.option('--need-auth', is_flag=True, default=False, help='need username and password')
@click.option('--webui-instance', default='pyspider.webui.app.app', callback=load_cls,
              help='webui Flask Application instance to be used.')
@click.option('--process-time-limit', default=30, help='script process time limit in debug')
@click.pass_context
def webui(ctx, host, port, cdn, scheduler_rpc, fetcher_rpc, max_rate, max_burst,
          username, password, need_auth, webui_instance, process_time_limit, get_object=False):
    """
    Run WebUI
    """
    app = load_cls(None, None, webui_instance)

    g = ctx.obj
    app.config['taskdb'] = g.taskdb
    app.config['projectdb'] = g.projectdb
    app.config['resultdb'] = g.resultdb
    app.config['cdn'] = cdn

    if max_rate:
        app.config['max_rate'] = max_rate
    if max_burst:
        app.config['max_burst'] = max_burst
    if username:
        app.config['webui_username'] = username
    if password:
        app.config['webui_password'] = password
    app.config['need_auth'] = need_auth
    app.config['process_time_limit'] = process_time_limit

    # inject queues for webui
    for name in ('newtask_queue', 'status_queue', 'scheduler2fetcher',
                 'fetcher2processor', 'processor2result'):
        app.config['queues'][name] = getattr(g, name, None)

    # fetcher rpc
    if isinstance(fetcher_rpc, six.string_types):
        import umsgpack
        fetcher_rpc = connect_rpc(ctx, None, fetcher_rpc)
        app.config['fetch'] = lambda x: umsgpack.unpackb(fetcher_rpc.fetch(x).data)
    else:
        # get fetcher instance for webui
        fetcher_config = g.config.get('fetcher', {})
        webui_fetcher = ctx.invoke(fetcher, async_mode=False, get_object=True, no_input=True, **fetcher_config)

        app.config['fetch'] = lambda x: webui_fetcher.fetch(x)

    # scheduler rpc
    if isinstance(scheduler_rpc, six.string_types):
        scheduler_rpc = connect_rpc(ctx, None, scheduler_rpc)
    if scheduler_rpc is None and os.environ.get('SCHEDULER_PORT_23333_TCP_ADDR'):
        app.config['scheduler_rpc'] = connect_rpc(ctx, None,
                                                  'http://{}:{}/'.format(os.environ.get('SCHEDULER_PORT_23333_TCP_ADDR'),
                                                                         os.environ.get('SCHEDULER_PORT_23333_TCP_PORT') or 23333))
    elif scheduler_rpc is None:
        app.config['scheduler_rpc'] = connect_rpc(ctx, None, 'http://127.0.0.1:23333/')
    else:
        app.config['scheduler_rpc'] = scheduler_rpc


    app.debug = g.debug
    g.instances.append(app)
    if g.get('testing_mode') or get_object:
        return app

    app.run(host=host, port=port)


@cli.command()
@click.option('--phantomjs-path', default='phantomjs', help='phantomjs path')
@click.option('--port', default=25555, help='phantomjs port')
@click.option('--auto-restart', default=False, help='auto restart phantomjs if crashed')
@click.argument('args', nargs=-1)
@click.pass_context
def phantomjs(ctx, phantomjs_path, port, auto_restart, args):
    """
    Run phantomjs fetcher if phantomjs is installed.
    """
    args = args or ctx.default_map and ctx.default_map.get('args', [])

    import subprocess
    g = ctx.obj
    _quit = []
    phantomjs_fetcher = os.path.join(
        os.path.dirname(pyspider.__file__), 'fetcher/phantomjs_fetcher.js')
    cmd = [phantomjs_path,
           # this may cause memory leak: https://github.com/ariya/phantomjs/issues/12903
           #'--load-images=false',
           '--ssl-protocol=any',
           '--disk-cache=true'] + list(args or []) + [phantomjs_fetcher, str(port)]

    try:
        _phantomjs = subprocess.Popen(cmd)
    except OSError:
        logging.warning('phantomjs not found, continue running without it.')
        return None

    def quit(*args, **kwargs):
        _quit.append(1)
        _phantomjs.kill()
        _phantomjs.wait()
        logging.info('phantomjs exited.')

    if not g.get('phantomjs_proxy'):
        g['phantomjs_proxy'] = '127.0.0.1:%s' % port

    phantomjs = utils.ObjectDict(port=port, quit=quit)
    g.instances.append(phantomjs)
    if g.get('testing_mode'):
        return phantomjs

    while True:
        _phantomjs.wait()
        if _quit or not auto_restart:
            break
        _phantomjs = subprocess.Popen(cmd)

@cli.command()
@click.option('--port', default=22222, help='puppeteer port')
@click.option('--auto-restart', default=False, help='auto restart puppeteer if crashed')
@click.argument('args', nargs=-1)
@click.pass_context
def puppeteer(ctx, port, auto_restart, args):
    """
    Run puppeteer fetcher if puppeteer is installed.
    """

    import subprocess
    g = ctx.obj
    _quit = []
    puppeteer_fetcher = os.path.join(
        os.path.dirname(pyspider.__file__), 'fetcher/puppeteer_fetcher.js')

    cmd = ['node', puppeteer_fetcher, str(port)]
    try:
        _puppeteer = subprocess.Popen(cmd)
    except OSError:
        logging.warning('puppeteer not found, continue running without it.')
        return None

    def quit(*args, **kwargs):
        _quit.append(1)
        _puppeteer.kill()
        _puppeteer.wait()
        logging.info('puppeteer exited.')

    if not g.get('puppeteer_proxy'):
        g['puppeteer_proxy'] = '127.0.0.1:%s' % port

    puppeteer = utils.ObjectDict(port=port, quit=quit)
    g.instances.append(puppeteer)
    if g.get('testing_mode'):
        return puppeteer

    while True:
        _puppeteer.wait()
        if _quit or not auto_restart:
            break
        _puppeteer = subprocess.Popen(cmd)


@cli.command()
@click.option('--fetcher-num', default=1, help='instance num of fetcher')
@click.option('--processor-num', default=1, help='instance num of processor')
@click.option('--result-worker-num', default=1,
              help='instance num of result worker')
@click.option('--run-in', default='subprocess', type=click.Choice(['subprocess', 'thread']),
              help='run each components in thread or subprocess. '
              'always using thread for windows.')
@click.pass_context
def all(ctx, fetcher_num, processor_num, result_worker_num, run_in):
    """
    Run all the components in subprocess or thread
    """

    ctx.obj['debug'] = False
    g = ctx.obj

    # FIXME: py34 cannot run components with threads
    if run_in == 'subprocess' and os.name != 'nt':
        run_in = utils.run_in_subprocess
    else:
        run_in = utils.run_in_thread

    threads = []

    try:
        # phantomjs
        if not g.get('phantomjs_proxy'):
            phantomjs_config = g.config.get('phantomjs', {})
            phantomjs_config.setdefault('auto_restart', True)
            threads.append(run_in(ctx.invoke, phantomjs, **phantomjs_config))
            time.sleep(2)
            if threads[-1].is_alive() and not g.get('phantomjs_proxy'):
                g['phantomjs_proxy'] = '127.0.0.1:%s' % phantomjs_config.get('port', 25555)

        # puppeteer
        if not g.get('puppeteer_proxy'):
            puppeteer_config = g.config.get('puppeteer', {})
            puppeteer_config.setdefault('auto_restart', True)
            threads.append(run_in(ctx.invoke, puppeteer, **puppeteer_config))
            time.sleep(2)
            if threads[-1].is_alive() and not g.get('puppeteer_proxy'):
                g['puppeteer_proxy'] = '127.0.0.1:%s' % puppeteer_config.get('port', 22222)

        # result worker
        result_worker_config = g.config.get('result_worker', {})
        for i in range(result_worker_num):
            threads.append(run_in(ctx.invoke, result_worker, **result_worker_config))

        # processor
        processor_config = g.config.get('processor', {})
        for i in range(processor_num):
            threads.append(run_in(ctx.invoke, processor, **processor_config))

        # fetcher
        fetcher_config = g.config.get('fetcher', {})
        fetcher_config.setdefault('xmlrpc_host', '127.0.0.1')
        for i in range(fetcher_num):
            threads.append(run_in(ctx.invoke, fetcher, **fetcher_config))

        # scheduler
        scheduler_config = g.config.get('scheduler', {})
        scheduler_config.setdefault('xmlrpc_host', '127.0.0.1')
        threads.append(run_in(ctx.invoke, scheduler, **scheduler_config))

        # running webui in main thread to make it exitable
        webui_config = g.config.get('webui', {})
        webui_config.setdefault('scheduler_rpc', 'http://127.0.0.1:%s/'
                                % g.config.get('scheduler', {}).get('xmlrpc_port', 23333))
        ctx.invoke(webui, **webui_config)
    finally:
        # exit components run in threading
        for each in g.instances:
            each.quit()

        # exit components run in subprocess
        for each in threads:
            if not each.is_alive():
                continue
            if hasattr(each, 'terminate'):
                each.terminate()
            each.join()


@cli.command()
@click.option('--fetcher-num', default=1, help='instance num of fetcher')
@click.option('--processor-num', default=2, help='instance num of processor')
@click.option('--result-worker-num', default=1, help='instance num of result worker')
@click.option('--run-in', default='subprocess', type=click.Choice(['subprocess', 'thread']),
              help='run each components in thread or subprocess. '
              'always using thread for windows.')
@click.option('--total', default=10000, help="total url in test page")
@click.option('--show', default=20, help="show how many urls in a page")
@click.option('--taskdb-bench', default=False, is_flag=True,
              help="only run taskdb bench test")
@click.option('--message-queue-bench', default=False, is_flag=True,
              help="only run message queue bench test")
@click.option('--all-bench', default=False, is_flag=True,
              help="only run all bench test")
@click.pass_context
def bench(ctx, fetcher_num, processor_num, result_worker_num, run_in, total, show,
          taskdb_bench, message_queue_bench, all_bench):
    """
    Run Benchmark test.
    In bench mode, in-memory sqlite database is used instead of on-disk sqlite database.
    """
    from pyspider.libs import bench
    from pyspider.webui import bench_test  # flake8: noqa

    ctx.obj['debug'] = False
    g = ctx.obj
    if result_worker_num == 0:
        g['processor2result'] = None

    if run_in == 'subprocess' and os.name != 'nt':
        run_in = utils.run_in_subprocess
    else:
        run_in = utils.run_in_thread

    all_test = not taskdb_bench and not message_queue_bench and not all_bench

    # test taskdb
    if all_test or taskdb_bench:
        bench.bench_test_taskdb(g.taskdb)
    # test message queue
    if all_test or message_queue_bench:
        bench.bench_test_message_queue(g.scheduler2fetcher)
    # test all
    if not all_test and not all_bench:
        return

    project_name = 'bench'

    def clear_project():
        g.taskdb.drop(project_name)
        g.resultdb.drop(project_name)

    clear_project()

    # disable log
    logging.getLogger().setLevel(logging.ERROR)
    logging.getLogger('scheduler').setLevel(logging.ERROR)
    logging.getLogger('fetcher').setLevel(logging.ERROR)
    logging.getLogger('processor').setLevel(logging.ERROR)
    logging.getLogger('result').setLevel(logging.ERROR)
    logging.getLogger('webui').setLevel(logging.ERROR)
    logging.getLogger('werkzeug').setLevel(logging.ERROR)

    try:
        threads = []

        # result worker
        result_worker_config = g.config.get('result_worker', {})
        for i in range(result_worker_num):
            threads.append(run_in(ctx.invoke, result_worker,
                                  result_cls='pyspider.libs.bench.BenchResultWorker',
                                  **result_worker_config))

        # processor
        processor_config = g.config.get('processor', {})
        for i in range(processor_num):
            threads.append(run_in(ctx.invoke, processor,
                                  processor_cls='pyspider.libs.bench.BenchProcessor',
                                  **processor_config))

        # fetcher
        fetcher_config = g.config.get('fetcher', {})
        fetcher_config.setdefault('xmlrpc_host', '127.0.0.1')
        for i in range(fetcher_num):
            threads.append(run_in(ctx.invoke, fetcher,
                                  fetcher_cls='pyspider.libs.bench.BenchFetcher',
                                  **fetcher_config))

        # webui
        webui_config = g.config.get('webui', {})
        webui_config.setdefault('scheduler_rpc', 'http://127.0.0.1:%s/'
                                % g.config.get('scheduler', {}).get('xmlrpc_port', 23333))
        threads.append(run_in(ctx.invoke, webui, **webui_config))

        # scheduler
        scheduler_config = g.config.get('scheduler', {})
        scheduler_config.setdefault('xmlrpc_host', '127.0.0.1')
        scheduler_config.setdefault('xmlrpc_port', 23333)
        threads.append(run_in(ctx.invoke, scheduler,
                              scheduler_cls='pyspider.libs.bench.BenchScheduler',
                              **scheduler_config))
        scheduler_rpc = connect_rpc(ctx, None,
                                    'http://%(xmlrpc_host)s:%(xmlrpc_port)s/' % scheduler_config)

        for _ in range(20):
            if utils.check_port_open(23333):
                break
            time.sleep(1)

        scheduler_rpc.newtask({
            "project": project_name,
            "taskid": "on_start",
            "url": "data:,on_start",
            "fetch": {
                "save": {"total": total, "show": show}
            },
            "process": {
                "callback": "on_start",
            },
        })

        # wait bench test finished
        while True:
            time.sleep(1)
            if scheduler_rpc.size() == 0:
                break
    finally:
        # exit components run in threading
        for each in g.instances:
            each.quit()

        # exit components run in subprocess
        for each in threads:
            if hasattr(each, 'terminate'):
                each.terminate()
            each.join(1)

        clear_project()


@cli.command()
@click.option('-i', '--interactive', default=False, is_flag=True,
              help='enable interactive mode, you can choose crawl url.')
@click.option('--phantomjs', 'enable_phantomjs', default=False, is_flag=True,
              help='enable phantomjs, will spawn a subprocess for phantomjs')
@click.option('--puppeteer', 'enable_puppeteer', default=False, is_flag=True,
              help='enable puppeteer, will spawn a subprocess for puppeteer')
@click.argument('scripts', nargs=-1)
@click.pass_context
def one(ctx, interactive, enable_phantomjs, enable_puppeteer, scripts):
    """
    One mode not only means all-in-one, it runs every thing in one process over
    tornado.ioloop, for debug purpose
    """

    ctx.obj['debug'] = False
    g = ctx.obj
    g['testing_mode'] = True

    if scripts:
        from pyspider.database.local.projectdb import ProjectDB
        g['projectdb'] = ProjectDB(scripts)
        if g.get('is_taskdb_default'):
            g['taskdb'] = connect_database('sqlite+taskdb://')
        if g.get('is_resultdb_default'):
            g['resultdb'] = None

    if enable_phantomjs:
        phantomjs_config = g.config.get('phantomjs', {})
        phantomjs_obj = ctx.invoke(phantomjs, **phantomjs_config)
        if phantomjs_obj:
            g.setdefault('phantomjs_proxy', '127.0.0.1:%s' % phantomjs_obj.port)
    else:
        phantomjs_obj = None

    if enable_puppeteer:
        puppeteer_config = g.config.get('puppeteer', {})
        puppeteer_obj = ctx.invoke(puppeteer, **puppeteer_config)
        if puppeteer_obj:
            g.setdefault('puppeteer_proxy', '127.0.0.1:%s' % puppeteer.port)
    else:
        puppeteer_obj = None

    result_worker_config = g.config.get('result_worker', {})
    if g.resultdb is None:
        result_worker_config.setdefault('result_cls',
                                        'pyspider.result.OneResultWorker')
    result_worker_obj = ctx.invoke(result_worker, **result_worker_config)

    processor_config = g.config.get('processor', {})
    processor_config.setdefault('enable_stdout_capture', False)
    processor_obj = ctx.invoke(processor, **processor_config)

    fetcher_config = g.config.get('fetcher', {})
    fetcher_config.setdefault('xmlrpc', False)
    fetcher_obj = ctx.invoke(fetcher, **fetcher_config)

    scheduler_config = g.config.get('scheduler', {})
    scheduler_config.setdefault('xmlrpc', False)
    scheduler_config.setdefault('scheduler_cls',
                                'pyspider.scheduler.OneScheduler')
    scheduler_obj = ctx.invoke(scheduler, **scheduler_config)

    scheduler_obj.init_one(ioloop=fetcher_obj.ioloop,
                           fetcher=fetcher_obj,
                           processor=processor_obj,
                           result_worker=result_worker_obj,
                           interactive=interactive)
    if scripts:
        for project in g.projectdb.projects:
            scheduler_obj.trigger_on_start(project)

    try:
        scheduler_obj.run()
    finally:
        scheduler_obj.quit()
        if phantomjs_obj:
            phantomjs_obj.quit()
        if puppeteer_obj:
            puppeteer_obj.quit()


@cli.command()
@click.option('--scheduler-rpc', callback=connect_rpc, help='xmlrpc path of scheduler')
@click.argument('project', nargs=1)
@click.argument('message', nargs=1)
@click.pass_context
def send_message(ctx, scheduler_rpc, project, message):
    """
    Send Message to project from command line
    """
    if isinstance(scheduler_rpc, six.string_types):
        scheduler_rpc = connect_rpc(ctx, None, scheduler_rpc)
    if scheduler_rpc is None and os.environ.get('SCHEDULER_PORT_23333_TCP_ADDR'):
        scheduler_rpc = connect_rpc(ctx, None, 'http://%s:%s/' % (os.environ['SCHEDULER_PORT_23333_TCP_ADDR'],
                                                                  os.environ['SCHEDULER_PORT_23333_TCP_PORT'] or 23333))
    if scheduler_rpc is None:
        scheduler_rpc = connect_rpc(ctx, None, 'http://127.0.0.1:23333/')

    return scheduler_rpc.send_task({
        'taskid': utils.md5string('data:,on_message'),
        'project': project,
        'url': 'data:,on_message',
        'fetch': {
            'save': ('__command__', message),
        },
        'process': {
            'callback': '_on_message',
        }
    })


def main():
    cli()

if __name__ == '__main__':
    main()


================================================
FILE: pyspider/scheduler/__init__.py
================================================
from .scheduler import Scheduler, OneScheduler, ThreadBaseScheduler  # NOQA


================================================
FILE: pyspider/scheduler/scheduler.py
================================================
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:
# Author: Binux<i@binux.me>
#         http://binux.me
# Created on 2014-02-07 17:05:11


import itertools
import json
import logging
import os
import time
from collections import deque

from six import iteritems, itervalues
from six.moves import queue as Queue

from pyspider.libs import counter, utils
from pyspider.libs.base_handler import BaseHandler
from .task_queue import TaskQueue

logger = logging.getLogger('scheduler')


class Project(object):
    '''
    project for scheduler
    '''
    def __init__(self, scheduler, project_info):
        '''
        '''
        self.scheduler = scheduler

        self.active_tasks = deque(maxlen=scheduler.ACTIVE_TASKS)
        self.task_queue = TaskQueue()
        self.task_loaded = False
        self._selected_tasks = False  # selected tasks after recent pause
        self._send_finished_event_wait = 0  # wait for scheduler.FAIL_PAUSE_NUM loop steps before sending the event

        self.md5sum = None
        self._send_on_get_info = False
        self.waiting_get_info = True

        self._paused = False
        self._paused_time = 0
        self._unpause_last_seen = None

        self.update(project_info)

    @property
    def paused(self):
        if self.scheduler.FAIL_PAUSE_NUM <= 0:
            return False

        # unpaused --(last FAIL_PAUSE_NUM task failed)--> paused --(PAUSE_TIME)--> unpause_checking
        #                         unpaused <--(last UNPAUSE_CHECK_NUM task have success)--|
        #                             paused <--(last UNPAUSE_CHECK_NUM task no success)--|
        if not self._paused:
            fail_cnt = 0
            for _, task in self.active_tasks:
                # ignore select task
                if task.get('type') == self.scheduler.TASK_PACK:
                    continue
                if 'process' not in task['track']:
                    logger.error('process not in task, %r', task)
                if task['track']['process']['ok']:
                    break
                else:
                    fail_cnt += 1
                if fail_cnt >= self.scheduler.FAIL_PAUSE_NUM:
                    break
            if fail_cnt >= self.scheduler.FAIL_PAUSE_NUM:
                self._paused = True
                self._paused_time = time.time()
        elif self._paused is True and (self._paused_time + self.scheduler.PAUSE_TIME < time.time()):
            self._paused = 'checking'
            self._unpause_last_seen = self.active_tasks[0][1] if len(self.active_tasks) else None
        elif self._paused == 'checking':
            cnt = 0
            fail_cnt = 0
            for _, task in self.active_tasks:
                if task is self._unpause_last_seen:
                    break
                # ignore select task
                if task.get('type') == self.scheduler.TASK_PACK:
                    continue
                cnt += 1
                if task['track']['process']['ok']:
                    # break with enough check cnt
                    cnt = max(cnt, self.scheduler.UNPAUSE_CHECK_NUM)
                    break
                else:
                    fail_cnt += 1
            if cnt >= self.scheduler.UNPAUSE_CHECK_NUM:
                if fail_cnt == cnt:
                    self._paused = True
                    self._paused_time = time.time()
                else:
                    self._paused = False

        return self._paused is True

    def update(self, project_info):
        self.project_info = project_info

        self.name = project_info['name']
        self.group = project_info['group']
        self.db_status = project_info['status']
        self.updatetime = project_info['updatetime']

        md5sum = utils.md5string(project_info['script'])
        if self.md5sum != md5sum:
            self.waiting_get_info = True
            self.md5sum = md5sum
        if self.waiting_get_info and self.active:
            self._send_on_get_info = True

        if self.active:
            self.task_queue.rate = project_info['rate']
            self.task_queue.burst = project_info['burst']
        else:
            self.task_queue.rate = 0
            self.task_queue.burst = 0

        logger.info('project %s updated, status:%s, paused:%s, %d tasks',
                    self.name, self.db_status, self.paused, len(self.task_queue))

    def on_get_info(self, info):
        self.waiting_get_info = False
        self.min_tick = info.get('min_tick', 0)
        self.retry_delay = info.get('retry_delay', {})
        self.crawl_config = info.get('crawl_config', {})

    @property
    def active(self):
        return self.db_status in ('RUNNING', 'DEBUG')


class Scheduler(object):
    UPDATE_PROJECT_INTERVAL = 5 * 60
    default_schedule = {
        'priority': 0,
        'retries': 3,
        'exetime': 0,
        'age': -1,
        'itag': None,
    }
    LOOP_LIMIT = 1000
    LOOP_INTERVAL = 0.1
    ACTIVE_TASKS = 100
    INQUEUE_LIMIT = 0
    EXCEPTION_LIMIT = 3
    DELETE_TIME = 24 * 60 * 60
    DEFAULT_RETRY_DELAY = {
        0: 30,
        1: 1*60*60,
        2: 6*60*60,
        3: 12*60*60,
        '': 24*60*60
    }
    FAIL_PAUSE_NUM = 10
    PAUSE_TIME = 5*60
    UNPAUSE_CHECK_NUM = 3

    TASK_PACK = 1
    STATUS_PACK = 2  # current not used
    REQUEST_PACK = 3  # current not used

    def __init__(self, taskdb, projectdb, newtask_queue, status_queue,
                 out_queue, data_path='./data', resultdb=None):
        self.taskdb = taskdb
        self.projectdb = projectdb
        self.resultdb = resultdb
        self.newtask_queue = newtask_queue
        self.status_queue = status_queue
        self.out_queue = out_queue
        self.data_path = data_path

        self._send_buffer = deque()
        self._quit = False
        self._exceptions = 0
        self.projects = dict()
        self._force_update_project = False
        self._last_update_project = 0
        self._last_tick = int(time.time())
        self._postpone_request = []

        self._cnt = {
            "5m_time": counter.CounterManager(
                lambda: counter.TimebaseAverageEventCounter(30, 10)),
            "5m": counter.CounterManager(
                lambda: counter.TimebaseAverageWindowCounter(30, 10)),
            "1h": counter.CounterManager(
                lambda: counter.TimebaseAverageWindowCounter(60, 60)),
            "1d": counter.CounterManager(
                lambda: counter.TimebaseAverageWindowCounter(10 * 60, 24 * 6)),
            "all": counter.CounterManager(
                lambda: counter.TotalCounter()),
        }
        self._cnt['1h'].load(os.path.join(self.data_path, 'scheduler.1h'))
        self._cnt['1d'].load(os.path.join(self.data_path, 'scheduler.1d'))
        self._cnt['all'].load(os.path.join(self.data_path, 'scheduler.all'))
        self._last_dump_cnt = 0

    def _update_projects(self):
        '''Check project update'''
        now = time.time()
        if (
                not self._force_update_project
                and self._last_update_project + self.UPDATE_PROJECT_INTERVAL > now
        ):
            return
        for project in self.projectdb.check_update(self._last_update_project):
            self._update_project(project)
            logger.debug("project: %s updated.", project['name'])
        self._force_update_project = False
        self._last_update_project = now

    get_info_attributes = ['min_tick', 'retry_delay', 'crawl_config']

    def _update_project(self, project):
        '''update one project'''
        if project['name'] not in self.projects:
            self.projects[project['name']] = Project(self, project)
        else:
            self.projects[project['name']].update(project)

        project = self.projects[project['name']]

        if project._send_on_get_info:
            # update project runtime info from processor by sending a _on_get_info
            # request, result is in status_page.track.save
            project._send_on_get_info = False
            self.on_select_task({
                'taskid': '_on_get_info',
                'project': project.name,
                'url': 'data:,_on_get_info',
                'status': self.taskdb.SUCCESS,
                'fetch': {
                    'save': self.get_info_attributes,
                },
                'process': {
                    'callback': '_on_get_info',
                },
            })

        # load task queue when project is running and delete task_queue when project is stoped
        if project.active:
            if not project.task_loaded:
                self._load_tasks(project)
                project.task_loaded = True
        else:
            if project.task_loaded:
                project.task_queue = TaskQueue()
                project.task_loaded = False

            if project not in self._cnt['all']:
                self._update_project_cnt(project.name)

    scheduler_task_fields = ['taskid', 'project', 'schedule', ]

    def _load_tasks(self, project):
        '''load tasks from database'''
        task_queue = project.task_queue

        for task in self.taskdb.load_tasks(
                self.taskdb.ACTIVE, project.name, self.scheduler_task_fields
        ):
            taskid = task['taskid']
            _schedule = task.get('schedule', self.default_schedule)
            priority = _schedule.get('priority', self.default_schedule['priority'])
            exetime = _schedule.get('exetime', self.default_schedule['exetime'])
            task_queue.put(taskid, priority, exetime)
        project.task_loaded = True
        logger.debug('project: %s loaded %d tasks.', project.name, len(task_queue))

        if project not in self._cnt['all']:
            self._update_project_cnt(project.name)
        self._cnt['all'].value((project.name, 'pending'), len(project.task_queue))

    def _update_project_cnt(self, project_name):
        status_count = self.taskdb.status_count(project_name)
        self._cnt['all'].value(
            (project_name, 'success'),
            status_count.get(self.taskdb.SUCCESS, 0)
        )
        self._cnt['all'].value(
            (project_name, 'failed'),
            status_count.get(self.taskdb.FAILED, 0) + status_count.get(self.taskdb.BAD, 0)
        )
        self._cnt['all'].value(
            (project_name, 'pending'),
            status_count.get(self.taskdb.ACTIVE, 0)
        )

    def task_verify(self, task):
        '''
        return False if any of 'taskid', 'project', 'url' is not in task dict
                        or project in not in task_queue
        '''
        for each in ('taskid', 'project', 'url', ):
            if each not in task or not task[each]:
                logger.error('%s not in task: %.200r', each, task)
                return False
        if task['project'] not in self.projects:
            logger.error('unknown project: %s', task['project'])
            return False

        project = self.projects[task['project']]
        if not project.active:
            logger.error('project %s not started, please set status to RUNNING or DEBUG',
                         task['project'])
            return False
        return True

    def insert_task(self, task):
        '''insert task into database'''
        return self.taskdb.insert(task['project'], task['taskid'], task)

    def update_task(self, task):
        '''update task in database'''
        return self.taskdb.update(task['project'], task['taskid'], task)

    def put_task(self, task):
        '''put task to task queue'''
        _schedule = task.get('schedule', self.default_schedule)
        self.projects[task['project']].task_queue.put(
            task['taskid'],
            priority=_schedule.get('priority', self.default_schedule['priority']),
            exetime=_schedule.get('exetime', self.default_schedule['exetime'])
        )

    def send_task(self, task, force=True):
        '''
        dispatch task to fetcher

        out queue may have size limit to prevent block, a send_buffer is used
        '''
        try:
            self.out_queue.put_nowait(task)
        except Queue.Full:
            if force:
                self._send_buffer.appendleft(task)
            else:
                raise

    def _check_task_done(self):
        '''Check status queue'''
        cnt = 0
        try:
            while True:
                task = self.status_queue.get_nowait()
                # check _on_get_info result here
                if task.get('taskid') == '_on_get_info' and 'project' in task and 'track' in task:
                    if task['project'] not in self.projects:
                        continue
                    project = self.projects[task['project']]
                    project.on_get_info(task['track'].get('save') or {})
                    logger.info(
                        '%s on_get_info %r', task['project'], task['track'].get('save', {})
                    )
                    continue
                elif not self.task_verify(task):
                    continue
                self.on_task_status(task)
                cnt += 1
        except Queue.Empty:
            pass
        return cnt

    merge_task_fields = ['taskid', 'project', 'url', 'status', 'schedule', 'lastcrawltime']

    def _check_request(self):
        '''Check new task queue'''
        # check _postpone_request first
        todo = []
        for task in self._postpone_request:
            if task['project'] not in self.projects:
                continue
            if self.projects[task['project']].task_queue.is_processing(task['taskid']):
                todo.append(task)
            else:
                self.on_request(task)
        self._postpone_request = todo

        tasks = {}
        while len(tasks) < self.LOOP_LIMIT:
            try:
                task = self.newtask_queue.get_nowait()
            except Queue.Empty:
                break

            if isinstance(task, list):
                _tasks = task
            else:
                _tasks = (task, )

            for task in _tasks:
                if not self.task_verify(task):
                    continue

                if task['taskid'] in self.projects[task['project']].task_queue:
                    if not task.get('schedule', {}).get('force_update', False):
                        logger.debug('ignore newtask %(project)s:%(taskid)s %(url)s', task)
                        continue

                if task['taskid'] in tasks:
                    if not task.get('schedule', {}).get('force_update', False):
                        continue

                tasks[task['taskid']] = task

        for task in itervalues(tasks):
            self.on_request(task)

        return len(tasks)

    def _check_cronjob(self):
        """Check projects cronjob tick, return True when a new tick is sended"""
        now = time.time()
        self._last_tick = int(self._last_tick)
        if now - self._last_tick < 1:
            return False
        self._last_tick += 1
        for project in itervalues(self.projects):
            if not project.active:
                continue
            if project.waiting_get_info:
                continue
            if int(project.min_tick) == 0:
                continue
            if self._last_tick % int(project.min_tick) != 0:
                continue
            self.on_select_task({
                'taskid': '_on_cronjob',
                'project': project.name,
                'url': 'data:,_on_cronjob',
                'status': self.taskdb.SUCCESS,
                'fetch': {
                    'save': {
                        'tick': self._last_tick,
                    },
                },
                'process': {
                    'callback': '_on_cronjob',
                },
            })
        return True

    request_task_fields = [
        'taskid',
        'project',
        'url',
        'status',
        'schedule',
        'fetch',
        'process',
        'track',
        'lastcrawltime'
    ]

    def _check_select(self):
        '''Select task to fetch & process'''
        while self._send_buffer:
            _task = self._send_buffer.pop()
            try:
                # use force=False here to prevent automatic send_buffer append and get exception
                self.send_task(_task, False)
            except Queue.Full:
                self._send_buffer.append(_task)
                break

        if self.out_queue.full():
            return {}

        taskids = []
        cnt = 0
        cnt_dict = dict()
        limit = self.LOOP_LIMIT

        # dynamic assign select limit for each project, use qsize as weight
        project_weights, total_weight = dict(), 0
        for project in itervalues(self.projects):  # type:Project
            if not project.active:
                continue
            # only check project pause when select new tasks, cronjob and new request still working
            if project.paused:
                continue
            if project.waiting_get_info:
                continue

            # task queue
            task_queue = project.task_queue  # type:TaskQueue
            pro_weight = task_queue.size()
            total_weight += pro_weight
            project_weights[project.name] = pro_weight
            pass

        min_project_limit = int(limit / 10.)  # ensure minimum select limit for each project
        max_project_limit = int(limit / 3.0)  # ensure maximum select limit for each project

        for pro_name, pro_weight in iteritems(project_weights):
            if cnt >= limit:
                break

            project = self.projects[pro_name]  # type:Project

            # task queue
            task_queue = project.task_queue
            task_queue.check_update()
            project_cnt = 0

            # calculate select limit for project
            if total_weight < 1 or pro_weight < 1:
                project_limit = min_project_limit
            else:
                project_limit = int((1.0 * pro_weight / total_weight) * limit)
                if project_limit < min_project_limit:
                    project_limit = min_project_limit
                elif project_limit > max_project_limit:
                    project_limit = max_project_limit

            # check send_buffer here. when not empty, out_queue may blocked. Not sending tasks
            while cnt < limit and project_cnt < project_limit:
                taskid = task_queue.get()
                if not taskid:
                    break

                taskids.append((project.name, taskid))
                if taskid != 'on_finished':
                    project_cnt += 1
                cnt += 1

            cnt_dict[project.name] = project_cnt
            if project_cnt:
                project._selected_tasks = True
                project._send_finished_event_wait = 0

            # check and send finished event to project
            if not project_cnt and len(task_queue) == 0 and project._selected_tasks:
                # wait for self.FAIL_PAUSE_NUM steps to make sure all tasks in queue have been processed
                if project._send_finished_event_wait < self.FAIL_PAUSE_NUM:
                    project._send_finished_event_wait += 1
                else:
                    project._selected_tasks = False
                    project._send_finished_event_wait = 0

                    self._postpone_request.append({
                        'project': project.name,
                        'taskid': 'on_finished',
                        'url': 'data:,on_finished',
                        'process': {
                            'callback': 'on_finished',
                        },
                        "schedule": {
                            "age": 0,
                            "priority": 9,
                            "force_update": True,
                        },
                    })

        for project, taskid in taskids:
            self._load_put_task(project, taskid)

        return cnt_dict

    def _load_put_task(self, project, taskid):
        try:
            task = self.taskdb.get_task(project, taskid, fields=self.request_task_fields)
        except ValueError:
            logger.error('bad task pack %s:%s', project, taskid)
            return
        if not task:
            return
        task = self.on_select_task(task)

    def _print_counter_log(self):
        # print top 5 active counters
        keywords = ('pending', 'success', 'retry', 'failed')
        total_cnt = {}
        project_actives = []
        project_fails = []
        for key in keywords:
            total_cnt[key] = 0
        for project, subcounter in iteritems(self._cnt['5m']):
            actives = 0
            for key in keywords:
                cnt = subcounter.get(key, None)
                if cnt:
                    cnt = cnt.sum
                    total_cnt[key] += cnt
                    actives += cnt

            project_actives.append((actives, project))

            fails = subcounter.get('failed', None)
            if fails:
                project_fails.append((fails.sum, project))

        top_2_fails = sorted(project_fails, reverse=True)[:2]
        top_3_actives = sorted([x for x in project_actives if x[1] not in top_2_fails],
                               reverse=True)[:5 - len(top_2_fails)]

        log_str = ("in 5m: new:%(pending)d,success:%(success)d,"
                   "retry:%(retry)d,failed:%(failed)d" % total_cnt)
        for _, project in itertools.chain(top_3_actives, top_2_fails):
            subcounter = self._cnt['5m'][project].to_dict(get_value='sum')
            log_str += " %s:%d,%d,%d,%d" % (project,
                                            subcounter.get('pending', 0),
                                            subcounter.get('success', 0),
                                            subcounter.get('retry', 0),
                                            subcounter.get('failed', 0))
        logger.info(log_str)

    def _dump_cnt(self):
        '''Dump counters to file'''
        self._cnt['1h'].dump(os.path.join(self.data_path, 'scheduler.1h'))
        self._cnt['1d'].dump(os.path.join(self.data_path, 'scheduler.1d'))
        self._cnt['all'].dump(os.path.join(self.data_path, 'scheduler.all'))

    def _try_dump_cnt(self):
        '''Dump counters every 60 seconds'''
        now = time.time()
        if now - self._last_dump_cnt > 60:
            self._last_dump_cnt = now
            self._dump_cnt()
            self._print_counter_log()

    def _check_delete(self):
        '''Check project delete'''
        now = time.time()
        for project in list(itervalues(self.projects)):
            if project.db_status != 'STOP':
                continue
            if now - project.updatetime < self.DELETE_TIME:
                continue
            if 'delete' not in self.projectdb.split_group(project.group):
                continue

            logger.warning("deleting project: %s!", project.name)
            del self.projects[project.name]
            self.taskdb.drop(project.name)
            self.projectdb.drop(project.name)
            if self.resultdb:
                self.resultdb.drop(project.name)
            for each in self._cnt.values():
                del each[project.name]

    def __len__(self):
        return sum(len(x.task_queue) for x in itervalues(self.projects))

    def quit(self):
        '''Set quit signal'''
        self._quit = True
        # stop xmlrpc server
        if hasattr(self, 'xmlrpc_server'):
            self.xmlrpc_ioloop.add_callback(self.xmlrpc_server.stop)
            self.xmlrpc_ioloop.add_callback(self.xmlrpc_ioloop.stop)

    def run_once(self):
        '''comsume queues and feed tasks to fetcher, once'''

        self._update_projects()
        self._check_task_done()
        self._check_request()
        while self._check_cronjob():
            pass
        self._check_select()
        self._check_delete()
        self._try_dump_cnt()

    def run(self):
        '''Start scheduler loop'''
        logger.info("scheduler starting...")

        while not self._quit:
            try:
                time.sleep(self.LOOP_INTERVAL)
                self.run_once()
                self._exceptions = 0
            except KeyboardInterrupt:
                break
            except Exception as e:
                logger.exception(e)
                self._exceptions += 1
                if self._exceptions > self.EXCEPTION_LIMIT:
                    break
                continue

        logger.info("scheduler exiting...")
        self._dump_cnt()

    def trigger_on_start(self, project):
        '''trigger an on_start callback of project'''
        self.newtask_queue.put({
            "project": project,
            "taskid": "on_start",
            "url": "data:,on_start",
            "process": {
                "callback": "on_start",
            },
        })

    def xmlrpc_run(self, port=23333, bind='127.0.0.1', logRequests=False):
        '''Start xmlrpc interface'''
        from pyspider.libs.wsgi_xmlrpc import WSGIXMLRPCApplication

        application = WSGIXMLRPCApplication()

        application.register_function(self.quit, '_quit')
        application.register_function(self.__len__, 'size')

        def dump_counter(_time, _type):
            try:
                return self._cnt[_time].to_dict(_type)
            except:
                logger.exception('')
        application.register_function(dump_counter, 'counter')

        def new_task(task):
            if self.task_verify(task):
                self.newtask_queue.put(task)
                return True
            return False
        application.register_function(new_task, 'newtask')

        def send_task(task):
            '''dispatch task to fetcher'''
            self.send_task(task)
            return True
        application.register_function(send_task, 'send_task')

        def update_project():
            self._force_update_project = True
        application.register_function(update_project, 'update_project')

        def get_active_tasks(project=None, limit=100):
            allowed_keys = set((
                'type',
                'taskid',
                'project',
                'status',
                'url',
                'lastcrawltime',
                'updatetime',
                'track',
            ))
            track_allowed_keys = set((
                'ok',
                'time',
                'follows',
                'status_code',
            ))

            iters = [iter(x.active_tasks) for k, x in iteritems(self.projects)
                     if x and (k == project if project else True)]
            tasks = [next(x, None) for x in iters]
            result = []

            while len(result) < limit and tasks and not all(x is None for x in tasks):
                updatetime, task = t = max(t for t in tasks if t)
                i = tasks.index(t)
                tasks[i] = next(iters[i], None)
                for key in list(task):
                    if key == 'track':
                        for k in list(task[key].get('fetch', [])):
                            if k not in track_allowed_keys:
                                del task[key]['fetch'][k]
                        for k in list(task[key].get('process', [])):
                            if k not in track_allowed_keys:
                                del task[key]['process'][k]
                    if key in allowed_keys:
                        continue
                    del task[key]
                result.append(t)
            # fix for "<type 'exceptions.TypeError'>:dictionary key must be string"
            # have no idea why
            return json.loads(json.dumps(result))
        application.register_function(get_active_tasks, 'get_active_tasks')

        def get_projects_pause_status():
            result = {}
            for project_name, project in iteritems(self.projects):
                result[project_name] = project.paused
            return result
        application.register_function(get_projects_pause_status, 'get_projects_pause_status')

        def webui_update():
            return {
                'pause_status': get_projects_pause_status(),
                'counter': {
                    '5m_time': dump_counter('5m_time', 'avg'),
                    '5m': dump_counter('5m', 'sum'),
                    '1h': dump_counter('1h', 'sum'),
                    '1d': dump_counter('1d', 'sum'),
                    'all': dump_counter('all', 'sum'),
                },
            }
        application.register_function(webui_update, 'webui_update')

        import tornado.wsgi
        import tornado.ioloop
        import tornado.httpserver

        container = tornado.wsgi.WSGIContainer(application)
        self.xmlrpc_ioloop = tornado.ioloop.IOLoop()
        self.xmlrpc_server = tornado.httpserver.HTTPServer(container, io_loop=self.xmlrpc_ioloop)
        self.xmlrpc_server.listen(port=port, address=bind)
        logger.info('scheduler.xmlrpc listening on %s:%s', bind, port)
        self.xmlrpc_ioloop.start()

    def on_request(self, task):
        if self.INQUEUE_LIMIT and len(self.projects[task['project']].task_queue) >= self.INQUEUE_LIMIT:
            logger.debug('overflow task %(project)s:%(taskid)s %(url)s', task)
            return

        oldtask = self.taskdb.get_task(task['project'], task['taskid'],
                                       fields=self.merge_task_fields)
        if oldtask:
            return self.on_old_request(task, oldtask)
        else:
            return self.on_new_request(task)

    def on_new_request(self, task):
        '''Called when a new request is arrived'''
        task['status'] = self.taskdb.ACTIVE
        self.insert_task(task)
        self.put_task(task)

        project = task['project']
        self._cnt['5m'].event((project, 'pending'), +1)
        self._cnt['1h'].event((project, 'pending'), +1)
        self._cnt['1d'].event((project, 'pending'), +1)
        self._cnt['all'].event((project, 'pending'), +1)
        logger.info('new task %(project)s:%(taskid)s %(url)s', task)
        return task

    def on_old_request(self, task, old_task):
        '''Called when a crawled task is arrived'''
        now = time.time()

        _schedule = task.get('schedule', self.default_schedule)
        old_schedule = old_task.get('schedule', {})

        if _schedule.get('force_update') and self.projects[task['project']].task_queue.is_processing(task['taskid']):
            # when a task is in processing, the modify may conflict with the running task.
            # postpone the modify after task finished.
            logger.info('postpone modify task %(project)s:%(taskid)s %(url)s', task)
            self._postpone_request.append(task)
            return

        restart = False
        schedule_age = _schedule.get('age', self.default_schedule['age'])
        if _schedule.get('itag') and _schedule['itag'] != old_schedule.get('itag'):
            restart = True
        elif schedule_age >= 0 and schedule_age + (old_task.get('lastcrawltime', 0) or 0) < now:
            restart = True
        elif _schedule.get('force_update'):
            restart = True

        if not restart:
            logger.debug('ignore newtask %(project)s:%(taskid)s %(url)s', task)
            return

        if _schedule.get('cancel'):
            logger.info('cancel task %(project)s:%(taskid)s %(url)s', task)
            task['status'] = self.taskdb.BAD
            self.update_task(task)
            self.projects[task['project']].task_queue.delete(task['taskid'])
            return task

        task['status'] = self.taskdb.ACTIVE
        self.update_task(task)
        self.put_task(task)

        project = task['project']
        if old_task['status'] != self.taskdb.ACTIVE:
            self._cnt['5m'].event((project, 'pending'), +1)
            self._cnt['1h'].event((project, 'pending'), +1)
            self._cnt['1d'].event((project, 'pending'), +1)
        if old_task['status'] == self.taskdb.SUCCESS:
            self._cnt['all'].event((project, 'success'), -1).event((project, 'pending'), +1)
        elif old_task['status'] == self.taskdb.FAILED:
            self._cnt['all'].event((project, 'failed'), -1).event((project, 'pending'), +1)
        logger.info('restart task %(project)s:%(taskid)s %(url)s', task)
        return task

    def on_task_status(self, task):
        '''Called when a status pack is arrived'''
        try:
            procesok = task['track']['process']['ok']
            if not self.projects[task['project']].task_queue.done(task['taskid']):
                logging.error('not processing pack: %(project)s:%(taskid)s %(url)s', task)
                return None
        except KeyError as e:
            logger.error("Bad status pack: %s", e)
            return None

        if procesok:
            ret = self.on_task_done(task)
        else:
            ret = self.on_task_failed(task)

        if task['track']['fetch'].get('time'):
            self._cnt['5m_time'].event((task['project'], 'fetch_time'),
                                       task['track']['fetch']['time'])
        if task['track']['process'].get('time'):
            self._cnt['5m_time'].event((task['project'], 'process_time'),
                                       task['track']['process'].get('time'))
        self.projects[task['project']].active_tasks.appendleft((time.time(), task))
        return ret

    def on_task_done(self, task):
        '''Called when a task is done and success, called by `on_task_status`'''
        task['status'] = self.taskdb.SUCCESS
        task['lastcrawltime'] = time.time()

        if 'schedule' in task:
            if task['schedule'].get('auto_recrawl') and 'age' in task['schedule']:
                task['status'] = self.taskdb.ACTIVE
                next_exetime = task['schedule'].get('age')
                task['schedule']['exetime'] = time.time() + next_exetime
                self.put_task(task)
            else:
                del task['schedule']
        self.update_task(task)

        project = task['project']
        self._cnt['5m'].event((project, 'success'), +1)
        self._cnt['1h'].event((project, 'success'), +1)
        self._cnt['1d'].event((project, 'success'), +1)
        self._cnt['all'].event((project, 'success'), +1).event((project, 'pending'), -1)
        logger.info('task done %(project)s:%(taskid)s %(url)s', task)
        return task

    def on_task_failed(self, task):
        '''Called when a task is failed, called by `on_task_status`'''

        if 'schedule' not in task:
            old_task = self.taskdb.get_task(task['project'], task['taskid'], fields=['schedule'])
            if old_task is None:
                logging.error('unknown status pack: %s' % task)
                return
            task['schedule'] = old_task.get('schedule', {})

        retries = task['schedule'].get('retries', self.default_schedule['retries'])
        retried = task['schedule'].get('retried', 0)

        project_info = self.projects[task['project']]
        retry_delay = project_info.retry_delay or self.DEFAULT_RETRY_DELAY
        next_exetime = retry_delay.get(retried, retry_delay.get('', self.DEFAULT_RETRY_DELAY['']))

        if task['schedule'].get('auto_recrawl') and 'age' in task['schedule']:
            next_exetime = min(next_exetime, task['schedule'].get('age'))
        else:
            if retried >= retries:
                next_exetime = -1
            elif 'age' in task['schedule'] and next_exetime > task['schedule'].get('age'):
                next_exetime = task['schedule'].get('age')

        if next_exetime < 0:
            task['status'] = self.taskdb.FAILED
            task['lastcrawltime'] = time.time()
            self.update_task(task)

            project = task['project']
            self._cnt['5m'].event((project, 'failed'), +1)
            self._cnt['1h'].event((project, 'failed'), +1)
            self._cnt['1d'].event((project, 'failed'), +1)
            self._cnt['all'].event((project, 'failed'), +1).event((project, 'pending'), -1)
            logger.info('task failed %(project)s:%(taskid)s %(url)s' % task)
            return task
        else:
            task['schedule']['retried'] = retried + 1
            task['schedule']['exetime'] = time.time() + next_exetime
            task['lastcrawltime'] = time.time()
            self.update_task(task)
            self.put_task(task)

            project = task['project']
            self._cnt['5m'].event((project, 'retry'), +1)
            self._cnt['1h'].event((project, 'retry'), +1)
            self._cnt['1d'].event((project, 'retry'), +1)
            # self._cnt['all'].event((project, 'retry'), +1)
            logger.info('task retry %d/%d %%(project)s:%%(taskid)s %%(url)s' % (
                retried, retries), task)
            return task

    def on_select_task(self, task):
        '''Called when a task is selected to fetch & process'''
        # inject informations about project
        logger.info('select %(project)s:%(taskid)s %(url)s', task)

        project_info = self.projects.get(task['project'])
        assert project_info, 'no such project'
        task['type'] = self.TASK_PACK
        task['group'] = project_info.group
        task['project_md5sum'] = project_info.md5sum
        task['project_updatetime'] = project_info.updatetime

        # lazy join project.crawl_config
        if getattr(project_info, 'crawl_config', None):
            task = BaseHandler.task_join_crawl_config(task, project_info.crawl_config)

        project_info.active_tasks.appendleft((time.time(), task))
        self.send_task(task)
        return task


from tornado import gen


class OneScheduler(Scheduler):
    """
    Scheduler Mixin class for one mode

    overwirted send_task method
    call processor.on_task(fetcher.fetch(task)) instead of consuming queue
    """

    def _check_select(self):
        """
        interactive mode of select tasks
        """
        if not self.interactive:
            return super(OneScheduler, self)._check_select()

        # waiting for running tasks
        if self.running_task > 0:
            return

        is_crawled = []

        def run(project=None):
            return crawl('on_start', project=project)

        def crawl(url, project=None, **kwargs):
            """
            Crawl given url, same parameters as BaseHandler.crawl

            url - url or taskid, parameters will be used if in taskdb
            project - can be ignored if only one project exists.
            """

            # looking up the project instance
            if project is None:
                if len(self.projects) == 1:
                    project = list(self.projects.keys())[0]
                else:
                    raise LookupError('You need specify the project: %r'
                                      % list(self.projects.keys()))
            project_data = self.processor.project_manager.get(project)
            if not project_data:
                raise LookupError('no such project: %s' % project)

            # get task package
            instance = project_data['instance']
            instance._reset()
            task = instance.crawl(url, **kwargs)
            if isinstance(task, list):
                raise Exception('url list is not allowed in interactive mode')

            # check task in taskdb
            if not kwargs:
                dbtask = self.taskdb.get_task(task['project'], task['taskid'],
                                              fields=self.request_task_fields)
                if not dbtask:
                    dbtask = self.taskdb.get_task(task['project'], task['url'],
                                                  fields=self.request_task_fields)
                if dbtask:
                    task = dbtask

            # select the task
            self.on_select_task(task)
            is_crawled.append(True)

            shell.ask_exit()

        def quit_interactive():
            '''Quit interactive mode'''
            is_crawled.append(True)
            self.interactive = False
            shell.ask_exit()

        def quit_pyspider():
            '''Close pyspider'''
            is_crawled[:] = []
            shell.ask_exit()

        shell = utils.get_python_console()
        banner = (
            'pyspider shell - Select task\n'
            'crawl(url, project=None, **kwargs) - same parameters as BaseHandler.crawl\n'
            'quit_interactive() - Quit interactive mode\n'
            'quit_pyspider() - Close pyspider'
        )
        if hasattr(shell, 'show_banner'):
            shell.show_banner(banner)
            shell.interact()
        else:
            shell.interact(banner)
        if not is_crawled:
            self.ioloop.add_callback(self.ioloop.stop)

    def __getattr__(self, name):
        """patch for crawl(url, callback=self.index_page) API"""
        if self.interactive:
            return name
        raise AttributeError(name)

    def on_task_status(self, task):
        """Ignore not processing error in interactive mode"""
        if not self.interactive:
            super(OneScheduler, self).on_task_status(task)

        try:
            procesok = task['track']['process']['ok']
        except KeyError as e:
            logger.error("Bad status pack: %s", e)
            return None

        if procesok:
            ret = self.on_task_done(task)
        else:
            ret = self.on_task_failed(task)
        if task['track']['fetch'].get('time'):
            self._cnt['5m_time'].event((task['project'], 'fetch_time'),
                                       task['track']['fetch']['time'])
        if task['track']['process'].get('time'):
            self._cnt['5m_time'].event((task['project'], 'process_time'),
                                       task['track']['process'].get('time'))
        self.projects[task['project']].active_tasks.appendleft((time.time(), task))
        return ret

    def init_one(self, ioloop, fetcher, processor,
                 result_worker=None, interactive=False):
        self.ioloop = ioloop
        self.fetcher = fetcher
        self.processor = processor
        self.result_worker = result_worker
        self.interactive = interactive
        self.running_task = 0

    @gen.coroutine
    def do_task(self, task):
        self.running_task += 1
        result = yield gen.Task(self.fetcher.fetch, task)
        type, task, response = result.args
        self.processor.on_task(task, response)
        # do with message
        while not self.processor.inqueue.empty():
            _task, _response = self.processor.inqueue.get()
            self.processor.on_task(_task, _response)
        # do with results
        while not self.processor.result_queue.empty():
            _task, _result = self.processor.result_queue.get()
            if self.result_worker:
                self.result_worker.on_result(_task, _result)
        self.running_task -= 1

    def send_task(self, task, force=True):
        if self.fetcher.http_client.free_size() <= 0:
            if force:
                self._send_buffer.appendleft(task)
            else:
                raise self.outqueue.Full
        self.ioloop.add_future(self.do_task(task), lambda x: x.result())

    def run(self):
        import tornado.ioloop
        tornado.ioloop.PeriodicCallback(self.run_once, 100,
                                        io_loop=self.ioloop).start()
        self.ioloop.start()

    def quit(self):
        self.ioloop.stop()
        logger.info("scheduler exiting...")


import random
import threading
from pyspider.database.sqlite.sqlitebase import SQLiteMixin


class ThreadBaseScheduler(Scheduler):
    def __init__(self, threads=4, *args, **kwargs):
        self.local = threading.local()

        super(ThreadBaseScheduler, self).__init__(*args, **kwargs)

        if isinstance(self.taskdb, SQLiteMixin):
            self.threads = 1
        else:
            self.threads = threads

        self._taskdb = self.taskdb
        self._projectdb = self.projectdb
        self._resultdb = self.resultdb

        self.thread_objs = []
        self.thread_queues = []
        self._start_threads()
        assert len(self.thread_queues) > 0

    @property
    def taskdb(self):
        if not hasattr(self.local, 'taskdb'):
            self.taskdb = self._taskdb.copy()
        return self.local.taskdb

    @taskdb.setter
    def taskdb(self, taskdb):
        self.local.taskdb = taskdb

    @property
    def projectdb(self):
        if not hasattr(self.local, 'projectdb'):
            self.projectdb = self._projectdb.copy()
        return self.local.projectdb

    @projectdb.setter
    def projectdb(self, projectdb):
        self.local.projectdb = projectdb

    @property
    def resultdb(self):
        if not hasattr(self.local, 'resultdb'):
            self.resultdb = self._resultdb.copy()
        return self.local.resultdb

    @resultdb.setter
    def resultdb(self, resultdb):
        self.local.resultdb = resultdb

    def _start_threads(self):
        for i in range(self.threads):
            queue = Queue.Queue()
            thread = threading.Thread(target=self._thread_worker, args=(queue, ))
            thread.daemon = True
            thread.start()
            self.thread_objs.append(thread)
            self.thread_queues.append(queue)

    def _thread_worker(self, queue):
        while True:
            method, args, kwargs = queue.get()
            try:
                method(*args, **kwargs)
            except Exception as e:
                logger.exception(e)

    def _run_in_thread(self, method, *args, **kwargs):
        i = kwargs.pop('_i', None)
        block = kwargs.pop('_block', False)

        if i is None:
            while True:
                for queue in self.thread_queues:
                    if queue.empty():
                        break
                else:
                    if block:
                        time.sleep(0.1)
                        continue
                    else:
                        queue = self.thread_queues[random.randint(0, len(self.thread_queues)-1)]
                break
        else:
            queue = self.thread_queues[i % len(self.thread_queues)]

        queue.put((method, args, kwargs))

        if block:
            self._wait_thread()

    def _wait_thread(self):
        while True:
            if all(queue.empty() for queue in self.thread_queues):
                break
            time.sleep(0.1)

    def _update_project(self, project):
        self._run_in_thread(Scheduler._update_project, self, project)

    def on_task_status(self, task):
        i = hash(task['taskid'])
        self._run_in_thread(Scheduler.on_task_status, self, task, _i=i)

    def on_request(self, task):
        i = hash(task['taskid'])
        self._run_in_thread(Scheduler.on_request, self, task, _i=i)

    def _load_put_task(self, project, taskid):
        i = hash(taskid)
        self._run_in_thread(Scheduler._load_put_task, self, project, taskid, _i=i)

    def run_once(self):
        super(ThreadBaseScheduler, self).run_once()
        self._wait_thread()


================================================
FILE: pyspider/scheduler/task_queue.py
================================================
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:
# Author: Binux<i@binux.me>
#         http://binux.me
# Created on 2014-02-07 13:12:10

import heapq
import logging
import threading
import time

try:
    from UserDict import DictMixin
except ImportError:
    from collections import Mapping as DictMixin
from .token_bucket import Bucket
from six.moves import queue as Queue

logger = logging.getLogger('scheduler')

try:
    cmp
except NameError:
    cmp = lambda x, y: (x > y) - (x < y)


class AtomInt(object):
    __value__ = 0
    __mutex__ = threading.RLock()

    @classmethod
    def get_value(cls):
        cls.__mutex__.acquire()
        cls.__value__ = cls.__value__ + 1
        value = cls.__value__
        cls.__mutex__.release()
        return value


class InQueueTask(DictMixin):
    __slots__ = ('taskid', 'priority', 'exetime', 'sequence')
    __getitem__ = lambda *x: getattr(*x)
    __setitem__ = lambda *x: setattr(*x)
    __iter__ = lambda self: iter(self.__slots__)
    __len__ = lambda self: len(self.__slots__)
    keys = lambda self: self.__slots__

    def __init__(self, taskid, priority=0, exetime=0):
        self.taskid = taskid
        self.priority = priority
        self.exetime = exetime
        self.sequence = AtomInt.get_value()

    def __cmp__(self, other):
        if self.exetime == 0 and other.exetime == 0:
            diff = -cmp(self.priority, other.priority)
        else:
            diff = cmp(self.exetime, other.exetime)

        # compare in-queue sequence number finally if two element has the same
        # priority or exetime
        return diff if diff != 0 else cmp(self.sequence, other.sequence)

    def __lt__(self, other):
        return self.__cmp__(other) < 0


class PriorityTaskQueue(Queue.Queue):
    '''
    TaskQueue

    Same taskid items will been merged
    '''

    def _init(self, maxsize):
        self.queue = []
        self.queue_dict = dict()

    def _qsize(self, len=len):
        return len(self.queue_dict)

    def _put(self, item, heappush=heapq.heappush):
        if item.taskid in self.queue_dict:
            task = self.queue_dict[item.taskid]
            changed = False
            if item < task:
                changed = True
            task.priority = max(item.priority, task.priority)
            task.exetime = min(item.exetime, task.exetime)
            if changed:
                self._resort()
        else:
            heappush(self.queue, item)
            self.queue_dict[item.taskid] = item

    def _get(self, heappop=heapq.heappop):
        while self.queue:
            item = heappop(self.queue)
            if item.taskid is None:
                continue
            self.queue_dict.pop(item.taskid, None)
            return item
        return None

    @property
    def top(self):
        while self.queue and self.queue[0].taskid is None:
            heapq.heappop(self.queue)
        if self.queue:
            return self.queue[0]
        return None

    def _resort(self):
        heapq.heapify(self.queue)

    def __contains__(self, taskid):
        return taskid in self.queue_dict

    def __getitem__(self, taskid):
        return self.queue_dict[taskid]

    def __setitem__(self, taskid, item):
        assert item.taskid == taskid
        self.put(item)

    def __delitem__(self, taskid):
        self.queue_dict.pop(taskid).taskid = None


class TaskQueue(object):
    '''
    task queue for scheduler, have a priority queue and a time queue for delayed tasks
    '''
    processing_timeout = 10 * 60

    def __init__(self, rate=0, burst=0):
        self.mutex = threading.RLock()
        self.priority_queue = PriorityTaskQueue()
        self.time_queue = PriorityTaskQueue()
        self.processing = PriorityTaskQueue()
        self.bucket = Bucket(rate=rate, burst=burst)

    @property
    def rate(self):
        return self.bucket.rate

    @rate.setter
    def rate(self, value):
        self.bucket.rate = value

    @property
    def burst(self):
        return self.bucket.burst

    @burst.setter
    def burst(self, value):
        self.bucket.burst = value

    def check_update(self):
        '''
        Check time queue and processing queue

        put tasks to priority queue when execute time arrived or process timeout
        '''
        self._check_time_queue()
        self._check_processing()

    def _check_time_queue(self):
        now = time.time()
        self.mutex.acquire()
        while self.time_queue.qsize() and self.time_queue.top and self.time_queue.top.exetime < now:
            task = self.time_queue.get_nowait()  # type: InQueueTask
            task.exetime = 0
            self.priority_queue.put(task)
        self.mutex.release()

    def _check_processing(self):
        now = time.time()
        self.mutex.acquire()
        while self.processing.qsize() and self.processing.top and self.processing.top.exetime < now:
            task = self.processing.get_nowait()
            if task.taskid is None:
                continue
            task.exetime = 0
            self.priority_queue.put(task)
            logger.info("processing: retry %s", task.taskid)
        self.mutex.release()

    def put(self, taskid, priority=0, exetime=0):
        """
        Put a task into task queue
        
        when use heap sort, if we put tasks(with the same priority and exetime=0) into queue,
        the queue is not a strict FIFO queue, but more like a FILO stack.
        It is very possible that when there are continuous big flow, the speed of select is 
        slower than request, resulting in priority-queue accumulation in short time.
        In this scenario, the tasks more earlier entering the priority-queue will not get 
        processed until the request flow becomes small. 
        
        Thus, we store a global atom self increasing value into task.sequence which represent 
        the task enqueue sequence. When the comparison of exetime and priority have no 
        difference, we compare task.sequence to ensure that the entire queue is ordered.
        """
        now = time.time()

        task = InQueueTask(taskid, priority, exetime)

        self.mutex.acquire()
        if taskid in self.priority_queue:
            self.priority_queue.put(task)
        elif taskid in self.time_queue:
            self.time_queue.put(task)
        elif taskid in self.processing and self.processing[taskid].taskid:
            # force update a processing task is not allowed as there are so many
            # problems may happen
            pass
        else:
            if exetime and exetime > now:
                self.time_queue.put(task)
            else:
                task.exetime = 0
                self.priority_queue.put(task)

        self.mutex.release()

    def get(self):
        '''Get a task from queue when bucket available'''
        if self.bucket.get() < 1:
            return None
        now = time.time()
        self.mutex.acquire()
        try:
            task = self.priority_queue.get_nowait()
            self.bucket.desc()
        except Queue.Empty:
            self.mutex.release()
            return None
        task.exetime = now + self.processing_timeout
        self.processing.put(task)
        self.mutex.release()
        return task.taskid

    def done(self, taskid):
        '''Mark task done'''
        if taskid in self.processing:
            self.mutex.acquire()
            if taskid in self.processing:
                del self.processing[taskid]
            self.mutex.release()
            return True
        return False

    def delete(self, taskid):
        if taskid not in self:
            return False
        if taskid in self.priority_queue:
            self.mutex.acquire()
            del self.priority_queue[taskid]
            self.mutex.release()
        elif taskid in self.time_queue:
            self.mutex.acquire()
            del self.time_queue[taskid]
            self.mutex.release()
        elif taskid in self.processing:
            self.done(taskid)
        return True

    def size(self):
        return self.priority_queue.qsize() + self.time_queue.qsize() + self.processing.qsize()

    def is_processing(self, taskid):
        '''
        return True if taskid is in processing
        '''
        return taskid in self.processing and self.processing[taskid].taskid

    def __len__(self):
        return self.size()

    def __contains__(self, taskid):
        if taskid in self.priority_queue or taskid in self.time_queue:
            return True
        if taskid in self.processing and self.processing[taskid].taskid:
            return True
        return False


if __name__ == '__main__':
    task_queue = TaskQueue()
    task_queue.processing_timeout = 0.1
    task_queue.put('a3', 3, time.time() + 0.1)
    task_queue.put('a1', 1)
    task_queue.put('a2', 2)
    assert task_queue.get() == 'a2'
    time.sleep(0.1)
    task_queue._check_time_queue()
    assert task_queue.get() == 'a3'
    assert task_queue.get() == 'a1'
    task_queue._check_processing()
    assert task_queue.get() == 'a2'
    assert len(task_queue) == 0


================================================
FILE: pyspider/scheduler/token_bucket.py
================================================
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:
# Author: Binux<i@binux.me>
#         http://binux.me
# Created on 2014-02-07 16:53:08

import time
try:
    import threading as _threading
except ImportError:
    import dummy_threading as _threading


class Bucket(object):

    '''
    traffic flow control with token bucket
    '''

    update_interval = 30

    def __init__(self, rate=1, burst=None):
        self.rate = float(rate)
        if burst is None:
            self.burst = float(rate) * 10
        else:
            self.burst = float(burst)
        self.mutex = _threading.Lock()
        self.bucket = self.burst
        self.last_update = time.time()

    def get(self):
        '''Get the number of tokens in bucket'''
        now = time.time()
        if self.bucket >= self.burst:
            self.last_update = now
            return self.bucket
        bucket = self.rate * (now - self.last_update)
        self.mutex.acquire()
        if bucket > 1:
            self.bucket += bucket
            if self.bucket > self.burst:
                self.bucket = self.burst
            self.last_update = now
        self.mutex.release()
        return self.bucket

    def set(self, value):
        '''Set number of tokens in bucket'''
        self.bucket = value

    def desc(self, value=1):
        '''Use value tokens'''
        self.bucket -= value


================================================
FILE: pyspider/webui/__init__.py
================================================
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:
# Author: Binux<i@binux.me>
#         http://binux.me
# Created on 2014-02-22 23:20:40

from . import app, index, debug, task, result, login


================================================
FILE: pyspider/webui/app.py
================================================
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:
# Author: Binux<i@binux.me>
#         http://binux.me
# Created on 2014-02-22 23:17:13

import os
import sys
import logging
logger = logging.getLogger("webui")

from six import reraise
from six.moves import builtins
from six.moves.urllib.parse import urljoin
from flask import Flask
from pyspider.fetcher import tornado_fetcher

if os.name == 'nt':
    import mimetypes
    mimetypes.add_type("text/css", ".css", True)


class QuitableFlask(Flask):
    """Add quit() method to Flask object"""

    @property
    def logger(self):
        return logger

    def run(self, host=None, port=None, debug=None, **options):
        import tornado.wsgi
        import tornado.ioloop
        import tornado.httpserver
        import tornado.web

        if host is None:
            host = '127.0.0.1'
        if port is None:
            server_name = self.config['SERVER_NAME']
            if server_name and ':' in server_name:
                port = int(server_name.rsplit(':', 1)[1])
            else:
                port = 5000
        if debug is not None:
            self.debug = bool(debug)

        hostname = host
        port = port
        application = self
        use_reloader = self.debug
        use_debugger = self.debug

        if use_debugger:
            from werkzeug.debug import DebuggedApplication
            application = DebuggedApplication(application, True)

        try:
            from .webdav import dav_app
        except ImportError as e:
            logger.warning('WebDav interface not enabled: %r', e)
            dav_app = None
        if dav_app:
            from werkzeug.wsgi import DispatcherMiddleware
            application = DispatcherMiddleware(application, {
                '/dav': dav_app
            })

        container = tornado.wsgi.WSGIContainer(application)
        self.http_server = tornado.httpserver.HTTPServer(container)
        self.http_server.listen(port, hostname)
        if use_reloader:
            from tornado import autoreload
            autoreload.start()

        self.logger.info('webui running on %s:%s', hostname, port)
        self.ioloop = tornado.ioloop.IOLoop.current()
        self.ioloop.start()

    def quit(self):
        if hasattr(self, 'ioloop'):
            self.ioloop.add_callback(self.http_server.stop)
            self.ioloop.add_callback(self.ioloop.stop)
        self.logger.info('webui exiting...')


app = QuitableFlask('webui',
                    static_folder=os.path.join(os.path.dirname(__file__), 'static'),
                    template_folder=os.path.join(os.path.dirname(__file__), 'templates'))
app.secret_key = os.urandom(24)
app.jinja_env.line_statement_prefix = '#'
app.jinja_env.globals.update(builtins.__dict__)

app.config.update({
    'fetch': lambda x: tornado_fetcher.Fetcher(None, None, async_mode=False).fetch(x),
    'taskdb': None,
    'projectdb': None,
    'scheduler_rpc': None,
    'queues': dict(),
    'process_time_limit': 30,
})


def cdn_url_handler(error, endpoint, kwargs):
    if endpoint == 'cdn':
        path = kwargs.pop('path')
        # cdn = app.config.get('cdn', 'http://cdn.staticfile.org/')
        # cdn = app.config.get('cdn', '//cdnjs.cloudflare.com/ajax/libs/')
        cdn = app.config.get('cdn', '//cdnjscn.b0.upaiyun.com/libs/')
        return urljoin(cdn, path)
    else:
        exc_type, exc_value, tb = sys.exc_info()
        if exc_value is error:
            reraise(exc_type, exc_value, tb)
        else:
            raise error
app.handle_url_build_error = cdn_url_handler


================================================
FILE: pyspider/webui/bench_test.py
================================================
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:
# Author: Binux<roy@binux.me>
#         http://binux.me
# Created on 2014-12-08 22:31:17

import random
try:
    from urllib import urlencode
except ImportError:
    from urllib.parse import urlencode

from flask import request
from .app import app


@app.route('/bench')
def bench_test():
    total = int(request.args.get('total', 10000))
    show = int(request.args.get('show', 20))
    nlist = [random.randint(1, total) for _ in range(show)]
    result = []
    result.append("<html><head></head><body>")
    args = dict(request.args)
    for nl in nlist:
        args['n'] = nl
        argstr = urlencode(sorted(args.items()), doseq=True)
        result.append("<a href='/bench?{0}'>follow {1}</a><br>".format(argstr, nl))
    result.append("</body></html>")
    return "".join(result)


================================================
FILE: pyspider/webui/debug.py
================================================
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:
# Author: Binux<i@binux.me>
#         http://binux.me
# Created on 2014-02-23 00:19:06


import sys
import time
import socket
import inspect
import datetime
import traceback
from flask import render_template, request, json

try:
    import flask_login as login
except ImportError:
    from flask.ext import login

from pyspider.libs import utils, sample_handler, dataurl
from pyspider.libs.response import rebuild_response
from pyspider.processor.project_module import ProjectManager, ProjectFinder
from .app import app

default_task = {
    'taskid': 'data:,on_start',
    'project': '',
    'url': 'data:,on_start',
    'process': {
        'callback': 'on_start',
    },
}
default_script = inspect.getsource(sample_handler)


@app.route('/debug/<project>', methods=['GET', 'POST'])
def debug(project):
    projectdb = app.config['projectdb']
    if not projectdb.verify_project_name(project):
        return 'project name is not allowed!', 400
    info = projectdb.get(project, fields=['name', 'script'])
    if info:
        script = info['script']
    else:
        script = (default_script
                  .replace('__DATE__', datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
                  .replace('__PROJECT_NAME__', project)
                  .replace('__START_URL__', request.values.get('start-urls') or '__START_URL__'))

    taskid = request.args.get('taskid')
    if taskid:
        taskdb = app.config['taskdb']
        task = taskdb.get_task(
            project, taskid, ['taskid', 'project', 'url', 'fetch', 'process'])
    else:
        task = default_task

    default_task['project'] = project
    return render_template("debug.html", task=task, script=script, project_name=project)


@app.before_first_request
def enable_projects_import():
    sys.meta_path.append(ProjectFinder(app.config['projectdb']))


@app.route('/debug/<project>/run', methods=['POST', ])
def run(project):
    start_time = time.time()
    try:
        task = utils.decode_unicode_obj(json.loads(request.form['task']))
    except Exception:
        result = {
            'fetch_result': "",
            'logs': u'task json error',
            'follows': [],
            'messages': [],
            'result': None,
            'time': time.time() - start_time,
        }
        return json.dumps(utils.unicode_obj(result)), \
            200, {'Content-Type': 'application/json'}

    project_info = {
        'name': project,
        'status': 'DEBUG',
        'script': request.form['script'],
    }

    if request.form.get('webdav_mode') == 'true':
        projectdb = app.config['projectdb']
        info = projectdb.get(project, fields=['name', 'script'])
        if not info:
            result = {
                'fetch_result': "",
                'logs': u' in wevdav mode, cannot load script',
                'follows': [],
                'messages': [],
                'result': None,
                'time': time.time() - start_time,
            }
            return json.dumps(utils.unicode_obj(result)), \
                200, {'Content-Type': 'application/json'}
        project_info['script'] = info['script']

    fetch_result = {}
    try:
        module = ProjectManager.build_module(project_info, {
            'debugger': True,
            'process_time_limit': app.config['process_time_limit'],
        })

        # The code below is to mock the behavior that crawl_config been joined when selected by scheduler.
        # but to have a better view of joined tasks, it has been done in BaseHandler.crawl when `is_debugger is True`
        # crawl_config = module['instance'].crawl_config
        # task = module['instance'].task_join_crawl_config(task, crawl_config)

        fetch_result = app.config['fetch'](task)
        response = rebuild_response(fetch_result)

        ret = module['instance'].run_task(module['module'], task, response)
    except Exception:
        type, value, tb = sys.exc_info()
        tb = utils.hide_me(tb, globals())
        logs = ''.join(traceback.format_exception(type, value, tb))
        result = {
            'fetch_result': fetch_result,
            'logs': logs,
            'follows': [],
            'messages': [],
            'result': None,
            'time': time.time() - start_time,
        }
    else:
        result = {
            'fetch_result': fetch_result,
            'logs': ret.logstr(),
            'follows': ret.follows,
            'messages': ret.messages,
            'result': ret.result,
            'time': time.time() - start_time,
        }
        result['fetch_result']['content'] = response.text
        if (response.headers.get('content-type', '').startswith('image')):
            result['fetch_result']['dataurl'] = dataurl.encode(
                response.content, response.headers['content-type'])

    try:
        # binary data can't encode to JSON, encode result as unicode obj
        # before send it to frontend
        return json.dumps(utils.unicode_obj(result)), 200, {'Content-Type': 'application/json'}
    except Exception:
        type, value, tb = sys.exc_info()
        tb = utils.hide_me(tb, globals())
        logs = ''.join(traceback.format_exception(type, value, tb))
        result = {
            'fetch_result': "",
            'logs': logs,
            'follows': [],
            'messages': [],
            'result': None,
            'time': time.time() - start_time,
        }
        return json.dumps(utils.unicode_obj(result)), 200, {'Content-Type': 'application/json'}


@app.route('/debug/<project>/save', methods=['POST', ])
def save(project):
    projectdb = app.config['projectdb']
    if not projectdb.verify_project_name(project):
        return 'project name is not allowed!', 400
    script = request.form['script']
    project_info = projectdb.get(project, fields=['name', 'status', 'group'])
    if project_info and 'lock' in projectdb.split_group(project_info.get('group')) \
            and not login.current_user.is_active():
        return app.login_response

    if project_info:
        info = {
            'script': script,
        }
        if project_info.get('status') in ('DEBUG', 'RUNNING', ):
            info['status'] = 'CHECKING'
        projectdb.update(project, info)
    else:
        info = {
            'name': project,
            'script': script,
            'status': 'TODO',
            'rate': app.config.get('max_rate', 1),
            'burst': app.config.get('max_burst', 3),
        }
        projectdb.insert(project, info)

    rpc = app.config['scheduler_rpc']
    if rpc is not None:
        try:
            rpc.update_project()
        except socket.error as e:
            app.logger.warning('connect to scheduler rpc error: %r', e)
            return 'rpc error', 200

    return 'ok', 200


@app.route('/debug/<project>/get')
def get_script(project):
    projectdb = app.config['projectdb']
    if not projectdb.verify_project_name(project):
        return 'project name is not allowed!', 400
    info = projectdb.get(project, fields=['name', 'script'])
    return json.dumps(utils.unicode_obj(info)), \
        200, {'Content-Type': 'application/json'}


@app.route('/blank.html')
def blank_html():
    return ""


================================================
FILE: pyspider/webui/index.py
================================================
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:
# Author: Binux<i@binux.me>
#         http://binux.me
# Created on 2014-02-22 23:20:39

import socket

from six import iteritems, itervalues
from flask import render_template, request, json

try:
    import flask_login as login
except ImportError:
    from flask.ext import login

from .app import app

index_fields = ['name', 'group', 'status', 'comments', 'rate', 'burst', 'updatetime']


@app.route('/')
def index():
    projectdb = app.config['projectdb']
    projects = sorted(projectdb.get_all(fields=index_fields),
                      key=lambda k: (0 if k['group'] else 1, k['group'] or '', k['name']))
    return render_template("index.html", projects=projects)


@app.route('/queues')
def get_queues():
    def try_get_qsize(queue):
        if queue is None:
            return 'None'
        try:
            return queue.qsize()
        except Exception as e:
            return "%r" % e

    result = {}
    queues = app.config.get('queues', {})
    for key in queues:
        result[key] = try_get_qsize(queues[key])
    return json.dumps(result), 200, {'Content-Type': 'application/json'}


@app.route('/update', methods=['POST', ])
def project_update():
    projectdb = app.config['projectdb']
    project = request.form['pk']
    name = request.form['name']
    value = request.form['value']

    project_info = projectdb.get(project, fields=('name', 'group'))
    if not project_info:
        return "no such project.", 404
    if 'lock' in projectdb.split_group(project_info.get('group')) \
            and not login.current_user.is_active():
        return app.login_response

    if name not in ('group', 'status', 'rate'):
        return 'unknown field: %s' % name, 400
    if name == 'rate':
        value = value.split('/')
        if len(value) != 2:
            return 'format error: rate/burst', 400
        rate = float(value[0])
        burst = float(value[1])
        update = {
            'rate': min(rate, app.config.get('max_rate', rate)),
            'burst': min(burst, app.config.get('max_burst', burst)),
        }
    else:
        update = {
            name: value
        }

    ret = projectdb.update(project, update)
    if ret:
        rpc = app.config['scheduler_rpc']
        if rpc is not None:
            try:
                rpc.update_project()
            except socket.error as e:
                app.logger.warning('connect to scheduler rpc error: %r', e)
                return 'rpc error', 200
        return 'ok', 200
    else:
        app.logger.warning("[webui index] projectdb.update() error - res: {}".format(ret))
        return 'update error', 500


@app.route('/counter')
def counter():
    rpc = app.config['scheduler_rpc']
    if rpc is None:
        return json.dumps({})

    result = {}
    try:
        data = rpc.webui_update()
        for type, counters in iteritems(data['counter']):
            for project, counter in iteritems(counters):
                result.setdefault(project, {})[type] = counter
        for project, paused in iteritems(data['pause_status']):
            result.setdefault(project, {})['paused'] = paused
    except socket.error as e:
        app.logger.warning('connect to scheduler rpc error: %r', e)
        return json.dumps({}), 200, {'Content-Type': 'application/json'}

    return json.dumps(result), 200, {'Content-Type': 'application/json'}


@app.route('/run', methods=['POST', ])
def runtask():
    rpc = app.config['scheduler_rpc']
    if rpc is None:
        return json.dumps({})

    projectdb = app.config['projectdb']
    project = request.form['project']
    project_info = projectdb.get(project, fields=('name', 'group'))
    if not project_info:
        return "no such project.", 404
    if 'lock' in projectdb.split_group(project_info.get('group')) \
            and not login.current_user.is_active():
        return app.login_response

    newtask = {
        "project": project,
        "taskid": "on_start",
        "url": "data:,on_start",
        "process": {
            "callback": "on_start",
        },
        "schedule": {
            "age": 0,
            "priority": 9,
            "force_update": True,
        },
    }

    try:
        ret = rpc.newtask(newtask)
    except socket.error as e:
        app.logger.warning('connect to scheduler rpc error: %r', e)
        return json.dumps({"result": False}), 200, {'Content-Type': 'application/json'}
    return json.dumps({"result": ret}), 200, {'Content-Type': 'application/json'}


@app.route('/robots.txt')
def robots():
    return """User-agent: *
Disallow: /
Allow: /$
Allow: /debug
Disallow: /debug/*?taskid=*
""", 200, {'Content-Type': 'text/plain'}


================================================
FILE: pyspider/webui/login.py
================================================
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:
# Author: Binux<roy@binux.me>
#         http://binux.me
# Created on 2014-12-10 20:36:27

import base64
from flask import Response
try:
    import flask_login as login
except ImportError:
    from flask.ext import login
from .app import app

login_manager = login.LoginManager()
login_manager.init_app(app)


class AnonymousUser(login.AnonymousUserMixin):

    def is_anonymous(self):
        return True

    def is_active(self):
        return False

    def is_authenticated(self):
        return False

    def get_id(self):
        return


class User(login.UserMixin):

    def __init__(self, id, password):
        self.id = id
        self.password = password

    def is_authenticated(self):
        if not app.config.get('webui_username'):
            return True
        if self.id == app.config.get('webui_username') \
                and self.password == app.config.get('webui_password'):
            return True
        return False

    def is_active(self):
        return self.is_authenticated()


login_manager.anonymous_user = AnonymousUser


@login_manager.request_loader
def load_user_from_request(request):
    api_key = request.headers.get('Authorization')
    if api_key:
        api_key = api_key[len("Basic "):]
        try:
            api_key = base64.b64decode(api_key).decode('utf8')
            return User(*api_key.split(":", 1))
        except Exception as e:
            app.logger.error('wrong api key: %r, %r', api_key, e)
            return None
    return None
app.login_response = Response(
    "need auth.", 401, {'WWW-Authenticate': 'Basic realm="Login Required"'}
)


@app.before_request
def before_request():
    if app.config.get('need_auth', False):
        if not login.current_user.is_active():
            return app.login_response


================================================
FILE: pyspider/webui/result.py
================================================
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:
# Author: Binux<i@binux.me>
#         http://binux.me
# Created on 2014-10-19 16:23:55

from __future__ import unicode_literals

from flask import render_template, request, json
from flask import Response
from .app import app
from pyspider.libs import result_dump


@app.route('/results')
def result():
    resultdb = app.config['resultdb']
    project = request.args.get('project')
    offset = int(request.args.get('offset', 0))
    limit = int(request.args.get('limit', 20))

    count = resultdb.count(project)
    results = list(resultdb.select(project, offset=offset, limit=limit))

    return render_template(
        "result.html", count=count, results=results,
        result_formater=result_dump.result_formater,
        project=project, offset=offset, limit=limit, json=json
    )


@app.route('/results/dump/<project>.<_format>')
def dump_result(project, _format):
    resultdb = app.config['resultdb']
    # force update project list
    resultdb.get(project, 'any')
    if project not in resultdb.projects:
        return "no such project.", 404

    offset = int(request.args.get('offset', 0)) or None
    limit = int(request.args.get('limit', 0)) or None
    results = resultdb.select(project, offset=offset, limit=limit)

    if _format == 'json':
        valid = request.args.get('style', 'rows') == 'full'
        return Response(result_dump.dump_as_json(results, valid),
                        mimetype='application/json')
    elif _format == 'txt':
        return Response(result_dump.dump_as_txt(results),
                        mimetype='text/plain')
    elif _format == 'csv':
        return Response(result_dump.dump_as_csv(results),
                        mimetype='text/csv')


================================================
FILE: pyspider/webui/static/.babelrc
================================================
{
  "presets": ["es2015"]
}


================================================
FILE: pyspider/webui/static/package.json
================================================
{
  "name": "pyspider-webui",
  "version": "0.3.9",
  "description": "webui of pyspider",
  "scripts": {
    "build": "webpack --progress --colors --optimize-minimize",
    "dev": "webpack --progress --colors --optimize-minimize --watch"
  },
  "keywords": [
    "pyspider"
  ],
  "author": "binux",
  "license": "MIT",
  "devDependencies": {
    "babel-core": "^6.14.0",
    "babel-loader": "^6.2.5",
    "babel-preset-es2015": "^6.14.0",
    "css-loader": "^0.25.0",
    "extract-text-webpack-plugin": "^1.0.1",
    "less": "^2.7.1",
    "less-loader": "^2.2.3",
    "style-loader": "^0.13.1",
    "webpack": "^1.13.2"
  }
}


================================================
FILE: pyspider/webui/static/src/css_selector_helper.js
================================================
// vim: set et sw=2 ts=2 sts=2 ff=unix fenc=utf8:
// Author: Binux<i@binux.me>
//         http://binux.me
// Created on 2013-11-11 18:50:58

import EventEmitter from 'events'

function arrayEquals(a, b) {
  if (!a || !b)
    return false;
  if (a.length != b.length)
    return false;

  for (var i = 0, l = a.length; i < l; i++) {
    if (a[i] !== b[i])
      return false;
  }
  return true;
}

function getOffset(elem) {
  var top = 0;
  var left = 0;
  do {
    if ( !isNaN( elem.offsetLeft) ) left += elem.offsetLeft;
    if ( !isNaN( elem.offsetTop) ) top += elem.offsetTop;
  } while( elem = elem.offsetParent )
  return {top: top, left: left};
}

function merge_name(features) {
  var element_name = '';
  features.forEach(function(f) {
    if (f.selected)
      element_name += f.name;
  })
  return element_name;
}

function merge_pattern(path, end) {
  var pattern = '';
  var prev = null;
  path.forEach(function(p, i) {
    if (end >= 0 && i > end) {
      return;
    }
    if (p.invalid) {
      prev = null;
    } else if (p.selected) {
      if (prev) {
        pattern += ' >';
      }
      var element_pattern = '';
      p.features.forEach(function(f) {
        if (f.selected) {
          element_pattern += f.pattern;
        }
      });
      if (element_pattern === '') {
        element_pattern = '*';
      }
      pattern += ' '+element_pattern;
      prev = p;
    } else {
      prev = null;
    }
  })
  if (pattern === '') {
    pattern = '*';
  }
  return pattern;
}


function path_info(doc, element) {
  var path = [];
  do {
    var features = [];
    // tagName
    features.push({
      name: element.tagName.toLowerCase(),
      pattern: element.tagName.toLowerCase(),
      selected: true,
    });
    // id
    if (element.getAttribute('id')) {
      features.push({
        name: '#'+element.getAttribute('id'),
        pattern: '#'+element.getAttribute('id'),
        selected: true,
      });
    }
    // class
    if (element.classList.length > 0) {
      for (var i=0; i<element.classList.length; i++) {
        var class_name = element.classList[i];
        features.push({
          name: '.'+class_name,
          pattern: '.'+class_name,
          selected: true,
        });
      }
    }
    // rel, property
    var allowed_attr_names = ('rel', 'property', 'itemprop');
    for (var i=0, attrs = element.attributes; i < attrs.length; i++) {
      if (allowed_attr_names.indexOf(attrs[i].nodeName) == -1) {
        continue
      }
      features.push({
        name: '['+attrs[i].nodeName+'='+JSON.stringify(attrs[i].nodeValue)+']',
        pattern: '['+attrs[i].nodeName+'='+JSON.stringify(attrs[i].nodeValue)+']',
        selected: true,
      });
    }

    // get xpath
    var siblings = element.parentNode.childNodes;
    var xpath = element.tagName.toLowerCase();
    for (var i=0, ix=0; siblings.length > 1 && i < siblings.length; i++) {
      var sibling = siblings[i];
      if (sibling === element) {
        xpath += '['+(ix+1)+']';
        break;
      } else if (sibling.tagName == element.tagName) {
        ix++;
      }
    }

    // pack it up
    path.push({
      tag: element.tagName.toLowerCase(),
      name: merge_name(features),
      xpath: xpath,
      selected: true,
      invalid: element.tagName.toLowerCase() === 'tbody',
      features: features,
    });
  } while (element = element.parentElement);

  path.reverse();

  // select elements
  var selected_elements = doc.querySelectorAll(merge_pattern(path));
  path.forEach(function(p, i) {
    if (p.invalid)
      return;
    // select features
    var feature_selected_elements = doc.querySelectorAll(merge_pattern(path, i));
    p.features.forEach(function(f, fi) {
      f.selected = false;
      if (arrayEquals(feature_selected_elements,
        doc.querySelectorAll(merge_pattern(path, i)))) {
          return;
        }
      f.selected = true;
    });
    if (p.features.every(function(f) {
      return !f.selected;
    })) {
      p.features[0].selected = true;
    }
    p.name = merge_name(p.features);
  });

  path.forEach(function(p, i) {
    p.selected = false;
    if (arrayEquals(selected_elements,
      doc.querySelectorAll(merge_pattern(path)))) {
        p.name = p.tag;
        return;
      }
    p.selected = true;
  });

  return path;
}

export default class CSSSelectorHelperServer extends EventEmitter {
  constructor(window) {
    super();

    this.window = window;
    this.document = window.document;

    this.document.addEventListener("mouseover", (ev) => {
      this.overlay(ev.target);
    });

    this.document.addEventListener("click", (ev) => {
      ev.preventDefault();
      ev.stopPropagation();

      this.emit('selector_helper_click', path_info(this.document, ev.target));
    });
  }

  overlay(elements) {
    if (typeof elements === 'string') {
      elements = this.document.querySelectorAll(elements);
    }
    if (elements instanceof this.window.Element) {
      elements = [elements];
    }
    [...this.document.querySelectorAll('.pyspider_overlay')].forEach((elem) => {
      elem.remove();
    });
    [...elements].forEach((elem) => {
      const offset = getOffset(elem);
      const div = this.document.createElement("div");
      div.className = "pyspider_overlay";
      div.setAttribute('style', 'z-index: 999999;background-color: rgba(255, 165, 0, 0.3);position: absolute;pointer-events: none;'
        +'top: '+offset.top+'px;'
          +'left:'+offset.left+'px;'
          +'width: '+elem.offsetWidth+'px;'
          +'height: '+elem.offsetHeight+'px;');
      this.document.body.appendChild(div);
    });
  }

  heightlight(elements) {
    if (typeof elements === 'string') {
      elements = this.document.querySelectorAll(elements);
    }
    console.log(elements);
    if (elements instanceof this.window.Element) {
      elements = [elements];
    }
    [...this.document.querySelectorAll('.pyspider_highlight')].forEach((elem) => {
      elem.remove();
    });
    [...elements].forEach((elem) => {
      const offset = getOffset(elem);
      const div = this.document.createElement("div");
      div.className = "pyspider_highlight";
      div.setAttribute('style', 'z-index: 888888;border: 2px solid #c00;position: absolute;pointer-events: none;'
        +'top: '+(offset.top-2)+'px;'
          +'left:'+(offset.left-2)+'px;'
          +'width: '+elem.offsetWidth+'px;'
          +'height: '+elem.offsetHeight+'px;');
      this.document.body.appendChild(div);
    });
  }

  getElementByXpath(path) {
    return this.document.evaluate(path, this.document, null, this.window.XPathResult.FIRST_ORDERED_NODE_TYPE, null).singleNodeValue;
  }
}


================================================
FILE: pyspider/webui/static/src/debug.js
================================================
// vim: set et sw=2 ts=2 sts=2 ff=unix fenc=utf8:
// Author: Binux<i@binux.me>
//         http://binux.me
// Created on 2014-02-23 15:19:19

import "./debug.less"
import "./splitter"
import CSSSelectorHelperServer from "./css_selector_helper"

window.SelectorHelper = (function() {
  var helper = $('#css-selector-helper');
  var server = null;

  function merge_name(p) {
    var features = p.features;
    var element_name = '';
    features.forEach(function(f) {
      if (f.selected)
        element_name += f.name;
    });
    if (element_name === '') {
      return p.tag;
    }
    return element_name;
  }

  function merge_pattern(path, end) {
    var pattern = '';
    var prev = null;
    path.forEach(function(p, i) {
      if (end >= 0 && i > end) {
        return;
      }
      if (p.invalid) {
        prev = null;
      } else if (p.selected) {
        if (prev) {
          pattern += ' >';
        }
        var element_pattern = '';
        p.features.forEach(function(f) {
          if (f.selected) {
            element_pattern += f.pattern;
          }
        });
        if (element_pattern === '') {
          element_pattern = '*';
        }
        pattern += ' '+element_pattern;
        prev = p;
      } else {
        prev = null;
      }
    })
    if (pattern === '') {
      pattern = '*';
    }
    return pattern.trim();
  }

  var current_path = null;
  function selector_changed(path) {
    current_path = path;
    server.heightlight(merge_pattern(path));
  }
  
  function render_selector_helper(path) {
    helper.find('.element').remove();
    var elements = [];
    $.each(path, function(i, p) {
      var span = $('<span>').addClass('element').data('info', p);
      $('<span class="element-name">').text(p.name).appendTo(span);
      if (p.selected) span.addClass('selected');
      if (p.invalid) span.addClass('invalid');

      var ul = $('<ul>');
      $.each(p.features, function(i, f) {
        var li = $('<li>').text(f.name).data('feature', f);
        if (f.selected) li.addClass('selected');
        li.appendTo(ul);
        // feature on click
        li.on('click', function(ev) {
          ev.stopPropagation();
          var $this = $(this);
          var f = $this.data('feature');
          if (f.selected) {
            f.selected = false;
            $this.removeClass('selected');
          } else {
            f.selected = true;
            $this.addClass('selected');
          }
          var element = $this.parents('.element');
          if (!p.selected) {
            p.selected = true;
            element.addClass('selected');
          }
          element.find('.element-name').text(merge_name(p));
          selector_changed(path);
        });
      });
      ul.appendTo(span);

      span.on('mouseover', (ev) => {
        var xpath = [];
        $.each(path, function(i, _p) {
          xpath.push(_p.xpath);
          if (_p === p) {
            return false;
          }
        });
        server.overlay(server.getElementByXpath('/' + xpath.join('/')));
      })
      // path on click
      span.on('click', function(ev) {
        ev.stopPropagation();
        var $this = $(this);
        var p = $this.data('info');
        if (p.selected) {
          p.selected = false;
          $this.removeClass('selected');
        } else {
          p.selected = true;
          $this.addClass('selected');
        }
        $this.find('.element-name').text(merge_name($this.data('info')));
        selector_changed(path);
      });
      elements.push(span);
    });
    helper.prepend(elements);

    adjustHelper();
    selector_changed(path);
  }

  function adjustHelper() {
    while (helper[0].scrollWidth > helper.width()) {
      var e = helper.find('.element:visible:first');
      if (e.length == 0) {
        return;
      }
      e.addClass('invalid').data('info')['invalid'] = true;
    }
  }

  var tab_web = $('#tab-web');
  return {
    init: function() {
      var _this = this;
      _this.clear();

      $("#J-enable-css-selector-helper").on('click', ev => {
        this.clear();
        server = new CSSSelectorHelperServer($("#tab-web iframe")[0].contentWindow);
        server.on('selector_helper_click', path => {
          render_selector_helper(path);
        })
        this.enable();
      });

      $("#task-panel").on("scroll", function(ev) {
        if (!helper.is(':visible')) {
          return;
        }
        if ($("#debug-tabs").position().top < 0) {
          helper.addClass('fixed');
          tab_web.addClass('fixed');
        } else {
          helper.removeClass('fixed');
          tab_web.removeClass('fixed');
        }
      });

      // copy button
      var input = helper.find('.copy-selector-input');
      input.on('focus', function(ev) {
        $(this).select();
      });
      helper.find('.copy-selector').on('click', function(ev) {
        if (!current_path) {
          return;
        }
        if (input.is(':visible')) {
          input.hide();
          helper.find('.element').show();
        } else {
          helper.find('.element').hide();
          input.val(merge_pattern(current_path)).show();
        }
      });
 
      // add button
      helper.find('.add-to-editor').on('click', function(ev) {
        Debugger.python_editor_replace_selection(merge_pattern(current_path));
      });
    },
    clear: function() {
      current_path = null;
      helper.hide();
      helper.removeClass('fixed');
      tab_web.removeClass('fixed');
      helper.find('.element').remove();
    },
    enable: function() {
      helper.show();
      helper.find('.copy-selector-input').hide();
      if ($("#debug-tabs").position().top < 0) {
        helper.addClass('fixed');
        tab_web.addClass('fixed');
      } else {
        helper.removeClass('fixed');
        tab_web.removeClass('fixed');
      }
    },
  }
})();

window.Debugger = (function() {
  var tmp_div = $('<div>');
  function escape(text) {
    return tmp_div.text(text).html();
  }

  return {
    init: function() {
      //init resizer
      this.splitter = $(".debug-panel:not(:first)").splitter().data('splitter')
          .trigger('init')
          .on('resize-start', function() {
            $('#left-area .overlay').show();
          })
          .on('resize-end', function() {
            $('#left-area .overlay').hide();
          });

      //codemirror
      CodeMirror.keyMap.basic.Tab = 'indentMore';
      this.init_python_editor($("#python-editor"));
      this.init_task_editor($("#task-editor"));
      this.bind_debug_tabs();
      this.bind_run();
      this.bind_save();
      this.bind_others();

      // css selector helper
      SelectorHelper.init();
    },

    not_saved: false,
    init_python_editor: function($el) {
      var _this = this;
      this.python_editor_elem = $el;
      var cm = this.python_editor = CodeMirror($el[0], {
        value: script_content,
        mode: "python",
        lineNumbers: true,
        indentUnit: 4,
        lineWrapping: true,
        styleActiveLine: true,
        autofocus: true
      });
      cm.on('focus', function() {
        $el.addClass("focus");
      });
      cm.on('blur', function() {
        $el.removeClass("focus");
      });
      cm.on('change', function() {
        _this.not_saved = true;
      });
      window.addEventListener('beforeunload', function(e) {
        if (_this.not_saved) {
          var returnValue = "You have not saved changes.";
          (e || window.event).returnValue = returnValue;
          return returnValue;
        }
      });
    },

    python_editor_replace_selection: function(content) {
      this.python_editor.getDoc().replaceSelection(content);
    },

    auto_format: function(cm) {
      var pos = cm.getCursor(true);
      CodeMirror.commands.selectAll(cm);
      cm.autoFormatRange(cm.getCursor(true), cm.getCursor(false));
      cm.setCursor(pos);
    },

    format_string: function(value, mode) {
      var div = document.createElement('div');
      var cm = CodeMirror(div, {
        value: value,
        mode: mode
      });
      this.auto_format(cm);
      return cm.getDoc().getValue();
    },

    init_task_editor: function($el) {
      var cm = this.task_editor = CodeMirror($el[0], {
        value: task_content,
        mode: "application/json",
        indentUnit: 2,
        lineWrapping: true,
        styleActiveLine: true,
        lint: true
      });
      this.auto_format(cm);
      cm.getDoc().clearHistory();
      cm.on('focus', function() {
        $el.addClass("focus");
      });
      cm.on('blur', function() {
        $el.removeClass("focus");
      });
    },

    bind_debug_tabs: function() {
      var _this = this;
      $('#tab-control > li[data-id]').on('click', function() {
        $('#tab-control > li[data-id]').removeClass('active');
        var name = $(this).addClass('active').data('id');
        $('#debug-tabs .tab').hide();
        $('#debug-tabs #'+name).show();
      });
      $("#tab-control li[data-id=tab-html]").on('click', function() {
        if (!!!$("#tab-html").data("format")) {
          var html_styled = "";
          CodeMirror.runMode(_this.format_string($("#tab-html pre").text(), 'text/html'), 'text/html',
                             function(text, classname) {
                               if (classname)
                                 html_styled += '<span class="cm-'+classname+'">'+escape(text)+'</span>';
                               else
                                 html_styled += escape(text);
                             });
          $("#tab-html pre").html(html_styled);
          $("#tab-html").data("format", true);
        }
      });
    },

    bind_run: function() {
      var _this = this;
      $('#run-task-btn').on('click', function() {
        _this.run();
      });
      $('#undo-btn').on('click', function(ev) {
        _this.task_editor.execCommand('undo');
      });
      $('#redo-btn').on('click', function(ev) {
        _this.task_editor.execCommand('redo');
      });
    },

    bind_save: function() {
      var _this = this;
      $('#save-task-btn').on('click', function() {
        var script = _this.python_editor.getDoc().getValue();
        $('#right-area .overlay').show();
        $.ajax({
          type: "POST",
          url: location.pathname+'/save',
          data: {
            script: script
          },
          success: function(data) {
            console.log(data);
            _this.python_log('');
            _this.python_log("saved!");
            _this.not_saved = false;
            $('#right-area .overlay').hide();
          },
          error: function(xhr, textStatus, errorThrown) {
            console.log(xhr, textStatus, errorThrown);
            _this.python_log("save error!\n"+xhr.responseText);
            $('#right-area .overlay').hide();
          }
        });
      });
    },

    bind_follows: function() {
      var _this = this;
      $('.newtask').on('click', function() {
        if ($(this).next().hasClass("task-show")) {
          $(this).next().remove();
          return;
        }
        var task = $(this).after('<div class="task-show"><pre class="cm-s-default"></pre></div>').data("task");
        task = JSON.stringify(window.newtasks[task], null, '  ');
        CodeMirror.runMode(task, 'application/json', $(this).next().find('pre')[0]);
      });
      
      $('.newtask .task-run').on('click', function(event) {
        event.preventDefault();
        event.stopPropagation();
        let task_id = $(this).parents('.newtask').data("task");
        let task = window.newtasks[task_id];
        _this.task_editor.setValue(JSON.stringify(task, null, '  '));
        _this.task_updated(task);
        _this.run();
      });
    },

    task_updated: function task_updated(task) {
      $('#history-wrap').hide();
      if (task.project && task.taskid) {
        $.ajax({
          url: `/task/${task.project}:${task.taskid}.json`,
          success: (data) => {
            if (!data.code && !data.error) {
              $('#history-link').attr('href', `/task/${task.project}:${task.taskid}`).text(`status: ${data.status_string}`);
              $('#history-wrap').show();
            }
          }
        })
      }
    },

    bind_others: function() {
      var _this = this;
      $('#python-log-show').on('click', function() {
        if ($('#python-log pre').is(":visible")) {
          $('#python-log pre').hide();
          $(this).height(8);
        } else {
          $('#python-log pre').show();
          $(this).height(0);
        }
      });
      $('.webdav-btn').on('click', function() {
        _this.toggle_webdav_mode(this);
      })
    },

    render_html: function(html, base_url, block_script=true, block_iframe=true) {
      if (html === undefined) {
        html = '';
      }
      let dom = (new DOMParser()).parseFromString(html, "text/html");

      $(dom).find('base').remove();
      $(dom).find('head').prepend('<base>');
      $(dom).find('base').attr('href', base_url);

      if (block_script) {
        $(dom).find('script').attr('type', 'text/plain');
      }
      if (block_iframe) {
        $(dom).find('iframe[src]').each((i, e) => {
          e = $(e);
          e.attr('__src', e.attr('src'))
          e.attr('src', encodeURI('data:text/html;,<h1>iframe blocked</h1>'));
        });
      }

      return dom.documentElement.innerHTML;
    },

    run: function() {
      var script = this.python_editor.getDoc().getValue();
      var task = this.task_editor.getDoc().getValue();
      var _this = this;

      // reset
      SelectorHelper.clear();
      $("#tab-web .iframe-box").html('');
      $("#tab-html pre").html('');
      $('#tab-follows').html('');
      $("#tab-control li[data-id=tab-follows] .num").hide();
      $('#python-log').hide();
      $('#left-area .overlay').show();

      $.ajax({
        type: "POST",
        url: location.pathname+'/run',
        data: {
          webdav_mode: _this.webdav_mode,
          script: _this.webdav_mode ? '' : script,
          task: task
        },
        success: function(data) {
          console.log(data);
          $('#left-area .overlay').hide();

          //web
          $("#tab-web .iframe-box").html('<iframe src="/blank.html" sandbox="allow-same-origin allow-scripts" height="50%"></iframe>');
          const iframe = $("#tab-web iframe")[0];
          const content_type = data.fetch_result.headers && data.fetch_result.headers['Content-Type'] && data.fetch_result.headers['Content-Type'] || "text/plain";

          //html
          $("#tab-html pre").text(data.fetch_result.content);
          $("#tab-html").data("format", true);

          let iframe_content = null;
          if (content_type.indexOf('application/json') == 0) {
            try {
              let content = JSON.parse(data.fetch_result.content);
              content = JSON.stringify(content, null, '  ');
              content = "<html><pre>"+content+"</pre></html>";
              iframe_content = _this.render_html(content, data.fetch_result.url, true, true, false);
            } catch (e) {
              iframe_content = "data:,Content-Type:"+content_type+" parse error.";
            }
          } else if (content_type.indexOf("text/html") == 0) {
            $("#tab-html").data("format", false);
            iframe_content = _this.render_html(data.fetch_result.content, data.fetch_result.url, true, true, false);
          } else if (content_type.indexOf("text") == 0) {
            iframe_content = "data:"+content_type+","+data.fetch_result.content;
          } else if (data.fetch_result.dataurl) {
            iframe_content = data.fetch_result.dataurl
          } else {
            iframe_content = "data:,Content-Type:"+content_type;
          }

          const doc = iframe.contentDocument;
          doc.open("text/html", "replace");
          doc.write(iframe_content)
          doc.close();
          doc.onreadystatechange = () => {
            if (doc.readyState === 'complete') {
              $("#tab-web iframe").height(doc.body.scrollHeight + 60);
            }
          };

          //follows
          $('#tab-follows').html('');
          var elem = $("#tab-control li[data-id=tab-follows] .num");

          var newtask_template = '<div class="newtask" data-task="__task__"><span class="task-callback">__callback__</span> &gt; <span class="task-url">__url__</span><div class="task-run"><i class="fa fa-play"></i></div><div class="task-more"> <i class="fa fa-ellipsis-h"></i> </div></div>';
          if (data.follows.length > 0) {
            elem.text(data.follows.length).show();
            var all_content = "";
            window.newtasks = {};
            $.each(data.follows, function(i, task) {
              var callback = task.process;
              callback = callback && callback.callback || '__call__';
              var content = newtask_template.replace('__callback__', callback);
              content = content.replace('__url__', task.url || '<span class="error">no_url!</span>');
              all_content += content.replace('__task__', i);
              window.newtasks[i] = task;
            });
            $('#tab-follows').append(all_content);
            _this.bind_follows();
          } else {
            elem.hide();
          }

          //messages
          $('#tab-messages pre').html('');
          if (data.messages.length > 0) {
            $("#tab-control li[data-id=tab-messages] .num").text(data.messages.length).show();
            var messages = JSON.stringify(data.messages, null, '  ');
            CodeMirror.runMode(messages, 'application/json', $('#tab-messages pre')[0]);
            $('#tab-messages')[0]
          } else {
            $("#tab-control li[data-id=tab-messages] .num").hide();
          }

          $("#tab-control li.active").click();

          // logs
          _this.python_log(data.logs);
        },
        error: function(xhr, textStatus, errorThrown) {
          console.log(xhr, textStatus, errorThrown);
          _this.python_log('error: '+textStatus);
          $('#left-area .overlay').hide();
        }
      });
    },

    python_log: function(text) {
      if (text) {
        $('#python-log pre').text(text);
        $('#python-log pre, #python-log').show();
        $('#python-log-show').height(0);
      } else {
        $('#python-log pre, #python-log').hide();
      }
    },

    webdav_mode: false,
    toggle_webdav_mode: function(button) {
      if (!this.webdav_mode) {
        if (this.not_saved) {
            if (!confirm("You have not saved changes. Ignore changes and switch to WebDav mode.")) {
            return;
          }
          this.not_saved = false;
        }
        this.python_editor_elem.hide();
        this.splitter.trigger('fullsize', 'prev');
        $(button).addClass('active');
        this.webdav_mode = !this.webdav_mode;
      } else {
        // leaving webdav mode, reload script
        var _this = this;
        $.ajax({
          type: "GET",
          url: location.pathname + '/get',
          success: function (data) {
            _this.splitter.trigger('init');
            _this.python_editor_elem.show();
            _this.python_editor.setValue(data.script);
            _this.not_saved = false;
            $(button).removeClass('active');
            _this.webdav_mode = !_this.webdav_mode;
          },
          error: function() {
            alert('Loading script from database error. Script may out-of-date.');
            _this.python_editor_elem.show();
            _this.splitter.trigger('init');
            $(button).removeClass('active');
            _this.webdav_mode = !_this.webdav_mode;
          },
        });
      }
    },
  };
})();

Debugger.init();


================================================
FILE: pyspider/webui/static/src/debug.less
================================================
/* vim: set et sw=2 ts=2 sts=2 ff=unix fenc=utf8: */
/* Author: Binux<i@binux.me> */
/*         http://binux.me */
/* Created on 2014-02-23 00:28:30 */

@import "variable";

body {
  margin: 0;
  padding: 0;
  height: 100%;
  overflow: hidden;
}

.warning {
  color: @orange;
}
.error {
  color: @red;
}

@control-height: 35px;
#control {
  z-index: 9999;
  min-width: 760px;
  width: 100%;
  height: @control-height;
  position: fixed;
  left: 0;
  right: 0;
  background-color: @gray-lighter;
  box-shadow: 0px 1px 2px @gray-light;

  div {
    line-height: 35px;
    margin-left: 10px;
    margin-right: 10px;
  }

  .webdav-btn {
    position: relative;
    float: right;
    padding: 1px 7px 0 7px;
    line-height: 21px;
    border-radius: 5px;
    border: solid 1px @blue;
    background: white;
    color: @blue;
    cursor: pointer;
    margin: 6px 0 0 10px;

    &:hover {
      background: lighten(@blue, 10%);
      color: white;
    }
    &.active {
      background: @blue;
      color: white;
    }
  }
}

#editarea {
  width: 100%;
  position: fixed;
  top: @control-height + 2px;
  left: 0;
  right: 0;
  bottom: 0;
  //debug
}

.debug-panel {
  position: absolute;
  top: 0;
  left: 0;
  right: 0;
  bottom: 0;
}

.resize {
  background-color: @gray;
  cursor: ew-resize;
  &:hover + .debug-panel {
    border-left: dashed 1px @gray !important;
 }
}

.overlay {
  position: absolute;
  top: 0;
  bottom: 0;
  left: 0;
  right: 0;
  z-index: 9999;
  background: rgba(0, 0, 0, 40%);
}

.focus .CodeMirror-activeline-background {
  background: #e8f2ff !important;
}
.CodeMirror-activeline-background {
  background: transparent !important;
}

#task-panel {
  height: 100%;
  overflow-x: auto;
}

.right-top-btn(@color: @green) {
  z-index: 99;
  position: absolute;
  top: 0;
  right: 0;
  background: @color;
  border-radius: 0 0 0 5px;
  color: white;
  margin: 0;
  padding: 3px 7px 5px 10px;
  cursor: pointer;
  font-weight: bold;
  line-height: 15px;

  &:hover {
    background: darken(@color, 10%);
  }
}

#run-task-btn {
  .right-top-btn(@color: @green);
}
#undo-redo-btn-group {
  @color: lighten(@green, 15%);
  .right-top-btn(@color: @color);

  top: auto;
  bottom: 0;
  border-radius: 5px 0 0 0;
  padding: 5px 0 3px 0;
  /*box-shadow: 0px 0px 30px @color;*/
  overflow: hidden;

  &:hover {
    background: @color;
  }

  a {
    color: white;
    text-decoration: none;
    padding: 5px 7px 3px 10px;
    &:hover {
      background: darken(@color, 10%);
    }
  }
}
#save-task-btn {
  .right-top-btn(@color: @blue);
}

#task-editor {
  position: relative;

  .CodeMirror {
    height: auto;
    padding-bottom: 3px;
    background: lighten(@green, 30%);
  }
  .CodeMirror-scroll {
    overflow-x: auto;
    overflow-y: hidden;
  }
  &.focus .CodeMirror-activeline-background {
    background: lighten(@green, 40%) !important;
  }
}

#tab-control {
  list-style-type: none;
  position: absolute;
  bottom: 0;
  right: 0;
  margin: 8px 20px;
  padding: 0;

  li {
    position: relative;
    float: right;
    padding: 1px 7px 0 7px;
    line-height: 21px;
    margin-left: 10px;
    border-radius: 5px;
    border: solid 1px @blue;
    background: white;
    color: @blue;
    cursor: pointer;

    &:hover {
      background: lighten(@blue, 10%);
      color: white;
    }
    &.active {
      background: @blue;
      color: white;
    }

    span {
      position: absolute;
      top: -5px;
      right: -10px;
      background: @red;
      color: white;
      font-size: 80%;
      font-weight: bold;
      padding: 2px 5px 0 5px;
      border-radius: 10px;
    }
  }
}

#debug-tabs {
  margin-bottom: 45px;
}

#tab-web {
  &.fixed {
    padding-top: 24px;
  }

  iframe {
    border-width: 0;
    width: 100%;
  }
}

#tab-html {
  margin: 0;
  padding: 7px 5px;

  pre {
    margin: 0;
    padding: 0;
  }
}

#tab-follows {
  .newtask {
    position: relative;
    height: 30px;
    line-height: 30px;
    background: lighten(@orange, 30%);
    border-bottom: solid 1px @orange;
    border-top: solid 1px @orange;
    margin-top: -1px;
    padding-left: 5px;
    padding-right: 70px;
    overflow: hidden;
    white-space: nowrap;
    text-overflow: ellipsis;
    cursor: pointer;

    &:hover {
      background: lighten(@orange, 20%);
      .task-more {
        background: lighten(@orange, 20%);
      }
    }
    .task-callback {
      color: darken(@orange, 10%);
    }
    .task-url {
      font-size: 95%;
      text-decoration: underline;
      font-weight: lighter;
      color: @blue;
    }
    .task-more {
      position: absolute;
      right: 33px;
      top: 0px;
      float: right;
      color: @orange;
      padding: 0 10px;
      background: lighten(@orange, 30%);
      border-radius: 10px;
    }
    .task-run {
      position: absolute;
      right: 0;
      top: 0;
      font-size: 80%;
      padding: 0 10px 0 30px;
      float: right;
      border-bottom: solid 1px lighten(@green, 20%);
      border-top: solid 1px lighten(@green, 20%);
      background: lighten(@green, 10%);
      color: white;
      text-shadow: 0 0 10px white;
      font-weight: bold;

      &:hover {
        background: @green;
      }
    }
  }
  .task-show {
    pre {
      margin: 5px 5px 10px 5px;
    }
  }
}

#python-editor {
  position: absolute;
  top: 0;
  width: 100%;
  bottom: 0;

  .CodeMirror {
    height: 100%;
    padding-bottom: 20px;
  }
}

#python-log {
  width: 100%;
  min-height: 10px;
  max-height: 40%;
  background: rgba(0, 0, 0, 60%);
  overflow: auto;

  #python-log-show {
    z-index: 89;
    width: auto;
    padding-top: 5px;
    background: @red;
    box-shadow: 0 2px 20px @red;
    cursor: pointer;
  }
  pre {
    margin: 0;
    padding: 10px 10px;
    color: white;
  }
}

#css-selector-helper {
  background-color: @gray-lighter;
  padding: 0;
  width: 100%;
  height: 24px;
  text-align: right;
  white-space: nowrap;

  &.fixed {
    position: absolute;
    top: 0;

  }

  button {
    line-height: 16px;
    vertical-align: 2px;
  }
}

span.element {
  position: relative;
  height: 24px;
  display: inline-block;
  padding: 0 0.2em;
  cursor: pointer;
  color: lighten(@gray, 35%);
  z-index: 99999;

  &.invalid {
    display: none;
  }
  &.selected {
    color: black;
  }
  &:hover {
    background-color: darken(@gray-lighter, 15%);

    & > ul {
      display: block;
    }
  }

  & > ul {
    display: none;
    margin: 0;
    padding: 0;
    position: absolute;
    top: 24px;
    left: 0;
    background-color: @gray-lighter;
    border: 1px solid black;
    border-top-width: 0;
    color: lighten(@gray, 35%);

    & > li {
      display: block;
      text-align: left;
      white-space: nowrap;
      padding: 0 4px;

      &.selected {
        color: black;
      }
      &:hover {
        background-color: darken(@gray-lighter, 15%);
      }
    }
  }
}

.copy-selector-input {
  height: 24px;
  padding: 0;
  border: 0;
  margin: 0;
  padding-right: 0.2em;
  font-size: 1em;
  text-align: right;
  width: 100%;
  margin-left: -100px;
  background: @gray-lighter;
}


================================================
FILE: pyspider/webui/static/src/index.js
================================================
// vim: set et sw=2 ts=2 sts=2 ff=unix fenc=utf8:
// Author: Binux<i@binux.me>
//         http://binux.me
// Created on 2014-03-02 17:53:23

import "./index.less";

$(function() {
  //$("input[name=start-urls]").on('keydown', function(ev) {
    //if (ev.keyCode == 13) {
      //var value = $(this).val();
      //var textarea = $('<textarea class="form-control" rows=3 name="start-urls"></textarea>').replaceAll(this);
      //textarea.val(value).focus();
    //}
  //});

  function init_editable(projects_app) {
    $(".project-group>span").editable({
      name: 'group',
      pk: function(e) {
        return $(this).parents('tr').data("name");
      },
      emptytext: '[group]',
      placement: 'right',
      url: "/update",
      success: function(response, value) {
        var project_name = $(this).parents('tr').data("name");
        projects_app.projects[project_name].group = value;
        $(this).attr('style', '');
      }
    });

    $(".project-status>span").editable({
      type: 'select',
      name: 'status',
      source: [
        {value: 'TODO', text: 'TODO'},
        {value: 'STOP', text: 'STOP'},
        {value: 'CHECKING', text: 'CHECKING'},
        {value: 'DEBUG', text: 'DEBUG'},
        {value: 'RUNNING', text: 'RUNNING'}
      ],
      pk: function(e) {
        return $(this).parents('tr').data("name");
      },
      emptytext: '[status]',
      placement: 'right',
      url: "/update",
      success: function(response, value) {
        var project_name = $(this).parents('tr').data("name");
        projects_app.projects[project_name].status = value;
        $(this).removeClass('status-'+$(this).attr('data-value')).addClass('status-'+value).attr('data-value', value).attr('style', '');
      }
    });

    $(".project-rate>span").editable({
      name: 'rate',
      pk: function(e) {
        return $(this).parents('tr').data("name");
      },
      validate: function(value) {
        var s = value.split('/');
        if (s.length != 2)
          return "format error: rate/burst";
        if (!$.isNumeric(s[0]) || !$.isNumeric(s[1]))
          return "format error: rate/burst";
      },
      highlight: false,
      emptytext: '0/0',
      placement: 'right',
      url: "/update",
      success: function(response, value) {
        var project_name = $(this).parents('tr').data("name");
        var s = value.split('/');
        projects_app.projects[project_name].rate = parseFloat(s[0]);
        projects_app.projects[project_name].burst = parseFloat(s[1]);
        $(this).attr('style', '');
      }
    });
  }

  function init_sortable() {
    // table sortable
    Sortable.getColumnType = function(table, i) {
      var type = $($(table).find('th').get(i)).data('type');
      if (type == "num") {
        return Sortable.types.numeric;
      } else if (type == "date") {
        return Sortable.types.date;
      }
      return Sortable.types.alpha;
    };
    $('table.projects').attr('data-sortable', true);
    Sortable.init();
  }

  $("#create-project-modal form").on('submit', function(ev) {
    var $this = $(this);
    var project_name = $this.find('[name=project-name]').val()
    if (project_name.length == 0 || project_name.search(/[^\w]/) != -1) {
      $this.find('[name=project-name]').parents('.form-group').addClass('has-error');
      $this.find('[name=project-name] ~ .help-block').show();
      return false;
    }
    var mode = $this.find('[name=script-mode]:checked').val();
    $this.attr('action', '/debug/'+project_name);
    return true;
  });

  function update_counters() {
    $.get('/counter', function(data) {
      for (let project in data) {
        var info = data[project];
        if (projects_app.projects[project] === undefined)
          continue;

        // data inject
        var types = "5m,1h,1d,all".split(',');
        for (let type of types) {
          var d = info[type];
          if (d === undefined)
            continue;
          var pending = d.pending || 0,
            success = d.success || 0,
            retry = d.retry || 0,
            failed = d.failed || 0,
            sum = d.task || pending + success + retry + failed;
          d.task = sum;
          d.title = ""+type+" of "+sum+" tasks:\n"
            +(type == "all"
              ? "pending("+(pending/sum*100).toFixed(1)+"%): \t"+pending+"\n"
              : "new("+(pending/sum*100).toFixed(1)+"%): \t\t"+pending+"\n")
            +"success("+(success/sum*100).toFixed(1)+"%): \t"+success+"\n"
            +"retry("+(retry/sum*100).toFixed(1)+"%): \t"+retry+"\n"
            +"failed("+(failed/sum*100).toFixed(1)+"%): \t"+failed;
        }

        projects_app.projects[project].paused = info['paused'];
        projects_app.projects[project].time = info['5m_time'];
        projects_app.projects[project].progress = info;
      }
    });
  }

  function update_queues() {
    $.get('/queues', function(data) {
      //console.log(data);
      $('.queue_value').each(function(i, e) {
        var attr = $(e).attr('title');
        if (data[attr] !== undefined) {
          $(e).text(data[attr]);
        } else {
          $(e).text('???');
        }
      });
    });
  }

  // projects vue
  var projects_map = {};
  projects.forEach(function(p) {
    p.paused = false;
    p.time = {};
    p.progress = {};
    projects_map[p.name] = p;
  });
  var projects_app = new Vue({
    el: '.projects',
    data: {
      projects: projects_map
    },
    ready: function() {
      init_editable(this);
      init_sortable(this);
      update_counters();
      window.setInterval(update_counters, 15*1000);
      update_queues();
      window.setInterval(update_queues, 15*1000);
    },
    methods: {
      project_run: function(project, event) {
        $("#need-set-status-alert").hide();
        if (project.status != "RUNNING" && project.status != "DEBUG") {
          $("#need-set-status-alert").show();
        }
        
        var _this = event.target;
        $(_this).addClass("btn-warning");
        $.ajax({
          type: "POST",
          url: '/run',
          data: {
            project: project.name
          },
          success: function(data) {
            $(_this).removeClass("btn-warning");
            if (!data.result) {
              $(_this).addClass("btn-danger");
            }
          },
          error: function() {
            $(_this).removeClass("btn-warning").addClass("btn-danger");
          }
        });
      }
    }
  });
});


================================================
FILE: pyspider/webui/static/src/index.less
================================================
/* vim: set et sw=2 ts=2 sts=2 ff=unix fenc=utf8: */
/* Author: Binux<i@binux.me> */
/*         http://binux.me */
/* Created on 2014-02-23 00:28:30 */

@import "variable";

h1 {
  margin-top: 5px;
}

header .alert {
  position: absolute;;
  width: 50rem;
  left: 50%;
  margin-left: -25rem;
}

.queue-info {
  th, td {
    text-align: center;
    border: 1px solid #ddd;
  }
}

[v-cloak] {
  display: none;
}

.projects {
  min-width: 850px;
  border-top: 1px solid #ddd;
  border-bottom: 1px solid #ddd;

  .project-group {
    width: 80px;
  }

  .project-name {
    font-weight: bold;
  }

  .project-status {
    width: 100px;
  }
  .project-status-span(@color) {
    border: solid 1px darken(@color, 10%);
    padding: 1px 5px 0 5px;
    background: @color;
    color: white;
  }
  .project-status>span {
    .project-status-span(@gray-light);
  }
  span.status-TODO {
    .project-status-span(@orange);
  }
  span.status-STOP {
    .project-status-span(@red);
  }
  span.status-CHECKING {
    .project-status-span(darken(@yellow, 10%));
  }
  span.status-DEBUG {
    .project-status-span(@blue);
  }
  span.status-RUNNING {
    .project-status-span(@green);
  }
  span.status-PAUSED {
    .project-status-span(@gray);
  }

  .project-rate {
    width: 110px;
  }

  .project-time {
    width: 110px;
  }
  
  th.project-progress {
    position: relative;
    span {
      position: absolute;
    }
  }

  td.project-progress {
    position: relative;
    min-width: 5%;
    &.progress-all {
      min-width: 10%;
    }

    .progress {
      position: relative;
      margin: 0;
      background-color: #aaa;
      .progress-text {
        width: 100%;
        text-align: center;
        position: absolute;
        font-weight: bold;
        color: #fff;
        pointer-events: none;
      }
      .progress-bar {
        -webkit-transition: none;
        transition: none;
      }
    }
  }

  .project-actions {
    width: 200px;
  }
}

.global-btn {
  margin-top: -5px;
  padding: 10px 10px 10px 10px;

  .create-btn-div {
    float: right;
  }

  .active-btn-div {
    float: left;
  }
}


================================================
FILE: pyspider/webui/static/src/result.less
================================================
/* vim: set et sw=2 ts=2 sts=2 ff=unix fenc=utf8: */
/* Author: Binux<i@binux.me> */
/*         http://binux.me */
/* Created on 2014-10-22 22:38:45 */

@import "variable";

.top-bar {
  padding: 10px 15px 2px 15px;
  height: 46px;
  background-color: #f5f5f5;
  border-bottom: 1px solid #ddd;
  position: relative;
  
  h1 {
    margin: 0 0 10px 0;
    font-size: 18px;
  }

  .btn-group {
    margin: 8px 10px 0 0;
    position: absolute;
    right: 0;
    top: 0;

    a.btn {
    }
  }
}

.pagination-wrap {
  text-align: right;
  padding-right: 15px;
}

table {
  border-bottom: 1px solid #ddd;

  td {
    word-break: break-all;
  }
}


================================================
FILE: pyspider/webui/static/src/splitter.js
================================================
// vim: set et sw=2 ts=2 sts=2 ff=unix fenc=utf8:
// Author: Binux<i@binux.me>
//         http://binux.me
// Created on 2014-02-23 01:35:35
// from: https://github.com/jsbin/jsbin

$.fn.splitter = function (_type) {
  var $document = $(document),
  $blocker = $('<div class="block"></div>'),
  $body = $('body');
  // blockiframe = $blocker.find('iframe')[0];

  var splitterSettings = JSON.parse(localStorage.getItem('splitterSettings') || '[]');
  return this.each(function () {
    var $el = $(this),
    $originalContainer = $(this),
    guid = $.fn.splitter.guid++,
    $parent = $el.parent(),
    type = _type || 'x',
    $prev = type === 'x' ? $el.prevAll(':visible:first') : $el.nextAll(':visible:first'),
    $handle = $('<div class="resize"></div>'),
    dragging = false,
    width = $parent.width(),
    parentOffset = $parent.offset(),
    left = parentOffset.left,
    top = parentOffset.top, // usually zero :(
    props = {
      x: {
        display: 'block',
        currentPos: $parent.offset().left,
        multiplier: 1,
        cssProp: 'left',
        otherCssProp: 'right',
        size: $parent.width(),
        sizeProp: 'width',
        moveProp: 'pageX',
        init: {
          top: 0,
          bottom: 0,
          width: 8,
          'margin-left': '-4px',
          height: '100%',
          left: 'auto',
          right: 'auto',
          opacity: 0,
          position: 'absolute',
          cursor: 'ew-resize',
          // 'border-top': '0',
          'border-left': '1px solid rgba(218, 218, 218, 0.5)',
          'z-index': 99999
        }
      },
      y: {
        display: 'block',
        currentPos: $parent.offset().top,
        multiplier: -1,
        size: $parent.height(),
        cssProp: 'bottom',
        otherCssProp: 'top',
        sizeProp: 'height',
        moveProp: 'pageY',
        init: {
          top: 'auto',
          cursor: 'ns-resize',
          bottom: 'auto',
          height: 8,
          width: '100%',
          left: 0,
          right: 0,
          opacity: 0,
          position: 'absolute',
          border: 0,
          // 'border-top': '1px solid rgba(218, 218, 218, 0.5)',
          'z-index': 99999
        }
      }
    },
    refreshTimer = null,
    settings = splitterSettings[guid] || {};

    var tracker = {
      down: { x: null, y: null },
      delta: { x: null, y: null },
      track: false,
      timer: null
    };
    $handle.bind('mousedown', function (event) {
      tracker.down.x = event.pageX;
      tracker.down.y = event.pageY;
      tracker.delta = { x: null, y: null };
      tracker.target = $handle[type == 'x' ? 'height' : 'width']() * 0.25;
    });

    $document.bind('mousemove', function (event) {
      if (dragging) {
        tracker.delta.x = tracker.down.x - event.pageX;
        tracker.delta.y = tracker.down.y - event.pageY;
        clearTimeout(tracker.timer);
        tracker.timer = setTimeout(function () {
          tracker.down.x = event.pageX;
          tracker.down.y = event.pageY;
        }, 250);
        //disable change to y
        //var targetType = type == 'x' ? 'y' : 'x';
        //if (Math.abs(tracker.delta[targetType]) > tracker.target) {
          //$handle.trigger('change', targetType, event[props[targetType].moveProp]);
          //tracker.down.x = event.pageX;
          //tracker.down.y = event.pageY;
        //}
      }
    });

    function moveSplitter(pos) {
      if (type === 'y') {
        pos -= top;
      }
      var v = pos - props[type].currentPos,
      split = 100 / props[type].size * v,
      delta = (pos - settings[type]) * props[type].multiplier,
      prevSize = $prev[props[type].sizeProp](),
      elSize = $el[props[type].sizeProp]();

      if (type === 'y') {
        split = 100 - split;
      }

      // if prev panel is too small and delta is negative, block
      if (prevSize < 100 && delta < 0) {
        // ignore
      } else if (elSize < 100 && delta > 0) {
        // ignore
      } else {
        // allow sizing to happen
        $el.css(props[type].cssProp, split + '%');
        $prev.css(props[type].otherCssProp, (100 - split) + '%');
        var css = {};
        css[props[type].cssProp] = split + '%';
        $handle.css(css);
        settings[type] = pos;
        splitterSettings[guid] = settings;
        localStorage.setItem('splitterSettings', JSON.stringify(splitterSettings));

        // wait until animations have completed!
        if (moveSplitter.timer) clearTimeout(moveSplitter.timer);
        moveSplitter.timer = setTimeout(function () {
          $document.trigger('sizeeditors');
        }, 120);
      }
    }

    function resetPrev() {
      $prev = type === 'x' ? $handle.prevAll(':visible:first') : $handle.nextAll(':visible:first');
    }

    $document.bind('mouseup touchend', function () {
      if (dragging) {
        dragging = false;
        $handle.trigger('resize-end');
        $blocker.remove();
        // $handle.css( 'opacity', '0');
        $body.removeClass('dragging');
      }
    }).bind('mousemove touchmove', function (event) {
      if (dragging) {
        moveSplitter(event[props[type].moveProp] || event.originalEvent.touches[0][props[type].moveProp]);
      }
    });

    $blocker.bind('mousemove touchmove', function (event) {
      if (dragging) {
        moveSplitter(event[props[type].moveProp] || event.originalEvent.touches[0][props[type].moveProp]);
      }
    });

    $handle.bind('mousedown touchstart', function (e) {
      dragging = true;
      $handle.trigger('resize-start');
      $body.append($blocker).addClass('dragging');
      props[type].size = $parent[props[type].sizeProp]();
      props[type].currentPos = 0; // is this really required then?

      resetPrev();
      e.preventDefault();
    });

    /*
       .hover(function () {
       $handle.css('opacity', '1');
       }, function () {
       if (!dragging) {
       $handle.css('opacity', '0');
       }
       })
       */

    $handle.bind('fullsize', function(event, panel) {
      if (panel === undefined) {
        panel = 'prev';
      }
      var split = 0;
      if (panel === 'prev') {
        split = 100;
      }
      $el.css(props[type].cssProp, split + '%');
      $prev.css(props[type].otherCssProp, (100 - split) + '%');
      $handle.hide();
    });

    $handle.bind('init', function (event, x) {
      $handle.css(props[type].init);
      props[type].size = $parent[props[type].sizeProp]();
      resetPrev();

      // can only be read at init
      top = $parent.offset().top;

      $blocker.css('cursor', type == 'x' ? 'ew-resize' : 'ns-resize');

      if (type == 'y') {
        $el.css('border-right', 0);
        $prev.css('border-left', 0);
        $prev.css('border-top', '2px solid #ccc');
      } else {
        // $el.css('border-right', '1px solid #ccc');
        $el.css('border-top', 0);
        // $prev.css('border-right', '2px solid #ccc');
      }

      if ($el.is(':hidden')) {
        $handle.hide();
      } else {
        if ($prev.length) {
          $el.css('border-' + props[type].cssProp, '1px solid #ccc');
        } else {
          $el.css('border-' + props[type].cssProp, '0');
        }
        moveSplitter(x !== undefined ? x : settings[type] || $el.offset()[props[type].cssProp]);
      }
    }); //.trigger('init', settings.x || $el.offset().left);

    $handle.bind('change', function (event, toType, value) {
      $el.css(props[type].cssProp, '0');
      $prev.css(props[type].otherCssProp, '0');
      $el.css('border-' + props[type].cssProp, '0');

      if (toType === 'y') {
        // 1. drop inside of a new div that encompases the elements
        $el = $el.find('> *');
        $handle.appendTo($prev);
        $el.appendTo($prev);
        $prev.css('height', '100%');
        $originalContainer.hide();
        $handle.css('margin-left', 0);
        $handle.css('margin-top', 5);

        $handle.addClass('vertical');

        delete settings.x;

        $originalContainer.nextAll(':visible:first').trigger('init');
        // 2. change splitter to the right to point to new block div
      } else {
        $el = $prev;
        $prev = $tmp;

        $el.appendTo($originalContainer);
        $handle.insertBefore($originalContainer);
        $handle.removeClass('vertical');
        $el.css('border-top', 0);
        $el = $originalContainer;
        $originalContainer.show();
        $handle.css('margin-top', 0);
        $handle.css('margin-left', -4);
        delete settings.y;

        setTimeout(function() {
          $originalContainer.nextAll(':visible:first').trigger('init');
        }, 0);
      }

      resetPrev();

      type = toType;

      // if (type == 'y') {
      // FIXME $prev should check visible
      var $tmp = $el;
      $el = $prev;
      $prev = $tmp;
      // } else {

      // }

      $el.css(props[type].otherCssProp, '0');
      $prev.css(props[type].cssProp, '0');
      // TODO
      // reset top/bottom positions
      // reset left/right positions

      if ($el.is(':visible')) {
        // find all other handles and recalc their height
        if (type === 'y') {
          var otherhandles = $el.find('.resize');

          otherhandles.each(function (i) {
            // find the top of the
            var $h = $(this);
            if (this === $handle[0]) {
              // ignore
            } else {
              // TODO change to real px :(
              $h.trigger('init', 100 / (otherhandles - i - 1));
            }
          });
        }
        $handle.trigger('init', value || $el.offset()[props[type].cssProp] || props[type].size / 2);
      }
    });


    $prev.css('width', 'auto');
    $prev.css('height', 'auto');
    $el.data('splitter', $handle);
    $el.before($handle);

    // if (settings.y) {
    //   $handle.trigger('change', 'y');
    // }
  });
};

$.fn.splitter.guid = 0;


================================================
FILE: pyspider/webui/static/src/task.less
================================================
/* vim: set et sw=2 ts=2 sts=2 ff=unix fenc=utf8: */
/* Author: Binux<i@binux.me> */
/*         http://binux.me */
/* Created on 2014-07-16 19:20:30 */

@import "variable";

.base-info {
  padding: 10px 15px 2px 15px;
  background-color: #f5f5f5;
  border-bottom: 1px solid #ddd;
}

.more-info {
  padding: 10px 15px;
}

.more-info dd {
  display: block;
  font-family: monospace;
  white-space: pre;
  word-break: break-all;
  word-wrap: break-word;
  margin: 1em 0px;
}

.status_mix(@color: lighten(black, 50%)) {
  border: solid 1px darken(@color, 10%);
  padding: 1px 5px 0 5px;
  background: @color;
  color: white;
}
.status {
  &-1 {
    .status_mix(@blue);
  }
  &-2 {
    .status_mix(@green);
  }
  &-3 {
    .status_mix(@red);
  }
  &-4 {
    .status_mix;
  }
}

.url {
  font-size: 120%;
  text-decoration: underline;
}

.callback {
  color: @orange;
  font-weight: bold;

  &:hover, &:focus {
    color: darken(@orange, 10%);
  }
}

dt .glyphicon-ok {
  color: @green;
}
dt .glyphicon-remove {
  color: @red;
}


================================================
FILE: pyspider/webui/static/src/tasks.less
================================================
/* vim: set et sw=2 ts=2 sts=2 ff=unix fenc=utf8: */
/* Author: Binux<i@binux.me> */
/*         http://binux.me */
/* Created on 2014-07-18 23:20:46 */

@import "variable";
@import "task";

.tasks {
  margin: 0;
  padding: 0;
  list-style-type: none;

  li {
    .base-info;

    &:nth-child(even) {
      background-color: white;
    }
  }

  .url {
    display: inline-block;
    vertical-align: bottom;
    max-width: 40em;
    overflow: hidden;
    white-space: nowrap;
    text-overflow: ellipsis;
  }
  
  .update-time {
    font-weight: bold;
  }
}


================================================
FILE: pyspider/webui/static/src/variable.less
================================================
/* vim: set et sw=2 ts=2 sts=2 ff=unix fenc=utf8: */
/* Author: Binux<i@binux.me> */
/*         http://binux.me */
/* Created on 2014-07-16 19:18:30 */

// colors
@gray-darker:            lighten(#000, 13.5%); // #222
@gray-dark:              lighten(#000, 20%);   // #333
@gray:                   lighten(#000, 33.5%); // #555
@gray-light:             lighten(#000, 60%);   // #999
@gray-lighter:           lighten(#000, 93.5%); // #eee

@blue: #428bca;
@green: #5cb85c;
@blue-light: #5bc0de;
@orange: #f0ad4e;
@yellow: #ffe543;
@red: #d9534f;


================================================
FILE: pyspider/webui/static/webpack.config.js
================================================
var webpack = require("webpack");
var ExtractTextPlugin = require("extract-text-webpack-plugin");

module.exports = {
  entry: {
    index: "./src/index",
    debug: "./src/debug",
    result: "./src/result.less",
    task: "./src/task.less",
    tasks: "./src/tasks.less",
  },
  output: {
    //path: "./dist",
    filename: "[name].min.js"
  },
  module: {
    loaders: [
      { test: /\.js$/, loader: "babel-loader" },
      { test: /\.less$/, loader: ExtractTextPlugin.extract("style-loader", "css-loader?sourceMap!less-loader?sourceMap") }
    ]
  },
  devtool: 'source-map',
  plugins: [
    new ExtractTextPlugin("[name].min.css"),
    new webpack.optimize.UglifyJsPlugin({ compress: { warnings: false } }),
  ]
}


================================================
FILE: pyspider/webui/task.py
================================================
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:
# Author: Binux<i@binux.me>
#         http://binux.me
# Created on 2014-07-16 15:30:57

import socket
from flask import abort, render_template, request, json

from pyspider.libs import utils
from .app import app


@app.route('/task/<taskid>')
def task(taskid):
    if ':' not in taskid:
        abort(400)
    project, taskid = taskid.split(':', 1)

    taskdb = app.config['taskdb']
    task = taskdb.get_task(project, taskid)

    if not task:
        abort(404)
    resultdb = app.config['resultdb']
    result = {}
    if resultdb:
        result = resultdb.get(project, taskid)

    return render_template("task.html", task=task, json=json, result=result,
                           status_to_string=app.config['taskdb'].status_to_string)


@app.route('/task/<taskid>.json')
def task_in_json(taskid):
    if ':' not in taskid:
        return json.jsonify({'code': 400, 'error': 'bad project:task_id format'})
    project, taskid = taskid.split(':', 1)

    taskdb = app.config['taskdb']
    task = taskdb.get_task(project, taskid)

    if not task:
        return json.jsonify({'code': 404, 'error': 'not found'})
    task['status_string'] = app.config['taskdb'].status_to_string(task['status'])
    return json.jsonify(task)


@app.route('/tasks')
def tasks():
    rpc = app.config['scheduler_rpc']
    taskdb = app.config['taskdb']
    project = request.args.get('project', "")
    limit = int(request.args.get('limit', 100))

    try:
        updatetime_tasks = rpc.get_active_tasks(project, limit)
    except socket.error as e:
        app.logger.warning('connect to scheduler rpc error: %r', e)
        return 'connect to scheduler error', 502

    tasks = {}
    result = []
    for updatetime, task in sorted(updatetime_tasks, key=lambda x: x[0]):
        key = '%(project)s:%(taskid)s' % task
        task['updatetime'] = updatetime
        if key in tasks and tasks[key].get('status', None) != taskdb.ACTIVE:
            result.append(tasks[key])
        tasks[key] = task
    result.extend(tasks.values())

    return render_template(
        "tasks.html",
        tasks=result,
        status_to_string=taskdb.status_to_string
    )


@app.route('/active_tasks')
def active_tasks():
    rpc = app.config['scheduler_rpc']
    taskdb = app.config['taskdb']
    project = request.args.get('project', "")
    limit = int(request.args.get('limit', 100))

    try:
        tasks = rpc.get_active_tasks(project, limit)
    except socket.error as e:
        app.logger.warning('connect to scheduler rpc error: %r', e)
        return '{}', 502, {'Content-Type': 'application/json'}

    result = []
    for updatetime, task in tasks:
        task['updatetime'] = updatetime
        task['updatetime_text'] = utils.format_date(updatetime)
        if 'status' in task:
            task['status_text'] = taskdb.status_to_string(task['status'])
        result.append(task)

    return json.dumps(result), 200, {'Content-Type': 'application/json'}

app.template_filter('format_date')(utils.format_date)


================================================
FILE: pyspider/webui/templates/debug.html
================================================
<!DOCTYPE html>
<html lang="en">
  <head>
    <meta charset="utf-8">
    <title>{{ project_name }} - Debugger - pyspider</title>
    <!--[if lt IE 9]>
      <script src="http://html5shim.googlecode.com/svn/trunk/html5.js"></script>
    <![endif]-->

    <meta name="description" content="pyspider - debugger - {{ project_name }}">
    <meta name="author" content="binux">

    <link href="{{ url_for('cdn', path='codemirror/5.20.2/codemirror.min.css') }}" rel="stylesheet">
    <link href="{{ url_for('cdn', path='font-awesome/4.0.3/css/font-awesome.min.css') }}" rel="stylesheet">
    <link href="{{ url_for('cdn', path='codemirror/5.20.2/addon/dialog/dialog.min.css') }}" rel="stylesheet">
    <link href="{{ url_for('cdn', path='codemirror/5.20.2/addon/lint/lint.min.css') }}" rel="stylesheet">
    <link href="{{ url_for('static', filename='debug.min.css') }}" rel="stylesheet">

    <script src="{{ url_for('cdn', path='jquery/1.11.0/jquery.min.js') }}"></script>
    <script src="{{ url_for('cdn', path='jsonlint/1.6.0/jsonlint.min.js') }}"></script>
    <script src="{{ url_for('cdn', path='codemirror/5.20.2/codemirror.min.js') }}"></script>
    <script src="{{ url_for('cdn', path='codemirror/5.20.2/mode/xml/xml.min.js') }}"></script>
    <script src="{{ url_for('cdn', path='codemirror/5.20.2/mode/css/css.min.js') }}"></script>
    <script src="{{ url_for('cdn', path='codemirror/5.20.2/mode/javascript/javascript.min.js') }}"></script>
    <script src="{{ url_for('cdn', path='codemirror/5.20.2/mode/htmlmixed/htmlmixed.min.js') }}"></script>
    <script src="{{ url_for('cdn', path='codemirror/5.20.2/mode/python/python.min.js') }}"></script>
    <script src="{{ url_for('cdn', path='codemirror/5.20.2/addon/search/search.min.js') }}"></script>
    <script src="{{ url_for('cdn', path='codemirror/5.20.2/addon/search/searchcursor.min.js') }}"></script>
    <script src="{{ url_for('cdn', path='codemirror/5.20.2/addon/dialog/dialog.min.js') }}"></script>
    <script src="{{ url_for('cdn', path='codemirror/5.20.2/addon/selection/active-line.min.js') }}"></script>
    <script src="{{ url_for('cdn', path='codemirror/5.20.2/addon/runmode/runmode.min.js') }}"></script>
    <script src="{{ url_for('cdn', path='codemirror/5.20.2/addon/lint/lint.min.js') }}"></script>
    <script src="{{ url_for('cdn', path='codemirror/5.20.2/addon/lint/json-lint.min.js') }}"></script>
    <script src="{{ url_for('cdn', path='codemirror/2.36.0/formatting.min.js') }}"></script>
    <script src="{{ url_for('cdn', path='URI.js/1.11.2/URI.min.js') }}"></script>
  </head>

  <body>
    <section id="control">
      <div class="title pull-left"><a href="/">pyspider</a> &gt; {{ project_name }}</div>
      <div class="pull-right">
        <a href="http://docs.pyspider.org/" target="_blank">Documentation</a>
        <span class="webdav-btn">WebDAV Mode</span>
      </div>
    </section>
    <section id="editarea">
      <div id="left-area" class="debug-panel" style="right: 50%">
        <div id="task-panel">
          <div id="task-editor" class="editor">
            <div id="run-task-btn">run</div>
            <div id="undo-redo-btn-group">
              <a href="javascript:;" id="undo-btn"> &lt; </a>|<a href="javascript:;" id="redo-btn">&gt; </a>
              <span id="history-wrap" style="display: none;">|<a target=_blank id="history-link">history</a></span>
            </div>
          </div>
          <div id="python-log" style="display: none;">
            <pre style="display: none;"></pre>
            <div id="python-log-show"></div>
          </div>
          <div id="debug-tabs">
            <div id="tab-web" class="tab" style="display: none;">
              <div id="css-selector-helper">
                <input class="copy-selector-input" />
                <button class="btn copy-selector"><i class="fa fa-clipboard" title="copy css selector"></i></button>
                <button class="btn add-to-editor"><i class="fa fa-arrow-right" title="add to editor"></i></button>
              </div>
              <div class="iframe-box"></div>
            </div>
            <div id="tab-html" class="tab" style="display: none;"><pre class="cm-s-default"></pre></div>
            <div id="tab-follows" class="tab">
              {# <div class="newtask">
                <span class="task-callback">__callback__</span> &gt; <span class="task-url">__url__</span>
                <div class="task-run"><i class="fa fa-play"></i></div>
                <div class="task-more"> <i class="fa fa-ellipsis-h"></i> </div>
              </div> #}
            </div>
            <div id="tab-messages" class="tab" style="display: none;">
              <pre class="cm-s-default"></pre>
            </div>
          </div>
        </div>
        <ul id="tab-control">
          <li data-id="tab-messages">messages<span class="num" style="display: none;"></span></li>
          <li data-id="tab-follows">follows<span class="num" style="display: none;"></span></li>
          <li data-id="tab-html">html</li>
          <li data-id="tab-web" class="active">web</li>
          <li id="J-enable-css-selector-helper">enable css selector helper</li>
        </ul>
        <div class="overlay" style="display: none;"></div>
      </div>

      <div id="right-area" class="debug-panel" style="left: 50%">
        <div id="python-editor" class="editor focus">
          <div id="save-task-btn">save</div>
        </div>
        <div class="overlay" style="display: none;"></div>
      </div>
    </section>

    <script>
      var task_content = {{ task | tojson | tojson | safe }};
      var script_content = {{ script | tojson | safe }};
    </script>
    <script src="{{ url_for('static', filename='debug.min.js') }}"></script>
  </body>
</html>
<!-- vim: set et sw=2 ts=2 sts=2 ff=unix fenc=utf8 syntax=htmldjango: -->


================================================
FILE: pyspider/webui/templates/index.html
================================================
<!DOCTYPE html>
<html lang="en">
  <head>
    <meta charset="utf-8">
    <title>Dashboard - pyspider</title>
    <!--[if lt IE 9]>
      <script src="http://html5shim.googlecode.com/svn/trunk/html5.js"></script>
    <![endif]-->

    <meta name="description" content="pyspider dashboard">
    <meta name="author" content="binux">
    <link href="{{ url_for('cdn', path='twitter-bootstrap/3.1.1/css/bootstrap.min.css') }}" rel="stylesheet">
    <link href="{{ url_for('cdn', path='x-editable/1.5.0/bootstrap3-editable/css/bootstrap-editable.css') }}" rel="stylesheet">
    <link href="{{ url_for('cdn', path='sortable/0.6.0/css/sortable-theme-bootstrap.css') }}" rel="stylesheet">
    <link href="{{ url_for('static', filename='index.min.css') }}" rel="stylesheet">

    <script src="{{ url_for('cdn', path='jquery/1.11.0/jquery.min.js') }}"></script>
  </head>

  <body>
    <header>
      <div id="need-set-status-alert" class="alert alert-danger alert-dismissible" style="display:none;" role="alert">
        <button type="button" class="close" data-dismiss="alert" aria-label="Close"><span aria-hidden="true">&times;</span></button>
        Project is not started, please set status to RUNNING or DEBUG.
      </div>
      <h1>pyspider dashboard</h1>
      <table class="table queue-info">
        <tr>
          <th>scheduler</th>
          <td class="queue_value" title="scheduler2fetcher">???</td>
          <th>fetcher</th>
          <td class="queue_value" title="fetcher2processor">???</td>
          <th>processor</th>
          <td class="queue_value" title="processor2result">???</td>
          <th>result_worker</th>
        </tr>
        <tr>
          <td style="border-width: 0px 1px;"></td>
          <td colspan=3></td>
          <td style="border-width: 0px 1px;"></td>
          <td colspan=2 style="border-width: 0px;"></td>
        </tr>
        <tr>
          <td style="border-width: 0px 0px 1px 1px"></td>
          <td colspan=3 style="border-width: 1px 0px;">
            <span class="queue_value" title="newtask_queue">???</span>
            + <span class="queue_value" title="status_queue">???</span>
          </td>
          <td style="border-width: 0px 1px 1px 0px;"></td>
          <td colspan=2 style="border-width: 0px;"></td>
        </tr>
      </table>
    </header>
    <section>
      <div class="global-btn clearfix">
        <div class="create-btn-div">
          <button class="project-create btn btn-default btn-primary" data-toggle="modal" data-target="#create-project-modal">Create</button>
        </div>

        <div class="active-btn-div">
          {% if config.scheduler_rpc is not none %}
            <a class="btn btn-default btn-info" href='/tasks' target=_blank>Recent Active Tasks</a>
          {% endif %}
        </div>

        <div class="modal fade" id="create-project-modal">
          <div class="modal-dialog">
            <div class="modal-content">
              <div class="modal-header">
                <button type="button" class="close" data-dismiss="modal" aria-label="Close"><span aria-hidden="true">&times;</span></button>
                <h4 class="modal-title">Create New Project</h4>
              </div>
              <form class="form-horizontal" method="POST">
                <div class="modal-body">
                  <div class="form-group">
                    <label class="col-sm-3 control-label" for="project-name">Project Name</label>
                    <div class="col-sm-9">
                      <input class="form-control" type="text" name="project-name" autocomplete="off">
                      <span class="help-block" style="display: none;">[a-zA-Z0-9_]+</span>
                    </div>
                  </div>
                  <div class="form-group">
                    <label class="col-sm-3 control-label" for="start-urls">Start URL(s)</label>
                    <div class="col-sm-9">
                      <input class="form-control" type="text" name="start-urls">
                    </div>
                  </div>
                  <div class="form-group">
                    <label class="col-sm-3 control-label" for="script-mode">Mode</label>
                    <div class="col-sm-9">
                      <div class="btn-group" data-toggle="buttons">
                        <label class="btn btn-default active">
                          <input type="radio" name="script-mode" id="mode-script" autocomplete="off" value="script" checked> Script
                        </label>
                        <label class="btn btn-default">
                          <input type="radio" name="script-mode" id="mode-slime" autocomplete="off" value="slime"> Slime
                        </label>
                      </div>
                    </div>
                  </div>
                </div>
                <div class="modal-footer">
                  <button type="button" class="btn btn-default" data-dismiss="modal">Close</button>
                  <button type="submit" class="btn btn-primary">Create</button>
                </div>
              </form>
            </div>
          </div>
        </div>
      </div>
      <table class="table sortable-theme-bootstrap projects">
        <thead>
          <tr>
            <th>group</th>
            <th>project name</th>
            <th>status</th>
            <th data-type="num">rate/burst</th>
            <th data-type="num">avg time</th>
            <th class="project-progress" data-type="num">&nbsp;<span>progress</span></th>
            <th data-type="num">&nbsp;</th>
            <th data-type="num">&nbsp;</th>
            <th data-type="num">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</th>
            <th data-type="num">actions</th>
          </tr>
        </thead>
        <tbody>
        {% raw %}
          <tr v-cloak v-for="project in projects" data-name="{{* project.name }}">
            <td class="project-group"><span>{{ project.group }}</span></td>
            <td class="project-name"><a href="/debug/{{* project.name }}">{{* project.name }}</a></td>
            <td class="project-status">
              <span class="status-{{ project.paused ? 'PAUSED' : project.status }}" :data-value="project.paused ? 'PAUSED' : project.status">
                {{ project.paused ? 'PAUSED' : project.status }}
              </span>
            </td>
            <td class="project-rate" :data-value="project.rate"><span>{{ project.rate }}/{{ project.burst }}</span></td>
            <td class="project-time" :data-value="project.time.fetch_time + project.time.process_time">
              <span v-show="project.time.fetch_time">{{ (project.time.fetch_time * 1000).toFixed(1) }}+{{ (project.time.process_time * 1000).toFixed(2) }}</span>
            </td>
            <td v-for="type in '5m,1h,1d,all'.split(',')"
                class="project-progress progress-{{* type }}"
                :title="project.progress[type].title"
                :data-value="project.progress[type].task">
              <div class="progress">
                <div class="progress-text">{{* type }}<span v-show="project.progress[type].task">: {{ project.progress[type].task }}</span></div>
                <div class="progress-bar progress-pending"
                     :style="{ width: project.progress[type].pending/project.progress[type].task*100 + '%' }"></div>
                <div class="progress-bar progress-bar-success progress-success"
                     :style="{ width: project.progress[type].success/project.progress[type].task*100 + '%' }"></div>
                <div class="progress-bar progress-bar-warning progress-retry"
                     :style="{ width: project.progress[type].retry/project.progress[type].task*100 + '%' }"></div>
                <div class="progress-bar progress-bar-danger progress-failed"
                     :style="{ width: project.progress[type].failed/project.progress[type].task*100 + '%' }"
                ></div>
              </div>
            </td>
            {% endraw %}

            {% raw %}
            <td class="project-actions" data-value="{{ project.updatetime }}">
              {% endraw %}
              # if config.scheduler_rpc is not none:
              {% raw %}
              <button class="project-run btn btn-default btn-xs" @click="project_run(project, $event)">Run</button>
              <a class="btn btn-default btn-xs" href="/tasks?project={{ project.name }}" target=_blank>Active Tasks</a>
              {% endraw %}
              # endif
              # if config.resultdb:
              {% raw %}
              <a class="btn btn-default btn-xs" href="/results?project={{ project.name }}" target=_blank>Results</a>
              {% endraw %}
              # endif
            </td>
          </tr>
        </tbody>
      </table>
    </section>
    <script>
      // json projects data for vue
      var projects = {{ projects | tojson | safe }};
    </script>
    <script src="{{ url_for('cdn', path='twitter-bootstrap/3.1.1/js/bootstrap.min.js') }}"></script>
    <script src="{{ url_for('cdn', path='x-editable/1.5.0/bootstrap3-editable/js/bootstrap-editable.min.js') }}"></script>
    <script src="{{ url_for('cdn', path='sortable/0.6.0/js/sortable.min.js') }}"></script>
    <script src="{{ url_for('cdn', path='vue/1.0.26/vue.min.js') }}"></script>
    <script src="{{ url_for('static', filename='index.min.js') }}"></script>
  </body>
</html>
<!-- vim: set et sw=2 ts=2 sts=2 ff=unix fenc=utf8 syntax=htmldjango: -->


================================================
FILE: pyspider/webui/templates/result.html
================================================
<!DOCTYPE html>
<html lang="en">
  <head>
    <meta charset="utf-8">
    <title>Results - {{ project }} - pyspider</title>
    <!--[if lt IE 9]>
      <script src="http://html5shim.googlecode.com/svn/trunk/html5.js"></script>
    <![endif]-->

    <meta name="description" content="results of {{ project }}">
    <meta name="author" content="binux">
    <link href="{{ url_for('cdn', path='twitter-bootstrap/3.1.1/css/bootstrap.min.css') }}" rel="stylesheet">
    <link href="{{ url_for('static', filename='result.min.css') }}" rel="stylesheet">

    <script src="{{ url_for('cdn', path='jquery/1.11.0/jquery.min.js') }}"></script>
    <script src="{{ url_for('cdn', path='twitter-bootstrap/3.1.1/js/bootstrap.min.js') }}"></script>
  </head>

  <body>
    <div class="top-bar">
      <h1>{{ project }} - Results</h1>
      <div class="btn-group">
        <a href="/results/dump/{{ project }}.json"
          target="_blank" class="btn btn-default btn-sm">
          <span class="glyphicon glyphicon-download-alt"></span>
          JSON</a>
        <a href="/results/dump/{{ project }}.txt"
          target="_blank" class="btn btn-default btn-sm">URL-JSON</a>
        <a href="/results/dump/{{ project }}.csv"
          target="_blank" class="btn btn-default btn-sm">CSV</a>
      </div>
    </div>
    # set common_fields, results = result_formater(results)
    <table class="table table-condensed table-striped">
      <thead>
        <th>url</th>
        <th></th>
        # for field in common_fields|sort
        <th>
          {{ field }}
        </th>
        # endfor
        <th>
          ...
        </th>
      </thead>
      <tbody>
        # for result in results
        <tr>
          <td>
            <a class=url href="/task/{{ project }}:{{ result.taskid }}" target=_blank>{{ result.url }}</a>
          </td>
          <td>
            <a class=open-url href="{{ result.url }}" target="_blank"><span class="glyphicon glyphicon-new-window"></span></a>
          </td>
          # for field in common_fields|sort
          <td>{{ json.dumps(result.result_formated[field], ensure_ascii=False) | truncate(100, True) }}</td>
          # endfor
          <td>
            {{ json.dumps(result.others, ensure_ascii=False) | truncate(100, True) }}
          </td>
        # endfor
      </tbody>
    </table>

    <div class="pagination-wrap">
      <ul class="pagination">
        # set current_page = int(offset/limit) + (1 if offset%limit else 0)
        # set count = count if count is not none else 0
        # set total_page = int(count/limit) + (1 if count%limit else 0)
        <li class="{{ "disabled" if current_page - 1 <= 0 else "" }}">
          <a href="{% if current_page>1 %}/results?project={{ project }}&offset={{ (current_page-1)*limit }}&limit={{ limit }}{% endif %}">&laquo;</a>
        </li>
        # set prev = 0
        # for i in range(0, total_page):
        # if abs(i-0) < 2 or abs(i-total_page) < 3 or -2 < i-current_page < 5:
          # set prev = i
          <li class="{% if i == current_page %}active{% endif %}">
            <a href="/results?project={{ project }}&offset={{ i*limit }}&limit={{ limit }}">{{ i + 1 }}</a>
          </li>
        # elif prev == i-1:
        <li class="disabled"><a>…</a></li>
        # endif
        # endfor
        <li class="{{ "disabled" if current_page + 1 >= total_page else "" }}">
          <a href="{% if current_page+1<total_page %}/results?project={{ project }}&offset={{ (current_page+1)*limit }}&limit={{ limit }}{% endif %}">&raquo;</a>
        </li>
      </ul>
    </div>
  </body>
</html>


================================================
FILE: pyspider/webui/templates/task.html
================================================
<!DOCTYPE html>
<html lang="en">
  <head>
    <meta charset="utf-8">
    <title>Task - {{ task.project }}:{{ task.taskid }} - pyspider</title>
    <!--[if lt IE 9]>
      <script src="http://html5shim.googlecode.com/svn/trunk/html5.js"></script>
    <![endif]-->

    <meta name="description" content="pyspider taskboard of {{ task.project }}:{{task.taskid }}">
    <meta name="author" content="binux">
    <link href="{{ url_for('cdn', path='twitter-bootstrap/3.1.1/css/bootstrap.min.css') }}" rel="stylesheet">
    <link href="{{ url_for('static', filename='task.min.css') }}" rel="stylesheet">

    <script src="{{ url_for('cdn', path='jquery/1.11.0/jquery.min.js') }}"></script>
    <script src="{{ url_for('cdn', path='twitter-bootstrap/3.1.1/js/bootstrap.min.js') }}"></script>
  </head>

  <body>
      <div class=base-info>
        <p>
          <span class="status status-{{ task.status }}">{{ status_to_string(task.status) }}</span>
          <a class=callback href="/debug/{{ task.project }}?taskid={{ task.taskid }}">{{ task.project }}.{{ task.process.callback }}</a>
          &gt;
          <a class=url href="{{ task.url }}" target=_blank>{{ task.url }}</a>
          {% if task.status in (2, 3, 4) %}
          (<span class=last-crawl>{{ task.lastcrawltime | format_date }}</span> crawled )
          {% else %}
          (<span class=update-time>{{ task.updatetime | format_date }}</span> updated )
          {% endif %}
        </p>
      </div>
      <div class=more-info>
        <dl>
          <dt>taskid</dt>
          <dd>{{ task.taskid }}</dd>
          <dt>lastcrawltime</dt>
          <dd>{{ task.lastcrawltime }} ({{ task.lastcrawltime | format_date }})</dd>
          <dt>updatetime</dt>
          <dd>{{ task.updatetime }} ({{ task.updatetime | format_date }})</dd>
          # if task.schedule and task.schedule.exetime
          <dt>exetime</dt>
          <dd>{{ task.schedule.exetime }} ({{ task.schedule.exetime | format_date }})</dd>
          # endif

          # if task.track and task.track.fetch
          <dt>
            track.fetch
            <span class="glyphicon glyphicon-{{ "ok" if task.track.fetch.ok else "remove" }}"></span>
            {{ (task.track.fetch.time * 1000) | round(2) }}ms
          </dt>
          <dd>{{ json.dumps(task.track.fetch, indent=2, ensure_ascii=False) }}</dd>
          # endif

          # if task.track and task.track.process
          <dt>
            track.process
            <span class="glyphicon glyphicon-{{ "ok" if task.track.process.ok else "remove" }}"></span>
            {{ (task.track.process.time * 1000) | round(2) }}ms
            # if task.track.process.follows
              +{{ task.track.process.follows | int }}
            # endif
          </dt>
          <dd>
            #- if task.track.process.exception
            {{- task.track.process.exception or '' }}
            # endif
            #- if task.track.process.logs
              {{- task.track.process.logs or '' }}
            # endif
            {{- json.dumps(task.track.process, indent=2, ensure_ascii=False) -}}
          </dd>
          # endif
        </dl>
        <dl>
          #- set not_shown_keys = ('status', 'url', 'project', 'taskid', 'lastcrawltime', 'updatetime', 'track', )
          #- for key, value in task.items() if key not in not_shown_keys
          <dt>{{ key }}</dt>
          <dd>{{ json.dumps(value, indent=2, ensure_ascii=False) if value is mapping else value }}</dd>
          #- endfor
        </dl>
        # if result and result.get('result'):
        <dl>
          <dt>result</dt>
          <dd>{{ json.dumps(result['result'], indent=2, ensure_ascii=False) }}</dd>
        </dl>
        # endif
      </div>
  </body>
</html>
<!-- vim: set et sw=2 ts=2 sts=2 ff=unix fenc=utf8: -->


================================================
FILE: pyspider/webui/templates/tasks.html
================================================
<!DOCTYPE html>
<html lang="en">
  <head>
    <meta charset="utf-8">
    <title>Tasks - pyspider</title>
    <!--[if lt IE 9]>
      <script src="http://html5shim.googlecode.com/svn/trunk/html5.js"></script>
    <![endif]-->

    <meta name="description" content="last actived tasks">
    <meta name="author" content="binux">
    <link href="{{ url_for('cdn', path='twitter-bootstrap/3.1.1/css/bootstrap.min.css') }}" rel="stylesheet">
    <link href="{{ url_for('static', filename='tasks.min.css') }}" rel="stylesheet">

    <script src="{{ url_for('cdn', path='jquery/1.11.0/jquery.min.js') }}"></script>
    <script src="{{ url_for('cdn', path='twitter-bootstrap/3.1.1/js/bootstrap.min.js') }}"></script>
  </head>

  <body>
    <ol class=tasks>
      {% for task in tasks | sort(reverse=True, attribute='updatetime') %}
      <li class=task>
        {% if task.status %}
          <span class="status status-{{ task.status }}">{{ status_to_string(task.status) }}</span>
        {% elif task.track %}
        <span class="status status-3">
          {% set fetchok = task.track.fetch and task.track.fetch.ok %}
          {% set processok = task.track.process and task.track.process.ok %}
          {%- if not fetchok -%}
          FETCH_ERROR
          {%- elif not processok -%}
          PROCESS_ERROR
          {%- endif -%}
        </span>
        {% else %}
          <span class="status status-4 }}">ERROR</span>
        {% endif %}

        <a class=callback href="/debug/{{ task.project }}?taskid={{ task.taskid }}" target=_blank>{{ task.project }}</a>
        &gt;
        <a class=url href="/task/{{ task.project }}:{{ task.taskid }}" title="{{ task.url }}" target=_blank>{{ task.url }}</a>

        <span class=update-time>{{ task.updatetime | format_date }}</span>

        {% if task.track and task.track.fetch %}
        <span span=use-time>
          {{- '%.1f' | format(task.track.fetch.time * 1000) }}+{{ '%.2f' | format(task.track.process.time * 1000 if task.track and task.track.process else 0) }}ms
        </span>
        {% endif %}

        <span span=follows>
        {% if task.track and task.track.process %}
        +{{ task.track.process.follows | int }}
        {% endif %}
        </span>
      </li>
      {% endfor %}
    </ol>
  </body>
</html>
<!-- vim: set et sw=2 ts=2 sts=2 ff=unix fenc=utf8: -->


================================================
FILE: pyspider/webui/webdav.py
================================================
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:
# Author: Binux<roy@binux.me>
#         http://binux.me
# Created on 2015-6-3 11:29


import os
import time
import base64
import six
from six import BytesIO
from wsgidav.wsgidav_app import DEFAULT_CONFIG, WsgiDAVApp
from wsgidav.dav_provider import DAVProvider, DAVCollection, DAVNonCollection
from wsgidav.dav_error import DAVError, HTTP_FORBIDDEN
from pyspider.libs.utils import utf8, text
from .app import app


def check_user(environ):
    authheader = environ.get("HTTP_AUTHORIZATION")
    if not authheader:
        return False
    authheader = authheader[len("Basic "):]
    try:
        username, password = text(base64.b64decode(authheader)).split(':', 1)
    except Exception as e:
        app.logger.error('wrong api key: %r, %r', authheader, e)
        return False

    if username == app.config['webui_username'] \
            and password == app.config['webui_password']:
        return True
    else:
        return False


class ContentIO(BytesIO):
    def close(self):
        self.content = self.getvalue()
        BytesIO.close(self) #old class


class ScriptResource(DAVNonCollection):
    def __init__(self, path, environ, app, project=None):
        super(ScriptResource, self).__init__(path, environ)

        self.app = app
        self.new_project = False
        self._project = project
        self.project_name = text(self.name)
        self.writebuffer = None
        if self.project_name.endswith('.py'):
            self.project_name = self.project_name[:-len('.py')]

    @property
    def project(self):
        if self._project:
            return self._project
        projectdb = self.app.config['projectdb']
        if projectdb:
            self._project = projectdb.get(self.project_name)
        if not self._project:
            if projectdb.verify_project_name(self.project_name) and text(self.name).endswith('.py'):
                self.new_project = True
                self._project = {
                    'name': self.project_name,
                    'script': '',
                    'status': 'TODO',
                    'rate': self.app.config.get('max_rate', 1),
                    'burst': self.app.config.get('max_burst', 3),
                    'updatetime': time.time(),
                }
            else:
                raise DAVError(HTTP_FORBIDDEN)
        return self._project

    @property
    def readonly(self):
        projectdb = self.app.config['projectdb']
        if not projectdb:
            return True
        if 'lock' in projectdb.split_group(self.project.get('group')) \
                and self.app.config.get('webui_username') \
                and self.app.config.get('webui_password'):
            return not check_user(self.environ)
        return False

    def getContentLength(self):
        return len(utf8(self.project['script']))

    def getContentType(self):
        return 'text/plain'

    def getLastModified(self):
        return self.project['updatetime']

    def getContent(self):
        return BytesIO(utf8(self.project['script']))

    def beginWrite(self, contentType=None):
        if self.readonly:
            self.app.logger.error('webdav.beginWrite readonly')
            return super(ScriptResource, self).beginWrite(contentType)
        self.writebuffer = ContentIO()
        return self.writebuffer

    def endWrite(self, withErrors):
        if withErrors:
            self.app.logger.error('webdav.endWrite error: %r', withErrors)
            return super(ScriptResource, self).endWrite(withErrors)
        if not self.writebuffer:
            return
        projectdb = self.app.config['projectdb']
        if not projectdb:
            return

        info = {
            'script': text(getattr(self.writebuffer, 'content', ''))
        }
        if self.project.get('status') in ('DEBUG', 'RUNNING'):
            info['status'] = 'CHECKING'

        if self.new_project:
            self.project.update(info)
            self.new_project = False
            return projectdb.insert(self.project_name, self.project)
        else:
            return projectdb.update(self.project_name, info)


class RootCollection(DAVCollection):
    def __init__(self, path, environ, app):
        super(RootCollection, self).__init__(path, environ)
        self.app = app
        self.projectdb = self.app.config['projectdb']

    def getMemberList(self):
        members = []
        for project in self.projectdb.get_all():
            project_name = project['name']
            if not project_name.endswith('.py'):
                project_name += '.py'
            native_path = os.path.join(self.path, project_name)
            native_path = text(native_path) if six.PY3 else utf8(native_path)
            members.append(ScriptResource(
                native_path,
                self.environ,
                self.app,
                project
            ))
        return members

    def getMemberNames(self):
        members = []
        for project in self.projectdb.get_all(fields=['name', ]):
            project_name = project['name']
            if not project_name.endswith('.py'):
                project_name += '.py'
            members.append(utf8(project_name))
        return members


class ScriptProvider(DAVProvider):
    def __init__(self, app):
        super(ScriptProvider, self).__init__()
        self.app = app

    def __repr__(self):
        return "pyspiderScriptProvider"

    def getResourceInst(self, path, environ):
        path = os.path.normpath(path).replace('\\', '/')
        if path in ('/', '.', ''):
            path = '/'
            return RootCollection(path, environ, self.app)
        else:
            return ScriptResource(path, environ, self.app)


class NeedAuthController(object):
    def __init__(self, app):
        self.app = app

    def getDomainRealm(self, inputRelativeURL, environ):
        return 'need auth'

    def requireAuthentication(self, realmname, environ):
        return self.app.config.get('need_auth', False)

    def isRealmUser(self, realmname, username, environ):
        return username == self.app.config.get('webui_username')

    def getRealmUserPassword(self, realmname, username, environ):
        return self.app.config.get('webui_password')

    def authDomainUser(self, realmname, username, password, environ):
        return username == self.app.config.get('webui_username') \
            and password == self.app.config.get('webui_password')


config = DEFAULT_CONFIG.copy()
config.update({
    'mount_path': '/dav',
    'provider_mapping': {
        '/': ScriptProvider(app)
    },
    'domaincontroller': NeedAuthController(app),
    'verbose': 1 if app.debug else 0,
    'dir_browser': {'davmount': False,
                    'enable': True,
                    'msmount': False,
                    'response_trailer': ''},
})
dav_app = WsgiDAVApp(config)


================================================
FILE: requirements.txt
================================================
Flask==0.10
Jinja2==2.7
chardet==3.0.4
cssselect==0.9
lxml==4.3.3
pycurl==7.43.0.3
pyquery==1.4.0
requests==2.24.0
tornado==4.5.3
mysql-connector-python==8.0.16
pika==1.1.0
pymongo==3.9.0
Flask-Login==0.2.11
u-msgpack-python==1.6
click==6.6
SQLAlchemy==1.3.10
six==1.10.0
amqp==2.4.0
redis==2.10.6
redis-py-cluster==1.3.6
kombu==4.4.0
psycopg2==2.8.2
elasticsearch==2.3.0
tblib==1.4.0


================================================
FILE: run.py
================================================
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:
# Author: Binux<roy@binux.me>
#         http://binux.me
# Created on 2014-11-24 23:11:49

from pyspider.run import main

if __name__ == '__main__':
    main()


================================================
FILE: setup.py
================================================
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:
# Author: Binux<roy@binux.me>
#         http://binux.me
# Created on 2014-11-24 22:27:45


import sys
from setuptools import setup, find_packages
from codecs import open
from os import path

here = path.abspath(path.dirname(__file__))
with open(path.join(here, 'README.md'), encoding='utf-8') as f:
    long_description = f.read()

import pyspider

install_requires = [
    'Flask==0.10',
    'Jinja2==2.7',
    'chardet==3.0.4',
    'cssselect==0.9',
    "lxml==4.3.3",
    'pycurl==7.43.0.3',
    'requests==2.24.0',
    'Flask-Login==0.2.11',
    'u-msgpack-python==1.6',
    'click==3.3',
    'six==1.10.0',
    'tblib==1.4.0',
    'wsgidav==2.3.0',
    'tornado>=3.2,<=4.5.3',
    'pyquery',
]

extras_require_all = [
    'mysql-connector-python==8.0.16',
    'pymongo==3.9.0',
    'redis==2.10.6',
    'redis-py-cluster==1.3.6',
    'psycopg2==2.8.2',
    'elasticsearch==2.3.0',
    'kombu==4.4.0',
    'amqp==2.4.0',
    'SQLAlchemy==1.3.10',
    'pika==1.1.0'
]

setup(
    name='pyspider',
    version=pyspider.__version__,

    description='A Powerful Spider System in Python',
    long_description=long_description,

    url='https://github.com/binux/pyspider',

    author='Roy Binux',
    author_email='roy@binux.me',

    license='Apache License, Version 2.0',

    classifiers=[
        'Development Status :: 4 - Beta',
        'Programming Language :: Python :: 3.5',
        'Programming Language :: Python :: 3.6',
        'Programming Language :: Python :: 3.7',

        'License :: OSI Approved :: Apache Software License',

        'Intended Audience :: Developers',
        'Operating System :: OS Independent',
        'Environment :: Web Environment',

        'Topic :: Internet :: WWW/HTTP',
        'Topic :: Software Development :: Libraries :: Application Frameworks',
        'Topic :: Software Development :: Libraries :: Python Modules',
    ],

    keywords='scrapy crawler spider webui',

    packages=find_packages(exclude=['data', 'tests*']),

    install_requires=install_requires,

    extras_require={
        'all': extras_require_all,
        'test': [
            'coverage',
            'Werkzeug==0.16.1',
            'httpbin==0.7.0',
            'pyproxy==0.1.6',
            'easywebdav==1.2.0',
        ]
    },

    package_data={
        'pyspider': [
            'logging.conf',
            'fetcher/phantomjs_fetcher.js',
            'fetcher/splash_fetcher.lua',
            'webui/static/*.js',
            'webui/static/*.css',
            'webui/templates/*'
        ],
    },

    entry_points={
        'console_scripts': [
            'pyspider=pyspider.run:main'
        ]
    },

    test_suite='tests.all_suite',
)


================================================
FILE: tests/__init__.py
================================================
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:
# Author: Binux<i@binux.me>
#         http://binux.me
# Created on 2014-02-09 10:53:19

import os
import unittest

all_suite = unittest.TestLoader().discover(os.path.dirname(__file__), "test_*.py")


================================================
FILE: tests/data_fetcher_processor_handler.py
================================================
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:
# Author: Binux<roy@binux.me>
#         http://binux.me
# Created on 2015-01-18 14:12:55

from pyspider.libs.base_handler import *

class Handler(BaseHandler):

    @not_send_status
    def not_send_status(self, response):
        self.crawl('http://www.baidu.com/')
        return response.text

    def url_deduplicated(self, response):
        self.crawl('http://www.baidu.com/')
        self.crawl('http://www.google.com/')
        self.crawl('http://www.baidu.com/')
        self.crawl('http://www.google.com/')
        self.crawl('http://www.google.com/')

    @catch_status_code_error
    def catch_http_error(self, response):
        self.crawl('http://www.baidu.com/')
        return response.status_code

    def json(self, response):
        return response.json

    def html(self, response):
        return response.doc('h1').text()

    def links(self, response):
        self.crawl([x.attr.href for x in response.doc('a').items()], callback=self.links)

    def cookies(self, response):
        return response.cookies

    def get_save(self, response):
        return response.save

    def get_process_save(self, response):
        return self.save

    def set_process_save(self, response):
        self.save['roy'] = 'binux'

class IgnoreHandler(BaseHandler):
    pass

__handler_cls__ = Handler


================================================
FILE: tests/data_handler.py
================================================

#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:
# Author: Binux<i@binux.me>
#         http://binux.me
# Created on 2014-02-22 14:02:21

import time
from pyspider.libs.base_handler import BaseHandler, catch_status_code_error, every

class IgnoreHandler(object):
    pass

class TestHandler(BaseHandler):
    retry_delay = {
        1: 10,
        '': -1
    }

    def hello(self):
        return "hello world!"

    def echo(self, response):
        return response.content

    def saved(self, response):
        return response.save

    def echo_task(self, response, task):
        return task['project']

    @catch_status_code_error
    def catch_status_code(self, response):
        return response.status_code

    def raise_exception(self):
        print('print')
        logger.info("info")
        logger.warning("warning")
        logger.error("error")
        raise Exception('exception')

    def add_task(self, response):
        self.crawl('http://www.google.com', callback='echo', params={'wd': u'中文'})
        self.send_message('some_project', {'some': 'message'})

    @every
    def on_cronjob1(self, response):
        logger.info('on_cronjob1')

    @every(seconds=10)
    def on_cronjob2(self, response):
        logger.info('on_cronjob2')

    def generator(self, response):
        yield "a"
        yield "b"

    def sleep(self, response):
        time.sleep(response.save)


================================================
FILE: tests/data_sample_handler.py
================================================
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# Created on __DATE__
# Project: __PROJECT_NAME__

from pyspider.libs.base_handler import *


class Handler(BaseHandler):
    crawl_config = {
    }

    @every(minutes=24 * 60)
    def on_start(self):
        self.crawl('http://127.0.0.1:14887/pyspider/test.html', callback=self.index_page)

    @config(age=10 * 24 * 60 * 60)
    def index_page(self, response):
        for each in response.doc('a[href^="http"]').items():
            self.crawl(each.attr.href, callback=self.detail_page)

    @config(priority=2)
    def detail_page(self, response):
        return {
            "url": response.url,
            "title": response.doc('title').text(),
        }


================================================
FILE: tests/data_test_webpage.py
================================================
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:
# Author: Binux<roy@binux.me>
#         http://binux.me
# Created on 2015-01-24 13:44:10

from httpbin import app

@app.route('/pyspider/test.html')
def test_page():
    return '''
<a href="/404">404
<a href="/links/10/0">0
<a href="/links/10/1">1
<a href="/links/10/2">2
<a href="/links/10/3">3
<a href="/links/10/4">4
<a href="/gzip">gzip
<a href="/get">get
<a href="/deflate">deflate
<a href="/html">html
<a href="/xml">xml
<a href="/robots.txt">robots
<a href="/cache">cache
<a href="/stream/20">stream
'''

@app.route('/pyspider/ajax.html')
def test_ajax():
    return '''
<div class=status>loading...</div>
<div class=ua></div>
<div class=ip></div>
<script>
var xhr = new XMLHttpRequest();
xhr.onload = function() {
  var data = JSON.parse(xhr.responseText);
  document.querySelector('.status').innerHTML = 'done';
  document.querySelector('.ua').innerHTML = data.headers['User-Agent'];
  document.querySelector('.ip').innerHTML = data.origin;
}
xhr.open("get", "/get", true);
xhr.send();
</script>
'''

@app.route('/pyspider/ajax_click.html')
def test_ajax_click():
    return '''
<div class=status>loading...</div>
<div class=ua></div>
<div class=ip></div>
<a href="javascript:void(0)" onclick="load()">load</a>
<script>
function load() {
    var xhr = new XMLHttpRequest();
    xhr.onload = function() {
      var data = JSON.parse(xhr.responseText);
      document.querySelector('.status').innerHTML = 'done';
      document.querySelector('.ua').innerHTML = data.headers['User-Agent'];
      document.querySelector('.ip').innerHTML = data.origin;
    }
    xhr.open("get", "/get", true);
    xhr.send();
}
</script>
'''


================================================
FILE: tests/test_base_handler.py
================================================
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:
# Author: Binux<roy@binux.me>
#         http://binux.me
# Created on 2017-02-26 10:35:23

import unittest

from pyspider.libs.base_handler import BaseHandler


class TestBaseHandler(unittest.TestCase):
    sample_task_http = {
        'taskid': 'taskid',
        'project': 'project',
        'url': '',
        'fetch': {
            'method': 'GET',
            'headers': {
                'Cookie': 'a=b',
                'a': 'b'
            },
            'cookies': {
                'c': 'd',
            },
            'timeout': 60,
            'save': 'abc',
        },
        'process': {
            'callback': 'callback',
            'save': [1, 2, 3],
        },
    }

    def test_task_join_crawl_config(self):
        task = dict(self.sample_task_http)
        crawl_config = {
            'taskid': 'xxxx',       # should not affect finial task
            'proxy': 'username:password@hostname:port',  # should add proxy
            'headers': {            # should merge headers
                'Cookie': 'abc',    # should not affect cookie
                'c': 'd',           # should add header c
            }
        }
        
        ret = BaseHandler.task_join_crawl_config(task, crawl_config)
        self.assertDictEqual(ret, {
            'taskid': 'taskid',
            'project': 'project',
            'url': '',
            'fetch': {
                'method': 'GET',
                'proxy': 'username:password@hostname:port',
                'headers': {
                    'Cookie': 'a=b',
                    'a': 'b',
                    'c': 'd'
                },
                'cookies': {
                    'c': 'd',
                },
                'timeout': 60,
                'save': 'abc',
            },
            'process': {
                'callback': 'callback',
                'save': [1, 2, 3],
            },
        });


================================================
FILE: tests/test_bench.py
================================================
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:
# Author: Binux<roy@binux.me>
#         http://binux.me
# Created on 2014-12-10 01:34:09

import os
import sys
import time
import click
import shutil
import inspect
import unittest

from pyspider import run
from pyspider.libs import utils

class TestBench(unittest.TestCase):

    @classmethod
    def setUpClass(self):
        shutil.rmtree('./data/bench', ignore_errors=True)
        os.makedirs('./data/bench')

    @classmethod
    def tearDownClass(self):
        shutil.rmtree('./data/bench', ignore_errors=True)

    def test_10_bench(self):
        import subprocess
        #cmd = [sys.executable]
        cmd = ['coverage', 'run']
        p = subprocess.Popen(cmd+[
            inspect.getsourcefile(run),
            '--queue-maxsize=0',
            'bench',
            '--total=500'
        ], close_fds=True, stderr=subprocess.PIPE)

        stdout, stderr = p.communicate()
        stderr = utils.text(stderr)
        print(stderr)

        self.assertEqual(p.returncode, 0, stderr)
        self.assertIn('Crawled', stderr)
        self.assertIn('Fetched', stderr)
        self.assertIn('Processed', stderr)
        self.assertIn('Saved', stderr)


================================================
FILE: tests/test_counter.py
================================================
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:
# Author: Binux<roy@binux.me>
#         http://binux.me
# Created on 2015-04-05 00:05:58

import sys
import time
import unittest

from pyspider.libs import counter

class TestCounter(unittest.TestCase):
    def test_010_TimebaseAverageEventCounter(self):
        c = counter.TimebaseAverageEventCounter(2, 1)
        for i in range(100):
            time.sleep(0.1)
            c.event(100+i)

        self.assertEqual(c.sum, float(180+199)*20/2)
        self.assertEqual(c.avg, float(180+199)/2)

    def test_020_TotalCounter(self):
        c = counter.TotalCounter()
        for i in range(3):
            c.event(i)
        self.assertEqual(c.avg, 3)
        self.assertEqual(c.sum, 3)

    def test_030_AverageWindowCounter(self):
        c = counter.AverageWindowCounter(10)
        self.assertTrue(c.empty())

        for i in range(20):
            c.event(i)

        self.assertFalse(c.empty())
        self.assertEqual(c.avg, 14.5)
        self.assertEqual(c.sum, 145)

    def test_020_delete(self):
        c = counter.CounterManager()
        c.event(('a', 'b'), 1)
        c.event(('a', 'c'), 1)
        c.event(('b', 'c'), 1)
        
        self.assertIsNotNone(c['a'])
        self.assertIsNotNone(c['b'])

        del c['a']

        self.assertNotIn('a', c)
        self.assertIsNotNone(c['b'])


================================================
FILE: tests/test_database.py
================================================
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:
# Author: Binux<i@binux.me>
#         http://binux.me
# Created on 2014-02-08 22:37:13

from __future__ import unicode_literals, division

import os
import six
import time
import unittest

from pyspider import database
from pyspider.database.base.taskdb import TaskDB


class TaskDBCase(object):
    sample_task = {
        'taskid': 'taskid',
        'project': 'project',
        'url': 'www.baidu.com/',
        'status': TaskDB.FAILED,
        'schedule': {
            'priority': 1,
            'retries': 3,
            'exetime': 0,
            'age': 3600,
            'itag': 'itag',
            'recrawl': 5,
        },
        'fetch': {
            'method': 'GET',
            'headers': {
                'Cookie': 'a=b',
            },
            'data': 'a=b&c=d',
            'timeout': 60,
        },
        'process': {
            'callback': 'callback',
            'save': [1, 2, 3],
        },
        'track': {
            'fetch': {
                'ok': True,
                'time': 300,
                'status_code': 200,
                'headers': {
                    'Content-Type': 'plain/html',
                },
                'encoding': 'utf8',
                # 'content': 'asdfasdfasdfasdf',
            },
            'process': {
                'ok': False,
                'time': 10,
                'follows': 3,
                'outputs': 5,
                'exception': u"中文",
            },
        },
        'lastcrawltime': time.time(),
        'updatetime': time.time(),
    }

    @classmethod
    def setUpClass(self):
        raise NotImplementedError

    # this test not works for mongodb
    # def test_10_create_project(self):
        # with self.assertRaises(AssertionError):
        # self.taskdb._create_project('abc.abc')
        # self.taskdb._create_project('abc')
        # self.taskdb._list_project()
        # self.assertEqual(len(self.taskdb.projects), 1)
        # self.assertIn('abc', self.taskdb.projects)

    def test_20_insert(self):
        self.taskdb.insert('project', 'taskid', self.sample_task)
        self.taskdb.insert('project', 'taskid2', self.sample_task)

    def test_25_get_task(self):
        task = self.taskdb.get_task('project', 'taskid2')
        self.assertIsNotNone(task)
        self.assertEqual(task['taskid'], 'taskid2')
        self.assertEqual(task['project'], self.sample_task['project'])
        self.assertEqual(task['url'], self.sample_task['url'])
        self.assertEqual(task['status'], self.taskdb.FAILED)
        self.assertEqual(task['schedule'], self.sample_task['schedule'])
        self.assertEqual(task['fetch'], self.sample_task['fetch'])
        self.assertEqual(task['process'], self.sample_task['process'])
        self.assertEqual(task['track'], self.sample_task['track'])

        task = self.taskdb.get_task('project', 'taskid1', fields=['status'])
        self.assertIsNone(task)

        task = self.taskdb.get_task('project', 'taskid', fields=['taskid', 'track', ])
        self.assertIn('track', task)
        self.assertNotIn('project', task)

    def test_30_status_count(self):
        status = self.taskdb.status_count('abc')
        self.assertEqual(status, {})
        status = self.taskdb.status_count('project')
        self.assertEqual(status, {self.taskdb.FAILED: 2})

    def test_40_update_and_status_count(self):
        self.taskdb.update('project', 'taskid', status=self.taskdb.ACTIVE)
        status = self.taskdb.status_count('project')
        self.assertEqual(status, {self.taskdb.ACTIVE: 1, self.taskdb.FAILED: 1})

        self.taskdb.update('project', 'taskid', track={})
        task = self.taskdb.get_task('project', 'taskid', fields=['taskid', 'track', ])
        self.assertIn('track', task)
        self.assertEqual(task['track'], {})

    def test_50_load_tasks(self):
        tasks = list(self.taskdb.load_tasks(self.taskdb.ACTIVE))
        self.assertEqual(len(tasks), 1)
        task = tasks[0]
        self.assertIn('taskid', task, task)
        self.assertEqual(task['taskid'], 'taskid', task)
        self.assertEqual(task['schedule'], self.sample_task['schedule'])
        self.assertEqual(task['fetch'], self.sample_task['fetch'])
        self.assertEqual(task['process'], self.sample_task['process'])
        self.assertEqual(task['track'], {})

        tasks = list(self.taskdb.load_tasks(self.taskdb.ACTIVE, project='project',
                                            fields=['taskid']))
        self.assertEqual(len(tasks), 1)
        self.assertEqual(tasks[0]['taskid'], 'taskid')
        self.assertNotIn('project', tasks[0])

    def test_60_relist_projects(self):
        if hasattr(self.taskdb, '_list_project'):
            self.taskdb._list_project()
            self.assertNotIn('system.indexes', self.taskdb.projects)

    def test_z10_drop(self):
        self.taskdb.insert('drop_project2', 'taskid', self.sample_task)
        self.taskdb.insert('drop_project3', 'taskid', self.sample_task)
        self.taskdb.drop('drop_project3')
        self.assertIsNotNone(self.taskdb.get_task('drop_project2', 'taskid'), None)
        self.assertIsNone(self.taskdb.get_task('drop_project3', 'taskid'), None)

    def test_z20_update_projects(self):
        saved = getattr(self.taskdb, 'UPDATE_PROJECTS_TIME', None)
        self.taskdb.UPDATE_PROJECTS_TIME = 0.1
        time.sleep(0.2)
        self.assertIn('drop_project2', self.taskdb.projects)
        self.assertNotIn('drop_project3', self.taskdb.projects)
        self.taskdb.UPDATE_PROJECTS_TIME = saved


class ProjectDBCase(object):
    sample_project = {
        'name': 'name',
        'script': 'import time\nprint(time.time(), "!@#$%^&*()\';:<>?/|")',
        'status': 'TODO',
        'rate': 1.0,
        'burst': 10.0,
    }

    @classmethod
    def setUpClass(self):
        raise NotImplemented

    def test_10_insert(self):
        self.projectdb.insert('abc', self.sample_project)
        self.projectdb.insert(u'name中文', self.sample_project)
        project = self.projectdb.get('abc')
        self.assertIsNotNone(project)

    def test_20_get_all(self):
        projects = list(self.projectdb.get_all())
        self.assertEqual(len(projects), 2)
        for project in projects:
            if project['name'] == 'abc':
                break
        for key in ('name', 'group', 'status', 'script', 'comments', 'rate', 'burst', 'updatetime'):
            self.assertIn(key, project)

        self.assertEqual(project['name'], u'abc')
        self.assertEqual(project['status'], self.sample_project['status'])
        self.assertEqual(project['script'], self.sample_project['script'])
        self.assertEqual(project['rate'], self.sample_project['rate'])
        self.assertEqual(type(project['rate']), float)
        self.assertEqual(project['burst'], self.sample_project['burst'])
        self.assertEqual(type(project['burst']), float)


        projects = list(self.projectdb.get_all(fields=['name', 'script']))
        self.assertEqual(len(projects), 2)
        project = projects[1]
        self.assertIn('name', project)
        self.assertNotIn('gourp', project)

    def test_30_update(self):
        self.projectdb.update('not_found', status='RUNNING')
        project = self.projectdb.get('not_found')
        self.assertIsNone(project)

    def test_40_check_update(self):
        time.sleep(0.1)
        now = time.time()
        time.sleep(0.1)
        self.projectdb.update('abc', status='RUNNING')

        projects = list(self.projectdb.check_update(
            now,
            fields=['name', 'status', 'group', 'updatetime', ]
        ))
        self.assertEqual(len(projects), 1, repr(projects))
        project = projects[0]
        self.assertEqual(project['name'], 'abc')
        self.assertEqual(project['status'], 'RUNNING')

    def test_45_check_update_when_bootup(self):
        projects = list(self.projectdb.check_update(0))
        project = projects[0]
        for key in ('name', 'group', 'status', 'script', 'comments', 'rate', 'burst', 'updatetime'):
            self.assertIn(key, project)

    def test_50_get(self):
        project = self.projectdb.get('not_found')
        self.assertIsNone(project)

        project = self.projectdb.get('abc')
        self.assertEqual(project['name'], 'abc')
        self.assertEqual(project['status'], 'RUNNING')

        project = self.projectdb.get(u'name中文', ['group', 'status', 'name'])
        self.assertEqual(project['name'], u'name中文')
        self.assertIn('status', project)
        self.assertNotIn('gourp', project)

    def test_z10_drop(self):
        self.projectdb.insert(u'drop_project2', self.sample_project)
        self.projectdb.insert(u'drop_project3', self.sample_project)
        self.projectdb.drop('drop_project3')
        self.assertIsNotNone(self.projectdb.get('drop_project2'))
        self.assertIsNone(self.projectdb.get('drop_project3'))


class ResultDBCase(object):

    @classmethod
    def setUpClass(self):
        raise NotImplemented

    def test_10_save(self):
        self.resultdb.save('test_project', 'test_taskid', 'test_url', 'result')
        result = self.resultdb.get('test_project', 'test_taskid')
        self.assertIsNotNone(result)
        self.assertEqual(result['result'], 'result')

        self.resultdb.save('test_project', 'test_taskid', 'test_url_updated', 'result_updated')
        result = self.resultdb.get('test_project', 'test_taskid')
        self.assertEqual(result['result'], 'result_updated')
        self.assertEqual(result['url'], 'test_url_updated')

    def test_20_get(self):
        result = self.resultdb.get('test_project', 'not_exists')
        self.assertIsNone(result)

        result = self.resultdb.get('not_exists', 'test_taskid')
        self.assertIsNone(result)

        result = self.resultdb.get('test_project', 'test_taskid', fields=('url', ))
        self.assertIsNotNone(result)
        self.assertIn('url', result)
        self.assertNotIn('result', result)

        result = self.resultdb.get('test_project', 'test_taskid')
        self.assertEqual(result['taskid'], 'test_taskid')
        self.assertEqual(result['url'], 'test_url_updated')
        self.assertEqual(result['result'], 'result_updated')
        self.assertIn('updatetime', result)

    def test_30_select(self):
        for i in range(5):
            self.resultdb.save('test_project', 'test_taskid-%d' % i,
                               'test_url', 'result-%d' % i)
        ret = list(self.resultdb.select('test_project'))
        self.assertEqual(len(ret), 6)

        ret = list(self.resultdb.select('test_project', limit=4))
        self.assertEqual(len(ret), 4)

        for ret in self.resultdb.select('test_project', fields=('url', ), limit=1):
            self.assertIn('url', ret)
            self.assertNotIn('result', ret)

    def test_35_select_limit(self):
        ret = list(self.resultdb.select('test_project', limit=None, offset=None))
        self.assertEqual(len(ret), 6)

        ret = list(self.resultdb.select('test_project', limit=None, offset=2))
        self.assertEqual(len(ret), 4, ret)

    def test_40_count(self):
        self.assertEqual(self.resultdb.count('test_project'), 6)

    def test_50_select_not_finished(self):
        for i in self.resultdb.select('test_project'):
            break
        self.assertEqual(self.resultdb.count('test_project'), 6)

    def test_60_relist_projects(self):
        if hasattr(self.resultdb, '_list_project'):
            self.resultdb._list_project()
            self.assertNotIn('system.indexes', self.resultdb.projects)

    def test_z10_drop(self):
        self.resultdb.save('drop_project2', 'test_taskid', 'test_url', 'result')
        self.resultdb.save('drop_project3', 'test_taskid', 'test_url', 'result')
        self.resultdb.drop('drop_project3')
        self.assertIsNotNone(self.resultdb.get('drop_project2', 'test_taskid'))
        self.assertIsNone(self.resultdb.get('drop_project3', 'test_taskid'))

    def test_z20_update_projects(self):
        saved = self.resultdb.UPDATE_PROJECTS_TIME
        self.resultdb.UPDATE_PROJECTS_TIME = 0.1
        time.sleep(0.2)
        self.assertIn('drop_project2', self.resultdb.projects)
        self.assertNotIn('drop_project3', self.resultdb.projects)
        self.resultdb.UPDATE_PROJECTS_TIME = saved


class TestSqliteTaskDB(TaskDBCase, unittest.TestCase):

    @classmethod
    def setUpClass(self):
        self.taskdb = database.connect_database('sqlite+taskdb://')
        self.assertIsNotNone(self, self.taskdb)

    @classmethod
    def tearDownClass(self):
        del self.taskdb


class TestSqliteProjectDB(ProjectDBCase, unittest.TestCase):

    @classmethod
    def setUpClass(self):
        self.projectdb = database.connect_database('sqlite+projectdb://')
        self.assertIsNotNone(self, self.projectdb)

    @classmethod
    def tearDownClass(self):
        del self.projectdb


class TestSqliteResultDB(ResultDBCase, unittest.TestCase):

    @classmethod
    def setUpClass(self):
        self.resultdb = database.connect_database('sqlite+resultdb://')
        self.assertIsNotNone(self, self.resultdb)

    @classmethod
    def tearDownClass(self):
        del self.resultdb


@unittest.skipIf(os.environ.get('IGNORE_MYSQL') or os.environ.get('IGNORE_ALL'), 'no mysql server for test.')
class TestMysqlTaskDB(TaskDBCase, unittest.TestCase):

    @classmethod
    def setUpClass(self):
        self.taskdb = database.connect_database('mysql+taskdb://localhost/pyspider_test_taskdb')
        self.assertIsNotNone(self, self.taskdb)

    @classmethod
    def tearDownClass(self):
        self.taskdb._execute('DROP DATABASE pyspider_test_taskdb')


@unittest.skipIf(os.environ.get('IGNORE_MYSQL') or os.environ.get('IGNORE_ALL'), 'no mysql server for test.')
class TestMysqlProjectDB(ProjectDBCase, unittest.TestCase):

    @classmethod
    def setUpClass(self):
        self.projectdb = database.connect_database(
            'mysql+projectdb://localhost/pyspider_test_projectdb'
        )
        self.assertIsNotNone(self, self.projectdb)

    @classmethod
    def tearDownClass(self):
        self.projectdb._execute('DROP DATABASE pyspider_test_projectdb')


@unittest.skipIf(os.environ.get('IGNORE_MYSQL') or os.environ.get('IGNORE_ALL'), 'no mysql server for test.')
class TestMysqlResultDB(ResultDBCase, unittest.TestCase):

    @classmethod
    def setUpClass(self):
        self.resultdb = database.connect_database(
            'mysql+resultdb://localhost/pyspider_test_resultdb'
        )
        self.assertIsNotNone(self, self.resultdb)

    @classmethod
    def tearDownClass(self):
        self.resultdb._execute('DROP DATABASE pyspider_test_resultdb')


@unittest.skipIf(os.environ.get('IGNORE_MONGODB') or os.environ.get('IGNORE_ALL'), 'no mongodb server for test.')
class TestMongoDBTaskDB(TaskDBCase, unittest.TestCase):

    @classmethod
    def setUpClass(self):
        self.taskdb = database.connect_database(
            'mongodb+taskdb://localhost:27017/pyspider_test_taskdb'
        )
        self.assertIsNotNone(self, self.taskdb)

    @classmethod
    def tearDownClass(self):
        self.taskdb.conn.drop_database(self.taskdb.database.name)

    def test_create_project(self):
        self.assertNotIn('test_create_project', self.taskdb.projects)
        self.taskdb._create_project('test_create_project')
        self.assertIn('test_create_project', self.taskdb.projects)


@unittest.skipIf(os.environ.get('IGNORE_MONGODB') or os.environ.get('IGNORE_ALL'), 'no mongodb server for test.')
class TestMongoDBProjectDB(ProjectDBCase, unittest.TestCase):

    @classmethod
    def setUpClass(self):
        self.projectdb = database.connect_database(
            'mongodb+projectdb://localhost/pyspider_test_projectdb'
        )
        self.assertIsNotNone(self, self.projectdb)

    @classmethod
    def tearDownClass(self):
        self.projectdb.conn.drop_database(self.projectdb.database.name)


@unittest.skipIf(os.environ.get('IGNORE_MONGODB') or os.environ.get('IGNORE_ALL'), 'no mongodb server for test.')
class TestMongoDBResultDB(ResultDBCase, unittest.TestCase):

    @classmethod
    def setUpClass(self):
        self.resultdb = database.connect_database(
            'mongodb+resultdb://localhost/pyspider_test_resultdb'
        )
        self.assertIsNotNone(self, self.resultdb)

    @classmethod
    def tearDownClass(self):
        self.resultdb.conn.drop_database(self.resultdb.database.name)

    def test_create_project(self):
        self.assertNotIn('test_create_project', self.resultdb.projects)
        self.resultdb._create_project('test_create_project')
        self.assertIn('test_create_project', self.resultdb.projects)


@unittest.skipIf(os.environ.get('IGNORE_MYSQL') or os.environ.get('IGNORE_ALL'), 'no mysql server for test.')
class TestSQLAlchemyMySQLTaskDB(TaskDBCase, unittest.TestCase):

    @classmethod
    def setUpClass(self):
        self.taskdb = database.connect_database(
            'sqlalchemy+mysql+mysqlconnector+taskdb://root@localhost/pyspider_test_taskdb'
        )
        self.assertIsNotNone(self, self.taskdb)

    @classmethod
    def tearDownClass(self):
        self.taskdb.engine.execute('DROP DATABASE pyspider_test_taskdb')


@unittest.skipIf(os.environ.get('IGNORE_MYSQL') or os.environ.get('IGNORE_ALL'), 'no mysql server for test.')
class TestSQLAlchemyMySQLProjectDB(ProjectDBCase, unittest.TestCase):

    @classmethod
    def setUpClass(self):
        self.projectdb = database.connect_database(
            'sqlalchemy+mysql+mysqlconnector+projectdb://root@localhost/pyspider_test_projectdb'
        )
        self.assertIsNotNone(self, self.projectdb)

    @classmethod
    def tearDownClass(self):
        self.projectdb.engine.execute('DROP DATABASE pyspider_test_projectdb')


@unittest.skipIf(os.environ.get('IGNORE_MYSQL') or os.environ.get('IGNORE_ALL'), 'no mysql server for test.')
class TestSQLAlchemyMySQLResultDB(ResultDBCase, unittest.TestCase):

    @classmethod
    def setUpClass(self):
        self.resultdb = database.connect_database(
            'sqlalchemy+mysql+mysqlconnector+resultdb://root@localhost/pyspider_test_resultdb'
        )
        self.assertIsNotNone(self, self.resultdb)

    @classmethod
    def tearDownClass(self):
        self.resultdb.engine.execute('DROP DATABASE pyspider_test_resultdb')


class TestSQLAlchemyTaskDB(TaskDBCase, unittest.TestCase):

    @classmethod
    def setUpClass(self):
        self.taskdb = database.connect_database(
            'sqlalchemy+sqlite+taskdb://'
        )
        self.assertIsNotNone(self, self.taskdb)

    @classmethod
    def tearDownClass(self):
        del self.taskdb


class TestSQLAlchemyProjectDB(ProjectDBCase, unittest.TestCase):

    @classmethod
    def setUpClass(self):
        self.projectdb = database.connect_database(
            'sqlalchemy+sqlite+projectdb://'
        )
        self.assertIsNotNone(self, self.projectdb)

    @classmethod
    def tearDownClass(self):
        del self.projectdb


class TestSQLAlchemyResultDB(ResultDBCase, unittest.TestCase):

    @classmethod
    def setUpClass(self):
        self.resultdb = database.connect_database(
            'sqlalchemy+sqlite+resultdb://'
        )
        self.assertIsNotNone(self, self.resultdb)

    @classmethod
    def tearDownClass(self):
        del self.resultdb


@unittest.skipIf(os.environ.get('IGNORE_POSTGRESQL') or os.environ.get('IGNORE_ALL'), 'no postgresql server for test.')
class TestPGTaskDB(TaskDBCase, unittest.TestCase):

    @classmethod
    def setUpClass(self):
        self.taskdb = database.connect_database(
            'sqlalchemy+postgresql+taskdb://postgres@127.0.0.1:5432/pyspider_test_taskdb'
        )
        self.assertIsNotNone(self, self.taskdb)
        self.tearDownClass()

    @classmethod
    def tearDownClass(self):
        for project in self.taskdb.projects:
            self.taskdb.drop(project)


@unittest.skipIf(os.environ.get('IGNORE_POSTGRESQL') or os.environ.get('IGNORE_ALL'), 'no postgresql server for test.')
class TestPGProjectDB(ProjectDBCase, unittest.TestCase):

    @classmethod
    def setUpClass(self):
        self.projectdb = database.connect_database(
            'sqlalchemy+postgresql+projectdb://postgres@127.0.0.1:5432/pyspider_test_projectdb'
        )
        self.assertIsNotNone(self, self.projectdb)
        self.tearDownClass()

    @classmethod
    def tearDownClass(self):
        for project in self.projectdb.get_all(fields=['name']):
            self.projectdb.drop(project['name'])


@unittest.skipIf(os.environ.get('IGNORE_POSTGRESQL') or os.environ.get('IGNORE_ALL'), 'no postgresql server for test.')
class TestPGResultDB(ResultDBCase, unittest.TestCase):

    @classmethod
    def setUpClass(self):
        self.resultdb = database.connect_database(
                'sqlalchemy+postgresql+resultdb://postgres@127.0.0.1/pyspider_test_resultdb'
        )
        self.assertIsNotNone(self, self.resultdb)
        self.tearDownClass()

    @classmethod
    def tearDownClass(self):
        for project in self.resultdb.projects:
            self.resultdb.drop(project)


@unittest.skipIf(os.environ.get('IGNORE_REDIS') or os.environ.get('IGNORE_ALL'), 'no redis server for test.')
class TestRedisTaskDB(TaskDBCase, unittest.TestCase):

    @classmethod
    def setUpClass(self):
        self.taskdb = database.connect_database('redis+taskdb://localhost:6379/15')
        self.assertIsNotNone(self, self.taskdb)
        self.taskdb.__prefix__ = 'testtaskdb_'

    @classmethod
    def tearDownClass(self):
        for project in self.taskdb.projects:
            self.taskdb.drop(project)


@unittest.skipIf(os.environ.get('IGNORE_ELASTICSEARCH') or os.environ.get('IGNORE_ALL'), 'no elasticsearch server for test.')
class TestESProjectDB(ProjectDBCase, unittest.TestCase):

    @classmethod
    def setUpClass(self):
        self.projectdb = database.connect_database(
            'elasticsearch+projectdb://127.0.0.1:9200/?index=test_pyspider_projectdb'
        )
        self.assertIsNotNone(self, self.projectdb)
        assert self.projectdb.index == 'test_pyspider_projectdb'

    @classmethod
    def tearDownClass(self):
        self.projectdb.es.indices.delete(index='test_pyspider_projectdb', ignore=[400, 404])


@unittest.skipIf(os.environ.get('IGNORE_ELASTICSEARCH') or os.environ.get('IGNORE_ALL'), 'no elasticsearch server for test.')
class TestESResultDB(ResultDBCase, unittest.TestCase):

    @classmethod
    def setUpClass(self):
        self.resultdb = database.connect_database(
            'elasticsearch+resultdb://127.0.0.1:9200/?index=test_pyspider_resultdb'
        )
        self.assertIsNotNone(self, self.resultdb)
        assert self.resultdb.index == 'test_pyspider_resultdb'

    @classmethod
    def tearDownClass(self):
        self.resultdb.es.indices.delete(index='test_pyspider_resultdb', ignore=[400, 404])

    def test_15_save(self):
        self.resultdb.refresh()

    def test_30_select(self):
        for i in range(5):
            self.resultdb.save('test_project', 'test_taskid-%d' % i,
                               'test_url', 'result-%d' % i)
        self.resultdb.refresh()

        ret = list(self.resultdb.select('test_project'))
        self.assertEqual(len(ret), 6)

        ret = list(self.resultdb.select('test_project', limit=4))
        self.assertEqual(len(ret), 4)

        for ret in self.resultdb.select('test_project', fields=('url', ), limit=1):
            self.assertIn('url', ret)
            self.assertNotIn('result', ret)

    def test_35_select_limit(self):
        pass

    def test_z20_update_projects(self):
        self.resultdb.refresh()
        self.assertIn('drop_project2', self.resultdb.projects)
        self.assertNotIn('drop_project3', self.resultdb.projects)

@unittest.skipIf(os.environ.get('IGNORE_ELASTICSEARCH') or os.environ.get('IGNORE_ALL'), 'no elasticsearch server for test.')
class TestESTaskDB(TaskDBCase, unittest.TestCase):

    @classmethod
    def setUpClass(self):
        self.taskdb = database.connect_database(
            'elasticsearch+taskdb://127.0.0.1:9200/?index=test_pyspider_taskdb'
        )
        self.assertIsNotNone(self, self.taskdb)
        assert self.taskdb.index == 'test_pyspider_taskdb'

    @classmethod
    def tearDownClass(self):
        self.taskdb.es.indices.delete(index='test_pyspider_taskdb', ignore=[400, 404])


@unittest.skipIf(os.environ.get('IGNORE_COUCHDB') or os.environ.get('IGNORE_ALL'), 'no couchdb server for test.')
class TestCouchDBProjectDB(ProjectDBCase, unittest.TestCase):

    @classmethod
    def setUpClass(self):
        # create a test admin user
        self.projectdb = database.connect_database(
            'couchdb+projectdb://localhost:5984/'
        )
        self.assertIsNotNone(self, self.projectdb)

    @classmethod
    def tearDownClass(self):
        # remove the test admin user
        self.projectdb.drop_database()


@unittest.skipIf(os.environ.get('IGNORE_COUCHDB') or os.environ.get('IGNORE_ALL'), 'no couchdb server for test.')
class TestCouchDBResultDB(ResultDBCase, unittest.TestCase):

    @classmethod
    def setUpClass(self):
        # create a test admin user
        self.resultdb = database.connect_database(
            'couchdb+resultdb://localhost:5984/'
        )
        self.assertIsNotNone(self, self.resultdb)

    @classmethod
    def tearDownClass(self):
        # remove the test admin user
        self.resultdb.drop_database()

    def test_create_project(self):
        self.assertNotIn('test_create_project', self.resultdb.projects)
        self.resultdb._create_project('test_create_project')
        self.assertIn('test_create_project', self.resultdb.projects)


@unittest.skipIf(os.environ.get('IGNORE_COUCHDB') or os.environ.get('IGNORE_ALL'), 'no couchdb server for test.')
class TestCouchDBTaskDB(TaskDBCase, unittest.TestCase):

    @classmethod
    def setUpClass(self):
        # create a test admin user
        import requests
        self.taskdb = database.connect_database(
            'couchdb+taskdb://localhost:5984/'
        )
        self.assertIsNotNone(self, self.taskdb)

    @classmethod
    def tearDownClass(self):
        # remove the test admin user
        import requests
        from requests.auth import HTTPBasicAuth
        self.taskdb.drop_database()

    def test_create_project(self):
        self.assertNotIn('test_create_project', self.taskdb.projects)
        self.taskdb._create_project('test_create_project')
        self.assertIn('test_create_project', self.taskdb.projects)


if __name__ == '__main__':
    unittest.main()


================================================
FILE: tests/test_fetcher.py
================================================
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:
# Author: Binux<i@binux.me>
#         http://binux.me
# Created on 2014-02-15 22:10:35

import os
import json
import copy
import time
import socket
import umsgpack
import subprocess
import unittest

import logging
import logging.config
logging.config.fileConfig("pyspider/logging.conf")

try:
    from six.moves import xmlrpc_client
except ImportError:
    import xmlrpclib as xmlrpc_client
from pyspider.libs import utils
from pyspider.libs.multiprocessing_queue import Queue
from pyspider.libs.response import rebuild_response
from pyspider.fetcher.tornado_fetcher import Fetcher


class TestFetcher(unittest.TestCase):
    sample_task_http = {
        'taskid': 'taskid',
        'project': 'project',
        'url': '',
        'fetch': {
            'method': 'GET',
            'headers': {
                'Cookie': 'a=b',
                'a': 'b'
            },
            'cookies': {
                'c': 'd',
            },
            'timeout': 60,
            'save': 'abc',
        },
        'process': {
            'callback': 'callback',
            'save': [1, 2, 3],
        },
    }

    @classmethod
    def setUpClass(self):
        import tests.data_test_webpage
        import httpbin

        self.httpbin_thread = utils.run_in_subprocess(httpbin.app.run, port=14887, passthrough_errors=False)
        self.httpbin = 'http://127.0.0.1:14887'

        self.inqueue = Queue(10)
        self.outqueue = Queue(10)
        self.fetcher = Fetcher(self.inqueue, self.outqueue)
        self.fetcher.phantomjs_proxy = '127.0.0.1:25555'
        self.rpc = xmlrpc_client.ServerProxy('http://localhost:%d' % 24444)
        self.xmlrpc_thread = utils.run_in_thread(self.fetcher.xmlrpc_run, port=24444)
        self.thread = utils.run_in_thread(self.fetcher.run)
        self.proxy_thread = subprocess.Popen(['pyproxy', '--username=binux',
                                              '--password=123456', '--port=14830',
                                              '--debug'], close_fds=True)
        self.proxy = '127.0.0.1:14830'
        try:
            self.phantomjs = subprocess.Popen(['phantomjs',
                os.path.join(os.path.dirname(__file__),
                    '../pyspider/fetcher/phantomjs_fetcher.js'),
                '25555'])
        except OSError:
            self.phantomjs = None
        time.sleep(0.5)

    @classmethod
    def tearDownClass(self):
        self.proxy_thread.terminate()
        self.proxy_thread.wait()
        self.httpbin_thread.terminate()
        self.httpbin_thread.join()

        if self.phantomjs:
            self.phantomjs.kill()
            self.phantomjs.wait()
        self.rpc._quit()
        self.thread.join()

        assert not utils.check_port_open(5000)
        assert not utils.check_port_open(23333)
        assert not utils.check_port_open(24444)
        assert not utils.check_port_open(25555)
        assert not utils.check_port_open(14887)

        time.sleep(1)

    def test_10_http_get(self):
        request = copy.deepcopy(self.sample_task_http)
        request['url'] = self.httpbin+'/get'
        result = self.fetcher.sync_fetch(request)
        response = rebuild_response(result)

        self.assertEqual(response.status_code, 200, result)
        self.assertEqual(response.orig_url, request['url'])
        self.assertEqual(response.save, request['fetch']['save'])
        self.assertIsNotNone(response.json, response.content)
        self.assertEqual(response.json['headers'].get('A'), 'b', response.json)
        self.assertIn('c=d', response.json['headers'].get('Cookie'), response.json)
        self.assertIn('a=b', response.json['headers'].get('Cookie'), response.json)

    def test_15_http_post(self):
        request = copy.deepcopy(self.sample_task_http)
        request['url'] = self.httpbin+'/post'
        request['fetch']['method'] = 'POST'
        request['fetch']['data'] = 'binux'
        request['fetch']['cookies'] = {'c': 'd'}
        result = self.fetcher.sync_fetch(request)
        response = rebuild_response(result)

        self.assertEqual(response.status_code, 200)
        self.assertEqual(response.orig_url, request['url'])
        self.assertEqual(response.save, request['fetch']['save'])
        self.assertIsNotNone(response.json, response.content)

        self.assertEqual(response.json['form'].get('binux'), '')
        self.assertEqual(response.json['headers'].get('A'), 'b', response.json)
        self.assertIn('c=d', response.json['headers'].get('Cookie'), response.json)
        self.assertIn('a=b', response.json['headers'].get('Cookie'), response.json)

    def test_20_dataurl_get(self):
        request = copy.deepcopy(self.sample_task_http)
        request['url'] = 'data:,hello'
        result = self.fetcher.sync_fetch(request)
        response = rebuild_response(result)

        self.assertEqual(response.status_code, 200)
        self.assertEqual(response.text, 'hello')

    def test_30_with_queue(self):
        request= copy.deepcopy(self.sample_task_http)
        request['url'] = 'data:,hello'
        self.inqueue.put(request)
        task, result = self.outqueue.get()
        response = rebuild_response(result)

        self.assertEqual(response.status_code, 200)
        self.assertEqual(response.text, 'hello')

    def test_40_with_rpc(self):
        request = copy.deepcopy(self.sample_task_http)
        request['url'] = 'data:,hello'
        result = umsgpack.unpackb(self.rpc.fetch(request).data)
        response = rebuild_response(result)

        self.assertEqual(response.status_code, 200)
        self.assertEqual(response.text, 'hello')

    def test_50_base64_data(self):
        request = copy.deepcopy(self.sample_task_http)
        request['url'] = self.httpbin+'/post'
        request['fetch']['method'] = 'POST'
        # utf8 encoding 中文
        request['fetch']['data'] = "[BASE64-DATA]5Lit5paH[/BASE64-DATA]"
        self.inqueue.put(request)
        task, result = self.outqueue.get()
        response = rebuild_response(result)

        self.assertEqual(response.status_code, 200, response.error)
        self.assertIsNotNone(response.json, response.content)
        self.assertIn(u'中文', response.json['form'], response.json)

    def test_55_base64_data(self):
        request = copy.deepcopy(self.sample_task_http)
        request['url'] = self.httpbin+'/post'
        request['fetch']['method'] = 'POST'
        # gbk encoding 中文
        request['fetch']['data'] = "[BASE64-DATA]1tDOxA==[/BASE64-DATA]"
        self.inqueue.put(request)
        task, result = self.outqueue.get()
        response = rebuild_response(result)

        self.assertEqual(response.status_code, 200, response.error)
        self.assertIsNotNone(response.json, response.content)

    def test_60_timeout(self):
        request = copy.deepcopy(self.sample_task_http)
        request['url'] = self.httpbin+'/delay/5'
        request['fetch']['timeout'] = 3
        start_time = time.time()
        self.inqueue.put(request)
        task, result = self.outqueue.get()
        end_time = time.time()
        self.assertGreater(end_time - start_time, 1.5)
        self.assertLess(end_time - start_time, 4.5)

        response = rebuild_response(result)
        self.assertEqual(response.orig_url, request['url'])
        self.assertEqual(response.save, request['fetch']['save'])

    def test_65_418(self):
        request = copy.deepcopy(self.sample_task_http)
        request['url'] = self.httpbin+'/status/418'
        self.inqueue.put(request)
        task, result = self.outqueue.get()
        response = rebuild_response(result)

        self.assertEqual(response.status_code, 418)
        self.assertIn('teapot', response.text)

    def test_69_no_phantomjs(self):
        phantomjs_proxy = self.fetcher.phantomjs_proxy
        self.fetcher.phantomjs_proxy = None

        if not self.phantomjs:
            raise unittest.SkipTest('no phantomjs')
        request = copy.deepcopy(self.sample_task_http)
        request['url'] = self.httpbin + '/get'
        request['fetch']['fetch_type'] = 'phantomjs'
        result = self.fetcher.sync_fetch(request)
        response = rebuild_response(result)

        self.assertEqual(response.status_code, 501, result)

        self.fetcher.phantomjs_proxy = phantomjs_proxy

    def test_70_phantomjs_url(self):
        if not self.phantomjs:
            raise unittest.SkipTest('no phantomjs')
        request = copy.deepcopy(self.sample_task_http)
        request['url'] = self.httpbin + '/get'
        request['fetch']['fetch_type'] = 'phantomjs'
        result = self.fetcher.sync_fetch(request)
        response = rebuild_response(result)

        self.assertEqual(response.status_code, 200, result)
        self.assertEqual(response.orig_url, request['url'])
        self.assertEqual(response.save, request['fetch']['save'])
        data = json.loads(response.doc('pre').text())
        self.assertEqual(data['headers'].get('A'), 'b', response.content)
        self.assertIn('c=d', data['headers'].get('Cookie'), response.content)
        self.assertIn('a=b', data['headers'].get('Cookie'), response.content)

    def test_75_phantomjs_robots(self):
        if not self.phantomjs:
            raise unittest.SkipTest('no phantomjs')
        request = copy.deepcopy(self.sample_task_http)
        request['url'] = self.httpbin + '/deny'
        request['fetch']['fetch_type'] = 'phantomjs'
        request['fetch']['robots_txt'] = True
        result = self.fetcher.sync_fetch(request)
        response = rebuild_response(result)

        self.assertEqual(response.status_code, 403, result)

    def test_80_phantomjs_timeout(self):
        if not self.phantomjs:
            raise unittest.SkipTest('no phantomjs')
        request = copy.deepcopy(self.sample_task_http)
        request['url'] = self.httpbin+'/delay/5'
        request['fetch']['fetch_type'] = 'phantomjs'
        request['fetch']['timeout'] = 3
        start_time = time.time()
        result = self.fetcher.sync_fetch(request)
        end_time = time.time()
        self.assertGreater(end_time - start_time, 2)
        self.assertLess(end_time - start_time, 5)
        self.assertEqual(result['status_code'], 599)
        self.assertIn('js_script_result', result)

    def test_90_phantomjs_js_script(self):
        if not self.phantomjs:
            raise unittest.SkipTest('no phantomjs')
        request = copy.deepcopy(self.sample_task_http)
        request['url'] = self.httpbin + '/html'
        request['fetch']['fetch_type'] = 'phantomjs'
        request['fetch']['js_script'] = 'function() { document.write("binux") }'
        result = self.fetcher.sync_fetch(request)
        self.assertEqual(result['status_code'], 200)
        self.assertIn('binux', result['content'])

    def test_a100_phantomjs_sharp_url(self):
        if not self.phantomjs:
            raise unittest.SkipTest('no phantomjs')
        request = copy.deepcopy(self.sample_task_http)
        request['url'] = self.httpbin+'/pyspider/ajax.html'
        request['fetch']['fetch_type'] = 'phantomjs'
        request['fetch']['headers']['User-Agent'] = 'pyspider-test'
        result = self.fetcher.sync_fetch(request)
        self.assertEqual(result['status_code'], 200)
        self.assertNotIn('loading', result['content'])
        self.assertIn('done', result['content'])
        self.assertIn('pyspider-test', result['content'])

    def test_a110_dns_error(self):
        request = copy.deepcopy(self.sample_task_http)
        request['url'] = 'http://www.not-exists-site-binux.com/'
        result = self.fetcher.sync_fetch(request)
        self.assertEqual(result['status_code'], 599)
        self.assertIn('error', result)
        self.assertIn('resolve', result['error'])

        self.inqueue.put(request)
        task, result = self.outqueue.get()
        self.assertEqual(result['status_code'], 599)
        self.assertIn('error', result)
        self.assertIn('resolve', result['error'])

    def test_a120_http_get_with_proxy_fail(self):
        self.fetcher.proxy = self.proxy
        request = copy.deepcopy(self.sample_task_http)
        request['url'] = self.httpbin+'/get'
        result = self.fetcher.sync_fetch(request)
        response = rebuild_response(result)

        self.assertEqual(response.status_code, 403, result)
        self.fetcher.proxy = None

    def test_a130_http_get_with_proxy_ok(self):
        self.fetcher.proxy = self.proxy
        request = copy.deepcopy(self.sample_task_http)
        request['url'] = self.httpbin+'/get?username=binux&password=123456'
        result = self.fetcher.sync_fetch(request)
        response = rebuild_response(result)

        self.assertEqual(response.status_code, 200, result)
        self.assertEqual(response.orig_url, request['url'])
        self.assertEqual(response.save, request['fetch']['save'])
        self.assertIsNotNone(response.json, response.content)
        self.assertEqual(response.json['headers'].get('A'), 'b', response.json)
        self.assertIn('c=d', response.json['headers'].get('Cookie'), response.json)
        self.assertIn('a=b', response.json['headers'].get('Cookie'), response.json)
        self.fetcher.proxy = None

    def test_a140_redirect(self):
        request = copy.deepcopy(self.sample_task_http)
        request['url'] = self.httpbin+'/redirect-to?url=/get'
        result = self.fetcher.sync_fetch(request)
        response = rebuild_response(result)

        self.assertEqual(response.status_code, 200, result)
        self.assertEqual(response.orig_url, request['url'])
        self.assertEqual(response.url, self.httpbin+'/get')

    def test_a150_too_much_redirect(self):
        request = copy.deepcopy(self.sample_task_http)
        request['url'] = self.httpbin+'/redirect/10'
        result = self.fetcher.sync_fetch(request)
        response = rebuild_response(result)

        self.assertEqual(response.status_code, 599, result)
        self.assertIn('redirects followed', response.error)

    def test_a160_cookie(self):
        request = copy.deepcopy(self.sample_task_http)
        request['url'] = self.httpbin+'/cookies/set?k1=v1&k2=v2'
        result = self.fetcher.sync_fetch(request)
        response = rebuild_response(result)

        self.assertEqual(response.status_code, 200, result)
        self.assertEqual(response.cookies, {'a': 'b', 'k1': 'v1', 'k2': 'v2', 'c': 'd'}, result)

    def test_a170_validate_cert(self):
        request = copy.deepcopy(self.sample_task_http)
        request['fetch']['validate_cert'] = False
        request['url'] = self.httpbin+'/get'
        result = self.fetcher.sync_fetch(request)
        response = rebuild_response(result)

        self.assertEqual(response.status_code, 200, result)

    def test_a180_max_redirects(self):
        request = copy.deepcopy(self.sample_task_http)
        request['fetch']['max_redirects'] = 10
        request['url'] = self.httpbin+'/redirect/10'
        result = self.fetcher.sync_fetch(request)
        response = rebuild_response(result)

        self.assertEqual(response.status_code, 200, result)

    def test_a200_robots_txt(self):
        request = copy.deepcopy(self.sample_task_http)
        request['fetch']['robots_txt'] = False
        request['url'] = self.httpbin+'/deny'
        result = self.fetcher.sync_fetch(request)
        response = rebuild_response(result)

        self.assertEqual(response.status_code, 200, result)

        request['fetch']['robots_txt'] = True
        result = self.fetcher.sync_fetch(request)
        response = rebuild_response(result)

        self.assertEqual(response.status_code, 403, result)

    def test_zzzz_issue375(self):
        phantomjs_proxy = self.fetcher.phantomjs_proxy
        self.fetcher.phantomjs_proxy = '127.0.0.1:20000'

        if not self.phantomjs:
            raise unittest.SkipTest('no phantomjs')
        request = copy.deepcopy(self.sample_task_http)
        request['url'] = self.httpbin + '/get'
        request['fetch']['fetch_type'] = 'phantomjs'
        result = self.fetcher.sync_fetch(request)
        response = rebuild_response(result)

        self.assertEqual(response.status_code, 599, result)

        self.fetcher.phantomjs_proxy = phantomjs_proxy

@unittest.skipIf(os.environ.get('IGNORE_SPLASH') or os.environ.get('IGNORE_ALL'), 'no splash server for test.')
class TestSplashFetcher(unittest.TestCase):
    @property
    def sample_task_http(self):
        return {
            'taskid': 'taskid',
            'project': 'project',
            'url': '',
            'fetch': {
                'method': 'GET',
                'headers': {
                    'Cookie': 'a=b',
                    'a': 'b'
                },
                'cookies': {
                    'c': 'd',
                },
                'timeout': 60,
                'save': 'abc',
            },
            'process': {
                'callback': 'callback',
                'save': [1, 2, 3],
            },
        }

    @classmethod
    def setUpClass(self):
        import tests.data_test_webpage
        import httpbin

        self.httpbin_thread = utils.run_in_subprocess(httpbin.app.run, host='0.0.0.0', port=14887, passthrough_errors=False)
        self.httpbin = 'http://' + socket.gethostbyname(socket.gethostname()) + ':14887'

        self.inqueue = Queue(10)
        self.outqueue = Queue(10)
        self.fetcher = Fetcher(self.inqueue, self.outqueue)
        self.fetcher.splash_endpoint = 'http://127.0.0.1:8050/execute'
        self.rpc = xmlrpc_client.ServerProxy('http://localhost:%d' % 24444)
        self.xmlrpc_thread = utils.run_in_thread(self.fetcher.xmlrpc_run, port=24444)
        self.thread = utils.run_in_thread(self.fetcher.run)
        self.proxy_thread = subprocess.Popen(['pyproxy', '--username=binux', '--bind=0.0.0.0',
                                              '--password=123456', '--port=14830',
                                              '--debug'], close_fds=True)
        self.proxy = socket.gethostbyname(socket.gethostname()) + ':14830'
        
    @classmethod
    def tearDownClass(self):
        self.rpc("close")()
        self.proxy_thread.terminate()
        self.proxy_thread.wait()
        self.httpbin_thread.terminate()
        self.httpbin_thread.join()

        self.rpc._quit()
        self.thread.join()

        assert not utils.check_port_open(5000)
        assert not utils.check_port_open(23333)
        assert not utils.check_port_open(24444)
        assert not utils.check_port_open(25555)
        assert not utils.check_port_open(14887)

        time.sleep(1)

    def test_69_no_splash(self):
        splash_endpoint = self.fetcher.splash_endpoint
        self.fetcher.splash_endpoint = None

        request = self.sample_task_http
        request['url'] = self.httpbin + '/get'
        request['fetch']['fetch_type'] = 'splash'
        result = self.fetcher.sync_fetch(request)
        response = rebuild_response(result)

        self.assertEqual(response.status_code, 501, result)

        self.fetcher.splash_endpoint = splash_endpoint

    def test_70_splash_url(self):
        request = self.sample_task_http
        request['url'] = self.httpbin + '/get'
        request['fetch']['fetch_type'] = 'splash'
        result = self.fetcher.sync_fetch(request)
        response = rebuild_response(result)

        self.assertEqual(response.status_code, 200, result)
        self.assertEqual(response.orig_url, request['url'])
        self.assertEqual(response.save, request['fetch']['save'])

        data = json.loads(response.doc('pre').text())
        self.assertEqual(data['headers'].get('A'), 'b', response.content)
        self.assertIn('c=d', data['headers'].get('Cookie'), response.content)
        self.assertIn('a=b', data['headers'].get('Cookie'), response.content)

    def test_75_splash_robots(self):
        request = self.sample_task_http
        request['url'] = self.httpbin + '/deny'
        request['fetch']['fetch_type'] = 'splash'
        request['fetch']['robots_txt'] = True
        result = self.fetcher.sync_fetch(request)
        response = rebuild_response(result)

        self.assertEqual(response.status_code, 403, result)

    def test_80_splash_timeout(self):
        request = self.sample_task_http
        request['url'] = self.httpbin+'/delay/5'
        request['fetch']['fetch_type'] = 'splash'
        request['fetch']['timeout'] = 3
        start_time = time.time()
        result = self.fetcher.sync_fetch(request)
        end_time = time.time()
        self.assertGreater(end_time - start_time, 2)
        self.assertLess(end_time - start_time, 5)
        self.assertEqual(result['status_code'], 599)
        # self.assertIn('js_script_result', result) TODO: lua nil is not exists

    def test_90_splash_js_script(self):
        request = self.sample_task_http
        request['url'] = self.httpbin + '/html'
        request['fetch']['fetch_type'] = 'splash'
        request['fetch']['js_script'] = 'function() { document.write("binux") }'
        result = self.fetcher.sync_fetch(request)
        self.assertEqual(result['status_code'], 200)
        self.assertIn('binux', result['content'])

    def test_95_splash_js_script_2(self):
        request = self.sample_task_http
        request['url'] = self.httpbin + '/pyspider/ajax_click.html'
        request['fetch']['fetch_type'] = 'splash'
        request['fetch']['headers']['User-Agent'] = 'pyspider-test'
        request['fetch']['js_script'] = 'function() { document.querySelector("a").click(); return "abc" }'
        result = self.fetcher.sync_fetch(request)
        self.assertEqual(result['status_code'], 200)
        self.assertNotIn('loading', result['content'])
        self.assertIn('done', result['content'])
        self.assertIn('pyspider-test', result['content'])
        self.assertIn('abc', result['js_script_result'])

    def test_a100_splash_sharp_url(self):
        request = self.sample_task_http
        request['url'] = self.httpbin+'/pyspider/ajax.html'
        request['fetch']['fetch_type'] = 'splash'
        request['fetch']['headers']['User-Agent'] = 'pyspider-test'
        result = self.fetcher.sync_fetch(request)
        self.assertEqual(result['status_code'], 200)
        self.assertNotIn('loading', result['content'])
        self.assertIn('done', result['content'])
        self.assertIn('pyspider-test', result['content'])

    def test_a120_http_get_with_proxy_fail_1(self):
        self.fetcher.proxy = self.proxy
        request = copy.deepcopy(self.sample_task_http)
        request['url'] = self.httpbin+'/get'
        result = self.fetcher.sync_fetch(request)
        response = rebuild_response(result)

        self.assertEqual(response.status_code, 403, result)
        self.fetcher.proxy = None

    def test_a120_http_get_with_proxy_fail(self):
        self.fetcher.proxy = self.proxy
        request = copy.deepcopy(self.sample_task_http)
        request['url'] = self.httpbin+'/get'
        request['fetch']['fetch_type'] = 'splash'
        result = self.fetcher.sync_fetch(request)
        response = rebuild_response(result)

        self.assertEqual(response.status_code, 403, result)
        self.fetcher.proxy = None

    def test_a130_http_get_with_proxy_ok_1(self):
        self.fetcher.proxy = 'http://binux:123456@%s/' % self.proxy
        request = copy.deepcopy(self.sample_task_http)
        request['url'] = self.httpbin+'/get'
        result = self.fetcher.sync_fetch(request)
        response = rebuild_response(result)

        self.assertEqual(response.status_code, 200, result)
        self.assertEqual(response.orig_url, request['url'])
        self.assertEqual(response.save, request['fetch']['save'])
        self.assertIsNotNone(response.json, response.content)
        self.assertEqual(response.json['headers'].get('A'), 'b', response.json)
        self.assertIn('c=d', response.json['headers'].get('Cookie'), response.json)
        self.assertIn('a=b', response.json['headers'].get('Cookie'), response.json)
        self.fetcher.proxy = None

    def test_a130_http_get_with_proxy_ok(self):
        self.fetcher.proxy = 'http://binux:123456@%s/' % self.proxy
        request = copy.deepcopy(self.sample_task_http)
        request['url'] = self.httpbin+'/get'
        request['fetch']['fetch_type'] = 'splash'
        result = self.fetcher.sync_fetch(request)
        response = rebuild_response(result)

        self.assertEqual(response.status_code, 200, result)
        self.assertEqual(response.orig_url, request['url'])
        self.assertEqual(response.save, request['fetch']['save'])

        data = json.loads(response.doc('pre').text())
        self.assertEqual(data['headers'].get('A'), 'b', response.content)
        self.assertIn('c=d', data['headers'].get('Cookie'), response.content)
        self.assertIn('a=b', data['headers'].get('Cookie'), response.content)
        self.fetcher.proxy = None


================================================
FILE: tests/test_fetcher_processor.py
================================================
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:
# Author: Binux<roy@binux.me>
#         http://binux.me
# Created on 2015-01-18 14:09:41

import os
import time
import httpbin
import subprocess
import unittest

from pyspider.database.local.projectdb import ProjectDB
from pyspider.fetcher import Fetcher
from pyspider.processor import Processor
from pyspider.libs import utils, dataurl
from six.moves.queue import Queue
from tests.data_fetcher_processor_handler import Handler


class TestFetcherProcessor(Handler, unittest.TestCase):

    @classmethod
    def setUpClass(self):
        self.projectdb = ProjectDB([os.path.join(os.path.dirname(__file__), 'data_fetcher_processor_handler.py')])
        self.fetcher = Fetcher(None, None, async_mode=False)
        self.status_queue = Queue()
        self.newtask_queue = Queue()
        self.result_queue = Queue()
        self.httpbin_thread = utils.run_in_subprocess(httpbin.app.run, port=14887, passthrough_errors=False)
        self.httpbin = 'http://127.0.0.1:14887'
        self.proxy_thread = subprocess.Popen(['pyproxy', '--username=binux',
                                              '--password=123456', '--port=14830',
                                              '--debug'], close_fds=True)
        self.proxy = '127.0.0.1:14830'
        self.processor = Processor(projectdb=self.projectdb,
                                   inqueue=None,
                                   status_queue=self.status_queue,
                                   newtask_queue=self.newtask_queue,
                                   result_queue=self.result_queue)
        self.project_name = 'data_fetcher_processor_handler'
        time.sleep(0.5)

    @classmethod
    def tearDownClass(self):
        self.proxy_thread.terminate()
        self.proxy_thread.wait()
        self.httpbin_thread.terminate()
        self.httpbin_thread.join()

    @classmethod
    def crawl(self, url=None, track=None, **kwargs):
        if url is None and kwargs.get('callback'):
            url = dataurl.encode(utils.text(kwargs.get('callback')))

        project_data = self.processor.project_manager.get(self.project_name)
        assert project_data, "can't find project: %s" % self.project_name
        instance = project_data['instance']
        instance._reset()
        task = instance.crawl(url, **kwargs)
        if isinstance(task, list):
            task = task[0]
        task['track'] = track
        result = self.fetcher.fetch(task)
        self.processor.on_task(task, result)

        status = None
        while not self.status_queue.empty():
            status = self.status_queue.get()
        newtasks = []
        while not self.newtask_queue.empty():
            newtasks = self.newtask_queue.get()
        result = None
        while not self.result_queue.empty():
            _, result = self.result_queue.get()
        return status, newtasks, result

    @classmethod
    def assertStatusOk(self, status):
        self.assertTrue(self.status_ok(status, 'fetch'), status.get('track', {}).get('fetch'))
        self.assertTrue(self.status_ok(status, 'process'), status.get('track', {}).get('process'))

    @classmethod
    def status_ok(self, status, type):
        if not status:
            return False
        return status.get('track', {}).get(type, {}).get('ok', False)

    def test_10_not_status(self):
        status, newtasks, result = self.crawl(callback=self.not_send_status.__name__)

        self.assertIsNone(status)
        self.assertEqual(len(newtasks), 1, newtasks)
        self.assertEqual(result, 'not_send_status')

    def test_20_url_deduplicated(self):
        status, newtasks, result = self.crawl(callback=self.url_deduplicated)

        self.assertStatusOk(status)
        self.assertIsNone(status['track']['fetch']['error'])
        self.assertIsNone(status['track']['fetch']['content'])
        self.assertFalse(status['track']['fetch']['headers'])
        self.assertFalse(status['track']['process']['logs'])
        self.assertEqual(len(newtasks), 2, newtasks)
        self.assertIsNone(result)

    def test_30_catch_status_code_error(self):
        status, newtasks, result = self.crawl(self.httpbin + '/status/418', callback=self.json)

        self.assertFalse(self.status_ok(status, 'fetch'))
        self.assertFalse(self.status_ok(status, 'process'))
        self.assertIn('HTTP 418', status['track']['fetch']['error'])
        self.assertTrue(status['track']['fetch']['content'], '')
        self.assertTrue(status['track']['fetch']['headers'])
        self.assertTrue(status['track']['process']['logs'])
        self.assertIn('HTTPError: HTTP 418', status['track']['process']['logs'])
        self.assertFalse(newtasks)

        status, newtasks, result = self.crawl(self.httpbin + '/status/400', callback=self.catch_http_error)

        self.assertFalse(self.status_ok(status, 'fetch'))
        self.assertTrue(self.status_ok(status, 'process'))
        self.assertEqual(len(newtasks), 1, newtasks)
        self.assertEqual(result, 400)

        status, newtasks, result = self.crawl(self.httpbin + '/status/500', callback=self.catch_http_error)
        self.assertFalse(self.status_ok(status, 'fetch'))
        self.assertTrue(self.status_ok(status, 'process'))
        self.assertEqual(len(newtasks), 1, newtasks)
        self.assertEqual(result, 500)

        status, newtasks, result = self.crawl(self.httpbin + '/status/302',
                                              allow_redirects=False,
                                              callback=self.catch_http_error)
        self.assertFalse(self.status_ok(status, 'fetch'))
        self.assertTrue(self.status_ok(status, 'process'))
        self.assertEqual(len(newtasks), 1, newtasks)
        self.assertEqual(result, 302)

    def test_40_method(self):
        status, newtasks, result = self.crawl(self.httpbin + '/delete', method='DELETE', callback=self.json)

        self.assertStatusOk(status)
        self.assertFalse(newtasks)

        status, newtasks, result = self.crawl(self.httpbin + '/get', method='DELETE', callback=self.catch_http_error)

        self.assertFalse(self.status_ok(status, 'fetch'))
        self.assertTrue(self.status_ok(status, 'process'))
        self.assertTrue(newtasks)
        self.assertEqual(result, 405)

    def test_50_params(self):
        status, newtasks, result = self.crawl(self.httpbin + '/get', params={
            'roy': 'binux',
            u'中文': '.',
        }, callback=self.json)

        self.assertStatusOk(status)
        self.assertFalse(newtasks)
        self.assertEqual(result['args'], {'roy': 'binux', u'中文': '.'})

    def test_60_data(self):
        status, newtasks, result = self.crawl(self.httpbin + '/post', data={
            'roy': 'binux',
            u'中文': '.',
        }, callback=self.json)

        self.assertStatusOk(status)
        self.assertFalse(newtasks)
        self.assertEqual(result['form'], {'roy': 'binux', u'中文': '.'})

    def test_70_redirect(self):
        status, newtasks, result = self.crawl(self.httpbin + '/redirect-to?url=/get', callback=self.json)

        self.assertStatusOk(status)
        self.assertEqual(status['track']['fetch']['redirect_url'], self.httpbin + '/get')
        self.assertFalse(newtasks)

    def test_80_redirect_too_many(self):
        status, newtasks, result = self.crawl(self.httpbin + '/redirect/10', callback=self.json)

        self.assertFalse(self.status_ok(status, 'fetch'))
        self.assertFalse(self.status_ok(status, 'process'))
        self.assertFalse(newtasks)
        self.assertEqual(status['track']['fetch']['status_code'], 599)
        self.assertIn('redirects followed', status['track']['fetch']['error'])

    def test_90_files(self):
        status, newtasks, result = self.crawl(self.httpbin + '/put', method='PUT',
                                              files={os.path.basename(__file__): open(__file__).read()},
                                              callback=self.json)

        self.assertStatusOk(status)
        self.assertFalse(newtasks)
        self.assertIn(os.path.basename(__file__), result['files'])

    def test_a100_files_with_data(self):
        status, newtasks, result = self.crawl(self.httpbin + '/put', method='PUT',
                                              files={os.path.basename(__file__): open(__file__).read()},
                                              data={
                                                  'roy': 'binux',
                                                  # '中文': '.', # FIXME: not work
                                              },
                                              callback=self.json)
        self.assertStatusOk(status)
        self.assertFalse(newtasks)
        self.assertEqual(result['form'], {'roy': 'binux'})
        self.assertIn(os.path.basename(__file__), result['files'])

    def test_a110_headers(self):
        status, newtasks, result = self.crawl(self.httpbin + '/get',
                                              headers={
                                                  'a': 'b',
                                                  'C-d': 'e-F',
                                              }, callback=self.json)
        self.assertStatusOk(status)
        self.assertFalse(newtasks)
        self.assertEqual(result['headers'].get('A'), 'b')
        self.assertEqual(result['headers'].get('C-D'), 'e-F')

    def test_a115_user_agent(self):
        status, newtasks, result = self.crawl(self.httpbin + '/get',
                                              user_agent='binux', callback=self.json)

        self.assertStatusOk(status)
        self.assertFalse(newtasks)
        self.assertEqual(result['headers'].get('User-Agent'), 'binux')

    def test_a120_cookies(self):
        status, newtasks, result = self.crawl(self.httpbin + '/get',
                                              cookies={
                                                  'a': 'b',
                                                  'C-d': 'e-F'
                                              }, callback=self.json)
        self.assertStatusOk(status)
        self.assertFalse(newtasks)
        self.assertIn('a=b', result['headers'].get('Cookie'))
        self.assertIn('C-d=e-F', result['headers'].get('Cookie'))

    def test_a130_cookies_with_headers(self):
        status, newtasks, result = self.crawl(self.httpbin + '/get',
                                              headers={
                                                  'Cookie': 'g=h; I=j',
                                              },
                                              cookies={
                                                  'a': 'b',
                                                  'C-d': 'e-F'
                                              }, callback=self.json)
        self.assertStatusOk(status)
        self.assertFalse(newtasks)
        self.assertIn('g=h', result['headers'].get('Cookie'))
        self.assertIn('I=j', result['headers'].get('Cookie'))
        self.assertIn('a=b', result['headers'].get('Cookie'))
        self.assertIn('C-d=e-F', result['headers'].get('Cookie'))

    def test_a140_response_cookie(self):
        status, newtasks, result = self.crawl(self.httpbin + '/cookies/set?k1=v1&k2=v2',
                                              callback=self.cookies)
        self.assertStatusOk(status)
        self.assertFalse(newtasks)
        self.assertEqual(result, {'k1': 'v1', 'k2': 'v2'})

    def test_a145_redirect_cookie(self):
        status, newtasks, result = self.crawl(self.httpbin + '/cookies/set?k1=v1&k2=v2',
                                              callback=self.json)
        self.assertStatusOk(status)
        self.assertFalse(newtasks)
        self.assertEqual(result['cookies'], {'k1': 'v1', 'k2': 'v2'})

    def test_a150_timeout(self):
        status, newtasks, result = self.crawl(self.httpbin + '/delay/2', timeout=1, callback=self.json)

        self.assertFalse(self.status_ok(status, 'fetch'))
        self.assertFalse(self.status_ok(status, 'process'))
        self.assertFalse(newtasks)
        self.assertEqual(int(status['track']['fetch']['time']), 1)

    def test_a160_etag(self):
        status, newtasks, result = self.crawl(self.httpbin + '/cache', etag='abc', callback=self.json)

        self.assertStatusOk(status)
        self.assertFalse(newtasks)
        self.assertFalse(result)

    def test_a170_last_modified(self):
        status, newtasks, result = self.crawl(self.httpbin + '/cache', last_modified='0', callback=self.json)

        self.assertStatusOk(status)
        self.assertFalse(newtasks)
        self.assertFalse(result)

    def test_a180_save(self):
        status, newtasks, result = self.crawl(callback=self.get_save,
                                              save={'roy': 'binux', u'中文': 'value'})

        self.assertStatusOk(status)
        self.assertFalse(newtasks)
        self.assertEqual(result, {'roy': 'binux', u'中文': 'value'})

    def test_a190_taskid(self):
        status, newtasks, result = self.crawl(callback=self.get_save,
                                              taskid='binux-taskid')

        self.assertStatusOk(status)
        self.assertEqual(status['taskid'], 'binux-taskid')
        self.assertFalse(newtasks)
        self.assertFalse(result)

    def test_a200_no_proxy(self):
        old_proxy = self.fetcher.proxy
        self.fetcher.proxy = self.proxy
        status, newtasks, result = self.crawl(self.httpbin + '/get',
                                              params={
                                                  'test': 'a200'
                                              }, proxy=False, callback=self.json)

        self.assertStatusOk(status)
        self.assertFalse(newtasks)
        self.fetcher.proxy = old_proxy

    def test_a210_proxy_failed(self):
        old_proxy = self.fetcher.proxy
        self.fetcher.proxy = self.proxy
        status, newtasks, result = self.crawl(self.httpbin + '/get',
                                              params={
                                                  'test': 'a210'
                                              }, callback=self.catch_http_error)

        self.assertFalse(self.status_ok(status, 'fetch'))
        self.assertTrue(self.status_ok(status, 'process'))
        self.assertEqual(len(newtasks), 1, newtasks)
        self.assertEqual(result, 403)
        self.fetcher.proxy = old_proxy

    def test_a220_proxy_ok(self):
        old_proxy = self.fetcher.proxy
        self.fetcher.proxy = self.proxy
        status, newtasks, result = self.crawl(self.httpbin + '/get',
                                              params={
                                                  'test': 'a220',
                                                  'username': 'binux',
                                                  'password': '123456',
                                              }, callback=self.catch_http_error)

        self.assertStatusOk(status)
        self.assertEqual(result, 200)
        self.fetcher.proxy = old_proxy

    def test_a230_proxy_parameter_fail(self):
        status, newtasks, result = self.crawl(self.httpbin + '/get',
                                              params={
                                                  'test': 'a230',
                                              }, proxy=self.proxy,
                                              callback=self.catch_http_error)

        self.assertFalse(self.status_ok(status, 'fetch'))
        self.assertTrue(self.status_ok(status, 'process'))
        self.assertEqual(result, 403)

    def test_a240_proxy_parameter_ok(self):
        status, newtasks, result = self.crawl(self.httpbin + '/post',
                                              method='POST',
                                              data={
                                                  'test': 'a240',
                                                  'username': 'binux',
                                                  'password': '123456',
                                              }, proxy=self.proxy,
                                              callback=self.catch_http_error)

        self.assertStatusOk(status)
        self.assertEqual(result, 200)

    def test_a250_proxy_userpass(self):
        status, newtasks, result = self.crawl(self.httpbin + '/post',
                                              method='POST',
                                              data={
                                                  'test': 'a250',
                                              }, proxy='binux:123456@' + self.proxy,
                                              callback=self.catch_http_error)

        self.assertStatusOk(status)
        self.assertEqual(result, 200)

    def test_a260_process_save(self):
        status, newtasks, result = self.crawl(callback=self.set_process_save)

        self.assertStatusOk(status)
        self.assertIn('roy', status['track']['save'])
        self.assertEqual(status['track']['save']['roy'], 'binux')

        status, newtasks, result = self.crawl(callback=self.get_process_save,
                                              track=status['track'])

        self.assertStatusOk(status)
        self.assertIn('roy', result)
        self.assertEqual(result['roy'], 'binux')

    def test_zzz_links(self):
        status, newtasks, result = self.crawl(self.httpbin + '/links/10/0', callback=self.links)

        self.assertStatusOk(status)
        self.assertEqual(len(newtasks), 9, newtasks)
        self.assertFalse(result)

    def test_zzz_html(self):
        status, newtasks, result = self.crawl(self.httpbin + '/html', callback=self.html)

        self.assertStatusOk(status)
        self.assertFalse(newtasks)
        self.assertEqual(result, 'Herman Melville - Moby-Dick')

    def test_zzz_etag_enabled(self):
        status, newtasks, result = self.crawl(self.httpbin + '/cache', callback=self.json)
        self.assertStatusOk(status)
        self.assertTrue(result)

        status, newtasks, result = self.crawl(self.httpbin + '/cache',
                                              track=status['track'], callback=self.json)
        self.assertStatusOk(status)
        self.assertFalse(newtasks)
        self.assertFalse(result)

    def test_zzz_etag_not_working(self):
        status, newtasks, result = self.crawl(self.httpbin + '/cache', callback=self.json)
        self.assertStatusOk(status)
        self.assertTrue(result)

        status['track']['process']['ok'] = False
        status, newtasks, result = self.crawl(self.httpbin + '/cache',
                                              track=status['track'], callback=self.json)
        self.assertStatusOk(status)
        self.assertTrue(result)

    def test_zzz_unexpected_crawl_argument(self):
        with self.assertRaisesRegexp(TypeError, "unexpected keyword argument"):
            self.crawl(self.httpbin + '/cache', cookie={}, callback=self.json)

    def test_zzz_curl_get(self):
        status, newtasks, result = self.crawl(
            "curl '" + self.httpbin + '''/get' -H 'DNT: 1' -H 'Accept-Encoding: gzip, deflate, sdch' -H 'Accept-Language: en,zh-CN;q=0.8,zh;q=0.6' -H 'User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.17 Safari/537.36' -H 'Binux-Header: Binux-Value' -H 'Accept: */*' -H 'Cookie: _gauges_unique_year=1; _gauges_unique=1; _ga=GA1.2.415471573.1419316591' -H 'Connection: keep-alive' --compressed''',
            callback=self.json)
        self.assertStatusOk(status)
        self.assertTrue(result)

        self.assertTrue(result['headers'].get('Binux-Header'), 'Binux-Value')

    def test_zzz_curl_post(self):
        status, newtasks, result = self.crawl(
            "curl '" + self.httpbin + '''/post' -H 'Origin: chrome-extension://hgmloofddffdnphfgcellkdfbfbjeloo' -H 'Accept-Encoding: gzip, deflate' -H 'Accept-Language: en,zh-CN;q=0.8,zh;q=0.6' -H 'User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.17 Safari/537.36' -H 'Content-Type: application/x-www-form-urlencoded' -H 'Accept: */*' -H 'Cookie: _gauges_unique_year=1; _gauges_unique=1; _ga=GA1.2.415471573.1419316591' -H 'Connection: keep-alive' -H 'DNT: 1' --data 'Binux-Key=%E4%B8%AD%E6%96%87+value' --compressed''',
            callback=self.json)
        self.assertStatusOk(status)
        self.assertTrue(result)

        self.assertTrue(result['form'].get('Binux-Key'), '中文 value')

    def test_zzz_curl_put(self):
        status, newtasks, result = self.crawl(
            "curl '" + self.httpbin + '''/put' -X PUT -H 'Origin: chrome-extension://hgmloofddffdnphfgcellkdfbfbjeloo' -H 'Accept-Encoding: gzip, deflate, sdch' -H 'Accept-Language: en,zh-CN;q=0.8,zh;q=0.6' -H 'User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.17 Safari/537.36' -H 'Content-Type: multipart/form-data; boundary=----WebKitFormBoundaryYlkgyaA7SRGOQYUG' -H 'Accept: */*' -H 'Cookie: _gauges_unique_year=1; _gauges_unique=1; _ga=GA1.2.415471573.1419316591' -H 'Connection: keep-alive' -H 'DNT: 1' --data-binary $'------WebKitFormBoundaryYlkgyaA7SRGOQYUG\r\nContent-Disposition: form-data; name="Binux-Key"\r\n\r\n%E4%B8%AD%E6%96%87+value\r\n------WebKitFormBoundaryYlkgyaA7SRGOQYUG\r\nContent-Disposition: form-data; name="fileUpload1"; filename="1"\r\nContent-Type: application/octet-stream\r\n\r\n\r\n------WebKitFormBoundaryYlkgyaA7SRGOQYUG--\r\n' --compressed''',
            callback=self.json)
        self.assertStatusOk(status)
        self.assertTrue(result)

        self.assertIn('fileUpload1', result['files'], result)

    def test_zzz_curl_no_url(self):
        with self.assertRaisesRegexp(TypeError, 'no URL'):
            status, newtasks, result = self.crawl(
                '''curl -X PUT -H 'Origin: chrome-extension://hgmloofddffdnphfgcellkdfbfbjeloo' --compressed''',
                callback=self.json)

    def test_zzz_curl_bad_option(self):
        with self.assertRaisesRegexp(TypeError, 'Unknow curl option'):
            status, newtasks, result = self.crawl(
                '''curl '%s/put' -X PUT -H 'Origin: chrome-extension://hgmloofddffdnphfgcellkdfbfbjeloo' -v''' % self.httpbin,
                callback=self.json)

        with self.assertRaisesRegexp(TypeError, 'Unknow curl option'):
            status, newtasks, result = self.crawl(
                '''curl '%s/put' -X PUT -v -H 'Origin: chrome-extension://hgmloofddffdnphfgcellkdfbfbjeloo' ''' % self.httpbin,
                callback=self.json)

    def test_zzz_robots_txt(self):
        status, newtasks, result = self.crawl(self.httpbin + '/deny', robots_txt=True, callback=self.catch_http_error)

        self.assertEqual(result, 403)

    def test_zzz_connect_timeout(self):
        start_time = time.time()
        status, newtasks, result = self.crawl('http://240.0.0.1/', connect_timeout=5, callback=self.catch_http_error)
        end_time = time.time()
        self.assertTrue(5 <= end_time - start_time <= 6)

================================================
FILE: tests/test_message_queue.py
================================================
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:
# Author: Binux<i@binux.me>
#         http://binux.me
# Created on 2014-10-07 10:33:38

import os
import six
import time
import unittest

from pyspider.libs import utils
from six.moves import queue as Queue


class TestMessageQueue(object):

    @classmethod
    def setUpClass(self):
        raise NotImplementedError

    def test_10_put(self):
        self.assertEqual(self.q1.qsize(), 0)
        self.assertEqual(self.q2.qsize(), 0)
        self.q1.put('TEST_DATA1', timeout=3)
        self.q1.put('TEST_DATA2_中文', timeout=3)
        time.sleep(0.01)
        self.assertEqual(self.q1.qsize(), 2)
        self.assertEqual(self.q2.qsize(), 2)

    def test_20_get(self):
        self.assertEqual(self.q1.get(timeout=0.01), 'TEST_DATA1')
        self.assertEqual(self.q2.get_nowait(), 'TEST_DATA2_中文')
        with self.assertRaises(Queue.Empty):
            self.q2.get(timeout=0.01)
        with self.assertRaises(Queue.Empty):
            self.q2.get_nowait()

    def test_30_full(self):
        self.assertEqual(self.q1.qsize(), 0)
        self.assertEqual(self.q2.qsize(), 0)
        for i in range(2):
            self.q1.put_nowait('TEST_DATA%d' % i)
        for i in range(3):
            self.q2.put('TEST_DATA%d' % i)

        with self.assertRaises(Queue.Full):
            self.q1.put('TEST_DATA6', timeout=0.01)
        with self.assertRaises(Queue.Full):
            self.q1.put_nowait('TEST_DATA6')

    def test_40_multiple_threading_error(self):
        def put(q):
            for i in range(100):
                q.put("DATA_%d" % i)

        def get(q):
            for i in range(100):
                q.get()

        t = utils.run_in_thread(put, self.q3)
        get(self.q3)
        t.join()


class BuiltinQueue(TestMessageQueue, unittest.TestCase):
    @classmethod
    def setUpClass(self):
        from pyspider.message_queue import connect_message_queue
        with utils.timeout(3):
            self.q1 = self.q2 = connect_message_queue('test_queue', maxsize=5)
            self.q3 = connect_message_queue('test_queue_for_threading_test')


#@unittest.skipIf(six.PY3, 'pika not suport python 3')
@unittest.skipIf(os.environ.get('IGNORE_RABBITMQ') or os.environ.get('IGNORE_ALL'), 'no rabbitmq server for test.')
class TestPikaRabbitMQ(TestMessageQueue, unittest.TestCase):

    @classmethod
    def setUpClass(self):
        from pyspider.message_queue import rabbitmq
        with utils.timeout(3):
            self.q1 = rabbitmq.PikaQueue('test_queue', maxsize=5, lazy_limit=False)
            self.q2 = rabbitmq.PikaQueue('test_queue', amqp_url='amqp://localhost:5672/%2F', maxsize=5, lazy_limit=False)
            self.q3 = rabbitmq.PikaQueue('test_queue_for_threading_test', amqp_url='amqp://guest:guest@localhost:5672/', lazy_limit=False)
        self.q2.delete()
        self.q2.reconnect()
        self.q3.delete()
        self.q3.reconnect()

    @classmethod
    def tearDownClass(self):
        self.q2.delete()
        self.q3.delete()
        del self.q1
        del self.q2
        del self.q3

    def test_30_full(self):
        self.assertEqual(self.q1.qsize(), 0)
        self.assertEqual(self.q2.qsize(), 0)
        for i in range(2):
            self.q1.put_nowait('TEST_DATA%d' % i)
        for i in range(3):
            self.q2.put('TEST_DATA%d' % i)

        print(self.q1.__dict__)
        print(self.q1.qsize())
        with self.assertRaises(Queue.Full):
            self.q1.put_nowait('TEST_DATA6')
        print(self.q1.__dict__)
        print(self.q1.qsize())
        with self.assertRaises(Queue.Full):
            self.q1.put('TEST_DATA6', timeout=0.01)


@unittest.skipIf(six.PY3, 'Python 3 now using Pika')
@unittest.skipIf(os.environ.get('IGNORE_RABBITMQ') or os.environ.get('IGNORE_ALL'), 'no rabbitmq server for test.')
class TestAmqpRabbitMQ(TestMessageQueue, unittest.TestCase):

    @classmethod
    def setUpClass(self):
        from pyspider.message_queue import connect_message_queue
        with utils.timeout(3):
            self.q1 = connect_message_queue('test_queue', 'amqp://localhost:5672/',
                                            maxsize=5, lazy_limit=False)
            self.q2 = connect_message_queue('test_queue', 'amqp://localhost:5672/%2F',
                                            maxsize=5, lazy_limit=False)
            self.q3 = connect_message_queue('test_queue_for_threading_test',
                                            'amqp://guest:guest@localhost:5672/', lazy_limit=False)
        self.q2.delete()
        self.q2.reconnect()
        self.q3.delete()
        self.q3.reconnect()

    @classmethod
    def tearDownClass(self):
        self.q2.delete()
        self.q3.delete()
        del self.q1
        del self.q2
        del self.q3

    def test_30_full(self):
        self.assertEqual(self.q1.qsize(), 0)
        self.assertEqual(self.q2.qsize(), 0)
        for i in range(2):
            self.q1.put_nowait('TEST_DATA%d' % i)
        for i in range(3):
            self.q2.put('TEST_DATA%d' % i)

        print(self.q1.__dict__)
        print(self.q1.qsize())
        with self.assertRaises(Queue.Full):
            self.q1.put('TEST_DATA6', timeout=0.01)
        print(self.q1.__dict__)
        print(self.q1.qsize())
        with self.assertRaises(Queue.Full):
            self.q1.put_nowait('TEST_DATA6')


@unittest.skipIf(os.environ.get('IGNORE_REDIS') or os.environ.get('IGNORE_ALL'), 'no redis server for test.')
class TestRedisQueue(TestMessageQueue, unittest.TestCase):

    @classmethod
    def setUpClass(self):
        from pyspider.message_queue import connect_message_queue
        from pyspider.message_queue import redis_queue
        with utils.timeout(3):
            self.q1 = redis_queue.RedisQueue('test_queue', maxsize=5, lazy_limit=False)
            self.q2 = redis_queue.RedisQueue('test_queue', maxsize=5, lazy_limit=False)
            self.q3 = connect_message_queue('test_queue_for_threading_test',
                                            'redis://localhost:6379/')
            while not self.q1.empty():
                self.q1.get()
            while not self.q2.empty():
                self.q2.get()
            while not self.q3.empty():
                self.q3.get()

    @classmethod
    def tearDownClass(self):
        while not self.q1.empty():
            self.q1.get()
        while not self.q2.empty():
            self.q2.get()
        while not self.q3.empty():
            self.q3.get()

class TestKombuQueue(TestMessageQueue, unittest.TestCase):
    kombu_url = 'kombu+memory://'

    @classmethod
    def setUpClass(self):
        from pyspider.message_queue import connect_message_queue
        with utils.timeout(3):
            self.q1 = connect_message_queue('test_queue', self.kombu_url, maxsize=5, lazy_limit=False)
            self.q2 = connect_message_queue('test_queue', self.kombu_url, maxsize=5, lazy_limit=False)
            self.q3 = connect_message_queue('test_queue_for_threading_test', self.kombu_url, lazy_limit=False)
            while not self.q1.empty():
                self.q1.get()
            while not self.q2.empty():
                self.q2.get()
            while not self.q3.empty():
                self.q3.get()

    @classmethod
    def tearDownClass(self):
        while not self.q1.empty():
            self.q1.get()
        self.q1.delete()
        while not self.q2.empty():
            self.q2.get()
        self.q2.delete()
        while not self.q3.empty():
            self.q3.get()
        self.q3.delete()

@unittest.skip('test cannot pass, get is buffered')
@unittest.skipIf(os.environ.get('IGNORE_RABBITMQ') or os.environ.get('IGNORE_ALL'), 'no rabbitmq server for test.')
class TestKombuAmpqQueue(TestKombuQueue):
    kombu_url = 'kombu+amqp://'

@unittest.skip('test cannot pass, put is buffered')
@unittest.skipIf(os.environ.get('IGNORE_REDIS') or os.environ.get('IGNORE_ALL'), 'no redis server for test.')
class TestKombuRedisQueue(TestKombuQueue):
    kombu_url = 'kombu+redis://'

@unittest.skipIf(os.environ.get('IGNORE_MONGODB') or os.environ.get('IGNORE_ALL'), 'no mongodb server for test.')
class TestKombuMongoDBQueue(TestKombuQueue):
    kombu_url = 'kombu+mongodb://'


================================================
FILE: tests/test_processor.py
================================================
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:
# Author: Binux<i@binux.me>
#         http://binux.me
# Created on 2014-02-22 14:00:05

import os
import six
import copy
import time
import unittest
import logging.config
logging.config.fileConfig("pyspider/logging.conf")

from pyspider.libs import utils
from pyspider.processor.project_module import ProjectManager


class TestProjectModule(unittest.TestCase):

    @property
    def base_task(self):
        return {
            'taskid': 'taskid',
            'project': 'test.project',
            'url': 'www.baidu.com/',
            'schedule': {
                'priority': 1,
                'retries': 3,
                'exetime': 0,
                'age': 3600,
                'itag': 'itag',
                'recrawl': 5,
            },
            'fetch': {
                'method': 'GET',
                'headers': {
                    'Cookie': 'a=b',
                },
                'data': 'a=b&c=d',
                'timeout': 60,
                'save': [1, 2, 3],
            },
            'process': {
                'callback': 'callback',
            },
        }

    @property
    def fetch_result(self):
        return {
            'status_code': 200,
            'orig_url': 'www.baidu.com/',
            'url': 'http://www.baidu.com/',
            'headers': {
                'cookie': 'abc',
            },
            'content': 'test data',
            'cookies': {
                'a': 'b',
            },
            'save': [1, 2, 3],
        }

    def setUp(self):
        self.project = "test.project"
        self.script = open(os.path.join(os.path.dirname(__file__), 'data_handler.py')).read()
        self.env = {
            'test': True,
        }
        self.project_info = {
            'name': self.project,
            'status': 'DEBUG',
        }
        data = ProjectManager.build_module({
            'name': self.project,
            'script': self.script
        }, {'test': True})
        self.module = data['module']
        self.instance = data['instance']

    def test_2_hello(self):
        base_task = self.base_task
        base_task['process']['callback'] = 'hello'
        ret = self.instance.run_task(self.module, base_task, self.fetch_result)
        self.assertIsNone(ret.exception)
        self.assertEqual(ret.result, "hello world!")

    def test_3_echo(self):
        base_task = self.base_task
        base_task['process']['callback'] = 'echo'
        ret = self.instance.run_task(self.module, base_task, self.fetch_result)
        self.assertIsNone(ret.exception)
        self.assertEqual(ret.result, "test data")

    def test_4_saved(self):
        base_task = self.base_task
        base_task['process']['callback'] = 'saved'
        ret = self.instance.run_task(self.module, base_task, self.fetch_result)
        self.assertIsNone(ret.exception)
        self.assertEqual(ret.result, base_task['fetch']['save'])

    def test_5_echo_task(self):
        base_task = self.base_task
        base_task['process']['callback'] = 'echo_task'
        ret = self.instance.run_task(self.module, base_task, self.fetch_result)
        self.assertIsNone(ret.exception)
        self.assertEqual(ret.result, self.project)

    def test_6_catch_status_code(self):
        base_task = self.base_task
        fetch_result = self.fetch_result
        fetch_result['status_code'] = 403
        base_task['process']['callback'] = 'catch_status_code'
        ret = self.instance.run_task(self.module, base_task, fetch_result)
        self.assertIsNone(ret.exception)
        self.assertEqual(ret.result, 403)

    def test_7_raise_exception(self):
        base_task = self.base_task
        base_task['process']['callback'] = 'raise_exception'
        ret = self.instance.run_task(self.module, base_task, self.fetch_result)
        self.assertIsNotNone(ret.exception)
        logstr = ret.logstr()
        self.assertIn('info', logstr)
        self.assertIn('warning', logstr)
        self.assertIn('error', logstr)

    def test_8_add_task(self):
        base_task = self.base_task
        base_task['process']['callback'] = 'add_task'
        ret = self.instance.run_task(self.module, base_task, self.fetch_result)
        self.assertIsNone(ret.exception, ret.logstr())
        self.assertEqual(len(ret.follows), 1)
        self.assertEqual(len(ret.messages), 1)

    def test_10_cronjob(self):
        task = {
            'taskid': '_on_cronjob',
            'project': self.project,
            'url': 'data:,_on_cronjob',
            'fetch': {
                'save': {
                    'tick': 11,
                },
            },
            'process': {
                'callback': '_on_cronjob',
            },
        }
        fetch_result = self.fetch_result
        fetch_result['save'] = {
            'tick': 11,
        }
        ret = self.instance.run_task(self.module, task, fetch_result)
        logstr = ret.logstr()
        self.assertNotIn('on_cronjob1', logstr)
        self.assertNotIn('on_cronjob2', logstr)

        task['fetch']['save']['tick'] = 10
        fetch_result['save'] = task['fetch']['save']
        ret = self.instance.run_task(self.module, task, fetch_result)
        logstr = ret.logstr()
        self.assertNotIn('on_cronjob1', logstr)
        self.assertIn('on_cronjob2', logstr)

        task['fetch']['save']['tick'] = 60
        fetch_result['save'] = task['fetch']['save']
        ret = self.instance.run_task(self.module, task, fetch_result)
        logstr = ret.logstr()
        self.assertIn('on_cronjob1', logstr)
        self.assertIn('on_cronjob2', logstr)

    def test_20_get_info(self):
        task = {
            'taskid': '_on_get_info',
            'project': self.project,
            'url': 'data:,_on_get_info',
            'fetch': {
                'save': ['min_tick', 'retry_delay'],
            },
            'process': {
                'callback': '_on_get_info',
            },
        }
        fetch_result = self.fetch_result
        fetch_result['save'] = task['fetch']['save']

        ret = self.instance.run_task(self.module, task, fetch_result)
        self.assertEqual(len(ret.save), 2, ret.logstr())
        for each in ret.follows:
            self.assertEqual(each['url'], 'data:,on_get_info')
            self.assertEqual(each['fetch']['save']['min_tick'], 10)
            self.assertEqual(each['fetch']['save']['retry_delay'], {})

    def test_30_generator(self):
        base_task = self.base_task
        base_task['process']['callback'] = 'generator'
        ret = self.instance.run_task(self.module, base_task, self.fetch_result)
        self.assertIsNone(ret.exception)
        self.assertIn('generator object', repr(ret.result))

    def test_40_sleep(self):
        base_task = self.base_task
        fetch_result = self.fetch_result
        base_task['process']['callback'] = 'sleep'
        fetch_result['save'] = 1

        start_time = time.time()
        ret = self.instance.run_task(self.module, base_task, fetch_result)
        self.assertGreaterEqual(time.time() - start_time, 1)

    def test_50_timeout(self):
        base_task = self.base_task
        fetch_result = self.fetch_result
        base_task['process']['callback'] = 'sleep'
        base_task['process']['process_time_limit'] = 0.5
        fetch_result['save'] = 2

        start_time = time.time()

        ret = self.instance.run_task(self.module, base_task, fetch_result)
        self.assertIsNotNone(ret.exception)
        logstr = ret.logstr()
        self.assertIn('TimeoutError: process timeout', logstr)

        self.assertGreaterEqual(time.time() - start_time, 1)
        self.assertLess(time.time() - start_time, 2)

    def test_60_timeout_in_thread(self):
        base_task = self.base_task
        fetch_result = self.fetch_result
        base_task['process']['callback'] = 'sleep'
        base_task['process']['process_time_limit'] = 0.5
        fetch_result['save'] = 2

        start_time = time.time()
        thread = utils.run_in_thread(lambda self=self: self.instance.run_task(self.module, base_task, fetch_result))
        thread.join()
        self.assertGreaterEqual(time.time() - start_time, 2)


import shutil
import inspect
from pyspider.database.sqlite import projectdb
from pyspider.processor.processor import Processor
from pyspider.libs.multiprocessing_queue import Queue
from pyspider.libs.utils import run_in_thread
from pyspider.libs import sample_handler


class TestProcessor(unittest.TestCase):
    projectdb_path = './data/tests/project.db'

    @classmethod
    def setUpClass(self):
        shutil.rmtree('./data/tests/', ignore_errors=True)
        os.makedirs('./data/tests/')

        def get_projectdb():
            return projectdb.ProjectDB(self.projectdb_path)
        self.projectdb = get_projectdb()
        self.in_queue = Queue(10)
        self.status_queue = Queue(10)
        self.newtask_queue = Queue(10)
        self.result_queue = Queue(10)

        def run_processor():
            self.processor = Processor(get_projectdb(), self.in_queue,
                                       self.status_queue, self.newtask_queue, self.result_queue)
            self.processor.project_manager.CHECK_PROJECTS_INTERVAL = 0.1
            self.processor.run()
        self.process = run_in_thread(run_processor)
        time.sleep(1)

    @classmethod
    def tearDownClass(self):
        if self.process.is_alive():
            self.processor.quit()
            self.process.join(2)
        assert not self.process.is_alive()
        shutil.rmtree('./data/tests/', ignore_errors=True)

    def test_10_update_project(self):
        self.assertIsNone(self.processor.project_manager.get('test_project'))
        self.projectdb.insert('test_project', {
            'name': 'test_project',
            'group': 'group',
            'status': 'TODO',
            'script': inspect.getsource(sample_handler),
            'comments': 'test project',
            'rate': 1.0,
            'burst': 10,
        })
        self.assertIsNone(self.processor.project_manager.get('not_exists'))
        self.assertIsNotNone(self.processor.project_manager.get('test_project'))

        task = {
            "process": {
                "callback": "on_start"
            },
            "project": "not_exists",
            "taskid": "data:,on_start",
            "url": "data:,on_start"
        }
        self.in_queue.put((task, {}))
        time.sleep(1)
        self.assertFalse(self.status_queue.empty())
        while not self.status_queue.empty():
            status = self.status_queue.get()
        self.assertEqual(status['track']['process']['ok'], False)
        self.assertIsNone(self.processor.project_manager.get('not_exists'))

    def test_20_broken_project(self):
        self.assertIsNone(self.processor.project_manager.get('test_broken_project'))
        self.projectdb.insert('test_broken_project', {
            'name': 'test_broken_project',
            'group': 'group',
            'status': 'DEBUG',
            'script': inspect.getsource(sample_handler)[:10],
            'comments': 'test project',
            'rate': 1.0,
            'burst': 10,
        })
        self.assertIsNone(self.processor.project_manager.get('not_exists'))
        self.assertIsNotNone(self.processor.project_manager.get('test_broken_project'))
        project_data = self.processor.project_manager.get('test_broken_project')
        self.assertIsNotNone(project_data.get('exception'))

    def test_30_new_task(self):
        self.assertTrue(self.status_queue.empty())
        self.assertTrue(self.newtask_queue.empty())
        task = {
            "process": {
                "callback": "on_start"
            },
            "project": "test_project",
            "taskid": "data:,on_start",
            "url": "data:,on_start"
        }
        fetch_result = {
            "orig_url": "data:,on_start",
            "content": "on_start",
            "headers": {},
            "status_code": 200,
            "url": "data:,on_start",
            "time": 0,
        }
        self.in_queue.put((task, fetch_result))
        time.sleep(1)
        self.assertFalse(self.status_queue.empty())
        while not self.status_queue.empty():
            self.status_queue.get()
        self.assertFalse(self.newtask_queue.empty())

    def test_40_index_page(self):
        task = None
        while not self.newtask_queue.empty():
            task = self.newtask_queue.get()[0]
        self.assertIsNotNone(task)

        fetch_result = {
            "orig_url": task['url'],
            "content": (
                "<html><body>"
                "<a href='http://binux.me'>binux</a>"
                "<a href='http://binux.me/中文'>binux</a>"
                "<a href='http://binux.me/1'>1</a>"
                "<a href='http://binux.me/1'>2</a>"
                "</body></html>"
            ),
            "headers": {'a': 'b', 'etag': 'tag'},
            "status_code": 200,
            "url": task['url'],
            "time": 0,
        }
        self.in_queue.put((task, fetch_result))
        time.sleep(1)
        self.assertFalse(self.status_queue.empty())
        self.assertFalse(self.newtask_queue.empty())

        status = self.status_queue.get()
        self.assertEqual(status['track']['fetch']['ok'], True)
        self.assertEqual(status['track']['fetch']['time'], 0)
        self.assertEqual(status['track']['fetch']['status_code'], 200)
        self.assertEqual('tag', status['track']['fetch']['headers']['etag'])
        self.assertIsNone(status['track']['fetch']['content'])
        self.assertEqual(status['track']['process']['ok'], True)
        self.assertGreater(status['track']['process']['time'], 0)
        self.assertEqual(status['track']['process']['follows'], 3)
        self.assertIsNone(status['track']['process']['result'])
        self.assertEqual(status['track']['process']['logs'], '')
        self.assertIsNone(status['track']['process']['exception'])

        tasks = self.newtask_queue.get()
        self.assertEqual(len(tasks), 3)
        self.assertEqual(tasks[0]['url'], 'http://binux.me/')
        self.assertTrue(tasks[1]['url'].startswith('http://binux.me/%'), task['url'])

    def test_50_fetch_error(self):
        # clear new task queue
        while not self.newtask_queue.empty():
            self.newtask_queue.get()
        # clear status queue
        while not self.status_queue.empty():
            self.status_queue.get()

        task = {
            "process": {
                "callback": "index_page"
            },
            "project": "test_project",
            "taskid": "data:,test_fetch_error",
            "url": "data:,test_fetch_error"
        }

        fetch_result = {
            "orig_url": task['url'],
            "content": "test_fetch_error",
            "error": "test_fetch_error",
            "headers": {'a': 'b', 'last-modified': '123'},
            "status_code": 598,
            "url": task['url'],
            "time": 0,
        }

        self.in_queue.put((task, fetch_result))
        time.sleep(1)
        self.assertFalse(self.status_queue.empty())
        self.assertTrue(self.newtask_queue.empty())

        status = self.status_queue.get()
        self.assertEqual(status['track']['fetch']['ok'], False)
        self.assertEqual(status['track']['fetch']['time'], 0)
        self.assertEqual(status['track']['fetch']['status_code'], 598)
        self.assertEqual('123', status['track']['fetch']['headers']['last-modified'])
        self.assertIsNotNone(status['track']['fetch']['content'])
        self.assertEqual(status['track']['process']['ok'], False)
        self.assertGreater(status['track']['process']['time'], 0)
        self.assertEqual(status['track']['process']['follows'], 0)
        self.assertIsNone(status['track']['process']['result'])
        self.assertGreater(len(status['track']['process']['logs']), 0)
        self.assertIsNotNone(status['track']['process']['exception'])

    def test_60_call_broken_project(self):
        # clear new task queue
        while not self.newtask_queue.empty():
            self.newtask_queue.get()
        # clear status queue
        while not self.status_queue.empty():
            self.status_queue.get()

        task = {
            "process": {
                "callback": "on_start"
            },
            "project": "test_broken_project",
            "taskid": "data:,on_start",
            "url": "data:,on_start",
        }
        fetch_result = {
            "orig_url": "data:,on_start",
            "content": "on_start",
            "headers": {},
            "status_code": 200,
            "url": "data:,on_start",
            "time": 0,
        }
        self.in_queue.put((task, fetch_result))
        time.sleep(1)
        self.assertFalse(self.status_queue.empty())
        while not self.status_queue.empty():
            status = self.status_queue.get()
        self.assertEqual(status['track']['fetch']['ok'], True)
        self.assertEqual(status['track']['process']['ok'], False)
        self.assertGreater(len(status['track']['process']['logs']), 0)
        self.assertIsNotNone(status['track']['process']['exception'])
        self.assertTrue(self.newtask_queue.empty())

    def test_70_update_project(self):
        self.processor.project_manager.CHECK_PROJECTS_INTERVAL = 1000000
        self.processor.project_manager._check_projects()
        self.assertIsNotNone(self.processor.project_manager.get('test_broken_project'))
        # clear new task queue
        while not self.newtask_queue.empty():
            self.newtask_queue.get()
        # clear status queue
        while not self.status_queue.empty():
            self.status_queue.get()

        task = {
            "process": {
                "callback": "on_start"
            },
            "project": "test_broken_project",
            "taskid": "data:,on_start",
            "url": "data:,on_start"
        }
        fetch_result = {
            "orig_url": "data:,on_start",
            "content": "on_start",
            "headers": {},
            "status_code": 200,
            "url": "data:,on_start",
            "time": 0,
        }

        self.projectdb.update('test_broken_project', {
            'script': inspect.getsource(sample_handler),
        })

        # not update
        self.in_queue.put((task, fetch_result))
        time.sleep(1)
        self.assertFalse(self.status_queue.empty())
        while not self.status_queue.empty():
            status = self.status_queue.get()
        self.assertEqual(status['track']['fetch']['ok'], True)
        self.assertEqual(status['track']['process']['ok'], False)

        # updated
        task['project_updatetime'] = time.time()
        self.in_queue.put((task, fetch_result))
        time.sleep(1)
        self.assertFalse(self.status_queue.empty())
        while not self.status_queue.empty():
            status = self.status_queue.get()
        self.assertEqual(status['track']['fetch']['ok'], True)
        self.assertEqual(status['track']['process']['ok'], True)

        self.projectdb.update('test_broken_project', {
            'script': inspect.getsource(sample_handler)[:10],
        })

        # update with md5
        task['project_md5sum'] = 'testmd5'
        del task['project_updatetime']
        self.in_queue.put((task, fetch_result))
        time.sleep(1)
        self.assertFalse(self.status_queue.empty())
        while not self.status_queue.empty():
            status = self.status_queue.get()
        self.assertEqual(status['track']['fetch']['ok'], True)
        self.assertEqual(status['track']['process']['ok'], False)

        self.processor.project_manager.CHECK_PROJECTS_INTERVAL = 0.1

    def test_80_import_project(self):
        self.projectdb.insert('test_project2', {
            'name': 'test_project',
            'group': 'group',
            'status': 'TODO',
            'script': inspect.getsource(sample_handler),
            'comments': 'test project',
            'rate': 1.0,
            'burst': 10,
        })
        self.projectdb.insert('test_project3', {
            'name': 'test_project',
            'group': 'group',
            'status': 'TODO',
            'script': inspect.getsource(sample_handler),
            'comments': 'test project',
            'rate': 1.0,
            'burst': 10,
        })

        from projects import test_project
        self.assertIsNotNone(test_project)
        self.assertIsNotNone(test_project.Handler)

        from projects.test_project2 import Handler
        self.assertIsNotNone(Handler)

        import projects.test_project3
        self.assertIsNotNone(projects.test_project3.Handler)


================================================
FILE: tests/test_response.py
================================================
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:
# Author: Binux<roy@binux.me>
#         http://binux.me
# Created on 2015-01-18 11:10:27


import os
import copy
import time
import httpbin
import unittest

import logging
import logging.config
logging.config.fileConfig("pyspider/logging.conf")

from pyspider.libs import utils
from pyspider.libs.response import rebuild_response
from pyspider.fetcher.tornado_fetcher import Fetcher

class TestResponse(unittest.TestCase):
    sample_task_http = {
        'taskid': 'taskid',
        'project': 'project',
        'url': '',
    }

    @classmethod
    def setUpClass(self):
        self.fetcher = Fetcher(None, None, async_mode=False)
        self.httpbin_thread = utils.run_in_subprocess(httpbin.app.run, port=14887, passthrough_errors=False)
        self.httpbin = 'http://127.0.0.1:14887'
        time.sleep(0.5)

    @classmethod
    def tearDownClass(self):
        self.httpbin_thread.terminate()

    def get(self, url, **kwargs):
        if not url.startswith('http://'):
            url = self.httpbin + url
        request = copy.deepcopy(self.sample_task_http)
        request['url'] = url
        request.update(kwargs)
        result = self.fetcher.fetch(request)
        response = rebuild_response(result)
        return response

    def test_10_html(self):
        response = self.get('/html')
        self.assertEqual(response.status_code, 200)
        self.assertIsNotNone(response.doc('h1'))

    def test_20_xml(self):
        response = self.get('/xml')
        self.assertEqual(response.status_code, 200)
        self.assertIsNotNone(response.doc('item'))

    def test_30_gzip(self):
        response = self.get('/gzip')
        self.assertEqual(response.status_code, 200)
        self.assertIn('gzipped', response.text)

    def test_40_deflate(self):
        response = self.get('/deflate')
        self.assertEqual(response.status_code, 200)
        self.assertIn('deflated', response.text)

    def test_50_ok(self):
        response = self.get('/status/200')
        self.assertTrue(response.ok)
        self.assertTrue(response)
        response = self.get('/status/302')
        self.assertTrue(response.ok)
        self.assertTrue(response)
        with self.assertRaises(Exception):
            self.raise_for_status(allow_redirects=False)

    def test_60_not_ok(self):
        response = self.get('/status/400')
        self.assertFalse(response.ok)
        self.assertFalse(response)
        response = self.get('/status/500')
        self.assertFalse(response.ok)
        self.assertFalse(response)
        response = self.get('/status/600')
        self.assertFalse(response.ok)
        self.assertFalse(response)

    def test_70_reraise_exception(self):
        response = self.get('file://abc')
        with self.assertRaisesRegex(Exception, 'HTTP 599'):
            response.raise_for_status()


================================================
FILE: tests/test_result_dump.py
================================================
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:
# Author: Binux<roy@binux.me>
#         http://binux.me
# Created on 2015-10-12 22:17:57

from __future__ import unicode_literals, division

import six
import csv
import time
import json
import unittest
from six import StringIO

from pyspider.libs import result_dump

results1 = [
    {'taskid': 'taskid1', 'url': 'http://example.org/url1', 'pdatetime': time.time(),
     'result': {'a': 1, 'b': 2} },
    {'taskid': 'taskid1', 'url': 'http://example.org/url1', 'pdatetime': time.time(),
     'result': {'a': 1, 'b': 2, 'c': 3} },
]

results2 = results1 + [
    {'taskid': 'taskid1', 'url': 'http://example.org/url1', 'pdatetime': time.time(),
     'result': [1, 2, '中文', u'中文'] },
]

results_error = results2 + [
    {'taskid': 'taskid1', 'url': 'http://example.org/url1', 'pdatetime': time.time(),
     'result': None},
    {'taskid': 'taskid1', 'url': 'http://example.org/url1', 'pdatetime': time.time() },
    {'taskid': 'taskid1', 'pdatetime': time.time() },
]

result_list_error = [
    {'taskid': 'taskid1', 'url': 'http://example.org/url1', 'pdatetime': time.time(),
     'result': [{"rate": "8.2", "title": '1'}, {"rate": "8.2", "title": '1'}]},
    {'taskid': 'taskid1', 'url': 'http://example.org/url1', 'pdatetime': time.time(),
     'result': [{"rate": "8.2", "title": '1'}, {"rate": "8.2", "title": '1'}]},
]

class TestResultDump(unittest.TestCase):
    def test_result_formater_1(self):
        common_fields, results = result_dump.result_formater(results1)
        self.assertEqual(common_fields, set(('a', 'b')))

    def test_result_formater_2(self):
        common_fields, results = result_dump.result_formater(results2)
        self.assertEqual(common_fields, set())

    def test_result_formater_error(self):
        common_fields, results = result_dump.result_formater(results_error)
        self.assertEqual(common_fields, set())

    def test_dump_as_json(self):
        for i, line in enumerate((''.join(
                result_dump.dump_as_json(results2))).splitlines()):
            self.assertDictEqual(results2[i], json.loads(line))

    def test_dump_as_json_valid(self):
        ret = json.loads(''.join(result_dump.dump_as_json(results2, True)))
        for i, j in zip(results2, ret):
            self.assertDictEqual(i, j)

    def test_dump_as_txt(self):
        for i, line in enumerate((''.join(
                result_dump.dump_as_txt(results2))).splitlines()):
            url, json_data = line.split('\t', 2)
            self.assertEqual(results2[i]['result'], json.loads(json_data))

    def test_dump_as_csv(self):
        reader = csv.reader(StringIO(''.join(result_dump.dump_as_csv(results1))))
        for row in reader:
            self.assertEqual(len(row), 4)

    def test_dump_as_csv_case_1(self):
        reader = csv.reader(StringIO(''.join(result_dump.dump_as_csv(result_list_error))))
        for row in reader:
            self.assertEqual(len(row), 2)


================================================
FILE: tests/test_result_worker.py
================================================
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:
# Author: Binux<i@binux.me>
#         http://binux.me
# Created on 2014-11-11 20:52:53

import os
import time
import unittest
import logging.config
logging.config.fileConfig("pyspider/logging.conf")

import shutil
from pyspider.database.sqlite import resultdb
from pyspider.result.result_worker import ResultWorker
from pyspider.libs.multiprocessing_queue import Queue
from pyspider.libs.utils import run_in_thread


class TestProcessor(unittest.TestCase):
    resultdb_path = './data/tests/result.db'

    @classmethod
    def setUpClass(self):
        shutil.rmtree('./data/tests/', ignore_errors=True)
        os.makedirs('./data/tests/')

        def get_resultdb():
            return resultdb.ResultDB(self.resultdb_path)
        self.resultdb = get_resultdb()
        self.inqueue = Queue(10)

        def run_result_worker():
            self.result_worker = ResultWorker(get_resultdb(), self.inqueue)
            self.result_worker.run()
        self.process = run_in_thread(run_result_worker)
        time.sleep(1)

    @classmethod
    def tearDownClass(self):
        if self.process.is_alive():
            self.result_worker.quit()
            self.process.join(2)
        assert not self.process.is_alive()
        shutil.rmtree('./data/tests/', ignore_errors=True)

    def test_10_bad_result(self):
        self.inqueue.put(({'project': 'test_project'}, {}))
        self.resultdb._list_project()
        self.assertEqual(len(self.resultdb.projects), 0)
        self.assertEqual(self.resultdb.count('test_project'), 0)

    def test_10_bad_result_2(self):
        self.inqueue.put(({'project': 'test_project'}, {'a': 'b'}))
        self.resultdb._list_project()
        self.assertEqual(len(self.resultdb.projects), 0)
        self.assertEqual(self.resultdb.count('test_project'), 0)

    def test_20_insert_result(self):
        data = {
            'a': 'b'
        }
        self.inqueue.put(({
            'project': 'test_project',
            'taskid': 'id1',
            'url': 'url1'
        }, data))
        time.sleep(0.5)
        self.resultdb._list_project()
        self.assertEqual(len(self.resultdb.projects), 1)
        self.assertEqual(self.resultdb.count('test_project'), 1)

        result = self.resultdb.get('test_project', 'id1')
        self.assertEqual(result['result'], data)

    def test_30_overwrite(self):
        self.inqueue.put(({
            'project': 'test_project',
            'taskid': 'id1',
            'url': 'url1'
        }, "abc"))
        time.sleep(0.1)
        result = self.resultdb.get('test_project', 'id1')
        self.assertEqual(result['result'], "abc")

    def test_40_insert_list(self):
        self.inqueue.put(({
            'project': 'test_project',
            'taskid': 'id2',
            'url': 'url1'
        }, ['a', 'b']))
        time.sleep(0.1)
        result = self.resultdb.get('test_project', 'id2')
        self.assertEqual(result['result'], ['a', 'b'])


================================================
FILE: tests/test_run.py
================================================
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:
# Author: Binux<roy@binux.me>
#         http://binux.me
# Created on 2014-11-21 22:32:35

from __future__ import print_function

import os
import sys
import six
import time
import json
import signal
import shutil
import inspect
import requests
import unittest

from pyspider import run
from pyspider.libs import utils
from tests import data_sample_handler

class TestRun(unittest.TestCase):

    @classmethod
    def setUpClass(self):
        shutil.rmtree('./data/tests', ignore_errors=True)
        os.makedirs('./data/tests')

        import tests.data_test_webpage
        import httpbin
        self.httpbin_thread = utils.run_in_subprocess(httpbin.app.run, port=14887, passthrough_errors=False)
        self.httpbin = 'http://127.0.0.1:14887'

    @classmethod
    def tearDownClass(self):
        self.httpbin_thread.terminate()
        self.httpbin_thread.join()

        assert not utils.check_port_open(5000)
        assert not utils.check_port_open(23333)
        assert not utils.check_port_open(24444)
        assert not utils.check_port_open(25555)
        assert not utils.check_port_open(14887)

        shutil.rmtree('./data/tests', ignore_errors=True)

    def test_10_cli(self):
        ctx = run.cli.make_context('test', [], None, obj=dict(testing_mode=True))
        ctx = run.cli.invoke(ctx)
        self.assertEqual(ctx.obj.debug, False)
        for db in ('taskdb', 'projectdb', 'resultdb'):
            self.assertIsNotNone(getattr(ctx.obj, db))
        for name in ('newtask_queue', 'status_queue', 'scheduler2fetcher',
                     'fetcher2processor', 'processor2result'):
            self.assertIsNotNone(getattr(ctx.obj, name))
        self.assertEqual(len(ctx.obj.instances), 0)

    def test_20_cli_config(self):
        with open('./data/tests/config.json', 'w') as fp:
            json.dump({
                'debug': True,
                'taskdb': 'mysql+taskdb://localhost:23456/taskdb',
                'amqp-url': 'amqp://guest:guest@localhost:23456/%%2F'
            }, fp)
        ctx = run.cli.make_context('test',
                                   ['--config', './data/tests/config.json'],
                                   None, obj=dict(testing_mode=True))
        ctx = run.cli.invoke(ctx)
        self.assertEqual(ctx.obj.debug, True)

        import mysql.connector
        with self.assertRaises(mysql.connector.Error):
            ctx.obj.taskdb

        with self.assertRaises(Exception):
            ctx.obj.newtask_queue

    def test_30_cli_command_line(self):
        ctx = run.cli.make_context(
            'test',
            ['--projectdb', 'mongodb+projectdb://localhost:23456/projectdb'],
            None,
            obj=dict(testing_mode=True)
        )
        ctx = run.cli.invoke(ctx)

        from pymongo.errors import ConnectionFailure
        with self.assertRaises(ConnectionFailure):
            ctx.obj.projectdb

    def test_30a_cli_command_line(self):
        ctx = run.cli.make_context(
            'test',
            ['--projectdb', 'couchdb+projectdb://localhost:5984/projectdb'],
            None,
            obj=dict(testing_mode=True)
        )
        ctx = run.cli.invoke(ctx)

        with self.assertRaises(Exception):
            # TODO: MORE SPECIFIC
            ctx.obj.projectdb

    def test_40_cli_env(self):
        try:
            os.environ['RESULTDB'] = 'sqlite+resultdb://'
            ctx = run.cli.make_context('test', [], None,
                                       obj=dict(testing_mode=True))
            ctx = run.cli.invoke(ctx)

            from pyspider.database.sqlite import resultdb
            self.assertIsInstance(ctx.obj.resultdb, resultdb.ResultDB)
        finally:
            del os.environ['RESULTDB']

    @unittest.skipIf(os.environ.get('IGNORE_RABBITMQ') or os.environ.get('IGNORE_ALL'), 'no rabbitmq server for test.')
    def test_50_docker_rabbitmq(self):
        try:
            os.environ['RABBITMQ_NAME'] = 'rabbitmq'
            os.environ['RABBITMQ_PORT_5672_TCP_ADDR'] = 'localhost'
            os.environ['RABBITMQ_PORT_5672_TCP_PORT'] = '5672'
            ctx = run.cli.make_context('test', [], None,
                                       obj=dict(testing_mode=True))
            ctx = run.cli.invoke(ctx)
            queue = ctx.obj.newtask_queue
            queue.put('abc')
            queue.delete()
        except Exception as e:
            self.assertIsNone(e)
        finally:
            del os.environ['RABBITMQ_NAME']
            del os.environ['RABBITMQ_PORT_5672_TCP_ADDR']
            del os.environ['RABBITMQ_PORT_5672_TCP_PORT']

    @unittest.skipIf(os.environ.get('IGNORE_MONGODB') or os.environ.get('IGNORE_ALL'), 'no mongodb server for test.')
    def test_60_docker_mongodb(self):
        try:
            os.environ['MONGODB_NAME'] = 'mongodb'
            os.environ['MONGODB_PORT_27017_TCP_ADDR'] = 'localhost'
            os.environ['MONGODB_PORT_27017_TCP_PORT'] = '27017'
            ctx = run.cli.make_context('test', [], None,
                                       obj=dict(testing_mode=True))
            ctx = run.cli.invoke(ctx)
            ctx.obj.resultdb
        except Exception as e:
            self.assertIsNone(e)
        finally:
            del os.environ['MONGODB_NAME']
            del os.environ['MONGODB_PORT_27017_TCP_ADDR']
            del os.environ['MONGODB_PORT_27017_TCP_PORT']

    @unittest.skipIf(os.environ.get('IGNORE_COUCHDB') or os.environ.get('IGNORE_ALL'), 'no couchdb server for test.')
    def test_60a_docker_couchdb(self):
        try:
            # create a test admin user
            os.environ['COUCHDB_NAME'] = 'couchdb'
            os.environ['COUCHDB_PORT_5984_TCP_ADDR'] = 'localhost'
            os.environ['COUCHDB_PORT_5984_TCP_PORT'] = '5984'
            ctx = run.cli.make_context('test', [], None,
                                       obj=dict(testing_mode=True))
            ctx = run.cli.invoke(ctx)
            ctx.obj.resultdb
        except Exception as e:
            self.assertIsNone(e)
        finally:
            # remove the test admin user
            del os.environ['COUCHDB_NAME']
            del os.environ['COUCHDB_PORT_5984_TCP_ADDR']
            del os.environ['COUCHDB_PORT_5984_TCP_PORT']

    @unittest.skip('only available in docker')
    @unittest.skipIf(os.environ.get('IGNORE_MYSQL') or os.environ.get('IGNORE_ALL'), 'no mysql server for test.')
    def test_70_docker_mysql(self):
        try:
            os.environ['MYSQL_NAME'] = 'mysql'
            os.environ['MYSQL_PORT_3306_TCP_ADDR'] = 'localhost'
            os.environ['MYSQL_PORT_3306_TCP_PORT'] = '3306'
            ctx = run.cli.make_context('test', [], None,
                                       obj=dict(testing_mode=True))
            ctx = run.cli.invoke(ctx)
            ctx.obj.resultdb
        except Exception as e:
            self.assertIsNone(e)
        finally:
            del os.environ['MYSQL_NAME']
            del os.environ['MYSQL_PORT_3306_TCP_ADDR']
            del os.environ['MYSQL_PORT_3306_TCP_PORT']

    def test_80_docker_phantomjs(self):
        try:
            os.environ['PHANTOMJS_NAME'] = 'phantomjs'
            os.environ['PHANTOMJS_PORT_25555_TCP'] = 'tpc://binux:25678'
            ctx = run.cli.make_context('test', [], None,
                                       obj=dict(testing_mode=True))
            ctx = run.cli.invoke(ctx)
            self.assertEqual(ctx.obj.phantomjs_proxy, 'binux:25678')
        except Exception as e:
            self.assertIsNone(e)
        finally:
            del os.environ['PHANTOMJS_NAME']
            del os.environ['PHANTOMJS_PORT_25555_TCP']

    def test_90_docker_scheduler(self):
        try:
            os.environ['SCHEDULER_PORT_23333_TCP_ADDR'] = 'scheduler'
            os.environ['SCHEDULER_PORT_23333_TCP_PORT'] = '23333'

            ctx = run.cli.make_context('test', [], None,
                                       obj=dict(testing_mode=True))
            ctx = run.cli.invoke(ctx)
            webui = run.cli.get_command(ctx, 'webui')
            webui_ctx = webui.make_context('webui', [], ctx)
            app = webui.invoke(webui_ctx)
            rpc = app.config['scheduler_rpc']
            self.assertEqual(rpc._ServerProxy__host, '{}:{}'.format(os.environ['SCHEDULER_PORT_23333_TCP_ADDR'],
                                                                    os.environ['SCHEDULER_PORT_23333_TCP_PORT']))
        except Exception as e:
            self.assertIsNone(e)
        finally:
            del os.environ['SCHEDULER_PORT_23333_TCP_ADDR']
            del os.environ['SCHEDULER_PORT_23333_TCP_PORT']

    def test_a100_all(self):
        import subprocess
        #cmd = [sys.executable]
        cmd = ['coverage', 'run']
        p = subprocess.Popen(cmd+[
            inspect.getsourcefile(run),
            '--taskdb', 'sqlite+taskdb:///data/tests/all_test_task.db',
            '--resultdb', 'sqlite+resultdb:///data/tests/all_test_result.db',
            '--projectdb', 'local+projectdb://'+inspect.getsourcefile(data_sample_handler),
            'all',
        ], close_fds=True, preexec_fn=os.setsid)

        try:
            limit = 30
            while limit >= 0:
                time.sleep(3)
                # click run
                try:
                    requests.post('http://localhost:5000/run', data={
                        'project': 'data_sample_handler',
                    })
                except requests.exceptions.ConnectionError:
                    limit -= 1
                    continue
                break

            limit = 30
            data = requests.get('http://localhost:5000/counter')
            self.assertEqual(data.status_code, 200)
            while data.json().get('data_sample_handler', {}).get('5m', {}).get('success', 0) < 5:
                time.sleep(1)
                data = requests.get('http://localhost:5000/counter')
                limit -= 1
                if limit <= 0:
                    break

            self.assertGreater(limit, 0)
            rv = requests.get('http://localhost:5000/results?project=data_sample_handler')
            self.assertIn('<th>url</th>', rv.text)
            self.assertIn('class=url', rv.text)
        except:
            raise
        finally:
            time.sleep(1)
            os.killpg(p.pid, signal.SIGTERM)
            p.wait()

    def test_a110_one(self):
        pid, fd = os.forkpty()
        #cmd = [sys.executable]
        cmd = ['coverage', 'run']
        cmd += [
            inspect.getsourcefile(run),
            'one',
            '-i',
            inspect.getsourcefile(data_sample_handler)
        ]

        if pid == 0:
            # child
            os.execvp(cmd[0], cmd)
        else:
            # parent
            def wait_text(timeout=1):
                import select
                text = []
                while True:
                    rl, wl, xl = select.select([fd], [], [], timeout)
                    if not rl:
                        break
                    try:
                        t = os.read(fd, 1024)
                    except OSError:
                        break
                    if not t:
                        break
                    t = utils.text(t)
                    text.append(t)
                    print(t, end='')
                return ''.join(text)

            text = wait_text(3)
            self.assertIn('new task data_sample_handler:on_start', text)
            self.assertIn('pyspider shell', text)

            os.write(fd, utils.utf8('run()\n'))
            text = wait_text()
            self.assertIn('task done data_sample_handler:on_start', text)

            os.write(fd, utils.utf8('crawl("%s/pyspider/test.html")\n' % self.httpbin))
            text = wait_text()
            self.assertIn('/robots.txt', text)

            os.write(fd, utils.utf8('crawl("%s/links/10/0")\n' % self.httpbin))
            text = wait_text()
            if '"title": "Links"' not in text:
                os.write(fd, utils.utf8('crawl("%s/links/10/1")\n' % self.httpbin))
                text = wait_text()
                self.assertIn('"title": "Links"', text)

            os.write(fd, utils.utf8('crawl("%s/404")\n' % self.httpbin))
            text = wait_text()
            self.assertIn('task retry', text)

            os.write(fd, b'quit_pyspider()\n')
            text = wait_text()
            self.assertIn('scheduler exiting...', text)
            os.close(fd)
            os.kill(pid, signal.SIGINT)

class TestSendMessage(unittest.TestCase):

    @classmethod
    def setUpClass(self):
        shutil.rmtree('./data/tests', ignore_errors=True)
        os.makedirs('./data/tests')

        ctx = run.cli.make_context('test', [
            '--taskdb', 'sqlite+taskdb:///data/tests/task.db',
            '--projectdb', 'sqlite+projectdb:///data/tests/projectdb.db',
            '--resultdb', 'sqlite+resultdb:///data/tests/resultdb.db',
        ], None, obj=dict(testing_mode=True))
        self.ctx = run.cli.invoke(ctx)

        ctx = run.scheduler.make_context('scheduler', [], self.ctx)
        scheduler = run.scheduler.invoke(ctx)
        self.xmlrpc_thread = utils.run_in_thread(scheduler.xmlrpc_run)
        self.scheduler_thread = utils.run_in_thread(scheduler.run)

        time.sleep(1)

    @classmethod
    def tearDownClass(self):
        for each in self.ctx.obj.instances:
            each.quit()
        self.xmlrpc_thread.join()
        self.scheduler_thread.join()
        time.sleep(1)

        assert not utils.check_port_open(5000)
        assert not utils.check_port_open(23333)
        assert not utils.check_port_open(24444)
        assert not utils.check_port_open(25555)

        shutil.rmtree('./data/tests', ignore_errors=True)

    def test_10_send_message(self):
        ctx = run.send_message.make_context('send_message', [
            'test_project', 'test_message'
        ], self.ctx)
        self.assertTrue(run.send_message.invoke(ctx))
        while True:
            task = self.ctx.obj.scheduler2fetcher.get(timeout=1)
            if task['url'] == 'data:,on_message':
                break
        self.assertEqual(task['process']['callback'], '_on_message')


================================================
FILE: tests/test_scheduler.py
================================================
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:
# Author: Binux<i@binux.me>
#         http://binux.me
# Created on 2014-02-08 22:37:13

import os
import time
import shutil
import unittest
import logging
import logging.config
logging.config.fileConfig("pyspider/logging.conf")

from pyspider.scheduler.task_queue import TaskQueue
from pyspider.libs import utils


class TestTaskQueue(unittest.TestCase):

    @classmethod
    def setUpClass(self):
        self.task_queue = TaskQueue()
        self.task_queue.rate = 100000
        self.task_queue.burst = 100000
        self.task_queue.processing_timeout = 0.5

    def test_10_put(self):
        self.task_queue.put('a3', 0, time.time() + 0.5)
        self.task_queue.put('a4', 3, time.time() + 0.2)
        self.task_queue.put('a2', 0)
        self.task_queue.put('a1', 1)
        self.assertEqual(self.task_queue.size(), 4)

    def test_20_update(self):
        self.task_queue.put('a2', 4)
        self.assertEqual(self.task_queue.size(), 4)
        self.task_queue.put('a3', 2, 0)
        self.assertEqual(self.task_queue.size(), 4)

    def test_30_get_from_priority_queue(self):
        self.assertEqual(self.task_queue.get(), 'a2')
        self.assertEqual(self.task_queue.size(), 4)

    def test_40_time_queue_1(self):
        self.task_queue.check_update()
        self.assertEqual(self.task_queue.get(), 'a3')
        self.assertEqual(self.task_queue.size(), 4)

    def test_50_time_queue_2(self):
        time.sleep(0.3)
        self.task_queue.check_update()
        self.assertEqual(self.task_queue.get(), 'a4')
        self.assertEqual(self.task_queue.get(), 'a1')
        self.assertEqual(self.task_queue.size(), 4)

    def test_60_processing_queue(self):
        time.sleep(0.5)
        self.task_queue.check_update()
        self.assertEqual(self.task_queue.get(), 'a2')
        self.assertEqual(len(self.task_queue), 4)
        self.assertEqual(self.task_queue.get(), 'a4')
        self.assertEqual(self.task_queue.get(), 'a3')
        self.assertEqual(self.task_queue.get(), 'a1')
        self.assertEqual(len(self.task_queue), 4)

    def test_70_done(self):
        self.assertTrue(self.task_queue.done('a2'))
        self.assertTrue(self.task_queue.done('a1'))
        self.assertEqual(len(self.task_queue), 2)
        self.assertTrue(self.task_queue.done('a4'))
        self.assertTrue(self.task_queue.done('a3'))
        self.assertEqual(len(self.task_queue), 0)


from pyspider.scheduler.token_bucket import Bucket


class TestBucket(unittest.TestCase):

    def test_bucket(self):
        bucket = Bucket(100, 1000)
        self.assertEqual(bucket.get(), 1000)
        time.sleep(0.1)
        self.assertEqual(bucket.get(), 1000)
        bucket.desc(100)
        self.assertEqual(bucket.get(), 900)
        time.sleep(0.1)
        self.assertAlmostEqual(bucket.get(), 910, delta=2)
        time.sleep(0.1)
        self.assertAlmostEqual(bucket.get(), 920, delta=2)


try:
    from six.moves import xmlrpc_client
except ImportError:
    import xmlrpclib as xmlrpc_client
from pyspider.scheduler.scheduler import Scheduler
from pyspider.database.sqlite import taskdb, projectdb, resultdb
from pyspider.libs.multiprocessing_queue import Queue
from pyspider.libs.utils import run_in_thread


class TestScheduler(unittest.TestCase):
    taskdb_path = './data/tests/task.db'
    projectdb_path = './data/tests/project.db'
    resultdb_path = './data/tests/result.db'
    check_project_time = 1
    scheduler_xmlrpc_port = 23333

    @classmethod
    def setUpClass(self):
        shutil.rmtree('./data/tests', ignore_errors=True)
        os.makedirs('./data/tests')

        def get_taskdb():
            return taskdb.TaskDB(self.taskdb_path)
        self.taskdb = get_taskdb()

        def get_projectdb():
            return projectdb.ProjectDB(self.projectdb_path)
        self.projectdb = get_projectdb()

        def get_resultdb():
            return resultdb.ResultDB(self.resultdb_path)
        self.resultdb = get_resultdb()

        self.newtask_queue = Queue(10)
        self.status_queue = Queue(10)
        self.scheduler2fetcher = Queue(10)
        self.rpc = xmlrpc_client.ServerProxy('http://localhost:%d' % self.scheduler_xmlrpc_port)

        def run_scheduler():
            scheduler = Scheduler(taskdb=get_taskdb(), projectdb=get_projectdb(),
                                  newtask_queue=self.newtask_queue, status_queue=self.status_queue,
                                  out_queue=self.scheduler2fetcher, data_path="./data/tests/",
                                  resultdb=get_resultdb())
            scheduler.UPDATE_PROJECT_INTERVAL = 0.1
            scheduler.LOOP_INTERVAL = 0.1
            scheduler.INQUEUE_LIMIT = 10
            scheduler.DELETE_TIME = 0
            scheduler.DEFAULT_RETRY_DELAY = {'': 5}
            scheduler._last_tick = int(time.time())  # not dispatch cronjob
            self.xmlrpc_thread = run_in_thread(scheduler.xmlrpc_run, port=self.scheduler_xmlrpc_port)
            scheduler.run()

        self.process = run_in_thread(run_scheduler)
        time.sleep(1)

    @classmethod
    def tearDownClass(self):
        if self.process.is_alive():
            self.rpc._quit()
            self.process.join(5)
        self.xmlrpc_thread.join()
        assert not self.process.is_alive()
        shutil.rmtree('./data/tests', ignore_errors=True)
        time.sleep(1)

        assert not utils.check_port_open(5000)
        assert not utils.check_port_open(self.scheduler_xmlrpc_port)
        assert not utils.check_port_open(24444)
        assert not utils.check_port_open(25555)

    def test_10_new_task_ignore(self):
        '''
        task_queue = [ ]
        '''
        self.newtask_queue.put({
            'taskid': 'taskid',
            'project': 'test_project',
            'url': 'url'
        })  # unknown project: test_project
        self.assertEqual(self.rpc.size(), 0)
        self.assertEqual(len(self.rpc.get_active_tasks()), 0)

    def test_20_new_project(self):
        '''
        task_queue = [ ]
        '''
        self.projectdb.insert('test_project', {
            'name': 'test_project',
            'group': 'group',
            'status': 'TODO',
            'script': 'import time\nprint(time.time())',
            'comments': 'test project',
            'rate': 1.0,
            'burst': 10,
        })

    def test_30_update_project(self):
        '''
        task_queue = [ ]
        '''
        from six.moves import queue as Queue
        with self.assertRaises(Queue.Empty):
            task = self.scheduler2fetcher.get(timeout=1)
        self.projectdb.update('test_project', status="DEBUG")
        time.sleep(0.1)
        self.rpc.update_project()

        task = self.scheduler2fetcher.get(timeout=10)
        self.assertIsNotNone(task)
        self.assertEqual(task['taskid'], '_on_get_info')  # select test_project:_on_get_info data:,_on_get_info

    def test_32_get_info(self):
        self.status_queue.put({
            'taskid': '_on_get_info',
            'project': 'test_project',
            'track': {
                'save': {
                    }
                }
            })
        # test_project on_get_info {}

    def test_34_new_not_used_project(self):
        '''
        task_queue = []
        '''
        self.projectdb.insert('test_project_not_started', {
            'name': 'test_project_not_started',
            'group': 'group',
            'status': 'RUNNING',
            'script': 'import time\nprint(time.time())',
            'comments': 'test project',
            'rate': 1.0,
            'burst': 10,
        })
        task = self.scheduler2fetcher.get(timeout=5)  # select test_project_not_started:_on_get_info data:,_on_get_info
        self.assertEqual(task['taskid'], '_on_get_info')

    def test_35_new_task(self):
        '''
        task_queue = [ ]
        '''
        time.sleep(0.2)
        self.newtask_queue.put({
            'taskid': 'taskid',
            'project': 'test_project',
            'url': 'url',
            'fetch': {
                'data': 'abc',
            },
            'process': {
                'data': 'abc',
            },
            'schedule': {
                'age': 0,
            },
        })  # new task test_project:taskid url
        # task_queue = [ test_project:taskid ]

        time.sleep(0.5)
        task = self.scheduler2fetcher.get(timeout=10)  # select test_project:taskid
        self.assertGreater(len(self.rpc.get_active_tasks()), 0)
        self.assertIsNotNone(task)
        self.assertEqual(task['taskid'], 'taskid')
        self.assertEqual(task['project'], 'test_project')
        self.assertIn('schedule', task)
        self.assertIn('fetch', task)
        self.assertIn('process', task)
        self.assertIn('track', task)
        self.assertEqual(task['fetch']['data'], 'abc')

    def test_37_force_update_processing_task(self):
        '''
        processing = [ test_project:taskid ]
        '''
        self.newtask_queue.put({
            'taskid': 'taskid',
            'project': 'test_project',
            'url': 'url_force_update',
            'schedule': {
                'age': 10,
                'force_update': True,
            },
        })  # restart task test_project:taskid url_force_update
        time.sleep(0.2)
        # it should not block next

    def test_40_taskdone_error_no_project(self):
        '''
        processing = [ test_project:taskid ]
        '''
        self.status_queue.put({
            'taskid': 'taskid',
            'project': 'no_project',
            'url': 'url'
        })  # unknown project: no_project
        time.sleep(0.1)
        self.assertEqual(self.rpc.size(), 1)

    def test_50_taskdone_error_no_track(self):
        '''
        processing = [ test_project:taskid ]
        '''
        self.status_queue.put({
            'taskid': 'taskid',
            'project': 'test_project',
            'url': 'url'
        })  # Bad status pack: 'track'
        time.sleep(0.1)
        self.assertEqual(self.rpc.size(), 1)
        self.status_queue.put({
            'taskid': 'taskid',
            'project': 'test_project',
            'url': 'url',
            'track': {}
        })  # Bad status pack: 'process'
        time.sleep(0.1)
        self.assertEqual(self.rpc.size(), 1)

    def test_60_taskdone_failed_retry(self):
        '''
        processing = [ test_project:taskid ]
        '''
        self.status_queue.put({
            'taskid': 'taskid',
            'project': 'test_project',
            'url': 'url',
            'track': {
                'fetch': {
                    'ok': True
                },
                'process': {
                    'ok': False
                },
            }
        })  # task retry 0/3 test_project:taskid url
        from six.moves import queue as Queue
        # with self.assertRaises(Queue.Empty):
            # task = self.scheduler2fetcher.get(timeout=4)
        task = self.scheduler2fetcher.get(timeout=5)  # select test_project:taskid url
        self.assertIsNotNone(task)

    def test_70_taskdone_ok(self):
        '''
        processing = [ test_project:taskid ]
        '''
        self.status_queue.put({
            'taskid': 'taskid',
            'project': 'test_project',
            'url': 'url',
            'track': {
                'fetch': {
                    'ok': True
                },
                'process': {
                    'ok': True
                },
            }
        })  # task done test_project:taskid url
        time.sleep(0.2)
        self.assertEqual(self.rpc.size(), 0)

    def test_75_on_finished_msg(self):
        task = self.scheduler2fetcher.get(timeout=5)  # select test_project:on_finished data:,on_finished

        self.assertEqual(task['taskid'], 'on_finished')

        self.status_queue.put({
            'taskid': 'on_finished',
            'project': 'test_project',
            'url': 'url',
            'track': {
                'fetch': {
                    'ok': True
                },
                'process': {
                    'ok': True
                },
            }
        })  # task done test_project:on_finished url
        time.sleep(0.2)
        self.assertEqual(self.rpc.size(), 0)

    def test_80_newtask_age_ignore(self):
        '''
        processing = [ ]
        '''
        self.newtask_queue.put({
            'taskid': 'taskid',
            'project': 'test_project',
            'url': 'url',
            'fetch': {
                'data': 'abc',
            },
            'process': {
                'data': 'abc',
            },
            'schedule': {
                'age': 30,
            },
        })
        time.sleep(0.1)
        self.assertEqual(self.rpc.size(), 0)

    def test_82_newtask_via_rpc(self):
        '''
        processing = [ ]
        '''
        self.rpc.newtask({
            'taskid': 'taskid',
            'project': 'test_project',
            'url': 'url',
            'fetch': {
                'data': 'abc',
            },
            'process': {
                'data': 'abc',
            },
            'schedule': {
                'age': 30,
            },
        })
        time.sleep(0.1)
        self.assertEqual(self.rpc.size(), 0)

    def test_90_newtask_with_itag(self):
        '''
        task_queue = [ ]
        processing = [ ]
        '''
        time.sleep(0.1)
        self.newtask_queue.put({
            'taskid': 'taskid',
            'project': 'test_project',
            'url': 'url',
            'fetch': {
                'data': 'abc',
            },
            'process': {
                'data': 'abc',
            },
            'schedule': {
                'itag': "abc",
                'retries': 1
            },
        })  # restart task test_project:taskid url

        task = self.scheduler2fetcher.get(timeout=10)  # select test_project:taskid url
        self.assertIsNotNone(task)
        self.assertEqual(task['taskid'], 'taskid')

        self.test_70_taskdone_ok()  # task done test_project:taskid url
        self.test_75_on_finished_msg()  # select test_project:on_finished data:,on_finished

    def test_a10_newtask_restart_by_age(self):
        self.newtask_queue.put({
            'taskid': 'taskid',
            'project': 'test_project',
            'url': 'url',
            'fetch': {
                'data': 'abc',
            },
            'process': {
                'data': 'abc',
            },
            'schedule': {
                'age': 0,
                'retries': 1
            },
        })  # restart task test_project:taskid url
        task = self.scheduler2fetcher.get(timeout=10)  # select test_project:taskid url
        self.assertIsNotNone(task)
        self.assertEqual(task['taskid'], 'taskid')

    def test_a20_failed_retry(self):
        '''
        processing: [ test_project:taskid ]
        '''
        self.status_queue.put({
            'taskid': 'taskid',
            'project': 'test_project',
            'url': 'url',
            'track': {
                'fetch': {
                    'ok': True
                },
                'process': {
                    'ok': False
                },
            }
        })  # task retry 0/1 test_project:taskid url
        task = self.scheduler2fetcher.get(timeout=5)  # select test_project:taskid url
        self.assertIsNotNone(task)
        self.assertEqual(task['taskid'], 'taskid')

        self.status_queue.put({
            'taskid': 'taskid',
            'project': 'test_project',
            'url': 'url',
            'track': {
                'fetch': {
                    'ok': False
                },
                'process': {
                    'ok': False
                },
            }
        })  # task failed test_project:taskid url

        self.test_75_on_finished_msg()  # select test_project:on_finished data:,on_finished

        from six.moves import queue as Queue
        with self.assertRaises(Queue.Empty):
            self.scheduler2fetcher.get(timeout=5)

    def test_a30_task_verify(self):
        self.assertFalse(self.rpc.newtask({
            #'taskid': 'taskid#',
            'project': 'test_project',
            'url': 'url',
        }))  # taskid not in task: {'project': 'test_project', 'url': 'url'}
        self.assertFalse(self.rpc.newtask({
            'taskid': 'taskid#',
            #'project': 'test_project',
            'url': 'url',
        }))  # project not in task: {'url': 'url', 'taskid': 'taskid#'}
        self.assertFalse(self.rpc.newtask({
            'taskid': 'taskid#',
            'project': 'test_project',
            #'url': 'url',
        }))  # url not in task: {'project': 'test_project', 'taskid': 'taskid#'}
        self.assertFalse(self.rpc.newtask({
            'taskid': 'taskid#',
            'project': 'not_exist_project',
            'url': 'url',
        }))  # unknown project: not_exist_project
        self.assertTrue(self.rpc.newtask({
            'taskid': 'taskid#',
            'project': 'test_project',
            'url': 'url',
        }))  # new task test_project:taskid# url

    def test_a40_success_recrawl(self):
        '''
        task_queue = [ test_project:taskid# ]
        '''
        self.newtask_queue.put({
            'taskid': 'taskid',
            'project': 'test_project',
            'url': 'url',
            'fetch': {
                'data': 'abc',
            },
            'process': {
                'data': 'abc',
            },
            'schedule': {
                'age': 0,
                'retries': 1,
                'auto_recrawl': True,
            },
        })  # restart task test_project:taskid url
        task1 = self.scheduler2fetcher.get(timeout=10)  # select test_project:taskid# url
        task2 = self.scheduler2fetcher.get(timeout=10)  # select test_project:taskid url
        self.assertIsNotNone(task1)
        self.assertIsNotNone(task2)
        self.assertTrue(task1['taskid'] == 'taskid#' or task2['taskid'] == 'taskid#')

        self.status_queue.put({
            'taskid': 'taskid',
            'project': 'test_project',
            'url': 'url',
            'schedule': {
                'age': 0,
                'retries': 1,
                'auto_recrawl': True,
            },
            'track': {
                'fetch': {
                    'ok': True
                },
                'process': {
                    'ok': True
                },
            }
        })  # task done test_project:taskid url
        task = self.scheduler2fetcher.get(timeout=10)
        self.assertIsNotNone(task)

    def test_a50_failed_recrawl(self):
        '''
        time_queue = [ test_project:taskid ]
        scheduler2fetcher = [ test_project:taskid# ]
        processing = [ test_project:taskid# ]
        '''
        for i in range(3):
            self.status_queue.put({
                'taskid': 'taskid',
                'project': 'test_project',
                'url': 'url',
                'schedule': {
                    'age': 0,
                    'retries': 1,
                    'auto_recrawl': True,
                },
                'track': {
                    'fetch': {
                        'ok': True
                    },
                    'process': {
                        'ok': False
                    },
                }
            })
            # not processing pack: test_project:taskid url
            # select test_project:taskid url
            # task retry 0/1 test_project:taskid url
            # select test_project:taskid url
            # task retry 0/1 test_project:taskid url
            # select test_project:taskid url
            task = self.scheduler2fetcher.get(timeout=10)
            self.assertIsNotNone(task)
            self.assertEqual(task['taskid'], 'taskid')

    def test_a60_disable_recrawl(self):
        '''
        time_queue = [ test_project:taskid ]
        scheduler2fetcher = [ test_project:taskid# ]
        processing = [ test_project:taskid# ]
        '''
        self.status_queue.put({
            'taskid': 'taskid',
            'project': 'test_project',
            'url': 'url',
            'schedule': {
                'age': 0,
                'retries': 1,
            },
            'track': {
                'fetch': {
                    'ok': True
                },
                'process': {
                    'ok': True
                },
            }
        })  # task done test_project:taskid url

        from six.moves import queue as Queue
        with self.assertRaises(Queue.Empty):
            self.scheduler2fetcher.get(timeout=5)

    def test_38_cancel_task(self):
        current_size = self.rpc.size()
        self.newtask_queue.put({
            'taskid': 'taskid_to_cancel',
            'project': 'test_project',
            'url': 'url',
            'fetch': {
                'data': 'abc',
            },
            'process': {
                'data': 'abc',
            },
            'schedule': {
                'age': 0,
                'exetime': time.time() + 30
            },
        })  # new task test_project:taskid_to_cancel url
        # task_queue = [ test_project:taskid_to_cancel ]

        time.sleep(0.2)
        self.assertEqual(self.rpc.size(), current_size+1)

        self.newtask_queue.put({
            'taskid': 'taskid_to_cancel',
            'project': 'test_project',
            'url': 'url',
            'fetch': {
                'data': 'abc',
            },
            'process': {
                'data': 'abc',
            },
            'schedule': {
                'force_update': True,
                'age': 0,
                'cancel': True
            },
        })  # new cancel test_project:taskid_to_cancel url
        # task_queue = [ ]

        time.sleep(0.2)
        self.assertEqual(self.rpc.size(), current_size)

    def test_x10_inqueue_limit(self):
        self.projectdb.insert('test_inqueue_project', {
            'name': 'test_inqueue_project',
            'group': 'group',
            'status': 'DEBUG',
            'script': 'import time\nprint(time.time())',
            'comments': 'test project',
            'rate': 0,
            'burst': 0,
        })
        time.sleep(0.1)
        pre_size = self.rpc.size()
        for i in range(20):
            self.newtask_queue.put({
                'taskid': 'taskid%d' % i,
                'project': 'test_inqueue_project',
                'url': 'url',
                'schedule': {
                    'age': 3000,
                    'force_update': True,
                },
            })
        time.sleep(1)
        self.assertEqual(self.rpc.size() - pre_size, 10)

    def test_x20_delete_project(self):
        self.assertIsNotNone(self.projectdb.get('test_inqueue_project'))
        #self.assertIsNotNone(self.taskdb.get_task('test_inqueue_project', 'taskid1'))
        self.projectdb.update('test_inqueue_project', status="STOP", group="lock,delete")
        time.sleep(1)
        self.assertIsNone(self.projectdb.get('test_inqueue_project'))
        self.taskdb._list_project()
        self.assertIsNone(self.taskdb.get_task('test_inqueue_project', 'taskid1'))
        self.assertNotIn('test_inqueue_project', self.rpc.counter('5m', 'sum'))

    def test_z10_startup(self):
        self.assertTrue(self.process.is_alive())

    def test_z20_quit(self):
        self.rpc._quit()
        time.sleep(0.2)
        self.assertFalse(self.process.is_alive())
        self.assertEqual(
            self.taskdb.get_task('test_project', 'taskid')['status'],
            self.taskdb.SUCCESS
        )


from pyspider.scheduler.scheduler import Project

class TestProject(unittest.TestCase):
    task_pack = {
        'type': Scheduler.TASK_PACK,
        'taskid': 'taskid',
        'project': 'test_project',
        'url': 'url',
        'fetch': {
            'data': 'abc',
        },
        'process': {
            'data': 'abc',
        },
        'schedule': {
            'age': 0,
        },
    }

    status_ok_pack = {
        'taskid': 'taskid',
        'project': 'test_project',
        'url': 'url',
        'schedule': {
            'age': 0,
            'retries': 1,
        },
        'track': {
            'fetch': {
                'ok': True
            },
            'process': {
                'ok': True
            },
        }
    }

    status_fail_pack = {
        'taskid': 'taskid',
        'project': 'test_project',
        'url': 'url',
        'schedule': {
            'age': 0,
            'retries': 1,
        },
        'track': {
            'fetch': {
                'ok': False
            },
            'process': {
                'ok': False
            },
        }
    }

    @classmethod
    def setUpClass(self):
        self.scheduler = Scheduler(taskdb=None, projectdb=None, newtask_queue=None, status_queue=None, out_queue=None)
        self.scheduler.PAUSE_TIME = 2
        self.project = Project(self.scheduler, {
            'name': 'test_project_not_started',
            'group': 'group',
            'status': 'RUNNING',
            'script': 'import time\nprint(time.time())',
            'comments': 'test project',
            'rate': 1.0,
            'burst': 10,
            'updatetime': time.time(),
        })

    def test_pause_10_unpaused(self):
        self.assertFalse(self.project.paused)

    def test_pause_20_no_enough_fail_tasks(self):
        for i in range(3):
            self.project.active_tasks.appendleft((time.time(), dict(self.task_pack)))
        self.assertFalse(self.project.paused)

        for i in range(1):
            self.project.active_tasks.appendleft((time.time(), dict(self.status_ok_pack)))
        for i in range(self.scheduler.FAIL_PAUSE_NUM - 5):
            self.project.active_tasks.appendleft((time.time(), dict(self.status_fail_pack)))
        self.assertFalse(self.project.paused)

        for i in range(5):
            self.project.active_tasks.appendleft((time.time(), dict(self.status_fail_pack)))
        for i in range(1):
            self.project.active_tasks.appendleft((time.time(), dict(self.status_ok_pack)))
        self.assertFalse(self.project.paused)

        for i in range(self.scheduler.FAIL_PAUSE_NUM):
            self.project.active_tasks.appendleft((time.time(), dict(self.task_pack)))
        self.assertFalse(self.project.paused)

    def test_pause_30_paused(self):
        for i in range(self.scheduler.FAIL_PAUSE_NUM):
            self.project.active_tasks.appendleft((time.time(), dict(self.status_fail_pack)))
        for i in range(self.scheduler.FAIL_PAUSE_NUM):
            self.project.active_tasks.appendleft((time.time(), dict(self.task_pack)))
        self.assertTrue(self.project.paused)

    def test_pause_40_unpause_checking(self):
        time.sleep(3)
        self.assertFalse(self.project.paused)

    def test_pause_50_paused_again(self):
        for i in range(self.scheduler.UNPAUSE_CHECK_NUM):
            self.project.active_tasks.appendleft((time.time(), dict(self.status_fail_pack)))
        self.assertTrue(self.project.paused)

    def test_pause_60_unpause_checking(self):
        time.sleep(3)
        self.assertFalse(self.project.paused)

    def test_pause_70_unpaused(self):
        for i in range(1):
            self.project.active_tasks.appendleft((time.time(), dict(self.status_ok_pack)))
        for i in range(self.scheduler.UNPAUSE_CHECK_NUM):
            self.project.active_tasks.appendleft((time.time(), dict(self.status_fail_pack)))
        for i in range(self.scheduler.FAIL_PAUSE_NUM):
            self.project.active_tasks.appendleft((time.time(), dict(self.task_pack)))
        self.assertFalse(self.project.paused)
        self.assertFalse(self.project._paused)

    def test_pause_x_disable_auto_pause(self):
        fail_pause_num = self.scheduler.FAIL_PAUSE_NUM
        self.scheduler.FAIL_PAUSE_NUM = 0
        for i in range(100):
            self.project.active_tasks.appendleft((time.time(), dict(self.status_fail_pack)))
        self.assertFalse(self.project.paused)
        self.scheduler.FAIL_PAUSE_NUM = fail_pause_num


if __name__ == '__main__':
    unittest.main()


================================================
FILE: tests/test_task_queue.py
================================================
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import time
import unittest

import six
from six.moves import queue as Queue

from pyspider.scheduler.task_queue import InQueueTask, TaskQueue


class TestTaskQueue(unittest.TestCase):
    """
        TestTaskQueue
    """

    def test_task_queue_in_time_order(self):
        tq = TaskQueue(rate=300, burst=1000)

        queues = dict()
        tasks = dict()

        for i in range(0, 100):
            it = InQueueTask(str(i), priority=int(i // 10), exetime=0)
            tq.put(it.taskid, it.priority, it.exetime)

            if it.priority not in queues:
                queues[it.priority] = Queue.Queue()

            q = queues[it.priority]  # type:Queue.Queue
            q.put(it)
            tasks[it.taskid] = it
            # six.print_('put, taskid=', it.taskid, 'priority=', it.priority, 'exetime=', it.exetime)
        for i in range(0, 100):
            task_id = tq.get()
            task = tasks[task_id]
            q = queues[task.priority]  # type: Queue.Queue
            expect_task = q.get()
            self.assertEqual(task_id, expect_task.taskid)
            self.assertEqual(task.priority, int(9 - i // 10))
            # six.print_('get, taskid=', task.taskid, 'priority=', task.priority, 'exetime=', task.exetime)

        self.assertEqual(tq.size(), 100)
        self.assertEqual(tq.priority_queue.qsize(), 0)
        self.assertEqual(tq.processing.qsize(), 100)
        for q in six.itervalues(queues):  # type:Queue.Queue
            self.assertEqual(q.qsize(), 0)
        pass

    pass


class TestTimeQueue(unittest.TestCase):
    def test_time_queue(self):

        # six.print_('Test time queue order by time only')

        tq = TaskQueue(rate=300, burst=1000)

        fifo_queue = Queue.Queue()

        interval = 5.0 / 1000

        for i in range(0, 20):
            it = InQueueTask(str(i), priority=int(i // 10), exetime=time.time() + (i + 1) * interval)
            tq.put(it.taskid, it.priority, it.exetime)
            fifo_queue.put(it)
            # six.print_('put, taskid=', it.taskid, 'priority=', it.priority, 'exetime=', it.exetime)

        self.assertEqual(tq.priority_queue.qsize(), 0)
        self.assertEqual(tq.processing.qsize(), 0)
        self.assertEqual(tq.time_queue.qsize(), 20)

        for i in range(0, 20):
            t1 = fifo_queue.get()
            t2 = tq.time_queue.get()
            self.assertEqual(t1.taskid, t2.taskid)
            # six.print_('get, taskid=', t2.taskid, 'priority=', t2.priority, 'exetime=', t2.exetime)
        self.assertEqual(tq.priority_queue.qsize(), 0)
        self.assertEqual(tq.processing.qsize(), 0)
        self.assertEqual(tq.time_queue.qsize(), 0)

        queues = dict()
        tasks = dict()
        for i in range(0, 20):
            priority = int(i // 10)
            it = InQueueTask(str(i), priority=priority, exetime=time.time() + (i + 1) * interval)
            tq.put(it.taskid, it.priority, it.exetime)
            tasks[it.taskid] = it

            if priority not in queues:
                queues[priority] = Queue.Queue()
            q = queues[priority]
            q.put(it)
            pass

        self.assertEqual(tq.priority_queue.qsize(), 0)
        self.assertEqual(tq.processing.qsize(), 0)
        self.assertEqual(tq.time_queue.qsize(), 20)

        time.sleep(20 * interval)
        tq.check_update()
        self.assertEqual(tq.priority_queue.qsize(), 20)
        self.assertEqual(tq.processing.qsize(), 0)
        self.assertEqual(tq.time_queue.qsize(), 0)
        for i in range(0, 20):
            taskid = tq.get()
            t1 = tasks[taskid]
            t2 = queues[t1.priority].get()
            self.assertEqual(t1.taskid, t2.taskid)

        self.assertEqual(tq.priority_queue.qsize(), 0)
        self.assertEqual(tq.processing.qsize(), 20)
        self.assertEqual(tq.time_queue.qsize(), 0)

        pass

    pass


if __name__ == '__main__':
    unittest.main()


================================================
FILE: tests/test_utils.py
================================================
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:
# Author: Binux<roy@binux.me>
#         http://binux.me
# Created on 2015-01-18 16:53:49

import sys
import time
import unittest

from pyspider.libs import utils

class TestFetcher(unittest.TestCase):
    def test_readonlydict(self):
        data = dict(a='a', b=123)
        data['c'] = self
        data = utils.ReadOnlyDict(data)

        with self.assertRaises(Exception):
            data['d'] = 9

    def test_getitem(self):
        l = [1, 2]
        self.assertEqual(utils.getitem(l, 0), 1)
        self.assertEqual(utils.getitem(l, 1), 2)
        self.assertEqual(utils.getitem(l, 3), None)
        self.assertEqual(utils.getitem(l, 3, 9), 9)
        self.assertEqual(utils.getitem(l, 'key'), None)
        self.assertEqual(utils.getitem(l, 'key', 8), 8)
        data = dict(a='a', b=123)
        self.assertEqual(utils.getitem(data, 'a'), 'a')
        self.assertEqual(utils.getitem(data, 'b'), 123)
        self.assertEqual(utils.getitem(data, 'c'), None)
        self.assertEqual(utils.getitem(data, 'c', 9), 9)

    def test_format_data(self):
        now = time.time()
        self.assertEqual(utils.format_date(now - 30), '30 seconds ago')
        self.assertEqual(utils.format_date(now - 60), '1 minute ago')
        self.assertEqual(utils.format_date(now - 2*60), '2 minutes ago')
        self.assertEqual(utils.format_date(now - 30*60), '30 minutes ago')
        self.assertEqual(utils.format_date(now - 60*60), '1 hour ago')
        self.assertEqual(utils.format_date(1963475336), 'Mar 21, 2032 at 9:48')
        self.assertEqual(utils.format_date(now - 12*60*60), '12 hours ago')
        self.assertRegex(utils.format_date(now - 24*60*60), r'^yesterday at \d{1,2}:\d{2}$')
        self.assertRegex(utils.format_date(now - 2*24*60*60), r'^[A-Z][a-z]+ at \d{1,2}:\d{2}$')
        self.assertRegex(utils.format_date(now - 3*24*60*60), r'^[A-Z][a-z]+ at \d{1,2}:\d{2}$')
        self.assertRegex(utils.format_date(now - 4*24*60*60), r'^[A-Z][a-z]+ at \d{1,2}:\d{2}$')
        self.assertRegex(utils.format_date(now - 5*24*60*60), r'^\d{1,2}-\d{1,2} at \d{1,2}:\d{2}$')
        self.assertRegex(utils.format_date(now - 333*24*60*60), r'^\d{1,2}-\d{1,2} at \d{1,2}:\d{2}$')
        self.assertRegex(utils.format_date(now - 334*24*60*60), r'^[A-Z][a-z]+ \d{1,2}, \d{4} at \d{1,2}:\d{2}$')


================================================
FILE: tests/test_webdav.py
================================================
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:
# Author: Binux<roy@binux.me>
#         http://binux.me
# Created on 2015-06-03 21:15

import os
import sys
import six
import time
import shutil
import inspect
import unittest

from six import BytesIO
from pyspider import run
from pyspider.libs import utils
from tests import data_sample_handler, data_handler

@unittest.skipIf(sys.version_info >= (3, 6), "easywebdav doesn't support python 3.6")
class TestWebDav(unittest.TestCase):
    @classmethod
    def setUpClass(self):
        import easywebdav

        shutil.rmtree('./data/tests', ignore_errors=True)
        os.makedirs('./data/tests')

        ctx = run.cli.make_context('test', [
            '--taskdb', 'sqlite+taskdb:///data/tests/task.db',
            '--projectdb', 'sqlite+projectdb:///data/tests/projectdb.db',
            '--resultdb', 'sqlite+resultdb:///data/tests/resultdb.db',
        ], None, obj=utils.ObjectDict(testing_mode=True))
        self.ctx = run.cli.invoke(ctx)

        ctx = run.webui.make_context('webui', [
            '--username', 'binux',
            '--password', '4321',
        ], self.ctx)
        self.app = run.webui.invoke(ctx)
        self.app_thread = utils.run_in_thread(self.app.run)
        time.sleep(5)

        self.webdav = easywebdav.connect('localhost', port=5000, path='dav')
        self.webdav_up = easywebdav.connect('localhost', port=5000, path='dav',
                                            username='binux', password='4321')

    @classmethod
    def tearDownClass(self):
        for each in self.ctx.obj.instances:
            each.quit()
        self.app_thread.join()
        time.sleep(1)

        assert not utils.check_port_open(5000)
        assert not utils.check_port_open(23333)
        assert not utils.check_port_open(24444)
        assert not utils.check_port_open(25555)
        assert not utils.check_port_open(14887)

        shutil.rmtree('./data/tests', ignore_errors=True)

    def test_10_ls(self):
        self.assertEqual(len(self.webdav.ls()), 1)

    def test_20_create_error(self):
        import easywebdav
        with self.assertRaises(easywebdav.OperationFailed):
            self.webdav.upload(inspect.getsourcefile(data_sample_handler),
                               'bad_file_name')
        with self.assertRaises(easywebdav.OperationFailed):
            self.webdav.upload(inspect.getsourcefile(data_sample_handler),
                               'bad.file.name')

    def test_30_create_ok(self):
        self.webdav.upload(inspect.getsourcefile(data_handler), 'handler.py')
        self.webdav.upload(inspect.getsourcefile(data_sample_handler), 'sample_handler.py')
        self.assertEqual(len(self.webdav.ls()), 3)

    def test_40_get_404(self):
        io = BytesIO()
        import easywebdav
        with self.assertRaises(easywebdav.OperationFailed):
            self.webdav.download('not_exitst', io)
        io.close()

    def test_50_get(self):
        io = BytesIO()
        self.webdav.download('handler.py', io)
        self.assertEqual(utils.text(inspect.getsource(data_handler)), utils.text(io.getvalue()))
        io.close()

        io = BytesIO()
        self.webdav.download('sample_handler.py', io)
        self.assertEqual(utils.text(inspect.getsource(data_sample_handler)), utils.text(io.getvalue()))
        io.close()

    def test_60_edit(self):
        self.webdav.upload(inspect.getsourcefile(data_handler), 'sample_handler.py')

    def test_70_get(self):
        io = BytesIO()
        self.webdav.download('sample_handler.py', io)
        self.assertEqual(utils.text(inspect.getsource(data_handler)), utils.text(io.getvalue()))
        io.close()

    def test_80_password(self):
        import requests
        rv = requests.post('http://localhost:5000/update', data={
            'name': 'group',
            'value': 'lock',
            'pk': 'sample_handler',
        })
        self.assertEqual(rv.status_code, 200)

        import easywebdav
        with self.assertRaises(easywebdav.OperationFailed):
            self.webdav.upload(inspect.getsourcefile(data_sample_handler), 'sample_handler.py')
        self.webdav_up.upload(inspect.getsourcefile(data_sample_handler), 'sample_handler.py')


@unittest.skipIf(sys.version_info >= (3, 6), "easywebdav doesn't support python 3.6")
class TestWebDavNeedAuth(unittest.TestCase):
    @classmethod
    def setUpClass(self):
        import easywebdav

        shutil.rmtree('./data/tests', ignore_errors=True)
        os.makedirs('./data/tests')

        ctx = run.cli.make_context('test', [
            '--taskdb', 'sqlite+taskdb:///data/tests/task.db',
            '--projectdb', 'sqlite+projectdb:///data/tests/projectdb.db',
            '--resultdb', 'sqlite+resultdb:///data/tests/resultdb.db',
        ], None, obj=utils.ObjectDict(testing_mode=True))
        self.ctx = run.cli.invoke(ctx)

        ctx = run.webui.make_context('webui', [
            '--username', 'binux',
            '--password', '4321',
            '--need-auth',
        ], self.ctx)
        self.app = run.webui.invoke(ctx)
        self.app_thread = utils.run_in_thread(self.app.run)
        time.sleep(5)

        self.webdav = easywebdav.connect('localhost', port=5000, path='dav')
        self.webdav_up = easywebdav.connect('localhost', port=5000, path='dav',
                                            username='binux', password='4321')

    @classmethod
    def tearDownClass(self):
        for each in self.ctx.obj.instances:
            each.quit()
        self.app_thread.join()
        time.sleep(1)

        assert not utils.check_port_open(5000)
        assert not utils.check_port_open(23333)
        assert not utils.check_port_open(24444)
        assert not utils.check_port_open(25555)
        assert not utils.check_port_open(14887)

        shutil.rmtree('./data/tests', ignore_errors=True)

    def test_10_ls(self):
        import easywebdav
        with self.assertRaises(easywebdav.OperationFailed):
            self.assertEqual(len(self.webdav.ls()), 1)
        self.assertEqual(len(self.webdav_up.ls()), 1)

    def test_30_create_ok(self):
        self.webdav_up.upload(inspect.getsourcefile(data_handler), 'handler.py')
        self.assertEqual(len(self.webdav_up.ls()), 2)

    def test_50_get(self):
        import easywebdav
        with self.assertRaises(easywebdav.OperationFailed):
            io = BytesIO()
            self.webdav.download('handler.py', io)
            io.close()

        io = BytesIO()
        self.webdav_up.download('handler.py', io)
        self.assertEqual(utils.text(inspect.getsource(data_handler)), utils.text(io.getvalue()))
        io.close()


================================================
FILE: tests/test_webui.py
================================================
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:
# Author: Binux<roy@binux.me>
#         http://binux.me
# Created on 2014-11-18 21:03:22

import os
import re
import time
import json
import shutil
import unittest

from pyspider import run
from pyspider.libs import utils
from pyspider.libs.utils import run_in_thread, ObjectDict


class TestWebUI(unittest.TestCase):

    @classmethod
    def setUpClass(self):
        shutil.rmtree('./data/tests', ignore_errors=True)
        os.makedirs('./data/tests')

        import tests.data_test_webpage
        import httpbin
        from pyspider.webui import bench_test  # flake8: noqa
        self.httpbin_thread = utils.run_in_subprocess(httpbin.app.run, port=14887, passthrough_errors=False)
        self.httpbin = 'http://127.0.0.1:14887'

        ctx = run.cli.make_context('test', [
            '--taskdb', 'sqlalchemy+sqlite+taskdb:///data/tests/task.db',
            '--projectdb', 'sqlalchemy+sqlite+projectdb:///data/tests/projectdb.db',
            '--resultdb', 'sqlalchemy+sqlite+resultdb:///data/tests/resultdb.db',
        ], None, obj=ObjectDict(testing_mode=True))
        self.ctx = run.cli.invoke(ctx)

        self.threads = []

        ctx = run.scheduler.make_context('scheduler', [], self.ctx)
        self.scheduler = scheduler = run.scheduler.invoke(ctx)
        self.threads.append(run_in_thread(scheduler.xmlrpc_run))
        self.threads.append(run_in_thread(scheduler.run))

        ctx = run.fetcher.make_context('fetcher', [
            '--xmlrpc-port', '24444',
        ], self.ctx)
        fetcher = run.fetcher.invoke(ctx)
        self.threads.append(run_in_thread(fetcher.xmlrpc_run))
        self.threads.append(run_in_thread(fetcher.run))

        ctx = run.processor.make_context('processor', [], self.ctx)
        processor = run.processor.invoke(ctx)
        self.threads.append(run_in_thread(processor.run))

        ctx = run.result_worker.make_context('result_worker', [], self.ctx)
        result_worker = run.result_worker.invoke(ctx)
        self.threads.append(run_in_thread(result_worker.run))

        ctx = run.webui.make_context('webui', [
            '--scheduler-rpc', 'http://localhost:23333/'
        ], self.ctx)
        app = run.webui.invoke(ctx)
        app.debug = True
        self.app = app.test_client()
        self.rpc = app.config['scheduler_rpc']

        time.sleep(1)

    @classmethod
    def tearDownClass(self):
        for each in self.ctx.obj.instances:
            each.quit()
        time.sleep(1)

        for thread in self.threads:
            thread.join()

        self.httpbin_thread.terminate()
        self.httpbin_thread.join()

        assert not utils.check_port_open(5000)
        assert not utils.check_port_open(23333)
        assert not utils.check_port_open(24444)
        assert not utils.check_port_open(25555)
        assert not utils.check_port_open(14887)

        shutil.rmtree('./data/tests', ignore_errors=True)

    def test_10_index_page(self):
        rv = self.app.get('/')
        self.assertEqual(rv.status_code, 200)
        self.assertIn(b'dashboard', rv.data)

    def test_20_debug(self):
        rv = self.app.get('/debug/test_project')
        self.assertEqual(rv.status_code, 200)
        self.assertIn(b'debugger', rv.data)
        self.assertIn(b'var task_content = ', rv.data)
        self.assertIn(b'var script_content = ', rv.data)

        m = re.search(r'var task_content = (.*);\n', utils.text(rv.data))
        self.assertIsNotNone(m)
        self.assertIn('test_project', json.loads(m.group(1)))

        m = re.search(r'var script_content = (.*);\n', utils.text(rv.data))
        self.assertIsNotNone(m)
        self.assertIn('__START_URL__', json.loads(m.group(1)))

    def test_25_debug_post(self):
        rv = self.app.post('/debug/test_project', data={
            'project-name': 'other_project',
            'start-urls': 'http://127.0.0.1:14887/pyspider/test.html',
            'script-mode': 'script',
        })
        self.assertEqual(rv.status_code, 200)
        self.assertIn(b'debugger', rv.data)
        self.assertIn(b'var task_content = ', rv.data)
        self.assertIn(b'var script_content = ', rv.data)

        m = re.search(r'var task_content = (.*);\n', utils.text(rv.data))
        self.assertIsNotNone(m)
        self.assertIn('test_project', m.group(1))
        self.__class__.task_content = json.loads(m.group(1))

        m = re.search(r'var script_content = (.*);\n', utils.text(rv.data))
        self.assertIsNotNone(m)
        self.assertIn('127.0.0.1:14887', m.group(1))
        self.__class__.script_content = json.loads(m.group(1))

    def test_30_run(self):
        rv = self.app.post('/debug/test_project/run', data={
            'script': self.script_content,
            'task': self.task_content
        })
        self.assertEqual(rv.status_code, 200)
        data = json.loads(utils.text(rv.data))
        self.assertIn(b'follows', rv.data)
        self.assertGreater(len(data['follows']), 0)
        self.__class__.task_content2 = data['follows'][0]

    def test_32_run_bad_task(self):
        rv = self.app.post('/debug/test_project/run', data={
            'script': self.script_content,
            'task': self.task_content+'asdfasdf312!@#'
        })
        self.assertEqual(rv.status_code, 200)
        data = json.loads(utils.text(rv.data))
        self.assertGreater(len(data['logs']), 0)
        self.assertEqual(len(data['follows']), 0)

    def test_33_run_bad_script(self):
        rv = self.app.post('/debug/test_project/run', data={
            'script': self.script_content+'adfasfasdf',
            'task': self.task_content
        })
        self.assertEqual(rv.status_code, 200)
        data = json.loads(utils.text(rv.data))
        self.assertGreater(len(data['logs']), 0)
        self.assertEqual(len(data['follows']), 0)

    def test_35_run_http_task(self):
        rv = self.app.post('/debug/test_project/run', data={
            'script': self.script_content,
            'task': json.dumps(self.task_content2)
        })
        self.assertEqual(rv.status_code, 200)
        data = json.loads(utils.text(rv.data))
        self.assertIn('follows', data)

    def test_40_save(self):
        rv = self.app.post('/debug/test_project/save', data={
            'script': self.script_content,
        })
        self.assertEqual(rv.status_code, 200)
        self.assertIn(b'ok', rv.data)

    def test_42_get(self):
        rv = self.app.get('/debug/test_project/get')
        self.assertEqual(rv.status_code, 200)
        data = json.loads(utils.text(rv.data))
        self.assertIn('script', data)
        self.assertEqual(data['script'], self.script_content)

    def test_45_run_with_saved_script(self):
        rv = self.app.post('/debug/test_project/run', data={
            'webdav_mode': 'true',
            'script': '',
            'task': self.task_content
        })
        self.assertEqual(rv.status_code, 200)
        data = json.loads(utils.text(rv.data))
        self.assertIn(b'follows', rv.data)
        self.assertGreater(len(data['follows']), 0)
        self.__class__.task_content2 = data['follows'][0]

    def test_50_index_page_list(self):
        rv = self.app.get('/')
        self.assertEqual(rv.status_code, 200)
        self.assertIn(b'"test_project"', rv.data)

    def test_52_change_status(self):
        rv = self.app.post('/update', data={
            'name': 'status',
            'value': 'RUNNING',
            'pk': 'test_project'
        })
        self.assertEqual(rv.status_code, 200)
        self.assertIn(b'ok', rv.data)

    def test_55_reopen(self):
        rv = self.app.get('/debug/test_project')
        self.assertEqual(rv.status_code, 200)
        self.assertIn(b'debugger', rv.data)

    def test_57_resave(self):
        rv = self.app.post('/debug/test_project/save', data={
            'script': self.script_content,
        })
        self.assertEqual(rv.status_code, 200)
        self.assertIn(b'ok', rv.data)

    def test_58_index_page_list(self):
        rv = self.app.get('/')
        self.assertEqual(rv.status_code, 200)
        self.assertIn(b'CHECKING', rv.data)

    def test_60_change_rate(self):
        rv = self.app.post('/update', data={
            'name': 'rate',
            'value': '1/4',
            'pk': 'test_project'
        })
        self.assertEqual(rv.status_code, 200)
        self.assertIn(b'ok', rv.data)

    def test_70_change_status(self):
        rv = self.app.post('/update', data={
            'name': 'status',
            'value': 'RUNNING',
            'pk': 'test_project'
        })
        self.assertEqual(rv.status_code, 200)
        self.assertIn(b'ok', rv.data)

    def test_80_change_group(self):
        rv = self.app.post('/update', data={
            'name': 'group',
            'value': 'test_binux',
            'pk': 'test_project'
        })
        self.assertEqual(rv.status_code, 200)
        self.assertIn(b'ok', rv.data)

        rv = self.app.get('/')
        self.assertEqual(rv.status_code, 200)
        self.assertIn(b'test_binux', rv.data)

    def test_90_run(self):
        time.sleep(0.5)
        rv = self.app.post('/run', data={
            'project': 'test_project',
        })
        self.assertEqual(rv.status_code, 200)
        self.assertEqual(json.loads(utils.text(rv.data))['result'], True)

    def test_a10_counter(self):
        for i in range(30):
            time.sleep(1)
            if self.rpc.counter('5m', 'sum')\
                    .get('test_project', {}).get('success', 0) > 5:
                break

        rv = self.app.get('/counter')
        self.assertEqual(rv.status_code, 200)
        data = json.loads(utils.text(rv.data))
        self.assertGreater(len(data), 0)
        self.assertGreater(data['test_project']['5m']['success'], 3)
        self.assertGreater(data['test_project']['1h']['success'], 3)
        self.assertGreater(data['test_project']['1d']['success'], 3)
        self.assertGreater(data['test_project']['all']['success'], 3)

    def test_a15_queues(self):
        rv = self.app.get('/queues')
        self.assertEqual(rv.status_code, 200)
        data = json.loads(utils.text(rv.data))
        self.assertGreater(len(data), 0)
        self.assertIn('scheduler2fetcher', data)
        self.assertIn('fetcher2processor', data)
        self.assertIn('processor2result', data)
        self.assertIn('newtask_queue', data)
        self.assertIn('status_queue', data)

    def test_a20_tasks(self):
        rv = self.app.get('/tasks')
        self.assertEqual(rv.status_code, 200, rv.data)
        self.assertIn(b'SUCCESS</span>', rv.data)
        self.assertNotIn(b'>ERROR</span>', rv.data)
        m = re.search(r'/task/test_project:[^"]+', utils.text(rv.data))
        self.assertIsNotNone(m)
        self.__class__.task_url = m.group(0)
        self.assertIsNotNone(self.task_url)
        m = re.search(r'/debug/test_project[^"]+', utils.text(rv.data))
        self.assertIsNotNone(m)
        self.__class__.debug_task_url = m.group(0)
        self.assertIsNotNone(self.debug_task_url)

        rv = self.app.get('/tasks?project=test_project')
        self.assertEqual(rv.status_code, 200)
        self.assertIn(b'SUCCESS</span>', rv.data)
        self.assertNotIn(b'>ERROR</span>', rv.data)

    def test_a22_active_tasks(self):
        rv = self.app.get('/active_tasks')
        data = json.loads(utils.text(rv.data))
        track = False
        self.assertGreater(len(data), 0)
        for task in data:
            for k in ('taskid', 'project', 'url', 'updatetime'):
                self.assertIn(k, task)
            if task.get('track'):
                track = True
                self.assertIn('fetch', task['track'])
                self.assertIn('ok', task['track']['fetch'])
                self.assertIn('time', task['track']['fetch'])
                self.assertIn('process', task['track'])
                self.assertIn('ok', task['track']['process'])
                self.assertIn('time', task['track']['process'])
        self.assertTrue(track)
                    

    def test_a24_task(self):
        rv = self.app.get(self.task_url)
        self.assertEqual(rv.status_code, 200)
        self.assertIn(b'lastcrawltime', rv.data)

    def test_a25_task_json(self):
        rv = self.app.get(self.task_url + '.json')
        self.assertEqual(rv.status_code, 200)
        self.assertIn('status_string', json.loads(utils.text(rv.data)))

    def test_a26_debug_task(self):
        rv = self.app.get(self.debug_task_url)
        self.assertEqual(rv.status_code, 200)

    def test_a30_results(self):
        rv = self.app.get('/results?project=test_project')
        self.assertEqual(rv.status_code, 200)
        self.assertIn(b'<th>url</th>', rv.data)
        self.assertIn(b'open-url', rv.data)

    def test_a30_export_json(self):
        rv = self.app.get('/results/dump/test_project.json')
        self.assertEqual(rv.status_code, 200)
        self.assertIn(b'"taskid":', rv.data)

    def test_a32_export_json_style_full(self):
        rv = self.app.get('/results/dump/test_project.json?style=full')
        self.assertEqual(rv.status_code, 200)
        data = json.loads(rv.data.decode('utf8'))
        self.assertGreater(len(data), 1)

    def test_a34_export_json_style_full_limit_1(self):
        rv = self.app.get('/results/dump/test_project.json?style=full&limit=1&offset=1')
        self.assertEqual(rv.status_code, 200)
        data = json.loads(rv.data.decode('utf8'))
        self.assertEqual(len(data), 1)

    def test_a40_export_url_json(self):
        rv = self.app.get('/results/dump/test_project.txt')
        self.assertEqual(rv.status_code, 200)
        self.assertIn(b'"url":', rv.data)

    def test_a50_export_csv(self):
        rv = self.app.get('/results/dump/test_project.csv')
        self.assertEqual(rv.status_code, 200)
        self.assertIn(b'url,title,url', rv.data)

    def test_a60_fetch_via_cannot_connect_fetcher(self):
        ctx = run.webui.make_context('webui', [
            '--fetcher-rpc', 'http://localhost:20000/',
        ], self.ctx)
        app = run.webui.invoke(ctx)
        app = app.test_client()
        rv = app.post('/debug/test_project/run', data={
            'script': self.script_content,
            'task': self.task_content
        })
        self.assertEqual(rv.status_code, 200)
        data = json.loads(utils.text(rv.data))
        self.assertGreater(len(data['logs']), 0)
        self.assertEqual(len(data['follows']), 0)

    def test_a70_fetch_via_fetcher(self):
        ctx = run.webui.make_context('webui', [
            '--fetcher-rpc', 'http://localhost:24444/',
        ], self.ctx)
        app = run.webui.invoke(ctx)
        app = app.test_client()
        rv = app.post('/debug/test_project/run', data={
            'script': self.script_content,
            'task': self.task_content
        })
        self.assertEqual(rv.status_code, 200)
        data = json.loads(utils.text(rv.data))
        self.assertEqual(len(data['logs']), 0, data['logs'])
        self.assertIn(b'follows', rv.data)
        self.assertGreater(len(data['follows']), 0)

    def test_h000_auth(self):
        ctx = run.webui.make_context('webui', [
            '--scheduler-rpc', 'http://localhost:23333/',
            '--username', 'binux',
            '--password', '4321',
        ], self.ctx)
        app = run.webui.invoke(ctx)
        self.__class__.app = app.test_client()
        self.__class__.rpc = app.config['scheduler_rpc']

    def test_h005_no_such_project(self):
        rv = self.app.post('/update', data={
            'name': 'group',
            'value': 'lock',
            'pk': 'not_exist_project'
        })
        self.assertEqual(rv.status_code, 404)

    def test_h005_unknown_field(self):
        rv = self.app.post('/update', data={
            'name': 'unknown_field',
            'value': 'lock',
            'pk': 'test_project'
        })
        self.assertEqual(rv.status_code, 400)

    def test_h005_rate_wrong_format(self):
        rv = self.app.post('/update', data={
            'name': 'rate',
            'value': 'xxx',
            'pk': 'test_project'
        })
        self.assertEqual(rv.status_code, 400)

    def test_h010_change_group(self):
        rv = self.app.post('/update', data={
            'name': 'group',
            'value': 'lock',
            'pk': 'test_project'
        })
        self.assertEqual(rv.status_code, 200)
        self.assertIn(b'ok', rv.data)

        rv = self.app.get('/')
        self.assertEqual(rv.status_code, 200)
        self.assertIn(b'lock', rv.data)

    def test_h020_change_group_lock_failed(self):
        rv = self.app.post('/update', data={
            'name': 'group',
            'value': '',
            'pk': 'test_project'
        })
        self.assertEqual(rv.status_code, 401)

    def test_h020_change_group_lock_ok(self):
        rv = self.app.post('/update', data={
            'name': 'group',
            'value': 'test_binux',
            'pk': 'test_project'
        }, headers={
            'Authorization': 'Basic YmludXg6NDMyMQ=='
        })
        self.assertEqual(rv.status_code, 200)

    def test_h030_need_auth(self):
        ctx = run.webui.make_context('webui', [
            '--scheduler-rpc', 'http://localhost:23333/',
            '--username', 'binux',
            '--password', '4321',
            '--need-auth',
        ], self.ctx)
        app = run.webui.invoke(ctx)
        self.__class__.app = app.test_client()
        self.__class__.rpc = app.config['scheduler_rpc']

    def test_h040_auth_fail(self):
        rv = self.app.get('/')
        self.assertEqual(rv.status_code, 401)

    def test_h050_auth_fail2(self):
        rv = self.app.get('/', headers={
            'Authorization': 'Basic Ymlasdfsd'
        })
        self.assertEqual(rv.status_code, 401)

    def test_h060_auth_fail3(self):
        rv = self.app.get('/', headers={
            'Authorization': 'Basic YmludXg6MQ=='
        })
        self.assertEqual(rv.status_code, 401)

    def test_h070_auth_ok(self):
        rv = self.app.get('/', headers={
            'Authorization': 'Basic YmludXg6NDMyMQ=='
        })
        self.assertEqual(rv.status_code, 200)

    def test_x0_disconnected_scheduler(self):
        ctx = run.webui.make_context('webui', [
            '--scheduler-rpc', 'http://localhost:23458/'
        ], self.ctx)
        app = run.webui.invoke(ctx)
        self.__class__.app = app.test_client()
        self.__class__.rpc = app.config['scheduler_rpc']

    def test_x10_project_update(self):
        rv = self.app.post('/update', data={
            'name': 'status',
            'value': 'RUNNING',
            'pk': 'test_project'
        })
        self.assertEqual(rv.status_code, 200)
        self.assertNotIn(b'ok', rv.data)

    def test_x20_counter(self):
        rv = self.app.get('/counter?time=5m&type=sum')
        self.assertEqual(rv.status_code, 200)
        self.assertEqual(json.loads(utils.text(rv.data)), {})

    def test_x30_run_not_exists_project(self):
        rv = self.app.post('/run', data={
            'project': 'not_exist_project',
        })
        self.assertEqual(rv.status_code, 404)

    def test_x30_run(self):
        rv = self.app.post('/run', data={
            'project': 'test_project',
        })
        self.assertEqual(rv.status_code, 200)
        self.assertEqual(json.loads(utils.text(rv.data))['result'], False)

    def test_x40_debug_save(self):
        rv = self.app.post('/debug/test_project/save', data={
            'script': self.script_content,
        })
        self.assertEqual(rv.status_code, 200)
        self.assertNotIn(b'ok', rv.data)

    def test_x50_tasks(self):
        rv = self.app.get('/tasks')
        self.assertEqual(rv.status_code, 502)

    def test_x60_robots(self):
        rv = self.app.get('/robots.txt')
        self.assertEqual(rv.status_code, 200)
        self.assertIn(b'ser-agent', rv.data)

    def test_x70_bench(self):
        rv = self.app.get('/bench?total=10&show=5')
        self.assertEqual(rv.status_code, 200)


================================================
FILE: tests/test_xmlrpc.py
================================================
#   Copyright (c) 2006-2007 Open Source Applications Foundation
#
#   Licensed under the Apache License, Version 2.0 (the "License");
#   you may not use this file except in compliance with the License.
#   You may obtain a copy of the License at
#
#       http://www.apache.org/licenses/LICENSE-2.0
#
#   Unless required by applicable law or agreed to in writing, software
#   distributed under the License is distributed on an "AS IS" BASIS,
#   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#   See the License for the specific language governing permissions and
#   limitations under the License.
#
#   Origin: https://code.google.com/p/wsgi-xmlrpc/

import unittest
import tornado.wsgi
import tornado.ioloop
import tornado.httpserver
from pyspider.libs import utils

class TestXMLRPCServer(unittest.TestCase):
    @classmethod
    def setUpClass(self):
        from pyspider.libs import wsgi_xmlrpc
        
        def test_1():
            return 'test_1'
            
        class Test2(object):
            def test_3(self, obj):
                return obj
                
        test = Test2()
        
        application = wsgi_xmlrpc.WSGIXMLRPCApplication()
        application.register_instance(Test2())
        application.register_function(test_1)

        container = tornado.wsgi.WSGIContainer(application)
        self.io_loop = tornado.ioloop.IOLoop.current()
        http_server = tornado.httpserver.HTTPServer(container, io_loop=self.io_loop)
        http_server.listen(3423)
        self.thread = utils.run_in_thread(self.io_loop.start)

    @classmethod
    def tearDownClass(self):
        self.io_loop.add_callback(self.io_loop.stop)
        self.thread.join()
    
    def test_xmlrpc_server(self, uri='http://127.0.0.1:3423'):
        from six.moves.xmlrpc_client import ServerProxy
        
        client = ServerProxy(uri)
        
        assert client.test_1() == 'test_1'
        assert client.test_3({'asdf':4}) == {'asdf':4}


================================================
FILE: tools/migrate.py
================================================
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:
# Author: Binux<roy@binux.me>
#         http://binux.me
# Created on 2015-09-30 23:22:46

import click
import logging
from pyspider.database.base.projectdb import ProjectDB
from pyspider.database.base.taskdb import TaskDB
from pyspider.database.base.resultdb import ResultDB
from pyspider.database import connect_database
from pyspider.libs.utils import unicode_obj
from multiprocessing.pool import ThreadPool as Pool

logging.getLogger().setLevel(logging.INFO)


def taskdb_migrating(project, from_connection, to_connection):
    logging.info("taskdb: %s", project)
    f = connect_database(from_connection)
    t = connect_database(to_connection)
    t.drop(project)
    for status in range(1, 5):
        for task in f.load_tasks(status, project=project):
            t.insert(project, task['taskid'], task)


def resultdb_migrating(project, from_connection, to_connection):
    logging.info("resultdb: %s", project)
    f = connect_database(from_connection)
    t = connect_database(to_connection)
    t.drop(project)
    for result in f.select(project):
        t.save(project, result['taskid'], result['url'], result['result'])


@click.command()
@click.option('--pool', default=10, help='cocurrent worker size.')
@click.argument('from_connection', required=1)
@click.argument('to_connection', required=1)
def migrate(pool, from_connection, to_connection):
    """
    Migrate tool for pyspider
    """
    f = connect_database(from_connection)
    t = connect_database(to_connection)

    if isinstance(f, ProjectDB):
        for each in f.get_all():
            each = unicode_obj(each)
            logging.info("projectdb: %s", each['name'])
            t.drop(each['name'])
            t.insert(each['name'], each)
    elif isinstance(f, TaskDB):
        pool = Pool(pool)
        pool.map(
            lambda x, f=from_connection, t=to_connection: taskdb_migrating(x, f, t),
            f.projects)
    elif isinstance(f, ResultDB):
        pool = Pool(pool)
        pool.map(
            lambda x, f=from_connection, t=to_connection: resultdb_migrating(x, f, t),
            f.projects)


if __name__ == '__main__':
    migrate()


================================================
FILE: tox.ini
================================================
[tox]
envlist = py35,py36,py37,py38
[testenv]
install_command = 
    pip install --allow-all-external 'https://dev.mysql.com/get/Downloads/Connector-Python/mysql-connector-python-2.1.5.zip#md5=ce4a24cb1746c1c8f6189a97087f21c1'  {opts} -e .[all,test] {packages}
commands =
    python setup.py test []