[
  {
    "path": ".coveragerc",
    "content": "[run]\nsource =\n    pyspider\nparallel = True\n\n[report]\nomit =\n    pyspider/libs/sample_handler.py\n    pyspider/libs/pprint.py\n\nexclude_lines =\n    pragma: no cover\n    def __repr__\n    if self.debug:\n    if settings.DEBUG\n    raise AssertionError\n    raise NotImplementedError\n    if 0:\n    if __name__ == .__main__.:\n    except ImportError:\n    pass\n"
  },
  {
    "path": ".github/ISSUE_TEMPLATE.md",
    "content": "<!--\nThanks for using pyspider!\n\n如果你需要使用中文提问，请将问题提交到 https://segmentfault.com/t/pyspider\n-->\n\n* pyspider version:\n* Operating system:\n* Start up command:\n\n### Expected behavior\n\n<!-- What do you think should happen? -->\n\n### Actual behavior\n\n<!-- What actually happens? -->\n\n### How to reproduce\n\n<!-- \n\nThe best chance of getting help is providing enough information that can be reproduce the issue you have.\n\nIf it's related to API or extraction behavior, please paste the script of your project.\nIf it's related to scheduling of whole project, please paste the screenshot of queue status on the top in dashboard.\n\n-->\n"
  },
  {
    "path": ".gitignore",
    "content": "*.py[cod]\ndata/*\n.venv\n.idea\n# C extensions\n*.so\n\n# Packages\n*.egg\n*.egg-info\ndist\nbuild\neggs\nparts\nbin\nvar\nsdist\ndevelop-eggs\n.installed.cfg\nlib\nlib64\n__pycache__\n\n# Installer logs\npip-log.txt\n\n# Unit test / coverage reports\n.coverage\n.tox\nnosetests.xml\n\n# Translations\n*.mo\n\n# Mr Developer\n.mr.developer.cfg\n.project\n.pydevproject\n.idea\n"
  },
  {
    "path": ".travis.yml",
    "content": "language: python\ncache: pip\npython:\n  - 3.5\n  - 3.6\n  - 3.7\n  #- 3.8\nservices:\n    - docker\n    - mongodb\n    - rabbitmq\n    - redis\n    - mysql\n    # - elasticsearch\n    - postgresql\naddons:\n  postgresql: \"9.4\"\n  apt:\n    packages:\n    - rabbitmq-server\nenv:\n    - IGNORE_COUCHDB=1\n\nbefore_install:\n    - sudo apt-get update -qq\n    - curl -O https://download.elastic.co/elasticsearch/release/org/elasticsearch/distribution/deb/elasticsearch/2.4.0/elasticsearch-2.4.0.deb && sudo dpkg -i --force-confnew elasticsearch-2.4.0.deb && sudo service elasticsearch restart\n    - npm install express puppeteer\n    - sudo docker pull scrapinghub/splash\n    - sudo docker run -d --net=host scrapinghub/splash\nbefore_script:\n    - psql -c \"CREATE DATABASE pyspider_test_taskdb ENCODING 'UTF8' TEMPLATE=template0;\" -U postgres\n    - psql -c \"CREATE DATABASE pyspider_test_projectdb ENCODING 'UTF8' TEMPLATE=template0;\" -U postgres\n    - psql -c \"CREATE DATABASE pyspider_test_resultdb ENCODING 'UTF8' TEMPLATE=template0;\" -U postgres\n    - sleep 10\ninstall:\n    - pip install https://github.com/marcus67/easywebdav/archive/master.zip\n    - sudo apt-get install libgnutls28-dev\n    - pip install -e .[all,test]\n    - pip install coveralls\nscript:\n    - coverage run setup.py test\nafter_success:\n    - coverage combine\n    - coveralls\n"
  },
  {
    "path": "Dockerfile",
    "content": "FROM python:3.6\nMAINTAINER binux <roy@binux.me>\n\n# install phantomjs\nRUN mkdir -p /opt/phantomjs \\\n        && cd /opt/phantomjs \\\n        && wget -O phantomjs.tar.bz2 https://bitbucket.org/ariya/phantomjs/downloads/phantomjs-2.1.1-linux-x86_64.tar.bz2 \\\n        && tar xavf phantomjs.tar.bz2 --strip-components 1 \\\n        && ln -s /opt/phantomjs/bin/phantomjs /usr/local/bin/phantomjs \\\n        && rm phantomjs.tar.bz2\n# Fix Error: libssl_conf.so: cannot open shared object file: No such file or directory\nENV OPENSSL_CONF=/etc/ssl/\n\n# install nodejs\nENV NODEJS_VERSION=8.15.0 \\\n    PATH=$PATH:/opt/node/bin\nWORKDIR \"/opt/node\"\nRUN apt-get -qq update && apt-get -qq install -y curl ca-certificates libx11-xcb1 libxtst6 libnss3 libasound2 libatk-bridge2.0-0 libgtk-3-0 --no-install-recommends && \\\n    curl -sL https://nodejs.org/dist/v${NODEJS_VERSION}/node-v${NODEJS_VERSION}-linux-x64.tar.gz | tar xz --strip-components=1 && \\\n    rm -rf /var/lib/apt/lists/*\nRUN npm install puppeteer express\n\n# install requirements\nCOPY requirements.txt /opt/pyspider/requirements.txt\nRUN pip install -r /opt/pyspider/requirements.txt\n\n# add all repo\nADD ./ /opt/pyspider\n\n# run test\nWORKDIR /opt/pyspider\nRUN pip install -e .[all]\n\n# Create a symbolic link to node_modules\nRUN ln -s /opt/node/node_modules ./node_modules\n\n#VOLUME [\"/opt/pyspider\"]\nENTRYPOINT [\"pyspider\"]\n\nEXPOSE 5000 23333 24444 25555 22222\n"
  },
  {
    "path": "LICENSE",
    "content": "Apache License\n                           Version 2.0, January 2004\n                        http://www.apache.org/licenses/\n\n   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION\n\n   1. Definitions.\n\n      \"License\" shall mean the terms and conditions for use, reproduction,\n      and distribution as defined by Sections 1 through 9 of this document.\n\n      \"Licensor\" shall mean the copyright owner or entity authorized by\n      the copyright owner that is granting the License.\n\n      \"Legal Entity\" shall mean the union of the acting entity and all\n      other entities that control, are controlled by, or are under common\n      control with that entity. For the purposes of this definition,\n      \"control\" means (i) the power, direct or indirect, to cause the\n      direction or management of such entity, whether by contract or\n      otherwise, or (ii) ownership of fifty percent (50%) or more of the\n      outstanding shares, or (iii) beneficial ownership of such entity.\n\n      \"You\" (or \"Your\") shall mean an individual or Legal Entity\n      exercising permissions granted by this License.\n\n      \"Source\" form shall mean the preferred form for making modifications,\n      including but not limited to software source code, documentation\n      source, and configuration files.\n\n      \"Object\" form shall mean any form resulting from mechanical\n      transformation or translation of a Source form, including but\n      not limited to compiled object code, generated documentation,\n      and conversions to other media types.\n\n      \"Work\" shall mean the work of authorship, whether in Source or\n      Object form, made available under the License, as indicated by a\n      copyright notice that is included in or attached to the work\n      (an example is provided in the Appendix below).\n\n      \"Derivative Works\" shall mean any work, whether in Source or Object\n      form, that is based on (or derived from) the Work and for which the\n      editorial revisions, annotations, elaborations, or other modifications\n      represent, as a whole, an original work of authorship. For the purposes\n      of this License, Derivative Works shall not include works that remain\n      separable from, or merely link (or bind by name) to the interfaces of,\n      the Work and Derivative Works thereof.\n\n      \"Contribution\" shall mean any work of authorship, including\n      the original version of the Work and any modifications or additions\n      to that Work or Derivative Works thereof, that is intentionally\n      submitted to Licensor for inclusion in the Work by the copyright owner\n      or by an individual or Legal Entity authorized to submit on behalf of\n      the copyright owner. For the purposes of this definition, \"submitted\"\n      means any form of electronic, verbal, or written communication sent\n      to the Licensor or its representatives, including but not limited to\n      communication on electronic mailing lists, source code control systems,\n      and issue tracking systems that are managed by, or on behalf of, the\n      Licensor for the purpose of discussing and improving the Work, but\n      excluding communication that is conspicuously marked or otherwise\n      designated in writing by the copyright owner as \"Not a Contribution.\"\n\n      \"Contributor\" shall mean Licensor and any individual or Legal Entity\n      on behalf of whom a Contribution has been received by Licensor and\n      subsequently incorporated within the Work.\n\n   2. Grant of Copyright License. Subject to the terms and conditions of\n      this License, each Contributor hereby grants to You a perpetual,\n      worldwide, non-exclusive, no-charge, royalty-free, irrevocable\n      copyright license to reproduce, prepare Derivative Works of,\n      publicly display, publicly perform, sublicense, and distribute the\n      Work and such Derivative Works in Source or Object form.\n\n   3. Grant of Patent License. Subject to the terms and conditions of\n      this License, each Contributor hereby grants to You a perpetual,\n      worldwide, non-exclusive, no-charge, royalty-free, irrevocable\n      (except as stated in this section) patent license to make, have made,\n      use, offer to sell, sell, import, and otherwise transfer the Work,\n      where such license applies only to those patent claims licensable\n      by such Contributor that are necessarily infringed by their\n      Contribution(s) alone or by combination of their Contribution(s)\n      with the Work to which such Contribution(s) was submitted. If You\n      institute patent litigation against any entity (including a\n      cross-claim or counterclaim in a lawsuit) alleging that the Work\n      or a Contribution incorporated within the Work constitutes direct\n      or contributory patent infringement, then any patent licenses\n      granted to You under this License for that Work shall terminate\n      as of the date such litigation is filed.\n\n   4. Redistribution. You may reproduce and distribute copies of the\n      Work or Derivative Works thereof in any medium, with or without\n      modifications, and in Source or Object form, provided that You\n      meet the following conditions:\n\n      (a) You must give any other recipients of the Work or\n          Derivative Works a copy of this License; and\n\n      (b) You must cause any modified files to carry prominent notices\n          stating that You changed the files; and\n\n      (c) You must retain, in the Source form of any Derivative Works\n          that You distribute, all copyright, patent, trademark, and\n          attribution notices from the Source form of the Work,\n          excluding those notices that do not pertain to any part of\n          the Derivative Works; and\n\n      (d) If the Work includes a \"NOTICE\" text file as part of its\n          distribution, then any Derivative Works that You distribute must\n          include a readable copy of the attribution notices contained\n          within such NOTICE file, excluding those notices that do not\n          pertain to any part of the Derivative Works, in at least one\n          of the following places: within a NOTICE text file distributed\n          as part of the Derivative Works; within the Source form or\n          documentation, if provided along with the Derivative Works; or,\n          within a display generated by the Derivative Works, if and\n          wherever such third-party notices normally appear. The contents\n          of the NOTICE file are for informational purposes only and\n          do not modify the License. You may add Your own attribution\n          notices within Derivative Works that You distribute, alongside\n          or as an addendum to the NOTICE text from the Work, provided\n          that such additional attribution notices cannot be construed\n          as modifying the License.\n\n      You may add Your own copyright statement to Your modifications and\n      may provide additional or different license terms and conditions\n      for use, reproduction, or distribution of Your modifications, or\n      for any such Derivative Works as a whole, provided Your use,\n      reproduction, and distribution of the Work otherwise complies with\n      the conditions stated in this License.\n\n   5. Submission of Contributions. Unless You explicitly state otherwise,\n      any Contribution intentionally submitted for inclusion in the Work\n      by You to the Licensor shall be under the terms and conditions of\n      this License, without any additional terms or conditions.\n      Notwithstanding the above, nothing herein shall supersede or modify\n      the terms of any separate license agreement you may have executed\n      with Licensor regarding such Contributions.\n\n   6. Trademarks. This License does not grant permission to use the trade\n      names, trademarks, service marks, or product names of the Licensor,\n      except as required for reasonable and customary use in describing the\n      origin of the Work and reproducing the content of the NOTICE file.\n\n   7. Disclaimer of Warranty. Unless required by applicable law or\n      agreed to in writing, Licensor provides the Work (and each\n      Contributor provides its Contributions) on an \"AS IS\" BASIS,\n      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or\n      implied, including, without limitation, any warranties or conditions\n      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A\n      PARTICULAR PURPOSE. You are solely responsible for determining the\n      appropriateness of using or redistributing the Work and assume any\n      risks associated with Your exercise of permissions under this License.\n\n   8. Limitation of Liability. In no event and under no legal theory,\n      whether in tort (including negligence), contract, or otherwise,\n      unless required by applicable law (such as deliberate and grossly\n      negligent acts) or agreed to in writing, shall any Contributor be\n      liable to You for damages, including any direct, indirect, special,\n      incidental, or consequential damages of any character arising as a\n      result of this License or out of the use or inability to use the\n      Work (including but not limited to damages for loss of goodwill,\n      work stoppage, computer failure or malfunction, or any and all\n      other commercial damages or losses), even if such Contributor\n      has been advised of the possibility of such damages.\n\n   9. Accepting Warranty or Additional Liability. While redistributing\n      the Work or Derivative Works thereof, You may choose to offer,\n      and charge a fee for, acceptance of support, warranty, indemnity,\n      or other liability obligations and/or rights consistent with this\n      License. However, in accepting such obligations, You may act only\n      on Your own behalf and on Your sole responsibility, not on behalf\n      of any other Contributor, and only if You agree to indemnify,\n      defend, and hold each Contributor harmless for any liability\n      incurred by, or claims asserted against, such Contributor by reason\n      of your accepting any such warranty or additional liability.\n\n   END OF TERMS AND CONDITIONS\n\n   APPENDIX: How to apply the Apache License to your work.\n\n      To apply the Apache License to your work, attach the following\n      boilerplate notice, with the fields enclosed by brackets \"{}\"\n      replaced with your own identifying information. (Don't include\n      the brackets!)  The text should be enclosed in the appropriate\n      comment syntax for the file format. We also recommend that a\n      file or class name and description of purpose be included on the\n      same \"printed page\" as the copyright notice for easier\n      identification within third-party archives.\n\n   Copyright 2014 Binux\n\n   Licensed under the Apache License, Version 2.0 (the \"License\");\n   you may not use this file except in compliance with the License.\n   You may obtain a copy of the License at\n\n       http://www.apache.org/licenses/LICENSE-2.0\n\n   Unless required by applicable law or agreed to in writing, software\n   distributed under the License is distributed on an \"AS IS\" BASIS,\n   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n   See the License for the specific language governing permissions and\n   limitations under the License.\n\n"
  },
  {
    "path": "MANIFEST.in",
    "content": "include README.md\ninclude requirements.txt\ninclude Dockerfile\ninclude LICENSE\ninclude pyspider/logging.conf\ninclude pyspider/webui/static/*\ninclude pyspider/webui/templates/*\n"
  },
  {
    "path": "README.md",
    "content": "pyspider [![Build Status]][Travis CI] [![Coverage Status]][Coverage]\n========\n\nA Powerful Spider(Web Crawler) System in Python.\n\n- Write script in Python\n- Powerful WebUI with script editor, task monitor, project manager and result viewer\n- [MySQL](https://www.mysql.com/), [MongoDB](https://www.mongodb.org/), [Redis](http://redis.io/), [SQLite](https://www.sqlite.org/), [Elasticsearch](https://www.elastic.co/products/elasticsearch); [PostgreSQL](http://www.postgresql.org/) with [SQLAlchemy](http://www.sqlalchemy.org/) as database backend\n- [RabbitMQ](http://www.rabbitmq.com/), [Redis](http://redis.io/) and [Kombu](http://kombu.readthedocs.org/) as message queue\n- Task priority, retry, periodical, recrawl by age, etc...\n- Distributed architecture, Crawl Javascript pages, Python 2.{6,7}, 3.{3,4,5,6} support, etc...\n\nTutorial: [http://docs.pyspider.org/en/latest/tutorial/](http://docs.pyspider.org/en/latest/tutorial/)  \nDocumentation: [http://docs.pyspider.org/](http://docs.pyspider.org/)  \nRelease notes: [https://github.com/binux/pyspider/releases](https://github.com/binux/pyspider/releases)  \n\nSample Code \n-----------\n\n```python\nfrom pyspider.libs.base_handler import *\n\n\nclass Handler(BaseHandler):\n    crawl_config = {\n    }\n\n    @every(minutes=24 * 60)\n    def on_start(self):\n        self.crawl('http://scrapy.org/', callback=self.index_page)\n\n    @config(age=10 * 24 * 60 * 60)\n    def index_page(self, response):\n        for each in response.doc('a[href^=\"http\"]').items():\n            self.crawl(each.attr.href, callback=self.detail_page)\n\n    def detail_page(self, response):\n        return {\n            \"url\": response.url,\n            \"title\": response.doc('title').text(),\n        }\n```\n\n\nInstallation\n------------\n\n* `pip install pyspider`\n* run command `pyspider`, visit [http://localhost:5000/](http://localhost:5000/)\n\n**WARNING:** WebUI is open to the public by default, it can be used to execute any command which may harm your system. Please use it in an internal network or [enable `need-auth` for webui](http://docs.pyspider.org/en/latest/Command-Line/#-config).\n\nQuickstart: [http://docs.pyspider.org/en/latest/Quickstart/](http://docs.pyspider.org/en/latest/Quickstart/)\n\nContribute\n----------\n\n* Use It\n* Open [Issue], send PR\n* [User Group]\n* [中文问答](http://segmentfault.com/t/pyspider)\n\n\nTODO\n----\n\n### v0.4.0\n\n- [ ] a visual scraping interface like [portia](https://github.com/scrapinghub/portia)\n\n\nLicense\n-------\nLicensed under the Apache License, Version 2.0\n\n\n[Build Status]:         https://img.shields.io/travis/binux/pyspider/master.svg?style=flat\n[Travis CI]:            https://travis-ci.org/binux/pyspider\n[Coverage Status]:      https://img.shields.io/coveralls/binux/pyspider.svg?branch=master&style=flat\n[Coverage]:             https://coveralls.io/r/binux/pyspider\n[Try]:                  https://img.shields.io/badge/try-pyspider-blue.svg?style=flat\n[Issue]:                https://github.com/binux/pyspider/issues\n[User Group]:           https://groups.google.com/group/pyspider-users\n"
  },
  {
    "path": "config_example.json",
    "content": "{\n  \"taskdb\": \"couchdb+taskdb://user:password@couchdb:5984\",\n  \"projectdb\": \"couchdb+projectdb://user:password@couchdb:5984\",\n  \"resultdb\": \"couchdb+resultdb://user:password@couchdb:5984\",\n  \"message_queue\": \"amqp://rabbitmq:5672/%2F\",\n  \"webui\": {\n    \"username\": \"username\",\n    \"password\": \"password\",\n    \"need-auth\": true,\n    \"scheduler-rpc\": \"http://scheduler:23333\",\n    \"fetcher-rpc\": \"http://fetcher:24444\"\n  }\n}\n"
  },
  {
    "path": "docker-compose.yaml",
    "content": "version: \"3.7\"\n\n# replace /path/to/dir/ to point to config.json\n\n# The RabbitMQ and CouchDB services can take some time to startup.\n# During this time most of the pyspider services will exit and restart.\n# Once RabbitMQ and CouchDB are fully up and running everything should run as normal.\n\nservices:\n  rabbitmq:\n    image: rabbitmq:alpine\n    container_name: rabbitmq\n    networks:\n      - pyspider\n    command: rabbitmq-server\n  mysql:\n    image: mysql:latest\n    container_name: mysql\n    volumes:\n      - /tmp:/var/lib/mysql\n    environment:\n      - MYSQL_ALLOW_EMPTY_PASSWORD=yes\n    networks:\n      - pyspider\n  phantomjs:\n    image: pyspider:latest\n    container_name: phantomjs\n    networks:\n      - pyspider\n    volumes:\n      - ./config_example.json:/opt/pyspider/config.json\n    command: -c config.json phantomjs\n    depends_on:\n      - couchdb\n      - rabbitmq\n    restart: unless-stopped\n  result:\n    image: pyspider:latest\n    container_name: result\n    networks:\n      - pyspider\n    volumes:\n      - ./config_example.json:/opt/pyspider/config.json\n    command: -c config.json result_worker\n    depends_on:\n      - couchdb\n      - rabbitmq\n    restart: unless-stopped # Sometimes we'll get a connection refused error because couchdb has yet to fully start\n  processor:\n    container_name: processor\n    image: pyspider:latest\n    networks:\n      - pyspider\n    volumes:\n      - ./config_example.json:/opt/pyspider/config.json\n    command: -c config.json processor\n    depends_on:\n      - couchdb\n      - rabbitmq\n    restart: unless-stopped\n  fetcher:\n    image: pyspider:latest\n    container_name: fetcher\n    networks:\n      - pyspider\n    volumes:\n      - ./config_example.json:/opt/pyspider/config.json\n    command : -c config.json fetcher\n    depends_on:\n      - couchdb\n      - rabbitmq\n    restart: unless-stopped\n  scheduler:\n    image: pyspider:latest\n    container_name: scheduler\n    networks:\n      - pyspider\n    volumes:\n      - ./config_example.json:/opt/pyspider/config.json\n    command: -c config.json scheduler\n    depends_on:\n      - couchdb\n      - rabbitmq\n    restart: unless-stopped\n  webui:\n    image: pyspider:latest\n    container_name: webui\n    ports:\n      - \"5050:5000\"\n    networks:\n      - pyspider\n    volumes:\n      - ./config_example.json:/opt/pyspider/config.json\n    command: -c config.json webui\n    depends_on:\n      - couchdb\n      - rabbitmq\n    restart: unless-stopped\n\nnetworks:\n  pyspider:\n    external:\n      name: pyspider\n  default:\n    driver: bridge\n"
  },
  {
    "path": "docs/About-Projects.md",
    "content": "About Projects\n==============\n\nIn most cases, a project is one script you write for one website.\n\n* Projects are independent, but you can import another project as a module with `from projects import other_project`\n* A project has 5 status: `TODO`, `STOP`, `CHECKING`, `DEBUG` and `RUNNING`\n    - `TODO` - a script is just created to be written\n    - `STOP` - you can mark a project as `STOP` if you want it to STOP (= =).\n    - `CHECKING` - when a running project is modified, to prevent incomplete modification, project status will be set as `CHECKING` automatically.\n    - `DEBUG`/`RUNNING` - these two status have no difference to spider. But it's good to mark it as `DEBUG` when it's running the first time then change it to `RUNNING` after being checked.\n* The crawl rate is controlled by `rate` and `burst` with [token-bucket](http://en.wikipedia.org/wiki/Token_bucket) algorithm.\n    - `rate` - how many requests in one second\n    - `burst` - consider this situation, `rate/burst = 0.1/3`, it means that the spider scrawls 1 page every 10 seconds. All tasks are finished, project is checking last updated items every minute. Assume that 3 new items are found, pyspider will \"burst\" and crawl 3 tasks without waiting 3*10 seconds. However, the fourth task needs wait 10 seconds.\n* To delete a project, set `group` to `delete` and status to `STOP`, wait 24 hours.\n\n\n`on_finished` callback\n--------------------\nYou can override `on_finished` method in the project, the method would be triggered when the task_queue goes to 0.\n\nExample 1: When you start a project to crawl a website with 100 pages, the `on_finished` callback will be fired when 100 pages are successfully crawled or failed after retries.\n\nExample 2: A project with `auto_recrawl` tasks will **NEVER** trigger the `on_finished` callback, because time queue will never become 0 when there are auto_recrawl tasks in it.\n\nExample 3: A project with `@every` decorated method will trigger the `on_finished` callback every time when the newly submitted tasks are finished.\n"
  },
  {
    "path": "docs/About-Tasks.md",
    "content": "About Tasks\n===========\n\nTasks are the basic unit to be scheduled.\n\nBasis\n-----\n\n* A task is differentiated by its `taskid`. (Default: `md5(url)`, can be changed by overriding the `def get_taskid(self, task)` method)\n* Tasks are isolated between different projects.\n* A Task has 4 status:\n    - active\n    - failed\n    - success\n    - bad - not used\n* Only tasks in active status will be scheduled.\n* Tasks are served in order of `priority`.\n\nSchedule\n--------\n\n#### new task\n\nWhen a new task (never seen before) comes in:\n\n* If `exetime` is set but not arrived, it will be put into a time-based queue to wait.\n* Otherwise it will be accepted.\n\nWhen the task is already in the queue:\n\n* Ignored unless `force_update`\n\nWhen a completed task comes out:\n\n* If `age` is set, `last_crawl_time + age < now` it will be accepted. Otherwise discarded.\n* If `itag` is set and not equal to it's previous value, it will be accepted. Otherwise discarded.\n\n\n#### task retry\n\nWhen a fetch error or script error happens, the task will retry 3 times by default.\n\nThe first retry will execute every time after 30 seconds, 1 hour, 6 hours, 12 hours and any more retries will postpone 24 hours.\n\nIf `age` is specified, the retry delay will not larger then `age`.\n\nYou can config the retry delay by adding a variable named `retry_delay` to handler. `retry_delay` is a dict to specify retry intervals. The items in the dict are {retried: seconds}, and a special key: '' (empty string) is used to specify the default retry delay if not specified.\n\ne.g. the default `retry_delay` declares like:\n\n\n```\nclass MyHandler(BaseHandler):\n    retry_delay = {\n        0: 30,\n        1: 1*60*60,\n        2: 6*60*60,\n        3: 12*60*60,\n        '': 24*60*60\n    }\n```\n"
  },
  {
    "path": "docs/Architecture.md",
    "content": "Architecture\n============\n\nThis document describes the reason why I made pyspider and the architecture.\n\nWhy\n---\nTwo years ago, I was working on a vertical search engine. We are facing following needs on crawling:\n\n1. collect 100-200 websites, they may on/offline or change their templates at any time\n> We need a really powerful monitor to find out which website is changing. And a good tool to help us write script/template for each website.\n\n2. data should be collected in 5min when website updated\n> We solve this problem by check index page frequently, and use something like 'last update time' or 'last reply time' to determine which page is changed. In addition to this, we recheck pages after X days in case to prevent the omission.  \n> **pyspider will never stop as WWW is changing all the time**\n\nFurthermore, we have some APIs from our cooperators, the API may need POST, proxy, request signature etc. Full control from script is more convenient than some global parameters of components.\n\nOverview\n--------\nThe following diagram shows an overview of the pyspider architecture with its components and an outline of the data flow that takes place inside the system.\n\n![pyspider](imgs/pyspider-arch.png)\n\nComponents are connected by message queue. Every component, including message queue, is running in their own process/thread, and replaceable. That means, when process is slow, you can have many instances of processor and make full use of multiple CPUs, or deploy to multiple machines. This architecture makes pyspider really fast. [benchmarking](https://gist.github.com/binux/67b276c51e988f8e2c31#comment-1339242).\n\nComponents\n----------\n\n### Scheduler\nThe Scheduler receives tasks from newtask_queue from processor. Decide whether the task is new or requires re-crawl. Sort tasks according to priority and feeding them to fetcher with traffic control ([token bucket](http://en.wikipedia.org/wiki/Token_bucket) algorithm). Take care of periodic tasks, lost tasks and failed tasks and retry later.\n\nAll of above can be set via `self.crawl` [API](apis/). \n\nNote that in current implement of scheduler, only one scheduler is allowed.\n\n### Fetcher\nThe Fetcher is responsible for fetching web pages then send results to processor. For flexible, fetcher support [Data URI](http://en.wikipedia.org/wiki/Data_URI_scheme) and pages that rendered by JavaScript (via [phantomjs](http://phantomjs.org/)). Fetch method, headers, cookies, proxy, etag etc can be controlled by script via [API](apis/self.crawl/#fetch).\n\n### Phantomjs Fetcher\nPhantomjs Fetcher works like a proxy. It's connected to general Fetcher, fetch and render pages with JavaScript enabled, output a general HTML back to Fetcher:\n\n```\nscheduler -> fetcher -> processor\n                |\n            phantomjs\n                |\n             internet\n```\n\n### Processor\nThe Processor is responsible for running the script written by users to parse and extract information. Your script is running in an unlimited environment. Although we have various tools(like [PyQuery](https://pythonhosted.org/pyquery/)) for you to extract information and links, you can use anything you want to deal with the response. You may refer to [Script Environment](Script-Environment) and [API Reference](apis/) to get more information about script.\n\nProcessor will capture the exceptions and logs, send status(task track) and new tasks to `scheduler`, send results to `Result Worker`.\n\n### Result Worker (optional)\nResult worker receives results from `Processor`. Pyspider has a built-in result worker to save result to `resultdb`. Overwrite it to deal with result by your needs.\n\n### WebUI\nWebUI is a web frontend for everything. It contains:\n\n* script editor, debugger\n* project manager\n* task monitor\n* result viewer, exporter\n\nMaybe webui is the most attractive part of pyspider. With this powerful UI, you can debug your scripts step by step just as pyspider do. Starting or stop a project. Finding which project is going wrong and what request is failed and try it again with debugger.\n\nData flow\n---------\nThe data flow in pyspider is just as your seen in diagram above:\n\n1. Each script has a callback named `on_start`, when you press the `Run` button on WebUI. A new task of `on_start` is submitted to Scheduler as the entries of project.\n2. Scheduler dispatches this `on_start` task with a Data URI as a normal task to Fetcher.\n3. Fetcher makes a request and a response to it (for Data URI, it's a fake request and response, but has no difference with other normal tasks), then feeds to Processor.\n4. Processor calls the `on_start` method and generated some new URL to crawl. Processor send a message to Scheduler that this task is finished and new tasks via message queue to Scheduler (here is no results for `on_start` in most case. If has results, Processor send them to `result_queue`).\n5. Scheduler receives the new tasks, looking up in the database, determine whether the task is new or requires re-crawl, if so, put them into task queue. Dispatch tasks in order.\n6. The process repeats (from step 3) and wouldn't stop till WWW is dead ;-). Scheduler will check periodic tasks to crawl latest data.\n"
  },
  {
    "path": "docs/Command-Line.md",
    "content": "Command Line\n============\n\nGlobal Config\n-------------\n\nYou can get command help via `pyspider --help` and `pyspider all --help` for subcommand help.\n\nglobal options work for all subcommands.\n\n```\nUsage: pyspider [OPTIONS] COMMAND [ARGS]...\n\n  A powerful spider system in python.\n\nOptions:\n  -c, --config FILENAME    a json file with default values for subcommands.\n                           {“webui”: {“port”:5001}}\n  --logging-config TEXT    logging config file for built-in python logging\n                           module  [default: pyspider/pyspider/logging.conf]\n  --debug                  debug mode\n  --queue-maxsize INTEGER  maxsize of queue\n  --taskdb TEXT            database url for taskdb, default: sqlite\n  --projectdb TEXT         database url for projectdb, default: sqlite\n  --resultdb TEXT          database url for resultdb, default: sqlite\n  --message-queue TEXT     connection url to message queue, default: builtin\n                           multiprocessing.Queue\n  --amqp-url TEXT          [deprecated] amqp url for rabbitmq. please use\n                           --message-queue instead.\n  --beanstalk TEXT         [deprecated] beanstalk config for beanstalk queue.\n                           please use --message-queue instead.\n  --phantomjs-proxy TEXT   phantomjs proxy ip:port\n  --data-path TEXT         data dir path\n  --version                Show the version and exit.\n  --help                   Show this message and exit.\n```\n\n#### --config\n\nConfig file is a JSON file with config values for global options or subcommands (a sub-dict named after subcommand). [example](/Deployment/#configjson)\n\n``` json\n{\n  \"taskdb\": \"mysql+taskdb://username:password@host:port/taskdb\",\n  \"projectdb\": \"mysql+projectdb://username:password@host:port/projectdb\",\n  \"resultdb\": \"mysql+resultdb://username:password@host:port/resultdb\",\n  \"message_queue\": \"amqp://username:password@host:port/%2F\",\n  \"webui\": {\n    \"username\": \"some_name\",\n    \"password\": \"some_passwd\",\n    \"need-auth\": true\n  }\n}\n```\n\n#### --queue-maxsize\n\nQueue size limit, 0 for not limit\n\n#### --taskdb, --projectdb, --resultdb\n\n```\nmysql:\n    mysql+type://user:passwd@host:port/database\nsqlite:\n    # relative path\n    sqlite+type:///path/to/database.db\n    # absolute path\n    sqlite+type:////path/to/database.db\n    # memory database\n    sqlite+type://\nmongodb:\n    mongodb+type://[username:password@]host1[:port1][,host2[:port2],...[,hostN[:portN]]][/[database][?options]]\n    more: http://docs.mongodb.org/manual/reference/connection-string/\ncouchdb:\n    couchdb+type://[username:password@]host[:port]\nsqlalchemy:\n    sqlalchemy+postgresql+type://user:passwd@host:port/database\n    sqlalchemy+mysql+mysqlconnector+type://user:passwd@host:port/database\n    more: http://docs.sqlalchemy.org/en/rel_0_9/core/engines.html\nlocal:\n    local+projectdb://filepath,filepath\n    \ntype:\n    should be one of `taskdb`, `projectdb`, `resultdb`.\n```\n\n\n#### --message-queue\n\n```\nrabbitmq:\n    amqp://username:password@host:5672/%2F\n    see https://www.rabbitmq.com/uri-spec.html\nredis:\n    redis://host:6379/db\n    redis://host1:port1,host2:port2,...,hostn:portn (for redis 3.x in cluster mode)\nkombu:\n    kombu+transport://userid:password@hostname:port/virtual_host\n    see http://kombu.readthedocs.org/en/latest/userguide/connections.html#urls\nbuiltin:\n    None\n```\n\n#### --phantomjs-proxy\n\nThe phantomjs proxy address, you need a phantomjs installed and running phantomjs proxy with command: [`pyspider phantomjs`](#phantomjs).\n\n#### --data-path\n\nSQLite database and counter dump files saved path\n\n\nall\n---\n\n```\nUsage: pyspider all [OPTIONS]\n\n  Run all the components in subprocess or thread\n\nOptions:\n  --fetcher-num INTEGER         instance num of fetcher\n  --processor-num INTEGER       instance num of processor\n  --result-worker-num INTEGER   instance num of result worker\n  --run-in [subprocess|thread]  run each components in thread or subprocess.\n                                always using thread for windows.\n  --help                        Show this message and exit.\n```\n\n\none\n---\n\n```\nUsage: pyspider one [OPTIONS] [SCRIPTS]...\n\n  One mode not only means all-in-one, it runs every thing in one process\n  over tornado.ioloop, for debug purpose\n\nOptions:\n  -i, --interactive  enable interactive mode, you can choose crawl url.\n  --phantomjs        enable phantomjs, will spawn a subprocess for phantomjs\n  --help             Show this message and exit.\n```\n\n**NOTE: WebUI is not running in one mode.**\n\nIn `one` mode, results will be written to stdout by default. You can capture them via `pyspider one > result.txt`.\n\n#### [SCRIPTS]\n\nThe script file path of projects. Project status is RUNNING, `rate` and `burst` can be set via script comments:\n\n```\n# rate: 1.0\n# burst: 3\n```\n\nWhen SCRIPTS is set, `taskdb` and `resultdb` will use a in-memory sqlite db by default (can be overridden by global config `--taskdb`, `--resultdb`). on_start callback will be triggered on start.\n\n#### -i, --interactive\n\nWith interactive mode, pyspider will start an interactive console asking what to do in next loop of process. In the console, you can use:\n\n``` python\ncrawl(url, project=None, **kwargs)\n    Crawl given url, same parameters as BaseHandler.crawl\n\n    url - url or taskid, parameters will be used if in taskdb\n    project - can be omitted if only one project exists.\n    \nquit_interactive()\n    Quit interactive mode\n    \nquit_pyspider()\n    Close pyspider\n```\n\nYou can use `pyspider.libs.utils.python_console()` to open an interactive console in your script.\n\nbench\n-----\n\n```\nUsage: pyspider bench [OPTIONS]\n\n  Run Benchmark test. In bench mode, in-memory sqlite database is used\n  instead of on-disk sqlite database.\n\nOptions:\n  --fetcher-num INTEGER         instance num of fetcher\n  --processor-num INTEGER       instance num of processor\n  --result-worker-num INTEGER   instance num of result worker\n  --run-in [subprocess|thread]  run each components in thread or subprocess.\n                                always using thread for windows.\n  --total INTEGER               total url in test page\n  --show INTEGER                show how many urls in a page\n  --help                        Show this message and exit.\n```\n\n\nscheduler\n---------\n\n```\nUsage: pyspider scheduler [OPTIONS]\n\n  Run Scheduler, only one scheduler is allowed.\n\nOptions:\n  --xmlrpc / --no-xmlrpc\n  --xmlrpc-host TEXT\n  --xmlrpc-port INTEGER\n  --inqueue-limit INTEGER  size limit of task queue for each project, tasks\n                           will been ignored when overflow\n  --delete-time INTEGER    delete time before marked as delete\n  --active-tasks INTEGER   active log size\n  --loop-limit INTEGER     maximum number of tasks due with in a loop\n  --scheduler-cls TEXT     scheduler class to be used.\n  --help                   Show this message and exit.\n```\n\n#### --scheduler-cls\n\nset this option to use customized Scheduler class\n\nphantomjs\n---------\n\n```\nUsage: run.py phantomjs [OPTIONS] [ARGS]...\n\n  Run phantomjs fetcher if phantomjs is installed.\n\nOptions:\n  --phantomjs-path TEXT  phantomjs path\n  --port INTEGER         phantomjs port\n  --auto-restart TEXT    auto restart phantomjs if crashed\n  --help                 Show this message and exit.\n```\n\n#### ARGS\n\nAddition args pass to phantomjs command line.\n\nfetcher\n-------\n\n```\nUsage: pyspider fetcher [OPTIONS]\n\n  Run Fetcher.\n\nOptions:\n  --xmlrpc / --no-xmlrpc\n  --xmlrpc-host TEXT\n  --xmlrpc-port INTEGER\n  --poolsize INTEGER      max simultaneous fetches\n  --proxy TEXT            proxy host:port\n  --user-agent TEXT       user agent\n  --timeout TEXT          default fetch timeout\n  --fetcher-cls TEXT      Fetcher class to be used.\n  --help                  Show this message and exit.\n```\n\n#### --proxy\n\nDefault proxy used by fetcher, can been override by `self.crawl` option. [DOC](apis/self.crawl/#fetch)\n\n\nprocessor\n---------\n\n```\nUsage: pyspider processor [OPTIONS]\n\n  Run Processor.\n\nOptions:\n  --processor-cls TEXT  Processor class to be used.\n  --help                Show this message and exit.\n```\n\nresult_worker\n-------------\n\n```\nUsage: pyspider result_worker [OPTIONS]\n\n  Run result worker.\n\nOptions:\n  --result-cls TEXT  ResultWorker class to be used.\n  --help             Show this message and exit.\n```\n\n\nwebui\n-----\n\n```\nUsage: pyspider webui [OPTIONS]\n\n  Run WebUI\n\nOptions:\n  --host TEXT            webui bind to host\n  --port INTEGER         webui bind to host\n  --cdn TEXT             js/css cdn server\n  --scheduler-rpc TEXT   xmlrpc path of scheduler\n  --fetcher-rpc TEXT     xmlrpc path of fetcher\n  --max-rate FLOAT       max rate for each project\n  --max-burst FLOAT      max burst for each project\n  --username TEXT        username of lock -ed projects\n  --password TEXT        password of lock -ed projects\n  --need-auth            need username and password\n  --webui-instance TEXT  webui Flask Application instance to be used.\n  --help                 Show this message and exit.\n```\n\n#### --cdn\n\nJS/CSS libs CDN service, URL must compatible with [cdnjs](https://cdnjs.com/)\n\n#### --fetcher-rpc\n\nXML-RPC path URI for fetcher XMLRPC server. If not set, use a Fetcher instance.\n\n#### --need-auth\n\nIf true, all pages require username and password specified via `--username` and `--password`.\n\n\n"
  },
  {
    "path": "docs/Deployment-demo.pyspider.org.md",
    "content": "Deployment of demo.pyspider.org\n===============================\n\n[demo.pyspider.org](http://demo.pyspider.org/) is running on three VPSs connected together with private network using [tinc](http://www.tinc-vpn.org/).\n\n1vCore 4GB RAM | 1vCore 2GB RAM * 2\n---------------|----------------\ndatabase<br>message queue<br>scheduler | phantomjs * 2<br>phantomjs-lb * 1<br>fetcher * 1<br>fetcher-lb * 1<br>processor * 2<br>result-worker * 1<br>webui * 4<br>webui-lb * 1<br>nginx * 1<br>\n\nAll components are running inside docker containers.\n\ndatabase / message queue / scheduler\n------------------------------------\n\nThe database is postgresql and the message queue is redis.\n\nScheduler may have a lot of database operations, it's better to put it close to the database.\n\n```bash\ndocker run --name postgres -v /data/postgres/:/var/lib/postgresql/data -d -p $LOCAL_IP:5432:5432 -e POSTGRES_PASSWORD=\"\" postgres\ndocker run --name redis -d -p  $LOCAL_IP:6379:6379 redis\ndocker run --name scheduler -d -p $LOCAL_IP:23333:23333 --restart=always binux/pyspider \\\n --taskdb \"sqlalchemy+postgresql+taskdb://binux@10.21.0.7/taskdb\" \\\n --resultdb \"sqlalchemy+postgresql+resultdb://binux@10.21.0.7/resultdb\" \\\n --projectdb \"sqlalchemy+postgresql+projectdb://binux@10.21.0.7/projectdb\" \\\n --message-queue \"redis://10.21.0.7:6379/1\" \\\n scheduler --inqueue-limit 5000 --delete-time 43200\n```\n\nother components\n----------------\n\nfetcher, processor, result_worker are running on two boxes with same configuration managed with [docker-compose](https://docs.docker.com/compose/).\n\n```yaml\nphantomjs:\n  image: 'binux/pyspider:latest'\n  command: phantomjs\n  cpu_shares: 512\n  environment:\n    - 'EXCLUDE_PORTS=5000,23333,24444'\n  expose:\n    - '25555'\n  mem_limit: 512m\n  restart: always\nphantomjs-lb:\n  image: 'dockercloud/haproxy:latest'\n  links:\n    - phantomjs\n  restart: always\n  \nfetcher:\n  image: 'binux/pyspider:latest'\n  command: '--message-queue \"redis://10.21.0.7:6379/1\" --phantomjs-proxy \"phantomjs:80\" fetcher --xmlrpc'\n  cpu_shares: 512\n  environment:\n    - 'EXCLUDE_PORTS=5000,25555,23333'\n  links:\n    - 'phantomjs-lb:phantomjs'\n  mem_limit: 128m\n  restart: always\nfetcher-lb:\n  image: 'dockercloud/haproxy:latest'\n  links:\n    - fetcher\n  restart: always\n  \nprocessor:\n  image: 'binux/pyspider:latest'\n  command: '--projectdb \"sqlalchemy+postgresql+projectdb://binux@10.21.0.7/projectdb\" --message-queue \"redis://10.21.0.7:6379/1\" processor'\n  cpu_shares: 512\n  mem_limit: 256m\n  restart: always\n  \nresult-worker:\n  image: 'binux/pyspider:latest'\n  command: '--taskdb \"sqlalchemy+postgresql+taskdb://binux@10.21.0.7/taskdb\"  --projectdb \"sqlalchemy+postgresql+projectdb://binux@10.21.0.7/projectdb\" --resultdb \"sqlalchemy+postgresql+resultdb://binux@10.21.0.7/resultdb\" --message-queue \"redis://10.21.0.7:6379/1\" result_worker'\n  cpu_shares: 512\n  mem_limit: 256m\n  restart: always\n  \nwebui:\n  image: 'binux/pyspider:latest'\n  command: '--taskdb \"sqlalchemy+postgresql+taskdb://binux@10.21.0.7/taskdb\"  --projectdb \"sqlalchemy+postgresql+projectdb://binux@10.21.0.7/projectdb\" --resultdb \"sqlalchemy+postgresql+resultdb://binux@10.21.0.7/resultdb\" --message-queue \"redis://10.21.0.7:6379/1\" webui --max-rate 0.2 --max-burst 3 --scheduler-rpc \"http://o4.i.binux.me:23333/\" --fetcher-rpc \"http://fetcher/\"'\n\n  cpu_shares: 512\n  environment:\n    - 'EXCLUDE_PORTS=24444,25555,23333'\n  links:\n    - 'fetcher-lb:fetcher'\n  mem_limit: 256m\n  restart: always\nwebui-lb:\n  image: 'dockercloud/haproxy:latest'\n  links:\n    - webui\n  restart: always\n  \nnginx:\n  image: 'nginx'\n  links:\n    - 'webui-lb:HAPROXY'\n  ports:\n    - '0.0.0.0:80:80'\n  volumes:\n    - /home/binux/nfs/profile/nginx/nginx.conf:/etc/nginx/nginx.conf\n    - /home/binux/nfs/profile/nginx/conf.d/:/etc/nginx/conf.d/\n  restart: always\n```\n\nWith the config, you can change the scale by `docker-compose scale phantomjs=2 processor=2 webui=4` when you need. \n\n#### load balance\n\nphantomjs-lb, fetcher-lb, webui-lb are automaticlly configed haproxy, allow any number of upstreams.\n\n#### phantomjs\n\nphantomjs have memory leak issue, memory limit applied, and it's recommended to restart it every hour.\n\n#### fetcher\n\nfetcher is implemented with aync IO, it supportes 100 concurrent connections. If the upstream queue are not choked, one fetcher should be enough.\n\n#### processor\n\nprocessor is CPU bound component, recommended number of instance is number of CPU cores + 1~2 or CPU cores * 10%~15% when you have more then 20 cores.\n\n#### result-worker\n\nIf you didn't override result-worker, it only write results into database, and should be very fast.\n"
  },
  {
    "path": "docs/Deployment.md",
    "content": "Deployment\n===========\n\nSince pyspider has various components, you can just run `pyspider` to start a standalone and third service free instance. Or using MySQL or MongoDB and RabbitMQ to deploy a distributed crawl cluster.\n\nTo deploy pyspider in product environment, running component in each process and store data in database service is more reliable and flexible.\n\nInstallation\n------------\n\nTo deploy pyspider components in each single processes, you need at least one database service. pyspider now supports [MySQL](http://www.mysql.com/), [CouchDB](https://couchdb.apache.org), [MongoDB](http://www.mongodb.org/) and [PostgreSQL](http://www.postgresql.org/). You can choose one of them.\n\nAnd you need a message queue service to connect the components together. You can use [RabbitMQ](http://www.rabbitmq.com/) or [Redis](http://redis.io/) as message queue.\n\n`pip install --allow-all-external pyspider[all]`\n\n> Even if you had install pyspider using `pip` before. Install with `pyspider[all]` is necessary to install the requirements for MySQL/MongoDB/RabbitMQ.\n\nif you are using Ubuntu, try:\n```\napt-get install python python-dev python-distribute python-pip libcurl4-openssl-dev libxml2-dev libxslt1-dev python-lxml\n```\nto install binary packages.\n\nDeployment\n----------\n\n**This document is based on MySQL + RabbitMQ**\n\n### config.json\n\nAlthough you can use command-line to specify the parameters. A config file is a better choice.\n\n```\n{\n  \"taskdb\": \"mysql+taskdb://username:password@host:port/taskdb\",\n  \"projectdb\": \"mysql+projectdb://username:password@host:port/projectdb\",\n  \"resultdb\": \"mysql+resultdb://username:password@host:port/resultdb\",\n  \"message_queue\": \"amqp://username:password@host:port/%2F\",\n  \"webui\": {\n    \"username\": \"some_name\",\n    \"password\": \"some_passwd\",\n    \"need-auth\": true\n  }\n}\n```\n\nyou can get complete options by running `pyspider --help` and `pyspider webui --help` for subcommands. `\"webui\"` in JSON  is configs for subcommands. You can add parameters for other components similar to this one.\n\n#### Database Connection URI\n`\"taskdb\"`, `\"projectdb”`, `\"resultdb\"` is using database connection URI with format below:\n\n```\nmysql:\n    mysql+type://user:passwd@host:port/database\nsqlite:\n    # relative path\n    sqlite+type:///path/to/database.db\n    # absolute path\n    sqlite+type:////path/to/database.db\n    # memory database\n    sqlite+type://\nmongodb:\n    mongodb+type://[username:password@]host1[:port1][,host2[:port2],...[,hostN[:portN]]][/[database][?options]]\n    more: http://docs.mongodb.org/manual/reference/connection-string/\ncouchdb:\n    couchdb+type://[username:password@]host[:port][?options]]\nsqlalchemy:\n    sqlalchemy+postgresql+type://user:passwd@host:port/database\n    sqlalchemy+mysql+mysqlconnector+type://user:passwd@host:port/database\n    more: http://docs.sqlalchemy.org/en/rel_0_9/core/engines.html\nlocal:\n    local+projectdb://filepath,filepath\n    \ntype:\n    should be one of `taskdb`, `projectdb`, `resultdb`.\n```\n\n#### Message Queue URL\nYou can use connection URL to specify the message queue:\n\n```\nrabbitmq:\n    amqp://username:password@host:5672/%2F\n    Refer: https://www.rabbitmq.com/uri-spec.html\nredis:\n    redis://host:6379/db\n    redis://host1:port1,host2:port2,...,hostn:portn (for redis 3.x in cluster mode)\nbuiltin:\n    None\n```\n\n> Hint for postgresql: you need to create database with encoding utf8 by your own. pyspider will not create database for you.\n\nrunning\n-------\n\nYou should run components alone with subcommands. You may add `&` after command to make it running in background and use [screen](http://linux.die.net/man/1/screen) or [nohup](http://linux.die.net/man/1/nohup) to prevent exit after your ssh session ends. **It's recommended to manage components with [Supervisor](http://supervisord.org/).**\n\n```\n# start **only one** scheduler instance\npyspider -c config.json scheduler\n\n# phantomjs\npyspider -c config.json phantomjs\n\n# start fetcher / processor / result_worker instances as many as your needs\npyspider -c config.json --phantomjs-proxy=\"localhost:25555\" fetcher\npyspider -c config.json processor\npyspider -c config.json result_worker\n\n# start webui, set `--scheduler-rpc` if scheduler is not running on the same host as webui\npyspider -c config.json webui\n```\n\nRunning with Docker\n-------------------\n[Running pyspider with Docker](Running-pyspider-with-Docker)\n\n\nDeployment of demo.pyspider.org\n-------------------------------\n[Deployment of demo.pyspider.org](Deployment-demo.pyspider.org)\n\n"
  },
  {
    "path": "docs/Frequently-Asked-Questions.md",
    "content": "Frequently Asked Questions\n==========================\n\nDoes pyspider Work with Windows?\n--------------------------------\nYes, it should, some users have made it work on Windows. But as I don't have windows development environment, I cannot test. Only some tips for users who want to use pyspider on Windows:\n\n- Some package needs binary libs (e.g. pycurl, lxml), that maybe you cannot install it from pip, Windowns binaries packages could be found in [http://www.lfd.uci.edu/~gohlke/pythonlibs/](http://www.lfd.uci.edu/~gohlke/pythonlibs/).\n- Make a clean environment with [virtualenv](https://virtualenv.readthedocs.org/en/latest/)\n- Try 32bit version of Python, especially your are facing crash issue.\n- Avoid using Python 3.4.1 ([#194](https://github.com/binux/pyspider/issues/194), [#217](https://github.com/binux/pyspider/issues/217))\n\nUnreadable Code (乱码) Returned from Phantomjs\n---------------------------------------------\n\nPhantomjs doesn't support gzip, don't set `Accept-Encoding` header with `gzip`.\n\n\nHow to Delete a Project?\n------------------------\n\nset `group` to `delete` and `status` to `STOP` then wait 24 hours. You can change the time before a project deleted via `scheduler.DELETE_TIME`.\n\nHow to Restart a Project?\n-------------------------\n#### Why\nIt happens after you modified a script, and wants to crawl everything again with new strategy. But as the [age](/apis/self.crawl/#age) of urls are not expired. Scheduler will discard all of the new requests.\n\n#### Solution\n1. Create a new project.\n2. Using a [itag](/apis/self.crawl/#itag) within `Handler.crawl_config` to specify the version of your script.\n\nHow to Use WebDAV Mode?\n-----------------------\nMount `http://hostname/dav/` to your filesystem, edit or create scripts with your favourite editor.\n\n> OSX: `mount_webdav http://hostname/dav/ /Volumes/dav`  \n> Linux: Install davfs2, `mount.davfs http://hostname/dav/ /mnt/dav`  \n> VIM: `vim http://hostname/dav/script_name.py`\n\nWhen you are editing script without WebUI, you need to change it to `WebDAV Mode` while debugging. After you saved script in editor, WebUI can load and use latest script to debug your code.\n\nWhat does the progress bar mean on the dashboard?\n-------------------------------------------------\nWhen mouse move onto the progress bar, you can see the explaintions.\n\nFor 5m, 1h, 1d the number are the events triggered in 5m, 1h, 1d. For all progress bar, they are the number of total tasks in correspond status.\n\nOnly the tasks in DEBUG/RUNNING status will show the progress.\n\nHow many scheduler/fetcher/processor/result_worker do I need? or pyspider stop working\n--------------------------------------------------------------------------------------\nYou can have only have one scheduler, and multiple fetcher/processor/result_worker depends on the bottleneck. You can use the queue status on dashboard to view the bottleneck of the system:\n\n![run one step](imgs/queue_status.png)\n\nFor example, the number between scheduler and fetcher indicate the queue size of scheduler to fetchers, when it's hitting 100 (default maximum queue size), fetcher might crashed, or you should considered adding more fetchers.\n\nThe number `0+0` below fetcher indicate the queue size of new tasks and status packs between processors and schduler. You can put your mouse over the numbers to see the tips."
  },
  {
    "path": "docs/Quickstart.md",
    "content": "Quickstart\n==========\n\nInstallation\n------------\n\n* `pip install pyspider`\n* run command `pyspider`, visit [http://localhost:5000/](http://localhost:5000/)\n\nif you are using ubuntu, try:\n```\napt-get install python python-dev python-distribute python-pip \\\nlibcurl4-openssl-dev libxml2-dev libxslt1-dev python-lxml \\\nlibssl-dev zlib1g-dev\n```\nto install binary packages first.\n\n\nplease install PhantomJS if needed: http://phantomjs.org/build.html\n\nnote that PhantomJS will be enabled only if it is excutable in the `PATH` or in the System Environment\n\n**Note:** `pyspider` command is running pyspider in `all` mode, which running components in threads or subprocesses. For production environment, please refer to [Deployment](Deployment).\n\n**WARNING:** WebUI is opened to public by default, it can be used to execute any command which may harm to you system. Please use it in internal network or [enable `need-auth` for webui](http://docs.pyspider.org/en/latest/Command-Line/#-config).\n\nYour First Script\n-----------------\n\n```python\nfrom pyspider.libs.base_handler import *\n\n\nclass Handler(BaseHandler):\n    crawl_config = {\n    }\n\n    @every(minutes=24 * 60)\n    def on_start(self):\n        self.crawl('http://scrapy.org/', callback=self.index_page)\n\n    @config(age=10 * 24 * 60 * 60)\n    def index_page(self, response):\n        for each in response.doc('a[href^=\"http\"]').items():\n            self.crawl(each.attr.href, callback=self.detail_page)\n\n    @config(priority=2)\n    def detail_page(self, response):\n        return {\n            \"url\": response.url,\n            \"title\": response.doc('title').text(),\n        }\n```\n\n> * `def on_start(self)` is the entry point of the script. It will be called when you click the `run` button on dashboard.\n> * [`self.crawl(url, callback=self.index_page)`*](/apis/self.crawl) is the most important API here. It will add a new task to be crawled. Most of the options will be spicified via `self.crawl` arguments.\n> * `def index_page(self, response)` get a [`Response`*](/apis/Response) object. [`response.doc`*](/apis/Response/#responsedoc) is a [pyquery](https://pythonhosted.org/pyquery/) object which has jQuery-like API to select elements to be extracted.\n> * `def detail_page(self, response)` return a `dict` object as result. The result will be captured into `resultdb` by default. You can override `on_result(self, result)` method to manage the result yourself.\n\n\nMore things you may want to know:\n\n> * [`@every(minutes=24*60, seconds=0)`*](/apis/@every/) is a helper to tell the scheduler that `on_start` method should be called everyday.\n> * [`@config(age=10 * 24 * 60 * 60)`*](/apis/self.crawl/#configkwargs) specified the default `age` parameter of `self.crawl` with page type `index_page` (when `callback=self.index_page`). The parameter [`age`*](/apis/self.crawl/#age) can be specified via `self.crawl(url, age=10*24*60*60)` (highest priority) and `crawl_config` (lowest priority).\n> * [`age=10 * 24 * 60 * 60`*](/apis/self.crawl/#age) tell scheduler discard the request if it have been crawled in 10 days. pyspider will not crawl a same URL twice by default (discard forever), even you had modified the code, it's very common for beginners that runs the project the first time and modified it and run it the second time, it will not crawl again (read [`itag`](/apis/self.crawl/#itag) for solution)\n> * [`@config(priority=2)`*](/apis/self.crawl/#schedule) mark that detail pages should be crawled first.\n\nYou can test your script step by step by click the green `run` button. Switch to `follows` panel, click the play button to move on.\n\n![run one step](imgs/run_one_step.png)\n\nStart Running\n-------------\n\n1. Save your script.\n2. Back to dashboard find your project.\n3. Changing the `status` to `DEBUG` or `RUNNING`.\n4. Click the `run` button.\n\n![index demo](imgs/index_page.png)\n\nYour script is running now!\n"
  },
  {
    "path": "docs/Running-pyspider-with-Docker.md",
    "content": "```shell\n# mysql\ndocker run --name mysql -d -v /data/mysql:/var/lib/mysql -e MYSQL_ALLOW_EMPTY_PASSWORD=yes mysql:latest\n# rabbitmq\ndocker run --name rabbitmq -d rabbitmq:latest\n\n# phantomjs\ndocker run --name phantomjs -d binux/pyspider:latest phantomjs\n\n# result worker\ndocker run --name result_worker -m 128m -d --link mysql:mysql --link rabbitmq:rabbitmq binux/pyspider:latest result_worker\n# processor, run multiple instance if needed.\ndocker run --name processor -m 256m -d --link mysql:mysql --link rabbitmq:rabbitmq binux/pyspider:latest processor\n# fetcher, run multiple instance if needed.\ndocker run --name fetcher -m 256m -d --link phantomjs:phantomjs --link rabbitmq:rabbitmq binux/pyspider:latest fetcher --no-xmlrpc\n# scheduler\ndocker run --name scheduler -d --link mysql:mysql --link rabbitmq:rabbitmq binux/pyspider:latest scheduler\n# webui\ndocker run --name webui -m 256m -d -p 5000:5000 --link mysql:mysql --link rabbitmq:rabbitmq --link scheduler:scheduler --link phantomjs:phantomjs binux/pyspider:latest webui\n```\n\nor running with [Docker Compose](https://docs.docker.com/compose/) with `docker-compose.yml`:\n\nNOTE: It's recommended to run mysql and rabbitmq outside compose as they may not been restarted with pyspider. You can find commands to start mysql and rabbitmq service above.\n\n```\nphantomjs:\n  image: binux/pyspider:latest\n  command: phantomjs\nresult:\n  image: binux/pyspider:latest\n  external_links:\n    - mysql\n    - rabbitmq\n  command: result_worker\nprocessor:\n  image: binux/pyspider:latest\n  external_links:\n    - mysql\n    - rabbitmq\n  command: processor\nfetcher:\n  image: binux/pyspider:latest\n  external_links:\n    - rabbitmq\n  links:\n    - phantomjs\n  command : fetcher\nscheduler:\n  image: binux/pyspider:latest\n  external_links:\n    - mysql\n    - rabbitmq\n  command: scheduler\nwebui:\n  image: binux/pyspider:latest\n  external_links:\n    - mysql\n    - rabbitmq\n  links:\n    - scheduler\n    - phantomjs\n  command: webui\n  ports:\n    - \"5000:5000\"\n```\n\n`docker-compose up`\n\n\n"
  },
  {
    "path": "docs/Script-Environment.md",
    "content": "Script Environment\n==================\n\nVariables\n---------\n* `self.project_name`\n* `self.project` information about current project\n* `self.response`\n* `self.task`\n\nAbout Script\n------------\n* The name of `Handler` is not matters, but you need at least one class inherit from `BaseHandler`\n* A third parameter can be set to get task object: `def callback(self, response, task)`\n* Non-200 response will not submit to callback by default. Use `@catch_status_code_error` \n\nAbout Environment\n-----------------\n* `logging`, `print` and exceptions will be captured.\n* You can import other projects as module with `from projects import some_project`\n\n### Web view\n\n* view the page as a browser would render (approximately)\n\n### HTML view\n\n* view the HTML of the current callback (index_page, detail_page, etc.)\n\n### Follows view\n\n* view the callbacks that can be made from the current callback\n* index_page follows view will show the detail_page callbacks that can be executed.\n\n### Messages view\n\n* shows the messages send by [`self.send_message`](apis/self.send_message) API.\n\n### Enable CSS Selector Helper\n\n* Enable a CSS Selector Helper of the Web view. It gets the CSS Selector of the element you clicked then add it to your script.\n"
  },
  {
    "path": "docs/Working-with-Results.md",
    "content": "Working with Results\n====================\nDownloading and viewing your data from WebUI is convenient, but may not suitable for computer.\n\nWorking with ResultDB\n---------------------\nAlthough resultdb is only designed for result preview, not suitable for large scale storage. But if you want to grab data from resultdb, there are some simple snippets using database API that can help you to connect and select the data.\n\n```\nfrom pyspider.database import connect_database\nresultdb = connect_database(\"<your resutldb connection url>\")\nfor project in resultdb.projects:\n    for result in resultdb.select(project):\n        assert result['taskid']\n        assert result['url']\n        assert result['result']\n```\n\nThe `result['result']` is the object submitted by `return` statement from your script.\n\nWorking with ResultWorker\n-------------------------\nIn product environment, you may want to connect pyspider to your system / post-processing pipeline, rather than store it into resultdb. It's highly recommended to override ResultWorker.\n\n```\nfrom pyspider.result import ResultWorker\n\nclass MyResultWorker(ResultWorker):\n    def on_result(self, task, result):\n        assert task['taskid']\n        assert task['project']\n        assert task['url']\n        assert result\n        # your processing code goes here\n```\n\n`result` is the object submitted by `return` statement from your script.\n\nYou can put this script (e.g., `my_result_worker.py`) at the folder where you launch pyspider. Add argument for `result_worker` subcommand:\n\n`pyspider result_worker --result-cls=my_result_worker.MyResultWorker`\n\nOr\n\n```\n{\n  ...\n  \"result_worker\": {\n    \"result_cls\": \"my_result_worker.MyResultWorker\"\n  }\n  ...\n}\n```\n\nif you are using config file. [Please refer to Deployment](/Deployment)\n\nDesign Your Own Database Schema\n-------------------------------\nThe results stored in database is encoded as JSON for compatibility. It's highly recommended to design your own database, and override the ResultWorker described above.\n\nTIPS about Results\n-------------------\n#### Want to return more than one result in callback?\nAs resultdb de-duplicate results by taskid(url), the latest will overwrite previous results.\n\nOne workaround is using `send_message` API to make a `fake` taskid for each result.\n\n```\ndef detail_page(self, response):\n    for li in response.doc('li').items():\n        self.send_message(self.project_name, {\n            ...\n        }, url=response.url+\"#\"+li('a.product-sku').text())\n\ndef on_message(self, project, msg):\n    return msg\n```\n\nSee Also: [apis/self.send_message](/apis/self.send_message)\n"
  },
  {
    "path": "docs/apis/@catch_status_code_error.md",
    "content": "@catch_status_code_error\n========================\n\nnon-200 response will been regarded as fetch failed and will not pass to callback. use this decorator to override this feature.\n\n```python\ndef on_start(self):\n    self.crawl('http://httpbin.org/status/404', self.callback)\n\n@catch_status_code_error  \ndef callback(self, response):\n    ...\n```\n\n>  The `callback` would not be executed as the request is failed (with status code 404). With the `@catch_status_code_error` decorater, the `callback` would be executed even if the request failed.\n\n"
  },
  {
    "path": "docs/apis/@every.md",
    "content": "@every(minutes=0, seconds=0)\n============================\n\nmethod will been called every `minutes` or `seconds`\n\n\n```python\n@every(minutes=24 * 60)\ndef on_start(self):\n    for url in urllist:\n        self.crawl(url, callback=self.index_page)\n```\n\nThe urls would be restarted every 24 hours. Note that, if `age` is also used and the period is longer then `@every`, the crawl request would be discarded as it's regarded as not changed:\n\n```python\n@every(minutes=24 * 60)\ndef on_start(self):\n    self.crawl('http://www.example.org/', callback=self.index_page)\n\n@config(age=10 * 24 * 60 * 60)\ndef index_page(self):\n    ...\n```\n\n> Even though the crawl request triggered every day, but it's discard and only restarted every 10 days.\n\n"
  },
  {
    "path": "docs/apis/Response.md",
    "content": "Response\n========\n\nThe attributes of Response object.\n\n### Response.url\n\nfinal URL.\n\n### Response.text\n\nContent of response, in unicode.\n\nif `Response.encoding` is None and `chardet` module is available, encoding of content will be guessed.\n\n### Response.content\n\nContent of response, in bytes.\n\n### Response.doc\n\nA [PyQuery](https://pythonhosted.org/pyquery/) object of the response's content. Links have made as absolute by default.\n\nRefer to the documentation of PyQuery: [https://pythonhosted.org/pyquery/](https://pythonhosted.org/pyquery/)\n\nIt's important that I will repeat, refer to the documentation of PyQuery: [https://pythonhosted.org/pyquery/](https://pythonhosted.org/pyquery/)\n\n### Response.etree\n\nA [lxml](http://lxml.de/) object of the response's content.\n\n### Response.json\n\nThe JSON-encoded content of the response, if any.\n\n### Response.status_code\n\n### Response.orig_url\n\nIf there is any redirection during the request, here is the url you just submit via `self.crawl`.\n\n### Response.headers\n\nA case insensitive dict holds the headers of response.\n\n### Response.cookies\n\n### Response.error\n\nMessages when fetch error\n\n### Response.time\n\nTime used during fetching.\n\n### Response.ok\n\nTrue if `status_code` is 200 and no error.\n\n### Response.encoding\n\nEncoding of Response.content.\n\nIf Response.encoding is None, encoding will be guessed by header or content or `chardet`(if available).\n\nSet encoding of content manually will overwrite the guessed encoding.\n\n### Response.save\n\nThe object saved by [`self.crawl`](/apis/self.crawl/#save) API\n\n### Response.js_script_result\n\ncontent returned by JS script\n\n### Response.raise_for_status()\n\nRaise HTTPError if status code is not 200 or `Response.error` exists.\n\n"
  },
  {
    "path": "docs/apis/index.md",
    "content": "API Reference\n=============\n    \n- [self.crawl](self.crawl)\n- [Response](Response)\n- [self.send_message](self.send_message)\n- [@every](@every)\n- [@catch_status_code_error](@catch_status_code_error)\n"
  },
  {
    "path": "docs/apis/self.crawl.md",
    "content": "self.crawl\n===========\n\nself.crawl(url, **kwargs)\n-------------------------\n\n`self.crawl` is the main interface to tell pyspider which url(s) should be crawled.\n\n### Parameters:\n\n##### url\nthe url or url list to be crawled.\n\n##### callback\nthe method to parse the response. _default: `__call__` _\n\n\n```python\ndef on_start(self):\n    self.crawl('http://scrapy.org/', callback=self.index_page)\n```\n\nthe following parameters are optional\n\n##### age\n\nthe period of validity of the task. The page would be regarded as not modified during the period. _default: -1(never recrawl)_ \n\n```python\n@config(age=10 * 24 * 60 * 60)\ndef index_page(self, response):\n    ...\n```\n> Every pages parsed by the callback `index_page` would be regarded not changed within 10 days. If you submit the task within 10 days since last crawled it would be discarded.\n\n##### priority\n\nthe priority of task to be scheduled, higher the better. _default: 0_ \n\n```python\ndef index_page(self):\n    self.crawl('http://www.example.org/page2.html', callback=self.index_page)\n    self.crawl('http://www.example.org/233.html', callback=self.detail_page,\n               priority=1)\n```\n> The page `233.html` would be crawled before `page2.html`. Use this parameter can do a [BFS](http://en.wikipedia.org/wiki/Breadth-first_search) and reduce the number of tasks in queue(which may cost more memory resources).\n\n##### exetime\n\nthe executed time of task in unix timestamp. _default: 0(immediately)_ \n\n```python\nimport time\ndef on_start(self):\n    self.crawl('http://www.example.org/', callback=self.callback,\n               exetime=time.time()+30*60)\n```\n> The page would be crawled 30 minutes later.\n\n##### retries\n\nretry times while failed. _default: 3_ \n\n##### itag\n\na marker from frontier page to reveal the potential modification of the task. It will be compared to its last value, recrawl when it's changed. _default: None_ \n\n```python\ndef index_page(self, response):\n    for item in response.doc('.item').items():\n        self.crawl(item.find('a').attr.url, callback=self.detail_page,\n                   itag=item.find('.update-time').text())\n```\n> In the sample, `.update-time` is used as itag. If it's not changed, the request would be discarded.\n\nOr you can use `itag` with `Handler.crawl_config` to specify the script version if you want to restart all of the tasks.\n\n```python\nclass Handler(BaseHandler):\n    crawl_config = {\n        'itag': 'v223'\n    }\n```\n> Change the value of itag after you modified the script and click run button again. It doesn't matter if not set before. \n\n##### auto_recrawl\n\nwhen enabled, task would be recrawled every `age` time. _default: False_ \n\n```python\ndef on_start(self):\n    self.crawl('http://www.example.org/', callback=self.callback,\n               age=5*60*60, auto_recrawl=True)\n```\n> The page would be restarted every `age` 5 hours.\n\n##### method\n    \nHTTP method to use. _default: GET_ \n\n##### params\n\ndictionary of URL parameters to append to the URL. \n\n```python\ndef on_start(self):\n    self.crawl('http://httpbin.org/get', callback=self.callback,\n               params={'a': 123, 'b': 'c'})\n    self.crawl('http://httpbin.org/get?a=123&b=c', callback=self.callback)\n```\n> The two requests are the same.\n\n##### data\n\nthe body to attach to the request. If a dictionary is provided, form-encoding will take place. \n\n```python\ndef on_start(self):\n    self.crawl('http://httpbin.org/post', callback=self.callback,\n               method='POST', data={'a': 123, 'b': 'c'})\n```\n\n##### files\n\ndictionary of `{field: {filename: 'content'}}` files to multipart upload.` \n\n##### user_agent\n\nthe User-Agent of the request\n\n##### headers\n\ndictionary of headers to send. \n\n##### cookies\n\ndictionary of cookies to attach to this request. \n\n##### connect_timeout\n\ntimeout for initial connection in seconds. _default: 20_\n\n##### timeout\n\nmaximum time in seconds to fetch the page. _default: 120_ \n\n##### allow_redirects\n\nfollow `30x` redirect _default: True_ \n\n##### validate_cert\n\nFor HTTPS requests, validate the server’s certificate? _default: True_ \n\n##### proxy\n\nproxy server of `username:password@hostname:port` to use, only http proxy is supported currently. \n\n```python\nclass Handler(BaseHandler):\n    crawl_config = {\n        'proxy': 'localhost:8080'\n    }\n```\n> `Handler.crawl_config` can be used with `proxy` to set a proxy for whole project.\n\n##### etag \n\nuse HTTP Etag mechanism to pass the process if the content of the page is not changed. _default: True_ \n\n###### last_modified\n\nuse HTTP Last-Modified header mechanism to pass the process if the content of the page is not changed. _default: True_ \n\n##### fetch_type\n\nset to `js` to enable JavaScript fetcher. _default: None_ \n\n##### js_script\n\nJavaScript run before or after page loaded, should been wrapped by a function like `function() { document.write(\"binux\"); }`. \n\n\n```python\ndef on_start(self):\n    self.crawl('http://www.example.org/', callback=self.callback,\n               fetch_type='js', js_script='''\n               function() {\n                   window.scrollTo(0,document.body.scrollHeight);\n                   return 123;\n               }\n               ''')\n```\n> The script would scroll the page to bottom. The value returned in function could be captured via `Response.js_script_result`.\n\n##### js_run_at\n\nrun JavaScript specified via `js_script` at `document-start` or `document-end`. _default: `document-end`_ \n\n##### js_viewport_width/js_viewport_height\n\nset the size of the viewport for the JavaScript fetcher of the layout process. \n\n##### load_images\n\nload images when JavaScript fetcher enabled. _default: False_ \n\n##### save\n\na object pass to the callback method, can be visit via `response.save`. \n\n\n```python\ndef on_start(self):\n    self.crawl('http://www.example.org/', callback=self.callback,\n               save={'a': 123})\n\ndef callback(self, response):\n    return response.save['a']\n```\n> `123` would be returned in `callback`\n\n##### taskid\n    \nunique id to identify the task, default is the MD5 check code of the URL, can be overridden by method `def get_taskid(self, task)` \n\n```python\nimport json\nfrom pyspider.libs.utils import md5string\ndef get_taskid(self, task):\n    return md5string(task['url']+json.dumps(task['fetch'].get('data', '')))\n```\n> Only url is md5 -ed as taskid by default, the code above add `data` of POST request as part of taskid.\n\n##### force_update\n    \nforce update task params even if the task is in `ACTIVE` status. \n\n##### cancel\n\ncancel a task, should be used with `force_update` to cancel a active task. To cancel an `auto_recrawl` task, you should set `auto_recrawl=False` as well.\n\ncURL command\n------------\n\n`self.crawl(curl_command)`\n\ncURL is a command line tool to make a HTTP request. It can easily get form Chrome Devtools > Network panel,  right click the request and \"Copy as cURL\".\n\nYou can use cURL command as the first argument of `self.crawl`. It will parse the command and make the HTTP request just like curl do.\n\n@config(**kwargs)\n-----------------\ndefault parameters of `self.crawl` when use the decorated method as callback. For example:\n\n```python\n@config(age=15*60)\ndef index_page(self, response):\n    self.crawl('http://www.example.org/list-1.html', callback=self.index_page)\n    self.crawl('http://www.example.org/product-233', callback=self.detail_page)\n    \n@config(age=10*24*60*60)\ndef detail_page(self, response):\n    return {...}\n```\n\n`age` of `list-1.html` is 15min while the `age` of `product-233.html` is 10days. Because the callback of `product-233.html` is `detail_page`, means it's a `detail_page` so it shares the config of `detail_page`.\n\nHandler.crawl_config = {}\n-------------------------\ndefault parameters of `self.crawl` for the whole project. The parameters in `crawl_config` for scheduler (priority, retries, exetime, age, itag, force_update, auto_recrawl, cancel) will be joined when the task created, the parameters for fetcher and processor will be joined when executed. You can use this mechanism to change the fetch config (e.g. cookies) afterwards.\n\n```python\nclass Handler(BaseHandler):\n    crawl_config = {\n        'headers': {\n            'User-Agent': 'GoogleBot',\n        }\n    }\n    \n    ...\n```\n> crawl_config set a project level user-agent.\n\n"
  },
  {
    "path": "docs/apis/self.send_message.md",
    "content": "self.send_message\n=================\n\nself.send_message(project, msg, [url])\n--------------------------------------\nsend messages to other project. can been received by `def on_message(self, project, message)` callback.\n\n- `project` - other project name\n- `msg` - any json-able object\n- `url` - result will been overwrite if have same `taskid`. `send_message` share a same `taskid` by default. Change this to return multiple result by one response.\n\n```python\ndef detail_page(self, response):\n    for i, each in enumerate(response.json['products']):\n        self.send_message(self.project_name, {\n                \"name\": each['name'],\n                'price': each['prices'],\n             }, url=\"%s#%s\" % (response.url, i))\n\ndef on_message(self, project, msg):\n    return msg\n``` \n\npyspider send_message [OPTIONS] PROJECT MESSAGE\n-----------------------------------------------\n\nYou can also send message from command line.\n\n```\nUsage: pyspider send_message [OPTIONS] PROJECT MESSAGE\n\n  Send Message to project from command line\n\nOptions:\n  --scheduler-rpc TEXT  xmlrpc path of scheduler\n  --help                Show this message and exit.\n```\n\ndef on_message(self, project, message)\n--------------------------------------\nreceive message from other project\n"
  },
  {
    "path": "docs/conf.py",
    "content": "#!/usr/bin/env python\n# -*- encoding: utf-8 -*-\n# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:\n# Author: Binux<roy@binux.me>\n#         http://binux.me\n# Created on 2015-11-10 01:31:54\n\nimport sys\nfrom unittest.mock import MagicMock\nfrom recommonmark.parser import CommonMarkParser\n\nclass Mock(MagicMock):\n    @classmethod\n    def __getattr__(cls, name):\n            return Mock()\n\nMOCK_MODULES = ['pycurl', 'lxml', 'psycopg2']\nsys.modules.update((mod_name, Mock()) for mod_name in MOCK_MODULES)\n\nsource_parsers = {\n        '.md': CommonMarkParser,\n}\n\nsource_suffix = ['.rst', '.md']\n"
  },
  {
    "path": "docs/index.md",
    "content": "pyspider [![Build Status][Build Status]][Travis CI] [![Coverage Status][Coverage Status]][Coverage] [![Try][Try]][Demo]\n========\n\nA Powerful Spider(Web Crawler) System in Python. **[TRY IT NOW!][Demo]**\n\n- Write script in Python\n- Powerful WebUI with script editor, task monitor, project manager and result viewer\n- [MySQL](https://www.mysql.com/), [CouchDB](https://couchdb.apache.org), [MongoDB](https://www.mongodb.org/), [Redis](http://redis.io/), [SQLite](https://www.sqlite.org/), [Elasticsearch](https://www.elastic.co/products/elasticsearch); [PostgreSQL](http://www.postgresql.org/) with [SQLAlchemy](http://www.sqlalchemy.org/) as database backend\n- [RabbitMQ](http://www.rabbitmq.com/), [Redis](http://redis.io/) and [Kombu](http://kombu.readthedocs.org/) as message queue\n- Task priority, retry, periodical, recrawl by age, etc...\n- Distributed architecture, Crawl Javascript pages, Python 2&3, etc...\n\nTutorial: [http://docs.pyspider.org/en/latest/tutorial/](http://docs.pyspider.org/en/latest/tutorial/)  \nDocumentation: [http://docs.pyspider.org/](http://docs.pyspider.org/)  \nRelease notes: [https://github.com/binux/pyspider/releases](https://github.com/binux/pyspider/releases)  \n\nSample Code \n-----------\n\n```python\nfrom pyspider.libs.base_handler import *\n\n\nclass Handler(BaseHandler):\n    crawl_config = {\n    }\n\n    @every(minutes=24 * 60)\n    def on_start(self):\n        self.crawl('http://scrapy.org/', callback=self.index_page)\n\n    @config(age=10 * 24 * 60 * 60)\n    def index_page(self, response):\n        for each in response.doc('a[href^=\"http\"]').items():\n            self.crawl(each.attr.href, callback=self.detail_page)\n\n    def detail_page(self, response):\n        return {\n            \"url\": response.url,\n            \"title\": response.doc('title').text(),\n        }\n```\n\n[![Demo][Demo Img]][Demo]\n\n\nInstallation\n------------\n\n* `pip install pyspider`\n* run command `pyspider`, visit [http://localhost:5000/](http://localhost:5000/)\n\nQuickstart: [http://docs.pyspider.org/en/latest/Quickstart/](http://docs.pyspider.org/en/latest/Quickstart/)\n\nContribute\n----------\n\n* Use It\n* Open [Issue], send PR\n* [User Group]\n* [中文问答](http://segmentfault.com/t/pyspider)\n\n\nTODO\n----\n\n### v0.4.0\n\n- [x] local mode, load script from file.\n- [x] works as a framework (all components running in one process, no threads)\n- [x] redis\n- [x] shell mode like `scrapy shell` \n- [ ] a visual scraping interface like [portia](https://github.com/scrapinghub/portia)\n\n\n### more\n\n- [x] edit script with vim via [WebDAV](http://en.wikipedia.org/wiki/WebDAV)\n\n\nLicense\n-------\nLicensed under the Apache License, Version 2.0\n\n\n[Build Status]:         https://img.shields.io/travis/binux/pyspider/master.svg?style=flat\n[Travis CI]:            https://travis-ci.org/binux/pyspider\n[Coverage Status]:      https://img.shields.io/coveralls/binux/pyspider.svg?branch=master&style=flat\n[Coverage]:             https://coveralls.io/r/binux/pyspider\n[Try]:                  https://img.shields.io/badge/try-pyspider-blue.svg?style=flat\n[Demo]:                 http://demo.pyspider.org/\n[Demo Img]:             imgs/demo.png\n[Issue]:                https://github.com/binux/pyspider/issues\n[User Group]:           https://groups.google.com/group/pyspider-users\n"
  },
  {
    "path": "docs/tutorial/AJAX-and-more-HTTP.md",
    "content": "Level 2: AJAX and More HTTP\n===========================\n\nIn the last article, we discussed how to extract links and information from HTML documents. However, web contents are becoming more complicated using some technology like AJAX. You may find that page looks different with it in browser, the information you want to extract is not in the HTML of the page.\n\nIn this article, we will not write complete scrape scripts, but some snippets of web page cases using the technology like AJAX or needs some HTTP parameters besides URL.\n\nAJAX\n----\n\n[AJAX] is short for asynchronous JavaScript + XML. AJAX is using existing standards to update parts of a web page without loading the whole page. A common usage of AJAX is loading [JSON] data and render to HTML on the client side.\n\nYou may find elements missing in HTML fetched by pyspider or [wget](https://www.gnu.org/software/wget/). When you open it in browser some elements appear after page loaded with(maybe not) a 'loading' animation or words. For example, we want to scrape all channels of Dota 2 from [http://www.twitch.tv/directory/game/Dota%202](http://www.twitch.tv/directory/game/Dota%202)\n\n![twitch](../imgs/twitch.png)\n\nBut you may find nothing in the page. \n\n### Finding the request\n\nAs [AJAX] data is transferred in [HTTP], we can find the real request with the help of [Chrome Developer Tools](https://developer.chrome.com/devtools).\n\n0. Open a new tab.\n1. Use `Ctrl`+`Shift`+`I` (or `Cmd`+`Opt`+`I` on Mac) to open the DevTools.\n2. Switch to Network panel.\n3. Open the URL [http://www.twitch.tv/directory/game/Dota%202](http://www.twitch.tv/directory/game/Dota%202) in this tab.\n\nWhile resources are been loaded, you may find a table of requested resources.\n\n![developer tools network](../imgs/developer-tools-network.png)\n\nAJAX is using [XMLHttpRequest](https://developer.mozilla.org/en-US/docs/Web/API/XMLHttpRequest) object to send and retrieve data which is generally shorted as \"XHR\". Use Filter (funnel icon) to filter out the XHR requests. Glance over each requests using preview:\n\n![find request](../imgs/search-for-request.png)\n\nTo determine which one is the key request, you can use a filter to reduce the number of requests, guess the usage of the request by this path and parameters, then view the response contents for confirmation. Here we found the request: [http://api.twitch.tv/kraken/streams?limit=20&offset=0&game=Dota+2&broadcaster_language=&on_site=1](http://api.twitch.tv/kraken/streams?limit=20&offset=0&game=Dota+2&broadcaster_language=&on_site=1)\n\nNow, open the URL in a new tab, you would see a [JSON] data containing channel list. You can use a extension [JSONView](https://chrome.google.com/webstore/detail/jsonview/chklaanhfefbnpoihckbnefhakgolnmc) ([for Firfox](http://jsonview.com/)) to have a pretty printed view of JSON. A sample code is trying extract the name, current title and viewers of each channel.\n\n```\nclass Handler(BaseHandler):\n    @every(minutes=10)\n    def on_start(self):\n        self.crawl('http://api.twitch.tv/kraken/streams?limit=20&offset=0&game=Dota+2&broadcaster_language=&on_site=1', callback=self.index_page)\n\n    @config(age=10*60)\n    def index_page(self, response):\n        return [{\n                \"name\": x['channel']['display_name'],\n                \"viewers\": x['viewers'],\n                \"status\": x['channel'].get('status'),\n             } for x in response.json['streams']]\n```\n\n> * You can use `response.json` to convert content to a python `dict` object.\n> * As channel list is changing frequently, we update it every 10 minutes and use [`@config(age=10*60)`](/apis/self.crawl/#configkwargs) to set the age. Otherwise, it will be ignored as scheduler thinks it's new enough and refuse to update the content.\n\nHere is an online demo for twitch as well as a measure using [PhantomJS] which will be discussed in the next level: [http://demo.pyspider.org/debug/tutorial_twitch](http://demo.pyspider.org/debug/tutorial_twitch)\n\nHTTP\n----\n\n[HTTP] is the protocol to exchange or transfer hypertext. We had used it in last article, we used `self.crawl` and a URL to fetch HTML content which is transferred by [HTTP].\n\nWhen you got `403 Forbidden` or needed login. You need right parameters of HTTP request.\n\nA typical HTTP request message to [http://example.com/](http://example.com/) looks like:\n\n```\nGET / HTTP/1.1\nHost: example.com\nConnection: keep-alive\nCache-Control: max-age=0\nAccept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8\nUser-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/40.0.2214.45 Safari/537.36\nReferer: http://en.wikipedia.org/wiki/Example.com\nAccept-Encoding: gzip, deflate, sdch\nAccept-Language: zh-CN,zh;q=0.8\nIf-None-Match: \"359670651\"\nIf-Modified-Since: Fri, 09 Aug 2013 23:54:35 GMT\n```\n\n> * the first line contains [HTTP method](http://www.w3schools.com/tags/ref_httpmethods.asp), path and HTTP version\n> * several lines of request header fields in `key: value` format.\n> * if has message body(say POST request), an empty line and message body would be appended to end of request message.\n\nYou can get this with [Chrome Developer Tools](https://developer.chrome.com/devtools) - Network panel we used in above section:\n\n![request header](../imgs/request-headers.png)\n\nIn most case, the last thing you need is to copy right URL + method + headers + body from Network panel.\n\ncURL command\n------------\n\n`self.crawl` supports `cURL` command as argument to make the HTTP request. It will parse the arguments in the command and use it as fetch parameters.\n\nWith `Copy as cURL` of a request, you can get a `cURL` command and paste to `self.crawl(command)` to make crawling easy.\n\nHTTP Method\n-----------\n\n[HTTP] defines methods to indicate the desired action to be performed on the identified resource. Two commonly used methods are: GET and POST. GET is when you open a URL, requests the content of a specified resource. POST is used to submit data to server.\n\nTODO: need example here.\n\nHTTP Headers\n------------\n\n[HTTP Headers](http://en.wikipedia.org/wiki/List_of_HTTP_header_fields) is a list of parameters of a request. Some headers you need to attention while scraping:\n\n### User-Agent\n\nA [user agent string](http://en.wikipedia.org/wiki/User_agent_string) tell server the application type, operating system or software revision who send the HTTP request.\n\npyspider's default user agent string is: `pyspider/VERSION (+http://pyspider.org/)`\n\n### Referer\n\n[Referer](http://en.wikipedia.org/wiki/HTTP_referer) is the address of the previous webpage from which a link to the currently requested page was followed. Some website uses this in image resources to prevent deep linking.\n\nTODO: need example here.\n\nHTTP Cookie\n-----------\n\n[HTTP Cookie](http://en.wikipedia.org/wiki/HTTP_cookie) is a field in HTTP headers used for tracking which user is making the request. Generally used for user login and prevent unauthorized requests.\n\nYou can use [`self.crawl(cookies={\"key\": value})`](/apis/self.crawl/#fetch) to set cookie via a dict like API.\n\nTODO: need example here.\n\n[PhantomJS]:           http://phantomjs.org/\n[AJAX]:          http://en.wikipedia.org/wiki/Ajax_%28programming%29\n[JSON]:          http://en.wikipedia.org/wiki/JSON\n[HTTP]:          http://en.wikipedia.org/wiki/Hypertext_Transfer_Protocol\n"
  },
  {
    "path": "docs/tutorial/HTML-and-CSS-Selector.md",
    "content": "Level 1: HTML and CSS Selector\n==============================\n\nIn this tutorial, we will scrape information of movies and TV from [IMDb].\n\nAn online demo with completed code is: [http://demo.pyspider.org/debug/tutorial_imdb](http://demo.pyspider.org/debug/tutorial_imdb) .\n\n\nBefore Start\n------------\n\nYou should have pyspider installed. You can refer to the documentation [QuickStart](Quickstart). Or test your code on [demo.pyspider.org](http://demo.pyspider.org).\n\nSome basic knowledges you should know before scraping:\n\n* [Web][WWW] is a system of interlinked hypertext pages.\n* Pages is identified on the Web via uniform resource locator ([URL]).\n* Pages transferred via the Hypertext Transfer Protocol ([HTTP]).\n* Web Pages structured using HyperText Markup Language ([HTML]).\n\nTo scrape information from a web is\n\n1. Finding URLs of the pages contain the information we want.\n2. Fetching the pages via HTTP.\n3. Extracting the information from HTML.\n4. Finding more URL contains what we want, go back to 2.\n\n\nPick a start URL\n----------------\n\nAs we want to get all of the movies on [IMDb], the first thing is finding a list.  A good list page may:\n\n* containing links to the [movies](http://www.imdb.com/title/tt0167260/) as many as possible.\n* by following next page, you can traverse all of the movies. \n* list sorted by last updated time would be a great help to get latest movies.\n\nBy looking around at the index page of [IMDb], I found this:\n\n![IMDb front page](../imgs/tutorial_imdb_front.png)\n\n[http://www.imdb.com/search/title?count=100&title_type=feature,tv_series,tv_movie&ref_=nv_ch_mm_1](http://www.imdb.com/search/title?count=100&title_type=feature,tv_series,tv_movie&ref_=nv_ch_mm_1)\n\n### Creating a project\n\nYou can find \"Create\" on the bottom right of baseboard. Click and name a project.\n\n![Creating a project](../imgs/creating_a_project.png)\n\nChanging the crawl URL in `on_start` callback:\n\n```\n    @every(minutes=24 * 60)\n    def on_start(self):\n        self.crawl('http://www.imdb.com/search/title?count=100&title_type=feature,tv_series,tv_movie&ref_=nv_ch_mm_1', callback=self.index_page)\n```\n\n> * `self.crawl` would fetch the page and call the `callback` method to parse the response.  \n> * The [`@every` decorator](http://docs.pyspider.org/en/latest/apis/@every/) represents `on_start` would execute every day, to make sure not missing any new movies.\n\nClick the green `run` button, you should find a red 1 above follows, switch to follows panel, click the green play button:\n\n![Run one step](../imgs/run_one_step.png)\n\nIndex Page\n----------\n\nFrom [index page](http://www.imdb.com/search/title?count=100&title_type=feature,tv_series,tv_movie&ref_=nv_ch_mm_1), we need extract two things:\n\n* links of the movies like `http://www.imdb.com/title/tt0167260/`\n* links of [Next](http://www.imdb.com/search/title?count=100&ref_=nv_ch_mm_1&start=101&title_type=feature,tv_series,tv_movie) page\n\n### Find Movies\n\nAs you can see, the sample handler had already extracted 1900+ links from the page. A measure of extracting movie pages is filtering links with regular expression:\n\n```\nimport re\n...\n\n    def index_page(self, response):\n        for each in response.doc('a[href^=\"http\"]').items():\n            if re.match(\"http://www.imdb.com/title/tt\\d+/$\", each.attr.href):\n                self.crawl(each.attr.href, callback=self.detail_page)\n```\n\n> * `callback` is `self.detail_page` here to use another callback method to parse.\n\nRemember you can always use the power of python or anything you are familiar with to extract information. But using tools like CSS selector is recommended.\n\n### Next page\n\n#### CSS Selectors\n\nCSS selectors are patterns used by [CSS] to select HTML elements which are wanted to style. As elements containing information may have different style in document, It's appropriate to use CSS Selector to select elements we want. More information about CSS selectors could be found in above links:\n\n* [CSS Selectors](http://www.w3schools.com/css/css_selectors.asp)\n* [CSS Selector Reference](http://www.w3schools.com/cssref/css_selectors.asp)\n\nYou can use CSS Selector with built-in `response.doc` object, which is provided by [PyQuery], you may find the full reference there.\n\n#### CSS Selector Helper\n\npyspider provide a tool called `CSS selector helper` to make it easier to generate a selector pattern to element you clicked. Enable CSS selector helper by click the button and switch to `web` panel.\n\n![CSS Selector helper](../imgs/css_selector_helper.png)\n\nThe element will be highlighted in yellow while mouse over. When you click it, a pre-selected CSS Selector pattern is shown on the bar above. You can edit the features to locate the element and add it to your source code.\n\nclick \"Next »\" in the page and add selector pattern to your code:\n\n```\n    def index_page(self, response):\n        for each in response.doc('a[href^=\"http\"]').items():\n            if re.match(\"http://www.imdb.com/title/tt\\d+/$\", each.attr.href):\n                self.crawl(each.attr.href, callback=self.detail_page)\n        self.crawl(response.doc('#right a').attr.href, callback=self.index_page)\n```\n\nClick `run` again and move to the next page, we found that \"« Prev\" has the same selector pattern as \"Next »\". When using above code you may find pyspider selected the link of \"« Prev\", not \"Next »\". A solution for this is select both of them:\n\n```\n        self.crawl([x.attr.href for x in response.doc('#right a').items()], callback=self.index_page)\n```\n\nExtracting Information\n----------------------\n\nClick `run` again and follow to detail page.\n\nAdd keys you need to result dict and collect value using `CSS selector helper` repeatedly:\n\n```\n    def detail_page(self, response):\n        return {\n            \"url\": response.url,\n            \"title\": response.doc('.header > [itemprop=\"name\"]').text(),\n            \"rating\": response.doc('.star-box-giga-star').text(),\n            \"director\": [x.text() for x in response.doc('[itemprop=\"director\"] span').items()],\n        }\n```\n\nNote that, `CSS Selector helper` may not always work. You could write selector pattern manually with tools like [Chrome Dev Tools](https://developer.chrome.com/devtools):\n\n![inspect element](../imgs/inspect_element.png)\n\nYou doesn't need to write every ancestral element in selector pattern, only the elements which can differentiate with not needed elements, is enough. However, it needs experience on scraping or Web developing to know which attribute is important, can be used as locator. You can also test CSS Selector in the JavaScript Console by using `$$` like `$$('[itemprop=\"director\"] span')`\n\nRunning\n-------\n\n1. After tested you code, don't forget to save it.\n2. Back to dashboard find your project.\n3. Changing the `status` to `DEBUG` or `RUNNING`.\n4. Press the `run` button. \n\n![index demo](../imgs/index_page.png)\n\nNotes\n-----\n\nThe script is just a simple, you may found more issues when scraping IMDb:\n\n* ref in list page url is for tracing user, it's better remove it.\n* IMDb does not serve more than 100000 results for any query, you need find more lists with lesser results, like [this](http://www.imdb.com/search/title?genres=action&title_type=feature&sort=moviemeter,asc)\n* You may need a list sorted by last updated time and update it with a shorter interval.\n* Some attribute is hard to extract, you may need write selector pattern on hand or using [XPATH](http://www.w3schools.com/xpath/xpath_syntax.asp) and/or some python code to extract information.\n\n[IMDb]:          http://www.imdb.com/\n[WWW]:           http://en.wikipedia.org/wiki/World_Wide_Web\n[HTTP]:          http://en.wikipedia.org/wiki/Hypertext_Transfer_Protocol\n[HTML]:          http://en.wikipedia.org/wiki/HTML\n[URL]:           http://en.wikipedia.org/wiki/Uniform_resource_locator\n[CSS]:           https://developer.mozilla.org/en-US/docs/Web/Guide/CSS/Getting_Started/What_is_CSS\n[PyQuery]:       https://pythonhosted.org/pyquery/\n"
  },
  {
    "path": "docs/tutorial/Render-with-PhantomJS.md",
    "content": "Level 3: Render with PhantomJS\n==============================\n\nSometimes web page is too complex to find out the API request. It's time to meet the power of [PhantomJS].\n\nTo use PhantomJS, you should have PhantomJS [installed](http://phantomjs.org/download.html). If you are running pyspider with `all` mode, PhantomJS is enabled if excutable in the `PATH`.\n\nMake sure phantomjs is working by running\n```\n$ pyspider phantomjs\n```\n\nContinue with the rest of the tutorial if the output is\n```\nWeb server running on port 25555\n```\n\nUse PhantomJS\n-------------\n\nWhen pyspider with PhantomJS connected, you can enable this feature by adding a parameter `fetch_type='js'` to `self.crawl`. We use PhantomJS to scrape channel list of  [http://www.twitch.tv/directory/game/Dota%202](http://www.twitch.tv/directory/game/Dota%202) which is loaded with AJAX we discussed in [Level 2](tutorial/AJAX-and-more-HTTP#ajax):\n\n```\nclass Handler(BaseHandler):\n    def on_start(self):\n        self.crawl('http://www.twitch.tv/directory/game/Dota%202',\n                   fetch_type='js', callback=self.index_page)\n             \n    def index_page(self, response):\n        return {\n            \"url\": response.url,\n            \"channels\": [{\n                \"title\": x('.title').text(),\n                \"viewers\": x('.info').contents()[2],\n                \"name\": x('.info a').text(),\n            } for x in response.doc('.stream.item').items()]\n        }\n```\n> I used some API to handle the list of streams. You can find complete API reference from [PyQuery complete API](https://pythonhosted.org/pyquery/api.html)\n\nRunning JavaScript on Page\n--------------------------\n\nWe will try to scrape images from [http://www.pinterest.com/categories/popular/](http://www.pinterest.com/categories/popular/) in this section. Only 25 images is shown at the beginning, more images would be loaded when you scroll to the bottom of the page.\n\nTo scrape images as many as posible we can use a [`js_script` parameter](/apis/self.crawl/#enable-javascript-fetcher-need-support-by-fetcher) to set some function wrapped JavaScript codes to simulate the scroll action: \n\n```\nclass Handler(BaseHandler):\n    def on_start(self):\n        self.crawl('http://www.pinterest.com/categories/popular/',\n                   fetch_type='js', js_script=\"\"\"\n                   function() {\n                       window.scrollTo(0,document.body.scrollHeight);\n                   }\n                   \"\"\", callback=self.index_page)\n\n    def index_page(self, response):\n        return {\n            \"url\": response.url,\n            \"images\": [{\n                \"title\": x('.richPinGridTitle').text(),\n                \"img\": x('.pinImg').attr('src'),\n                \"author\": x('.creditName').text(),\n            } for x in response.doc('.item').items() if x('.pinImg')]\n        }\n```\n\n> * Script would been executed after page loaded(can been changed via [`js_run_at` parameter](/apis/self.crawl/#enable-javascript-fetcher-need-support-by-fetcher))\n> * We scroll once after page loaded, you can scroll multiple times using [`setTimeout`](https://developer.mozilla.org/en-US/docs/Web/API/WindowTimers.setTimeout). PhantomJS will fetch as many items as possible before timeout arrived.\n\nOnline demo: [http://demo.pyspider.org/debug/tutorial_pinterest](http://demo.pyspider.org/debug/tutorial_pinterest)\n\n\n\n[PhantomJS]:           http://phantomjs.org/\n"
  },
  {
    "path": "docs/tutorial/index.md",
    "content": "pyspider Tutorial\n=================\n\n> The best way to learn how to scrap is learning how to make it.\n\n* [Level 1: HTML and CSS Selector](HTML-and-CSS-Selector)\n* [Level 2: AJAX and More HTTP](AJAX-and-more-HTTP)\n* [Level 3: Render with PhantomJS](Render-with-PhantomJS)\n\nIf you have problem using pyspider, [user group](https://groups.google.com/group/pyspider-users) is a place for discussing.\n"
  },
  {
    "path": "mkdocs.yml",
    "content": "site_name: pyspider\nsite_description: A Powerful Spider(Web Crawler) System in Python.\nsite_author: binux\nrepo_url: https://github.com/binux/pyspider\npages:\n- Introduction: index.md\n- Quickstart: Quickstart.md\n- Command Line: Command-Line.md\n- Tutorial:\n  - Index: tutorial/index.md\n  - 'Level 1: HTML and CSS Selector': tutorial/HTML-and-CSS-Selector.md\n  - 'Level 2: AJAX and More HTTP': tutorial/AJAX-and-more-HTTP.md\n  - 'Level 3: Render with PhantomJS': tutorial/Render-with-PhantomJS.md\n- About pyspider:\n  - Architecture: Architecture.md\n  - About Tasks: About-Tasks.md\n  - About Projects: About-Projects.md\n  - Script Environment: Script-Environment.md\n  - Working with Results: Working-with-Results.md\n- API Reference:\n  - Index: apis/index.md\n  - self.crawl: apis/self.crawl.md\n  - Response: apis/Response.md\n  - self.send_message: apis/self.send_message.md\n  - '@catch_status_code_error': apis/@catch_status_code_error.md\n  - '@every': apis/@every.md\n- Deployment: Deployment.md\n- Running pyspider with Docker: Running-pyspider-with-Docker.md\n- Deployment of demo.pyspider.org: Deployment-demo.pyspider.org.md\n- Frequently Asked Questions: Frequently-Asked-Questions.md\n\ntheme: readthedocs\nmarkdown_extensions: ['toc(permalink=true)', ]\n"
  },
  {
    "path": "pyspider/__init__.py",
    "content": "#!/usr/bin/env python\n# -*- encoding: utf-8 -*-\n# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:\n# Author: Binux<i@binux.me>\n#         http://binux.me\n# Created on 2014-11-17 19:17:12\n\n__version__ = '0.4.0'\n"
  },
  {
    "path": "pyspider/database/__init__.py",
    "content": "#!/usr/bin/env python\n# -*- encoding: utf-8 -*-\n# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:\n# Author: Binux<i@binux.me>\n#         http://binux.me\n# Created on 2014-10-08 15:04:08\n\nimport os, requests, json\nfrom six.moves.urllib.parse import urlparse, parse_qs\n\n\ndef connect_database(url):\n    \"\"\"\n    create database object by url\n\n    mysql:\n        mysql+type://user:passwd@host:port/database\n    sqlite:\n        # relative path\n        sqlite+type:///path/to/database.db\n        # absolute path\n        sqlite+type:////path/to/database.db\n        # memory database\n        sqlite+type://\n    mongodb:\n        mongodb+type://[username:password@]host1[:port1][,host2[:port2],...[,hostN[:portN]]][/[database][?options]]\n        more: http://docs.mongodb.org/manual/reference/connection-string/\n    sqlalchemy:\n        sqlalchemy+postgresql+type://user:passwd@host:port/database\n        sqlalchemy+mysql+mysqlconnector+type://user:passwd@host:port/database\n        more: http://docs.sqlalchemy.org/en/rel_0_9/core/engines.html\n    redis:\n        redis+taskdb://host:port/db\n    elasticsearch:\n        elasticsearch+type://host:port/?index=pyspider\n    couchdb:\n        couchdb+type://[username:password@]host[:port]\n    local:\n        local+projectdb://filepath,filepath\n\n    type:\n        taskdb\n        projectdb\n        resultdb\n\n    \"\"\"\n    db = _connect_database(url)\n    db.copy = lambda: _connect_database(url)\n    return db\n\n\ndef _connect_database(url):  # NOQA\n    parsed = urlparse(url)\n\n    scheme = parsed.scheme.split('+')\n    if len(scheme) == 1:\n        raise Exception('wrong scheme format: %s' % parsed.scheme)\n    else:\n        engine, dbtype = scheme[0], scheme[-1]\n        other_scheme = \"+\".join(scheme[1:-1])\n\n    if dbtype not in ('taskdb', 'projectdb', 'resultdb'):\n        raise LookupError('unknown database type: %s, '\n                          'type should be one of [\"taskdb\", \"projectdb\", \"resultdb\"]', dbtype)\n\n    if engine == 'mysql':\n        return _connect_mysql(parsed,dbtype)\n\n    elif engine == 'sqlite':\n        return _connect_sqlite(parsed,dbtype)\n    elif engine == 'mongodb':\n        return _connect_mongodb(parsed,dbtype,url)\n\n    elif engine == 'sqlalchemy':\n        return _connect_sqlalchemy(parsed, dbtype, url, other_scheme)\n\n\n    elif engine == 'redis':\n        if dbtype == 'taskdb':\n            from .redis.taskdb import TaskDB\n            return TaskDB(parsed.hostname, parsed.port,\n                          int(parsed.path.strip('/') or 0))\n        else:\n            raise LookupError('not supported dbtype: %s', dbtype)\n    elif engine == 'local':\n        scripts = url.split('//', 1)[1].split(',')\n        if dbtype == 'projectdb':\n            from .local.projectdb import ProjectDB\n            return ProjectDB(scripts)\n        else:\n            raise LookupError('not supported dbtype: %s', dbtype)\n    elif engine == 'elasticsearch' or engine == 'es':\n        return _connect_elasticsearch(parsed, dbtype)\n\n    elif engine == 'couchdb':\n        return _connect_couchdb(parsed, dbtype, url)\n\n    else:\n        raise Exception('unknown engine: %s' % engine)\n\n\ndef _connect_mysql(parsed,dbtype):\n    parames = {}\n    if parsed.username:\n        parames['user'] = parsed.username\n    if parsed.password:\n        parames['passwd'] = parsed.password\n    if parsed.hostname:\n        parames['host'] = parsed.hostname\n    if parsed.port:\n        parames['port'] = parsed.port\n    if parsed.path.strip('/'):\n        parames['database'] = parsed.path.strip('/')\n\n    if dbtype == 'taskdb':\n        from .mysql.taskdb import TaskDB\n        return TaskDB(**parames)\n    elif dbtype == 'projectdb':\n        from .mysql.projectdb import ProjectDB\n        return ProjectDB(**parames)\n    elif dbtype == 'resultdb':\n        from .mysql.resultdb import ResultDB\n        return ResultDB(**parames)\n    else:\n        raise LookupError\n\n\ndef _connect_sqlite(parsed,dbtype):\n    if parsed.path.startswith('//'):\n        path = '/' + parsed.path.strip('/')\n    elif parsed.path.startswith('/'):\n        path = './' + parsed.path.strip('/')\n    elif not parsed.path:\n        path = ':memory:'\n    else:\n        raise Exception('error path: %s' % parsed.path)\n\n    if dbtype == 'taskdb':\n        from .sqlite.taskdb import TaskDB\n        return TaskDB(path)\n    elif dbtype == 'projectdb':\n        from .sqlite.projectdb import ProjectDB\n        return ProjectDB(path)\n    elif dbtype == 'resultdb':\n        from .sqlite.resultdb import ResultDB\n        return ResultDB(path)\n    else:\n        raise LookupError\n\n\ndef _connect_mongodb(parsed,dbtype,url):\n    url = url.replace(parsed.scheme, 'mongodb')\n    parames = {}\n    if parsed.path.strip('/'):\n        parames['database'] = parsed.path.strip('/')\n\n    if dbtype == 'taskdb':\n        from .mongodb.taskdb import TaskDB\n        return TaskDB(url, **parames)\n    elif dbtype == 'projectdb':\n        from .mongodb.projectdb import ProjectDB\n        return ProjectDB(url, **parames)\n    elif dbtype == 'resultdb':\n        from .mongodb.resultdb import ResultDB\n        return ResultDB(url, **parames)\n    else:\n        raise LookupError\n\n\ndef _connect_sqlalchemy(parsed, dbtype,url, other_scheme):\n    if not other_scheme:\n        raise Exception('wrong scheme format: %s' % parsed.scheme)\n    url = url.replace(parsed.scheme, other_scheme)\n    if dbtype == 'taskdb':\n        from .sqlalchemy.taskdb import TaskDB\n        return TaskDB(url)\n    elif dbtype == 'projectdb':\n        from .sqlalchemy.projectdb import ProjectDB\n        return ProjectDB(url)\n    elif dbtype == 'resultdb':\n        from .sqlalchemy.resultdb import ResultDB\n        return ResultDB(url)\n    else:\n        raise LookupError\n\n\ndef _connect_elasticsearch(parsed, dbtype):\n    # in python 2.6 url like \"http://host/?query\", query will not been splitted\n    if parsed.path.startswith('/?'):\n        index = parse_qs(parsed.path[2:])\n    else:\n        index = parse_qs(parsed.query)\n    if 'index' in index and index['index']:\n        index = index['index'][0]\n    else:\n        index = 'pyspider'\n\n    if dbtype == 'projectdb':\n        from .elasticsearch.projectdb import ProjectDB\n        return ProjectDB([parsed.netloc], index=index)\n    elif dbtype == 'resultdb':\n        from .elasticsearch.resultdb import ResultDB\n        return ResultDB([parsed.netloc], index=index)\n    elif dbtype == 'taskdb':\n        from .elasticsearch.taskdb import TaskDB\n        return TaskDB([parsed.netloc], index=index)\n\n\ndef _connect_couchdb(parsed, dbtype, url):\n    if os.environ.get('COUCHDB_HTTPS'):\n        url = \"https://\" + parsed.netloc + \"/\"\n    else:\n        url = \"http://\" + parsed.netloc + \"/\"\n    params = {}\n\n    # default to env, then url, then hard coded\n    params['username'] = os.environ.get('COUCHDB_USER') or parsed.username\n    params['password'] = os.environ.get('COUCHDB_PASSWORD') or parsed.password\n\n    if dbtype == 'taskdb':\n        from .couchdb.taskdb import TaskDB\n        return TaskDB(url, **params)\n    elif dbtype == 'projectdb':\n        from .couchdb.projectdb import ProjectDB\n        return ProjectDB(url, **params)\n    elif dbtype == 'resultdb':\n        from .couchdb.resultdb import ResultDB\n        return ResultDB(url, **params)\n    else:\n        raise LookupError\n"
  },
  {
    "path": "pyspider/database/base/__init__.py",
    "content": ""
  },
  {
    "path": "pyspider/database/base/projectdb.py",
    "content": "#!/usr/bin/env python\n# -*- encoding: utf-8 -*-\n# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:\n# Author: Binux<i@binux.me>\n#         http://binux.me\n# Created on 2014-02-09 11:28:52\n\nimport re\n\n# NOTE: When get/get_all/check_update from database with default fields,\n#       all following fields should be included in output dict.\n{\n    'project': {\n        'name': str,\n        'group': str,\n        'status': str,\n        'script': str,\n        # 'config': str,\n        'comments': str,\n        # 'priority': int,\n        'rate': int,\n        'burst': int,\n        'updatetime': int,\n    }\n}\n\n\nclass ProjectDB(object):\n    status_str = [\n        'TODO',\n        'STOP',\n        'CHECKING',\n        'DEBUG',\n        'RUNNING',\n    ]\n\n    def insert(self, name, obj={}):\n        raise NotImplementedError\n\n    def update(self, name, obj={}, **kwargs):\n        raise NotImplementedError\n\n    def get_all(self, fields=None):\n        raise NotImplementedError\n\n    def get(self, name, fields):\n        raise NotImplementedError\n\n    def drop(self, name):\n        raise NotImplementedError\n\n    def check_update(self, timestamp, fields=None):\n        raise NotImplementedError\n\n    def split_group(self, group, lower=True):\n        if lower:\n            return re.split(\"\\W+\", (group or '').lower())\n        else:\n            return re.split(\"\\W+\", group or '')\n\n    def verify_project_name(self, name):\n        if len(name) > 64:\n            return False\n        if re.search(r\"[^\\w]\", name):\n            return False\n        return True\n\n    def copy(self):\n        '''\n        database should be able to copy itself to create new connection\n\n        it's implemented automatically by pyspider.database.connect_database\n        if you are not create database connection via connect_database method,\n        you should implement this\n        '''\n        raise NotImplementedError\n"
  },
  {
    "path": "pyspider/database/base/resultdb.py",
    "content": "#!/usr/bin/env python\n# -*- encoding: utf-8 -*-\n# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:\n# Author: Binux<i@binux.me>\n#         http://binux.me\n# Created on 2014-10-11 18:40:03\n\n# result schema\n{\n    'result': {\n        'taskid': str,  # new, not changeable\n        'project': str,  # new, not changeable\n        'url': str,  # new, not changeable\n        'result': str,  # json string\n        'updatetime': int,\n    }\n}\n\n\nclass ResultDB(object):\n    \"\"\"\n    database for result\n    \"\"\"\n    projects = set()  # projects in resultdb\n\n    def save(self, project, taskid, url, result):\n        raise NotImplementedError\n\n    def select(self, project, fields=None, offset=0, limit=None):\n        raise NotImplementedError\n\n    def count(self, project):\n        raise NotImplementedError\n\n    def get(self, project, taskid, fields=None):\n        raise NotImplementedError\n\n    def drop(self, project):\n        raise NotImplementedError\n\n    def copy(self):\n        '''\n        database should be able to copy itself to create new connection\n\n        it's implemented automatically by pyspider.database.connect_database\n        if you are not create database connection via connect_database method,\n        you should implement this\n        '''\n        raise NotImplementedError\n"
  },
  {
    "path": "pyspider/database/base/taskdb.py",
    "content": "#!/usr/bin/env python\n# -*- encoding: utf-8 -*-\n# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:\n# Author: Binux<i@binux.me>\n#         http://binux.me\n# Created on 2014-02-08 10:28:48\n\n# task schema\n{\n    'task': {\n        'taskid': str,  # new, not change\n        'project': str,  # new, not change\n        'url': str,  # new, not change\n        'status': int,  # change\n        'schedule': {\n            'priority': int,\n            'retries': int,\n            'retried': int,\n            'exetime': int,\n            'age': int,\n            'itag': str,\n            # 'recrawl': int\n        },  # new and restart\n        'fetch': {\n            'method': str,\n            'headers': dict,\n            'data': str,\n            'timeout': int,\n            'save': dict,\n        },  # new and restart\n        'process': {\n            'callback': str,\n        },  # new and restart\n        'track': {\n            'fetch': {\n                'ok': bool,\n                'time': int,\n                'status_code': int,\n                'headers': dict,\n                'encoding': str,\n                'content': str,\n            },\n            'process': {\n                'ok': bool,\n                'time': int,\n                'follows': int,\n                'outputs': int,\n                'logs': str,\n                'exception': str,\n            },\n            'save': object,  # jsonable object saved by processor\n        },  # finish\n        'lastcrawltime': int,  # keep between request\n        'updatetime': int,  # keep between request\n    }\n}\n\n\nclass TaskDB(object):\n    ACTIVE = 1\n    SUCCESS = 2\n    FAILED = 3\n    BAD = 4\n\n    projects = set()  # projects in taskdb\n\n    def load_tasks(self, status, project=None, fields=None):\n        raise NotImplementedError\n\n    def get_task(self, project, taskid, fields=None):\n        raise NotImplementedError\n\n    def status_count(self, project):\n        '''\n        return a dict\n        '''\n        raise NotImplementedError\n\n    def insert(self, project, taskid, obj={}):\n        raise NotImplementedError\n\n    def update(self, project, taskid, obj={}, **kwargs):\n        raise NotImplementedError\n\n    def drop(self, project):\n        raise NotImplementedError\n\n    @staticmethod\n    def status_to_string(status):\n        return {\n            1: 'ACTIVE',\n            2: 'SUCCESS',\n            3: 'FAILED',\n            4: 'BAD',\n        }.get(status, 'UNKNOWN')\n\n    @staticmethod\n    def status_to_int(status):\n        return {\n            'ACTIVE': 1,\n            'SUCCESS': 2,\n            'FAILED': 3,\n            'BAD': 4,\n        }.get(status, 4)\n\n    def copy(self):\n        '''\n        database should be able to copy itself to create new connection\n\n        it's implemented automatically by pyspider.database.connect_database\n        if you are not create database connection via connect_database method,\n        you should implement this\n        '''\n        raise NotImplementedError\n"
  },
  {
    "path": "pyspider/database/basedb.py",
    "content": "#!/usr/bin/env python\n# -*- encoding: utf-8 -*-\n# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:\n# Author: Binux<i@binux.com>\n#         http://binux.me\n# Created on 2012-08-30 17:43:49\n\nfrom __future__ import unicode_literals, division, absolute_import\n\nimport logging\nlogger = logging.getLogger('database.basedb')\n\nfrom six import itervalues\nfrom pyspider.libs import utils\n\n\nclass BaseDB:\n\n    '''\n    BaseDB\n\n    dbcur should be overwirte\n    '''\n    __tablename__ = None\n    placeholder = '%s'\n    maxlimit = -1\n\n    @staticmethod\n    def escape(string):\n        return '`%s`' % string\n\n    @property\n    def dbcur(self):\n        raise NotImplementedError\n\n    def _execute(self, sql_query, values=[]):\n        dbcur = self.dbcur\n        dbcur.execute(sql_query, values)\n        return dbcur\n\n    def _select(self, tablename=None, what=\"*\", where=\"\", where_values=[], offset=0, limit=None):\n        tablename = self.escape(tablename or self.__tablename__)\n        if isinstance(what, list) or isinstance(what, tuple) or what is None:\n            what = ','.join(self.escape(f) for f in what) if what else '*'\n\n        sql_query = \"SELECT %s FROM %s\" % (what, tablename)\n        if where:\n            sql_query += \" WHERE %s\" % where\n        if limit:\n            sql_query += \" LIMIT %d, %d\" % (offset, limit)\n        elif offset:\n            sql_query += \" LIMIT %d, %d\" % (offset, self.maxlimit)\n        logger.debug(\"<sql: %s>\", sql_query)\n\n        for row in self._execute(sql_query, where_values):\n            yield row\n\n    def _select2dic(self, tablename=None, what=\"*\", where=\"\", where_values=[],\n                    order=None, offset=0, limit=None):\n        tablename = self.escape(tablename or self.__tablename__)\n        if isinstance(what, list) or isinstance(what, tuple) or what is None:\n            what = ','.join(self.escape(f) for f in what) if what else '*'\n\n        sql_query = \"SELECT %s FROM %s\" % (what, tablename)\n        if where:\n            sql_query += \" WHERE %s\" % where\n        if order:\n            sql_query += ' ORDER BY %s' % order\n        if limit:\n            sql_query += \" LIMIT %d, %d\" % (offset, limit)\n        elif offset:\n            sql_query += \" LIMIT %d, %d\" % (offset, self.maxlimit)\n        logger.debug(\"<sql: %s>\", sql_query)\n\n        dbcur = self._execute(sql_query, where_values)\n\n        # f[0] may return bytes type\n        # https://github.com/mysql/mysql-connector-python/pull/37\n        fields = [utils.text(f[0]) for f in dbcur.description]\n\n        for row in dbcur:\n            yield dict(zip(fields, row))\n\n    def _replace(self, tablename=None, **values):\n        tablename = self.escape(tablename or self.__tablename__)\n        if values:\n            _keys = \", \".join(self.escape(k) for k in values)\n            _values = \", \".join([self.placeholder, ] * len(values))\n            sql_query = \"REPLACE INTO %s (%s) VALUES (%s)\" % (tablename, _keys, _values)\n        else:\n            sql_query = \"REPLACE INTO %s DEFAULT VALUES\" % tablename\n        logger.debug(\"<sql: %s>\", sql_query)\n\n        if values:\n            dbcur = self._execute(sql_query, list(itervalues(values)))\n        else:\n            dbcur = self._execute(sql_query)\n        return dbcur.lastrowid\n\n    def _insert(self, tablename=None, **values):\n        tablename = self.escape(tablename or self.__tablename__)\n        if values:\n            _keys = \", \".join((self.escape(k) for k in values))\n            _values = \", \".join([self.placeholder, ] * len(values))\n            sql_query = \"INSERT INTO %s (%s) VALUES (%s)\" % (tablename, _keys, _values)\n        else:\n            sql_query = \"INSERT INTO %s DEFAULT VALUES\" % tablename\n        logger.debug(\"<sql: %s>\", sql_query)\n\n        if values:\n            dbcur = self._execute(sql_query, list(itervalues(values)))\n        else:\n            dbcur = self._execute(sql_query)\n        return dbcur.lastrowid\n\n    def _update(self, tablename=None, where=\"1=0\", where_values=[], **values):\n        tablename = self.escape(tablename or self.__tablename__)\n        _key_values = \", \".join([\n            \"%s = %s\" % (self.escape(k), self.placeholder) for k in values\n        ])\n        sql_query = \"UPDATE %s SET %s WHERE %s\" % (tablename, _key_values, where)\n        logger.debug(\"<sql: %s>\", sql_query)\n\n        return self._execute(sql_query, list(itervalues(values)) + list(where_values))\n\n    def _delete(self, tablename=None, where=\"1=0\", where_values=[]):\n        tablename = self.escape(tablename or self.__tablename__)\n        sql_query = \"DELETE FROM %s\" % tablename\n        if where:\n            sql_query += \" WHERE %s\" % where\n        logger.debug(\"<sql: %s>\", sql_query)\n\n        return self._execute(sql_query, where_values)\n\nif __name__ == \"__main__\":\n    import sqlite3\n\n    class DB(BaseDB):\n        __tablename__ = \"test\"\n        placeholder = \"?\"\n\n        def __init__(self):\n            self.conn = sqlite3.connect(\":memory:\")\n            cursor = self.conn.cursor()\n            cursor.execute(\n                '''CREATE TABLE `%s` (id INTEGER PRIMARY KEY AUTOINCREMENT, name, age)'''\n                % self.__tablename__\n            )\n\n        @property\n        def dbcur(self):\n            return self.conn.cursor()\n\n    db = DB()\n    assert db._insert(db.__tablename__, name=\"binux\", age=23) == 1\n    assert db._select(db.__tablename__, \"name, age\").next() == (\"binux\", 23)\n    assert db._select2dic(db.__tablename__, \"name, age\").next()[\"name\"] == \"binux\"\n    assert db._select2dic(db.__tablename__, \"name, age\").next()[\"age\"] == 23\n    db._replace(db.__tablename__, id=1, age=24)\n    assert db._select(db.__tablename__, \"name, age\").next() == (None, 24)\n    db._update(db.__tablename__, \"id = 1\", age=16)\n    assert db._select(db.__tablename__, \"name, age\").next() == (None, 16)\n    db._delete(db.__tablename__, \"id = 1\")\n    assert [row for row in db._select(db.__tablename__)] == []\n"
  },
  {
    "path": "pyspider/database/couchdb/__init__.py",
    "content": ""
  },
  {
    "path": "pyspider/database/couchdb/couchdbbase.py",
    "content": "import time, requests, json\nfrom requests.auth import HTTPBasicAuth\n\nclass SplitTableMixin(object):\n    UPDATE_PROJECTS_TIME = 10 * 60\n\n    def __init__(self):\n        self.session = requests.session()\n        if self.username:\n            self.session.auth = HTTPBasicAuth(self.username, self.password)\n        self.session.headers.update({'Content-Type': 'application/json'})\n\n    def _collection_name(self, project):\n        if self.collection_prefix:\n            return \"%s_%s\" % (self.collection_prefix, project)\n        else:\n            return project\n\n\n    @property\n    def projects(self):\n        if time.time() - getattr(self, '_last_update_projects', 0) > self.UPDATE_PROJECTS_TIME:\n            self._list_project()\n        return self._projects\n\n\n    @projects.setter\n    def projects(self, value):\n        self._projects = value\n\n\n    def _list_project(self):\n        self._last_update_projects = time.time()\n        self.projects = set()\n        if self.collection_prefix:\n            prefix = \"%s.\" % self.collection_prefix\n        else:\n            prefix = ''\n\n        url = self.base_url + \"_all_dbs\"\n        res = self.session.get(url, json={}).json()\n        for each in res:\n            if each.startswith('_'):\n                continue\n            if each.startswith(self.database):\n                self.projects.add(each[len(self.database)+1+len(prefix):])\n\n\n    def create_database(self, name):\n        url = self.base_url + name\n        res = self.session.put(url).json()\n        if 'error' in res and res['error'] == 'unauthorized':\n            raise Exception(\"Supplied credentials are incorrect. Reason: {} for User: {} Password: {}\".format(res['reason'], self.username, self.password))\n        return res\n\n\n    def get_doc(self, db_name, doc_id):\n        url = self.base_url + db_name + \"/\" + doc_id\n        res = self.session.get(url).json()\n        if \"error\" in res and res[\"error\"] == \"not_found\":\n            return None\n        return res\n\n\n    def get_docs(self, db_name, selector):\n        url = self.base_url + db_name + \"/_find\"\n        selector['use_index'] = self.index\n        res = self.session.post(url, json=selector).json()\n        if 'error' in res and res['error'] == 'not_found':\n            return []\n        return res['docs']\n\n\n    def get_all_docs(self, db_name):\n        return self.get_docs(db_name, {\"selector\": {}})\n\n\n    def insert_doc(self, db_name, doc_id, doc):\n        url = self.base_url + db_name + \"/\" + doc_id\n        return self.session.put(url, json=doc).json()\n\n\n    def update_doc(self, db_name, doc_id, new_doc):\n        doc = self.get_doc(db_name, doc_id)\n        if doc is None:\n            return self.insert_doc(db_name, doc_id, new_doc)\n        for key in new_doc:\n            doc[key] = new_doc[key]\n        url = self.base_url + db_name + \"/\" + doc_id\n        return self.session.put(url, json=doc).json()\n\n\n    def delete(self, url):\n        return self.session.delete(url).json()\n\n"
  },
  {
    "path": "pyspider/database/couchdb/projectdb.py",
    "content": "import time, requests, json\nfrom requests.auth import HTTPBasicAuth\nfrom pyspider.database.base.projectdb import ProjectDB as BaseProjectDB\n\n\nclass ProjectDB(BaseProjectDB):\n    __collection_name__ = 'projectdb'\n\n    def __init__(self, url, database='projectdb', username=None, password=None):\n        self.username = username\n        self.password = password\n        self.url = url + self.__collection_name__ + \"_\" + database + \"/\"\n        self.database = database\n\n        self.session = requests.session()\n        if username:\n            self.session.auth = HTTPBasicAuth(self.username, self.password)\n        self.session.headers.update({'Content-Type': 'application/json'})\n\n        # Create the db\n        res = self.session.put(self.url).json()\n        if 'error' in res and res['error'] == 'unauthorized':\n            raise Exception(\n                \"Supplied credentials are incorrect. Reason: {} for User: {} Password: {}\".format(res['reason'],\n                                                                                                  self.username,\n                                                                                                  self.password))\n        # create index\n        payload = {\n            'index': {\n                'fields': ['name']\n            },\n            'name': self.__collection_name__ + \"_\" + database\n        }\n        res = self.session.post(self.url + \"_index\", json=payload).json()\n        self.index = res['id']\n\n    def _default_fields(self, each):\n        if each is None:\n            return each\n        each.setdefault('group', None)\n        each.setdefault('status', 'TODO')\n        each.setdefault('script', '')\n        each.setdefault('comments', None)\n        each.setdefault('rate', 0)\n        each.setdefault('burst', 0)\n        each.setdefault('updatetime', 0)\n        return each\n\n    def insert(self, name, obj={}):\n        url = self.url + name\n        obj = dict(obj)\n        obj['name'] = name\n        obj['updatetime'] = time.time()\n        res = self.session.put(url, json=obj).json()\n        return res\n\n    def update(self, name, obj={}, **kwargs):\n        # object contains the fields to update and their new values\n        update = self.get(name) # update will contain _rev\n        if update is None:\n            return None\n        obj = dict(obj)\n        obj['updatetime'] = time.time()\n        obj.update(kwargs)\n        for key in obj:\n            update[key] = obj[key]\n        return self.insert(name, update)\n\n    def get_all(self, fields=None):\n        if fields is None:\n            fields = []\n        payload = {\n            \"selector\": {},\n            \"fields\": fields,\n            \"use_index\": self.index\n        }\n        url = self.url + \"_find\"\n        res = self.session.post(url, json=payload).json()\n        for doc in res['docs']:\n            yield self._default_fields(doc)\n\n    def get(self, name, fields=None):\n        if fields is None:\n            fields = []\n        payload = {\n            \"selector\": {\"name\": name},\n            \"fields\": fields,\n            \"limit\": 1,\n            \"use_index\": self.index\n        }\n        url = self.url + \"_find\"\n        res = self.session.post(url, json=payload).json()\n        if len(res['docs']) == 0:\n            return None\n        return self._default_fields(res['docs'][0])\n\n    def check_update(self, timestamp, fields=None):\n        if fields is None:\n            fields = []\n        for project in self.get_all(fields=('updatetime', 'name')):\n            if project['updatetime'] > timestamp:\n                project = self.get(project['name'], fields)\n                yield self._default_fields(project)\n\n    def drop(self, name):\n        doc = self.get(name)\n        payload = {\"rev\": doc[\"_rev\"]}\n        url = self.url + name\n        return self.session.delete(url, params=payload).json()\n\n    def drop_database(self):\n        return self.session.delete(self.url).json()\n"
  },
  {
    "path": "pyspider/database/couchdb/resultdb.py",
    "content": "import time, json\nfrom pyspider.database.base.resultdb import ResultDB as BaseResultDB\nfrom .couchdbbase import SplitTableMixin\n\n\nclass ResultDB(SplitTableMixin, BaseResultDB):\n    collection_prefix = ''\n\n    def __init__(self, url, database='resultdb', username=None, password=None):\n        self.username = username\n        self.password = password\n        self.base_url = url\n        self.url = url + database + \"/\"\n        self.database = database\n\n        super().__init__()\n        self.create_database(database)\n        self.index = None\n\n    def _get_collection_name(self, project):\n        return self.database + \"_\" + self._collection_name(project)\n\n    def _create_project(self, project):\n        collection_name = self._get_collection_name(project)\n        self.create_database(collection_name)\n        # create index\n        payload = {\n            'index': {\n                'fields': ['taskid']\n            },\n            'name': collection_name\n        }\n\n        res = self.session.post(self.base_url + collection_name + \"/_index\", json=payload).json()\n        self.index = res['id']\n        self._list_project()\n\n    def save(self, project, taskid, url, result):\n        if project not in self.projects:\n            self._create_project(project)\n        collection_name = self._get_collection_name(project)\n        obj = {\n            'taskid': taskid,\n            'url': url,\n            'result': result,\n            'updatetime': time.time(),\n        }\n        return self.update_doc(collection_name, taskid, obj)\n\n    def select(self, project, fields=None, offset=0, limit=0):\n        if project not in self.projects:\n            self._list_project()\n        if project not in self.projects:\n            return\n        offset = offset or 0\n        limit = limit or 0\n        collection_name = self._get_collection_name(project)\n        if fields is None:\n            fields = []\n        if limit == 0:\n            sel = {\n                'selector': {},\n                'fields': fields,\n                'skip': offset\n            }\n        else:\n            sel = {\n              'selector': {},\n              'fields': fields,\n              'skip': offset,\n              'limit': limit\n            }\n        for result in self.get_docs(collection_name, sel):\n            yield result\n\n    def count(self, project):\n        if project not in self.projects:\n            self._list_project()\n        if project not in self.projects:\n            return\n        collection_name = self._get_collection_name(project)\n        return len(self.get_all_docs(collection_name))\n\n    def get(self, project, taskid, fields=None):\n        if project not in self.projects:\n            self._list_project()\n        if project not in self.projects:\n            return\n        collection_name = self._get_collection_name(project)\n        if fields is None:\n            fields = []\n        sel = {\n            'selector': {'taskid': taskid},\n            'fields': fields\n        }\n        ret = self.get_docs(collection_name, sel)\n        if len(ret) == 0:\n            return None\n        return ret[0]\n\n    def drop_database(self):\n        return self.delete(self.url)\n\n    def drop(self, project):\n        # drop the project\n        collection_name = self._get_collection_name(project)\n        url = self.base_url + collection_name\n        return self.delete(url)"
  },
  {
    "path": "pyspider/database/couchdb/taskdb.py",
    "content": "import json, time\nfrom pyspider.database.base.taskdb import TaskDB as BaseTaskDB\nfrom .couchdbbase import SplitTableMixin\n\n\nclass TaskDB(SplitTableMixin, BaseTaskDB):\n    collection_prefix = ''\n\n    def __init__(self, url, database='taskdb', username=None, password=None):\n        self.username = username\n        self.password = password\n        self.base_url = url\n        self.url = url + database + \"/\"\n        self.database = database\n        self.index = None\n\n        super().__init__()\n\n        self.create_database(database)\n        self.projects = set()\n        self._list_project()\n\n    def _get_collection_name(self, project):\n        return self.database + \"_\" + self._collection_name(project)\n\n    def _create_project(self, project):\n        collection_name = self._get_collection_name(project)\n        self.create_database(collection_name)\n        # create index\n        payload = {\n            'index': {\n                'fields': ['status', 'taskid']\n            },\n            'name': collection_name\n        }\n        res = self.session.post(self.base_url + collection_name + \"/_index\", json=payload).json()\n        self.index = res['id']\n        self._list_project()\n\n    def load_tasks(self, status, project=None, fields=None):\n        if not project:\n            self._list_project()\n        if fields is None:\n            fields = []\n        if project:\n            projects = [project, ]\n        else:\n            projects = self.projects\n        for project in projects:\n            collection_name = self._get_collection_name(project)\n            for task in self.get_docs(collection_name, {\"selector\": {\"status\": status}, \"fields\": fields}):\n                yield task\n\n    def get_task(self, project, taskid, fields=None):\n        if project not in self.projects:\n            self._list_project()\n        if project not in self.projects:\n            return\n        if fields is None:\n            fields = []\n        collection_name = self._get_collection_name(project)\n        ret = self.get_docs(collection_name, {\"selector\": {\"taskid\": taskid}, \"fields\": fields})\n        if len(ret) == 0:\n            return None\n        return ret[0]\n\n    def status_count(self, project):\n        if project not in self.projects:\n            self._list_project()\n        if project not in self.projects:\n            return {}\n        collection_name = self._get_collection_name(project)\n\n        def _count_for_status(collection_name, status):\n            total = len(self.get_docs(collection_name, {\"selector\": {'status': status}}))\n            return {'total': total, \"_id\": status} if total else None\n\n        c = collection_name\n        ret = filter(lambda x: x,map(lambda s: _count_for_status(c, s), [self.ACTIVE, self.SUCCESS, self.FAILED]))\n\n        result = {}\n        if isinstance(ret, dict):\n            ret = ret.get('result', [])\n        for each in ret:\n            result[each['_id']] = each['total']\n        return result\n\n    def insert(self, project, taskid, obj={}):\n        if project not in self.projects:\n            self._create_project(project)\n        obj = dict(obj)\n        obj['taskid'] = taskid\n        obj['project'] = project\n        obj['updatetime'] = time.time()\n        return self.update(project, taskid, obj=obj)\n\n    def update(self, project, taskid, obj={}, **kwargs):\n        obj = dict(obj)\n        obj.update(kwargs)\n        obj['updatetime'] = time.time()\n        collection_name = self._get_collection_name(project)\n        return self.update_doc(collection_name, taskid, obj)\n\n    def drop_database(self):\n        return self.delete(self.url)\n\n    def drop(self, project):\n        collection_name = self._get_collection_name(project)\n        url = self.base_url + collection_name\n        return self.delete(url)"
  },
  {
    "path": "pyspider/database/elasticsearch/__init__.py",
    "content": "#!/usr/bin/env python\n# -*- encoding: utf-8 -*-\n# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:\n# Author: Binux<roy@binux.me>\n#         http://binux.me\n# Created on 2016-01-17 18:31:58\n"
  },
  {
    "path": "pyspider/database/elasticsearch/projectdb.py",
    "content": "#!/usr/bin/env python\n# -*- encoding: utf-8 -*-\n# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:\n# Author: Binux<roy@binux.me>\n#         http://binux.me\n# Created on 2016-01-17 18:32:33\n\nimport time\n\nimport elasticsearch.helpers\nfrom elasticsearch import Elasticsearch\nfrom pyspider.database.base.projectdb import ProjectDB as BaseProjectDB\n\n\nclass ProjectDB(BaseProjectDB):\n    __type__ = 'project'\n\n    def __init__(self, hosts, index='pyspider'):\n        self.index = index\n        self.es = Elasticsearch(hosts=hosts)\n\n        self.es.indices.create(index=self.index, ignore=400)\n        if not self.es.indices.get_mapping(index=self.index, doc_type=self.__type__):\n            self.es.indices.put_mapping(index=self.index, doc_type=self.__type__, body={\n                \"_all\": {\"enabled\": False},\n                \"properties\": {\n                    \"updatetime\": {\"type\": \"double\"}\n                }\n            })\n\n    def insert(self, name, obj={}):\n        obj = dict(obj)\n        obj['name'] = name\n        obj['updatetime'] = time.time()\n\n        obj.setdefault('group', '')\n        obj.setdefault('status', 'TODO')\n        obj.setdefault('script', '')\n        obj.setdefault('comments', '')\n        obj.setdefault('rate', 0)\n        obj.setdefault('burst', 0)\n\n        return self.es.index(index=self.index, doc_type=self.__type__, body=obj, id=name,\n                             refresh=True)\n\n    def update(self, name, obj={}, **kwargs):\n        obj = dict(obj)\n        obj.update(kwargs)\n        obj['updatetime'] = time.time()\n        return self.es.update(index=self.index, doc_type=self.__type__,\n                              body={'doc': obj}, id=name, refresh=True, ignore=404)\n\n    def get_all(self, fields=None):\n        for record in elasticsearch.helpers.scan(self.es, index=self.index, doc_type=self.__type__,\n                                                 query={'query': {\"match_all\": {}}},\n                                                 _source_include=fields or []):\n            yield record['_source']\n\n    def get(self, name, fields=None):\n        ret = self.es.get(index=self.index, doc_type=self.__type__, id=name,\n                          _source_include=fields or [], ignore=404)\n        return ret.get('_source', None)\n\n    def check_update(self, timestamp, fields=None):\n        for record in elasticsearch.helpers.scan(self.es, index=self.index, doc_type=self.__type__,\n                                                 query={'query': {\"range\": {\n                                                     \"updatetime\": {\"gte\": timestamp}\n                                                 }}}, _source_include=fields or []):\n            yield record['_source']\n\n    def drop(self, name):\n        return self.es.delete(index=self.index, doc_type=self.__type__, id=name, refresh=True)\n"
  },
  {
    "path": "pyspider/database/elasticsearch/resultdb.py",
    "content": "#!/usr/bin/env python\n# -*- encoding: utf-8 -*-\n# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:\n# Author: Binux<roy@binux.me>\n#         http://binux.me\n# Created on 2016-01-18 19:41:24\n\n\nimport time\n\nimport elasticsearch.helpers\nfrom elasticsearch import Elasticsearch\nfrom pyspider.database.base.resultdb import ResultDB as BaseResultDB\n\n\nclass ResultDB(BaseResultDB):\n    __type__ = 'result'\n\n    def __init__(self, hosts, index='pyspider'):\n        self.index = index\n        self.es = Elasticsearch(hosts=hosts)\n\n        self.es.indices.create(index=self.index, ignore=400)\n        if not self.es.indices.get_mapping(index=self.index, doc_type=self.__type__):\n            self.es.indices.put_mapping(index=self.index, doc_type=self.__type__, body={\n                \"_all\": {\"enabled\": True},\n                \"properties\": {\n                    \"taskid\": {\"enabled\": False},\n                    \"project\": {\"type\": \"string\", \"index\": \"not_analyzed\"},\n                    \"url\": {\"enabled\": False},\n                }\n            })\n\n    @property\n    def projects(self):\n        ret = self.es.search(index=self.index, doc_type=self.__type__,\n                             body={\"aggs\": {\"projects\": {\n                                 \"terms\": {\"field\": \"project\"}\n                             }}}, _source=False)\n        return [each['key'] for each in ret['aggregations']['projects'].get('buckets', [])]\n\n    def save(self, project, taskid, url, result):\n        obj = {\n            'taskid': taskid,\n            'project': project,\n            'url': url,\n            'result': result,\n            'updatetime': time.time(),\n        }\n        return self.es.index(index=self.index, doc_type=self.__type__,\n                             body=obj, id='%s:%s' % (project, taskid))\n\n    def select(self, project, fields=None, offset=0, limit=0):\n        offset = offset or 0\n        limit = limit or 0\n        if not limit:\n            for record in elasticsearch.helpers.scan(self.es, index=self.index, doc_type=self.__type__,\n                                                     query={'query': {'term': {'project': project}}},\n                                                     _source_include=fields or [], from_=offset,\n                                                     sort=\"updatetime:desc\"):\n                yield record['_source']\n        else:\n            for record in self.es.search(index=self.index, doc_type=self.__type__,\n                                         body={'query': {'term': {'project': project}}},\n                                         _source_include=fields or [], from_=offset, size=limit,\n                                         sort=\"updatetime:desc\"\n                                         ).get('hits', {}).get('hits', []):\n                yield record['_source']\n\n    def count(self, project):\n        return self.es.count(index=self.index, doc_type=self.__type__,\n                             body={'query': {'term': {'project': project}}}\n                             ).get('count', 0)\n\n    def get(self, project, taskid, fields=None):\n        ret = self.es.get(index=self.index, doc_type=self.__type__, id=\"%s:%s\" % (project, taskid),\n                          _source_include=fields or [], ignore=404)\n        return ret.get('_source', None)\n\n    def drop(self, project):\n        self.refresh()\n        for record in elasticsearch.helpers.scan(self.es, index=self.index, doc_type=self.__type__,\n                                                 query={'query': {'term': {'project': project}}},\n                                                 _source=False):\n            self.es.delete(index=self.index, doc_type=self.__type__, id=record['_id'])\n\n    def refresh(self):\n        \"\"\"\n        Explicitly refresh one or more index, making all operations\n        performed since the last refresh available for search.\n        \"\"\"\n        self.es.indices.refresh(index=self.index)\n"
  },
  {
    "path": "pyspider/database/elasticsearch/taskdb.py",
    "content": "#!/usr/bin/env python\n# -*- encoding: utf-8 -*-\n# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:\n# Author: Binux<roy@binux.me>\n#         http://binux.me\n# Created on 2016-01-20 20:20:55\n\n\nimport time\nimport json\n\nimport elasticsearch.helpers\nfrom elasticsearch import Elasticsearch\nfrom pyspider.database.base.taskdb import TaskDB as BaseTaskDB\n\n\nclass TaskDB(BaseTaskDB):\n    __type__ = 'task'\n\n    def __init__(self, hosts, index='pyspider'):\n        self.index = index\n        self._changed = False\n        self.es = Elasticsearch(hosts=hosts)\n\n        self.es.indices.create(index=self.index, ignore=400)\n        if not self.es.indices.get_mapping(index=self.index, doc_type=self.__type__):\n            self.es.indices.put_mapping(index=self.index, doc_type=self.__type__, body={\n                \"_all\": {\"enabled\": False},\n                \"properties\": {\n                    \"project\": {\"type\": \"string\", \"index\": \"not_analyzed\"},\n                    \"status\": {\"type\": \"byte\"},\n                }\n            })\n\n    def _parse(self, data):\n        if not data:\n            return data\n        for each in ('schedule', 'fetch', 'process', 'track'):\n            if each in data:\n                if data[each]:\n                    data[each] = json.loads(data[each])\n                else:\n                    data[each] = {}\n        return data\n\n    def _stringify(self, data):\n        for each in ('schedule', 'fetch', 'process', 'track'):\n            if each in data:\n                data[each] = json.dumps(data[each])\n        return data\n\n    @property\n    def projects(self):\n        ret = self.es.search(index=self.index, doc_type=self.__type__,\n                             body={\"aggs\": {\"projects\": {\n                                 \"terms\": {\"field\": \"project\"}\n                             }}}, _source=False)\n        return [each['key'] for each in ret['aggregations']['projects'].get('buckets', [])]\n\n    def load_tasks(self, status, project=None, fields=None):\n        self.refresh()\n        if project is None:\n            for project in self.projects:\n                for each in self.load_tasks(status, project, fields):\n                    yield each\n        else:\n            for record in elasticsearch.helpers.scan(self.es, index=self.index, doc_type=self.__type__,\n                                                     query={'query': {'bool': {\n                                                         'must': {'term': {'project': project}},\n                                                         'should': [{'term': {'status': status}}],\n                                                         'minimum_should_match': 1,\n                                                     }}}, _source_include=fields or []):\n                yield self._parse(record['_source'])\n\n    def get_task(self, project, taskid, fields=None):\n        if self._changed:\n            self.refresh()\n        ret = self.es.get(index=self.index, doc_type=self.__type__, id=\"%s:%s\" % (project, taskid),\n                          _source_include=fields or [], ignore=404)\n        return self._parse(ret.get('_source', None))\n\n    def status_count(self, project):\n        self.refresh()\n        ret = self.es.search(index=self.index, doc_type=self.__type__,\n                             body={\"query\": {'term': {'project': project}},\n                                   \"aggs\": {\"status\": {\n                                       \"terms\": {\"field\": \"status\"}\n                                   }}}, _source=False)\n        result = {}\n        for each in ret['aggregations']['status'].get('buckets', []):\n            result[each['key']] = each['doc_count']\n        return result\n\n    def insert(self, project, taskid, obj={}):\n        self._changed = True\n        obj = dict(obj)\n        obj['taskid'] = taskid\n        obj['project'] = project\n        obj['updatetime'] = time.time()\n        return self.es.index(index=self.index, doc_type=self.__type__,\n                             body=self._stringify(obj), id='%s:%s' % (project, taskid))\n\n    def update(self, project, taskid, obj={}, **kwargs):\n        self._changed = True\n        obj = dict(obj)\n        obj.update(kwargs)\n        obj['updatetime'] = time.time()\n        return self.es.update(index=self.index, doc_type=self.__type__, id='%s:%s' % (project, taskid),\n                              body={\"doc\": self._stringify(obj)}, ignore=404)\n\n    def drop(self, project):\n        self.refresh()\n        for record in elasticsearch.helpers.scan(self.es, index=self.index, doc_type=self.__type__,\n                                                 query={'query': {'term': {'project': project}}},\n                                                 _source=False):\n            self.es.delete(index=self.index, doc_type=self.__type__, id=record['_id'])\n        self.refresh()\n\n    def refresh(self):\n        \"\"\"\n        Explicitly refresh one or more index, making all operations\n        performed since the last refresh available for search.\n        \"\"\"\n        self._changed = False\n        self.es.indices.refresh(index=self.index)\n"
  },
  {
    "path": "pyspider/database/local/__init__.py",
    "content": "#!/usr/bin/env python\n# -*- encoding: utf-8 -*-\n# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:\n# Author: Binux<roy@binux.me>\n#         http://binux.me\n# Created on 2015-01-17 20:56:50\n"
  },
  {
    "path": "pyspider/database/local/projectdb.py",
    "content": "#!/usr/bin/env python\n# -*- encoding: utf-8 -*-\n# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:\n# Author: Binux<roy@binux.me>\n#         http://binux.me\n# Created on 2015-01-17 12:32:17\n\nimport os\nimport re\nimport six\nimport glob\nimport logging\n\nfrom pyspider.database.base.projectdb import ProjectDB as BaseProjectDB\n\n\nclass ProjectDB(BaseProjectDB):\n    \"\"\"ProjectDB loading scripts from local file.\"\"\"\n\n    def __init__(self, files):\n        self.files = files\n        self.projects = {}\n        self.load_scripts()\n\n    def load_scripts(self):\n        project_names = set(self.projects.keys())\n        for path in self.files:\n            for filename in glob.glob(path):\n                name = os.path.splitext(os.path.basename(filename))[0]\n                if name in project_names:\n                    project_names.remove(name)\n                updatetime = os.path.getmtime(filename)\n                if name not in self.projects or updatetime > self.projects[name]['updatetime']:\n                    project = self._build_project(filename)\n                    if not project:\n                        continue\n                    self.projects[project['name']] = project\n\n        for name in project_names:\n            del self.projects[name]\n\n    rate_re = re.compile(r'^\\s*#\\s*rate.*?(\\d+(\\.\\d+)?)', re.I | re.M)\n    burst_re = re.compile(r'^\\s*#\\s*burst.*?(\\d+(\\.\\d+)?)', re.I | re.M)\n\n    def _build_project(self, filename):\n        try:\n            with open(filename) as fp:\n                script = fp.read()\n            m = self.rate_re.search(script)\n            if m:\n                rate = float(m.group(1))\n            else:\n                rate = 1\n\n            m = self.burst_re.search(script)\n            if m:\n                burst = float(m.group(1))\n            else:\n                burst = 3\n\n            return {\n                'name': os.path.splitext(os.path.basename(filename))[0],\n                'group': None,\n                'status': 'RUNNING',\n                'script': script,\n                'comments': None,\n                'rate': rate,\n                'burst': burst,\n                'updatetime': os.path.getmtime(filename),\n            }\n        except OSError as e:\n            logging.error('loading project script error: %s', e)\n            return None\n\n    def get_all(self, fields=None):\n        for projectname in self.projects:\n            yield self.get(projectname, fields)\n\n    def get(self, name, fields=None):\n        if name not in self.projects:\n            return None\n        project = self.projects[name]\n        result = {}\n        for f in fields or project:\n            if f in project:\n                result[f] = project[f]\n            else:\n                result[f] = None\n        return result\n\n    def check_update(self, timestamp, fields=None):\n        self.load_scripts()\n        for projectname, project in six.iteritems(self.projects):\n            if project['updatetime'] > timestamp:\n                yield self.get(projectname, fields)\n"
  },
  {
    "path": "pyspider/database/mongodb/__init__.py",
    "content": ""
  },
  {
    "path": "pyspider/database/mongodb/mongodbbase.py",
    "content": "#!/usr/bin/env python\n# -*- encoding: utf-8 -*-\n# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:\n# Author: Binux<roy@binux.me>\n#         http://binux.me\n# Created on 2014-11-22 20:42:01\n\nimport time\n\n\nclass SplitTableMixin(object):\n    UPDATE_PROJECTS_TIME = 10 * 60\n\n    def _collection_name(self, project):\n        if self.collection_prefix:\n            return \"%s.%s\" % (self.collection_prefix, project)\n        else:\n            return project\n\n    @property\n    def projects(self):\n        if time.time() - getattr(self, '_last_update_projects', 0) > self.UPDATE_PROJECTS_TIME:\n            self._list_project()\n        return self._projects\n\n    @projects.setter\n    def projects(self, value):\n        self._projects = value\n\n    def _list_project(self):\n        self._last_update_projects = time.time()\n        self.projects = set()\n        if self.collection_prefix:\n            prefix = \"%s.\" % self.collection_prefix\n        else:\n            prefix = ''\n        for each in self.database.collection_names():\n            if each.startswith('system.'):\n                continue\n            if each.startswith(prefix):\n                self.projects.add(each[len(prefix):])\n\n    def drop(self, project):\n        if project not in self.projects:\n            self._list_project()\n        if project not in self.projects:\n            return\n        collection_name = self._collection_name(project)\n        self.database[collection_name].drop()\n        self._list_project()\n"
  },
  {
    "path": "pyspider/database/mongodb/projectdb.py",
    "content": "#!/usr/bin/env python\n# -*- encoding: utf-8 -*-\n# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:\n# Author: Binux<i@binux.me>\n#         http://binux.me\n# Created on 2014-10-12 12:22:42\n\nimport time\nfrom pymongo import MongoClient\n\nfrom pyspider.database.base.projectdb import ProjectDB as BaseProjectDB\n\n\nclass ProjectDB(BaseProjectDB):\n    __collection_name__ = 'projectdb'\n\n    def __init__(self, url, database='projectdb'):\n        self.conn = MongoClient(url)\n        self.conn.admin.command(\"ismaster\")\n        self.database = self.conn[database]\n        self.collection = self.database[self.__collection_name__]\n\n        self.collection.ensure_index('name', unique=True)\n\n    def _default_fields(self, each):\n        if each is None:\n            return each\n        each.setdefault('group', None)\n        each.setdefault('status', 'TODO')\n        each.setdefault('script', '')\n        each.setdefault('comments', None)\n        each.setdefault('rate', 0)\n        each.setdefault('burst', 0)\n        each.setdefault('updatetime', 0)\n        return each\n\n    def insert(self, name, obj={}):\n        obj = dict(obj)\n        obj['name'] = name\n        obj['updatetime'] = time.time()\n        return self.collection.update({'name': name}, {'$set': obj}, upsert=True)\n\n    def update(self, name, obj={}, **kwargs):\n        obj = dict(obj)\n        obj.update(kwargs)\n        obj['updatetime'] = time.time()\n        return self.collection.update({'name': name}, {'$set': obj})\n\n    def get_all(self, fields=None):\n        for each in self.collection.find({}, fields):\n            if each and '_id' in each:\n                del each['_id']\n            yield self._default_fields(each)\n\n    def get(self, name, fields=None):\n        each = self.collection.find_one({'name': name}, fields)\n        if each and '_id' in each:\n            del each['_id']\n        return self._default_fields(each)\n\n    def check_update(self, timestamp, fields=None):\n        for project in self.get_all(fields=('updatetime', 'name')):\n            if project['updatetime'] > timestamp:\n                project = self.get(project['name'], fields)\n                yield self._default_fields(project)\n\n    def drop(self, name):\n        return self.collection.remove({'name': name})\n"
  },
  {
    "path": "pyspider/database/mongodb/resultdb.py",
    "content": "#!/usr/bin/env python\n# -*- encoding: utf-8 -*-\n# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:\n# Author: Binux<i@binux.me>\n#         http://binux.me\n# Created on 2014-10-13 22:18:36\n\nimport json\nimport time\n\nfrom pymongo import MongoClient\n\nfrom pyspider.database.base.resultdb import ResultDB as BaseResultDB\nfrom .mongodbbase import SplitTableMixin\n\n\nclass ResultDB(SplitTableMixin, BaseResultDB):\n    collection_prefix = ''\n\n    def __init__(self, url, database='resultdb'):\n        self.conn = MongoClient(url)\n        self.conn.admin.command(\"ismaster\")\n        self.database = self.conn[database]\n        self.projects = set()\n\n        self._list_project()\n        # we suggest manually build index in advance, instead of indexing\n        #  in the startup process,\n        # for project in self.projects:\n        #     collection_name = self._collection_name(project)\n        #     self.database[collection_name].ensure_index('taskid')\n        pass\n\n    def _create_project(self, project):\n        collection_name = self._collection_name(project)\n        self.database[collection_name].ensure_index('taskid')\n        self._list_project()\n\n    def _parse(self, data):\n        data['_id'] = str(data['_id'])\n        if 'result' in data:\n            data['result'] = json.loads(data['result'])\n        return data\n\n    def _stringify(self, data):\n        if 'result' in data:\n            data['result'] = json.dumps(data['result'])\n        return data\n\n    def save(self, project, taskid, url, result):\n        if project not in self.projects:\n            self._create_project(project)\n        collection_name = self._collection_name(project)\n        obj = {\n            'taskid'    : taskid,\n            'url'       : url,\n            'result'    : result,\n            'updatetime': time.time(),\n        }\n        return self.database[collection_name].update(\n            {'taskid': taskid}, {\"$set\": self._stringify(obj)}, upsert=True\n        )\n\n    def select(self, project, fields=None, offset=0, limit=0):\n        if project not in self.projects:\n            self._list_project()\n        if project not in self.projects:\n            return\n        offset = offset or 0\n        limit = limit or 0\n        collection_name = self._collection_name(project)\n        for result in self.database[collection_name].find({}, fields, skip=offset, limit=limit):\n            yield self._parse(result)\n\n    def count(self, project):\n        if project not in self.projects:\n            self._list_project()\n        if project not in self.projects:\n            return\n        collection_name = self._collection_name(project)\n        return self.database[collection_name].count()\n\n    def get(self, project, taskid, fields=None):\n        if project not in self.projects:\n            self._list_project()\n        if project not in self.projects:\n            return\n        collection_name = self._collection_name(project)\n        ret = self.database[collection_name].find_one({'taskid': taskid}, fields)\n        if not ret:\n            return ret\n        return self._parse(ret)\n"
  },
  {
    "path": "pyspider/database/mongodb/taskdb.py",
    "content": "#!/usr/bin/env python\n# -*- encoding: utf-8 -*-\n# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:\n# Author: Binux<i@binux.me>\n#         http://binux.me\n# Created on 2014-10-11 23:54:50\n\nimport json\nimport time\n\nfrom pymongo import MongoClient\n\nfrom pyspider.database.base.taskdb import TaskDB as BaseTaskDB\nfrom .mongodbbase import SplitTableMixin\n\n\nclass TaskDB(SplitTableMixin, BaseTaskDB):\n    collection_prefix = ''\n\n    def __init__(self, url, database='taskdb'):\n        self.conn = MongoClient(url)\n        self.conn.admin.command(\"ismaster\")\n        self.database = self.conn[database]\n        self.projects = set()\n\n        self._list_project()\n        # we suggest manually build index in advance, instead of indexing\n        #  in the startup process,\n        # for project in self.projects:\n        #     collection_name = self._collection_name(project)\n        #     self.database[collection_name].ensure_index('status')\n        #     self.database[collection_name].ensure_index('taskid')\n\n    def _create_project(self, project):\n        collection_name = self._collection_name(project)\n        self.database[collection_name].ensure_index('status')\n        self.database[collection_name].ensure_index('taskid')\n        self._list_project()\n\n    def _parse(self, data):\n        if '_id' in data:\n            del data['_id']\n        for each in ('schedule', 'fetch', 'process', 'track'):\n            if each in data:\n                if data[each]:\n                    if isinstance(data[each], bytearray):\n                        data[each] = str(data[each])\n                    data[each] = json.loads(data[each], encoding='utf8')\n                else:\n                    data[each] = {}\n        return data\n\n    def _stringify(self, data):\n        for each in ('schedule', 'fetch', 'process', 'track'):\n            if each in data:\n                data[each] = json.dumps(data[each])\n        return data\n\n    def load_tasks(self, status, project=None, fields=None):\n        if not project:\n            self._list_project()\n\n        if project:\n            projects = [project, ]\n        else:\n            projects = self.projects\n\n        for project in projects:\n            collection_name = self._collection_name(project)\n            for task in self.database[collection_name].find({'status': status}, fields):\n                yield self._parse(task)\n\n    def get_task(self, project, taskid, fields=None):\n        if project not in self.projects:\n            self._list_project()\n        if project not in self.projects:\n            return\n        collection_name = self._collection_name(project)\n        ret = self.database[collection_name].find_one({'taskid': taskid}, fields)\n        if not ret:\n            return ret\n        return self._parse(ret)\n\n    def status_count(self, project):\n        if project not in self.projects:\n            self._list_project()\n        if project not in self.projects:\n            return {}\n        collection_name = self._collection_name(project)\n\n        # when there are too many data in task collection , aggregate operation will take a very long time,\n        #  and this will cause scheduler module startup to be particularly slow\n\n        # ret = self.database[collection_name].aggregate([\n        #     {'$group': {\n        #         '_id'  : '$status',\n        #         'total': {\n        #             '$sum': 1\n        #         }\n        #     }\n        #     }])\n\n        # Instead of aggregate, use find-count on status(with index) field.\n        def _count_for_status(collection, status):\n            total = collection.find({'status': status}).count()\n            return {'total': total, \"_id\": status} if total else None\n\n        c = self.database[collection_name]\n        ret = filter(\n            lambda x: x,\n            map(\n                lambda s: _count_for_status(c, s), [self.ACTIVE, self.SUCCESS, self.FAILED]\n            )\n        )\n\n        result = {}\n        if isinstance(ret, dict):\n            ret = ret.get('result', [])\n        for each in ret:\n            result[each['_id']] = each['total']\n        return result\n\n    def insert(self, project, taskid, obj={}):\n        if project not in self.projects:\n            self._create_project(project)\n        obj = dict(obj)\n        obj['taskid'] = taskid\n        obj['project'] = project\n        obj['updatetime'] = time.time()\n        return self.update(project, taskid, obj=obj)\n\n    def update(self, project, taskid, obj={}, **kwargs):\n        obj = dict(obj)\n        obj.update(kwargs)\n        obj['updatetime'] = time.time()\n        collection_name = self._collection_name(project)\n        return self.database[collection_name].update(\n            {'taskid': taskid},\n            {\"$set\": self._stringify(obj)},\n            upsert=True\n        )\n"
  },
  {
    "path": "pyspider/database/mysql/__init__.py",
    "content": "#!/usr/bin/env python\n# -*- encoding: utf-8 -*-\n# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:\n# Author: Binux<i@binux.me>\n#         http://binux.me\n# Created on 2014-07-17 20:12:54\n"
  },
  {
    "path": "pyspider/database/mysql/mysqlbase.py",
    "content": "#!/usr/bin/env python\n# -*- encoding: utf-8 -*-\n# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:\n# Author: Binux<i@binux.me>\n#         http://binux.me\n# Created on 2014-11-05 10:42:24\n\nimport time\nimport mysql.connector\n\n\nclass MySQLMixin(object):\n    maxlimit = 18446744073709551615\n\n    @property\n    def dbcur(self):\n        try:\n            if self.conn.unread_result:\n                self.conn.get_rows()\n                if hasattr(self.conn, 'free_result'):\n                    self.conn.free_result()\n            return self.conn.cursor()\n        except (mysql.connector.OperationalError, mysql.connector.InterfaceError):\n            self.conn.ping(reconnect=True)\n            self.conn.database = self.database_name\n            return self.conn.cursor()\n\n\nclass SplitTableMixin(object):\n    UPDATE_PROJECTS_TIME = 10 * 60\n\n    def _tablename(self, project):\n        if self.__tablename__:\n            return '%s_%s' % (self.__tablename__, project)\n        else:\n            return project\n\n    @property\n    def projects(self):\n        if time.time() - getattr(self, '_last_update_projects', 0) \\\n                > self.UPDATE_PROJECTS_TIME:\n            self._list_project()\n        return self._projects\n\n    @projects.setter\n    def projects(self, value):\n        self._projects = value\n\n    def _list_project(self):\n        self._last_update_projects = time.time()\n        self.projects = set()\n        if self.__tablename__:\n            prefix = '%s_' % self.__tablename__\n        else:\n            prefix = ''\n        for project, in self._execute('show tables;'):\n            if project.startswith(prefix):\n                project = project[len(prefix):]\n                self.projects.add(project)\n\n    def drop(self, project):\n        if project not in self.projects:\n            self._list_project()\n        if project not in self.projects:\n            return\n        tablename = self._tablename(project)\n        self._execute(\"DROP TABLE %s\" % self.escape(tablename))\n        self._list_project()\n"
  },
  {
    "path": "pyspider/database/mysql/projectdb.py",
    "content": "#!/usr/bin/env python\n# -*- encoding: utf-8 -*-\n# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:\n# Author: Binux<i@binux.me>\n#         http://binux.me\n# Created on 2014-07-17 21:06:43\n\nimport time\nimport mysql.connector\n\nfrom pyspider.database.base.projectdb import ProjectDB as BaseProjectDB\nfrom pyspider.database.basedb import BaseDB\nfrom .mysqlbase import MySQLMixin\n\n\nclass ProjectDB(MySQLMixin, BaseProjectDB, BaseDB):\n    __tablename__ = 'projectdb'\n\n    def __init__(self, host='localhost', port=3306, database='projectdb',\n                 user='root', passwd=None):\n        self.database_name = database\n        self.conn = mysql.connector.connect(user=user, password=passwd,\n                                            host=host, port=port, autocommit=True)\n        if database not in [x[0] for x in self._execute('show databases')]:\n            self._execute('CREATE DATABASE %s' % self.escape(database))\n        self.conn.database = database\n\n        self._execute('''CREATE TABLE IF NOT EXISTS %s (\n            `name` varchar(64) PRIMARY KEY,\n            `group` varchar(64),\n            `status` varchar(16),\n            `script` TEXT,\n            `comments` varchar(1024),\n            `rate` float(11, 4),\n            `burst` float(11, 4),\n            `updatetime` double(16, 4)\n            ) ENGINE=InnoDB CHARSET=utf8''' % self.escape(self.__tablename__))\n\n    def insert(self, name, obj={}):\n        obj = dict(obj)\n        obj['name'] = name\n        obj['updatetime'] = time.time()\n        return self._insert(**obj)\n\n    def update(self, name, obj={}, **kwargs):\n        obj = dict(obj)\n        obj.update(kwargs)\n        obj['updatetime'] = time.time()\n        ret = self._update(where=\"`name` = %s\" % self.placeholder, where_values=(name, ), **obj)\n        return ret.rowcount\n\n    def get_all(self, fields=None):\n        return self._select2dic(what=fields)\n\n    def get(self, name, fields=None):\n        where = \"`name` = %s\" % self.placeholder\n        for each in self._select2dic(what=fields, where=where, where_values=(name, )):\n            return each\n        return None\n\n    def drop(self, name):\n        where = \"`name` = %s\" % self.placeholder\n        return self._delete(where=where, where_values=(name, ))\n\n    def check_update(self, timestamp, fields=None):\n        where = \"`updatetime` >= %f\" % timestamp\n        return self._select2dic(what=fields, where=where)\n"
  },
  {
    "path": "pyspider/database/mysql/resultdb.py",
    "content": "#!/usr/bin/env python\n# -*- encoding: utf-8 -*-\n# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:\n# Author: Binux<i@binux.me>\n#         http://binux.me\n# Created on 2014-10-13 22:02:57\n\nimport re\nimport six\nimport time\nimport json\nimport mysql.connector\n\nfrom pyspider.libs import utils\nfrom pyspider.database.base.resultdb import ResultDB as BaseResultDB\nfrom pyspider.database.basedb import BaseDB\nfrom .mysqlbase import MySQLMixin, SplitTableMixin\n\n\nclass ResultDB(MySQLMixin, SplitTableMixin, BaseResultDB, BaseDB):\n    __tablename__ = ''\n\n    def __init__(self, host='localhost', port=3306, database='resultdb',\n                 user='root', passwd=None):\n        self.database_name = database\n        self.conn = mysql.connector.connect(user=user, password=passwd,\n                                            host=host, port=port, autocommit=True)\n        if database not in [x[0] for x in self._execute('show databases')]:\n            self._execute('CREATE DATABASE %s' % self.escape(database))\n        self.conn.database = database\n        self._list_project()\n\n    def _create_project(self, project):\n        assert re.match(r'^\\w+$', project) is not None\n        tablename = self._tablename(project)\n        if tablename in [x[0] for x in self._execute('show tables')]:\n            return\n        self._execute('''CREATE TABLE %s (\n            `taskid` varchar(64) PRIMARY KEY,\n            `url` varchar(1024),\n            `result` MEDIUMBLOB,\n            `updatetime` double(16, 4)\n            ) ENGINE=InnoDB CHARSET=utf8''' % self.escape(tablename))\n\n    def _parse(self, data):\n        for key, value in list(six.iteritems(data)):\n            if isinstance(value, (bytearray, six.binary_type)):\n                data[key] = utils.text(value)\n        if 'result' in data:\n            data['result'] = json.loads(data['result'])\n        return data\n\n    def _stringify(self, data):\n        if 'result' in data:\n            data['result'] = json.dumps(data['result'])\n        return data\n\n    def save(self, project, taskid, url, result):\n        tablename = self._tablename(project)\n        if project not in self.projects:\n            self._create_project(project)\n            self._list_project()\n        obj = {\n            'taskid': taskid,\n            'url': url,\n            'result': result,\n            'updatetime': time.time(),\n        }\n        return self._replace(tablename, **self._stringify(obj))\n\n    def select(self, project, fields=None, offset=0, limit=None):\n        if project not in self.projects:\n            self._list_project()\n        if project not in self.projects:\n            return\n        tablename = self._tablename(project)\n\n        for task in self._select2dic(tablename, what=fields, order='updatetime DESC',\n                                     offset=offset, limit=limit):\n            yield self._parse(task)\n\n    def count(self, project):\n        if project not in self.projects:\n            self._list_project()\n        if project not in self.projects:\n            return 0\n        tablename = self._tablename(project)\n        for count, in self._execute(\"SELECT count(1) FROM %s\" % self.escape(tablename)):\n            return count\n\n    def get(self, project, taskid, fields=None):\n        if project not in self.projects:\n            self._list_project()\n        if project not in self.projects:\n            return\n        tablename = self._tablename(project)\n        where = \"`taskid` = %s\" % self.placeholder\n        for task in self._select2dic(tablename, what=fields,\n                                     where=where, where_values=(taskid, )):\n            return self._parse(task)\n"
  },
  {
    "path": "pyspider/database/mysql/taskdb.py",
    "content": "#!/usr/bin/envutils\n# -*- encoding: utf-8 -*-\n# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:\n# Author: Binux<i@binux.me>\n#         http://binux.me\n# Created on 2014-07-17 18:53:01\n\n\nimport re\nimport six\nimport time\nimport json\nimport mysql.connector\n\nfrom pyspider.libs import utils\nfrom pyspider.database.base.taskdb import TaskDB as BaseTaskDB\nfrom pyspider.database.basedb import BaseDB\nfrom .mysqlbase import MySQLMixin, SplitTableMixin\n\n\nclass TaskDB(MySQLMixin, SplitTableMixin, BaseTaskDB, BaseDB):\n    __tablename__ = ''\n\n    def __init__(self, host='localhost', port=3306, database='taskdb',\n                 user='root', passwd=None):\n        self.database_name = database\n        self.conn = mysql.connector.connect(user=user, password=passwd,\n                                            host=host, port=port, autocommit=True)\n        if database not in [x[0] for x in self._execute('show databases')]:\n            self._execute('CREATE DATABASE %s' % self.escape(database))\n        self.conn.database = database\n        self._list_project()\n\n    def _create_project(self, project):\n        assert re.match(r'^\\w+$', project) is not None\n        tablename = self._tablename(project)\n        if tablename in [x[0] for x in self._execute('show tables')]:\n            return\n        self._execute('''CREATE TABLE IF NOT EXISTS %s (\n            `taskid` varchar(64) PRIMARY KEY,\n            `project` varchar(64),\n            `url` varchar(1024),\n            `status` int(1),\n            `schedule` BLOB,\n            `fetch` BLOB,\n            `process` BLOB,\n            `track` BLOB,\n            `lastcrawltime` double(16, 4),\n            `updatetime` double(16, 4),\n            INDEX `status_index` (`status`)\n            ) ENGINE=InnoDB CHARSET=utf8''' % self.escape(tablename))\n\n    def _parse(self, data):\n        for key, value in list(six.iteritems(data)):\n            if isinstance(value, (bytearray, six.binary_type)):\n                data[key] = utils.text(value)\n        for each in ('schedule', 'fetch', 'process', 'track'):\n            if each in data:\n                if data[each]:\n                    data[each] = json.loads(data[each])\n                else:\n                    data[each] = {}\n        return data\n\n    def _stringify(self, data):\n        for each in ('schedule', 'fetch', 'process', 'track'):\n            if each in data:\n                data[each] = json.dumps(data[each])\n        return data\n\n    def load_tasks(self, status, project=None, fields=None):\n        if project and project not in self.projects:\n            return\n        where = \"`status` = %s\" % self.placeholder\n\n        if project:\n            projects = [project, ]\n        else:\n            projects = self.projects\n\n        for project in projects:\n            tablename = self._tablename(project)\n            for each in self._select2dic(\n                tablename, what=fields, where=where, where_values=(status, )\n            ):\n                yield self._parse(each)\n\n    def get_task(self, project, taskid, fields=None):\n        if project not in self.projects:\n            self._list_project()\n        if project not in self.projects:\n            return None\n        where = \"`taskid` = %s\" % self.placeholder\n        tablename = self._tablename(project)\n        for each in self._select2dic(tablename, what=fields, where=where, where_values=(taskid, )):\n            return self._parse(each)\n        return None\n\n    def status_count(self, project):\n        result = dict()\n        if project not in self.projects:\n            self._list_project()\n        if project not in self.projects:\n            return result\n        tablename = self._tablename(project)\n        for status, count in self._execute(\"SELECT `status`, count(1) FROM %s GROUP BY `status`\" %\n                                           self.escape(tablename)):\n            result[status] = count\n        return result\n\n    def insert(self, project, taskid, obj={}):\n        if project not in self.projects:\n            self._list_project()\n        if project not in self.projects:\n            self._create_project(project)\n            self._list_project()\n        obj = dict(obj)\n        obj['taskid'] = taskid\n        obj['project'] = project\n        obj['updatetime'] = time.time()\n        tablename = self._tablename(project)\n        return self._insert(tablename, **self._stringify(obj))\n\n    def update(self, project, taskid, obj={}, **kwargs):\n        if project not in self.projects:\n            self._list_project()\n        if project not in self.projects:\n            raise LookupError\n        tablename = self._tablename(project)\n        obj = dict(obj)\n        obj.update(kwargs)\n        obj['updatetime'] = time.time()\n        return self._update(\n            tablename,\n            where=\"`taskid` = %s\" % self.placeholder,\n            where_values=(taskid, ),\n            **self._stringify(obj)\n        )\n"
  },
  {
    "path": "pyspider/database/redis/__init__.py",
    "content": "#!/usr/bin/env python\n# -*- encoding: utf-8 -*-\n# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:\n# Author: Binux<roy@binux.me>\n#         http://binux.me\n# Created on 2015-05-17 01:34:21\n\n"
  },
  {
    "path": "pyspider/database/redis/taskdb.py",
    "content": "#!/usr/bin/env python\n# -*- encoding: utf-8 -*-\n# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:\n# Author: Binux<roy@binux.me>\n#         http://binux.me\n# Created on 2015-05-16 21:01:52\n\nimport six\nimport time\nimport json\nimport redis\nimport logging\nimport itertools\n\nfrom pyspider.libs import utils\nfrom pyspider.database.base.taskdb import TaskDB as BaseTaskDB\n\n\nclass TaskDB(BaseTaskDB):\n    UPDATE_PROJECTS_TIME = 10 * 60\n    __prefix__ = 'taskdb_'\n\n    def __init__(self, host='localhost', port=6379, db=0):\n        self.redis = redis.StrictRedis(host=host, port=port, db=db)\n        try:\n            self.redis.scan(count=1)\n            self.scan_available = True\n        except Exception as e:\n            logging.debug(\"redis_scan disabled: %r\", e)\n            self.scan_available = False\n\n    def _gen_key(self, project, taskid):\n        return \"%s%s_%s\" % (self.__prefix__, project, taskid)\n\n    def _gen_status_key(self, project, status):\n        return '%s%s_status_%d' % (self.__prefix__, project, status)\n\n    def _parse(self, data):\n        if six.PY3:\n            result = {}\n            for key, value in data.items():\n                if isinstance(value, bytes):\n                    value = utils.text(value)\n                result[utils.text(key)] = value\n            data = result\n\n        for each in ('schedule', 'fetch', 'process', 'track'):\n            if each in data:\n                if data[each]:\n                    data[each] = json.loads(data[each])\n                else:\n                    data[each] = {}\n        if 'status' in data:\n            data['status'] = int(data['status'])\n        if 'lastcrawltime' in data:\n            data['lastcrawltime'] = float(data['lastcrawltime'] or 0)\n        if 'updatetime' in data:\n            data['updatetime'] = float(data['updatetime'] or 0)\n        return data\n\n    def _stringify(self, data):\n        for each in ('schedule', 'fetch', 'process', 'track'):\n            if each in data:\n                data[each] = json.dumps(data[each])\n        return data\n\n    @property\n    def projects(self):\n        if time.time() - getattr(self, '_last_update_projects', 0) \\\n                > self.UPDATE_PROJECTS_TIME:\n            self._projects = set(utils.text(x) for x in self.redis.smembers(\n                self.__prefix__ + 'projects'))\n        return self._projects\n\n    def load_tasks(self, status, project=None, fields=None):\n        if project is None:\n            project = self.projects\n        elif not isinstance(project, list):\n            project = [project, ]\n\n        if self.scan_available:\n            scan_method = self.redis.sscan_iter\n        else:\n            scan_method = self.redis.smembers\n\n        if fields:\n            def get_method(key):\n                obj = self.redis.hmget(key, fields)\n                if all(x is None for x in obj):\n                    return None\n                return dict(zip(fields, obj))\n        else:\n            get_method = self.redis.hgetall\n\n        for p in project:\n            status_key = self._gen_status_key(p, status)\n            for taskid in scan_method(status_key):\n                obj = get_method(self._gen_key(p, utils.text(taskid)))\n                if not obj:\n                    #self.redis.srem(status_key, taskid)\n                    continue\n                else:\n                    yield self._parse(obj)\n\n    def get_task(self, project, taskid, fields=None):\n        if fields:\n            obj = self.redis.hmget(self._gen_key(project, taskid), fields)\n            if all(x is None for x in obj):\n                return None\n            obj = dict(zip(fields, obj))\n        else:\n            obj = self.redis.hgetall(self._gen_key(project, taskid))\n\n        if not obj:\n            return None\n        return self._parse(obj)\n\n    def status_count(self, project):\n        '''\n        return a dict\n        '''\n        pipe = self.redis.pipeline(transaction=False)\n        for status in range(1, 5):\n            pipe.scard(self._gen_status_key(project, status))\n        ret = pipe.execute()\n\n        result = {}\n        for status, count in enumerate(ret):\n            if count > 0:\n                result[status + 1] = count\n        return result\n\n    def insert(self, project, taskid, obj={}):\n        obj = dict(obj)\n        obj['taskid'] = taskid\n        obj['project'] = project\n        obj['updatetime'] = time.time()\n        obj.setdefault('status', self.ACTIVE)\n\n        task_key = self._gen_key(project, taskid)\n\n        pipe = self.redis.pipeline(transaction=False)\n        if project not in self.projects:\n            pipe.sadd(self.__prefix__ + 'projects', project)\n        pipe.hmset(task_key, self._stringify(obj))\n        pipe.sadd(self._gen_status_key(project, obj['status']), taskid)\n        pipe.execute()\n\n    def update(self, project, taskid, obj={}, **kwargs):\n        obj = dict(obj)\n        obj.update(kwargs)\n        obj['updatetime'] = time.time()\n\n        pipe = self.redis.pipeline(transaction=False)\n        pipe.hmset(self._gen_key(project, taskid), self._stringify(obj))\n        if 'status' in obj:\n            for status in range(1, 5):\n                if status == obj['status']:\n                    pipe.sadd(self._gen_status_key(project, status), taskid)\n                else:\n                    pipe.srem(self._gen_status_key(project, status), taskid)\n        pipe.execute()\n\n    def drop(self, project):\n        self.redis.srem(self.__prefix__ + 'projects', project)\n\n        if self.scan_available:\n            scan_method = self.redis.scan_iter\n        else:\n            scan_method = self.redis.keys\n\n        for each in itertools.tee(scan_method(\"%s%s_*\" % (self.__prefix__, project)), 100):\n            each = list(each)\n            if each:\n                self.redis.delete(*each)\n"
  },
  {
    "path": "pyspider/database/sqlalchemy/__init__.py",
    "content": "#!/usr/bin/env python\n# -*- encoding: utf-8 -*-\n# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:\n# Author: Binux<roy@binux.me>\n#         http://binux.me\n# Created on 2014-12-04 20:11:04\n\n"
  },
  {
    "path": "pyspider/database/sqlalchemy/projectdb.py",
    "content": "#!/usr/bin/env python\n# -*- encoding: utf-8 -*-\n# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:\n# Author: Binux<roy@binux.me>\n#         http://binux.me\n# Created on 2014-12-04 23:25:10\n\nimport six\nimport time\nimport sqlalchemy.exc\n\nfrom sqlalchemy import create_engine, MetaData, Table, Column, String, Float, Text\nfrom sqlalchemy.engine.url import make_url\nfrom pyspider.libs import utils\nfrom pyspider.database.base.projectdb import ProjectDB as BaseProjectDB\nfrom .sqlalchemybase import result2dict\n\n\nclass ProjectDB(BaseProjectDB):\n    __tablename__ = 'projectdb'\n\n    def __init__(self, url):\n        self.table = Table(self.__tablename__, MetaData(),\n                           Column('name', String(64), primary_key=True),\n                           Column('group', String(64)),\n                           Column('status', String(16)),\n                           Column('script', Text),\n                           Column('comments', String(1024)),\n                           Column('rate', Float(11)),\n                           Column('burst', Float(11)),\n                           Column('updatetime', Float(32)),\n                           mysql_engine='InnoDB',\n                           mysql_charset='utf8'\n                           )\n\n        self.url = make_url(url)\n        if self.url.database:\n            database = self.url.database\n            self.url.database = None\n            try:\n                engine = create_engine(self.url, convert_unicode=True, pool_recycle=3600)\n                conn = engine.connect()\n                conn.execute(\"commit\")\n                conn.execute(\"CREATE DATABASE %s\" % database)\n            except sqlalchemy.exc.SQLAlchemyError:\n                pass\n            self.url.database = database\n        self.engine = create_engine(url, convert_unicode=True, pool_recycle=3600)\n        self.table.create(self.engine, checkfirst=True)\n\n    @staticmethod\n    def _parse(data):\n        return data\n\n    @staticmethod\n    def _stringify(data):\n        return data\n\n    def insert(self, name, obj={}):\n        obj = dict(obj)\n        obj['name'] = name\n        obj['updatetime'] = time.time()\n        return self.engine.execute(self.table.insert()\n                                   .values(**self._stringify(obj)))\n\n    def update(self, name, obj={}, **kwargs):\n        obj = dict(obj)\n        obj.update(kwargs)\n        obj['updatetime'] = time.time()\n        return self.engine.execute(self.table.update()\n                                   .where(self.table.c.name == name)\n                                   .values(**self._stringify(obj)))\n\n    def get_all(self, fields=None):\n        columns = [getattr(self.table.c, f, f) for f in fields] if fields else self.table.c\n        for task in self.engine.execute(self.table.select()\n                                        .with_only_columns(columns)):\n            yield self._parse(result2dict(columns, task))\n\n    def get(self, name, fields=None):\n        columns = [getattr(self.table.c, f, f) for f in fields] if fields else self.table.c\n        for task in self.engine.execute(self.table.select()\n                                        .where(self.table.c.name == name)\n                                        .limit(1)\n                                        .with_only_columns(columns)):\n            return self._parse(result2dict(columns, task))\n\n    def drop(self, name):\n        return self.engine.execute(self.table.delete()\n                                   .where(self.table.c.name == name))\n\n    def check_update(self, timestamp, fields=None):\n        columns = [getattr(self.table.c, f, f) for f in fields] if fields else self.table.c\n        for task in self.engine.execute(self.table.select()\n                                        .with_only_columns(columns)\n                                        .where(self.table.c.updatetime >= timestamp)):\n            yield self._parse(result2dict(columns, task))\n"
  },
  {
    "path": "pyspider/database/sqlalchemy/resultdb.py",
    "content": "#!/usr/bin/env python\n# -*- encoding: utf-8 -*-\n# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:\n# Author: Binux<roy@binux.me>\n#         http://binux.me\n# Created on 2014-12-04 18:48:15\n\nimport re\nimport six\nimport time\nimport json\nimport sqlalchemy.exc\n\nfrom sqlalchemy import (create_engine, MetaData, Table, Column,\n                        String, Float, Text)\nfrom sqlalchemy.engine.url import make_url\nfrom pyspider.database.base.resultdb import ResultDB as BaseResultDB\nfrom pyspider.libs import utils\nfrom .sqlalchemybase import SplitTableMixin, result2dict\n\n\nclass ResultDB(SplitTableMixin, BaseResultDB):\n    __tablename__ = ''\n\n    def __init__(self, url):\n        self.table = Table('__tablename__', MetaData(),\n                           Column('taskid', String(64), primary_key=True, nullable=False),\n                           Column('url', String(1024)),\n                           Column('result', Text()),\n                           Column('updatetime', Float(32)),\n                           mysql_engine='InnoDB',\n                           mysql_charset='utf8'\n                           )\n\n        self.url = make_url(url)\n        if self.url.database:\n            database = self.url.database\n            self.url.database = None\n            try:\n                engine = create_engine(self.url, convert_unicode=True, pool_recycle=3600)\n                conn = engine.connect()\n                conn.execute(\"commit\")\n                conn.execute(\"CREATE DATABASE %s\" % database)\n            except sqlalchemy.exc.SQLAlchemyError:\n                pass\n            self.url.database = database\n        self.engine = create_engine(url, convert_unicode=True,\n                                    pool_recycle=3600)\n\n        self._list_project()\n\n    def _create_project(self, project):\n        assert re.match(r'^\\w+$', project) is not None\n        if project in self.projects:\n            return\n        self.table.name = self._tablename(project)\n        self.table.create(self.engine)\n\n    @staticmethod\n    def _parse(data):\n        for key, value in list(six.iteritems(data)):\n            if isinstance(value, six.binary_type):\n                data[key] = utils.text(value)\n        if 'result' in data:\n            if data['result']:\n                data['result'] = json.loads(data['result'])\n            else:\n                data['result'] = {}\n        return data\n\n    @staticmethod\n    def _stringify(data):\n        if 'result' in data:\n            if data['result']:\n                data['result'] = json.dumps(data['result'])\n            else:\n                data['result'] = json.dumps({})\n        return data\n\n    def save(self, project, taskid, url, result):\n        if project not in self.projects:\n            self._create_project(project)\n            self._list_project()\n        self.table.name = self._tablename(project)\n        obj = {\n            'taskid': taskid,\n            'url': url,\n            'result': result,\n            'updatetime': time.time(),\n        }\n        if self.get(project, taskid, ('taskid', )):\n            del obj['taskid']\n            return self.engine.execute(self.table.update()\n                                       .where(self.table.c.taskid == taskid)\n                                       .values(**self._stringify(obj)))\n        else:\n            return self.engine.execute(self.table.insert()\n                                       .values(**self._stringify(obj)))\n\n    def select(self, project, fields=None, offset=0, limit=None):\n        if project not in self.projects:\n            self._list_project()\n        if project not in self.projects:\n            return\n        self.table.name = self._tablename(project)\n\n        columns = [getattr(self.table.c, f, f) for f in fields] if fields else self.table.c\n        for task in self.engine.execute(self.table.select()\n                                        .with_only_columns(columns=columns)\n                                        .order_by(self.table.c.updatetime.desc())\n                                        .offset(offset).limit(limit)\n                                        .execution_options(autocommit=True)):\n            yield self._parse(result2dict(columns, task))\n\n    def count(self, project):\n        if project not in self.projects:\n            self._list_project()\n        if project not in self.projects:\n            return 0\n        self.table.name = self._tablename(project)\n\n        for count, in self.engine.execute(self.table.count()):\n            return count\n\n    def get(self, project, taskid, fields=None):\n        if project not in self.projects:\n            self._list_project()\n        if project not in self.projects:\n            return\n        self.table.name = self._tablename(project)\n\n        columns = [getattr(self.table.c, f, f) for f in fields] if fields else self.table.c\n        for task in self.engine.execute(self.table.select()\n                                        .with_only_columns(columns=columns)\n                                        .where(self.table.c.taskid == taskid)\n                                        .limit(1)):\n            return self._parse(result2dict(columns, task))\n"
  },
  {
    "path": "pyspider/database/sqlalchemy/sqlalchemybase.py",
    "content": "#!/usr/bin/env python\n# -*- encoding: utf-8 -*-\n# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:\n# Author: Binux<roy@binux.me>\n#         http://binux.me\n# Created on 2014-12-04 18:48:47\n\nimport time\n\n\ndef result2dict(columns, task):\n    return dict(task)\n\n\nclass SplitTableMixin(object):\n    UPDATE_PROJECTS_TIME = 10 * 60\n\n    def _tablename(self, project):\n        if self.__tablename__:\n            return '%s_%s' % (self.__tablename__, project)\n        else:\n            return project\n\n    @property\n    def projects(self):\n        if time.time() - getattr(self, '_last_update_projects', 0) \\\n                > self.UPDATE_PROJECTS_TIME:\n            self._list_project()\n        return self._projects\n\n    @projects.setter\n    def projects(self, value):\n        self._projects = value\n\n    def _list_project(self):\n        self._last_update_projects = time.time()\n        self.projects = set()\n        if self.__tablename__:\n            prefix = '%s_' % self.__tablename__\n        else:\n            prefix = ''\n\n        for project in self.engine.table_names():\n            if project.startswith(prefix):\n                project = project[len(prefix):]\n                self.projects.add(project)\n\n    def drop(self, project):\n        if project not in self.projects:\n            self._list_project()\n        if project not in self.projects:\n            return\n        self.table.name = self._tablename(project)\n        self.table.drop(self.engine)\n        self._list_project()\n"
  },
  {
    "path": "pyspider/database/sqlalchemy/taskdb.py",
    "content": "#!/usr/bin/env python\n# -*- encoding: utf-8 -*-\n# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:\n# Author: Binux<roy@binux.me>\n#         http://binux.me\n# Created on 2014-12-04 22:33:43\n\nimport re\nimport six\nimport time\nimport json\nimport sqlalchemy.exc\n\nfrom sqlalchemy import (create_engine, MetaData, Table, Column, Index,\n                        Integer, String, Float, Text, func)\nfrom sqlalchemy.engine.url import make_url\nfrom pyspider.libs import utils\nfrom pyspider.database.base.taskdb import TaskDB as BaseTaskDB\nfrom .sqlalchemybase import SplitTableMixin, result2dict\n\n\nclass TaskDB(SplitTableMixin, BaseTaskDB):\n    __tablename__ = ''\n\n    def __init__(self, url):\n        self.table = Table('__tablename__', MetaData(),\n                           Column('taskid', String(64), primary_key=True, nullable=False),\n                           Column('project', String(64)),\n                           Column('url', String(1024)),\n                           Column('status', Integer),\n                           Column('schedule', Text()),\n                           Column('fetch', Text()),\n                           Column('process', Text()),\n                           Column('track', Text()),\n                           Column('lastcrawltime', Float(32)),\n                           Column('updatetime', Float(32)),\n                           mysql_engine='InnoDB',\n                           mysql_charset='utf8'\n                           )\n\n        self.url = make_url(url)\n        if self.url.database:\n            database = self.url.database\n            self.url.database = None\n            try:\n                engine = create_engine(self.url, convert_unicode=True, pool_recycle=3600)\n                conn = engine.connect()\n                conn.execute(\"commit\")\n                conn.execute(\"CREATE DATABASE %s\" % database)\n            except sqlalchemy.exc.SQLAlchemyError:\n                pass\n            self.url.database = database\n        self.engine = create_engine(url, convert_unicode=True, pool_recycle=3600)\n\n        self._list_project()\n\n    def _create_project(self, project):\n        assert re.match(r'^\\w+$', project) is not None\n        if project in self.projects:\n            return\n        self.table.name = self._tablename(project)\n        Index('status_%s_index' % self.table.name, self.table.c.status)\n        self.table.create(self.engine, checkfirst=True)\n        self.table.indexes.clear()\n\n    @staticmethod\n    def _parse(data):\n        for key, value in list(six.iteritems(data)):\n            if isinstance(value, six.binary_type):\n                data[key] = utils.text(value)\n        for each in ('schedule', 'fetch', 'process', 'track'):\n            if each in data:\n                if data[each]:\n                    data[each] = json.loads(data[each])\n                else:\n                    data[each] = {}\n        return data\n\n    @staticmethod\n    def _stringify(data):\n        for each in ('schedule', 'fetch', 'process', 'track'):\n            if each in data:\n                if data[each]:\n                    data[each] = json.dumps(data[each])\n                else:\n                    data[each] = json.dumps({})\n        return data\n\n    def load_tasks(self, status, project=None, fields=None):\n        if project and project not in self.projects:\n            return\n\n        if project:\n            projects = [project, ]\n        else:\n            projects = self.projects\n\n        columns = [getattr(self.table.c, f, f) for f in fields] if fields else self.table.c\n        for project in projects:\n            self.table.name = self._tablename(project)\n            for task in self.engine.execute(self.table.select()\n                                            .with_only_columns(columns)\n                                            .where(self.table.c.status == status)):\n                yield self._parse(result2dict(columns, task))\n\n    def get_task(self, project, taskid, fields=None):\n        if project not in self.projects:\n            self._list_project()\n        if project not in self.projects:\n            return None\n\n        self.table.name = self._tablename(project)\n        columns = [getattr(self.table.c, f, f) for f in fields] if fields else self.table.c\n        for each in self.engine.execute(self.table.select()\n                                        .with_only_columns(columns)\n                                        .limit(1)\n                                        .where(self.table.c.taskid == taskid)):\n            return self._parse(result2dict(columns, each))\n\n    def status_count(self, project):\n        result = dict()\n        if project not in self.projects:\n            self._list_project()\n        if project not in self.projects:\n            return result\n\n        self.table.name = self._tablename(project)\n        for status, count in self.engine.execute(\n                self.table.select()\n                .with_only_columns((self.table.c.status, func.count(1)))\n                .group_by(self.table.c.status)):\n            result[status] = count\n        return result\n\n    def insert(self, project, taskid, obj={}):\n        if project not in self.projects:\n            self._list_project()\n        if project not in self.projects:\n            self._create_project(project)\n            self._list_project()\n        obj = dict(obj)\n        obj['taskid'] = taskid\n        obj['project'] = project\n        obj['updatetime'] = time.time()\n        self.table.name = self._tablename(project)\n        return self.engine.execute(self.table.insert()\n                                   .values(**self._stringify(obj)))\n\n    def update(self, project, taskid, obj={}, **kwargs):\n        if project not in self.projects:\n            self._list_project()\n        if project not in self.projects:\n            raise LookupError\n        self.table.name = self._tablename(project)\n        obj = dict(obj)\n        obj.update(kwargs)\n        obj['updatetime'] = time.time()\n        return self.engine.execute(self.table.update()\n                                   .where(self.table.c.taskid == taskid)\n                                   .values(**self._stringify(obj)))\n"
  },
  {
    "path": "pyspider/database/sqlite/__init__.py",
    "content": ""
  },
  {
    "path": "pyspider/database/sqlite/projectdb.py",
    "content": "#!/usr/bin/env python\n# -*- encoding: utf-8 -*-\n# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:\n# Author: Binux<i@binux.me>\n#         http://binux.me\n# Created on 2014-02-09 12:05:52\n\nimport time\n\nfrom .sqlitebase import SQLiteMixin\nfrom pyspider.database.base.projectdb import ProjectDB as BaseProjectDB\nfrom pyspider.database.basedb import BaseDB\n\n\nclass ProjectDB(SQLiteMixin, BaseProjectDB, BaseDB):\n    __tablename__ = 'projectdb'\n    placeholder = '?'\n\n    def __init__(self, path):\n        self.path = path\n        self.last_pid = 0\n        self.conn = None\n        self._execute('''CREATE TABLE IF NOT EXISTS `%s` (\n                name PRIMARY KEY,\n                `group`,\n                status, script, comments,\n                rate, burst, updatetime\n                )''' % self.__tablename__)\n\n    def insert(self, name, obj={}):\n        obj = dict(obj)\n        obj['name'] = name\n        obj['updatetime'] = time.time()\n        return self._insert(**obj)\n\n    def update(self, name, obj={}, **kwargs):\n        obj = dict(obj)\n        obj.update(kwargs)\n        obj['updatetime'] = time.time()\n        ret = self._update(where=\"`name` = %s\" % self.placeholder, where_values=(name, ), **obj)\n        return ret.rowcount\n\n    def get_all(self, fields=None):\n        return self._select2dic(what=fields)\n\n    def get(self, name, fields=None):\n        where = \"`name` = %s\" % self.placeholder\n        for each in self._select2dic(what=fields, where=where, where_values=(name, )):\n            return each\n        return None\n\n    def check_update(self, timestamp, fields=None):\n        where = \"`updatetime` >= %f\" % timestamp\n        return self._select2dic(what=fields, where=where)\n\n    def drop(self, name):\n        where = \"`name` = %s\" % self.placeholder\n        return self._delete(where=where, where_values=(name, ))\n"
  },
  {
    "path": "pyspider/database/sqlite/resultdb.py",
    "content": "#!/usr/bin/env python\n# -*- encoding: utf-8 -*-\n# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:\n# Author: Binux<i@binux.me>\n#         http://binux.me\n# Created on 2014-10-13 17:08:43\n\nimport re\nimport time\nimport json\n\nfrom .sqlitebase import SQLiteMixin, SplitTableMixin\nfrom pyspider.database.base.resultdb import ResultDB as BaseResultDB\nfrom pyspider.database.basedb import BaseDB\n\n\nclass ResultDB(SQLiteMixin, SplitTableMixin, BaseResultDB, BaseDB):\n    __tablename__ = 'resultdb'\n    placeholder = '?'\n\n    def __init__(self, path):\n        self.path = path\n        self.last_pid = 0\n        self.conn = None\n        self._list_project()\n\n    def _create_project(self, project):\n        assert re.match(r'^\\w+$', project) is not None\n        tablename = self._tablename(project)\n        self._execute('''CREATE TABLE IF NOT EXISTS `%s` (\n                taskid PRIMARY KEY,\n                url,\n                result,\n                updatetime\n                )''' % tablename)\n\n    def _parse(self, data):\n        if 'result' in data:\n            data['result'] = json.loads(data['result'])\n        return data\n\n    def _stringify(self, data):\n        if 'result' in data:\n            data['result'] = json.dumps(data['result'])\n        return data\n\n    def save(self, project, taskid, url, result):\n        tablename = self._tablename(project)\n        if project not in self.projects:\n            self._create_project(project)\n            self._list_project()\n        obj = {\n            'taskid': taskid,\n            'url': url,\n            'result': result,\n            'updatetime': time.time(),\n        }\n        return self._replace(tablename, **self._stringify(obj))\n\n    def select(self, project, fields=None, offset=0, limit=None):\n        if project not in self.projects:\n            self._list_project()\n        if project not in self.projects:\n            return\n        tablename = self._tablename(project)\n\n        for task in self._select2dic(tablename, what=fields, order='updatetime DESC',\n                                     offset=offset, limit=limit):\n            yield self._parse(task)\n\n    def count(self, project):\n        if project not in self.projects:\n            self._list_project()\n        if project not in self.projects:\n            return 0\n        tablename = self._tablename(project)\n        for count, in self._execute(\"SELECT count(1) FROM %s\" % self.escape(tablename)):\n            return count\n\n    def get(self, project, taskid, fields=None):\n        if project not in self.projects:\n            self._list_project()\n        if project not in self.projects:\n            return\n        tablename = self._tablename(project)\n        where = \"`taskid` = %s\" % self.placeholder\n        for task in self._select2dic(tablename, what=fields,\n                                     where=where, where_values=(taskid, )):\n            return self._parse(task)\n"
  },
  {
    "path": "pyspider/database/sqlite/sqlitebase.py",
    "content": "#!/usr/bin/env python\n# -*- encoding: utf-8 -*-\n# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:\n# Author: Binux<roy@binux.me>\n#         http://binux.me\n# Created on 2014-11-22 20:30:44\n\nimport os\nimport time\nimport sqlite3\nimport threading\n\n\nclass SQLiteMixin(object):\n\n    @property\n    def dbcur(self):\n        pid = (os.getpid(), threading.current_thread().ident)\n        if not (self.conn and pid == self.last_pid):\n            self.last_pid = pid\n            self.conn = sqlite3.connect(self.path, isolation_level=None)\n        return self.conn.cursor()\n\n\nclass SplitTableMixin(object):\n    UPDATE_PROJECTS_TIME = 10 * 60\n\n    def _tablename(self, project):\n        if self.__tablename__:\n            return '%s_%s' % (self.__tablename__, project)\n        else:\n            return project\n\n    @property\n    def projects(self):\n        if time.time() - getattr(self, '_last_update_projects', 0) \\\n                > self.UPDATE_PROJECTS_TIME:\n            self._list_project()\n        return self._projects\n\n    @projects.setter\n    def projects(self, value):\n        self._projects = value\n\n    def _list_project(self):\n        self._last_update_projects = time.time()\n        self.projects = set()\n        if self.__tablename__:\n            prefix = '%s_' % self.__tablename__\n        else:\n            prefix = ''\n        for project, in self._select('sqlite_master', what='name',\n                                     where='type = \"table\"'):\n            if project.startswith(prefix):\n                project = project[len(prefix):]\n                self.projects.add(project)\n\n    def drop(self, project):\n        if project not in self.projects:\n            self._list_project()\n        if project not in self.projects:\n            return\n        tablename = self._tablename(project)\n        self._execute(\"DROP TABLE %s\" % self.escape(tablename))\n        self._list_project()\n"
  },
  {
    "path": "pyspider/database/sqlite/taskdb.py",
    "content": "#!/usr/bin/env python\n# -*- encoding: utf-8 -*-\n# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:\n# Author: Binux<i@binux.me>\n#         http://binux.me\n# Created on 2014-02-08 10:25:34\n\nimport re\nimport time\nimport json\n\nfrom .sqlitebase import SQLiteMixin, SplitTableMixin\nfrom pyspider.database.base.taskdb import TaskDB as BaseTaskDB\nfrom pyspider.database.basedb import BaseDB\n\n\nclass TaskDB(SQLiteMixin, SplitTableMixin, BaseTaskDB, BaseDB):\n    __tablename__ = 'taskdb'\n    placeholder = '?'\n\n    def __init__(self, path):\n        self.path = path\n        self.last_pid = 0\n        self.conn = None\n        self._list_project()\n\n    def _create_project(self, project):\n        assert re.match(r'^\\w+$', project) is not None\n        tablename = self._tablename(project)\n        self._execute('''CREATE TABLE IF NOT EXISTS `%s` (\n                taskid PRIMARY KEY,\n                project,\n                url, status,\n                schedule, fetch, process, track,\n                lastcrawltime, updatetime\n                )''' % tablename)\n        self._execute(\n            '''CREATE INDEX `status_%s_index` ON %s (status)'''\n            % (tablename, self.escape(tablename))\n        )\n\n    def _parse(self, data):\n        for each in ('schedule', 'fetch', 'process', 'track'):\n            if each in data:\n                if data[each]:\n                    data[each] = json.loads(data[each])\n                else:\n                    data[each] = {}\n        return data\n\n    def _stringify(self, data):\n        for each in ('schedule', 'fetch', 'process', 'track'):\n            if each in data:\n                data[each] = json.dumps(data[each])\n        return data\n\n    def load_tasks(self, status, project=None, fields=None):\n        if project and project not in self.projects:\n            return\n        where = \"status = %d\" % status\n\n        if project:\n            projects = [project, ]\n        else:\n            projects = self.projects\n\n        for project in projects:\n            tablename = self._tablename(project)\n            for each in self._select2dic(tablename, what=fields, where=where):\n                yield self._parse(each)\n\n    def get_task(self, project, taskid, fields=None):\n        if project not in self.projects:\n            self._list_project()\n        if project not in self.projects:\n            return None\n        where = \"`taskid` = %s\" % self.placeholder\n        if project not in self.projects:\n            return None\n        tablename = self._tablename(project)\n        for each in self._select2dic(tablename, what=fields, where=where, where_values=(taskid, )):\n            return self._parse(each)\n        return None\n\n    def status_count(self, project):\n        '''\n        return a dict\n        '''\n        result = dict()\n        if project not in self.projects:\n            self._list_project()\n        if project not in self.projects:\n            return result\n        tablename = self._tablename(project)\n        for status, count in self._execute(\"SELECT `status`, count(1) FROM %s GROUP BY `status`\" %\n                                           self.escape(tablename)):\n            result[status] = count\n        return result\n\n    def insert(self, project, taskid, obj={}):\n        if project not in self.projects:\n            self._create_project(project)\n            self._list_project()\n        obj = dict(obj)\n        obj['taskid'] = taskid\n        obj['project'] = project\n        obj['updatetime'] = time.time()\n        tablename = self._tablename(project)\n        return self._insert(tablename, **self._stringify(obj))\n\n    def update(self, project, taskid, obj={}, **kwargs):\n        if project not in self.projects:\n            raise LookupError\n        tablename = self._tablename(project)\n        obj = dict(obj)\n        obj.update(kwargs)\n        obj['updatetime'] = time.time()\n        return self._update(\n            tablename, where=\"`taskid` = %s\" % self.placeholder, where_values=(taskid, ),\n            **self._stringify(obj)\n        )\n"
  },
  {
    "path": "pyspider/fetcher/__init__.py",
    "content": "from .tornado_fetcher import Fetcher\n"
  },
  {
    "path": "pyspider/fetcher/cookie_utils.py",
    "content": "#!/usr/bin/env python\n# -*- encoding: utf-8 -*-\n# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:\n# Author: Binux<roy@binux.me>\n#         http://binux.me\n# Created on 2014-12-14 09:07:11\n\nfrom requests.cookies import MockRequest\n\n\nclass MockResponse(object):\n\n    def __init__(self, headers):\n        self._headers = headers\n\n    def info(self):\n        return self\n\n    def getheaders(self, name):\n        \"\"\"make cookie python 2 version use this method to get cookie list\"\"\"\n        return self._headers.get_list(name)\n\n    def get_all(self, name, default=None):\n        \"\"\"make cookie python 3 version use this instead of getheaders\"\"\"\n        if default is None:\n            default = []\n        return self._headers.get_list(name) or default\n\n\ndef extract_cookies_to_jar(jar, request, response):\n    req = MockRequest(request)\n    res = MockResponse(response)\n    jar.extract_cookies(res, req)\n"
  },
  {
    "path": "pyspider/fetcher/phantomjs_fetcher.js",
    "content": "// vim: set et sw=2 ts=2 sts=2 ff=unix fenc=utf8:\n// Author: Binux<i@binux.me>\n//         http://binux.me\n// Created on 2014-10-29 22:12:14\n\nvar port, server, service,\n  wait_before_end = 1000,\n  system = require('system'),\n  webpage = require('webpage');\n\nif (system.args.length !== 2) {\n  console.log('Usage: simpleserver.js <portnumber>');\n  phantom.exit(1);\n} else {\n  port = system.args[1];\n  server = require('webserver').create();\n  console.debug = function(){};\n\n  service = server.listen(port, {\n    'keepAlive': false\n  }, function (request, response) {\n    phantom.clearCookies();\n\n    //console.debug(JSON.stringify(request, null, 4));\n    // check method\n    if (request.method == 'GET') {\n      body = \"method not allowed!\";\n      response.statusCode = 403;\n      response.headers = {\n        'Cache': 'no-cache',\n        'Content-Length': body.length\n      };\n      response.write(body);\n      response.closeGracefully();\n      return;\n    }\n    \n    var first_response = null,\n        finished = false,\n        page_loaded = false,\n        start_time = Date.now(),\n        end_time = null,\n        script_executed = false,\n        script_result = null;\n\n    var fetch = JSON.parse(request.postRaw);\n    console.debug(JSON.stringify(fetch, null, 2));\n\n    // create and set page\n    var page = webpage.create();\n    if (fetch.proxy) {\n      if (fetch.proxy.indexOf('://') == -1){\n        fetch.proxy = 'http://' + fetch.proxy\n      }\n      page.setProxy(fetch.proxy);\n    }\n    page.onConsoleMessage = function(msg) {\n        console.log('console: ' + msg);\n    };\n    page.viewportSize = {\n      width: fetch.js_viewport_width || 1024,\n      height: fetch.js_viewport_height || 768*3\n    }\n    if (fetch.headers) {\n      fetch.headers['Accept-Encoding'] = undefined;\n      fetch.headers['Connection'] = undefined;\n      fetch.headers['Content-Length'] = undefined;\n    }\n    if (fetch.headers && fetch.headers['User-Agent']) {\n      page.settings.userAgent = fetch.headers['User-Agent'];\n    }\n    // this may cause memory leak: https://github.com/ariya/phantomjs/issues/12903\n    page.settings.loadImages = fetch.load_images === undefined ? true : fetch.load_images;\n    page.settings.resourceTimeout = fetch.timeout ? fetch.timeout * 1000 : 20*1000;\n    if (fetch.headers) {\n      page.customHeaders = fetch.headers;\n    }\n\n    // add callbacks\n    page.onInitialized = function() {\n      if (!script_executed && fetch.js_script && fetch.js_run_at === \"document-start\") {\n        script_executed = true;\n        console.log('running document-start script.');\n        script_result = page.evaluateJavaScript(fetch.js_script);\n      }\n    };\n    page.onLoadFinished = function(status) {\n      page_loaded = true;\n      if (!script_executed && fetch.js_script && fetch.js_run_at !== \"document-start\") {\n        script_executed = true;\n        console.log('running document-end script.');\n        script_result = page.evaluateJavaScript(fetch.js_script);\n      }\n      console.debug(\"waiting \"+wait_before_end+\"ms before finished.\");\n      end_time = Date.now() + wait_before_end;\n      setTimeout(make_result, wait_before_end+10, page);\n    };\n    page.onResourceRequested = function(request) {\n      console.debug(\"Starting request: #\"+request.id+\" [\"+request.method+\"]\"+request.url);\n      end_time = null;\n    };\n    page.onResourceReceived = function(response) {\n      console.debug(\"Request finished: #\"+response.id+\" [\"+response.status+\"]\"+response.url);\n      if (first_response === null && response.status != 301 && response.status != 302) {\n        first_response = response;\n      }\n      if (page_loaded) {\n        console.debug(\"waiting \"+wait_before_end+\"ms before finished.\");\n        end_time = Date.now() + wait_before_end;\n        setTimeout(make_result, wait_before_end+10, page);\n      }\n    }\n    page.onResourceError = page.onResourceTimeout=function(response) {\n      console.info(\"Request error: #\"+response.id+\" [\"+response.errorCode+\"=\"+response.errorString+\"]\"+response.url);\n      if (first_response === null) {\n        first_response = response;\n      }\n      if (page_loaded) {\n        console.debug(\"waiting \"+wait_before_end+\"ms before finished.\");\n        end_time = Date.now() + wait_before_end;\n        setTimeout(make_result, wait_before_end+10, page);\n      }\n    }\n\n    // make sure request will finished\n    setTimeout(make_result, page.settings.resourceTimeout + 100, page);\n\n    // send request\n    page.open(fetch.url, {\n      operation: fetch.method,\n      data: fetch.data,\n    });\n\n    // make response\n    function make_result(page) {\n      if (finished) {\n        return;\n      }\n      if (Date.now() - start_time < page.settings.resourceTimeout) {\n        if (!!!end_time) {\n          return;\n        }\n        if (end_time > Date.now()) {\n          setTimeout(make_result, Math.min(Date.now() - end_time, 100), page);\n          return;\n        }\n      }\n\n      var result = {};\n      try {\n        result = _make_result(page);\n        page.close();\n        finished = true;\n        console.log(\"[\"+result.status_code+\"] \"+result.orig_url+\" \"+result.time)\n      } catch (e) {\n        result = {\n          orig_url: fetch.url,\n          status_code: 599,\n          error: e.toString(),\n          content: page.content || \"\",\n          headers: {},\n          url: page.url || fetch.url,\n          cookies: {},\n          time: (Date.now() - start_time) / 1000,\n          js_script_result: null,\n          save: fetch.save\n        }\n      }\n\n      var body = JSON.stringify(result, null, 2);\n      response.writeHead(200, {\n        'Cache': 'no-cache',\n        'Content-Type': 'application/json',\n      });\n      response.write(body);\n      response.closeGracefully();\n    }\n\n    function _make_result(page) {\n      if (first_response === null) {\n        throw \"Timeout before first response.\";\n      }\n\n      var cookies = {};\n      page.cookies.forEach(function(e) {\n        cookies[e.name] = e.value;\n      });\n\n      var headers = {};\n      if (first_response.headers) {\n        first_response.headers.forEach(function(e) {\n          headers[e.name] = e.value;\n        });\n      }\n\n      return {\n        orig_url: fetch.url,\n        status_code: first_response.status || 599,\n        error: first_response.errorString,\n        content:  page.content,\n        headers: headers,\n        url: page.url,\n        cookies: cookies,\n        time: (Date.now() - start_time) / 1000,\n        js_script_result: script_result,\n        save: fetch.save\n      }\n    }\n  });\n\n  if (service) {\n    console.log('phantomjs fetcher running on port ' + port);\n  } else {\n    console.log('Error: Could not create web server listening on port ' + port);\n    phantom.exit();\n  }\n}\n"
  },
  {
    "path": "pyspider/fetcher/puppeteer_fetcher.js",
    "content": "const express = require(\"express\");\nconst puppeteer = require('puppeteer');\nconst bodyParser = require('body-parser');\n\nconst app = express();\n\napp.use(bodyParser.json());\napp.use(bodyParser.urlencoded({extended: false}));\n\nlet init_browser = true;\nlet browser_settings = {};\n\napp.use(async (req, res, next) => {\n    if (init_browser) {\n        var options = req.body;\n        if (options.proxy) {\n            if (options.proxy.indexOf(\"://\") == -1) {\n                options.proxy = \"http://\" + options.proxy;\n            }\n            browser_settings[\"args\"] = ['--no-sandbox', \"--disable-setuid-sandbox\", \"--proxy-server=\"+options.proxy];\n        } else {\n          browser_settings[\"args\"] = ['--no-sandbox', \"--disable-setuid-sandbox\"];\n        }\n        browser_settings[\"headless\"] = options.headless === \"false\"? false:true\n        browser = await puppeteer.launch(browser_settings);\n        init_browser=false;\n        console.log(\"init browser success!\");\n        next();\n    } else {\n        next();\n    };\n});\n\n\nasync function fetch(options) {\n    var page = await browser.newPage();\n    options.start_time = Date.now();\n    try {\n        await _fetch(page, options);\n        var result = await make_result(page, options);\n        await page.close();\n        return result\n    } catch (error) {\n        console.log('catch error ', error);\n        var result = await make_result(page, options, error);\n        await page.close();\n        return result\n    }\n}\n\nasync function _fetch(page, options) {\n\n    width = options.js_viewport_width || 1024;\n    height = options.js_viewport_height || 768 * 3;\n    await page.setViewport({\n        \"width\": width,\n        \"height\": height\n    });\n\n    if (options.headers) {\n        await page.setExtraHTTPHeaders(options.headers);\n    }\n\n    if (options.headers && options.headers[\"User-Agent\"]) {\n        page.setUserAgent(options.headers[\"User-Agent\"]);\n    }\n\n    page.on(\"console\", msg => {\n        console.log('console: ' + msg.args());\n    });\n\n    // Http post method\n    let first_request = true;\n    let request_reseted = false;\n    await page.setRequestInterception(true);\n    if (options.method && options.method.toLowerCase() === \"post\") {\n        page.on(\"request\", interceptedRequest => {\n            request_reseted = false;\n            end_time = null;\n            if (first_request) {\n                first_request = false;\n                var data = {\n                    \"method\": \"POST\",\n                    \"postData\": options.data\n                };\n                console.log(data);\n                interceptedRequest.continue(data);\n                request_reseted = true\n            }\n        })\n    } else {\n        page.on(\"request\", interceptedRequest => {\n            request_reseted = false;\n            end_time = null;\n        })\n    }\n\n    // load images or not\n    if (options.load_images && options.load_images.toLowerCase() === \"false\") {\n        page.on(\"request\", request => {\n            if (!!!request_reseted) {\n                if (request.resourceType() === 'image')\n                    request.abort();\n                else\n                    request.continue();\n            }\n        })\n    } else {\n        page.on(\"request\", request => {\n            if (!!!request_reseted)\n                request.continue()\n        })\n    }\n\n    let error_message = null;\n    page.on(\"error\", e => {\n        error_message = e\n    });\n\n    let page_settings = {};\n    var page_timeout = options.timeout ? options.timeout * 1000 : 20 * 1000;\n    page_settings[\"timeout\"] = page_timeout\n    page_settings[\"waitUntil\"] = [\"domcontentloaded\", \"networkidle0\"];\n\n    console.log('goto ', options.url)\n    var response = await page.goto(options.url, page_settings);\n\n    if (error_message) {\n        throw error_message\n    }\n\n    if (options.js_script) {\n        console.log('running document-end script.');\n        script_result = await page.evaluate(options.js_script);\n        console.log(\"end script_result is: \", script_result);\n        options.script_result = script_result\n    }\n\n    if (options.screenshot_path) {\n        await page.screenshot({path: options.screenshot_path});\n    }\n\n    options.response = response\n}\n\nasync function make_result(page, options, error) {\n    response = options.response;\n\n    var cookies = {};\n    var tmp_cookies = await page.cookies();\n    tmp_cookies.forEach(function (e) {\n        cookies[e.name] = e.value;\n    });\n\n    let status_code = null;\n    let headers = null;\n    let page_content = null;\n\n    if (!!!error) {\n        response = options.response;\n        status_code = response.status();\n        headers = response.headers();\n        page_content = await page.content();\n    }\n\n    return {\n        orig_url: options.url,\n        status_code: status_code || 599,\n        error: error,\n        content: page_content,\n        headers: headers,\n        url: page.url(),\n        cookies: cookies,\n        time: (Date.now() - options.start_time) / 1000,\n        js_script_result: options.script_result,\n        save: options.save\n    }\n}\n\napp.get(\"/\", function (request, response) {\n    body = \"method not allowed!\";\n    response.status(403);\n    response.set({\n        \"cache\": \"no-cache\",\n        \"Content-Length\": body.length\n    });\n    response.send(body);\n});\n\n\n\nlet max_open_pages = 5;\nlet opened_page_nums = 0;\n\napp.post(\"/\", async (request, response) => {\n    console.log(\"opened pages: \" + opened_page_nums);\n    if (opened_page_nums >= max_open_pages){\n        body = \"browser pages is too many, open new browser process!\";\n        response.status(403);\n        response.set({\n            \"cache\": \"no-cache\",\n            \"Content-Length\": body.length\n        });\n        response.send(body);\n    } else {\n        opened_page_nums += 1;\n        let options = request.body;\n        result = await fetch(options);\n        opened_page_nums -= 1;\n        response.send(result)\n    }\n});\n\n\nlet port = 22222;\n\nif (process.argv.length === 3) {\n    port = parseInt(process.argv[2])\n}\n\napp.listen(port, function () {\n    console.log(\"puppeteer fetcher running on port \" + port);\n});\n"
  },
  {
    "path": "pyspider/fetcher/splash_fetcher.lua",
    "content": "--#! /usr/bin/env lua\n--\n-- splash_fetcher.lua\n-- Copyright (C) 2016 Binux <roy@binux.me>\n--\n-- Distributed under terms of the Apache license, version 2.0.\n--\n\njson = require(\"json\")\n\nfunction render(splash, fetch)\n    local debug = true\n    local function log_message(message, level)\n        if debug or level ~= nil then\n            print(message)\n        end\n    end\n    if not splash.with_timeout then\n        function with_timeout(self, func, timeout)\n            return true, func()\n        end\n        splash.with_timeout = with_timeout\n    end\n\n    log_message(json.encode(fetch))\n\n    -- create and set page\n    local start_time = os.time()\n\n    splash:clear_cookies()\n    splash:autoload_reset()\n    splash:on_request_reset()\n    splash:on_response_reset()\n\n    splash:set_viewport_size(fetch.js_viewport_width or 1024, fetch.js_viewport_height or 768 * 3)\n    if fetch.headers and fetch.headers[\"User-Agent\"] ~= nil then\n        splash:set_user_agent(fetch.headers[\"User-Agent\"])\n    end\n    if fetch.headers then\n        fetch.headers['Accept-Encoding'] = nil\n        fetch.headers['Connection'] = nil\n        fetch.headers['Content-Length'] = nil\n        splash:set_custom_headers(fetch.headers)\n    end\n    splash.images_enabled = (fetch.load_images == true)\n    splash.resource_timeout = math.min((fetch.timeout or 20), 58)\n    fetch.timeout = splash.resource_timeout\n\n    local wait_before_end = 1.0;\n    local end_time = start_time + fetch.timeout - 0.1\n    \n\n    -- callbacks\n    splash:on_request(function(request)\n        -- wait for new request\n        end_time = start_time + fetch.timeout - 0.1\n        log_message(\"Starting request: [\" .. tostring(request.method) .. \"]\" .. tostring(request.url))\n\n        if fetch.proxy_host and fetch.proxy_port then\n            request:set_proxy({\n                host = fetch.proxy_host,\n                port = tonumber(fetch.proxy_port),\n                username = fetch.proxy_username,\n                password = fetch.proxy_password,\n                type = 'HTTP'\n            })\n        end\n    end)\n\n    local first_response = nil\n    splash:on_response(function(response)\n        if first_response == nil then\n            first_response = response\n        end\n        -- wait for some other respond and render\n        end_time = math.min(os.time() + wait_before_end + 0.1, start_time + fetch.timeout - 0.1)\n        log_message(\"Request finished: [\" .. tostring(response.status) .. \"]\" .. tostring(response.url))\n    end)\n\n    -- send request\n    local js_script_result = nil\n    local timeout_ok, ok, reason = splash:with_timeout(function()\n        local js_script = nil\n        if fetch.js_script then\n            ok, js_script = pcall(function()\n                return splash:jsfunc(fetch.js_script)\n            end)\n            if not ok then\n                log_message(\"js_script error: \" .. tostring(js_script), 1)\n                js_script = nil\n            end\n        end\n\n        if js_script and fetch.js_run_at == \"document-start\" then\n            log_message(\"running document-start script.\");\n            ok, js_script_result = pcall(js_script)\n            if not ok then\n                log_message(\"running document-start script error: \" .. tostring(js_script_result), 1)\n            end\n        end\n\n        local ok, reason = splash:go{url=fetch.url, http_method=fetch.method, body=fetch.data}\n        end_time = math.min(os.time() + wait_before_end + 0.1, start_time + fetch.timeout - 0.1)\n\n        if js_script and fetch.js_run_at ~= \"document-start\" then\n            splash:wait(0.5)\n            log_message(\"running document-end script.\");\n            ok, js_script_result = pcall(js_script)\n            if not ok then\n                log_message(\"running document-end script error: \" .. tostring(js_script_result), 1)\n            end\n        end\n\n        -- wait for all requests finished\n        local now = os.time()\n        while now <= end_time do\n            splash:wait(math.min(end_time - now, 0.1))\n            now = os.time()\n        end\n\n        return ok, reason\n    end, fetch.timeout + 0.1)\n\n    -- make response\n    local cookies = {}\n    for i, c in ipairs(splash:get_cookies()) do\n        cookies[c.name] = c.value\n    end\n    if (not timeout_ok and first_response.ok) or (timeok and ok) then\n        return {\n            orig_url = fetch.url,\n            status_code = first_response.status == 0 and 599 or first_response.status,\n            error = nil,\n            content = splash:html(),\n            headers = first_response.headers,\n            url = splash:url(),\n            cookies = cookies,\n            time = os.time() - start_time,\n            js_script_result = js_script_result and tostring(js_script_result),\n            save = fetch.save\n        }\n    else\n        if first_response then\n            return {\n                orig_url = fetch.url,\n                status_code = first_response.status == 0 and 599 or first_response.status,\n                error = reason,\n                content = splash:html(),\n                headers = first_response.headers,\n                url = splash:url(),\n                cookies = cookies,\n                time = os.time() - start_time,\n                js_script_result = js_script_result and tostring(js_script_result),\n                save = fetch.save\n            }\n        else\n            return {\n                orig_url = fetch.url,\n                status_code = 599,\n                error = reason,\n                content = splash:html(),\n                headers = {},\n                url = splash:url(),\n                cookies = cookies,\n                time = os.time() - start_time,\n                js_script_result = js_script_result and tostring(js_script_result),\n                save = fetch.save\n            }\n        end\n    end\n\nend\n\nfunction main(splash)\n    local fetch = splash.args\n    local start_time = os.time()\n\n    ok, result = pcall(function()\n        return render(splash, fetch)\n    end)\n\n    if ok then\n        return result\n    else\n        return {\n            orig_url = fetch.url,\n            status_code = 599,\n            error = result,\n            content = splash:html(),\n            headers = {},\n            url = splash:url(),\n            cookies = {},\n            time = os.time() - start_time,\n            js_script_result = nil,\n            save = fetch.save\n        }\n    end\nend\n"
  },
  {
    "path": "pyspider/fetcher/tornado_fetcher.py",
    "content": "#!/usr/bin/env python\n# -*- encoding: utf-8 -*-\n# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:\n# Author: Binux<i@binux.me>\n#         http://binux.me\n# Created on 2012-12-17 11:07:19\n\nfrom __future__ import unicode_literals\n\nimport os\nimport sys\nimport six\nimport copy\nimport time\nimport json\nimport logging\nimport traceback\nimport functools\nimport threading\nimport tornado.ioloop\nimport tornado.httputil\nimport tornado.httpclient\nimport pyspider\n\nfrom six.moves import queue, http_cookies\nfrom six.moves.urllib.robotparser import RobotFileParser\nfrom requests import cookies\nfrom six.moves.urllib.parse import urljoin, urlsplit\nfrom tornado import gen\nfrom tornado.curl_httpclient import CurlAsyncHTTPClient\nfrom tornado.simple_httpclient import SimpleAsyncHTTPClient\n\nfrom pyspider.libs import utils, dataurl, counter\nfrom pyspider.libs.url import quote_chinese\nfrom .cookie_utils import extract_cookies_to_jar\nlogger = logging.getLogger('fetcher')\n\n\nclass MyCurlAsyncHTTPClient(CurlAsyncHTTPClient):\n\n    def free_size(self):\n        return len(self._free_list)\n\n    def size(self):\n        return len(self._curls) - self.free_size()\n\n\nclass MySimpleAsyncHTTPClient(SimpleAsyncHTTPClient):\n\n    def free_size(self):\n        return self.max_clients - self.size()\n\n    def size(self):\n        return len(self.active)\n\nfetcher_output = {\n    \"status_code\": int,\n    \"orig_url\": str,\n    \"url\": str,\n    \"headers\": dict,\n    \"content\": str,\n    \"cookies\": dict,\n}\n\n\nclass Fetcher(object):\n    user_agent = \"pyspider/%s (+http://pyspider.org/)\" % pyspider.__version__\n    default_options = {\n        'method': 'GET',\n        'headers': {\n        },\n        'use_gzip': True,\n        'timeout': 120,\n        'connect_timeout': 20,\n    }\n    phantomjs_proxy = None\n    splash_endpoint = None\n    splash_lua_source = open(os.path.join(os.path.dirname(__file__), \"splash_fetcher.lua\")).read()\n    robot_txt_age = 60*60  # 1h\n\n    def __init__(self, inqueue, outqueue, poolsize=100, proxy=None, async_mode=True):\n        self.inqueue = inqueue\n        self.outqueue = outqueue\n\n        self.poolsize = poolsize\n        self._running = False\n        self._quit = False\n        self.proxy = proxy\n        self.async_mode = async_mode\n        self.ioloop = tornado.ioloop.IOLoop()\n\n        self.robots_txt_cache = {}\n\n        # binding io_loop to http_client here\n        if self.async_mode:\n            self.http_client = MyCurlAsyncHTTPClient(max_clients=self.poolsize,\n                                                     io_loop=self.ioloop)\n        else:\n            self.http_client = tornado.httpclient.HTTPClient(MyCurlAsyncHTTPClient, max_clients=self.poolsize)\n\n        self._cnt = {\n            '5m': counter.CounterManager(\n                lambda: counter.TimebaseAverageWindowCounter(30, 10)),\n            '1h': counter.CounterManager(\n                lambda: counter.TimebaseAverageWindowCounter(60, 60)),\n        }\n\n    def send_result(self, type, task, result):\n        '''Send fetch result to processor'''\n        if self.outqueue:\n            try:\n                self.outqueue.put((task, result))\n            except Exception as e:\n                logger.exception(e)\n\n    def fetch(self, task, callback=None):\n        if self.async_mode:\n            return self.async_fetch(task, callback)\n        else:\n            return self.async_fetch(task, callback).result()\n\n    @gen.coroutine\n    def async_fetch(self, task, callback=None):\n        '''Do one fetch'''\n        url = task.get('url', 'data:,')\n        if callback is None:\n            callback = self.send_result\n\n        type = 'None'\n        start_time = time.time()\n        try:\n            if url.startswith('data:'):\n                type = 'data'\n                result = yield gen.maybe_future(self.data_fetch(url, task))\n            elif task.get('fetch', {}).get('fetch_type') in ('js', 'phantomjs'):\n                type = 'phantomjs'\n                result = yield self.phantomjs_fetch(url, task)\n            elif task.get('fetch', {}).get('fetch_type') in ('splash', ):\n                type = 'splash'\n                result = yield self.splash_fetch(url, task)\n            elif task.get('fetch', {}).get('fetch_type') in ('puppeteer', ):\n                type = 'puppeteer'\n                result = yield self.puppeteer_fetch(url, task)\n            else:\n                type = 'http'\n                result = yield self.http_fetch(url, task)\n        except Exception as e:\n            logger.exception(e)\n            result = self.handle_error(type, url, task, start_time, e)\n\n        callback(type, task, result)\n        self.on_result(type, task, result)\n        raise gen.Return(result)\n\n    def sync_fetch(self, task):\n        '''Synchronization fetch, usually used in xmlrpc thread'''\n        if not self._running:\n            return self.ioloop.run_sync(functools.partial(self.async_fetch, task, lambda t, _, r: True))\n\n        wait_result = threading.Condition()\n        _result = {}\n\n        def callback(type, task, result):\n            wait_result.acquire()\n            _result['type'] = type\n            _result['task'] = task\n            _result['result'] = result\n            wait_result.notify()\n            wait_result.release()\n\n        wait_result.acquire()\n        self.ioloop.add_callback(self.fetch, task, callback)\n        while 'result' not in _result:\n            wait_result.wait()\n        wait_result.release()\n        return _result['result']\n\n    def data_fetch(self, url, task):\n        '''A fake fetcher for dataurl'''\n        self.on_fetch('data', task)\n        result = {}\n        result['orig_url'] = url\n        result['content'] = dataurl.decode(url)\n        result['headers'] = {}\n        result['status_code'] = 200\n        result['url'] = url\n        result['cookies'] = {}\n        result['time'] = 0\n        result['save'] = task.get('fetch', {}).get('save')\n        if len(result['content']) < 70:\n            logger.info(\"[200] %s:%s %s 0s\", task.get('project'), task.get('taskid'), url)\n        else:\n            logger.info(\n                \"[200] %s:%s data:,%s...[content:%d] 0s\",\n                task.get('project'), task.get('taskid'),\n                result['content'][:70],\n                len(result['content'])\n            )\n\n        return result\n\n    def handle_error(self, type, url, task, start_time, error):\n        result = {\n            'status_code': getattr(error, 'code', 599),\n            'error': utils.text(error),\n            'traceback': traceback.format_exc() if sys.exc_info()[0] else None,\n            'content': \"\",\n            'time': time.time() - start_time,\n            'orig_url': url,\n            'url': url,\n            \"save\": task.get('fetch', {}).get('save')\n        }\n        logger.error(\"[%d] %s:%s %s, %r %.2fs\",\n                     result['status_code'], task.get('project'), task.get('taskid'),\n                     url, error, result['time'])\n        return result\n\n    allowed_options = ['method', 'data', 'connect_timeout', 'timeout', 'cookies', 'use_gzip', 'validate_cert']\n\n    def pack_tornado_request_parameters(self, url, task):\n        fetch = copy.deepcopy(self.default_options)\n        fetch['url'] = url\n        fetch['headers'] = tornado.httputil.HTTPHeaders(fetch['headers'])\n        fetch['headers']['User-Agent'] = self.user_agent\n        task_fetch = task.get('fetch', {})\n        for each in self.allowed_options:\n            if each in task_fetch:\n                fetch[each] = task_fetch[each]\n        fetch['headers'].update(task_fetch.get('headers', {}))\n\n        if task.get('track'):\n            track_headers = tornado.httputil.HTTPHeaders(\n                task.get('track', {}).get('fetch', {}).get('headers') or {})\n            track_ok = task.get('track', {}).get('process', {}).get('ok', False)\n        else:\n            track_headers = {}\n            track_ok = False\n        # proxy\n        proxy_string = None\n        if isinstance(task_fetch.get('proxy'), six.string_types):\n            proxy_string = task_fetch['proxy']\n        elif self.proxy and task_fetch.get('proxy', True):\n            proxy_string = self.proxy\n        if proxy_string:\n            if '://' not in proxy_string:\n                proxy_string = 'http://' + proxy_string\n            proxy_splited = urlsplit(proxy_string)\n            fetch['proxy_host'] = proxy_splited.hostname\n            if proxy_splited.username:\n                fetch['proxy_username'] = proxy_splited.username\n            if proxy_splited.password:\n                fetch['proxy_password'] = proxy_splited.password\n            if six.PY2:\n                for key in ('proxy_host', 'proxy_username', 'proxy_password'):\n                    if key in fetch:\n                        fetch[key] = fetch[key].encode('utf8')\n            fetch['proxy_port'] = proxy_splited.port or 8080\n\n        # etag\n        if task_fetch.get('etag', True):\n            _t = None\n            if isinstance(task_fetch.get('etag'), six.string_types):\n                _t = task_fetch.get('etag')\n            elif track_ok:\n                _t = track_headers.get('etag')\n            if _t and 'If-None-Match' not in fetch['headers']:\n                fetch['headers']['If-None-Match'] = _t\n        # last modifed\n        if task_fetch.get('last_modified', task_fetch.get('last_modifed', True)):\n            last_modified = task_fetch.get('last_modified', task_fetch.get('last_modifed', True))\n            _t = None\n            if isinstance(last_modified, six.string_types):\n                _t = last_modified\n            elif track_ok:\n                _t = track_headers.get('last-modified')\n            if _t and 'If-Modified-Since' not in fetch['headers']:\n                fetch['headers']['If-Modified-Since'] = _t\n        # timeout\n        if 'timeout' in fetch:\n            fetch['request_timeout'] = fetch['timeout']\n            del fetch['timeout']\n        # data rename to body\n        if 'data' in fetch:\n            fetch['body'] = fetch['data']\n            del fetch['data']\n\n        return fetch\n\n    @gen.coroutine\n    def can_fetch(self, user_agent, url):\n        parsed = urlsplit(url)\n        domain = parsed.netloc\n        if domain in self.robots_txt_cache:\n            robot_txt = self.robots_txt_cache[domain]\n            if time.time() - robot_txt.mtime() > self.robot_txt_age:\n                robot_txt = None\n        else:\n            robot_txt = None\n\n        if robot_txt is None:\n            robot_txt = RobotFileParser()\n            try:\n                response = yield gen.maybe_future(self.http_client.fetch(\n                    urljoin(url, '/robots.txt'), connect_timeout=10, request_timeout=30))\n                content = response.body\n            except tornado.httpclient.HTTPError as e:\n                logger.error('load robots.txt from %s error: %r', domain, e)\n                content = ''\n\n            try:\n                content = content.decode('utf8', 'ignore')\n            except UnicodeDecodeError:\n                content = ''\n\n            robot_txt.parse(content.splitlines())\n            self.robots_txt_cache[domain] = robot_txt\n\n        raise gen.Return(robot_txt.can_fetch(user_agent, url))\n\n    def clear_robot_txt_cache(self):\n        now = time.time()\n        for domain, robot_txt in self.robots_txt_cache.items():\n            if now - robot_txt.mtime() > self.robot_txt_age:\n                del self.robots_txt_cache[domain]\n\n    @gen.coroutine\n    def http_fetch(self, url, task):\n        '''HTTP fetcher'''\n        start_time = time.time()\n        self.on_fetch('http', task)\n        handle_error = lambda x: self.handle_error('http', url, task, start_time, x)\n\n        # setup request parameters\n        fetch = self.pack_tornado_request_parameters(url, task)\n        task_fetch = task.get('fetch', {})\n\n        session = cookies.RequestsCookieJar()\n        # fix for tornado request obj\n        if 'Cookie' in fetch['headers']:\n            c = http_cookies.SimpleCookie()\n            try:\n                c.load(fetch['headers']['Cookie'])\n            except AttributeError:\n                c.load(utils.utf8(fetch['headers']['Cookie']))\n            for key in c:\n                session.set(key, c[key])\n            del fetch['headers']['Cookie']\n        if 'cookies' in fetch:\n            session.update(fetch['cookies'])\n            del fetch['cookies']\n\n        max_redirects = task_fetch.get('max_redirects', 5)\n        # we will handle redirects by hand to capture cookies\n        fetch['follow_redirects'] = False\n\n        # making requests\n        while True:\n            # robots.txt\n            if task_fetch.get('robots_txt', False):\n                can_fetch = yield self.can_fetch(fetch['headers']['User-Agent'], fetch['url'])\n                if not can_fetch:\n                    error = tornado.httpclient.HTTPError(403, 'Disallowed by robots.txt')\n                    raise gen.Return(handle_error(error))\n\n            try:\n                request = tornado.httpclient.HTTPRequest(**fetch)\n                # if cookie already in header, get_cookie_header wouldn't work\n                old_cookie_header = request.headers.get('Cookie')\n                if old_cookie_header:\n                    del request.headers['Cookie']\n                cookie_header = cookies.get_cookie_header(session, request)\n                if cookie_header:\n                    request.headers['Cookie'] = cookie_header\n                elif old_cookie_header:\n                    request.headers['Cookie'] = old_cookie_header\n            except Exception as e:\n                logger.exception(fetch)\n                raise gen.Return(handle_error(e))\n\n            try:\n                response = yield gen.maybe_future(self.http_client.fetch(request))\n            except tornado.httpclient.HTTPError as e:\n                if e.response:\n                    response = e.response\n                else:\n                    raise gen.Return(handle_error(e))\n\n            extract_cookies_to_jar(session, response.request, response.headers)\n            if (response.code in (301, 302, 303, 307)\n                    and response.headers.get('Location')\n                    and task_fetch.get('allow_redirects', True)):\n                if max_redirects <= 0:\n                    error = tornado.httpclient.HTTPError(\n                        599, 'Maximum (%d) redirects followed' % task_fetch.get('max_redirects', 5),\n                        response)\n                    raise gen.Return(handle_error(error))\n                if response.code in (302, 303):\n                    fetch['method'] = 'GET'\n                    if 'body' in fetch:\n                        del fetch['body']\n                fetch['url'] = quote_chinese(urljoin(fetch['url'], response.headers['Location']))\n                fetch['request_timeout'] -= time.time() - start_time\n                if fetch['request_timeout'] < 0:\n                    fetch['request_timeout'] = 0.1\n                max_redirects -= 1\n                continue\n\n            result = {}\n            result['orig_url'] = url\n            result['content'] = response.body or ''\n            result['headers'] = dict(response.headers)\n            result['status_code'] = response.code\n            result['url'] = response.effective_url or url\n            result['time'] = time.time() - start_time\n            result['cookies'] = session.get_dict()\n            result['save'] = task_fetch.get('save')\n            if response.error:\n                result['error'] = utils.text(response.error)\n            if 200 <= response.code < 300:\n                logger.info(\"[%d] %s:%s %s %.2fs\", response.code,\n                            task.get('project'), task.get('taskid'),\n                            url, result['time'])\n            else:\n                logger.warning(\"[%d] %s:%s %s %.2fs\", response.code,\n                               task.get('project'), task.get('taskid'),\n                               url, result['time'])\n\n            raise gen.Return(result)\n\n    @gen.coroutine\n    def phantomjs_fetch(self, url, task):\n        '''Fetch with phantomjs proxy'''\n        start_time = time.time()\n        self.on_fetch('phantomjs', task)\n        handle_error = lambda x: self.handle_error('phantomjs', url, task, start_time, x)\n\n        # check phantomjs proxy is enabled\n        if not self.phantomjs_proxy:\n            result = {\n                \"orig_url\": url,\n                \"content\": \"phantomjs is not enabled.\",\n                \"headers\": {},\n                \"status_code\": 501,\n                \"url\": url,\n                \"time\": time.time() - start_time,\n                \"cookies\": {},\n                \"save\": task.get('fetch', {}).get('save')\n            }\n            logger.warning(\"[501] %s:%s %s 0s\", task.get('project'), task.get('taskid'), url)\n            raise gen.Return(result)\n\n        # setup request parameters\n        fetch = self.pack_tornado_request_parameters(url, task)\n        task_fetch = task.get('fetch', {})\n        for each in task_fetch:\n            if each not in fetch:\n                fetch[each] = task_fetch[each]\n\n        # robots.txt\n        if task_fetch.get('robots_txt', False):\n            user_agent = fetch['headers']['User-Agent']\n            can_fetch = yield self.can_fetch(user_agent, url)\n            if not can_fetch:\n                error = tornado.httpclient.HTTPError(403, 'Disallowed by robots.txt')\n                raise gen.Return(handle_error(error))\n\n        request_conf = {\n            'follow_redirects': False\n        }\n        request_conf['connect_timeout'] = fetch.get('connect_timeout', 20)\n        request_conf['request_timeout'] = fetch.get('request_timeout', 120) + 1\n\n        session = cookies.RequestsCookieJar()\n        if 'Cookie' in fetch['headers']:\n            c = http_cookies.SimpleCookie()\n            try:\n                c.load(fetch['headers']['Cookie'])\n            except AttributeError:\n                c.load(utils.utf8(fetch['headers']['Cookie']))\n            for key in c:\n                session.set(key, c[key])\n            del fetch['headers']['Cookie']\n        if 'cookies' in fetch:\n            session.update(fetch['cookies'])\n            del fetch['cookies']\n\n        request = tornado.httpclient.HTTPRequest(url=fetch['url'])\n        cookie_header = cookies.get_cookie_header(session, request)\n        if cookie_header:\n            fetch['headers']['Cookie'] = cookie_header\n\n        # making requests\n        fetch['headers'] = dict(fetch['headers'])\n        try:\n            request = tornado.httpclient.HTTPRequest(\n                url=self.phantomjs_proxy, method=\"POST\",\n                body=json.dumps(fetch), **request_conf)\n        except Exception as e:\n            raise gen.Return(handle_error(e))\n\n        try:\n            response = yield gen.maybe_future(self.http_client.fetch(request))\n        except tornado.httpclient.HTTPError as e:\n            if e.response:\n                response = e.response\n            else:\n                raise gen.Return(handle_error(e))\n\n        if not response.body:\n            raise gen.Return(handle_error(Exception('no response from phantomjs: %r' % response)))\n\n        result = {}\n        try:\n            result = json.loads(utils.text(response.body))\n            assert 'status_code' in result, result\n        except Exception as e:\n            if response.error:\n                result['error'] = utils.text(response.error)\n            raise gen.Return(handle_error(e))\n\n        if result.get('status_code', 200):\n            logger.info(\"[%d] %s:%s %s %.2fs\", result['status_code'],\n                        task.get('project'), task.get('taskid'), url, result['time'])\n        else:\n            logger.error(\"[%d] %s:%s %s, %r %.2fs\", result['status_code'],\n                         task.get('project'), task.get('taskid'),\n                         url, result['content'], result['time'])\n\n        raise gen.Return(result)\n\n    @gen.coroutine\n    def splash_fetch(self, url, task):\n        '''Fetch with splash'''\n        start_time = time.time()\n        self.on_fetch('splash', task)\n        handle_error = lambda x: self.handle_error('splash', url, task, start_time, x)\n\n        # check phantomjs proxy is enabled\n        if not self.splash_endpoint:\n            result = {\n                \"orig_url\": url,\n                \"content\": \"splash is not enabled.\",\n                \"headers\": {},\n                \"status_code\": 501,\n                \"url\": url,\n                \"time\": time.time() - start_time,\n                \"cookies\": {},\n                \"save\": task.get('fetch', {}).get('save')\n            }\n            logger.warning(\"[501] %s:%s %s 0s\", task.get('project'), task.get('taskid'), url)\n            raise gen.Return(result)\n\n        # setup request parameters\n        fetch = self.pack_tornado_request_parameters(url, task)\n        task_fetch = task.get('fetch', {})\n        for each in task_fetch:\n            if each not in fetch:\n                fetch[each] = task_fetch[each]\n\n        # robots.txt\n        if task_fetch.get('robots_txt', False):\n            user_agent = fetch['headers']['User-Agent']\n            can_fetch = yield self.can_fetch(user_agent, url)\n            if not can_fetch:\n                error = tornado.httpclient.HTTPError(403, 'Disallowed by robots.txt')\n                raise gen.Return(handle_error(error))\n\n        request_conf = {\n            'follow_redirects': False,\n            'headers': {\n                'Content-Type': 'application/json',\n            }\n        }\n        request_conf['connect_timeout'] = fetch.get('connect_timeout', 20)\n        request_conf['request_timeout'] = fetch.get('request_timeout', 120) + 1\n\n        session = cookies.RequestsCookieJar()\n        if 'Cookie' in fetch['headers']:\n            c = http_cookies.SimpleCookie()\n            try:\n                c.load(fetch['headers']['Cookie'])\n            except AttributeError:\n                c.load(utils.utf8(fetch['headers']['Cookie']))\n            for key in c:\n                session.set(key, c[key])\n            del fetch['headers']['Cookie']\n        if 'cookies' in fetch:\n            session.update(fetch['cookies'])\n            del fetch['cookies']\n\n        request = tornado.httpclient.HTTPRequest(url=fetch['url'])\n        cookie_header = cookies.get_cookie_header(session, request)\n        if cookie_header:\n            fetch['headers']['Cookie'] = cookie_header\n\n        # making requests\n        fetch['lua_source'] = self.splash_lua_source\n        fetch['headers'] = dict(fetch['headers'])\n        try:\n            request = tornado.httpclient.HTTPRequest(\n                url=self.splash_endpoint, method=\"POST\",\n                body=json.dumps(fetch), **request_conf)\n        except Exception as e:\n            raise gen.Return(handle_error(e))\n\n        try:\n            response = yield gen.maybe_future(self.http_client.fetch(request))\n        except tornado.httpclient.HTTPError as e:\n            if e.response:\n                response = e.response\n            else:\n                raise gen.Return(handle_error(e))\n\n        if not response.body:\n            raise gen.Return(handle_error(Exception('no response from phantomjs')))\n\n        result = {}\n        try:\n            result = json.loads(utils.text(response.body))\n            assert 'status_code' in result, result\n        except ValueError as e:\n            logger.error(\"result is not json: %r\", response.body[:500])\n            raise gen.Return(handle_error(e))\n        except Exception as e:\n            if response.error:\n                result['error'] = utils.text(response.error)\n            raise gen.Return(handle_error(e))\n\n        if result.get('status_code', 200):\n            logger.info(\"[%d] %s:%s %s %.2fs\", result['status_code'],\n                        task.get('project'), task.get('taskid'), url, result['time'])\n        else:\n            logger.error(\"[%d] %s:%s %s, %r %.2fs\", result['status_code'],\n                         task.get('project'), task.get('taskid'),\n                         url, result['content'], result['time'])\n\n        raise gen.Return(result)\n\n    @gen.coroutine\n    def puppeteer_fetch(self, url, task):\n        '''Fetch with puppeteer proxy'''\n        start_time = time.time()\n        self.on_fetch('puppeteer', task)\n        handle_error = lambda x: self.handle_error('puppeteer', url, task, start_time, x)\n\n        # check puppeteer proxy is enabled\n        if not self.puppeteer_proxy:\n            result = {\n                \"orig_url\": url,\n                \"content\": \"puppeteer is not enabled.\",\n                \"headers\": {},\n                \"status_code\": 501,\n                \"url\": url,\n                \"time\": time.time() - start_time,\n                \"cookies\": {},\n                \"save\": task.get('fetch', {}).get('save')\n            }\n            logger.warning(\"[501] %s:%s %s 0s\", task.get('project'), task.get('taskid'), url)\n            raise gen.Return(result)\n\n        # setup request parameters\n        fetch = self.pack_tornado_request_parameters(url, task)\n        task_fetch = task.get('fetch', {})\n        for each in task_fetch:\n            if each not in fetch:\n                fetch[each] = task_fetch[each]\n\n        # robots.txt\n        if task_fetch.get('robots_txt', False):\n            user_agent = fetch['headers']['User-Agent']\n            can_fetch = yield self.can_fetch(user_agent, url)\n            if not can_fetch:\n                error = tornado.httpclient.HTTPError(403, 'Disallowed by robots.txt')\n                raise gen.Return(handle_error(error))\n\n        request_conf = {\n            'follow_redirects': False\n        }\n        request_conf['connect_timeout'] = fetch.get('connect_timeout', 20)\n        request_conf['request_timeout'] = fetch.get('request_timeout', 120) + 1\n\n        session = cookies.RequestsCookieJar()\n        if 'Cookie' in fetch['headers']:\n            c = http_cookies.SimpleCookie()\n            try:\n                c.load(fetch['headers']['Cookie'])\n            except AttributeError:\n                c.load(utils.utf8(fetch['headers']['Cookie']))\n            for key in c:\n                session.set(key, c[key])\n            del fetch['headers']['Cookie']\n        if 'cookies' in fetch:\n            session.update(fetch['cookies'])\n            del fetch['cookies']\n\n        request = tornado.httpclient.HTTPRequest(url=fetch['url'])\n        cookie_header = cookies.get_cookie_header(session, request)\n        if cookie_header:\n            fetch['headers']['Cookie'] = cookie_header\n\n        logger.info(\"%s\", self.puppeteer_proxy)\n        # making requests\n        fetch['headers'] = dict(fetch['headers'])\n        headers = {}\n        headers['Content-Type'] = 'application/json; charset=UTF-8'\n        try:\n            request = tornado.httpclient.HTTPRequest(\n                url=self.puppeteer_proxy, method=\"POST\", headers=headers,\n                body=json.dumps(fetch), **request_conf)\n        except Exception as e:\n            raise gen.Return(handle_error(e))\n\n        try:\n            response = yield gen.maybe_future(self.http_client.fetch(request))\n        except tornado.httpclient.HTTPError as e:\n            if e.response:\n                response = e.response\n            else:\n                raise gen.Return(handle_error(e))\n\n        if not response.body:\n            raise gen.Return(handle_error(Exception('no response from puppeteer: %r' % response)))\n\n        result = {}\n        try:\n            result = json.loads(utils.text(response.body))\n            assert 'status_code' in result, result\n        except Exception as e:\n            if response.error:\n                result['error'] = utils.text(response.error)\n            raise gen.Return(handle_error(e))\n\n        if result.get('status_code', 200):\n            logger.info(\"[%d] %s:%s %s %.2fs\", result['status_code'],\n                        task.get('project'), task.get('taskid'), url, result['time'])\n        else:\n            logger.error(\"[%d] %s:%s %s, %r %.2fs\", result['status_code'],\n                         task.get('project'), task.get('taskid'),\n                         url, result['content'], result['time'])\n\n        raise gen.Return(result)\n\n    def run(self):\n        '''Run loop'''\n        logger.info(\"fetcher starting...\")\n\n        def queue_loop():\n            if not self.outqueue or not self.inqueue:\n                return\n            while not self._quit:\n                try:\n                    if self.outqueue.full():\n                        break\n                    if self.http_client.free_size() <= 0:\n                        break\n                    task = self.inqueue.get_nowait()\n                    # FIXME: decode unicode_obj should used after data selete from\n                    # database, it's used here for performance\n                    task = utils.decode_unicode_obj(task)\n                    self.fetch(task)\n                except queue.Empty:\n                    break\n                except KeyboardInterrupt:\n                    break\n                except Exception as e:\n                    logger.exception(e)\n                    break\n\n        tornado.ioloop.PeriodicCallback(queue_loop, 100, io_loop=self.ioloop).start()\n        tornado.ioloop.PeriodicCallback(self.clear_robot_txt_cache, 10000, io_loop=self.ioloop).start()\n        self._running = True\n\n        try:\n            self.ioloop.start()\n        except KeyboardInterrupt:\n            pass\n\n        logger.info(\"fetcher exiting...\")\n\n    def quit(self):\n        '''Quit fetcher'''\n        self._running = False\n        self._quit = True\n        self.ioloop.add_callback(self.ioloop.stop)\n        if hasattr(self, 'xmlrpc_server'):\n            self.xmlrpc_ioloop.add_callback(self.xmlrpc_server.stop)\n            self.xmlrpc_ioloop.add_callback(self.xmlrpc_ioloop.stop)\n\n    def size(self):\n        return self.http_client.size()\n\n    def xmlrpc_run(self, port=24444, bind='127.0.0.1', logRequests=False):\n        '''Run xmlrpc server'''\n        import umsgpack\n        from pyspider.libs.wsgi_xmlrpc import WSGIXMLRPCApplication\n        try:\n            from xmlrpc.client import Binary\n        except ImportError:\n            from xmlrpclib import Binary\n\n        application = WSGIXMLRPCApplication()\n\n        application.register_function(self.quit, '_quit')\n        application.register_function(self.size)\n\n        def sync_fetch(task):\n            result = self.sync_fetch(task)\n            result = Binary(umsgpack.packb(result))\n            return result\n        application.register_function(sync_fetch, 'fetch')\n\n        def dump_counter(_time, _type):\n            return self._cnt[_time].to_dict(_type)\n        application.register_function(dump_counter, 'counter')\n\n        import tornado.wsgi\n        import tornado.ioloop\n        import tornado.httpserver\n\n        container = tornado.wsgi.WSGIContainer(application)\n        self.xmlrpc_ioloop = tornado.ioloop.IOLoop()\n        self.xmlrpc_server = tornado.httpserver.HTTPServer(container, io_loop=self.xmlrpc_ioloop)\n        self.xmlrpc_server.listen(port=port, address=bind)\n        logger.info('fetcher.xmlrpc listening on %s:%s', bind, port)\n        self.xmlrpc_ioloop.start()\n\n    def on_fetch(self, type, task):\n        '''Called before task fetch'''\n        logger.info('on fetch %s:%s', type, task)\n\n    def on_result(self, type, task, result):\n        '''Called after task fetched'''\n        status_code = result.get('status_code', 599)\n        if status_code != 599:\n            status_code = (int(status_code) / 100 * 100)\n        self._cnt['5m'].event((task.get('project'), status_code), +1)\n        self._cnt['1h'].event((task.get('project'), status_code), +1)\n\n        if type in ('http', 'phantomjs') and result.get('time'):\n            content_len = len(result.get('content', ''))\n            self._cnt['5m'].event((task.get('project'), 'speed'),\n                                  float(content_len) / result.get('time'))\n            self._cnt['1h'].event((task.get('project'), 'speed'),\n                                  float(content_len) / result.get('time'))\n            self._cnt['5m'].event((task.get('project'), 'time'), result.get('time'))\n            self._cnt['1h'].event((task.get('project'), 'time'), result.get('time'))\n"
  },
  {
    "path": "pyspider/libs/ListIO.py",
    "content": "#!/usr/bin/env python\n# -*- encoding: utf-8 -*-\n# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:\n# Author: Binux<i@binux.me>\n#         http://binux.me\n# Created on 2014-02-26 23:41:51\n\n\nclass ListO(object):\n\n    \"\"\"A StringO write to list.\"\"\"\n\n    def __init__(self, buffer=None):\n        self._buffer = buffer\n        if self._buffer is None:\n            self._buffer = []\n\n    def isatty(self):\n        return False\n\n    def close(self):\n        pass\n\n    def flush(self):\n        pass\n\n    def seek(self, n, mode=0):\n        pass\n\n    def readline(self):\n        pass\n\n    def reset(self):\n        pass\n\n    def write(self, x):\n        self._buffer.append(x)\n\n    def writelines(self, x):\n        self._buffer.extend(x)\n"
  },
  {
    "path": "pyspider/libs/__init__.py",
    "content": ""
  },
  {
    "path": "pyspider/libs/base_handler.py",
    "content": "#!/usr/bin/env python\n# -*- encoding: utf-8 -*-\n# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:\n# Author: Binux<i@binux.me>\n#         http://binux.me\n# Created on 2014-02-16 23:12:48\n\nimport sys\nimport inspect\nimport functools\nimport fractions\n\nimport six\nfrom six import add_metaclass, iteritems\n\nfrom pyspider.libs.url import (\n    quote_chinese, _build_url, _encode_params,\n    _encode_multipart_formdata, curl_to_arguments)\nfrom pyspider.libs.utils import md5string, timeout\nfrom pyspider.libs.ListIO import ListO\nfrom pyspider.libs.response import rebuild_response\nfrom pyspider.libs.pprint import pprint\nfrom pyspider.processor import ProcessorResult\n\n\ndef catch_status_code_error(func):\n    \"\"\"\n    Non-200 response will been regarded as fetch failed and will not pass to callback.\n    Use this decorator to override this feature.\n    \"\"\"\n    func._catch_status_code_error = True\n    return func\n\n\ndef not_send_status(func):\n    \"\"\"\n    Do not send process status package back to scheduler.\n\n    It's used by callbacks like on_message, on_result etc...\n    \"\"\"\n    @functools.wraps(func)\n    def wrapper(self, response, task):\n        self._extinfo['not_send_status'] = True\n        function = func.__get__(self, self.__class__)\n        return self._run_func(function, response, task)\n    return wrapper\n\n\ndef config(_config=None, **kwargs):\n    \"\"\"\n    A decorator for setting the default kwargs of `BaseHandler.crawl`.\n    Any self.crawl with this callback will use this config.\n    \"\"\"\n    if _config is None:\n        _config = {}\n    _config.update(kwargs)\n\n    def wrapper(func):\n        func._config = _config\n        return func\n    return wrapper\n\n\nclass NOTSET(object):\n    pass\n\n\ndef every(minutes=NOTSET, seconds=NOTSET):\n    \"\"\"\n    method will been called every minutes or seconds\n    \"\"\"\n    def wrapper(func):\n        # mark the function with variable 'is_cronjob=True', the function would be\n        # collected into the list Handler._cron_jobs by meta class\n        func.is_cronjob = True\n\n        # collect interval and unify to seconds, it's used in meta class. See the\n        # comments in meta class.\n        func.tick = minutes * 60 + seconds\n        return func\n\n    if inspect.isfunction(minutes):\n        func = minutes\n        minutes = 1\n        seconds = 0\n        return wrapper(func)\n\n    if minutes is NOTSET:\n        if seconds is NOTSET:\n            minutes = 1\n            seconds = 0\n        else:\n            minutes = 0\n    if seconds is NOTSET:\n        seconds = 0\n\n    return wrapper\n\n\nclass BaseHandlerMeta(type):\n\n    def __new__(cls, name, bases, attrs):\n        # A list of all functions which is marked as 'is_cronjob=True'\n        cron_jobs = []\n\n        # The min_tick is the greatest common divisor(GCD) of the interval of cronjobs\n        # this value would be queried by scheduler when the project initial loaded.\n        # Scheudler may only send _on_cronjob task every min_tick seconds. It can reduce\n        # the number of tasks sent from scheduler.\n        min_tick = 0\n\n        for each in attrs.values():\n            if inspect.isfunction(each) and getattr(each, 'is_cronjob', False):\n                cron_jobs.append(each)\n                min_tick = fractions.gcd(min_tick, each.tick)\n        newcls = type.__new__(cls, name, bases, attrs)\n        newcls._cron_jobs = cron_jobs\n        newcls._min_tick = min_tick\n        return newcls\n\n\n@add_metaclass(BaseHandlerMeta)\nclass BaseHandler(object):\n    \"\"\"\n    BaseHandler for all scripts.\n\n    `BaseHandler.run` is the main method to handler the task.\n    \"\"\"\n    crawl_config = {}\n    project_name = None\n    _cron_jobs = []\n    _min_tick = 0\n    __env__ = {'not_inited': True}\n    retry_delay = {}\n\n    def _reset(self):\n        \"\"\"\n        reset before each task\n        \"\"\"\n        self._extinfo = {}\n        self._messages = []\n        self._follows = []\n        self._follows_keys = set()\n\n    def _run_func(self, function, *arguments):\n        \"\"\"\n        Running callback function with requested number of arguments\n        \"\"\"\n        args, varargs, keywords, defaults = inspect.getargspec(function)\n        task = arguments[-1]\n        process_time_limit = task['process'].get('process_time_limit',\n                                                 self.__env__.get('process_time_limit', 0))\n        if process_time_limit > 0:\n            with timeout(process_time_limit, 'process timeout'):\n                ret = function(*arguments[:len(args) - 1])\n        else:\n            ret = function(*arguments[:len(args) - 1])\n        return ret\n\n    def _run_task(self, task, response):\n        \"\"\"\n        Finding callback specified by `task['callback']`\n        raising status error for it if needed.\n        \"\"\"\n        process = task.get('process', {})\n        callback = process.get('callback', '__call__')\n        if not hasattr(self, callback):\n            raise NotImplementedError(\"self.%s() not implemented!\" % callback)\n\n        function = getattr(self, callback)\n        # do not run_func when 304\n        if response.status_code == 304 and not getattr(function, '_catch_status_code_error', False):\n            return None\n        if not getattr(function, '_catch_status_code_error', False):\n            response.raise_for_status()\n        return self._run_func(function, response, task)\n\n    def run_task(self, module, task, response):\n        \"\"\"\n        Processing the task, catching exceptions and logs, return a `ProcessorResult` object\n        \"\"\"\n        self.logger = logger = module.logger\n        result = None\n        exception = None\n        stdout = sys.stdout\n        self.task = task\n        if isinstance(response, dict):\n            response = rebuild_response(response)\n        self.response = response\n        self.save = (task.get('track') or {}).get('save', {})\n\n        try:\n            if self.__env__.get('enable_stdout_capture', True):\n                sys.stdout = ListO(module.log_buffer)\n            self._reset()\n            result = self._run_task(task, response)\n            if inspect.isgenerator(result):\n                for r in result:\n                    self._run_func(self.on_result, r, response, task)\n            else:\n                self._run_func(self.on_result, result, response, task)\n        except Exception as e:\n            logger.exception(e)\n            exception = e\n        finally:\n            follows = self._follows\n            messages = self._messages\n            logs = list(module.log_buffer)\n            extinfo = self._extinfo\n            save = self.save\n\n            sys.stdout = stdout\n            self.task = None\n            self.response = None\n            self.save = None\n\n        module.log_buffer[:] = []\n        return ProcessorResult(result, follows, messages, logs, exception, extinfo, save)\n\n    schedule_fields = ('priority', 'retries', 'exetime', 'age', 'itag', 'force_update', 'auto_recrawl', 'cancel')\n    fetch_fields = ('method', 'headers', 'user_agent', 'data', 'connect_timeout', 'timeout', 'allow_redirects', 'cookies',\n                    'proxy', 'etag', 'last_modifed', 'last_modified', 'save', 'js_run_at', 'js_script',\n                    'js_viewport_width', 'js_viewport_height', 'load_images', 'fetch_type', 'use_gzip', 'validate_cert',\n                    'max_redirects', 'robots_txt')\n    process_fields = ('callback', 'process_time_limit')\n\n    @staticmethod\n    def task_join_crawl_config(task, crawl_config):\n        task_fetch = task.get('fetch', {})\n        for k in BaseHandler.fetch_fields:\n            if k in crawl_config:\n                v = crawl_config[k]\n                if isinstance(v, dict) and isinstance(task_fetch.get(k), dict):\n                    v = dict(v)\n                    v.update(task_fetch[k])\n                    task_fetch[k] = v\n                else:\n                    task_fetch.setdefault(k, v)\n        if task_fetch:\n            task['fetch'] = task_fetch\n\n        task_process = task.get('process', {})\n        for k in BaseHandler.process_fields:\n            if k in crawl_config:\n                v = crawl_config[k]\n                if isinstance(v, dict) and isinstance(task_process.get(k), dict):\n                    task_process[k].update(v)\n                else:\n                    task_process.setdefault(k, v)\n        if task_process:\n            task['process'] = task_process\n\n        return task\n\n    def _crawl(self, url, **kwargs):\n        \"\"\"\n        real crawl API\n\n        checking kwargs, and repack them to each sub-dict\n        \"\"\"\n        task = {}\n\n        assert len(url) < 1024, \"Maximum (1024) URL length error.\"\n\n        if kwargs.get('callback'):\n            callback = kwargs['callback']\n            if isinstance(callback, six.string_types) and hasattr(self, callback):\n                func = getattr(self, callback)\n            elif six.callable(callback) and six.get_method_self(callback) is self:\n                func = callback\n                kwargs['callback'] = func.__name__\n            elif six.callable(callback) and hasattr(self, callback.__name__):\n                func = getattr(self, callback.__name__)\n                kwargs['callback'] = func.__name__\n            else:\n                raise NotImplementedError(\"self.%s() not implemented!\" % callback)\n            if hasattr(func, '_config'):\n                for k, v in iteritems(func._config):\n                    if isinstance(v, dict) and isinstance(kwargs.get(k), dict):\n                        kwargs[k].update(v)\n                    else:\n                        kwargs.setdefault(k, v)\n\n        url = quote_chinese(_build_url(url.strip(), kwargs.pop('params', None)))\n        if kwargs.get('files'):\n            assert isinstance(\n                kwargs.get('data', {}), dict), \"data must be a dict when using with files!\"\n            content_type, data = _encode_multipart_formdata(kwargs.pop('data', {}),\n                                                            kwargs.pop('files', {}))\n            kwargs.setdefault('headers', {})\n            kwargs['headers']['Content-Type'] = content_type\n            kwargs['data'] = data\n        if kwargs.get('data'):\n            kwargs['data'] = _encode_params(kwargs['data'])\n        if kwargs.get('data'):\n            kwargs.setdefault('method', 'POST')\n\n        if kwargs.get('user_agent'):\n            kwargs.setdefault('headers', {})\n            kwargs['headers']['User-Agent'] = kwargs.get('user_agent')\n\n        schedule = {}\n        for key in self.schedule_fields:\n            if key in kwargs:\n                schedule[key] = kwargs.pop(key)\n            elif key in self.crawl_config:\n                schedule[key] = self.crawl_config[key]\n\n        task['schedule'] = schedule\n\n        fetch = {}\n        for key in self.fetch_fields:\n            if key in kwargs:\n                fetch[key] = kwargs.pop(key)\n        task['fetch'] = fetch\n\n        process = {}\n        for key in self.process_fields:\n            if key in kwargs:\n                process[key] = kwargs.pop(key)\n        task['process'] = process\n\n        task['project'] = self.project_name\n        task['url'] = url\n        if 'taskid' in kwargs:\n            task['taskid'] = kwargs.pop('taskid')\n        else:\n            task['taskid'] = self.get_taskid(task)\n\n        if kwargs:\n            raise TypeError('crawl() got unexpected keyword argument: %s' % kwargs.keys())\n\n        if self.is_debugger():\n            task = self.task_join_crawl_config(task, self.crawl_config)\n\n        cache_key = \"%(project)s:%(taskid)s\" % task\n        if cache_key not in self._follows_keys:\n            self._follows_keys.add(cache_key)\n            self._follows.append(task)\n        return task\n\n    def get_taskid(self, task):\n        '''Generate taskid by information of task md5(url) by default, override me'''\n        return md5string(task['url'])\n\n    # apis\n    def crawl(self, url, **kwargs):\n        '''\n        available params:\n          url\n          callback\n\n          method\n          params\n          data\n          files\n          headers\n          timeout\n          allow_redirects\n          cookies\n          proxy\n          etag\n          last_modified\n          auto_recrawl\n\n          fetch_type\n          js_run_at\n          js_script\n          js_viewport_width\n          js_viewport_height\n          load_images\n\n          priority\n          retries\n          exetime\n          age\n          itag\n          cancel\n\n          save\n          taskid\n\n          full documents: http://pyspider.readthedocs.org/en/latest/apis/self.crawl/\n        '''\n\n        if isinstance(url, six.string_types) and url.startswith('curl '):\n            curl_kwargs = curl_to_arguments(url)\n            url = curl_kwargs.pop('urls')\n            for k, v in iteritems(curl_kwargs):\n                kwargs.setdefault(k, v)\n\n        if isinstance(url, six.string_types):\n            return self._crawl(url, **kwargs)\n        elif hasattr(url, \"__iter__\"):\n            result = []\n            for each in url:\n                result.append(self._crawl(each, **kwargs))\n            return result\n\n    def is_debugger(self):\n        \"\"\"Return true if running in debugger\"\"\"\n        return self.__env__.get('debugger')\n\n    def send_message(self, project, msg, url='data:,on_message'):\n        \"\"\"Send messages to other project.\"\"\"\n        self._messages.append((project, msg, url))\n\n    def on_message(self, project, msg):\n        \"\"\"Receive message from other project, override me.\"\"\"\n        pass\n\n    def on_result(self, result):\n        \"\"\"Receiving returns from other callback, override me.\"\"\"\n        if not result:\n            return\n        assert self.task, \"on_result can't outside a callback.\"\n        if self.is_debugger():\n            pprint(result)\n        if self.__env__.get('result_queue'):\n            self.__env__['result_queue'].put((self.task, result))\n\n    def on_finished(self, response, task):\n        \"\"\"\n        Triggered when all tasks in task queue finished.\n        http://docs.pyspider.org/en/latest/About-Projects/#on_finished-callback\n        \"\"\"\n        pass\n\n    @not_send_status\n    def _on_message(self, response):\n        project, msg = response.save\n        return self.on_message(project, msg)\n\n    @not_send_status\n    def _on_cronjob(self, response, task):\n        if (not response.save\n                or not isinstance(response.save, dict)\n                or 'tick' not in response.save):\n            return\n\n        # When triggered, a '_on_cronjob' task is sent from scheudler with 'tick' in\n        # Response.save. Scheduler may at least send the trigger task every GCD of the\n        # inverval of the cronjobs. The method should check the tick for each cronjob\n        # function to confirm the execute interval.\n        for cronjob in self._cron_jobs:\n            if response.save['tick'] % cronjob.tick != 0:\n                continue\n            function = cronjob.__get__(self, self.__class__)\n            self._run_func(function, response, task)\n\n    def _on_get_info(self, response, task):\n        \"\"\"Sending runtime infomation about this script.\"\"\"\n        for each in response.save or []:\n            if each == 'min_tick':\n                self.save[each] = self._min_tick\n            elif each == 'retry_delay':\n                if not isinstance(self.retry_delay, dict):\n                    self.retry_delay = {'': self.retry_delay}\n                self.save[each] = self.retry_delay\n            elif each == 'crawl_config':\n                self.save[each] = self.crawl_config\n"
  },
  {
    "path": "pyspider/libs/bench.py",
    "content": "#!/usr/bin/env python\n# -*- encoding: utf-8 -*-\n# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:\n# Author: Binux<roy@binux.me>\n#         http://binux.me\n# Created on 2014-12-08 22:23:10\n# rate: 10000000000\n# burst: 10000000000\n\nimport time\nimport logging\nlogger = logging.getLogger('bench')\n\nfrom six.moves import queue as Queue\nfrom pyspider.scheduler import ThreadBaseScheduler as Scheduler\nfrom pyspider.fetcher.tornado_fetcher import Fetcher\nfrom pyspider.processor import Processor\nfrom pyspider.result import ResultWorker\nfrom pyspider.libs.utils import md5string\n\n\ndef bench_test_taskdb(taskdb):\n    project_name = '__bench_test__'\n    task = {\n        \"fetch\": {\n            \"fetch_type\": \"js\",\n            \"headers\": {\n                \"User-Agent\": \"BaiDuSpider\"\n            }\n        },\n        \"process\": {\n            \"callback\": \"detail_page\"\n        },\n        \"project\": project_name,\n        \"taskid\": \"553300d2582154413b4982c00c34a2d5\",\n        \"url\": \"http://www.sciencedirect.com/science/article/pii/S1674200109000704\"\n    }\n\n    track = {\n        \"fetch\": {\n            \"content\": None,\n            \"encoding\": \"unicode\",\n            \"error\": None,\n            \"headers\": {\n                \"last-modified\": \"Wed, 04 Mar 2015 09:24:33 GMT\"\n            },\n            \"ok\": True,\n            \"redirect_url\": None,\n            \"status_code\": 200,\n            \"time\": 5.543\n        },\n        \"process\": {\n            \"exception\": None,\n            \"follows\": 4,\n            \"logs\": \"\",\n            \"ok\": True,\n            \"result\": \"{'url': u'\",\n            \"time\": 0.07105398178100586\n        }\n    }\n\n    def test_insert(n, start=0):\n        logger.info(\"taskdb insert %d\", n)\n        start_time = time.time()\n        for i in range(n):\n            task['url'] = 'http://bench.pyspider.org/?l=%d' % (i + start)\n            task['taskid'] = md5string(task['url'])\n            task['track'] = {}\n            taskdb.insert(task['project'], task['taskid'], task)\n        end_time = time.time()\n        cost_time = end_time - start_time\n        logger.info(\"cost %.2fs, %.2f/s %.2fms\",\n                    cost_time, n * 1.0 / cost_time, cost_time / n * 1000)\n\n    def test_update(n, start=0):\n        logger.info(\"taskdb update %d\" % n)\n        start_time = time.time()\n        for i in range(n):\n            task['url'] = 'http://bench.pyspider.org/?l=%d' % (i + start)\n            task['taskid'] = md5string(task['url'])\n            task['track'] = track\n            taskdb.update(task['project'], task['taskid'], task)\n        end_time = time.time()\n        cost_time = end_time - start_time\n        logger.info(\"cost %.2fs, %.2f/s %.2fms\",\n                    cost_time, n * 1.0 / cost_time, cost_time / n * 1000)\n\n    request_task_fields = [\n        'taskid',\n        'project',\n        'url',\n        'status',\n        'fetch',\n        'process',\n        'track',\n        'lastcrawltime'\n    ]\n\n    def test_get(n, start=0, random=True, fields=request_task_fields):\n        logger.info(\"taskdb get %d %s\" % (n, \"randomly\" if random else \"\"))\n        range_n = list(range(n))\n        if random:\n            from random import shuffle\n            shuffle(range_n)\n        start_time = time.time()\n        for i in range_n:\n            task['url'] = 'http://bench.pyspider.org/?l=%d' % (i + start)\n            task['taskid'] = md5string(task['url'])\n            task['track'] = track\n            taskdb.get_task(task['project'], task['taskid'], fields=fields)\n        end_time = time.time()\n        cost_time = end_time - start_time\n        logger.info(\"cost %.2fs, %.2f/s %.2fms\",\n                    cost_time, n * 1.0 / cost_time, cost_time / n * 1000)\n\n    try:\n        test_insert(1000)\n        test_update(1000)\n        test_get(1000)\n        test_insert(10000, 1000)\n        test_update(10000, 1000)\n        test_get(10000, 1000)\n    except Exception as e:\n        logger.exception(e)\n    finally:\n        taskdb.drop(project_name)\n\n\ndef bench_test_message_queue(queue):\n    task = {\n        \"fetch\": {\n            \"fetch_type\": \"js\",\n            \"headers\": {\n                \"User-Agent\": \"BaiDuSpider\"\n            }\n        },\n        \"process\": {\n            \"callback\": \"detail_page\"\n        },\n        \"project\": \"__bench_test__\",\n        \"taskid\": \"553300d2582154413b4982c00c34a2d5\",\n        \"url\": \"http://www.sciencedirect.com/science/article/pii/S1674200109000704\"\n    }\n\n    def test_put(n):\n        logger.info(\"message queue put %d\", n)\n        start_time = time.time()\n        for i in range(n):\n            task['url'] = 'http://bench.pyspider.org/?l=%d' % i\n            task['taskid'] = md5string(task['url'])\n            queue.put(task, block=True, timeout=1)\n        end_time = time.time()\n        cost_time = end_time - start_time\n        logger.info(\"cost %.2fs, %.2f/s %.2fms\",\n                    cost_time, n * 1.0 / cost_time, cost_time / n * 1000)\n\n    def test_get(n):\n        logger.info(\"message queue get %d\", n)\n        start_time = time.time()\n        for i in range(n):\n            try:\n                queue.get(True, 1)\n            except Queue.Empty:\n                logger.error('message queue empty while get %d', i)\n                raise\n        end_time = time.time()\n        cost_time = end_time - start_time\n        logger.info(\"cost %.2fs, %.2f/s %.2fms\",\n                    cost_time, n * 1.0 / cost_time, cost_time / n * 1000)\n\n    try:\n        test_put(1000)\n        test_get(1000)\n        test_put(10000)\n        test_get(10000)\n    except Exception as e:\n        logger.exception(e)\n    finally:\n        if hasattr(queue, 'channel'):\n            queue.channel.queue_purge(queue.name)\n\n        # clear message queue\n        try:\n            while queue.get(False):\n                continue\n        except Queue.Empty:\n            pass\n\n\nclass BenchMixin(object):\n    \"\"\"Report to logger for bench test\"\"\"\n    def _bench_init(self):\n        self.done_cnt = 0\n        self.start_time = time.time()\n        self.last_cnt = 0\n        self.last_report = 0\n\n    def _bench_report(self, name, prefix=0, rjust=0):\n        self.done_cnt += 1\n        now = time.time()\n        if now - self.last_report >= 1:\n            rps = float(self.done_cnt - self.last_cnt) / (now - self.last_report)\n            output = ''\n            if prefix:\n                output += \" \" * prefix\n            output += (\"%s %s pages (at %d pages/min)\" % (\n                name, self.done_cnt, rps * 60.0)).rjust(rjust)\n            logger.info(output)\n            self.last_cnt = self.done_cnt\n            self.last_report = now\n\n\nclass BenchScheduler(Scheduler, BenchMixin):\n    def __init__(self, *args, **kwargs):\n        super(BenchScheduler, self).__init__(*args, **kwargs)\n        self._bench_init()\n\n    def on_task_status(self, task):\n        self._bench_report('Crawled')\n        return super(BenchScheduler, self).on_task_status(task)\n\n\nclass BenchFetcher(Fetcher, BenchMixin):\n    def __init__(self, *args, **kwargs):\n        super(BenchFetcher, self).__init__(*args, **kwargs)\n        self._bench_init()\n\n    def on_result(self, type, task, result):\n        self._bench_report(\"Fetched\", 0, 75)\n        return super(BenchFetcher, self).on_result(type, task, result)\n\n\nclass BenchProcessor(Processor, BenchMixin):\n    def __init__(self, *args, **kwargs):\n        super(BenchProcessor, self).__init__(*args, **kwargs)\n        self._bench_init()\n\n    def on_task(self, task, response):\n        self._bench_report(\"Processed\", 75)\n        return super(BenchProcessor, self).on_task(task, response)\n\n\nclass BenchResultWorker(ResultWorker, BenchMixin):\n    def __init__(self, *args, **kwargs):\n        super(BenchResultWorker, self).__init__(*args, **kwargs)\n        self._bench_init()\n\n    def on_result(self, task, result):\n        self._bench_report(\"Saved\", 0, 150)\n        super(BenchResultWorker, self).on_result(task, result)\n\n\nfrom pyspider.libs.base_handler import BaseHandler\n\n\nclass Handler(BaseHandler):\n    def on_start(self, response):\n        self.crawl('http://127.0.0.1:5000/bench',\n                   params={'total': response.save.get('total', 10000), 'show': response.save.get('show', 20)},\n                   callback=self.index_page)\n\n    def index_page(self, response):\n        for each in response.doc('a[href^=\"http://\"]').items():\n            self.crawl(each.attr.href, callback=self.index_page)\n        return response.url\n"
  },
  {
    "path": "pyspider/libs/counter.py",
    "content": "#!/usr/bin/env python\n# -*- encoding: utf-8 -*-\n# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:\n# Author: Binux<i@binux.me>\n#         http://binux.me\n# Created on 2012-11-14 17:09:50\n\nfrom __future__ import unicode_literals, division, absolute_import\n\nimport time\nimport logging\nfrom collections import deque\ntry:\n    from UserDict import DictMixin\nexcept ImportError:\n    from collections import Mapping as DictMixin\n\nimport six\nfrom six import iteritems\nfrom six.moves import cPickle\n\n\nclass BaseCounter(object):\n\n    def __init__(self):\n        pass\n\n    def event(self, value=1):\n        \"\"\"Fire a event.\"\"\"\n        raise NotImplementedError\n\n    def value(self, value):\n        \"\"\"Set counter value.\"\"\"\n        raise NotImplementedError\n\n    @property\n    def avg(self):\n        \"\"\"Get average value\"\"\"\n        raise NotImplementedError\n\n    @property\n    def sum(self):\n        \"\"\"Get sum of counter\"\"\"\n        raise NotImplementedError\n\n    def empty(self):\n        \"\"\"Clear counter\"\"\"\n        raise NotImplementedError\n\n\nclass TotalCounter(BaseCounter):\n    \"\"\"Total counter\"\"\"\n\n    def __init__(self):\n        super(TotalCounter, self).__init__()\n        self.cnt = 0\n\n    def event(self, value=1):\n        self.cnt += value\n\n    def value(self, value):\n        self.cnt = value\n\n    @property\n    def avg(self):\n        return self.cnt\n\n    @property\n    def sum(self):\n        return self.cnt\n\n    def empty(self):\n        return self.cnt == 0\n\n\nclass AverageWindowCounter(BaseCounter):\n    \"\"\"\n    Record last N(window) value\n    \"\"\"\n\n    def __init__(self, window_size=300):\n        super(AverageWindowCounter, self).__init__()\n        self.window_size = window_size\n        self.values = deque(maxlen=window_size)\n\n    def event(self, value=1):\n        self.values.append(value)\n\n    value = event\n\n    @property\n    def avg(self):\n        return self.sum / len(self.values)\n\n    @property\n    def sum(self):\n        return sum(self.values)\n\n    def empty(self):\n        if not self.values:\n            return True\n\n\nclass TimebaseAverageEventCounter(BaseCounter):\n    \"\"\"\n    Record last window_size * window_interval seconds event.\n\n    records will trim ever window_interval seconds\n    \"\"\"\n\n    def __init__(self, window_size=30, window_interval=10):\n        super(TimebaseAverageEventCounter, self).__init__()\n        self.max_window_size = window_size\n        self.window_size = 0\n        self.window_interval = window_interval\n        self.values = deque(maxlen=window_size)\n        self.events = deque(maxlen=window_size)\n        self.times = deque(maxlen=window_size)\n\n        self.cache_value = 0\n        self.cache_event = 0\n        self.cache_start = None\n        self._first_data_time = None\n\n    def event(self, value=1):\n        now = time.time()\n        if self._first_data_time is None:\n            self._first_data_time = now\n\n        if self.cache_start is None:\n            self.cache_value = value\n            self.cache_event = 1\n            self.cache_start = now\n        elif now - self.cache_start > self.window_interval:\n            self.values.append(self.cache_value)\n            self.events.append(self.cache_event)\n            self.times.append(self.cache_start)\n            self.on_append(self.cache_value, self.cache_start)\n            self.cache_value = value\n            self.cache_event = 1\n            self.cache_start = now\n        else:\n            self.cache_value += value\n            self.cache_event += 1\n        return self\n\n    def value(self, value):\n        self.cache_value = value\n\n    def _trim_window(self):\n        now = time.time()\n        if self.cache_start and now - self.cache_start > self.window_interval:\n            self.values.append(self.cache_value)\n            self.events.append(self.cache_event)\n            self.times.append(self.cache_start)\n            self.on_append(self.cache_value, self.cache_start)\n            self.cache_value = 0\n            self.cache_start = None\n\n        if self.window_size != self.max_window_size and self._first_data_time is not None:\n            time_passed = now - self._first_data_time\n            self.window_size = min(self.max_window_size, time_passed / self.window_interval)\n        window_limit = now - self.window_size * self.window_interval\n        while self.times and self.times[0] < window_limit:\n            self.times.popleft()\n            self.events.popleft()\n            self.values.popleft()\n\n    @property\n    def avg(self):\n        events = (sum(self.events) + self.cache_event)\n        if not events:\n            return 0\n        return float(self.sum) / events\n\n    @property\n    def sum(self):\n        self._trim_window()\n        return sum(self.values) + self.cache_value\n\n    def empty(self):\n        self._trim_window()\n        if not self.values and not self.cache_start:\n            return True\n\n    def on_append(self, value, time):\n        pass\n\n\nclass TimebaseAverageWindowCounter(BaseCounter):\n    \"\"\"\n    Record last window_size * window_interval seconds values.\n\n    records will trim ever window_interval seconds\n    \"\"\"\n\n    def __init__(self, window_size=30, window_interval=10):\n        super(TimebaseAverageWindowCounter, self).__init__()\n        self.max_window_size = window_size\n        self.window_size = 0\n        self.window_interval = window_interval\n        self.values = deque(maxlen=window_size)\n        self.times = deque(maxlen=window_size)\n\n        self.cache_value = 0\n        self.cache_start = None\n        self._first_data_time = None\n\n    def event(self, value=1):\n        now = time.time()\n        if self._first_data_time is None:\n            self._first_data_time = now\n\n        if self.cache_start is None:\n            self.cache_value = value\n            self.cache_start = now\n        elif now - self.cache_start > self.window_interval:\n            self.values.append(self.cache_value)\n            self.times.append(self.cache_start)\n            self.on_append(self.cache_value, self.cache_start)\n            self.cache_value = value\n            self.cache_start = now\n        else:\n            self.cache_value += value\n        return self\n\n    def value(self, value):\n        self.cache_value = value\n\n    def _trim_window(self):\n        now = time.time()\n        if self.cache_start and now - self.cache_start > self.window_interval:\n            self.values.append(self.cache_value)\n            self.times.append(self.cache_start)\n            self.on_append(self.cache_value, self.cache_start)\n            self.cache_value = 0\n            self.cache_start = None\n\n        if self.window_size != self.max_window_size and self._first_data_time is not None:\n            time_passed = now - self._first_data_time\n            self.window_size = min(self.max_window_size, time_passed / self.window_interval)\n        window_limit = now - self.window_size * self.window_interval\n        while self.times and self.times[0] < window_limit:\n            self.times.popleft()\n            self.values.popleft()\n\n    @property\n    def avg(self):\n        sum = float(self.sum)\n        if not self.window_size:\n            return 0\n        return sum / self.window_size / self.window_interval\n\n    @property\n    def sum(self):\n        self._trim_window()\n        return sum(self.values) + self.cache_value\n\n    def empty(self):\n        self._trim_window()\n        if not self.values and not self.cache_start:\n            return True\n\n    def on_append(self, value, time):\n        pass\n\n\nclass CounterValue(DictMixin):\n    \"\"\"\n    A dict like value item for CounterManager.\n    \"\"\"\n\n    def __init__(self, manager, keys):\n        self.manager = manager\n        self._keys = keys\n\n    def __getitem__(self, key):\n        if key == '__value__':\n            key = self._keys\n            return self.manager.counters[key]\n        else:\n            key = self._keys + (key, )\n\n        available_keys = []\n        for _key in list(self.manager.counters.keys()):\n            if _key[:len(key)] == key:\n                available_keys.append(_key)\n\n        if len(available_keys) == 0:\n            raise KeyError\n        elif len(available_keys) == 1:\n            if available_keys[0] == key:\n                return self.manager.counters.get(key)\n            else:\n                return CounterValue(self.manager, key)\n        else:\n            return CounterValue(self.manager, key)\n\n    def __len__(self):\n        return len(self.keys())\n\n    def __iter__(self):\n        return iter(self.keys())\n\n    def __contains__(self, key):\n        return key in self.keys()\n\n    def keys(self):\n        result = set()\n        for key in list(self.manager.counters.keys()):\n            if key[:len(self._keys)] == self._keys:\n                key = key[len(self._keys):]\n                result.add(key[0] if key else '__value__')\n        return result\n\n    def to_dict(self, get_value=None):\n        \"\"\"Dump counters as a dict\"\"\"\n        result = {}\n        for key, value in iteritems(self):\n            if isinstance(value, BaseCounter):\n                if get_value is not None:\n                    value = getattr(value, get_value)\n                result[key] = value\n            else:\n                result[key] = value.to_dict(get_value)\n        return result\n\n\nclass CounterManager(DictMixin):\n    \"\"\"\n    A dict like counter manager.\n\n    When using a tuple as event key, say: ('foo', 'bar'), You can visite counter\n    with manager['foo']['bar'].  Or get all counters which first element is 'foo'\n    by manager['foo'].\n\n    It's useful for a group of counters.\n    \"\"\"\n\n    def __init__(self, cls=TimebaseAverageWindowCounter):\n        \"\"\"init manager with Counter cls\"\"\"\n        self.cls = cls\n        self.counters = {}\n\n    def event(self, key, value=1):\n        \"\"\"Fire a event of a counter by counter key\"\"\"\n        if isinstance(key, six.string_types):\n            key = (key, )\n        assert isinstance(key, tuple), \"event key type error\"\n        if key not in self.counters:\n            self.counters[key] = self.cls()\n        self.counters[key].event(value)\n        return self\n\n    def value(self, key, value=1):\n        \"\"\"Set value of a counter by counter key\"\"\"\n        if isinstance(key, six.string_types):\n            key = (key, )\n        # assert all(isinstance(k, six.string_types) for k in key)\n        assert isinstance(key, tuple), \"event key type error\"\n        if key not in self.counters:\n            self.counters[key] = self.cls()\n        self.counters[key].value(value)\n        return self\n\n    def trim(self):\n        \"\"\"Clear not used counters\"\"\"\n        for key, value in list(iteritems(self.counters)):\n            if value.empty():\n                del self.counters[key]\n\n    def __getitem__(self, key):\n        key = (key, )\n        available_keys = []\n        for _key in list(self.counters.keys()):\n            if _key[:len(key)] == key:\n                available_keys.append(_key)\n\n        if len(available_keys) == 0:\n            raise KeyError\n        elif len(available_keys) == 1:\n            if available_keys[0] == key:\n                return self.counters.get(key)\n            else:\n                return CounterValue(self, key)\n        else:\n            return CounterValue(self, key)\n\n    def __delitem__(self, key):\n        key = (key, )\n        available_keys = []\n        for _key in list(self.counters.keys()):\n            if _key[:len(key)] == key:\n                available_keys.append(_key)\n        for _key in available_keys:\n            del self.counters[_key]\n\n    def __iter__(self):\n        return iter(self.keys())\n\n    def __len__(self):\n        return len(self.keys())\n\n    def keys(self):\n        result = set()\n        for key in self.counters.keys():\n            result.add(key[0] if key else ())\n        return result\n\n    def to_dict(self, get_value=None):\n        \"\"\"Dump counters as a dict\"\"\"\n        self.trim()\n        result = {}\n        for key, value in iteritems(self.counters):\n            if get_value is not None:\n                value = getattr(value, get_value)\n            r = result\n            for _key in key[:-1]:\n                r = r.setdefault(_key, {})\n            r[key[-1]] = value\n        return result\n\n    def dump(self, filename):\n        \"\"\"Dump counters to file\"\"\"\n        try:\n            with open(filename, 'wb') as fp:\n                cPickle.dump(self.counters, fp)\n        except Exception as e:\n            logging.warning(\"can't dump counter to file %s: %s\", filename, e)\n            return False\n        return True\n\n    def load(self, filename):\n        \"\"\"Load counters to file\"\"\"\n        try:\n            with open(filename, 'rb') as fp:\n                self.counters = cPickle.load(fp)\n        except:\n            logging.debug(\"can't load counter from file: %s\", filename)\n            return False\n        return True\n"
  },
  {
    "path": "pyspider/libs/dataurl.py",
    "content": "#!/usr/bin/env python\n# -*- encoding: utf-8 -*-\n# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:\n# Author: Binux<i@binux.me>\n#         http://binux.me\n# Created on 2012-11-16 10:33:20\n\nimport six\nfrom base64 import b64encode, b64decode\nfrom . import utils\nfrom six.moves.urllib.parse import quote, unquote\n\n\ndef encode(data, mime_type='', charset='utf-8', base64=True):\n    \"\"\"\n    Encode data to DataURL\n    \"\"\"\n    if isinstance(data, six.text_type):\n        data = data.encode(charset)\n    else:\n        charset = None\n    if base64:\n        data = utils.text(b64encode(data))\n    else:\n        data = utils.text(quote(data))\n\n    result = ['data:', ]\n    if mime_type:\n        result.append(mime_type)\n    if charset:\n        result.append(';charset=')\n        result.append(charset)\n    if base64:\n        result.append(';base64')\n    result.append(',')\n    result.append(data)\n\n    return ''.join(result)\n\n\ndef decode(data_url):\n    \"\"\"\n    Decode DataURL data\n    \"\"\"\n    metadata, data = data_url.rsplit(',', 1)\n    _, metadata = metadata.split('data:', 1)\n    parts = metadata.split(';')\n    if parts[-1] == 'base64':\n        data = b64decode(data)\n    else:\n        data = unquote(data)\n\n    for part in parts:\n        if part.startswith(\"charset=\"):\n            data = data.decode(part[8:])\n    return data\n"
  },
  {
    "path": "pyspider/libs/log.py",
    "content": "#!/usr/bin/env python\n# -*- encoding: utf-8 -*-\n# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:\n# Author: Binux<i@binux.me>\n#         http://binux.me\n# Created on 2012-10-24 16:08:17\n\nimport logging\n\ntry:\n    import curses\nexcept ImportError:\n    curses = None\n\nfrom tornado.log import LogFormatter as _LogFormatter\n\n\nclass LogFormatter(_LogFormatter, object):\n    \"\"\"Init tornado.log.LogFormatter from logging.config.fileConfig\"\"\"\n    def __init__(self, fmt=None, datefmt=None, color=True, *args, **kwargs):\n        if fmt is None:\n            fmt = _LogFormatter.DEFAULT_FORMAT\n        super(LogFormatter, self).__init__(color=color, fmt=fmt, *args, **kwargs)\n\n\nclass SaveLogHandler(logging.Handler):\n    \"\"\"LogHandler that save records to a list\"\"\"\n\n    def __init__(self, saveto=None, *args, **kwargs):\n        self.saveto = saveto\n        logging.Handler.__init__(self, *args, **kwargs)\n\n    def emit(self, record):\n        if self.saveto is not None:\n            self.saveto.append(record)\n\n    handle = emit\n\n\ndef enable_pretty_logging(logger=logging.getLogger()):\n    channel = logging.StreamHandler()\n    channel.setFormatter(LogFormatter())\n    logger.addHandler(channel)\n"
  },
  {
    "path": "pyspider/libs/multiprocessing_queue.py",
    "content": "import six\nimport platform\nimport multiprocessing\nfrom multiprocessing.queues import Queue as BaseQueue\n\n\n# The SharedCounter and Queue classes come from:\n# https://github.com/vterron/lemon/commit/9ca6b4b\n\nclass SharedCounter(object):\n    \"\"\" A synchronized shared counter.\n    The locking done by multiprocessing.Value ensures that only a single\n    process or thread may read or write the in-memory ctypes object. However,\n    in order to do n += 1, Python performs a read followed by a write, so a\n    second process may read the old value before the new one is written by the\n    first process. The solution is to use a multiprocessing.Lock to guarantee\n    the atomicity of the modifications to Value.\n    This class comes almost entirely from Eli Bendersky's blog:\n    http://eli.thegreenplace.net/2012/01/04/shared-counter-with-pythons-multiprocessing/\n    \"\"\"\n\n    def __init__(self, n=0):\n        self.count = multiprocessing.Value('i', n)\n\n    def increment(self, n=1):\n        \"\"\" Increment the counter by n (default = 1) \"\"\"\n        with self.count.get_lock():\n            self.count.value += n\n\n    @property\n    def value(self):\n        \"\"\" Return the value of the counter \"\"\"\n        return self.count.value\n\n\nclass MultiProcessingQueue(BaseQueue):\n    \"\"\" A portable implementation of multiprocessing.Queue.\n    Because of multithreading / multiprocessing semantics, Queue.qsize() may\n    raise the NotImplementedError exception on Unix platforms like Mac OS X\n    where sem_getvalue() is not implemented. This subclass addresses this\n    problem by using a synchronized shared counter (initialized to zero) and\n    increasing / decreasing its value every time the put() and get() methods\n    are called, respectively. This not only prevents NotImplementedError from\n    being raised, but also allows us to implement a reliable version of both\n    qsize() and empty().\n    \"\"\"\n    def __init__(self, *args, **kwargs):\n        super(MultiProcessingQueue, self).__init__(*args, **kwargs)\n        self.size = SharedCounter(0)\n\n    def put(self, *args, **kwargs):\n        self.size.increment(1)\n        super(MultiProcessingQueue, self).put(*args, **kwargs)\n\n    def get(self, *args, **kwargs):\n        v = super(MultiProcessingQueue, self).get(*args, **kwargs)\n        self.size.increment(-1)\n        return v\n\n    def qsize(self):\n        \"\"\" Reliable implementation of multiprocessing.Queue.qsize() \"\"\"\n        return self.size.value\n\n\nif platform.system() == 'Darwin':\n    if hasattr(multiprocessing, 'get_context'):  # for py34\n        def Queue(maxsize=0):\n            return MultiProcessingQueue(maxsize, ctx=multiprocessing.get_context())\n    else:\n        def Queue(maxsize=0):\n            return MultiProcessingQueue(maxsize)\nelse:\n    from multiprocessing import Queue  # flake8: noqa\n"
  },
  {
    "path": "pyspider/libs/pprint.py",
    "content": "#  Author:      Fred L. Drake, Jr.\n#               fdrake@...\n#\n#  This is a simple little module I wrote to make life easier.  I didn't\n#  see anything quite like it in the library, though I may have overlooked\n#  something.  I wrote this when I was trying to read some heavily nested\n#  tuples with fairly non-descriptive content.  This is modeled very much\n#  after Lisp/Scheme - style pretty-printing of lists.  If you find it\n#  useful, thank small children who sleep at night.\n\n\"\"\"Support to pretty-print lists, tuples, & dictionaries recursively.\n\nVery simple, but useful, especially in debugging data structures.\n\nClasses\n-------\n\nPrettyPrinter()\n    Handle pretty-printing operations onto a stream using a configured\n    set of formatting parameters.\n\nFunctions\n---------\n\npformat()\n    Format a Python object into a pretty-printed representation.\n\npprint()\n    Pretty-print a Python object to a stream [default is sys.stdout].\n\nsaferepr()\n    Generate a 'standard' repr()-like value, but protect against recursive\n    data structures.\n\n\"\"\"\n\nfrom __future__ import print_function\n\nimport six\nimport sys as _sys\n\nfrom io import BytesIO, StringIO\n\n__all__ = [\"pprint\", \"pformat\", \"isreadable\", \"isrecursive\", \"saferepr\",\n           \"PrettyPrinter\"]\n\n# cache these for faster access:\n_commajoin = \", \".join\n_id = id\n_len = len\n_type = type\n\n\ndef pprint(object, stream=None, indent=1, width=80, depth=None):\n    \"\"\"Pretty-print a Python object to a stream [default is sys.stdout].\"\"\"\n    printer = PrettyPrinter(\n        stream=stream, indent=indent, width=width, depth=depth)\n    printer.pprint(object)\n\n\ndef pformat(object, indent=1, width=80, depth=None):\n    \"\"\"Format a Python object into a pretty-printed representation.\"\"\"\n    return PrettyPrinter(indent=indent, width=width, depth=depth).pformat(object)\n\n\ndef saferepr(object):\n    \"\"\"Version of repr() which can handle recursive data structures.\"\"\"\n    return _safe_repr(object, {}, None, 0)[0]\n\n\ndef isreadable(object):\n    \"\"\"Determine if saferepr(object) is readable by eval().\"\"\"\n    return _safe_repr(object, {}, None, 0)[1]\n\n\ndef isrecursive(object):\n    \"\"\"Determine if object requires a recursive representation.\"\"\"\n    return _safe_repr(object, {}, None, 0)[2]\n\n\ndef _sorted(iterable):\n    return sorted(iterable)\n\n\nclass PrettyPrinter:\n\n    def __init__(self, indent=1, width=80, depth=None, stream=None):\n        \"\"\"Handle pretty printing operations onto a stream using a set of\n        configured parameters.\n\n        indent\n            Number of spaces to indent for each level of nesting.\n\n        width\n            Attempted maximum number of columns in the output.\n\n        depth\n            The maximum depth to print out nested structures.\n\n        stream\n            The desired output stream.  If omitted (or false), the standard\n            output stream available at construction will be used.\n\n        \"\"\"\n        indent = int(indent)\n        width = int(width)\n        assert indent >= 0, \"indent must be >= 0\"\n        assert depth is None or depth > 0, \"depth must be > 0\"\n        assert width, \"width must be != 0\"\n        self._depth = depth\n        self._indent_per_level = indent\n        self._width = width\n        if stream is not None:\n            self._stream = stream\n        else:\n            self._stream = _sys.stdout\n\n    def pprint(self, object):\n        self._format(object, self._stream, 0, 0, {}, 0)\n        self._stream.write(\"\\n\")\n\n    def pformat(self, object):\n        sio = BytesIO()\n        self._format(object, sio, 0, 0, {}, 0)\n        return sio.getvalue()\n\n    def isrecursive(self, object):\n        return self.format(object, {}, 0, 0)[2]\n\n    def isreadable(self, object):\n        s, readable, recursive = self.format(object, {}, 0, 0)\n        return readable and not recursive\n\n    def _format(self, object, stream, indent, allowance, context, level):\n        level = level + 1\n        objid = _id(object)\n        if objid in context:\n            stream.write(_recursion(object))\n            self._recursive = True\n            self._readable = False\n            return\n        rep = self._repr(object, context, level - 1)\n        typ = _type(object)\n        sepLines = _len(rep) > (self._width - 1 - indent - allowance)\n        write = stream.write\n\n        if self._depth and level > self._depth:\n            write(rep)\n            return\n\n        r = getattr(typ, \"__repr__\", None)\n        if issubclass(typ, dict) and r is dict.__repr__:\n            write('{')\n            if self._indent_per_level > 1:\n                write((self._indent_per_level - 1) * ' ')\n            length = _len(object)\n            if length:\n                context[objid] = 1\n                indent = indent + self._indent_per_level\n                items = _sorted(object.items())\n                key, ent = items[0]\n                rep = self._repr(key, context, level)\n                write(rep)\n                write(': ')\n                self._format(ent, stream, indent + _len(rep) + 2,\n                             allowance + 1, context, level)\n                if length > 1:\n                    for key, ent in items[1:]:\n                        rep = self._repr(key, context, level)\n                        if sepLines:\n                            write(',\\n%s%s: ' % (' ' * indent, rep))\n                        else:\n                            write(', %s: ' % rep)\n                        self._format(ent, stream, indent + _len(rep) + 2,\n                                     allowance + 1, context, level)\n                indent = indent - self._indent_per_level\n                del context[objid]\n            write('}')\n            return\n\n        if (\n                (issubclass(typ, list) and r is list.__repr__) or\n                (issubclass(typ, tuple) and r is tuple.__repr__) or\n                (issubclass(typ, set) and r is set.__repr__) or\n                (issubclass(typ, frozenset) and r is frozenset.__repr__)\n        ):\n            length = _len(object)\n            if issubclass(typ, list):\n                write('[')\n                endchar = ']'\n            elif issubclass(typ, set):\n                if not length:\n                    write('set()')\n                    return\n                write('set([')\n                endchar = '])'\n                object = _sorted(object)\n                indent += 4\n            elif issubclass(typ, frozenset):\n                if not length:\n                    write('frozenset()')\n                    return\n                write('frozenset([')\n                endchar = '])'\n                object = _sorted(object)\n                indent += 10\n            else:\n                write('(')\n                endchar = ')'\n            if self._indent_per_level > 1 and sepLines:\n                write((self._indent_per_level - 1) * ' ')\n            if length:\n                context[objid] = 1\n                indent = indent + self._indent_per_level\n                self._format(object[0], stream, indent, allowance + 1,\n                             context, level)\n                if length > 1:\n                    for ent in object[1:]:\n                        if sepLines:\n                            write(',\\n' + ' ' * indent)\n                        else:\n                            write(', ')\n                        self._format(ent, stream, indent,\n                                     allowance + 1, context, level)\n                indent = indent - self._indent_per_level\n                del context[objid]\n            if issubclass(typ, tuple) and length == 1:\n                write(',')\n            write(endchar)\n            return\n\n        write(rep)\n\n    def _repr(self, object, context, level):\n        repr, readable, recursive = self.format(object, context.copy(),\n                                                self._depth, level)\n        if not readable:\n            self._readable = False\n        if recursive:\n            self._recursive = True\n        return repr\n\n    def format(self, object, context, maxlevels, level):\n        \"\"\"Format object for a specific context, returning a string\n        and flags indicating whether the representation is 'readable'\n        and whether the object represents a recursive construct.\n        \"\"\"\n        return _safe_repr(object, context, maxlevels, level)\n\n\n# Return triple (repr_string, isreadable, isrecursive).\n\ndef _safe_repr(object, context, maxlevels, level):\n    typ = _type(object)\n    if typ is str:\n        string = object\n        string = string.replace('\\n', '\\\\n').replace('\\r', '\\\\r').replace('\\t', '\\\\t')\n        if 'locale' not in _sys.modules:\n            return repr(object), True, False\n        if \"'\" in object and '\"' not in object:\n            closure = '\"'\n            quotes = {'\"': '\\\\\"'}\n            string = string.replace('\"', '\\\\\"')\n        else:\n            closure = \"'\"\n            quotes = {\"'\": \"\\\\'\"}\n            string = string.replace(\"'\", \"\\\\'\")\n        try:\n            string.decode('utf8').encode('gbk', 'replace')\n            return (\"%s%s%s\" % (closure, string, closure)), True, False\n        except:\n            pass\n        qget = quotes.get\n        sio = StringIO()\n        write = sio.write\n        for char in object:\n            if char.isalpha():\n                write(char)\n            else:\n                write(qget(char, repr(char)[1:-1]))\n        return (\"%s%s%s\" % (closure, sio.getvalue(), closure)), True, False\n\n    if typ is six.text_type:\n        string = object.encode(\"utf8\", 'replace')\n        string = string.replace('\\n', '\\\\n').replace('\\r', '\\\\r').replace('\\t', '\\\\t')\n        if \"'\" in object and '\"' not in object:\n            closure = '\"'\n            quotes = {'\"': '\\\\\"'}\n            string = string.replace('\"', '\\\\\"')\n        else:\n            closure = \"'\"\n            quotes = {\"'\": \"\\\\'\"}\n            string = string.replace(\"'\", \"\\\\'\")\n        return (\"u%s%s%s\" % (closure, string, closure)), True, False\n\n    r = getattr(typ, \"__repr__\", None)\n    if issubclass(typ, dict) and r is dict.__repr__:\n        if not object:\n            return \"{}\", True, False\n        objid = _id(object)\n        if maxlevels and level >= maxlevels:\n            return \"{...}\", False, objid in context\n        if objid in context:\n            return _recursion(object), False, True\n        context[objid] = 1\n        readable = True\n        recursive = False\n        components = []\n        append = components.append\n        level += 1\n        saferepr = _safe_repr\n        for k, v in _sorted(object.items()):\n            krepr, kreadable, krecur = saferepr(k, context, maxlevels, level)\n            vrepr, vreadable, vrecur = saferepr(v, context, maxlevels, level)\n            append(\"%s: %s\" % (krepr, vrepr))\n            readable = readable and kreadable and vreadable\n            if krecur or vrecur:\n                recursive = True\n        del context[objid]\n        return \"{%s}\" % _commajoin(components), readable, recursive\n\n    if (issubclass(typ, list) and r is list.__repr__) or \\\n            (issubclass(typ, tuple) and r is tuple.__repr__):\n        if issubclass(typ, list):\n            if not object:\n                return \"[]\", True, False\n            format = \"[%s]\"\n        elif _len(object) == 1:\n            format = \"(%s,)\"\n        else:\n            if not object:\n                return \"()\", True, False\n            format = \"(%s)\"\n        objid = _id(object)\n        if maxlevels and level >= maxlevels:\n            return format % \"...\", False, objid in context\n        if objid in context:\n            return _recursion(object), False, True\n        context[objid] = 1\n        readable = True\n        recursive = False\n        components = []\n        append = components.append\n        level += 1\n        for o in object:\n            orepr, oreadable, orecur = _safe_repr(o, context, maxlevels, level)\n            append(orepr)\n            if not oreadable:\n                readable = False\n            if orecur:\n                recursive = True\n        del context[objid]\n        return format % _commajoin(components), readable, recursive\n\n    rep = repr(object)\n    return rep, (rep and not rep.startswith('<')), False\n\n\ndef _recursion(object):\n    return (\"<Recursion on %s with id=%s>\"\n            % (_type(object).__name__, _id(object)))\n\n\ndef _perfcheck(object=None):\n    import time\n    if object is None:\n        object = [(\"string\", (1, 2), [3, 4], {5: 6, 7: 8})] * 100000\n    p = PrettyPrinter()\n    t1 = time.time()\n    _safe_repr(object, {}, None, 0)\n    t2 = time.time()\n    p.pformat(object)\n    t3 = time.time()\n    print(\"_safe_repr:\", t2 - t1)\n    print(\"pformat:\", t3 - t2)\n\nif __name__ == \"__main__\":\n    _perfcheck()\n"
  },
  {
    "path": "pyspider/libs/response.py",
    "content": "#!/usr/bin/env python\n# -*- encoding: utf-8 -*-\n# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:\n# Author: Binux<i@binux.me>\n#         http://binux.me\n# Created on 2012-11-02 11:16:02\n\nimport cgi\nimport re\nimport six\nimport json\nimport chardet\nimport lxml.html\nimport lxml.etree\nfrom tblib import Traceback\nfrom pyquery import PyQuery\nfrom requests.structures import CaseInsensitiveDict\nfrom requests import HTTPError\nfrom pyspider.libs import utils\n\n\nclass Response(object):\n\n    def __init__(self, status_code=None, url=None, orig_url=None, headers=CaseInsensitiveDict(),\n                 content='', cookies=None, error=None, traceback=None, save=None, js_script_result=None, time=0):\n        if cookies is None:\n            cookies = {}\n        self.status_code = status_code\n        self.url = url\n        self.orig_url = orig_url\n        self.headers = headers\n        self.content = content\n        self.cookies = cookies\n        self.error = error\n        self.traceback = traceback\n        self.save = save\n        self.js_script_result = js_script_result\n        self.time = time\n\n    def __repr__(self):\n        return u'<Response [%d]>' % self.status_code\n\n    def __bool__(self):\n        \"\"\"Returns true if `status_code` is 200 and no error\"\"\"\n        return self.ok\n\n    def __nonzero__(self):\n        \"\"\"Returns true if `status_code` is 200 and no error.\"\"\"\n        return self.ok\n\n    @property\n    def ok(self):\n        \"\"\"Return true if `status_code` is 200 and no error.\"\"\"\n        try:\n            self.raise_for_status()\n        except:\n            return False\n        return True\n\n    @property\n    def encoding(self):\n        \"\"\"\n        encoding of Response.content.\n\n        if Response.encoding is None, encoding will be guessed\n        by header or content or chardet if available.\n        \"\"\"\n        if hasattr(self, '_encoding'):\n            return self._encoding\n\n        # content is unicode\n        if isinstance(self.content, six.text_type):\n            return 'unicode'\n\n        # Try charset from content-type or content\n        encoding = get_encoding(self.headers, self.content)\n\n        # Fallback to auto-detected encoding.\n        if not encoding and chardet is not None:\n            encoding = chardet.detect(self.content[:600])['encoding']\n\n        if encoding and encoding.lower() == 'gb2312':\n            encoding = 'gb18030'\n\n        self._encoding = encoding or 'utf-8'\n        return self._encoding\n\n    @encoding.setter\n    def encoding(self, value):\n        \"\"\"\n        set encoding of content manually\n        it will overwrite the guessed encoding\n        \"\"\"\n        self._encoding = value\n        self._text = None\n\n    @property\n    def text(self):\n        \"\"\"\n        Content of the response, in unicode.\n\n        if Response.encoding is None and chardet module is available, encoding\n        will be guessed.\n        \"\"\"\n        if hasattr(self, '_text') and self._text:\n            return self._text\n        if not self.content:\n            return u''\n        if isinstance(self.content, six.text_type):\n            return self.content\n\n        content = None\n        encoding = self.encoding\n\n        # Decode unicode from given encoding.\n        try:\n            content = self.content.decode(encoding, 'replace')\n        except LookupError:\n            # A LookupError is raised if the encoding was not found which could\n            # indicate a misspelling or similar mistake.\n            #\n            # So we try blindly encoding.\n            content = self.content.decode('utf-8', 'replace')\n\n        self._text = content\n        return content\n\n    @property\n    def json(self):\n        \"\"\"Returns the json-encoded content of the response, if any.\"\"\"\n        if hasattr(self, '_json'):\n            return self._json\n        try:\n            self._json = json.loads(self.text or self.content)\n        except ValueError:\n            self._json = None\n        return self._json\n\n    @property\n    def doc(self):\n        \"\"\"Returns a PyQuery object of the response's content\"\"\"\n        if hasattr(self, '_doc'):\n            return self._doc\n        elements = self.etree\n        doc = self._doc = PyQuery(elements)\n        doc.make_links_absolute(utils.text(self.url))\n        return doc\n\n    @property\n    def etree(self):\n        \"\"\"Returns a lxml object of the response's content that can be selected by xpath\"\"\"\n        if not hasattr(self, '_elements'):\n            try:\n                parser = lxml.html.HTMLParser(encoding=self.encoding)\n                self._elements = lxml.html.fromstring(self.content, parser=parser)\n            except LookupError:\n                # lxml would raise LookupError when encoding not supported\n                # try fromstring without encoding instead.\n                # on windows, unicode is not availabe as encoding for lxml\n                self._elements = lxml.html.fromstring(self.content)\n        if isinstance(self._elements, lxml.etree._ElementTree):\n            self._elements = self._elements.getroot()\n        return self._elements\n\n    def raise_for_status(self, allow_redirects=True):\n        \"\"\"Raises stored :class:`HTTPError` or :class:`URLError`, if one occurred.\"\"\"\n\n        if self.status_code == 304:\n            return\n        elif self.error:\n            if self.traceback:\n                six.reraise(Exception, Exception(self.error), Traceback.from_string(self.traceback).as_traceback())\n            http_error = HTTPError(self.error)\n        elif (self.status_code >= 300) and (self.status_code < 400) and not allow_redirects:\n            http_error = HTTPError('%s Redirection' % (self.status_code))\n        elif (self.status_code >= 400) and (self.status_code < 500):\n            http_error = HTTPError('%s Client Error' % (self.status_code))\n        elif (self.status_code >= 500) and (self.status_code < 600):\n            http_error = HTTPError('%s Server Error' % (self.status_code))\n        else:\n            return\n\n        http_error.response = self\n        raise http_error\n\n    def isok(self):\n        try:\n            self.raise_for_status()\n            return True\n        except:\n            return False\n\n\ndef rebuild_response(r):\n    response = Response(\n        status_code=r.get('status_code', 599),\n        url=r.get('url', ''),\n        headers=CaseInsensitiveDict(r.get('headers', {})),\n        content=r.get('content', ''),\n        cookies=r.get('cookies', {}),\n        error=r.get('error'),\n        traceback=r.get('traceback'),\n        time=r.get('time', 0),\n        orig_url=r.get('orig_url', r.get('url', '')),\n        js_script_result=r.get('js_script_result'),\n        save=r.get('save'),\n    )\n    return response\n\n\ndef get_encoding(headers, content):\n    \"\"\"Get encoding from request headers or page head.\"\"\"\n    encoding = None\n\n    content_type = headers.get('content-type')\n    if content_type:\n        _, params = cgi.parse_header(content_type)\n        if 'charset' in params:\n            encoding = params['charset'].strip(\"'\\\"\")\n\n    if not encoding:\n        content = utils.pretty_unicode(content[:1000]) if six.PY3 else content\n\n        charset_re = re.compile(r'<meta.*?charset=[\"\\']*(.+?)[\"\\'>]',\n                                flags=re.I)\n        pragma_re = re.compile(r'<meta.*?content=[\"\\']*;?charset=(.+?)[\"\\'>]',\n                               flags=re.I)\n        xml_re = re.compile(r'^<\\?xml.*?encoding=[\"\\']*(.+?)[\"\\'>]')\n        encoding = (charset_re.findall(content) +\n                    pragma_re.findall(content) +\n                    xml_re.findall(content))\n        encoding = encoding and encoding[0] or None\n\n    return encoding\n"
  },
  {
    "path": "pyspider/libs/result_dump.py",
    "content": "#!/usr/bin/env python\n# -*- encoding: utf-8 -*-\n# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:\n# Author: Binux<roy@binux.me>\n#         http://binux.me\n# Created on 2015-03-27 20:12:11\n\nimport six\nimport csv\nimport json\nimport itertools\nfrom io import StringIO, BytesIO\nfrom six import iteritems\n\n\ndef result_formater(results):\n    common_fields = None\n    for result in results:\n        result.setdefault('result', None)\n        if isinstance(result['result'], dict):\n            if common_fields is None:\n                common_fields = set(result['result'].keys())\n            else:\n                common_fields &= set(result['result'].keys())\n        else:\n            common_fields = set()\n    for result in results:\n        result['result_formated'] = {}\n        if not common_fields:\n            result['others'] = result['result']\n        elif not isinstance(result['result'], dict):\n            result['others'] = result['result']\n        else:\n            result_formated = {}\n            others = {}\n            for key, value in iteritems(result['result']):\n                if key in common_fields:\n                    result_formated[key] = value\n                else:\n                    others[key] = value\n            result['result_formated'] = result_formated\n            result['others'] = others\n    return common_fields or set(), results\n\n\ndef dump_as_json(results, valid=False):\n    first = True\n    if valid:\n        yield '['\n\n    for result in results:\n        if valid:\n            if first:\n                first = False\n            else:\n                yield ', '\n\n        yield json.dumps(result, ensure_ascii=False) + '\\n'\n\n    if valid:\n        yield ']'\n\n\ndef dump_as_txt(results):\n    for result in results:\n        yield (\n            result.get('url', None) + '\\t' +\n            json.dumps(result.get('result', None), ensure_ascii=False) + '\\n'\n        )\n\n\ndef dump_as_csv(results):\n    def toString(obj):\n        if isinstance(obj, six.binary_type):\n            if six.PY2:\n                return obj\n            else:\n                return obj.decode('utf8')\n        elif isinstance(obj, six.text_type):\n            if six.PY2:\n                return obj.encode('utf8')\n            else:\n                return obj\n        else:\n            if six.PY2:\n                return json.dumps(obj, ensure_ascii=False).encode('utf8')\n            else:\n                return json.dumps(obj, ensure_ascii=False)\n\n    # python2 needs byes when python3 needs unicode\n    if six.PY2:\n        stringio = BytesIO()\n    else:\n        stringio = StringIO()\n    csv_writer = csv.writer(stringio)\n\n    it = iter(results)\n    first_30 = []\n    for result in it:\n        first_30.append(result)\n        if len(first_30) >= 30:\n            break\n    common_fields, _ = result_formater(first_30)\n    common_fields_l = sorted(common_fields)\n\n    csv_writer.writerow([toString('url')]\n                        + [toString(x) for x in common_fields_l]\n                        + [toString('...')])\n    for result in itertools.chain(first_30, it):\n        result['result_formated'] = {}\n        if not common_fields:\n            result['others'] = result['result']\n        elif not isinstance(result['result'], dict):\n            result['others'] = result['result']\n        else:\n            result_formated = {}\n            others = {}\n            for key, value in iteritems(result['result']):\n                if key in common_fields:\n                    result_formated[key] = value\n                else:\n                    others[key] = value\n            result['result_formated'] = result_formated\n            result['others'] = others\n        csv_writer.writerow(\n            [toString(result['url'])]\n            + [toString(result['result_formated'].get(k, '')) for k in common_fields_l]\n            + [toString(result['others'])]\n        )\n        yield stringio.getvalue()\n        stringio.truncate(0)\n        stringio.seek(0)\n"
  },
  {
    "path": "pyspider/libs/sample_handler.py",
    "content": "#!/usr/bin/env python\n# -*- encoding: utf-8 -*-\n# Created on __DATE__\n# Project: __PROJECT_NAME__\n\nfrom pyspider.libs.base_handler import *\n\n\nclass Handler(BaseHandler):\n    crawl_config = {\n    }\n\n    @every(minutes=24 * 60)\n    def on_start(self):\n        self.crawl('__START_URL__', callback=self.index_page)\n\n    @config(age=10 * 24 * 60 * 60)\n    def index_page(self, response):\n        for each in response.doc('a[href^=\"http\"]').items():\n            self.crawl(each.attr.href, callback=self.detail_page)\n\n    @config(priority=2)\n    def detail_page(self, response):\n        return {\n            \"url\": response.url,\n            \"title\": response.doc('title').text(),\n        }\n"
  },
  {
    "path": "pyspider/libs/url.py",
    "content": "#!/usr/bin/env python\n# -*- encoding: utf-8 -*-\n# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:\n# Author: Binux<i@binux.me>\n#         http://binux.me\n# Created on 2012-11-09 14:39:57\n\nimport mimetypes\n\nimport six\nimport shlex\nfrom six.moves.urllib.parse import urlparse, urlunparse\nfrom requests.models import RequestEncodingMixin\n\n\ndef get_content_type(filename):\n    \"\"\"Guessing file type by filename\"\"\"\n    return mimetypes.guess_type(filename)[0] or 'application/octet-stream'\n\n\n_encode_params = RequestEncodingMixin._encode_params\n\n\ndef _encode_multipart_formdata(fields, files):\n    body, content_type = RequestEncodingMixin._encode_files(files, fields)\n    return content_type, body\n\n\ndef _build_url(url, _params):\n    \"\"\"Build the actual URL to use.\"\"\"\n\n    # Support for unicode domain names and paths.\n    scheme, netloc, path, params, query, fragment = urlparse(url)\n    netloc = netloc.encode('idna').decode('utf-8')\n    if not path:\n        path = '/'\n\n    if six.PY2:\n        if isinstance(scheme, six.text_type):\n            scheme = scheme.encode('utf-8')\n        if isinstance(netloc, six.text_type):\n            netloc = netloc.encode('utf-8')\n        if isinstance(path, six.text_type):\n            path = path.encode('utf-8')\n        if isinstance(params, six.text_type):\n            params = params.encode('utf-8')\n        if isinstance(query, six.text_type):\n            query = query.encode('utf-8')\n        if isinstance(fragment, six.text_type):\n            fragment = fragment.encode('utf-8')\n\n    enc_params = _encode_params(_params)\n    if enc_params:\n        if query:\n            query = '%s&%s' % (query, enc_params)\n        else:\n            query = enc_params\n    url = (urlunparse([scheme, netloc, path, params, query, fragment]))\n    return url\n\n\ndef quote_chinese(url, encodeing=\"utf-8\"):\n    \"\"\"Quote non-ascii characters\"\"\"\n    if isinstance(url, six.text_type):\n        return quote_chinese(url.encode(encodeing))\n    if six.PY3:\n        res = [six.int2byte(b).decode('latin-1') if b < 128 else '%%%02X' % b for b in url]\n    else:\n        res = [b if ord(b) < 128 else '%%%02X' % ord(b) for b in url]\n    return \"\".join(res)\n\n\ndef curl_to_arguments(curl):\n    kwargs = {}\n    headers = {}\n    command = None\n    urls = []\n    current_opt = None\n\n    for part in shlex.split(curl):\n        if command is None:\n            # curl\n            command = part\n        elif not part.startswith('-') and not current_opt:\n            # waiting for url\n            urls.append(part)\n        elif current_opt is None and part.startswith('-'):\n            # flags\n            if part == '--compressed':\n                kwargs['use_gzip'] = True\n            else:\n                current_opt = part\n        else:\n            # option\n            if current_opt is None:\n                raise TypeError('Unknow curl argument: %s' % part)\n            elif current_opt in ('-H', '--header'):\n                key_value = part.split(':', 1)\n                if len(key_value) == 2:\n                    key, value = key_value\n                    headers[key.strip()] = value.strip()\n            elif current_opt in ('-d', '--data'):\n                kwargs['data'] = part\n            elif current_opt in ('--data-binary'):\n                if part[0] == '$':\n                    part = part[1:]\n                kwargs['data'] = part\n            elif current_opt in ('-X', '--request'):\n                kwargs['method'] = part\n            else:\n                raise TypeError('Unknow curl option: %s' % current_opt)\n            current_opt = None\n\n    if not urls:\n        raise TypeError('curl: no URL specified!')\n    if current_opt:\n        raise TypeError('Unknow curl option: %s' % current_opt)\n\n    kwargs['urls'] = urls\n    if headers:\n        kwargs['headers'] = headers\n\n    return kwargs\n"
  },
  {
    "path": "pyspider/libs/utils.py",
    "content": "#!/usr/bin/env python\n# -*- encoding: utf-8 -*-\n# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:\n# Author: Binux<i@binux.me>\n#         http://binux.me\n# Created on 2012-11-06 11:50:13\n\nimport math\nimport logging\nimport hashlib\nimport datetime\nimport socket\nimport base64\nimport warnings\nimport threading\n\nimport six\nfrom six import iteritems\n\nmd5string = lambda x: hashlib.md5(utf8(x)).hexdigest()\n\n\nclass ReadOnlyDict(dict):\n    \"\"\"A Read Only Dict\"\"\"\n\n    def __setitem__(self, key, value):\n        raise Exception(\"dict is read-only\")\n\n\ndef getitem(obj, key=0, default=None):\n    \"\"\"Get first element of list or return default\"\"\"\n    try:\n        return obj[key]\n    except:\n        return default\n\n\ndef hide_me(tb, g=globals()):\n    \"\"\"Hide stack traceback of given stack\"\"\"\n    base_tb = tb\n    try:\n        while tb and tb.tb_frame.f_globals is not g:\n            tb = tb.tb_next\n        while tb and tb.tb_frame.f_globals is g:\n            tb = tb.tb_next\n    except Exception as e:\n        logging.exception(e)\n        tb = base_tb\n    if not tb:\n        tb = base_tb\n    return tb\n\n\ndef run_in_thread(func, *args, **kwargs):\n    \"\"\"Run function in thread, return a Thread object\"\"\"\n    from threading import Thread\n    thread = Thread(target=func, args=args, kwargs=kwargs)\n    thread.daemon = True\n    thread.start()\n    return thread\n\n\ndef run_in_subprocess(func, *args, **kwargs):\n    \"\"\"Run function in subprocess, return a Process object\"\"\"\n    from multiprocessing import Process\n    thread = Process(target=func, args=args, kwargs=kwargs)\n    thread.daemon = True\n    thread.start()\n    return thread\n\n\ndef format_date(date, gmt_offset=0, relative=True, shorter=False, full_format=False):\n    \"\"\"Formats the given date (which should be GMT).\n\n    By default, we return a relative time (e.g., \"2 minutes ago\"). You\n    can return an absolute date string with ``relative=False``.\n\n    You can force a full format date (\"July 10, 1980\") with\n    ``full_format=True``.\n\n    This method is primarily intended for dates in the past.\n    For dates in the future, we fall back to full format.\n\n    From tornado\n    \"\"\"\n\n    if not date:\n        return '-'\n    if isinstance(date, float) or isinstance(date, int):\n        date = datetime.datetime.utcfromtimestamp(date)\n    now = datetime.datetime.utcnow()\n    if date > now:\n        if relative and (date - now).seconds < 60:\n            # Due to click skew, things are some things slightly\n            # in the future. Round timestamps in the immediate\n            # future down to now in relative mode.\n            date = now\n        else:\n            # Otherwise, future dates always use the full format.\n            full_format = True\n    local_date = date - datetime.timedelta(minutes=gmt_offset)\n    local_now = now - datetime.timedelta(minutes=gmt_offset)\n    local_yesterday = local_now - datetime.timedelta(hours=24)\n    difference = now - date\n    seconds = difference.seconds\n    days = difference.days\n\n    format = None\n    if not full_format:\n        ret_, fff_format = fix_full_format(days, seconds, relative, shorter, local_date, local_yesterday)\n        format = fff_format\n        if ret_:\n            return format\n        else:\n            format = format\n\n    if format is None:\n        format = \"%(month_name)s %(day)s, %(year)s\" if shorter else \\\n            \"%(month_name)s %(day)s, %(year)s at %(time)s\"\n\n    str_time = \"%d:%02d\" % (local_date.hour, local_date.minute)\n\n    return format % {\n        \"month_name\": local_date.strftime('%b'),\n        \"weekday\": local_date.strftime('%A'),\n        \"day\": str(local_date.day),\n        \"year\": str(local_date.year),\n        \"month\": local_date.month,\n        \"time\": str_time\n    }\n\n\ndef fix_full_format(days, seconds, relative, shorter, local_date, local_yesterday):\n    if relative and days == 0:\n        if seconds < 50:\n            return True, ((\"1 second ago\" if seconds <= 1 else\n                    \"%(seconds)d seconds ago\") % {\"seconds\": seconds})\n\n        if seconds < 50 * 60:\n            minutes = round(seconds / 60.0)\n            return True, ((\"1 minute ago\" if minutes <= 1 else\n                    \"%(minutes)d minutes ago\") % {\"minutes\": minutes})\n\n        hours = round(seconds / (60.0 * 60))\n        return True, ((\"1 hour ago\" if hours <= 1 else\n                \"%(hours)d hours ago\") % {\"hours\": hours})\n    format = None\n    if days == 0:\n        format = \"%(time)s\"\n    elif days == 1 and local_date.day == local_yesterday.day and \\\n            relative:\n        format = \"yesterday\" if shorter else \"yesterday at %(time)s\"\n    elif days < 5:\n        format = \"%(weekday)s\" if shorter else \"%(weekday)s at %(time)s\"\n    elif days < 334:  # 11mo, since confusing for same month last year\n        format = \"%(month)s-%(day)s\" if shorter else \\\n            \"%(month)s-%(day)s at %(time)s\"\n    return False, format\n\nclass TimeoutError(Exception):\n    pass\n\ntry:\n    import signal\n    if not hasattr(signal, 'SIGALRM'):\n        raise ImportError('signal')\n\n    class timeout:\n        \"\"\"\n        Time limit of command\n\n        with timeout(3):\n            time.sleep(10)\n        \"\"\"\n\n        def __init__(self, seconds=1, error_message='Timeout'):\n            self.seconds = seconds\n            self.error_message = error_message\n\n        def handle_timeout(self, signum, frame):\n            raise TimeoutError(self.error_message)\n\n        def __enter__(self):\n            if not isinstance(threading.current_thread(), threading._MainThread):\n                logging.warning(\"timeout only works on main thread, are you running pyspider in threads?\")\n                self.seconds = 0\n            if self.seconds:\n                signal.signal(signal.SIGALRM, self.handle_timeout)\n                signal.alarm(int(math.ceil(self.seconds)))\n\n        def __exit__(self, type, value, traceback):\n            if self.seconds:\n                signal.alarm(0)\n\nexcept ImportError as e:\n    warnings.warn(\"timeout is not supported on your platform.\", FutureWarning)\n\n    class timeout:\n        \"\"\"\n        Time limit of command (for windows)\n        \"\"\"\n\n        def __init__(self, seconds=1, error_message='Timeout'):\n            pass\n\n        def __enter__(self):\n            pass\n\n        def __exit__(self, type, value, traceback):\n            pass\n\n\ndef utf8(string):\n    \"\"\"\n    Make sure string is utf8 encoded bytes.\n\n    If parameter is a object, object.__str__ will been called before encode as bytes\n    \"\"\"\n    if isinstance(string, six.text_type):\n        return string.encode('utf8')\n    elif isinstance(string, six.binary_type):\n        return string\n    else:\n        return six.text_type(string).encode('utf8')\n\n\ndef text(string, encoding='utf8'):\n    \"\"\"\n    Make sure string is unicode type, decode with given encoding if it's not.\n\n    If parameter is a object, object.__str__ will been called\n    \"\"\"\n    if isinstance(string, six.text_type):\n        return string\n    elif isinstance(string, six.binary_type):\n        return string.decode(encoding)\n    else:\n        return six.text_type(string)\n\n\ndef pretty_unicode(string):\n    \"\"\"\n    Make sure string is unicode, try to decode with utf8, or unicode escaped string if failed.\n    \"\"\"\n    if isinstance(string, six.text_type):\n        return string\n    try:\n        return string.decode(\"utf8\")\n    except UnicodeDecodeError:\n        return string.decode('Latin-1').encode('unicode_escape').decode(\"utf8\")\n\n\ndef unicode_string(string):\n    \"\"\"\n    Make sure string is unicode, try to default with utf8, or base64 if failed.\n\n    can been decode by `decode_unicode_string`\n    \"\"\"\n    if isinstance(string, six.text_type):\n        return string\n    try:\n        return string.decode(\"utf8\")\n    except UnicodeDecodeError:\n        return '[BASE64-DATA]' + base64.b64encode(string) + '[/BASE64-DATA]'\n\n\ndef unicode_dict(_dict):\n    \"\"\"\n    Make sure keys and values of dict is unicode.\n    \"\"\"\n    r = {}\n    for k, v in iteritems(_dict):\n        r[unicode_obj(k)] = unicode_obj(v)\n    return r\n\n\ndef unicode_list(_list):\n    \"\"\"\n    Make sure every element in list is unicode. bytes will encode in base64\n    \"\"\"\n    return [unicode_obj(x) for x in _list]\n\n\ndef unicode_obj(obj):\n    \"\"\"\n    Make sure keys and values of dict/list/tuple is unicode. bytes will encode in base64.\n\n    Can been decode by `decode_unicode_obj`\n    \"\"\"\n    if isinstance(obj, dict):\n        return unicode_dict(obj)\n    elif isinstance(obj, (list, tuple)):\n        return unicode_list(obj)\n    elif isinstance(obj, six.string_types):\n        return unicode_string(obj)\n    elif isinstance(obj, (int, float)):\n        return obj\n    elif obj is None:\n        return obj\n    else:\n        try:\n            return text(obj)\n        except:\n            return text(repr(obj))\n\n\ndef decode_unicode_string(string):\n    \"\"\"\n    Decode string encoded by `unicode_string`\n    \"\"\"\n    if string.startswith('[BASE64-DATA]') and string.endswith('[/BASE64-DATA]'):\n        return base64.b64decode(string[len('[BASE64-DATA]'):-len('[/BASE64-DATA]')])\n    return string\n\n\ndef decode_unicode_obj(obj):\n    \"\"\"\n    Decode unicoded dict/list/tuple encoded by `unicode_obj`\n    \"\"\"\n    if isinstance(obj, dict):\n        r = {}\n        for k, v in iteritems(obj):\n            r[decode_unicode_string(k)] = decode_unicode_obj(v)\n        return r\n    elif isinstance(obj, six.string_types):\n        return decode_unicode_string(obj)\n    elif isinstance(obj, (list, tuple)):\n        return [decode_unicode_obj(x) for x in obj]\n    else:\n        return obj\n\n\nclass Get(object):\n    \"\"\"\n    Lazy value calculate for object\n    \"\"\"\n\n    def __init__(self, getter):\n        self.getter = getter\n\n    def __get__(self, instance, owner):\n        return self.getter()\n\n\nclass ObjectDict(dict):\n    \"\"\"\n    Object like dict, every dict[key] can visite by dict.key\n\n    If dict[key] is `Get`, calculate it's value.\n    \"\"\"\n\n    def __getattr__(self, name):\n        ret = self.__getitem__(name)\n        if hasattr(ret, '__get__'):\n            return ret.__get__(self, ObjectDict)\n        return ret\n\n\ndef load_object(name):\n    \"\"\"Load object from module\"\"\"\n\n    if \".\" not in name:\n        raise Exception('load object need module.object')\n\n    module_name, object_name = name.rsplit('.', 1)\n    if six.PY2:\n        module = __import__(module_name, globals(), locals(), [utf8(object_name)], -1)\n    else:\n        module = __import__(module_name, globals(), locals(), [object_name])\n    return getattr(module, object_name)\n\n\ndef get_python_console(namespace=None):\n    \"\"\"\n    Return a interactive python console instance with caller's stack\n    \"\"\"\n\n    if namespace is None:\n        import inspect\n        frame = inspect.currentframe()\n        caller = frame.f_back\n        if not caller:\n            logging.error(\"can't find caller who start this console.\")\n            caller = frame\n        namespace = dict(caller.f_globals)\n        namespace.update(caller.f_locals)\n\n    try:\n        from IPython.terminal.interactiveshell import TerminalInteractiveShell\n        shell = TerminalInteractiveShell(user_ns=namespace)\n    except ImportError:\n        try:\n            import readline\n            import rlcompleter\n            readline.set_completer(rlcompleter.Completer(namespace).complete)\n            readline.parse_and_bind(\"tab: complete\")\n        except ImportError:\n            pass\n        import code\n        shell = code.InteractiveConsole(namespace)\n        shell._quit = False\n\n        def exit():\n            shell._quit = True\n\n        def readfunc(prompt=\"\"):\n            if shell._quit:\n                raise EOFError\n            return six.moves.input(prompt)\n\n        # inject exit method\n        shell.ask_exit = exit\n        shell.raw_input = readfunc\n\n    return shell\n\n\ndef python_console(namespace=None):\n    \"\"\"Start a interactive python console with caller's stack\"\"\"\n\n    if namespace is None:\n        import inspect\n        frame = inspect.currentframe()\n        caller = frame.f_back\n        if not caller:\n            logging.error(\"can't find caller who start this console.\")\n            caller = frame\n        namespace = dict(caller.f_globals)\n        namespace.update(caller.f_locals)\n\n    return get_python_console(namespace=namespace).interact()\n\n\ndef check_port_open(port, addr='127.0.0.1'):\n    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock:\n        result = sock.connect_ex((addr, port))\n        if result == 0:\n            return True\n        else:\n            return False\n"
  },
  {
    "path": "pyspider/libs/wsgi_xmlrpc.py",
    "content": "#   Copyright (c) 2006-2007 Open Source Applications Foundation\n#\n#   Licensed under the Apache License, Version 2.0 (the \"License\");\n#   you may not use this file except in compliance with the License.\n#   You may obtain a copy of the License at\n#\n#       http://www.apache.org/licenses/LICENSE-2.0\n#\n#   Unless required by applicable law or agreed to in writing, software\n#   distributed under the License is distributed on an \"AS IS\" BASIS,\n#   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n#   See the License for the specific language governing permissions and\n#   limitations under the License.\n#\n#   Origin: https://code.google.com/p/wsgi-xmlrpc/\n\n\nfrom six.moves.xmlrpc_server import SimpleXMLRPCDispatcher\nimport logging\n\nlogger = logging.getLogger(__name__)\n\n\nclass WSGIXMLRPCApplication(object):\n    \"\"\"Application to handle requests to the XMLRPC service\"\"\"\n\n    def __init__(self, instance=None, methods=None):\n        \"\"\"Create windmill xmlrpc dispatcher\"\"\"\n        if methods is None:\n            methods = []\n        try:\n            self.dispatcher = SimpleXMLRPCDispatcher(allow_none=True, encoding=None)\n        except TypeError:\n            # python 2.4\n            self.dispatcher = SimpleXMLRPCDispatcher()\n        if instance is not None:\n            self.dispatcher.register_instance(instance)\n        for method in methods:\n            self.dispatcher.register_function(method)\n        self.dispatcher.register_introspection_functions()\n\n    def register_instance(self, instance):\n        return self.dispatcher.register_instance(instance)\n\n    def register_function(self, function, name=None):\n        return self.dispatcher.register_function(function, name)\n\n    def handler(self, environ, start_response):\n        \"\"\"XMLRPC service for windmill browser core to communicate with\"\"\"\n\n        if environ['REQUEST_METHOD'] == 'POST':\n            return self.handle_POST(environ, start_response)\n        else:\n            start_response(\"400 Bad request\", [('Content-Type', 'text/plain')])\n            return ['']\n\n    def handle_POST(self, environ, start_response):\n        \"\"\"Handles the HTTP POST request.\n\n        Attempts to interpret all HTTP POST requests as XML-RPC calls,\n        which are forwarded to the server's _dispatch method for handling.\n\n        Most code taken from SimpleXMLRPCServer with modifications for wsgi and my custom dispatcher.\n        \"\"\"\n\n        try:\n            # Get arguments by reading body of request.\n            # We read this in chunks to avoid straining\n            # socket.read(); around the 10 or 15Mb mark, some platforms\n            # begin to have problems (bug #792570).\n\n            length = int(environ['CONTENT_LENGTH'])\n            data = environ['wsgi.input'].read(length)\n\n            # In previous versions of SimpleXMLRPCServer, _dispatch\n            # could be overridden in this class, instead of in\n            # SimpleXMLRPCDispatcher. To maintain backwards compatibility,\n            # check to see if a subclass implements _dispatch and\n            # using that method if present.\n            response = self.dispatcher._marshaled_dispatch(\n                data, getattr(self.dispatcher, '_dispatch', None)\n            )\n            response += b'\\n'\n        except Exception as e:  # This should only happen if the module is buggy\n            # internal error, report as HTTP server error\n            logger.exception(e)\n            start_response(\"500 Server error\", [('Content-Type', 'text/plain')])\n            return []\n        else:\n            # got a valid XML RPC response\n            start_response(\"200 OK\", [('Content-Type', 'text/xml'), ('Content-Length', str(len(response)),)])\n            return [response]\n\n    def __call__(self, environ, start_response):\n        return self.handler(environ, start_response)\n"
  },
  {
    "path": "pyspider/logging.conf",
    "content": "[loggers]\nkeys=root,scheduler,fetcher,processor,webui,bench,werkzeug\n\n[logger_root]\nlevel=INFO\nhandlers=screen\n\n[logger_scheduler]\nlevel=INFO\nhandlers=screen\nqualname=scheduler\npropagate=0\n\n[logger_fetcher]\nlevel=DEBUG\nhandlers=screen\nqualname=fetcher\npropagate=0\n\n[logger_processor]\nlevel=DEBUG\nhandlers=screen\nqualname=processor\npropagate=0\n\n[logger_webui]\nlevel=DEBUG\nhandlers=screen\nqualname=webui\npropagate=0\n\n[logger_bench]\nlevel=DEBUG\nhandlers=screen\nqualname=bench\npropagate=0\n\n[logger_werkzeug]\nlevel=INFO\nhandlers=screen\nqualname=werkzeug\npropagate=0\n\n[handlers]\nkeys=screen\n\n[handler_screen]\nclass=logging.StreamHandler\nformatter=pretty\nlevel=DEBUG\nargs=(sys.stderr, )\n\n[formatters]\nkeys=pretty\n\n[formatter_pretty]\nclass=pyspider.libs.log.LogFormatter\n"
  },
  {
    "path": "pyspider/message_queue/__init__.py",
    "content": "#!/usr/bin/env python\n# -*- encoding: utf-8 -*-\n# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:\n# Author: Binux<roy@binux.me>\n#         http://binux.me\n# Created on 2015-04-30 21:47:08\n\nimport logging\n\ntry:\n    from urllib import parse as urlparse\nexcept ImportError:\n    import urlparse\n\n\ndef connect_message_queue(name, url=None, maxsize=0, lazy_limit=True):\n    \"\"\"\n    create connection to message queue\n\n    name:\n        name of message queue\n\n    rabbitmq:\n        amqp://username:password@host:5672/%2F\n        see https://www.rabbitmq.com/uri-spec.html\n    redis:\n        redis://host:6379/db\n        redis://host1:port1,host2:port2,...,hostn:portn (for redis 3.x in cluster mode)\n    kombu:\n        kombu+transport://userid:password@hostname:port/virtual_host\n        see http://kombu.readthedocs.org/en/latest/userguide/connections.html#urls\n    builtin:\n        None\n    \"\"\"\n\n    if not url:\n        from pyspider.libs.multiprocessing_queue import Queue\n        return Queue(maxsize=maxsize)\n\n    parsed = urlparse.urlparse(url)\n    if parsed.scheme == 'amqp':\n        from .rabbitmq import Queue\n        return Queue(name, url, maxsize=maxsize, lazy_limit=lazy_limit)\n    elif parsed.scheme == 'redis':\n        from .redis_queue import Queue\n        if ',' in parsed.netloc:\n            \"\"\"\n            redis in cluster mode (there is no concept of 'db' in cluster mode)\n            ex. redis://host1:port1,host2:port2,...,hostn:portn\n            \"\"\"\n            cluster_nodes = []\n            for netloc in parsed.netloc.split(','):\n                cluster_nodes.append({'host': netloc.split(':')[0], 'port': int(netloc.split(':')[1])})\n\n            return Queue(name=name, maxsize=maxsize, lazy_limit=lazy_limit, cluster_nodes=cluster_nodes)\n\n        else:\n            db = parsed.path.lstrip('/').split('/')\n            try:\n                db = int(db[0])\n            except:\n                logging.warning('redis DB must zero-based numeric index, using 0 instead')\n                db = 0\n\n            password = parsed.password or None\n\n            return Queue(name=name, host=parsed.hostname, port=parsed.port, db=db, maxsize=maxsize, password=password, lazy_limit=lazy_limit)\n    elif url.startswith('kombu+'):\n        url = url[len('kombu+'):]\n        from .kombu_queue import Queue\n        return Queue(name, url, maxsize=maxsize, lazy_limit=lazy_limit)\n    else:\n        raise Exception('unknown connection url: %s', url)\n"
  },
  {
    "path": "pyspider/message_queue/kombu_queue.py",
    "content": "#!/usr/bin/env python\n# -*- encoding: utf-8 -*-\n# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:\n# Author: Binux<roy@binux.me>\n#         http://binux.me\n# Created on 2015-05-22 20:54:01\n\nimport time\nimport umsgpack\nfrom kombu import Connection, enable_insecure_serializers\nfrom kombu.serialization import register\nfrom kombu.exceptions import ChannelError\nfrom six.moves import queue as BaseQueue\n\n\nregister('umsgpack', umsgpack.packb, umsgpack.unpackb, 'application/x-msgpack')\nenable_insecure_serializers(['umsgpack'])\n\n\nclass KombuQueue(object):\n    \"\"\"\n    kombu is a high-level interface for multiple message queue backends.\n\n    KombuQueue is built on top of kombu API.\n    \"\"\"\n\n    Empty = BaseQueue.Empty\n    Full = BaseQueue.Full\n    max_timeout = 0.3\n\n    def __init__(self, name, url=\"amqp://\", maxsize=0, lazy_limit=True):\n        \"\"\"\n        Constructor for KombuQueue\n\n        url:        http://kombu.readthedocs.org/en/latest/userguide/connections.html#urls\n        maxsize:    an integer that sets the upperbound limit on the number of\n                    items that can be placed in the queue.\n        \"\"\"\n        self.name = name\n        self.conn = Connection(url)\n        self.queue = self.conn.SimpleQueue(self.name, no_ack=True, serializer='umsgpack')\n\n        self.maxsize = maxsize\n        self.lazy_limit = lazy_limit\n        if self.lazy_limit and self.maxsize:\n            self.qsize_diff_limit = int(self.maxsize * 0.1)\n        else:\n            self.qsize_diff_limit = 0\n        self.qsize_diff = 0\n\n    def qsize(self):\n        try:\n            return self.queue.qsize()\n        except ChannelError:\n            return 0\n\n    def empty(self):\n        if self.qsize() == 0:\n            return True\n        else:\n            return False\n\n    def full(self):\n        if self.maxsize and self.qsize() >= self.maxsize:\n            return True\n        else:\n            return False\n\n    def put(self, obj, block=True, timeout=None):\n        if not block:\n            return self.put_nowait(obj)\n\n        start_time = time.time()\n        while True:\n            try:\n                return self.put_nowait(obj)\n            except BaseQueue.Full:\n                if timeout:\n                    lasted = time.time() - start_time\n                    if timeout > lasted:\n                        time.sleep(min(self.max_timeout, timeout - lasted))\n                    else:\n                        raise\n                else:\n                    time.sleep(self.max_timeout)\n\n    def put_nowait(self, obj):\n        if self.lazy_limit and self.qsize_diff < self.qsize_diff_limit:\n            pass\n        elif self.full():\n            raise BaseQueue.Full\n        else:\n            self.qsize_diff = 0\n        return self.queue.put(obj)\n\n    def get(self, block=True, timeout=None):\n        try:\n            ret = self.queue.get(block, timeout)\n            return ret.payload\n        except self.queue.Empty:\n            raise BaseQueue.Empty\n\n    def get_nowait(self):\n        try:\n            ret = self.queue.get_nowait()\n            return ret.payload\n        except self.queue.Empty:\n            raise BaseQueue.Empty\n\n    def delete(self):\n        self.queue.queue.delete()\n\n    def __del__(self):\n        self.queue.close()\n\n\nQueue = KombuQueue\n"
  },
  {
    "path": "pyspider/message_queue/rabbitmq.py",
    "content": "#!/usr/bin/env python\n# -*- encoding: utf-8 -*-\n# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:\n# Author: Binux<17175297.hk@gmail.com>\n#         http://binux.me\n# Created on 2012-11-15 17:27:54\n\nimport time\nimport socket\nimport select\nimport logging\nimport umsgpack\nimport threading\n\nimport amqp\nfrom six.moves.urllib.parse import unquote\ntry:\n    from urllib import parse as urlparse\nexcept ImportError:\n    import urlparse\nfrom six.moves import queue as BaseQueue\n\n\ndef catch_error(func):\n    \"\"\"Catch errors of rabbitmq then reconnect\"\"\"\n    import amqp\n    try:\n        import pika.exceptions\n        connect_exceptions = (\n            pika.exceptions.ConnectionClosed,\n            pika.exceptions.AMQPConnectionError,\n        )\n    except ImportError:\n        connect_exceptions = ()\n\n    connect_exceptions += (\n        select.error,\n        socket.error,\n        amqp.ConnectionError\n    )\n\n    def wrap(self, *args, **kwargs):\n        try:\n            return func(self, *args, **kwargs)\n        except connect_exceptions as e:\n            logging.error('RabbitMQ error: %r, reconnect.', e)\n            self.reconnect()\n            return func(self, *args, **kwargs)\n    return wrap\n\n\nclass PikaQueue(object):\n    \"\"\"\n    A Queue like rabbitmq connector\n    \"\"\"\n\n    Empty = BaseQueue.Empty\n    Full = BaseQueue.Full\n    max_timeout = 0.3\n\n    def __init__(self, name, amqp_url='amqp://guest:guest@localhost:5672/%2F',\n                 maxsize=0, lazy_limit=True):\n        \"\"\"\n        Constructor for a PikaQueue.\n\n        Not works with python 3. Default for python 2.\n\n        amqp_url:   https://www.rabbitmq.com/uri-spec.html\n        maxsize:    an integer that sets the upperbound limit on the number of\n                    items that can be placed in the queue.\n        lazy_limit: as rabbitmq is shared between multipul instance, for a strict\n                    limit on the number of items in the queue. PikaQueue have to\n                    update current queue size before every put operation. When\n                    `lazy_limit` is enabled, PikaQueue will check queue size every\n                    max_size / 10 put operation for better performace.\n        \"\"\"\n        self.name = name\n        self.amqp_url = amqp_url\n        self.maxsize = maxsize\n        self.lock = threading.RLock()\n\n        self.lazy_limit = lazy_limit\n        if self.lazy_limit and self.maxsize:\n            self.qsize_diff_limit = int(self.maxsize * 0.1)\n        else:\n            self.qsize_diff_limit = 0\n        self.qsize_diff = 0\n\n        self.reconnect()\n\n    def reconnect(self):\n        \"\"\"Reconnect to rabbitmq server\"\"\"\n        import pika\n        import pika.exceptions\n\n        self.connection = pika.BlockingConnection(pika.URLParameters(self.amqp_url))\n        self.channel = self.connection.channel()\n        try:\n            self.channel.queue_declare(self.name)\n        except pika.exceptions.ChannelClosed:\n            self.connection = pika.BlockingConnection(pika.URLParameters(self.amqp_url))\n            self.channel = self.connection.channel()\n        #self.channel.queue_purge(self.name)\n\n    @catch_error\n    def qsize(self):\n        with self.lock:\n            ret = self.channel.queue_declare(self.name, passive=True)\n        return ret.method.message_count\n\n    def empty(self):\n        if self.qsize() == 0:\n            return True\n        else:\n            return False\n\n    def full(self):\n        if self.maxsize and self.qsize() >= self.maxsize:\n            return True\n        else:\n            return False\n\n    @catch_error\n    def put(self, obj, block=True, timeout=None):\n        if not block:\n            return self.put_nowait()\n\n        start_time = time.time()\n        while True:\n            try:\n                return self.put_nowait(obj)\n            except BaseQueue.Full:\n                if timeout:\n                    lasted = time.time() - start_time\n                    if timeout > lasted:\n                        time.sleep(min(self.max_timeout, timeout - lasted))\n                    else:\n                        raise\n                else:\n                    time.sleep(self.max_timeout)\n\n    @catch_error\n    def put_nowait(self, obj):\n        if self.lazy_limit and self.qsize_diff < self.qsize_diff_limit:\n            pass\n        elif self.full():\n            raise BaseQueue.Full\n        else:\n            self.qsize_diff = 0\n        with self.lock:\n            self.qsize_diff += 1\n            return self.channel.basic_publish(\"\", self.name, umsgpack.packb(obj))\n\n    @catch_error\n    def get(self, block=True, timeout=None, ack=False):\n        if not block:\n            return self.get_nowait()\n\n        start_time = time.time()\n        while True:\n            try:\n                return self.get_nowait(ack)\n            except BaseQueue.Empty:\n                if timeout:\n                    lasted = time.time() - start_time\n                    if timeout > lasted:\n                        time.sleep(min(self.max_timeout, timeout - lasted))\n                    else:\n                        raise\n                else:\n                    time.sleep(self.max_timeout)\n\n    @catch_error\n    def get_nowait(self, ack=False):\n        with self.lock:\n            method_frame, header_frame, body = self.channel.basic_get(self.name, not ack)\n            if method_frame is None:\n                raise BaseQueue.Empty\n            if ack:\n                self.channel.basic_ack(method_frame.delivery_tag)\n        return umsgpack.unpackb(body)\n\n    @catch_error\n    def delete(self):\n        with self.lock:\n            return self.channel.queue_delete(queue=self.name)\n\n\nclass AmqpQueue(PikaQueue):\n    Empty = BaseQueue.Empty\n    Full = BaseQueue.Full\n    max_timeout = 0.3\n\n    def __init__(self, name, amqp_url='amqp://guest:guest@localhost:5672/%2F',\n                 maxsize=0, lazy_limit=True):\n        \"\"\"\n        Constructor for a AmqpQueue.\n\n        Default for python 3.\n\n        amqp_url:   https://www.rabbitmq.com/uri-spec.html\n        maxsize:    an integer that sets the upperbound limit on the number of\n                    items that can be placed in the queue.\n        lazy_limit: as rabbitmq is shared between multipul instance, for a strict\n                    limit on the number of items in the queue. PikaQueue have to\n                    update current queue size before every put operation. When\n                    `lazy_limit` is enabled, PikaQueue will check queue size every\n                    max_size / 10 put operation for better performace.\n        \"\"\"\n        self.name = name\n        self.amqp_url = amqp_url\n        self.maxsize = maxsize\n        self.lock = threading.RLock()\n\n        self.lazy_limit = lazy_limit\n        if self.lazy_limit and self.maxsize:\n            self.qsize_diff_limit = int(self.maxsize * 0.1)\n        else:\n            self.qsize_diff_limit = 0\n        self.qsize_diff = 0\n\n        self.reconnect()\n\n    def reconnect(self):\n        \"\"\"Reconnect to rabbitmq server\"\"\"\n        parsed = urlparse.urlparse(self.amqp_url)\n        port = parsed.port or 5672\n        self.connection = amqp.Connection(host=\"%s:%s\" % (parsed.hostname, port),\n                                          userid=parsed.username or 'guest',\n                                          password=parsed.password or 'guest',\n                                          virtual_host=unquote(\n                                              parsed.path.lstrip('/') or '%2F')).connect()\n        self.channel = self.connection.channel()\n        try:\n            self.channel.queue_declare(self.name)\n        except amqp.exceptions.PreconditionFailed:\n            pass\n        #self.channel.queue_purge(self.name)\n\n    @catch_error\n    def qsize(self):\n        with self.lock:\n            name, message_count, consumer_count = self.channel.queue_declare(\n                self.name, passive=True)\n        return message_count\n\n    @catch_error\n    def put_nowait(self, obj):\n        if self.lazy_limit and self.qsize_diff < self.qsize_diff_limit:\n            pass\n        elif self.full():\n            raise BaseQueue.Full\n        else:\n            self.qsize_diff = 0\n        with self.lock:\n            self.qsize_diff += 1\n            msg = amqp.Message(umsgpack.packb(obj))\n            return self.channel.basic_publish(msg, exchange=\"\", routing_key=self.name)\n\n    @catch_error\n    def get_nowait(self, ack=False):\n        with self.lock:\n            message = self.channel.basic_get(self.name, not ack)\n            if message is None:\n                raise BaseQueue.Empty\n            if ack:\n                self.channel.basic_ack(message.delivery_tag)\n        return umsgpack.unpackb(message.body)\n\nQueue = PikaQueue\n"
  },
  {
    "path": "pyspider/message_queue/redis_queue.py",
    "content": "#!/usr/bin/env python\n# -*- encoding: utf-8 -*-\n# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:\n# Author: Binux<roy@binux.me>\n#         http://binux.me\n# Created on 2015-04-27 22:48:04\n\nimport time\nimport redis\nimport umsgpack\nfrom six.moves import queue as BaseQueue\n\n\nclass RedisQueue(object):\n    \"\"\"\n    A Queue like message built over redis\n    \"\"\"\n\n    Empty = BaseQueue.Empty\n    Full = BaseQueue.Full\n    max_timeout = 0.3\n\n    def __init__(self, name, host='localhost', port=6379, db=0,\n                 maxsize=0, lazy_limit=True, password=None, cluster_nodes=None):\n        \"\"\"\n        Constructor for RedisQueue\n\n        maxsize:    an integer that sets the upperbound limit on the number of\n                    items that can be placed in the queue.\n        lazy_limit: redis queue is shared via instance, a lazy size limit is used\n                    for better performance.\n        \"\"\"\n        self.name = name\n        if(cluster_nodes is not None):\n            from rediscluster import StrictRedisCluster\n            self.redis = StrictRedisCluster(startup_nodes=cluster_nodes)\n        else:\n            self.redis = redis.StrictRedis(host=host, port=port, db=db, password=password)\n        self.maxsize = maxsize\n        self.lazy_limit = lazy_limit\n        self.last_qsize = 0\n\n    def qsize(self):\n        self.last_qsize = self.redis.llen(self.name)\n        return self.last_qsize\n\n    def empty(self):\n        if self.qsize() == 0:\n            return True\n        else:\n            return False\n\n    def full(self):\n        if self.maxsize and self.qsize() >= self.maxsize:\n            return True\n        else:\n            return False\n\n    def put_nowait(self, obj):\n        if self.lazy_limit and self.last_qsize < self.maxsize:\n            pass\n        elif self.full():\n            raise self.Full\n        self.last_qsize = self.redis.rpush(self.name, umsgpack.packb(obj))\n        return True\n\n    def put(self, obj, block=True, timeout=None):\n        if not block:\n            return self.put_nowait(obj)\n\n        start_time = time.time()\n        while True:\n            try:\n                return self.put_nowait(obj)\n            except self.Full:\n                if timeout:\n                    lasted = time.time() - start_time\n                    if timeout > lasted:\n                        time.sleep(min(self.max_timeout, timeout - lasted))\n                    else:\n                        raise\n                else:\n                    time.sleep(self.max_timeout)\n\n    def get_nowait(self):\n        ret = self.redis.lpop(self.name)\n        if ret is None:\n            raise self.Empty\n        return umsgpack.unpackb(ret)\n\n    def get(self, block=True, timeout=None):\n        if not block:\n            return self.get_nowait()\n\n        start_time = time.time()\n        while True:\n            try:\n                return self.get_nowait()\n            except self.Empty:\n                if timeout:\n                    lasted = time.time() - start_time\n                    if timeout > lasted:\n                        time.sleep(min(self.max_timeout, timeout - lasted))\n                    else:\n                        raise\n                else:\n                    time.sleep(self.max_timeout)\n\nQueue = RedisQueue\n"
  },
  {
    "path": "pyspider/processor/__init__.py",
    "content": "from .processor import ProcessorResult, Processor\n"
  },
  {
    "path": "pyspider/processor/processor.py",
    "content": "#!/usr/bin/env python\n# -*- encoding: utf-8 -*-\n# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:\n# Author: Binux<i@binux.me>\n#         http://binux.me\n# Created on 2014-02-16 22:59:56\n\nimport sys\nimport six\nimport time\nimport logging\nimport traceback\nlogger = logging.getLogger(\"processor\")\n\nfrom six.moves import queue as Queue\nfrom pyspider.libs import utils\nfrom pyspider.libs.log import LogFormatter\nfrom pyspider.libs.utils import pretty_unicode, hide_me\nfrom pyspider.libs.response import rebuild_response\nfrom .project_module import ProjectManager, ProjectFinder\n\n\nclass ProcessorResult(object):\n    \"\"\"The result and logs producted by a callback\"\"\"\n\n    def __init__(self, result=None, follows=(), messages=(),\n                 logs=(), exception=None, extinfo=None, save=None):\n        if extinfo is None:\n            extinfo = {}\n        self.result = result\n        self.follows = follows\n        self.messages = messages\n        self.logs = logs\n        self.exception = exception\n        self.extinfo = extinfo\n        self.save = save\n\n    def rethrow(self):\n        \"\"\"rethrow the exception\"\"\"\n\n        if self.exception:\n            raise self.exception\n\n    def logstr(self):\n        \"\"\"handler the log records to formatted string\"\"\"\n\n        result = []\n        formater = LogFormatter(color=False)\n        for record in self.logs:\n            if isinstance(record, six.string_types):\n                result.append(pretty_unicode(record))\n            else:\n                if record.exc_info:\n                    a, b, tb = record.exc_info\n                    tb = hide_me(tb, globals())\n                    record.exc_info = a, b, tb\n                result.append(pretty_unicode(formater.format(record)))\n                result.append(u'\\n')\n        return u''.join(result)\n\n\nclass Processor(object):\n    PROCESS_TIME_LIMIT = 30\n    EXCEPTION_LIMIT = 3\n\n    RESULT_LOGS_LIMIT = 1000\n    RESULT_RESULT_LIMIT = 10\n\n    def __init__(self, projectdb, inqueue, status_queue, newtask_queue, result_queue,\n                 enable_stdout_capture=True,\n                 enable_projects_import=True,\n                 process_time_limit=PROCESS_TIME_LIMIT):\n        self.inqueue = inqueue\n        self.status_queue = status_queue\n        self.newtask_queue = newtask_queue\n        self.result_queue = result_queue\n        self.projectdb = projectdb\n        self.enable_stdout_capture = enable_stdout_capture\n\n        self._quit = False\n        self._exceptions = 10\n        self.project_manager = ProjectManager(projectdb, dict(\n            result_queue=self.result_queue,\n            enable_stdout_capture=self.enable_stdout_capture,\n            process_time_limit=process_time_limit,\n        ))\n\n        if enable_projects_import:\n            self.enable_projects_import()\n\n    def enable_projects_import(self):\n        '''\n        Enable import other project as module\n\n        `from project import project_name`\n        '''\n        sys.meta_path.append(ProjectFinder(self.projectdb))\n\n    def __del__(self):\n        pass\n\n    def on_task(self, task, response):\n        '''Deal one task'''\n        start_time = time.time()\n        response = rebuild_response(response)\n\n        try:\n            assert 'taskid' in task, 'need taskid in task'\n            project = task['project']\n            updatetime = task.get('project_updatetime', None)\n            md5sum = task.get('project_md5sum', None)\n            project_data = self.project_manager.get(project, updatetime, md5sum)\n            assert project_data, \"no such project!\"\n            if project_data.get('exception'):\n                ret = ProcessorResult(logs=(project_data.get('exception_log'), ),\n                                      exception=project_data['exception'])\n            else:\n                ret = project_data['instance'].run_task(\n                    project_data['module'], task, response)\n        except Exception as e:\n            logstr = traceback.format_exc()\n            ret = ProcessorResult(logs=(logstr, ), exception=e)\n        process_time = time.time() - start_time\n\n        if not ret.extinfo.get('not_send_status', False):\n            if ret.exception:\n                track_headers = dict(response.headers)\n            else:\n                track_headers = {}\n                for name in ('etag', 'last-modified'):\n                    if name not in response.headers:\n                        continue\n                    track_headers[name] = response.headers[name]\n\n            status_pack = {\n                'taskid': task['taskid'],\n                'project': task['project'],\n                'url': task.get('url'),\n                'track': {\n                    'fetch': {\n                        'ok': response.isok(),\n                        'redirect_url': response.url if response.url != response.orig_url else None,\n                        'time': response.time,\n                        'error': response.error,\n                        'status_code': response.status_code,\n                        'encoding': getattr(response, '_encoding', None),\n                        'headers': track_headers,\n                        'content': response.text[:500] if ret.exception else None,\n                    },\n                    'process': {\n                        'ok': not ret.exception,\n                        'time': process_time,\n                        'follows': len(ret.follows),\n                        'result': (\n                            None if ret.result is None\n                            else utils.text(ret.result)[:self.RESULT_RESULT_LIMIT]\n                        ),\n                        'logs': ret.logstr()[-self.RESULT_LOGS_LIMIT:],\n                        'exception': ret.exception,\n                    },\n                    'save': ret.save,\n                },\n            }\n            if 'schedule' in task:\n                status_pack['schedule'] = task['schedule']\n\n            # FIXME: unicode_obj should used in scheduler before store to database\n            # it's used here for performance.\n            self.status_queue.put(utils.unicode_obj(status_pack))\n\n        # FIXME: unicode_obj should used in scheduler before store to database\n        # it's used here for performance.\n        if ret.follows:\n            for each in (ret.follows[x:x + 1000] for x in range(0, len(ret.follows), 1000)):\n                self.newtask_queue.put([utils.unicode_obj(newtask) for newtask in each])\n\n        for project, msg, url in ret.messages:\n            try:\n                self.on_task({\n                    'taskid': utils.md5string(url),\n                    'project': project,\n                    'url': url,\n                    'process': {\n                        'callback': '_on_message',\n                    }\n                }, {\n                    'status_code': 200,\n                    'url': url,\n                    'save': (task['project'], msg),\n                })\n            except Exception as e:\n                logger.exception('Sending message error.')\n                continue\n\n        if ret.exception:\n            logger_func = logger.error\n        else:\n            logger_func = logger.info\n        logger_func('process %s:%s %s -> [%d] len:%d -> result:%.10r fol:%d msg:%d err:%r' % (\n            task['project'], task['taskid'],\n            task.get('url'), response.status_code, len(response.content),\n            ret.result, len(ret.follows), len(ret.messages), ret.exception))\n        return True\n\n    def quit(self):\n        '''Set quit signal'''\n        self._quit = True\n\n    def run(self):\n        '''Run loop'''\n        logger.info(\"processor starting...\")\n\n        while not self._quit:\n            try:\n                task, response = self.inqueue.get(timeout=1)\n                self.on_task(task, response)\n                self._exceptions = 0\n            except Queue.Empty as e:\n                continue\n            except KeyboardInterrupt:\n                break\n            except Exception as e:\n                logger.exception(e)\n                self._exceptions += 1\n                if self._exceptions > self.EXCEPTION_LIMIT:\n                    break\n                continue\n\n        logger.info(\"processor exiting...\")\n"
  },
  {
    "path": "pyspider/processor/project_module.py",
    "content": "#!/usr/bin/env python\n# -*- encoding: utf-8 -*-\n# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:\n# Author: Binux<i@binux.me>\n#         http://binux.me\n# Created on 2014-02-16 22:24:20\n\nimport os\nimport six\nimport sys\nimport imp\nimport time\nimport weakref\nimport logging\nimport inspect\nimport traceback\nimport linecache\nfrom pyspider.libs import utils\nfrom pyspider.libs.log import SaveLogHandler, LogFormatter\nlogger = logging.getLogger(\"processor\")\n\n\nclass ProjectManager(object):\n    \"\"\"\n    load projects from projectdb, update project\n    \"\"\"\n\n    CHECK_PROJECTS_INTERVAL = 5 * 60\n    RELOAD_PROJECT_INTERVAL = 60 * 60\n\n    @staticmethod\n    def build_module(project, env=None):\n        '''Build project script as module'''\n        from pyspider.libs import base_handler\n        assert 'name' in project, 'need name of project'\n        assert 'script' in project, 'need script of project'\n\n        if env is None:\n            env = {}\n        # fix for old non-package version scripts\n        pyspider_path = os.path.join(os.path.dirname(__file__), \"..\")\n        if pyspider_path not in sys.path:\n            sys.path.insert(1, pyspider_path)\n\n        env = dict(env)\n        env.update({\n            'debug': project.get('status', 'DEBUG') == 'DEBUG',\n        })\n\n        loader = ProjectLoader(project)\n        module = loader.load_module(project['name'])\n\n        # logger inject\n        module.log_buffer = []\n        module.logging = module.logger = logging.Logger(project['name'])\n        if env.get('enable_stdout_capture', True):\n            handler = SaveLogHandler(module.log_buffer)\n            handler.setFormatter(LogFormatter(color=False))\n        else:\n            handler = logging.StreamHandler()\n            handler.setFormatter(LogFormatter(color=True))\n        module.logger.addHandler(handler)\n\n        if '__handler_cls__' not in module.__dict__:\n            BaseHandler = module.__dict__.get('BaseHandler', base_handler.BaseHandler)\n            for each in list(six.itervalues(module.__dict__)):\n                if inspect.isclass(each) and each is not BaseHandler \\\n                        and issubclass(each, BaseHandler):\n                    module.__dict__['__handler_cls__'] = each\n        _class = module.__dict__.get('__handler_cls__')\n        assert _class is not None, \"need BaseHandler in project module\"\n\n        instance = _class()\n        instance.__env__ = env\n        instance.project_name = project['name']\n        instance.project = project\n\n        return {\n            'loader': loader,\n            'module': module,\n            'class': _class,\n            'instance': instance,\n            'exception': None,\n            'exception_log': '',\n            'info': project,\n            'load_time': time.time(),\n        }\n\n    def __init__(self, projectdb, env):\n        self.projectdb = projectdb\n        self.env = env\n\n        self.projects = {}\n        self.last_check_projects = time.time()\n\n    def _need_update(self, project_name, updatetime=None, md5sum=None):\n        '''Check if project_name need update'''\n        if project_name not in self.projects:\n            return True\n        elif md5sum and md5sum != self.projects[project_name]['info'].get('md5sum'):\n            return True\n        elif updatetime and updatetime > self.projects[project_name]['info'].get('updatetime', 0):\n            return True\n        elif time.time() - self.projects[project_name]['load_time'] > self.RELOAD_PROJECT_INTERVAL:\n            return True\n        return False\n\n    def _check_projects(self):\n        '''Check projects by last update time'''\n        for project in self.projectdb.check_update(self.last_check_projects,\n                                                   ['name', 'updatetime']):\n            if project['name'] not in self.projects:\n                continue\n            if project['updatetime'] > self.projects[project['name']]['info'].get('updatetime', 0):\n                self._update_project(project['name'])\n        self.last_check_projects = time.time()\n\n    def _update_project(self, project_name):\n        '''Update one project from database'''\n        project = self.projectdb.get(project_name)\n        if not project:\n            return None\n        return self._load_project(project)\n\n    def _load_project(self, project):\n        '''Load project into self.projects from project info dict'''\n        try:\n            project['md5sum'] = utils.md5string(project['script'])\n            ret = self.build_module(project, self.env)\n            self.projects[project['name']] = ret\n        except Exception as e:\n            logger.exception(\"load project %s error\", project.get('name', None))\n            ret = {\n                'loader': None,\n                'module': None,\n                'class': None,\n                'instance': None,\n                'exception': e,\n                'exception_log': traceback.format_exc(),\n                'info': project,\n                'load_time': time.time(),\n            }\n            self.projects[project['name']] = ret\n            return False\n        logger.debug('project: %s updated.', project.get('name', None))\n        return True\n\n    def get(self, project_name, updatetime=None, md5sum=None):\n        '''get project data object, return None if not exists'''\n        if time.time() - self.last_check_projects > self.CHECK_PROJECTS_INTERVAL:\n            self._check_projects()\n        if self._need_update(project_name, updatetime, md5sum):\n            self._update_project(project_name)\n        return self.projects.get(project_name, None)\n\n\nclass ProjectLoader(object):\n    '''ProjectLoader class for sys.meta_path'''\n\n    def __init__(self, project, mod=None):\n        self.project = project\n        self.name = project['name']\n        self.mod = mod\n        pass\n\n    def load_module(self, fullname):\n        if self.mod is None:\n            self.mod = mod = imp.new_module(fullname)\n        else:\n            mod = self.mod\n        mod.__file__ = '<%s>' % self.name\n        mod.__loader__ = self\n        mod.__project__ = self.project\n        mod.__package__ = ''\n        code = self.get_code(fullname)\n        six.exec_(code, mod.__dict__)\n        linecache.clearcache()\n        if sys.version_info[:2] == (3, 3):\n            sys.modules[fullname] = mod\n        return mod\n\n    def is_package(self, fullname):\n        return False\n\n    def get_code(self, fullname):\n        return compile(self.get_source(fullname), '<%s>' % self.name, 'exec')\n\n    def get_source(self, fullname):\n        script = self.project['script']\n        if isinstance(script, six.text_type):\n            return script.encode('utf8')\n        return script\n\n\nif six.PY2:\n    class ProjectFinder(object):\n        '''ProjectFinder class for sys.meta_path'''\n\n        def __init__(self, projectdb):\n            self.get_projectdb = weakref.ref(projectdb)\n\n        @property\n        def projectdb(self):\n            return self.get_projectdb()\n\n        def find_module(self, fullname, path=None):\n            if fullname == 'projects':\n                return self\n            parts = fullname.split('.')\n            if len(parts) == 2 and parts[0] == 'projects':\n                name = parts[1]\n                if not self.projectdb:\n                    return\n                info = self.projectdb.get(name)\n                if info:\n                    return ProjectLoader(info)\n\n        def load_module(self, fullname):\n            mod = imp.new_module(fullname)\n            mod.__file__ = '<projects>'\n            mod.__loader__ = self\n            mod.__path__ = ['<projects>']\n            mod.__package__ = 'projects'\n            return mod\n\n        def is_package(self, fullname):\n            return True\nelse:\n    import importlib.abc\n\n    class ProjectFinder(importlib.abc.MetaPathFinder):\n        '''ProjectFinder class for sys.meta_path'''\n\n        def __init__(self, projectdb):\n            self.get_projectdb = weakref.ref(projectdb)\n\n        @property\n        def projectdb(self):\n            return self.get_projectdb()\n\n        def find_spec(self, fullname, path, target=None):\n            loader = self.find_module(fullname, path)\n            if loader:\n                return importlib.util.spec_from_loader(fullname, loader)\n\n        def find_module(self, fullname, path):\n            if fullname == 'projects':\n                return ProjectsLoader()\n            parts = fullname.split('.')\n            if len(parts) == 2 and parts[0] == 'projects':\n                name = parts[1]\n                if not self.projectdb:\n                    return\n                info = self.projectdb.get(name)\n                if info:\n                    return ProjectLoader(info)\n\n    class ProjectsLoader(importlib.abc.InspectLoader):\n        def load_module(self, fullname):\n            mod = imp.new_module(fullname)\n            mod.__file__ = '<projects>'\n            mod.__loader__ = self\n            mod.__path__ = ['<projects>']\n            mod.__package__ = 'projects'\n            if sys.version_info[:2] == (3, 3):\n                sys.modules[fullname] = mod\n            return mod\n\n        def module_repr(self, module):\n            return '<Module projects>'\n\n        def is_package(self, fullname):\n            return True\n\n        def get_source(self, path):\n            return ''\n\n        def get_code(self, fullname):\n            return compile(self.get_source(fullname), '<projects>', 'exec')\n\n    class ProjectLoader(ProjectLoader, importlib.abc.Loader):\n        def create_module(self, spec):\n            return self.load_module(spec.name)\n\n        def exec_module(self, module):\n            return module\n\n        def module_repr(self, module):\n            return '<Module projects.%s>' % self.name\n"
  },
  {
    "path": "pyspider/result/__init__.py",
    "content": "#!/usr/bin/env python\n# -*- encoding: utf-8 -*-\n# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:\n# Author: Binux<i@binux.me>\n#         http://binux.me\n# Created on 2014-10-19 16:10:19\n\nfrom .result_worker import ResultWorker, OneResultWorker\n"
  },
  {
    "path": "pyspider/result/result_worker.py",
    "content": "#!/usr/bin/env python\n# -*- encoding: utf-8 -*-\n# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:\n# Author: Binux<i@binux.me>\n#         http://binux.me\n# Created on 2014-10-19 15:37:46\n\nimport time\nimport json\nimport logging\nfrom six.moves import queue as Queue\nlogger = logging.getLogger(\"result\")\n\n\nclass ResultWorker(object):\n\n    \"\"\"\n    do with result\n    override this if needed.\n    \"\"\"\n\n    def __init__(self, resultdb, inqueue):\n        self.resultdb = resultdb\n        self.inqueue = inqueue\n        self._quit = False\n\n    def on_result(self, task, result):\n        '''Called every result'''\n        if not result:\n            return\n        if 'taskid' in task and 'project' in task and 'url' in task:\n            logger.info('result %s:%s %s -> %.30r' % (\n                task['project'], task['taskid'], task['url'], result))\n            return self.resultdb.save(\n                project=task['project'],\n                taskid=task['taskid'],\n                url=task['url'],\n                result=result\n            )\n        else:\n            logger.warning('result UNKNOW -> %.30r' % result)\n            return\n\n    def quit(self):\n        self._quit = True\n\n    def run(self):\n        '''Run loop'''\n        logger.info(\"result_worker starting...\")\n\n        while not self._quit:\n            try:\n                task, result = self.inqueue.get(timeout=1)\n                self.on_result(task, result)\n            except Queue.Empty as e:\n                continue\n            except KeyboardInterrupt:\n                break\n            except AssertionError as e:\n                logger.error(e)\n                continue\n            except Exception as e:\n                logger.exception(e)\n                continue\n\n        logger.info(\"result_worker exiting...\")\n\n\nclass OneResultWorker(ResultWorker):\n    '''Result Worker for one mode, write results to stdout'''\n    def on_result(self, task, result):\n        '''Called every result'''\n        if not result:\n            return\n        if 'taskid' in task and 'project' in task and 'url' in task:\n            logger.info('result %s:%s %s -> %.30r' % (\n                task['project'], task['taskid'], task['url'], result))\n            print(json.dumps({\n                'taskid': task['taskid'],\n                'project': task['project'],\n                'url': task['url'],\n                'result': result,\n                'updatetime': time.time()\n            }))\n        else:\n            logger.warning('result UNKNOW -> %.30r' % result)\n            return\n"
  },
  {
    "path": "pyspider/run.py",
    "content": "#!/usr/bin/env python\n# -*- encoding: utf-8 -*-\n# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:\n# Author: Binux<i@binux.me>\n#         http://binux.me\n# Created on 2014-03-05 00:11:49\n\n\nimport os\nimport sys\nimport six\nimport copy\nimport time\nimport shutil\nimport logging\nimport logging.config\n\nimport click\nimport pyspider\nfrom pyspider.message_queue import connect_message_queue\nfrom pyspider.database import connect_database\nfrom pyspider.libs import utils\n\n\ndef read_config(ctx, param, value):\n    if not value:\n        return {}\n    import json\n\n    def underline_dict(d):\n        if not isinstance(d, dict):\n            return d\n        return dict((k.replace('-', '_'), underline_dict(v)) for k, v in six.iteritems(d))\n\n    config = underline_dict(json.load(value))\n    ctx.default_map = config\n    return config\n\n\ndef connect_db(ctx, param, value):\n    if not value:\n        return\n    return utils.Get(lambda: connect_database(value))\n\n\ndef load_cls(ctx, param, value):\n    if isinstance(value, six.string_types):\n        return utils.load_object(value)\n    return value\n\n\ndef connect_rpc(ctx, param, value):\n    if not value:\n        return\n    try:\n        from six.moves import xmlrpc_client\n    except ImportError:\n        import xmlrpclib as xmlrpc_client\n    return xmlrpc_client.ServerProxy(value, allow_none=True)\n\n\n@click.group(invoke_without_command=True)\n@click.option('-c', '--config', callback=read_config, type=click.File('r'),\n              help='a json file with default values for subcommands. {\"webui\": {\"port\":5001}}')\n@click.option('--logging-config', default=os.path.join(os.path.dirname(__file__), \"logging.conf\"),\n              help=\"logging config file for built-in python logging module\", show_default=True)\n@click.option('--debug', envvar='DEBUG', default=False, is_flag=True, help='debug mode')\n@click.option('--queue-maxsize', envvar='QUEUE_MAXSIZE', default=100,\n              help='maxsize of queue')\n@click.option('--taskdb', envvar='TASKDB', callback=connect_db,\n              help='database url for taskdb, default: sqlite')\n@click.option('--projectdb', envvar='PROJECTDB', callback=connect_db,\n              help='database url for projectdb, default: sqlite')\n@click.option('--resultdb', envvar='RESULTDB', callback=connect_db,\n              help='database url for resultdb, default: sqlite')\n@click.option('--message-queue', envvar='AMQP_URL',\n              help='connection url to message queue, '\n              'default: builtin multiprocessing.Queue')\n@click.option('--amqp-url', help='[deprecated] amqp url for rabbitmq. '\n              'please use --message-queue instead.')\n@click.option('--beanstalk', envvar='BEANSTALK_HOST',\n              help='[deprecated] beanstalk config for beanstalk queue. '\n              'please use --message-queue instead.')\n@click.option('--phantomjs-proxy', envvar='PHANTOMJS_PROXY', help=\"phantomjs proxy ip:port\")\n@click.option('--puppeteer-proxy', envvar='PUPPETEER_PROXY', help=\"puppeteer proxy ip:port\")\n@click.option('--data-path', default='./data', help='data dir path')\n@click.option('--add-sys-path/--not-add-sys-path', default=True, is_flag=True,\n              help='add current working directory to python lib search path')\n@click.version_option(version=pyspider.__version__, prog_name=pyspider.__name__)\n@click.pass_context\ndef cli(ctx, **kwargs):\n    \"\"\"\n    A powerful spider system in python.\n    \"\"\"\n    if kwargs['add_sys_path']:\n        sys.path.append(os.getcwd())\n\n    logging.config.fileConfig(kwargs['logging_config'])\n\n    # get db from env\n    for db in ('taskdb', 'projectdb', 'resultdb'):\n        if kwargs[db] is not None:\n            continue\n        if os.environ.get('MYSQL_NAME'):\n            kwargs[db] = utils.Get(lambda db=db: connect_database(\n                'sqlalchemy+mysql+%s://%s:%s/%s' % (\n                    db, os.environ['MYSQL_PORT_3306_TCP_ADDR'],\n                    os.environ['MYSQL_PORT_3306_TCP_PORT'], db)))\n        elif os.environ.get('MONGODB_NAME'):\n            kwargs[db] = utils.Get(lambda db=db: connect_database(\n                'mongodb+%s://%s:%s/%s' % (\n                    db, os.environ['MONGODB_PORT_27017_TCP_ADDR'],\n                    os.environ['MONGODB_PORT_27017_TCP_PORT'], db)))\n        elif os.environ.get('COUCHDB_NAME'):\n            kwargs[db] = utils.Get(lambda db=db: connect_database(\n                'couchdb+%s://%s:%s/%s' % (\n                    db,\n                    os.environ['COUCHDB_PORT_5984_TCP_ADDR'] or 'couchdb',\n                    os.environ['COUCHDB_PORT_5984_TCP_PORT'] or '5984',\n                    db)))\n        elif ctx.invoked_subcommand == 'bench':\n            if kwargs['data_path'] == './data':\n                kwargs['data_path'] += '/bench'\n                shutil.rmtree(kwargs['data_path'], ignore_errors=True)\n                os.mkdir(kwargs['data_path'])\n            if db in ('taskdb', 'resultdb'):\n                kwargs[db] = utils.Get(lambda db=db: connect_database('sqlite+%s://' % (db)))\n            elif db in ('projectdb', ):\n                kwargs[db] = utils.Get(lambda db=db: connect_database('local+%s://%s' % (\n                    db, os.path.join(os.path.dirname(__file__), 'libs/bench.py'))))\n        else:\n            if not os.path.exists(kwargs['data_path']):\n                os.mkdir(kwargs['data_path'])\n            kwargs[db] = utils.Get(lambda db=db: connect_database('sqlite+%s:///%s/%s.db' % (\n                db, kwargs['data_path'], db[:-2])))\n            kwargs['is_%s_default' % db] = True\n\n    # create folder for counter.dump\n    if not os.path.exists(kwargs['data_path']):\n        os.mkdir(kwargs['data_path'])\n\n    # message queue, compatible with old version\n    if kwargs.get('message_queue'):\n        pass\n    elif kwargs.get('amqp_url'):\n        kwargs['message_queue'] = kwargs['amqp_url']\n    elif os.environ.get('RABBITMQ_NAME'):\n        kwargs['message_queue'] = (\"amqp://guest:guest@%(RABBITMQ_PORT_5672_TCP_ADDR)s\"\n                                   \":%(RABBITMQ_PORT_5672_TCP_PORT)s/%%2F\" % os.environ)\n\n    for name in ('newtask_queue', 'status_queue', 'scheduler2fetcher',\n                 'fetcher2processor', 'processor2result'):\n        if kwargs.get('message_queue'):\n            kwargs[name] = utils.Get(lambda name=name: connect_message_queue(\n                name, kwargs.get('message_queue'), kwargs['queue_maxsize']))\n        else:\n            kwargs[name] = connect_message_queue(name, kwargs.get('message_queue'),\n                                                 kwargs['queue_maxsize'])\n\n    # phantomjs-proxy\n    if kwargs.get('phantomjs_proxy'):\n        pass\n    elif os.environ.get('PHANTOMJS_NAME'):\n        kwargs['phantomjs_proxy'] = os.environ['PHANTOMJS_PORT_25555_TCP'][len('tcp://'):]\n\n    # puppeteer-proxy\n    if kwargs.get('puppeteer_proxy'):\n        pass\n    elif os.environ.get('PUPPETEER_NAME'):\n        kwargs['puppeteer_proxy'] = os.environ['PUPPETEER_PORT_22222_TCP'][len('tcp://'):]\n\n    ctx.obj = utils.ObjectDict(ctx.obj or {})\n    ctx.obj['instances'] = []\n    ctx.obj.update(kwargs)\n\n    if ctx.invoked_subcommand is None and not ctx.obj.get('testing_mode'):\n        ctx.invoke(all)\n    return ctx\n\n\n@cli.command()\n@click.option('--xmlrpc', is_flag=True, help=\"Enable xmlrpc (Default=True)\")\n@click.option('--no-xmlrpc', is_flag=True, help=\"Disable xmlrpc\")\n@click.option('--xmlrpc-host', default='0.0.0.0')\n@click.option('--xmlrpc-port', envvar='SCHEDULER_XMLRPC_PORT', default=23333)\n@click.option('--inqueue-limit', default=0,\n              help='size limit of task queue for each project, '\n              'tasks will been ignored when overflow')\n@click.option('--delete-time', default=24 * 60 * 60,\n              help='delete time before marked as delete')\n@click.option('--active-tasks', default=100, help='active log size')\n@click.option('--loop-limit', default=1000, help='maximum number of tasks due with in a loop')\n@click.option('--fail-pause-num', default=10, help='auto pause the project when last FAIL_PAUSE_NUM task failed, set 0 to disable')\n@click.option('--scheduler-cls', default='pyspider.scheduler.ThreadBaseScheduler', callback=load_cls,\n              help='scheduler class to be used.')\n@click.option('--threads', default=None, help='thread number for ThreadBaseScheduler, default: 4')\n@click.pass_context\ndef scheduler(ctx, xmlrpc, no_xmlrpc, xmlrpc_host, xmlrpc_port,\n              inqueue_limit, delete_time, active_tasks, loop_limit, fail_pause_num,\n              scheduler_cls, threads, get_object=False):\n    \"\"\"\n    Run Scheduler, only one scheduler is allowed.\n    \"\"\"\n    g = ctx.obj\n    Scheduler = load_cls(None, None, scheduler_cls)\n\n    kwargs = dict(taskdb=g.taskdb, projectdb=g.projectdb, resultdb=g.resultdb,\n                  newtask_queue=g.newtask_queue, status_queue=g.status_queue,\n                  out_queue=g.scheduler2fetcher, data_path=g.get('data_path', 'data'))\n    if threads:\n        kwargs['threads'] = int(threads)\n\n    scheduler = Scheduler(**kwargs)\n    scheduler.INQUEUE_LIMIT = inqueue_limit\n    scheduler.DELETE_TIME = delete_time\n    scheduler.ACTIVE_TASKS = active_tasks\n    scheduler.LOOP_LIMIT = loop_limit\n    scheduler.FAIL_PAUSE_NUM = fail_pause_num\n\n    g.instances.append(scheduler)\n    if g.get('testing_mode') or get_object:\n        return scheduler\n\n    if not no_xmlrpc:\n        utils.run_in_thread(scheduler.xmlrpc_run, port=xmlrpc_port, bind=xmlrpc_host)\n\n    scheduler.run()\n\n\n@cli.command()\n@click.option('--xmlrpc', is_flag=True, help=\"Enable xmlrpc (Default=True)\")\n@click.option('--no-xmlrpc', is_flag=True, help=\"Disable xmlrpc\")\n@click.option('--xmlrpc-host', default='0.0.0.0')\n@click.option('--xmlrpc-port', envvar='FETCHER_XMLRPC_PORT', default=24444)\n@click.option('--poolsize', default=100, help=\"max simultaneous fetches\")\n@click.option('--proxy', help=\"proxy host:port\")\n@click.option('--user-agent', help='user agent')\n@click.option('--timeout', help='default fetch timeout')\n@click.option('--phantomjs-endpoint', help=\"endpoint of phantomjs, start via pyspider phantomjs\")\n@click.option('--puppeteer-endpoint', help=\"endpoint of puppeteer, start via pyspider puppeteer\")\n@click.option('--splash-endpoint', help=\"execute endpoint of splash: http://splash.readthedocs.io/en/stable/api.html#execute\")\n@click.option('--fetcher-cls', default='pyspider.fetcher.Fetcher', callback=load_cls,\n              help='Fetcher class to be used.')\n@click.pass_context\ndef fetcher(ctx, xmlrpc, no_xmlrpc, xmlrpc_host, xmlrpc_port, poolsize, proxy, user_agent,\n            timeout, phantomjs_endpoint, puppeteer_endpoint, splash_endpoint, fetcher_cls,\n            async_mode=True, get_object=False, no_input=False):\n    \"\"\"\n    Run Fetcher.\n    \"\"\"\n    g = ctx.obj\n    Fetcher = load_cls(None, None, fetcher_cls)\n\n    if no_input:\n        inqueue = None\n        outqueue = None\n    else:\n        inqueue = g.scheduler2fetcher\n        outqueue = g.fetcher2processor\n    fetcher = Fetcher(inqueue=inqueue, outqueue=outqueue,\n                      poolsize=poolsize, proxy=proxy, async_mode=async_mode)\n    fetcher.phantomjs_proxy = phantomjs_endpoint or g.phantomjs_proxy\n    fetcher.puppeteer_proxy = puppeteer_endpoint or g.puppeteer_proxy\n    fetcher.splash_endpoint = splash_endpoint\n    if user_agent:\n        fetcher.user_agent = user_agent\n    if timeout:\n        fetcher.default_options = copy.deepcopy(fetcher.default_options)\n        fetcher.default_options['timeout'] = timeout\n\n    g.instances.append(fetcher)\n    if g.get('testing_mode') or get_object:\n        return fetcher\n\n    if not no_xmlrpc:\n        utils.run_in_thread(fetcher.xmlrpc_run, port=xmlrpc_port, bind=xmlrpc_host)\n\n    fetcher.run()\n\n\n@cli.command()\n@click.option('--processor-cls', default='pyspider.processor.Processor',\n              callback=load_cls, help='Processor class to be used.')\n@click.option('--process-time-limit', default=30, help='script process time limit')\n@click.pass_context\ndef processor(ctx, processor_cls, process_time_limit, enable_stdout_capture=True, get_object=False):\n    \"\"\"\n    Run Processor.\n    \"\"\"\n    g = ctx.obj\n    Processor = load_cls(None, None, processor_cls)\n\n    processor = Processor(projectdb=g.projectdb,\n                          inqueue=g.fetcher2processor, status_queue=g.status_queue,\n                          newtask_queue=g.newtask_queue, result_queue=g.processor2result,\n                          enable_stdout_capture=enable_stdout_capture,\n                          process_time_limit=process_time_limit)\n\n    g.instances.append(processor)\n    if g.get('testing_mode') or get_object:\n        return processor\n\n    processor.run()\n\n\n@cli.command()\n@click.option('--result-cls', default='pyspider.result.ResultWorker', callback=load_cls,\n              help='ResultWorker class to be used.')\n@click.pass_context\ndef result_worker(ctx, result_cls, get_object=False):\n    \"\"\"\n    Run result worker.\n    \"\"\"\n    g = ctx.obj\n    ResultWorker = load_cls(None, None, result_cls)\n\n    result_worker = ResultWorker(resultdb=g.resultdb, inqueue=g.processor2result)\n\n    g.instances.append(result_worker)\n    if g.get('testing_mode') or get_object:\n        return result_worker\n\n    result_worker.run()\n\n\n@cli.command()\n@click.option('--host', default='0.0.0.0', envvar='WEBUI_HOST',\n              help='webui bind to host')\n@click.option('--port', default=5000, envvar='WEBUI_PORT',\n              help='webui bind to host')\n@click.option('--cdn', default='//cdnjs.cloudflare.com/ajax/libs/',\n              help='js/css cdn server')\n@click.option('--scheduler-rpc', help='xmlrpc path of scheduler')\n@click.option('--fetcher-rpc', help='xmlrpc path of fetcher')\n@click.option('--max-rate', type=float, help='max rate for each project')\n@click.option('--max-burst', type=float, help='max burst for each project')\n@click.option('--username', envvar='WEBUI_USERNAME',\n              help='username of lock -ed projects')\n@click.option('--password', envvar='WEBUI_PASSWORD',\n              help='password of lock -ed projects')\n@click.option('--need-auth', is_flag=True, default=False, help='need username and password')\n@click.option('--webui-instance', default='pyspider.webui.app.app', callback=load_cls,\n              help='webui Flask Application instance to be used.')\n@click.option('--process-time-limit', default=30, help='script process time limit in debug')\n@click.pass_context\ndef webui(ctx, host, port, cdn, scheduler_rpc, fetcher_rpc, max_rate, max_burst,\n          username, password, need_auth, webui_instance, process_time_limit, get_object=False):\n    \"\"\"\n    Run WebUI\n    \"\"\"\n    app = load_cls(None, None, webui_instance)\n\n    g = ctx.obj\n    app.config['taskdb'] = g.taskdb\n    app.config['projectdb'] = g.projectdb\n    app.config['resultdb'] = g.resultdb\n    app.config['cdn'] = cdn\n\n    if max_rate:\n        app.config['max_rate'] = max_rate\n    if max_burst:\n        app.config['max_burst'] = max_burst\n    if username:\n        app.config['webui_username'] = username\n    if password:\n        app.config['webui_password'] = password\n    app.config['need_auth'] = need_auth\n    app.config['process_time_limit'] = process_time_limit\n\n    # inject queues for webui\n    for name in ('newtask_queue', 'status_queue', 'scheduler2fetcher',\n                 'fetcher2processor', 'processor2result'):\n        app.config['queues'][name] = getattr(g, name, None)\n\n    # fetcher rpc\n    if isinstance(fetcher_rpc, six.string_types):\n        import umsgpack\n        fetcher_rpc = connect_rpc(ctx, None, fetcher_rpc)\n        app.config['fetch'] = lambda x: umsgpack.unpackb(fetcher_rpc.fetch(x).data)\n    else:\n        # get fetcher instance for webui\n        fetcher_config = g.config.get('fetcher', {})\n        webui_fetcher = ctx.invoke(fetcher, async_mode=False, get_object=True, no_input=True, **fetcher_config)\n\n        app.config['fetch'] = lambda x: webui_fetcher.fetch(x)\n\n    # scheduler rpc\n    if isinstance(scheduler_rpc, six.string_types):\n        scheduler_rpc = connect_rpc(ctx, None, scheduler_rpc)\n    if scheduler_rpc is None and os.environ.get('SCHEDULER_PORT_23333_TCP_ADDR'):\n        app.config['scheduler_rpc'] = connect_rpc(ctx, None,\n                                                  'http://{}:{}/'.format(os.environ.get('SCHEDULER_PORT_23333_TCP_ADDR'),\n                                                                         os.environ.get('SCHEDULER_PORT_23333_TCP_PORT') or 23333))\n    elif scheduler_rpc is None:\n        app.config['scheduler_rpc'] = connect_rpc(ctx, None, 'http://127.0.0.1:23333/')\n    else:\n        app.config['scheduler_rpc'] = scheduler_rpc\n\n\n    app.debug = g.debug\n    g.instances.append(app)\n    if g.get('testing_mode') or get_object:\n        return app\n\n    app.run(host=host, port=port)\n\n\n@cli.command()\n@click.option('--phantomjs-path', default='phantomjs', help='phantomjs path')\n@click.option('--port', default=25555, help='phantomjs port')\n@click.option('--auto-restart', default=False, help='auto restart phantomjs if crashed')\n@click.argument('args', nargs=-1)\n@click.pass_context\ndef phantomjs(ctx, phantomjs_path, port, auto_restart, args):\n    \"\"\"\n    Run phantomjs fetcher if phantomjs is installed.\n    \"\"\"\n    args = args or ctx.default_map and ctx.default_map.get('args', [])\n\n    import subprocess\n    g = ctx.obj\n    _quit = []\n    phantomjs_fetcher = os.path.join(\n        os.path.dirname(pyspider.__file__), 'fetcher/phantomjs_fetcher.js')\n    cmd = [phantomjs_path,\n           # this may cause memory leak: https://github.com/ariya/phantomjs/issues/12903\n           #'--load-images=false',\n           '--ssl-protocol=any',\n           '--disk-cache=true'] + list(args or []) + [phantomjs_fetcher, str(port)]\n\n    try:\n        _phantomjs = subprocess.Popen(cmd)\n    except OSError:\n        logging.warning('phantomjs not found, continue running without it.')\n        return None\n\n    def quit(*args, **kwargs):\n        _quit.append(1)\n        _phantomjs.kill()\n        _phantomjs.wait()\n        logging.info('phantomjs exited.')\n\n    if not g.get('phantomjs_proxy'):\n        g['phantomjs_proxy'] = '127.0.0.1:%s' % port\n\n    phantomjs = utils.ObjectDict(port=port, quit=quit)\n    g.instances.append(phantomjs)\n    if g.get('testing_mode'):\n        return phantomjs\n\n    while True:\n        _phantomjs.wait()\n        if _quit or not auto_restart:\n            break\n        _phantomjs = subprocess.Popen(cmd)\n\n@cli.command()\n@click.option('--port', default=22222, help='puppeteer port')\n@click.option('--auto-restart', default=False, help='auto restart puppeteer if crashed')\n@click.argument('args', nargs=-1)\n@click.pass_context\ndef puppeteer(ctx, port, auto_restart, args):\n    \"\"\"\n    Run puppeteer fetcher if puppeteer is installed.\n    \"\"\"\n\n    import subprocess\n    g = ctx.obj\n    _quit = []\n    puppeteer_fetcher = os.path.join(\n        os.path.dirname(pyspider.__file__), 'fetcher/puppeteer_fetcher.js')\n\n    cmd = ['node', puppeteer_fetcher, str(port)]\n    try:\n        _puppeteer = subprocess.Popen(cmd)\n    except OSError:\n        logging.warning('puppeteer not found, continue running without it.')\n        return None\n\n    def quit(*args, **kwargs):\n        _quit.append(1)\n        _puppeteer.kill()\n        _puppeteer.wait()\n        logging.info('puppeteer exited.')\n\n    if not g.get('puppeteer_proxy'):\n        g['puppeteer_proxy'] = '127.0.0.1:%s' % port\n\n    puppeteer = utils.ObjectDict(port=port, quit=quit)\n    g.instances.append(puppeteer)\n    if g.get('testing_mode'):\n        return puppeteer\n\n    while True:\n        _puppeteer.wait()\n        if _quit or not auto_restart:\n            break\n        _puppeteer = subprocess.Popen(cmd)\n\n\n@cli.command()\n@click.option('--fetcher-num', default=1, help='instance num of fetcher')\n@click.option('--processor-num', default=1, help='instance num of processor')\n@click.option('--result-worker-num', default=1,\n              help='instance num of result worker')\n@click.option('--run-in', default='subprocess', type=click.Choice(['subprocess', 'thread']),\n              help='run each components in thread or subprocess. '\n              'always using thread for windows.')\n@click.pass_context\ndef all(ctx, fetcher_num, processor_num, result_worker_num, run_in):\n    \"\"\"\n    Run all the components in subprocess or thread\n    \"\"\"\n\n    ctx.obj['debug'] = False\n    g = ctx.obj\n\n    # FIXME: py34 cannot run components with threads\n    if run_in == 'subprocess' and os.name != 'nt':\n        run_in = utils.run_in_subprocess\n    else:\n        run_in = utils.run_in_thread\n\n    threads = []\n\n    try:\n        # phantomjs\n        if not g.get('phantomjs_proxy'):\n            phantomjs_config = g.config.get('phantomjs', {})\n            phantomjs_config.setdefault('auto_restart', True)\n            threads.append(run_in(ctx.invoke, phantomjs, **phantomjs_config))\n            time.sleep(2)\n            if threads[-1].is_alive() and not g.get('phantomjs_proxy'):\n                g['phantomjs_proxy'] = '127.0.0.1:%s' % phantomjs_config.get('port', 25555)\n\n        # puppeteer\n        if not g.get('puppeteer_proxy'):\n            puppeteer_config = g.config.get('puppeteer', {})\n            puppeteer_config.setdefault('auto_restart', True)\n            threads.append(run_in(ctx.invoke, puppeteer, **puppeteer_config))\n            time.sleep(2)\n            if threads[-1].is_alive() and not g.get('puppeteer_proxy'):\n                g['puppeteer_proxy'] = '127.0.0.1:%s' % puppeteer_config.get('port', 22222)\n\n        # result worker\n        result_worker_config = g.config.get('result_worker', {})\n        for i in range(result_worker_num):\n            threads.append(run_in(ctx.invoke, result_worker, **result_worker_config))\n\n        # processor\n        processor_config = g.config.get('processor', {})\n        for i in range(processor_num):\n            threads.append(run_in(ctx.invoke, processor, **processor_config))\n\n        # fetcher\n        fetcher_config = g.config.get('fetcher', {})\n        fetcher_config.setdefault('xmlrpc_host', '127.0.0.1')\n        for i in range(fetcher_num):\n            threads.append(run_in(ctx.invoke, fetcher, **fetcher_config))\n\n        # scheduler\n        scheduler_config = g.config.get('scheduler', {})\n        scheduler_config.setdefault('xmlrpc_host', '127.0.0.1')\n        threads.append(run_in(ctx.invoke, scheduler, **scheduler_config))\n\n        # running webui in main thread to make it exitable\n        webui_config = g.config.get('webui', {})\n        webui_config.setdefault('scheduler_rpc', 'http://127.0.0.1:%s/'\n                                % g.config.get('scheduler', {}).get('xmlrpc_port', 23333))\n        ctx.invoke(webui, **webui_config)\n    finally:\n        # exit components run in threading\n        for each in g.instances:\n            each.quit()\n\n        # exit components run in subprocess\n        for each in threads:\n            if not each.is_alive():\n                continue\n            if hasattr(each, 'terminate'):\n                each.terminate()\n            each.join()\n\n\n@cli.command()\n@click.option('--fetcher-num', default=1, help='instance num of fetcher')\n@click.option('--processor-num', default=2, help='instance num of processor')\n@click.option('--result-worker-num', default=1, help='instance num of result worker')\n@click.option('--run-in', default='subprocess', type=click.Choice(['subprocess', 'thread']),\n              help='run each components in thread or subprocess. '\n              'always using thread for windows.')\n@click.option('--total', default=10000, help=\"total url in test page\")\n@click.option('--show', default=20, help=\"show how many urls in a page\")\n@click.option('--taskdb-bench', default=False, is_flag=True,\n              help=\"only run taskdb bench test\")\n@click.option('--message-queue-bench', default=False, is_flag=True,\n              help=\"only run message queue bench test\")\n@click.option('--all-bench', default=False, is_flag=True,\n              help=\"only run all bench test\")\n@click.pass_context\ndef bench(ctx, fetcher_num, processor_num, result_worker_num, run_in, total, show,\n          taskdb_bench, message_queue_bench, all_bench):\n    \"\"\"\n    Run Benchmark test.\n    In bench mode, in-memory sqlite database is used instead of on-disk sqlite database.\n    \"\"\"\n    from pyspider.libs import bench\n    from pyspider.webui import bench_test  # flake8: noqa\n\n    ctx.obj['debug'] = False\n    g = ctx.obj\n    if result_worker_num == 0:\n        g['processor2result'] = None\n\n    if run_in == 'subprocess' and os.name != 'nt':\n        run_in = utils.run_in_subprocess\n    else:\n        run_in = utils.run_in_thread\n\n    all_test = not taskdb_bench and not message_queue_bench and not all_bench\n\n    # test taskdb\n    if all_test or taskdb_bench:\n        bench.bench_test_taskdb(g.taskdb)\n    # test message queue\n    if all_test or message_queue_bench:\n        bench.bench_test_message_queue(g.scheduler2fetcher)\n    # test all\n    if not all_test and not all_bench:\n        return\n\n    project_name = 'bench'\n\n    def clear_project():\n        g.taskdb.drop(project_name)\n        g.resultdb.drop(project_name)\n\n    clear_project()\n\n    # disable log\n    logging.getLogger().setLevel(logging.ERROR)\n    logging.getLogger('scheduler').setLevel(logging.ERROR)\n    logging.getLogger('fetcher').setLevel(logging.ERROR)\n    logging.getLogger('processor').setLevel(logging.ERROR)\n    logging.getLogger('result').setLevel(logging.ERROR)\n    logging.getLogger('webui').setLevel(logging.ERROR)\n    logging.getLogger('werkzeug').setLevel(logging.ERROR)\n\n    try:\n        threads = []\n\n        # result worker\n        result_worker_config = g.config.get('result_worker', {})\n        for i in range(result_worker_num):\n            threads.append(run_in(ctx.invoke, result_worker,\n                                  result_cls='pyspider.libs.bench.BenchResultWorker',\n                                  **result_worker_config))\n\n        # processor\n        processor_config = g.config.get('processor', {})\n        for i in range(processor_num):\n            threads.append(run_in(ctx.invoke, processor,\n                                  processor_cls='pyspider.libs.bench.BenchProcessor',\n                                  **processor_config))\n\n        # fetcher\n        fetcher_config = g.config.get('fetcher', {})\n        fetcher_config.setdefault('xmlrpc_host', '127.0.0.1')\n        for i in range(fetcher_num):\n            threads.append(run_in(ctx.invoke, fetcher,\n                                  fetcher_cls='pyspider.libs.bench.BenchFetcher',\n                                  **fetcher_config))\n\n        # webui\n        webui_config = g.config.get('webui', {})\n        webui_config.setdefault('scheduler_rpc', 'http://127.0.0.1:%s/'\n                                % g.config.get('scheduler', {}).get('xmlrpc_port', 23333))\n        threads.append(run_in(ctx.invoke, webui, **webui_config))\n\n        # scheduler\n        scheduler_config = g.config.get('scheduler', {})\n        scheduler_config.setdefault('xmlrpc_host', '127.0.0.1')\n        scheduler_config.setdefault('xmlrpc_port', 23333)\n        threads.append(run_in(ctx.invoke, scheduler,\n                              scheduler_cls='pyspider.libs.bench.BenchScheduler',\n                              **scheduler_config))\n        scheduler_rpc = connect_rpc(ctx, None,\n                                    'http://%(xmlrpc_host)s:%(xmlrpc_port)s/' % scheduler_config)\n\n        for _ in range(20):\n            if utils.check_port_open(23333):\n                break\n            time.sleep(1)\n\n        scheduler_rpc.newtask({\n            \"project\": project_name,\n            \"taskid\": \"on_start\",\n            \"url\": \"data:,on_start\",\n            \"fetch\": {\n                \"save\": {\"total\": total, \"show\": show}\n            },\n            \"process\": {\n                \"callback\": \"on_start\",\n            },\n        })\n\n        # wait bench test finished\n        while True:\n            time.sleep(1)\n            if scheduler_rpc.size() == 0:\n                break\n    finally:\n        # exit components run in threading\n        for each in g.instances:\n            each.quit()\n\n        # exit components run in subprocess\n        for each in threads:\n            if hasattr(each, 'terminate'):\n                each.terminate()\n            each.join(1)\n\n        clear_project()\n\n\n@cli.command()\n@click.option('-i', '--interactive', default=False, is_flag=True,\n              help='enable interactive mode, you can choose crawl url.')\n@click.option('--phantomjs', 'enable_phantomjs', default=False, is_flag=True,\n              help='enable phantomjs, will spawn a subprocess for phantomjs')\n@click.option('--puppeteer', 'enable_puppeteer', default=False, is_flag=True,\n              help='enable puppeteer, will spawn a subprocess for puppeteer')\n@click.argument('scripts', nargs=-1)\n@click.pass_context\ndef one(ctx, interactive, enable_phantomjs, enable_puppeteer, scripts):\n    \"\"\"\n    One mode not only means all-in-one, it runs every thing in one process over\n    tornado.ioloop, for debug purpose\n    \"\"\"\n\n    ctx.obj['debug'] = False\n    g = ctx.obj\n    g['testing_mode'] = True\n\n    if scripts:\n        from pyspider.database.local.projectdb import ProjectDB\n        g['projectdb'] = ProjectDB(scripts)\n        if g.get('is_taskdb_default'):\n            g['taskdb'] = connect_database('sqlite+taskdb://')\n        if g.get('is_resultdb_default'):\n            g['resultdb'] = None\n\n    if enable_phantomjs:\n        phantomjs_config = g.config.get('phantomjs', {})\n        phantomjs_obj = ctx.invoke(phantomjs, **phantomjs_config)\n        if phantomjs_obj:\n            g.setdefault('phantomjs_proxy', '127.0.0.1:%s' % phantomjs_obj.port)\n    else:\n        phantomjs_obj = None\n\n    if enable_puppeteer:\n        puppeteer_config = g.config.get('puppeteer', {})\n        puppeteer_obj = ctx.invoke(puppeteer, **puppeteer_config)\n        if puppeteer_obj:\n            g.setdefault('puppeteer_proxy', '127.0.0.1:%s' % puppeteer.port)\n    else:\n        puppeteer_obj = None\n\n    result_worker_config = g.config.get('result_worker', {})\n    if g.resultdb is None:\n        result_worker_config.setdefault('result_cls',\n                                        'pyspider.result.OneResultWorker')\n    result_worker_obj = ctx.invoke(result_worker, **result_worker_config)\n\n    processor_config = g.config.get('processor', {})\n    processor_config.setdefault('enable_stdout_capture', False)\n    processor_obj = ctx.invoke(processor, **processor_config)\n\n    fetcher_config = g.config.get('fetcher', {})\n    fetcher_config.setdefault('xmlrpc', False)\n    fetcher_obj = ctx.invoke(fetcher, **fetcher_config)\n\n    scheduler_config = g.config.get('scheduler', {})\n    scheduler_config.setdefault('xmlrpc', False)\n    scheduler_config.setdefault('scheduler_cls',\n                                'pyspider.scheduler.OneScheduler')\n    scheduler_obj = ctx.invoke(scheduler, **scheduler_config)\n\n    scheduler_obj.init_one(ioloop=fetcher_obj.ioloop,\n                           fetcher=fetcher_obj,\n                           processor=processor_obj,\n                           result_worker=result_worker_obj,\n                           interactive=interactive)\n    if scripts:\n        for project in g.projectdb.projects:\n            scheduler_obj.trigger_on_start(project)\n\n    try:\n        scheduler_obj.run()\n    finally:\n        scheduler_obj.quit()\n        if phantomjs_obj:\n            phantomjs_obj.quit()\n        if puppeteer_obj:\n            puppeteer_obj.quit()\n\n\n@cli.command()\n@click.option('--scheduler-rpc', callback=connect_rpc, help='xmlrpc path of scheduler')\n@click.argument('project', nargs=1)\n@click.argument('message', nargs=1)\n@click.pass_context\ndef send_message(ctx, scheduler_rpc, project, message):\n    \"\"\"\n    Send Message to project from command line\n    \"\"\"\n    if isinstance(scheduler_rpc, six.string_types):\n        scheduler_rpc = connect_rpc(ctx, None, scheduler_rpc)\n    if scheduler_rpc is None and os.environ.get('SCHEDULER_PORT_23333_TCP_ADDR'):\n        scheduler_rpc = connect_rpc(ctx, None, 'http://%s:%s/' % (os.environ['SCHEDULER_PORT_23333_TCP_ADDR'],\n                                                                  os.environ['SCHEDULER_PORT_23333_TCP_PORT'] or 23333))\n    if scheduler_rpc is None:\n        scheduler_rpc = connect_rpc(ctx, None, 'http://127.0.0.1:23333/')\n\n    return scheduler_rpc.send_task({\n        'taskid': utils.md5string('data:,on_message'),\n        'project': project,\n        'url': 'data:,on_message',\n        'fetch': {\n            'save': ('__command__', message),\n        },\n        'process': {\n            'callback': '_on_message',\n        }\n    })\n\n\ndef main():\n    cli()\n\nif __name__ == '__main__':\n    main()\n"
  },
  {
    "path": "pyspider/scheduler/__init__.py",
    "content": "from .scheduler import Scheduler, OneScheduler, ThreadBaseScheduler  # NOQA\n"
  },
  {
    "path": "pyspider/scheduler/scheduler.py",
    "content": "#!/usr/bin/env python\n# -*- encoding: utf-8 -*-\n# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:\n# Author: Binux<i@binux.me>\n#         http://binux.me\n# Created on 2014-02-07 17:05:11\n\n\nimport itertools\nimport json\nimport logging\nimport os\nimport time\nfrom collections import deque\n\nfrom six import iteritems, itervalues\nfrom six.moves import queue as Queue\n\nfrom pyspider.libs import counter, utils\nfrom pyspider.libs.base_handler import BaseHandler\nfrom .task_queue import TaskQueue\n\nlogger = logging.getLogger('scheduler')\n\n\nclass Project(object):\n    '''\n    project for scheduler\n    '''\n    def __init__(self, scheduler, project_info):\n        '''\n        '''\n        self.scheduler = scheduler\n\n        self.active_tasks = deque(maxlen=scheduler.ACTIVE_TASKS)\n        self.task_queue = TaskQueue()\n        self.task_loaded = False\n        self._selected_tasks = False  # selected tasks after recent pause\n        self._send_finished_event_wait = 0  # wait for scheduler.FAIL_PAUSE_NUM loop steps before sending the event\n\n        self.md5sum = None\n        self._send_on_get_info = False\n        self.waiting_get_info = True\n\n        self._paused = False\n        self._paused_time = 0\n        self._unpause_last_seen = None\n\n        self.update(project_info)\n\n    @property\n    def paused(self):\n        if self.scheduler.FAIL_PAUSE_NUM <= 0:\n            return False\n\n        # unpaused --(last FAIL_PAUSE_NUM task failed)--> paused --(PAUSE_TIME)--> unpause_checking\n        #                         unpaused <--(last UNPAUSE_CHECK_NUM task have success)--|\n        #                             paused <--(last UNPAUSE_CHECK_NUM task no success)--|\n        if not self._paused:\n            fail_cnt = 0\n            for _, task in self.active_tasks:\n                # ignore select task\n                if task.get('type') == self.scheduler.TASK_PACK:\n                    continue\n                if 'process' not in task['track']:\n                    logger.error('process not in task, %r', task)\n                if task['track']['process']['ok']:\n                    break\n                else:\n                    fail_cnt += 1\n                if fail_cnt >= self.scheduler.FAIL_PAUSE_NUM:\n                    break\n            if fail_cnt >= self.scheduler.FAIL_PAUSE_NUM:\n                self._paused = True\n                self._paused_time = time.time()\n        elif self._paused is True and (self._paused_time + self.scheduler.PAUSE_TIME < time.time()):\n            self._paused = 'checking'\n            self._unpause_last_seen = self.active_tasks[0][1] if len(self.active_tasks) else None\n        elif self._paused == 'checking':\n            cnt = 0\n            fail_cnt = 0\n            for _, task in self.active_tasks:\n                if task is self._unpause_last_seen:\n                    break\n                # ignore select task\n                if task.get('type') == self.scheduler.TASK_PACK:\n                    continue\n                cnt += 1\n                if task['track']['process']['ok']:\n                    # break with enough check cnt\n                    cnt = max(cnt, self.scheduler.UNPAUSE_CHECK_NUM)\n                    break\n                else:\n                    fail_cnt += 1\n            if cnt >= self.scheduler.UNPAUSE_CHECK_NUM:\n                if fail_cnt == cnt:\n                    self._paused = True\n                    self._paused_time = time.time()\n                else:\n                    self._paused = False\n\n        return self._paused is True\n\n    def update(self, project_info):\n        self.project_info = project_info\n\n        self.name = project_info['name']\n        self.group = project_info['group']\n        self.db_status = project_info['status']\n        self.updatetime = project_info['updatetime']\n\n        md5sum = utils.md5string(project_info['script'])\n        if self.md5sum != md5sum:\n            self.waiting_get_info = True\n            self.md5sum = md5sum\n        if self.waiting_get_info and self.active:\n            self._send_on_get_info = True\n\n        if self.active:\n            self.task_queue.rate = project_info['rate']\n            self.task_queue.burst = project_info['burst']\n        else:\n            self.task_queue.rate = 0\n            self.task_queue.burst = 0\n\n        logger.info('project %s updated, status:%s, paused:%s, %d tasks',\n                    self.name, self.db_status, self.paused, len(self.task_queue))\n\n    def on_get_info(self, info):\n        self.waiting_get_info = False\n        self.min_tick = info.get('min_tick', 0)\n        self.retry_delay = info.get('retry_delay', {})\n        self.crawl_config = info.get('crawl_config', {})\n\n    @property\n    def active(self):\n        return self.db_status in ('RUNNING', 'DEBUG')\n\n\nclass Scheduler(object):\n    UPDATE_PROJECT_INTERVAL = 5 * 60\n    default_schedule = {\n        'priority': 0,\n        'retries': 3,\n        'exetime': 0,\n        'age': -1,\n        'itag': None,\n    }\n    LOOP_LIMIT = 1000\n    LOOP_INTERVAL = 0.1\n    ACTIVE_TASKS = 100\n    INQUEUE_LIMIT = 0\n    EXCEPTION_LIMIT = 3\n    DELETE_TIME = 24 * 60 * 60\n    DEFAULT_RETRY_DELAY = {\n        0: 30,\n        1: 1*60*60,\n        2: 6*60*60,\n        3: 12*60*60,\n        '': 24*60*60\n    }\n    FAIL_PAUSE_NUM = 10\n    PAUSE_TIME = 5*60\n    UNPAUSE_CHECK_NUM = 3\n\n    TASK_PACK = 1\n    STATUS_PACK = 2  # current not used\n    REQUEST_PACK = 3  # current not used\n\n    def __init__(self, taskdb, projectdb, newtask_queue, status_queue,\n                 out_queue, data_path='./data', resultdb=None):\n        self.taskdb = taskdb\n        self.projectdb = projectdb\n        self.resultdb = resultdb\n        self.newtask_queue = newtask_queue\n        self.status_queue = status_queue\n        self.out_queue = out_queue\n        self.data_path = data_path\n\n        self._send_buffer = deque()\n        self._quit = False\n        self._exceptions = 0\n        self.projects = dict()\n        self._force_update_project = False\n        self._last_update_project = 0\n        self._last_tick = int(time.time())\n        self._postpone_request = []\n\n        self._cnt = {\n            \"5m_time\": counter.CounterManager(\n                lambda: counter.TimebaseAverageEventCounter(30, 10)),\n            \"5m\": counter.CounterManager(\n                lambda: counter.TimebaseAverageWindowCounter(30, 10)),\n            \"1h\": counter.CounterManager(\n                lambda: counter.TimebaseAverageWindowCounter(60, 60)),\n            \"1d\": counter.CounterManager(\n                lambda: counter.TimebaseAverageWindowCounter(10 * 60, 24 * 6)),\n            \"all\": counter.CounterManager(\n                lambda: counter.TotalCounter()),\n        }\n        self._cnt['1h'].load(os.path.join(self.data_path, 'scheduler.1h'))\n        self._cnt['1d'].load(os.path.join(self.data_path, 'scheduler.1d'))\n        self._cnt['all'].load(os.path.join(self.data_path, 'scheduler.all'))\n        self._last_dump_cnt = 0\n\n    def _update_projects(self):\n        '''Check project update'''\n        now = time.time()\n        if (\n                not self._force_update_project\n                and self._last_update_project + self.UPDATE_PROJECT_INTERVAL > now\n        ):\n            return\n        for project in self.projectdb.check_update(self._last_update_project):\n            self._update_project(project)\n            logger.debug(\"project: %s updated.\", project['name'])\n        self._force_update_project = False\n        self._last_update_project = now\n\n    get_info_attributes = ['min_tick', 'retry_delay', 'crawl_config']\n\n    def _update_project(self, project):\n        '''update one project'''\n        if project['name'] not in self.projects:\n            self.projects[project['name']] = Project(self, project)\n        else:\n            self.projects[project['name']].update(project)\n\n        project = self.projects[project['name']]\n\n        if project._send_on_get_info:\n            # update project runtime info from processor by sending a _on_get_info\n            # request, result is in status_page.track.save\n            project._send_on_get_info = False\n            self.on_select_task({\n                'taskid': '_on_get_info',\n                'project': project.name,\n                'url': 'data:,_on_get_info',\n                'status': self.taskdb.SUCCESS,\n                'fetch': {\n                    'save': self.get_info_attributes,\n                },\n                'process': {\n                    'callback': '_on_get_info',\n                },\n            })\n\n        # load task queue when project is running and delete task_queue when project is stoped\n        if project.active:\n            if not project.task_loaded:\n                self._load_tasks(project)\n                project.task_loaded = True\n        else:\n            if project.task_loaded:\n                project.task_queue = TaskQueue()\n                project.task_loaded = False\n\n            if project not in self._cnt['all']:\n                self._update_project_cnt(project.name)\n\n    scheduler_task_fields = ['taskid', 'project', 'schedule', ]\n\n    def _load_tasks(self, project):\n        '''load tasks from database'''\n        task_queue = project.task_queue\n\n        for task in self.taskdb.load_tasks(\n                self.taskdb.ACTIVE, project.name, self.scheduler_task_fields\n        ):\n            taskid = task['taskid']\n            _schedule = task.get('schedule', self.default_schedule)\n            priority = _schedule.get('priority', self.default_schedule['priority'])\n            exetime = _schedule.get('exetime', self.default_schedule['exetime'])\n            task_queue.put(taskid, priority, exetime)\n        project.task_loaded = True\n        logger.debug('project: %s loaded %d tasks.', project.name, len(task_queue))\n\n        if project not in self._cnt['all']:\n            self._update_project_cnt(project.name)\n        self._cnt['all'].value((project.name, 'pending'), len(project.task_queue))\n\n    def _update_project_cnt(self, project_name):\n        status_count = self.taskdb.status_count(project_name)\n        self._cnt['all'].value(\n            (project_name, 'success'),\n            status_count.get(self.taskdb.SUCCESS, 0)\n        )\n        self._cnt['all'].value(\n            (project_name, 'failed'),\n            status_count.get(self.taskdb.FAILED, 0) + status_count.get(self.taskdb.BAD, 0)\n        )\n        self._cnt['all'].value(\n            (project_name, 'pending'),\n            status_count.get(self.taskdb.ACTIVE, 0)\n        )\n\n    def task_verify(self, task):\n        '''\n        return False if any of 'taskid', 'project', 'url' is not in task dict\n                        or project in not in task_queue\n        '''\n        for each in ('taskid', 'project', 'url', ):\n            if each not in task or not task[each]:\n                logger.error('%s not in task: %.200r', each, task)\n                return False\n        if task['project'] not in self.projects:\n            logger.error('unknown project: %s', task['project'])\n            return False\n\n        project = self.projects[task['project']]\n        if not project.active:\n            logger.error('project %s not started, please set status to RUNNING or DEBUG',\n                         task['project'])\n            return False\n        return True\n\n    def insert_task(self, task):\n        '''insert task into database'''\n        return self.taskdb.insert(task['project'], task['taskid'], task)\n\n    def update_task(self, task):\n        '''update task in database'''\n        return self.taskdb.update(task['project'], task['taskid'], task)\n\n    def put_task(self, task):\n        '''put task to task queue'''\n        _schedule = task.get('schedule', self.default_schedule)\n        self.projects[task['project']].task_queue.put(\n            task['taskid'],\n            priority=_schedule.get('priority', self.default_schedule['priority']),\n            exetime=_schedule.get('exetime', self.default_schedule['exetime'])\n        )\n\n    def send_task(self, task, force=True):\n        '''\n        dispatch task to fetcher\n\n        out queue may have size limit to prevent block, a send_buffer is used\n        '''\n        try:\n            self.out_queue.put_nowait(task)\n        except Queue.Full:\n            if force:\n                self._send_buffer.appendleft(task)\n            else:\n                raise\n\n    def _check_task_done(self):\n        '''Check status queue'''\n        cnt = 0\n        try:\n            while True:\n                task = self.status_queue.get_nowait()\n                # check _on_get_info result here\n                if task.get('taskid') == '_on_get_info' and 'project' in task and 'track' in task:\n                    if task['project'] not in self.projects:\n                        continue\n                    project = self.projects[task['project']]\n                    project.on_get_info(task['track'].get('save') or {})\n                    logger.info(\n                        '%s on_get_info %r', task['project'], task['track'].get('save', {})\n                    )\n                    continue\n                elif not self.task_verify(task):\n                    continue\n                self.on_task_status(task)\n                cnt += 1\n        except Queue.Empty:\n            pass\n        return cnt\n\n    merge_task_fields = ['taskid', 'project', 'url', 'status', 'schedule', 'lastcrawltime']\n\n    def _check_request(self):\n        '''Check new task queue'''\n        # check _postpone_request first\n        todo = []\n        for task in self._postpone_request:\n            if task['project'] not in self.projects:\n                continue\n            if self.projects[task['project']].task_queue.is_processing(task['taskid']):\n                todo.append(task)\n            else:\n                self.on_request(task)\n        self._postpone_request = todo\n\n        tasks = {}\n        while len(tasks) < self.LOOP_LIMIT:\n            try:\n                task = self.newtask_queue.get_nowait()\n            except Queue.Empty:\n                break\n\n            if isinstance(task, list):\n                _tasks = task\n            else:\n                _tasks = (task, )\n\n            for task in _tasks:\n                if not self.task_verify(task):\n                    continue\n\n                if task['taskid'] in self.projects[task['project']].task_queue:\n                    if not task.get('schedule', {}).get('force_update', False):\n                        logger.debug('ignore newtask %(project)s:%(taskid)s %(url)s', task)\n                        continue\n\n                if task['taskid'] in tasks:\n                    if not task.get('schedule', {}).get('force_update', False):\n                        continue\n\n                tasks[task['taskid']] = task\n\n        for task in itervalues(tasks):\n            self.on_request(task)\n\n        return len(tasks)\n\n    def _check_cronjob(self):\n        \"\"\"Check projects cronjob tick, return True when a new tick is sended\"\"\"\n        now = time.time()\n        self._last_tick = int(self._last_tick)\n        if now - self._last_tick < 1:\n            return False\n        self._last_tick += 1\n        for project in itervalues(self.projects):\n            if not project.active:\n                continue\n            if project.waiting_get_info:\n                continue\n            if int(project.min_tick) == 0:\n                continue\n            if self._last_tick % int(project.min_tick) != 0:\n                continue\n            self.on_select_task({\n                'taskid': '_on_cronjob',\n                'project': project.name,\n                'url': 'data:,_on_cronjob',\n                'status': self.taskdb.SUCCESS,\n                'fetch': {\n                    'save': {\n                        'tick': self._last_tick,\n                    },\n                },\n                'process': {\n                    'callback': '_on_cronjob',\n                },\n            })\n        return True\n\n    request_task_fields = [\n        'taskid',\n        'project',\n        'url',\n        'status',\n        'schedule',\n        'fetch',\n        'process',\n        'track',\n        'lastcrawltime'\n    ]\n\n    def _check_select(self):\n        '''Select task to fetch & process'''\n        while self._send_buffer:\n            _task = self._send_buffer.pop()\n            try:\n                # use force=False here to prevent automatic send_buffer append and get exception\n                self.send_task(_task, False)\n            except Queue.Full:\n                self._send_buffer.append(_task)\n                break\n\n        if self.out_queue.full():\n            return {}\n\n        taskids = []\n        cnt = 0\n        cnt_dict = dict()\n        limit = self.LOOP_LIMIT\n\n        # dynamic assign select limit for each project, use qsize as weight\n        project_weights, total_weight = dict(), 0\n        for project in itervalues(self.projects):  # type:Project\n            if not project.active:\n                continue\n            # only check project pause when select new tasks, cronjob and new request still working\n            if project.paused:\n                continue\n            if project.waiting_get_info:\n                continue\n\n            # task queue\n            task_queue = project.task_queue  # type:TaskQueue\n            pro_weight = task_queue.size()\n            total_weight += pro_weight\n            project_weights[project.name] = pro_weight\n            pass\n\n        min_project_limit = int(limit / 10.)  # ensure minimum select limit for each project\n        max_project_limit = int(limit / 3.0)  # ensure maximum select limit for each project\n\n        for pro_name, pro_weight in iteritems(project_weights):\n            if cnt >= limit:\n                break\n\n            project = self.projects[pro_name]  # type:Project\n\n            # task queue\n            task_queue = project.task_queue\n            task_queue.check_update()\n            project_cnt = 0\n\n            # calculate select limit for project\n            if total_weight < 1 or pro_weight < 1:\n                project_limit = min_project_limit\n            else:\n                project_limit = int((1.0 * pro_weight / total_weight) * limit)\n                if project_limit < min_project_limit:\n                    project_limit = min_project_limit\n                elif project_limit > max_project_limit:\n                    project_limit = max_project_limit\n\n            # check send_buffer here. when not empty, out_queue may blocked. Not sending tasks\n            while cnt < limit and project_cnt < project_limit:\n                taskid = task_queue.get()\n                if not taskid:\n                    break\n\n                taskids.append((project.name, taskid))\n                if taskid != 'on_finished':\n                    project_cnt += 1\n                cnt += 1\n\n            cnt_dict[project.name] = project_cnt\n            if project_cnt:\n                project._selected_tasks = True\n                project._send_finished_event_wait = 0\n\n            # check and send finished event to project\n            if not project_cnt and len(task_queue) == 0 and project._selected_tasks:\n                # wait for self.FAIL_PAUSE_NUM steps to make sure all tasks in queue have been processed\n                if project._send_finished_event_wait < self.FAIL_PAUSE_NUM:\n                    project._send_finished_event_wait += 1\n                else:\n                    project._selected_tasks = False\n                    project._send_finished_event_wait = 0\n\n                    self._postpone_request.append({\n                        'project': project.name,\n                        'taskid': 'on_finished',\n                        'url': 'data:,on_finished',\n                        'process': {\n                            'callback': 'on_finished',\n                        },\n                        \"schedule\": {\n                            \"age\": 0,\n                            \"priority\": 9,\n                            \"force_update\": True,\n                        },\n                    })\n\n        for project, taskid in taskids:\n            self._load_put_task(project, taskid)\n\n        return cnt_dict\n\n    def _load_put_task(self, project, taskid):\n        try:\n            task = self.taskdb.get_task(project, taskid, fields=self.request_task_fields)\n        except ValueError:\n            logger.error('bad task pack %s:%s', project, taskid)\n            return\n        if not task:\n            return\n        task = self.on_select_task(task)\n\n    def _print_counter_log(self):\n        # print top 5 active counters\n        keywords = ('pending', 'success', 'retry', 'failed')\n        total_cnt = {}\n        project_actives = []\n        project_fails = []\n        for key in keywords:\n            total_cnt[key] = 0\n        for project, subcounter in iteritems(self._cnt['5m']):\n            actives = 0\n            for key in keywords:\n                cnt = subcounter.get(key, None)\n                if cnt:\n                    cnt = cnt.sum\n                    total_cnt[key] += cnt\n                    actives += cnt\n\n            project_actives.append((actives, project))\n\n            fails = subcounter.get('failed', None)\n            if fails:\n                project_fails.append((fails.sum, project))\n\n        top_2_fails = sorted(project_fails, reverse=True)[:2]\n        top_3_actives = sorted([x for x in project_actives if x[1] not in top_2_fails],\n                               reverse=True)[:5 - len(top_2_fails)]\n\n        log_str = (\"in 5m: new:%(pending)d,success:%(success)d,\"\n                   \"retry:%(retry)d,failed:%(failed)d\" % total_cnt)\n        for _, project in itertools.chain(top_3_actives, top_2_fails):\n            subcounter = self._cnt['5m'][project].to_dict(get_value='sum')\n            log_str += \" %s:%d,%d,%d,%d\" % (project,\n                                            subcounter.get('pending', 0),\n                                            subcounter.get('success', 0),\n                                            subcounter.get('retry', 0),\n                                            subcounter.get('failed', 0))\n        logger.info(log_str)\n\n    def _dump_cnt(self):\n        '''Dump counters to file'''\n        self._cnt['1h'].dump(os.path.join(self.data_path, 'scheduler.1h'))\n        self._cnt['1d'].dump(os.path.join(self.data_path, 'scheduler.1d'))\n        self._cnt['all'].dump(os.path.join(self.data_path, 'scheduler.all'))\n\n    def _try_dump_cnt(self):\n        '''Dump counters every 60 seconds'''\n        now = time.time()\n        if now - self._last_dump_cnt > 60:\n            self._last_dump_cnt = now\n            self._dump_cnt()\n            self._print_counter_log()\n\n    def _check_delete(self):\n        '''Check project delete'''\n        now = time.time()\n        for project in list(itervalues(self.projects)):\n            if project.db_status != 'STOP':\n                continue\n            if now - project.updatetime < self.DELETE_TIME:\n                continue\n            if 'delete' not in self.projectdb.split_group(project.group):\n                continue\n\n            logger.warning(\"deleting project: %s!\", project.name)\n            del self.projects[project.name]\n            self.taskdb.drop(project.name)\n            self.projectdb.drop(project.name)\n            if self.resultdb:\n                self.resultdb.drop(project.name)\n            for each in self._cnt.values():\n                del each[project.name]\n\n    def __len__(self):\n        return sum(len(x.task_queue) for x in itervalues(self.projects))\n\n    def quit(self):\n        '''Set quit signal'''\n        self._quit = True\n        # stop xmlrpc server\n        if hasattr(self, 'xmlrpc_server'):\n            self.xmlrpc_ioloop.add_callback(self.xmlrpc_server.stop)\n            self.xmlrpc_ioloop.add_callback(self.xmlrpc_ioloop.stop)\n\n    def run_once(self):\n        '''comsume queues and feed tasks to fetcher, once'''\n\n        self._update_projects()\n        self._check_task_done()\n        self._check_request()\n        while self._check_cronjob():\n            pass\n        self._check_select()\n        self._check_delete()\n        self._try_dump_cnt()\n\n    def run(self):\n        '''Start scheduler loop'''\n        logger.info(\"scheduler starting...\")\n\n        while not self._quit:\n            try:\n                time.sleep(self.LOOP_INTERVAL)\n                self.run_once()\n                self._exceptions = 0\n            except KeyboardInterrupt:\n                break\n            except Exception as e:\n                logger.exception(e)\n                self._exceptions += 1\n                if self._exceptions > self.EXCEPTION_LIMIT:\n                    break\n                continue\n\n        logger.info(\"scheduler exiting...\")\n        self._dump_cnt()\n\n    def trigger_on_start(self, project):\n        '''trigger an on_start callback of project'''\n        self.newtask_queue.put({\n            \"project\": project,\n            \"taskid\": \"on_start\",\n            \"url\": \"data:,on_start\",\n            \"process\": {\n                \"callback\": \"on_start\",\n            },\n        })\n\n    def xmlrpc_run(self, port=23333, bind='127.0.0.1', logRequests=False):\n        '''Start xmlrpc interface'''\n        from pyspider.libs.wsgi_xmlrpc import WSGIXMLRPCApplication\n\n        application = WSGIXMLRPCApplication()\n\n        application.register_function(self.quit, '_quit')\n        application.register_function(self.__len__, 'size')\n\n        def dump_counter(_time, _type):\n            try:\n                return self._cnt[_time].to_dict(_type)\n            except:\n                logger.exception('')\n        application.register_function(dump_counter, 'counter')\n\n        def new_task(task):\n            if self.task_verify(task):\n                self.newtask_queue.put(task)\n                return True\n            return False\n        application.register_function(new_task, 'newtask')\n\n        def send_task(task):\n            '''dispatch task to fetcher'''\n            self.send_task(task)\n            return True\n        application.register_function(send_task, 'send_task')\n\n        def update_project():\n            self._force_update_project = True\n        application.register_function(update_project, 'update_project')\n\n        def get_active_tasks(project=None, limit=100):\n            allowed_keys = set((\n                'type',\n                'taskid',\n                'project',\n                'status',\n                'url',\n                'lastcrawltime',\n                'updatetime',\n                'track',\n            ))\n            track_allowed_keys = set((\n                'ok',\n                'time',\n                'follows',\n                'status_code',\n            ))\n\n            iters = [iter(x.active_tasks) for k, x in iteritems(self.projects)\n                     if x and (k == project if project else True)]\n            tasks = [next(x, None) for x in iters]\n            result = []\n\n            while len(result) < limit and tasks and not all(x is None for x in tasks):\n                updatetime, task = t = max(t for t in tasks if t)\n                i = tasks.index(t)\n                tasks[i] = next(iters[i], None)\n                for key in list(task):\n                    if key == 'track':\n                        for k in list(task[key].get('fetch', [])):\n                            if k not in track_allowed_keys:\n                                del task[key]['fetch'][k]\n                        for k in list(task[key].get('process', [])):\n                            if k not in track_allowed_keys:\n                                del task[key]['process'][k]\n                    if key in allowed_keys:\n                        continue\n                    del task[key]\n                result.append(t)\n            # fix for \"<type 'exceptions.TypeError'>:dictionary key must be string\"\n            # have no idea why\n            return json.loads(json.dumps(result))\n        application.register_function(get_active_tasks, 'get_active_tasks')\n\n        def get_projects_pause_status():\n            result = {}\n            for project_name, project in iteritems(self.projects):\n                result[project_name] = project.paused\n            return result\n        application.register_function(get_projects_pause_status, 'get_projects_pause_status')\n\n        def webui_update():\n            return {\n                'pause_status': get_projects_pause_status(),\n                'counter': {\n                    '5m_time': dump_counter('5m_time', 'avg'),\n                    '5m': dump_counter('5m', 'sum'),\n                    '1h': dump_counter('1h', 'sum'),\n                    '1d': dump_counter('1d', 'sum'),\n                    'all': dump_counter('all', 'sum'),\n                },\n            }\n        application.register_function(webui_update, 'webui_update')\n\n        import tornado.wsgi\n        import tornado.ioloop\n        import tornado.httpserver\n\n        container = tornado.wsgi.WSGIContainer(application)\n        self.xmlrpc_ioloop = tornado.ioloop.IOLoop()\n        self.xmlrpc_server = tornado.httpserver.HTTPServer(container, io_loop=self.xmlrpc_ioloop)\n        self.xmlrpc_server.listen(port=port, address=bind)\n        logger.info('scheduler.xmlrpc listening on %s:%s', bind, port)\n        self.xmlrpc_ioloop.start()\n\n    def on_request(self, task):\n        if self.INQUEUE_LIMIT and len(self.projects[task['project']].task_queue) >= self.INQUEUE_LIMIT:\n            logger.debug('overflow task %(project)s:%(taskid)s %(url)s', task)\n            return\n\n        oldtask = self.taskdb.get_task(task['project'], task['taskid'],\n                                       fields=self.merge_task_fields)\n        if oldtask:\n            return self.on_old_request(task, oldtask)\n        else:\n            return self.on_new_request(task)\n\n    def on_new_request(self, task):\n        '''Called when a new request is arrived'''\n        task['status'] = self.taskdb.ACTIVE\n        self.insert_task(task)\n        self.put_task(task)\n\n        project = task['project']\n        self._cnt['5m'].event((project, 'pending'), +1)\n        self._cnt['1h'].event((project, 'pending'), +1)\n        self._cnt['1d'].event((project, 'pending'), +1)\n        self._cnt['all'].event((project, 'pending'), +1)\n        logger.info('new task %(project)s:%(taskid)s %(url)s', task)\n        return task\n\n    def on_old_request(self, task, old_task):\n        '''Called when a crawled task is arrived'''\n        now = time.time()\n\n        _schedule = task.get('schedule', self.default_schedule)\n        old_schedule = old_task.get('schedule', {})\n\n        if _schedule.get('force_update') and self.projects[task['project']].task_queue.is_processing(task['taskid']):\n            # when a task is in processing, the modify may conflict with the running task.\n            # postpone the modify after task finished.\n            logger.info('postpone modify task %(project)s:%(taskid)s %(url)s', task)\n            self._postpone_request.append(task)\n            return\n\n        restart = False\n        schedule_age = _schedule.get('age', self.default_schedule['age'])\n        if _schedule.get('itag') and _schedule['itag'] != old_schedule.get('itag'):\n            restart = True\n        elif schedule_age >= 0 and schedule_age + (old_task.get('lastcrawltime', 0) or 0) < now:\n            restart = True\n        elif _schedule.get('force_update'):\n            restart = True\n\n        if not restart:\n            logger.debug('ignore newtask %(project)s:%(taskid)s %(url)s', task)\n            return\n\n        if _schedule.get('cancel'):\n            logger.info('cancel task %(project)s:%(taskid)s %(url)s', task)\n            task['status'] = self.taskdb.BAD\n            self.update_task(task)\n            self.projects[task['project']].task_queue.delete(task['taskid'])\n            return task\n\n        task['status'] = self.taskdb.ACTIVE\n        self.update_task(task)\n        self.put_task(task)\n\n        project = task['project']\n        if old_task['status'] != self.taskdb.ACTIVE:\n            self._cnt['5m'].event((project, 'pending'), +1)\n            self._cnt['1h'].event((project, 'pending'), +1)\n            self._cnt['1d'].event((project, 'pending'), +1)\n        if old_task['status'] == self.taskdb.SUCCESS:\n            self._cnt['all'].event((project, 'success'), -1).event((project, 'pending'), +1)\n        elif old_task['status'] == self.taskdb.FAILED:\n            self._cnt['all'].event((project, 'failed'), -1).event((project, 'pending'), +1)\n        logger.info('restart task %(project)s:%(taskid)s %(url)s', task)\n        return task\n\n    def on_task_status(self, task):\n        '''Called when a status pack is arrived'''\n        try:\n            procesok = task['track']['process']['ok']\n            if not self.projects[task['project']].task_queue.done(task['taskid']):\n                logging.error('not processing pack: %(project)s:%(taskid)s %(url)s', task)\n                return None\n        except KeyError as e:\n            logger.error(\"Bad status pack: %s\", e)\n            return None\n\n        if procesok:\n            ret = self.on_task_done(task)\n        else:\n            ret = self.on_task_failed(task)\n\n        if task['track']['fetch'].get('time'):\n            self._cnt['5m_time'].event((task['project'], 'fetch_time'),\n                                       task['track']['fetch']['time'])\n        if task['track']['process'].get('time'):\n            self._cnt['5m_time'].event((task['project'], 'process_time'),\n                                       task['track']['process'].get('time'))\n        self.projects[task['project']].active_tasks.appendleft((time.time(), task))\n        return ret\n\n    def on_task_done(self, task):\n        '''Called when a task is done and success, called by `on_task_status`'''\n        task['status'] = self.taskdb.SUCCESS\n        task['lastcrawltime'] = time.time()\n\n        if 'schedule' in task:\n            if task['schedule'].get('auto_recrawl') and 'age' in task['schedule']:\n                task['status'] = self.taskdb.ACTIVE\n                next_exetime = task['schedule'].get('age')\n                task['schedule']['exetime'] = time.time() + next_exetime\n                self.put_task(task)\n            else:\n                del task['schedule']\n        self.update_task(task)\n\n        project = task['project']\n        self._cnt['5m'].event((project, 'success'), +1)\n        self._cnt['1h'].event((project, 'success'), +1)\n        self._cnt['1d'].event((project, 'success'), +1)\n        self._cnt['all'].event((project, 'success'), +1).event((project, 'pending'), -1)\n        logger.info('task done %(project)s:%(taskid)s %(url)s', task)\n        return task\n\n    def on_task_failed(self, task):\n        '''Called when a task is failed, called by `on_task_status`'''\n\n        if 'schedule' not in task:\n            old_task = self.taskdb.get_task(task['project'], task['taskid'], fields=['schedule'])\n            if old_task is None:\n                logging.error('unknown status pack: %s' % task)\n                return\n            task['schedule'] = old_task.get('schedule', {})\n\n        retries = task['schedule'].get('retries', self.default_schedule['retries'])\n        retried = task['schedule'].get('retried', 0)\n\n        project_info = self.projects[task['project']]\n        retry_delay = project_info.retry_delay or self.DEFAULT_RETRY_DELAY\n        next_exetime = retry_delay.get(retried, retry_delay.get('', self.DEFAULT_RETRY_DELAY['']))\n\n        if task['schedule'].get('auto_recrawl') and 'age' in task['schedule']:\n            next_exetime = min(next_exetime, task['schedule'].get('age'))\n        else:\n            if retried >= retries:\n                next_exetime = -1\n            elif 'age' in task['schedule'] and next_exetime > task['schedule'].get('age'):\n                next_exetime = task['schedule'].get('age')\n\n        if next_exetime < 0:\n            task['status'] = self.taskdb.FAILED\n            task['lastcrawltime'] = time.time()\n            self.update_task(task)\n\n            project = task['project']\n            self._cnt['5m'].event((project, 'failed'), +1)\n            self._cnt['1h'].event((project, 'failed'), +1)\n            self._cnt['1d'].event((project, 'failed'), +1)\n            self._cnt['all'].event((project, 'failed'), +1).event((project, 'pending'), -1)\n            logger.info('task failed %(project)s:%(taskid)s %(url)s' % task)\n            return task\n        else:\n            task['schedule']['retried'] = retried + 1\n            task['schedule']['exetime'] = time.time() + next_exetime\n            task['lastcrawltime'] = time.time()\n            self.update_task(task)\n            self.put_task(task)\n\n            project = task['project']\n            self._cnt['5m'].event((project, 'retry'), +1)\n            self._cnt['1h'].event((project, 'retry'), +1)\n            self._cnt['1d'].event((project, 'retry'), +1)\n            # self._cnt['all'].event((project, 'retry'), +1)\n            logger.info('task retry %d/%d %%(project)s:%%(taskid)s %%(url)s' % (\n                retried, retries), task)\n            return task\n\n    def on_select_task(self, task):\n        '''Called when a task is selected to fetch & process'''\n        # inject informations about project\n        logger.info('select %(project)s:%(taskid)s %(url)s', task)\n\n        project_info = self.projects.get(task['project'])\n        assert project_info, 'no such project'\n        task['type'] = self.TASK_PACK\n        task['group'] = project_info.group\n        task['project_md5sum'] = project_info.md5sum\n        task['project_updatetime'] = project_info.updatetime\n\n        # lazy join project.crawl_config\n        if getattr(project_info, 'crawl_config', None):\n            task = BaseHandler.task_join_crawl_config(task, project_info.crawl_config)\n\n        project_info.active_tasks.appendleft((time.time(), task))\n        self.send_task(task)\n        return task\n\n\nfrom tornado import gen\n\n\nclass OneScheduler(Scheduler):\n    \"\"\"\n    Scheduler Mixin class for one mode\n\n    overwirted send_task method\n    call processor.on_task(fetcher.fetch(task)) instead of consuming queue\n    \"\"\"\n\n    def _check_select(self):\n        \"\"\"\n        interactive mode of select tasks\n        \"\"\"\n        if not self.interactive:\n            return super(OneScheduler, self)._check_select()\n\n        # waiting for running tasks\n        if self.running_task > 0:\n            return\n\n        is_crawled = []\n\n        def run(project=None):\n            return crawl('on_start', project=project)\n\n        def crawl(url, project=None, **kwargs):\n            \"\"\"\n            Crawl given url, same parameters as BaseHandler.crawl\n\n            url - url or taskid, parameters will be used if in taskdb\n            project - can be ignored if only one project exists.\n            \"\"\"\n\n            # looking up the project instance\n            if project is None:\n                if len(self.projects) == 1:\n                    project = list(self.projects.keys())[0]\n                else:\n                    raise LookupError('You need specify the project: %r'\n                                      % list(self.projects.keys()))\n            project_data = self.processor.project_manager.get(project)\n            if not project_data:\n                raise LookupError('no such project: %s' % project)\n\n            # get task package\n            instance = project_data['instance']\n            instance._reset()\n            task = instance.crawl(url, **kwargs)\n            if isinstance(task, list):\n                raise Exception('url list is not allowed in interactive mode')\n\n            # check task in taskdb\n            if not kwargs:\n                dbtask = self.taskdb.get_task(task['project'], task['taskid'],\n                                              fields=self.request_task_fields)\n                if not dbtask:\n                    dbtask = self.taskdb.get_task(task['project'], task['url'],\n                                                  fields=self.request_task_fields)\n                if dbtask:\n                    task = dbtask\n\n            # select the task\n            self.on_select_task(task)\n            is_crawled.append(True)\n\n            shell.ask_exit()\n\n        def quit_interactive():\n            '''Quit interactive mode'''\n            is_crawled.append(True)\n            self.interactive = False\n            shell.ask_exit()\n\n        def quit_pyspider():\n            '''Close pyspider'''\n            is_crawled[:] = []\n            shell.ask_exit()\n\n        shell = utils.get_python_console()\n        banner = (\n            'pyspider shell - Select task\\n'\n            'crawl(url, project=None, **kwargs) - same parameters as BaseHandler.crawl\\n'\n            'quit_interactive() - Quit interactive mode\\n'\n            'quit_pyspider() - Close pyspider'\n        )\n        if hasattr(shell, 'show_banner'):\n            shell.show_banner(banner)\n            shell.interact()\n        else:\n            shell.interact(banner)\n        if not is_crawled:\n            self.ioloop.add_callback(self.ioloop.stop)\n\n    def __getattr__(self, name):\n        \"\"\"patch for crawl(url, callback=self.index_page) API\"\"\"\n        if self.interactive:\n            return name\n        raise AttributeError(name)\n\n    def on_task_status(self, task):\n        \"\"\"Ignore not processing error in interactive mode\"\"\"\n        if not self.interactive:\n            super(OneScheduler, self).on_task_status(task)\n\n        try:\n            procesok = task['track']['process']['ok']\n        except KeyError as e:\n            logger.error(\"Bad status pack: %s\", e)\n            return None\n\n        if procesok:\n            ret = self.on_task_done(task)\n        else:\n            ret = self.on_task_failed(task)\n        if task['track']['fetch'].get('time'):\n            self._cnt['5m_time'].event((task['project'], 'fetch_time'),\n                                       task['track']['fetch']['time'])\n        if task['track']['process'].get('time'):\n            self._cnt['5m_time'].event((task['project'], 'process_time'),\n                                       task['track']['process'].get('time'))\n        self.projects[task['project']].active_tasks.appendleft((time.time(), task))\n        return ret\n\n    def init_one(self, ioloop, fetcher, processor,\n                 result_worker=None, interactive=False):\n        self.ioloop = ioloop\n        self.fetcher = fetcher\n        self.processor = processor\n        self.result_worker = result_worker\n        self.interactive = interactive\n        self.running_task = 0\n\n    @gen.coroutine\n    def do_task(self, task):\n        self.running_task += 1\n        result = yield gen.Task(self.fetcher.fetch, task)\n        type, task, response = result.args\n        self.processor.on_task(task, response)\n        # do with message\n        while not self.processor.inqueue.empty():\n            _task, _response = self.processor.inqueue.get()\n            self.processor.on_task(_task, _response)\n        # do with results\n        while not self.processor.result_queue.empty():\n            _task, _result = self.processor.result_queue.get()\n            if self.result_worker:\n                self.result_worker.on_result(_task, _result)\n        self.running_task -= 1\n\n    def send_task(self, task, force=True):\n        if self.fetcher.http_client.free_size() <= 0:\n            if force:\n                self._send_buffer.appendleft(task)\n            else:\n                raise self.outqueue.Full\n        self.ioloop.add_future(self.do_task(task), lambda x: x.result())\n\n    def run(self):\n        import tornado.ioloop\n        tornado.ioloop.PeriodicCallback(self.run_once, 100,\n                                        io_loop=self.ioloop).start()\n        self.ioloop.start()\n\n    def quit(self):\n        self.ioloop.stop()\n        logger.info(\"scheduler exiting...\")\n\n\nimport random\nimport threading\nfrom pyspider.database.sqlite.sqlitebase import SQLiteMixin\n\n\nclass ThreadBaseScheduler(Scheduler):\n    def __init__(self, threads=4, *args, **kwargs):\n        self.local = threading.local()\n\n        super(ThreadBaseScheduler, self).__init__(*args, **kwargs)\n\n        if isinstance(self.taskdb, SQLiteMixin):\n            self.threads = 1\n        else:\n            self.threads = threads\n\n        self._taskdb = self.taskdb\n        self._projectdb = self.projectdb\n        self._resultdb = self.resultdb\n\n        self.thread_objs = []\n        self.thread_queues = []\n        self._start_threads()\n        assert len(self.thread_queues) > 0\n\n    @property\n    def taskdb(self):\n        if not hasattr(self.local, 'taskdb'):\n            self.taskdb = self._taskdb.copy()\n        return self.local.taskdb\n\n    @taskdb.setter\n    def taskdb(self, taskdb):\n        self.local.taskdb = taskdb\n\n    @property\n    def projectdb(self):\n        if not hasattr(self.local, 'projectdb'):\n            self.projectdb = self._projectdb.copy()\n        return self.local.projectdb\n\n    @projectdb.setter\n    def projectdb(self, projectdb):\n        self.local.projectdb = projectdb\n\n    @property\n    def resultdb(self):\n        if not hasattr(self.local, 'resultdb'):\n            self.resultdb = self._resultdb.copy()\n        return self.local.resultdb\n\n    @resultdb.setter\n    def resultdb(self, resultdb):\n        self.local.resultdb = resultdb\n\n    def _start_threads(self):\n        for i in range(self.threads):\n            queue = Queue.Queue()\n            thread = threading.Thread(target=self._thread_worker, args=(queue, ))\n            thread.daemon = True\n            thread.start()\n            self.thread_objs.append(thread)\n            self.thread_queues.append(queue)\n\n    def _thread_worker(self, queue):\n        while True:\n            method, args, kwargs = queue.get()\n            try:\n                method(*args, **kwargs)\n            except Exception as e:\n                logger.exception(e)\n\n    def _run_in_thread(self, method, *args, **kwargs):\n        i = kwargs.pop('_i', None)\n        block = kwargs.pop('_block', False)\n\n        if i is None:\n            while True:\n                for queue in self.thread_queues:\n                    if queue.empty():\n                        break\n                else:\n                    if block:\n                        time.sleep(0.1)\n                        continue\n                    else:\n                        queue = self.thread_queues[random.randint(0, len(self.thread_queues)-1)]\n                break\n        else:\n            queue = self.thread_queues[i % len(self.thread_queues)]\n\n        queue.put((method, args, kwargs))\n\n        if block:\n            self._wait_thread()\n\n    def _wait_thread(self):\n        while True:\n            if all(queue.empty() for queue in self.thread_queues):\n                break\n            time.sleep(0.1)\n\n    def _update_project(self, project):\n        self._run_in_thread(Scheduler._update_project, self, project)\n\n    def on_task_status(self, task):\n        i = hash(task['taskid'])\n        self._run_in_thread(Scheduler.on_task_status, self, task, _i=i)\n\n    def on_request(self, task):\n        i = hash(task['taskid'])\n        self._run_in_thread(Scheduler.on_request, self, task, _i=i)\n\n    def _load_put_task(self, project, taskid):\n        i = hash(taskid)\n        self._run_in_thread(Scheduler._load_put_task, self, project, taskid, _i=i)\n\n    def run_once(self):\n        super(ThreadBaseScheduler, self).run_once()\n        self._wait_thread()\n"
  },
  {
    "path": "pyspider/scheduler/task_queue.py",
    "content": "#!/usr/bin/env python\n# -*- encoding: utf-8 -*-\n# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:\n# Author: Binux<i@binux.me>\n#         http://binux.me\n# Created on 2014-02-07 13:12:10\n\nimport heapq\nimport logging\nimport threading\nimport time\n\ntry:\n    from UserDict import DictMixin\nexcept ImportError:\n    from collections import Mapping as DictMixin\nfrom .token_bucket import Bucket\nfrom six.moves import queue as Queue\n\nlogger = logging.getLogger('scheduler')\n\ntry:\n    cmp\nexcept NameError:\n    cmp = lambda x, y: (x > y) - (x < y)\n\n\nclass AtomInt(object):\n    __value__ = 0\n    __mutex__ = threading.RLock()\n\n    @classmethod\n    def get_value(cls):\n        cls.__mutex__.acquire()\n        cls.__value__ = cls.__value__ + 1\n        value = cls.__value__\n        cls.__mutex__.release()\n        return value\n\n\nclass InQueueTask(DictMixin):\n    __slots__ = ('taskid', 'priority', 'exetime', 'sequence')\n    __getitem__ = lambda *x: getattr(*x)\n    __setitem__ = lambda *x: setattr(*x)\n    __iter__ = lambda self: iter(self.__slots__)\n    __len__ = lambda self: len(self.__slots__)\n    keys = lambda self: self.__slots__\n\n    def __init__(self, taskid, priority=0, exetime=0):\n        self.taskid = taskid\n        self.priority = priority\n        self.exetime = exetime\n        self.sequence = AtomInt.get_value()\n\n    def __cmp__(self, other):\n        if self.exetime == 0 and other.exetime == 0:\n            diff = -cmp(self.priority, other.priority)\n        else:\n            diff = cmp(self.exetime, other.exetime)\n\n        # compare in-queue sequence number finally if two element has the same\n        # priority or exetime\n        return diff if diff != 0 else cmp(self.sequence, other.sequence)\n\n    def __lt__(self, other):\n        return self.__cmp__(other) < 0\n\n\nclass PriorityTaskQueue(Queue.Queue):\n    '''\n    TaskQueue\n\n    Same taskid items will been merged\n    '''\n\n    def _init(self, maxsize):\n        self.queue = []\n        self.queue_dict = dict()\n\n    def _qsize(self, len=len):\n        return len(self.queue_dict)\n\n    def _put(self, item, heappush=heapq.heappush):\n        if item.taskid in self.queue_dict:\n            task = self.queue_dict[item.taskid]\n            changed = False\n            if item < task:\n                changed = True\n            task.priority = max(item.priority, task.priority)\n            task.exetime = min(item.exetime, task.exetime)\n            if changed:\n                self._resort()\n        else:\n            heappush(self.queue, item)\n            self.queue_dict[item.taskid] = item\n\n    def _get(self, heappop=heapq.heappop):\n        while self.queue:\n            item = heappop(self.queue)\n            if item.taskid is None:\n                continue\n            self.queue_dict.pop(item.taskid, None)\n            return item\n        return None\n\n    @property\n    def top(self):\n        while self.queue and self.queue[0].taskid is None:\n            heapq.heappop(self.queue)\n        if self.queue:\n            return self.queue[0]\n        return None\n\n    def _resort(self):\n        heapq.heapify(self.queue)\n\n    def __contains__(self, taskid):\n        return taskid in self.queue_dict\n\n    def __getitem__(self, taskid):\n        return self.queue_dict[taskid]\n\n    def __setitem__(self, taskid, item):\n        assert item.taskid == taskid\n        self.put(item)\n\n    def __delitem__(self, taskid):\n        self.queue_dict.pop(taskid).taskid = None\n\n\nclass TaskQueue(object):\n    '''\n    task queue for scheduler, have a priority queue and a time queue for delayed tasks\n    '''\n    processing_timeout = 10 * 60\n\n    def __init__(self, rate=0, burst=0):\n        self.mutex = threading.RLock()\n        self.priority_queue = PriorityTaskQueue()\n        self.time_queue = PriorityTaskQueue()\n        self.processing = PriorityTaskQueue()\n        self.bucket = Bucket(rate=rate, burst=burst)\n\n    @property\n    def rate(self):\n        return self.bucket.rate\n\n    @rate.setter\n    def rate(self, value):\n        self.bucket.rate = value\n\n    @property\n    def burst(self):\n        return self.bucket.burst\n\n    @burst.setter\n    def burst(self, value):\n        self.bucket.burst = value\n\n    def check_update(self):\n        '''\n        Check time queue and processing queue\n\n        put tasks to priority queue when execute time arrived or process timeout\n        '''\n        self._check_time_queue()\n        self._check_processing()\n\n    def _check_time_queue(self):\n        now = time.time()\n        self.mutex.acquire()\n        while self.time_queue.qsize() and self.time_queue.top and self.time_queue.top.exetime < now:\n            task = self.time_queue.get_nowait()  # type: InQueueTask\n            task.exetime = 0\n            self.priority_queue.put(task)\n        self.mutex.release()\n\n    def _check_processing(self):\n        now = time.time()\n        self.mutex.acquire()\n        while self.processing.qsize() and self.processing.top and self.processing.top.exetime < now:\n            task = self.processing.get_nowait()\n            if task.taskid is None:\n                continue\n            task.exetime = 0\n            self.priority_queue.put(task)\n            logger.info(\"processing: retry %s\", task.taskid)\n        self.mutex.release()\n\n    def put(self, taskid, priority=0, exetime=0):\n        \"\"\"\n        Put a task into task queue\n        \n        when use heap sort, if we put tasks(with the same priority and exetime=0) into queue,\n        the queue is not a strict FIFO queue, but more like a FILO stack.\n        It is very possible that when there are continuous big flow, the speed of select is \n        slower than request, resulting in priority-queue accumulation in short time.\n        In this scenario, the tasks more earlier entering the priority-queue will not get \n        processed until the request flow becomes small. \n        \n        Thus, we store a global atom self increasing value into task.sequence which represent \n        the task enqueue sequence. When the comparison of exetime and priority have no \n        difference, we compare task.sequence to ensure that the entire queue is ordered.\n        \"\"\"\n        now = time.time()\n\n        task = InQueueTask(taskid, priority, exetime)\n\n        self.mutex.acquire()\n        if taskid in self.priority_queue:\n            self.priority_queue.put(task)\n        elif taskid in self.time_queue:\n            self.time_queue.put(task)\n        elif taskid in self.processing and self.processing[taskid].taskid:\n            # force update a processing task is not allowed as there are so many\n            # problems may happen\n            pass\n        else:\n            if exetime and exetime > now:\n                self.time_queue.put(task)\n            else:\n                task.exetime = 0\n                self.priority_queue.put(task)\n\n        self.mutex.release()\n\n    def get(self):\n        '''Get a task from queue when bucket available'''\n        if self.bucket.get() < 1:\n            return None\n        now = time.time()\n        self.mutex.acquire()\n        try:\n            task = self.priority_queue.get_nowait()\n            self.bucket.desc()\n        except Queue.Empty:\n            self.mutex.release()\n            return None\n        task.exetime = now + self.processing_timeout\n        self.processing.put(task)\n        self.mutex.release()\n        return task.taskid\n\n    def done(self, taskid):\n        '''Mark task done'''\n        if taskid in self.processing:\n            self.mutex.acquire()\n            if taskid in self.processing:\n                del self.processing[taskid]\n            self.mutex.release()\n            return True\n        return False\n\n    def delete(self, taskid):\n        if taskid not in self:\n            return False\n        if taskid in self.priority_queue:\n            self.mutex.acquire()\n            del self.priority_queue[taskid]\n            self.mutex.release()\n        elif taskid in self.time_queue:\n            self.mutex.acquire()\n            del self.time_queue[taskid]\n            self.mutex.release()\n        elif taskid in self.processing:\n            self.done(taskid)\n        return True\n\n    def size(self):\n        return self.priority_queue.qsize() + self.time_queue.qsize() + self.processing.qsize()\n\n    def is_processing(self, taskid):\n        '''\n        return True if taskid is in processing\n        '''\n        return taskid in self.processing and self.processing[taskid].taskid\n\n    def __len__(self):\n        return self.size()\n\n    def __contains__(self, taskid):\n        if taskid in self.priority_queue or taskid in self.time_queue:\n            return True\n        if taskid in self.processing and self.processing[taskid].taskid:\n            return True\n        return False\n\n\nif __name__ == '__main__':\n    task_queue = TaskQueue()\n    task_queue.processing_timeout = 0.1\n    task_queue.put('a3', 3, time.time() + 0.1)\n    task_queue.put('a1', 1)\n    task_queue.put('a2', 2)\n    assert task_queue.get() == 'a2'\n    time.sleep(0.1)\n    task_queue._check_time_queue()\n    assert task_queue.get() == 'a3'\n    assert task_queue.get() == 'a1'\n    task_queue._check_processing()\n    assert task_queue.get() == 'a2'\n    assert len(task_queue) == 0\n"
  },
  {
    "path": "pyspider/scheduler/token_bucket.py",
    "content": "#!/usr/bin/env python\n# -*- encoding: utf-8 -*-\n# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:\n# Author: Binux<i@binux.me>\n#         http://binux.me\n# Created on 2014-02-07 16:53:08\n\nimport time\ntry:\n    import threading as _threading\nexcept ImportError:\n    import dummy_threading as _threading\n\n\nclass Bucket(object):\n\n    '''\n    traffic flow control with token bucket\n    '''\n\n    update_interval = 30\n\n    def __init__(self, rate=1, burst=None):\n        self.rate = float(rate)\n        if burst is None:\n            self.burst = float(rate) * 10\n        else:\n            self.burst = float(burst)\n        self.mutex = _threading.Lock()\n        self.bucket = self.burst\n        self.last_update = time.time()\n\n    def get(self):\n        '''Get the number of tokens in bucket'''\n        now = time.time()\n        if self.bucket >= self.burst:\n            self.last_update = now\n            return self.bucket\n        bucket = self.rate * (now - self.last_update)\n        self.mutex.acquire()\n        if bucket > 1:\n            self.bucket += bucket\n            if self.bucket > self.burst:\n                self.bucket = self.burst\n            self.last_update = now\n        self.mutex.release()\n        return self.bucket\n\n    def set(self, value):\n        '''Set number of tokens in bucket'''\n        self.bucket = value\n\n    def desc(self, value=1):\n        '''Use value tokens'''\n        self.bucket -= value\n"
  },
  {
    "path": "pyspider/webui/__init__.py",
    "content": "#!/usr/bin/env python\n# -*- encoding: utf-8 -*-\n# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:\n# Author: Binux<i@binux.me>\n#         http://binux.me\n# Created on 2014-02-22 23:20:40\n\nfrom . import app, index, debug, task, result, login\n"
  },
  {
    "path": "pyspider/webui/app.py",
    "content": "#!/usr/bin/env python\n# -*- encoding: utf-8 -*-\n# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:\n# Author: Binux<i@binux.me>\n#         http://binux.me\n# Created on 2014-02-22 23:17:13\n\nimport os\nimport sys\nimport logging\nlogger = logging.getLogger(\"webui\")\n\nfrom six import reraise\nfrom six.moves import builtins\nfrom six.moves.urllib.parse import urljoin\nfrom flask import Flask\nfrom pyspider.fetcher import tornado_fetcher\n\nif os.name == 'nt':\n    import mimetypes\n    mimetypes.add_type(\"text/css\", \".css\", True)\n\n\nclass QuitableFlask(Flask):\n    \"\"\"Add quit() method to Flask object\"\"\"\n\n    @property\n    def logger(self):\n        return logger\n\n    def run(self, host=None, port=None, debug=None, **options):\n        import tornado.wsgi\n        import tornado.ioloop\n        import tornado.httpserver\n        import tornado.web\n\n        if host is None:\n            host = '127.0.0.1'\n        if port is None:\n            server_name = self.config['SERVER_NAME']\n            if server_name and ':' in server_name:\n                port = int(server_name.rsplit(':', 1)[1])\n            else:\n                port = 5000\n        if debug is not None:\n            self.debug = bool(debug)\n\n        hostname = host\n        port = port\n        application = self\n        use_reloader = self.debug\n        use_debugger = self.debug\n\n        if use_debugger:\n            from werkzeug.debug import DebuggedApplication\n            application = DebuggedApplication(application, True)\n\n        try:\n            from .webdav import dav_app\n        except ImportError as e:\n            logger.warning('WebDav interface not enabled: %r', e)\n            dav_app = None\n        if dav_app:\n            from werkzeug.wsgi import DispatcherMiddleware\n            application = DispatcherMiddleware(application, {\n                '/dav': dav_app\n            })\n\n        container = tornado.wsgi.WSGIContainer(application)\n        self.http_server = tornado.httpserver.HTTPServer(container)\n        self.http_server.listen(port, hostname)\n        if use_reloader:\n            from tornado import autoreload\n            autoreload.start()\n\n        self.logger.info('webui running on %s:%s', hostname, port)\n        self.ioloop = tornado.ioloop.IOLoop.current()\n        self.ioloop.start()\n\n    def quit(self):\n        if hasattr(self, 'ioloop'):\n            self.ioloop.add_callback(self.http_server.stop)\n            self.ioloop.add_callback(self.ioloop.stop)\n        self.logger.info('webui exiting...')\n\n\napp = QuitableFlask('webui',\n                    static_folder=os.path.join(os.path.dirname(__file__), 'static'),\n                    template_folder=os.path.join(os.path.dirname(__file__), 'templates'))\napp.secret_key = os.urandom(24)\napp.jinja_env.line_statement_prefix = '#'\napp.jinja_env.globals.update(builtins.__dict__)\n\napp.config.update({\n    'fetch': lambda x: tornado_fetcher.Fetcher(None, None, async_mode=False).fetch(x),\n    'taskdb': None,\n    'projectdb': None,\n    'scheduler_rpc': None,\n    'queues': dict(),\n    'process_time_limit': 30,\n})\n\n\ndef cdn_url_handler(error, endpoint, kwargs):\n    if endpoint == 'cdn':\n        path = kwargs.pop('path')\n        # cdn = app.config.get('cdn', 'http://cdn.staticfile.org/')\n        # cdn = app.config.get('cdn', '//cdnjs.cloudflare.com/ajax/libs/')\n        cdn = app.config.get('cdn', '//cdnjscn.b0.upaiyun.com/libs/')\n        return urljoin(cdn, path)\n    else:\n        exc_type, exc_value, tb = sys.exc_info()\n        if exc_value is error:\n            reraise(exc_type, exc_value, tb)\n        else:\n            raise error\napp.handle_url_build_error = cdn_url_handler\n"
  },
  {
    "path": "pyspider/webui/bench_test.py",
    "content": "#!/usr/bin/env python\n# -*- encoding: utf-8 -*-\n# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:\n# Author: Binux<roy@binux.me>\n#         http://binux.me\n# Created on 2014-12-08 22:31:17\n\nimport random\ntry:\n    from urllib import urlencode\nexcept ImportError:\n    from urllib.parse import urlencode\n\nfrom flask import request\nfrom .app import app\n\n\n@app.route('/bench')\ndef bench_test():\n    total = int(request.args.get('total', 10000))\n    show = int(request.args.get('show', 20))\n    nlist = [random.randint(1, total) for _ in range(show)]\n    result = []\n    result.append(\"<html><head></head><body>\")\n    args = dict(request.args)\n    for nl in nlist:\n        args['n'] = nl\n        argstr = urlencode(sorted(args.items()), doseq=True)\n        result.append(\"<a href='/bench?{0}'>follow {1}</a><br>\".format(argstr, nl))\n    result.append(\"</body></html>\")\n    return \"\".join(result)\n"
  },
  {
    "path": "pyspider/webui/debug.py",
    "content": "#!/usr/bin/env python\n# -*- encoding: utf-8 -*-\n# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:\n# Author: Binux<i@binux.me>\n#         http://binux.me\n# Created on 2014-02-23 00:19:06\n\n\nimport sys\nimport time\nimport socket\nimport inspect\nimport datetime\nimport traceback\nfrom flask import render_template, request, json\n\ntry:\n    import flask_login as login\nexcept ImportError:\n    from flask.ext import login\n\nfrom pyspider.libs import utils, sample_handler, dataurl\nfrom pyspider.libs.response import rebuild_response\nfrom pyspider.processor.project_module import ProjectManager, ProjectFinder\nfrom .app import app\n\ndefault_task = {\n    'taskid': 'data:,on_start',\n    'project': '',\n    'url': 'data:,on_start',\n    'process': {\n        'callback': 'on_start',\n    },\n}\ndefault_script = inspect.getsource(sample_handler)\n\n\n@app.route('/debug/<project>', methods=['GET', 'POST'])\ndef debug(project):\n    projectdb = app.config['projectdb']\n    if not projectdb.verify_project_name(project):\n        return 'project name is not allowed!', 400\n    info = projectdb.get(project, fields=['name', 'script'])\n    if info:\n        script = info['script']\n    else:\n        script = (default_script\n                  .replace('__DATE__', datetime.datetime.now().strftime(\"%Y-%m-%d %H:%M:%S\"))\n                  .replace('__PROJECT_NAME__', project)\n                  .replace('__START_URL__', request.values.get('start-urls') or '__START_URL__'))\n\n    taskid = request.args.get('taskid')\n    if taskid:\n        taskdb = app.config['taskdb']\n        task = taskdb.get_task(\n            project, taskid, ['taskid', 'project', 'url', 'fetch', 'process'])\n    else:\n        task = default_task\n\n    default_task['project'] = project\n    return render_template(\"debug.html\", task=task, script=script, project_name=project)\n\n\n@app.before_first_request\ndef enable_projects_import():\n    sys.meta_path.append(ProjectFinder(app.config['projectdb']))\n\n\n@app.route('/debug/<project>/run', methods=['POST', ])\ndef run(project):\n    start_time = time.time()\n    try:\n        task = utils.decode_unicode_obj(json.loads(request.form['task']))\n    except Exception:\n        result = {\n            'fetch_result': \"\",\n            'logs': u'task json error',\n            'follows': [],\n            'messages': [],\n            'result': None,\n            'time': time.time() - start_time,\n        }\n        return json.dumps(utils.unicode_obj(result)), \\\n            200, {'Content-Type': 'application/json'}\n\n    project_info = {\n        'name': project,\n        'status': 'DEBUG',\n        'script': request.form['script'],\n    }\n\n    if request.form.get('webdav_mode') == 'true':\n        projectdb = app.config['projectdb']\n        info = projectdb.get(project, fields=['name', 'script'])\n        if not info:\n            result = {\n                'fetch_result': \"\",\n                'logs': u' in wevdav mode, cannot load script',\n                'follows': [],\n                'messages': [],\n                'result': None,\n                'time': time.time() - start_time,\n            }\n            return json.dumps(utils.unicode_obj(result)), \\\n                200, {'Content-Type': 'application/json'}\n        project_info['script'] = info['script']\n\n    fetch_result = {}\n    try:\n        module = ProjectManager.build_module(project_info, {\n            'debugger': True,\n            'process_time_limit': app.config['process_time_limit'],\n        })\n\n        # The code below is to mock the behavior that crawl_config been joined when selected by scheduler.\n        # but to have a better view of joined tasks, it has been done in BaseHandler.crawl when `is_debugger is True`\n        # crawl_config = module['instance'].crawl_config\n        # task = module['instance'].task_join_crawl_config(task, crawl_config)\n\n        fetch_result = app.config['fetch'](task)\n        response = rebuild_response(fetch_result)\n\n        ret = module['instance'].run_task(module['module'], task, response)\n    except Exception:\n        type, value, tb = sys.exc_info()\n        tb = utils.hide_me(tb, globals())\n        logs = ''.join(traceback.format_exception(type, value, tb))\n        result = {\n            'fetch_result': fetch_result,\n            'logs': logs,\n            'follows': [],\n            'messages': [],\n            'result': None,\n            'time': time.time() - start_time,\n        }\n    else:\n        result = {\n            'fetch_result': fetch_result,\n            'logs': ret.logstr(),\n            'follows': ret.follows,\n            'messages': ret.messages,\n            'result': ret.result,\n            'time': time.time() - start_time,\n        }\n        result['fetch_result']['content'] = response.text\n        if (response.headers.get('content-type', '').startswith('image')):\n            result['fetch_result']['dataurl'] = dataurl.encode(\n                response.content, response.headers['content-type'])\n\n    try:\n        # binary data can't encode to JSON, encode result as unicode obj\n        # before send it to frontend\n        return json.dumps(utils.unicode_obj(result)), 200, {'Content-Type': 'application/json'}\n    except Exception:\n        type, value, tb = sys.exc_info()\n        tb = utils.hide_me(tb, globals())\n        logs = ''.join(traceback.format_exception(type, value, tb))\n        result = {\n            'fetch_result': \"\",\n            'logs': logs,\n            'follows': [],\n            'messages': [],\n            'result': None,\n            'time': time.time() - start_time,\n        }\n        return json.dumps(utils.unicode_obj(result)), 200, {'Content-Type': 'application/json'}\n\n\n@app.route('/debug/<project>/save', methods=['POST', ])\ndef save(project):\n    projectdb = app.config['projectdb']\n    if not projectdb.verify_project_name(project):\n        return 'project name is not allowed!', 400\n    script = request.form['script']\n    project_info = projectdb.get(project, fields=['name', 'status', 'group'])\n    if project_info and 'lock' in projectdb.split_group(project_info.get('group')) \\\n            and not login.current_user.is_active():\n        return app.login_response\n\n    if project_info:\n        info = {\n            'script': script,\n        }\n        if project_info.get('status') in ('DEBUG', 'RUNNING', ):\n            info['status'] = 'CHECKING'\n        projectdb.update(project, info)\n    else:\n        info = {\n            'name': project,\n            'script': script,\n            'status': 'TODO',\n            'rate': app.config.get('max_rate', 1),\n            'burst': app.config.get('max_burst', 3),\n        }\n        projectdb.insert(project, info)\n\n    rpc = app.config['scheduler_rpc']\n    if rpc is not None:\n        try:\n            rpc.update_project()\n        except socket.error as e:\n            app.logger.warning('connect to scheduler rpc error: %r', e)\n            return 'rpc error', 200\n\n    return 'ok', 200\n\n\n@app.route('/debug/<project>/get')\ndef get_script(project):\n    projectdb = app.config['projectdb']\n    if not projectdb.verify_project_name(project):\n        return 'project name is not allowed!', 400\n    info = projectdb.get(project, fields=['name', 'script'])\n    return json.dumps(utils.unicode_obj(info)), \\\n        200, {'Content-Type': 'application/json'}\n\n\n@app.route('/blank.html')\ndef blank_html():\n    return \"\"\n"
  },
  {
    "path": "pyspider/webui/index.py",
    "content": "#!/usr/bin/env python\n# -*- encoding: utf-8 -*-\n# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:\n# Author: Binux<i@binux.me>\n#         http://binux.me\n# Created on 2014-02-22 23:20:39\n\nimport socket\n\nfrom six import iteritems, itervalues\nfrom flask import render_template, request, json\n\ntry:\n    import flask_login as login\nexcept ImportError:\n    from flask.ext import login\n\nfrom .app import app\n\nindex_fields = ['name', 'group', 'status', 'comments', 'rate', 'burst', 'updatetime']\n\n\n@app.route('/')\ndef index():\n    projectdb = app.config['projectdb']\n    projects = sorted(projectdb.get_all(fields=index_fields),\n                      key=lambda k: (0 if k['group'] else 1, k['group'] or '', k['name']))\n    return render_template(\"index.html\", projects=projects)\n\n\n@app.route('/queues')\ndef get_queues():\n    def try_get_qsize(queue):\n        if queue is None:\n            return 'None'\n        try:\n            return queue.qsize()\n        except Exception as e:\n            return \"%r\" % e\n\n    result = {}\n    queues = app.config.get('queues', {})\n    for key in queues:\n        result[key] = try_get_qsize(queues[key])\n    return json.dumps(result), 200, {'Content-Type': 'application/json'}\n\n\n@app.route('/update', methods=['POST', ])\ndef project_update():\n    projectdb = app.config['projectdb']\n    project = request.form['pk']\n    name = request.form['name']\n    value = request.form['value']\n\n    project_info = projectdb.get(project, fields=('name', 'group'))\n    if not project_info:\n        return \"no such project.\", 404\n    if 'lock' in projectdb.split_group(project_info.get('group')) \\\n            and not login.current_user.is_active():\n        return app.login_response\n\n    if name not in ('group', 'status', 'rate'):\n        return 'unknown field: %s' % name, 400\n    if name == 'rate':\n        value = value.split('/')\n        if len(value) != 2:\n            return 'format error: rate/burst', 400\n        rate = float(value[0])\n        burst = float(value[1])\n        update = {\n            'rate': min(rate, app.config.get('max_rate', rate)),\n            'burst': min(burst, app.config.get('max_burst', burst)),\n        }\n    else:\n        update = {\n            name: value\n        }\n\n    ret = projectdb.update(project, update)\n    if ret:\n        rpc = app.config['scheduler_rpc']\n        if rpc is not None:\n            try:\n                rpc.update_project()\n            except socket.error as e:\n                app.logger.warning('connect to scheduler rpc error: %r', e)\n                return 'rpc error', 200\n        return 'ok', 200\n    else:\n        app.logger.warning(\"[webui index] projectdb.update() error - res: {}\".format(ret))\n        return 'update error', 500\n\n\n@app.route('/counter')\ndef counter():\n    rpc = app.config['scheduler_rpc']\n    if rpc is None:\n        return json.dumps({})\n\n    result = {}\n    try:\n        data = rpc.webui_update()\n        for type, counters in iteritems(data['counter']):\n            for project, counter in iteritems(counters):\n                result.setdefault(project, {})[type] = counter\n        for project, paused in iteritems(data['pause_status']):\n            result.setdefault(project, {})['paused'] = paused\n    except socket.error as e:\n        app.logger.warning('connect to scheduler rpc error: %r', e)\n        return json.dumps({}), 200, {'Content-Type': 'application/json'}\n\n    return json.dumps(result), 200, {'Content-Type': 'application/json'}\n\n\n@app.route('/run', methods=['POST', ])\ndef runtask():\n    rpc = app.config['scheduler_rpc']\n    if rpc is None:\n        return json.dumps({})\n\n    projectdb = app.config['projectdb']\n    project = request.form['project']\n    project_info = projectdb.get(project, fields=('name', 'group'))\n    if not project_info:\n        return \"no such project.\", 404\n    if 'lock' in projectdb.split_group(project_info.get('group')) \\\n            and not login.current_user.is_active():\n        return app.login_response\n\n    newtask = {\n        \"project\": project,\n        \"taskid\": \"on_start\",\n        \"url\": \"data:,on_start\",\n        \"process\": {\n            \"callback\": \"on_start\",\n        },\n        \"schedule\": {\n            \"age\": 0,\n            \"priority\": 9,\n            \"force_update\": True,\n        },\n    }\n\n    try:\n        ret = rpc.newtask(newtask)\n    except socket.error as e:\n        app.logger.warning('connect to scheduler rpc error: %r', e)\n        return json.dumps({\"result\": False}), 200, {'Content-Type': 'application/json'}\n    return json.dumps({\"result\": ret}), 200, {'Content-Type': 'application/json'}\n\n\n@app.route('/robots.txt')\ndef robots():\n    return \"\"\"User-agent: *\nDisallow: /\nAllow: /$\nAllow: /debug\nDisallow: /debug/*?taskid=*\n\"\"\", 200, {'Content-Type': 'text/plain'}\n"
  },
  {
    "path": "pyspider/webui/login.py",
    "content": "#!/usr/bin/env python\n# -*- encoding: utf-8 -*-\n# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:\n# Author: Binux<roy@binux.me>\n#         http://binux.me\n# Created on 2014-12-10 20:36:27\n\nimport base64\nfrom flask import Response\ntry:\n    import flask_login as login\nexcept ImportError:\n    from flask.ext import login\nfrom .app import app\n\nlogin_manager = login.LoginManager()\nlogin_manager.init_app(app)\n\n\nclass AnonymousUser(login.AnonymousUserMixin):\n\n    def is_anonymous(self):\n        return True\n\n    def is_active(self):\n        return False\n\n    def is_authenticated(self):\n        return False\n\n    def get_id(self):\n        return\n\n\nclass User(login.UserMixin):\n\n    def __init__(self, id, password):\n        self.id = id\n        self.password = password\n\n    def is_authenticated(self):\n        if not app.config.get('webui_username'):\n            return True\n        if self.id == app.config.get('webui_username') \\\n                and self.password == app.config.get('webui_password'):\n            return True\n        return False\n\n    def is_active(self):\n        return self.is_authenticated()\n\n\nlogin_manager.anonymous_user = AnonymousUser\n\n\n@login_manager.request_loader\ndef load_user_from_request(request):\n    api_key = request.headers.get('Authorization')\n    if api_key:\n        api_key = api_key[len(\"Basic \"):]\n        try:\n            api_key = base64.b64decode(api_key).decode('utf8')\n            return User(*api_key.split(\":\", 1))\n        except Exception as e:\n            app.logger.error('wrong api key: %r, %r', api_key, e)\n            return None\n    return None\napp.login_response = Response(\n    \"need auth.\", 401, {'WWW-Authenticate': 'Basic realm=\"Login Required\"'}\n)\n\n\n@app.before_request\ndef before_request():\n    if app.config.get('need_auth', False):\n        if not login.current_user.is_active():\n            return app.login_response\n"
  },
  {
    "path": "pyspider/webui/result.py",
    "content": "#!/usr/bin/env python\n# -*- encoding: utf-8 -*-\n# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:\n# Author: Binux<i@binux.me>\n#         http://binux.me\n# Created on 2014-10-19 16:23:55\n\nfrom __future__ import unicode_literals\n\nfrom flask import render_template, request, json\nfrom flask import Response\nfrom .app import app\nfrom pyspider.libs import result_dump\n\n\n@app.route('/results')\ndef result():\n    resultdb = app.config['resultdb']\n    project = request.args.get('project')\n    offset = int(request.args.get('offset', 0))\n    limit = int(request.args.get('limit', 20))\n\n    count = resultdb.count(project)\n    results = list(resultdb.select(project, offset=offset, limit=limit))\n\n    return render_template(\n        \"result.html\", count=count, results=results,\n        result_formater=result_dump.result_formater,\n        project=project, offset=offset, limit=limit, json=json\n    )\n\n\n@app.route('/results/dump/<project>.<_format>')\ndef dump_result(project, _format):\n    resultdb = app.config['resultdb']\n    # force update project list\n    resultdb.get(project, 'any')\n    if project not in resultdb.projects:\n        return \"no such project.\", 404\n\n    offset = int(request.args.get('offset', 0)) or None\n    limit = int(request.args.get('limit', 0)) or None\n    results = resultdb.select(project, offset=offset, limit=limit)\n\n    if _format == 'json':\n        valid = request.args.get('style', 'rows') == 'full'\n        return Response(result_dump.dump_as_json(results, valid),\n                        mimetype='application/json')\n    elif _format == 'txt':\n        return Response(result_dump.dump_as_txt(results),\n                        mimetype='text/plain')\n    elif _format == 'csv':\n        return Response(result_dump.dump_as_csv(results),\n                        mimetype='text/csv')\n"
  },
  {
    "path": "pyspider/webui/static/.babelrc",
    "content": "{\n  \"presets\": [\"es2015\"]\n}\n"
  },
  {
    "path": "pyspider/webui/static/package.json",
    "content": "{\n  \"name\": \"pyspider-webui\",\n  \"version\": \"0.3.9\",\n  \"description\": \"webui of pyspider\",\n  \"scripts\": {\n    \"build\": \"webpack --progress --colors --optimize-minimize\",\n    \"dev\": \"webpack --progress --colors --optimize-minimize --watch\"\n  },\n  \"keywords\": [\n    \"pyspider\"\n  ],\n  \"author\": \"binux\",\n  \"license\": \"MIT\",\n  \"devDependencies\": {\n    \"babel-core\": \"^6.14.0\",\n    \"babel-loader\": \"^6.2.5\",\n    \"babel-preset-es2015\": \"^6.14.0\",\n    \"css-loader\": \"^0.25.0\",\n    \"extract-text-webpack-plugin\": \"^1.0.1\",\n    \"less\": \"^2.7.1\",\n    \"less-loader\": \"^2.2.3\",\n    \"style-loader\": \"^0.13.1\",\n    \"webpack\": \"^1.13.2\"\n  }\n}\n"
  },
  {
    "path": "pyspider/webui/static/src/css_selector_helper.js",
    "content": "// vim: set et sw=2 ts=2 sts=2 ff=unix fenc=utf8:\n// Author: Binux<i@binux.me>\n//         http://binux.me\n// Created on 2013-11-11 18:50:58\n\nimport EventEmitter from 'events'\n\nfunction arrayEquals(a, b) {\n  if (!a || !b)\n    return false;\n  if (a.length != b.length)\n    return false;\n\n  for (var i = 0, l = a.length; i < l; i++) {\n    if (a[i] !== b[i])\n      return false;\n  }\n  return true;\n}\n\nfunction getOffset(elem) {\n  var top = 0;\n  var left = 0;\n  do {\n    if ( !isNaN( elem.offsetLeft) ) left += elem.offsetLeft;\n    if ( !isNaN( elem.offsetTop) ) top += elem.offsetTop;\n  } while( elem = elem.offsetParent )\n  return {top: top, left: left};\n}\n\nfunction merge_name(features) {\n  var element_name = '';\n  features.forEach(function(f) {\n    if (f.selected)\n      element_name += f.name;\n  })\n  return element_name;\n}\n\nfunction merge_pattern(path, end) {\n  var pattern = '';\n  var prev = null;\n  path.forEach(function(p, i) {\n    if (end >= 0 && i > end) {\n      return;\n    }\n    if (p.invalid) {\n      prev = null;\n    } else if (p.selected) {\n      if (prev) {\n        pattern += ' >';\n      }\n      var element_pattern = '';\n      p.features.forEach(function(f) {\n        if (f.selected) {\n          element_pattern += f.pattern;\n        }\n      });\n      if (element_pattern === '') {\n        element_pattern = '*';\n      }\n      pattern += ' '+element_pattern;\n      prev = p;\n    } else {\n      prev = null;\n    }\n  })\n  if (pattern === '') {\n    pattern = '*';\n  }\n  return pattern;\n}\n\n\nfunction path_info(doc, element) {\n  var path = [];\n  do {\n    var features = [];\n    // tagName\n    features.push({\n      name: element.tagName.toLowerCase(),\n      pattern: element.tagName.toLowerCase(),\n      selected: true,\n    });\n    // id\n    if (element.getAttribute('id')) {\n      features.push({\n        name: '#'+element.getAttribute('id'),\n        pattern: '#'+element.getAttribute('id'),\n        selected: true,\n      });\n    }\n    // class\n    if (element.classList.length > 0) {\n      for (var i=0; i<element.classList.length; i++) {\n        var class_name = element.classList[i];\n        features.push({\n          name: '.'+class_name,\n          pattern: '.'+class_name,\n          selected: true,\n        });\n      }\n    }\n    // rel, property\n    var allowed_attr_names = ('rel', 'property', 'itemprop');\n    for (var i=0, attrs = element.attributes; i < attrs.length; i++) {\n      if (allowed_attr_names.indexOf(attrs[i].nodeName) == -1) {\n        continue\n      }\n      features.push({\n        name: '['+attrs[i].nodeName+'='+JSON.stringify(attrs[i].nodeValue)+']',\n        pattern: '['+attrs[i].nodeName+'='+JSON.stringify(attrs[i].nodeValue)+']',\n        selected: true,\n      });\n    }\n\n    // get xpath\n    var siblings = element.parentNode.childNodes;\n    var xpath = element.tagName.toLowerCase();\n    for (var i=0, ix=0; siblings.length > 1 && i < siblings.length; i++) {\n      var sibling = siblings[i];\n      if (sibling === element) {\n        xpath += '['+(ix+1)+']';\n        break;\n      } else if (sibling.tagName == element.tagName) {\n        ix++;\n      }\n    }\n\n    // pack it up\n    path.push({\n      tag: element.tagName.toLowerCase(),\n      name: merge_name(features),\n      xpath: xpath,\n      selected: true,\n      invalid: element.tagName.toLowerCase() === 'tbody',\n      features: features,\n    });\n  } while (element = element.parentElement);\n\n  path.reverse();\n\n  // select elements\n  var selected_elements = doc.querySelectorAll(merge_pattern(path));\n  path.forEach(function(p, i) {\n    if (p.invalid)\n      return;\n    // select features\n    var feature_selected_elements = doc.querySelectorAll(merge_pattern(path, i));\n    p.features.forEach(function(f, fi) {\n      f.selected = false;\n      if (arrayEquals(feature_selected_elements,\n        doc.querySelectorAll(merge_pattern(path, i)))) {\n          return;\n        }\n      f.selected = true;\n    });\n    if (p.features.every(function(f) {\n      return !f.selected;\n    })) {\n      p.features[0].selected = true;\n    }\n    p.name = merge_name(p.features);\n  });\n\n  path.forEach(function(p, i) {\n    p.selected = false;\n    if (arrayEquals(selected_elements,\n      doc.querySelectorAll(merge_pattern(path)))) {\n        p.name = p.tag;\n        return;\n      }\n    p.selected = true;\n  });\n\n  return path;\n}\n\nexport default class CSSSelectorHelperServer extends EventEmitter {\n  constructor(window) {\n    super();\n\n    this.window = window;\n    this.document = window.document;\n\n    this.document.addEventListener(\"mouseover\", (ev) => {\n      this.overlay(ev.target);\n    });\n\n    this.document.addEventListener(\"click\", (ev) => {\n      ev.preventDefault();\n      ev.stopPropagation();\n\n      this.emit('selector_helper_click', path_info(this.document, ev.target));\n    });\n  }\n\n  overlay(elements) {\n    if (typeof elements === 'string') {\n      elements = this.document.querySelectorAll(elements);\n    }\n    if (elements instanceof this.window.Element) {\n      elements = [elements];\n    }\n    [...this.document.querySelectorAll('.pyspider_overlay')].forEach((elem) => {\n      elem.remove();\n    });\n    [...elements].forEach((elem) => {\n      const offset = getOffset(elem);\n      const div = this.document.createElement(\"div\");\n      div.className = \"pyspider_overlay\";\n      div.setAttribute('style', 'z-index: 999999;background-color: rgba(255, 165, 0, 0.3);position: absolute;pointer-events: none;'\n        +'top: '+offset.top+'px;'\n          +'left:'+offset.left+'px;'\n          +'width: '+elem.offsetWidth+'px;'\n          +'height: '+elem.offsetHeight+'px;');\n      this.document.body.appendChild(div);\n    });\n  }\n\n  heightlight(elements) {\n    if (typeof elements === 'string') {\n      elements = this.document.querySelectorAll(elements);\n    }\n    console.log(elements);\n    if (elements instanceof this.window.Element) {\n      elements = [elements];\n    }\n    [...this.document.querySelectorAll('.pyspider_highlight')].forEach((elem) => {\n      elem.remove();\n    });\n    [...elements].forEach((elem) => {\n      const offset = getOffset(elem);\n      const div = this.document.createElement(\"div\");\n      div.className = \"pyspider_highlight\";\n      div.setAttribute('style', 'z-index: 888888;border: 2px solid #c00;position: absolute;pointer-events: none;'\n        +'top: '+(offset.top-2)+'px;'\n          +'left:'+(offset.left-2)+'px;'\n          +'width: '+elem.offsetWidth+'px;'\n          +'height: '+elem.offsetHeight+'px;');\n      this.document.body.appendChild(div);\n    });\n  }\n\n  getElementByXpath(path) {\n    return this.document.evaluate(path, this.document, null, this.window.XPathResult.FIRST_ORDERED_NODE_TYPE, null).singleNodeValue;\n  }\n}\n\n"
  },
  {
    "path": "pyspider/webui/static/src/debug.js",
    "content": "// vim: set et sw=2 ts=2 sts=2 ff=unix fenc=utf8:\n// Author: Binux<i@binux.me>\n//         http://binux.me\n// Created on 2014-02-23 15:19:19\n\nimport \"./debug.less\"\nimport \"./splitter\"\nimport CSSSelectorHelperServer from \"./css_selector_helper\"\n\nwindow.SelectorHelper = (function() {\n  var helper = $('#css-selector-helper');\n  var server = null;\n\n  function merge_name(p) {\n    var features = p.features;\n    var element_name = '';\n    features.forEach(function(f) {\n      if (f.selected)\n        element_name += f.name;\n    });\n    if (element_name === '') {\n      return p.tag;\n    }\n    return element_name;\n  }\n\n  function merge_pattern(path, end) {\n    var pattern = '';\n    var prev = null;\n    path.forEach(function(p, i) {\n      if (end >= 0 && i > end) {\n        return;\n      }\n      if (p.invalid) {\n        prev = null;\n      } else if (p.selected) {\n        if (prev) {\n          pattern += ' >';\n        }\n        var element_pattern = '';\n        p.features.forEach(function(f) {\n          if (f.selected) {\n            element_pattern += f.pattern;\n          }\n        });\n        if (element_pattern === '') {\n          element_pattern = '*';\n        }\n        pattern += ' '+element_pattern;\n        prev = p;\n      } else {\n        prev = null;\n      }\n    })\n    if (pattern === '') {\n      pattern = '*';\n    }\n    return pattern.trim();\n  }\n\n  var current_path = null;\n  function selector_changed(path) {\n    current_path = path;\n    server.heightlight(merge_pattern(path));\n  }\n  \n  function render_selector_helper(path) {\n    helper.find('.element').remove();\n    var elements = [];\n    $.each(path, function(i, p) {\n      var span = $('<span>').addClass('element').data('info', p);\n      $('<span class=\"element-name\">').text(p.name).appendTo(span);\n      if (p.selected) span.addClass('selected');\n      if (p.invalid) span.addClass('invalid');\n\n      var ul = $('<ul>');\n      $.each(p.features, function(i, f) {\n        var li = $('<li>').text(f.name).data('feature', f);\n        if (f.selected) li.addClass('selected');\n        li.appendTo(ul);\n        // feature on click\n        li.on('click', function(ev) {\n          ev.stopPropagation();\n          var $this = $(this);\n          var f = $this.data('feature');\n          if (f.selected) {\n            f.selected = false;\n            $this.removeClass('selected');\n          } else {\n            f.selected = true;\n            $this.addClass('selected');\n          }\n          var element = $this.parents('.element');\n          if (!p.selected) {\n            p.selected = true;\n            element.addClass('selected');\n          }\n          element.find('.element-name').text(merge_name(p));\n          selector_changed(path);\n        });\n      });\n      ul.appendTo(span);\n\n      span.on('mouseover', (ev) => {\n        var xpath = [];\n        $.each(path, function(i, _p) {\n          xpath.push(_p.xpath);\n          if (_p === p) {\n            return false;\n          }\n        });\n        server.overlay(server.getElementByXpath('/' + xpath.join('/')));\n      })\n      // path on click\n      span.on('click', function(ev) {\n        ev.stopPropagation();\n        var $this = $(this);\n        var p = $this.data('info');\n        if (p.selected) {\n          p.selected = false;\n          $this.removeClass('selected');\n        } else {\n          p.selected = true;\n          $this.addClass('selected');\n        }\n        $this.find('.element-name').text(merge_name($this.data('info')));\n        selector_changed(path);\n      });\n      elements.push(span);\n    });\n    helper.prepend(elements);\n\n    adjustHelper();\n    selector_changed(path);\n  }\n\n  function adjustHelper() {\n    while (helper[0].scrollWidth > helper.width()) {\n      var e = helper.find('.element:visible:first');\n      if (e.length == 0) {\n        return;\n      }\n      e.addClass('invalid').data('info')['invalid'] = true;\n    }\n  }\n\n  var tab_web = $('#tab-web');\n  return {\n    init: function() {\n      var _this = this;\n      _this.clear();\n\n      $(\"#J-enable-css-selector-helper\").on('click', ev => {\n        this.clear();\n        server = new CSSSelectorHelperServer($(\"#tab-web iframe\")[0].contentWindow);\n        server.on('selector_helper_click', path => {\n          render_selector_helper(path);\n        })\n        this.enable();\n      });\n\n      $(\"#task-panel\").on(\"scroll\", function(ev) {\n        if (!helper.is(':visible')) {\n          return;\n        }\n        if ($(\"#debug-tabs\").position().top < 0) {\n          helper.addClass('fixed');\n          tab_web.addClass('fixed');\n        } else {\n          helper.removeClass('fixed');\n          tab_web.removeClass('fixed');\n        }\n      });\n\n      // copy button\n      var input = helper.find('.copy-selector-input');\n      input.on('focus', function(ev) {\n        $(this).select();\n      });\n      helper.find('.copy-selector').on('click', function(ev) {\n        if (!current_path) {\n          return;\n        }\n        if (input.is(':visible')) {\n          input.hide();\n          helper.find('.element').show();\n        } else {\n          helper.find('.element').hide();\n          input.val(merge_pattern(current_path)).show();\n        }\n      });\n \n      // add button\n      helper.find('.add-to-editor').on('click', function(ev) {\n        Debugger.python_editor_replace_selection(merge_pattern(current_path));\n      });\n    },\n    clear: function() {\n      current_path = null;\n      helper.hide();\n      helper.removeClass('fixed');\n      tab_web.removeClass('fixed');\n      helper.find('.element').remove();\n    },\n    enable: function() {\n      helper.show();\n      helper.find('.copy-selector-input').hide();\n      if ($(\"#debug-tabs\").position().top < 0) {\n        helper.addClass('fixed');\n        tab_web.addClass('fixed');\n      } else {\n        helper.removeClass('fixed');\n        tab_web.removeClass('fixed');\n      }\n    },\n  }\n})();\n\nwindow.Debugger = (function() {\n  var tmp_div = $('<div>');\n  function escape(text) {\n    return tmp_div.text(text).html();\n  }\n\n  return {\n    init: function() {\n      //init resizer\n      this.splitter = $(\".debug-panel:not(:first)\").splitter().data('splitter')\n          .trigger('init')\n          .on('resize-start', function() {\n            $('#left-area .overlay').show();\n          })\n          .on('resize-end', function() {\n            $('#left-area .overlay').hide();\n          });\n\n      //codemirror\n      CodeMirror.keyMap.basic.Tab = 'indentMore';\n      this.init_python_editor($(\"#python-editor\"));\n      this.init_task_editor($(\"#task-editor\"));\n      this.bind_debug_tabs();\n      this.bind_run();\n      this.bind_save();\n      this.bind_others();\n\n      // css selector helper\n      SelectorHelper.init();\n    },\n\n    not_saved: false,\n    init_python_editor: function($el) {\n      var _this = this;\n      this.python_editor_elem = $el;\n      var cm = this.python_editor = CodeMirror($el[0], {\n        value: script_content,\n        mode: \"python\",\n        lineNumbers: true,\n        indentUnit: 4,\n        lineWrapping: true,\n        styleActiveLine: true,\n        autofocus: true\n      });\n      cm.on('focus', function() {\n        $el.addClass(\"focus\");\n      });\n      cm.on('blur', function() {\n        $el.removeClass(\"focus\");\n      });\n      cm.on('change', function() {\n        _this.not_saved = true;\n      });\n      window.addEventListener('beforeunload', function(e) {\n        if (_this.not_saved) {\n          var returnValue = \"You have not saved changes.\";\n          (e || window.event).returnValue = returnValue;\n          return returnValue;\n        }\n      });\n    },\n\n    python_editor_replace_selection: function(content) {\n      this.python_editor.getDoc().replaceSelection(content);\n    },\n\n    auto_format: function(cm) {\n      var pos = cm.getCursor(true);\n      CodeMirror.commands.selectAll(cm);\n      cm.autoFormatRange(cm.getCursor(true), cm.getCursor(false));\n      cm.setCursor(pos);\n    },\n\n    format_string: function(value, mode) {\n      var div = document.createElement('div');\n      var cm = CodeMirror(div, {\n        value: value,\n        mode: mode\n      });\n      this.auto_format(cm);\n      return cm.getDoc().getValue();\n    },\n\n    init_task_editor: function($el) {\n      var cm = this.task_editor = CodeMirror($el[0], {\n        value: task_content,\n        mode: \"application/json\",\n        indentUnit: 2,\n        lineWrapping: true,\n        styleActiveLine: true,\n        lint: true\n      });\n      this.auto_format(cm);\n      cm.getDoc().clearHistory();\n      cm.on('focus', function() {\n        $el.addClass(\"focus\");\n      });\n      cm.on('blur', function() {\n        $el.removeClass(\"focus\");\n      });\n    },\n\n    bind_debug_tabs: function() {\n      var _this = this;\n      $('#tab-control > li[data-id]').on('click', function() {\n        $('#tab-control > li[data-id]').removeClass('active');\n        var name = $(this).addClass('active').data('id');\n        $('#debug-tabs .tab').hide();\n        $('#debug-tabs #'+name).show();\n      });\n      $(\"#tab-control li[data-id=tab-html]\").on('click', function() {\n        if (!!!$(\"#tab-html\").data(\"format\")) {\n          var html_styled = \"\";\n          CodeMirror.runMode(_this.format_string($(\"#tab-html pre\").text(), 'text/html'), 'text/html',\n                             function(text, classname) {\n                               if (classname)\n                                 html_styled += '<span class=\"cm-'+classname+'\">'+escape(text)+'</span>';\n                               else\n                                 html_styled += escape(text);\n                             });\n          $(\"#tab-html pre\").html(html_styled);\n          $(\"#tab-html\").data(\"format\", true);\n        }\n      });\n    },\n\n    bind_run: function() {\n      var _this = this;\n      $('#run-task-btn').on('click', function() {\n        _this.run();\n      });\n      $('#undo-btn').on('click', function(ev) {\n        _this.task_editor.execCommand('undo');\n      });\n      $('#redo-btn').on('click', function(ev) {\n        _this.task_editor.execCommand('redo');\n      });\n    },\n\n    bind_save: function() {\n      var _this = this;\n      $('#save-task-btn').on('click', function() {\n        var script = _this.python_editor.getDoc().getValue();\n        $('#right-area .overlay').show();\n        $.ajax({\n          type: \"POST\",\n          url: location.pathname+'/save',\n          data: {\n            script: script\n          },\n          success: function(data) {\n            console.log(data);\n            _this.python_log('');\n            _this.python_log(\"saved!\");\n            _this.not_saved = false;\n            $('#right-area .overlay').hide();\n          },\n          error: function(xhr, textStatus, errorThrown) {\n            console.log(xhr, textStatus, errorThrown);\n            _this.python_log(\"save error!\\n\"+xhr.responseText);\n            $('#right-area .overlay').hide();\n          }\n        });\n      });\n    },\n\n    bind_follows: function() {\n      var _this = this;\n      $('.newtask').on('click', function() {\n        if ($(this).next().hasClass(\"task-show\")) {\n          $(this).next().remove();\n          return;\n        }\n        var task = $(this).after('<div class=\"task-show\"><pre class=\"cm-s-default\"></pre></div>').data(\"task\");\n        task = JSON.stringify(window.newtasks[task], null, '  ');\n        CodeMirror.runMode(task, 'application/json', $(this).next().find('pre')[0]);\n      });\n      \n      $('.newtask .task-run').on('click', function(event) {\n        event.preventDefault();\n        event.stopPropagation();\n        let task_id = $(this).parents('.newtask').data(\"task\");\n        let task = window.newtasks[task_id];\n        _this.task_editor.setValue(JSON.stringify(task, null, '  '));\n        _this.task_updated(task);\n        _this.run();\n      });\n    },\n\n    task_updated: function task_updated(task) {\n      $('#history-wrap').hide();\n      if (task.project && task.taskid) {\n        $.ajax({\n          url: `/task/${task.project}:${task.taskid}.json`,\n          success: (data) => {\n            if (!data.code && !data.error) {\n              $('#history-link').attr('href', `/task/${task.project}:${task.taskid}`).text(`status: ${data.status_string}`);\n              $('#history-wrap').show();\n            }\n          }\n        })\n      }\n    },\n\n    bind_others: function() {\n      var _this = this;\n      $('#python-log-show').on('click', function() {\n        if ($('#python-log pre').is(\":visible\")) {\n          $('#python-log pre').hide();\n          $(this).height(8);\n        } else {\n          $('#python-log pre').show();\n          $(this).height(0);\n        }\n      });\n      $('.webdav-btn').on('click', function() {\n        _this.toggle_webdav_mode(this);\n      })\n    },\n\n    render_html: function(html, base_url, block_script=true, block_iframe=true) {\n      if (html === undefined) {\n        html = '';\n      }\n      let dom = (new DOMParser()).parseFromString(html, \"text/html\");\n\n      $(dom).find('base').remove();\n      $(dom).find('head').prepend('<base>');\n      $(dom).find('base').attr('href', base_url);\n\n      if (block_script) {\n        $(dom).find('script').attr('type', 'text/plain');\n      }\n      if (block_iframe) {\n        $(dom).find('iframe[src]').each((i, e) => {\n          e = $(e);\n          e.attr('__src', e.attr('src'))\n          e.attr('src', encodeURI('data:text/html;,<h1>iframe blocked</h1>'));\n        });\n      }\n\n      return dom.documentElement.innerHTML;\n    },\n\n    run: function() {\n      var script = this.python_editor.getDoc().getValue();\n      var task = this.task_editor.getDoc().getValue();\n      var _this = this;\n\n      // reset\n      SelectorHelper.clear();\n      $(\"#tab-web .iframe-box\").html('');\n      $(\"#tab-html pre\").html('');\n      $('#tab-follows').html('');\n      $(\"#tab-control li[data-id=tab-follows] .num\").hide();\n      $('#python-log').hide();\n      $('#left-area .overlay').show();\n\n      $.ajax({\n        type: \"POST\",\n        url: location.pathname+'/run',\n        data: {\n          webdav_mode: _this.webdav_mode,\n          script: _this.webdav_mode ? '' : script,\n          task: task\n        },\n        success: function(data) {\n          console.log(data);\n          $('#left-area .overlay').hide();\n\n          //web\n          $(\"#tab-web .iframe-box\").html('<iframe src=\"/blank.html\" sandbox=\"allow-same-origin allow-scripts\" height=\"50%\"></iframe>');\n          const iframe = $(\"#tab-web iframe\")[0];\n          const content_type = data.fetch_result.headers && data.fetch_result.headers['Content-Type'] && data.fetch_result.headers['Content-Type'] || \"text/plain\";\n\n          //html\n          $(\"#tab-html pre\").text(data.fetch_result.content);\n          $(\"#tab-html\").data(\"format\", true);\n\n          let iframe_content = null;\n          if (content_type.indexOf('application/json') == 0) {\n            try {\n              let content = JSON.parse(data.fetch_result.content);\n              content = JSON.stringify(content, null, '  ');\n              content = \"<html><pre>\"+content+\"</pre></html>\";\n              iframe_content = _this.render_html(content, data.fetch_result.url, true, true, false);\n            } catch (e) {\n              iframe_content = \"data:,Content-Type:\"+content_type+\" parse error.\";\n            }\n          } else if (content_type.indexOf(\"text/html\") == 0) {\n            $(\"#tab-html\").data(\"format\", false);\n            iframe_content = _this.render_html(data.fetch_result.content, data.fetch_result.url, true, true, false);\n          } else if (content_type.indexOf(\"text\") == 0) {\n            iframe_content = \"data:\"+content_type+\",\"+data.fetch_result.content;\n          } else if (data.fetch_result.dataurl) {\n            iframe_content = data.fetch_result.dataurl\n          } else {\n            iframe_content = \"data:,Content-Type:\"+content_type;\n          }\n\n          const doc = iframe.contentDocument;\n          doc.open(\"text/html\", \"replace\");\n          doc.write(iframe_content)\n          doc.close();\n          doc.onreadystatechange = () => {\n            if (doc.readyState === 'complete') {\n              $(\"#tab-web iframe\").height(doc.body.scrollHeight + 60);\n            }\n          };\n\n          //follows\n          $('#tab-follows').html('');\n          var elem = $(\"#tab-control li[data-id=tab-follows] .num\");\n\n          var newtask_template = '<div class=\"newtask\" data-task=\"__task__\"><span class=\"task-callback\">__callback__</span> &gt; <span class=\"task-url\">__url__</span><div class=\"task-run\"><i class=\"fa fa-play\"></i></div><div class=\"task-more\"> <i class=\"fa fa-ellipsis-h\"></i> </div></div>';\n          if (data.follows.length > 0) {\n            elem.text(data.follows.length).show();\n            var all_content = \"\";\n            window.newtasks = {};\n            $.each(data.follows, function(i, task) {\n              var callback = task.process;\n              callback = callback && callback.callback || '__call__';\n              var content = newtask_template.replace('__callback__', callback);\n              content = content.replace('__url__', task.url || '<span class=\"error\">no_url!</span>');\n              all_content += content.replace('__task__', i);\n              window.newtasks[i] = task;\n            });\n            $('#tab-follows').append(all_content);\n            _this.bind_follows();\n          } else {\n            elem.hide();\n          }\n\n          //messages\n          $('#tab-messages pre').html('');\n          if (data.messages.length > 0) {\n            $(\"#tab-control li[data-id=tab-messages] .num\").text(data.messages.length).show();\n            var messages = JSON.stringify(data.messages, null, '  ');\n            CodeMirror.runMode(messages, 'application/json', $('#tab-messages pre')[0]);\n            $('#tab-messages')[0]\n          } else {\n            $(\"#tab-control li[data-id=tab-messages] .num\").hide();\n          }\n\n          $(\"#tab-control li.active\").click();\n\n          // logs\n          _this.python_log(data.logs);\n        },\n        error: function(xhr, textStatus, errorThrown) {\n          console.log(xhr, textStatus, errorThrown);\n          _this.python_log('error: '+textStatus);\n          $('#left-area .overlay').hide();\n        }\n      });\n    },\n\n    python_log: function(text) {\n      if (text) {\n        $('#python-log pre').text(text);\n        $('#python-log pre, #python-log').show();\n        $('#python-log-show').height(0);\n      } else {\n        $('#python-log pre, #python-log').hide();\n      }\n    },\n\n    webdav_mode: false,\n    toggle_webdav_mode: function(button) {\n      if (!this.webdav_mode) {\n        if (this.not_saved) {\n            if (!confirm(\"You have not saved changes. Ignore changes and switch to WebDav mode.\")) {\n            return;\n          }\n          this.not_saved = false;\n        }\n        this.python_editor_elem.hide();\n        this.splitter.trigger('fullsize', 'prev');\n        $(button).addClass('active');\n        this.webdav_mode = !this.webdav_mode;\n      } else {\n        // leaving webdav mode, reload script\n        var _this = this;\n        $.ajax({\n          type: \"GET\",\n          url: location.pathname + '/get',\n          success: function (data) {\n            _this.splitter.trigger('init');\n            _this.python_editor_elem.show();\n            _this.python_editor.setValue(data.script);\n            _this.not_saved = false;\n            $(button).removeClass('active');\n            _this.webdav_mode = !_this.webdav_mode;\n          },\n          error: function() {\n            alert('Loading script from database error. Script may out-of-date.');\n            _this.python_editor_elem.show();\n            _this.splitter.trigger('init');\n            $(button).removeClass('active');\n            _this.webdav_mode = !_this.webdav_mode;\n          },\n        });\n      }\n    },\n  };\n})();\n\nDebugger.init();\n"
  },
  {
    "path": "pyspider/webui/static/src/debug.less",
    "content": "/* vim: set et sw=2 ts=2 sts=2 ff=unix fenc=utf8: */\n/* Author: Binux<i@binux.me> */\n/*         http://binux.me */\n/* Created on 2014-02-23 00:28:30 */\n\n@import \"variable\";\n\nbody {\n  margin: 0;\n  padding: 0;\n  height: 100%;\n  overflow: hidden;\n}\n\n.warning {\n  color: @orange;\n}\n.error {\n  color: @red;\n}\n\n@control-height: 35px;\n#control {\n  z-index: 9999;\n  min-width: 760px;\n  width: 100%;\n  height: @control-height;\n  position: fixed;\n  left: 0;\n  right: 0;\n  background-color: @gray-lighter;\n  box-shadow: 0px 1px 2px @gray-light;\n\n  div {\n    line-height: 35px;\n    margin-left: 10px;\n    margin-right: 10px;\n  }\n\n  .webdav-btn {\n    position: relative;\n    float: right;\n    padding: 1px 7px 0 7px;\n    line-height: 21px;\n    border-radius: 5px;\n    border: solid 1px @blue;\n    background: white;\n    color: @blue;\n    cursor: pointer;\n    margin: 6px 0 0 10px;\n\n    &:hover {\n      background: lighten(@blue, 10%);\n      color: white;\n    }\n    &.active {\n      background: @blue;\n      color: white;\n    }\n  }\n}\n\n#editarea {\n  width: 100%;\n  position: fixed;\n  top: @control-height + 2px;\n  left: 0;\n  right: 0;\n  bottom: 0;\n  //debug\n}\n\n.debug-panel {\n  position: absolute;\n  top: 0;\n  left: 0;\n  right: 0;\n  bottom: 0;\n}\n\n.resize {\n  background-color: @gray;\n  cursor: ew-resize;\n  &:hover + .debug-panel {\n    border-left: dashed 1px @gray !important;\n }\n}\n\n.overlay {\n  position: absolute;\n  top: 0;\n  bottom: 0;\n  left: 0;\n  right: 0;\n  z-index: 9999;\n  background: rgba(0, 0, 0, 40%);\n}\n\n.focus .CodeMirror-activeline-background {\n  background: #e8f2ff !important;\n}\n.CodeMirror-activeline-background {\n  background: transparent !important;\n}\n\n#task-panel {\n  height: 100%;\n  overflow-x: auto;\n}\n\n.right-top-btn(@color: @green) {\n  z-index: 99;\n  position: absolute;\n  top: 0;\n  right: 0;\n  background: @color;\n  border-radius: 0 0 0 5px;\n  color: white;\n  margin: 0;\n  padding: 3px 7px 5px 10px;\n  cursor: pointer;\n  font-weight: bold;\n  line-height: 15px;\n\n  &:hover {\n    background: darken(@color, 10%);\n  }\n}\n\n#run-task-btn {\n  .right-top-btn(@color: @green);\n}\n#undo-redo-btn-group {\n  @color: lighten(@green, 15%);\n  .right-top-btn(@color: @color);\n\n  top: auto;\n  bottom: 0;\n  border-radius: 5px 0 0 0;\n  padding: 5px 0 3px 0;\n  /*box-shadow: 0px 0px 30px @color;*/\n  overflow: hidden;\n\n  &:hover {\n    background: @color;\n  }\n\n  a {\n    color: white;\n    text-decoration: none;\n    padding: 5px 7px 3px 10px;\n    &:hover {\n      background: darken(@color, 10%);\n    }\n  }\n}\n#save-task-btn {\n  .right-top-btn(@color: @blue);\n}\n\n#task-editor {\n  position: relative;\n\n  .CodeMirror {\n    height: auto;\n    padding-bottom: 3px;\n    background: lighten(@green, 30%);\n  }\n  .CodeMirror-scroll {\n    overflow-x: auto;\n    overflow-y: hidden;\n  }\n  &.focus .CodeMirror-activeline-background {\n    background: lighten(@green, 40%) !important;\n  }\n}\n\n#tab-control {\n  list-style-type: none;\n  position: absolute;\n  bottom: 0;\n  right: 0;\n  margin: 8px 20px;\n  padding: 0;\n\n  li {\n    position: relative;\n    float: right;\n    padding: 1px 7px 0 7px;\n    line-height: 21px;\n    margin-left: 10px;\n    border-radius: 5px;\n    border: solid 1px @blue;\n    background: white;\n    color: @blue;\n    cursor: pointer;\n\n    &:hover {\n      background: lighten(@blue, 10%);\n      color: white;\n    }\n    &.active {\n      background: @blue;\n      color: white;\n    }\n\n    span {\n      position: absolute;\n      top: -5px;\n      right: -10px;\n      background: @red;\n      color: white;\n      font-size: 80%;\n      font-weight: bold;\n      padding: 2px 5px 0 5px;\n      border-radius: 10px;\n    }\n  }\n}\n\n#debug-tabs {\n  margin-bottom: 45px;\n}\n\n#tab-web {\n  &.fixed {\n    padding-top: 24px;\n  }\n\n  iframe {\n    border-width: 0;\n    width: 100%;\n  }\n}\n\n#tab-html {\n  margin: 0;\n  padding: 7px 5px;\n\n  pre {\n    margin: 0;\n    padding: 0;\n  }\n}\n\n#tab-follows {\n  .newtask {\n    position: relative;\n    height: 30px;\n    line-height: 30px;\n    background: lighten(@orange, 30%);\n    border-bottom: solid 1px @orange;\n    border-top: solid 1px @orange;\n    margin-top: -1px;\n    padding-left: 5px;\n    padding-right: 70px;\n    overflow: hidden;\n    white-space: nowrap;\n    text-overflow: ellipsis;\n    cursor: pointer;\n\n    &:hover {\n      background: lighten(@orange, 20%);\n      .task-more {\n        background: lighten(@orange, 20%);\n      }\n    }\n    .task-callback {\n      color: darken(@orange, 10%);\n    }\n    .task-url {\n      font-size: 95%;\n      text-decoration: underline;\n      font-weight: lighter;\n      color: @blue;\n    }\n    .task-more {\n      position: absolute;\n      right: 33px;\n      top: 0px;\n      float: right;\n      color: @orange;\n      padding: 0 10px;\n      background: lighten(@orange, 30%);\n      border-radius: 10px;\n    }\n    .task-run {\n      position: absolute;\n      right: 0;\n      top: 0;\n      font-size: 80%;\n      padding: 0 10px 0 30px;\n      float: right;\n      border-bottom: solid 1px lighten(@green, 20%);\n      border-top: solid 1px lighten(@green, 20%);\n      background: lighten(@green, 10%);\n      color: white;\n      text-shadow: 0 0 10px white;\n      font-weight: bold;\n\n      &:hover {\n        background: @green;\n      }\n    }\n  }\n  .task-show {\n    pre {\n      margin: 5px 5px 10px 5px;\n    }\n  }\n}\n\n#python-editor {\n  position: absolute;\n  top: 0;\n  width: 100%;\n  bottom: 0;\n\n  .CodeMirror {\n    height: 100%;\n    padding-bottom: 20px;\n  }\n}\n\n#python-log {\n  width: 100%;\n  min-height: 10px;\n  max-height: 40%;\n  background: rgba(0, 0, 0, 60%);\n  overflow: auto;\n\n  #python-log-show {\n    z-index: 89;\n    width: auto;\n    padding-top: 5px;\n    background: @red;\n    box-shadow: 0 2px 20px @red;\n    cursor: pointer;\n  }\n  pre {\n    margin: 0;\n    padding: 10px 10px;\n    color: white;\n  }\n}\n\n#css-selector-helper {\n  background-color: @gray-lighter;\n  padding: 0;\n  width: 100%;\n  height: 24px;\n  text-align: right;\n  white-space: nowrap;\n\n  &.fixed {\n    position: absolute;\n    top: 0;\n\n  }\n\n  button {\n    line-height: 16px;\n    vertical-align: 2px;\n  }\n}\n\nspan.element {\n  position: relative;\n  height: 24px;\n  display: inline-block;\n  padding: 0 0.2em;\n  cursor: pointer;\n  color: lighten(@gray, 35%);\n  z-index: 99999;\n\n  &.invalid {\n    display: none;\n  }\n  &.selected {\n    color: black;\n  }\n  &:hover {\n    background-color: darken(@gray-lighter, 15%);\n\n    & > ul {\n      display: block;\n    }\n  }\n\n  & > ul {\n    display: none;\n    margin: 0;\n    padding: 0;\n    position: absolute;\n    top: 24px;\n    left: 0;\n    background-color: @gray-lighter;\n    border: 1px solid black;\n    border-top-width: 0;\n    color: lighten(@gray, 35%);\n\n    & > li {\n      display: block;\n      text-align: left;\n      white-space: nowrap;\n      padding: 0 4px;\n\n      &.selected {\n        color: black;\n      }\n      &:hover {\n        background-color: darken(@gray-lighter, 15%);\n      }\n    }\n  }\n}\n\n.copy-selector-input {\n  height: 24px;\n  padding: 0;\n  border: 0;\n  margin: 0;\n  padding-right: 0.2em;\n  font-size: 1em;\n  text-align: right;\n  width: 100%;\n  margin-left: -100px;\n  background: @gray-lighter;\n}\n"
  },
  {
    "path": "pyspider/webui/static/src/index.js",
    "content": "// vim: set et sw=2 ts=2 sts=2 ff=unix fenc=utf8:\n// Author: Binux<i@binux.me>\n//         http://binux.me\n// Created on 2014-03-02 17:53:23\n\nimport \"./index.less\";\n\n$(function() {\n  //$(\"input[name=start-urls]\").on('keydown', function(ev) {\n    //if (ev.keyCode == 13) {\n      //var value = $(this).val();\n      //var textarea = $('<textarea class=\"form-control\" rows=3 name=\"start-urls\"></textarea>').replaceAll(this);\n      //textarea.val(value).focus();\n    //}\n  //});\n\n  function init_editable(projects_app) {\n    $(\".project-group>span\").editable({\n      name: 'group',\n      pk: function(e) {\n        return $(this).parents('tr').data(\"name\");\n      },\n      emptytext: '[group]',\n      placement: 'right',\n      url: \"/update\",\n      success: function(response, value) {\n        var project_name = $(this).parents('tr').data(\"name\");\n        projects_app.projects[project_name].group = value;\n        $(this).attr('style', '');\n      }\n    });\n\n    $(\".project-status>span\").editable({\n      type: 'select',\n      name: 'status',\n      source: [\n        {value: 'TODO', text: 'TODO'},\n        {value: 'STOP', text: 'STOP'},\n        {value: 'CHECKING', text: 'CHECKING'},\n        {value: 'DEBUG', text: 'DEBUG'},\n        {value: 'RUNNING', text: 'RUNNING'}\n      ],\n      pk: function(e) {\n        return $(this).parents('tr').data(\"name\");\n      },\n      emptytext: '[status]',\n      placement: 'right',\n      url: \"/update\",\n      success: function(response, value) {\n        var project_name = $(this).parents('tr').data(\"name\");\n        projects_app.projects[project_name].status = value;\n        $(this).removeClass('status-'+$(this).attr('data-value')).addClass('status-'+value).attr('data-value', value).attr('style', '');\n      }\n    });\n\n    $(\".project-rate>span\").editable({\n      name: 'rate',\n      pk: function(e) {\n        return $(this).parents('tr').data(\"name\");\n      },\n      validate: function(value) {\n        var s = value.split('/');\n        if (s.length != 2)\n          return \"format error: rate/burst\";\n        if (!$.isNumeric(s[0]) || !$.isNumeric(s[1]))\n          return \"format error: rate/burst\";\n      },\n      highlight: false,\n      emptytext: '0/0',\n      placement: 'right',\n      url: \"/update\",\n      success: function(response, value) {\n        var project_name = $(this).parents('tr').data(\"name\");\n        var s = value.split('/');\n        projects_app.projects[project_name].rate = parseFloat(s[0]);\n        projects_app.projects[project_name].burst = parseFloat(s[1]);\n        $(this).attr('style', '');\n      }\n    });\n  }\n\n  function init_sortable() {\n    // table sortable\n    Sortable.getColumnType = function(table, i) {\n      var type = $($(table).find('th').get(i)).data('type');\n      if (type == \"num\") {\n        return Sortable.types.numeric;\n      } else if (type == \"date\") {\n        return Sortable.types.date;\n      }\n      return Sortable.types.alpha;\n    };\n    $('table.projects').attr('data-sortable', true);\n    Sortable.init();\n  }\n\n  $(\"#create-project-modal form\").on('submit', function(ev) {\n    var $this = $(this);\n    var project_name = $this.find('[name=project-name]').val()\n    if (project_name.length == 0 || project_name.search(/[^\\w]/) != -1) {\n      $this.find('[name=project-name]').parents('.form-group').addClass('has-error');\n      $this.find('[name=project-name] ~ .help-block').show();\n      return false;\n    }\n    var mode = $this.find('[name=script-mode]:checked').val();\n    $this.attr('action', '/debug/'+project_name);\n    return true;\n  });\n\n  function update_counters() {\n    $.get('/counter', function(data) {\n      for (let project in data) {\n        var info = data[project];\n        if (projects_app.projects[project] === undefined)\n          continue;\n\n        // data inject\n        var types = \"5m,1h,1d,all\".split(',');\n        for (let type of types) {\n          var d = info[type];\n          if (d === undefined)\n            continue;\n          var pending = d.pending || 0,\n            success = d.success || 0,\n            retry = d.retry || 0,\n            failed = d.failed || 0,\n            sum = d.task || pending + success + retry + failed;\n          d.task = sum;\n          d.title = \"\"+type+\" of \"+sum+\" tasks:\\n\"\n            +(type == \"all\"\n              ? \"pending(\"+(pending/sum*100).toFixed(1)+\"%): \\t\"+pending+\"\\n\"\n              : \"new(\"+(pending/sum*100).toFixed(1)+\"%): \\t\\t\"+pending+\"\\n\")\n            +\"success(\"+(success/sum*100).toFixed(1)+\"%): \\t\"+success+\"\\n\"\n            +\"retry(\"+(retry/sum*100).toFixed(1)+\"%): \\t\"+retry+\"\\n\"\n            +\"failed(\"+(failed/sum*100).toFixed(1)+\"%): \\t\"+failed;\n        }\n\n        projects_app.projects[project].paused = info['paused'];\n        projects_app.projects[project].time = info['5m_time'];\n        projects_app.projects[project].progress = info;\n      }\n    });\n  }\n\n  function update_queues() {\n    $.get('/queues', function(data) {\n      //console.log(data);\n      $('.queue_value').each(function(i, e) {\n        var attr = $(e).attr('title');\n        if (data[attr] !== undefined) {\n          $(e).text(data[attr]);\n        } else {\n          $(e).text('???');\n        }\n      });\n    });\n  }\n\n  // projects vue\n  var projects_map = {};\n  projects.forEach(function(p) {\n    p.paused = false;\n    p.time = {};\n    p.progress = {};\n    projects_map[p.name] = p;\n  });\n  var projects_app = new Vue({\n    el: '.projects',\n    data: {\n      projects: projects_map\n    },\n    ready: function() {\n      init_editable(this);\n      init_sortable(this);\n      update_counters();\n      window.setInterval(update_counters, 15*1000);\n      update_queues();\n      window.setInterval(update_queues, 15*1000);\n    },\n    methods: {\n      project_run: function(project, event) {\n        $(\"#need-set-status-alert\").hide();\n        if (project.status != \"RUNNING\" && project.status != \"DEBUG\") {\n          $(\"#need-set-status-alert\").show();\n        }\n        \n        var _this = event.target;\n        $(_this).addClass(\"btn-warning\");\n        $.ajax({\n          type: \"POST\",\n          url: '/run',\n          data: {\n            project: project.name\n          },\n          success: function(data) {\n            $(_this).removeClass(\"btn-warning\");\n            if (!data.result) {\n              $(_this).addClass(\"btn-danger\");\n            }\n          },\n          error: function() {\n            $(_this).removeClass(\"btn-warning\").addClass(\"btn-danger\");\n          }\n        });\n      }\n    }\n  });\n});\n"
  },
  {
    "path": "pyspider/webui/static/src/index.less",
    "content": "/* vim: set et sw=2 ts=2 sts=2 ff=unix fenc=utf8: */\n/* Author: Binux<i@binux.me> */\n/*         http://binux.me */\n/* Created on 2014-02-23 00:28:30 */\n\n@import \"variable\";\n\nh1 {\n  margin-top: 5px;\n}\n\nheader .alert {\n  position: absolute;;\n  width: 50rem;\n  left: 50%;\n  margin-left: -25rem;\n}\n\n.queue-info {\n  th, td {\n    text-align: center;\n    border: 1px solid #ddd;\n  }\n}\n\n[v-cloak] {\n  display: none;\n}\n\n.projects {\n  min-width: 850px;\n  border-top: 1px solid #ddd;\n  border-bottom: 1px solid #ddd;\n\n  .project-group {\n    width: 80px;\n  }\n\n  .project-name {\n    font-weight: bold;\n  }\n\n  .project-status {\n    width: 100px;\n  }\n  .project-status-span(@color) {\n    border: solid 1px darken(@color, 10%);\n    padding: 1px 5px 0 5px;\n    background: @color;\n    color: white;\n  }\n  .project-status>span {\n    .project-status-span(@gray-light);\n  }\n  span.status-TODO {\n    .project-status-span(@orange);\n  }\n  span.status-STOP {\n    .project-status-span(@red);\n  }\n  span.status-CHECKING {\n    .project-status-span(darken(@yellow, 10%));\n  }\n  span.status-DEBUG {\n    .project-status-span(@blue);\n  }\n  span.status-RUNNING {\n    .project-status-span(@green);\n  }\n  span.status-PAUSED {\n    .project-status-span(@gray);\n  }\n\n  .project-rate {\n    width: 110px;\n  }\n\n  .project-time {\n    width: 110px;\n  }\n  \n  th.project-progress {\n    position: relative;\n    span {\n      position: absolute;\n    }\n  }\n\n  td.project-progress {\n    position: relative;\n    min-width: 5%;\n    &.progress-all {\n      min-width: 10%;\n    }\n\n    .progress {\n      position: relative;\n      margin: 0;\n      background-color: #aaa;\n      .progress-text {\n        width: 100%;\n        text-align: center;\n        position: absolute;\n        font-weight: bold;\n        color: #fff;\n        pointer-events: none;\n      }\n      .progress-bar {\n        -webkit-transition: none;\n        transition: none;\n      }\n    }\n  }\n\n  .project-actions {\n    width: 200px;\n  }\n}\n\n.global-btn {\n  margin-top: -5px;\n  padding: 10px 10px 10px 10px;\n\n  .create-btn-div {\n    float: right;\n  }\n\n  .active-btn-div {\n    float: left;\n  }\n}\n\n"
  },
  {
    "path": "pyspider/webui/static/src/result.less",
    "content": "/* vim: set et sw=2 ts=2 sts=2 ff=unix fenc=utf8: */\n/* Author: Binux<i@binux.me> */\n/*         http://binux.me */\n/* Created on 2014-10-22 22:38:45 */\n\n@import \"variable\";\n\n.top-bar {\n  padding: 10px 15px 2px 15px;\n  height: 46px;\n  background-color: #f5f5f5;\n  border-bottom: 1px solid #ddd;\n  position: relative;\n  \n  h1 {\n    margin: 0 0 10px 0;\n    font-size: 18px;\n  }\n\n  .btn-group {\n    margin: 8px 10px 0 0;\n    position: absolute;\n    right: 0;\n    top: 0;\n\n    a.btn {\n    }\n  }\n}\n\n.pagination-wrap {\n  text-align: right;\n  padding-right: 15px;\n}\n\ntable {\n  border-bottom: 1px solid #ddd;\n\n  td {\n    word-break: break-all;\n  }\n}\n"
  },
  {
    "path": "pyspider/webui/static/src/splitter.js",
    "content": "// vim: set et sw=2 ts=2 sts=2 ff=unix fenc=utf8:\n// Author: Binux<i@binux.me>\n//         http://binux.me\n// Created on 2014-02-23 01:35:35\n// from: https://github.com/jsbin/jsbin\n\n$.fn.splitter = function (_type) {\n  var $document = $(document),\n  $blocker = $('<div class=\"block\"></div>'),\n  $body = $('body');\n  // blockiframe = $blocker.find('iframe')[0];\n\n  var splitterSettings = JSON.parse(localStorage.getItem('splitterSettings') || '[]');\n  return this.each(function () {\n    var $el = $(this),\n    $originalContainer = $(this),\n    guid = $.fn.splitter.guid++,\n    $parent = $el.parent(),\n    type = _type || 'x',\n    $prev = type === 'x' ? $el.prevAll(':visible:first') : $el.nextAll(':visible:first'),\n    $handle = $('<div class=\"resize\"></div>'),\n    dragging = false,\n    width = $parent.width(),\n    parentOffset = $parent.offset(),\n    left = parentOffset.left,\n    top = parentOffset.top, // usually zero :(\n    props = {\n      x: {\n        display: 'block',\n        currentPos: $parent.offset().left,\n        multiplier: 1,\n        cssProp: 'left',\n        otherCssProp: 'right',\n        size: $parent.width(),\n        sizeProp: 'width',\n        moveProp: 'pageX',\n        init: {\n          top: 0,\n          bottom: 0,\n          width: 8,\n          'margin-left': '-4px',\n          height: '100%',\n          left: 'auto',\n          right: 'auto',\n          opacity: 0,\n          position: 'absolute',\n          cursor: 'ew-resize',\n          // 'border-top': '0',\n          'border-left': '1px solid rgba(218, 218, 218, 0.5)',\n          'z-index': 99999\n        }\n      },\n      y: {\n        display: 'block',\n        currentPos: $parent.offset().top,\n        multiplier: -1,\n        size: $parent.height(),\n        cssProp: 'bottom',\n        otherCssProp: 'top',\n        sizeProp: 'height',\n        moveProp: 'pageY',\n        init: {\n          top: 'auto',\n          cursor: 'ns-resize',\n          bottom: 'auto',\n          height: 8,\n          width: '100%',\n          left: 0,\n          right: 0,\n          opacity: 0,\n          position: 'absolute',\n          border: 0,\n          // 'border-top': '1px solid rgba(218, 218, 218, 0.5)',\n          'z-index': 99999\n        }\n      }\n    },\n    refreshTimer = null,\n    settings = splitterSettings[guid] || {};\n\n    var tracker = {\n      down: { x: null, y: null },\n      delta: { x: null, y: null },\n      track: false,\n      timer: null\n    };\n    $handle.bind('mousedown', function (event) {\n      tracker.down.x = event.pageX;\n      tracker.down.y = event.pageY;\n      tracker.delta = { x: null, y: null };\n      tracker.target = $handle[type == 'x' ? 'height' : 'width']() * 0.25;\n    });\n\n    $document.bind('mousemove', function (event) {\n      if (dragging) {\n        tracker.delta.x = tracker.down.x - event.pageX;\n        tracker.delta.y = tracker.down.y - event.pageY;\n        clearTimeout(tracker.timer);\n        tracker.timer = setTimeout(function () {\n          tracker.down.x = event.pageX;\n          tracker.down.y = event.pageY;\n        }, 250);\n        //disable change to y\n        //var targetType = type == 'x' ? 'y' : 'x';\n        //if (Math.abs(tracker.delta[targetType]) > tracker.target) {\n          //$handle.trigger('change', targetType, event[props[targetType].moveProp]);\n          //tracker.down.x = event.pageX;\n          //tracker.down.y = event.pageY;\n        //}\n      }\n    });\n\n    function moveSplitter(pos) {\n      if (type === 'y') {\n        pos -= top;\n      }\n      var v = pos - props[type].currentPos,\n      split = 100 / props[type].size * v,\n      delta = (pos - settings[type]) * props[type].multiplier,\n      prevSize = $prev[props[type].sizeProp](),\n      elSize = $el[props[type].sizeProp]();\n\n      if (type === 'y') {\n        split = 100 - split;\n      }\n\n      // if prev panel is too small and delta is negative, block\n      if (prevSize < 100 && delta < 0) {\n        // ignore\n      } else if (elSize < 100 && delta > 0) {\n        // ignore\n      } else {\n        // allow sizing to happen\n        $el.css(props[type].cssProp, split + '%');\n        $prev.css(props[type].otherCssProp, (100 - split) + '%');\n        var css = {};\n        css[props[type].cssProp] = split + '%';\n        $handle.css(css);\n        settings[type] = pos;\n        splitterSettings[guid] = settings;\n        localStorage.setItem('splitterSettings', JSON.stringify(splitterSettings));\n\n        // wait until animations have completed!\n        if (moveSplitter.timer) clearTimeout(moveSplitter.timer);\n        moveSplitter.timer = setTimeout(function () {\n          $document.trigger('sizeeditors');\n        }, 120);\n      }\n    }\n\n    function resetPrev() {\n      $prev = type === 'x' ? $handle.prevAll(':visible:first') : $handle.nextAll(':visible:first');\n    }\n\n    $document.bind('mouseup touchend', function () {\n      if (dragging) {\n        dragging = false;\n        $handle.trigger('resize-end');\n        $blocker.remove();\n        // $handle.css( 'opacity', '0');\n        $body.removeClass('dragging');\n      }\n    }).bind('mousemove touchmove', function (event) {\n      if (dragging) {\n        moveSplitter(event[props[type].moveProp] || event.originalEvent.touches[0][props[type].moveProp]);\n      }\n    });\n\n    $blocker.bind('mousemove touchmove', function (event) {\n      if (dragging) {\n        moveSplitter(event[props[type].moveProp] || event.originalEvent.touches[0][props[type].moveProp]);\n      }\n    });\n\n    $handle.bind('mousedown touchstart', function (e) {\n      dragging = true;\n      $handle.trigger('resize-start');\n      $body.append($blocker).addClass('dragging');\n      props[type].size = $parent[props[type].sizeProp]();\n      props[type].currentPos = 0; // is this really required then?\n\n      resetPrev();\n      e.preventDefault();\n    });\n\n    /*\n       .hover(function () {\n       $handle.css('opacity', '1');\n       }, function () {\n       if (!dragging) {\n       $handle.css('opacity', '0');\n       }\n       })\n       */\n\n    $handle.bind('fullsize', function(event, panel) {\n      if (panel === undefined) {\n        panel = 'prev';\n      }\n      var split = 0;\n      if (panel === 'prev') {\n        split = 100;\n      }\n      $el.css(props[type].cssProp, split + '%');\n      $prev.css(props[type].otherCssProp, (100 - split) + '%');\n      $handle.hide();\n    });\n\n    $handle.bind('init', function (event, x) {\n      $handle.css(props[type].init);\n      props[type].size = $parent[props[type].sizeProp]();\n      resetPrev();\n\n      // can only be read at init\n      top = $parent.offset().top;\n\n      $blocker.css('cursor', type == 'x' ? 'ew-resize' : 'ns-resize');\n\n      if (type == 'y') {\n        $el.css('border-right', 0);\n        $prev.css('border-left', 0);\n        $prev.css('border-top', '2px solid #ccc');\n      } else {\n        // $el.css('border-right', '1px solid #ccc');\n        $el.css('border-top', 0);\n        // $prev.css('border-right', '2px solid #ccc');\n      }\n\n      if ($el.is(':hidden')) {\n        $handle.hide();\n      } else {\n        if ($prev.length) {\n          $el.css('border-' + props[type].cssProp, '1px solid #ccc');\n        } else {\n          $el.css('border-' + props[type].cssProp, '0');\n        }\n        moveSplitter(x !== undefined ? x : settings[type] || $el.offset()[props[type].cssProp]);\n      }\n    }); //.trigger('init', settings.x || $el.offset().left);\n\n    $handle.bind('change', function (event, toType, value) {\n      $el.css(props[type].cssProp, '0');\n      $prev.css(props[type].otherCssProp, '0');\n      $el.css('border-' + props[type].cssProp, '0');\n\n      if (toType === 'y') {\n        // 1. drop inside of a new div that encompases the elements\n        $el = $el.find('> *');\n        $handle.appendTo($prev);\n        $el.appendTo($prev);\n        $prev.css('height', '100%');\n        $originalContainer.hide();\n        $handle.css('margin-left', 0);\n        $handle.css('margin-top', 5);\n\n        $handle.addClass('vertical');\n\n        delete settings.x;\n\n        $originalContainer.nextAll(':visible:first').trigger('init');\n        // 2. change splitter to the right to point to new block div\n      } else {\n        $el = $prev;\n        $prev = $tmp;\n\n        $el.appendTo($originalContainer);\n        $handle.insertBefore($originalContainer);\n        $handle.removeClass('vertical');\n        $el.css('border-top', 0);\n        $el = $originalContainer;\n        $originalContainer.show();\n        $handle.css('margin-top', 0);\n        $handle.css('margin-left', -4);\n        delete settings.y;\n\n        setTimeout(function() {\n          $originalContainer.nextAll(':visible:first').trigger('init');\n        }, 0);\n      }\n\n      resetPrev();\n\n      type = toType;\n\n      // if (type == 'y') {\n      // FIXME $prev should check visible\n      var $tmp = $el;\n      $el = $prev;\n      $prev = $tmp;\n      // } else {\n\n      // }\n\n      $el.css(props[type].otherCssProp, '0');\n      $prev.css(props[type].cssProp, '0');\n      // TODO\n      // reset top/bottom positions\n      // reset left/right positions\n\n      if ($el.is(':visible')) {\n        // find all other handles and recalc their height\n        if (type === 'y') {\n          var otherhandles = $el.find('.resize');\n\n          otherhandles.each(function (i) {\n            // find the top of the\n            var $h = $(this);\n            if (this === $handle[0]) {\n              // ignore\n            } else {\n              // TODO change to real px :(\n              $h.trigger('init', 100 / (otherhandles - i - 1));\n            }\n          });\n        }\n        $handle.trigger('init', value || $el.offset()[props[type].cssProp] || props[type].size / 2);\n      }\n    });\n\n\n    $prev.css('width', 'auto');\n    $prev.css('height', 'auto');\n    $el.data('splitter', $handle);\n    $el.before($handle);\n\n    // if (settings.y) {\n    //   $handle.trigger('change', 'y');\n    // }\n  });\n};\n\n$.fn.splitter.guid = 0;\n\n"
  },
  {
    "path": "pyspider/webui/static/src/task.less",
    "content": "/* vim: set et sw=2 ts=2 sts=2 ff=unix fenc=utf8: */\n/* Author: Binux<i@binux.me> */\n/*         http://binux.me */\n/* Created on 2014-07-16 19:20:30 */\n\n@import \"variable\";\n\n.base-info {\n  padding: 10px 15px 2px 15px;\n  background-color: #f5f5f5;\n  border-bottom: 1px solid #ddd;\n}\n\n.more-info {\n  padding: 10px 15px;\n}\n\n.more-info dd {\n  display: block;\n  font-family: monospace;\n  white-space: pre;\n  word-break: break-all;\n  word-wrap: break-word;\n  margin: 1em 0px;\n}\n\n.status_mix(@color: lighten(black, 50%)) {\n  border: solid 1px darken(@color, 10%);\n  padding: 1px 5px 0 5px;\n  background: @color;\n  color: white;\n}\n.status {\n  &-1 {\n    .status_mix(@blue);\n  }\n  &-2 {\n    .status_mix(@green);\n  }\n  &-3 {\n    .status_mix(@red);\n  }\n  &-4 {\n    .status_mix;\n  }\n}\n\n.url {\n  font-size: 120%;\n  text-decoration: underline;\n}\n\n.callback {\n  color: @orange;\n  font-weight: bold;\n\n  &:hover, &:focus {\n    color: darken(@orange, 10%);\n  }\n}\n\ndt .glyphicon-ok {\n  color: @green;\n}\ndt .glyphicon-remove {\n  color: @red;\n}\n"
  },
  {
    "path": "pyspider/webui/static/src/tasks.less",
    "content": "/* vim: set et sw=2 ts=2 sts=2 ff=unix fenc=utf8: */\n/* Author: Binux<i@binux.me> */\n/*         http://binux.me */\n/* Created on 2014-07-18 23:20:46 */\n\n@import \"variable\";\n@import \"task\";\n\n.tasks {\n  margin: 0;\n  padding: 0;\n  list-style-type: none;\n\n  li {\n    .base-info;\n\n    &:nth-child(even) {\n      background-color: white;\n    }\n  }\n\n  .url {\n    display: inline-block;\n    vertical-align: bottom;\n    max-width: 40em;\n    overflow: hidden;\n    white-space: nowrap;\n    text-overflow: ellipsis;\n  }\n  \n  .update-time {\n    font-weight: bold;\n  }\n}\n"
  },
  {
    "path": "pyspider/webui/static/src/variable.less",
    "content": "/* vim: set et sw=2 ts=2 sts=2 ff=unix fenc=utf8: */\n/* Author: Binux<i@binux.me> */\n/*         http://binux.me */\n/* Created on 2014-07-16 19:18:30 */\n\n// colors\n@gray-darker:            lighten(#000, 13.5%); // #222\n@gray-dark:              lighten(#000, 20%);   // #333\n@gray:                   lighten(#000, 33.5%); // #555\n@gray-light:             lighten(#000, 60%);   // #999\n@gray-lighter:           lighten(#000, 93.5%); // #eee\n\n@blue: #428bca;\n@green: #5cb85c;\n@blue-light: #5bc0de;\n@orange: #f0ad4e;\n@yellow: #ffe543;\n@red: #d9534f;\n"
  },
  {
    "path": "pyspider/webui/static/webpack.config.js",
    "content": "var webpack = require(\"webpack\");\nvar ExtractTextPlugin = require(\"extract-text-webpack-plugin\");\n\nmodule.exports = {\n  entry: {\n    index: \"./src/index\",\n    debug: \"./src/debug\",\n    result: \"./src/result.less\",\n    task: \"./src/task.less\",\n    tasks: \"./src/tasks.less\",\n  },\n  output: {\n    //path: \"./dist\",\n    filename: \"[name].min.js\"\n  },\n  module: {\n    loaders: [\n      { test: /\\.js$/, loader: \"babel-loader\" },\n      { test: /\\.less$/, loader: ExtractTextPlugin.extract(\"style-loader\", \"css-loader?sourceMap!less-loader?sourceMap\") }\n    ]\n  },\n  devtool: 'source-map',\n  plugins: [\n    new ExtractTextPlugin(\"[name].min.css\"),\n    new webpack.optimize.UglifyJsPlugin({ compress: { warnings: false } }),\n  ]\n}\n"
  },
  {
    "path": "pyspider/webui/task.py",
    "content": "#!/usr/bin/env python\n# -*- encoding: utf-8 -*-\n# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:\n# Author: Binux<i@binux.me>\n#         http://binux.me\n# Created on 2014-07-16 15:30:57\n\nimport socket\nfrom flask import abort, render_template, request, json\n\nfrom pyspider.libs import utils\nfrom .app import app\n\n\n@app.route('/task/<taskid>')\ndef task(taskid):\n    if ':' not in taskid:\n        abort(400)\n    project, taskid = taskid.split(':', 1)\n\n    taskdb = app.config['taskdb']\n    task = taskdb.get_task(project, taskid)\n\n    if not task:\n        abort(404)\n    resultdb = app.config['resultdb']\n    result = {}\n    if resultdb:\n        result = resultdb.get(project, taskid)\n\n    return render_template(\"task.html\", task=task, json=json, result=result,\n                           status_to_string=app.config['taskdb'].status_to_string)\n\n\n@app.route('/task/<taskid>.json')\ndef task_in_json(taskid):\n    if ':' not in taskid:\n        return json.jsonify({'code': 400, 'error': 'bad project:task_id format'})\n    project, taskid = taskid.split(':', 1)\n\n    taskdb = app.config['taskdb']\n    task = taskdb.get_task(project, taskid)\n\n    if not task:\n        return json.jsonify({'code': 404, 'error': 'not found'})\n    task['status_string'] = app.config['taskdb'].status_to_string(task['status'])\n    return json.jsonify(task)\n\n\n@app.route('/tasks')\ndef tasks():\n    rpc = app.config['scheduler_rpc']\n    taskdb = app.config['taskdb']\n    project = request.args.get('project', \"\")\n    limit = int(request.args.get('limit', 100))\n\n    try:\n        updatetime_tasks = rpc.get_active_tasks(project, limit)\n    except socket.error as e:\n        app.logger.warning('connect to scheduler rpc error: %r', e)\n        return 'connect to scheduler error', 502\n\n    tasks = {}\n    result = []\n    for updatetime, task in sorted(updatetime_tasks, key=lambda x: x[0]):\n        key = '%(project)s:%(taskid)s' % task\n        task['updatetime'] = updatetime\n        if key in tasks and tasks[key].get('status', None) != taskdb.ACTIVE:\n            result.append(tasks[key])\n        tasks[key] = task\n    result.extend(tasks.values())\n\n    return render_template(\n        \"tasks.html\",\n        tasks=result,\n        status_to_string=taskdb.status_to_string\n    )\n\n\n@app.route('/active_tasks')\ndef active_tasks():\n    rpc = app.config['scheduler_rpc']\n    taskdb = app.config['taskdb']\n    project = request.args.get('project', \"\")\n    limit = int(request.args.get('limit', 100))\n\n    try:\n        tasks = rpc.get_active_tasks(project, limit)\n    except socket.error as e:\n        app.logger.warning('connect to scheduler rpc error: %r', e)\n        return '{}', 502, {'Content-Type': 'application/json'}\n\n    result = []\n    for updatetime, task in tasks:\n        task['updatetime'] = updatetime\n        task['updatetime_text'] = utils.format_date(updatetime)\n        if 'status' in task:\n            task['status_text'] = taskdb.status_to_string(task['status'])\n        result.append(task)\n\n    return json.dumps(result), 200, {'Content-Type': 'application/json'}\n\napp.template_filter('format_date')(utils.format_date)\n"
  },
  {
    "path": "pyspider/webui/templates/debug.html",
    "content": "<!DOCTYPE html>\n<html lang=\"en\">\n  <head>\n    <meta charset=\"utf-8\">\n    <title>{{ project_name }} - Debugger - pyspider</title>\n    <!--[if lt IE 9]>\n      <script src=\"http://html5shim.googlecode.com/svn/trunk/html5.js\"></script>\n    <![endif]-->\n\n    <meta name=\"description\" content=\"pyspider - debugger - {{ project_name }}\">\n    <meta name=\"author\" content=\"binux\">\n\n    <link href=\"{{ url_for('cdn', path='codemirror/5.20.2/codemirror.min.css') }}\" rel=\"stylesheet\">\n    <link href=\"{{ url_for('cdn', path='font-awesome/4.0.3/css/font-awesome.min.css') }}\" rel=\"stylesheet\">\n    <link href=\"{{ url_for('cdn', path='codemirror/5.20.2/addon/dialog/dialog.min.css') }}\" rel=\"stylesheet\">\n    <link href=\"{{ url_for('cdn', path='codemirror/5.20.2/addon/lint/lint.min.css') }}\" rel=\"stylesheet\">\n    <link href=\"{{ url_for('static', filename='debug.min.css') }}\" rel=\"stylesheet\">\n\n    <script src=\"{{ url_for('cdn', path='jquery/1.11.0/jquery.min.js') }}\"></script>\n    <script src=\"{{ url_for('cdn', path='jsonlint/1.6.0/jsonlint.min.js') }}\"></script>\n    <script src=\"{{ url_for('cdn', path='codemirror/5.20.2/codemirror.min.js') }}\"></script>\n    <script src=\"{{ url_for('cdn', path='codemirror/5.20.2/mode/xml/xml.min.js') }}\"></script>\n    <script src=\"{{ url_for('cdn', path='codemirror/5.20.2/mode/css/css.min.js') }}\"></script>\n    <script src=\"{{ url_for('cdn', path='codemirror/5.20.2/mode/javascript/javascript.min.js') }}\"></script>\n    <script src=\"{{ url_for('cdn', path='codemirror/5.20.2/mode/htmlmixed/htmlmixed.min.js') }}\"></script>\n    <script src=\"{{ url_for('cdn', path='codemirror/5.20.2/mode/python/python.min.js') }}\"></script>\n    <script src=\"{{ url_for('cdn', path='codemirror/5.20.2/addon/search/search.min.js') }}\"></script>\n    <script src=\"{{ url_for('cdn', path='codemirror/5.20.2/addon/search/searchcursor.min.js') }}\"></script>\n    <script src=\"{{ url_for('cdn', path='codemirror/5.20.2/addon/dialog/dialog.min.js') }}\"></script>\n    <script src=\"{{ url_for('cdn', path='codemirror/5.20.2/addon/selection/active-line.min.js') }}\"></script>\n    <script src=\"{{ url_for('cdn', path='codemirror/5.20.2/addon/runmode/runmode.min.js') }}\"></script>\n    <script src=\"{{ url_for('cdn', path='codemirror/5.20.2/addon/lint/lint.min.js') }}\"></script>\n    <script src=\"{{ url_for('cdn', path='codemirror/5.20.2/addon/lint/json-lint.min.js') }}\"></script>\n    <script src=\"{{ url_for('cdn', path='codemirror/2.36.0/formatting.min.js') }}\"></script>\n    <script src=\"{{ url_for('cdn', path='URI.js/1.11.2/URI.min.js') }}\"></script>\n  </head>\n\n  <body>\n    <section id=\"control\">\n      <div class=\"title pull-left\"><a href=\"/\">pyspider</a> &gt; {{ project_name }}</div>\n      <div class=\"pull-right\">\n        <a href=\"http://docs.pyspider.org/\" target=\"_blank\">Documentation</a>\n        <span class=\"webdav-btn\">WebDAV Mode</span>\n      </div>\n    </section>\n    <section id=\"editarea\">\n      <div id=\"left-area\" class=\"debug-panel\" style=\"right: 50%\">\n        <div id=\"task-panel\">\n          <div id=\"task-editor\" class=\"editor\">\n            <div id=\"run-task-btn\">run</div>\n            <div id=\"undo-redo-btn-group\">\n              <a href=\"javascript:;\" id=\"undo-btn\"> &lt; </a>|<a href=\"javascript:;\" id=\"redo-btn\">&gt; </a>\n              <span id=\"history-wrap\" style=\"display: none;\">|<a target=_blank id=\"history-link\">history</a></span>\n            </div>\n          </div>\n          <div id=\"python-log\" style=\"display: none;\">\n            <pre style=\"display: none;\"></pre>\n            <div id=\"python-log-show\"></div>\n          </div>\n          <div id=\"debug-tabs\">\n            <div id=\"tab-web\" class=\"tab\" style=\"display: none;\">\n              <div id=\"css-selector-helper\">\n                <input class=\"copy-selector-input\" />\n                <button class=\"btn copy-selector\"><i class=\"fa fa-clipboard\" title=\"copy css selector\"></i></button>\n                <button class=\"btn add-to-editor\"><i class=\"fa fa-arrow-right\" title=\"add to editor\"></i></button>\n              </div>\n              <div class=\"iframe-box\"></div>\n            </div>\n            <div id=\"tab-html\" class=\"tab\" style=\"display: none;\"><pre class=\"cm-s-default\"></pre></div>\n            <div id=\"tab-follows\" class=\"tab\">\n              {# <div class=\"newtask\">\n                <span class=\"task-callback\">__callback__</span> &gt; <span class=\"task-url\">__url__</span>\n                <div class=\"task-run\"><i class=\"fa fa-play\"></i></div>\n                <div class=\"task-more\"> <i class=\"fa fa-ellipsis-h\"></i> </div>\n              </div> #}\n            </div>\n            <div id=\"tab-messages\" class=\"tab\" style=\"display: none;\">\n              <pre class=\"cm-s-default\"></pre>\n            </div>\n          </div>\n        </div>\n        <ul id=\"tab-control\">\n          <li data-id=\"tab-messages\">messages<span class=\"num\" style=\"display: none;\"></span></li>\n          <li data-id=\"tab-follows\">follows<span class=\"num\" style=\"display: none;\"></span></li>\n          <li data-id=\"tab-html\">html</li>\n          <li data-id=\"tab-web\" class=\"active\">web</li>\n          <li id=\"J-enable-css-selector-helper\">enable css selector helper</li>\n        </ul>\n        <div class=\"overlay\" style=\"display: none;\"></div>\n      </div>\n\n      <div id=\"right-area\" class=\"debug-panel\" style=\"left: 50%\">\n        <div id=\"python-editor\" class=\"editor focus\">\n          <div id=\"save-task-btn\">save</div>\n        </div>\n        <div class=\"overlay\" style=\"display: none;\"></div>\n      </div>\n    </section>\n\n    <script>\n      var task_content = {{ task | tojson | tojson | safe }};\n      var script_content = {{ script | tojson | safe }};\n    </script>\n    <script src=\"{{ url_for('static', filename='debug.min.js') }}\"></script>\n  </body>\n</html>\n<!-- vim: set et sw=2 ts=2 sts=2 ff=unix fenc=utf8 syntax=htmldjango: -->\n\n"
  },
  {
    "path": "pyspider/webui/templates/index.html",
    "content": "<!DOCTYPE html>\n<html lang=\"en\">\n  <head>\n    <meta charset=\"utf-8\">\n    <title>Dashboard - pyspider</title>\n    <!--[if lt IE 9]>\n      <script src=\"http://html5shim.googlecode.com/svn/trunk/html5.js\"></script>\n    <![endif]-->\n\n    <meta name=\"description\" content=\"pyspider dashboard\">\n    <meta name=\"author\" content=\"binux\">\n    <link href=\"{{ url_for('cdn', path='twitter-bootstrap/3.1.1/css/bootstrap.min.css') }}\" rel=\"stylesheet\">\n    <link href=\"{{ url_for('cdn', path='x-editable/1.5.0/bootstrap3-editable/css/bootstrap-editable.css') }}\" rel=\"stylesheet\">\n    <link href=\"{{ url_for('cdn', path='sortable/0.6.0/css/sortable-theme-bootstrap.css') }}\" rel=\"stylesheet\">\n    <link href=\"{{ url_for('static', filename='index.min.css') }}\" rel=\"stylesheet\">\n\n    <script src=\"{{ url_for('cdn', path='jquery/1.11.0/jquery.min.js') }}\"></script>\n  </head>\n\n  <body>\n    <header>\n      <div id=\"need-set-status-alert\" class=\"alert alert-danger alert-dismissible\" style=\"display:none;\" role=\"alert\">\n        <button type=\"button\" class=\"close\" data-dismiss=\"alert\" aria-label=\"Close\"><span aria-hidden=\"true\">&times;</span></button>\n        Project is not started, please set status to RUNNING or DEBUG.\n      </div>\n      <h1>pyspider dashboard</h1>\n      <table class=\"table queue-info\">\n        <tr>\n          <th>scheduler</th>\n          <td class=\"queue_value\" title=\"scheduler2fetcher\">???</td>\n          <th>fetcher</th>\n          <td class=\"queue_value\" title=\"fetcher2processor\">???</td>\n          <th>processor</th>\n          <td class=\"queue_value\" title=\"processor2result\">???</td>\n          <th>result_worker</th>\n        </tr>\n        <tr>\n          <td style=\"border-width: 0px 1px;\"></td>\n          <td colspan=3></td>\n          <td style=\"border-width: 0px 1px;\"></td>\n          <td colspan=2 style=\"border-width: 0px;\"></td>\n        </tr>\n        <tr>\n          <td style=\"border-width: 0px 0px 1px 1px\"></td>\n          <td colspan=3 style=\"border-width: 1px 0px;\">\n            <span class=\"queue_value\" title=\"newtask_queue\">???</span>\n            + <span class=\"queue_value\" title=\"status_queue\">???</span>\n          </td>\n          <td style=\"border-width: 0px 1px 1px 0px;\"></td>\n          <td colspan=2 style=\"border-width: 0px;\"></td>\n        </tr>\n      </table>\n    </header>\n    <section>\n      <div class=\"global-btn clearfix\">\n        <div class=\"create-btn-div\">\n          <button class=\"project-create btn btn-default btn-primary\" data-toggle=\"modal\" data-target=\"#create-project-modal\">Create</button>\n        </div>\n\n        <div class=\"active-btn-div\">\n          {% if config.scheduler_rpc is not none %}\n            <a class=\"btn btn-default btn-info\" href='/tasks' target=_blank>Recent Active Tasks</a>\n          {% endif %}\n        </div>\n\n        <div class=\"modal fade\" id=\"create-project-modal\">\n          <div class=\"modal-dialog\">\n            <div class=\"modal-content\">\n              <div class=\"modal-header\">\n                <button type=\"button\" class=\"close\" data-dismiss=\"modal\" aria-label=\"Close\"><span aria-hidden=\"true\">&times;</span></button>\n                <h4 class=\"modal-title\">Create New Project</h4>\n              </div>\n              <form class=\"form-horizontal\" method=\"POST\">\n                <div class=\"modal-body\">\n                  <div class=\"form-group\">\n                    <label class=\"col-sm-3 control-label\" for=\"project-name\">Project Name</label>\n                    <div class=\"col-sm-9\">\n                      <input class=\"form-control\" type=\"text\" name=\"project-name\" autocomplete=\"off\">\n                      <span class=\"help-block\" style=\"display: none;\">[a-zA-Z0-9_]+</span>\n                    </div>\n                  </div>\n                  <div class=\"form-group\">\n                    <label class=\"col-sm-3 control-label\" for=\"start-urls\">Start URL(s)</label>\n                    <div class=\"col-sm-9\">\n                      <input class=\"form-control\" type=\"text\" name=\"start-urls\">\n                    </div>\n                  </div>\n                  <div class=\"form-group\">\n                    <label class=\"col-sm-3 control-label\" for=\"script-mode\">Mode</label>\n                    <div class=\"col-sm-9\">\n                      <div class=\"btn-group\" data-toggle=\"buttons\">\n                        <label class=\"btn btn-default active\">\n                          <input type=\"radio\" name=\"script-mode\" id=\"mode-script\" autocomplete=\"off\" value=\"script\" checked> Script\n                        </label>\n                        <label class=\"btn btn-default\">\n                          <input type=\"radio\" name=\"script-mode\" id=\"mode-slime\" autocomplete=\"off\" value=\"slime\"> Slime\n                        </label>\n                      </div>\n                    </div>\n                  </div>\n                </div>\n                <div class=\"modal-footer\">\n                  <button type=\"button\" class=\"btn btn-default\" data-dismiss=\"modal\">Close</button>\n                  <button type=\"submit\" class=\"btn btn-primary\">Create</button>\n                </div>\n              </form>\n            </div>\n          </div>\n        </div>\n      </div>\n      <table class=\"table sortable-theme-bootstrap projects\">\n        <thead>\n          <tr>\n            <th>group</th>\n            <th>project name</th>\n            <th>status</th>\n            <th data-type=\"num\">rate/burst</th>\n            <th data-type=\"num\">avg time</th>\n            <th class=\"project-progress\" data-type=\"num\">&nbsp;<span>progress</span></th>\n            <th data-type=\"num\">&nbsp;</th>\n            <th data-type=\"num\">&nbsp;</th>\n            <th data-type=\"num\">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</th>\n            <th data-type=\"num\">actions</th>\n          </tr>\n        </thead>\n        <tbody>\n        {% raw %}\n          <tr v-cloak v-for=\"project in projects\" data-name=\"{{* project.name }}\">\n            <td class=\"project-group\"><span>{{ project.group }}</span></td>\n            <td class=\"project-name\"><a href=\"/debug/{{* project.name }}\">{{* project.name }}</a></td>\n            <td class=\"project-status\">\n              <span class=\"status-{{ project.paused ? 'PAUSED' : project.status }}\" :data-value=\"project.paused ? 'PAUSED' : project.status\">\n                {{ project.paused ? 'PAUSED' : project.status }}\n              </span>\n            </td>\n            <td class=\"project-rate\" :data-value=\"project.rate\"><span>{{ project.rate }}/{{ project.burst }}</span></td>\n            <td class=\"project-time\" :data-value=\"project.time.fetch_time + project.time.process_time\">\n              <span v-show=\"project.time.fetch_time\">{{ (project.time.fetch_time * 1000).toFixed(1) }}+{{ (project.time.process_time * 1000).toFixed(2) }}</span>\n            </td>\n            <td v-for=\"type in '5m,1h,1d,all'.split(',')\"\n                class=\"project-progress progress-{{* type }}\"\n                :title=\"project.progress[type].title\"\n                :data-value=\"project.progress[type].task\">\n              <div class=\"progress\">\n                <div class=\"progress-text\">{{* type }}<span v-show=\"project.progress[type].task\">: {{ project.progress[type].task }}</span></div>\n                <div class=\"progress-bar progress-pending\"\n                     :style=\"{ width: project.progress[type].pending/project.progress[type].task*100 + '%' }\"></div>\n                <div class=\"progress-bar progress-bar-success progress-success\"\n                     :style=\"{ width: project.progress[type].success/project.progress[type].task*100 + '%' }\"></div>\n                <div class=\"progress-bar progress-bar-warning progress-retry\"\n                     :style=\"{ width: project.progress[type].retry/project.progress[type].task*100 + '%' }\"></div>\n                <div class=\"progress-bar progress-bar-danger progress-failed\"\n                     :style=\"{ width: project.progress[type].failed/project.progress[type].task*100 + '%' }\"\n                ></div>\n              </div>\n            </td>\n            {% endraw %}\n\n            {% raw %}\n            <td class=\"project-actions\" data-value=\"{{ project.updatetime }}\">\n              {% endraw %}\n              # if config.scheduler_rpc is not none:\n              {% raw %}\n              <button class=\"project-run btn btn-default btn-xs\" @click=\"project_run(project, $event)\">Run</button>\n              <a class=\"btn btn-default btn-xs\" href=\"/tasks?project={{ project.name }}\" target=_blank>Active Tasks</a>\n              {% endraw %}\n              # endif\n              # if config.resultdb:\n              {% raw %}\n              <a class=\"btn btn-default btn-xs\" href=\"/results?project={{ project.name }}\" target=_blank>Results</a>\n              {% endraw %}\n              # endif\n            </td>\n          </tr>\n        </tbody>\n      </table>\n    </section>\n    <script>\n      // json projects data for vue\n      var projects = {{ projects | tojson | safe }};\n    </script>\n    <script src=\"{{ url_for('cdn', path='twitter-bootstrap/3.1.1/js/bootstrap.min.js') }}\"></script>\n    <script src=\"{{ url_for('cdn', path='x-editable/1.5.0/bootstrap3-editable/js/bootstrap-editable.min.js') }}\"></script>\n    <script src=\"{{ url_for('cdn', path='sortable/0.6.0/js/sortable.min.js') }}\"></script>\n    <script src=\"{{ url_for('cdn', path='vue/1.0.26/vue.min.js') }}\"></script>\n    <script src=\"{{ url_for('static', filename='index.min.js') }}\"></script>\n  </body>\n</html>\n<!-- vim: set et sw=2 ts=2 sts=2 ff=unix fenc=utf8 syntax=htmldjango: -->\n\n"
  },
  {
    "path": "pyspider/webui/templates/result.html",
    "content": "<!DOCTYPE html>\n<html lang=\"en\">\n  <head>\n    <meta charset=\"utf-8\">\n    <title>Results - {{ project }} - pyspider</title>\n    <!--[if lt IE 9]>\n      <script src=\"http://html5shim.googlecode.com/svn/trunk/html5.js\"></script>\n    <![endif]-->\n\n    <meta name=\"description\" content=\"results of {{ project }}\">\n    <meta name=\"author\" content=\"binux\">\n    <link href=\"{{ url_for('cdn', path='twitter-bootstrap/3.1.1/css/bootstrap.min.css') }}\" rel=\"stylesheet\">\n    <link href=\"{{ url_for('static', filename='result.min.css') }}\" rel=\"stylesheet\">\n\n    <script src=\"{{ url_for('cdn', path='jquery/1.11.0/jquery.min.js') }}\"></script>\n    <script src=\"{{ url_for('cdn', path='twitter-bootstrap/3.1.1/js/bootstrap.min.js') }}\"></script>\n  </head>\n\n  <body>\n    <div class=\"top-bar\">\n      <h1>{{ project }} - Results</h1>\n      <div class=\"btn-group\">\n        <a href=\"/results/dump/{{ project }}.json\"\n          target=\"_blank\" class=\"btn btn-default btn-sm\">\n          <span class=\"glyphicon glyphicon-download-alt\"></span>\n          JSON</a>\n        <a href=\"/results/dump/{{ project }}.txt\"\n          target=\"_blank\" class=\"btn btn-default btn-sm\">URL-JSON</a>\n        <a href=\"/results/dump/{{ project }}.csv\"\n          target=\"_blank\" class=\"btn btn-default btn-sm\">CSV</a>\n      </div>\n    </div>\n    # set common_fields, results = result_formater(results)\n    <table class=\"table table-condensed table-striped\">\n      <thead>\n        <th>url</th>\n        <th></th>\n        # for field in common_fields|sort\n        <th>\n          {{ field }}\n        </th>\n        # endfor\n        <th>\n          ...\n        </th>\n      </thead>\n      <tbody>\n        # for result in results\n        <tr>\n          <td>\n            <a class=url href=\"/task/{{ project }}:{{ result.taskid }}\" target=_blank>{{ result.url }}</a>\n          </td>\n          <td>\n            <a class=open-url href=\"{{ result.url }}\" target=\"_blank\"><span class=\"glyphicon glyphicon-new-window\"></span></a>\n          </td>\n          # for field in common_fields|sort\n          <td>{{ json.dumps(result.result_formated[field], ensure_ascii=False) | truncate(100, True) }}</td>\n          # endfor\n          <td>\n            {{ json.dumps(result.others, ensure_ascii=False) | truncate(100, True) }}\n          </td>\n        # endfor\n      </tbody>\n    </table>\n\n    <div class=\"pagination-wrap\">\n      <ul class=\"pagination\">\n        # set current_page = int(offset/limit) + (1 if offset%limit else 0)\n        # set count = count if count is not none else 0\n        # set total_page = int(count/limit) + (1 if count%limit else 0)\n        <li class=\"{{ \"disabled\" if current_page - 1 <= 0 else \"\" }}\">\n          <a href=\"{% if current_page>1 %}/results?project={{ project }}&offset={{ (current_page-1)*limit }}&limit={{ limit }}{% endif %}\">&laquo;</a>\n        </li>\n        # set prev = 0\n        # for i in range(0, total_page):\n        # if abs(i-0) < 2 or abs(i-total_page) < 3 or -2 < i-current_page < 5:\n          # set prev = i\n          <li class=\"{% if i == current_page %}active{% endif %}\">\n            <a href=\"/results?project={{ project }}&offset={{ i*limit }}&limit={{ limit }}\">{{ i + 1 }}</a>\n          </li>\n        # elif prev == i-1:\n        <li class=\"disabled\"><a>…</a></li>\n        # endif\n        # endfor\n        <li class=\"{{ \"disabled\" if current_page + 1 >= total_page else \"\" }}\">\n          <a href=\"{% if current_page+1<total_page %}/results?project={{ project }}&offset={{ (current_page+1)*limit }}&limit={{ limit }}{% endif %}\">&raquo;</a>\n        </li>\n      </ul>\n    </div>\n  </body>\n</html>\n"
  },
  {
    "path": "pyspider/webui/templates/task.html",
    "content": "<!DOCTYPE html>\n<html lang=\"en\">\n  <head>\n    <meta charset=\"utf-8\">\n    <title>Task - {{ task.project }}:{{ task.taskid }} - pyspider</title>\n    <!--[if lt IE 9]>\n      <script src=\"http://html5shim.googlecode.com/svn/trunk/html5.js\"></script>\n    <![endif]-->\n\n    <meta name=\"description\" content=\"pyspider taskboard of {{ task.project }}:{{task.taskid }}\">\n    <meta name=\"author\" content=\"binux\">\n    <link href=\"{{ url_for('cdn', path='twitter-bootstrap/3.1.1/css/bootstrap.min.css') }}\" rel=\"stylesheet\">\n    <link href=\"{{ url_for('static', filename='task.min.css') }}\" rel=\"stylesheet\">\n\n    <script src=\"{{ url_for('cdn', path='jquery/1.11.0/jquery.min.js') }}\"></script>\n    <script src=\"{{ url_for('cdn', path='twitter-bootstrap/3.1.1/js/bootstrap.min.js') }}\"></script>\n  </head>\n\n  <body>\n      <div class=base-info>\n        <p>\n          <span class=\"status status-{{ task.status }}\">{{ status_to_string(task.status) }}</span>\n          <a class=callback href=\"/debug/{{ task.project }}?taskid={{ task.taskid }}\">{{ task.project }}.{{ task.process.callback }}</a>\n          &gt;\n          <a class=url href=\"{{ task.url }}\" target=_blank>{{ task.url }}</a>\n          {% if task.status in (2, 3, 4) %}\n          (<span class=last-crawl>{{ task.lastcrawltime | format_date }}</span> crawled )\n          {% else %}\n          (<span class=update-time>{{ task.updatetime | format_date }}</span> updated )\n          {% endif %}\n        </p>\n      </div>\n      <div class=more-info>\n        <dl>\n          <dt>taskid</dt>\n          <dd>{{ task.taskid }}</dd>\n          <dt>lastcrawltime</dt>\n          <dd>{{ task.lastcrawltime }} ({{ task.lastcrawltime | format_date }})</dd>\n          <dt>updatetime</dt>\n          <dd>{{ task.updatetime }} ({{ task.updatetime | format_date }})</dd>\n          # if task.schedule and task.schedule.exetime\n          <dt>exetime</dt>\n          <dd>{{ task.schedule.exetime }} ({{ task.schedule.exetime | format_date }})</dd>\n          # endif\n\n          # if task.track and task.track.fetch\n          <dt>\n            track.fetch\n            <span class=\"glyphicon glyphicon-{{ \"ok\" if task.track.fetch.ok else \"remove\" }}\"></span>\n            {{ (task.track.fetch.time * 1000) | round(2) }}ms\n          </dt>\n          <dd>{{ json.dumps(task.track.fetch, indent=2, ensure_ascii=False) }}</dd>\n          # endif\n\n          # if task.track and task.track.process\n          <dt>\n            track.process\n            <span class=\"glyphicon glyphicon-{{ \"ok\" if task.track.process.ok else \"remove\" }}\"></span>\n            {{ (task.track.process.time * 1000) | round(2) }}ms\n            # if task.track.process.follows\n              +{{ task.track.process.follows | int }}\n            # endif\n          </dt>\n          <dd>\n            #- if task.track.process.exception\n            {{- task.track.process.exception or '' }}\n            # endif\n            #- if task.track.process.logs\n              {{- task.track.process.logs or '' }}\n            # endif\n            {{- json.dumps(task.track.process, indent=2, ensure_ascii=False) -}}\n          </dd>\n          # endif\n        </dl>\n        <dl>\n          #- set not_shown_keys = ('status', 'url', 'project', 'taskid', 'lastcrawltime', 'updatetime', 'track', )\n          #- for key, value in task.items() if key not in not_shown_keys\n          <dt>{{ key }}</dt>\n          <dd>{{ json.dumps(value, indent=2, ensure_ascii=False) if value is mapping else value }}</dd>\n          #- endfor\n        </dl>\n        # if result and result.get('result'):\n        <dl>\n          <dt>result</dt>\n          <dd>{{ json.dumps(result['result'], indent=2, ensure_ascii=False) }}</dd>\n        </dl>\n        # endif\n      </div>\n  </body>\n</html>\n<!-- vim: set et sw=2 ts=2 sts=2 ff=unix fenc=utf8: -->\n\n"
  },
  {
    "path": "pyspider/webui/templates/tasks.html",
    "content": "<!DOCTYPE html>\n<html lang=\"en\">\n  <head>\n    <meta charset=\"utf-8\">\n    <title>Tasks - pyspider</title>\n    <!--[if lt IE 9]>\n      <script src=\"http://html5shim.googlecode.com/svn/trunk/html5.js\"></script>\n    <![endif]-->\n\n    <meta name=\"description\" content=\"last actived tasks\">\n    <meta name=\"author\" content=\"binux\">\n    <link href=\"{{ url_for('cdn', path='twitter-bootstrap/3.1.1/css/bootstrap.min.css') }}\" rel=\"stylesheet\">\n    <link href=\"{{ url_for('static', filename='tasks.min.css') }}\" rel=\"stylesheet\">\n\n    <script src=\"{{ url_for('cdn', path='jquery/1.11.0/jquery.min.js') }}\"></script>\n    <script src=\"{{ url_for('cdn', path='twitter-bootstrap/3.1.1/js/bootstrap.min.js') }}\"></script>\n  </head>\n\n  <body>\n    <ol class=tasks>\n      {% for task in tasks | sort(reverse=True, attribute='updatetime') %}\n      <li class=task>\n        {% if task.status %}\n          <span class=\"status status-{{ task.status }}\">{{ status_to_string(task.status) }}</span>\n        {% elif task.track %}\n        <span class=\"status status-3\">\n          {% set fetchok = task.track.fetch and task.track.fetch.ok %}\n          {% set processok = task.track.process and task.track.process.ok %}\n          {%- if not fetchok -%}\n          FETCH_ERROR\n          {%- elif not processok -%}\n          PROCESS_ERROR\n          {%- endif -%}\n        </span>\n        {% else %}\n          <span class=\"status status-4 }}\">ERROR</span>\n        {% endif %}\n\n        <a class=callback href=\"/debug/{{ task.project }}?taskid={{ task.taskid }}\" target=_blank>{{ task.project }}</a>\n        &gt;\n        <a class=url href=\"/task/{{ task.project }}:{{ task.taskid }}\" title=\"{{ task.url }}\" target=_blank>{{ task.url }}</a>\n\n        <span class=update-time>{{ task.updatetime | format_date }}</span>\n\n        {% if task.track and task.track.fetch %}\n        <span span=use-time>\n          {{- '%.1f' | format(task.track.fetch.time * 1000) }}+{{ '%.2f' | format(task.track.process.time * 1000 if task.track and task.track.process else 0) }}ms\n        </span>\n        {% endif %}\n\n        <span span=follows>\n        {% if task.track and task.track.process %}\n        +{{ task.track.process.follows | int }}\n        {% endif %}\n        </span>\n      </li>\n      {% endfor %}\n    </ol>\n  </body>\n</html>\n<!-- vim: set et sw=2 ts=2 sts=2 ff=unix fenc=utf8: -->\n\n"
  },
  {
    "path": "pyspider/webui/webdav.py",
    "content": "#!/usr/bin/env python\n# -*- encoding: utf-8 -*-\n# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:\n# Author: Binux<roy@binux.me>\n#         http://binux.me\n# Created on 2015-6-3 11:29\n\n\nimport os\nimport time\nimport base64\nimport six\nfrom six import BytesIO\nfrom wsgidav.wsgidav_app import DEFAULT_CONFIG, WsgiDAVApp\nfrom wsgidav.dav_provider import DAVProvider, DAVCollection, DAVNonCollection\nfrom wsgidav.dav_error import DAVError, HTTP_FORBIDDEN\nfrom pyspider.libs.utils import utf8, text\nfrom .app import app\n\n\ndef check_user(environ):\n    authheader = environ.get(\"HTTP_AUTHORIZATION\")\n    if not authheader:\n        return False\n    authheader = authheader[len(\"Basic \"):]\n    try:\n        username, password = text(base64.b64decode(authheader)).split(':', 1)\n    except Exception as e:\n        app.logger.error('wrong api key: %r, %r', authheader, e)\n        return False\n\n    if username == app.config['webui_username'] \\\n            and password == app.config['webui_password']:\n        return True\n    else:\n        return False\n\n\nclass ContentIO(BytesIO):\n    def close(self):\n        self.content = self.getvalue()\n        BytesIO.close(self) #old class\n\n\nclass ScriptResource(DAVNonCollection):\n    def __init__(self, path, environ, app, project=None):\n        super(ScriptResource, self).__init__(path, environ)\n\n        self.app = app\n        self.new_project = False\n        self._project = project\n        self.project_name = text(self.name)\n        self.writebuffer = None\n        if self.project_name.endswith('.py'):\n            self.project_name = self.project_name[:-len('.py')]\n\n    @property\n    def project(self):\n        if self._project:\n            return self._project\n        projectdb = self.app.config['projectdb']\n        if projectdb:\n            self._project = projectdb.get(self.project_name)\n        if not self._project:\n            if projectdb.verify_project_name(self.project_name) and text(self.name).endswith('.py'):\n                self.new_project = True\n                self._project = {\n                    'name': self.project_name,\n                    'script': '',\n                    'status': 'TODO',\n                    'rate': self.app.config.get('max_rate', 1),\n                    'burst': self.app.config.get('max_burst', 3),\n                    'updatetime': time.time(),\n                }\n            else:\n                raise DAVError(HTTP_FORBIDDEN)\n        return self._project\n\n    @property\n    def readonly(self):\n        projectdb = self.app.config['projectdb']\n        if not projectdb:\n            return True\n        if 'lock' in projectdb.split_group(self.project.get('group')) \\\n                and self.app.config.get('webui_username') \\\n                and self.app.config.get('webui_password'):\n            return not check_user(self.environ)\n        return False\n\n    def getContentLength(self):\n        return len(utf8(self.project['script']))\n\n    def getContentType(self):\n        return 'text/plain'\n\n    def getLastModified(self):\n        return self.project['updatetime']\n\n    def getContent(self):\n        return BytesIO(utf8(self.project['script']))\n\n    def beginWrite(self, contentType=None):\n        if self.readonly:\n            self.app.logger.error('webdav.beginWrite readonly')\n            return super(ScriptResource, self).beginWrite(contentType)\n        self.writebuffer = ContentIO()\n        return self.writebuffer\n\n    def endWrite(self, withErrors):\n        if withErrors:\n            self.app.logger.error('webdav.endWrite error: %r', withErrors)\n            return super(ScriptResource, self).endWrite(withErrors)\n        if not self.writebuffer:\n            return\n        projectdb = self.app.config['projectdb']\n        if not projectdb:\n            return\n\n        info = {\n            'script': text(getattr(self.writebuffer, 'content', ''))\n        }\n        if self.project.get('status') in ('DEBUG', 'RUNNING'):\n            info['status'] = 'CHECKING'\n\n        if self.new_project:\n            self.project.update(info)\n            self.new_project = False\n            return projectdb.insert(self.project_name, self.project)\n        else:\n            return projectdb.update(self.project_name, info)\n\n\nclass RootCollection(DAVCollection):\n    def __init__(self, path, environ, app):\n        super(RootCollection, self).__init__(path, environ)\n        self.app = app\n        self.projectdb = self.app.config['projectdb']\n\n    def getMemberList(self):\n        members = []\n        for project in self.projectdb.get_all():\n            project_name = project['name']\n            if not project_name.endswith('.py'):\n                project_name += '.py'\n            native_path = os.path.join(self.path, project_name)\n            native_path = text(native_path) if six.PY3 else utf8(native_path)\n            members.append(ScriptResource(\n                native_path,\n                self.environ,\n                self.app,\n                project\n            ))\n        return members\n\n    def getMemberNames(self):\n        members = []\n        for project in self.projectdb.get_all(fields=['name', ]):\n            project_name = project['name']\n            if not project_name.endswith('.py'):\n                project_name += '.py'\n            members.append(utf8(project_name))\n        return members\n\n\nclass ScriptProvider(DAVProvider):\n    def __init__(self, app):\n        super(ScriptProvider, self).__init__()\n        self.app = app\n\n    def __repr__(self):\n        return \"pyspiderScriptProvider\"\n\n    def getResourceInst(self, path, environ):\n        path = os.path.normpath(path).replace('\\\\', '/')\n        if path in ('/', '.', ''):\n            path = '/'\n            return RootCollection(path, environ, self.app)\n        else:\n            return ScriptResource(path, environ, self.app)\n\n\nclass NeedAuthController(object):\n    def __init__(self, app):\n        self.app = app\n\n    def getDomainRealm(self, inputRelativeURL, environ):\n        return 'need auth'\n\n    def requireAuthentication(self, realmname, environ):\n        return self.app.config.get('need_auth', False)\n\n    def isRealmUser(self, realmname, username, environ):\n        return username == self.app.config.get('webui_username')\n\n    def getRealmUserPassword(self, realmname, username, environ):\n        return self.app.config.get('webui_password')\n\n    def authDomainUser(self, realmname, username, password, environ):\n        return username == self.app.config.get('webui_username') \\\n            and password == self.app.config.get('webui_password')\n\n\nconfig = DEFAULT_CONFIG.copy()\nconfig.update({\n    'mount_path': '/dav',\n    'provider_mapping': {\n        '/': ScriptProvider(app)\n    },\n    'domaincontroller': NeedAuthController(app),\n    'verbose': 1 if app.debug else 0,\n    'dir_browser': {'davmount': False,\n                    'enable': True,\n                    'msmount': False,\n                    'response_trailer': ''},\n})\ndav_app = WsgiDAVApp(config)\n"
  },
  {
    "path": "requirements.txt",
    "content": "Flask==0.10\nJinja2==2.7\nchardet==3.0.4\ncssselect==0.9\nlxml==4.3.3\npycurl==7.43.0.3\npyquery==1.4.0\nrequests==2.24.0\ntornado==4.5.3\nmysql-connector-python==8.0.16\npika==1.1.0\npymongo==3.9.0\nFlask-Login==0.2.11\nu-msgpack-python==1.6\nclick==6.6\nSQLAlchemy==1.3.10\nsix==1.10.0\namqp==2.4.0\nredis==2.10.6\nredis-py-cluster==1.3.6\nkombu==4.4.0\npsycopg2==2.8.2\nelasticsearch==2.3.0\ntblib==1.4.0\n"
  },
  {
    "path": "run.py",
    "content": "#!/usr/bin/env python\n# -*- encoding: utf-8 -*-\n# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:\n# Author: Binux<roy@binux.me>\n#         http://binux.me\n# Created on 2014-11-24 23:11:49\n\nfrom pyspider.run import main\n\nif __name__ == '__main__':\n    main()\n"
  },
  {
    "path": "setup.py",
    "content": "#!/usr/bin/env python\n# -*- encoding: utf-8 -*-\n# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:\n# Author: Binux<roy@binux.me>\n#         http://binux.me\n# Created on 2014-11-24 22:27:45\n\n\nimport sys\nfrom setuptools import setup, find_packages\nfrom codecs import open\nfrom os import path\n\nhere = path.abspath(path.dirname(__file__))\nwith open(path.join(here, 'README.md'), encoding='utf-8') as f:\n    long_description = f.read()\n\nimport pyspider\n\ninstall_requires = [\n    'Flask==0.10',\n    'Jinja2==2.7',\n    'chardet==3.0.4',\n    'cssselect==0.9',\n    \"lxml==4.3.3\",\n    'pycurl==7.43.0.3',\n    'requests==2.24.0',\n    'Flask-Login==0.2.11',\n    'u-msgpack-python==1.6',\n    'click==3.3',\n    'six==1.10.0',\n    'tblib==1.4.0',\n    'wsgidav==2.3.0',\n    'tornado>=3.2,<=4.5.3',\n    'pyquery',\n]\n\nextras_require_all = [\n    'mysql-connector-python==8.0.16',\n    'pymongo==3.9.0',\n    'redis==2.10.6',\n    'redis-py-cluster==1.3.6',\n    'psycopg2==2.8.2',\n    'elasticsearch==2.3.0',\n    'kombu==4.4.0',\n    'amqp==2.4.0',\n    'SQLAlchemy==1.3.10',\n    'pika==1.1.0'\n]\n\nsetup(\n    name='pyspider',\n    version=pyspider.__version__,\n\n    description='A Powerful Spider System in Python',\n    long_description=long_description,\n\n    url='https://github.com/binux/pyspider',\n\n    author='Roy Binux',\n    author_email='roy@binux.me',\n\n    license='Apache License, Version 2.0',\n\n    classifiers=[\n        'Development Status :: 4 - Beta',\n        'Programming Language :: Python :: 3.5',\n        'Programming Language :: Python :: 3.6',\n        'Programming Language :: Python :: 3.7',\n\n        'License :: OSI Approved :: Apache Software License',\n\n        'Intended Audience :: Developers',\n        'Operating System :: OS Independent',\n        'Environment :: Web Environment',\n\n        'Topic :: Internet :: WWW/HTTP',\n        'Topic :: Software Development :: Libraries :: Application Frameworks',\n        'Topic :: Software Development :: Libraries :: Python Modules',\n    ],\n\n    keywords='scrapy crawler spider webui',\n\n    packages=find_packages(exclude=['data', 'tests*']),\n\n    install_requires=install_requires,\n\n    extras_require={\n        'all': extras_require_all,\n        'test': [\n            'coverage',\n            'Werkzeug==0.16.1',\n            'httpbin==0.7.0',\n            'pyproxy==0.1.6',\n            'easywebdav==1.2.0',\n        ]\n    },\n\n    package_data={\n        'pyspider': [\n            'logging.conf',\n            'fetcher/phantomjs_fetcher.js',\n            'fetcher/splash_fetcher.lua',\n            'webui/static/*.js',\n            'webui/static/*.css',\n            'webui/templates/*'\n        ],\n    },\n\n    entry_points={\n        'console_scripts': [\n            'pyspider=pyspider.run:main'\n        ]\n    },\n\n    test_suite='tests.all_suite',\n)\n"
  },
  {
    "path": "tests/__init__.py",
    "content": "#!/usr/bin/env python\n# -*- encoding: utf-8 -*-\n# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:\n# Author: Binux<i@binux.me>\n#         http://binux.me\n# Created on 2014-02-09 10:53:19\n\nimport os\nimport unittest\n\nall_suite = unittest.TestLoader().discover(os.path.dirname(__file__), \"test_*.py\")\n"
  },
  {
    "path": "tests/data_fetcher_processor_handler.py",
    "content": "#!/usr/bin/env python\n# -*- encoding: utf-8 -*-\n# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:\n# Author: Binux<roy@binux.me>\n#         http://binux.me\n# Created on 2015-01-18 14:12:55\n\nfrom pyspider.libs.base_handler import *\n\nclass Handler(BaseHandler):\n\n    @not_send_status\n    def not_send_status(self, response):\n        self.crawl('http://www.baidu.com/')\n        return response.text\n\n    def url_deduplicated(self, response):\n        self.crawl('http://www.baidu.com/')\n        self.crawl('http://www.google.com/')\n        self.crawl('http://www.baidu.com/')\n        self.crawl('http://www.google.com/')\n        self.crawl('http://www.google.com/')\n\n    @catch_status_code_error\n    def catch_http_error(self, response):\n        self.crawl('http://www.baidu.com/')\n        return response.status_code\n\n    def json(self, response):\n        return response.json\n\n    def html(self, response):\n        return response.doc('h1').text()\n\n    def links(self, response):\n        self.crawl([x.attr.href for x in response.doc('a').items()], callback=self.links)\n\n    def cookies(self, response):\n        return response.cookies\n\n    def get_save(self, response):\n        return response.save\n\n    def get_process_save(self, response):\n        return self.save\n\n    def set_process_save(self, response):\n        self.save['roy'] = 'binux'\n\nclass IgnoreHandler(BaseHandler):\n    pass\n\n__handler_cls__ = Handler\n"
  },
  {
    "path": "tests/data_handler.py",
    "content": "\n#!/usr/bin/env python\n# -*- encoding: utf-8 -*-\n# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:\n# Author: Binux<i@binux.me>\n#         http://binux.me\n# Created on 2014-02-22 14:02:21\n\nimport time\nfrom pyspider.libs.base_handler import BaseHandler, catch_status_code_error, every\n\nclass IgnoreHandler(object):\n    pass\n\nclass TestHandler(BaseHandler):\n    retry_delay = {\n        1: 10,\n        '': -1\n    }\n\n    def hello(self):\n        return \"hello world!\"\n\n    def echo(self, response):\n        return response.content\n\n    def saved(self, response):\n        return response.save\n\n    def echo_task(self, response, task):\n        return task['project']\n\n    @catch_status_code_error\n    def catch_status_code(self, response):\n        return response.status_code\n\n    def raise_exception(self):\n        print('print')\n        logger.info(\"info\")\n        logger.warning(\"warning\")\n        logger.error(\"error\")\n        raise Exception('exception')\n\n    def add_task(self, response):\n        self.crawl('http://www.google.com', callback='echo', params={'wd': u'中文'})\n        self.send_message('some_project', {'some': 'message'})\n\n    @every\n    def on_cronjob1(self, response):\n        logger.info('on_cronjob1')\n\n    @every(seconds=10)\n    def on_cronjob2(self, response):\n        logger.info('on_cronjob2')\n\n    def generator(self, response):\n        yield \"a\"\n        yield \"b\"\n\n    def sleep(self, response):\n        time.sleep(response.save)\n\n"
  },
  {
    "path": "tests/data_sample_handler.py",
    "content": "#!/usr/bin/env python\n# -*- encoding: utf-8 -*-\n# Created on __DATE__\n# Project: __PROJECT_NAME__\n\nfrom pyspider.libs.base_handler import *\n\n\nclass Handler(BaseHandler):\n    crawl_config = {\n    }\n\n    @every(minutes=24 * 60)\n    def on_start(self):\n        self.crawl('http://127.0.0.1:14887/pyspider/test.html', callback=self.index_page)\n\n    @config(age=10 * 24 * 60 * 60)\n    def index_page(self, response):\n        for each in response.doc('a[href^=\"http\"]').items():\n            self.crawl(each.attr.href, callback=self.detail_page)\n\n    @config(priority=2)\n    def detail_page(self, response):\n        return {\n            \"url\": response.url,\n            \"title\": response.doc('title').text(),\n        }\n"
  },
  {
    "path": "tests/data_test_webpage.py",
    "content": "#!/usr/bin/env python\n# -*- encoding: utf-8 -*-\n# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:\n# Author: Binux<roy@binux.me>\n#         http://binux.me\n# Created on 2015-01-24 13:44:10\n\nfrom httpbin import app\n\n@app.route('/pyspider/test.html')\ndef test_page():\n    return '''\n<a href=\"/404\">404\n<a href=\"/links/10/0\">0\n<a href=\"/links/10/1\">1\n<a href=\"/links/10/2\">2\n<a href=\"/links/10/3\">3\n<a href=\"/links/10/4\">4\n<a href=\"/gzip\">gzip\n<a href=\"/get\">get\n<a href=\"/deflate\">deflate\n<a href=\"/html\">html\n<a href=\"/xml\">xml\n<a href=\"/robots.txt\">robots\n<a href=\"/cache\">cache\n<a href=\"/stream/20\">stream\n'''\n\n@app.route('/pyspider/ajax.html')\ndef test_ajax():\n    return '''\n<div class=status>loading...</div>\n<div class=ua></div>\n<div class=ip></div>\n<script>\nvar xhr = new XMLHttpRequest();\nxhr.onload = function() {\n  var data = JSON.parse(xhr.responseText);\n  document.querySelector('.status').innerHTML = 'done';\n  document.querySelector('.ua').innerHTML = data.headers['User-Agent'];\n  document.querySelector('.ip').innerHTML = data.origin;\n}\nxhr.open(\"get\", \"/get\", true);\nxhr.send();\n</script>\n'''\n\n@app.route('/pyspider/ajax_click.html')\ndef test_ajax_click():\n    return '''\n<div class=status>loading...</div>\n<div class=ua></div>\n<div class=ip></div>\n<a href=\"javascript:void(0)\" onclick=\"load()\">load</a>\n<script>\nfunction load() {\n    var xhr = new XMLHttpRequest();\n    xhr.onload = function() {\n      var data = JSON.parse(xhr.responseText);\n      document.querySelector('.status').innerHTML = 'done';\n      document.querySelector('.ua').innerHTML = data.headers['User-Agent'];\n      document.querySelector('.ip').innerHTML = data.origin;\n    }\n    xhr.open(\"get\", \"/get\", true);\n    xhr.send();\n}\n</script>\n'''\n"
  },
  {
    "path": "tests/test_base_handler.py",
    "content": "#!/usr/bin/env python\n# -*- encoding: utf-8 -*-\n# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:\n# Author: Binux<roy@binux.me>\n#         http://binux.me\n# Created on 2017-02-26 10:35:23\n\nimport unittest\n\nfrom pyspider.libs.base_handler import BaseHandler\n\n\nclass TestBaseHandler(unittest.TestCase):\n    sample_task_http = {\n        'taskid': 'taskid',\n        'project': 'project',\n        'url': '',\n        'fetch': {\n            'method': 'GET',\n            'headers': {\n                'Cookie': 'a=b',\n                'a': 'b'\n            },\n            'cookies': {\n                'c': 'd',\n            },\n            'timeout': 60,\n            'save': 'abc',\n        },\n        'process': {\n            'callback': 'callback',\n            'save': [1, 2, 3],\n        },\n    }\n\n    def test_task_join_crawl_config(self):\n        task = dict(self.sample_task_http)\n        crawl_config = {\n            'taskid': 'xxxx',       # should not affect finial task\n            'proxy': 'username:password@hostname:port',  # should add proxy\n            'headers': {            # should merge headers\n                'Cookie': 'abc',    # should not affect cookie\n                'c': 'd',           # should add header c\n            }\n        }\n        \n        ret = BaseHandler.task_join_crawl_config(task, crawl_config)\n        self.assertDictEqual(ret, {\n            'taskid': 'taskid',\n            'project': 'project',\n            'url': '',\n            'fetch': {\n                'method': 'GET',\n                'proxy': 'username:password@hostname:port',\n                'headers': {\n                    'Cookie': 'a=b',\n                    'a': 'b',\n                    'c': 'd'\n                },\n                'cookies': {\n                    'c': 'd',\n                },\n                'timeout': 60,\n                'save': 'abc',\n            },\n            'process': {\n                'callback': 'callback',\n                'save': [1, 2, 3],\n            },\n        });\n"
  },
  {
    "path": "tests/test_bench.py",
    "content": "#!/usr/bin/env python\n# -*- encoding: utf-8 -*-\n# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:\n# Author: Binux<roy@binux.me>\n#         http://binux.me\n# Created on 2014-12-10 01:34:09\n\nimport os\nimport sys\nimport time\nimport click\nimport shutil\nimport inspect\nimport unittest\n\nfrom pyspider import run\nfrom pyspider.libs import utils\n\nclass TestBench(unittest.TestCase):\n\n    @classmethod\n    def setUpClass(self):\n        shutil.rmtree('./data/bench', ignore_errors=True)\n        os.makedirs('./data/bench')\n\n    @classmethod\n    def tearDownClass(self):\n        shutil.rmtree('./data/bench', ignore_errors=True)\n\n    def test_10_bench(self):\n        import subprocess\n        #cmd = [sys.executable]\n        cmd = ['coverage', 'run']\n        p = subprocess.Popen(cmd+[\n            inspect.getsourcefile(run),\n            '--queue-maxsize=0',\n            'bench',\n            '--total=500'\n        ], close_fds=True, stderr=subprocess.PIPE)\n\n        stdout, stderr = p.communicate()\n        stderr = utils.text(stderr)\n        print(stderr)\n\n        self.assertEqual(p.returncode, 0, stderr)\n        self.assertIn('Crawled', stderr)\n        self.assertIn('Fetched', stderr)\n        self.assertIn('Processed', stderr)\n        self.assertIn('Saved', stderr)\n"
  },
  {
    "path": "tests/test_counter.py",
    "content": "#!/usr/bin/env python\n# -*- encoding: utf-8 -*-\n# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:\n# Author: Binux<roy@binux.me>\n#         http://binux.me\n# Created on 2015-04-05 00:05:58\n\nimport sys\nimport time\nimport unittest\n\nfrom pyspider.libs import counter\n\nclass TestCounter(unittest.TestCase):\n    def test_010_TimebaseAverageEventCounter(self):\n        c = counter.TimebaseAverageEventCounter(2, 1)\n        for i in range(100):\n            time.sleep(0.1)\n            c.event(100+i)\n\n        self.assertEqual(c.sum, float(180+199)*20/2)\n        self.assertEqual(c.avg, float(180+199)/2)\n\n    def test_020_TotalCounter(self):\n        c = counter.TotalCounter()\n        for i in range(3):\n            c.event(i)\n        self.assertEqual(c.avg, 3)\n        self.assertEqual(c.sum, 3)\n\n    def test_030_AverageWindowCounter(self):\n        c = counter.AverageWindowCounter(10)\n        self.assertTrue(c.empty())\n\n        for i in range(20):\n            c.event(i)\n\n        self.assertFalse(c.empty())\n        self.assertEqual(c.avg, 14.5)\n        self.assertEqual(c.sum, 145)\n\n    def test_020_delete(self):\n        c = counter.CounterManager()\n        c.event(('a', 'b'), 1)\n        c.event(('a', 'c'), 1)\n        c.event(('b', 'c'), 1)\n        \n        self.assertIsNotNone(c['a'])\n        self.assertIsNotNone(c['b'])\n\n        del c['a']\n\n        self.assertNotIn('a', c)\n        self.assertIsNotNone(c['b'])\n"
  },
  {
    "path": "tests/test_database.py",
    "content": "#!/usr/bin/env python\n# -*- encoding: utf-8 -*-\n# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:\n# Author: Binux<i@binux.me>\n#         http://binux.me\n# Created on 2014-02-08 22:37:13\n\nfrom __future__ import unicode_literals, division\n\nimport os\nimport six\nimport time\nimport unittest\n\nfrom pyspider import database\nfrom pyspider.database.base.taskdb import TaskDB\n\n\nclass TaskDBCase(object):\n    sample_task = {\n        'taskid': 'taskid',\n        'project': 'project',\n        'url': 'www.baidu.com/',\n        'status': TaskDB.FAILED,\n        'schedule': {\n            'priority': 1,\n            'retries': 3,\n            'exetime': 0,\n            'age': 3600,\n            'itag': 'itag',\n            'recrawl': 5,\n        },\n        'fetch': {\n            'method': 'GET',\n            'headers': {\n                'Cookie': 'a=b',\n            },\n            'data': 'a=b&c=d',\n            'timeout': 60,\n        },\n        'process': {\n            'callback': 'callback',\n            'save': [1, 2, 3],\n        },\n        'track': {\n            'fetch': {\n                'ok': True,\n                'time': 300,\n                'status_code': 200,\n                'headers': {\n                    'Content-Type': 'plain/html',\n                },\n                'encoding': 'utf8',\n                # 'content': 'asdfasdfasdfasdf',\n            },\n            'process': {\n                'ok': False,\n                'time': 10,\n                'follows': 3,\n                'outputs': 5,\n                'exception': u\"中文\",\n            },\n        },\n        'lastcrawltime': time.time(),\n        'updatetime': time.time(),\n    }\n\n    @classmethod\n    def setUpClass(self):\n        raise NotImplementedError\n\n    # this test not works for mongodb\n    # def test_10_create_project(self):\n        # with self.assertRaises(AssertionError):\n        # self.taskdb._create_project('abc.abc')\n        # self.taskdb._create_project('abc')\n        # self.taskdb._list_project()\n        # self.assertEqual(len(self.taskdb.projects), 1)\n        # self.assertIn('abc', self.taskdb.projects)\n\n    def test_20_insert(self):\n        self.taskdb.insert('project', 'taskid', self.sample_task)\n        self.taskdb.insert('project', 'taskid2', self.sample_task)\n\n    def test_25_get_task(self):\n        task = self.taskdb.get_task('project', 'taskid2')\n        self.assertIsNotNone(task)\n        self.assertEqual(task['taskid'], 'taskid2')\n        self.assertEqual(task['project'], self.sample_task['project'])\n        self.assertEqual(task['url'], self.sample_task['url'])\n        self.assertEqual(task['status'], self.taskdb.FAILED)\n        self.assertEqual(task['schedule'], self.sample_task['schedule'])\n        self.assertEqual(task['fetch'], self.sample_task['fetch'])\n        self.assertEqual(task['process'], self.sample_task['process'])\n        self.assertEqual(task['track'], self.sample_task['track'])\n\n        task = self.taskdb.get_task('project', 'taskid1', fields=['status'])\n        self.assertIsNone(task)\n\n        task = self.taskdb.get_task('project', 'taskid', fields=['taskid', 'track', ])\n        self.assertIn('track', task)\n        self.assertNotIn('project', task)\n\n    def test_30_status_count(self):\n        status = self.taskdb.status_count('abc')\n        self.assertEqual(status, {})\n        status = self.taskdb.status_count('project')\n        self.assertEqual(status, {self.taskdb.FAILED: 2})\n\n    def test_40_update_and_status_count(self):\n        self.taskdb.update('project', 'taskid', status=self.taskdb.ACTIVE)\n        status = self.taskdb.status_count('project')\n        self.assertEqual(status, {self.taskdb.ACTIVE: 1, self.taskdb.FAILED: 1})\n\n        self.taskdb.update('project', 'taskid', track={})\n        task = self.taskdb.get_task('project', 'taskid', fields=['taskid', 'track', ])\n        self.assertIn('track', task)\n        self.assertEqual(task['track'], {})\n\n    def test_50_load_tasks(self):\n        tasks = list(self.taskdb.load_tasks(self.taskdb.ACTIVE))\n        self.assertEqual(len(tasks), 1)\n        task = tasks[0]\n        self.assertIn('taskid', task, task)\n        self.assertEqual(task['taskid'], 'taskid', task)\n        self.assertEqual(task['schedule'], self.sample_task['schedule'])\n        self.assertEqual(task['fetch'], self.sample_task['fetch'])\n        self.assertEqual(task['process'], self.sample_task['process'])\n        self.assertEqual(task['track'], {})\n\n        tasks = list(self.taskdb.load_tasks(self.taskdb.ACTIVE, project='project',\n                                            fields=['taskid']))\n        self.assertEqual(len(tasks), 1)\n        self.assertEqual(tasks[0]['taskid'], 'taskid')\n        self.assertNotIn('project', tasks[0])\n\n    def test_60_relist_projects(self):\n        if hasattr(self.taskdb, '_list_project'):\n            self.taskdb._list_project()\n            self.assertNotIn('system.indexes', self.taskdb.projects)\n\n    def test_z10_drop(self):\n        self.taskdb.insert('drop_project2', 'taskid', self.sample_task)\n        self.taskdb.insert('drop_project3', 'taskid', self.sample_task)\n        self.taskdb.drop('drop_project3')\n        self.assertIsNotNone(self.taskdb.get_task('drop_project2', 'taskid'), None)\n        self.assertIsNone(self.taskdb.get_task('drop_project3', 'taskid'), None)\n\n    def test_z20_update_projects(self):\n        saved = getattr(self.taskdb, 'UPDATE_PROJECTS_TIME', None)\n        self.taskdb.UPDATE_PROJECTS_TIME = 0.1\n        time.sleep(0.2)\n        self.assertIn('drop_project2', self.taskdb.projects)\n        self.assertNotIn('drop_project3', self.taskdb.projects)\n        self.taskdb.UPDATE_PROJECTS_TIME = saved\n\n\nclass ProjectDBCase(object):\n    sample_project = {\n        'name': 'name',\n        'script': 'import time\\nprint(time.time(), \"!@#$%^&*()\\';:<>?/|\")',\n        'status': 'TODO',\n        'rate': 1.0,\n        'burst': 10.0,\n    }\n\n    @classmethod\n    def setUpClass(self):\n        raise NotImplemented\n\n    def test_10_insert(self):\n        self.projectdb.insert('abc', self.sample_project)\n        self.projectdb.insert(u'name中文', self.sample_project)\n        project = self.projectdb.get('abc')\n        self.assertIsNotNone(project)\n\n    def test_20_get_all(self):\n        projects = list(self.projectdb.get_all())\n        self.assertEqual(len(projects), 2)\n        for project in projects:\n            if project['name'] == 'abc':\n                break\n        for key in ('name', 'group', 'status', 'script', 'comments', 'rate', 'burst', 'updatetime'):\n            self.assertIn(key, project)\n\n        self.assertEqual(project['name'], u'abc')\n        self.assertEqual(project['status'], self.sample_project['status'])\n        self.assertEqual(project['script'], self.sample_project['script'])\n        self.assertEqual(project['rate'], self.sample_project['rate'])\n        self.assertEqual(type(project['rate']), float)\n        self.assertEqual(project['burst'], self.sample_project['burst'])\n        self.assertEqual(type(project['burst']), float)\n\n\n        projects = list(self.projectdb.get_all(fields=['name', 'script']))\n        self.assertEqual(len(projects), 2)\n        project = projects[1]\n        self.assertIn('name', project)\n        self.assertNotIn('gourp', project)\n\n    def test_30_update(self):\n        self.projectdb.update('not_found', status='RUNNING')\n        project = self.projectdb.get('not_found')\n        self.assertIsNone(project)\n\n    def test_40_check_update(self):\n        time.sleep(0.1)\n        now = time.time()\n        time.sleep(0.1)\n        self.projectdb.update('abc', status='RUNNING')\n\n        projects = list(self.projectdb.check_update(\n            now,\n            fields=['name', 'status', 'group', 'updatetime', ]\n        ))\n        self.assertEqual(len(projects), 1, repr(projects))\n        project = projects[0]\n        self.assertEqual(project['name'], 'abc')\n        self.assertEqual(project['status'], 'RUNNING')\n\n    def test_45_check_update_when_bootup(self):\n        projects = list(self.projectdb.check_update(0))\n        project = projects[0]\n        for key in ('name', 'group', 'status', 'script', 'comments', 'rate', 'burst', 'updatetime'):\n            self.assertIn(key, project)\n\n    def test_50_get(self):\n        project = self.projectdb.get('not_found')\n        self.assertIsNone(project)\n\n        project = self.projectdb.get('abc')\n        self.assertEqual(project['name'], 'abc')\n        self.assertEqual(project['status'], 'RUNNING')\n\n        project = self.projectdb.get(u'name中文', ['group', 'status', 'name'])\n        self.assertEqual(project['name'], u'name中文')\n        self.assertIn('status', project)\n        self.assertNotIn('gourp', project)\n\n    def test_z10_drop(self):\n        self.projectdb.insert(u'drop_project2', self.sample_project)\n        self.projectdb.insert(u'drop_project3', self.sample_project)\n        self.projectdb.drop('drop_project3')\n        self.assertIsNotNone(self.projectdb.get('drop_project2'))\n        self.assertIsNone(self.projectdb.get('drop_project3'))\n\n\nclass ResultDBCase(object):\n\n    @classmethod\n    def setUpClass(self):\n        raise NotImplemented\n\n    def test_10_save(self):\n        self.resultdb.save('test_project', 'test_taskid', 'test_url', 'result')\n        result = self.resultdb.get('test_project', 'test_taskid')\n        self.assertIsNotNone(result)\n        self.assertEqual(result['result'], 'result')\n\n        self.resultdb.save('test_project', 'test_taskid', 'test_url_updated', 'result_updated')\n        result = self.resultdb.get('test_project', 'test_taskid')\n        self.assertEqual(result['result'], 'result_updated')\n        self.assertEqual(result['url'], 'test_url_updated')\n\n    def test_20_get(self):\n        result = self.resultdb.get('test_project', 'not_exists')\n        self.assertIsNone(result)\n\n        result = self.resultdb.get('not_exists', 'test_taskid')\n        self.assertIsNone(result)\n\n        result = self.resultdb.get('test_project', 'test_taskid', fields=('url', ))\n        self.assertIsNotNone(result)\n        self.assertIn('url', result)\n        self.assertNotIn('result', result)\n\n        result = self.resultdb.get('test_project', 'test_taskid')\n        self.assertEqual(result['taskid'], 'test_taskid')\n        self.assertEqual(result['url'], 'test_url_updated')\n        self.assertEqual(result['result'], 'result_updated')\n        self.assertIn('updatetime', result)\n\n    def test_30_select(self):\n        for i in range(5):\n            self.resultdb.save('test_project', 'test_taskid-%d' % i,\n                               'test_url', 'result-%d' % i)\n        ret = list(self.resultdb.select('test_project'))\n        self.assertEqual(len(ret), 6)\n\n        ret = list(self.resultdb.select('test_project', limit=4))\n        self.assertEqual(len(ret), 4)\n\n        for ret in self.resultdb.select('test_project', fields=('url', ), limit=1):\n            self.assertIn('url', ret)\n            self.assertNotIn('result', ret)\n\n    def test_35_select_limit(self):\n        ret = list(self.resultdb.select('test_project', limit=None, offset=None))\n        self.assertEqual(len(ret), 6)\n\n        ret = list(self.resultdb.select('test_project', limit=None, offset=2))\n        self.assertEqual(len(ret), 4, ret)\n\n    def test_40_count(self):\n        self.assertEqual(self.resultdb.count('test_project'), 6)\n\n    def test_50_select_not_finished(self):\n        for i in self.resultdb.select('test_project'):\n            break\n        self.assertEqual(self.resultdb.count('test_project'), 6)\n\n    def test_60_relist_projects(self):\n        if hasattr(self.resultdb, '_list_project'):\n            self.resultdb._list_project()\n            self.assertNotIn('system.indexes', self.resultdb.projects)\n\n    def test_z10_drop(self):\n        self.resultdb.save('drop_project2', 'test_taskid', 'test_url', 'result')\n        self.resultdb.save('drop_project3', 'test_taskid', 'test_url', 'result')\n        self.resultdb.drop('drop_project3')\n        self.assertIsNotNone(self.resultdb.get('drop_project2', 'test_taskid'))\n        self.assertIsNone(self.resultdb.get('drop_project3', 'test_taskid'))\n\n    def test_z20_update_projects(self):\n        saved = self.resultdb.UPDATE_PROJECTS_TIME\n        self.resultdb.UPDATE_PROJECTS_TIME = 0.1\n        time.sleep(0.2)\n        self.assertIn('drop_project2', self.resultdb.projects)\n        self.assertNotIn('drop_project3', self.resultdb.projects)\n        self.resultdb.UPDATE_PROJECTS_TIME = saved\n\n\nclass TestSqliteTaskDB(TaskDBCase, unittest.TestCase):\n\n    @classmethod\n    def setUpClass(self):\n        self.taskdb = database.connect_database('sqlite+taskdb://')\n        self.assertIsNotNone(self, self.taskdb)\n\n    @classmethod\n    def tearDownClass(self):\n        del self.taskdb\n\n\nclass TestSqliteProjectDB(ProjectDBCase, unittest.TestCase):\n\n    @classmethod\n    def setUpClass(self):\n        self.projectdb = database.connect_database('sqlite+projectdb://')\n        self.assertIsNotNone(self, self.projectdb)\n\n    @classmethod\n    def tearDownClass(self):\n        del self.projectdb\n\n\nclass TestSqliteResultDB(ResultDBCase, unittest.TestCase):\n\n    @classmethod\n    def setUpClass(self):\n        self.resultdb = database.connect_database('sqlite+resultdb://')\n        self.assertIsNotNone(self, self.resultdb)\n\n    @classmethod\n    def tearDownClass(self):\n        del self.resultdb\n\n\n@unittest.skipIf(os.environ.get('IGNORE_MYSQL') or os.environ.get('IGNORE_ALL'), 'no mysql server for test.')\nclass TestMysqlTaskDB(TaskDBCase, unittest.TestCase):\n\n    @classmethod\n    def setUpClass(self):\n        self.taskdb = database.connect_database('mysql+taskdb://localhost/pyspider_test_taskdb')\n        self.assertIsNotNone(self, self.taskdb)\n\n    @classmethod\n    def tearDownClass(self):\n        self.taskdb._execute('DROP DATABASE pyspider_test_taskdb')\n\n\n@unittest.skipIf(os.environ.get('IGNORE_MYSQL') or os.environ.get('IGNORE_ALL'), 'no mysql server for test.')\nclass TestMysqlProjectDB(ProjectDBCase, unittest.TestCase):\n\n    @classmethod\n    def setUpClass(self):\n        self.projectdb = database.connect_database(\n            'mysql+projectdb://localhost/pyspider_test_projectdb'\n        )\n        self.assertIsNotNone(self, self.projectdb)\n\n    @classmethod\n    def tearDownClass(self):\n        self.projectdb._execute('DROP DATABASE pyspider_test_projectdb')\n\n\n@unittest.skipIf(os.environ.get('IGNORE_MYSQL') or os.environ.get('IGNORE_ALL'), 'no mysql server for test.')\nclass TestMysqlResultDB(ResultDBCase, unittest.TestCase):\n\n    @classmethod\n    def setUpClass(self):\n        self.resultdb = database.connect_database(\n            'mysql+resultdb://localhost/pyspider_test_resultdb'\n        )\n        self.assertIsNotNone(self, self.resultdb)\n\n    @classmethod\n    def tearDownClass(self):\n        self.resultdb._execute('DROP DATABASE pyspider_test_resultdb')\n\n\n@unittest.skipIf(os.environ.get('IGNORE_MONGODB') or os.environ.get('IGNORE_ALL'), 'no mongodb server for test.')\nclass TestMongoDBTaskDB(TaskDBCase, unittest.TestCase):\n\n    @classmethod\n    def setUpClass(self):\n        self.taskdb = database.connect_database(\n            'mongodb+taskdb://localhost:27017/pyspider_test_taskdb'\n        )\n        self.assertIsNotNone(self, self.taskdb)\n\n    @classmethod\n    def tearDownClass(self):\n        self.taskdb.conn.drop_database(self.taskdb.database.name)\n\n    def test_create_project(self):\n        self.assertNotIn('test_create_project', self.taskdb.projects)\n        self.taskdb._create_project('test_create_project')\n        self.assertIn('test_create_project', self.taskdb.projects)\n\n\n@unittest.skipIf(os.environ.get('IGNORE_MONGODB') or os.environ.get('IGNORE_ALL'), 'no mongodb server for test.')\nclass TestMongoDBProjectDB(ProjectDBCase, unittest.TestCase):\n\n    @classmethod\n    def setUpClass(self):\n        self.projectdb = database.connect_database(\n            'mongodb+projectdb://localhost/pyspider_test_projectdb'\n        )\n        self.assertIsNotNone(self, self.projectdb)\n\n    @classmethod\n    def tearDownClass(self):\n        self.projectdb.conn.drop_database(self.projectdb.database.name)\n\n\n@unittest.skipIf(os.environ.get('IGNORE_MONGODB') or os.environ.get('IGNORE_ALL'), 'no mongodb server for test.')\nclass TestMongoDBResultDB(ResultDBCase, unittest.TestCase):\n\n    @classmethod\n    def setUpClass(self):\n        self.resultdb = database.connect_database(\n            'mongodb+resultdb://localhost/pyspider_test_resultdb'\n        )\n        self.assertIsNotNone(self, self.resultdb)\n\n    @classmethod\n    def tearDownClass(self):\n        self.resultdb.conn.drop_database(self.resultdb.database.name)\n\n    def test_create_project(self):\n        self.assertNotIn('test_create_project', self.resultdb.projects)\n        self.resultdb._create_project('test_create_project')\n        self.assertIn('test_create_project', self.resultdb.projects)\n\n\n@unittest.skipIf(os.environ.get('IGNORE_MYSQL') or os.environ.get('IGNORE_ALL'), 'no mysql server for test.')\nclass TestSQLAlchemyMySQLTaskDB(TaskDBCase, unittest.TestCase):\n\n    @classmethod\n    def setUpClass(self):\n        self.taskdb = database.connect_database(\n            'sqlalchemy+mysql+mysqlconnector+taskdb://root@localhost/pyspider_test_taskdb'\n        )\n        self.assertIsNotNone(self, self.taskdb)\n\n    @classmethod\n    def tearDownClass(self):\n        self.taskdb.engine.execute('DROP DATABASE pyspider_test_taskdb')\n\n\n@unittest.skipIf(os.environ.get('IGNORE_MYSQL') or os.environ.get('IGNORE_ALL'), 'no mysql server for test.')\nclass TestSQLAlchemyMySQLProjectDB(ProjectDBCase, unittest.TestCase):\n\n    @classmethod\n    def setUpClass(self):\n        self.projectdb = database.connect_database(\n            'sqlalchemy+mysql+mysqlconnector+projectdb://root@localhost/pyspider_test_projectdb'\n        )\n        self.assertIsNotNone(self, self.projectdb)\n\n    @classmethod\n    def tearDownClass(self):\n        self.projectdb.engine.execute('DROP DATABASE pyspider_test_projectdb')\n\n\n@unittest.skipIf(os.environ.get('IGNORE_MYSQL') or os.environ.get('IGNORE_ALL'), 'no mysql server for test.')\nclass TestSQLAlchemyMySQLResultDB(ResultDBCase, unittest.TestCase):\n\n    @classmethod\n    def setUpClass(self):\n        self.resultdb = database.connect_database(\n            'sqlalchemy+mysql+mysqlconnector+resultdb://root@localhost/pyspider_test_resultdb'\n        )\n        self.assertIsNotNone(self, self.resultdb)\n\n    @classmethod\n    def tearDownClass(self):\n        self.resultdb.engine.execute('DROP DATABASE pyspider_test_resultdb')\n\n\nclass TestSQLAlchemyTaskDB(TaskDBCase, unittest.TestCase):\n\n    @classmethod\n    def setUpClass(self):\n        self.taskdb = database.connect_database(\n            'sqlalchemy+sqlite+taskdb://'\n        )\n        self.assertIsNotNone(self, self.taskdb)\n\n    @classmethod\n    def tearDownClass(self):\n        del self.taskdb\n\n\nclass TestSQLAlchemyProjectDB(ProjectDBCase, unittest.TestCase):\n\n    @classmethod\n    def setUpClass(self):\n        self.projectdb = database.connect_database(\n            'sqlalchemy+sqlite+projectdb://'\n        )\n        self.assertIsNotNone(self, self.projectdb)\n\n    @classmethod\n    def tearDownClass(self):\n        del self.projectdb\n\n\nclass TestSQLAlchemyResultDB(ResultDBCase, unittest.TestCase):\n\n    @classmethod\n    def setUpClass(self):\n        self.resultdb = database.connect_database(\n            'sqlalchemy+sqlite+resultdb://'\n        )\n        self.assertIsNotNone(self, self.resultdb)\n\n    @classmethod\n    def tearDownClass(self):\n        del self.resultdb\n\n\n@unittest.skipIf(os.environ.get('IGNORE_POSTGRESQL') or os.environ.get('IGNORE_ALL'), 'no postgresql server for test.')\nclass TestPGTaskDB(TaskDBCase, unittest.TestCase):\n\n    @classmethod\n    def setUpClass(self):\n        self.taskdb = database.connect_database(\n            'sqlalchemy+postgresql+taskdb://postgres@127.0.0.1:5432/pyspider_test_taskdb'\n        )\n        self.assertIsNotNone(self, self.taskdb)\n        self.tearDownClass()\n\n    @classmethod\n    def tearDownClass(self):\n        for project in self.taskdb.projects:\n            self.taskdb.drop(project)\n\n\n@unittest.skipIf(os.environ.get('IGNORE_POSTGRESQL') or os.environ.get('IGNORE_ALL'), 'no postgresql server for test.')\nclass TestPGProjectDB(ProjectDBCase, unittest.TestCase):\n\n    @classmethod\n    def setUpClass(self):\n        self.projectdb = database.connect_database(\n            'sqlalchemy+postgresql+projectdb://postgres@127.0.0.1:5432/pyspider_test_projectdb'\n        )\n        self.assertIsNotNone(self, self.projectdb)\n        self.tearDownClass()\n\n    @classmethod\n    def tearDownClass(self):\n        for project in self.projectdb.get_all(fields=['name']):\n            self.projectdb.drop(project['name'])\n\n\n@unittest.skipIf(os.environ.get('IGNORE_POSTGRESQL') or os.environ.get('IGNORE_ALL'), 'no postgresql server for test.')\nclass TestPGResultDB(ResultDBCase, unittest.TestCase):\n\n    @classmethod\n    def setUpClass(self):\n        self.resultdb = database.connect_database(\n                'sqlalchemy+postgresql+resultdb://postgres@127.0.0.1/pyspider_test_resultdb'\n        )\n        self.assertIsNotNone(self, self.resultdb)\n        self.tearDownClass()\n\n    @classmethod\n    def tearDownClass(self):\n        for project in self.resultdb.projects:\n            self.resultdb.drop(project)\n\n\n@unittest.skipIf(os.environ.get('IGNORE_REDIS') or os.environ.get('IGNORE_ALL'), 'no redis server for test.')\nclass TestRedisTaskDB(TaskDBCase, unittest.TestCase):\n\n    @classmethod\n    def setUpClass(self):\n        self.taskdb = database.connect_database('redis+taskdb://localhost:6379/15')\n        self.assertIsNotNone(self, self.taskdb)\n        self.taskdb.__prefix__ = 'testtaskdb_'\n\n    @classmethod\n    def tearDownClass(self):\n        for project in self.taskdb.projects:\n            self.taskdb.drop(project)\n\n\n@unittest.skipIf(os.environ.get('IGNORE_ELASTICSEARCH') or os.environ.get('IGNORE_ALL'), 'no elasticsearch server for test.')\nclass TestESProjectDB(ProjectDBCase, unittest.TestCase):\n\n    @classmethod\n    def setUpClass(self):\n        self.projectdb = database.connect_database(\n            'elasticsearch+projectdb://127.0.0.1:9200/?index=test_pyspider_projectdb'\n        )\n        self.assertIsNotNone(self, self.projectdb)\n        assert self.projectdb.index == 'test_pyspider_projectdb'\n\n    @classmethod\n    def tearDownClass(self):\n        self.projectdb.es.indices.delete(index='test_pyspider_projectdb', ignore=[400, 404])\n\n\n@unittest.skipIf(os.environ.get('IGNORE_ELASTICSEARCH') or os.environ.get('IGNORE_ALL'), 'no elasticsearch server for test.')\nclass TestESResultDB(ResultDBCase, unittest.TestCase):\n\n    @classmethod\n    def setUpClass(self):\n        self.resultdb = database.connect_database(\n            'elasticsearch+resultdb://127.0.0.1:9200/?index=test_pyspider_resultdb'\n        )\n        self.assertIsNotNone(self, self.resultdb)\n        assert self.resultdb.index == 'test_pyspider_resultdb'\n\n    @classmethod\n    def tearDownClass(self):\n        self.resultdb.es.indices.delete(index='test_pyspider_resultdb', ignore=[400, 404])\n\n    def test_15_save(self):\n        self.resultdb.refresh()\n\n    def test_30_select(self):\n        for i in range(5):\n            self.resultdb.save('test_project', 'test_taskid-%d' % i,\n                               'test_url', 'result-%d' % i)\n        self.resultdb.refresh()\n\n        ret = list(self.resultdb.select('test_project'))\n        self.assertEqual(len(ret), 6)\n\n        ret = list(self.resultdb.select('test_project', limit=4))\n        self.assertEqual(len(ret), 4)\n\n        for ret in self.resultdb.select('test_project', fields=('url', ), limit=1):\n            self.assertIn('url', ret)\n            self.assertNotIn('result', ret)\n\n    def test_35_select_limit(self):\n        pass\n\n    def test_z20_update_projects(self):\n        self.resultdb.refresh()\n        self.assertIn('drop_project2', self.resultdb.projects)\n        self.assertNotIn('drop_project3', self.resultdb.projects)\n\n@unittest.skipIf(os.environ.get('IGNORE_ELASTICSEARCH') or os.environ.get('IGNORE_ALL'), 'no elasticsearch server for test.')\nclass TestESTaskDB(TaskDBCase, unittest.TestCase):\n\n    @classmethod\n    def setUpClass(self):\n        self.taskdb = database.connect_database(\n            'elasticsearch+taskdb://127.0.0.1:9200/?index=test_pyspider_taskdb'\n        )\n        self.assertIsNotNone(self, self.taskdb)\n        assert self.taskdb.index == 'test_pyspider_taskdb'\n\n    @classmethod\n    def tearDownClass(self):\n        self.taskdb.es.indices.delete(index='test_pyspider_taskdb', ignore=[400, 404])\n\n\n@unittest.skipIf(os.environ.get('IGNORE_COUCHDB') or os.environ.get('IGNORE_ALL'), 'no couchdb server for test.')\nclass TestCouchDBProjectDB(ProjectDBCase, unittest.TestCase):\n\n    @classmethod\n    def setUpClass(self):\n        # create a test admin user\n        self.projectdb = database.connect_database(\n            'couchdb+projectdb://localhost:5984/'\n        )\n        self.assertIsNotNone(self, self.projectdb)\n\n    @classmethod\n    def tearDownClass(self):\n        # remove the test admin user\n        self.projectdb.drop_database()\n\n\n@unittest.skipIf(os.environ.get('IGNORE_COUCHDB') or os.environ.get('IGNORE_ALL'), 'no couchdb server for test.')\nclass TestCouchDBResultDB(ResultDBCase, unittest.TestCase):\n\n    @classmethod\n    def setUpClass(self):\n        # create a test admin user\n        self.resultdb = database.connect_database(\n            'couchdb+resultdb://localhost:5984/'\n        )\n        self.assertIsNotNone(self, self.resultdb)\n\n    @classmethod\n    def tearDownClass(self):\n        # remove the test admin user\n        self.resultdb.drop_database()\n\n    def test_create_project(self):\n        self.assertNotIn('test_create_project', self.resultdb.projects)\n        self.resultdb._create_project('test_create_project')\n        self.assertIn('test_create_project', self.resultdb.projects)\n\n\n@unittest.skipIf(os.environ.get('IGNORE_COUCHDB') or os.environ.get('IGNORE_ALL'), 'no couchdb server for test.')\nclass TestCouchDBTaskDB(TaskDBCase, unittest.TestCase):\n\n    @classmethod\n    def setUpClass(self):\n        # create a test admin user\n        import requests\n        self.taskdb = database.connect_database(\n            'couchdb+taskdb://localhost:5984/'\n        )\n        self.assertIsNotNone(self, self.taskdb)\n\n    @classmethod\n    def tearDownClass(self):\n        # remove the test admin user\n        import requests\n        from requests.auth import HTTPBasicAuth\n        self.taskdb.drop_database()\n\n    def test_create_project(self):\n        self.assertNotIn('test_create_project', self.taskdb.projects)\n        self.taskdb._create_project('test_create_project')\n        self.assertIn('test_create_project', self.taskdb.projects)\n\n\nif __name__ == '__main__':\n    unittest.main()\n"
  },
  {
    "path": "tests/test_fetcher.py",
    "content": "#!/usr/bin/env python\n# -*- encoding: utf-8 -*-\n# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:\n# Author: Binux<i@binux.me>\n#         http://binux.me\n# Created on 2014-02-15 22:10:35\n\nimport os\nimport json\nimport copy\nimport time\nimport socket\nimport umsgpack\nimport subprocess\nimport unittest\n\nimport logging\nimport logging.config\nlogging.config.fileConfig(\"pyspider/logging.conf\")\n\ntry:\n    from six.moves import xmlrpc_client\nexcept ImportError:\n    import xmlrpclib as xmlrpc_client\nfrom pyspider.libs import utils\nfrom pyspider.libs.multiprocessing_queue import Queue\nfrom pyspider.libs.response import rebuild_response\nfrom pyspider.fetcher.tornado_fetcher import Fetcher\n\n\nclass TestFetcher(unittest.TestCase):\n    sample_task_http = {\n        'taskid': 'taskid',\n        'project': 'project',\n        'url': '',\n        'fetch': {\n            'method': 'GET',\n            'headers': {\n                'Cookie': 'a=b',\n                'a': 'b'\n            },\n            'cookies': {\n                'c': 'd',\n            },\n            'timeout': 60,\n            'save': 'abc',\n        },\n        'process': {\n            'callback': 'callback',\n            'save': [1, 2, 3],\n        },\n    }\n\n    @classmethod\n    def setUpClass(self):\n        import tests.data_test_webpage\n        import httpbin\n\n        self.httpbin_thread = utils.run_in_subprocess(httpbin.app.run, port=14887, passthrough_errors=False)\n        self.httpbin = 'http://127.0.0.1:14887'\n\n        self.inqueue = Queue(10)\n        self.outqueue = Queue(10)\n        self.fetcher = Fetcher(self.inqueue, self.outqueue)\n        self.fetcher.phantomjs_proxy = '127.0.0.1:25555'\n        self.rpc = xmlrpc_client.ServerProxy('http://localhost:%d' % 24444)\n        self.xmlrpc_thread = utils.run_in_thread(self.fetcher.xmlrpc_run, port=24444)\n        self.thread = utils.run_in_thread(self.fetcher.run)\n        self.proxy_thread = subprocess.Popen(['pyproxy', '--username=binux',\n                                              '--password=123456', '--port=14830',\n                                              '--debug'], close_fds=True)\n        self.proxy = '127.0.0.1:14830'\n        try:\n            self.phantomjs = subprocess.Popen(['phantomjs',\n                os.path.join(os.path.dirname(__file__),\n                    '../pyspider/fetcher/phantomjs_fetcher.js'),\n                '25555'])\n        except OSError:\n            self.phantomjs = None\n        time.sleep(0.5)\n\n    @classmethod\n    def tearDownClass(self):\n        self.proxy_thread.terminate()\n        self.proxy_thread.wait()\n        self.httpbin_thread.terminate()\n        self.httpbin_thread.join()\n\n        if self.phantomjs:\n            self.phantomjs.kill()\n            self.phantomjs.wait()\n        self.rpc._quit()\n        self.thread.join()\n\n        assert not utils.check_port_open(5000)\n        assert not utils.check_port_open(23333)\n        assert not utils.check_port_open(24444)\n        assert not utils.check_port_open(25555)\n        assert not utils.check_port_open(14887)\n\n        time.sleep(1)\n\n    def test_10_http_get(self):\n        request = copy.deepcopy(self.sample_task_http)\n        request['url'] = self.httpbin+'/get'\n        result = self.fetcher.sync_fetch(request)\n        response = rebuild_response(result)\n\n        self.assertEqual(response.status_code, 200, result)\n        self.assertEqual(response.orig_url, request['url'])\n        self.assertEqual(response.save, request['fetch']['save'])\n        self.assertIsNotNone(response.json, response.content)\n        self.assertEqual(response.json['headers'].get('A'), 'b', response.json)\n        self.assertIn('c=d', response.json['headers'].get('Cookie'), response.json)\n        self.assertIn('a=b', response.json['headers'].get('Cookie'), response.json)\n\n    def test_15_http_post(self):\n        request = copy.deepcopy(self.sample_task_http)\n        request['url'] = self.httpbin+'/post'\n        request['fetch']['method'] = 'POST'\n        request['fetch']['data'] = 'binux'\n        request['fetch']['cookies'] = {'c': 'd'}\n        result = self.fetcher.sync_fetch(request)\n        response = rebuild_response(result)\n\n        self.assertEqual(response.status_code, 200)\n        self.assertEqual(response.orig_url, request['url'])\n        self.assertEqual(response.save, request['fetch']['save'])\n        self.assertIsNotNone(response.json, response.content)\n\n        self.assertEqual(response.json['form'].get('binux'), '')\n        self.assertEqual(response.json['headers'].get('A'), 'b', response.json)\n        self.assertIn('c=d', response.json['headers'].get('Cookie'), response.json)\n        self.assertIn('a=b', response.json['headers'].get('Cookie'), response.json)\n\n    def test_20_dataurl_get(self):\n        request = copy.deepcopy(self.sample_task_http)\n        request['url'] = 'data:,hello'\n        result = self.fetcher.sync_fetch(request)\n        response = rebuild_response(result)\n\n        self.assertEqual(response.status_code, 200)\n        self.assertEqual(response.text, 'hello')\n\n    def test_30_with_queue(self):\n        request= copy.deepcopy(self.sample_task_http)\n        request['url'] = 'data:,hello'\n        self.inqueue.put(request)\n        task, result = self.outqueue.get()\n        response = rebuild_response(result)\n\n        self.assertEqual(response.status_code, 200)\n        self.assertEqual(response.text, 'hello')\n\n    def test_40_with_rpc(self):\n        request = copy.deepcopy(self.sample_task_http)\n        request['url'] = 'data:,hello'\n        result = umsgpack.unpackb(self.rpc.fetch(request).data)\n        response = rebuild_response(result)\n\n        self.assertEqual(response.status_code, 200)\n        self.assertEqual(response.text, 'hello')\n\n    def test_50_base64_data(self):\n        request = copy.deepcopy(self.sample_task_http)\n        request['url'] = self.httpbin+'/post'\n        request['fetch']['method'] = 'POST'\n        # utf8 encoding 中文\n        request['fetch']['data'] = \"[BASE64-DATA]5Lit5paH[/BASE64-DATA]\"\n        self.inqueue.put(request)\n        task, result = self.outqueue.get()\n        response = rebuild_response(result)\n\n        self.assertEqual(response.status_code, 200, response.error)\n        self.assertIsNotNone(response.json, response.content)\n        self.assertIn(u'中文', response.json['form'], response.json)\n\n    def test_55_base64_data(self):\n        request = copy.deepcopy(self.sample_task_http)\n        request['url'] = self.httpbin+'/post'\n        request['fetch']['method'] = 'POST'\n        # gbk encoding 中文\n        request['fetch']['data'] = \"[BASE64-DATA]1tDOxA==[/BASE64-DATA]\"\n        self.inqueue.put(request)\n        task, result = self.outqueue.get()\n        response = rebuild_response(result)\n\n        self.assertEqual(response.status_code, 200, response.error)\n        self.assertIsNotNone(response.json, response.content)\n\n    def test_60_timeout(self):\n        request = copy.deepcopy(self.sample_task_http)\n        request['url'] = self.httpbin+'/delay/5'\n        request['fetch']['timeout'] = 3\n        start_time = time.time()\n        self.inqueue.put(request)\n        task, result = self.outqueue.get()\n        end_time = time.time()\n        self.assertGreater(end_time - start_time, 1.5)\n        self.assertLess(end_time - start_time, 4.5)\n\n        response = rebuild_response(result)\n        self.assertEqual(response.orig_url, request['url'])\n        self.assertEqual(response.save, request['fetch']['save'])\n\n    def test_65_418(self):\n        request = copy.deepcopy(self.sample_task_http)\n        request['url'] = self.httpbin+'/status/418'\n        self.inqueue.put(request)\n        task, result = self.outqueue.get()\n        response = rebuild_response(result)\n\n        self.assertEqual(response.status_code, 418)\n        self.assertIn('teapot', response.text)\n\n    def test_69_no_phantomjs(self):\n        phantomjs_proxy = self.fetcher.phantomjs_proxy\n        self.fetcher.phantomjs_proxy = None\n\n        if not self.phantomjs:\n            raise unittest.SkipTest('no phantomjs')\n        request = copy.deepcopy(self.sample_task_http)\n        request['url'] = self.httpbin + '/get'\n        request['fetch']['fetch_type'] = 'phantomjs'\n        result = self.fetcher.sync_fetch(request)\n        response = rebuild_response(result)\n\n        self.assertEqual(response.status_code, 501, result)\n\n        self.fetcher.phantomjs_proxy = phantomjs_proxy\n\n    def test_70_phantomjs_url(self):\n        if not self.phantomjs:\n            raise unittest.SkipTest('no phantomjs')\n        request = copy.deepcopy(self.sample_task_http)\n        request['url'] = self.httpbin + '/get'\n        request['fetch']['fetch_type'] = 'phantomjs'\n        result = self.fetcher.sync_fetch(request)\n        response = rebuild_response(result)\n\n        self.assertEqual(response.status_code, 200, result)\n        self.assertEqual(response.orig_url, request['url'])\n        self.assertEqual(response.save, request['fetch']['save'])\n        data = json.loads(response.doc('pre').text())\n        self.assertEqual(data['headers'].get('A'), 'b', response.content)\n        self.assertIn('c=d', data['headers'].get('Cookie'), response.content)\n        self.assertIn('a=b', data['headers'].get('Cookie'), response.content)\n\n    def test_75_phantomjs_robots(self):\n        if not self.phantomjs:\n            raise unittest.SkipTest('no phantomjs')\n        request = copy.deepcopy(self.sample_task_http)\n        request['url'] = self.httpbin + '/deny'\n        request['fetch']['fetch_type'] = 'phantomjs'\n        request['fetch']['robots_txt'] = True\n        result = self.fetcher.sync_fetch(request)\n        response = rebuild_response(result)\n\n        self.assertEqual(response.status_code, 403, result)\n\n    def test_80_phantomjs_timeout(self):\n        if not self.phantomjs:\n            raise unittest.SkipTest('no phantomjs')\n        request = copy.deepcopy(self.sample_task_http)\n        request['url'] = self.httpbin+'/delay/5'\n        request['fetch']['fetch_type'] = 'phantomjs'\n        request['fetch']['timeout'] = 3\n        start_time = time.time()\n        result = self.fetcher.sync_fetch(request)\n        end_time = time.time()\n        self.assertGreater(end_time - start_time, 2)\n        self.assertLess(end_time - start_time, 5)\n        self.assertEqual(result['status_code'], 599)\n        self.assertIn('js_script_result', result)\n\n    def test_90_phantomjs_js_script(self):\n        if not self.phantomjs:\n            raise unittest.SkipTest('no phantomjs')\n        request = copy.deepcopy(self.sample_task_http)\n        request['url'] = self.httpbin + '/html'\n        request['fetch']['fetch_type'] = 'phantomjs'\n        request['fetch']['js_script'] = 'function() { document.write(\"binux\") }'\n        result = self.fetcher.sync_fetch(request)\n        self.assertEqual(result['status_code'], 200)\n        self.assertIn('binux', result['content'])\n\n    def test_a100_phantomjs_sharp_url(self):\n        if not self.phantomjs:\n            raise unittest.SkipTest('no phantomjs')\n        request = copy.deepcopy(self.sample_task_http)\n        request['url'] = self.httpbin+'/pyspider/ajax.html'\n        request['fetch']['fetch_type'] = 'phantomjs'\n        request['fetch']['headers']['User-Agent'] = 'pyspider-test'\n        result = self.fetcher.sync_fetch(request)\n        self.assertEqual(result['status_code'], 200)\n        self.assertNotIn('loading', result['content'])\n        self.assertIn('done', result['content'])\n        self.assertIn('pyspider-test', result['content'])\n\n    def test_a110_dns_error(self):\n        request = copy.deepcopy(self.sample_task_http)\n        request['url'] = 'http://www.not-exists-site-binux.com/'\n        result = self.fetcher.sync_fetch(request)\n        self.assertEqual(result['status_code'], 599)\n        self.assertIn('error', result)\n        self.assertIn('resolve', result['error'])\n\n        self.inqueue.put(request)\n        task, result = self.outqueue.get()\n        self.assertEqual(result['status_code'], 599)\n        self.assertIn('error', result)\n        self.assertIn('resolve', result['error'])\n\n    def test_a120_http_get_with_proxy_fail(self):\n        self.fetcher.proxy = self.proxy\n        request = copy.deepcopy(self.sample_task_http)\n        request['url'] = self.httpbin+'/get'\n        result = self.fetcher.sync_fetch(request)\n        response = rebuild_response(result)\n\n        self.assertEqual(response.status_code, 403, result)\n        self.fetcher.proxy = None\n\n    def test_a130_http_get_with_proxy_ok(self):\n        self.fetcher.proxy = self.proxy\n        request = copy.deepcopy(self.sample_task_http)\n        request['url'] = self.httpbin+'/get?username=binux&password=123456'\n        result = self.fetcher.sync_fetch(request)\n        response = rebuild_response(result)\n\n        self.assertEqual(response.status_code, 200, result)\n        self.assertEqual(response.orig_url, request['url'])\n        self.assertEqual(response.save, request['fetch']['save'])\n        self.assertIsNotNone(response.json, response.content)\n        self.assertEqual(response.json['headers'].get('A'), 'b', response.json)\n        self.assertIn('c=d', response.json['headers'].get('Cookie'), response.json)\n        self.assertIn('a=b', response.json['headers'].get('Cookie'), response.json)\n        self.fetcher.proxy = None\n\n    def test_a140_redirect(self):\n        request = copy.deepcopy(self.sample_task_http)\n        request['url'] = self.httpbin+'/redirect-to?url=/get'\n        result = self.fetcher.sync_fetch(request)\n        response = rebuild_response(result)\n\n        self.assertEqual(response.status_code, 200, result)\n        self.assertEqual(response.orig_url, request['url'])\n        self.assertEqual(response.url, self.httpbin+'/get')\n\n    def test_a150_too_much_redirect(self):\n        request = copy.deepcopy(self.sample_task_http)\n        request['url'] = self.httpbin+'/redirect/10'\n        result = self.fetcher.sync_fetch(request)\n        response = rebuild_response(result)\n\n        self.assertEqual(response.status_code, 599, result)\n        self.assertIn('redirects followed', response.error)\n\n    def test_a160_cookie(self):\n        request = copy.deepcopy(self.sample_task_http)\n        request['url'] = self.httpbin+'/cookies/set?k1=v1&k2=v2'\n        result = self.fetcher.sync_fetch(request)\n        response = rebuild_response(result)\n\n        self.assertEqual(response.status_code, 200, result)\n        self.assertEqual(response.cookies, {'a': 'b', 'k1': 'v1', 'k2': 'v2', 'c': 'd'}, result)\n\n    def test_a170_validate_cert(self):\n        request = copy.deepcopy(self.sample_task_http)\n        request['fetch']['validate_cert'] = False\n        request['url'] = self.httpbin+'/get'\n        result = self.fetcher.sync_fetch(request)\n        response = rebuild_response(result)\n\n        self.assertEqual(response.status_code, 200, result)\n\n    def test_a180_max_redirects(self):\n        request = copy.deepcopy(self.sample_task_http)\n        request['fetch']['max_redirects'] = 10\n        request['url'] = self.httpbin+'/redirect/10'\n        result = self.fetcher.sync_fetch(request)\n        response = rebuild_response(result)\n\n        self.assertEqual(response.status_code, 200, result)\n\n    def test_a200_robots_txt(self):\n        request = copy.deepcopy(self.sample_task_http)\n        request['fetch']['robots_txt'] = False\n        request['url'] = self.httpbin+'/deny'\n        result = self.fetcher.sync_fetch(request)\n        response = rebuild_response(result)\n\n        self.assertEqual(response.status_code, 200, result)\n\n        request['fetch']['robots_txt'] = True\n        result = self.fetcher.sync_fetch(request)\n        response = rebuild_response(result)\n\n        self.assertEqual(response.status_code, 403, result)\n\n    def test_zzzz_issue375(self):\n        phantomjs_proxy = self.fetcher.phantomjs_proxy\n        self.fetcher.phantomjs_proxy = '127.0.0.1:20000'\n\n        if not self.phantomjs:\n            raise unittest.SkipTest('no phantomjs')\n        request = copy.deepcopy(self.sample_task_http)\n        request['url'] = self.httpbin + '/get'\n        request['fetch']['fetch_type'] = 'phantomjs'\n        result = self.fetcher.sync_fetch(request)\n        response = rebuild_response(result)\n\n        self.assertEqual(response.status_code, 599, result)\n\n        self.fetcher.phantomjs_proxy = phantomjs_proxy\n\n@unittest.skipIf(os.environ.get('IGNORE_SPLASH') or os.environ.get('IGNORE_ALL'), 'no splash server for test.')\nclass TestSplashFetcher(unittest.TestCase):\n    @property\n    def sample_task_http(self):\n        return {\n            'taskid': 'taskid',\n            'project': 'project',\n            'url': '',\n            'fetch': {\n                'method': 'GET',\n                'headers': {\n                    'Cookie': 'a=b',\n                    'a': 'b'\n                },\n                'cookies': {\n                    'c': 'd',\n                },\n                'timeout': 60,\n                'save': 'abc',\n            },\n            'process': {\n                'callback': 'callback',\n                'save': [1, 2, 3],\n            },\n        }\n\n    @classmethod\n    def setUpClass(self):\n        import tests.data_test_webpage\n        import httpbin\n\n        self.httpbin_thread = utils.run_in_subprocess(httpbin.app.run, host='0.0.0.0', port=14887, passthrough_errors=False)\n        self.httpbin = 'http://' + socket.gethostbyname(socket.gethostname()) + ':14887'\n\n        self.inqueue = Queue(10)\n        self.outqueue = Queue(10)\n        self.fetcher = Fetcher(self.inqueue, self.outqueue)\n        self.fetcher.splash_endpoint = 'http://127.0.0.1:8050/execute'\n        self.rpc = xmlrpc_client.ServerProxy('http://localhost:%d' % 24444)\n        self.xmlrpc_thread = utils.run_in_thread(self.fetcher.xmlrpc_run, port=24444)\n        self.thread = utils.run_in_thread(self.fetcher.run)\n        self.proxy_thread = subprocess.Popen(['pyproxy', '--username=binux', '--bind=0.0.0.0',\n                                              '--password=123456', '--port=14830',\n                                              '--debug'], close_fds=True)\n        self.proxy = socket.gethostbyname(socket.gethostname()) + ':14830'\n        \n    @classmethod\n    def tearDownClass(self):\n        self.rpc(\"close\")()\n        self.proxy_thread.terminate()\n        self.proxy_thread.wait()\n        self.httpbin_thread.terminate()\n        self.httpbin_thread.join()\n\n        self.rpc._quit()\n        self.thread.join()\n\n        assert not utils.check_port_open(5000)\n        assert not utils.check_port_open(23333)\n        assert not utils.check_port_open(24444)\n        assert not utils.check_port_open(25555)\n        assert not utils.check_port_open(14887)\n\n        time.sleep(1)\n\n    def test_69_no_splash(self):\n        splash_endpoint = self.fetcher.splash_endpoint\n        self.fetcher.splash_endpoint = None\n\n        request = self.sample_task_http\n        request['url'] = self.httpbin + '/get'\n        request['fetch']['fetch_type'] = 'splash'\n        result = self.fetcher.sync_fetch(request)\n        response = rebuild_response(result)\n\n        self.assertEqual(response.status_code, 501, result)\n\n        self.fetcher.splash_endpoint = splash_endpoint\n\n    def test_70_splash_url(self):\n        request = self.sample_task_http\n        request['url'] = self.httpbin + '/get'\n        request['fetch']['fetch_type'] = 'splash'\n        result = self.fetcher.sync_fetch(request)\n        response = rebuild_response(result)\n\n        self.assertEqual(response.status_code, 200, result)\n        self.assertEqual(response.orig_url, request['url'])\n        self.assertEqual(response.save, request['fetch']['save'])\n\n        data = json.loads(response.doc('pre').text())\n        self.assertEqual(data['headers'].get('A'), 'b', response.content)\n        self.assertIn('c=d', data['headers'].get('Cookie'), response.content)\n        self.assertIn('a=b', data['headers'].get('Cookie'), response.content)\n\n    def test_75_splash_robots(self):\n        request = self.sample_task_http\n        request['url'] = self.httpbin + '/deny'\n        request['fetch']['fetch_type'] = 'splash'\n        request['fetch']['robots_txt'] = True\n        result = self.fetcher.sync_fetch(request)\n        response = rebuild_response(result)\n\n        self.assertEqual(response.status_code, 403, result)\n\n    def test_80_splash_timeout(self):\n        request = self.sample_task_http\n        request['url'] = self.httpbin+'/delay/5'\n        request['fetch']['fetch_type'] = 'splash'\n        request['fetch']['timeout'] = 3\n        start_time = time.time()\n        result = self.fetcher.sync_fetch(request)\n        end_time = time.time()\n        self.assertGreater(end_time - start_time, 2)\n        self.assertLess(end_time - start_time, 5)\n        self.assertEqual(result['status_code'], 599)\n        # self.assertIn('js_script_result', result) TODO: lua nil is not exists\n\n    def test_90_splash_js_script(self):\n        request = self.sample_task_http\n        request['url'] = self.httpbin + '/html'\n        request['fetch']['fetch_type'] = 'splash'\n        request['fetch']['js_script'] = 'function() { document.write(\"binux\") }'\n        result = self.fetcher.sync_fetch(request)\n        self.assertEqual(result['status_code'], 200)\n        self.assertIn('binux', result['content'])\n\n    def test_95_splash_js_script_2(self):\n        request = self.sample_task_http\n        request['url'] = self.httpbin + '/pyspider/ajax_click.html'\n        request['fetch']['fetch_type'] = 'splash'\n        request['fetch']['headers']['User-Agent'] = 'pyspider-test'\n        request['fetch']['js_script'] = 'function() { document.querySelector(\"a\").click(); return \"abc\" }'\n        result = self.fetcher.sync_fetch(request)\n        self.assertEqual(result['status_code'], 200)\n        self.assertNotIn('loading', result['content'])\n        self.assertIn('done', result['content'])\n        self.assertIn('pyspider-test', result['content'])\n        self.assertIn('abc', result['js_script_result'])\n\n    def test_a100_splash_sharp_url(self):\n        request = self.sample_task_http\n        request['url'] = self.httpbin+'/pyspider/ajax.html'\n        request['fetch']['fetch_type'] = 'splash'\n        request['fetch']['headers']['User-Agent'] = 'pyspider-test'\n        result = self.fetcher.sync_fetch(request)\n        self.assertEqual(result['status_code'], 200)\n        self.assertNotIn('loading', result['content'])\n        self.assertIn('done', result['content'])\n        self.assertIn('pyspider-test', result['content'])\n\n    def test_a120_http_get_with_proxy_fail_1(self):\n        self.fetcher.proxy = self.proxy\n        request = copy.deepcopy(self.sample_task_http)\n        request['url'] = self.httpbin+'/get'\n        result = self.fetcher.sync_fetch(request)\n        response = rebuild_response(result)\n\n        self.assertEqual(response.status_code, 403, result)\n        self.fetcher.proxy = None\n\n    def test_a120_http_get_with_proxy_fail(self):\n        self.fetcher.proxy = self.proxy\n        request = copy.deepcopy(self.sample_task_http)\n        request['url'] = self.httpbin+'/get'\n        request['fetch']['fetch_type'] = 'splash'\n        result = self.fetcher.sync_fetch(request)\n        response = rebuild_response(result)\n\n        self.assertEqual(response.status_code, 403, result)\n        self.fetcher.proxy = None\n\n    def test_a130_http_get_with_proxy_ok_1(self):\n        self.fetcher.proxy = 'http://binux:123456@%s/' % self.proxy\n        request = copy.deepcopy(self.sample_task_http)\n        request['url'] = self.httpbin+'/get'\n        result = self.fetcher.sync_fetch(request)\n        response = rebuild_response(result)\n\n        self.assertEqual(response.status_code, 200, result)\n        self.assertEqual(response.orig_url, request['url'])\n        self.assertEqual(response.save, request['fetch']['save'])\n        self.assertIsNotNone(response.json, response.content)\n        self.assertEqual(response.json['headers'].get('A'), 'b', response.json)\n        self.assertIn('c=d', response.json['headers'].get('Cookie'), response.json)\n        self.assertIn('a=b', response.json['headers'].get('Cookie'), response.json)\n        self.fetcher.proxy = None\n\n    def test_a130_http_get_with_proxy_ok(self):\n        self.fetcher.proxy = 'http://binux:123456@%s/' % self.proxy\n        request = copy.deepcopy(self.sample_task_http)\n        request['url'] = self.httpbin+'/get'\n        request['fetch']['fetch_type'] = 'splash'\n        result = self.fetcher.sync_fetch(request)\n        response = rebuild_response(result)\n\n        self.assertEqual(response.status_code, 200, result)\n        self.assertEqual(response.orig_url, request['url'])\n        self.assertEqual(response.save, request['fetch']['save'])\n\n        data = json.loads(response.doc('pre').text())\n        self.assertEqual(data['headers'].get('A'), 'b', response.content)\n        self.assertIn('c=d', data['headers'].get('Cookie'), response.content)\n        self.assertIn('a=b', data['headers'].get('Cookie'), response.content)\n        self.fetcher.proxy = None\n"
  },
  {
    "path": "tests/test_fetcher_processor.py",
    "content": "#!/usr/bin/env python\n# -*- encoding: utf-8 -*-\n# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:\n# Author: Binux<roy@binux.me>\n#         http://binux.me\n# Created on 2015-01-18 14:09:41\n\nimport os\nimport time\nimport httpbin\nimport subprocess\nimport unittest\n\nfrom pyspider.database.local.projectdb import ProjectDB\nfrom pyspider.fetcher import Fetcher\nfrom pyspider.processor import Processor\nfrom pyspider.libs import utils, dataurl\nfrom six.moves.queue import Queue\nfrom tests.data_fetcher_processor_handler import Handler\n\n\nclass TestFetcherProcessor(Handler, unittest.TestCase):\n\n    @classmethod\n    def setUpClass(self):\n        self.projectdb = ProjectDB([os.path.join(os.path.dirname(__file__), 'data_fetcher_processor_handler.py')])\n        self.fetcher = Fetcher(None, None, async_mode=False)\n        self.status_queue = Queue()\n        self.newtask_queue = Queue()\n        self.result_queue = Queue()\n        self.httpbin_thread = utils.run_in_subprocess(httpbin.app.run, port=14887, passthrough_errors=False)\n        self.httpbin = 'http://127.0.0.1:14887'\n        self.proxy_thread = subprocess.Popen(['pyproxy', '--username=binux',\n                                              '--password=123456', '--port=14830',\n                                              '--debug'], close_fds=True)\n        self.proxy = '127.0.0.1:14830'\n        self.processor = Processor(projectdb=self.projectdb,\n                                   inqueue=None,\n                                   status_queue=self.status_queue,\n                                   newtask_queue=self.newtask_queue,\n                                   result_queue=self.result_queue)\n        self.project_name = 'data_fetcher_processor_handler'\n        time.sleep(0.5)\n\n    @classmethod\n    def tearDownClass(self):\n        self.proxy_thread.terminate()\n        self.proxy_thread.wait()\n        self.httpbin_thread.terminate()\n        self.httpbin_thread.join()\n\n    @classmethod\n    def crawl(self, url=None, track=None, **kwargs):\n        if url is None and kwargs.get('callback'):\n            url = dataurl.encode(utils.text(kwargs.get('callback')))\n\n        project_data = self.processor.project_manager.get(self.project_name)\n        assert project_data, \"can't find project: %s\" % self.project_name\n        instance = project_data['instance']\n        instance._reset()\n        task = instance.crawl(url, **kwargs)\n        if isinstance(task, list):\n            task = task[0]\n        task['track'] = track\n        result = self.fetcher.fetch(task)\n        self.processor.on_task(task, result)\n\n        status = None\n        while not self.status_queue.empty():\n            status = self.status_queue.get()\n        newtasks = []\n        while not self.newtask_queue.empty():\n            newtasks = self.newtask_queue.get()\n        result = None\n        while not self.result_queue.empty():\n            _, result = self.result_queue.get()\n        return status, newtasks, result\n\n    @classmethod\n    def assertStatusOk(self, status):\n        self.assertTrue(self.status_ok(status, 'fetch'), status.get('track', {}).get('fetch'))\n        self.assertTrue(self.status_ok(status, 'process'), status.get('track', {}).get('process'))\n\n    @classmethod\n    def status_ok(self, status, type):\n        if not status:\n            return False\n        return status.get('track', {}).get(type, {}).get('ok', False)\n\n    def test_10_not_status(self):\n        status, newtasks, result = self.crawl(callback=self.not_send_status.__name__)\n\n        self.assertIsNone(status)\n        self.assertEqual(len(newtasks), 1, newtasks)\n        self.assertEqual(result, 'not_send_status')\n\n    def test_20_url_deduplicated(self):\n        status, newtasks, result = self.crawl(callback=self.url_deduplicated)\n\n        self.assertStatusOk(status)\n        self.assertIsNone(status['track']['fetch']['error'])\n        self.assertIsNone(status['track']['fetch']['content'])\n        self.assertFalse(status['track']['fetch']['headers'])\n        self.assertFalse(status['track']['process']['logs'])\n        self.assertEqual(len(newtasks), 2, newtasks)\n        self.assertIsNone(result)\n\n    def test_30_catch_status_code_error(self):\n        status, newtasks, result = self.crawl(self.httpbin + '/status/418', callback=self.json)\n\n        self.assertFalse(self.status_ok(status, 'fetch'))\n        self.assertFalse(self.status_ok(status, 'process'))\n        self.assertIn('HTTP 418', status['track']['fetch']['error'])\n        self.assertTrue(status['track']['fetch']['content'], '')\n        self.assertTrue(status['track']['fetch']['headers'])\n        self.assertTrue(status['track']['process']['logs'])\n        self.assertIn('HTTPError: HTTP 418', status['track']['process']['logs'])\n        self.assertFalse(newtasks)\n\n        status, newtasks, result = self.crawl(self.httpbin + '/status/400', callback=self.catch_http_error)\n\n        self.assertFalse(self.status_ok(status, 'fetch'))\n        self.assertTrue(self.status_ok(status, 'process'))\n        self.assertEqual(len(newtasks), 1, newtasks)\n        self.assertEqual(result, 400)\n\n        status, newtasks, result = self.crawl(self.httpbin + '/status/500', callback=self.catch_http_error)\n        self.assertFalse(self.status_ok(status, 'fetch'))\n        self.assertTrue(self.status_ok(status, 'process'))\n        self.assertEqual(len(newtasks), 1, newtasks)\n        self.assertEqual(result, 500)\n\n        status, newtasks, result = self.crawl(self.httpbin + '/status/302',\n                                              allow_redirects=False,\n                                              callback=self.catch_http_error)\n        self.assertFalse(self.status_ok(status, 'fetch'))\n        self.assertTrue(self.status_ok(status, 'process'))\n        self.assertEqual(len(newtasks), 1, newtasks)\n        self.assertEqual(result, 302)\n\n    def test_40_method(self):\n        status, newtasks, result = self.crawl(self.httpbin + '/delete', method='DELETE', callback=self.json)\n\n        self.assertStatusOk(status)\n        self.assertFalse(newtasks)\n\n        status, newtasks, result = self.crawl(self.httpbin + '/get', method='DELETE', callback=self.catch_http_error)\n\n        self.assertFalse(self.status_ok(status, 'fetch'))\n        self.assertTrue(self.status_ok(status, 'process'))\n        self.assertTrue(newtasks)\n        self.assertEqual(result, 405)\n\n    def test_50_params(self):\n        status, newtasks, result = self.crawl(self.httpbin + '/get', params={\n            'roy': 'binux',\n            u'中文': '.',\n        }, callback=self.json)\n\n        self.assertStatusOk(status)\n        self.assertFalse(newtasks)\n        self.assertEqual(result['args'], {'roy': 'binux', u'中文': '.'})\n\n    def test_60_data(self):\n        status, newtasks, result = self.crawl(self.httpbin + '/post', data={\n            'roy': 'binux',\n            u'中文': '.',\n        }, callback=self.json)\n\n        self.assertStatusOk(status)\n        self.assertFalse(newtasks)\n        self.assertEqual(result['form'], {'roy': 'binux', u'中文': '.'})\n\n    def test_70_redirect(self):\n        status, newtasks, result = self.crawl(self.httpbin + '/redirect-to?url=/get', callback=self.json)\n\n        self.assertStatusOk(status)\n        self.assertEqual(status['track']['fetch']['redirect_url'], self.httpbin + '/get')\n        self.assertFalse(newtasks)\n\n    def test_80_redirect_too_many(self):\n        status, newtasks, result = self.crawl(self.httpbin + '/redirect/10', callback=self.json)\n\n        self.assertFalse(self.status_ok(status, 'fetch'))\n        self.assertFalse(self.status_ok(status, 'process'))\n        self.assertFalse(newtasks)\n        self.assertEqual(status['track']['fetch']['status_code'], 599)\n        self.assertIn('redirects followed', status['track']['fetch']['error'])\n\n    def test_90_files(self):\n        status, newtasks, result = self.crawl(self.httpbin + '/put', method='PUT',\n                                              files={os.path.basename(__file__): open(__file__).read()},\n                                              callback=self.json)\n\n        self.assertStatusOk(status)\n        self.assertFalse(newtasks)\n        self.assertIn(os.path.basename(__file__), result['files'])\n\n    def test_a100_files_with_data(self):\n        status, newtasks, result = self.crawl(self.httpbin + '/put', method='PUT',\n                                              files={os.path.basename(__file__): open(__file__).read()},\n                                              data={\n                                                  'roy': 'binux',\n                                                  # '中文': '.', # FIXME: not work\n                                              },\n                                              callback=self.json)\n        self.assertStatusOk(status)\n        self.assertFalse(newtasks)\n        self.assertEqual(result['form'], {'roy': 'binux'})\n        self.assertIn(os.path.basename(__file__), result['files'])\n\n    def test_a110_headers(self):\n        status, newtasks, result = self.crawl(self.httpbin + '/get',\n                                              headers={\n                                                  'a': 'b',\n                                                  'C-d': 'e-F',\n                                              }, callback=self.json)\n        self.assertStatusOk(status)\n        self.assertFalse(newtasks)\n        self.assertEqual(result['headers'].get('A'), 'b')\n        self.assertEqual(result['headers'].get('C-D'), 'e-F')\n\n    def test_a115_user_agent(self):\n        status, newtasks, result = self.crawl(self.httpbin + '/get',\n                                              user_agent='binux', callback=self.json)\n\n        self.assertStatusOk(status)\n        self.assertFalse(newtasks)\n        self.assertEqual(result['headers'].get('User-Agent'), 'binux')\n\n    def test_a120_cookies(self):\n        status, newtasks, result = self.crawl(self.httpbin + '/get',\n                                              cookies={\n                                                  'a': 'b',\n                                                  'C-d': 'e-F'\n                                              }, callback=self.json)\n        self.assertStatusOk(status)\n        self.assertFalse(newtasks)\n        self.assertIn('a=b', result['headers'].get('Cookie'))\n        self.assertIn('C-d=e-F', result['headers'].get('Cookie'))\n\n    def test_a130_cookies_with_headers(self):\n        status, newtasks, result = self.crawl(self.httpbin + '/get',\n                                              headers={\n                                                  'Cookie': 'g=h; I=j',\n                                              },\n                                              cookies={\n                                                  'a': 'b',\n                                                  'C-d': 'e-F'\n                                              }, callback=self.json)\n        self.assertStatusOk(status)\n        self.assertFalse(newtasks)\n        self.assertIn('g=h', result['headers'].get('Cookie'))\n        self.assertIn('I=j', result['headers'].get('Cookie'))\n        self.assertIn('a=b', result['headers'].get('Cookie'))\n        self.assertIn('C-d=e-F', result['headers'].get('Cookie'))\n\n    def test_a140_response_cookie(self):\n        status, newtasks, result = self.crawl(self.httpbin + '/cookies/set?k1=v1&k2=v2',\n                                              callback=self.cookies)\n        self.assertStatusOk(status)\n        self.assertFalse(newtasks)\n        self.assertEqual(result, {'k1': 'v1', 'k2': 'v2'})\n\n    def test_a145_redirect_cookie(self):\n        status, newtasks, result = self.crawl(self.httpbin + '/cookies/set?k1=v1&k2=v2',\n                                              callback=self.json)\n        self.assertStatusOk(status)\n        self.assertFalse(newtasks)\n        self.assertEqual(result['cookies'], {'k1': 'v1', 'k2': 'v2'})\n\n    def test_a150_timeout(self):\n        status, newtasks, result = self.crawl(self.httpbin + '/delay/2', timeout=1, callback=self.json)\n\n        self.assertFalse(self.status_ok(status, 'fetch'))\n        self.assertFalse(self.status_ok(status, 'process'))\n        self.assertFalse(newtasks)\n        self.assertEqual(int(status['track']['fetch']['time']), 1)\n\n    def test_a160_etag(self):\n        status, newtasks, result = self.crawl(self.httpbin + '/cache', etag='abc', callback=self.json)\n\n        self.assertStatusOk(status)\n        self.assertFalse(newtasks)\n        self.assertFalse(result)\n\n    def test_a170_last_modified(self):\n        status, newtasks, result = self.crawl(self.httpbin + '/cache', last_modified='0', callback=self.json)\n\n        self.assertStatusOk(status)\n        self.assertFalse(newtasks)\n        self.assertFalse(result)\n\n    def test_a180_save(self):\n        status, newtasks, result = self.crawl(callback=self.get_save,\n                                              save={'roy': 'binux', u'中文': 'value'})\n\n        self.assertStatusOk(status)\n        self.assertFalse(newtasks)\n        self.assertEqual(result, {'roy': 'binux', u'中文': 'value'})\n\n    def test_a190_taskid(self):\n        status, newtasks, result = self.crawl(callback=self.get_save,\n                                              taskid='binux-taskid')\n\n        self.assertStatusOk(status)\n        self.assertEqual(status['taskid'], 'binux-taskid')\n        self.assertFalse(newtasks)\n        self.assertFalse(result)\n\n    def test_a200_no_proxy(self):\n        old_proxy = self.fetcher.proxy\n        self.fetcher.proxy = self.proxy\n        status, newtasks, result = self.crawl(self.httpbin + '/get',\n                                              params={\n                                                  'test': 'a200'\n                                              }, proxy=False, callback=self.json)\n\n        self.assertStatusOk(status)\n        self.assertFalse(newtasks)\n        self.fetcher.proxy = old_proxy\n\n    def test_a210_proxy_failed(self):\n        old_proxy = self.fetcher.proxy\n        self.fetcher.proxy = self.proxy\n        status, newtasks, result = self.crawl(self.httpbin + '/get',\n                                              params={\n                                                  'test': 'a210'\n                                              }, callback=self.catch_http_error)\n\n        self.assertFalse(self.status_ok(status, 'fetch'))\n        self.assertTrue(self.status_ok(status, 'process'))\n        self.assertEqual(len(newtasks), 1, newtasks)\n        self.assertEqual(result, 403)\n        self.fetcher.proxy = old_proxy\n\n    def test_a220_proxy_ok(self):\n        old_proxy = self.fetcher.proxy\n        self.fetcher.proxy = self.proxy\n        status, newtasks, result = self.crawl(self.httpbin + '/get',\n                                              params={\n                                                  'test': 'a220',\n                                                  'username': 'binux',\n                                                  'password': '123456',\n                                              }, callback=self.catch_http_error)\n\n        self.assertStatusOk(status)\n        self.assertEqual(result, 200)\n        self.fetcher.proxy = old_proxy\n\n    def test_a230_proxy_parameter_fail(self):\n        status, newtasks, result = self.crawl(self.httpbin + '/get',\n                                              params={\n                                                  'test': 'a230',\n                                              }, proxy=self.proxy,\n                                              callback=self.catch_http_error)\n\n        self.assertFalse(self.status_ok(status, 'fetch'))\n        self.assertTrue(self.status_ok(status, 'process'))\n        self.assertEqual(result, 403)\n\n    def test_a240_proxy_parameter_ok(self):\n        status, newtasks, result = self.crawl(self.httpbin + '/post',\n                                              method='POST',\n                                              data={\n                                                  'test': 'a240',\n                                                  'username': 'binux',\n                                                  'password': '123456',\n                                              }, proxy=self.proxy,\n                                              callback=self.catch_http_error)\n\n        self.assertStatusOk(status)\n        self.assertEqual(result, 200)\n\n    def test_a250_proxy_userpass(self):\n        status, newtasks, result = self.crawl(self.httpbin + '/post',\n                                              method='POST',\n                                              data={\n                                                  'test': 'a250',\n                                              }, proxy='binux:123456@' + self.proxy,\n                                              callback=self.catch_http_error)\n\n        self.assertStatusOk(status)\n        self.assertEqual(result, 200)\n\n    def test_a260_process_save(self):\n        status, newtasks, result = self.crawl(callback=self.set_process_save)\n\n        self.assertStatusOk(status)\n        self.assertIn('roy', status['track']['save'])\n        self.assertEqual(status['track']['save']['roy'], 'binux')\n\n        status, newtasks, result = self.crawl(callback=self.get_process_save,\n                                              track=status['track'])\n\n        self.assertStatusOk(status)\n        self.assertIn('roy', result)\n        self.assertEqual(result['roy'], 'binux')\n\n    def test_zzz_links(self):\n        status, newtasks, result = self.crawl(self.httpbin + '/links/10/0', callback=self.links)\n\n        self.assertStatusOk(status)\n        self.assertEqual(len(newtasks), 9, newtasks)\n        self.assertFalse(result)\n\n    def test_zzz_html(self):\n        status, newtasks, result = self.crawl(self.httpbin + '/html', callback=self.html)\n\n        self.assertStatusOk(status)\n        self.assertFalse(newtasks)\n        self.assertEqual(result, 'Herman Melville - Moby-Dick')\n\n    def test_zzz_etag_enabled(self):\n        status, newtasks, result = self.crawl(self.httpbin + '/cache', callback=self.json)\n        self.assertStatusOk(status)\n        self.assertTrue(result)\n\n        status, newtasks, result = self.crawl(self.httpbin + '/cache',\n                                              track=status['track'], callback=self.json)\n        self.assertStatusOk(status)\n        self.assertFalse(newtasks)\n        self.assertFalse(result)\n\n    def test_zzz_etag_not_working(self):\n        status, newtasks, result = self.crawl(self.httpbin + '/cache', callback=self.json)\n        self.assertStatusOk(status)\n        self.assertTrue(result)\n\n        status['track']['process']['ok'] = False\n        status, newtasks, result = self.crawl(self.httpbin + '/cache',\n                                              track=status['track'], callback=self.json)\n        self.assertStatusOk(status)\n        self.assertTrue(result)\n\n    def test_zzz_unexpected_crawl_argument(self):\n        with self.assertRaisesRegexp(TypeError, \"unexpected keyword argument\"):\n            self.crawl(self.httpbin + '/cache', cookie={}, callback=self.json)\n\n    def test_zzz_curl_get(self):\n        status, newtasks, result = self.crawl(\n            \"curl '\" + self.httpbin + '''/get' -H 'DNT: 1' -H 'Accept-Encoding: gzip, deflate, sdch' -H 'Accept-Language: en,zh-CN;q=0.8,zh;q=0.6' -H 'User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.17 Safari/537.36' -H 'Binux-Header: Binux-Value' -H 'Accept: */*' -H 'Cookie: _gauges_unique_year=1; _gauges_unique=1; _ga=GA1.2.415471573.1419316591' -H 'Connection: keep-alive' --compressed''',\n            callback=self.json)\n        self.assertStatusOk(status)\n        self.assertTrue(result)\n\n        self.assertTrue(result['headers'].get('Binux-Header'), 'Binux-Value')\n\n    def test_zzz_curl_post(self):\n        status, newtasks, result = self.crawl(\n            \"curl '\" + self.httpbin + '''/post' -H 'Origin: chrome-extension://hgmloofddffdnphfgcellkdfbfbjeloo' -H 'Accept-Encoding: gzip, deflate' -H 'Accept-Language: en,zh-CN;q=0.8,zh;q=0.6' -H 'User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.17 Safari/537.36' -H 'Content-Type: application/x-www-form-urlencoded' -H 'Accept: */*' -H 'Cookie: _gauges_unique_year=1; _gauges_unique=1; _ga=GA1.2.415471573.1419316591' -H 'Connection: keep-alive' -H 'DNT: 1' --data 'Binux-Key=%E4%B8%AD%E6%96%87+value' --compressed''',\n            callback=self.json)\n        self.assertStatusOk(status)\n        self.assertTrue(result)\n\n        self.assertTrue(result['form'].get('Binux-Key'), '中文 value')\n\n    def test_zzz_curl_put(self):\n        status, newtasks, result = self.crawl(\n            \"curl '\" + self.httpbin + '''/put' -X PUT -H 'Origin: chrome-extension://hgmloofddffdnphfgcellkdfbfbjeloo' -H 'Accept-Encoding: gzip, deflate, sdch' -H 'Accept-Language: en,zh-CN;q=0.8,zh;q=0.6' -H 'User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.17 Safari/537.36' -H 'Content-Type: multipart/form-data; boundary=----WebKitFormBoundaryYlkgyaA7SRGOQYUG' -H 'Accept: */*' -H 'Cookie: _gauges_unique_year=1; _gauges_unique=1; _ga=GA1.2.415471573.1419316591' -H 'Connection: keep-alive' -H 'DNT: 1' --data-binary $'------WebKitFormBoundaryYlkgyaA7SRGOQYUG\\r\\nContent-Disposition: form-data; name=\"Binux-Key\"\\r\\n\\r\\n%E4%B8%AD%E6%96%87+value\\r\\n------WebKitFormBoundaryYlkgyaA7SRGOQYUG\\r\\nContent-Disposition: form-data; name=\"fileUpload1\"; filename=\"1\"\\r\\nContent-Type: application/octet-stream\\r\\n\\r\\n\\r\\n------WebKitFormBoundaryYlkgyaA7SRGOQYUG--\\r\\n' --compressed''',\n            callback=self.json)\n        self.assertStatusOk(status)\n        self.assertTrue(result)\n\n        self.assertIn('fileUpload1', result['files'], result)\n\n    def test_zzz_curl_no_url(self):\n        with self.assertRaisesRegexp(TypeError, 'no URL'):\n            status, newtasks, result = self.crawl(\n                '''curl -X PUT -H 'Origin: chrome-extension://hgmloofddffdnphfgcellkdfbfbjeloo' --compressed''',\n                callback=self.json)\n\n    def test_zzz_curl_bad_option(self):\n        with self.assertRaisesRegexp(TypeError, 'Unknow curl option'):\n            status, newtasks, result = self.crawl(\n                '''curl '%s/put' -X PUT -H 'Origin: chrome-extension://hgmloofddffdnphfgcellkdfbfbjeloo' -v''' % self.httpbin,\n                callback=self.json)\n\n        with self.assertRaisesRegexp(TypeError, 'Unknow curl option'):\n            status, newtasks, result = self.crawl(\n                '''curl '%s/put' -X PUT -v -H 'Origin: chrome-extension://hgmloofddffdnphfgcellkdfbfbjeloo' ''' % self.httpbin,\n                callback=self.json)\n\n    def test_zzz_robots_txt(self):\n        status, newtasks, result = self.crawl(self.httpbin + '/deny', robots_txt=True, callback=self.catch_http_error)\n\n        self.assertEqual(result, 403)\n\n    def test_zzz_connect_timeout(self):\n        start_time = time.time()\n        status, newtasks, result = self.crawl('http://240.0.0.1/', connect_timeout=5, callback=self.catch_http_error)\n        end_time = time.time()\n        self.assertTrue(5 <= end_time - start_time <= 6)"
  },
  {
    "path": "tests/test_message_queue.py",
    "content": "#!/usr/bin/env python\n# -*- encoding: utf-8 -*-\n# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:\n# Author: Binux<i@binux.me>\n#         http://binux.me\n# Created on 2014-10-07 10:33:38\n\nimport os\nimport six\nimport time\nimport unittest\n\nfrom pyspider.libs import utils\nfrom six.moves import queue as Queue\n\n\nclass TestMessageQueue(object):\n\n    @classmethod\n    def setUpClass(self):\n        raise NotImplementedError\n\n    def test_10_put(self):\n        self.assertEqual(self.q1.qsize(), 0)\n        self.assertEqual(self.q2.qsize(), 0)\n        self.q1.put('TEST_DATA1', timeout=3)\n        self.q1.put('TEST_DATA2_中文', timeout=3)\n        time.sleep(0.01)\n        self.assertEqual(self.q1.qsize(), 2)\n        self.assertEqual(self.q2.qsize(), 2)\n\n    def test_20_get(self):\n        self.assertEqual(self.q1.get(timeout=0.01), 'TEST_DATA1')\n        self.assertEqual(self.q2.get_nowait(), 'TEST_DATA2_中文')\n        with self.assertRaises(Queue.Empty):\n            self.q2.get(timeout=0.01)\n        with self.assertRaises(Queue.Empty):\n            self.q2.get_nowait()\n\n    def test_30_full(self):\n        self.assertEqual(self.q1.qsize(), 0)\n        self.assertEqual(self.q2.qsize(), 0)\n        for i in range(2):\n            self.q1.put_nowait('TEST_DATA%d' % i)\n        for i in range(3):\n            self.q2.put('TEST_DATA%d' % i)\n\n        with self.assertRaises(Queue.Full):\n            self.q1.put('TEST_DATA6', timeout=0.01)\n        with self.assertRaises(Queue.Full):\n            self.q1.put_nowait('TEST_DATA6')\n\n    def test_40_multiple_threading_error(self):\n        def put(q):\n            for i in range(100):\n                q.put(\"DATA_%d\" % i)\n\n        def get(q):\n            for i in range(100):\n                q.get()\n\n        t = utils.run_in_thread(put, self.q3)\n        get(self.q3)\n        t.join()\n\n\nclass BuiltinQueue(TestMessageQueue, unittest.TestCase):\n    @classmethod\n    def setUpClass(self):\n        from pyspider.message_queue import connect_message_queue\n        with utils.timeout(3):\n            self.q1 = self.q2 = connect_message_queue('test_queue', maxsize=5)\n            self.q3 = connect_message_queue('test_queue_for_threading_test')\n\n\n#@unittest.skipIf(six.PY3, 'pika not suport python 3')\n@unittest.skipIf(os.environ.get('IGNORE_RABBITMQ') or os.environ.get('IGNORE_ALL'), 'no rabbitmq server for test.')\nclass TestPikaRabbitMQ(TestMessageQueue, unittest.TestCase):\n\n    @classmethod\n    def setUpClass(self):\n        from pyspider.message_queue import rabbitmq\n        with utils.timeout(3):\n            self.q1 = rabbitmq.PikaQueue('test_queue', maxsize=5, lazy_limit=False)\n            self.q2 = rabbitmq.PikaQueue('test_queue', amqp_url='amqp://localhost:5672/%2F', maxsize=5, lazy_limit=False)\n            self.q3 = rabbitmq.PikaQueue('test_queue_for_threading_test', amqp_url='amqp://guest:guest@localhost:5672/', lazy_limit=False)\n        self.q2.delete()\n        self.q2.reconnect()\n        self.q3.delete()\n        self.q3.reconnect()\n\n    @classmethod\n    def tearDownClass(self):\n        self.q2.delete()\n        self.q3.delete()\n        del self.q1\n        del self.q2\n        del self.q3\n\n    def test_30_full(self):\n        self.assertEqual(self.q1.qsize(), 0)\n        self.assertEqual(self.q2.qsize(), 0)\n        for i in range(2):\n            self.q1.put_nowait('TEST_DATA%d' % i)\n        for i in range(3):\n            self.q2.put('TEST_DATA%d' % i)\n\n        print(self.q1.__dict__)\n        print(self.q1.qsize())\n        with self.assertRaises(Queue.Full):\n            self.q1.put_nowait('TEST_DATA6')\n        print(self.q1.__dict__)\n        print(self.q1.qsize())\n        with self.assertRaises(Queue.Full):\n            self.q1.put('TEST_DATA6', timeout=0.01)\n\n\n@unittest.skipIf(six.PY3, 'Python 3 now using Pika')\n@unittest.skipIf(os.environ.get('IGNORE_RABBITMQ') or os.environ.get('IGNORE_ALL'), 'no rabbitmq server for test.')\nclass TestAmqpRabbitMQ(TestMessageQueue, unittest.TestCase):\n\n    @classmethod\n    def setUpClass(self):\n        from pyspider.message_queue import connect_message_queue\n        with utils.timeout(3):\n            self.q1 = connect_message_queue('test_queue', 'amqp://localhost:5672/',\n                                            maxsize=5, lazy_limit=False)\n            self.q2 = connect_message_queue('test_queue', 'amqp://localhost:5672/%2F',\n                                            maxsize=5, lazy_limit=False)\n            self.q3 = connect_message_queue('test_queue_for_threading_test',\n                                            'amqp://guest:guest@localhost:5672/', lazy_limit=False)\n        self.q2.delete()\n        self.q2.reconnect()\n        self.q3.delete()\n        self.q3.reconnect()\n\n    @classmethod\n    def tearDownClass(self):\n        self.q2.delete()\n        self.q3.delete()\n        del self.q1\n        del self.q2\n        del self.q3\n\n    def test_30_full(self):\n        self.assertEqual(self.q1.qsize(), 0)\n        self.assertEqual(self.q2.qsize(), 0)\n        for i in range(2):\n            self.q1.put_nowait('TEST_DATA%d' % i)\n        for i in range(3):\n            self.q2.put('TEST_DATA%d' % i)\n\n        print(self.q1.__dict__)\n        print(self.q1.qsize())\n        with self.assertRaises(Queue.Full):\n            self.q1.put('TEST_DATA6', timeout=0.01)\n        print(self.q1.__dict__)\n        print(self.q1.qsize())\n        with self.assertRaises(Queue.Full):\n            self.q1.put_nowait('TEST_DATA6')\n\n\n@unittest.skipIf(os.environ.get('IGNORE_REDIS') or os.environ.get('IGNORE_ALL'), 'no redis server for test.')\nclass TestRedisQueue(TestMessageQueue, unittest.TestCase):\n\n    @classmethod\n    def setUpClass(self):\n        from pyspider.message_queue import connect_message_queue\n        from pyspider.message_queue import redis_queue\n        with utils.timeout(3):\n            self.q1 = redis_queue.RedisQueue('test_queue', maxsize=5, lazy_limit=False)\n            self.q2 = redis_queue.RedisQueue('test_queue', maxsize=5, lazy_limit=False)\n            self.q3 = connect_message_queue('test_queue_for_threading_test',\n                                            'redis://localhost:6379/')\n            while not self.q1.empty():\n                self.q1.get()\n            while not self.q2.empty():\n                self.q2.get()\n            while not self.q3.empty():\n                self.q3.get()\n\n    @classmethod\n    def tearDownClass(self):\n        while not self.q1.empty():\n            self.q1.get()\n        while not self.q2.empty():\n            self.q2.get()\n        while not self.q3.empty():\n            self.q3.get()\n\nclass TestKombuQueue(TestMessageQueue, unittest.TestCase):\n    kombu_url = 'kombu+memory://'\n\n    @classmethod\n    def setUpClass(self):\n        from pyspider.message_queue import connect_message_queue\n        with utils.timeout(3):\n            self.q1 = connect_message_queue('test_queue', self.kombu_url, maxsize=5, lazy_limit=False)\n            self.q2 = connect_message_queue('test_queue', self.kombu_url, maxsize=5, lazy_limit=False)\n            self.q3 = connect_message_queue('test_queue_for_threading_test', self.kombu_url, lazy_limit=False)\n            while not self.q1.empty():\n                self.q1.get()\n            while not self.q2.empty():\n                self.q2.get()\n            while not self.q3.empty():\n                self.q3.get()\n\n    @classmethod\n    def tearDownClass(self):\n        while not self.q1.empty():\n            self.q1.get()\n        self.q1.delete()\n        while not self.q2.empty():\n            self.q2.get()\n        self.q2.delete()\n        while not self.q3.empty():\n            self.q3.get()\n        self.q3.delete()\n\n@unittest.skip('test cannot pass, get is buffered')\n@unittest.skipIf(os.environ.get('IGNORE_RABBITMQ') or os.environ.get('IGNORE_ALL'), 'no rabbitmq server for test.')\nclass TestKombuAmpqQueue(TestKombuQueue):\n    kombu_url = 'kombu+amqp://'\n\n@unittest.skip('test cannot pass, put is buffered')\n@unittest.skipIf(os.environ.get('IGNORE_REDIS') or os.environ.get('IGNORE_ALL'), 'no redis server for test.')\nclass TestKombuRedisQueue(TestKombuQueue):\n    kombu_url = 'kombu+redis://'\n\n@unittest.skipIf(os.environ.get('IGNORE_MONGODB') or os.environ.get('IGNORE_ALL'), 'no mongodb server for test.')\nclass TestKombuMongoDBQueue(TestKombuQueue):\n    kombu_url = 'kombu+mongodb://'\n"
  },
  {
    "path": "tests/test_processor.py",
    "content": "#!/usr/bin/env python\n# -*- encoding: utf-8 -*-\n# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:\n# Author: Binux<i@binux.me>\n#         http://binux.me\n# Created on 2014-02-22 14:00:05\n\nimport os\nimport six\nimport copy\nimport time\nimport unittest\nimport logging.config\nlogging.config.fileConfig(\"pyspider/logging.conf\")\n\nfrom pyspider.libs import utils\nfrom pyspider.processor.project_module import ProjectManager\n\n\nclass TestProjectModule(unittest.TestCase):\n\n    @property\n    def base_task(self):\n        return {\n            'taskid': 'taskid',\n            'project': 'test.project',\n            'url': 'www.baidu.com/',\n            'schedule': {\n                'priority': 1,\n                'retries': 3,\n                'exetime': 0,\n                'age': 3600,\n                'itag': 'itag',\n                'recrawl': 5,\n            },\n            'fetch': {\n                'method': 'GET',\n                'headers': {\n                    'Cookie': 'a=b',\n                },\n                'data': 'a=b&c=d',\n                'timeout': 60,\n                'save': [1, 2, 3],\n            },\n            'process': {\n                'callback': 'callback',\n            },\n        }\n\n    @property\n    def fetch_result(self):\n        return {\n            'status_code': 200,\n            'orig_url': 'www.baidu.com/',\n            'url': 'http://www.baidu.com/',\n            'headers': {\n                'cookie': 'abc',\n            },\n            'content': 'test data',\n            'cookies': {\n                'a': 'b',\n            },\n            'save': [1, 2, 3],\n        }\n\n    def setUp(self):\n        self.project = \"test.project\"\n        self.script = open(os.path.join(os.path.dirname(__file__), 'data_handler.py')).read()\n        self.env = {\n            'test': True,\n        }\n        self.project_info = {\n            'name': self.project,\n            'status': 'DEBUG',\n        }\n        data = ProjectManager.build_module({\n            'name': self.project,\n            'script': self.script\n        }, {'test': True})\n        self.module = data['module']\n        self.instance = data['instance']\n\n    def test_2_hello(self):\n        base_task = self.base_task\n        base_task['process']['callback'] = 'hello'\n        ret = self.instance.run_task(self.module, base_task, self.fetch_result)\n        self.assertIsNone(ret.exception)\n        self.assertEqual(ret.result, \"hello world!\")\n\n    def test_3_echo(self):\n        base_task = self.base_task\n        base_task['process']['callback'] = 'echo'\n        ret = self.instance.run_task(self.module, base_task, self.fetch_result)\n        self.assertIsNone(ret.exception)\n        self.assertEqual(ret.result, \"test data\")\n\n    def test_4_saved(self):\n        base_task = self.base_task\n        base_task['process']['callback'] = 'saved'\n        ret = self.instance.run_task(self.module, base_task, self.fetch_result)\n        self.assertIsNone(ret.exception)\n        self.assertEqual(ret.result, base_task['fetch']['save'])\n\n    def test_5_echo_task(self):\n        base_task = self.base_task\n        base_task['process']['callback'] = 'echo_task'\n        ret = self.instance.run_task(self.module, base_task, self.fetch_result)\n        self.assertIsNone(ret.exception)\n        self.assertEqual(ret.result, self.project)\n\n    def test_6_catch_status_code(self):\n        base_task = self.base_task\n        fetch_result = self.fetch_result\n        fetch_result['status_code'] = 403\n        base_task['process']['callback'] = 'catch_status_code'\n        ret = self.instance.run_task(self.module, base_task, fetch_result)\n        self.assertIsNone(ret.exception)\n        self.assertEqual(ret.result, 403)\n\n    def test_7_raise_exception(self):\n        base_task = self.base_task\n        base_task['process']['callback'] = 'raise_exception'\n        ret = self.instance.run_task(self.module, base_task, self.fetch_result)\n        self.assertIsNotNone(ret.exception)\n        logstr = ret.logstr()\n        self.assertIn('info', logstr)\n        self.assertIn('warning', logstr)\n        self.assertIn('error', logstr)\n\n    def test_8_add_task(self):\n        base_task = self.base_task\n        base_task['process']['callback'] = 'add_task'\n        ret = self.instance.run_task(self.module, base_task, self.fetch_result)\n        self.assertIsNone(ret.exception, ret.logstr())\n        self.assertEqual(len(ret.follows), 1)\n        self.assertEqual(len(ret.messages), 1)\n\n    def test_10_cronjob(self):\n        task = {\n            'taskid': '_on_cronjob',\n            'project': self.project,\n            'url': 'data:,_on_cronjob',\n            'fetch': {\n                'save': {\n                    'tick': 11,\n                },\n            },\n            'process': {\n                'callback': '_on_cronjob',\n            },\n        }\n        fetch_result = self.fetch_result\n        fetch_result['save'] = {\n            'tick': 11,\n        }\n        ret = self.instance.run_task(self.module, task, fetch_result)\n        logstr = ret.logstr()\n        self.assertNotIn('on_cronjob1', logstr)\n        self.assertNotIn('on_cronjob2', logstr)\n\n        task['fetch']['save']['tick'] = 10\n        fetch_result['save'] = task['fetch']['save']\n        ret = self.instance.run_task(self.module, task, fetch_result)\n        logstr = ret.logstr()\n        self.assertNotIn('on_cronjob1', logstr)\n        self.assertIn('on_cronjob2', logstr)\n\n        task['fetch']['save']['tick'] = 60\n        fetch_result['save'] = task['fetch']['save']\n        ret = self.instance.run_task(self.module, task, fetch_result)\n        logstr = ret.logstr()\n        self.assertIn('on_cronjob1', logstr)\n        self.assertIn('on_cronjob2', logstr)\n\n    def test_20_get_info(self):\n        task = {\n            'taskid': '_on_get_info',\n            'project': self.project,\n            'url': 'data:,_on_get_info',\n            'fetch': {\n                'save': ['min_tick', 'retry_delay'],\n            },\n            'process': {\n                'callback': '_on_get_info',\n            },\n        }\n        fetch_result = self.fetch_result\n        fetch_result['save'] = task['fetch']['save']\n\n        ret = self.instance.run_task(self.module, task, fetch_result)\n        self.assertEqual(len(ret.save), 2, ret.logstr())\n        for each in ret.follows:\n            self.assertEqual(each['url'], 'data:,on_get_info')\n            self.assertEqual(each['fetch']['save']['min_tick'], 10)\n            self.assertEqual(each['fetch']['save']['retry_delay'], {})\n\n    def test_30_generator(self):\n        base_task = self.base_task\n        base_task['process']['callback'] = 'generator'\n        ret = self.instance.run_task(self.module, base_task, self.fetch_result)\n        self.assertIsNone(ret.exception)\n        self.assertIn('generator object', repr(ret.result))\n\n    def test_40_sleep(self):\n        base_task = self.base_task\n        fetch_result = self.fetch_result\n        base_task['process']['callback'] = 'sleep'\n        fetch_result['save'] = 1\n\n        start_time = time.time()\n        ret = self.instance.run_task(self.module, base_task, fetch_result)\n        self.assertGreaterEqual(time.time() - start_time, 1)\n\n    def test_50_timeout(self):\n        base_task = self.base_task\n        fetch_result = self.fetch_result\n        base_task['process']['callback'] = 'sleep'\n        base_task['process']['process_time_limit'] = 0.5\n        fetch_result['save'] = 2\n\n        start_time = time.time()\n\n        ret = self.instance.run_task(self.module, base_task, fetch_result)\n        self.assertIsNotNone(ret.exception)\n        logstr = ret.logstr()\n        self.assertIn('TimeoutError: process timeout', logstr)\n\n        self.assertGreaterEqual(time.time() - start_time, 1)\n        self.assertLess(time.time() - start_time, 2)\n\n    def test_60_timeout_in_thread(self):\n        base_task = self.base_task\n        fetch_result = self.fetch_result\n        base_task['process']['callback'] = 'sleep'\n        base_task['process']['process_time_limit'] = 0.5\n        fetch_result['save'] = 2\n\n        start_time = time.time()\n        thread = utils.run_in_thread(lambda self=self: self.instance.run_task(self.module, base_task, fetch_result))\n        thread.join()\n        self.assertGreaterEqual(time.time() - start_time, 2)\n\n\nimport shutil\nimport inspect\nfrom pyspider.database.sqlite import projectdb\nfrom pyspider.processor.processor import Processor\nfrom pyspider.libs.multiprocessing_queue import Queue\nfrom pyspider.libs.utils import run_in_thread\nfrom pyspider.libs import sample_handler\n\n\nclass TestProcessor(unittest.TestCase):\n    projectdb_path = './data/tests/project.db'\n\n    @classmethod\n    def setUpClass(self):\n        shutil.rmtree('./data/tests/', ignore_errors=True)\n        os.makedirs('./data/tests/')\n\n        def get_projectdb():\n            return projectdb.ProjectDB(self.projectdb_path)\n        self.projectdb = get_projectdb()\n        self.in_queue = Queue(10)\n        self.status_queue = Queue(10)\n        self.newtask_queue = Queue(10)\n        self.result_queue = Queue(10)\n\n        def run_processor():\n            self.processor = Processor(get_projectdb(), self.in_queue,\n                                       self.status_queue, self.newtask_queue, self.result_queue)\n            self.processor.project_manager.CHECK_PROJECTS_INTERVAL = 0.1\n            self.processor.run()\n        self.process = run_in_thread(run_processor)\n        time.sleep(1)\n\n    @classmethod\n    def tearDownClass(self):\n        if self.process.is_alive():\n            self.processor.quit()\n            self.process.join(2)\n        assert not self.process.is_alive()\n        shutil.rmtree('./data/tests/', ignore_errors=True)\n\n    def test_10_update_project(self):\n        self.assertIsNone(self.processor.project_manager.get('test_project'))\n        self.projectdb.insert('test_project', {\n            'name': 'test_project',\n            'group': 'group',\n            'status': 'TODO',\n            'script': inspect.getsource(sample_handler),\n            'comments': 'test project',\n            'rate': 1.0,\n            'burst': 10,\n        })\n        self.assertIsNone(self.processor.project_manager.get('not_exists'))\n        self.assertIsNotNone(self.processor.project_manager.get('test_project'))\n\n        task = {\n            \"process\": {\n                \"callback\": \"on_start\"\n            },\n            \"project\": \"not_exists\",\n            \"taskid\": \"data:,on_start\",\n            \"url\": \"data:,on_start\"\n        }\n        self.in_queue.put((task, {}))\n        time.sleep(1)\n        self.assertFalse(self.status_queue.empty())\n        while not self.status_queue.empty():\n            status = self.status_queue.get()\n        self.assertEqual(status['track']['process']['ok'], False)\n        self.assertIsNone(self.processor.project_manager.get('not_exists'))\n\n    def test_20_broken_project(self):\n        self.assertIsNone(self.processor.project_manager.get('test_broken_project'))\n        self.projectdb.insert('test_broken_project', {\n            'name': 'test_broken_project',\n            'group': 'group',\n            'status': 'DEBUG',\n            'script': inspect.getsource(sample_handler)[:10],\n            'comments': 'test project',\n            'rate': 1.0,\n            'burst': 10,\n        })\n        self.assertIsNone(self.processor.project_manager.get('not_exists'))\n        self.assertIsNotNone(self.processor.project_manager.get('test_broken_project'))\n        project_data = self.processor.project_manager.get('test_broken_project')\n        self.assertIsNotNone(project_data.get('exception'))\n\n    def test_30_new_task(self):\n        self.assertTrue(self.status_queue.empty())\n        self.assertTrue(self.newtask_queue.empty())\n        task = {\n            \"process\": {\n                \"callback\": \"on_start\"\n            },\n            \"project\": \"test_project\",\n            \"taskid\": \"data:,on_start\",\n            \"url\": \"data:,on_start\"\n        }\n        fetch_result = {\n            \"orig_url\": \"data:,on_start\",\n            \"content\": \"on_start\",\n            \"headers\": {},\n            \"status_code\": 200,\n            \"url\": \"data:,on_start\",\n            \"time\": 0,\n        }\n        self.in_queue.put((task, fetch_result))\n        time.sleep(1)\n        self.assertFalse(self.status_queue.empty())\n        while not self.status_queue.empty():\n            self.status_queue.get()\n        self.assertFalse(self.newtask_queue.empty())\n\n    def test_40_index_page(self):\n        task = None\n        while not self.newtask_queue.empty():\n            task = self.newtask_queue.get()[0]\n        self.assertIsNotNone(task)\n\n        fetch_result = {\n            \"orig_url\": task['url'],\n            \"content\": (\n                \"<html><body>\"\n                \"<a href='http://binux.me'>binux</a>\"\n                \"<a href='http://binux.me/中文'>binux</a>\"\n                \"<a href='http://binux.me/1'>1</a>\"\n                \"<a href='http://binux.me/1'>2</a>\"\n                \"</body></html>\"\n            ),\n            \"headers\": {'a': 'b', 'etag': 'tag'},\n            \"status_code\": 200,\n            \"url\": task['url'],\n            \"time\": 0,\n        }\n        self.in_queue.put((task, fetch_result))\n        time.sleep(1)\n        self.assertFalse(self.status_queue.empty())\n        self.assertFalse(self.newtask_queue.empty())\n\n        status = self.status_queue.get()\n        self.assertEqual(status['track']['fetch']['ok'], True)\n        self.assertEqual(status['track']['fetch']['time'], 0)\n        self.assertEqual(status['track']['fetch']['status_code'], 200)\n        self.assertEqual('tag', status['track']['fetch']['headers']['etag'])\n        self.assertIsNone(status['track']['fetch']['content'])\n        self.assertEqual(status['track']['process']['ok'], True)\n        self.assertGreater(status['track']['process']['time'], 0)\n        self.assertEqual(status['track']['process']['follows'], 3)\n        self.assertIsNone(status['track']['process']['result'])\n        self.assertEqual(status['track']['process']['logs'], '')\n        self.assertIsNone(status['track']['process']['exception'])\n\n        tasks = self.newtask_queue.get()\n        self.assertEqual(len(tasks), 3)\n        self.assertEqual(tasks[0]['url'], 'http://binux.me/')\n        self.assertTrue(tasks[1]['url'].startswith('http://binux.me/%'), task['url'])\n\n    def test_50_fetch_error(self):\n        # clear new task queue\n        while not self.newtask_queue.empty():\n            self.newtask_queue.get()\n        # clear status queue\n        while not self.status_queue.empty():\n            self.status_queue.get()\n\n        task = {\n            \"process\": {\n                \"callback\": \"index_page\"\n            },\n            \"project\": \"test_project\",\n            \"taskid\": \"data:,test_fetch_error\",\n            \"url\": \"data:,test_fetch_error\"\n        }\n\n        fetch_result = {\n            \"orig_url\": task['url'],\n            \"content\": \"test_fetch_error\",\n            \"error\": \"test_fetch_error\",\n            \"headers\": {'a': 'b', 'last-modified': '123'},\n            \"status_code\": 598,\n            \"url\": task['url'],\n            \"time\": 0,\n        }\n\n        self.in_queue.put((task, fetch_result))\n        time.sleep(1)\n        self.assertFalse(self.status_queue.empty())\n        self.assertTrue(self.newtask_queue.empty())\n\n        status = self.status_queue.get()\n        self.assertEqual(status['track']['fetch']['ok'], False)\n        self.assertEqual(status['track']['fetch']['time'], 0)\n        self.assertEqual(status['track']['fetch']['status_code'], 598)\n        self.assertEqual('123', status['track']['fetch']['headers']['last-modified'])\n        self.assertIsNotNone(status['track']['fetch']['content'])\n        self.assertEqual(status['track']['process']['ok'], False)\n        self.assertGreater(status['track']['process']['time'], 0)\n        self.assertEqual(status['track']['process']['follows'], 0)\n        self.assertIsNone(status['track']['process']['result'])\n        self.assertGreater(len(status['track']['process']['logs']), 0)\n        self.assertIsNotNone(status['track']['process']['exception'])\n\n    def test_60_call_broken_project(self):\n        # clear new task queue\n        while not self.newtask_queue.empty():\n            self.newtask_queue.get()\n        # clear status queue\n        while not self.status_queue.empty():\n            self.status_queue.get()\n\n        task = {\n            \"process\": {\n                \"callback\": \"on_start\"\n            },\n            \"project\": \"test_broken_project\",\n            \"taskid\": \"data:,on_start\",\n            \"url\": \"data:,on_start\",\n        }\n        fetch_result = {\n            \"orig_url\": \"data:,on_start\",\n            \"content\": \"on_start\",\n            \"headers\": {},\n            \"status_code\": 200,\n            \"url\": \"data:,on_start\",\n            \"time\": 0,\n        }\n        self.in_queue.put((task, fetch_result))\n        time.sleep(1)\n        self.assertFalse(self.status_queue.empty())\n        while not self.status_queue.empty():\n            status = self.status_queue.get()\n        self.assertEqual(status['track']['fetch']['ok'], True)\n        self.assertEqual(status['track']['process']['ok'], False)\n        self.assertGreater(len(status['track']['process']['logs']), 0)\n        self.assertIsNotNone(status['track']['process']['exception'])\n        self.assertTrue(self.newtask_queue.empty())\n\n    def test_70_update_project(self):\n        self.processor.project_manager.CHECK_PROJECTS_INTERVAL = 1000000\n        self.processor.project_manager._check_projects()\n        self.assertIsNotNone(self.processor.project_manager.get('test_broken_project'))\n        # clear new task queue\n        while not self.newtask_queue.empty():\n            self.newtask_queue.get()\n        # clear status queue\n        while not self.status_queue.empty():\n            self.status_queue.get()\n\n        task = {\n            \"process\": {\n                \"callback\": \"on_start\"\n            },\n            \"project\": \"test_broken_project\",\n            \"taskid\": \"data:,on_start\",\n            \"url\": \"data:,on_start\"\n        }\n        fetch_result = {\n            \"orig_url\": \"data:,on_start\",\n            \"content\": \"on_start\",\n            \"headers\": {},\n            \"status_code\": 200,\n            \"url\": \"data:,on_start\",\n            \"time\": 0,\n        }\n\n        self.projectdb.update('test_broken_project', {\n            'script': inspect.getsource(sample_handler),\n        })\n\n        # not update\n        self.in_queue.put((task, fetch_result))\n        time.sleep(1)\n        self.assertFalse(self.status_queue.empty())\n        while not self.status_queue.empty():\n            status = self.status_queue.get()\n        self.assertEqual(status['track']['fetch']['ok'], True)\n        self.assertEqual(status['track']['process']['ok'], False)\n\n        # updated\n        task['project_updatetime'] = time.time()\n        self.in_queue.put((task, fetch_result))\n        time.sleep(1)\n        self.assertFalse(self.status_queue.empty())\n        while not self.status_queue.empty():\n            status = self.status_queue.get()\n        self.assertEqual(status['track']['fetch']['ok'], True)\n        self.assertEqual(status['track']['process']['ok'], True)\n\n        self.projectdb.update('test_broken_project', {\n            'script': inspect.getsource(sample_handler)[:10],\n        })\n\n        # update with md5\n        task['project_md5sum'] = 'testmd5'\n        del task['project_updatetime']\n        self.in_queue.put((task, fetch_result))\n        time.sleep(1)\n        self.assertFalse(self.status_queue.empty())\n        while not self.status_queue.empty():\n            status = self.status_queue.get()\n        self.assertEqual(status['track']['fetch']['ok'], True)\n        self.assertEqual(status['track']['process']['ok'], False)\n\n        self.processor.project_manager.CHECK_PROJECTS_INTERVAL = 0.1\n\n    def test_80_import_project(self):\n        self.projectdb.insert('test_project2', {\n            'name': 'test_project',\n            'group': 'group',\n            'status': 'TODO',\n            'script': inspect.getsource(sample_handler),\n            'comments': 'test project',\n            'rate': 1.0,\n            'burst': 10,\n        })\n        self.projectdb.insert('test_project3', {\n            'name': 'test_project',\n            'group': 'group',\n            'status': 'TODO',\n            'script': inspect.getsource(sample_handler),\n            'comments': 'test project',\n            'rate': 1.0,\n            'burst': 10,\n        })\n\n        from projects import test_project\n        self.assertIsNotNone(test_project)\n        self.assertIsNotNone(test_project.Handler)\n\n        from projects.test_project2 import Handler\n        self.assertIsNotNone(Handler)\n\n        import projects.test_project3\n        self.assertIsNotNone(projects.test_project3.Handler)\n"
  },
  {
    "path": "tests/test_response.py",
    "content": "#!/usr/bin/env python\n# -*- encoding: utf-8 -*-\n# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:\n# Author: Binux<roy@binux.me>\n#         http://binux.me\n# Created on 2015-01-18 11:10:27\n\n\nimport os\nimport copy\nimport time\nimport httpbin\nimport unittest\n\nimport logging\nimport logging.config\nlogging.config.fileConfig(\"pyspider/logging.conf\")\n\nfrom pyspider.libs import utils\nfrom pyspider.libs.response import rebuild_response\nfrom pyspider.fetcher.tornado_fetcher import Fetcher\n\nclass TestResponse(unittest.TestCase):\n    sample_task_http = {\n        'taskid': 'taskid',\n        'project': 'project',\n        'url': '',\n    }\n\n    @classmethod\n    def setUpClass(self):\n        self.fetcher = Fetcher(None, None, async_mode=False)\n        self.httpbin_thread = utils.run_in_subprocess(httpbin.app.run, port=14887, passthrough_errors=False)\n        self.httpbin = 'http://127.0.0.1:14887'\n        time.sleep(0.5)\n\n    @classmethod\n    def tearDownClass(self):\n        self.httpbin_thread.terminate()\n\n    def get(self, url, **kwargs):\n        if not url.startswith('http://'):\n            url = self.httpbin + url\n        request = copy.deepcopy(self.sample_task_http)\n        request['url'] = url\n        request.update(kwargs)\n        result = self.fetcher.fetch(request)\n        response = rebuild_response(result)\n        return response\n\n    def test_10_html(self):\n        response = self.get('/html')\n        self.assertEqual(response.status_code, 200)\n        self.assertIsNotNone(response.doc('h1'))\n\n    def test_20_xml(self):\n        response = self.get('/xml')\n        self.assertEqual(response.status_code, 200)\n        self.assertIsNotNone(response.doc('item'))\n\n    def test_30_gzip(self):\n        response = self.get('/gzip')\n        self.assertEqual(response.status_code, 200)\n        self.assertIn('gzipped', response.text)\n\n    def test_40_deflate(self):\n        response = self.get('/deflate')\n        self.assertEqual(response.status_code, 200)\n        self.assertIn('deflated', response.text)\n\n    def test_50_ok(self):\n        response = self.get('/status/200')\n        self.assertTrue(response.ok)\n        self.assertTrue(response)\n        response = self.get('/status/302')\n        self.assertTrue(response.ok)\n        self.assertTrue(response)\n        with self.assertRaises(Exception):\n            self.raise_for_status(allow_redirects=False)\n\n    def test_60_not_ok(self):\n        response = self.get('/status/400')\n        self.assertFalse(response.ok)\n        self.assertFalse(response)\n        response = self.get('/status/500')\n        self.assertFalse(response.ok)\n        self.assertFalse(response)\n        response = self.get('/status/600')\n        self.assertFalse(response.ok)\n        self.assertFalse(response)\n\n    def test_70_reraise_exception(self):\n        response = self.get('file://abc')\n        with self.assertRaisesRegex(Exception, 'HTTP 599'):\n            response.raise_for_status()\n"
  },
  {
    "path": "tests/test_result_dump.py",
    "content": "#!/usr/bin/env python\n# -*- encoding: utf-8 -*-\n# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:\n# Author: Binux<roy@binux.me>\n#         http://binux.me\n# Created on 2015-10-12 22:17:57\n\nfrom __future__ import unicode_literals, division\n\nimport six\nimport csv\nimport time\nimport json\nimport unittest\nfrom six import StringIO\n\nfrom pyspider.libs import result_dump\n\nresults1 = [\n    {'taskid': 'taskid1', 'url': 'http://example.org/url1', 'pdatetime': time.time(),\n     'result': {'a': 1, 'b': 2} },\n    {'taskid': 'taskid1', 'url': 'http://example.org/url1', 'pdatetime': time.time(),\n     'result': {'a': 1, 'b': 2, 'c': 3} },\n]\n\nresults2 = results1 + [\n    {'taskid': 'taskid1', 'url': 'http://example.org/url1', 'pdatetime': time.time(),\n     'result': [1, 2, '中文', u'中文'] },\n]\n\nresults_error = results2 + [\n    {'taskid': 'taskid1', 'url': 'http://example.org/url1', 'pdatetime': time.time(),\n     'result': None},\n    {'taskid': 'taskid1', 'url': 'http://example.org/url1', 'pdatetime': time.time() },\n    {'taskid': 'taskid1', 'pdatetime': time.time() },\n]\n\nresult_list_error = [\n    {'taskid': 'taskid1', 'url': 'http://example.org/url1', 'pdatetime': time.time(),\n     'result': [{\"rate\": \"8.2\", \"title\": '1'}, {\"rate\": \"8.2\", \"title\": '1'}]},\n    {'taskid': 'taskid1', 'url': 'http://example.org/url1', 'pdatetime': time.time(),\n     'result': [{\"rate\": \"8.2\", \"title\": '1'}, {\"rate\": \"8.2\", \"title\": '1'}]},\n]\n\nclass TestResultDump(unittest.TestCase):\n    def test_result_formater_1(self):\n        common_fields, results = result_dump.result_formater(results1)\n        self.assertEqual(common_fields, set(('a', 'b')))\n\n    def test_result_formater_2(self):\n        common_fields, results = result_dump.result_formater(results2)\n        self.assertEqual(common_fields, set())\n\n    def test_result_formater_error(self):\n        common_fields, results = result_dump.result_formater(results_error)\n        self.assertEqual(common_fields, set())\n\n    def test_dump_as_json(self):\n        for i, line in enumerate((''.join(\n                result_dump.dump_as_json(results2))).splitlines()):\n            self.assertDictEqual(results2[i], json.loads(line))\n\n    def test_dump_as_json_valid(self):\n        ret = json.loads(''.join(result_dump.dump_as_json(results2, True)))\n        for i, j in zip(results2, ret):\n            self.assertDictEqual(i, j)\n\n    def test_dump_as_txt(self):\n        for i, line in enumerate((''.join(\n                result_dump.dump_as_txt(results2))).splitlines()):\n            url, json_data = line.split('\\t', 2)\n            self.assertEqual(results2[i]['result'], json.loads(json_data))\n\n    def test_dump_as_csv(self):\n        reader = csv.reader(StringIO(''.join(result_dump.dump_as_csv(results1))))\n        for row in reader:\n            self.assertEqual(len(row), 4)\n\n    def test_dump_as_csv_case_1(self):\n        reader = csv.reader(StringIO(''.join(result_dump.dump_as_csv(result_list_error))))\n        for row in reader:\n            self.assertEqual(len(row), 2)\n"
  },
  {
    "path": "tests/test_result_worker.py",
    "content": "#!/usr/bin/env python\n# -*- encoding: utf-8 -*-\n# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:\n# Author: Binux<i@binux.me>\n#         http://binux.me\n# Created on 2014-11-11 20:52:53\n\nimport os\nimport time\nimport unittest\nimport logging.config\nlogging.config.fileConfig(\"pyspider/logging.conf\")\n\nimport shutil\nfrom pyspider.database.sqlite import resultdb\nfrom pyspider.result.result_worker import ResultWorker\nfrom pyspider.libs.multiprocessing_queue import Queue\nfrom pyspider.libs.utils import run_in_thread\n\n\nclass TestProcessor(unittest.TestCase):\n    resultdb_path = './data/tests/result.db'\n\n    @classmethod\n    def setUpClass(self):\n        shutil.rmtree('./data/tests/', ignore_errors=True)\n        os.makedirs('./data/tests/')\n\n        def get_resultdb():\n            return resultdb.ResultDB(self.resultdb_path)\n        self.resultdb = get_resultdb()\n        self.inqueue = Queue(10)\n\n        def run_result_worker():\n            self.result_worker = ResultWorker(get_resultdb(), self.inqueue)\n            self.result_worker.run()\n        self.process = run_in_thread(run_result_worker)\n        time.sleep(1)\n\n    @classmethod\n    def tearDownClass(self):\n        if self.process.is_alive():\n            self.result_worker.quit()\n            self.process.join(2)\n        assert not self.process.is_alive()\n        shutil.rmtree('./data/tests/', ignore_errors=True)\n\n    def test_10_bad_result(self):\n        self.inqueue.put(({'project': 'test_project'}, {}))\n        self.resultdb._list_project()\n        self.assertEqual(len(self.resultdb.projects), 0)\n        self.assertEqual(self.resultdb.count('test_project'), 0)\n\n    def test_10_bad_result_2(self):\n        self.inqueue.put(({'project': 'test_project'}, {'a': 'b'}))\n        self.resultdb._list_project()\n        self.assertEqual(len(self.resultdb.projects), 0)\n        self.assertEqual(self.resultdb.count('test_project'), 0)\n\n    def test_20_insert_result(self):\n        data = {\n            'a': 'b'\n        }\n        self.inqueue.put(({\n            'project': 'test_project',\n            'taskid': 'id1',\n            'url': 'url1'\n        }, data))\n        time.sleep(0.5)\n        self.resultdb._list_project()\n        self.assertEqual(len(self.resultdb.projects), 1)\n        self.assertEqual(self.resultdb.count('test_project'), 1)\n\n        result = self.resultdb.get('test_project', 'id1')\n        self.assertEqual(result['result'], data)\n\n    def test_30_overwrite(self):\n        self.inqueue.put(({\n            'project': 'test_project',\n            'taskid': 'id1',\n            'url': 'url1'\n        }, \"abc\"))\n        time.sleep(0.1)\n        result = self.resultdb.get('test_project', 'id1')\n        self.assertEqual(result['result'], \"abc\")\n\n    def test_40_insert_list(self):\n        self.inqueue.put(({\n            'project': 'test_project',\n            'taskid': 'id2',\n            'url': 'url1'\n        }, ['a', 'b']))\n        time.sleep(0.1)\n        result = self.resultdb.get('test_project', 'id2')\n        self.assertEqual(result['result'], ['a', 'b'])\n"
  },
  {
    "path": "tests/test_run.py",
    "content": "#!/usr/bin/env python\n# -*- encoding: utf-8 -*-\n# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:\n# Author: Binux<roy@binux.me>\n#         http://binux.me\n# Created on 2014-11-21 22:32:35\n\nfrom __future__ import print_function\n\nimport os\nimport sys\nimport six\nimport time\nimport json\nimport signal\nimport shutil\nimport inspect\nimport requests\nimport unittest\n\nfrom pyspider import run\nfrom pyspider.libs import utils\nfrom tests import data_sample_handler\n\nclass TestRun(unittest.TestCase):\n\n    @classmethod\n    def setUpClass(self):\n        shutil.rmtree('./data/tests', ignore_errors=True)\n        os.makedirs('./data/tests')\n\n        import tests.data_test_webpage\n        import httpbin\n        self.httpbin_thread = utils.run_in_subprocess(httpbin.app.run, port=14887, passthrough_errors=False)\n        self.httpbin = 'http://127.0.0.1:14887'\n\n    @classmethod\n    def tearDownClass(self):\n        self.httpbin_thread.terminate()\n        self.httpbin_thread.join()\n\n        assert not utils.check_port_open(5000)\n        assert not utils.check_port_open(23333)\n        assert not utils.check_port_open(24444)\n        assert not utils.check_port_open(25555)\n        assert not utils.check_port_open(14887)\n\n        shutil.rmtree('./data/tests', ignore_errors=True)\n\n    def test_10_cli(self):\n        ctx = run.cli.make_context('test', [], None, obj=dict(testing_mode=True))\n        ctx = run.cli.invoke(ctx)\n        self.assertEqual(ctx.obj.debug, False)\n        for db in ('taskdb', 'projectdb', 'resultdb'):\n            self.assertIsNotNone(getattr(ctx.obj, db))\n        for name in ('newtask_queue', 'status_queue', 'scheduler2fetcher',\n                     'fetcher2processor', 'processor2result'):\n            self.assertIsNotNone(getattr(ctx.obj, name))\n        self.assertEqual(len(ctx.obj.instances), 0)\n\n    def test_20_cli_config(self):\n        with open('./data/tests/config.json', 'w') as fp:\n            json.dump({\n                'debug': True,\n                'taskdb': 'mysql+taskdb://localhost:23456/taskdb',\n                'amqp-url': 'amqp://guest:guest@localhost:23456/%%2F'\n            }, fp)\n        ctx = run.cli.make_context('test',\n                                   ['--config', './data/tests/config.json'],\n                                   None, obj=dict(testing_mode=True))\n        ctx = run.cli.invoke(ctx)\n        self.assertEqual(ctx.obj.debug, True)\n\n        import mysql.connector\n        with self.assertRaises(mysql.connector.Error):\n            ctx.obj.taskdb\n\n        with self.assertRaises(Exception):\n            ctx.obj.newtask_queue\n\n    def test_30_cli_command_line(self):\n        ctx = run.cli.make_context(\n            'test',\n            ['--projectdb', 'mongodb+projectdb://localhost:23456/projectdb'],\n            None,\n            obj=dict(testing_mode=True)\n        )\n        ctx = run.cli.invoke(ctx)\n\n        from pymongo.errors import ConnectionFailure\n        with self.assertRaises(ConnectionFailure):\n            ctx.obj.projectdb\n\n    def test_30a_cli_command_line(self):\n        ctx = run.cli.make_context(\n            'test',\n            ['--projectdb', 'couchdb+projectdb://localhost:5984/projectdb'],\n            None,\n            obj=dict(testing_mode=True)\n        )\n        ctx = run.cli.invoke(ctx)\n\n        with self.assertRaises(Exception):\n            # TODO: MORE SPECIFIC\n            ctx.obj.projectdb\n\n    def test_40_cli_env(self):\n        try:\n            os.environ['RESULTDB'] = 'sqlite+resultdb://'\n            ctx = run.cli.make_context('test', [], None,\n                                       obj=dict(testing_mode=True))\n            ctx = run.cli.invoke(ctx)\n\n            from pyspider.database.sqlite import resultdb\n            self.assertIsInstance(ctx.obj.resultdb, resultdb.ResultDB)\n        finally:\n            del os.environ['RESULTDB']\n\n    @unittest.skipIf(os.environ.get('IGNORE_RABBITMQ') or os.environ.get('IGNORE_ALL'), 'no rabbitmq server for test.')\n    def test_50_docker_rabbitmq(self):\n        try:\n            os.environ['RABBITMQ_NAME'] = 'rabbitmq'\n            os.environ['RABBITMQ_PORT_5672_TCP_ADDR'] = 'localhost'\n            os.environ['RABBITMQ_PORT_5672_TCP_PORT'] = '5672'\n            ctx = run.cli.make_context('test', [], None,\n                                       obj=dict(testing_mode=True))\n            ctx = run.cli.invoke(ctx)\n            queue = ctx.obj.newtask_queue\n            queue.put('abc')\n            queue.delete()\n        except Exception as e:\n            self.assertIsNone(e)\n        finally:\n            del os.environ['RABBITMQ_NAME']\n            del os.environ['RABBITMQ_PORT_5672_TCP_ADDR']\n            del os.environ['RABBITMQ_PORT_5672_TCP_PORT']\n\n    @unittest.skipIf(os.environ.get('IGNORE_MONGODB') or os.environ.get('IGNORE_ALL'), 'no mongodb server for test.')\n    def test_60_docker_mongodb(self):\n        try:\n            os.environ['MONGODB_NAME'] = 'mongodb'\n            os.environ['MONGODB_PORT_27017_TCP_ADDR'] = 'localhost'\n            os.environ['MONGODB_PORT_27017_TCP_PORT'] = '27017'\n            ctx = run.cli.make_context('test', [], None,\n                                       obj=dict(testing_mode=True))\n            ctx = run.cli.invoke(ctx)\n            ctx.obj.resultdb\n        except Exception as e:\n            self.assertIsNone(e)\n        finally:\n            del os.environ['MONGODB_NAME']\n            del os.environ['MONGODB_PORT_27017_TCP_ADDR']\n            del os.environ['MONGODB_PORT_27017_TCP_PORT']\n\n    @unittest.skipIf(os.environ.get('IGNORE_COUCHDB') or os.environ.get('IGNORE_ALL'), 'no couchdb server for test.')\n    def test_60a_docker_couchdb(self):\n        try:\n            # create a test admin user\n            os.environ['COUCHDB_NAME'] = 'couchdb'\n            os.environ['COUCHDB_PORT_5984_TCP_ADDR'] = 'localhost'\n            os.environ['COUCHDB_PORT_5984_TCP_PORT'] = '5984'\n            ctx = run.cli.make_context('test', [], None,\n                                       obj=dict(testing_mode=True))\n            ctx = run.cli.invoke(ctx)\n            ctx.obj.resultdb\n        except Exception as e:\n            self.assertIsNone(e)\n        finally:\n            # remove the test admin user\n            del os.environ['COUCHDB_NAME']\n            del os.environ['COUCHDB_PORT_5984_TCP_ADDR']\n            del os.environ['COUCHDB_PORT_5984_TCP_PORT']\n\n    @unittest.skip('only available in docker')\n    @unittest.skipIf(os.environ.get('IGNORE_MYSQL') or os.environ.get('IGNORE_ALL'), 'no mysql server for test.')\n    def test_70_docker_mysql(self):\n        try:\n            os.environ['MYSQL_NAME'] = 'mysql'\n            os.environ['MYSQL_PORT_3306_TCP_ADDR'] = 'localhost'\n            os.environ['MYSQL_PORT_3306_TCP_PORT'] = '3306'\n            ctx = run.cli.make_context('test', [], None,\n                                       obj=dict(testing_mode=True))\n            ctx = run.cli.invoke(ctx)\n            ctx.obj.resultdb\n        except Exception as e:\n            self.assertIsNone(e)\n        finally:\n            del os.environ['MYSQL_NAME']\n            del os.environ['MYSQL_PORT_3306_TCP_ADDR']\n            del os.environ['MYSQL_PORT_3306_TCP_PORT']\n\n    def test_80_docker_phantomjs(self):\n        try:\n            os.environ['PHANTOMJS_NAME'] = 'phantomjs'\n            os.environ['PHANTOMJS_PORT_25555_TCP'] = 'tpc://binux:25678'\n            ctx = run.cli.make_context('test', [], None,\n                                       obj=dict(testing_mode=True))\n            ctx = run.cli.invoke(ctx)\n            self.assertEqual(ctx.obj.phantomjs_proxy, 'binux:25678')\n        except Exception as e:\n            self.assertIsNone(e)\n        finally:\n            del os.environ['PHANTOMJS_NAME']\n            del os.environ['PHANTOMJS_PORT_25555_TCP']\n\n    def test_90_docker_scheduler(self):\n        try:\n            os.environ['SCHEDULER_PORT_23333_TCP_ADDR'] = 'scheduler'\n            os.environ['SCHEDULER_PORT_23333_TCP_PORT'] = '23333'\n\n            ctx = run.cli.make_context('test', [], None,\n                                       obj=dict(testing_mode=True))\n            ctx = run.cli.invoke(ctx)\n            webui = run.cli.get_command(ctx, 'webui')\n            webui_ctx = webui.make_context('webui', [], ctx)\n            app = webui.invoke(webui_ctx)\n            rpc = app.config['scheduler_rpc']\n            self.assertEqual(rpc._ServerProxy__host, '{}:{}'.format(os.environ['SCHEDULER_PORT_23333_TCP_ADDR'],\n                                                                    os.environ['SCHEDULER_PORT_23333_TCP_PORT']))\n        except Exception as e:\n            self.assertIsNone(e)\n        finally:\n            del os.environ['SCHEDULER_PORT_23333_TCP_ADDR']\n            del os.environ['SCHEDULER_PORT_23333_TCP_PORT']\n\n    def test_a100_all(self):\n        import subprocess\n        #cmd = [sys.executable]\n        cmd = ['coverage', 'run']\n        p = subprocess.Popen(cmd+[\n            inspect.getsourcefile(run),\n            '--taskdb', 'sqlite+taskdb:///data/tests/all_test_task.db',\n            '--resultdb', 'sqlite+resultdb:///data/tests/all_test_result.db',\n            '--projectdb', 'local+projectdb://'+inspect.getsourcefile(data_sample_handler),\n            'all',\n        ], close_fds=True, preexec_fn=os.setsid)\n\n        try:\n            limit = 30\n            while limit >= 0:\n                time.sleep(3)\n                # click run\n                try:\n                    requests.post('http://localhost:5000/run', data={\n                        'project': 'data_sample_handler',\n                    })\n                except requests.exceptions.ConnectionError:\n                    limit -= 1\n                    continue\n                break\n\n            limit = 30\n            data = requests.get('http://localhost:5000/counter')\n            self.assertEqual(data.status_code, 200)\n            while data.json().get('data_sample_handler', {}).get('5m', {}).get('success', 0) < 5:\n                time.sleep(1)\n                data = requests.get('http://localhost:5000/counter')\n                limit -= 1\n                if limit <= 0:\n                    break\n\n            self.assertGreater(limit, 0)\n            rv = requests.get('http://localhost:5000/results?project=data_sample_handler')\n            self.assertIn('<th>url</th>', rv.text)\n            self.assertIn('class=url', rv.text)\n        except:\n            raise\n        finally:\n            time.sleep(1)\n            os.killpg(p.pid, signal.SIGTERM)\n            p.wait()\n\n    def test_a110_one(self):\n        pid, fd = os.forkpty()\n        #cmd = [sys.executable]\n        cmd = ['coverage', 'run']\n        cmd += [\n            inspect.getsourcefile(run),\n            'one',\n            '-i',\n            inspect.getsourcefile(data_sample_handler)\n        ]\n\n        if pid == 0:\n            # child\n            os.execvp(cmd[0], cmd)\n        else:\n            # parent\n            def wait_text(timeout=1):\n                import select\n                text = []\n                while True:\n                    rl, wl, xl = select.select([fd], [], [], timeout)\n                    if not rl:\n                        break\n                    try:\n                        t = os.read(fd, 1024)\n                    except OSError:\n                        break\n                    if not t:\n                        break\n                    t = utils.text(t)\n                    text.append(t)\n                    print(t, end='')\n                return ''.join(text)\n\n            text = wait_text(3)\n            self.assertIn('new task data_sample_handler:on_start', text)\n            self.assertIn('pyspider shell', text)\n\n            os.write(fd, utils.utf8('run()\\n'))\n            text = wait_text()\n            self.assertIn('task done data_sample_handler:on_start', text)\n\n            os.write(fd, utils.utf8('crawl(\"%s/pyspider/test.html\")\\n' % self.httpbin))\n            text = wait_text()\n            self.assertIn('/robots.txt', text)\n\n            os.write(fd, utils.utf8('crawl(\"%s/links/10/0\")\\n' % self.httpbin))\n            text = wait_text()\n            if '\"title\": \"Links\"' not in text:\n                os.write(fd, utils.utf8('crawl(\"%s/links/10/1\")\\n' % self.httpbin))\n                text = wait_text()\n                self.assertIn('\"title\": \"Links\"', text)\n\n            os.write(fd, utils.utf8('crawl(\"%s/404\")\\n' % self.httpbin))\n            text = wait_text()\n            self.assertIn('task retry', text)\n\n            os.write(fd, b'quit_pyspider()\\n')\n            text = wait_text()\n            self.assertIn('scheduler exiting...', text)\n            os.close(fd)\n            os.kill(pid, signal.SIGINT)\n\nclass TestSendMessage(unittest.TestCase):\n\n    @classmethod\n    def setUpClass(self):\n        shutil.rmtree('./data/tests', ignore_errors=True)\n        os.makedirs('./data/tests')\n\n        ctx = run.cli.make_context('test', [\n            '--taskdb', 'sqlite+taskdb:///data/tests/task.db',\n            '--projectdb', 'sqlite+projectdb:///data/tests/projectdb.db',\n            '--resultdb', 'sqlite+resultdb:///data/tests/resultdb.db',\n        ], None, obj=dict(testing_mode=True))\n        self.ctx = run.cli.invoke(ctx)\n\n        ctx = run.scheduler.make_context('scheduler', [], self.ctx)\n        scheduler = run.scheduler.invoke(ctx)\n        self.xmlrpc_thread = utils.run_in_thread(scheduler.xmlrpc_run)\n        self.scheduler_thread = utils.run_in_thread(scheduler.run)\n\n        time.sleep(1)\n\n    @classmethod\n    def tearDownClass(self):\n        for each in self.ctx.obj.instances:\n            each.quit()\n        self.xmlrpc_thread.join()\n        self.scheduler_thread.join()\n        time.sleep(1)\n\n        assert not utils.check_port_open(5000)\n        assert not utils.check_port_open(23333)\n        assert not utils.check_port_open(24444)\n        assert not utils.check_port_open(25555)\n\n        shutil.rmtree('./data/tests', ignore_errors=True)\n\n    def test_10_send_message(self):\n        ctx = run.send_message.make_context('send_message', [\n            'test_project', 'test_message'\n        ], self.ctx)\n        self.assertTrue(run.send_message.invoke(ctx))\n        while True:\n            task = self.ctx.obj.scheduler2fetcher.get(timeout=1)\n            if task['url'] == 'data:,on_message':\n                break\n        self.assertEqual(task['process']['callback'], '_on_message')\n\n"
  },
  {
    "path": "tests/test_scheduler.py",
    "content": "#!/usr/bin/env python\n# -*- encoding: utf-8 -*-\n# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:\n# Author: Binux<i@binux.me>\n#         http://binux.me\n# Created on 2014-02-08 22:37:13\n\nimport os\nimport time\nimport shutil\nimport unittest\nimport logging\nimport logging.config\nlogging.config.fileConfig(\"pyspider/logging.conf\")\n\nfrom pyspider.scheduler.task_queue import TaskQueue\nfrom pyspider.libs import utils\n\n\nclass TestTaskQueue(unittest.TestCase):\n\n    @classmethod\n    def setUpClass(self):\n        self.task_queue = TaskQueue()\n        self.task_queue.rate = 100000\n        self.task_queue.burst = 100000\n        self.task_queue.processing_timeout = 0.5\n\n    def test_10_put(self):\n        self.task_queue.put('a3', 0, time.time() + 0.5)\n        self.task_queue.put('a4', 3, time.time() + 0.2)\n        self.task_queue.put('a2', 0)\n        self.task_queue.put('a1', 1)\n        self.assertEqual(self.task_queue.size(), 4)\n\n    def test_20_update(self):\n        self.task_queue.put('a2', 4)\n        self.assertEqual(self.task_queue.size(), 4)\n        self.task_queue.put('a3', 2, 0)\n        self.assertEqual(self.task_queue.size(), 4)\n\n    def test_30_get_from_priority_queue(self):\n        self.assertEqual(self.task_queue.get(), 'a2')\n        self.assertEqual(self.task_queue.size(), 4)\n\n    def test_40_time_queue_1(self):\n        self.task_queue.check_update()\n        self.assertEqual(self.task_queue.get(), 'a3')\n        self.assertEqual(self.task_queue.size(), 4)\n\n    def test_50_time_queue_2(self):\n        time.sleep(0.3)\n        self.task_queue.check_update()\n        self.assertEqual(self.task_queue.get(), 'a4')\n        self.assertEqual(self.task_queue.get(), 'a1')\n        self.assertEqual(self.task_queue.size(), 4)\n\n    def test_60_processing_queue(self):\n        time.sleep(0.5)\n        self.task_queue.check_update()\n        self.assertEqual(self.task_queue.get(), 'a2')\n        self.assertEqual(len(self.task_queue), 4)\n        self.assertEqual(self.task_queue.get(), 'a4')\n        self.assertEqual(self.task_queue.get(), 'a3')\n        self.assertEqual(self.task_queue.get(), 'a1')\n        self.assertEqual(len(self.task_queue), 4)\n\n    def test_70_done(self):\n        self.assertTrue(self.task_queue.done('a2'))\n        self.assertTrue(self.task_queue.done('a1'))\n        self.assertEqual(len(self.task_queue), 2)\n        self.assertTrue(self.task_queue.done('a4'))\n        self.assertTrue(self.task_queue.done('a3'))\n        self.assertEqual(len(self.task_queue), 0)\n\n\nfrom pyspider.scheduler.token_bucket import Bucket\n\n\nclass TestBucket(unittest.TestCase):\n\n    def test_bucket(self):\n        bucket = Bucket(100, 1000)\n        self.assertEqual(bucket.get(), 1000)\n        time.sleep(0.1)\n        self.assertEqual(bucket.get(), 1000)\n        bucket.desc(100)\n        self.assertEqual(bucket.get(), 900)\n        time.sleep(0.1)\n        self.assertAlmostEqual(bucket.get(), 910, delta=2)\n        time.sleep(0.1)\n        self.assertAlmostEqual(bucket.get(), 920, delta=2)\n\n\ntry:\n    from six.moves import xmlrpc_client\nexcept ImportError:\n    import xmlrpclib as xmlrpc_client\nfrom pyspider.scheduler.scheduler import Scheduler\nfrom pyspider.database.sqlite import taskdb, projectdb, resultdb\nfrom pyspider.libs.multiprocessing_queue import Queue\nfrom pyspider.libs.utils import run_in_thread\n\n\nclass TestScheduler(unittest.TestCase):\n    taskdb_path = './data/tests/task.db'\n    projectdb_path = './data/tests/project.db'\n    resultdb_path = './data/tests/result.db'\n    check_project_time = 1\n    scheduler_xmlrpc_port = 23333\n\n    @classmethod\n    def setUpClass(self):\n        shutil.rmtree('./data/tests', ignore_errors=True)\n        os.makedirs('./data/tests')\n\n        def get_taskdb():\n            return taskdb.TaskDB(self.taskdb_path)\n        self.taskdb = get_taskdb()\n\n        def get_projectdb():\n            return projectdb.ProjectDB(self.projectdb_path)\n        self.projectdb = get_projectdb()\n\n        def get_resultdb():\n            return resultdb.ResultDB(self.resultdb_path)\n        self.resultdb = get_resultdb()\n\n        self.newtask_queue = Queue(10)\n        self.status_queue = Queue(10)\n        self.scheduler2fetcher = Queue(10)\n        self.rpc = xmlrpc_client.ServerProxy('http://localhost:%d' % self.scheduler_xmlrpc_port)\n\n        def run_scheduler():\n            scheduler = Scheduler(taskdb=get_taskdb(), projectdb=get_projectdb(),\n                                  newtask_queue=self.newtask_queue, status_queue=self.status_queue,\n                                  out_queue=self.scheduler2fetcher, data_path=\"./data/tests/\",\n                                  resultdb=get_resultdb())\n            scheduler.UPDATE_PROJECT_INTERVAL = 0.1\n            scheduler.LOOP_INTERVAL = 0.1\n            scheduler.INQUEUE_LIMIT = 10\n            scheduler.DELETE_TIME = 0\n            scheduler.DEFAULT_RETRY_DELAY = {'': 5}\n            scheduler._last_tick = int(time.time())  # not dispatch cronjob\n            self.xmlrpc_thread = run_in_thread(scheduler.xmlrpc_run, port=self.scheduler_xmlrpc_port)\n            scheduler.run()\n\n        self.process = run_in_thread(run_scheduler)\n        time.sleep(1)\n\n    @classmethod\n    def tearDownClass(self):\n        if self.process.is_alive():\n            self.rpc._quit()\n            self.process.join(5)\n        self.xmlrpc_thread.join()\n        assert not self.process.is_alive()\n        shutil.rmtree('./data/tests', ignore_errors=True)\n        time.sleep(1)\n\n        assert not utils.check_port_open(5000)\n        assert not utils.check_port_open(self.scheduler_xmlrpc_port)\n        assert not utils.check_port_open(24444)\n        assert not utils.check_port_open(25555)\n\n    def test_10_new_task_ignore(self):\n        '''\n        task_queue = [ ]\n        '''\n        self.newtask_queue.put({\n            'taskid': 'taskid',\n            'project': 'test_project',\n            'url': 'url'\n        })  # unknown project: test_project\n        self.assertEqual(self.rpc.size(), 0)\n        self.assertEqual(len(self.rpc.get_active_tasks()), 0)\n\n    def test_20_new_project(self):\n        '''\n        task_queue = [ ]\n        '''\n        self.projectdb.insert('test_project', {\n            'name': 'test_project',\n            'group': 'group',\n            'status': 'TODO',\n            'script': 'import time\\nprint(time.time())',\n            'comments': 'test project',\n            'rate': 1.0,\n            'burst': 10,\n        })\n\n    def test_30_update_project(self):\n        '''\n        task_queue = [ ]\n        '''\n        from six.moves import queue as Queue\n        with self.assertRaises(Queue.Empty):\n            task = self.scheduler2fetcher.get(timeout=1)\n        self.projectdb.update('test_project', status=\"DEBUG\")\n        time.sleep(0.1)\n        self.rpc.update_project()\n\n        task = self.scheduler2fetcher.get(timeout=10)\n        self.assertIsNotNone(task)\n        self.assertEqual(task['taskid'], '_on_get_info')  # select test_project:_on_get_info data:,_on_get_info\n\n    def test_32_get_info(self):\n        self.status_queue.put({\n            'taskid': '_on_get_info',\n            'project': 'test_project',\n            'track': {\n                'save': {\n                    }\n                }\n            })\n        # test_project on_get_info {}\n\n    def test_34_new_not_used_project(self):\n        '''\n        task_queue = []\n        '''\n        self.projectdb.insert('test_project_not_started', {\n            'name': 'test_project_not_started',\n            'group': 'group',\n            'status': 'RUNNING',\n            'script': 'import time\\nprint(time.time())',\n            'comments': 'test project',\n            'rate': 1.0,\n            'burst': 10,\n        })\n        task = self.scheduler2fetcher.get(timeout=5)  # select test_project_not_started:_on_get_info data:,_on_get_info\n        self.assertEqual(task['taskid'], '_on_get_info')\n\n    def test_35_new_task(self):\n        '''\n        task_queue = [ ]\n        '''\n        time.sleep(0.2)\n        self.newtask_queue.put({\n            'taskid': 'taskid',\n            'project': 'test_project',\n            'url': 'url',\n            'fetch': {\n                'data': 'abc',\n            },\n            'process': {\n                'data': 'abc',\n            },\n            'schedule': {\n                'age': 0,\n            },\n        })  # new task test_project:taskid url\n        # task_queue = [ test_project:taskid ]\n\n        time.sleep(0.5)\n        task = self.scheduler2fetcher.get(timeout=10)  # select test_project:taskid\n        self.assertGreater(len(self.rpc.get_active_tasks()), 0)\n        self.assertIsNotNone(task)\n        self.assertEqual(task['taskid'], 'taskid')\n        self.assertEqual(task['project'], 'test_project')\n        self.assertIn('schedule', task)\n        self.assertIn('fetch', task)\n        self.assertIn('process', task)\n        self.assertIn('track', task)\n        self.assertEqual(task['fetch']['data'], 'abc')\n\n    def test_37_force_update_processing_task(self):\n        '''\n        processing = [ test_project:taskid ]\n        '''\n        self.newtask_queue.put({\n            'taskid': 'taskid',\n            'project': 'test_project',\n            'url': 'url_force_update',\n            'schedule': {\n                'age': 10,\n                'force_update': True,\n            },\n        })  # restart task test_project:taskid url_force_update\n        time.sleep(0.2)\n        # it should not block next\n\n    def test_40_taskdone_error_no_project(self):\n        '''\n        processing = [ test_project:taskid ]\n        '''\n        self.status_queue.put({\n            'taskid': 'taskid',\n            'project': 'no_project',\n            'url': 'url'\n        })  # unknown project: no_project\n        time.sleep(0.1)\n        self.assertEqual(self.rpc.size(), 1)\n\n    def test_50_taskdone_error_no_track(self):\n        '''\n        processing = [ test_project:taskid ]\n        '''\n        self.status_queue.put({\n            'taskid': 'taskid',\n            'project': 'test_project',\n            'url': 'url'\n        })  # Bad status pack: 'track'\n        time.sleep(0.1)\n        self.assertEqual(self.rpc.size(), 1)\n        self.status_queue.put({\n            'taskid': 'taskid',\n            'project': 'test_project',\n            'url': 'url',\n            'track': {}\n        })  # Bad status pack: 'process'\n        time.sleep(0.1)\n        self.assertEqual(self.rpc.size(), 1)\n\n    def test_60_taskdone_failed_retry(self):\n        '''\n        processing = [ test_project:taskid ]\n        '''\n        self.status_queue.put({\n            'taskid': 'taskid',\n            'project': 'test_project',\n            'url': 'url',\n            'track': {\n                'fetch': {\n                    'ok': True\n                },\n                'process': {\n                    'ok': False\n                },\n            }\n        })  # task retry 0/3 test_project:taskid url\n        from six.moves import queue as Queue\n        # with self.assertRaises(Queue.Empty):\n            # task = self.scheduler2fetcher.get(timeout=4)\n        task = self.scheduler2fetcher.get(timeout=5)  # select test_project:taskid url\n        self.assertIsNotNone(task)\n\n    def test_70_taskdone_ok(self):\n        '''\n        processing = [ test_project:taskid ]\n        '''\n        self.status_queue.put({\n            'taskid': 'taskid',\n            'project': 'test_project',\n            'url': 'url',\n            'track': {\n                'fetch': {\n                    'ok': True\n                },\n                'process': {\n                    'ok': True\n                },\n            }\n        })  # task done test_project:taskid url\n        time.sleep(0.2)\n        self.assertEqual(self.rpc.size(), 0)\n\n    def test_75_on_finished_msg(self):\n        task = self.scheduler2fetcher.get(timeout=5)  # select test_project:on_finished data:,on_finished\n\n        self.assertEqual(task['taskid'], 'on_finished')\n\n        self.status_queue.put({\n            'taskid': 'on_finished',\n            'project': 'test_project',\n            'url': 'url',\n            'track': {\n                'fetch': {\n                    'ok': True\n                },\n                'process': {\n                    'ok': True\n                },\n            }\n        })  # task done test_project:on_finished url\n        time.sleep(0.2)\n        self.assertEqual(self.rpc.size(), 0)\n\n    def test_80_newtask_age_ignore(self):\n        '''\n        processing = [ ]\n        '''\n        self.newtask_queue.put({\n            'taskid': 'taskid',\n            'project': 'test_project',\n            'url': 'url',\n            'fetch': {\n                'data': 'abc',\n            },\n            'process': {\n                'data': 'abc',\n            },\n            'schedule': {\n                'age': 30,\n            },\n        })\n        time.sleep(0.1)\n        self.assertEqual(self.rpc.size(), 0)\n\n    def test_82_newtask_via_rpc(self):\n        '''\n        processing = [ ]\n        '''\n        self.rpc.newtask({\n            'taskid': 'taskid',\n            'project': 'test_project',\n            'url': 'url',\n            'fetch': {\n                'data': 'abc',\n            },\n            'process': {\n                'data': 'abc',\n            },\n            'schedule': {\n                'age': 30,\n            },\n        })\n        time.sleep(0.1)\n        self.assertEqual(self.rpc.size(), 0)\n\n    def test_90_newtask_with_itag(self):\n        '''\n        task_queue = [ ]\n        processing = [ ]\n        '''\n        time.sleep(0.1)\n        self.newtask_queue.put({\n            'taskid': 'taskid',\n            'project': 'test_project',\n            'url': 'url',\n            'fetch': {\n                'data': 'abc',\n            },\n            'process': {\n                'data': 'abc',\n            },\n            'schedule': {\n                'itag': \"abc\",\n                'retries': 1\n            },\n        })  # restart task test_project:taskid url\n\n        task = self.scheduler2fetcher.get(timeout=10)  # select test_project:taskid url\n        self.assertIsNotNone(task)\n        self.assertEqual(task['taskid'], 'taskid')\n\n        self.test_70_taskdone_ok()  # task done test_project:taskid url\n        self.test_75_on_finished_msg()  # select test_project:on_finished data:,on_finished\n\n    def test_a10_newtask_restart_by_age(self):\n        self.newtask_queue.put({\n            'taskid': 'taskid',\n            'project': 'test_project',\n            'url': 'url',\n            'fetch': {\n                'data': 'abc',\n            },\n            'process': {\n                'data': 'abc',\n            },\n            'schedule': {\n                'age': 0,\n                'retries': 1\n            },\n        })  # restart task test_project:taskid url\n        task = self.scheduler2fetcher.get(timeout=10)  # select test_project:taskid url\n        self.assertIsNotNone(task)\n        self.assertEqual(task['taskid'], 'taskid')\n\n    def test_a20_failed_retry(self):\n        '''\n        processing: [ test_project:taskid ]\n        '''\n        self.status_queue.put({\n            'taskid': 'taskid',\n            'project': 'test_project',\n            'url': 'url',\n            'track': {\n                'fetch': {\n                    'ok': True\n                },\n                'process': {\n                    'ok': False\n                },\n            }\n        })  # task retry 0/1 test_project:taskid url\n        task = self.scheduler2fetcher.get(timeout=5)  # select test_project:taskid url\n        self.assertIsNotNone(task)\n        self.assertEqual(task['taskid'], 'taskid')\n\n        self.status_queue.put({\n            'taskid': 'taskid',\n            'project': 'test_project',\n            'url': 'url',\n            'track': {\n                'fetch': {\n                    'ok': False\n                },\n                'process': {\n                    'ok': False\n                },\n            }\n        })  # task failed test_project:taskid url\n\n        self.test_75_on_finished_msg()  # select test_project:on_finished data:,on_finished\n\n        from six.moves import queue as Queue\n        with self.assertRaises(Queue.Empty):\n            self.scheduler2fetcher.get(timeout=5)\n\n    def test_a30_task_verify(self):\n        self.assertFalse(self.rpc.newtask({\n            #'taskid': 'taskid#',\n            'project': 'test_project',\n            'url': 'url',\n        }))  # taskid not in task: {'project': 'test_project', 'url': 'url'}\n        self.assertFalse(self.rpc.newtask({\n            'taskid': 'taskid#',\n            #'project': 'test_project',\n            'url': 'url',\n        }))  # project not in task: {'url': 'url', 'taskid': 'taskid#'}\n        self.assertFalse(self.rpc.newtask({\n            'taskid': 'taskid#',\n            'project': 'test_project',\n            #'url': 'url',\n        }))  # url not in task: {'project': 'test_project', 'taskid': 'taskid#'}\n        self.assertFalse(self.rpc.newtask({\n            'taskid': 'taskid#',\n            'project': 'not_exist_project',\n            'url': 'url',\n        }))  # unknown project: not_exist_project\n        self.assertTrue(self.rpc.newtask({\n            'taskid': 'taskid#',\n            'project': 'test_project',\n            'url': 'url',\n        }))  # new task test_project:taskid# url\n\n    def test_a40_success_recrawl(self):\n        '''\n        task_queue = [ test_project:taskid# ]\n        '''\n        self.newtask_queue.put({\n            'taskid': 'taskid',\n            'project': 'test_project',\n            'url': 'url',\n            'fetch': {\n                'data': 'abc',\n            },\n            'process': {\n                'data': 'abc',\n            },\n            'schedule': {\n                'age': 0,\n                'retries': 1,\n                'auto_recrawl': True,\n            },\n        })  # restart task test_project:taskid url\n        task1 = self.scheduler2fetcher.get(timeout=10)  # select test_project:taskid# url\n        task2 = self.scheduler2fetcher.get(timeout=10)  # select test_project:taskid url\n        self.assertIsNotNone(task1)\n        self.assertIsNotNone(task2)\n        self.assertTrue(task1['taskid'] == 'taskid#' or task2['taskid'] == 'taskid#')\n\n        self.status_queue.put({\n            'taskid': 'taskid',\n            'project': 'test_project',\n            'url': 'url',\n            'schedule': {\n                'age': 0,\n                'retries': 1,\n                'auto_recrawl': True,\n            },\n            'track': {\n                'fetch': {\n                    'ok': True\n                },\n                'process': {\n                    'ok': True\n                },\n            }\n        })  # task done test_project:taskid url\n        task = self.scheduler2fetcher.get(timeout=10)\n        self.assertIsNotNone(task)\n\n    def test_a50_failed_recrawl(self):\n        '''\n        time_queue = [ test_project:taskid ]\n        scheduler2fetcher = [ test_project:taskid# ]\n        processing = [ test_project:taskid# ]\n        '''\n        for i in range(3):\n            self.status_queue.put({\n                'taskid': 'taskid',\n                'project': 'test_project',\n                'url': 'url',\n                'schedule': {\n                    'age': 0,\n                    'retries': 1,\n                    'auto_recrawl': True,\n                },\n                'track': {\n                    'fetch': {\n                        'ok': True\n                    },\n                    'process': {\n                        'ok': False\n                    },\n                }\n            })\n            # not processing pack: test_project:taskid url\n            # select test_project:taskid url\n            # task retry 0/1 test_project:taskid url\n            # select test_project:taskid url\n            # task retry 0/1 test_project:taskid url\n            # select test_project:taskid url\n            task = self.scheduler2fetcher.get(timeout=10)\n            self.assertIsNotNone(task)\n            self.assertEqual(task['taskid'], 'taskid')\n\n    def test_a60_disable_recrawl(self):\n        '''\n        time_queue = [ test_project:taskid ]\n        scheduler2fetcher = [ test_project:taskid# ]\n        processing = [ test_project:taskid# ]\n        '''\n        self.status_queue.put({\n            'taskid': 'taskid',\n            'project': 'test_project',\n            'url': 'url',\n            'schedule': {\n                'age': 0,\n                'retries': 1,\n            },\n            'track': {\n                'fetch': {\n                    'ok': True\n                },\n                'process': {\n                    'ok': True\n                },\n            }\n        })  # task done test_project:taskid url\n\n        from six.moves import queue as Queue\n        with self.assertRaises(Queue.Empty):\n            self.scheduler2fetcher.get(timeout=5)\n\n    def test_38_cancel_task(self):\n        current_size = self.rpc.size()\n        self.newtask_queue.put({\n            'taskid': 'taskid_to_cancel',\n            'project': 'test_project',\n            'url': 'url',\n            'fetch': {\n                'data': 'abc',\n            },\n            'process': {\n                'data': 'abc',\n            },\n            'schedule': {\n                'age': 0,\n                'exetime': time.time() + 30\n            },\n        })  # new task test_project:taskid_to_cancel url\n        # task_queue = [ test_project:taskid_to_cancel ]\n\n        time.sleep(0.2)\n        self.assertEqual(self.rpc.size(), current_size+1)\n\n        self.newtask_queue.put({\n            'taskid': 'taskid_to_cancel',\n            'project': 'test_project',\n            'url': 'url',\n            'fetch': {\n                'data': 'abc',\n            },\n            'process': {\n                'data': 'abc',\n            },\n            'schedule': {\n                'force_update': True,\n                'age': 0,\n                'cancel': True\n            },\n        })  # new cancel test_project:taskid_to_cancel url\n        # task_queue = [ ]\n\n        time.sleep(0.2)\n        self.assertEqual(self.rpc.size(), current_size)\n\n    def test_x10_inqueue_limit(self):\n        self.projectdb.insert('test_inqueue_project', {\n            'name': 'test_inqueue_project',\n            'group': 'group',\n            'status': 'DEBUG',\n            'script': 'import time\\nprint(time.time())',\n            'comments': 'test project',\n            'rate': 0,\n            'burst': 0,\n        })\n        time.sleep(0.1)\n        pre_size = self.rpc.size()\n        for i in range(20):\n            self.newtask_queue.put({\n                'taskid': 'taskid%d' % i,\n                'project': 'test_inqueue_project',\n                'url': 'url',\n                'schedule': {\n                    'age': 3000,\n                    'force_update': True,\n                },\n            })\n        time.sleep(1)\n        self.assertEqual(self.rpc.size() - pre_size, 10)\n\n    def test_x20_delete_project(self):\n        self.assertIsNotNone(self.projectdb.get('test_inqueue_project'))\n        #self.assertIsNotNone(self.taskdb.get_task('test_inqueue_project', 'taskid1'))\n        self.projectdb.update('test_inqueue_project', status=\"STOP\", group=\"lock,delete\")\n        time.sleep(1)\n        self.assertIsNone(self.projectdb.get('test_inqueue_project'))\n        self.taskdb._list_project()\n        self.assertIsNone(self.taskdb.get_task('test_inqueue_project', 'taskid1'))\n        self.assertNotIn('test_inqueue_project', self.rpc.counter('5m', 'sum'))\n\n    def test_z10_startup(self):\n        self.assertTrue(self.process.is_alive())\n\n    def test_z20_quit(self):\n        self.rpc._quit()\n        time.sleep(0.2)\n        self.assertFalse(self.process.is_alive())\n        self.assertEqual(\n            self.taskdb.get_task('test_project', 'taskid')['status'],\n            self.taskdb.SUCCESS\n        )\n\n\nfrom pyspider.scheduler.scheduler import Project\n\nclass TestProject(unittest.TestCase):\n    task_pack = {\n        'type': Scheduler.TASK_PACK,\n        'taskid': 'taskid',\n        'project': 'test_project',\n        'url': 'url',\n        'fetch': {\n            'data': 'abc',\n        },\n        'process': {\n            'data': 'abc',\n        },\n        'schedule': {\n            'age': 0,\n        },\n    }\n\n    status_ok_pack = {\n        'taskid': 'taskid',\n        'project': 'test_project',\n        'url': 'url',\n        'schedule': {\n            'age': 0,\n            'retries': 1,\n        },\n        'track': {\n            'fetch': {\n                'ok': True\n            },\n            'process': {\n                'ok': True\n            },\n        }\n    }\n\n    status_fail_pack = {\n        'taskid': 'taskid',\n        'project': 'test_project',\n        'url': 'url',\n        'schedule': {\n            'age': 0,\n            'retries': 1,\n        },\n        'track': {\n            'fetch': {\n                'ok': False\n            },\n            'process': {\n                'ok': False\n            },\n        }\n    }\n\n    @classmethod\n    def setUpClass(self):\n        self.scheduler = Scheduler(taskdb=None, projectdb=None, newtask_queue=None, status_queue=None, out_queue=None)\n        self.scheduler.PAUSE_TIME = 2\n        self.project = Project(self.scheduler, {\n            'name': 'test_project_not_started',\n            'group': 'group',\n            'status': 'RUNNING',\n            'script': 'import time\\nprint(time.time())',\n            'comments': 'test project',\n            'rate': 1.0,\n            'burst': 10,\n            'updatetime': time.time(),\n        })\n\n    def test_pause_10_unpaused(self):\n        self.assertFalse(self.project.paused)\n\n    def test_pause_20_no_enough_fail_tasks(self):\n        for i in range(3):\n            self.project.active_tasks.appendleft((time.time(), dict(self.task_pack)))\n        self.assertFalse(self.project.paused)\n\n        for i in range(1):\n            self.project.active_tasks.appendleft((time.time(), dict(self.status_ok_pack)))\n        for i in range(self.scheduler.FAIL_PAUSE_NUM - 5):\n            self.project.active_tasks.appendleft((time.time(), dict(self.status_fail_pack)))\n        self.assertFalse(self.project.paused)\n\n        for i in range(5):\n            self.project.active_tasks.appendleft((time.time(), dict(self.status_fail_pack)))\n        for i in range(1):\n            self.project.active_tasks.appendleft((time.time(), dict(self.status_ok_pack)))\n        self.assertFalse(self.project.paused)\n\n        for i in range(self.scheduler.FAIL_PAUSE_NUM):\n            self.project.active_tasks.appendleft((time.time(), dict(self.task_pack)))\n        self.assertFalse(self.project.paused)\n\n    def test_pause_30_paused(self):\n        for i in range(self.scheduler.FAIL_PAUSE_NUM):\n            self.project.active_tasks.appendleft((time.time(), dict(self.status_fail_pack)))\n        for i in range(self.scheduler.FAIL_PAUSE_NUM):\n            self.project.active_tasks.appendleft((time.time(), dict(self.task_pack)))\n        self.assertTrue(self.project.paused)\n\n    def test_pause_40_unpause_checking(self):\n        time.sleep(3)\n        self.assertFalse(self.project.paused)\n\n    def test_pause_50_paused_again(self):\n        for i in range(self.scheduler.UNPAUSE_CHECK_NUM):\n            self.project.active_tasks.appendleft((time.time(), dict(self.status_fail_pack)))\n        self.assertTrue(self.project.paused)\n\n    def test_pause_60_unpause_checking(self):\n        time.sleep(3)\n        self.assertFalse(self.project.paused)\n\n    def test_pause_70_unpaused(self):\n        for i in range(1):\n            self.project.active_tasks.appendleft((time.time(), dict(self.status_ok_pack)))\n        for i in range(self.scheduler.UNPAUSE_CHECK_NUM):\n            self.project.active_tasks.appendleft((time.time(), dict(self.status_fail_pack)))\n        for i in range(self.scheduler.FAIL_PAUSE_NUM):\n            self.project.active_tasks.appendleft((time.time(), dict(self.task_pack)))\n        self.assertFalse(self.project.paused)\n        self.assertFalse(self.project._paused)\n\n    def test_pause_x_disable_auto_pause(self):\n        fail_pause_num = self.scheduler.FAIL_PAUSE_NUM\n        self.scheduler.FAIL_PAUSE_NUM = 0\n        for i in range(100):\n            self.project.active_tasks.appendleft((time.time(), dict(self.status_fail_pack)))\n        self.assertFalse(self.project.paused)\n        self.scheduler.FAIL_PAUSE_NUM = fail_pause_num\n\n\nif __name__ == '__main__':\n    unittest.main()\n"
  },
  {
    "path": "tests/test_task_queue.py",
    "content": "#!/usr/bin/env python\n# -*- coding: utf-8 -*-\n\nimport time\nimport unittest\n\nimport six\nfrom six.moves import queue as Queue\n\nfrom pyspider.scheduler.task_queue import InQueueTask, TaskQueue\n\n\nclass TestTaskQueue(unittest.TestCase):\n    \"\"\"\n        TestTaskQueue\n    \"\"\"\n\n    def test_task_queue_in_time_order(self):\n        tq = TaskQueue(rate=300, burst=1000)\n\n        queues = dict()\n        tasks = dict()\n\n        for i in range(0, 100):\n            it = InQueueTask(str(i), priority=int(i // 10), exetime=0)\n            tq.put(it.taskid, it.priority, it.exetime)\n\n            if it.priority not in queues:\n                queues[it.priority] = Queue.Queue()\n\n            q = queues[it.priority]  # type:Queue.Queue\n            q.put(it)\n            tasks[it.taskid] = it\n            # six.print_('put, taskid=', it.taskid, 'priority=', it.priority, 'exetime=', it.exetime)\n        for i in range(0, 100):\n            task_id = tq.get()\n            task = tasks[task_id]\n            q = queues[task.priority]  # type: Queue.Queue\n            expect_task = q.get()\n            self.assertEqual(task_id, expect_task.taskid)\n            self.assertEqual(task.priority, int(9 - i // 10))\n            # six.print_('get, taskid=', task.taskid, 'priority=', task.priority, 'exetime=', task.exetime)\n\n        self.assertEqual(tq.size(), 100)\n        self.assertEqual(tq.priority_queue.qsize(), 0)\n        self.assertEqual(tq.processing.qsize(), 100)\n        for q in six.itervalues(queues):  # type:Queue.Queue\n            self.assertEqual(q.qsize(), 0)\n        pass\n\n    pass\n\n\nclass TestTimeQueue(unittest.TestCase):\n    def test_time_queue(self):\n\n        # six.print_('Test time queue order by time only')\n\n        tq = TaskQueue(rate=300, burst=1000)\n\n        fifo_queue = Queue.Queue()\n\n        interval = 5.0 / 1000\n\n        for i in range(0, 20):\n            it = InQueueTask(str(i), priority=int(i // 10), exetime=time.time() + (i + 1) * interval)\n            tq.put(it.taskid, it.priority, it.exetime)\n            fifo_queue.put(it)\n            # six.print_('put, taskid=', it.taskid, 'priority=', it.priority, 'exetime=', it.exetime)\n\n        self.assertEqual(tq.priority_queue.qsize(), 0)\n        self.assertEqual(tq.processing.qsize(), 0)\n        self.assertEqual(tq.time_queue.qsize(), 20)\n\n        for i in range(0, 20):\n            t1 = fifo_queue.get()\n            t2 = tq.time_queue.get()\n            self.assertEqual(t1.taskid, t2.taskid)\n            # six.print_('get, taskid=', t2.taskid, 'priority=', t2.priority, 'exetime=', t2.exetime)\n        self.assertEqual(tq.priority_queue.qsize(), 0)\n        self.assertEqual(tq.processing.qsize(), 0)\n        self.assertEqual(tq.time_queue.qsize(), 0)\n\n        queues = dict()\n        tasks = dict()\n        for i in range(0, 20):\n            priority = int(i // 10)\n            it = InQueueTask(str(i), priority=priority, exetime=time.time() + (i + 1) * interval)\n            tq.put(it.taskid, it.priority, it.exetime)\n            tasks[it.taskid] = it\n\n            if priority not in queues:\n                queues[priority] = Queue.Queue()\n            q = queues[priority]\n            q.put(it)\n            pass\n\n        self.assertEqual(tq.priority_queue.qsize(), 0)\n        self.assertEqual(tq.processing.qsize(), 0)\n        self.assertEqual(tq.time_queue.qsize(), 20)\n\n        time.sleep(20 * interval)\n        tq.check_update()\n        self.assertEqual(tq.priority_queue.qsize(), 20)\n        self.assertEqual(tq.processing.qsize(), 0)\n        self.assertEqual(tq.time_queue.qsize(), 0)\n        for i in range(0, 20):\n            taskid = tq.get()\n            t1 = tasks[taskid]\n            t2 = queues[t1.priority].get()\n            self.assertEqual(t1.taskid, t2.taskid)\n\n        self.assertEqual(tq.priority_queue.qsize(), 0)\n        self.assertEqual(tq.processing.qsize(), 20)\n        self.assertEqual(tq.time_queue.qsize(), 0)\n\n        pass\n\n    pass\n\n\nif __name__ == '__main__':\n    unittest.main()\n"
  },
  {
    "path": "tests/test_utils.py",
    "content": "#!/usr/bin/env python\n# -*- encoding: utf-8 -*-\n# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:\n# Author: Binux<roy@binux.me>\n#         http://binux.me\n# Created on 2015-01-18 16:53:49\n\nimport sys\nimport time\nimport unittest\n\nfrom pyspider.libs import utils\n\nclass TestFetcher(unittest.TestCase):\n    def test_readonlydict(self):\n        data = dict(a='a', b=123)\n        data['c'] = self\n        data = utils.ReadOnlyDict(data)\n\n        with self.assertRaises(Exception):\n            data['d'] = 9\n\n    def test_getitem(self):\n        l = [1, 2]\n        self.assertEqual(utils.getitem(l, 0), 1)\n        self.assertEqual(utils.getitem(l, 1), 2)\n        self.assertEqual(utils.getitem(l, 3), None)\n        self.assertEqual(utils.getitem(l, 3, 9), 9)\n        self.assertEqual(utils.getitem(l, 'key'), None)\n        self.assertEqual(utils.getitem(l, 'key', 8), 8)\n        data = dict(a='a', b=123)\n        self.assertEqual(utils.getitem(data, 'a'), 'a')\n        self.assertEqual(utils.getitem(data, 'b'), 123)\n        self.assertEqual(utils.getitem(data, 'c'), None)\n        self.assertEqual(utils.getitem(data, 'c', 9), 9)\n\n    def test_format_data(self):\n        now = time.time()\n        self.assertEqual(utils.format_date(now - 30), '30 seconds ago')\n        self.assertEqual(utils.format_date(now - 60), '1 minute ago')\n        self.assertEqual(utils.format_date(now - 2*60), '2 minutes ago')\n        self.assertEqual(utils.format_date(now - 30*60), '30 minutes ago')\n        self.assertEqual(utils.format_date(now - 60*60), '1 hour ago')\n        self.assertEqual(utils.format_date(1963475336), 'Mar 21, 2032 at 9:48')\n        self.assertEqual(utils.format_date(now - 12*60*60), '12 hours ago')\n        self.assertRegex(utils.format_date(now - 24*60*60), r'^yesterday at \\d{1,2}:\\d{2}$')\n        self.assertRegex(utils.format_date(now - 2*24*60*60), r'^[A-Z][a-z]+ at \\d{1,2}:\\d{2}$')\n        self.assertRegex(utils.format_date(now - 3*24*60*60), r'^[A-Z][a-z]+ at \\d{1,2}:\\d{2}$')\n        self.assertRegex(utils.format_date(now - 4*24*60*60), r'^[A-Z][a-z]+ at \\d{1,2}:\\d{2}$')\n        self.assertRegex(utils.format_date(now - 5*24*60*60), r'^\\d{1,2}-\\d{1,2} at \\d{1,2}:\\d{2}$')\n        self.assertRegex(utils.format_date(now - 333*24*60*60), r'^\\d{1,2}-\\d{1,2} at \\d{1,2}:\\d{2}$')\n        self.assertRegex(utils.format_date(now - 334*24*60*60), r'^[A-Z][a-z]+ \\d{1,2}, \\d{4} at \\d{1,2}:\\d{2}$')\n"
  },
  {
    "path": "tests/test_webdav.py",
    "content": "#!/usr/bin/env python\n# -*- encoding: utf-8 -*-\n# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:\n# Author: Binux<roy@binux.me>\n#         http://binux.me\n# Created on 2015-06-03 21:15\n\nimport os\nimport sys\nimport six\nimport time\nimport shutil\nimport inspect\nimport unittest\n\nfrom six import BytesIO\nfrom pyspider import run\nfrom pyspider.libs import utils\nfrom tests import data_sample_handler, data_handler\n\n@unittest.skipIf(sys.version_info >= (3, 6), \"easywebdav doesn't support python 3.6\")\nclass TestWebDav(unittest.TestCase):\n    @classmethod\n    def setUpClass(self):\n        import easywebdav\n\n        shutil.rmtree('./data/tests', ignore_errors=True)\n        os.makedirs('./data/tests')\n\n        ctx = run.cli.make_context('test', [\n            '--taskdb', 'sqlite+taskdb:///data/tests/task.db',\n            '--projectdb', 'sqlite+projectdb:///data/tests/projectdb.db',\n            '--resultdb', 'sqlite+resultdb:///data/tests/resultdb.db',\n        ], None, obj=utils.ObjectDict(testing_mode=True))\n        self.ctx = run.cli.invoke(ctx)\n\n        ctx = run.webui.make_context('webui', [\n            '--username', 'binux',\n            '--password', '4321',\n        ], self.ctx)\n        self.app = run.webui.invoke(ctx)\n        self.app_thread = utils.run_in_thread(self.app.run)\n        time.sleep(5)\n\n        self.webdav = easywebdav.connect('localhost', port=5000, path='dav')\n        self.webdav_up = easywebdav.connect('localhost', port=5000, path='dav',\n                                            username='binux', password='4321')\n\n    @classmethod\n    def tearDownClass(self):\n        for each in self.ctx.obj.instances:\n            each.quit()\n        self.app_thread.join()\n        time.sleep(1)\n\n        assert not utils.check_port_open(5000)\n        assert not utils.check_port_open(23333)\n        assert not utils.check_port_open(24444)\n        assert not utils.check_port_open(25555)\n        assert not utils.check_port_open(14887)\n\n        shutil.rmtree('./data/tests', ignore_errors=True)\n\n    def test_10_ls(self):\n        self.assertEqual(len(self.webdav.ls()), 1)\n\n    def test_20_create_error(self):\n        import easywebdav\n        with self.assertRaises(easywebdav.OperationFailed):\n            self.webdav.upload(inspect.getsourcefile(data_sample_handler),\n                               'bad_file_name')\n        with self.assertRaises(easywebdav.OperationFailed):\n            self.webdav.upload(inspect.getsourcefile(data_sample_handler),\n                               'bad.file.name')\n\n    def test_30_create_ok(self):\n        self.webdav.upload(inspect.getsourcefile(data_handler), 'handler.py')\n        self.webdav.upload(inspect.getsourcefile(data_sample_handler), 'sample_handler.py')\n        self.assertEqual(len(self.webdav.ls()), 3)\n\n    def test_40_get_404(self):\n        io = BytesIO()\n        import easywebdav\n        with self.assertRaises(easywebdav.OperationFailed):\n            self.webdav.download('not_exitst', io)\n        io.close()\n\n    def test_50_get(self):\n        io = BytesIO()\n        self.webdav.download('handler.py', io)\n        self.assertEqual(utils.text(inspect.getsource(data_handler)), utils.text(io.getvalue()))\n        io.close()\n\n        io = BytesIO()\n        self.webdav.download('sample_handler.py', io)\n        self.assertEqual(utils.text(inspect.getsource(data_sample_handler)), utils.text(io.getvalue()))\n        io.close()\n\n    def test_60_edit(self):\n        self.webdav.upload(inspect.getsourcefile(data_handler), 'sample_handler.py')\n\n    def test_70_get(self):\n        io = BytesIO()\n        self.webdav.download('sample_handler.py', io)\n        self.assertEqual(utils.text(inspect.getsource(data_handler)), utils.text(io.getvalue()))\n        io.close()\n\n    def test_80_password(self):\n        import requests\n        rv = requests.post('http://localhost:5000/update', data={\n            'name': 'group',\n            'value': 'lock',\n            'pk': 'sample_handler',\n        })\n        self.assertEqual(rv.status_code, 200)\n\n        import easywebdav\n        with self.assertRaises(easywebdav.OperationFailed):\n            self.webdav.upload(inspect.getsourcefile(data_sample_handler), 'sample_handler.py')\n        self.webdav_up.upload(inspect.getsourcefile(data_sample_handler), 'sample_handler.py')\n\n\n@unittest.skipIf(sys.version_info >= (3, 6), \"easywebdav doesn't support python 3.6\")\nclass TestWebDavNeedAuth(unittest.TestCase):\n    @classmethod\n    def setUpClass(self):\n        import easywebdav\n\n        shutil.rmtree('./data/tests', ignore_errors=True)\n        os.makedirs('./data/tests')\n\n        ctx = run.cli.make_context('test', [\n            '--taskdb', 'sqlite+taskdb:///data/tests/task.db',\n            '--projectdb', 'sqlite+projectdb:///data/tests/projectdb.db',\n            '--resultdb', 'sqlite+resultdb:///data/tests/resultdb.db',\n        ], None, obj=utils.ObjectDict(testing_mode=True))\n        self.ctx = run.cli.invoke(ctx)\n\n        ctx = run.webui.make_context('webui', [\n            '--username', 'binux',\n            '--password', '4321',\n            '--need-auth',\n        ], self.ctx)\n        self.app = run.webui.invoke(ctx)\n        self.app_thread = utils.run_in_thread(self.app.run)\n        time.sleep(5)\n\n        self.webdav = easywebdav.connect('localhost', port=5000, path='dav')\n        self.webdav_up = easywebdav.connect('localhost', port=5000, path='dav',\n                                            username='binux', password='4321')\n\n    @classmethod\n    def tearDownClass(self):\n        for each in self.ctx.obj.instances:\n            each.quit()\n        self.app_thread.join()\n        time.sleep(1)\n\n        assert not utils.check_port_open(5000)\n        assert not utils.check_port_open(23333)\n        assert not utils.check_port_open(24444)\n        assert not utils.check_port_open(25555)\n        assert not utils.check_port_open(14887)\n\n        shutil.rmtree('./data/tests', ignore_errors=True)\n\n    def test_10_ls(self):\n        import easywebdav\n        with self.assertRaises(easywebdav.OperationFailed):\n            self.assertEqual(len(self.webdav.ls()), 1)\n        self.assertEqual(len(self.webdav_up.ls()), 1)\n\n    def test_30_create_ok(self):\n        self.webdav_up.upload(inspect.getsourcefile(data_handler), 'handler.py')\n        self.assertEqual(len(self.webdav_up.ls()), 2)\n\n    def test_50_get(self):\n        import easywebdav\n        with self.assertRaises(easywebdav.OperationFailed):\n            io = BytesIO()\n            self.webdav.download('handler.py', io)\n            io.close()\n\n        io = BytesIO()\n        self.webdav_up.download('handler.py', io)\n        self.assertEqual(utils.text(inspect.getsource(data_handler)), utils.text(io.getvalue()))\n        io.close()\n"
  },
  {
    "path": "tests/test_webui.py",
    "content": "#!/usr/bin/env python\n# -*- encoding: utf-8 -*-\n# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:\n# Author: Binux<roy@binux.me>\n#         http://binux.me\n# Created on 2014-11-18 21:03:22\n\nimport os\nimport re\nimport time\nimport json\nimport shutil\nimport unittest\n\nfrom pyspider import run\nfrom pyspider.libs import utils\nfrom pyspider.libs.utils import run_in_thread, ObjectDict\n\n\nclass TestWebUI(unittest.TestCase):\n\n    @classmethod\n    def setUpClass(self):\n        shutil.rmtree('./data/tests', ignore_errors=True)\n        os.makedirs('./data/tests')\n\n        import tests.data_test_webpage\n        import httpbin\n        from pyspider.webui import bench_test  # flake8: noqa\n        self.httpbin_thread = utils.run_in_subprocess(httpbin.app.run, port=14887, passthrough_errors=False)\n        self.httpbin = 'http://127.0.0.1:14887'\n\n        ctx = run.cli.make_context('test', [\n            '--taskdb', 'sqlalchemy+sqlite+taskdb:///data/tests/task.db',\n            '--projectdb', 'sqlalchemy+sqlite+projectdb:///data/tests/projectdb.db',\n            '--resultdb', 'sqlalchemy+sqlite+resultdb:///data/tests/resultdb.db',\n        ], None, obj=ObjectDict(testing_mode=True))\n        self.ctx = run.cli.invoke(ctx)\n\n        self.threads = []\n\n        ctx = run.scheduler.make_context('scheduler', [], self.ctx)\n        self.scheduler = scheduler = run.scheduler.invoke(ctx)\n        self.threads.append(run_in_thread(scheduler.xmlrpc_run))\n        self.threads.append(run_in_thread(scheduler.run))\n\n        ctx = run.fetcher.make_context('fetcher', [\n            '--xmlrpc-port', '24444',\n        ], self.ctx)\n        fetcher = run.fetcher.invoke(ctx)\n        self.threads.append(run_in_thread(fetcher.xmlrpc_run))\n        self.threads.append(run_in_thread(fetcher.run))\n\n        ctx = run.processor.make_context('processor', [], self.ctx)\n        processor = run.processor.invoke(ctx)\n        self.threads.append(run_in_thread(processor.run))\n\n        ctx = run.result_worker.make_context('result_worker', [], self.ctx)\n        result_worker = run.result_worker.invoke(ctx)\n        self.threads.append(run_in_thread(result_worker.run))\n\n        ctx = run.webui.make_context('webui', [\n            '--scheduler-rpc', 'http://localhost:23333/'\n        ], self.ctx)\n        app = run.webui.invoke(ctx)\n        app.debug = True\n        self.app = app.test_client()\n        self.rpc = app.config['scheduler_rpc']\n\n        time.sleep(1)\n\n    @classmethod\n    def tearDownClass(self):\n        for each in self.ctx.obj.instances:\n            each.quit()\n        time.sleep(1)\n\n        for thread in self.threads:\n            thread.join()\n\n        self.httpbin_thread.terminate()\n        self.httpbin_thread.join()\n\n        assert not utils.check_port_open(5000)\n        assert not utils.check_port_open(23333)\n        assert not utils.check_port_open(24444)\n        assert not utils.check_port_open(25555)\n        assert not utils.check_port_open(14887)\n\n        shutil.rmtree('./data/tests', ignore_errors=True)\n\n    def test_10_index_page(self):\n        rv = self.app.get('/')\n        self.assertEqual(rv.status_code, 200)\n        self.assertIn(b'dashboard', rv.data)\n\n    def test_20_debug(self):\n        rv = self.app.get('/debug/test_project')\n        self.assertEqual(rv.status_code, 200)\n        self.assertIn(b'debugger', rv.data)\n        self.assertIn(b'var task_content = ', rv.data)\n        self.assertIn(b'var script_content = ', rv.data)\n\n        m = re.search(r'var task_content = (.*);\\n', utils.text(rv.data))\n        self.assertIsNotNone(m)\n        self.assertIn('test_project', json.loads(m.group(1)))\n\n        m = re.search(r'var script_content = (.*);\\n', utils.text(rv.data))\n        self.assertIsNotNone(m)\n        self.assertIn('__START_URL__', json.loads(m.group(1)))\n\n    def test_25_debug_post(self):\n        rv = self.app.post('/debug/test_project', data={\n            'project-name': 'other_project',\n            'start-urls': 'http://127.0.0.1:14887/pyspider/test.html',\n            'script-mode': 'script',\n        })\n        self.assertEqual(rv.status_code, 200)\n        self.assertIn(b'debugger', rv.data)\n        self.assertIn(b'var task_content = ', rv.data)\n        self.assertIn(b'var script_content = ', rv.data)\n\n        m = re.search(r'var task_content = (.*);\\n', utils.text(rv.data))\n        self.assertIsNotNone(m)\n        self.assertIn('test_project', m.group(1))\n        self.__class__.task_content = json.loads(m.group(1))\n\n        m = re.search(r'var script_content = (.*);\\n', utils.text(rv.data))\n        self.assertIsNotNone(m)\n        self.assertIn('127.0.0.1:14887', m.group(1))\n        self.__class__.script_content = json.loads(m.group(1))\n\n    def test_30_run(self):\n        rv = self.app.post('/debug/test_project/run', data={\n            'script': self.script_content,\n            'task': self.task_content\n        })\n        self.assertEqual(rv.status_code, 200)\n        data = json.loads(utils.text(rv.data))\n        self.assertIn(b'follows', rv.data)\n        self.assertGreater(len(data['follows']), 0)\n        self.__class__.task_content2 = data['follows'][0]\n\n    def test_32_run_bad_task(self):\n        rv = self.app.post('/debug/test_project/run', data={\n            'script': self.script_content,\n            'task': self.task_content+'asdfasdf312!@#'\n        })\n        self.assertEqual(rv.status_code, 200)\n        data = json.loads(utils.text(rv.data))\n        self.assertGreater(len(data['logs']), 0)\n        self.assertEqual(len(data['follows']), 0)\n\n    def test_33_run_bad_script(self):\n        rv = self.app.post('/debug/test_project/run', data={\n            'script': self.script_content+'adfasfasdf',\n            'task': self.task_content\n        })\n        self.assertEqual(rv.status_code, 200)\n        data = json.loads(utils.text(rv.data))\n        self.assertGreater(len(data['logs']), 0)\n        self.assertEqual(len(data['follows']), 0)\n\n    def test_35_run_http_task(self):\n        rv = self.app.post('/debug/test_project/run', data={\n            'script': self.script_content,\n            'task': json.dumps(self.task_content2)\n        })\n        self.assertEqual(rv.status_code, 200)\n        data = json.loads(utils.text(rv.data))\n        self.assertIn('follows', data)\n\n    def test_40_save(self):\n        rv = self.app.post('/debug/test_project/save', data={\n            'script': self.script_content,\n        })\n        self.assertEqual(rv.status_code, 200)\n        self.assertIn(b'ok', rv.data)\n\n    def test_42_get(self):\n        rv = self.app.get('/debug/test_project/get')\n        self.assertEqual(rv.status_code, 200)\n        data = json.loads(utils.text(rv.data))\n        self.assertIn('script', data)\n        self.assertEqual(data['script'], self.script_content)\n\n    def test_45_run_with_saved_script(self):\n        rv = self.app.post('/debug/test_project/run', data={\n            'webdav_mode': 'true',\n            'script': '',\n            'task': self.task_content\n        })\n        self.assertEqual(rv.status_code, 200)\n        data = json.loads(utils.text(rv.data))\n        self.assertIn(b'follows', rv.data)\n        self.assertGreater(len(data['follows']), 0)\n        self.__class__.task_content2 = data['follows'][0]\n\n    def test_50_index_page_list(self):\n        rv = self.app.get('/')\n        self.assertEqual(rv.status_code, 200)\n        self.assertIn(b'\"test_project\"', rv.data)\n\n    def test_52_change_status(self):\n        rv = self.app.post('/update', data={\n            'name': 'status',\n            'value': 'RUNNING',\n            'pk': 'test_project'\n        })\n        self.assertEqual(rv.status_code, 200)\n        self.assertIn(b'ok', rv.data)\n\n    def test_55_reopen(self):\n        rv = self.app.get('/debug/test_project')\n        self.assertEqual(rv.status_code, 200)\n        self.assertIn(b'debugger', rv.data)\n\n    def test_57_resave(self):\n        rv = self.app.post('/debug/test_project/save', data={\n            'script': self.script_content,\n        })\n        self.assertEqual(rv.status_code, 200)\n        self.assertIn(b'ok', rv.data)\n\n    def test_58_index_page_list(self):\n        rv = self.app.get('/')\n        self.assertEqual(rv.status_code, 200)\n        self.assertIn(b'CHECKING', rv.data)\n\n    def test_60_change_rate(self):\n        rv = self.app.post('/update', data={\n            'name': 'rate',\n            'value': '1/4',\n            'pk': 'test_project'\n        })\n        self.assertEqual(rv.status_code, 200)\n        self.assertIn(b'ok', rv.data)\n\n    def test_70_change_status(self):\n        rv = self.app.post('/update', data={\n            'name': 'status',\n            'value': 'RUNNING',\n            'pk': 'test_project'\n        })\n        self.assertEqual(rv.status_code, 200)\n        self.assertIn(b'ok', rv.data)\n\n    def test_80_change_group(self):\n        rv = self.app.post('/update', data={\n            'name': 'group',\n            'value': 'test_binux',\n            'pk': 'test_project'\n        })\n        self.assertEqual(rv.status_code, 200)\n        self.assertIn(b'ok', rv.data)\n\n        rv = self.app.get('/')\n        self.assertEqual(rv.status_code, 200)\n        self.assertIn(b'test_binux', rv.data)\n\n    def test_90_run(self):\n        time.sleep(0.5)\n        rv = self.app.post('/run', data={\n            'project': 'test_project',\n        })\n        self.assertEqual(rv.status_code, 200)\n        self.assertEqual(json.loads(utils.text(rv.data))['result'], True)\n\n    def test_a10_counter(self):\n        for i in range(30):\n            time.sleep(1)\n            if self.rpc.counter('5m', 'sum')\\\n                    .get('test_project', {}).get('success', 0) > 5:\n                break\n\n        rv = self.app.get('/counter')\n        self.assertEqual(rv.status_code, 200)\n        data = json.loads(utils.text(rv.data))\n        self.assertGreater(len(data), 0)\n        self.assertGreater(data['test_project']['5m']['success'], 3)\n        self.assertGreater(data['test_project']['1h']['success'], 3)\n        self.assertGreater(data['test_project']['1d']['success'], 3)\n        self.assertGreater(data['test_project']['all']['success'], 3)\n\n    def test_a15_queues(self):\n        rv = self.app.get('/queues')\n        self.assertEqual(rv.status_code, 200)\n        data = json.loads(utils.text(rv.data))\n        self.assertGreater(len(data), 0)\n        self.assertIn('scheduler2fetcher', data)\n        self.assertIn('fetcher2processor', data)\n        self.assertIn('processor2result', data)\n        self.assertIn('newtask_queue', data)\n        self.assertIn('status_queue', data)\n\n    def test_a20_tasks(self):\n        rv = self.app.get('/tasks')\n        self.assertEqual(rv.status_code, 200, rv.data)\n        self.assertIn(b'SUCCESS</span>', rv.data)\n        self.assertNotIn(b'>ERROR</span>', rv.data)\n        m = re.search(r'/task/test_project:[^\"]+', utils.text(rv.data))\n        self.assertIsNotNone(m)\n        self.__class__.task_url = m.group(0)\n        self.assertIsNotNone(self.task_url)\n        m = re.search(r'/debug/test_project[^\"]+', utils.text(rv.data))\n        self.assertIsNotNone(m)\n        self.__class__.debug_task_url = m.group(0)\n        self.assertIsNotNone(self.debug_task_url)\n\n        rv = self.app.get('/tasks?project=test_project')\n        self.assertEqual(rv.status_code, 200)\n        self.assertIn(b'SUCCESS</span>', rv.data)\n        self.assertNotIn(b'>ERROR</span>', rv.data)\n\n    def test_a22_active_tasks(self):\n        rv = self.app.get('/active_tasks')\n        data = json.loads(utils.text(rv.data))\n        track = False\n        self.assertGreater(len(data), 0)\n        for task in data:\n            for k in ('taskid', 'project', 'url', 'updatetime'):\n                self.assertIn(k, task)\n            if task.get('track'):\n                track = True\n                self.assertIn('fetch', task['track'])\n                self.assertIn('ok', task['track']['fetch'])\n                self.assertIn('time', task['track']['fetch'])\n                self.assertIn('process', task['track'])\n                self.assertIn('ok', task['track']['process'])\n                self.assertIn('time', task['track']['process'])\n        self.assertTrue(track)\n                    \n\n    def test_a24_task(self):\n        rv = self.app.get(self.task_url)\n        self.assertEqual(rv.status_code, 200)\n        self.assertIn(b'lastcrawltime', rv.data)\n\n    def test_a25_task_json(self):\n        rv = self.app.get(self.task_url + '.json')\n        self.assertEqual(rv.status_code, 200)\n        self.assertIn('status_string', json.loads(utils.text(rv.data)))\n\n    def test_a26_debug_task(self):\n        rv = self.app.get(self.debug_task_url)\n        self.assertEqual(rv.status_code, 200)\n\n    def test_a30_results(self):\n        rv = self.app.get('/results?project=test_project')\n        self.assertEqual(rv.status_code, 200)\n        self.assertIn(b'<th>url</th>', rv.data)\n        self.assertIn(b'open-url', rv.data)\n\n    def test_a30_export_json(self):\n        rv = self.app.get('/results/dump/test_project.json')\n        self.assertEqual(rv.status_code, 200)\n        self.assertIn(b'\"taskid\":', rv.data)\n\n    def test_a32_export_json_style_full(self):\n        rv = self.app.get('/results/dump/test_project.json?style=full')\n        self.assertEqual(rv.status_code, 200)\n        data = json.loads(rv.data.decode('utf8'))\n        self.assertGreater(len(data), 1)\n\n    def test_a34_export_json_style_full_limit_1(self):\n        rv = self.app.get('/results/dump/test_project.json?style=full&limit=1&offset=1')\n        self.assertEqual(rv.status_code, 200)\n        data = json.loads(rv.data.decode('utf8'))\n        self.assertEqual(len(data), 1)\n\n    def test_a40_export_url_json(self):\n        rv = self.app.get('/results/dump/test_project.txt')\n        self.assertEqual(rv.status_code, 200)\n        self.assertIn(b'\"url\":', rv.data)\n\n    def test_a50_export_csv(self):\n        rv = self.app.get('/results/dump/test_project.csv')\n        self.assertEqual(rv.status_code, 200)\n        self.assertIn(b'url,title,url', rv.data)\n\n    def test_a60_fetch_via_cannot_connect_fetcher(self):\n        ctx = run.webui.make_context('webui', [\n            '--fetcher-rpc', 'http://localhost:20000/',\n        ], self.ctx)\n        app = run.webui.invoke(ctx)\n        app = app.test_client()\n        rv = app.post('/debug/test_project/run', data={\n            'script': self.script_content,\n            'task': self.task_content\n        })\n        self.assertEqual(rv.status_code, 200)\n        data = json.loads(utils.text(rv.data))\n        self.assertGreater(len(data['logs']), 0)\n        self.assertEqual(len(data['follows']), 0)\n\n    def test_a70_fetch_via_fetcher(self):\n        ctx = run.webui.make_context('webui', [\n            '--fetcher-rpc', 'http://localhost:24444/',\n        ], self.ctx)\n        app = run.webui.invoke(ctx)\n        app = app.test_client()\n        rv = app.post('/debug/test_project/run', data={\n            'script': self.script_content,\n            'task': self.task_content\n        })\n        self.assertEqual(rv.status_code, 200)\n        data = json.loads(utils.text(rv.data))\n        self.assertEqual(len(data['logs']), 0, data['logs'])\n        self.assertIn(b'follows', rv.data)\n        self.assertGreater(len(data['follows']), 0)\n\n    def test_h000_auth(self):\n        ctx = run.webui.make_context('webui', [\n            '--scheduler-rpc', 'http://localhost:23333/',\n            '--username', 'binux',\n            '--password', '4321',\n        ], self.ctx)\n        app = run.webui.invoke(ctx)\n        self.__class__.app = app.test_client()\n        self.__class__.rpc = app.config['scheduler_rpc']\n\n    def test_h005_no_such_project(self):\n        rv = self.app.post('/update', data={\n            'name': 'group',\n            'value': 'lock',\n            'pk': 'not_exist_project'\n        })\n        self.assertEqual(rv.status_code, 404)\n\n    def test_h005_unknown_field(self):\n        rv = self.app.post('/update', data={\n            'name': 'unknown_field',\n            'value': 'lock',\n            'pk': 'test_project'\n        })\n        self.assertEqual(rv.status_code, 400)\n\n    def test_h005_rate_wrong_format(self):\n        rv = self.app.post('/update', data={\n            'name': 'rate',\n            'value': 'xxx',\n            'pk': 'test_project'\n        })\n        self.assertEqual(rv.status_code, 400)\n\n    def test_h010_change_group(self):\n        rv = self.app.post('/update', data={\n            'name': 'group',\n            'value': 'lock',\n            'pk': 'test_project'\n        })\n        self.assertEqual(rv.status_code, 200)\n        self.assertIn(b'ok', rv.data)\n\n        rv = self.app.get('/')\n        self.assertEqual(rv.status_code, 200)\n        self.assertIn(b'lock', rv.data)\n\n    def test_h020_change_group_lock_failed(self):\n        rv = self.app.post('/update', data={\n            'name': 'group',\n            'value': '',\n            'pk': 'test_project'\n        })\n        self.assertEqual(rv.status_code, 401)\n\n    def test_h020_change_group_lock_ok(self):\n        rv = self.app.post('/update', data={\n            'name': 'group',\n            'value': 'test_binux',\n            'pk': 'test_project'\n        }, headers={\n            'Authorization': 'Basic YmludXg6NDMyMQ=='\n        })\n        self.assertEqual(rv.status_code, 200)\n\n    def test_h030_need_auth(self):\n        ctx = run.webui.make_context('webui', [\n            '--scheduler-rpc', 'http://localhost:23333/',\n            '--username', 'binux',\n            '--password', '4321',\n            '--need-auth',\n        ], self.ctx)\n        app = run.webui.invoke(ctx)\n        self.__class__.app = app.test_client()\n        self.__class__.rpc = app.config['scheduler_rpc']\n\n    def test_h040_auth_fail(self):\n        rv = self.app.get('/')\n        self.assertEqual(rv.status_code, 401)\n\n    def test_h050_auth_fail2(self):\n        rv = self.app.get('/', headers={\n            'Authorization': 'Basic Ymlasdfsd'\n        })\n        self.assertEqual(rv.status_code, 401)\n\n    def test_h060_auth_fail3(self):\n        rv = self.app.get('/', headers={\n            'Authorization': 'Basic YmludXg6MQ=='\n        })\n        self.assertEqual(rv.status_code, 401)\n\n    def test_h070_auth_ok(self):\n        rv = self.app.get('/', headers={\n            'Authorization': 'Basic YmludXg6NDMyMQ=='\n        })\n        self.assertEqual(rv.status_code, 200)\n\n    def test_x0_disconnected_scheduler(self):\n        ctx = run.webui.make_context('webui', [\n            '--scheduler-rpc', 'http://localhost:23458/'\n        ], self.ctx)\n        app = run.webui.invoke(ctx)\n        self.__class__.app = app.test_client()\n        self.__class__.rpc = app.config['scheduler_rpc']\n\n    def test_x10_project_update(self):\n        rv = self.app.post('/update', data={\n            'name': 'status',\n            'value': 'RUNNING',\n            'pk': 'test_project'\n        })\n        self.assertEqual(rv.status_code, 200)\n        self.assertNotIn(b'ok', rv.data)\n\n    def test_x20_counter(self):\n        rv = self.app.get('/counter?time=5m&type=sum')\n        self.assertEqual(rv.status_code, 200)\n        self.assertEqual(json.loads(utils.text(rv.data)), {})\n\n    def test_x30_run_not_exists_project(self):\n        rv = self.app.post('/run', data={\n            'project': 'not_exist_project',\n        })\n        self.assertEqual(rv.status_code, 404)\n\n    def test_x30_run(self):\n        rv = self.app.post('/run', data={\n            'project': 'test_project',\n        })\n        self.assertEqual(rv.status_code, 200)\n        self.assertEqual(json.loads(utils.text(rv.data))['result'], False)\n\n    def test_x40_debug_save(self):\n        rv = self.app.post('/debug/test_project/save', data={\n            'script': self.script_content,\n        })\n        self.assertEqual(rv.status_code, 200)\n        self.assertNotIn(b'ok', rv.data)\n\n    def test_x50_tasks(self):\n        rv = self.app.get('/tasks')\n        self.assertEqual(rv.status_code, 502)\n\n    def test_x60_robots(self):\n        rv = self.app.get('/robots.txt')\n        self.assertEqual(rv.status_code, 200)\n        self.assertIn(b'ser-agent', rv.data)\n\n    def test_x70_bench(self):\n        rv = self.app.get('/bench?total=10&show=5')\n        self.assertEqual(rv.status_code, 200)\n"
  },
  {
    "path": "tests/test_xmlrpc.py",
    "content": "#   Copyright (c) 2006-2007 Open Source Applications Foundation\n#\n#   Licensed under the Apache License, Version 2.0 (the \"License\");\n#   you may not use this file except in compliance with the License.\n#   You may obtain a copy of the License at\n#\n#       http://www.apache.org/licenses/LICENSE-2.0\n#\n#   Unless required by applicable law or agreed to in writing, software\n#   distributed under the License is distributed on an \"AS IS\" BASIS,\n#   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n#   See the License for the specific language governing permissions and\n#   limitations under the License.\n#\n#   Origin: https://code.google.com/p/wsgi-xmlrpc/\n\nimport unittest\nimport tornado.wsgi\nimport tornado.ioloop\nimport tornado.httpserver\nfrom pyspider.libs import utils\n\nclass TestXMLRPCServer(unittest.TestCase):\n    @classmethod\n    def setUpClass(self):\n        from pyspider.libs import wsgi_xmlrpc\n        \n        def test_1():\n            return 'test_1'\n            \n        class Test2(object):\n            def test_3(self, obj):\n                return obj\n                \n        test = Test2()\n        \n        application = wsgi_xmlrpc.WSGIXMLRPCApplication()\n        application.register_instance(Test2())\n        application.register_function(test_1)\n\n        container = tornado.wsgi.WSGIContainer(application)\n        self.io_loop = tornado.ioloop.IOLoop.current()\n        http_server = tornado.httpserver.HTTPServer(container, io_loop=self.io_loop)\n        http_server.listen(3423)\n        self.thread = utils.run_in_thread(self.io_loop.start)\n\n    @classmethod\n    def tearDownClass(self):\n        self.io_loop.add_callback(self.io_loop.stop)\n        self.thread.join()\n    \n    def test_xmlrpc_server(self, uri='http://127.0.0.1:3423'):\n        from six.moves.xmlrpc_client import ServerProxy\n        \n        client = ServerProxy(uri)\n        \n        assert client.test_1() == 'test_1'\n        assert client.test_3({'asdf':4}) == {'asdf':4}\n"
  },
  {
    "path": "tools/migrate.py",
    "content": "#!/usr/bin/env python\n# -*- encoding: utf-8 -*-\n# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:\n# Author: Binux<roy@binux.me>\n#         http://binux.me\n# Created on 2015-09-30 23:22:46\n\nimport click\nimport logging\nfrom pyspider.database.base.projectdb import ProjectDB\nfrom pyspider.database.base.taskdb import TaskDB\nfrom pyspider.database.base.resultdb import ResultDB\nfrom pyspider.database import connect_database\nfrom pyspider.libs.utils import unicode_obj\nfrom multiprocessing.pool import ThreadPool as Pool\n\nlogging.getLogger().setLevel(logging.INFO)\n\n\ndef taskdb_migrating(project, from_connection, to_connection):\n    logging.info(\"taskdb: %s\", project)\n    f = connect_database(from_connection)\n    t = connect_database(to_connection)\n    t.drop(project)\n    for status in range(1, 5):\n        for task in f.load_tasks(status, project=project):\n            t.insert(project, task['taskid'], task)\n\n\ndef resultdb_migrating(project, from_connection, to_connection):\n    logging.info(\"resultdb: %s\", project)\n    f = connect_database(from_connection)\n    t = connect_database(to_connection)\n    t.drop(project)\n    for result in f.select(project):\n        t.save(project, result['taskid'], result['url'], result['result'])\n\n\n@click.command()\n@click.option('--pool', default=10, help='cocurrent worker size.')\n@click.argument('from_connection', required=1)\n@click.argument('to_connection', required=1)\ndef migrate(pool, from_connection, to_connection):\n    \"\"\"\n    Migrate tool for pyspider\n    \"\"\"\n    f = connect_database(from_connection)\n    t = connect_database(to_connection)\n\n    if isinstance(f, ProjectDB):\n        for each in f.get_all():\n            each = unicode_obj(each)\n            logging.info(\"projectdb: %s\", each['name'])\n            t.drop(each['name'])\n            t.insert(each['name'], each)\n    elif isinstance(f, TaskDB):\n        pool = Pool(pool)\n        pool.map(\n            lambda x, f=from_connection, t=to_connection: taskdb_migrating(x, f, t),\n            f.projects)\n    elif isinstance(f, ResultDB):\n        pool = Pool(pool)\n        pool.map(\n            lambda x, f=from_connection, t=to_connection: resultdb_migrating(x, f, t),\n            f.projects)\n\n\nif __name__ == '__main__':\n    migrate()\n"
  },
  {
    "path": "tox.ini",
    "content": "[tox]\nenvlist = py35,py36,py37,py38\n[testenv]\ninstall_command = \n    pip install --allow-all-external 'https://dev.mysql.com/get/Downloads/Connector-Python/mysql-connector-python-2.1.5.zip#md5=ce4a24cb1746c1c8f6189a97087f21c1'  {opts} -e .[all,test] {packages}\ncommands =\n    python setup.py test []\n"
  }
]