Showing preview only (6,201K chars total). Download the full file or copy to clipboard to get everything.
Repository: scrapinghub/portia
Branch: master
Commit: 606467d278ea
Files: 729
Total size: 5.8 MB
Directory structure:
gitextract_l8nvd49y/
├── .dockerignore
├── .drone.yml
├── .editorconfig
├── .gitattributes
├── .gitignore
├── .jshintrc
├── .travis.yml
├── CHANGES
├── Dockerfile
├── LICENSE
├── README.md
├── VERSION
├── Vagrantfile
├── bin/
│ └── bump_version.py
├── docker/
│ ├── compile-assets.sh
│ ├── entry
│ ├── nginx/
│ │ ├── nginx.conf
│ │ ├── proxy_portia_server.conf
│ │ └── proxy_slyd.conf
│ ├── portia.conf
│ ├── provision.sh
│ ├── qt_install.qs
│ ├── restore-mtime.sh
│ └── run-tests.sh
├── docker-compose.yml
├── docs/
│ ├── Makefile
│ ├── conf.py
│ ├── examples.rst
│ ├── faq.rst
│ ├── getting-started.rst
│ ├── index.rst
│ ├── installation.rst
│ ├── items.rst
│ ├── make.bat
│ ├── projects.rst
│ ├── samples.rst
│ └── spiders.rst
├── portia_server/
│ ├── db_repo/
│ │ ├── __init__.py
│ │ ├── apps.py
│ │ ├── migrations/
│ │ │ ├── 0001_initial.py
│ │ │ ├── __init__.py
│ │ │ └── slyd_to_django.sql
│ │ ├── models.py
│ │ └── repo.py
│ ├── manage.py
│ ├── portia_api/
│ │ ├── __init__.py
│ │ ├── apps.py
│ │ ├── errors.py
│ │ ├── jsonapi/
│ │ │ ├── __init__.py
│ │ │ ├── exceptions.py
│ │ │ ├── parsers.py
│ │ │ ├── registry.py
│ │ │ ├── relationships.py
│ │ │ ├── renderers.py
│ │ │ ├── response.py
│ │ │ ├── serializers.py
│ │ │ └── utils.py
│ │ ├── resources/
│ │ │ ├── __init__.py
│ │ │ ├── annotations.py
│ │ │ ├── extractors.py
│ │ │ ├── fields.py
│ │ │ ├── items.py
│ │ │ ├── models.py
│ │ │ ├── projects.py
│ │ │ ├── response.py
│ │ │ ├── route.py
│ │ │ ├── samples.py
│ │ │ ├── schemas.py
│ │ │ ├── serializers.py
│ │ │ └── spiders.py
│ │ ├── routers.py
│ │ ├── tests/
│ │ │ ├── __init__.py
│ │ │ └── test_routes.py
│ │ ├── urls.py
│ │ └── utils/
│ │ ├── __init__.py
│ │ ├── annotations.py
│ │ ├── copy.py
│ │ ├── deploy/
│ │ │ ├── base.py
│ │ │ ├── package.py
│ │ │ ├── scrapinghub.py
│ │ │ └── scrapyd.py
│ │ ├── download.py
│ │ ├── extract.py
│ │ ├── projects.py
│ │ └── spiders.py
│ ├── portia_orm/
│ │ ├── __init__.py
│ │ ├── apps.py
│ │ ├── base.py
│ │ ├── collection.py
│ │ ├── datastore.py
│ │ ├── decorators.py
│ │ ├── deletion.py
│ │ ├── exceptions.py
│ │ ├── fields.py
│ │ ├── middleware.py
│ │ ├── models.py
│ │ ├── registry.py
│ │ ├── relationships.py
│ │ ├── serializers.py
│ │ ├── snapshots.py
│ │ ├── tests/
│ │ │ ├── __init__.py
│ │ │ ├── models.py
│ │ │ ├── test_basic.py
│ │ │ ├── test_collection.py
│ │ │ ├── test_model.py
│ │ │ ├── test_relationship.py
│ │ │ └── utils.py
│ │ ├── utils.py
│ │ └── validators.py
│ ├── portia_server/
│ │ ├── __init__.py
│ │ ├── backends.py
│ │ ├── models.py
│ │ ├── settings.py
│ │ ├── urls.py
│ │ ├── views.py
│ │ └── wsgi.py
│ ├── requirements.txt
│ └── storage/
│ ├── __init__.py
│ ├── apps.py
│ ├── backends.py
│ ├── jsondiff.py
│ ├── projecttemplates.py
│ └── repoman.py
├── portiaui/
│ ├── .bowerrc
│ ├── .editorconfig
│ ├── .ember-cli
│ ├── .gitignore
│ ├── .jshintrc
│ ├── .watchmanconfig
│ ├── app/
│ │ ├── adapters/
│ │ │ ├── application.js
│ │ │ └── project.js
│ │ ├── app.js
│ │ ├── components/
│ │ │ ├── .gitkeep
│ │ │ ├── add-start-url-button.js
│ │ │ ├── animation-container.js
│ │ │ ├── annotation-options.js
│ │ │ ├── browser-iframe.js
│ │ │ ├── browser-url-blocked.js
│ │ │ ├── browser-url-failing.js
│ │ │ ├── browser-view-port.js
│ │ │ ├── buffered-input.js
│ │ │ ├── colored-badge.js
│ │ │ ├── colored-span.js
│ │ │ ├── combo-box.js
│ │ │ ├── create-project-button.js
│ │ │ ├── create-spider-button.js
│ │ │ ├── data-structure-annotations.js
│ │ │ ├── data-structure-listing.js
│ │ │ ├── dropdown-delete.js
│ │ │ ├── dropdown-divider.js
│ │ │ ├── dropdown-header.js
│ │ │ ├── dropdown-item.js
│ │ │ ├── dropdown-menu.js
│ │ │ ├── dropdown-widget.js
│ │ │ ├── edit-sample-button.js
│ │ │ ├── element-overlay.js
│ │ │ ├── element-rect-overlay.js
│ │ │ ├── extracted-item-table.js
│ │ │ ├── extracted-items-group.js
│ │ │ ├── extracted-items-json-panel.js
│ │ │ ├── extracted-items-json-value.js
│ │ │ ├── extracted-items-json.js
│ │ │ ├── extracted-items-panel.js
│ │ │ ├── extracted-items-status.js
│ │ │ ├── extracted-items-tab.js
│ │ │ ├── extractor-options.js
│ │ │ ├── feed-url-options.js
│ │ │ ├── field-options.js
│ │ │ ├── fragment-options.js
│ │ │ ├── generated-url-options.js
│ │ │ ├── help-icon.js
│ │ │ ├── icon-button.js
│ │ │ ├── indentation-spacer.js
│ │ │ ├── input-with-clear.js
│ │ │ ├── inspector-panel.js
│ │ │ ├── link-crawling-options.js
│ │ │ ├── list-item-add-annotation-menu.js
│ │ │ ├── list-item-annotation-field.js
│ │ │ ├── list-item-badge.js
│ │ │ ├── list-item-combo.js
│ │ │ ├── list-item-editable.js
│ │ │ ├── list-item-field-type.js
│ │ │ ├── list-item-icon-menu.js
│ │ │ ├── list-item-icon.js
│ │ │ ├── list-item-item-schema.js
│ │ │ ├── list-item-link-crawling.js
│ │ │ ├── list-item-relation-manager.js
│ │ │ ├── list-item-selectable.js
│ │ │ ├── list-item-text.js
│ │ │ ├── notification-container.js
│ │ │ ├── notification-message.js
│ │ │ ├── page-actions-editor.js
│ │ │ ├── project-list.js
│ │ │ ├── project-listing.js
│ │ │ ├── project-structure-listing.js
│ │ │ ├── project-structure-spider-feed-url.js
│ │ │ ├── project-structure-spider-generated-url.js
│ │ │ ├── project-structure-spider-url.js
│ │ │ ├── regex-pattern-list.js
│ │ │ ├── reorder-handler.js
│ │ │ ├── save-status.js
│ │ │ ├── schema-structure-listing.js
│ │ │ ├── scrapinghub-links.js
│ │ │ ├── select-box.js
│ │ │ ├── show-links-button.js
│ │ │ ├── show-links-legend.js
│ │ │ ├── sliding-main.js
│ │ │ ├── spider-indentation.js
│ │ │ ├── spider-message.js
│ │ │ ├── spider-options.js
│ │ │ ├── spider-row.js
│ │ │ ├── spider-structure-listing.js
│ │ │ ├── start-url-options.js
│ │ │ ├── tool-group.js
│ │ │ ├── tool-panel.js
│ │ │ ├── tool-tab.js
│ │ │ ├── tooltip-container.js
│ │ │ ├── tooltip-icon.js
│ │ │ ├── tree-list-item-row.js
│ │ │ ├── tree-list-item.js
│ │ │ ├── tree-list.js
│ │ │ └── url-bar.js
│ │ ├── controllers/
│ │ │ ├── .gitkeep
│ │ │ └── projects/
│ │ │ ├── project/
│ │ │ │ ├── conflicts/
│ │ │ │ │ └── conflict.js
│ │ │ │ ├── conflicts.js
│ │ │ │ ├── schema/
│ │ │ │ │ └── field/
│ │ │ │ │ └── options.js
│ │ │ │ ├── spider/
│ │ │ │ │ ├── link-options.js
│ │ │ │ │ ├── options.js
│ │ │ │ │ └── sample/
│ │ │ │ │ ├── data/
│ │ │ │ │ │ └── annotation/
│ │ │ │ │ │ └── options.js
│ │ │ │ │ └── data.js
│ │ │ │ └── spider.js
│ │ │ └── project.js
│ │ ├── helpers/
│ │ │ ├── .gitkeep
│ │ │ ├── array-get.js
│ │ │ ├── attribute-annotation.js
│ │ │ ├── chain-actions.js
│ │ │ ├── guid.js
│ │ │ ├── includes.js
│ │ │ ├── indexed-object.js
│ │ │ ├── is-empty-object.js
│ │ │ ├── is-object-or-array.js
│ │ │ └── is-object.js
│ │ ├── index.html
│ │ ├── initializers/
│ │ │ └── ui-state.js
│ │ ├── instance-initializers/
│ │ │ └── error-handler.js
│ │ ├── mixins/
│ │ │ ├── options-route.js
│ │ │ └── save-spider-mixin.js
│ │ ├── models/
│ │ │ ├── .gitkeep
│ │ │ ├── annotation.js
│ │ │ ├── base-annotation.js
│ │ │ ├── base.js
│ │ │ ├── extractor.js
│ │ │ ├── field.js
│ │ │ ├── item.js
│ │ │ ├── project.js
│ │ │ ├── sample.js
│ │ │ ├── schema.js
│ │ │ ├── spider.js
│ │ │ └── start-url.js
│ │ ├── resolver.js
│ │ ├── router.js
│ │ ├── routes/
│ │ │ ├── .gitkeep
│ │ │ ├── application.js
│ │ │ ├── browsers.js
│ │ │ ├── index.js
│ │ │ ├── projects/
│ │ │ │ ├── project/
│ │ │ │ │ ├── compatibility.js
│ │ │ │ │ ├── conflicts/
│ │ │ │ │ │ └── conflict.js
│ │ │ │ │ ├── conflicts.js
│ │ │ │ │ ├── schema/
│ │ │ │ │ │ ├── field/
│ │ │ │ │ │ │ └── options.js
│ │ │ │ │ │ └── field.js
│ │ │ │ │ ├── schema.js
│ │ │ │ │ ├── spider/
│ │ │ │ │ │ ├── link-options.js
│ │ │ │ │ │ ├── options.js
│ │ │ │ │ │ ├── sample/
│ │ │ │ │ │ │ ├── data/
│ │ │ │ │ │ │ │ ├── annotation/
│ │ │ │ │ │ │ │ │ └── options.js
│ │ │ │ │ │ │ │ ├── annotation.js
│ │ │ │ │ │ │ │ └── item.js
│ │ │ │ │ │ │ ├── data.js
│ │ │ │ │ │ │ └── index.js
│ │ │ │ │ │ ├── sample.js
│ │ │ │ │ │ ├── start-url/
│ │ │ │ │ │ │ └── options.js
│ │ │ │ │ │ └── start-url.js
│ │ │ │ │ └── spider.js
│ │ │ │ └── project.js
│ │ │ └── projects.js
│ │ ├── serializers/
│ │ │ └── application.js
│ │ ├── services/
│ │ │ ├── annotation-structure.js
│ │ │ ├── browser.js
│ │ │ ├── capabilities.js
│ │ │ ├── changes.js
│ │ │ ├── clock.js
│ │ │ ├── dispatcher.js
│ │ │ ├── extracted-items.js
│ │ │ ├── notification-manager.js
│ │ │ ├── overlays.js
│ │ │ ├── position-monitor.js
│ │ │ ├── saving-notification.js
│ │ │ ├── selector-matcher.js
│ │ │ ├── store.js
│ │ │ ├── ui-state.js
│ │ │ └── web-socket.js
│ │ ├── storages/
│ │ │ ├── cookies.js
│ │ │ ├── page-loads.js
│ │ │ ├── ui-state-collapsed-panels.js
│ │ │ └── ui-state-selected-tools.js
│ │ ├── styles/
│ │ │ ├── _animations.scss
│ │ │ ├── _bootstrap_overrides.scss
│ │ │ ├── _icons.scss
│ │ │ ├── _lib_config.scss
│ │ │ ├── _variables.scss
│ │ │ ├── app.scss
│ │ │ ├── components/
│ │ │ │ ├── animation-container.scss
│ │ │ │ ├── browser-iframe.scss
│ │ │ │ ├── browser-view-port.scss
│ │ │ │ ├── combo-box.scss
│ │ │ │ ├── conflicts.scss
│ │ │ │ ├── dropdown-delete.scss
│ │ │ │ ├── dropdown-menu.scss
│ │ │ │ ├── dropdown-widget.scss
│ │ │ │ ├── extracted-item-table.scss
│ │ │ │ ├── extracted-items-json-panel.scss
│ │ │ │ ├── extractor-options.scss
│ │ │ │ ├── fragment-options.scss
│ │ │ │ ├── help-icon.scss
│ │ │ │ ├── icon-button.scss
│ │ │ │ ├── indentation-spacer.scss
│ │ │ │ ├── input-with-clear.scss
│ │ │ │ ├── inspector-panel.scss
│ │ │ │ ├── list-item-badge.scss
│ │ │ │ ├── list-item-combo.scss
│ │ │ │ ├── list-item-editable.scss
│ │ │ │ ├── list-item-icon.scss
│ │ │ │ ├── list-item-selectable.scss
│ │ │ │ ├── list-item-text.scss
│ │ │ │ ├── notifications.scss
│ │ │ │ ├── page-actions.scss
│ │ │ │ ├── project-structure-spider-generation-url.scss
│ │ │ │ ├── regex-pattern-list.scss
│ │ │ │ ├── save-status.scss
│ │ │ │ ├── select-box.scss
│ │ │ │ ├── show-links-legend.scss
│ │ │ │ ├── side-bar.scss
│ │ │ │ ├── sliding-main.scss
│ │ │ │ ├── start-url-options.scss
│ │ │ │ ├── tool-group.scss
│ │ │ │ ├── tool-panel.scss
│ │ │ │ ├── tooltip-container.scss
│ │ │ │ ├── top-bar.scss
│ │ │ │ ├── tree-list.scss
│ │ │ │ └── url-bar.scss
│ │ │ ├── document.scss
│ │ │ ├── droplet.scss
│ │ │ ├── generic.scss
│ │ │ ├── layout/
│ │ │ │ ├── _clickable.scss
│ │ │ │ ├── _forms.scss
│ │ │ │ └── _full-page-content.scss
│ │ │ └── templates/
│ │ │ ├── application.scss
│ │ │ ├── browsers.scss
│ │ │ └── projects.scss
│ │ ├── templates/
│ │ │ ├── application.hbs
│ │ │ ├── branding.hbs
│ │ │ ├── browsers.hbs
│ │ │ ├── components/
│ │ │ │ ├── .gitkeep
│ │ │ │ ├── add-start-url-button.hbs
│ │ │ │ ├── animation-container.hbs
│ │ │ │ ├── annotation-options.hbs
│ │ │ │ ├── browser-iframe.hbs
│ │ │ │ ├── browser-list.hbs
│ │ │ │ ├── browser-url-blocked.hbs
│ │ │ │ ├── browser-url-failing.hbs
│ │ │ │ ├── browser-view-port.hbs
│ │ │ │ ├── buffered-input.hbs
│ │ │ │ ├── colored-badge.hbs
│ │ │ │ ├── colored-span.hbs
│ │ │ │ ├── combo-box.hbs
│ │ │ │ ├── create-project-button.hbs
│ │ │ │ ├── create-spider-button.hbs
│ │ │ │ ├── data-structure-annotations.hbs
│ │ │ │ ├── data-structure-listing.hbs
│ │ │ │ ├── dropdown-delete.hbs
│ │ │ │ ├── dropdown-divider.hbs
│ │ │ │ ├── dropdown-header.hbs
│ │ │ │ ├── dropdown-item.hbs
│ │ │ │ ├── dropdown-menu.hbs
│ │ │ │ ├── dropdown-widget.hbs
│ │ │ │ ├── edit-sample-button.hbs
│ │ │ │ ├── element-overlay.hbs
│ │ │ │ ├── element-rect-overlay.hbs
│ │ │ │ ├── extracted-item-table.hbs
│ │ │ │ ├── extracted-items-group.hbs
│ │ │ │ ├── extracted-items-json-panel.hbs
│ │ │ │ ├── extracted-items-json-value.hbs
│ │ │ │ ├── extracted-items-json.hbs
│ │ │ │ ├── extracted-items-panel.hbs
│ │ │ │ ├── extracted-items-status.hbs
│ │ │ │ ├── extracted-items-tab.hbs
│ │ │ │ ├── extractor-options.hbs
│ │ │ │ ├── feed-url-options.hbs
│ │ │ │ ├── field-options.hbs
│ │ │ │ ├── fragment-options.hbs
│ │ │ │ ├── generated-url-options.hbs
│ │ │ │ ├── help-icon.hbs
│ │ │ │ ├── icon-button.hbs
│ │ │ │ ├── input-with-clear.hbs
│ │ │ │ ├── inspector-panel.hbs
│ │ │ │ ├── json-file-compare.hbs
│ │ │ │ ├── link-crawling-options.hbs
│ │ │ │ ├── list-item-add-annotation-menu.hbs
│ │ │ │ ├── list-item-annotation-field.hbs
│ │ │ │ ├── list-item-badge.hbs
│ │ │ │ ├── list-item-combo.hbs
│ │ │ │ ├── list-item-editable.hbs
│ │ │ │ ├── list-item-field-type.hbs
│ │ │ │ ├── list-item-icon-menu.hbs
│ │ │ │ ├── list-item-icon.hbs
│ │ │ │ ├── list-item-item-schema.hbs
│ │ │ │ ├── list-item-link-crawling.hbs
│ │ │ │ ├── list-item-relation-manager.hbs
│ │ │ │ ├── list-item-selectable.hbs
│ │ │ │ ├── list-item-text.hbs
│ │ │ │ ├── notification-container.hbs
│ │ │ │ ├── notification-message.hbs
│ │ │ │ ├── page-actions-editor.hbs
│ │ │ │ ├── project-list.hbs
│ │ │ │ ├── project-listing.hbs
│ │ │ │ ├── project-structure-listing.hbs
│ │ │ │ ├── project-structure-spider-feed-url.hbs
│ │ │ │ ├── project-structure-spider-generated-url.hbs
│ │ │ │ ├── project-structure-spider-url.hbs
│ │ │ │ ├── regex-pattern-list.hbs
│ │ │ │ ├── save-status.hbs
│ │ │ │ ├── schema-structure-listing.hbs
│ │ │ │ ├── scrapinghub-links.hbs
│ │ │ │ ├── select-box.hbs
│ │ │ │ ├── show-links-button.hbs
│ │ │ │ ├── show-links-legend.hbs
│ │ │ │ ├── sliding-main.hbs
│ │ │ │ ├── spider-indentation.hbs
│ │ │ │ ├── spider-message.hbs
│ │ │ │ ├── spider-options.hbs
│ │ │ │ ├── spider-row.hbs
│ │ │ │ ├── spider-structure-listing.hbs
│ │ │ │ ├── start-url-options.hbs
│ │ │ │ ├── tool-group.hbs
│ │ │ │ ├── tool-panel.hbs
│ │ │ │ ├── tool-tab.hbs
│ │ │ │ ├── tooltip-container.hbs
│ │ │ │ ├── tooltip-icon.hbs
│ │ │ │ ├── tree-list-item-row.hbs
│ │ │ │ ├── tree-list-item.hbs
│ │ │ │ ├── tree-list.hbs
│ │ │ │ └── url-bar.hbs
│ │ │ ├── options-panels.hbs
│ │ │ ├── projects/
│ │ │ │ ├── project/
│ │ │ │ │ ├── conflicts/
│ │ │ │ │ │ ├── file-selector.hbs
│ │ │ │ │ │ ├── help.hbs
│ │ │ │ │ │ ├── resolver.hbs
│ │ │ │ │ │ └── topbar.hbs
│ │ │ │ │ ├── schema/
│ │ │ │ │ │ ├── field/
│ │ │ │ │ │ │ └── options.hbs
│ │ │ │ │ │ ├── field.hbs
│ │ │ │ │ │ └── structure.hbs
│ │ │ │ │ ├── schema.hbs
│ │ │ │ │ ├── spider/
│ │ │ │ │ │ ├── link-options.hbs
│ │ │ │ │ │ ├── options.hbs
│ │ │ │ │ │ ├── overlays.hbs
│ │ │ │ │ │ ├── sample/
│ │ │ │ │ │ │ ├── annotation/
│ │ │ │ │ │ │ │ └── selection.hbs
│ │ │ │ │ │ │ ├── data/
│ │ │ │ │ │ │ │ ├── annotation/
│ │ │ │ │ │ │ │ │ └── options.hbs
│ │ │ │ │ │ │ │ ├── annotation.hbs
│ │ │ │ │ │ │ │ ├── item.hbs
│ │ │ │ │ │ │ │ ├── overlays.hbs
│ │ │ │ │ │ │ │ ├── structure.hbs
│ │ │ │ │ │ │ │ ├── toolbar.hbs
│ │ │ │ │ │ │ │ └── tools.hbs
│ │ │ │ │ │ │ ├── data.hbs
│ │ │ │ │ │ │ ├── item.hbs
│ │ │ │ │ │ │ ├── structure.hbs
│ │ │ │ │ │ │ └── toolbar.hbs
│ │ │ │ │ │ ├── sample.hbs
│ │ │ │ │ │ ├── start-url/
│ │ │ │ │ │ │ └── options.hbs
│ │ │ │ │ │ ├── structure.hbs
│ │ │ │ │ │ ├── toolbar.hbs
│ │ │ │ │ │ └── tools.hbs
│ │ │ │ │ ├── spider.hbs
│ │ │ │ │ ├── structure.hbs
│ │ │ │ │ └── toolbar.hbs
│ │ │ │ └── project.hbs
│ │ │ ├── projects.hbs
│ │ │ └── tool-panels.hbs
│ │ ├── transforms/
│ │ │ ├── array.js
│ │ │ ├── json.js
│ │ │ └── start-url.js
│ │ ├── utils/
│ │ │ ├── attrs.js
│ │ │ ├── browser-features.js
│ │ │ ├── colors.js
│ │ │ ├── computed.js
│ │ │ ├── ensure-promise.js
│ │ │ ├── interaction-event.js
│ │ │ ├── promises.js
│ │ │ ├── selectors.js
│ │ │ ├── start-urls.js
│ │ │ ├── tree-mirror-delegate.js
│ │ │ ├── types.js
│ │ │ └── utils.js
│ │ ├── validations/
│ │ │ ├── fixed-fragment.js
│ │ │ ├── list-fragment.js
│ │ │ └── range-fragment.js
│ │ └── validators/
│ │ ├── range.js
│ │ └── whitespace.js
│ ├── bower.json
│ ├── config/
│ │ ├── deprecation-workflow.js
│ │ ├── environment-development.js
│ │ ├── environment-production.js
│ │ ├── environment-test.js
│ │ └── environment.js
│ ├── ember-cli-build.js
│ ├── package.json
│ ├── public/
│ │ ├── crossdomain.xml
│ │ ├── empty-frame.html
│ │ ├── frames-not-supported.html
│ │ └── robots.txt
│ ├── testem.js
│ ├── tests/
│ │ ├── .jshintrc
│ │ ├── helpers/
│ │ │ ├── destroy-app.js
│ │ │ ├── module-for-acceptance.js
│ │ │ ├── resolver.js
│ │ │ └── start-app.js
│ │ ├── index.html
│ │ ├── test-helper.js
│ │ └── unit/
│ │ ├── .gitkeep
│ │ ├── models/
│ │ │ └── start-url-test.js
│ │ ├── utils/
│ │ │ ├── selectors-test.js
│ │ │ └── start-urls-test.js
│ │ └── validators/
│ │ ├── range-test.js
│ │ └── whitespace-test.js
│ └── vendor/
│ ├── .gitkeep
│ ├── modernizr.js
│ ├── mutation-summary.js
│ └── tree-mirror.js
├── slybot/
│ ├── .gitignore
│ ├── CHANGES
│ ├── MANIFEST.in
│ ├── Makefile.buildbot
│ ├── README.rst
│ ├── bin/
│ │ ├── makedeb
│ │ ├── portiacrawl
│ │ └── slybot
│ ├── debian/
│ │ ├── changelog
│ │ ├── compat
│ │ ├── control
│ │ ├── copyright
│ │ ├── pyversions
│ │ └── rules
│ ├── docs/
│ │ ├── Makefile
│ │ ├── conf.py
│ │ ├── index.rst
│ │ ├── make.bat
│ │ ├── project.rst
│ │ └── spiderlets.rst
│ ├── requirements-clustering.txt
│ ├── requirements-test.txt
│ ├── requirements.txt
│ ├── scrapy.cfg
│ ├── setup.py
│ ├── slybot/
│ │ ├── __init__.py
│ │ ├── baseurl.py
│ │ ├── closespider.py
│ │ ├── clustering.py
│ │ ├── dupefilter.py
│ │ ├── exporter.py
│ │ ├── extractors.py
│ │ ├── fieldtypes/
│ │ │ ├── __init__.py
│ │ │ ├── date.py
│ │ │ ├── images.py
│ │ │ ├── number.py
│ │ │ ├── point.py
│ │ │ ├── price.py
│ │ │ ├── text.py
│ │ │ └── url.py
│ │ ├── generic_form.py
│ │ ├── item.py
│ │ ├── linkextractor/
│ │ │ ├── __init__.py
│ │ │ ├── base.py
│ │ │ ├── ecsv.py
│ │ │ ├── html.py
│ │ │ ├── pagination.py
│ │ │ ├── regex.py
│ │ │ └── xml.py
│ │ ├── meta.py
│ │ ├── pageactions.py
│ │ ├── plugins/
│ │ │ ├── __init__.py
│ │ │ ├── scrapely_annotations/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── annotations.py
│ │ │ │ ├── builder.py
│ │ │ │ ├── exceptions.py
│ │ │ │ ├── extraction/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── container_extractors.py
│ │ │ │ │ ├── extractors.py
│ │ │ │ │ ├── pageparsing.py
│ │ │ │ │ ├── region_extractors.py
│ │ │ │ │ └── utils.py
│ │ │ │ ├── migration.py
│ │ │ │ ├── processors.py
│ │ │ │ └── utils.py
│ │ │ └── selectors/
│ │ │ └── __init__.py
│ │ ├── settings.py
│ │ ├── spider.py
│ │ ├── spiderlets.py
│ │ ├── spidermanager.py
│ │ ├── splash.py
│ │ ├── starturls/
│ │ │ ├── __init__.py
│ │ │ ├── feed_generator.py
│ │ │ ├── fragment_generator.py
│ │ │ ├── generated_url.py
│ │ │ └── generator.py
│ │ ├── tests/
│ │ │ ├── __init__.py
│ │ │ ├── data/
│ │ │ │ ├── SampleProject/
│ │ │ │ │ ├── extractors.json
│ │ │ │ │ ├── items.json
│ │ │ │ │ ├── project.json
│ │ │ │ │ └── spiders/
│ │ │ │ │ ├── allowed_domains.json
│ │ │ │ │ ├── any_allowed_domains.json
│ │ │ │ │ ├── books.toscrape.com/
│ │ │ │ │ │ ├── 3617-44af-a2f0/
│ │ │ │ │ │ │ └── original_body.html
│ │ │ │ │ │ ├── 3617-44af-a2f0.json
│ │ │ │ │ │ ├── 3652-4fa1-a912.json
│ │ │ │ │ │ ├── 4583-41b4-9edb/
│ │ │ │ │ │ │ └── original_body.html
│ │ │ │ │ │ └── 4583-41b4-9edb.json
│ │ │ │ │ ├── books.toscrape.com.json
│ │ │ │ │ ├── books.toscrape.com_1.json
│ │ │ │ │ ├── cargurus.json
│ │ │ │ │ ├── ebay.json
│ │ │ │ │ ├── ebay2.json
│ │ │ │ │ ├── ebay3.json
│ │ │ │ │ ├── ebay4.json
│ │ │ │ │ ├── example.com.json
│ │ │ │ │ ├── example2.com.json
│ │ │ │ │ ├── example3.com.json
│ │ │ │ │ ├── example4.com.json
│ │ │ │ │ ├── networkhealth.com/
│ │ │ │ │ │ ├── networkhealthtemplate/
│ │ │ │ │ │ │ ├── annotated_body.html
│ │ │ │ │ │ │ └── original_body.html
│ │ │ │ │ │ └── networkhealthtemplate.json
│ │ │ │ │ ├── networkhealth.com.json
│ │ │ │ │ ├── pinterest.com.json
│ │ │ │ │ ├── seedsofchange.com.json
│ │ │ │ │ ├── seedsofchange.json
│ │ │ │ │ ├── seedsofchange2.json
│ │ │ │ │ └── sitemaps.json
│ │ │ │ ├── atom_sample.xml
│ │ │ │ ├── ebay_advanced_search.html
│ │ │ │ ├── pinterest.html
│ │ │ │ ├── rss_sample.xml
│ │ │ │ ├── sitemap_sample.xml
│ │ │ │ ├── templates/
│ │ │ │ │ ├── 411_list.json
│ │ │ │ │ ├── autoevolution.html
│ │ │ │ │ ├── autoevolution.json
│ │ │ │ │ ├── autoevolution2.json
│ │ │ │ │ ├── cars.com.json
│ │ │ │ │ ├── cars.com_nested.json
│ │ │ │ │ ├── cs-cart.json
│ │ │ │ │ ├── daft_ie.html
│ │ │ │ │ ├── daft_list.json
│ │ │ │ │ ├── firmen.wko.at.html
│ │ │ │ │ ├── firmen.wko.at.json
│ │ │ │ │ ├── hn.html
│ │ │ │ │ ├── patchofland.html
│ │ │ │ │ ├── so_annotations.json
│ │ │ │ │ ├── stack_overflow.html
│ │ │ │ │ ├── stips.co.il.html
│ │ │ │ │ ├── stips.co.il.json
│ │ │ │ │ └── xceed.json
│ │ │ │ └── test_params.txt
│ │ │ ├── test_baseurl.py
│ │ │ ├── test_dropmeta.py
│ │ │ ├── test_dupefilter.py
│ │ │ ├── test_extraction_speed.py
│ │ │ ├── test_extractors.py
│ │ │ ├── test_fieldtypes.py
│ │ │ ├── test_fragment_generator.py
│ │ │ ├── test_generic_form.py
│ │ │ ├── test_linkextractors.py
│ │ │ ├── test_migration.py
│ │ │ ├── test_multiple_item_extraction.py
│ │ │ ├── test_page_actions.py
│ │ │ ├── test_schema_validation.py
│ │ │ ├── test_selectors.py
│ │ │ ├── test_spider.py
│ │ │ ├── test_starturls.py
│ │ │ ├── test_starturls_generator.py
│ │ │ └── utils.py
│ │ ├── utils.py
│ │ └── validation/
│ │ ├── __init__.py
│ │ ├── schema.py
│ │ └── schemas.json
│ └── tox.ini
├── slyd/
│ ├── .gitignore
│ ├── .jshintrc
│ ├── README.md
│ ├── bin/
│ │ ├── init_mysql_db
│ │ ├── sh2sly
│ │ └── slyd
│ ├── requirements.txt
│ ├── setup.py
│ ├── slyd/
│ │ ├── __init__.py
│ │ ├── authmanager.py
│ │ ├── dummyauth.py
│ │ ├── errors.py
│ │ ├── gitstorage/
│ │ │ ├── __init__.py
│ │ │ ├── jsondiff.py
│ │ │ ├── projects.py
│ │ │ └── projectspec.py
│ │ ├── html_utils.py
│ │ ├── projects.py
│ │ ├── projectspec.py
│ │ ├── resource.py
│ │ ├── server.py
│ │ ├── settings/
│ │ │ ├── __init__.py
│ │ │ └── base.py
│ │ ├── specmanager.py
│ │ ├── splash/
│ │ │ ├── __init__.py
│ │ │ ├── commands.py
│ │ │ ├── cookies.py
│ │ │ ├── css_utils.py
│ │ │ ├── ferry.py
│ │ │ ├── proxy.py
│ │ │ ├── qtutils.py
│ │ │ └── utils.py
│ │ └── tap.py
│ └── twisted/
│ └── plugins/
│ └── slyd_plugin.py
└── splash_utils/
├── compile_slybot.sh
├── filters/
│ └── easylist.txt
├── perform_actions.js
├── waitAsync.js
└── z_inject_this.js
================================================
FILE CONTENTS
================================================
================================================
FILE: .dockerignore
================================================
.git
.vagrant
docs
*/node_modules
*/bower_components
*/tests
*/tmp
*/db.sqlite3
*/.tox
*/.pyc
*/__pycache__
================================================
FILE: .drone.yml
================================================
image: scrapinghub
script:
- echo "Portia is at:"`git show -s --pretty=%d HEAD`
- git restore-mtime
- shopt -s extglob
- nvm install 10.16.0
- nvm use 10.16.0
- sudo mkdir -p ~/.npm ~/.node-gyp ~/.cache
- sudo chown -R ubuntu ~/.npm ~/.node-gyp ~/.cache
- npm install -g bower ember-cli@2.6.3 --cache-min 999999
- docker/compile-assets.sh
- build_docker_image
- publish_to_dockerhub
cache:
- /home/ubuntu/.npm
- /home/ubuntu/.node-gyp
- /home/ubuntu/.cache
================================================
FILE: .editorconfig
================================================
# EditorConfig helps developers define and maintain consistent
# coding styles between different editors and IDEs
# editorconfig.org
root = true
[*]
end_of_line = lf
charset = utf-8
trim_trailing_whitespace = true
insert_final_newline = true
indent_style = space
indent_size = 2
[*.js]
indent_style = space
indent_size = 2
[*.hbs]
indent_style = space
indent_size = 2
[*.css]
indent_style = space
indent_size = 2
[*.html]
indent_style = space
indent_size = 2
[*.{diff,md}]
trim_trailing_whitespace = false
================================================
FILE: .gitattributes
================================================
*.sh eol=lf
*.bat eol=crlf
*.js text
*.py text
*.css text
*.hbs text
*.json text
*.html text
*.xml text
*.yml text
*.txt text
*.rst text
*.md text
*.cfg text
*.conf text
Makefile* text
*.png binary
*.swf binary
*.ttf binary
*.woff binary
*.woff2 binary
================================================
FILE: .gitignore
================================================
# Python compiled files
*__pycache__/*
*.pyc
# Vagrant files
.vagrant/
/.idea/
# Python build files
*.egg-info
slybot/dist
slybot/build
slyd/slyd/dist
slyd/slyd/build
# npm files
node_modules/*
slyd/bower_components/*
slyd/tmp/*
npm-debug.log
slybot/slybot/splash-script-combined.js
# Local Settings
slyd/slyd/local_settings.py
slybot/slybot/local_slybot_settings.py
# Testing
slybot/.tox
# Docs build directory
docs/_build
# Development Databases
*.sqlite*
# Default Portia data directory
slyd/slyd/data
/data/
================================================
FILE: .jshintrc
================================================
{
"predef": [
"document",
"window",
"-Promise"
],
"browser": true,
"boss": true,
"curly": true,
"debug": false,
"devel": true,
"eqeqeq": true,
"evil": true,
"forin": false,
"immed": false,
"laxbreak": false,
"newcap": true,
"noarg": true,
"noempty": false,
"nonew": false,
"nomen": false,
"onevar": false,
"plusplus": false,
"regexp": false,
"undef": true,
"sub": true,
"strict": false,
"white": false,
"eqnull": true,
"esnext": true,
"unused": true
}
================================================
FILE: .travis.yml
================================================
language: python
python: 3.7
dist: bionic
services:
- docker
env:
- WHEELHOUSE=$HOME/.cache/wheelhouse PIP_FIND_LINKS=file://$WHEELHOUSE PIP_WHEEL_DIR=$WHEELHOUSE
cache:
directories:
- "$HOME/.cache/pip"
- "$HOME/.cache/wheelhouse"
- portiaui/node_modules
- portiaui/bower_components
before_install:
- docker build -t scrapinghub/portia .
- docker ps -a
install:
- docker run scrapinghub/portia /app/docker/run-tests.sh
- pushd portiaui
- nvm install 10.16.0
- nvm use 10.16.0
- npm install -g bower ember-cli@2.6.3
- npm install
- bower install
- popd
script:
- pushd portiaui
- npm rebuild node-sass
- npm test
- popd
before_deploy:
- cd slybot
- pip install twine
- sudo chown -R $USER .
deploy:
provider: pypi
distributions: sdist bdist_wheel
user: scrapinghub
password:
secure: S5hZT2YBncUSkPTyR5RUQnACfTsW2ZtpHeQucIamKWN+xkE8KK9O0cWUMuKQ0q3U5ShFkZdhO4PnBjvtP54Dq9IogJAudkDJCylctf4qGoIlWu01mAoJzcUfrS5KW+VolF/opBJObwG38EIOOsVy9UYq7DeQcryAAG1RuMjONAk=
on:
all_branches: true
tags: true
repo: scrapinghub/portia
condition: "$TRAVIS_TAG =~ ^slybot-[0-9][.][0-9]*"
================================================
FILE: CHANGES
================================================
2.0.8 - 2017-04-20
Limit project and spider id length to avoid causing issues in windows
Only use auto annotations when calculating container selectors
Update portia2code to 0.0.12 - Handle malformed schemas
Convert splash url to unicode instead of bytes
Enable item nesting in samples from ember config
Add logic for keeping track of, and blocking pages that fail to load in Portia
Fix 404 error when downloading projects from git backend as a new user
2.0.7 - 2017-03-28
Pin ember data to 2.11.x
Do not initialize tree mirror in web page until after initial page load
Change order of compilation for injected JS files in splash
Set text content correctly for html elements with a content attribute
2.0.6 - 2017-03-07
Add option to have default data format for project
Do not show any version control info if it is not enabled
Fix Extractor overflow bug
Fix error when loading broken samples in UI
Fix splash browser tab storing wrong html
Fix bug with repr for tab
2.0.5 - 2017-02-27
Add data directory for storing spider data and ignore all new data in it
Update install instructions and scripts
Update Docs
Throw KeyError when trying to get non existent model from collection
Fix loading html as raw from the tab in the socket for extraction
Fix missing objects in branch by deleting the branch
Fix UnicodeDecodeError for downloads and css parsing
Fix bytes/unicode issue with slybot extractors
2.0.4 - 2017-02-23
Fix error when trying to load assets by proxy from an invalid tab
Fix value resolved to `None` when merging lists
Fix unicode error in regex for removing XSS from CSS assets
Fix sample loading for old spiders
Fix for running legacy spiders
Fix storage being cleared by another message while in use
Fix error caused by downloading invalid project name
Disable error logging for missing websocket command, just log debug message
Cache selector searches for container to increase sample build speed
Do not throw error for missing annotation data in sample when finding schema
Do not log model error when operating on deleted model
2.0.3 - 2017-02-21
Add download option for downloading a Portia project for use with slybot
Add download option for downloading a Portia project as Python code
Add copy for moving spiders from one project to another
Add loading slider when changing page or loading models
Add droplet to inform use that they have changes
Add better message when websocket is reconnecting
Add help icon describing what crawl rules do
Add message informing user when they have an unused required field in their schema
Add automatic selection of new field type in some cases
Add dropdown for projects - publish, discard, download
Add option to create new projects from UI (open source only)
Add dropdown for spiders - Copy, Download and delete
Add dropdown for schemas - delete
Fix bug with using master when scheduling spider
Fix error when errors with numeric ids are returned by server
Fix spider listing loading in UI
Fix so that project changes show up whenever there is a change
Fix model not being loaded when changing field or schema
Fix handling malformed extractor objects
Fix merging html data during a conflict - take mine
Fix when changing item schema in UI
Fix extractors were not shown in UI
Fix spiders would not run after a rename
Fix deletes where file may be requested for delete more than once
Fix with missing node when mirroring page from server
Fix incorrect node data when mirroring page from server
Fix loading extractors during migration
Fix RuntimeError when loading tab url after tab has been closed
Fix deletion of html files during cascade delete
Fix loading extractors when reading annotations
Fix loading html into sample in websocket for extraction
Fix logged error due to missing `save_html` callback
Fix html not being saved while creating sample
Fix loading samples from `template_names` field instead of spider directory
2.0.2 - 2017-01-20
Gracefully handle missing objects in DB
2.0.1 - 2017-01-19
Limit number of spiders shown at once in UI to 15
Fix scheduling when spider has been run from Portia
Display helpful error messages to users
Replace guid spider ids with spider name
Add spider id if it is missing
Add a schema for newly created items
Fix PathResolutionError for unmigrated samples
Better migrate samples that extract from tables
2.0.0 - 2017-01-02
Change backend to use Django instead of Twisted web
Created a new JSON API and ORM to handle all Portia objects for greater consistency + efficiency
Automatically detect if a user should enable or disable JS for better extraction
Added support for generating urls
Added support for using a feed as a start url
Added support for extracting multiple values to a single field
Added support for a spinner showing the extraction progress for a better UX
Many bug fixes and stability improvements
16.07.2 - 2016-07-26
Fix bug with `project_filename` method missing Fix bug when initializing project
16.05.1 - 2016-05-17
New transactional request handling
Inline element overlays (correctly show tags that may wrap around the page)
New download endpoint for projects/spiders `GET portia/api/projects/<PROJECT_ID>/download[/<SPIDER_ID>]`
Save selection_mode, pre_text and post_text for annotations
Add toggle CSS button
16.04.1 - 2016-04-05
Add link to docs
Add auto pagination, learns from start_urls
Add git status endpoint for projects
Load page when it is specified as url param `url`
Save selection mode for annotation when using xpath/css selector
Improve Portia on smaller screens Notify users of unpublished changes
Reject annotations with elements that share a container with the hovered element
Fix issue with extracting items with more than one sibling
Fix bug where clicking on help icon toggled checkbox
16.02.2 - 2016-02-16
Fix incompatability with latest splash
Fix error with next page link following
Log traceback if error occurs in websocket
16.02.1 - 2016-02-09
Add automatic next page link extractor to slybot
Show errors to user instead of event id
Fix bug where items are not initialized correctly
Log websocket errors
Fix regex validation
Fix conflicts resolution errors
16.01.1 - 2016-01-25
Fix Unicode error when creating spiders
Fix install packages
Extract data from a list of urls `POST portia/projects/<PROJECT>/spec/extract/<SPIDER> {"urls": ["http://example.com"]}`
Allow `.` to be used in spider and sample names
Correctly handle Atom, RSS and XML sitemaps
Automatically dismiss suggested annotations if user doesn't use them
Correctly place annotations when page contains `<ins>` tags
Use Qt5 for internal splash instance
Fix scrolling page action
Fix URL validation
15.12.2 - 2015-12-30
Fix issue when copying spiders that reference deleted extractors
Fix issue with using srcdoc in IE
Enable annotation suggestions and page actions by default
15.12.1 - 2015-12-03
Fix issue with overwritten JSON library in splash
Fix error when merging samples modified in one or more branches
Fix event passing in Safari
================================================
FILE: Dockerfile
================================================
FROM ubuntu:16.04
WORKDIR /app/slyd
ENV PATH="/opt/qt59/5.9.1/gcc_64/bin:${PATH}"
ENV DEBIAN_FRONTEND noninteractive
ENV QT_MIRROR http://ftp.fau.de/qtproject/official_releases/qt/5.9/5.9.1/qt-opensource-linux-x64-5.9.1.run
COPY docker/portia.conf /app/portia.conf
COPY docker/qt_install.qs /app/script.qs
COPY docker/provision.sh /app/provision.sh
COPY slybot/requirements.txt /app/slybot/requirements.txt
COPY slyd/requirements.txt /app/slyd/requirements.txt
COPY portia_server/requirements.txt /app/portia_server/requirements.txt
RUN /app/provision.sh prepare_install && \
/app/provision.sh install_deps && \
/app/provision.sh install_qtwebkit_deps && \
/app/provision.sh download_official_qt && \
/app/provision.sh install_official_qt && \
/app/provision.sh install_qtwebkit && \
/app/provision.sh install_pyqt5 && \
/app/provision.sh install_python_deps && \
/app/provision.sh install_flash && \
/app/provision.sh install_msfonts && \
/app/provision.sh install_extra_fonts && \
/app/provision.sh remove_builddeps && \
/app/provision.sh remove_extra
ADD docker/nginx /etc/nginx
ADD . /app
RUN pip install -e /app/slyd && \
pip install -e /app/slybot
RUN python3 /app/portia_server/manage.py migrate
EXPOSE 9001
ENTRYPOINT ["/app/docker/entry"]
================================================
FILE: LICENSE
================================================
Copyright (c) Scrapinghub.
All rights reserved.
Redistribution and use in source and binary forms, with or without modification,
are permitted provided that the following conditions are met:
1. Redistributions of source code must retain the above copyright notice,
this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
3. Neither the name of Portia nor the names of its contributors may be used
to endorse or promote products derived from this software without
specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
================================================
FILE: README.md
================================================
Portia
======
Portia is a tool that allows you to visually scrape websites without any programming knowledge required. With Portia you can annotate a web page to identify the data you wish to extract, and Portia will understand based on these annotations how to scrape data from similar pages.
# Running Portia
The easiest way to run Portia is using [Docker]:
You can run Portia using Docker & official Portia-image by running:
docker run -v ~/portia_projects:/app/data/projects:rw -p 9001:9001 scrapinghub/portia
You can also set up a local instance with [Docker-compose] by cloning this repo & running from the root of the folder:
docker-compose up
For more detailed instructions, and alternatives to using Docker, see the [Installation] docs.
# Documentation
Documentation can be found from [Read the docs]. Source files can be found in the ``docs`` directory.
[Docker]: https://www.docker.com/
[Docker-compose]:https://docs.docker.com/compose
[Installation]: http://portia.readthedocs.org/en/latest/installation.html
[Read the docs]: http://portia.readthedocs.org/en/latest/index.html
[Scrapinghub]: https://portia.scrapinghub.com/
================================================
FILE: VERSION
================================================
2.0.8
================================================
FILE: Vagrantfile
================================================
# vim:ft=ruby
Vagrant.configure("2") do |config|
config.vm.box = "ubuntu/trusty64"
config.vm.host_name = "portia"
config.vm.provision :shell, :path => 'docker/provision.sh', :args => [
"install_deps", "install_splash", "install_python_deps", "configure_nginx", "configure_initctl", "migrate_django_db", "start_portia"
]
config.vm.network "private_network", ip: "33.33.33.10"
config.vm.network "forwarded_port", guest: 9001, host: 9001
config.vm.provider "virtualbox" do |v|
v.memory = 2048
v.cpus = 2
end
end
================================================
FILE: bin/bump_version.py
================================================
#!/usr/bin/env python3
import os
from datetime import datetime
_BASE_PATH = os.path.abspath(os.path.dirname(__file__))
VERSION_FILE = os.path.abspath(os.path.join(_BASE_PATH, '../VERSION'))
def next_version(version_file):
now = datetime.now()
this_month = datetime(now.year, now.month, 1)
with open(version_file, 'r') as f:
version = f.read().strip().split('.')
release_month = datetime.strptime('.'.join(version[:-1]), '%y.%m')
release_number = int(version[-1]) + 1
if this_month != release_month:
release_number = 1
release_number = max(1, release_number)
return '{:%y.%m}.{}'.format(this_month, release_number).decode('utf-8')
def bump_version_file(filename=None):
if filename is None:
filename = VERSION_FILE
next_version_string = next_version(filename)
with open(filename, 'w') as f:
f.write(next_version_string)
if __name__ == '__main__':
bump_version_file()
================================================
FILE: docker/compile-assets.sh
================================================
#!/bin/bash
cd portiaui
npm install
npm run build
================================================
FILE: docker/entry
================================================
#!/bin/bash
set -x
action=$1
shift
_run() {
service nginx start
_set_env
echo $PYTHONPATH
/app/slyd/bin/slyd -p 9002 -r /app/portiaui/dist &
/app/portia_server/manage.py runserver
}
_set_env() {
path='/app/portia_server:/app/slyd:/app/slybot'
export PYTHONPATH="$path"
}
if [ -z "$action" ]; then
_run
else
case $action in
start-dev|start-prod)
_run
;;
start-webshell)
_run_webshell "$@"
;;
*)
exec $action "$@"
;;
esac
fi
================================================
FILE: docker/nginx/nginx.conf
================================================
worker_processes 1;
events { worker_connections 1024; }
http {
include mime.types;
sendfile on;
client_max_body_size 0;
gzip on;
gzip_static on;
gzip_http_version 1.0;
gzip_proxied any;
gzip_min_length 500;
gzip_disable "MSIE [1-6]\.";
gzip_types text/plain text/xml text/css
text/comma-separated-values
text/javascript
application/json
application/javascript
application/x-javascript
application/atom+xml;
# Configuration for the server
server {
# Running port
listen 9001;
root /app/portiaui/dist;
location ~ \.map$ {
return 404;
}
location = /index.html {
rewrite /index.html /;
}
location /static {
alias /app/portiaui/dist;
}
location / {
try_files $uri @backend;
}
location /api {
include proxy_portia_server.conf;
}
location /server_capabilities {
include proxy_portia_server.conf;
}
location @backend {
include proxy_slyd.conf;
}
location /ws {
proxy_http_version 1.1;
proxy_buffering off;
proxy_set_header Upgrade $http_upgrade;
proxy_set_header Connection "Upgrade";
# proxy_set_header Host 127.0.0.1:9002;
include proxy_slyd.conf;
}
}
}
================================================
FILE: docker/nginx/proxy_portia_server.conf
================================================
proxy_pass http://127.0.0.1:8000;
proxy_redirect off;
proxy_set_header Host $http_host;
proxy_set_header X-Real-IP $remote_addr;
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
proxy_set_header X-Forwarded-Host $server_name;
================================================
FILE: docker/nginx/proxy_slyd.conf
================================================
proxy_pass http://127.0.0.1:9002;
proxy_redirect off;
proxy_set_header Host $host:9002;
proxy_set_header X-Real-IP $remote_addr;
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
proxy_set_header X-Forwarded-Host $server_name;
================================================
FILE: docker/portia.conf
================================================
description "portia server"
start on vagrant-mounted or filesystem
stop on runlevel [!2345]
script
export PYTHONPATH='/vagrant/portia_server:/vagrant/slyd:/vagrant/slybot'
/vagrant/slyd/bin/slyd -p 9002 -r /vagrant/portiaui/dist &
/vagrant/portia_server/manage.py runserver
end script
respawn
================================================
FILE: docker/provision.sh
================================================
#!/bin/bash
set -e
if [ "x$APP_ROOT" = x ]
then
for dir in "$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" /app /vagrant $(pwd)
do
if [ -d "$dir" ] && [ -d "$dir/slyd" ]
then
APP_ROOT="$dir"
break
fi
done
fi
if [ "x$APP_ROOT" = x ]
then
echo "Could not determine app directory"
exit 1
fi
echo "APP_ROOT=$APP_ROOT"
usage() {
cat <<EOF
Portia provisioner.
Usage: $0 COMMAND [ COMMAND ... ]
Available commands:
usage -- print this message
prepare_install -- prepare image for installation
install_deps -- install general system-level dependencies
install_qtwebkit_deps -- install Qt and WebKit dependencies
install_official_qt -- install Qt using official installer
install_qtwebkit -- install updated WebKit for QT
install_pyqt5 -- install PyQT5 from sources
install_python_deps -- install python packages
install_msfonts -- agree with EULA and install Microsoft fonts
install_extra_fonts -- install extra fonts
install_flash -- install flash plugin
remove_builddeps -- WARNING: only for Docker! Remove build-dependencies.
remove_extra -- WARNING: only for Docker! Eemove files that are not necessary to run Splash.
configure_initctl -- installs initctl configuration
configure_nginx -- installs nginx configuration
EOF
}
SPLASH_SIP_VERSION=${SPLASH_SIP_VERSION:-"4.19.3"}
SPLASH_PYQT_VERSION=${SPLASH_PYQT_VERSION:-"5.9"}
SPLASH_BUILD_PARALLEL_JOBS=${SPLASH_BUILD_PARALLEL_JOBS:-"2"}
QT_MIRROR=${QT_MIRROR:-"http://ftp.fau.de/qtproject/official_releases/qt/5.9/5.9.1/qt-opensource-linux-x64-5.9.1.run"}
export PATH=/opt/qt59/5.9.1/gcc_64/bin:$PATH
# '2' is not supported by this script; allowed values are "3" and "venv" (?).
SPLASH_PYTHON_VERSION=${SPLASH_PYTHON_VERSION:-"3"}
if [[ ${SPLASH_PYTHON_VERSION} == "venv" ]]; then
_PYTHON=python
else
_PYTHON=python${SPLASH_PYTHON_VERSION}
fi
_activate_venv () {
if [[ ${SPLASH_PYTHON_VERSION} == "venv" ]]; then
source ${VIRTUAL_ENV}/bin/activate
fi
}
prepare_install () {
# Prepare docker image for installation of packages, docker images are
# usually stripped and apt-get doesn't work immediately.
#
# python-software-properties contains "add-apt-repository" command for PPA conf
sed 's/main$/main universe/' -i /etc/apt/sources.list
apt-get update
apt-get install -y --no-install-recommends \
curl \
software-properties-common \
apt-transport-https \
python3-software-properties
}
install_deps () {
# Get more recent node install
echo deb http://nginx.org/packages/ubuntu/ trusty nginx > /etc/apt/sources.list.d/nginx.list
apt-key adv --keyserver keyserver.ubuntu.com --recv-keys ABF5BD827BD9BF62
wget -O - https://deb.nodesource.com/setup_8.x | bash -
# Install system dependencies for Qt, Python packages, etc.
# ppa:pi-rho/security is a repo for libre2-dev
add-apt-repository -y ppa:pi-rho/security && \
apt-get update -q && \
apt-get install -y --no-install-recommends \
python3 \
python3-dev \
python3-pip \
build-essential \
libre2-dev \
liblua5.2-dev \
libsqlite3-dev \
zlib1g \
zlib1g-dev \
netbase \
ca-certificates \
pkg-config \
nodejs \
libmysqlclient-dev \
python-mysql.connector \
python-numpy \
python-openssl \
python-pip \
nginx
}
install_qtwebkit_deps () {
apt-get install -y --no-install-recommends \
xvfb \
libjpeg-turbo8-dev \
libgl1-mesa-dev \
libglu1-mesa-dev \
mesa-common-dev \
libfontconfig1-dev \
libicu-dev \
libpng12-dev \
libxslt1-dev \
libxml2-dev \
libhyphen-dev \
libgbm1 \
libxcb-image0 \
libxcb-icccm4 \
libxcb-keysyms1 \
libxcb-render-util0 \
libxi6 \
libxcomposite-dev \
libxrender-dev \
libgstreamer1.0-dev \
libgstreamer-plugins-base1.0-dev \
libgstreamer-plugins-good1.0-dev \
gstreamer1.0-plugins-good \
gstreamer1.0-x \
gstreamer1.0-libav \
webp \
rsync
}
_ensure_folders () {
mkdir -p /downloads && \
mkdir -p /builds && \
chmod a+rw /downloads && \
chmod a+rw /builds
}
download_official_qt() {
_ensure_folders && \
curl -L -o /downloads/qt-installer.run \
$QT_MIRROR
}
install_official_qt () {
# XXX: if qt version is changed, Dockerfile should be updated,
# as well as qt-installer-noninteractive.qs script.
chmod +x /downloads/qt-installer.run && \
xvfb-run /downloads/qt-installer.run \
--script /app/script.qs \
| egrep -v '\[[0-9]+\] Warning: (Unsupported screen format)|((QPainter|QWidget))' && \
ls /opt/qt59/ && \
# cat /opt/qt59/InstallationLog.txt && \
cat /opt/qt59/components.xml
}
install_qtwebkit () {
# Install webkit from https://github.com/annulen/webkit
_ensure_folders && \
curl -L -o /downloads/qtwebkit.tar.xz https://github.com/annulen/webkit/releases/download/qtwebkit-5.212.0-alpha2/qtwebkit-5.212.0_alpha2-qt59-linux-x64.tar.xz && \
pushd /builds && \
tar xvfJ /downloads/qtwebkit.tar.xz --keep-newer-files && \
rsync -aP /builds/qtwebkit-5.212.0_alpha2-qt59-linux-x64/* `qmake -query QT_INSTALL_PREFIX`
}
install_pyqt5 () {
_ensure_folders && \
_activate_venv && \
${_PYTHON} --version && \
curl -L -o /downloads/sip.tar.gz https://sourceforge.net/projects/pyqt/files/sip/sip-${SPLASH_SIP_VERSION}/sip-${SPLASH_SIP_VERSION}.tar.gz && \
curl -L -o /downloads/pyqt5.tar.gz https://sourceforge.net/projects/pyqt/files/PyQt5/PyQt-${SPLASH_PYQT_VERSION}/PyQt5_gpl-${SPLASH_PYQT_VERSION}.tar.gz && \
# curl -L -o /downloads/sip.tar.gz https://www.riverbankcomputing.com/static/Downloads/sip/sip-${SPLASH_SIP_VERSION}.tar.gz && \
# curl -L -o /downloads/pyqt5.tar.gz https://www.riverbankcomputing.com/static/Downloads/PyQt5/PyQt5_gpl-${SPLASH_PYQT_VERSION}.tar.gz && \
ls -lh /downloads && \
# TODO: check downloads
pushd /builds && \
# SIP
tar xzf /downloads/sip.tar.gz --keep-newer-files && \
pushd sip-${SPLASH_SIP_VERSION} && \
${_PYTHON} configure.py && \
make -j ${SPLASH_BUILD_PARALLEL_JOBS} && \
make install && \
popd && \
# PyQt5
tar xzf /downloads/pyqt5.tar.gz --keep-newer-files && \
pushd PyQt5_gpl-${SPLASH_PYQT_VERSION} && \
# --qmake "${SPLASH_QT_PATH}/bin/qmake" \
${_PYTHON} configure.py -c \
--verbose \
--confirm-license \
--no-designer-plugin \
--no-qml-plugin \
--no-python-dbus \
-e QtCore \
-e QtGui \
-e QtWidgets \
-e QtNetwork \
-e QtWebKit \
-e QtWebKitWidgets \
-e QtSvg \
-e QtPrintSupport && \
make -j ${SPLASH_BUILD_PARALLEL_JOBS} && \
make install && \
popd && \
# Builds Complete
popd
}
install_python_deps(){
# Install python-level dependencies.
_activate_venv && \
${_PYTHON} -m pip install -U pip setuptools six && \
${_PYTHON} -m pip install \
qt5reactor==0.4 \
psutil==5.0.0 \
Twisted==16.1.1 \
adblockparser==0.7 \
xvfbwrapper==0.2.9 \
funcparserlib==0.3.6 \
Pillow==3.4.2 \
lupa==1.3 && \
${_PYTHON} -m pip install https://github.com/sunu/pyre2/archive/c610be52c3b5379b257d56fc0669d022fd70082a.zip#egg=re2
${_PYTHON} -m pip install -r "$APP_ROOT/slybot/requirements.txt"
${_PYTHON} -m pip install -r "$APP_ROOT/slyd/requirements.txt"
${_PYTHON} -m pip install -r "$APP_ROOT/portia_server/requirements.txt"
}
install_msfonts() {
# Agree with EULA and install Microsoft fonts
# apt-add-repository -y "deb http://archive.ubuntu.com/ubuntu xenial multiverse" && \
# apt-add-repository -y "deb http://archive.ubuntu.com/ubuntu xenial-updates multiverse" && \
# apt-get update && \
echo ttf-mscorefonts-installer msttcorefonts/accepted-mscorefonts-eula select true | debconf-set-selections && \
apt-get install --no-install-recommends -y ttf-mscorefonts-installer
}
install_extra_fonts() {
# Install extra fonts (Chinese and other)
apt-get install --no-install-recommends -y \
fonts-liberation \
ttf-wqy-zenhei \
fonts-arphic-gbsn00lp \
fonts-arphic-bsmi00lp \
fonts-arphic-gkai00mp \
fonts-arphic-bkai00mp \
fonts-beng
}
install_flash () {
apt-add-repository -y "deb http://archive.ubuntu.com/ubuntu trusty multiverse" && \
apt-get update && \
apt-get install -y flashplugin-installer
}
remove_builddeps () {
# WARNING: only for Docker, don't run blindly!
# Uninstall build dependencies.
apt-get remove -y --purge \
python3-dev \
libpython3.5-dev \
libpython3.5 \
libpython3.5-dev \
build-essential \
libre2-dev \
liblua5.2-dev \
zlib1g-dev \
libc-dev \
libjpeg-turbo8-dev \
libcurl3 \
gcc cpp cpp-5 binutils perl rsync && \
apt-get clean -y
}
remove_extra () {
# WARNING: only for Docker, don't run blindly!
# Remove unnecessary files.
rm -rf \
/builds \
/downloads \
/opt/qt59/Docs \
/opt/qt59/Tools \
/opt/qt59/Examples \
/app/.git \
/usr/share/man \
/usr/share/info \
/usr/share/doc \
/var/lib/apt/lists/*
}
install_splash(){
cd /tmp
curl -L -o splash.tar.gz 'https://github.com/scrapinghub/splash/archive/3.2.x.tar.gz'
tar -xvf splash.tar.gz --keep-newer-files
cd splash-*
_activate_venv
prepare_install
install_deps
install_qtwebkit_deps
download_official_qt
install_official_qt
install_qtwebkit
install_pyqt5
install_python_deps
pip install .
}
configure_nginx(){
cp -r $APP_ROOT/nginx/* /etc/nginx
sed 's/\/app\//'""${APP_ROOT//\//\\\/}""'\//g' -i /etc/nginx/nginx.conf
}
configure_initctl(){
cp "$APP_ROOT/portia.conf" /etc/init
}
migrate_django_db(){
python /vagrant/portia_server/manage.py migrate
}
start_portia(){
echo "Starting Nginx"
echo "=============="
/etc/init.d/nginx start
echo "Starting Nginx"
echo "=============="
start portia
}
install_frontend_deps() {
npm install -g bower ember-cli
}
build_assets() {
cd "$APP_ROOT/portiaui"
npm install && npm run build
}
if [ \( $# -eq 0 \) -o \( "$1" = "-h" \) -o \( "$1" = "--help" \) ]; then
usage
exit 1
fi
UNKNOWN=0
for cmd in "$@"; do
if [ "$(type -t -- "$cmd")" != "function" ]; then
echo "Unknown command: $cmd"
UNKNOWN=1
fi
done
if [ $UNKNOWN -eq 1 ]; then
echo "Unknown commands encountered, exiting..."
exit 1
fi
while [ $# -gt 0 ]; do
echo "Executing command: $1"
"$1"
shift
done
================================================
FILE: docker/qt_install.qs
================================================
// Emacs mode hint: -*- mode: JavaScript -*-
// https://bitbucket.org/xiannox/trusty-qt5.7-beta-x64/raw/HEAD/qt-installer-noninteractive.qs
// https://bitbucket.org/xiannox/trusty-qt5.7-beta-x64
function Controller() {
installer.autoRejectMessageBoxes();
installer.installationFinished.connect(function() {
gui.clickButton(buttons.NextButton);
})
}
Controller.prototype.WelcomePageCallback = function() {
gui.clickButton(buttons.NextButton);
}
Controller.prototype.CredentialsPageCallback = function() {
gui.clickButton(buttons.NextButton);
}
Controller.prototype.IntroductionPageCallback = function() {
gui.clickButton(buttons.NextButton);
}
Controller.prototype.TargetDirectoryPageCallback = function()
{
gui.currentPageWidget().TargetDirectoryLineEdit.setText("/opt/qt59");
gui.clickButton(buttons.NextButton);
}
Controller.prototype.ComponentSelectionPageCallback = function() {
var widget = gui.currentPageWidget();
// To get component names, comment out widget.deselectAll()
// to install everything default, then check components.xml file.
widget.deselectAll();
widget.selectComponent("qt.591.gcc_64");
widget.selectComponent("qt.591.qtwebengine.gcc_64");
gui.clickButton(buttons.NextButton);
}
Controller.prototype.LicenseAgreementPageCallback = function() {
gui.currentPageWidget().AcceptLicenseRadioButton.setChecked(true);
gui.clickButton(buttons.NextButton);
}
Controller.prototype.StartMenuDirectoryPageCallback = function() {
gui.clickButton(buttons.NextButton);
}
Controller.prototype.ReadyForInstallationPageCallback = function()
{
gui.clickButton(buttons.NextButton);
}
Controller.prototype.FinishedPageCallback = function() {
var checkBoxForm = gui.currentPageWidget().LaunchQtCreatorCheckBoxForm
if (checkBoxForm && checkBoxForm.launchQtCreatorCheckBox) {
checkBoxForm.launchQtCreatorCheckBox.checked = false;
}
gui.clickButton(buttons.FinishButton);
}
================================================
FILE: docker/restore-mtime.sh
================================================
#!/bin/bash
commit=$(git rev-list -n 1 HEAD requirements.txt)
mtime=$(git show --pretty=format:%ai --abbrev-commit $commit |head -n1)
touch -d "$mtime" requirements.txt
================================================
FILE: docker/run-tests.sh
================================================
#!/bin/bash
export PYTHONPATH=`pwd`/slybot:`pwd`/slyd
pip install tox
cd /app/slyd
python2.7 tests/testserver/server.py 2>&1 | grep -v 'HTTP/1.1" 200' &
sleep 3
cd /app/slybot
tox
cd /app/portia_server
./manage.py test portia_orm.tests
./manage.py test portia_api.tests
================================================
FILE: docker-compose.yml
================================================
version: '3'
services:
app:
build: .
command: /app/docker/entry start-dev
volumes:
- ./data/projects:/app/data/projects:rw
- ./portiaui/dist:/app/portiaui/dist
- ./slyd:/app/slyd
- ./portia_server:/app/portia_server
- ./slybot:/app/slybot
ports:
- 9001:9001
restart: always
================================================
FILE: docs/Makefile
================================================
# Makefile for Sphinx documentation
#
# You can set these variables from the command line.
SPHINXOPTS =
SPHINXBUILD = sphinx-build
PAPER =
BUILDDIR = _build
# User-friendly check for sphinx-build
ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1)
$(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/)
endif
# Internal variables.
PAPEROPT_a4 = -D latex_paper_size=a4
PAPEROPT_letter = -D latex_paper_size=letter
ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
# the i18n builder cannot share the environment and doctrees with the others
I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
.PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext
help:
@echo "Please use \`make <target>' where <target> is one of"
@echo " html to make standalone HTML files"
@echo " dirhtml to make HTML files named index.html in directories"
@echo " singlehtml to make a single large HTML file"
@echo " pickle to make pickle files"
@echo " json to make JSON files"
@echo " htmlhelp to make HTML files and a HTML help project"
@echo " qthelp to make HTML files and a qthelp project"
@echo " devhelp to make HTML files and a Devhelp project"
@echo " epub to make an epub"
@echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
@echo " latexpdf to make LaTeX files and run them through pdflatex"
@echo " latexpdfja to make LaTeX files and run them through platex/dvipdfmx"
@echo " text to make text files"
@echo " man to make manual pages"
@echo " texinfo to make Texinfo files"
@echo " info to make Texinfo files and run them through makeinfo"
@echo " gettext to make PO message catalogs"
@echo " changes to make an overview of all changed/added/deprecated items"
@echo " xml to make Docutils-native XML files"
@echo " pseudoxml to make pseudoxml-XML files for display purposes"
@echo " linkcheck to check all external links for integrity"
@echo " doctest to run all doctests embedded in the documentation (if enabled)"
clean:
rm -rf $(BUILDDIR)/*
html:
$(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
@echo
@echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
dirhtml:
$(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
@echo
@echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
singlehtml:
$(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml
@echo
@echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml."
pickle:
$(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
@echo
@echo "Build finished; now you can process the pickle files."
json:
$(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
@echo
@echo "Build finished; now you can process the JSON files."
htmlhelp:
$(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
@echo
@echo "Build finished; now you can run HTML Help Workshop with the" \
".hhp project file in $(BUILDDIR)/htmlhelp."
qthelp:
$(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
@echo
@echo "Build finished; now you can run "qcollectiongenerator" with the" \
".qhcp project file in $(BUILDDIR)/qthelp, like this:"
@echo "# qcollectiongenerator $(BUILDDIR)/qthelp/Portia.qhcp"
@echo "To view the help file:"
@echo "# assistant -collectionFile $(BUILDDIR)/qthelp/Portia.qhc"
devhelp:
$(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp
@echo
@echo "Build finished."
@echo "To view the help file:"
@echo "# mkdir -p $$HOME/.local/share/devhelp/Portia"
@echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/Portia"
@echo "# devhelp"
epub:
$(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub
@echo
@echo "Build finished. The epub file is in $(BUILDDIR)/epub."
latex:
$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
@echo
@echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
@echo "Run \`make' in that directory to run these through (pdf)latex" \
"(use \`make latexpdf' here to do that automatically)."
latexpdf:
$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
@echo "Running LaTeX files through pdflatex..."
$(MAKE) -C $(BUILDDIR)/latex all-pdf
@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
latexpdfja:
$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
@echo "Running LaTeX files through platex and dvipdfmx..."
$(MAKE) -C $(BUILDDIR)/latex all-pdf-ja
@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
text:
$(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text
@echo
@echo "Build finished. The text files are in $(BUILDDIR)/text."
man:
$(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man
@echo
@echo "Build finished. The manual pages are in $(BUILDDIR)/man."
texinfo:
$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
@echo
@echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo."
@echo "Run \`make' in that directory to run these through makeinfo" \
"(use \`make info' here to do that automatically)."
info:
$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
@echo "Running Texinfo files through makeinfo..."
make -C $(BUILDDIR)/texinfo info
@echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo."
gettext:
$(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale
@echo
@echo "Build finished. The message catalogs are in $(BUILDDIR)/locale."
changes:
$(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
@echo
@echo "The overview file is in $(BUILDDIR)/changes."
linkcheck:
$(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
@echo
@echo "Link check complete; look for any errors in the above output " \
"or in $(BUILDDIR)/linkcheck/output.txt."
doctest:
$(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
@echo "Testing of doctests in the sources finished, look at the " \
"results in $(BUILDDIR)/doctest/output.txt."
xml:
$(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml
@echo
@echo "Build finished. The XML files are in $(BUILDDIR)/xml."
pseudoxml:
$(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml
@echo
@echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml."
================================================
FILE: docs/conf.py
================================================
# -*- coding: utf-8 -*-
#
# Portia documentation build configuration file, created by
# sphinx-quickstart on Tue Aug 25 13:51:18 2015.
#
# This file is execfile()d with the current directory set to its
# containing dir.
#
# Note that not all possible configuration values are present in this
# autogenerated file.
#
# All configuration values have a default; values that are commented out
# serve to show the default.
import sys
import os
from datetime import datetime
from os import path
VERSION_FILE = path.abspath(path.join(path.dirname(__file__), '..', 'VERSION'))
YEAR = datetime.now().year
with open(VERSION_FILE, 'r') as f:
RELEASE = f.read().strip()
VERSION = RELEASE.rsplit('.', 1)[0]
# If extensions (or modules to document with autodoc) are in another directory,
# add these directories to sys.path here. If the directory is relative to the
# documentation root, use os.path.abspath to make it absolute, like shown here.
#sys.path.insert(0, os.path.abspath('.'))
# -- General configuration ------------------------------------------------
# If your documentation needs a minimal Sphinx version, state it here.
#needs_sphinx = '1.0'
# Add any Sphinx extension module names here, as strings. They can be
# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
# ones.
extensions = []
# Add any paths that contain templates here, relative to this directory.
templates_path = ['_templates']
# The suffix of source filenames.
source_suffix = '.rst'
# The encoding of source files.
#source_encoding = 'utf-8-sig'
# The master toctree document.
master_doc = 'index'
# General information about the project.
project = u'Portia'
copyright = u'{}, Scrapinghub'.format(YEAR)
# The version info for the project you're documenting, acts as replacement for
# |version| and |release|, also used in various other places throughout the
# built documents.
#
# The short X.Y version.
version = VERSION
# The full version, including alpha/beta/rc tags.
release = RELEASE
# The language for content autogenerated by Sphinx. Refer to documentation
# for a list of supported languages.
#language = None
# There are two options for replacing |today|: either, you set today to some
# non-false value, then it is used:
#today = ''
# Else, today_fmt is used as the format for a strftime call.
#today_fmt = '%B %d, %Y'
# List of patterns, relative to source directory, that match files and
# directories to ignore when looking for source files.
exclude_patterns = ['_build']
# The reST default role (used for this markup: `text`) to use for all
# documents.
#default_role = None
# If true, '()' will be appended to :func: etc. cross-reference text.
#add_function_parentheses = True
# If true, the current module name will be prepended to all description
# unit titles (such as .. function::).
#add_module_names = True
# If true, sectionauthor and moduleauthor directives will be shown in the
# output. They are ignored by default.
#show_authors = False
# The name of the Pygments (syntax highlighting) style to use.
pygments_style = 'sphinx'
# A list of ignored prefixes for module index sorting.
#modindex_common_prefix = []
# If true, keep warnings as "system message" paragraphs in the built documents.
#keep_warnings = False
# -- Options for HTML output ----------------------------------------------
# The theme to use for HTML and HTML Help pages. See the documentation for
# a list of builtin themes.
html_theme = 'default'
# Theme options are theme-specific and customize the look and feel of a theme
# further. For a list of options available for each theme, see the
# documentation.
#html_theme_options = {}
# Add any paths that contain custom themes here, relative to this directory.
#html_theme_path = []
# The name for this set of Sphinx documents. If None, it defaults to
# "<project> v<release> documentation".
#html_title = None
# A shorter title for the navigation bar. Default is the same as html_title.
#html_short_title = None
# The name of an image file (relative to this directory) to place at the top
# of the sidebar.
#html_logo = None
# The name of an image file (within the static path) to use as favicon of the
# docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32
# pixels large.
#html_favicon = None
# Add any paths that contain custom static files (such as style sheets) here,
# relative to this directory. They are copied after the builtin static files,
# so a file named "default.css" will overwrite the builtin "default.css".
html_static_path = ['_static']
# Add any extra paths that contain custom files (such as robots.txt or
# .htaccess) here, relative to this directory. These files are copied
# directly to the root of the documentation.
#html_extra_path = []
# If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
# using the given strftime format.
#html_last_updated_fmt = '%b %d, %Y'
# If true, SmartyPants will be used to convert quotes and dashes to
# typographically correct entities.
#html_use_smartypants = True
# Custom sidebar templates, maps document names to template names.
#html_sidebars = {}
# Additional templates that should be rendered to pages, maps page names to
# template names.
#html_additional_pages = {}
# If false, no module index is generated.
#html_domain_indices = True
# If false, no index is generated.
#html_use_index = True
# If true, the index is split into individual pages for each letter.
#html_split_index = False
# If true, links to the reST sources are added to the pages.
#html_show_sourcelink = True
# If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
#html_show_sphinx = True
# If true, "(C) Copyright ..." is shown in the HTML footer. Default is True.
#html_show_copyright = True
# If true, an OpenSearch description file will be output, and all pages will
# contain a <link> tag referring to it. The value of this option must be the
# base URL from which the finished HTML is served.
#html_use_opensearch = ''
# This is the file name suffix for HTML files (e.g. ".xhtml").
#html_file_suffix = None
# Output file base name for HTML help builder.
htmlhelp_basename = 'Portiadoc'
# -- Options for LaTeX output ---------------------------------------------
latex_elements = {
# The paper size ('letterpaper' or 'a4paper').
#'papersize': 'letterpaper',
# The font size ('10pt', '11pt' or '12pt').
#'pointsize': '10pt',
# Additional stuff for the LaTeX preamble.
#'preamble': '',
}
# Grouping the document tree into LaTeX files. List of tuples
# (source start file, target name, title,
# author, documentclass [howto, manual, or own class]).
latex_documents = [
('index', 'Portia.tex', u'Portia Documentation',
u'Scrapinghub', 'manual'),
]
# The name of an image file (relative to this directory) to place at the top of
# the title page.
#latex_logo = None
# For "manual" documents, if this is true, then toplevel headings are parts,
# not chapters.
#latex_use_parts = False
# If true, show page references after internal links.
#latex_show_pagerefs = False
# If true, show URL addresses after external links.
#latex_show_urls = False
# Documents to append as an appendix to all manuals.
#latex_appendices = []
# If false, no module index is generated.
#latex_domain_indices = True
# -- Options for manual page output ---------------------------------------
# One entry per manual page. List of tuples
# (source start file, name, description, authors, manual section).
man_pages = [
('index', 'portia', u'Portia Documentation',
[u'Scrapinghub'], 1)
]
# If true, show URL addresses after external links.
#man_show_urls = False
# -- Options for Texinfo output -------------------------------------------
# Grouping the document tree into Texinfo files. List of tuples
# (source start file, target name, title, author,
# dir menu entry, description, category)
texinfo_documents = [
('index', 'Portia', u'Portia Documentation',
u'Scrapinghub', 'Portia', 'One line description of project.',
'Miscellaneous'),
]
# Documents to append as an appendix to all manuals.
#texinfo_appendices = []
# If false, no module index is generated.
#texinfo_domain_indices = True
# How to display URL addresses: 'footnote', 'no', or 'inline'.
#texinfo_show_urls = 'footnote'
# If true, do not generate a @detailmenu in the "Top" node's menu.
#texinfo_no_detailmenu = False
### Following is taken from https://github.com/snide/sphinx_rtd_theme#using-this-theme-locally-then-building-on-read-the-docs
# on_rtd is whether we are on readthedocs.org, this line of code grabbed from docs.readthedocs.org
on_rtd = os.environ.get('READTHEDOCS', None) == 'True'
if not on_rtd: # only import and set the theme if we're building docs locally
import sphinx_rtd_theme
html_theme = 'sphinx_rtd_theme'
html_theme_path = [sphinx_rtd_theme.get_html_theme_path()]
# otherwise, readthedocs.org uses their theme by default, so no need to specify it
### end
================================================
FILE: docs/examples.rst
================================================
.. _examples:
========
Examples
========
Crawling paginated listings
===========================
Most e-commerce sites use pagination to spread results across multiple pages.
When crawling these sites with Portia, there are some best practices you should follow:
* Use the target categories as start pages.
* Use URL follow patterns to limit Portia to only visit category and article pages.
This will prevent Portia from visiting unnecessary pages so you can crawl the items a lot faster.
Let's use `timberlandonline.co.uk <http://www.timberlandonline.co.uk>`_ as an example. Say you want to only scrape products from the `boots <http://www.timberlandonline.co.uk/en/men-footwear-boots>`_ and `shoes <http://www.timberlandonline.co.uk/en/men-footwear-shoes>`_ categories. You can `create a spider <getting-started>`_ and add the categories to its start URLs:
.. image:: _static/portia-start-urls.png
:target: _static/portia-start-urls.png
:alt: Start URLs
To ensure the spider only visits relevant pages, you'll need to limit crawling to the target categories and product pages. You can accomplish this defining URL follow patterns in the Link Crawling configuration of your spider:
.. image:: _static/portia-follow-patterns.png
:target: _static/portia-follow-patterns.png
:alt: Follow patterns
You can use follow patterns to filter URLs with `regular expressions <https://en.wikipedia.org/Regular_expressions>`_. You can see which links will be followed by clicking the |icon-toggle-links| button (toggle highlighting) to the right of Portia's URL bar. Followed links will be highlighted in green and excluded links in red.
.. |icon-toggle-links| image:: _static/portia-icon-toggle-links.png
:width: 16px
:height: 16px
As you can see above, the spider will now only visit the boots and shoes category pages and their product listings. To ensure that only products belonging to the target categories are visited, we filter against the ``catID`` parameter value in the URL.
Crawling listings in this manner is much more efficient. You avoid visiting tons of unwanted pages on the site and instead crawl only those you need.
Selecting elements with CSS and XPath
=====================================
You can select elements with CSS or XPath by changing the selection mode of an annotation. You can do it clicking the |cog-symbol| symbol right to the annotation name in the ``ITEMS`` section of the left sidebar.
.. image:: _static/portia-change-selection-mode.png
:alt: Changing the selection mode of an annotation.
This way, you can tweak your selections, making them more or less specific, for example.
.. |cog-symbol| unicode:: 0x2699
Extracting a single attribute to multiple fields
================================================
Portia supports multiple annotations for the same attribute. You can take advantage of this to extract an attribute to multiple fields by simply creating an annotation for each field.
Imagine you want to extract the username and the date from blog posts and this information is represented like this:
.. code-block:: html
<div class="details">
By johndoe on March 3th
</div>
To extract this information separately, you have to annotate the element, click the gear icon right after the field name and add an extractor with a regular expression that captures only the username: ``By (\w+).*``.
After that, you have to go back to annotation mode, click the |icon-add| button in the toolbar and then annotate the same element again. Now, you have to create another extractor to capture only the date from the element: ``By \w+ on (.*)``.
.. |icon-add| image:: _static/portia-icon-add.png
:width: 16px
:height: 16px
Scraping multiple items from a single page
==========================================
You'll often need to retrieve several items from a single page. You can do this using either the repeating element tool |portia-icon-add-repeat| or with the wand |portia-icon-wand| by annotating the first item's element and then clicking the second item's element. Portia will detect all similar items on the page and create annotations for each of them.
.. |portia-icon-add-repeat| image:: _static/portia-icon-add-repeat.png
:width: 16px
:height: 16px
.. |portia-icon-wand| image:: _static/portia-icon-wand.png
:width: 16px
:height: 16px
Let's revisit the `timerberlandonline.co.uk <http://www.timberlandonline.co.uk>`_ spider and demonstrate this process by annotating a couple of pairs of shoes.
Click the tiles icon to select the repeating element tool and then click an element, and Portia will find all similar elements and link them to the same field:
.. image:: _static/portia-multi-preview.png
:target: _static/portia-multi-preview.png
:alt: Start URLs
Now you just need to do same for the other fields, and you're done!
.. _multiple-samples-example:
Using Multiple Samples to Deal with Different Layouts
=====================================================
Some websites use different layouts to display the same kind of information. E-commerce websites usually create special pages for some products on Black Friday, for example. Sometimes, the problem is that some pages might not have all the data you need.
You can create multiple samples, even if you are extracting only one item type, to make sure your spider can handle these variations.
**Consider this example:** our spider has an item type with the fields ``name``, ``price``, ``description`` and ``manufacturer``, where ``name`` and ``price`` are required fields. We have created a sample with annotations for each of those fields. Upon running the spider, many items are correctly scraped; however, there are a large number of scraped items where the ``manufacturer`` field contains what should be the ``description``, and the ``description`` field is empty. This has been caused by some pages having a different layout:
Layout A:
.. code-block:: html
<table>
<tbody>
<tr>
<td>name</td>
<td>price</td>
</tr>
<tr>
<td colspan="2">manufacturer</td>
<tr>
<tr>
<td colspan="2">description</td>
</tr>
</tbody>
</table>
Layout B:
.. code-block:: html
<table>
<tbody>
<tr>
<td>name</td>
<td>price</td>
</tr>
<tr>
<td colspan="2">description</td>
</tr>
</tbody>
</table>
As you can see, the problem lies with the fact that in layout B the description is where manufacturer would be, and with ``description`` not being a required field it means that the sample created for layout A will match layout B. Creating a new sample for layout B won't be enough to fix the problem, as layout A's sample :ref:`would contain more annotation and be matched against first <multiple-samples>`.
Instead we need to modify layout A's sample, and mark the ``description`` annotation as **Required**. With this added constraint, items displayed with layout B will not be matched against layout A's sample due to the missing ``description`` field, so the spider will proceed onto layout B's sample which will extract the data successfully.
:ref:`Click here to learn more about Multiple Samples <multiple-samples>`.
================================================
FILE: docs/faq.rst
================================================
.. _faq:
FAQ
===
How do I use Crawlera with Portia?
----------------------------------
Portia spiders are standard Scrapy spiders, so you can enable the `middleware <https://github.com/scrapy-plugins/scrapy-crawlera>`_ in your project's `settings.py`.
Does Portia support AJAX based websites?
----------------------------------------
Yes.
Does Portia work with large JavaScript frameworks like Ember?
-------------------------------------------------------------
Backbone, Angular, and Ember have all been thoroughly tested using Portia, and in most cases should work fine. React based websites aren't supported yet but we're working on it.
Does Portia support sites that require you to log in?
-----------------------------------------------------
Yes, you can set credentials in your spider's crawling configuration.
Does Portia support content behind search forms?
------------------------------------------------
No, but we plan on adding support in the near future.
================================================
FILE: docs/getting-started.rst
================================================
.. _getting-started:
===============
Getting Started
===============
.. note:: If you don't have Portia running yet, please read the :ref:`Installation guide <installation>` first.
This tutorial will briefly cover how to begin extracting data with Portia.
Creating a spider
=================
Let's start by creating a project. Enter a URL and Portia will render it like below:
.. This tutorial will briefly cover how to retrieve products from Amazon.com_ using Portia.
.. .. _amazon.com: http://amazon.com/
.. First, create a Portia project and enter a URL. Portia will render it like below:
.. image:: _static/portia-main-page.png
:alt: Portia main page
Click the ``New spider`` button to create a new spider. Portia will add the page's URL as a start page automatically. Start pages are used to seed the crawl and Portia will visit them when you start the spider to find more links.
Creating a sample
=================
A sample describes how data should be extracted from the page. Portia will use your samples to extract data from other pages with a similar structure.
Portia works like a web browser, so you can navigate between pages as you would normally. Navigate to a page you want to scrape and then the ``New sample`` button to create a :ref:`sample <samples>` of the page.
.. image:: _static/portia-new-spider.png
:alt: Newly created sample
Now that you've created the sample, you can begin :ref:`annotating <what-are-annotations>` the page. Annotations link a piece of data in the page to an item field. You'll notice that you can highlight elements on the page, if you click on it will create a new field to which the element will be extracted.
Portia will create an :ref:`item <items>` schema from the elements that you annotated and will use it as the data format for the scraped :ref:`items <items>`.
.. image:: _static/portia-annotation.png
:alt: Annotating a page
You can see a preview of the items your sample will extract on the right. Once you've annotated all the data you wish to extract, close the sample. Your spider is :ref:`ready to run <running-spider>`, but you may want to configure it further in which case you should continue reading.
Configuring your crawler
========================
To start crawling a website, Portia needs one or more URLs to visit first so it can gather further links to crawl. You can define these URLs on the left under ``START PAGES``.
.. image:: _static/portia-add-start-pages.png
:alt: Adding start pages
Portia follows all in-domain URLs by default. In many cases you'll want to limit the pages Portia will visit so requests aren't wasted on irrelevant pages.
To do this, you can set follow and exclude patterns that whitelist and blacklist URLs respectively. These can be configured by changing the crawling policy to ``Configure URL patterns``.
For example, Amazon products' URLs contain ``/gp/``, so you can add this as a follow pattern and Portia will know to only follow such URLs.
.. image:: _static/portia-configuring-crawling.png
:alt: Configuring the crawling
What's next?
============
Once you've created your samples and configured crawling behavior, it's time to :ref:`run <running-spider>` your spider.
Check out the :ref:`examples` to learn a few tips to be more productive with Portia.
================================================
FILE: docs/index.rst
================================================
Welcome to Portia's documentation!
==================================
Contents:
.. toctree::
:maxdepth: 2
installation
getting-started
examples
projects
spiders
samples
items
faq
Indices and tables
==================
* :ref:`genindex`
* :ref:`modindex`
* :ref:`search`
================================================
FILE: docs/installation.rst
================================================
.. _installation:
Installation
============
Docker (recommended)
--------------------
If you are on a Linux machine you will need `Docker <https://docs.docker.com/installation/>`_ installed or if you are using a `Windows <https://docs.docker.com/installation/windows/>`_ or `Mac OS X <https://docs.docker.com/installation/mac/>`_ machine you will need `boot2docker <http://boot2docker.io/>`_.
You can run Portia with the command below::
docker run -i -t --rm -v <PROJECTS_FOLDER>:/app/data/projects:rw -p 9001:9001 scrapinghub/portia
Or with docker-compose by running::
docker compose up
Portia will now be running on port 9001 and you can access it at ``http://localhost:9001``.
Projects will be stored in the project folder that you mount to docker.
To extract data using portia you can run your spider with::
docker run -i -t --rm -v <PROJECTS_FOLDER>:/app/data/projects:rw -v <OUPUT_FOLDER>:/mnt:rw -p 9001:9001 scrapinghub/portia \
portiacrawl /app/data/projects/PROJECT_NAME SPIDER_NAME -o /mnt/SPIDER_NAME.jl
After the crawl finishes you will find your extracted data in the the `OUTPUT_FOLDER`
.. note:: *<PROJECT_FOLDER>* and *<OUTPUT_FOLDER>* are just paths on your system where your projects and extracted data are stored.
.. warning:: For Windows the *<PROJECT_FOLDER>* path must be of the form */<DRIVE_LETTER/<PATH>*. For example */C/Users/UserName/Documents/PortiaProjects*
Vagrant
-------
Checkout the repository::
$ git clone https://github.com/scrapinghub/portia
You will need `Vagrant <http://www.vagrantup.com/downloads.html>`_ , `VirtualBox <https://www.virtualbox.org/wiki/Downloads>`_ `Node.js <https://nodejs.org/en/download/package-manager/>`_, `Bower <https://bower.io/#install-bower>`_ and `ember-cli <https://ember-cli.com/>`_ installed.
Run the following in Portia's directory::
docker/compile-assets.sh
vagrant up
This will launch an Ubuntu virtual machine, build Portia and start the ``portia`` server. You'll then be able to access Portia at ``http://localhost:9001``. You can stop the ``portia`` server using ``vagrant suspend`` or ``vagrant halt``. To run ``portiacrawl`` you will need to SSH into the virtual machine by running ``vagrant ssh``.
Ubuntu
------
Running Portia Locally
^^^^^^^^^^^^^^^^^^^^^^
**These instructions are only valid for an Ubuntu based OS**
Install the following dependencies::
sudo ./provision.sh install_deps
If you would like to run Portia locally you should create an environment with virtualenv::
virtualenv YOUR_ENV_NAME --no-site-packages
source YOUR_ENV_NAME/bin/activate
cd ENV_NAME
Now clone this repository into that env::
git clone https://github.com/scrapinghub/portia.git
cd portia
Install splash and the required packages::
sudo ./provision.sh install_deps install_splash install_python_deps
To run Portia start slyd and portia_server::
PYTHONPATH='/vagrant/portia_server:/vagrant/slyd:/vagrant/slybot'
slyd/bin/slyd -p 9002 -r portiaui/dist &
portia_server/manage.py runserver
Portia should now be running on port 9001 and you can access it at ``http://localhost:9001``.
Developing Portia using Docker
------------------------------
To develop Portia using docker you will need `Node.js <https://nodejs.org/en/download/package-manager/>`_, `Bower <https://bower.io/#install-bower>`_ and `ember-cli <https://ember-cli.com/>`_ installed.
To set up Portia for development use the commands below::
mkdir ~/data
git clone git@github.com:scrapinghub/portia.git
cd portia/portiaui
npm install && bower install
cd node_modules/ember-cli && npm install && cd ../../
ember build
cd ..
docker build . -t portia
You can run it using::
docker run -i -t --rm -p 9001:9001 \
-v ~/data:/app/data/projects:rw \
-v ~/portia/portiaui/dist:/app/portiaui/dist \
-v ~/portia/slyd:/app/slyd \
-v ~/portia/portia_server:/app/portia_server \
portia
This sets up the ``portia_server`` to restart with every change you make and if you run
``cd ~/portia/portiaui && ember build -w`` in another shell you can rebuild the Portia assets with every change too.
================================================
FILE: docs/items.rst
================================================
.. _items:
=====
Items
=====
An item refers to a single item of data scraped from the target website. A common example of an item would be a product for sale on an e-commerce website. It's important to differentiate **item** and **item definition**. In Portia, an item definition or item type refers to the schema of an item rather than the item itself. For example, ``book`` would be an item definition, and a specific book scraped from the website would be an item. An item definition consists of multiple fields, so using the example of a product you might have fields named ``name``, ``price``, ``manufacturer`` and so on. We use annotations to extract data from the page into each of these fields.
To ensure certain fields are extracted, you can set the **Required** flag on each required field. Portia will discard an item if any required fields are missing. Portia will also remove any duplicate items by default.
In some cases you may have fields where the value can vary despite being the same item, in which case you can mark them as **Vary**. This will ignore the field when checking for duplicates. It’s important to only use **Vary** when necessary, as misuse could easily lead to duplicate items being stored. The ``url`` field is a good example of where **Vary** is useful, as the same item may have multiple URLs. If the ``url`` field wasn’t marked as **Vary**, each duplicate item would be seen as unique because its URL would be different.
Field types
===========
You can set a field's type to ensure it will only match that kind of data. The following field types are available:
========= ===========
type description
========= ===========
text Plain text. Any markup is stripped and text within nested elements is also extracted.
number A numeric value e.g. 7, 9.59.
image An image URL. In most cases you will want to map an ``img`` element's ``src`` attribute.
price The same as ``number``, a numeric value.
raw html Non-sanitized HTML.
safe html Sanitized HTML. See below for more details.
geopoint The same as ``text``.
url A URL.
date A date value parsed by `dateparser <https://github.com/scrapinghub/dateparser>`_. Won't work if the annotated element has non-date text.
========= ===========
The ``safe html`` field type keeps the following elements: ``br``, ``p``, ``big``, ``em``, ``small``, ``strong``, ``sub``, ``sup``, ``ins``, ``del``, ``code``, ``kbd``, ``samp``, ``tt``, ``var``, ``pre``, ``listing``, ``plaintext``, ``abbr``, ``acronym``, ``address``, ``bdo``, ``blockquote``, ``q``, ``cite``, ``dfn``, ``table``, ``tr``, ``th``, ``td``, ``tbody``, ``ul``, ``ol``, ``li``, ``dl``, ``dd``, ``dt``.
All other elements are discarded, with the exception of header tags (``h1``, ``h2`` ... ``h6``) and ``b`` which are replaced with ``strong``, and ``i`` which is replaced with ``em``. Whitelisted elements contained within non-whitelisted elements will still be retained, with the exception of elements contained within a ``script``, ``img`` or ``input`` element. For example, ``<div><code>example</code></div>`` would extract to ``<code>example</code>``, whereas ``<script><code>example</code></script>`` would be discarded completely.
================================================
FILE: docs/make.bat
================================================
@ECHO OFF
REM Command file for Sphinx documentation
if "%SPHINXBUILD%" == "" (
set SPHINXBUILD=sphinx-build
)
set BUILDDIR=_build
set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% .
set I18NSPHINXOPTS=%SPHINXOPTS% .
if NOT "%PAPER%" == "" (
set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS%
set I18NSPHINXOPTS=-D latex_paper_size=%PAPER% %I18NSPHINXOPTS%
)
if "%1" == "" goto help
if "%1" == "help" (
:help
echo.Please use `make ^<target^>` where ^<target^> is one of
echo. html to make standalone HTML files
echo. dirhtml to make HTML files named index.html in directories
echo. singlehtml to make a single large HTML file
echo. pickle to make pickle files
echo. json to make JSON files
echo. htmlhelp to make HTML files and a HTML help project
echo. qthelp to make HTML files and a qthelp project
echo. devhelp to make HTML files and a Devhelp project
echo. epub to make an epub
echo. latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter
echo. text to make text files
echo. man to make manual pages
echo. texinfo to make Texinfo files
echo. gettext to make PO message catalogs
echo. changes to make an overview over all changed/added/deprecated items
echo. xml to make Docutils-native XML files
echo. pseudoxml to make pseudoxml-XML files for display purposes
echo. linkcheck to check all external links for integrity
echo. doctest to run all doctests embedded in the documentation if enabled
goto end
)
if "%1" == "clean" (
for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i
del /q /s %BUILDDIR%\*
goto end
)
%SPHINXBUILD% 2> nul
if errorlevel 9009 (
echo.
echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
echo.installed, then set the SPHINXBUILD environment variable to point
echo.to the full path of the 'sphinx-build' executable. Alternatively you
echo.may add the Sphinx directory to PATH.
echo.
echo.If you don't have Sphinx installed, grab it from
echo.http://sphinx-doc.org/
exit /b 1
)
if "%1" == "html" (
%SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html
if errorlevel 1 exit /b 1
echo.
echo.Build finished. The HTML pages are in %BUILDDIR%/html.
goto end
)
if "%1" == "dirhtml" (
%SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml
if errorlevel 1 exit /b 1
echo.
echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml.
goto end
)
if "%1" == "singlehtml" (
%SPHINXBUILD% -b singlehtml %ALLSPHINXOPTS% %BUILDDIR%/singlehtml
if errorlevel 1 exit /b 1
echo.
echo.Build finished. The HTML pages are in %BUILDDIR%/singlehtml.
goto end
)
if "%1" == "pickle" (
%SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle
if errorlevel 1 exit /b 1
echo.
echo.Build finished; now you can process the pickle files.
goto end
)
if "%1" == "json" (
%SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json
if errorlevel 1 exit /b 1
echo.
echo.Build finished; now you can process the JSON files.
goto end
)
if "%1" == "htmlhelp" (
%SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp
if errorlevel 1 exit /b 1
echo.
echo.Build finished; now you can run HTML Help Workshop with the ^
.hhp project file in %BUILDDIR%/htmlhelp.
goto end
)
if "%1" == "qthelp" (
%SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp
if errorlevel 1 exit /b 1
echo.
echo.Build finished; now you can run "qcollectiongenerator" with the ^
.qhcp project file in %BUILDDIR%/qthelp, like this:
echo.^> qcollectiongenerator %BUILDDIR%\qthelp\Portia.qhcp
echo.To view the help file:
echo.^> assistant -collectionFile %BUILDDIR%\qthelp\Portia.ghc
goto end
)
if "%1" == "devhelp" (
%SPHINXBUILD% -b devhelp %ALLSPHINXOPTS% %BUILDDIR%/devhelp
if errorlevel 1 exit /b 1
echo.
echo.Build finished.
goto end
)
if "%1" == "epub" (
%SPHINXBUILD% -b epub %ALLSPHINXOPTS% %BUILDDIR%/epub
if errorlevel 1 exit /b 1
echo.
echo.Build finished. The epub file is in %BUILDDIR%/epub.
goto end
)
if "%1" == "latex" (
%SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
if errorlevel 1 exit /b 1
echo.
echo.Build finished; the LaTeX files are in %BUILDDIR%/latex.
goto end
)
if "%1" == "latexpdf" (
%SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
cd %BUILDDIR%/latex
make all-pdf
cd %BUILDDIR%/..
echo.
echo.Build finished; the PDF files are in %BUILDDIR%/latex.
goto end
)
if "%1" == "latexpdfja" (
%SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
cd %BUILDDIR%/latex
make all-pdf-ja
cd %BUILDDIR%/..
echo.
echo.Build finished; the PDF files are in %BUILDDIR%/latex.
goto end
)
if "%1" == "text" (
%SPHINXBUILD% -b text %ALLSPHINXOPTS% %BUILDDIR%/text
if errorlevel 1 exit /b 1
echo.
echo.Build finished. The text files are in %BUILDDIR%/text.
goto end
)
if "%1" == "man" (
%SPHINXBUILD% -b man %ALLSPHINXOPTS% %BUILDDIR%/man
if errorlevel 1 exit /b 1
echo.
echo.Build finished. The manual pages are in %BUILDDIR%/man.
goto end
)
if "%1" == "texinfo" (
%SPHINXBUILD% -b texinfo %ALLSPHINXOPTS% %BUILDDIR%/texinfo
if errorlevel 1 exit /b 1
echo.
echo.Build finished. The Texinfo files are in %BUILDDIR%/texinfo.
goto end
)
if "%1" == "gettext" (
%SPHINXBUILD% -b gettext %I18NSPHINXOPTS% %BUILDDIR%/locale
if errorlevel 1 exit /b 1
echo.
echo.Build finished. The message catalogs are in %BUILDDIR%/locale.
goto end
)
if "%1" == "changes" (
%SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes
if errorlevel 1 exit /b 1
echo.
echo.The overview file is in %BUILDDIR%/changes.
goto end
)
if "%1" == "linkcheck" (
%SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck
if errorlevel 1 exit /b 1
echo.
echo.Link check complete; look for any errors in the above output ^
or in %BUILDDIR%/linkcheck/output.txt.
goto end
)
if "%1" == "doctest" (
%SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest
if errorlevel 1 exit /b 1
echo.
echo.Testing of doctests in the sources finished, look at the ^
results in %BUILDDIR%/doctest/output.txt.
goto end
)
if "%1" == "xml" (
%SPHINXBUILD% -b xml %ALLSPHINXOPTS% %BUILDDIR%/xml
if errorlevel 1 exit /b 1
echo.
echo.Build finished. The XML files are in %BUILDDIR%/xml.
goto end
)
if "%1" == "pseudoxml" (
%SPHINXBUILD% -b pseudoxml %ALLSPHINXOPTS% %BUILDDIR%/pseudoxml
if errorlevel 1 exit /b 1
echo.
echo.Build finished. The pseudo-XML files are in %BUILDDIR%/pseudoxml.
goto end
)
:end
================================================
FILE: docs/projects.rst
================================================
.. _projects:
========
Projects
========
A project in Portia consists of one or more :ref:`spiders <spiders>` and can be deployed to any `scrapyd`_ instance.
Versioning
==========
.. _project-deployment:
Portia provides project versioning via Git, but this isn't enabled by default.
Git versioning can be enabled by creating a `local_settings.py` file in the `slyd/slyd` directory and adding the following:
.. code-block:: python
import os
SPEC_FACTORY = {
'PROJECT_SPEC': 'slyd.gitstorage.projectspec.ProjectSpec',
'PROJECT_MANAGER': 'slyd.gitstorage.projects.ProjectsManager',
'PARAMS': {
'storage_backend': 'dulwich.repo.Repo',
'location': os.environ.get('PORTIA_DATA_DIR', SPEC_DATA_DIR)
},
'CAPABILITIES': {
'version_control': True,
'create_projects': True,
'delete_projects': True,
'rename_projects': True
}
}
You can also use MySQL to store your project files in combination with Git:
.. code-block:: python
import os
SPEC_FACTORY = {
'PROJECT_SPEC': 'slyd.gitstorage.projectspec.ProjectSpec',
'PROJECT_MANAGER': 'slyd.gitstorage.projects.ProjectsManager',
'PARAMS': {
'storage_backend': 'slyd.gitstorage.repo.MysqlRepo',
'location': os.environ.get('DB_URL'),
},
'CAPABILITIES': {
'version_control': True,
'create_projects': True,
'delete_projects': True,
'rename_projects': True
}
}
This will store versioned projects as blobs within the MySQL database that you specify by setting the environment variable below::
DB_URL = mysql://<USERNAME>:<PASSWORD>@<HOST>:<PORT>/<DB>
When this env variable is set the database can be initialized by running the bin/init_mysqldb script.
.. note:: The MySQL backend only stores project data. Data generated during crawl is still stored locally.
Deployment
==========
You can deploy your Portia projects using `scrapyd`_. Change directory into ``slyd/data/projects/PROJECT_NAME`` and add your target to ``scrapy.cfg``. You'll then be able to run ``scrapyd-deploy`` which will deploy your project using the default deploy target. Alternatively, you can specify a target and project using the following::
scrapyd-deploy your_scrapyd_target -p project_name
Once your spider is deployed, you can schedule your spider via ``schedule.json``::
curl http://your_scrapyd_host:6800/schedule.json -d project=your_project_name -d spider=your_spider_name
.. warning:: Running scrapyd from your project directory will cause deployment to fail.
.. _scrapyd: https://scrapyd.readthedocs.org/en/latest/
================================================
FILE: docs/samples.rst
================================================
.. _samples:
=======
Samples
=======
What are samples?
=================
When the crawler visits a page, it matches the page against each sample. Samples with more annotations take precedence over those with less. If the page matches a sample, it will use the sample's annotations to extract data. Assuming all required fields are filled, it will yield an item. Spiders consist of one or more samples and each sample is made up of annotations that define the elements you wish to extract. Within the sample you define the item you want to extract and mark required fields for that item.
.. _what-are-annotations:
What are annotations?
=====================
An annotation defines the location of a piece of data on the web page and how it should be used by the spider. Typically an annotation maps some element on the page to a particular field of an item, but there is also the option to mark the data as being required without storing the data in an item. It's possible to map attributes of a particular element instead of the content if this is required, for example you can map the ``href`` attribute of an anchor link rather than the text.
Annotations
===========
Creating annotations
--------------------
You can create annotations by clicking an element on the page with the appropriate tool selected. You should use the wand (|icon-wand|) most of the time as it will select the appropriate tool automatically. The following tools are available:
* |icon-wand| - Select the most appropriate tool when clicking on an element
* |icon-select| - Select an element
* |icon-add| - Add an element
* |icon-sub| - Remove an element
* |icon-add-repeat| - Add repeating element
.. |icon-wand| image:: _static/portia-icon-wand.png
:width: 16px
:height: 16px
.. |icon-select| image:: _static/portia-icon-pointer.png
:width: 16px
:height: 16px
.. |icon-add| image:: _static/portia-icon-add.png
:width: 16px
:height: 16px
.. |icon-sub| image:: _static/portia-icon-sub.png
:width: 16px
:height: 16px
.. |icon-add-repeat| image:: _static/portia-icon-add-repeat.png
:width: 16px
:height: 16px
Extractors
----------
You can also add extractors to annotations. Extractors let you use regular expressions or a pre-defined type to further refine data extracted from a page.
For example, assume there's an element that contains a phone number, but it has additional text that you don't need. In this scenario you could add an extractor to retrieve only the phone number instead of the full text.
You can define the extractor for a particular field by clicking in the gear icon right after the field type:
.. image:: _static/portia-goto-extractors.png
:alt: Field extractors
And then you can select use any built-in extractors or create your own extractor via regular expressions:
.. image:: _static/portia-extractors.png
:alt: Field extractors
Multiple fields
---------------
It's possible to extract multiple fields using a single annotation if there are several properties you want to extract from an element. For example, if there was an image you wanted, you could map the ``src`` attribute that contains the image URL to one field, and the ``alt`` attribute to another.
You can do it in the ``Inspector`` panel in the top left of the screen:
.. image:: _static/portia-sample-multiple-fields.png
:alt: Multiple fields from one element
Just click the ``+`` button right after an attribute to add a new field based on the same annotation.
.. _multiple-samples:
Multiple samples
================
It's often necessary to use multiple samples within one spider, even if you're only extracting one item type. Some pages containing the same item type may have a different layout or fields missing, and you will need to accommodate for those pages by creating a sample for each variation in layout.
Sample precedence
-----------------
The more annotations a sample has, the more specific the data being extracted and therefore less chance of a false positive. For this reason, samples with more annotations take precedence over those with less annotations. If a subset of samples contains equal number of annotations per sample, then within that subset samples will be tried in the order they were created from first to last. In other words, samples are tried sequentially in order of number of annotations first, and age second.
If you are working with a large number of samples, it may be difficult to ensure the correct sample is applied to the right page. It's best to keep samples as strict as possible to avoid any false matches. It's useful to take advantage of the ``Required`` option from item fields and annotate elements that will always appear on matching pages to reduce the number of false positives.
**Check this example** to learn how to do it: :ref:`multiple-samples-example`.
================================================
FILE: docs/spiders.rst
================================================
.. _spiders:
=======
Spiders
=======
Spiders are web crawlers that use :ref:`samples <samples>` to extract data from the pages it visits.
.. _spider-properties:
Spider properties
=================
You can access your spider's properties by clicking the gear icon located right of your spider in the list on the left.
.. image:: _static/portia-spider-properties.png
:alt: Spider properties
Configuring login details
-------------------------
If you need to log into a site, you can configure login details by ticking 'Perform login' in the :ref:`spider properties <spider-properties>` menu. Here you can set the login URL, username and password.
Enabling JavaScript
-------------------
You can enable JavaScript in your spider by ticking ``Enable JavaScript`` in the :ref:`spider properties <spider-properties>` menu. Note that you'll need to set the ``SPLASH_URL`` Scrapy setting to your [Splash](https://github.com/scrapinghub/splash) endpoint URL for JavaScript to work during the crawl.
Start pages and link crawling
=============================
Start pages are the initial URLs that Portia will visit to start the crawl. You can add and remove start pages on the left menu.
You can choose how Portia will follow links under ``LINK CRAWLING``.
.. image:: _static/portia-spider-link-crawling.png
:alt: Link crawling properties
* Follow all in-domain links - follow all links under the same domain and subdomain.
* Don't follow links - only visit start URLs.
* Configure url patterns - use regular expressions to choose which URLs to follow.
The ``Configure url patterns`` option lets you set follow and exclude patterns as well as choose whether to respect the ``nofollow`` attribute. Click the gear icon to show the link crawling options where you can set the follow/exclude patterns.
.. _running-spider:
Running a spider
================
Portia will save your projects in ``slyd/data/projects``. You can use ``portiacrawl`` to run a spider::
portiacrawl PROJECT_PATH SPIDER_NAME
where ``PROJECT_PATH`` is the path of the project and ``SPIDER_NAME`` is a spider that exists within that project. You can list the spiders for a project with the following::
portiacrawl PROJECT_PATH
Portia spiders are ultimately `Scrapy <http://scrapy.org>`_ spiders. You can pass Scrapy arguments when running with ``portiacrawl`` using the ``-a`` option. You can also specify a custom settings module using the ``--settings`` option. The `Scrapy documentation <http://doc.scrapy.org/en/latest>`_ contains full details on available options and settings.
Minimum items threshold
=======================
To avoid infinite crawling loops, Portia spiders check to see if the number of scraped items meet a minimum threshold over a given period of time. If not, the job is closed with ``slybot_fewitems_scraped`` outcome.
By default, the period of time is 3600 seconds and the threshold is 200 items scraped. This means if less than 200 items were scraped in the last 3600 seconds, the job will close.
You can set the period in seconds with the ``SLYCLOSE_SPIDER_CHECK_PERIOD`` setting, and the threshold number of items with the ``SLYCLOSE_SPIDER_PERIOD_ITEMS`` setting.
================================================
FILE: portia_server/db_repo/__init__.py
================================================
================================================
FILE: portia_server/db_repo/apps.py
================================================
from __future__ import unicode_literals
from django.apps import AppConfig
class DbRepoConfig(AppConfig):
name = 'db_repo'
================================================
FILE: portia_server/db_repo/migrations/0001_initial.py
================================================
# -*- coding: utf-8 -*-
# Generated by Django 1.10 on 2016-10-04 06:54
from __future__ import unicode_literals
import db_repo.models
from django.db import migrations, models
class Migration(migrations.Migration):
initial = True
dependencies = [
]
operations = [
migrations.CreateModel(
name='Objs',
fields=[
('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
('oid', db_repo.models.RealBinaryField(default='\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00', max_length=40)),
('repo', models.CharField(max_length=64)),
('type', db_repo.models.PositiveTinyIntegerField(db_index=True)),
('size', db_repo.models.PositiveBigIntegerField(db_index=True)),
('data', db_repo.models.CompressedBinaryField()),
],
options={
'db_table': 'objs',
},
),
migrations.CreateModel(
name='Refs',
fields=[
('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
('ref', models.CharField(default='', max_length=100)),
('repo', models.CharField(max_length=64)),
('value', db_repo.models.RealBinaryField(db_index=True, max_length=40)),
],
options={
'db_table': 'refs',
},
),
migrations.AlterUniqueTogether(
name='refs',
unique_together=set([('ref', 'repo')]),
),
migrations.AlterUniqueTogether(
name='objs',
unique_together=set([('oid', 'repo')]),
),
]
================================================
FILE: portia_server/db_repo/migrations/__init__.py
================================================
================================================
FILE: portia_server/db_repo/migrations/slyd_to_django.sql
================================================
ALTER TABLE `objs` DROP PRIMARY KEY,
ADD COLUMN `id` integer AUTO_INCREMENT NOT NULL PRIMARY KEY,
ADD CONSTRAINT `objs_oid_feda89ac_uniq` UNIQUE (`oid`, `repo`),
CONVERT TO CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci,
ALTER COLUMN `oid` DROP DEFAULT;
DROP INDEX `type` ON `objs`;
DROP INDEX `size` ON `objs`;
CREATE INDEX `objs_599dcce2` ON `objs` (`type`);
CREATE INDEX `objs_f7bd60b7` ON `objs` (`size`);
ALTER TABLE `refs` DROP PRIMARY KEY,
ADD COLUMN `id` integer AUTO_INCREMENT NOT NULL PRIMARY KEY,
ADD CONSTRAINT `refs_ref_4a751775_uniq` UNIQUE (`ref`, `repo`),
CONVERT TO CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci,
ALTER COLUMN `ref` DROP DEFAULT;
DROP INDEX `value` ON `refs`;
CREATE INDEX `refs_2063c160` ON `refs` (`value`);
================================================
FILE: portia_server/db_repo/models.py
================================================
from __future__ import unicode_literals
from django.db.models import (Model, BinaryField, BigIntegerField,
PositiveSmallIntegerField, CharField)
from django.db.models.expressions import Func, Value
class PositiveTinyIntegerField(PositiveSmallIntegerField):
def db_type(self, connection):
if connection.vendor == 'mysql':
return "tinyint(1) unsigned"
else:
return super(PositiveTinyIntegerField, self).db_type(connection)
class PositiveBigIntegerField(BigIntegerField):
def db_type(self, connection):
if connection.vendor == 'mysql':
return "bigint(20) unsigned"
else:
return super(PositiveBigIntegerField, self).db_type(connection)
class RealBinaryField(BinaryField):
def db_type(self, connection):
if connection.vendor == 'mysql':
return "binary({})".format(self.max_length)
else:
return super(RealBinaryField, self).db_type(connection)
class CompressedBinaryField(BinaryField):
def get_db_prep_save(self, value, connection):
prepped_value = super(CompressedBinaryField, self).get_db_prep_save(
value, connection)
if connection.vendor == 'mysql':
return Func(Value(prepped_value), function='COMPRESS')
return prepped_value
def select_format(self, compiler, sql, params):
sql, params = super(CompressedBinaryField, self).select_format(
compiler, sql, params)
if compiler.connection.vendor == 'mysql':
sql = 'UNCOMPRESS({})'.format(sql)
return sql, params
class Objs(Model):
oid = RealBinaryField(max_length=40, default='\0' * 40, null=False)
repo = CharField(max_length=64, null=False)
type = PositiveTinyIntegerField(null=False, db_index=True)
size = PositiveBigIntegerField(null=False, db_index=True)
data = CompressedBinaryField(null=False)
class Meta(object):
unique_together = (('oid', 'repo'),)
db_table = 'objs'
class Refs(Model):
ref = CharField(max_length=100, default='', null=False)
repo = CharField(max_length=64, null=False)
value = RealBinaryField(max_length=40, null=False, db_index=True)
class Meta(object):
unique_together = (('ref', 'repo'),)
db_table = 'refs'
================================================
FILE: portia_server/db_repo/repo.py
================================================
from django.db.transaction import get_autocommit
from django.db.utils import IntegrityError
from dulwich.errors import ObjectMissing
from dulwich.object_store import BaseObjectStore, MemoryObjectStore
from dulwich.objects import sha_to_hex
from dulwich.repo import BaseRepo, MemoryRepo
from dulwich.refs import DictRefsContainer, RefsContainer, SYMREF
from six import get_unbound_function
from .models import Objs, Refs
class MysqlObjectStore(BaseObjectStore):
"""Object store that keeps all objects in a mysql database."""
def __init__(self, repo):
super(MysqlObjectStore, self).__init__()
self._repo = repo
add_objects = get_unbound_function(MemoryObjectStore.add_objects)
add_thin_pack = get_unbound_function(MemoryObjectStore.add_thin_pack)
contains_packed = get_unbound_function(MemoryObjectStore.contains_packed)
packs = MemoryObjectStore.packs
_complete_thin_pack = get_unbound_function(
MemoryObjectStore._complete_thin_pack)
def _to_hexsha(self, sha):
if len(sha) == 40:
return sha
elif len(sha) == 20:
return sha_to_hex(sha)
else:
raise ValueError("Invalid sha %r" % (sha,))
def _has_sha(self, sha):
"""Look for the sha in the database."""
return Objs.objects.filter(repo=self._repo, oid=sha).exists()
def _all_shas(self):
"""Return all db sha keys."""
for obj in Objs.objects.filter(repo=self._repo).only('oid').iterator():
yield obj.oid
def contains_loose(self, sha):
"""Check if a particular object is present by SHA1 and is loose."""
return self._has_sha(self._to_hexsha(sha))
def __iter__(self):
"""Iterate over the SHAs that are present in this store."""
return self._all_shas()
def get_raw(self, name):
"""Obtain the raw text for an object.
:param name: sha for the object.
:return: tuple with numeric type and object contents.
"""
try:
obj = Objs.objects.only('type', 'data')\
.get(repo=self._repo, oid=self._to_hexsha(name))
except Objs.DoesNotExist:
# last resort fallback, this exception will cause a retry
raise ObjectMissing(name)
else:
return obj.type, obj.data
def add_object(self, obj):
data = obj.as_raw_string()
oid = obj.id
tnum = obj.get_type()
try:
Objs.objects.update_or_create(
repo=self._repo, oid=oid, type=tnum, size=len(data), data=data)
except IntegrityError:
pass
def delete_objects(self, object_ids):
Objs.objects.filter(repo=self._repo, oid__in=object_ids).delete()
class MysqlRefsContainer(RefsContainer):
"""RefsContainer backed by MySql.
This container does not support packed references.
"""
def __init__(self, repo):
super(MysqlRefsContainer, self).__init__()
self._repo = repo
get_packed_refs = get_unbound_function(DictRefsContainer.get_packed_refs)
def allkeys(self):
for ref in Refs.objects.filter(repo=self._repo).only('ref').iterator():
yield ref.ref
def read_loose_ref(self, name):
qs = Refs.objects.only('value')
if not get_autocommit(using=qs._db):
qs = qs.select_for_update()
try:
ref = qs.get(repo=self._repo, ref=name)
except Refs.DoesNotExist:
return None
else:
return ref.value
def set_symbolic_ref(self, name, other):
self._update_ref(name, SYMREF + other)
def set_if_equals(self, name, old_ref, new_ref):
if old_ref is not None and self.read_loose_ref(name) != old_ref:
return False
realnames, _ = self.follow(name)
for realname in realnames:
self._check_refname(realname)
self._update_ref(realname, new_ref)
return True
def add_if_new(self, name, ref):
if self.read_loose_ref(name):
return False
self._update_ref(name, ref)
return True
def remove_if_equals(self, name, old_ref):
if old_ref is not None and self.read_loose_ref(name) != old_ref:
return False
self._remove_ref(name)
return True
def _update_ref(self, name, value):
Refs.objects.update_or_create(repo=self._repo, ref=name, defaults={
'value': value,
})
def _remove_ref(self, name):
Refs.objects.filter(repo=self._repo, ref=name).delete()
class MysqlRepo(BaseRepo):
"""Repo that stores refs, objects, and named files in MySql.
MySql repos are always bare: they have no working tree and no index, since
those have a stronger dependency on the filesystem.
"""
def __init__(self, name):
self._name = name
BaseRepo.__init__(self, MysqlObjectStore(name),
MysqlRefsContainer(name))
self.bare = True
open_index = get_unbound_function(MemoryRepo.open_index)
def head(self):
"""Return the SHA1 pointed at by HEAD."""
return self.refs['refs/heads/master']
@classmethod
def init_bare(cls, name):
"""Create a new bare repository."""
return cls(name)
@classmethod
def open(cls, name):
"""Open an existing repository."""
return cls(name)
@classmethod
def repo_exists(cls, name):
"""Check if a repository exists."""
return Objs.objects.filter(repo=name).exists()
@classmethod
def list_repos(cls):
"""List all repository names."""
return Objs.objects.distinct().values_list('repo', flat=True)
@classmethod
def delete_repo(cls, name):
"""Delete a repository."""
Objs.objects.filter(repo=name).delete()
Refs.objects.filter(repo=name).delete()
================================================
FILE: portia_server/manage.py
================================================
#!/usr/bin/env python3
import os
import sys
if __name__ == "__main__":
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "portia_server.settings")
from django.core.management import execute_from_command_line
execute_from_command_line(sys.argv)
================================================
FILE: portia_server/portia_api/__init__.py
================================================
================================================
FILE: portia_server/portia_api/apps.py
================================================
from __future__ import unicode_literals
from django.apps import AppConfig
class PortiaApiConfig(AppConfig):
name = 'portia_api'
================================================
FILE: portia_server/portia_api/errors.py
================================================
class BaseError(Exception):
def __init__(self, status, title, body=''):
self._status = status
self._title = title
self._body = body
@property
def title(self):
return self._title
@property
def body(self):
return self._body
@property
def status(self):
return self._status
def __repr__(self):
return '%s(%s)' % (self.__class__.__name__, str(self))
def __str__(self):
return '%s: %s' % (self.status, self.title)
class BaseHTTPError(BaseError):
_status = 999
def __init__(self, title, body=''):
super(BaseHTTPError, self).__init__(self._status, title, body)
class BadRequest(BaseHTTPError):
_status = 400
class Forbidden(BaseHTTPError):
_status = 403
class NotFound(BaseHTTPError):
_status = 404
class InternalServerError(BaseHTTPError):
_status = 500
================================================
FILE: portia_server/portia_api/jsonapi/__init__.py
================================================
from .response import JSONResponse
================================================
FILE: portia_server/portia_api/jsonapi/exceptions.py
================================================
from collections import OrderedDict
from uuid import uuid4
from rest_framework.exceptions import APIException, ValidationError
from rest_framework.status import (HTTP_400_BAD_REQUEST, HTTP_409_CONFLICT,
HTTP_404_NOT_FOUND)
from rest_framework.views import exception_handler
from .utils import get_status_title
class JsonApiValidationError(ValidationError):
def __init__(self, detail):
super(JsonApiValidationError, self).__init__({
'errors': [OrderedDict([
('status', self.status_code),
('title', get_status_title(self.status_code)),
('detail', error['detail']),
('source', error['source']),
]) for error in detail.get('errors', [])]
})
def render_exception(status_code, detail):
return {
'errors': [OrderedDict([
('id', str(uuid4())),
('status', status_code),
('title', get_status_title(status_code)),
('detail', detail)
])]
}
class JsonApiBadRequestError(APIException):
status_code = HTTP_400_BAD_REQUEST
default_detail = (u"The server cannot process the request due to invalid "
u"data.")
class JsonApiNotFoundError(APIException):
status_code = HTTP_404_NOT_FOUND
default_detail = u"Could not find the resource specified"
class JsonApiConflictError(APIException):
status_code = HTTP_409_CONFLICT
default_detail = u"The server cannot process the request due to a conflict."
class JsonApiFeatureNotAvailableError(JsonApiBadRequestError):
default_detail = u"This feature is not available for your project."
class JsonApiGeneralException(APIException):
def __init__(self, detail=None, status_code=None):
assert status_code is not None
self.status_code = status_code
super(JsonApiGeneralException, self).__init__(detail)
def jsonapi_exception_handler(exc, context):
accepts = context['request'].accepted_media_type or ''
if accepts.startswith('application/vnd.api+json'):
try:
exc.detail = render_exception(exc.status_code, exc.detail)
except AttributeError:
pass # Ignore django exceptions
response = exception_handler(exc, context)
return response
================================================
FILE: portia_server/portia_api/jsonapi/parsers.py
================================================
from __future__ import unicode_literals
from rest_framework.parsers import JSONParser
class JSONApiParser(JSONParser):
media_type = 'application/vnd.api+json'
================================================
FILE: portia_server/portia_api/jsonapi/registry.py
================================================
from portia_orm.exceptions import ImproperlyConfigured
__all__ = [
'schema',
]
schemas = {}
def get_schema(schema_type):
try:
return schemas[schema_type]
except KeyError:
raise ImproperlyConfigured(
u"No schema for type '{}' exists".format(schema_type))
================================================
FILE: portia_server/portia_api/jsonapi/relationships.py
================================================
from marshmallow_jsonapi.fields import Relationship as BaseRelationship
from portia_api.jsonapi.registry import get_schema
from portia_api.jsonapi.utils import (
TOP_LEVEL_OBJECT_ORDER, LINKS_OBJECT_ORDER, type_from_model_name,
order_dict, cached_property, cached_property_ignore_set)
class Relationship(BaseRelationship):
def __init__(self, **kwargs):
self._serializer = kwargs.get('serializer')
super(Relationship, self).__init__(**kwargs)
@cached_property
def schema(self):
schema = self._serializer or get_schema(self.type_)
return schema(fields_map=self.root.fields_map,
exclude_map=self.root.exclude_map,
include_data=self.root.include_map.get(self.name, []),
include_data_map=self.root.include_data_map)
@cached_property_ignore_set
def include_resource_linkage(self):
return self.name in self.root.relationship_set
def get_related_url(self, obj):
if self.related_url:
try:
return self.related_url.format(self=obj)
except AttributeError:
pass
return None
def get_self_url(self, obj):
if self.self_url:
try:
return self.self_url.format(self=obj)
except AttributeError:
pass
return None
def get_resource_linkage(self, value):
data = super(Relationship, self).get_resource_linkage(value)
if self.many:
return [order_dict(item, TOP_LEVEL_OBJECT_ORDER) for item in data]
return order_dict(data, TOP_LEVEL_OBJECT_ORDER)
def _serialize(self, value, attr, obj):
data = super(Relationship, self)._serialize(value, attr, obj)
if 'links' in data:
data['links'] = order_dict(data['links'], LINKS_OBJECT_ORDER)
return order_dict(data, TOP_LEVEL_OBJECT_ORDER)
class PolymorphicRelationship(Relationship):
def __init__(self, **kwargs):
super(PolymorphicRelationship, self).__init__(**kwargs)
def _serialize(self, value, attr, obj):
if not self.many:
value = [value]
links = None
result = []
if value:
for instance in value:
field = Relationship(
type_=type_from_model_name(instance.__class__.__name__),
id_field='pk',
self_url=self.self_url,
related_url=self.related_url,
many=False)
field._add_to_schema(self.name, self.parent)
field.include_data = self.include_data
data = field._serialize(instance, attr, obj)
if links is None and 'links' in data:
links = data['links']
result.append(data.get('data'))
else:
data = super(PolymorphicRelationship, self)._serialize(
None, attr, obj)
if links is None and 'links' in data:
links = data['links']
if not self.many:
result = result[0]
data = {}
if links is not None:
data['links'] = order_dict(links, LINKS_OBJECT_ORDER)
if self.include_resource_linkage or self.include_data:
data['data'] = result
return order_dict(data, TOP_LEVEL_OBJECT_ORDER)
def _deserialize(self, value, attr, data):
return value
================================================
FILE: portia_server/portia_api/jsonapi/renderers.py
================================================
from __future__ import unicode_literals
from rest_framework.renderers import JSONRenderer as BaseJSONRenderer
class JSONRenderer(BaseJSONRenderer):
default_indent = 2
def get_indent(self, accepted_media_type, renderer_context):
indent = super(JSONRenderer, self).get_indent(
accepted_media_type, renderer_context)
if indent is None:
return self.default_indent
return indent
class JSONApiRenderer(JSONRenderer):
media_type = 'application/vnd.api+json'
def render(self, data, accepted_media_type=None, renderer_context=None):
ret = super(JSONApiRenderer, self).render(data, accepted_media_type,
renderer_context)
response = renderer_context['response']
content_type = self.media_type
try:
profiles = data['links']['profile']
except (TypeError, KeyError):
profiles = []
if profiles:
content_type += '; profile="{}"'.format(' '.join(profiles))
response['Content-Type'] = content_type
return ret
================================================
FILE: portia_server/portia_api/jsonapi/response.py
================================================
from django.http import HttpResponse
from rest_framework.renderers import JSONRenderer
class JSONResponse(HttpResponse):
"""
An HttpResponse that renders its content into JSON.
"""
def __init__(self, data, **kwargs):
content = JSONRenderer().render(data)
kwargs['content_type'] = 'application/json'
super(JSONResponse, self).__init__(content, **kwargs)
================================================
FILE: portia_server/portia_api/jsonapi/serializers.py
================================================
from collections import defaultdict, OrderedDict
from functools import partial
from itertools import chain
from operator import itemgetter
from marshmallow import pre_dump, post_dump, ValidationError
from marshmallow.fields import Method
from marshmallow.schema import SchemaMeta
from marshmallow_jsonapi import Schema as BaseSchema, SchemaOpts
from marshmallow_jsonapi.exceptions import IncorrectTypeError
from six import iteritems, iterkeys, string_types, with_metaclass
from six.moves import map, zip
from portia_api.jsonapi.registry import schemas, get_schema
from portia_api.jsonapi.relationships import (
Relationship, PolymorphicRelationship)
from portia_api.jsonapi.utils import (
RESOURCE_OBJECT_ORDER, TOP_LEVEL_OBJECT_ORDER, cached_property,
camel_case_to_dashes, deep_getattr, dasherize, order_dict,
should_include_field, type_from_model_name)
from portia_orm.base import AUTO_PK, Model
from portia_orm.exceptions import ImproperlyConfigured
from portia_orm.fields import Field as OrmField
from portia_orm.relationships import BaseRelationship, HasMany
__all__ = [
'JsonApiSerializer',
'JsonApiPolymorphicSerializer',
]
DELETED_PROFILE = 'https://portia.scrapinghub.com/jsonapi/extensions/deleted'
UPDATES_PROFILE = 'https://portia.scrapinghub.com/jsonapi/extensions/updates'
DELETED_PROFILE_ALIAS = 'deleted'
UPDATES_PROFILE_ALIAS = 'updates'
class JsonApiSerializerMeta(SchemaMeta):
"""Meta class for JSON API schemas."""
def __new__(mcs, name, bases, attrs):
parents = [b for b in bases if isinstance(b, JsonApiSerializerMeta)]
if not parents:
return super(JsonApiSerializerMeta, mcs).__new__(
mcs, name, bases, attrs)
meta = attrs.pop('Meta', None)
try:
model = meta.model
except AttributeError:
raise TypeError(
u"Class '{}' is missing the 'Meta.model' attribute.".format(
name))
schema_type = type_from_model_name(model.__name__)
meta_bases = (meta, object) if meta else (object,)
schema_attrs = {
'Meta': type('Meta', meta_bases, {
'type_': schema_type,
'inflect': dasherize,
})
}
links = getattr(meta, 'links', {})
for attrname, field in iteritems(model._fields):
if isinstance(field, OrmField):
field_copy = object.__new__(field.__class__)
field_copy.__dict__ = dict(field.__dict__)
field_copy.load_from = None
field_copy.dump_to = None
schema_attrs[attrname] = field_copy
elif isinstance(field, BaseRelationship):
rel_links = links.get(attrname, {})
rel_many = isinstance(field, HasMany)
rel_options = {
'self_url': rel_links.get('self', ''),
'related_url': rel_links.get('related', ''),
'many': rel_many
}
if not rel_many:
rel_options['allow_none'] = True
if field.polymorphic:
schema_attrs[attrname] = PolymorphicRelationship(
**rel_options)
else:
schema_attrs[attrname] = Relationship(
type_=type_from_model_name(field.model.__name__),
id_field='pk',
serializer=rel_links.get('serializer'),
**rel_options)
if 'id' not in schema_attrs:
pk_field = model._fields[model._pk_field]
schema_attrs['id'] = type(pk_field)(attribute=model._pk_field)
# we need to access the serialized object to generate the url, but
# get_resource_links takes the serialized item, so we add a method field
# to do the work
schema_attrs['_url'] = Method('get_url')
attrs.update(schema_attrs)
cls = super(JsonApiSerializerMeta, mcs).__new__(mcs, name, bases, attrs)
# add new schema to registry by type
is_custom = name.replace('Serializer', '') != model.__name__
key = camel_case_to_dashes(name) if is_custom else schema_type
schemas[key] = cls
return cls
class JsonApiSerializerOpts(SchemaOpts):
def __init__(self, meta):
super(JsonApiSerializerOpts, self).__init__(meta)
if meta is BaseSchema.Meta:
return
self.strict = True
# the model from which the Schema was created, required
self.model = getattr(meta, 'model', None)
if not issubclass(self.model, Model):
raise ValueError("'model' option must be a orm.Model.")
# url for an object instance
self.url = getattr(meta, 'url', None)
if not isinstance(self.url, string_types):
raise ValueError("'url' option must be a string.")
# default context for serialization
self.default_kwargs = getattr(meta, 'default_kwargs', {})
if not isinstance(self.default_kwargs, dict):
raise ValueError("'default_kwargs' option must be a dictionary.")
class JsonApiSerializer(with_metaclass(JsonApiSerializerMeta, BaseSchema)):
OPTIONS_CLASS = JsonApiSerializerOpts
def __init__(self, instance=None, data=None, storage=None, only=(),
**kwargs):
default_kwargs = dict(self.opts.default_kwargs)
fields_map = dict(default_kwargs.pop('fields_map', {}),
**kwargs.pop('fields_map', {}))
exclude_map = dict(default_kwargs.pop('exclude_map', {}),
**kwargs.pop('exclude_map', {}))
include_data_map = dict(default_kwargs.pop('include_data_map', {}),
**kwargs.pop('include_data_map', {}))
kwargs = dict(default_kwargs, **kwargs)
self.current_url = kwargs.pop('current_url', None)
self.include = kwargs.pop('include_data', [])
self.ordering = kwargs.pop('ordering', [])
self.fields_map = fields_map
self.exclude_map = exclude_map
self.include_data_map = include_data_map
type_ = self.opts.type_
model = self.opts.model
self.include_map = include_map = defaultdict(list)
for include in chain(self.include,
self.include_data_map.get(type_, [])):
parts = include.split('.', 1)
prefix_map = include_map[parts[0]]
if len(parts) == 2:
prefix_map.append(parts[1])
field_names = model._field_names
relationship_names = model._relationship_names
fields = fields_map.get(type_)
exclude = exclude_map.get(type_)
if not only:
f_set = fields and set(fields)
e_set = exclude and set(exclude)
self.field_set = field_set = set()
self.relationship_set = relationship_set = set()
for name in field_names:
if should_include_field(self.inflect(name),
include=f_set, exclude=e_set):
field_set.add(name)
for name in relationship_names:
if should_include_field(self.inflect(name),
include=f_set, exclude=e_set):
relationship_set.add(name)
only = list(field_set | {'id'})
for name in relationship_names:
relationship = self._declared_fields[name]
if (name in relationship_set or relationship.related_url or
relationship.self_url):
only.append(name)
only.append('_url')
self.field_order = list(
chain(fields or [], map(self.inflect, field_names)))
self.relationship_order = list(
chain(fields or [], map(self.inflect, relationship_names)))
only_set = set(only)
kwargs['include_data'] = tuple(
k for k in iterkeys(self.include_map) if k in only_set)
super(JsonApiSerializer, self).__init__(only=only, **kwargs)
self.instance = instance
self.initial_data = data
if storage:
self.storage = storage
elif isinstance(instance, Model):
self.storage = instance.storage
else:
self.storage = None
self.deleted = []
self.updated = []
@property
def data(self):
return self.dump(self.instance).data
@cached_property
def errors(self):
if not hasattr(self, '_errors'):
self.is_valid()
return self._errors
@cached_property
def validated_data(self):
if not hasattr(self, '_errors'):
self.is_valid()
return self._validated_data
@cached_property
def validated_profile_updates_data(self):
if not hasattr(self, '_errors'):
self.is_valid()
return self._validated_profile_updates_data
def deserialize_related_model(self, model, id_):
if id_ is None:
return None
return model(self.storage, **{
model._pk_field: id_,
})
def update(self, instance, validated_data):
model = instance.__class__
errors = {}
fields = []
for attrname in model._ordered_fields:
if attrname in validated_data:
try:
value = validated_data[attrname]
if attrname in model._field_names:
setattr(instance, attrname, value)
fields.append(attrname)
elif attrname in model._relationship_names:
# read in existing value to populate data stores
getattr(instance, attrname)
related_model = model._fields[attrname].model
if isinstance(value, list):
setattr(instance, attrname, [
self.deserialize_related_model(related_model, v)
for v in value])
fields.append(attrname)
else:
setattr(instance, attrname,
self.deserialize_related_model(
related_model, value))
fields.append(attrname)
except ValidationError as err:
errors[attrname] = err.messages
if errors:
raise ValidationError(errors)
instance.save(only=fields)
return instance
def create(self, validated_data):
model = self.opts.model
processed_attributes = {
model._pk_field: AUTO_PK,
}
for attrname, value in iteritems(validated_data):
if attrname in model._relationship_names:
related_model = model._fields[attrname].model
related_name = model._fields[attrname].related_name
if isinstance(value, list):
value = [self.deserialize_related_model(related_model, v)
for v in value]
else:
value = self.deserialize_related_model(related_model, value)
# read in existing values to populate data stores, for unique
# key generation
if value is not None:
for v in (value if isinstance(value, list) else [value]):
getattr(v, related_name)
processed_attributes[attrname] = value
instance = model(self.storage, **processed_attributes)
instance.save()
return instance
def apply_profile_updates(self, validated_data, serializers):
errors = {}
for data, serializer in zip(validated_data, serializers):
id_ = data['id']
instance = self.deserialize_related_model(
serializer.opts.model, id_)
serializer.instance = instance
try:
serializer.save()
except ValidationError as err:
errors.update(err.messages)
continue
self.updated.append(instance)
self.updated.extend(serializer.updated)
self.deleted.extend(serializer.deleted)
if errors:
raise ValidationError(errors)
def save(self):
if not hasattr(self, '_errors'):
self.is_valid(raise_exception=True)
validated_data = self.validated_data
validated_profile_updates_data = self.validated_profile_updates_data
errors = {}
try:
if self.instance is not None:
self.instance = self.update(self.instance, validated_data)
else:
self.instance = self.create(validated_data)
except ValidationError as err:
errors.update(err.messages)
if validated_profile_updates_data:
try:
self.apply_profile_updates(validated_profile_updates_data,
self._profile_updates_serializers)
except ValidationError as err:
errors.update(err.messages)
if errors:
formatted_messages = self.format_errors(errors, many=self.many)
raise ValidationError(formatted_messages)
return self.instance
def delete(self):
data, serializers = self.load_profile_updates()
self._validated_profile_updates_data = data
self._profile_updates_serializers = serializers
if self._validated_profile_updates_data:
self.apply_profile_updates(self._validated_profile_updates_data,
self._profile_updates_serializers)
self.deleted.extend(self.instance.delete())
self.instance = self.instance.with_snapshots(('working',))
def is_valid(self, raise_exception=False):
errors = []
try:
self._validated_data = self.load(self.initial_data).data
except (ValidationError, IncorrectTypeError) as err:
errors.extend(err.messages.get('errors', []))
try:
data, serializers = self.load_profile_updates()
self._validated_profile_updates_data = data
self._profile_updates_serializers = serializers
except (ValidationError, IncorrectTypeError) as err:
errors.extend(err.messages.get('errors', []))
if errors:
self._validated_data = {}
self._validated_profile_updates_data = []
self._errors = errors
if raise_exception:
err = ValidationError(u'Invalid data.')
err.messages = {
'errors': errors
}
raise err
return False
self._errors = {}
return True
def load_profile_updates(self):
if UPDATES_PROFILE not in self.initial_data.get(
'links', {}).get('profile', []):
return [], []
for alias, profile in iteritems(self.initial_data.get('aliases', {})):
if profile == UPDATES_PROFILE:
break
else:
return [], []
errors = []
validated_data = []
profile_serializers = []
for i, update in enumerate(
self.initial_data.get('meta', {}).get(alias, [])):
if 'type' not in update:
errors.append({
'detail': '`data` object must include `type` key.',
'source': {
'pointer': '/meta/{}/{}/data'.format(alias, i)
}
})
continue
type_ = update['type']
data = {
'data': update,
}
try:
serializer_class = get_schema(type_)
except ImproperlyConfigured:
errors.append({
'detail': 'Invalid type: {}.'.format(type_),
'source': {
'pointer': '/meta/{}/{}/data/type'.format(alias, i),
},
})
continue
serializer = serializer_class(
data=data,
partial=set(
serializer_class.opts.model._ordered_fields).difference(
{'id'}))
try:
validated_data.append(serializer.validated_data)
profile_serializers.append(serializer)
except (ValidationError, IncorrectTypeError) as err:
errors.extend({
'detail': error['detail'],
'source': {
'pointer': '/meta/{}/{}{}'.format(
alias, i, error['source']['pointer'])
},
} for error in err.messages.get('errors', []))
continue
if errors:
err = ValidationError(u'Invalid data for updates.')
err.messages = {
'errors': errors
}
raise err
return validated_data, profile_serializers
@pre_dump(pass_many=True)
def sort_collection(self, models, many):
ordering = self.ordering
if not many or not ordering:
return models
for key in reversed(ordering):
reverse = key[0] == '-'
if reverse:
key = key[1:]
models = sorted(models, key=partial(deep_getattr, key=key),
reverse=reverse)
return models
@post_dump(pass_many=True)
def format_json_api_response(self, data, many):
updated_set = set()
deleted_set = set()
updated = []
deleted = []
for instance in self.deleted:
if instance.data_key in deleted_set:
continue
deleted_set.add(instance.data_key)
if instance != self.instance:
deleted.append(instance)
for instance in self.updated:
if (instance.data_key in updated_set or
instance.data_key in deleted_set):
continue
updated_set.add(instance.data_key)
updated.append(instance)
self.add_includes(updated)
response = super(JsonApiSerializer, self).format_json_api_response(
data, many)
if 'included' in response:
response['included'].sort(key=itemgetter('type', 'id'))
deleted = self.format_profile_references(deleted)
if deleted:
self.add_profile_to_response(DELETED_PROFILE, DELETED_PROFILE_ALIAS,
deleted, response)
updated = self.format_profile_references(updated)
if updated:
self.add_profile_to_response(UPDATES_PROFILE, UPDATES_PROFILE_ALIAS,
updated, response)
if (isinstance(self.instance, Model) and
self.instance.data_key in deleted_set):
response.pop('data', None)
if not response.get('meta'):
return {}
return order_dict(response, TOP_LEVEL_OBJECT_ORDER)
def format_item(self, item):
item = super(JsonApiSerializer, self).format_item(item)
if 'attributes' in item:
attributes = item.pop('attributes')
attributes.pop('-url', None) # super call adds this
if attributes:
item['attributes'] = order_dict(attributes, self.field_order)
if 'relationships' in item:
item['relationships'] = order_dict(item['relationships'],
self.relationship_order)
return order_dict(item, RESOURCE_OBJECT_ORDER)
def get_top_level_links(self, data, many):
if self.current_url:
return OrderedDict([('self', self.current_url)])
return None
def get_resource_links(self, item):
url = item.get('_url')
if url:
return OrderedDict([('self', url)])
return None
def get_url(self, obj):
return self.opts.url.format(self=obj)
def add_includes(self, includes):
included_data = self.included_data
for instance in includes:
type_ = type_from_model_name(instance.__class__.__name__)
serializer = get_schema(type_)(
instance,
fields_map=self.fields_map,
exclude_map=self.exclude_map)
data = serializer.data
item = data['data']
included_data[(item['type'], item['id'])] = item
included_data.update(serializer.included_data)
def format_profile_references(self, instances):
references = []
for instance in instances:
type_ = type_from_model_name(instance.__class__.__name__)
serializer = get_schema(type_)(
instance.with_snapshots(('working',)),
only=('id',))
data = serializer.data.get('data', {})
if data:
references.append(data)
return references
def add_profile_to_response(self, profile, alias, data, response):
response.setdefault('aliases', {})[alias] = profile
response.setdefault('links', {}).setdefault('profile', []).append(
profile)
response.setdefault('meta', {})[alias] = data
class JsonApiPolymorphicSerializer(object):
def __new__(cls, base, default_model, instance=None, data=None, many=False,
**kwargs):
if not many:
# in the single instance case return the correct serializer by type
# or the default_model's serializer if type is not valid for base
type_ = None
if instance:
type_ = type_from_model_name(instance.__class__.__name__)
elif data:
type_ = data.get('data', {}).get('type')
if type_:
serializer_class = get_schema(type_)
if not issubclass(serializer_class.opts.model, base):
type_ = None
if not type_:
type_ = type_from_model_name(default_model.__name__)
serializer_class = get_schema(type_)
return serializer_class(
instance=instance, data=data, many=many, **kwargs)
if data:
raise ValueError(
u"You can only use a JsonApiPolymorphicSerializer with "
u"many=True for serializing a ModelCollection")
return super(JsonApiPolymorphicSerializer, cls).__new__(
cls, base, default_model, instance, data, many, **kwargs)
def __init__(self, base, default_model, instance=None, data=None,
many=False, **kwargs):
# this is only used for serializing a ModelCollection
self.base = base
self.default_model = default_model
self.collection = instance
self.kwargs = kwargs
@property
def data(self):
links = None
data = []
included = []
included_set = set()
default_type = type_from_model_name(self.default_model.__name__)
default_serializer = get_schema(default_type)
for instance in self.collection:
type_ = type_from_model_name(instance.__class__.__name__)
serializer_class = get_schema(type_)
if not issubclass(serializer_class.opts.model, self.base):
serializer_class = default_serializer
serialized = serializer_class(instance=instance, **self.kwargs).data
if not links and 'links' in serialized:
links = serialized['links']
data.append(serialized['data'])
for include in serialized.get('included', []):
key = (include['type'], include['id'])
if key not in included_set:
included.append(include)
included_set.add(key)
response = {
'data': data
}
if links:
response['links'] = links
if included:
included.sort(key=itemgetter('type', 'id'))
response['included'] = included
return order_dict(response, TOP_LEVEL_OBJECT_ORDER)
================================================
FILE: portia_server/portia_api/jsonapi/utils.py
================================================
from collections import defaultdict, OrderedDict
from django.utils.text import camel_case_to_spaces
from requests.status_codes import _codes as status_codes
from six import iteritems
from six.moves import reduce
from portia_orm.utils import cached_property, cached_property_ignore_set
__all__ = [
'cached_property',
'cached_property_ignore_set',
'dasherize',
'deep_getattr',
'order_dict',
'should_include_field',
'type_from_model_name',
'LINKS_OBJECT_ORDER',
'RESOURCE_OBJECT_ORDER',
'TOP_LEVEL_OBJECT_ORDER',
]
TOP_LEVEL_OBJECT_ORDER = (
'jsonapi',
'aliases',
'links',
'data',
'errors',
'included',
'meta',
)
RESOURCE_OBJECT_ORDER = (
'type',
'id',
'links',
'attributes',
'relationships',
'meta',
)
LINKS_OBJECT_ORDER = (
'self',
'related',
'profile',
)
def camel_case_to_dashes(value):
return '-'.join(camel_case_to_spaces(value).split(' '))
def dasherize(value):
return str(value).replace('_', '-')
def type_from_model_name(value):
return '{}s'.format(camel_case_to_dashes(value))
def deep_getattr(obj, key):
try:
return reduce(getattr, key.split('.'), obj)
except AttributeError:
return None
def should_include_field(field, include, exclude):
if include is not None:
return field in include
if exclude is not None:
return field not in exclude
return True
def order_dict(data, ordered_keys, key_map_cache={}):
can_cache = True
try:
key_map = key_map_cache.get(ordered_keys)
except TypeError:
key_map = None
can_cache = False
if key_map is None:
key_map = defaultdict(lambda: float('inf'))
for i, k in enumerate(ordered_keys):
if k not in key_map:
key_map[k] = i
if can_cache:
key_map_cache[ordered_keys] = key_map
return OrderedDict(sorted(iteritems(data), key=lambda kv: key_map[kv[0]]))
def get_status_title(status_code):
return status_codes[status_code][0].replace('_', ' ').title()
================================================
FILE: portia_server/portia_api/resources/__init__.py
================================================
================================================
FILE: portia_server/portia_api/resources/annotations.py
================================================
from .projects import BaseProjectModelRoute
from ..jsonapi.utils import cached_property
from portia_orm.models import BaseAnnotation, Annotation
class AnnotationRoute(BaseProjectModelRoute):
lookup_url_kwarg = 'annotation_id'
default_model = Annotation
polymorphic = BaseAnnotation
@cached_property
def sample(self):
return (self.project.spiders[self.kwargs.get('spider_id')]
.samples[self.kwargs.get('sample_id')])
def perform_create(self, serializer):
self.sample.ordered_children # preload items and annotations
return super(AnnotationRoute, self).perform_create(serializer)
def get_instance(self):
return self.get_collection()[self.kwargs.get('annotation_id')]
def get_collection(self):
project = self.project
project.schemas # preload schemas and fields
project.extractors # preload extractors
return self.sample.ordered_children
def get_detail_kwargs(self):
return {
'include_data_map': {
'items': [
'schema.fields',
'annotations',
],
'annotations': [
'field.schema.fields',
'extractors',
],
},
}
================================================
FILE: portia_server/portia_api/resources/extractors.py
================================================
from .projects import BaseProjectModelRoute
from portia_orm.models import Extractor
class ExtractorRoute(BaseProjectModelRoute):
lookup_url_kwarg = 'extractor_id'
default_model = Extractor
def get_instance(self):
return self.get_collection()[self.kwargs.get('extractor_id')]
def get_collection(self):
return self.project.extractors
================================================
FILE: portia_server/portia_api/resources/fields.py
================================================
from portia_orm.models import Field
from .projects import BaseProjectModelRoute
from ..jsonapi.exceptions import JsonApiBadRequestError
class FieldRoute(BaseProjectModelRoute):
lookup_url_kwarg = 'field_id'
default_model = Field
def perform_create(self, serializer):
self.project.schemas # preload schemas
super(FieldRoute, self).perform_create(serializer)
def get_instance(self):
return self.get_collection()[self.kwargs.get('field_id')]
def get_collection(self):
return self.project.schemas[self.kwargs.get('schema_id')].fields
def destroy(self, *args, **kwargs):
annotation_count = self._annotation_count
if annotation_count > 0:
raise JsonApiBadRequestError(self._destroy_error(annotation_count))
return super(FieldRoute, self).destroy(*args, **kwargs)
def _destroy_error(self, annotation_count):
field = self.get_instance()
annotation = 'annotation' if annotation_count == 1 else 'annotations'
return ('Unable to delete the field "{}" as it has {} {}.'
.format(field.name, annotation_count, annotation))
@property
def _annotation_count(self):
self._load_annotations()
return len(self.get_instance().annotations)
def _load_annotations(self):
for spider in self.project.spiders:
for sample in spider.samples:
sample.annotations
================================================
FILE: portia_server/portia_api/resources/items.py
================================================
from .projects import BaseProjectModelRoute
from ..jsonapi.utils import cached_property
from portia_orm.models import Item
class ItemRoute(BaseProjectModelRoute):
lookup_url_kwarg = 'item_id'
default_model = Item
@cached_property
def spider(self):
return self.project.spiders[self.kwargs.get('spider_id')]
def perform_create(self, serializer):
self.spider.samples # preload samples
return super(ItemRoute, self).perform_create(serializer)
def get_instance(self):
return self.get_collection()[self.kwargs.get('item_id')]
def get_collection(self):
project = self.project
project.schemas # preload schemas and fields
project.extractors # preload extractors
return self.spider.samples[self.kwargs.get('sample_id')].ordered_items
def get_detail_kwargs(self):
return {
'include_data_map': {
'items': [
'schema.fields',
'annotations',
],
'annotations': [
'field.schema.fields',
'extractors',
],
},
}
================================================
FILE: portia_server/portia_api/resources/models.py
================================================
from itertools import chain
from marshmallow_jsonapi import Schema, fields
from marshmallow import pre_dump, post_load
class SlydSchema(Schema):
_properties = ('project', 'spider', 'schema', 'item', 'sample', 'field')
@staticmethod
def empty_data():
return {
'meta': {}
}
def __init__(self, *args, **kwargs):
self._skip_relationships = kwargs.pop('skip_relationships', False)
if self._skip_relationships:
relationships = ((f, '%s_id' % f) for f in self._properties)
exclude = kwargs.get('exclude', [])
excluded = tuple(chain(exclude, *zip(*relationships)))
kwargs['exclude'] = excluded
super(SlydSchema, self).__init__(*args, **kwargs)
@property
def project_id(self):
return self.context.get('project_id')
@property
def spider_id(self):
return self.context.get('spider_id')
@property
def sample_id(self):
return self.context.get('sample_id')
@property
def schema_id(self):
return self.context.get('schema_id')
@property
def item_id(self):
return self.context.get('item_id')
@property
def field_id(self):
return self.context.get('field_id')
@pre_dump
def _dump_relationship_properties(self, item):
if getattr(self, '_skip_relationships', False):
return item
for attr in self._properties:
_id = '_'.join((attr, 'id'))
if _id not in item or item['id'] is None:
item[_id] = getattr(self, _id)
else:
self.context[_id] = item[_id]
if item.get(attr) is None and item[_id]:
item[attr] = {'id': item[_id]}
return item
class ProjectSchema(SlydSchema):
id = fields.Str(load_from='name')
name = fields.Str()
spiders = fields.Relationship(
related_url='/api/projects/{project_id}/spiders',
related_url_kwargs={'project_id': '<id>'}, type_='spiders',
include_resource_linkage=True, many=True
)
schemas = fields.Relationship(
related_url='/api/projects/{project_id}/schemas',
related_url_kwargs={'project_id': '<id>'}, type_='schemas',
include_resource_linkage=True, many=True
)
extractors = fields.Relationship(
related_url='/api/projects/{project_id}/extractors',
related_url_kwargs={'project_id': '<id>'}, type_='extractors',
include_resource_linkage=True, many=True
)
project = fields.Relationship(
self_url='/api/projects/{project_id}',
self_url_kwargs={'project_id': '<id>'}, type_='projects'
)
class Meta:
type_ = 'projects'
class SchemaSchema(SlydSchema):
id = fields.Str(dump_only=True)
name = fields.Str()
default = fields.Boolean()
project = fields.Relationship(
related_url='/api/projects/{project_id}',
related_url_kwargs={'project_id': '<project_id>'},
type_='projects',
include_resource_linkage=True
)
fields = fields.Relationship(
related_url='/api/projects/{project_id}/schemas/{schema_id}/fields',
related_url_kwargs={'project_id': '<project_id>',
'schema_id': '<id>'},
many=True, include_resource_linkage=True, type_='fields'
)
class Meta:
type_ = 'schemas'
class FieldSchema(SlydSchema):
id = fields.Str()
name = fields.Str()
type = fields.Str()
vary = fields.Boolean(default=False)
required = fields.Boolean(default=False)
project = fields.Relationship(
related_url='/api/projects/{project_id}',
related_url_kwargs={'project_id': '<project_id>'},
type_='projects',
include_resource_linkage=True
)
schema = fields.Relationship(
related_url='/api/projects/{project_id}/schemas/{schema_id}',
related_url_kwargs={'project_id': '<project_id>',
'schema_id': '<schema_id>'},
type_='schema',
include_resource_linkage=True
)
class Meta:
type_ = 'fields'
class SpiderSchema(SlydSchema):
id = fields.Str(dump_only=True, load_from='name')
name = fields.Str()
start_urls = fields.List(fields.Str(), default=[])
links_to_follow = fields.Str(default='patterns')
follow_patterns = fields.List(fields.Str(), default=[])
exclude_patterns = fields.List(fields.Str(), default=[])
js_enabled = fields.Boolean(default=False)
js_enable_patterns = fields.List(fields.Str(), default=[])
js_disable_patterns = fields.List(fields.Str(), default=[])
respect_nofollow = fields.Boolean(default=True)
allowed_domains = fields.List(fields.Str(), default=[])
login_url = fields.Str()
login_user = fields.Str()
login_password = fields.Str()
perform_login = fields.Boolean(default=False)
samples = fields.Relationship(
related_url='/api/projects/{project_id}/spider/{spider_id}/samples',
related_url_kwargs={'project_id': '<project_id>',
'spider_id': '<spider_id>'},
many=True, include_resource_linkage=True, type_='samples'
)
project = fields.Relationship(
related_url='/api/projects/{project_id}',
related_url_kwargs={'project_id': '<project_id>'},
type_='projects',
include_resource_linkage=True
)
@pre_dump
def _dump_login_data(self, item):
init_requests = item.pop('init_requests', None)
if init_requests:
login_request = init_requests[0]
item['login_url'] = login_request['loginurl']
item['login_user'] = login_request['username']
item['login_password'] = login_request['password']
return item
@post_load
def _load_login_data(self, item):
fields = ('login_url', 'login_user', 'login_password')
if all(field in item and item[field] for field in fields):
item['init_requests'] = [{
'type': 'login',
'loginurl': item.pop('login_url'),
'username': item.pop('login_user'),
'password': item.pop('login_password')
}]
for field in fields:
item.pop(field, None)
return item
class Meta:
type_ = 'spiders'
class SampleSchema(SlydSchema):
id = fields.Str(dump_only=True)
name = fields.Str()
url = fields.Str(required=True)
page_id = fields.Str()
page_type = fields.Str(default='item')
scrapes = fields.Str()
extractors = fields.Dict(default={})
project = fields.Relationship(
related_url='/api/projects/{project_id}',
related_url_kwargs={'project_id': '<project_id>'},
type_='projects', include_resource_linkage=True
)
spider = fields.Relationship(
related_url='/api/projects/{project_id}/spiders/{spider_id}',
related_url_kwargs={'project_id': '<project_id>',
'spider_id': '<spider_id>'},
type_='spiders', include_resource_linkage=True
)
original_body = fields.Relationship(
related_url='/api/projects/{project_id}/spider/{spider_id}/samples/'
'{sample_id}/original_body',
related_url_kwargs={'project_id': '<project_id>',
'spider_id': '<spider_id>',
'sample_id': '<id>'},
type_='html', include_resource_linkage=False
)
rendered_body = fields.Relationship(
related_url='/api/projects/{project_id}/spider/{spider_id}/samples/'
'{sample_id}/rendered_body',
related_url_kwargs={'project_id': '<project_id>',
'spider_id': '<spider_id>',
'sample_id': '<id>'},
type_='html', include_resource_linkage=False
)
items = fields.Relationship(
related_url='/api/projects/{project_id}/spider/{spider_id}/samples/'
'{sample_id}/items',
related_url_kwargs={'project_id': '<project_id>',
'spider_id': '<spider_id>',
'sample_id': '<id>'},
type_='items', many=True, include_resource_linkage=True
)
def dump(self, obj, many=None, update_fields=True, **kwargs):
many = self.many if many is None else bool(many)
if many:
for o in obj:
o.setdefault('items', [])
else:
obj.setdefault('items', [])
return super(SampleSchema, self).dump(obj, many, update_fields,
**kwargs)
class Meta:
type_ = 'samples'
class BaseAnnotationSchema(SlydSchema):
id = fields.Str()
attribute = fields.Str(required=True)
accept_selectors = fields.List(fields.Str(), default=[])
reject_selectors = fields.List(fields.Str(), default=[])
tagid = fields.Integer(required=True)
text_content = fields.Str()
selector = fields.Str()
sample = fields.Relationship(
related_url='/api/projects/{project_id}/spiders/{spider_id}/samples/'
'{sample_id}',
related_url_kwargs={'project_id': '<project_id>',
'spider_id': '<spider_id>',
'sample_id': '<sample_id>'},
type_='samples',
include_resource_linkage=True
)
parent = fields.Relationship(
related_url_kwargs={'project_id': '<project_id>',
'spider_id': '<spider_id>',
'sample_id': '<sample_id>',
'item_id': '<parent_id>'},
type_='items', include_resource_linkage=True
)
@property
def parent_id(self):
return self.context.get('container_id', self.item_id)
@pre_dump
def _dump_parent_id(self, item):
parent_id = None
if 'parent' in item:
parent_id = item['parent']['id']
if not parent_id:
parent_id = item.get('container_id', self.parent_id) or ''
if (item['id'].split('#')[0] == parent_id or
parent_id.split('#')[0] == item['id']):
item.pop('parent', None)
item.pop('parent_id', None)
return
if parent_id:
item['parent'] = {'id': parent_id}
if parent_id and item.get('parent_id') is None:
item['parent_id'] = parent_id
class AnnotationSchema(BaseAnnotationSchema):
required = fields.Boolean(default=False)
ignore = fields.Boolean(default=False)
ignore_beneath = fields.Boolean(default=False)
variant = fields.Integer(default=False)
slice = fields.List(fields.Integer())
pre_text = fields.Str()
post_text = fields.Str()
selection_mode = fields.Str()
field = fields.Relationship(
related_url='/api/projects/{project_id}/schemas/{schema_id}/fields/'
'{field_id}',
related_url_kwargs={'project_id': '<project_id>',
'schema_id': '<schema_id>',
'field_id': '<field.id>'},
type_='fields', include_resource_linkage=True
)
extractors = fields.Relationship(
related_url='/api/projects/{project_id}/extractors',
related_url_kwargs={'project_id': '<project_id>'},
many=True, include_resource_linkage=True, type_='extractors'
)
class Meta:
type_ = 'annotations'
class ItemAnnotationSchema(BaseAnnotationSchema):
item_container = fields.Boolean(default=True)
container_id = fields.Str()
repeated = fields.Boolean()
repeated_container_id = fields.Str(dump_only=True)
repeated_accept_selectors = fields.Str(dump_only=True)
siblings = fields.Integer()
parent_field = fields.Str()
schema = fields.Relationship(
related_url='/api/projects/{project_id}/schemas/{schema_id}',
related_url_kwargs={'project_id': '<project_id>',
'schema_id': '<schema_id>'},
type_='schemas', include_resource_linkage=True
)
class Meta:
type_ = 'item_annotations'
class ExtractorSchema(SlydSchema):
id = fields.Str()
type = fields.Str()
value = fields.Str()
project = fields.Relationship(
related_url='/api/projects/{project_id}',
related_url_kwargs={'project_id': '<project_id>'},
type_='projects',
include_resource_linkage=True
)
@pre_dump
def _dump_extractor_attributes(self, item):
if 'type' not in item:
item['type'] = 'type' if 'type_extractor' in item else 'regex'
if 'value' not in item:
item['value'] = item['type_extractor'] if item['type'] == 'type' \
else item['regular_expression']
return item
class Meta:
type_ = 'extractors'
class HtmlSchema(SlydSchema):
id = fields.Str()
html = fields.Str()
class Meta:
type_ = 'html'
class RenderedBody(SlydSchema):
id = fields.Str()
html = fields.Str()
class Meta:
type_ = 'rendered-bodys'
class OriginalBody(SlydSchema):
id = fields.Str()
html = fields.Str()
class Meta:
type_ = 'original-bodys'
class ItemSchema(SlydSchema):
"""Instance of a schema. Meta item built from sample."""
id = fields.Str()
sample = fields.Relationship(
related_url='/api/projects/{project_id}/spider/{spider_id}/samples/'
'{sample_id}',
related_url_kwargs={'project_id': '<project_id>',
'spider_id': '<spider_id>',
'sample_id': '<sample_id>'},
include_resource_linkage=True, type_='samples'
)
schema = fields.Relationship(
related_url='/api/projects/{project_id}/schemas/{schema_id}',
related_url_kwargs={'project_id': '<project_id>',
'schema_id': '<schema_id>'},
type_='schemas', include_resource_linkage=True
)
annotations = fields.Relationship(
related_url='/api/projects/{project_id}/spider/{spider_id}/samples/'
'{sample_id}/items/{item_id}/annotations',
related_url_kwargs={'project_id': '<project_id>',
'spider_id': '<spider_id>',
'sample_id': '<sample_id>',
'item_id': '<id>'},
many=True, include_resource_linkage=True, type_='annotations'
)
item_annotation = fields.Relationship(
related_url='/api/projects/{project_id}/spider/{spider_id}/samples/'
'{sample_id}/items/{item_id}/item_annotation',
related_url_kwargs={'project_id': '<project_id>',
'spider_id': '<spider_id>',
'sample_id': '<sample_id>',
'item_id': '<id>'},
include_resource_linkage=True, type_='item_annotations'
)
parent = fields.Relationship(type_='items', include_resource_linkage=True)
@pre_dump
def _dump_parent_id(self, item):
parent_id = item.get('container_id') or ''
if parent_id:
item['parent'] = {'id': parent_id}
if parent_id and item.get('parent_id') is None:
item['parent_id'] = parent_id
class Meta:
type_ = 'items'
================================================
FILE: portia_server/portia_api/resources/projects.py
================================================
from collections import OrderedDict
from django.conf import settings
from django.utils.functional import cached_property
from dulwich.objects import Commit
from rest_framework.decorators import detail_route
from rest_framework.response import Response
from rest_framework.status import HTTP_200_OK, HTTP_201_CREATED
from six import iteritems
from scrapy.utils.misc import load_object
from portia_orm.models import Project
from storage import get_storage_class
from storage.backends import InvalidFilename
from .route import (JsonApiRoute, JsonApiModelRoute, CreateModelMixin,
ListModelMixin, RetrieveModelMixin)
from .response import FileResponse
from ..jsonapi.exceptions import (JsonApiFeatureNotAvailableError,
JsonApiBadRequestError,
JsonApiNotFoundError,
JsonApiConflictError)
from ..utils.download import ProjectArchiver, CodeProjectArchiver
from ..utils.copy import ModelCopier, MissingModelException
Deployer = load_object(settings.PROJECT_DEPLOYER)
class ProjectDownloadMixin(object):
@detail_route(methods=['get'])
def download(self, *args, **kwargs):
fmt = self.query.get('format', 'spec')
version = self.query.get('version', None)
branch = self.query.get('branch', None)
selector = self.query.get('selector') or 'css'
spider_id = self.kwargs.get('spider_id', None)
spiders = [spider_id] if spider_id is not None else None
try:
self.project
except InvalidFilename as e:
raise JsonApiNotFoundError(str(e))
if hasattr(self.storage, 'checkout') and (version or branch):
try:
if version and len(version) < 40:
version = self.commit_from_short_sha(version).id
self.storage.checkout(version, branch)
except IOError:
pass
except ValueError as e:
raise JsonApiNotFoundError(str(e))
archiver = CodeProjectArchiver if fmt == u'code' else ProjectArchiver
try:
content = archiver(self.storage).archive(
spiders, selector=selector)
except IOError as e:
raise JsonApiNotFoundError(str(e))
try:
name = u'{}.zip'.format(self.project.name)
except UnicodeEncodeError:
name = str(self.project.id)
return FileResponse(name, content, status=HTTP_200_OK)
def commit_from_short_sha(self, version):
for oid in self.storage.repo._repo.object_store:
if oid.startswith(version):
obj = self.storage.repo._repo.get_object(oid)
if isinstance(obj, Commit):
return obj
raise JsonApiNotFoundError(
'Could not find commit for `{}`'.format(version)
)
class BaseProjectRoute(JsonApiRoute):
@cached_property
def projects(self):
storage_class = get_storage_class()
return storage_class.get_projects(self.request.user)
@cached_property
def project(self):
project_id = self.kwargs.get('project_id')
try:
name = self.projects[project_id]
return Project(self.storage, id=project_id, name=name)
except KeyError:
raise JsonApiNotFoundError()
class BaseProjectModelRoute(BaseProjectRoute, JsonApiModelRoute):
pass
class ProjectRoute(ProjectDownloadMixin, BaseProjectRoute,
ListModelMixin, RetrieveModelMixin, CreateModelMixin):
lookup_url_kwarg = 'project_id'
default_model = Project
class FakeStorage(object):
def exists(self, *args, **kwargs):
return False
def listdir(self, *args, **kwargs):
return [], []
def create(self, request):
"""Create a new project from the provided attributes"""
try:
name = self.data['data']['attributes']['name']
except KeyError:
raise JsonApiBadRequestError('No `name` provided')
self.kwargs['project_id'] = name
projects = self.projects
if not self.storage.is_valid_filename(name) or '.' in name:
raise JsonApiBadRequestError(
'"{}" is not a valid project name,\nProject names may only '
'contain letters and numbers'.format(name))
if name in projects:
raise JsonApiBadRequestError(
'A project with the name "{}" already exists'.format(name))
# Bootstrap project
storage = self.storage
storage.commit()
project = Project(storage, id=name, name=name)
serializer = self.get_serializer(project, storage=storage)
data = serializer.data
headers = self.get_success_headers(data)
return Response(data, status=HTTP_201_CREATED, headers=headers)
# def update(self):
# """Update an exiting project with the provided attributes"""
# def destroy(self):
# """Delete the requested project"""
@detail_route(methods=['get'])
def status(self, *args, **kwargs):
response = self.retrieve()
data = OrderedDict()
data.update({
'meta': {
'changes': self.get_project_changes()
}
})
data.update(response.data)
return Response(data, status=HTTP_200_OK)
@detail_route(methods=['put', 'patch', 'post'])
def publish(self, *args, **kwargs):
if not self.storage.version_control and hasattr(self.storage, 'repo'):
raise JsonApiFeatureNotAvailableError()
if not self.get_project_changes():
raise JsonApiBadRequestError('You have no changes to publish')
force = self.query.get('force', False)
branch = self.storage.branch
published = self.storage.repo.publish_branch(branch, force=force)
if not published:
raise JsonApiConflictError(
'A conflict occurred when publishing your changes.'
'You must resolve the conflict before the project can be '
'published.')
self.deploy()
self.storage.repo.delete_branch(branch)
response = self.retrieve()
return Response(response.data, status=HTTP_200_OK)
@detail_route(methods=['POST'])
def deploy(self, *args, **kwargs):
data = self._deploy()
return Response(data, HTTP_200_OK)
@detail_route(methods=['put', 'patch', 'post'])
def reset(self, *args, **kwargs):
if not self.storage.version_control and hasattr(self.storage, 'repo'):
raise JsonApiFeatureNotAvailableError()
branch = self.storage.branch
master = self.storage.repo.refs['refs/heads/master']
self.storage.repo.refs['refs/heads/%s' % branch] = master
return self.retrieve()
@detail_route(methods=['post'])
def copy(self, *args, **kwargs):
from_project_id = self.query.get('from') or self.data.get('from')
if not from_project_id:
raise JsonApiBadRequestError('`from` parameter must be provided.')
try:
self.projects[from_project_id]
except KeyError:
raise JsonApiNotFoundError(
'No project exists with the id "{}"'.format(from_project_id))
models = self.data.get('data', [])
if not models:
raise JsonApiBadRequestError('No models provided to copy.')
try:
copier = ModelCopier(self.project, self.storage, from_project_id)
copier.copy(models)
except MissingModelException as e:
raise JsonApiBadRequestError(
'Could not find the following ids "{}" in the project.'.format(
'", "'.join(e.args[0])))
response = self.retrieve()
return Response(response.data, status=HTTP_201_CREATED)
@detail_route(methods=['post'])
def rollback(self, *args, **kwargs):
if not self.storage.version_control and hasattr(self.storage, 'repo'):
raise JsonApiFeatureNotAvailableError()
version = self.query.get('version')
branch = self.query.get('branch')
if not (branch or version):
raise JsonApiBadRequestError(
'Need either `branch` or `version` arguments to rollback to')
if branch:
commit = self.storage.repo.refs['refs/heads/{}'.format(branch)]
else:
commit = self.commit_from_short_sha(version).id
self.storage.repo.refs['refs/heads/master'] = commit
self.storage.commit()
self.deploy()
return self.retrieve()
def get_instance(self):
return self.project
def get_collection(self):
storage = self.FakeStorage()
return Project.collection(
Project(storage, id=project_id, name=name)
for project_id, name in iteritems(self.projects))
def get_detail_kwargs(self):
return {
'include_data': [
'spiders',
'schemas',
],
'fields_map': {
'spiders': [
'project',
],
'schemas': [
'name',
'default',
'project',
],
},
'exclude_map': {
'projects': [
'extractors',
],
}
}
def get_list_kwargs(self):
return {
'fields_map': {
'projects': [
'name',
],
}
}
def get_project_changes(self):
storage = self.storage
if not storage.version_control:
raise JsonApiFeatureNotAvailableError()
return [{'type': type_, 'path': path, 'old_path': old_path}
for type_, path, old_path
in storage.changed_files()]
def _deploy(self):
if settings.CAPABILITIES.get('deploy_projects'):
return Deployer(self.project).deploy()
================================================
FILE: portia_server/portia_api/resources/response.py
================================================
import json
from django.http.response import HttpResponse
from wsgiref.util import FileWrapper
from six.moves import map
from twisted.python.compat import intToBytes
class BaseApiResource(object):
def render(self, request):
raise NotImplementedError
def render_async(self, request):
body = self.render(request)
if body is not NOT_DONE_YET:
if body is not None:
request.setHeader(b'content-length', intToBytes(len(body)))
request.write(body)
request.finish()
class JsonApiResource(BaseApiResource):
def __init__(self, status, data=None):
self.status = status
self.data = data
def render(self, request):
request.setResponseCode(self.status)
data = self.data
if data is not None:
content_type = b"application/vnd.api+json"
profiles = data.get('links', {}).get('profile', [])
if profiles:
content_type += b'; profile="{}"'.format(
b' '.join(map(bytes, profiles)))
request.setHeader(b'content-type', content_type)
return json.dumps(data, indent=2)
class FileResponse(HttpResponse):
def __init__(self, name, content, *args, **kwargs):
content = FileWrapper(content)
super(FileResponse, self).__init__(
content=content, content_type='application/zip')
self['Content-Disposition'] = 'attachment; filename="%s"' % name
================================================
FILE: portia_server/portia_api/resources/route.py
================================================
from collections import Sequence
from operator import attrgetter
from django.db import transaction
from django.http.response import Http404
from django.utils.functional import cached_property
from marshmallow import ValidationError
from marshmallow_jsonapi.exceptions import IncorrectTypeError
from rest_framework.permissions import IsAuthenticated
from rest_framework.response import Response
from rest_framework.status import (HTTP_200_OK, HTTP_201_CREATED,
HTTP_204_NO_CONTENT)
from rest_framework.viewsets import ViewSet
from portia_orm.collection import ModelCollection
from portia_orm.exceptions import ProtectedError
from portia_orm.relationships import BelongsTo, HasMany
from storage import create_project_storage
from ..jsonapi.exceptions import (JsonApiBadRequestError,
JsonApiConflictError,
JsonApiValidationError,
render_exception)
from ..jsonapi.parsers import JSONApiParser, JSONParser
from ..jsonapi.registry import get_schema
from ..jsonapi.renderers import JSONApiRenderer, JSONRenderer
from ..jsonapi.serializers import JsonApiPolymorphicSerializer
from ..jsonapi.utils import type_from_model_name
class JsonApiRoute(ViewSet):
default_model = None
polymorphic = None
permission_classes = (IsAuthenticated,)
parser_classes = (JSONApiParser, JSONParser)
renderer_classes = (JSONApiRenderer, JSONRenderer)
def __str__(self):
return '{} {}'.format(self.method.upper(), self.path)
def __repr__(self):
return 'Route(%s)' % str(self)
@cached_property
def method(self):
return self.request.method.lower()
@cached_property
def path(self):
return self.request.path
@cached_property
def query(self):
return self.request.query_params or {}
@cached_property
def data(self):
return self.request.data or {}
@cached_property
def user(self):
return self.request.user
@cached_property
def storage(self):
if 'project_id' in self.kwargs:
return create_project_storage(
self.kwargs['project_id'], author=self.user)
return None
@transaction.atomic
def dispatch(self, request, *args, **kwargs):
return super(JsonApiRoute, self).dispatch(request, *args, **kwargs)
def handle_exception(self, exc):
response = super(JsonApiRoute, self).handle_exception(exc)
if isinstance(exc, Http404):
response.data['detail'] = "Resource '%s' not found." % self.path
status_code = response.status_code
if (isinstance(response.data, dict) and len(response.data) == 1 and
'detail' in response.data):
response.data = render_exception(status_code,
response.data['detail'])
return response
def get_instance(self):
raise NotImplementedError
def get_collection(self):
raise NotImplementedError
def filter_collection(self, collection):
if 'filter[id]' in self.query:
if not isinstance(collection, ModelCollection):
raise JsonApiBadRequestError(u"Cannot filter this collection.")
ids = []
for id_list in self.query.getlist('filter[id]'):
ids.extend(id_list.split(','))
collection = collection.__class__((collection[id_] for id_ in ids))
for key in self.query.keys():
if (key != 'filter[id]' and key.startswith('filter[') and
key[-1] == ']'):
field_name = key[7:-1]
field_values = set()
for field_list in self.query.getlist(key):
field_values.update(field_list.split(','))
filtered = []
for obj in collection:
try:
field = obj._fields[field_name]
if isinstance(field, BelongsTo):
related = getattr(obj, field_name)
filter_values = {related.pk if related else 'null'}
elif isinstance(field, HasMany):
filter_values = {attrgetter('pk')(f)
for f in getattr(obj, field_name)}
else:
value = getattr(obj, field_name)
if isinstance(value, Sequence):
filter_values = set(value)
else:
filter_values = {value}
if filter_values.intersection(field_values):
filtered.append(obj)
except (AttributeError, KeyError, TypeError):
# skip objects which don't have a field
pass
collection = collection.__class__(filtered)
return collection
def get_serializer(self, instance=None, data=None, many=False, **kwargs):
params = {}
if self.method == 'get':
params.update({
'current_url': self.path,
})
if many:
params.update(self.get_list_kwargs())
else:
params.update(self.get_detail_kwargs())
params.update(self.get_request_kwargs())
params.update(kwargs)
if self.polymorphic:
return JsonApiPolymorphicSerializer(
base=self.polymorphic, default_model=self.default_model,
instance=instance, data=data, many=many, **params)
type_ = type_from_model_name(self.default_model.__name__)
return get_schema(type_)(
instance=instance, data=data, many=many, **params)
def get_detail_kwargs(self):
return {}
def get_list_kwargs(self):
return self.get_detail_kwargs()
def get_request_kwargs(self):
kwargs = {}
if 'include' in self.query:
include = []
for include_list in self.query.getlist('include'):
include.extend(include_list.split(','))
kwargs['include_data'] = include
fields = {}
for key in self.query.keys():
if key.startswith('fields[') and key[-1] == ']':
field = key[7:-1]
for field_list in self.query.getlist(key):
if field in fields:
fields[field].extend(field_list.split(','))
else:
fields[field] = field_list.split(',')
kwargs['fields_map'] = fields
if 'sort' in self.query:
sort_ = []
for sort_list in self.query.getlist('sort'):
sort_.extend(sort_list.split(','))
kwargs['ordering'] = sort_
return kwargs
class CreateModelMixin(object):
def create(self, *args, **kwargs):
serializer = self.get_serializer(data=self.data, storage=self.storage,
partial={'id'})
try:
self.perform_create(serializer)
except ValidationError as err:
raise JsonApiValidationError(err.messages)
data = serializer.data
self.storage.commit()
headers = self.get_success_headers(data)
return Response(data, status=HTTP_201_CREATED, headers=headers)
def perform_create(self, serializer):
serializer.save()
def get_success_headers(self, data):
try:
return {
'Location': data['data']['links']['self']
}
except (TypeError, KeyError):
return {}
class ListModelMixin(object):
def list(self, *args, **kwargs):
try:
collection = self.filter_collection(self.get_collection())
except (TypeError, IndexError, KeyError):
raise Http404
serializer = self.get_serializer(collection, many=True)
return Response(serializer.data, status=HTTP_200_OK)
class RetrieveModelMixin(object):
def retrieve(self, *args, **kwargs):
try:
instance = self.get_instance()
except (TypeError, IndexError, KeyError):
raise Http404
serializer = self.get_serializer(instance)
return Response(serializer.data, status=HTTP_200_OK)
class UpdateModelMixin(object):
def update(self, *args, **kwargs):
try:
instance = self.get_instance()
except (TypeError, IndexError, KeyError):
raise Http404
if kwargs.pop('partial', False):
partial = set(instance.__class__._ordered_fields) - {'id'}
else:
partial = False
serializer = self.get_serializer(
instance, data=self.data, partial=partial)
try:
self.perform_update(serializer)
except (ValidationError, IncorrectTypeError) as err:
raise JsonApiValidationError(err.messages)
data = serializer.data
self.storage.commit()
return Response(data, status=HTTP_200_OK)
def partial_update(self, request, *args, **kwargs):
kwargs['partial'] = True
return self.update(request, *args, **kwargs)
def perform_update(self, serializer):
serializer.save()
class DestroyModelMixin(object):
def destroy(self, *args, **kwargs):
try:
instance = self.get_instance()
except (TypeError, IndexError, KeyError):
raise Http404
serializer = self.get_serializer(instance, data=self.data)
try:
self.perform_destroy(serializer)
except (ValidationError, IncorrectTypeError) as err:
raise JsonApiValidationError(err.messages)
except ProtectedError:
raise JsonApiConflictError(u"You cannot delete this resource.")
data = serializer.data
self.storage.commit()
if data:
return Response(data, status=HTTP_200_OK)
return Response(status=HTTP_204_NO_CONTENT)
def perform_destroy(self, serializer):
return serializer.delete()
class JsonApiModelRoute(JsonApiRoute, ListModelMixin, RetrieveModelMixin,
CreateModelMixin, UpdateModelMixin, DestroyModelMixin):
pass
================================================
FILE: portia_server/portia_api/resources/samples.py
================================================
from .projects import BaseProjectModelRoute
from .serializers import SampleSerializer
from portia_orm.models import Sample
class SampleRoute(BaseProjectModelRoute):
lookup_url_kwarg = 'sample_id'
default_model = Sample
def perform_create(self, serializer):
self.project.spiders # preload spiders
super(SampleRoute, self).perform_create(serializer)
def get_instance(self):
return self.get_collection()[self.kwargs.get('sample_id')]
def get_collection(self):
project = self.project
project.schemas # preload schemas and fields
project.extractors # preload extractors
spider = project.spiders[self.kwargs.get('spider_id')]
for sample in spider.samples:
sample = sample.load(sample.storage, sample)
sample.url
spider.samples.add(sample)
return spider.samples
def get_detail_kwargs(self):
return {
'include_data': [
'items',
],
'include_data_map': {
'items': [
'schema.fields',
'annotations',
],
'annotations': [
'field.schema.fields',
'extractors',
],
},
}
def get_list_kwargs(self):
excludes = (SampleSerializer.opts
.default_kwargs['exclude_map']['samples'])
return {
'exclude_map': {
'samples': excludes + [
'items',
]
}
}
================================================
FILE: portia_server/portia_api/resources/schemas.py
================================================
from portia_orm.models import Schema
from .projects import BaseProjectModelRoute
from ..jsonapi.exceptions import JsonApiBadRequestError, JsonApiNotFoundError
class SchemaRoute(BaseProjectModelRoute):
lookup_url_kwarg = 'schema_id'
default_model = Schema
def get_instance(self):
return self.get_collection()[self.kwargs.get('schema_id')]
def get_collection(self):
return self.project.schemas
def get_list_kwargs(self):
return {
'fields_map': {
'schemas': [
'name',
'default',
'project',
],
}
}
def update(self, *args, **kwargs):
# Reset default schema if current schema will be default
if self.data.get('data', {}).get('attributes', {}).get('default'):
for schema in self.get_collection():
if schema.default:
schema.default = False
schema.save()
return super(SchemaRoute, self).update(*args, **kwargs)
def destroy(self, *args, **kwargs):
try:
schema = self.get_instance()
except KeyError:
raise JsonApiNotFoundError('Unable to find the requested schema')
for spider in self.project.spiders:
for sample in spider.samples:
for item in sample.items:
if self._item_uses_schema(item):
raise JsonApiBadRequestError(
'Unable to delete the data format "%s" as it is used '
"by a spider's sample." % schema.name)
return super(SchemaRoute, self).destroy(*args, **kwargs)
def _item_uses_schema(self, item):
schema = self.get_instance()
if item.schema.id == schema.id:
return True
for item in item.annotations:
if hasattr(item, 'schema') and self._item_uses_schema(item):
return True
return False
================================================
FILE: portia_server/portia_api/resources/serializers.py
================================================
from operator import attrgetter
from six.moves import map
from portia_api.jsonapi.serializers import JsonApiSerializer
from portia_orm.base import AUTO_PK
from portia_orm.exceptions import ProtectedError
from portia_orm.models import (Project, Schema, Field, Extractor, Spider,
Sample, Item, Annotation, RenderedBody,
OriginalBody)
from portia_api.utils.projects import unique_name
from portia_api.utils.annotations import choose_field_type
def clear_auto_created(instance):
if instance.auto_created:
instance.auto_created = False
instance.save(only=('auto_created',))
class SpiderListSerializer(JsonApiSerializer):
class Meta:
model = Spider
url = '/api/projects/{self.project.id}/spiders/{self.id}'
links = {
'project': {
'related': '/api/projects/{self.project.id}',
},
}
class ProjectSerializer(JsonApiSerializer):
class Meta:
model = Project
url = '/api/projects/{self.id}'
links = {
'spiders': {
'related': '/api/projects/{self.id}/spiders',
'serializer': SpiderListSerializer,
},
'schemas': {
'related': '/api/projects/{self.id}/schemas',
},
'extractors': {
'related': '/api/projects/{self.id}/extractors',
},
}
class SchemaSerializer(JsonApiSerializer):
class Meta:
model = Schema
url = '/api/projects/{self.project.id}/schemas/{self.id}'
links = {
'project': {
'related': '/api/projects/{self.project.id}',
},
'fields': {
'related': '/api/projects/{self.project.id}/schemas'
'/{self.id}/fields',
},
}
default_kwargs = {
'include_data': [
'fields',
],
'exclude_map': {
'schemas': [
'auto-created',
'items',
]
}
}
def update(self, instance, validated_data):
instance = super(SchemaSerializer, self).update(
instance, validated_data)
clear_auto_created(instance)
return instance
class FieldSerializer(JsonApiSerializer):
class Meta:
model = Field
url = ('/api/projects/{self.schema.project.id}/schemas'
'/{self.schema.id}/fields/{self.id}')
links = {
'schema': {
'related': '/api/projects/{self.schema.project.id}/schemas'
'/{self.schema.id}',
},
}
default_kwargs = {
'exclude_map': {
'fields': [
'auto-created',
'annotations',
]
}
}
def create(self, validated_data):
field = super(FieldSerializer, self).create(validated_data)
clear_auto_created(field.schema)
return field
def update(self, instance, validated_data):
instance = super(FieldSerializer, self).update(instance, validated_data)
clear_auto_created(instance)
clear_auto_created(instance.schema)
return instance
def delete(self):
clear_auto_created(self.instance.schema)
super(FieldSerializer, self).delete()
class ExtractorSerializer(JsonApiSerializer):
class Meta:
model = Extractor
url = '/api/projects/{self.project.id}/extractors/{self.id}'
links = {
'project': {
'related': '/api/projects/{self.project.id}',
},
}
default_kwargs = {
'exclude_map': {
'extractors': [
'annotations',
]
}
}
class SpiderSerializer(JsonApiSerializer):
class Meta:
model = Spider
url = '/api/projects/{self.project.id}/spiders/{self.id}'
links = {
'project': {
'related': '/api/projects/{self.project.id}',
},
'samples': {
'related': '/api/projects/{self.project.id}/spiders/{self.id}'
'/samples',
},
}
default_kwargs = {
'exclude_map': {
'spiders': [
'samples',
]
}
}
def delete(self):
project = self.instance.project
project.schemas # preload schemas and fields
super(SpiderSerializer, self).delete()
class SampleSerializer(JsonApiSerializer):
class Meta:
model = Sample
url = ('/api/projects/{self.spider.project.id}/spiders'
'/{self.spider.id}/samples/{self.id}')
links = {
'spider': {
'related': '/api/projects/{self.spider.project.id}/spiders'
'/{self.spider.id}',
},
'items': {
'related': '/api/projects/{self.spider.project.id}/spiders'
'/{self.spider.id}/samples/{self.id}/items'
'?filter[parent]=null',
},
}
default_kwargs = {
'exclude_map': {
'samples': [
'page-id',
'page-type',
'original-body',
'annotated-body',
]
}
}
def create(self, validated_data):
sample = super(SampleSerializer, self).create(validated_data)
project = sample.spider.project
schemas = project.schemas
schema = next((s for s in schemas if s.default), None)
if schema is None:
schema_names = map(attrgetter('name'), schemas)
schema_name = unique_name(sample.name, schema_names)
schema = Schema(self.storage, id=AUTO_PK, name=schema_name,
project=project, auto_created=True)
schema.save()
item = Item(self.storage, id=AUTO_PK, sample=sample, schema=schema)
item.save()
return sample
def update(self, instance, validated_data):
sample = super(SampleSerializer, self).update(instance, validated_data)
for schema in sample.spider.project.schemas:
schema.save()
return sample
class ItemSerializer(JsonApiSerializer):
class Meta:
model = Item
url = ('/api/projects/{self.owner_sample.spider.project.id}/spiders'
'/{self.owner_sample.spider.id}/samples/{self.owner_sample.id}'
'/items/{self.id}')
links = {
'sample': {
'related': '/api/projects/{self.sample.spider.project.id}'
'/spiders/{self.sample.spider.id}/samples'
'/{self.sample.id}',
},
'parent': {
'related': '/api/projects/{self.owner_sample.spider.project.id}'
'/spiders/{self.owner_sample.spider.id}/samples'
'/{self.owner_sample.id}/items/{self.parent.id}',
},
'schema': {
'related': '/api/projects/{self.owner_sample.spider.project.id}'
'/schemas/{self.schema.id}',
},
'annotations': {
'related': '/api/projects/{self.owner_sample.spider.project.id}'
'/spiders/{self.owner_sample.spider.id}/samples'
'/{self.owner_sample.id}/annotations'
'?filter[parent]={self.id}',
},
}
def create(self, validated_data):
item = super(ItemSerializer, self).create(validated_data)
if item.schema is None:
sample = item.owner_sample
project = sample.spider.project
schema_names = map(attrgetter('name'), project.schemas)
schema_name = unique_name(sample.name, schema_names,
initial_suffix=1)
schema = Schema(self.storage, id=AUTO_PK, name=schema_name,
project=project, auto_created=True)
schema.items.add(item)
schema.save()
if item.parent and item.name is None:
sample = item.owner_sample
item_names = map(attrgetter('name'), sample.ordered_items)
item.name = unique_name('subitem', item_names, initial_suffix=1)
item.save(only=('name',))
return item
def update(self, instance, validated_data):
current_schema = instance.schema
instance = super(ItemSerializer, self).update(instance, validated_data)
new_schema = instance.schema
if new_schema != current_schema:
field_map = {field.name: field for field in new_schema.fields}
for annotation in instance.annotations:
current_field = annotation.field
if current_field.name in field_map:
new_field = field_map[current_field.name]
clear_auto_created(new_field)
else:
new_field = Field(self.storage, id=AUTO_PK,
name=current_field.name,
type=current_field.type,
schema=new_schema,
auto_created=True)
field_map[new_field.name] = new_field
new_field.save()
annotation.field = new_field
annotation.save(only=('field',))
if current_field.auto_created:
self.deleted.extend(current_field.delete())
if current_schema.auto_created:
self.deleted.extend(current_schema.delete())
clear_auto_created(new_schema)
return instance
def delete(self):
instance = self.instance
sample = instance.owner_sample
items = sample.items
if len(items) == 1 and items[0] == instance:
raise ProtectedError(
u"Cannot delete item {} because it is the only item in the "
u"sample {}".format(instance, sample))
super(ItemSerializer, self).delete()
class AnnotationSerializer(JsonApiSerializer):
class Meta:
model = Annotation
url = ('/api/projects/{self.owner_sample.spider.project.id}/spiders'
'/{self.owner_sample.spider.id}/samples/{self.owner_sample.id}'
'/annotations/{self.id}')
links = {
'parent': {
'related': '/api/projects'
'/{self.owner_sample.spider.project.id}/spiders'
'/{self.owner_sample.spider.id}/samples'
'/{self.owner_sample.id}/items/{self.parent.id}',
},
'field': {
'related': '/api/projects'
'/{self.owner_sample.spider.project.id}/schemas'
'/{self.parent.schema.id}/fields/{self.field.id}',
},
}
def create(self, validated_data):
annotation = super(AnnotationSerializer, self).create(validated_data)
if annotation.field is None:
project = annotation.owner_sample.spider.project
project.schemas # preload schemas and fields
item = annotation.parent
schema = item.schema
field_names = map(attrgetter('name'), schema.fields)
field_name = unique_name('field', field_names, initial_suffix=1)
field = Field(self.storage, id=AUTO_PK, name=field_name,
type=choose_field_type(annotation), schema=schema,
auto_created=True)
field.annotations.add(annotation)
field.save()
return annotation
def update(self, instance, validated_data):
current_field = instance.field
instance = super(AnnotationSerializer, self).update(
instance, validated_data)
new_field = instance.field
if new_field != current_field:
if current_field.auto_created:
self.deleted.extend(current_field.delete())
clear_auto_created(new_field)
return instance
class RenderedBodySerializer(JsonApiSerializer):
class Meta:
model = RenderedBody
url = ('/api/projects/{self.sample.spider.project.id}/'
'spiders/{self.sample.spider.id}/samples/'
'{self.sample.id}/rendered_body')
links = {
'sample': {
'related': ('/api/projects/{self.sample.spider.project.id}/'
'spiders/{self.sample.spider.id}/samples/'
'{self.sample.id}'),
},
}
class OriginalBodySerializ
gitextract_l8nvd49y/
├── .dockerignore
├── .drone.yml
├── .editorconfig
├── .gitattributes
├── .gitignore
├── .jshintrc
├── .travis.yml
├── CHANGES
├── Dockerfile
├── LICENSE
├── README.md
├── VERSION
├── Vagrantfile
├── bin/
│ └── bump_version.py
├── docker/
│ ├── compile-assets.sh
│ ├── entry
│ ├── nginx/
│ │ ├── nginx.conf
│ │ ├── proxy_portia_server.conf
│ │ └── proxy_slyd.conf
│ ├── portia.conf
│ ├── provision.sh
│ ├── qt_install.qs
│ ├── restore-mtime.sh
│ └── run-tests.sh
├── docker-compose.yml
├── docs/
│ ├── Makefile
│ ├── conf.py
│ ├── examples.rst
│ ├── faq.rst
│ ├── getting-started.rst
│ ├── index.rst
│ ├── installation.rst
│ ├── items.rst
│ ├── make.bat
│ ├── projects.rst
│ ├── samples.rst
│ └── spiders.rst
├── portia_server/
│ ├── db_repo/
│ │ ├── __init__.py
│ │ ├── apps.py
│ │ ├── migrations/
│ │ │ ├── 0001_initial.py
│ │ │ ├── __init__.py
│ │ │ └── slyd_to_django.sql
│ │ ├── models.py
│ │ └── repo.py
│ ├── manage.py
│ ├── portia_api/
│ │ ├── __init__.py
│ │ ├── apps.py
│ │ ├── errors.py
│ │ ├── jsonapi/
│ │ │ ├── __init__.py
│ │ │ ├── exceptions.py
│ │ │ ├── parsers.py
│ │ │ ├── registry.py
│ │ │ ├── relationships.py
│ │ │ ├── renderers.py
│ │ │ ├── response.py
│ │ │ ├── serializers.py
│ │ │ └── utils.py
│ │ ├── resources/
│ │ │ ├── __init__.py
│ │ │ ├── annotations.py
│ │ │ ├── extractors.py
│ │ │ ├── fields.py
│ │ │ ├── items.py
│ │ │ ├── models.py
│ │ │ ├── projects.py
│ │ │ ├── response.py
│ │ │ ├── route.py
│ │ │ ├── samples.py
│ │ │ ├── schemas.py
│ │ │ ├── serializers.py
│ │ │ └── spiders.py
│ │ ├── routers.py
│ │ ├── tests/
│ │ │ ├── __init__.py
│ │ │ └── test_routes.py
│ │ ├── urls.py
│ │ └── utils/
│ │ ├── __init__.py
│ │ ├── annotations.py
│ │ ├── copy.py
│ │ ├── deploy/
│ │ │ ├── base.py
│ │ │ ├── package.py
│ │ │ ├── scrapinghub.py
│ │ │ └── scrapyd.py
│ │ ├── download.py
│ │ ├── extract.py
│ │ ├── projects.py
│ │ └── spiders.py
│ ├── portia_orm/
│ │ ├── __init__.py
│ │ ├── apps.py
│ │ ├── base.py
│ │ ├── collection.py
│ │ ├── datastore.py
│ │ ├── decorators.py
│ │ ├── deletion.py
│ │ ├── exceptions.py
│ │ ├── fields.py
│ │ ├── middleware.py
│ │ ├── models.py
│ │ ├── registry.py
│ │ ├── relationships.py
│ │ ├── serializers.py
│ │ ├── snapshots.py
│ │ ├── tests/
│ │ │ ├── __init__.py
│ │ │ ├── models.py
│ │ │ ├── test_basic.py
│ │ │ ├── test_collection.py
│ │ │ ├── test_model.py
│ │ │ ├── test_relationship.py
│ │ │ └── utils.py
│ │ ├── utils.py
│ │ └── validators.py
│ ├── portia_server/
│ │ ├── __init__.py
│ │ ├── backends.py
│ │ ├── models.py
│ │ ├── settings.py
│ │ ├── urls.py
│ │ ├── views.py
│ │ └── wsgi.py
│ ├── requirements.txt
│ └── storage/
│ ├── __init__.py
│ ├── apps.py
│ ├── backends.py
│ ├── jsondiff.py
│ ├── projecttemplates.py
│ └── repoman.py
├── portiaui/
│ ├── .bowerrc
│ ├── .editorconfig
│ ├── .ember-cli
│ ├── .gitignore
│ ├── .jshintrc
│ ├── .watchmanconfig
│ ├── app/
│ │ ├── adapters/
│ │ │ ├── application.js
│ │ │ └── project.js
│ │ ├── app.js
│ │ ├── components/
│ │ │ ├── .gitkeep
│ │ │ ├── add-start-url-button.js
│ │ │ ├── animation-container.js
│ │ │ ├── annotation-options.js
│ │ │ ├── browser-iframe.js
│ │ │ ├── browser-url-blocked.js
│ │ │ ├── browser-url-failing.js
│ │ │ ├── browser-view-port.js
│ │ │ ├── buffered-input.js
│ │ │ ├── colored-badge.js
│ │ │ ├── colored-span.js
│ │ │ ├── combo-box.js
│ │ │ ├── create-project-button.js
│ │ │ ├── create-spider-button.js
│ │ │ ├── data-structure-annotations.js
│ │ │ ├── data-structure-listing.js
│ │ │ ├── dropdown-delete.js
│ │ │ ├── dropdown-divider.js
│ │ │ ├── dropdown-header.js
│ │ │ ├── dropdown-item.js
│ │ │ ├── dropdown-menu.js
│ │ │ ├── dropdown-widget.js
│ │ │ ├── edit-sample-button.js
│ │ │ ├── element-overlay.js
│ │ │ ├── element-rect-overlay.js
│ │ │ ├── extracted-item-table.js
│ │ │ ├── extracted-items-group.js
│ │ │ ├── extracted-items-json-panel.js
│ │ │ ├── extracted-items-json-value.js
│ │ │ ├── extracted-items-json.js
│ │ │ ├── extracted-items-panel.js
│ │ │ ├── extracted-items-status.js
│ │ │ ├── extracted-items-tab.js
│ │ │ ├── extractor-options.js
│ │ │ ├── feed-url-options.js
│ │ │ ├── field-options.js
│ │ │ ├── fragment-options.js
│ │ │ ├── generated-url-options.js
│ │ │ ├── help-icon.js
│ │ │ ├── icon-button.js
│ │ │ ├── indentation-spacer.js
│ │ │ ├── input-with-clear.js
│ │ │ ├── inspector-panel.js
│ │ │ ├── link-crawling-options.js
│ │ │ ├── list-item-add-annotation-menu.js
│ │ │ ├── list-item-annotation-field.js
│ │ │ ├── list-item-badge.js
│ │ │ ├── list-item-combo.js
│ │ │ ├── list-item-editable.js
│ │ │ ├── list-item-field-type.js
│ │ │ ├── list-item-icon-menu.js
│ │ │ ├── list-item-icon.js
│ │ │ ├── list-item-item-schema.js
│ │ │ ├── list-item-link-crawling.js
│ │ │ ├── list-item-relation-manager.js
│ │ │ ├── list-item-selectable.js
│ │ │ ├── list-item-text.js
│ │ │ ├── notification-container.js
│ │ │ ├── notification-message.js
│ │ │ ├── page-actions-editor.js
│ │ │ ├── project-list.js
│ │ │ ├── project-listing.js
│ │ │ ├── project-structure-listing.js
│ │ │ ├── project-structure-spider-feed-url.js
│ │ │ ├── project-structure-spider-generated-url.js
│ │ │ ├── project-structure-spider-url.js
│ │ │ ├── regex-pattern-list.js
│ │ │ ├── reorder-handler.js
│ │ │ ├── save-status.js
│ │ │ ├── schema-structure-listing.js
│ │ │ ├── scrapinghub-links.js
│ │ │ ├── select-box.js
│ │ │ ├── show-links-button.js
│ │ │ ├── show-links-legend.js
│ │ │ ├── sliding-main.js
│ │ │ ├── spider-indentation.js
│ │ │ ├── spider-message.js
│ │ │ ├── spider-options.js
│ │ │ ├── spider-row.js
│ │ │ ├── spider-structure-listing.js
│ │ │ ├── start-url-options.js
│ │ │ ├── tool-group.js
│ │ │ ├── tool-panel.js
│ │ │ ├── tool-tab.js
│ │ │ ├── tooltip-container.js
│ │ │ ├── tooltip-icon.js
│ │ │ ├── tree-list-item-row.js
│ │ │ ├── tree-list-item.js
│ │ │ ├── tree-list.js
│ │ │ └── url-bar.js
│ │ ├── controllers/
│ │ │ ├── .gitkeep
│ │ │ └── projects/
│ │ │ ├── project/
│ │ │ │ ├── conflicts/
│ │ │ │ │ └── conflict.js
│ │ │ │ ├── conflicts.js
│ │ │ │ ├── schema/
│ │ │ │ │ └── field/
│ │ │ │ │ └── options.js
│ │ │ │ ├── spider/
│ │ │ │ │ ├── link-options.js
│ │ │ │ │ ├── options.js
│ │ │ │ │ └── sample/
│ │ │ │ │ ├── data/
│ │ │ │ │ │ └── annotation/
│ │ │ │ │ │ └── options.js
│ │ │ │ │ └── data.js
│ │ │ │ └── spider.js
│ │ │ └── project.js
│ │ ├── helpers/
│ │ │ ├── .gitkeep
│ │ │ ├── array-get.js
│ │ │ ├── attribute-annotation.js
│ │ │ ├── chain-actions.js
│ │ │ ├── guid.js
│ │ │ ├── includes.js
│ │ │ ├── indexed-object.js
│ │ │ ├── is-empty-object.js
│ │ │ ├── is-object-or-array.js
│ │ │ └── is-object.js
│ │ ├── index.html
│ │ ├── initializers/
│ │ │ └── ui-state.js
│ │ ├── instance-initializers/
│ │ │ └── error-handler.js
│ │ ├── mixins/
│ │ │ ├── options-route.js
│ │ │ └── save-spider-mixin.js
│ │ ├── models/
│ │ │ ├── .gitkeep
│ │ │ ├── annotation.js
│ │ │ ├── base-annotation.js
│ │ │ ├── base.js
│ │ │ ├── extractor.js
│ │ │ ├── field.js
│ │ │ ├── item.js
│ │ │ ├── project.js
│ │ │ ├── sample.js
│ │ │ ├── schema.js
│ │ │ ├── spider.js
│ │ │ └── start-url.js
│ │ ├── resolver.js
│ │ ├── router.js
│ │ ├── routes/
│ │ │ ├── .gitkeep
│ │ │ ├── application.js
│ │ │ ├── browsers.js
│ │ │ ├── index.js
│ │ │ ├── projects/
│ │ │ │ ├── project/
│ │ │ │ │ ├── compatibility.js
│ │ │ │ │ ├── conflicts/
│ │ │ │ │ │ └── conflict.js
│ │ │ │ │ ├── conflicts.js
│ │ │ │ │ ├── schema/
│ │ │ │ │ │ ├── field/
│ │ │ │ │ │ │ └── options.js
│ │ │ │ │ │ └── field.js
│ │ │ │ │ ├── schema.js
│ │ │ │ │ ├── spider/
│ │ │ │ │ │ ├── link-options.js
│ │ │ │ │ │ ├── options.js
│ │ │ │ │ │ ├── sample/
│ │ │ │ │ │ │ ├── data/
│ │ │ │ │ │ │ │ ├── annotation/
│ │ │ │ │ │ │ │ │ └── options.js
│ │ │ │ │ │ │ │ ├── annotation.js
│ │ │ │ │ │ │ │ └── item.js
│ │ │ │ │ │ │ ├── data.js
│ │ │ │ │ │ │ └── index.js
│ │ │ │ │ │ ├── sample.js
│ │ │ │ │ │ ├── start-url/
│ │ │ │ │ │ │ └── options.js
│ │ │ │ │ │ └── start-url.js
│ │ │ │ │ └── spider.js
│ │ │ │ └── project.js
│ │ │ └── projects.js
│ │ ├── serializers/
│ │ │ └── application.js
│ │ ├── services/
│ │ │ ├── annotation-structure.js
│ │ │ ├── browser.js
│ │ │ ├── capabilities.js
│ │ │ ├── changes.js
│ │ │ ├── clock.js
│ │ │ ├── dispatcher.js
│ │ │ ├── extracted-items.js
│ │ │ ├── notification-manager.js
│ │ │ ├── overlays.js
│ │ │ ├── position-monitor.js
│ │ │ ├── saving-notification.js
│ │ │ ├── selector-matcher.js
│ │ │ ├── store.js
│ │ │ ├── ui-state.js
│ │ │ └── web-socket.js
│ │ ├── storages/
│ │ │ ├── cookies.js
│ │ │ ├── page-loads.js
│ │ │ ├── ui-state-collapsed-panels.js
│ │ │ └── ui-state-selected-tools.js
│ │ ├── styles/
│ │ │ ├── _animations.scss
│ │ │ ├── _bootstrap_overrides.scss
│ │ │ ├── _icons.scss
│ │ │ ├── _lib_config.scss
│ │ │ ├── _variables.scss
│ │ │ ├── app.scss
│ │ │ ├── components/
│ │ │ │ ├── animation-container.scss
│ │ │ │ ├── browser-iframe.scss
│ │ │ │ ├── browser-view-port.scss
│ │ │ │ ├── combo-box.scss
│ │ │ │ ├── conflicts.scss
│ │ │ │ ├── dropdown-delete.scss
│ │ │ │ ├── dropdown-menu.scss
│ │ │ │ ├── dropdown-widget.scss
│ │ │ │ ├── extracted-item-table.scss
│ │ │ │ ├── extracted-items-json-panel.scss
│ │ │ │ ├── extractor-options.scss
│ │ │ │ ├── fragment-options.scss
│ │ │ │ ├── help-icon.scss
│ │ │ │ ├── icon-button.scss
│ │ │ │ ├── indentation-spacer.scss
│ │ │ │ ├── input-with-clear.scss
│ │ │ │ ├── inspector-panel.scss
│ │ │ │ ├── list-item-badge.scss
│ │ │ │ ├── list-item-combo.scss
│ │ │ │ ├── list-item-editable.scss
│ │ │ │ ├── list-item-icon.scss
│ │ │ │ ├── list-item-selectable.scss
│ │ │ │ ├── list-item-text.scss
│ │ │ │ ├── notifications.scss
│ │ │ │ ├── page-actions.scss
│ │ │ │ ├── project-structure-spider-generation-url.scss
│ │ │ │ ├── regex-pattern-list.scss
│ │ │ │ ├── save-status.scss
│ │ │ │ ├── select-box.scss
│ │ │ │ ├── show-links-legend.scss
│ │ │ │ ├── side-bar.scss
│ │ │ │ ├── sliding-main.scss
│ │ │ │ ├── start-url-options.scss
│ │ │ │ ├── tool-group.scss
│ │ │ │ ├── tool-panel.scss
│ │ │ │ ├── tooltip-container.scss
│ │ │ │ ├── top-bar.scss
│ │ │ │ ├── tree-list.scss
│ │ │ │ └── url-bar.scss
│ │ │ ├── document.scss
│ │ │ ├── droplet.scss
│ │ │ ├── generic.scss
│ │ │ ├── layout/
│ │ │ │ ├── _clickable.scss
│ │ │ │ ├── _forms.scss
│ │ │ │ └── _full-page-content.scss
│ │ │ └── templates/
│ │ │ ├── application.scss
│ │ │ ├── browsers.scss
│ │ │ └── projects.scss
│ │ ├── templates/
│ │ │ ├── application.hbs
│ │ │ ├── branding.hbs
│ │ │ ├── browsers.hbs
│ │ │ ├── components/
│ │ │ │ ├── .gitkeep
│ │ │ │ ├── add-start-url-button.hbs
│ │ │ │ ├── animation-container.hbs
│ │ │ │ ├── annotation-options.hbs
│ │ │ │ ├── browser-iframe.hbs
│ │ │ │ ├── browser-list.hbs
│ │ │ │ ├── browser-url-blocked.hbs
│ │ │ │ ├── browser-url-failing.hbs
│ │ │ │ ├── browser-view-port.hbs
│ │ │ │ ├── buffered-input.hbs
│ │ │ │ ├── colored-badge.hbs
│ │ │ │ ├── colored-span.hbs
│ │ │ │ ├── combo-box.hbs
│ │ │ │ ├── create-project-button.hbs
│ │ │ │ ├── create-spider-button.hbs
│ │ │ │ ├── data-structure-annotations.hbs
│ │ │ │ ├── data-structure-listing.hbs
│ │ │ │ ├── dropdown-delete.hbs
│ │ │ │ ├── dropdown-divider.hbs
│ │ │ │ ├── dropdown-header.hbs
│ │ │ │ ├── dropdown-item.hbs
│ │ │ │ ├── dropdown-menu.hbs
│ │ │ │ ├── dropdown-widget.hbs
│ │ │ │ ├── edit-sample-button.hbs
│ │ │ │ ├── element-overlay.hbs
│ │ │ │ ├── element-rect-overlay.hbs
│ │ │ │ ├── extracted-item-table.hbs
│ │ │ │ ├── extracted-items-group.hbs
│ │ │ │ ├── extracted-items-json-panel.hbs
│ │ │ │ ├── extracted-items-json-value.hbs
│ │ │ │ ├── extracted-items-json.hbs
│ │ │ │ ├── extracted-items-panel.hbs
│ │ │ │ ├── extracted-items-status.hbs
│ │ │ │ ├── extracted-items-tab.hbs
│ │ │ │ ├── extractor-options.hbs
│ │ │ │ ├── feed-url-options.hbs
│ │ │ │ ├── field-options.hbs
│ │ │ │ ├── fragment-options.hbs
│ │ │ │ ├── generated-url-options.hbs
│ │ │ │ ├── help-icon.hbs
│ │ │ │ ├── icon-button.hbs
│ │ │ │ ├── input-with-clear.hbs
│ │ │ │ ├── inspector-panel.hbs
│ │ │ │ ├── json-file-compare.hbs
│ │ │ │ ├── link-crawling-options.hbs
│ │ │ │ ├── list-item-add-annotation-menu.hbs
│ │ │ │ ├── list-item-annotation-field.hbs
│ │ │ │ ├── list-item-badge.hbs
│ │ │ │ ├── list-item-combo.hbs
│ │ │ │ ├── list-item-editable.hbs
│ │ │ │ ├── list-item-field-type.hbs
│ │ │ │ ├── list-item-icon-menu.hbs
│ │ │ │ ├── list-item-icon.hbs
│ │ │ │ ├── list-item-item-schema.hbs
│ │ │ │ ├── list-item-link-crawling.hbs
│ │ │ │ ├── list-item-relation-manager.hbs
│ │ │ │ ├── list-item-selectable.hbs
│ │ │ │ ├── list-item-text.hbs
│ │ │ │ ├── notification-container.hbs
│ │ │ │ ├── notification-message.hbs
│ │ │ │ ├── page-actions-editor.hbs
│ │ │ │ ├── project-list.hbs
│ │ │ │ ├── project-listing.hbs
│ │ │ │ ├── project-structure-listing.hbs
│ │ │ │ ├── project-structure-spider-feed-url.hbs
│ │ │ │ ├── project-structure-spider-generated-url.hbs
│ │ │ │ ├── project-structure-spider-url.hbs
│ │ │ │ ├── regex-pattern-list.hbs
│ │ │ │ ├── save-status.hbs
│ │ │ │ ├── schema-structure-listing.hbs
│ │ │ │ ├── scrapinghub-links.hbs
│ │ │ │ ├── select-box.hbs
│ │ │ │ ├── show-links-button.hbs
│ │ │ │ ├── show-links-legend.hbs
│ │ │ │ ├── sliding-main.hbs
│ │ │ │ ├── spider-indentation.hbs
│ │ │ │ ├── spider-message.hbs
│ │ │ │ ├── spider-options.hbs
│ │ │ │ ├── spider-row.hbs
│ │ │ │ ├── spider-structure-listing.hbs
│ │ │ │ ├── start-url-options.hbs
│ │ │ │ ├── tool-group.hbs
│ │ │ │ ├── tool-panel.hbs
│ │ │ │ ├── tool-tab.hbs
│ │ │ │ ├── tooltip-container.hbs
│ │ │ │ ├── tooltip-icon.hbs
│ │ │ │ ├── tree-list-item-row.hbs
│ │ │ │ ├── tree-list-item.hbs
│ │ │ │ ├── tree-list.hbs
│ │ │ │ └── url-bar.hbs
│ │ │ ├── options-panels.hbs
│ │ │ ├── projects/
│ │ │ │ ├── project/
│ │ │ │ │ ├── conflicts/
│ │ │ │ │ │ ├── file-selector.hbs
│ │ │ │ │ │ ├── help.hbs
│ │ │ │ │ │ ├── resolver.hbs
│ │ │ │ │ │ └── topbar.hbs
│ │ │ │ │ ├── schema/
│ │ │ │ │ │ ├── field/
│ │ │ │ │ │ │ └── options.hbs
│ │ │ │ │ │ ├── field.hbs
│ │ │ │ │ │ └── structure.hbs
│ │ │ │ │ ├── schema.hbs
│ │ │ │ │ ├── spider/
│ │ │ │ │ │ ├── link-options.hbs
│ │ │ │ │ │ ├── options.hbs
│ │ │ │ │ │ ├── overlays.hbs
│ │ │ │ │ │ ├── sample/
│ │ │ │ │ │ │ ├── annotation/
│ │ │ │ │ │ │ │ └── selection.hbs
│ │ │ │ │ │ │ ├── data/
│ │ │ │ │ │ │ │ ├── annotation/
│ │ │ │ │ │ │ │ │ └── options.hbs
│ │ │ │ │ │ │ │ ├── annotation.hbs
│ │ │ │ │ │ │ │ ├── item.hbs
│ │ │ │ │ │ │ │ ├── overlays.hbs
│ │ │ │ │ │ │ │ ├── structure.hbs
│ │ │ │ │ │ │ │ ├── toolbar.hbs
│ │ │ │ │ │ │ │ └── tools.hbs
│ │ │ │ │ │ │ ├── data.hbs
│ │ │ │ │ │ │ ├── item.hbs
│ │ │ │ │ │ │ ├── structure.hbs
│ │ │ │ │ │ │ └── toolbar.hbs
│ │ │ │ │ │ ├── sample.hbs
│ │ │ │ │ │ ├── start-url/
│ │ │ │ │ │ │ └── options.hbs
│ │ │ │ │ │ ├── structure.hbs
│ │ │ │ │ │ ├── toolbar.hbs
│ │ │ │ │ │ └── tools.hbs
│ │ │ │ │ ├── spider.hbs
│ │ │ │ │ ├── structure.hbs
│ │ │ │ │ └── toolbar.hbs
│ │ │ │ └── project.hbs
│ │ │ ├── projects.hbs
│ │ │ └── tool-panels.hbs
│ │ ├── transforms/
│ │ │ ├── array.js
│ │ │ ├── json.js
│ │ │ └── start-url.js
│ │ ├── utils/
│ │ │ ├── attrs.js
│ │ │ ├── browser-features.js
│ │ │ ├── colors.js
│ │ │ ├── computed.js
│ │ │ ├── ensure-promise.js
│ │ │ ├── interaction-event.js
│ │ │ ├── promises.js
│ │ │ ├── selectors.js
│ │ │ ├── start-urls.js
│ │ │ ├── tree-mirror-delegate.js
│ │ │ ├── types.js
│ │ │ └── utils.js
│ │ ├── validations/
│ │ │ ├── fixed-fragment.js
│ │ │ ├── list-fragment.js
│ │ │ └── range-fragment.js
│ │ └── validators/
│ │ ├── range.js
│ │ └── whitespace.js
│ ├── bower.json
│ ├── config/
│ │ ├── deprecation-workflow.js
│ │ ├── environment-development.js
│ │ ├── environment-production.js
│ │ ├── environment-test.js
│ │ └── environment.js
│ ├── ember-cli-build.js
│ ├── package.json
│ ├── public/
│ │ ├── crossdomain.xml
│ │ ├── empty-frame.html
│ │ ├── frames-not-supported.html
│ │ └── robots.txt
│ ├── testem.js
│ ├── tests/
│ │ ├── .jshintrc
│ │ ├── helpers/
│ │ │ ├── destroy-app.js
│ │ │ ├── module-for-acceptance.js
│ │ │ ├── resolver.js
│ │ │ └── start-app.js
│ │ ├── index.html
│ │ ├── test-helper.js
│ │ └── unit/
│ │ ├── .gitkeep
│ │ ├── models/
│ │ │ └── start-url-test.js
│ │ ├── utils/
│ │ │ ├── selectors-test.js
│ │ │ └── start-urls-test.js
│ │ └── validators/
│ │ ├── range-test.js
│ │ └── whitespace-test.js
│ └── vendor/
│ ├── .gitkeep
│ ├── modernizr.js
│ ├── mutation-summary.js
│ └── tree-mirror.js
├── slybot/
│ ├── .gitignore
│ ├── CHANGES
│ ├── MANIFEST.in
│ ├── Makefile.buildbot
│ ├── README.rst
│ ├── bin/
│ │ ├── makedeb
│ │ ├── portiacrawl
│ │ └── slybot
│ ├── debian/
│ │ ├── changelog
│ │ ├── compat
│ │ ├── control
│ │ ├── copyright
│ │ ├── pyversions
│ │ └── rules
│ ├── docs/
│ │ ├── Makefile
│ │ ├── conf.py
│ │ ├── index.rst
│ │ ├── make.bat
│ │ ├── project.rst
│ │ └── spiderlets.rst
│ ├── requirements-clustering.txt
│ ├── requirements-test.txt
│ ├── requirements.txt
│ ├── scrapy.cfg
│ ├── setup.py
│ ├── slybot/
│ │ ├── __init__.py
│ │ ├── baseurl.py
│ │ ├── closespider.py
│ │ ├── clustering.py
│ │ ├── dupefilter.py
│ │ ├── exporter.py
│ │ ├── extractors.py
│ │ ├── fieldtypes/
│ │ │ ├── __init__.py
│ │ │ ├── date.py
│ │ │ ├── images.py
│ │ │ ├── number.py
│ │ │ ├── point.py
│ │ │ ├── price.py
│ │ │ ├── text.py
│ │ │ └── url.py
│ │ ├── generic_form.py
│ │ ├── item.py
│ │ ├── linkextractor/
│ │ │ ├── __init__.py
│ │ │ ├── base.py
│ │ │ ├── ecsv.py
│ │ │ ├── html.py
│ │ │ ├── pagination.py
│ │ │ ├── regex.py
│ │ │ └── xml.py
│ │ ├── meta.py
│ │ ├── pageactions.py
│ │ ├── plugins/
│ │ │ ├── __init__.py
│ │ │ ├── scrapely_annotations/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── annotations.py
│ │ │ │ ├── builder.py
│ │ │ │ ├── exceptions.py
│ │ │ │ ├── extraction/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── container_extractors.py
│ │ │ │ │ ├── extractors.py
│ │ │ │ │ ├── pageparsing.py
│ │ │ │ │ ├── region_extractors.py
│ │ │ │ │ └── utils.py
│ │ │ │ ├── migration.py
│ │ │ │ ├── processors.py
│ │ │ │ └── utils.py
│ │ │ └── selectors/
│ │ │ └── __init__.py
│ │ ├── settings.py
│ │ ├── spider.py
│ │ ├── spiderlets.py
│ │ ├── spidermanager.py
│ │ ├── splash.py
│ │ ├── starturls/
│ │ │ ├── __init__.py
│ │ │ ├── feed_generator.py
│ │ │ ├── fragment_generator.py
│ │ │ ├── generated_url.py
│ │ │ └── generator.py
│ │ ├── tests/
│ │ │ ├── __init__.py
│ │ │ ├── data/
│ │ │ │ ├── SampleProject/
│ │ │ │ │ ├── extractors.json
│ │ │ │ │ ├── items.json
│ │ │ │ │ ├── project.json
│ │ │ │ │ └── spiders/
│ │ │ │ │ ├── allowed_domains.json
│ │ │ │ │ ├── any_allowed_domains.json
│ │ │ │ │ ├── books.toscrape.com/
│ │ │ │ │ │ ├── 3617-44af-a2f0/
│ │ │ │ │ │ │ └── original_body.html
│ │ │ │ │ │ ├── 3617-44af-a2f0.json
│ │ │ │ │ │ ├── 3652-4fa1-a912.json
│ │ │ │ │ │ ├── 4583-41b4-9edb/
│ │ │ │ │ │ │ └── original_body.html
│ │ │ │ │ │ └── 4583-41b4-9edb.json
│ │ │ │ │ ├── books.toscrape.com.json
│ │ │ │ │ ├── books.toscrape.com_1.json
│ │ │ │ │ ├── cargurus.json
│ │ │ │ │ ├── ebay.json
│ │ │ │ │ ├── ebay2.json
│ │ │ │ │ ├── ebay3.json
│ │ │ │ │ ├── ebay4.json
│ │ │ │ │ ├── example.com.json
│ │ │ │ │ ├── example2.com.json
│ │ │ │ │ ├── example3.com.json
│ │ │ │ │ ├── example4.com.json
│ │ │ │ │ ├── networkhealth.com/
│ │ │ │ │ │ ├── networkhealthtemplate/
│ │ │ │ │ │ │ ├── annotated_body.html
│ │ │ │ │ │ │ └── original_body.html
│ │ │ │ │ │ └── networkhealthtemplate.json
│ │ │ │ │ ├── networkhealth.com.json
│ │ │ │ │ ├── pinterest.com.json
│ │ │ │ │ ├── seedsofchange.com.json
│ │ │ │ │ ├── seedsofchange.json
│ │ │ │ │ ├── seedsofchange2.json
│ │ │ │ │ └── sitemaps.json
│ │ │ │ ├── atom_sample.xml
│ │ │ │ ├── ebay_advanced_search.html
│ │ │ │ ├── pinterest.html
│ │ │ │ ├── rss_sample.xml
│ │ │ │ ├── sitemap_sample.xml
│ │ │ │ ├── templates/
│ │ │ │ │ ├── 411_list.json
│ │ │ │ │ ├── autoevolution.html
│ │ │ │ │ ├── autoevolution.json
│ │ │ │ │ ├── autoevolution2.json
│ │ │ │ │ ├── cars.com.json
│ │ │ │ │ ├── cars.com_nested.json
│ │ │ │ │ ├── cs-cart.json
│ │ │ │ │ ├── daft_ie.html
│ │ │ │ │ ├── daft_list.json
│ │ │ │ │ ├── firmen.wko.at.html
│ │ │ │ │ ├── firmen.wko.at.json
│ │ │ │ │ ├── hn.html
│ │ │ │ │ ├── patchofland.html
│ │ │ │ │ ├── so_annotations.json
│ │ │ │ │ ├── stack_overflow.html
│ │ │ │ │ ├── stips.co.il.html
│ │ │ │ │ ├── stips.co.il.json
│ │ │ │ │ └── xceed.json
│ │ │ │ └── test_params.txt
│ │ │ ├── test_baseurl.py
│ │ │ ├── test_dropmeta.py
│ │ │ ├── test_dupefilter.py
│ │ │ ├── test_extraction_speed.py
│ │ │ ├── test_extractors.py
│ │ │ ├── test_fieldtypes.py
│ │ │ ├── test_fragment_generator.py
│ │ │ ├── test_generic_form.py
│ │ │ ├── test_linkextractors.py
│ │ │ ├── test_migration.py
│ │ │ ├── test_multiple_item_extraction.py
│ │ │ ├── test_page_actions.py
│ │ │ ├── test_schema_validation.py
│ │ │ ├── test_selectors.py
│ │ │ ├── test_spider.py
│ │ │ ├── test_starturls.py
│ │ │ ├── test_starturls_generator.py
│ │ │ └── utils.py
│ │ ├── utils.py
│ │ └── validation/
│ │ ├── __init__.py
│ │ ├── schema.py
│ │ └── schemas.json
│ └── tox.ini
├── slyd/
│ ├── .gitignore
│ ├── .jshintrc
│ ├── README.md
│ ├── bin/
│ │ ├── init_mysql_db
│ │ ├── sh2sly
│ │ └── slyd
│ ├── requirements.txt
│ ├── setup.py
│ ├── slyd/
│ │ ├── __init__.py
│ │ ├── authmanager.py
│ │ ├── dummyauth.py
│ │ ├── errors.py
│ │ ├── gitstorage/
│ │ │ ├── __init__.py
│ │ │ ├── jsondiff.py
│ │ │ ├── projects.py
│ │ │ └── projectspec.py
│ │ ├── html_utils.py
│ │ ├── projects.py
│ │ ├── projectspec.py
│ │ ├── resource.py
│ │ ├── server.py
│ │ ├── settings/
│ │ │ ├── __init__.py
│ │ │ └── base.py
│ │ ├── specmanager.py
│ │ ├── splash/
│ │ │ ├── __init__.py
│ │ │ ├── commands.py
│ │ │ ├── cookies.py
│ │ │ ├── css_utils.py
│ │ │ ├── ferry.py
│ │ │ ├── proxy.py
│ │ │ ├── qtutils.py
│ │ │ └── utils.py
│ │ └── tap.py
│ └── twisted/
│ └── plugins/
│ └── slyd_plugin.py
└── splash_utils/
├── compile_slybot.sh
├── filters/
│ └── easylist.txt
├── perform_actions.js
├── waitAsync.js
└── z_inject_this.js
Showing preview only (207K chars total). Download the full file or copy to clipboard to get everything.
SYMBOL INDEX (2806 symbols across 307 files)
FILE: bin/bump_version.py
function next_version (line 8) | def next_version(version_file):
function bump_version_file (line 21) | def bump_version_file(filename=None):
FILE: portia_server/db_repo/apps.py
class DbRepoConfig (line 6) | class DbRepoConfig(AppConfig):
FILE: portia_server/db_repo/migrations/0001_initial.py
class Migration (line 9) | class Migration(migrations.Migration):
FILE: portia_server/db_repo/migrations/slyd_to_django.sql
type `objs_599dcce2` (line 8) | CREATE INDEX `objs_599dcce2` ON `objs` (`type`)
type `objs_f7bd60b7` (line 9) | CREATE INDEX `objs_f7bd60b7` ON `objs` (`size`)
type `refs_2063c160` (line 17) | CREATE INDEX `refs_2063c160` ON `refs` (`value`)
FILE: portia_server/db_repo/models.py
class PositiveTinyIntegerField (line 8) | class PositiveTinyIntegerField(PositiveSmallIntegerField):
method db_type (line 9) | def db_type(self, connection):
class PositiveBigIntegerField (line 16) | class PositiveBigIntegerField(BigIntegerField):
method db_type (line 17) | def db_type(self, connection):
class RealBinaryField (line 24) | class RealBinaryField(BinaryField):
method db_type (line 25) | def db_type(self, connection):
class CompressedBinaryField (line 32) | class CompressedBinaryField(BinaryField):
method get_db_prep_save (line 33) | def get_db_prep_save(self, value, connection):
method select_format (line 40) | def select_format(self, compiler, sql, params):
class Objs (line 48) | class Objs(Model):
class Meta (line 55) | class Meta(object):
class Refs (line 60) | class Refs(Model):
class Meta (line 65) | class Meta(object):
FILE: portia_server/db_repo/repo.py
class MysqlObjectStore (line 13) | class MysqlObjectStore(BaseObjectStore):
method __init__ (line 16) | def __init__(self, repo):
method _to_hexsha (line 27) | def _to_hexsha(self, sha):
method _has_sha (line 35) | def _has_sha(self, sha):
method _all_shas (line 39) | def _all_shas(self):
method contains_loose (line 44) | def contains_loose(self, sha):
method __iter__ (line 48) | def __iter__(self):
method get_raw (line 52) | def get_raw(self, name):
method add_object (line 67) | def add_object(self, obj):
method delete_objects (line 77) | def delete_objects(self, object_ids):
class MysqlRefsContainer (line 81) | class MysqlRefsContainer(RefsContainer):
method __init__ (line 86) | def __init__(self, repo):
method allkeys (line 92) | def allkeys(self):
method read_loose_ref (line 96) | def read_loose_ref(self, name):
method set_symbolic_ref (line 107) | def set_symbolic_ref(self, name, other):
method set_if_equals (line 110) | def set_if_equals(self, name, old_ref, new_ref):
method add_if_new (line 119) | def add_if_new(self, name, ref):
method remove_if_equals (line 125) | def remove_if_equals(self, name, old_ref):
method _update_ref (line 131) | def _update_ref(self, name, value):
method _remove_ref (line 136) | def _remove_ref(self, name):
class MysqlRepo (line 140) | class MysqlRepo(BaseRepo):
method __init__ (line 147) | def __init__(self, name):
method head (line 155) | def head(self):
method init_bare (line 160) | def init_bare(cls, name):
method open (line 165) | def open(cls, name):
method repo_exists (line 170) | def repo_exists(cls, name):
method list_repos (line 175) | def list_repos(cls):
method delete_repo (line 180) | def delete_repo(cls, name):
FILE: portia_server/portia_api/apps.py
class PortiaApiConfig (line 6) | class PortiaApiConfig(AppConfig):
FILE: portia_server/portia_api/errors.py
class BaseError (line 1) | class BaseError(Exception):
method __init__ (line 2) | def __init__(self, status, title, body=''):
method title (line 8) | def title(self):
method body (line 12) | def body(self):
method status (line 16) | def status(self):
method __repr__ (line 19) | def __repr__(self):
method __str__ (line 22) | def __str__(self):
class BaseHTTPError (line 26) | class BaseHTTPError(BaseError):
method __init__ (line 29) | def __init__(self, title, body=''):
class BadRequest (line 33) | class BadRequest(BaseHTTPError):
class Forbidden (line 37) | class Forbidden(BaseHTTPError):
class NotFound (line 41) | class NotFound(BaseHTTPError):
class InternalServerError (line 45) | class InternalServerError(BaseHTTPError):
FILE: portia_server/portia_api/jsonapi/exceptions.py
class JsonApiValidationError (line 13) | class JsonApiValidationError(ValidationError):
method __init__ (line 14) | def __init__(self, detail):
function render_exception (line 25) | def render_exception(status_code, detail):
class JsonApiBadRequestError (line 36) | class JsonApiBadRequestError(APIException):
class JsonApiNotFoundError (line 42) | class JsonApiNotFoundError(APIException):
class JsonApiConflictError (line 47) | class JsonApiConflictError(APIException):
class JsonApiFeatureNotAvailableError (line 52) | class JsonApiFeatureNotAvailableError(JsonApiBadRequestError):
class JsonApiGeneralException (line 56) | class JsonApiGeneralException(APIException):
method __init__ (line 57) | def __init__(self, detail=None, status_code=None):
function jsonapi_exception_handler (line 63) | def jsonapi_exception_handler(exc, context):
FILE: portia_server/portia_api/jsonapi/parsers.py
class JSONApiParser (line 6) | class JSONApiParser(JSONParser):
FILE: portia_server/portia_api/jsonapi/registry.py
function get_schema (line 11) | def get_schema(schema_type):
FILE: portia_server/portia_api/jsonapi/relationships.py
class Relationship (line 9) | class Relationship(BaseRelationship):
method __init__ (line 10) | def __init__(self, **kwargs):
method schema (line 15) | def schema(self):
method include_resource_linkage (line 23) | def include_resource_linkage(self):
method get_related_url (line 26) | def get_related_url(self, obj):
method get_self_url (line 34) | def get_self_url(self, obj):
method get_resource_linkage (line 42) | def get_resource_linkage(self, value):
method _serialize (line 48) | def _serialize(self, value, attr, obj):
class PolymorphicRelationship (line 55) | class PolymorphicRelationship(Relationship):
method __init__ (line 56) | def __init__(self, **kwargs):
method _serialize (line 59) | def _serialize(self, value, attr, obj):
method _deserialize (line 94) | def _deserialize(self, value, attr, data):
FILE: portia_server/portia_api/jsonapi/renderers.py
class JSONRenderer (line 6) | class JSONRenderer(BaseJSONRenderer):
method get_indent (line 9) | def get_indent(self, accepted_media_type, renderer_context):
class JSONApiRenderer (line 17) | class JSONApiRenderer(JSONRenderer):
method render (line 20) | def render(self, data, accepted_media_type=None, renderer_context=None):
FILE: portia_server/portia_api/jsonapi/response.py
class JSONResponse (line 5) | class JSONResponse(HttpResponse):
method __init__ (line 9) | def __init__(self, data, **kwargs):
FILE: portia_server/portia_api/jsonapi/serializers.py
class JsonApiSerializerMeta (line 37) | class JsonApiSerializerMeta(SchemaMeta):
method __new__ (line 39) | def __new__(mcs, name, bases, attrs):
class JsonApiSerializerOpts (line 111) | class JsonApiSerializerOpts(SchemaOpts):
method __init__ (line 112) | def __init__(self, meta):
class JsonApiSerializer (line 132) | class JsonApiSerializer(with_metaclass(JsonApiSerializerMeta, BaseSchema)):
method __init__ (line 135) | def __init__(self, instance=None, data=None, storage=None, only=(),
method data (line 212) | def data(self):
method errors (line 216) | def errors(self):
method validated_data (line 222) | def validated_data(self):
method validated_profile_updates_data (line 228) | def validated_profile_updates_data(self):
method deserialize_related_model (line 233) | def deserialize_related_model(self, model, id_):
method update (line 240) | def update(self, instance, validated_data):
method create (line 276) | def create(self, validated_data):
method apply_profile_updates (line 302) | def apply_profile_updates(self, validated_data, serializers):
method save (line 324) | def save(self):
method delete (line 354) | def delete(self):
method is_valid (line 366) | def is_valid(self, raise_exception=False):
method load_profile_updates (line 396) | def load_profile_updates(self):
method sort_collection (line 465) | def sort_collection(self, models, many):
method format_json_api_response (line 480) | def format_json_api_response(self, data, many):
method format_item (line 525) | def format_item(self, item):
method get_top_level_links (line 537) | def get_top_level_links(self, data, many):
method get_resource_links (line 542) | def get_resource_links(self, item):
method get_url (line 548) | def get_url(self, obj):
method add_includes (line 551) | def add_includes(self, includes):
method format_profile_references (line 564) | def format_profile_references(self, instances):
method add_profile_to_response (line 577) | def add_profile_to_response(self, profile, alias, data, response):
class JsonApiPolymorphicSerializer (line 584) | class JsonApiPolymorphicSerializer(object):
method __new__ (line 585) | def __new__(cls, base, default_model, instance=None, data=None, many=F...
method __init__ (line 616) | def __init__(self, base, default_model, instance=None, data=None,
method data (line 625) | def data(self):
FILE: portia_server/portia_api/jsonapi/utils.py
function camel_case_to_dashes (line 49) | def camel_case_to_dashes(value):
function dasherize (line 53) | def dasherize(value):
function type_from_model_name (line 57) | def type_from_model_name(value):
function deep_getattr (line 61) | def deep_getattr(obj, key):
function should_include_field (line 68) | def should_include_field(field, include, exclude):
function order_dict (line 76) | def order_dict(data, ordered_keys, key_map_cache={}):
function get_status_title (line 93) | def get_status_title(status_code):
FILE: portia_server/portia_api/resources/annotations.py
class AnnotationRoute (line 6) | class AnnotationRoute(BaseProjectModelRoute):
method sample (line 12) | def sample(self):
method perform_create (line 16) | def perform_create(self, serializer):
method get_instance (line 20) | def get_instance(self):
method get_collection (line 23) | def get_collection(self):
method get_detail_kwargs (line 29) | def get_detail_kwargs(self):
FILE: portia_server/portia_api/resources/extractors.py
class ExtractorRoute (line 5) | class ExtractorRoute(BaseProjectModelRoute):
method get_instance (line 9) | def get_instance(self):
method get_collection (line 12) | def get_collection(self):
FILE: portia_server/portia_api/resources/fields.py
class FieldRoute (line 7) | class FieldRoute(BaseProjectModelRoute):
method perform_create (line 11) | def perform_create(self, serializer):
method get_instance (line 15) | def get_instance(self):
method get_collection (line 18) | def get_collection(self):
method destroy (line 21) | def destroy(self, *args, **kwargs):
method _destroy_error (line 27) | def _destroy_error(self, annotation_count):
method _annotation_count (line 34) | def _annotation_count(self):
method _load_annotations (line 38) | def _load_annotations(self):
FILE: portia_server/portia_api/resources/items.py
class ItemRoute (line 6) | class ItemRoute(BaseProjectModelRoute):
method spider (line 11) | def spider(self):
method perform_create (line 14) | def perform_create(self, serializer):
method get_instance (line 18) | def get_instance(self):
method get_collection (line 21) | def get_collection(self):
method get_detail_kwargs (line 27) | def get_detail_kwargs(self):
FILE: portia_server/portia_api/resources/models.py
class SlydSchema (line 7) | class SlydSchema(Schema):
method empty_data (line 11) | def empty_data():
method __init__ (line 16) | def __init__(self, *args, **kwargs):
method project_id (line 26) | def project_id(self):
method spider_id (line 30) | def spider_id(self):
method sample_id (line 34) | def sample_id(self):
method schema_id (line 38) | def schema_id(self):
method item_id (line 42) | def item_id(self):
method field_id (line 46) | def field_id(self):
method _dump_relationship_properties (line 50) | def _dump_relationship_properties(self, item):
class ProjectSchema (line 64) | class ProjectSchema(SlydSchema):
class Meta (line 87) | class Meta:
class SchemaSchema (line 91) | class SchemaSchema(SlydSchema):
class Meta (line 108) | class Meta:
class FieldSchema (line 112) | class FieldSchema(SlydSchema):
class Meta (line 133) | class Meta:
class SpiderSchema (line 137) | class SpiderSchema(SlydSchema):
method _dump_login_data (line 167) | def _dump_login_data(self, item):
method _load_login_data (line 177) | def _load_login_data(self, item):
class Meta (line 190) | class Meta:
class SampleSchema (line 194) | class SampleSchema(SlydSchema):
method dump (line 238) | def dump(self, obj, many=None, update_fields=True, **kwargs):
class Meta (line 248) | class Meta:
class BaseAnnotationSchema (line 252) | class BaseAnnotationSchema(SlydSchema):
method parent_id (line 279) | def parent_id(self):
method _dump_parent_id (line 283) | def _dump_parent_id(self, item):
class AnnotationSchema (line 300) | class AnnotationSchema(BaseAnnotationSchema):
class Meta (line 324) | class Meta:
class ItemAnnotationSchema (line 328) | class ItemAnnotationSchema(BaseAnnotationSchema):
class Meta (line 343) | class Meta:
class ExtractorSchema (line 347) | class ExtractorSchema(SlydSchema):
method _dump_extractor_attributes (line 359) | def _dump_extractor_attributes(self, item):
class Meta (line 367) | class Meta:
class HtmlSchema (line 371) | class HtmlSchema(SlydSchema):
class Meta (line 375) | class Meta:
class RenderedBody (line 379) | class RenderedBody(SlydSchema):
class Meta (line 383) | class Meta:
class OriginalBody (line 387) | class OriginalBody(SlydSchema):
class Meta (line 391) | class Meta:
class ItemSchema (line 395) | class ItemSchema(SlydSchema):
method _dump_parent_id (line 433) | def _dump_parent_id(self, item):
class Meta (line 440) | class Meta:
FILE: portia_server/portia_api/resources/projects.py
class ProjectDownloadMixin (line 27) | class ProjectDownloadMixin(object):
method download (line 29) | def download(self, *args, **kwargs):
method commit_from_short_sha (line 61) | def commit_from_short_sha(self, version):
class BaseProjectRoute (line 72) | class BaseProjectRoute(JsonApiRoute):
method projects (line 74) | def projects(self):
method project (line 79) | def project(self):
class BaseProjectModelRoute (line 88) | class BaseProjectModelRoute(BaseProjectRoute, JsonApiModelRoute):
class ProjectRoute (line 92) | class ProjectRoute(ProjectDownloadMixin, BaseProjectRoute,
class FakeStorage (line 97) | class FakeStorage(object):
method exists (line 98) | def exists(self, *args, **kwargs):
method listdir (line 101) | def listdir(self, *args, **kwargs):
method create (line 104) | def create(self, request):
method status (line 138) | def status(self, *args, **kwargs):
method publish (line 150) | def publish(self, *args, **kwargs):
method deploy (line 171) | def deploy(self, *args, **kwargs):
method reset (line 176) | def reset(self, *args, **kwargs):
method copy (line 185) | def copy(self, *args, **kwargs):
method rollback (line 209) | def rollback(self, *args, **kwargs):
method get_instance (line 227) | def get_instance(self):
method get_collection (line 230) | def get_collection(self):
method get_detail_kwargs (line 236) | def get_detail_kwargs(self):
method get_list_kwargs (line 259) | def get_list_kwargs(self):
method get_project_changes (line 268) | def get_project_changes(self):
method _deploy (line 276) | def _deploy(self):
FILE: portia_server/portia_api/resources/response.py
class BaseApiResource (line 10) | class BaseApiResource(object):
method render (line 11) | def render(self, request):
method render_async (line 14) | def render_async(self, request):
class JsonApiResource (line 23) | class JsonApiResource(BaseApiResource):
method __init__ (line 24) | def __init__(self, status, data=None):
method render (line 28) | def render(self, request):
class FileResponse (line 43) | class FileResponse(HttpResponse):
method __init__ (line 44) | def __init__(self, name, content, *args, **kwargs):
FILE: portia_server/portia_api/resources/route.py
class JsonApiRoute (line 30) | class JsonApiRoute(ViewSet):
method __str__ (line 37) | def __str__(self):
method __repr__ (line 40) | def __repr__(self):
method method (line 44) | def method(self):
method path (line 48) | def path(self):
method query (line 52) | def query(self):
method data (line 56) | def data(self):
method user (line 60) | def user(self):
method storage (line 64) | def storage(self):
method dispatch (line 71) | def dispatch(self, request, *args, **kwargs):
method handle_exception (line 74) | def handle_exception(self, exc):
method get_instance (line 85) | def get_instance(self):
method get_collection (line 88) | def get_collection(self):
method filter_collection (line 91) | def filter_collection(self, collection):
method get_serializer (line 136) | def get_serializer(self, instance=None, data=None, many=False, **kwargs):
method get_detail_kwargs (line 158) | def get_detail_kwargs(self):
method get_list_kwargs (line 161) | def get_list_kwargs(self):
method get_request_kwargs (line 164) | def get_request_kwargs(self):
class CreateModelMixin (line 193) | class CreateModelMixin(object):
method create (line 194) | def create(self, *args, **kwargs):
method perform_create (line 208) | def perform_create(self, serializer):
method get_success_headers (line 211) | def get_success_headers(self, data):
class ListModelMixin (line 220) | class ListModelMixin(object):
method list (line 221) | def list(self, *args, **kwargs):
class RetrieveModelMixin (line 231) | class RetrieveModelMixin(object):
method retrieve (line 232) | def retrieve(self, *args, **kwargs):
class UpdateModelMixin (line 242) | class UpdateModelMixin(object):
method update (line 243) | def update(self, *args, **kwargs):
method partial_update (line 266) | def partial_update(self, request, *args, **kwargs):
method perform_update (line 270) | def perform_update(self, serializer):
class DestroyModelMixin (line 274) | class DestroyModelMixin(object):
method destroy (line 275) | def destroy(self, *args, **kwargs):
method perform_destroy (line 296) | def perform_destroy(self, serializer):
class JsonApiModelRoute (line 300) | class JsonApiModelRoute(JsonApiRoute, ListModelMixin, RetrieveModelMixin,
FILE: portia_server/portia_api/resources/samples.py
class SampleRoute (line 6) | class SampleRoute(BaseProjectModelRoute):
method perform_create (line 10) | def perform_create(self, serializer):
method get_instance (line 14) | def get_instance(self):
method get_collection (line 17) | def get_collection(self):
method get_detail_kwargs (line 28) | def get_detail_kwargs(self):
method get_list_kwargs (line 45) | def get_list_kwargs(self):
FILE: portia_server/portia_api/resources/schemas.py
class SchemaRoute (line 7) | class SchemaRoute(BaseProjectModelRoute):
method get_instance (line 11) | def get_instance(self):
method get_collection (line 14) | def get_collection(self):
method get_list_kwargs (line 17) | def get_list_kwargs(self):
method update (line 28) | def update(self, *args, **kwargs):
method destroy (line 37) | def destroy(self, *args, **kwargs):
method _item_uses_schema (line 52) | def _item_uses_schema(self, item):
FILE: portia_server/portia_api/resources/serializers.py
function clear_auto_created (line 15) | def clear_auto_created(instance):
class SpiderListSerializer (line 21) | class SpiderListSerializer(JsonApiSerializer):
class Meta (line 22) | class Meta:
class ProjectSerializer (line 32) | class ProjectSerializer(JsonApiSerializer):
class Meta (line 33) | class Meta:
class SchemaSerializer (line 50) | class SchemaSerializer(JsonApiSerializer):
class Meta (line 51) | class Meta:
method update (line 75) | def update(self, instance, validated_data):
class FieldSerializer (line 82) | class FieldSerializer(JsonApiSerializer):
class Meta (line 83) | class Meta:
method create (line 102) | def create(self, validated_data):
method update (line 107) | def update(self, instance, validated_data):
method delete (line 113) | def delete(self):
class ExtractorSerializer (line 118) | class ExtractorSerializer(JsonApiSerializer):
class Meta (line 119) | class Meta:
class SpiderSerializer (line 136) | class SpiderSerializer(JsonApiSerializer):
class Meta (line 137) | class Meta:
method delete (line 157) | def delete(self):
class SampleSerializer (line 163) | class SampleSerializer(JsonApiSerializer):
class Meta (line 164) | class Meta:
method create (line 190) | def create(self, validated_data):
method update (line 208) | def update(self, instance, validated_data):
class ItemSerializer (line 215) | class ItemSerializer(JsonApiSerializer):
class Meta (line 216) | class Meta:
method create (line 244) | def create(self, validated_data):
method update (line 266) | def update(self, instance, validated_data):
method delete (line 297) | def delete(self):
class AnnotationSerializer (line 308) | class AnnotationSerializer(JsonApiSerializer):
class Meta (line 309) | class Meta:
method create (line 328) | def create(self, validated_data):
method update (line 346) | def update(self, instance, validated_data):
class RenderedBodySerializer (line 361) | class RenderedBodySerializer(JsonApiSerializer):
class Meta (line 362) | class Meta:
class OriginalBodySerializer (line 376) | class OriginalBodySerializer(JsonApiSerializer):
class Meta (line 377) | class Meta:
FILE: portia_server/portia_api/resources/spiders.py
class SpiderRoute (line 18) | class SpiderRoute(ProjectDownloadMixin, BaseProjectModelRoute):
method get_instance (line 23) | def get_instance(self):
method get_collection (line 26) | def get_collection(self):
method extract (line 30) | def extract(self, *args, **kwargs):
method _build_pages (line 49) | def _build_pages(self, spider):
method rename (line 53) | def rename(self, *args, **kwargs):
method schedule (line 78) | def schedule(self, *args, **kwargs):
FILE: portia_server/portia_api/routers.py
class Router (line 9) | class Router(SimpleRouter):
method __init__ (line 10) | def __init__(self, trailing_slash=False):
method get_lookup_regex (line 13) | def get_lookup_regex(self, viewset, lookup_prefix=''):
class NestedRouter (line 17) | class NestedRouter(NestedSimpleRouter, Router):
method __init__ (line 18) | def __init__(self, parent_router, parent_prefix, trailing_slash=False,
FILE: portia_server/portia_api/tests/test_routes.py
class TestRoute (line 8) | class TestRoute(unittest.TestCase):
method test_route_representation (line 9) | def test_route_representation(self):
FILE: portia_server/portia_api/utils/annotations.py
function choose_field_type (line 26) | def choose_field_type(annotation):
FILE: portia_server/portia_api/utils/copy.py
class MissingModelException (line 8) | class MissingModelException(Exception):
class ModelCopier (line 12) | class ModelCopier(object):
method __init__ (line 15) | def __init__(self, project, storage, from_project_id):
method copy (line 32) | def copy(self, models):
method copy_spider (line 53) | def copy_spider(self, spider):
method copy_sample (line 60) | def copy_sample(self, sample, spider):
method copy_item (line 77) | def copy_item(self, item, schema, sample):
method copy_schema (line 85) | def copy_schema(self, schema):
method copy_annotation (line 100) | def copy_annotation(self, annotation, item, field):
method copy_extractors (line 110) | def copy_extractors(self, extractors):
method group (line 122) | def group(self, models):
method _copy_field (line 137) | def _copy_field(self, field, schema):
method _copy_extractor (line 143) | def _copy_extractor(self, extractor):
method _copy_body (line 155) | def _copy_body(self, body, sample):
method _unique_id (line 163) | def _unique_id(self, spider_id):
FILE: portia_server/portia_api/utils/deploy/base.py
class BaseDeploy (line 4) | class BaseDeploy(object):
method __init__ (line 5) | def __init__(self, project):
method build_archive (line 11) | def build_archive(self):
method _get_config (line 15) | def _get_config(self):
method deploy (line 18) | def deploy(self, target=None):
method schedule (line 21) | def schedule(self, spider, args=None, settings=None, target=None):
FILE: portia_server/portia_api/utils/deploy/package.py
class EggInfo (line 10) | class EggInfo(object):
method __init__ (line 11) | def __init__(self, project, archive):
method write (line 16) | def write(self):
method _write_file (line 24) | def _write_file(self, filename, contents):
method build_pkg_info (line 30) | def build_pkg_info(self):
method build_sources (line 39) | def build_sources(self):
method build_top_level (line 42) | def build_top_level(self):
method build_dependency (line 50) | def build_dependency(self):
method build_entry_points (line 53) | def build_entry_points(self):
method build_zip_safe (line 59) | def build_zip_safe(self):
FILE: portia_server/portia_api/utils/deploy/scrapinghub.py
class ScrapinghubDeploy (line 19) | class ScrapinghubDeploy(BaseDeploy):
method _get_config (line 30) | def _get_config(self):
method _default_config (line 44) | def _default_config(self):
method deploy (line 54) | def deploy(self, target='default'):
method schedule (line 82) | def schedule(self, spider, args=None, settings=None, target='default'):
FILE: portia_server/portia_api/utils/deploy/scrapyd.py
class ScrapydDeploy (line 12) | class ScrapydDeploy(BaseDeploy):
method _get_config (line 13) | def _get_config(self):
method _get_config_defaults (line 25) | def _get_config_defaults(self):
method deploy (line 45) | def deploy(self, target='default'):
method schedule (line 68) | def schedule(self, spider, args=None, settings=None, target=None):
method _schedule_data (line 81) | def _schedule_data(self, spider_id, args):
FILE: portia_server/portia_api/utils/download.py
function walk (line 35) | def walk(storage, dirname=''):
class ProjectArchiver (line 43) | class ProjectArchiver(object):
method __init__ (line 48) | def __init__(self, storage, required_files=None, *, project=None):
method archive (line 56) | def archive(self, spiders=None, **kwargs):
method _add_files (line 69) | def _add_files(self, spiders):
method _add_file (line 95) | def _add_file(self, filename, contents, tstamp):
method _add_spider (line 105) | def _add_spider(self, file_path, templates, extractors):
method _deleted_spider (line 117) | def _deleted_spider(self, file_path, spider_data, templates):
method _spider_templates (line 130) | def _spider_templates(self, spider_templates, extractors):
method add_egg_info (line 154) | def add_egg_info(self):
method _spider_name (line 157) | def _spider_name(self, file_path):
method _name (line 166) | def _name(self, file_path):
method _spider_path (line 175) | def _spider_path(self, file_path):
method _paths (line 180) | def _paths(self, spiders):
method _template_paths (line 198) | def _template_paths(self, spiders, files):
method list_files (line 210) | def list_files(self):
method read_file (line 213) | def read_file(self, filename, deserialize=False):
class CodeProjectArchiver (line 227) | class CodeProjectArchiver(ProjectArchiver):
method archive (line 228) | def archive(self, spiders=None, **kwargs):
method _process_name (line 262) | def _process_name(self):
FILE: portia_server/portia_api/utils/extract.py
class FetchError (line 14) | class FetchError(Exception):
method __init__ (line 17) | def __init__(self, errors):
method __str__ (line 23) | def __str__(self):
function get_page (line 27) | def get_page(times, url):
function _load_urls (line 45) | def _load_urls(urls):
function load_urls (line 53) | def load_urls(urls):
class Pages (line 57) | class Pages(object):
method __init__ (line 58) | def __init__(self, urls, spider):
method fetch (line 67) | def fetch(self):
method process (line 82) | def process(self, url, page):
method extract_items (line 85) | def extract_items(self):
FILE: portia_server/portia_api/utils/projects.py
function unique_name (line 1) | def unique_name(base_name, disallow=(), initial_suffix=''):
FILE: portia_server/portia_api/utils/spiders.py
function load_spider_data (line 7) | def load_spider_data(model):
function load_spider (line 31) | def load_spider(model):
FILE: portia_server/portia_orm/apps.py
class PortiaOrmConfig (line 6) | class PortiaOrmConfig(AppConfig):
FILE: portia_server/portia_orm/base.py
class ModelOpts (line 31) | class ModelOpts(object):
method __init__ (line 33) | def __init__(self, meta, model):
method initialize_boolean (line 51) | def initialize_boolean(self, key, meta):
class ModelMeta (line 58) | class ModelMeta(type):
method __new__ (line 60) | def __new__(mcs, name, bases, attrs):
class Model (line 152) | class Model(object, metaclass=ModelMeta):
class Meta (line 168) | class Meta:
method __init__ (line 171) | def __init__(self, storage=None, snapshots=None, _data_key=unspecified,
method __eq__ (line 209) | def __eq__(self, other):
method __hash__ (line 220) | def __hash__(self):
method __ne__ (line 223) | def __ne__(self, other):
method __repr__ (line 226) | def __repr__(self, *field_names):
method __setattr__ (line 248) | def __setattr__(self, key, value):
method with_snapshots (line 255) | def with_snapshots(self, snapshots=None):
method shared_data_store (line 265) | def shared_data_store(cls):
method loaded (line 270) | def loaded(cls):
method _file_model (line 274) | def _file_model(cls):
method generate_pk (line 297) | def generate_pk(cls, storage):
method pk (line 305) | def pk(self):
method data_store (line 309) | def data_store(self):
method has_data (line 316) | def has_data(self, key):
method get_data (line 323) | def get_data(self, key, default=unspecified):
method set_data (line 335) | def set_data(self, key, value):
method dump (line 338) | def dump(self, state='working'):
method dumps (line 349) | def dumps(self, state='working'):
method rollback (line 353) | def rollback(self):
method save (line 356) | def save(self, only=None):
method _stage_changes (line 370) | def _stage_changes(self, only=None):
method _commit_changes (line 391) | def _commit_changes(self, saved_paths=None, deleted_paths=None):
method _get_object_to_dump (line 432) | def _get_object_to_dump(self, model, parent_snapshots):
method _get_parent_object (line 448) | def _get_parent_object(self, parent_snapshots):
method _staged_model_references (line 452) | def _staged_model_references(self, load_relationships=False):
method delete (line 471) | def delete(self):
method _stage_delete (line 481) | def _stage_delete(self, collector):
method _commit_delete (line 496) | def _commit_delete(self, collector, saved_paths=None, deleted_paths=No...
method load (line 536) | def load(cls, storage, instance=None, **kwargs):
method storage_path (line 593) | def storage_path(cls, data, snapshots=None):
method resolve_attributes (line 608) | def resolve_attributes(self, *attributes, **kwargs):
method copy (line 630) | def copy(self, new_id=None, storage=None):
FILE: portia_server/portia_orm/collection.py
function set_related (line 20) | def set_related(model, relationship_name, related):
function clear_related (line 28) | def clear_related(model, relationship_name, related):
class OwnedList (line 36) | class OwnedList(list):
method __init__ (line 37) | def __init__(self, iterable=None, owner=None, attrname=None,
method field (line 76) | def field(self):
method with_snapshots (line 79) | def with_snapshots(self, snapshots=None):
method __setitem__ (line 87) | def __setitem__(self, index, value):
method __delitem__ (line 122) | def __delitem__(self, index):
method __getslice__ (line 127) | def __getslice__(self, i, j):
method __setslice__ (line 130) | def __setslice__(self, i, j, value):
method __delslice__ (line 133) | def __delslice__(self, i, j):
method __contains__ (line 136) | def __contains__(self, key):
method append (line 147) | def append(self, value):
method extend (line 153) | def extend(self, iterable):
method insert (line 157) | def insert(self, index, value):
method remove (line 163) | def remove(self, value):
method pop (line 169) | def pop(self, index=-1):
method index (line 175) | def index(self, value, start=None, stop=None):
method clear (line 190) | def clear(self):
method _validate (line 194) | def _validate(self, value):
method _update_owner_data (line 197) | def _update_owner_data(self):
method _populate_cache (line 202) | def _populate_cache(self):
class FieldCollection (line 207) | class FieldCollection(OwnedList):
method _validate (line 208) | def _validate(self, value):
class ModelCollection (line 216) | class ModelCollection(OwnedList):
method related_name (line 226) | def related_name(self):
method __getitem__ (line 231) | def __getitem__(self, key):
method __setitem__ (line 244) | def __setitem__(self, key, value):
method __delitem__ (line 257) | def __delitem__(self, key):
method __repr__ (line 263) | def __repr__(self):
method append (line 276) | def append(self, obj):
method add (line 283) | def add(self, obj):
method extend (line 289) | def extend(self, iterable):
method update (line 293) | def update(self, iterable):
method insert (line 297) | def insert(self, index, obj):
method remove (line 304) | def remove(self, obj):
method discard (line 308) | def discard(self, obj):
method pop (line 315) | def pop(self, key=unspecified):
method get (line 330) | def get(self, key, default=None):
method keys (line 334) | def keys(self):
method dump (line 338) | def dump(self, state='working'):
method dumps (line 354) | def dumps(self, state='working'):
method _validate (line 358) | def _validate(self, value):
method _get_index (line 361) | def _get_index(self, index, default=None):
method _key_to_index (line 367) | def _key_to_index(self, key):
method _set_related (line 384) | def _set_related(self, related):
method _clear_related (line 391) | def _clear_related(self, related):
class ListDescriptor (line 399) | class ListDescriptor(object):
method __init__ (line 400) | def __init__(self, attrname):
method __get__ (line 403) | def __get__(self, instance, instance_type=None):
method __set__ (line 413) | def __set__(self, instance, values):
method new_collection (line 423) | def new_collection(self, instance):
method replace_collection (line 427) | def replace_collection(self, collection, values):
FILE: portia_server/portia_orm/datastore.py
class DataStoreHandler (line 6) | class DataStoreHandler(local):
method data_store (line 8) | def data_store(self):
method data_store (line 17) | def data_store(self, value):
method data_store (line 21) | def data_store(self):
method loaded (line 25) | def loaded(self):
method loaded (line 34) | def loaded(self, value):
method loaded (line 38) | def loaded(self):
function data_store_context (line 43) | def data_store_context():
FILE: portia_server/portia_orm/deletion.py
class Collector (line 14) | class Collector(set):
method __init__ (line 15) | def __init__(self):
method save_instance (line 20) | def save_instance(self, instance, *fields):
method delete_instance (line 28) | def delete_instance(self, instance):
function CASCADE (line 36) | def CASCADE(collector, instance, field_name, related_instance):
function CLEAR (line 45) | def CLEAR(collector, instance, field_name, related_instance):
function PROTECT (line 55) | def PROTECT(collector, instance, field_name, related_instance):
FILE: portia_server/portia_orm/exceptions.py
class ImproperlyConfigured (line 9) | class ImproperlyConfigured(Exception):
class PathResolutionError (line 13) | class PathResolutionError(Exception):
class ProtectedError (line 17) | class ProtectedError(Exception):
FILE: portia_server/portia_orm/fields.py
class FieldDescriptor (line 32) | class FieldDescriptor(object):
method __init__ (line 34) | def __init__(self, attrname, field):
method default (line 39) | def default(self):
method __get__ (line 46) | def __get__(self, instance, instance_type=None):
method __set__ (line 62) | def __set__(self, instance, value):
class Field (line 68) | class Field(fields.Field):
method __init__ (line 69) | def __init__(self, **kwargs):
method contribute_to_class (line 78) | def contribute_to_class(self, cls, attrname):
method get_dependencies (line 81) | def get_dependencies(self, cls):
method serialize (line 86) | def serialize(self, attr, obj, accessor=None):
class ValidatedField (line 94) | class ValidatedField(fields.ValidatedField, Field):
class Validator (line 99) | class Validator(validate.Validator):
method __init__ (line 102) | def __init__(self, error=None):
method _format_error (line 105) | def _format_error(self, value):
method fail (line 108) | def fail(self, value):
method __call__ (line 111) | def __call__(self, value):
method __init__ (line 116) | def __init__(self, *args, **kwargs):
method _validated (line 120) | def _validated(self, value):
class String (line 125) | class String(fields.String, Field):
class Boolean (line 129) | class Boolean(fields.Boolean, Field):
class Integer (line 133) | class Integer(fields.Integer, Field):
class Url (line 137) | class Url(fields.Url, Field):
class Domain (line 141) | class Domain(ValidatedField, String):
class ValidDomain (line 146) | class ValidDomain(ValidatedField.Validator):
method __call__ (line 160) | def __call__(self, value):
class Regexp (line 169) | class Regexp(ValidatedField, String):
class ValidRegexp (line 174) | class ValidRegexp(ValidatedField.Validator):
method __call__ (line 177) | def __call__(self, value):
class DependantField (line 187) | class DependantField(Field):
method __init__ (line 193) | def __init__(self, when, then, **kwargs):
method get_dependencies (line 202) | def get_dependencies(self, cls):
method serialize (line 205) | def serialize(self, attr, obj, accessor=None):
method deserialize (line 209) | def deserialize(self, value, attr=None, data=None):
method _add_to_schema (line 213) | def _add_to_schema(self, field_name, schema):
method _field_for_data (line 218) | def _field_for_data(self, data):
class List (line 226) | class List(fields.List, Field):
method contribute_to_class (line 227) | def contribute_to_class(self, cls, attrname):
class Fragment (line 231) | class Fragment(ValidatedField, Field):
class ValidType (line 232) | class ValidType(ValidatedField.Validator):
method __call__ (line 235) | def __call__(self, value):
class ValidValue (line 241) | class ValidValue(ValidatedField.Validator):
method invalid_range (line 246) | def invalid_range(self, value):
method __call__ (line 252) | def __call__(self, value):
method __init__ (line 257) | def __init__(self, *args, **kwargs):
class StartUrl (line 262) | class StartUrl(Schema):
FILE: portia_server/portia_orm/middleware.py
class ORMDataStoreMiddleware (line 4) | class ORMDataStoreMiddleware(object):
method __init__ (line 5) | def __init__(self, get_response=None):
method __call__ (line 8) | def __call__(self, request):
FILE: portia_server/portia_orm/models.py
class Project (line 39) | class Project(Model):
method version (line 51) | def version(self):
class Meta (line 59) | class Meta:
function CASCADE_AUTO_OR_CLEAR (line 63) | def CASCADE_AUTO_OR_CLEAR(collector, instance, field_name, related_insta...
class Schema (line 70) | class Schema(Model):
class Meta (line 81) | class Meta:
method unwrap_envelopes (line 86) | def unwrap_envelopes(self, data, many):
method name_from_id (line 90) | def name_from_id(self, data):
method add_fake_items (line 96) | def add_fake_items(self, data):
method remove_fake_items (line 101) | def remove_fake_items(self, data):
method remove_auto_created_false (line 106) | def remove_auto_created_false(self, data):
method wrap_envelopes (line 114) | def wrap_envelopes(self, data, many):
class Field (line 118) | class Field(Model):
class Meta (line 130) | class Meta:
method __repr__ (line 134) | def __repr__(self):
method unwrap_envelopes (line 138) | def unwrap_envelopes(self, data, many):
method name_from_id (line 142) | def name_from_id(self, data):
method add_fake_annotations (line 148) | def add_fake_annotations(self, data):
method remove_fake_annotations (line 153) | def remove_fake_annotations(self, data):
method remove_auto_created_false (line 158) | def remove_auto_created_false(self, data):
method wrap_envelopes (line 164) | def wrap_envelopes(self, data, many):
class Extractor (line 168) | class Extractor(Model):
class Meta (line 180) | class Meta:
method unwrap_envelopes (line 185) | def unwrap_envelopes(self, data, many):
method to_type_and_value (line 189) | def to_type_and_value(self, data):
method from_type_and_value (line 204) | def from_type_and_value(self, data):
method wrap_envelopes (line 216) | def wrap_envelopes(self, data, many):
class Spider (line 220) | class Spider(Model):
class Meta (line 242) | class Meta:
method __repr__ (line 245) | def __repr__(self):
method load (line 249) | def load(cls, storage, instance=None, project=None, **kwargs):
method populate_id (line 263) | def populate_id(self, data):
method dump_templates (line 272) | def dump_templates(self, data):
method normalize_start_urls (line 304) | def normalize_start_urls(self, data):
method get_init_requests (line 312) | def get_init_requests(self, data):
method set_init_requests (line 324) | def set_init_requests(self, data):
method _is_perform_login (line 339) | def _is_perform_login(data):
class OrderedAnnotationsMixin (line 344) | class OrderedAnnotationsMixin(object):
method ordered_children (line 346) | def ordered_children(self):
method ordered_annotations (line 355) | def ordered_annotations(self):
method ordered_items (line 365) | def ordered_items(self):
class Sample (line 374) | class Sample(Model, OrderedAnnotationsMixin):
class Meta (line 388) | class Meta:
method __repr__ (line 391) | def __repr__(self):
method annotations (line 395) | def annotations(self):
method load (line 399) | def load(cls, storage, instance=None, spider=None, **kwargs):
method chain_load (line 410) | def chain_load(self, data):
method migrate_sample (line 419) | def migrate_sample(self, data):
method get_items (line 447) | def get_items(self, data):
method _add_schemas (line 500) | def _add_schemas(serializer, schemas):
method _populate_schema_id (line 515) | def _populate_schema_id(data, schema_id):
method _migrate_html (line 527) | def _migrate_html(self, sample):
method add_fields (line 546) | def add_fields(self, data):
method save_raw (line 591) | def save_raw(serializer, data):
method clean (line 598) | def clean(self, data):
class BaseAnnotation (line 602) | class BaseAnnotation(Model):
class Meta (line 607) | class Meta:
class Item (line 611) | class Item(BaseAnnotation, OrderedAnnotationsMixin):
class Meta (line 625) | class Meta:
method __repr__ (line 629) | def __repr__(self):
method owner_sample (line 634) | def owner_sample(self):
method storage_path (line 642) | def storage_path(cls, data, snapshots=None):
method _get_parent_object (line 655) | def _get_parent_object(self, parent_snapshots):
method wrap_schema_envelopes (line 659) | def wrap_schema_envelopes(self, data):
method remove_attributes (line 667) | def remove_attributes(self, data):
method add_field (line 675) | def add_field(self, data):
method add_attributes (line 680) | def add_attributes(self, data):
method remove_type (line 697) | def remove_type(self, data):
method unwrap_schema_envelopes (line 702) | def unwrap_schema_envelopes(self, data):
class Annotation (line 709) | class Annotation(BaseAnnotation):
class Meta (line 727) | class Meta:
method __repr__ (line 732) | def __repr__(self):
method owner_sample (line 736) | def owner_sample(self):
method storage_path (line 740) | def storage_path(cls, data, snapshots=None):
method generate_pk (line 754) | def generate_pk(cls, storage):
method get_annotation_data (line 762) | def get_annotation_data(self, data):
method set_annotation_data (line 800) | def set_annotation_data(self, data):
class OriginalBody (line 827) | class OriginalBody(Model):
method load (line 834) | def load(cls, storage, instance=None, sample=None, **kwargs):
method populate_item (line 844) | def populate_item(self, data):
method return_html (line 856) | def return_html(self, data):
method dump (line 859) | def dump(self, state='working'):
method dumps (line 870) | def dumps(self, state='working'):
class Meta (line 873) | class Meta:
class RenderedBody (line 881) | class RenderedBody(Model):
method load (line 888) | def load(cls, storage, instance=None, sample=None, **kwargs):
method populate_item (line 896) | def populate_item(self, data):
method return_html (line 908) | def return_html(self, data):
method dump (line 911) | def dump(self, state='working'):
method dumps (line 922) | def dumps(self, state='working'):
class Meta (line 925) | class Meta:
FILE: portia_server/portia_orm/registry.py
function get_model (line 14) | def get_model(model_name):
function get_polymorphic_model (line 22) | def get_polymorphic_model(data):
FILE: portia_server/portia_orm/relationships.py
class BaseRelationshipDescriptor (line 18) | class BaseRelationshipDescriptor(object):
method __init__ (line 20) | def __init__(self, model, attrname, related_name):
method __get__ (line 25) | def __get__(self, instance, instance_type=None):
method __set__ (line 28) | def __set__(self, instance, value):
method __repr__ (line 31) | def __repr__(self):
method model (line 37) | def model(self):
method _validate (line 42) | def _validate(self, value):
class BelongsToDescriptor (line 47) | class BelongsToDescriptor(BaseRelationshipDescriptor):
method __get__ (line 48) | def __get__(self, instance, instance_type=None):
method __set__ (line 53) | def __set__(self, instance, value):
class HasManyDescriptor (line 67) | class HasManyDescriptor(ListDescriptor, BaseRelationshipDescriptor):
method __init__ (line 68) | def __init__(self, *args, **kwargs):
method new_collection (line 71) | def new_collection(self, instance):
method replace_collection (line 88) | def replace_collection(self, collection, values):
class HasOneDescriptor (line 93) | class HasOneDescriptor(BelongsToDescriptor):
method __get__ (line 94) | def __get__(self, instance, instance_type=None):
class BaseRelationship (line 109) | class BaseRelationship(fields.Nested):
method __init__ (line 112) | def __init__(self, model, related_name, on_delete, ignore_in_file=False,
method model (line 129) | def model(self):
method nested (line 135) | def nested(self):
method schema (line 139) | def schema(self):
method _serialize (line 142) | def _serialize(self, nested_obj, attr, obj):
method _deserialize (line 160) | def _deserialize(self, value, attr, data):
method _get_field_for_polymorphic (line 184) | def _get_field_for_polymorphic(self, model):
method _is_collection (line 195) | def _is_collection(self, value):
method contribute_to_class (line 198) | def contribute_to_class(self, cls, attrname):
method get_dependencies (line 228) | def get_dependencies(self, cls):
method _includes_relationships (line 232) | def _includes_relationships(field, model):
class BelongsTo (line 244) | class BelongsTo(BaseRelationship):
class HasMany (line 248) | class HasMany(BaseRelationship):
method __init__ (line 251) | def __init__(self, *args, **kwargs):
class HasOne (line 256) | class HasOne(BaseRelationship):
FILE: portia_server/portia_orm/serializers.py
class FileSerializerOpts (line 13) | class FileSerializerOpts(schema.SchemaOpts):
method __init__ (line 14) | def __init__(self, meta):
class FileSerializer (line 33) | class FileSerializer(schema.Schema):
method __init__ (line 36) | def __init__(self, *args, **kwargs):
method __getattr__ (line 45) | def __getattr__(self, item):
method get_attribute (line 49) | def get_attribute(self, attr, obj, default):
method create_object (line 53) | def create_object(self, data):
method select_snapshots (line 59) | def select_snapshots(self, instance):
method order_keys (line 66) | def order_keys(self, data):
method _do_load (line 74) | def _do_load(self, data, many=None, *args, **kwargs):
method _wrap_only (line 94) | def _wrap_only(self, data):
FILE: portia_server/portia_orm/snapshots.py
class ModelSnapshots (line 8) | class ModelSnapshots(defaultdict):
class ModelSnapshotsAccessor (line 11) | class ModelSnapshotsAccessor(object):
method __init__ (line 14) | def __init__(self, instance, snapshots=None):
method __getattr__ (line 18) | def __getattr__(self, name):
method __setattr__ (line 32) | def __setattr__(self, name, value):
method __init__ (line 38) | def __init__(self):
method get (line 41) | def get(self, key, snapshots=None):
method set (line 51) | def set(self, key, value, snapshot=None):
method copy_from (line 56) | def copy_from(self, other):
method dirty_fields (line 61) | def dirty_fields(self, changed, original):
method update_snapshot (line 72) | def update_snapshot(self, destination, snapshots, fields=None):
method clear_snapshot (line 79) | def clear_snapshot(self, snapshot, fields=None):
method accessor (line 87) | def accessor(self, snapshots=None):
FILE: portia_server/portia_orm/tests/models.py
class ExampleModel (line 5) | class ExampleModel(Model):
class RequiredFieldModel (line 10) | class RequiredFieldModel(Model):
class SingleFileModel (line 15) | class SingleFileModel(Model):
class Meta (line 19) | class Meta:
class ManyFileModel (line 23) | class ManyFileModel(Model):
class Meta (line 29) | class Meta:
class ParamFileModel (line 34) | class ParamFileModel(Model):
class Meta (line 39) | class Meta:
class OneToOneModel1 (line 43) | class OneToOneModel1(Model):
class Meta (line 49) | class Meta:
class OneToOneModel2 (line 53) | class OneToOneModel2(Model):
class Meta (line 59) | class Meta:
class ChildModel (line 63) | class ChildModel(Model):
class Meta (line 69) | class Meta:
class ParentModel (line 74) | class ParentModel(Model):
class Meta (line 80) | class Meta:
class ManyToManyModel1 (line 84) | class ManyToManyModel1(Model):
class Meta (line 90) | class Meta:
class ManyToManyModel2 (line 94) | class ManyToManyModel2(Model):
class Meta (line 100) | class Meta:
class PolymorphicParentModel (line 105) | class PolymorphicParentModel(Model):
class Meta (line 112) | class Meta:
class PolymorphicChildBase (line 116) | class PolymorphicChildBase(Model):
class Meta (line 121) | class Meta:
class PolymorphicChildModel1 (line 127) | class PolymorphicChildModel1(PolymorphicChildBase):
class PolymorphicChildModel2 (line 131) | class PolymorphicChildModel2(PolymorphicChildBase):
class Meta (line 134) | class Meta:
FILE: portia_server/portia_orm/tests/test_basic.py
class BasicModelTests (line 9) | class BasicModelTests(DataStoreTestCase):
method setUp (line 10) | def setUp(self):
method test_validation (line 37) | def test_validation(self):
method test_dump (line 52) | def test_dump(self):
method test_dumps (line 60) | def test_dumps(self):
method test_required (line 71) | def test_required(self):
method test_load_single (line 83) | def test_load_single(self):
method test_load_single_on_access (line 92) | def test_load_single_on_access(self):
method test_partial_load_single (line 102) | def test_partial_load_single(self):
method test_load_many (line 115) | def test_load_many(self):
method test_load_one_from_many (line 130) | def test_load_one_from_many(self):
method test_load_param (line 143) | def test_load_param(self):
method test_load_param_skipped_if_param_missing (line 157) | def test_load_param_skipped_if_param_missing(self):
method test_save_single (line 164) | def test_save_single(self):
method test_save_single_does_not_save_if_nothing_changed (line 178) | def test_save_single_does_not_save_if_nothing_changed(self):
method test_partial_save_single (line 185) | def test_partial_save_single(self):
method test_save_param (line 202) | def test_save_param(self):
method test_save_param_raises_error_if_params_missing (line 216) | def test_save_param_raises_error_if_params_missing(self):
method test_save_selected_fields (line 224) | def test_save_selected_fields(self):
method test_copy (line 238) | def test_copy(self):
class PolymorphicModelTests (line 246) | class PolymorphicModelTests(DataStoreTestCase):
method setUp (line 247) | def setUp(self):
method test_load_many (line 267) | def test_load_many(self):
method test_load_one_from_many (line 286) | def test_load_one_from_many(self):
method test_save_many (line 301) | def test_save_many(self):
FILE: portia_server/portia_orm/tests/test_collection.py
class ModelCollectionTests (line 7) | class ModelCollectionTests(DataStoreTestCase):
method test_create_ (line 8) | def test_create_(self):
method test_create_with_model (line 14) | def test_create_with_model(self):
method test_getitem (line 21) | def test_getitem(self):
method test_setitem_index (line 38) | def test_setitem_index(self):
method test_setitem_key (line 65) | def test_setitem_key(self):
method test_setitem_object (line 92) | def test_setitem_object(self):
method test_delitem_index (line 119) | def test_delitem_index(self):
method test_delitem_key (line 130) | def test_delitem_key(self):
method test_delitem_object (line 141) | def test_delitem_object(self):
method test_append (line 152) | def test_append(self):
method test_add (line 173) | def test_add(self):
method test_extend (line 196) | def test_extend(self):
method test_update (line 216) | def test_update(self):
method test_insert (line 234) | def test_insert(self):
method test_remove (line 265) | def test_remove(self):
method test_discard (line 282) | def test_discard(self):
method test_pop (line 302) | def test_pop(self):
method test_get (line 336) | def test_get(self):
method test_clear (line 351) | def test_clear(self):
method test_validation (line 361) | def test_validation(self):
method test_dump (line 369) | def test_dump(self):
method test_dumps (line 386) | def test_dumps(self):
class PolymorphicCollectionTests (line 412) | class PolymorphicCollectionTests(DataStoreTestCase):
method test_create_ (line 413) | def test_create_(self):
method test_create_with_model (line 419) | def test_create_with_model(self):
method test_getitem (line 426) | def test_getitem(self):
method test_setitem_index (line 449) | def test_setitem_index(self):
method test_setitem_key (line 476) | def test_setitem_key(self):
method test_setitem_object (line 503) | def test_setitem_object(self):
method test_delitem_index (line 530) | def test_delitem_index(self):
method test_delitem_key (line 541) | def test_delitem_key(self):
method test_delitem_object (line 552) | def test_delitem_object(self):
method test_append (line 563) | def test_append(self):
method test_add (line 584) | def test_add(self):
method test_extend (line 607) | def test_extend(self):
method test_update (line 627) | def test_update(self):
method test_insert (line 645) | def test_insert(self):
method test_remove (line 676) | def test_remove(self):
method test_discard (line 693) | def test_discard(self):
method test_pop (line 713) | def test_pop(self):
method test_get (line 747) | def test_get(self):
method test_clear (line 762) | def test_clear(self):
method test_validation (line 772) | def test_validation(self):
method test_dump (line 780) | def test_dump(self):
method test_dumps (line 801) | def test_dumps(self):
FILE: portia_server/portia_orm/tests/test_model.py
class ProjectTestCase (line 10) | class ProjectTestCase(DataStoreTestCase):
method setUp (line 11) | def setUp(self):
method get_storage_files (line 15) | def get_storage_files(self):
class ProjectTests (line 208) | class ProjectTests(ProjectTestCase):
method test_project (line 209) | def test_project(self):
method test_load (line 215) | def test_load(self):
method test_save (line 223) | def test_save(self):
method test_delete (line 242) | def test_delete(self):
class SchemaTests (line 257) | class SchemaTests(ProjectTestCase):
method test_no_fields (line 258) | def test_no_fields(self):
method test_fields (line 270) | def test_fields(self):
method test_collection (line 297) | def test_collection(self):
method test_load_through_project (line 334) | def test_load_through_project(self):
method test_load_through_partial (line 384) | def test_load_through_partial(self):
method test_save_edit (line 426) | def test_save_edit(self):
method test_save_new (line 534) | def test_save_new(self):
method test_delete (line 651) | def test_delete(self):
class FieldTests (line 670) | class FieldTests(ProjectTestCase):
method test_minimal_field (line 671) | def test_minimal_field(self):
method test_full_field (line 684) | def test_full_field(self):
method test_field_types (line 699) | def test_field_types(self):
method test_load_through_project (line 713) | def test_load_through_project(self):
method test_load_through_partial (line 754) | def test_load_through_partial(self):
method test_save_edit (line 767) | def test_save_edit(self):
method test_save_new (line 876) | def test_save_new(self):
method test_delete (line 1002) | def test_delete(self):
class ExtractorTests (line 1050) | class ExtractorTests(ProjectTestCase):
method test_type_extractor (line 1051) | def test_type_extractor(self):
method test_regexp_extractor (line 1072) | def test_regexp_extractor(self):
method test_extractor_type (line 1098) | def test_extractor_type(self):
method test_collection (line 1110) | def test_collection(self):
method test_load_through_project (line 1127) | def test_load_through_project(self):
method test_load_through_partial (line 1146) | def test_load_through_partial(self):
method test_save_edit (line 1156) | def test_save_edit(self):
method test_save_new (line 1203) | def test_save_new(self):
method test_delete (line 1260) | def test_delete(self):
class SpiderTests (line 1280) | class SpiderTests(ProjectTestCase):
method test_minimal_spider (line 1281) | def test_minimal_spider(self):
method test_full_spider (line 1301) | def test_full_spider(self):
method test_links_to_follow (line 1356) | def test_links_to_follow(self):
method test_load_through_project (line 1371) | def test_load_through_project(self):
method test_load_through_partial (line 1406) | def test_load_through_partial(self):
method test_save_edit (line 1434) | def test_save_edit(self):
method test_save_new (line 1523) | def test_save_new(self):
method test_delete (line 1569) | def test_delete(self):
class SampleTests (line 1596) | class SampleTests(ProjectTestCase):
method test_minimal_sample (line 1597) | def test_minimal_sample(self):
method test_full_sample (line 1620) | def test_full_sample(self):
method test_load_through_project (line 1645) | def test_load_through_project(self):
method test_load_through_partial (line 1759) | def test_load_through_partial(self):
method test_save_edit (line 1868) | def test_save_edit(self):
method test_save_new (line 2142) | def test_save_new(self):
method test_delete (line 2204) | def test_delete(self):
class ItemTests (line 2269) | class ItemTests(ProjectTestCase):
method get_storage_files (line 2270) | def get_storage_files(self):
method test_minimal_item (line 2449) | def test_minimal_item(self):
method test_full_item (line 2470) | def test_full_item(self):
method test_with_annotation (line 2499) | def test_with_annotation(self):
method test_with_nested_item (line 2545) | def test_with_nested_item(self):
method test_load_through_project (line 2585) | def test_load_through_project(self):
method test_load_through_partial (line 2716) | def test_load_through_partial(self):
method test_save_edit (line 2842) | def test_save_edit(self):
method test_save_new (line 3164) | def test_save_new(self):
method test_delete (line 3539) | def test_delete(self):
class AnnotationTests (line 3615) | class AnnotationTests(ProjectTestCase):
method test_minimal_item (line 3616) | def test_minimal_item(self):
method test_full_item (line 3643) | def test_full_item(self):
method test_load_through_project (line 3685) | def test_load_through_project(self):
method test_load_through_partial (line 3758) | def test_load_through_partial(self):
method test_save_edit (line 3799) | def test_save_edit(self):
method test_save_new (line 4043) | def test_save_new(self):
method test_delete (line 4346) | def test_delete(self):
method test_invalid_project_name (line 4483) | def test_invalid_project_name(self):
method test_invalid_spider_name (line 4487) | def test_invalid_spider_name(self):
FILE: portia_server/portia_orm/tests/test_relationship.py
class OneToOneRelationshipTests (line 10) | class OneToOneRelationshipTests(DataStoreTestCase):
method setUp (line 11) | def setUp(self):
method test_no_relation (line 32) | def test_no_relation(self):
method test_set_relation (line 47) | def test_set_relation(self):
method test_set_reverse_relation (line 66) | def test_set_reverse_relation(self):
method test_create_with_relation (line 85) | def test_create_with_relation(self):
method test_create_with_reverse_relation (line 103) | def test_create_with_reverse_relation(self):
method test_change_relation (line 119) | def test_change_relation(self):
method test_change_reverse_relation (line 141) | def test_change_reverse_relation(self):
method test_load_full (line 163) | def test_load_full(self):
method test_load_partial (line 177) | def test_load_partial(self):
method test_save_field (line 190) | def test_save_field(self):
method test_save_id (line 238) | def test_save_id(self):
class OneToManyRelationshipTests (line 296) | class OneToManyRelationshipTests(DataStoreTestCase):
method setUp (line 297) | def setUp(self):
method test_no_children (line 321) | def test_no_children(self):
method test_set_children (line 330) | def test_set_children(self):
method test_add_to_children (line 348) | def test_add_to_children(self):
method test_set_parent (line 366) | def test_set_parent(self):
method test_create_with_children (line 384) | def test_create_with_children(self):
method test_create_with_parent (line 401) | def test_create_with_parent(self):
method test_change_parent (line 418) | def test_change_parent(self):
method test_change_children (line 447) | def test_change_children(self):
method test_getitem (line 475) | def test_getitem(self):
method test_get (line 490) | def test_get(self):
method test_setitem (line 503) | def test_setitem(self):
method test_delitem (line 543) | def test_delitem(self):
method test_append (line 561) | def test_append(self):
method test_add (line 576) | def test_add(self):
method test_insert (line 591) | def test_insert(self):
method test_remove (line 602) | def test_remove(self):
method test_discard (line 616) | def test_discard(self):
method test_pop (line 630) | def test_pop(self):
method test_clear (line 648) | def test_clear(self):
method test_load_full (line 661) | def test_load_full(self):
method test_load_partial (line 680) | def test_load_partial(self):
method test_save_field (line 706) | def test_save_field(self):
method test_save_id (line 761) | def test_save_id(self):
class ManyToManyRelationshipTests (line 829) | class ManyToManyRelationshipTests(DataStoreTestCase):
method setUp (line 830) | def setUp(self):
method test_no_relation (line 859) | def test_no_relation(self):
method test_set_relation (line 874) | def test_set_relation(self):
method test_set_reverse_relation (line 901) | def test_set_reverse_relation(self):
method test_create_with_relation (line 928) | def test_create_with_relation(self):
method test_create_with_reverse_relation (line 954) | def test_create_with_reverse_relation(self):
method test_change_relation (line 980) | def test_change_relation(self):
method test_change_reverse_relation (line 1059) | def test_change_reverse_relation(self):
method test_load_full (line 1138) | def test_load_full(self):
method test_load_partial (line 1159) | def test_load_partial(self):
method test_save_field (line 1174) | def test_save_field(self):
method test_save_id (line 1234) | def test_save_id(self):
class PolymorphicRelationshipTests (line 1308) | class PolymorphicRelationshipTests(DataStoreTestCase):
method setUp (line 1309) | def setUp(self):
method test_no_children (line 1344) | def test_no_children(self):
method test_set_children (line 1353) | def test_set_children(self):
method test_add_to_children (line 1378) | def test_add_to_children(self):
method test_set_parent (line 1404) | def test_set_parent(self):
method test_create_with_children (line 1430) | def test_create_with_children(self):
method test_create_with_parent (line 1455) | def test_create_with_parent(self):
method test_change_parent (line 1481) | def test_change_parent(self):
method test_change_children (line 1547) | def test_change_children(self):
method test_getitem (line 1576) | def test_getitem(self):
method test_get (line 1595) | def test_get(self):
method test_setitem (line 1612) | def test_setitem(self):
method test_delitem (line 1653) | def test_delitem(self):
method test_append (line 1671) | def test_append(self):
method test_add (line 1687) | def test_add(self):
method test_insert (line 1703) | def test_insert(self):
method test_remove (line 1715) | def test_remove(self):
method test_discard (line 1730) | def test_discard(self):
method test_pop (line 1745) | def test_pop(self):
method test_clear (line 1764) | def test_clear(self):
method test_load_full (line 1778) | def test_load_full(self):
method test_load_partial (line 1800) | def test_load_partial(self):
method test_save_field (line 1831) | def test_save_field(self):
method test_save_id (line 1882) | def test_save_id(self):
FILE: portia_server/portia_orm/tests/utils.py
class DataStoreTestCase (line 10) | class DataStoreTestCase(unittest.TestCase):
method setUp (line 11) | def setUp(self):
function mock_storage (line 17) | def mock_storage(files):
FILE: portia_server/portia_orm/utils.py
class cached_property_ignore_set (line 25) | class cached_property_ignore_set(cached_property):
method __set__ (line 26) | def __set__(self, instance, value):
class class_property (line 30) | class class_property(object):
method __init__ (line 32) | def __init__(self, fget=None):
method __get__ (line 37) | def __get__(self, instance, instance_type=None):
function short_guid (line 41) | def short_guid():
function validate_type (line 48) | def validate_type(value, model):
function unwrap_envelopes (line 55) | def unwrap_envelopes(data, many, pk_field, remove_key):
function wrap_envelopes (line 73) | def wrap_envelopes(data, many, pk_field, remove_key):
class AttributeDict (line 85) | class AttributeDict(dict):
method __getattr__ (line 86) | def __getattr__(self, name):
function strip_json (line 95) | def strip_json(fname):
class OrderedIndexedTransformDict (line 101) | class OrderedIndexedTransformDict(object):
method __init__ (line 104) | def __init__(self, transform, init_dict=None, **kwargs):
method getitem (line 114) | def getitem(self, key):
method __len__ (line 119) | def __len__(self):
method __iter__ (line 122) | def __iter__(self):
method __getitem__ (line 125) | def __getitem__(self, key):
method __setitem__ (line 131) | def __setitem__(self, key, value=None):
method __delitem__ (line 135) | def __delitem__(self, key):
method clear (line 151) | def clear(self):
method __contains__ (line 154) | def __contains__(self, key):
method get (line 157) | def get(self, key, default=None):
method pop (line 160) | def pop(self, key, default=_SENTINEL):
method items (line 168) | def items(self):
method update (line 171) | def update(self, value, **kws):
method insert (line 174) | def insert(self, index, value):
method replace (line 181) | def replace(self, key, value):
method _remove (line 189) | def _remove(self, key):
method popitem (line 198) | def popitem(self):
method copy (line 202) | def copy(self):
method __getstate__ (line 209) | def __getstate__(self):
method __setstate__ (line 212) | def __setstate__(self, state):
method __repr__ (line 215) | def __repr__(self):
FILE: portia_server/portia_server/backends.py
class LocalAuthentication (line 4) | class LocalAuthentication(object):
method authenticate (line 5) | def authenticate(self, request, **kwargs):
method get_user (line 8) | def get_user(self, user_id):
FILE: portia_server/portia_server/models.py
class LocalUser (line 12) | class LocalUser(AnonymousUser):
class _meta (line 18) | class _meta:
method __init__ (line 22) | def __init__(self, **kwargs):
method pk (line 31) | def pk(self):
method __str__ (line 34) | def __str__(self):
method __eq__ (line 37) | def __eq__(self, other):
method __hash__ (line 40) | def __hash__(self):
method is_anonymous (line 43) | def is_anonymous(self):
method is_authenticated (line 46) | def is_authenticated(self):
method save (line 49) | def save(self, *args, **kwargs):
FILE: portia_server/portia_server/views.py
function capabilities (line 5) | def capabilities(request):
FILE: portia_server/storage/__init__.py
function get_storage_class (line 12) | def get_storage_class():
function create_project_storage (line 20) | def create_project_storage(project_id, author=None, branch=None):
FILE: portia_server/storage/apps.py
class StorageConfig (line 4) | class StorageConfig(AppConfig):
FILE: portia_server/storage/backends.py
class InvalidFilename (line 35) | class InvalidFilename(Exception):
class CommittingStorage (line 39) | class CommittingStorage(object):
method init_project (line 58) | def init_project(self):
method get_projects (line 69) | def get_projects(cls, user):
method setup (line 79) | def setup(cls):
method get_available_name (line 82) | def get_available_name(self, name, max_length=None):
method commit (line 85) | def commit(self, message='Saving multiple files'):
method changed_files (line 88) | def changed_files(self):
class BasePortiaStorage (line 92) | class BasePortiaStorage(CommittingStorage, Storage):
method __init__ (line 93) | def __init__(self, name, author=None):
method is_valid_filename (line 97) | def is_valid_filename(s):
method validate_filename (line 106) | def validate_filename(cls, s):
method open_with_default (line 113) | def open_with_default(self, name, default=None):
class FsStorage (line 122) | class FsStorage(BasePortiaStorage, FileSystemStorage):
method __init__ (line 125) | def __init__(self, name, author=None, *args, **kwargs):
method isdir (line 133) | def isdir(self, name):
method isfile (line 136) | def isfile(self, name):
method move (line 139) | def move(self, old_file_name, new_file_name, allow_overwrite=False):
method rmtree (line 146) | def rmtree(self, name):
method _save (line 149) | def _save(self, name, content):
method delete (line 189) | def delete(self, name):
class GitStorage (line 198) | class GitStorage(BasePortiaStorage):
method __init__ (line 201) | def __init__(self, name, author=None):
method checkout (line 213) | def checkout(self, commit=None, branch=None, retry=True):
method setup (line 255) | def setup(cls):
method _open (line 259) | def _open(self, name, mode='rb'):
method _save (line 272) | def _save(self, name, content):
method delete (line 279) | def delete(self, name):
method exists (line 290) | def exists(self, name):
method listdir (line 296) | def listdir(self, path):
method isdir (line 320) | def isdir(self, name):
method isfile (line 327) | def isfile(self, name):
method move (line 333) | def move(self, old_name, new_name, allow_overwrite=False):
method rmtree (line 352) | def rmtree(self, name):
method path (line 361) | def path(self, path):
method commit (line 373) | def commit(self, message='Saving multiple files'):
method changed_files (line 403) | def changed_files(self):
FILE: portia_server/storage/jsondiff.py
class Conflict (line 9) | class Conflict(object):
method __init__ (line 10) | def __init__(self, mine, other, base):
method from_prepared (line 16) | def from_prepared(cls, mine, other, base):
method resolve_sub_conflict (line 27) | def resolve_sub_conflict(cls, mine, other):
method update (line 31) | def update(self, m, o, b):
method resolve_conflict (line 39) | def resolve_conflict(self):
method _asdict (line 76) | def _asdict(self):
method __eq__ (line 85) | def __eq__(self, other):
method __str__ (line 89) | def __str__(self):
method __repr__ (line 92) | def __repr__(self):
function merge_lists (line 96) | def merge_lists(base, mine, other):
class JsonDiff (line 129) | class JsonDiff(object):
method __init__ (line 135) | def __init__(self, old, new):
method op_for_field (line 143) | def op_for_field(self, field_name):
function merge_jsons (line 154) | def merge_jsons(base, mine, other):
FILE: portia_server/storage/repoman.py
class Repoman (line 26) | class Repoman(object):
method __init__ (line 49) | def __init__(self, author):
method setup (line 60) | def setup(cls, storage_backend):
method create_repo (line 64) | def create_repo(cls, repo_name, author=None):
method open_repo (line 78) | def open_repo(cls, repo_name, author=None):
method repo_exists (line 86) | def repo_exists(cls, repo_name):
method list_repos (line 91) | def list_repos(cls):
method delete_repo (line 95) | def delete_repo(cls, repo_name):
method refs (line 100) | def refs(self):
method create_branch (line 103) | def create_branch(self, branch_name, at_revision=None):
method delete_branch (line 112) | def delete_branch(self, branch_name):
method has_branch (line 120) | def has_branch(self, branch_name):
method get_branch (line 124) | def get_branch(self, branch_name):
method save_file (line 128) | def save_file(self, file_path, contents, branch_name, commit_message=N...
method save_files (line 136) | def save_files(self, files, branch_name, commit_message=None):
method blob_for_branch (line 149) | def blob_for_branch(self, file_path, branch_name):
method blob (line 156) | def blob(self, file_path, revision):
method file_contents_for_branch (line 162) | def file_contents_for_branch(self, file_path, branch_name):
method list_files_for_branch (line 169) | def list_files_for_branch(self, branch_name):
method list_files (line 177) | def list_files(self, revision):
method publish_branch (line 183) | def publish_branch(self, branch_name, force=False, message=None,
method _publish_branch (line 214) | def _publish_branch(self, branch_name, force=False, message=None):
method advance_branch (line 242) | def advance_branch(self, commit, tree=sentinel, branch='master'):
method get_published_revisions (line 256) | def get_published_revisions(self):
method get_branch_checkpoints (line 264) | def get_branch_checkpoints(self, branch_name):
method get_branch_changed_entries (line 277) | def get_branch_changed_entries(self, branch_name):
method get_branch_changed_files (line 284) | def get_branch_changed_files(self, branch_name):
method add_tag (line 289) | def add_tag(self, tag_name):
method checkout_tag (line 301) | def checkout_tag(self, tag_name, remove=False):
method _merge_branches (line 311) | def _merge_branches(self, base, mine, other, take_mine=False):
method _perform_file_operation (line 407) | def _perform_file_operation(self, branch_name, operation, *args):
method _save_file (line 414) | def _save_file(self, parent_commit, file_path, contents, commit_message):
method _save_files (line 420) | def _save_files(self, parent_commit, files, commit_message):
method _update_store (line 440) | def _update_store(self, *args):
method _advance_branch (line 444) | def _advance_branch(self, branch_name, commit):
method _get_tree (line 447) | def _get_tree(self, revision):
method _create_commit (line 451) | def _create_commit(self):
method _get_head (line 459) | def _get_head(self):
method _is_ancestor_commit (line 465) | def _is_ancestor_commit(self, descendant, ancestor):
FILE: portiaui/app/adapters/application.js
constant DELETED_EXTENSION (line 6) | const DELETED_EXTENSION = 'https://portia.scrapinghub.com/jsonapi/extens...
constant UPDATES_EXTENSION (line 7) | const UPDATES_EXTENSION = 'https://portia.scrapinghub.com/jsonapi/extens...
function filter_update_errors (line 9) | function filter_update_errors(errors, pointer) {
method selfLink (line 37) | selfLink(type, id, snapshot /*, query */) {
method relatedLink (line 44) | relatedLink(type, id, snapshot /*, query */) {
method createRecord (line 73) | createRecord(store, type, snapshot) {
method updateRecord (line 82) | updateRecord(store, type, snapshot) {
method deleteRecord (line 91) | deleteRecord(store, type, snapshot) {
method dataForRequest (line 100) | dataForRequest(params) {
method methodForRequest (line 160) | methodForRequest(params) {
method urlForRequest (line 170) | urlForRequest(params) {
method headersForRequest (line 182) | headersForRequest(params) {
method _requestFor (line 203) | _requestFor(params) {
method _makeRequest (line 219) | _makeRequest(request) {
method _getExtentionAliases (line 393) | _getExtentionAliases(response, extention) {
method ajaxOptions (line 406) | ajaxOptions(url, method, request = {}) {
FILE: portiaui/app/adapters/project.js
method shouldReloadRecord (line 7) | shouldReloadRecord() { return true; }
FILE: portiaui/app/components/add-start-url-button.js
method toggleStartUrl (line 21) | toggleStartUrl() {
method _toggleStartUrl (line 29) | _toggleStartUrl() {
FILE: portiaui/app/components/animation-container.js
method didReceiveAttrs (line 15) | didReceiveAttrs({oldAttrs, newAttrs}) {
method didInsertElement (line 51) | didInsertElement() {
method willDestroyElement (line 64) | willDestroyElement() {
method readPosition (line 73) | readPosition(rects, boundingRect, element) {
method updatePosition (line 77) | updatePosition(rects, boundingRect) {
method updateSize (line 88) | updateSize(rects, boundingRect) {
method transitionEnd (line 101) | transitionEnd($event) {
FILE: portiaui/app/components/annotation-options.js
method get (line 29) | get() {
method set (line 33) | set(key, value) {
method get (line 46) | get() {
method set (line 50) | set(key, value) {
method get (line 56) | get() {
method set (line 60) | set(key, value) {
method get (line 71) | get() {
method set (line 75) | set(key, value) {
method setAnnotationSelector (line 104) | setAnnotationSelector(annotation, selector) {
method updateSelector (line 111) | updateSelector(sample) {
method _updateSelector (line 124) | _updateSelector(sample) {
method save (line 135) | save() {
FILE: portiaui/app/components/browser-iframe.js
function hashString (line 9) | function hashString(string) {
method init (line 44) | init() {
method click (line 50) | click() {
method willInsertElement (line 63) | willInsertElement() {
method didInsertElement (line 79) | didInsertElement() {
method willDestroyElement (line 89) | willDestroyElement() {
method documentLoaded (line 107) | documentLoaded() {
method _loadUrl (line 119) | _loadUrl() {
method visit (line 164) | visit(url, baseurl) {
method msgLoadStarted (line 180) | msgLoadStarted(data) {
method failedLoad (line 191) | failedLoad(reason) {
method msgLoadFinished (line 213) | msgLoadFinished(data) {
method msgLoad (line 225) | msgLoad(data) {
method msgMetadata (line 229) | msgMetadata(data) {
method handleMetadataError (line 242) | handleMetadataError() {
method msgMutation (line 256) | msgMutation(data) {
method msgCookies (line 278) | msgCookies(data) {
method noop (line 286) | noop() {
method loadCookies (line 290) | loadCookies(){
method unbindEventHandlers (line 301) | unbindEventHandlers() {
method addFrameEventListener (line 309) | addFrameEventListener(event, fn, useCapture=false) {
method bindEventHandlers (line 315) | bindEventHandlers() {
method clickHandlerBrowse (line 341) | clickHandlerBrowse(evt) {
method postEvent (line 350) | postEvent(evt) {
method clearIframe (line 361) | clearIframe() {
method iframeSize (line 382) | iframeSize() {
FILE: portiaui/app/components/browser-url-failing.js
method reloadPage (line 9) | reloadPage() {
FILE: portiaui/app/components/browser-view-port.js
method willInsertElement (line 14) | willInsertElement() {
method willDestroyElement (line 18) | willDestroyElement() {
method updateHoveredElement (line 22) | updateHoveredElement(elements) {
method viewPortClick (line 29) | viewPortClick() {
method reconnectWebsocket (line 35) | reconnectWebsocket() {
FILE: portiaui/app/components/buffered-input.js
method didInsertElement (line 15) | didInsertElement() {
method get (line 32) | get() {
method set (line 40) | set(key, value, cachedValue) {
method setInputFocus (line 54) | setInputFocus() {
method validateName (line 68) | validateName(name) {
method startEditing (line 73) | startEditing() {
method cancelEditing (line 80) | cancelEditing() {
method endEditing (line 87) | endEditing(reason) {
FILE: portiaui/app/components/combo-box.js
method get (line 10) | get() {
method set (line 14) | set(key, value) {
method setInputFocus (line 20) | setInputFocus(ignoreAutoSelect = false) {
method getValueAttribute (line 32) | getValueAttribute(value) {
method updateViewValue (line 40) | updateViewValue() {
method restoreFocus (line 66) | restoreFocus() {
FILE: portiaui/app/components/create-project-button.js
method addProject (line 13) | addProject() {
FILE: portiaui/app/components/create-spider-button.js
method addSpider (line 15) | addSpider() {
FILE: portiaui/app/components/data-structure-annotations.js
method enterAnnotation (line 14) | enterAnnotation(annotation) {
method leaveAnnotation (line 18) | leaveAnnotation() {
method enterItem (line 22) | enterItem(item) {
method leaveItem (line 26) | leaveItem() {
method removeAnnotation (line 30) | removeAnnotation(annotation) {
method removeItem (line 34) | removeItem(item) {
method saveItem (line 38) | saveItem(item) {
FILE: portiaui/app/components/data-structure-listing.js
method addItem (line 11) | addItem(sample) {
method removeItem (line 15) | removeItem(item) {
FILE: portiaui/app/components/dropdown-delete.js
method onDelete (line 13) | onDelete() {
FILE: portiaui/app/components/dropdown-item.js
method didInsertElement (line 19) | didInsertElement() {
method willDestroyElement (line 26) | willDestroyElement() {
method performAction (line 34) | performAction(value) {
FILE: portiaui/app/components/dropdown-menu.js
function computedItem (line 3) | function computedItem(propertyName) {
method init (line 58) | init() {
method next (line 69) | next(type) {
method previous (line 89) | previous(type) {
method triggerAction (line 109) | triggerAction(type) {
method validateType (line 118) | validateType(type, fallback) {
method focusIn (line 125) | focusIn() {
method focusOut (line 131) | focusOut() {
method keyDown (line 137) | keyDown() {
method registerItem (line 141) | registerItem(item) {
method unRegisterItem (line 146) | unRegisterItem(item) {
method updateItems (line 151) | updateItems() {
method orderItemsForSearch (line 157) | orderItemsForSearch(items) {
method valuesEqual (line 161) | valuesEqual(a, b) {
method keyDown (line 166) | keyDown($event) {
FILE: portiaui/app/components/dropdown-widget.js
method init (line 35) | init() {
method didInsertElement (line 47) | didInsertElement() {
method willDestroyElement (line 61) | willDestroyElement() {
method focusIn (line 73) | focusIn() {
method focusOut (line 77) | focusOut() {
method keyDown (line 81) | keyDown() {
method updateMenuSize (line 85) | updateMenuSize() {
method updatePosition (line 92) | updatePosition(rects, boundingRect) {
method click (line 113) | click() {
method openMenu (line 121) | openMenu() {
method closeMenu (line 127) | closeMenu(closeReason) {
method toggleMenu (line 134) | toggleMenu(closeReason) {
method focusIn (line 142) | focusIn() {
method focusOut (line 146) | focusOut() {
method keyDown (line 156) | keyDown(event) {
FILE: portiaui/app/components/edit-sample-button.js
method getUrlDomain (line 45) | getUrlDomain(uri) {
method addSample (line 57) | addSample() {
FILE: portiaui/app/components/element-overlay.js
method init (line 12) | init() {
method didInsertElement (line 17) | didInsertElement() {
method willDestroyElement (line 21) | willDestroyElement() {
method notifyAddOverlay (line 25) | notifyAddOverlay() {
method notifyRemoveOverlay (line 29) | notifyRemoveOverlay() {
method didReceiveAttrs (line 33) | didReceiveAttrs({oldAttrs, newAttrs}) {
method on (line 51) | on(name, ...params) {
method readContainerSize (line 56) | readContainerSize(rects, boundingRect, element) {
method updatePosition (line 66) | updatePosition(rects) {
FILE: portiaui/app/components/element-rect-overlay.js
method didInsertElement (line 22) | didInsertElement() {
method willDestroyElement (line 27) | willDestroyElement() {
method updatePosition (line 31) | updatePosition(rects) {
FILE: portiaui/app/components/extractor-options.js
method save (line 16) | save() {
method addTypeExtractor (line 20) | addTypeExtractor(type) {
method addRegexExtractor (line 25) | addRegexExtractor(extractor) {
method addNewRegexExtractor (line 30) | addNewRegexExtractor() {
method removeExtractor (line 35) | removeExtractor(extractor) {
method saveExtractor (line 40) | saveExtractor(extractor) {
FILE: portiaui/app/components/feed-url-options.js
method didRender (line 7) | didRender() {
method saveFeedUrl (line 13) | saveFeedUrl() {
FILE: portiaui/app/components/field-options.js
method save (line 9) | save() {
FILE: portiaui/app/components/fragment-options.js
constant TOOLTIP_DEBOUNCE (line 13) | const TOOLTIP_DEBOUNCE = 1000;
constant TOOLTIP_DELAY (line 14) | const TOOLTIP_DELAY = 2000;
constant VALIDATIONS (line 16) | const VALIDATIONS = {
method get (line 35) | get() {
method set (line 38) | set(key, value) {
method limits (line 66) | limits() {
method get (line 71) | get() {
method set (line 74) | set(key, value) {
method get (line 80) | get() {
method set (line 83) | set(key, value) {
method updateFragment (line 90) | updateFragment() {
method updateLimit (line 97) | updateLimit(value, index) {
method changeFragmentType (line 107) | changeFragmentType(value) {
method focusFragment (line 113) | focusFragment() {
method saveChangeset (line 119) | saveChangeset() {
method saveFragment (line 142) | saveFragment() {
method updateValue (line 150) | updateValue() {
method changeFragmentType (line 154) | changeFragmentType() {
FILE: portiaui/app/components/generated-url-options.js
method addFragment (line 30) | addFragment() {
method removeFragment (line 35) | removeFragment(fragment) {
FILE: portiaui/app/components/icon-button.js
constant ICON_CLASSES (line 3) | const ICON_CLASSES = {
method beforeClick (line 66) | beforeClick() {}
method click (line 68) | click() {
FILE: portiaui/app/components/input-with-clear.js
method clear (line 10) | clear() {
method keyUp (line 15) | keyUp() {
FILE: portiaui/app/components/inspector-panel.js
constant IGNORED_ATTRIBUTES (line 4) | const IGNORED_ATTRIBUTES = new Set([
function hasContentAttribute (line 10) | function hasContentAttribute(element) {
function getAttributeList (line 14) | function getAttributeList(element) {
function getDefaultAttribute (line 53) | function getDefaultAttribute(element) {
method addAnnotation (line 123) | addAnnotation(attribute) {
method changeAnnotationSource (line 128) | changeAnnotationSource(attribute) {
method selectParent (line 133) | selectParent(element) {
method selectChild (line 137) | selectChild(element) {
FILE: portiaui/app/components/link-crawling-options.js
method save (line 10) | save() {
FILE: portiaui/app/components/list-item-add-annotation-menu.js
method addAnnotation (line 15) | addAnnotation() {
method addNestedItem (line 20) | addNestedItem() {
FILE: portiaui/app/components/list-item-annotation-field.js
method validateFieldName (line 14) | validateFieldName(name) {
method addField (line 31) | addField(name) {
method changeField (line 44) | changeField() {
FILE: portiaui/app/components/list-item-editable.js
method click (line 13) | click() {
method startEditing (line 20) | startEditing() {
FILE: portiaui/app/components/list-item-field-type.js
method saveField (line 13) | saveField() {
FILE: portiaui/app/components/list-item-icon-menu.js
method clickIcon (line 9) | clickIcon() {
FILE: portiaui/app/components/list-item-icon.js
method beforeClick (line 6) | beforeClick() {
FILE: portiaui/app/components/list-item-item-schema.js
method addSchema (line 12) | addSchema(name) {
method changeSchema (line 22) | changeSchema() {
FILE: portiaui/app/components/list-item-link-crawling.js
method get (line 39) | get() {
method set (line 43) | set(key, value) {
method saveSpider (line 50) | saveSpider() {
FILE: portiaui/app/components/list-item-relation-manager.js
method orderItemsForSearch (line 15) | orderItemsForSearch(items) {
method valuesEqual (line 30) | valuesEqual(a, b) {
method validateName (line 36) | validateName(name) {
method add (line 41) | add(name) {
method rename (line 51) | rename(name) {
FILE: portiaui/app/components/list-item-selectable.js
method startSelecting (line 18) | startSelecting() {
FILE: portiaui/app/components/notification-container.js
method dismissNotification (line 44) | dismissNotification(notification) {
method fadeBanner (line 48) | fadeBanner(banner) {
method fadeNotification (line 52) | fadeNotification(notification) {
FILE: portiaui/app/components/notification-message.js
method init (line 23) | init() {
method didReceiveAttrs (line 28) | didReceiveAttrs({newAttrs, oldAttrs}) {
method didInsertElement (line 34) | didInsertElement() {
method fadeIn (line 43) | fadeIn() {
method fadeOut (line 49) | fadeOut() {
method transitionEnd (line 53) | transitionEnd() {
FILE: portiaui/app/components/page-actions-editor.js
constant TYPES (line 3) | const TYPES = ['click', 'set', 'wait'];
FILE: portiaui/app/components/project-list.js
method search (line 37) | search(value) {
method clear (line 40) | clear() {
method selectProject (line 48) | selectProject(project) {
FILE: portiaui/app/components/project-listing.js
method deploy (line 32) | deploy() {
method publish (line 49) | publish() {
method discard (line 66) | discard() {
method clickProjectOptions (line 79) | clickProjectOptions() {
FILE: portiaui/app/components/project-structure-listing.js
constant LIMIT (line 6) | const LIMIT = 15;
constant FILTER_DEBOUNCE (line 7) | const FILTER_DEBOUNCE = 800;
constant TURN_PAGE_DEBOUNCE (line 8) | const TURN_PAGE_DEBOUNCE = 200;
method didReceiveAttrs (line 18) | didReceiveAttrs() {
method get (line 98) | get() {
method addSchema (line 108) | addSchema() {
method removeSchema (line 112) | removeSchema(schema) {
method setSchemaDefault (line 116) | setSchemaDefault(schema) {
method removeSchemaDefault (line 122) | removeSchemaDefault(schema) {
method saveSchema (line 127) | saveSchema(schema) {
method addSpider (line 131) | addSpider() {
method _fuzzyFilter (line 136) | _fuzzyFilter(items, term) {
method _addCurrentSpider (line 145) | _addCurrentSpider(spiders) {
method _updateFilter (line 152) | _updateFilter(spiders, term = '') {
FILE: portiaui/app/components/project-structure-spider-feed-url.js
method get (line 14) | get() {
method set (line 17) | set(key, value) {
method saveStartUrl (line 22) | saveStartUrl(url) {
FILE: portiaui/app/components/project-structure-spider-url.js
method get (line 15) | get() {
method set (line 19) | set(key, value, oldValue) {
method handleNewUrl (line 25) | handleNewUrl(oldUrl, newUrl) {
method removeStartUrl (line 36) | removeStartUrl() {
method saveStartUrl (line 42) | saveStartUrl(oldUrl, newUrl) {
FILE: portiaui/app/components/regex-pattern-list.js
method triggerChange (line 18) | triggerChange() {
method addPattern (line 25) | addPattern(pattern) {
method clearPattern (line 33) | clearPattern() {
method changePattern (line 37) | changePattern(index, value) {
method removePattern (line 47) | removePattern(index) {
method stopPropagation (line 52) | stopPropagation($event) {
FILE: portiaui/app/components/save-status.js
method init (line 13) | init() {
method get (line 19) | get() {
method set (line 38) | set(key, value) {
method get (line 51) | get() {
method set (line 66) | set(key, value) {
FILE: portiaui/app/components/schema-structure-listing.js
function validateFieldName (line 4) | function validateFieldName(name, fields) {
method addField (line 26) | addField() {
method removeField (line 30) | removeField(field) {
method validateFieldName (line 34) | validateFieldName(field, name) {
method saveField (line 45) | saveField(field) {
FILE: portiaui/app/components/select-box.js
method get (line 18) | get() {
method set (line 22) | set(key, value) {
method didInsertElement (line 27) | didInsertElement() {
method setInputFocus (line 41) | setInputFocus() {
method setViewValue (line 51) | setViewValue(value) {
method setValue (line 55) | setValue(value) {
method setValueAndClose (line 62) | setValueAndClose(value) {
method menuClicked (line 72) | menuClicked() {
method menuClosed (line 80) | menuClosed(reason) {
FILE: portiaui/app/components/show-links-button.js
method toggleShowLinks (line 11) | toggleShowLinks() {
FILE: portiaui/app/components/spider-message.js
method runSpider (line 10) | runSpider(spider) {
FILE: portiaui/app/components/spider-options.js
method save (line 10) | save() {
FILE: portiaui/app/components/spider-row.js
method init (line 16) | init() {
method notifyError (line 33) | notifyError(spider) {
method validateSpiderName (line 40) | validateSpiderName(spider, name) {
method removeSpider (line 58) | removeSpider(spider) {
method saveSpiderName (line 61) | saveSpiderName(spider) {
method closeSpiderOptions (line 72) | closeSpiderOptions() {
method copySpider (line 75) | copySpider() {
method copyToProject (line 78) | copyToProject(options, project) {
method _copyProjectSuccess (line 90) | _copyProjectSuccess(project) {
method _copyProjectError (line 97) | _copyProjectError(data) {
method _afterCopyProject (line 104) | _afterCopyProject(options) {
FILE: portiaui/app/components/spider-structure-listing.js
method init (line 18) | init() {
method getNewStartUrl (line 23) | getNewStartUrl(newUrl) {
method getNewUrl (line 32) | getNewUrl() {
method addStartUrl (line 42) | addStartUrl() {
method addGenerationUrl (line 54) | addGenerationUrl() {
method addFeedUrl (line 61) | addFeedUrl() {
method removeStartUrl (line 67) | removeStartUrl(startUrl) {
method addSample (line 72) | addSample() {
method removeSample (line 76) | removeSample(sample) {
method saveSample (line 80) | saveSample(sample) {
FILE: portiaui/app/components/start-url-options.js
constant SPIDER_DEBOUNCE (line 5) | const SPIDER_DEBOUNCE = 1000;
FILE: portiaui/app/components/tool-group.js
method init (line 14) | init() {
method close (line 24) | close() {
method selectTab (line 29) | selectTab(toolId) {
method toggleCollapsed (line 35) | toggleCollapsed() {
FILE: portiaui/app/components/tool-tab.js
method didInsertElement (line 10) | didInsertElement() {
method selectTab (line 21) | selectTab() {
FILE: portiaui/app/components/tooltip-container.js
method init (line 21) | init() {
method didInsertElement (line 26) | didInsertElement() {
method willDestroyElement (line 30) | willDestroyElement() {
method createTooltip (line 34) | createTooltip() {
method destroyTooltip (line 54) | destroyTooltip() {
FILE: portiaui/app/components/tooltip-icon.js
method onClick (line 7) | onClick() {
FILE: portiaui/app/components/tree-list-item-row.js
method mouseEnter (line 6) | mouseEnter() {
method mouseLeave (line 12) | mouseLeave() {
FILE: portiaui/app/components/url-bar.js
method submit (line 23) | submit($event) {
method back (line 30) | back() {
method forward (line 36) | forward() {
method submit (line 42) | submit(url) {
FILE: portiaui/app/controllers/projects/project.js
method setClickHandler (line 12) | setClickHandler(fn) {
method clearClickHandler (line 16) | clearClickHandler() {
method viewPortClick (line 21) | viewPortClick() {
FILE: portiaui/app/controllers/projects/project/conflicts/conflict.js
function isConflict (line 4) | function isConflict(obj) {
function sortKeys (line 13) | function sortKeys(keys){
function applyPatches (line 34) | function applyPatches(obj, values) {
function patch (line 45) | function patch(obj, path, value) {
FILE: portiaui/app/controllers/projects/project/schema/field/options.js
method closeOptions (line 5) | closeOptions() {
FILE: portiaui/app/controllers/projects/project/spider.js
function filterLinkElements (line 4) | function filterLinkElements(filterFn) {
function mapOverlayElements (line 19) | function mapOverlayElements(elementsProperty, color) {
method init (line 59) | init() {
method activate (line 65) | activate() {
method deactivate (line 69) | deactivate() {
method updateLinkElements (line 73) | updateLinkElements(elements) {
FILE: portiaui/app/controllers/projects/project/spider/link-options.js
method closeOptions (line 5) | closeOptions() {
FILE: portiaui/app/controllers/projects/project/spider/options.js
method closeOptions (line 5) | closeOptions() {
FILE: portiaui/app/controllers/projects/project/spider/sample/data.js
method get (line 40) | get(key) {
method set (line 55) | set(key, value) {
method get (line 67) | get() {
method set (line 76) | set(key, value) {
method toggleCSS (line 296) | toggleCSS() {
method toggleMagicTool (line 305) | toggleMagicTool() {
method selectElement (line 321) | selectElement() {
FILE: portiaui/app/controllers/projects/project/spider/sample/data/annotation/options.js
method closeOptions (line 5) | closeOptions() {
FILE: portiaui/app/helpers/array-get.js
method compute (line 4) | compute(params/*, hash*/) {
FILE: portiaui/app/helpers/attribute-annotation.js
method compute (line 4) | compute([annotations, attribute]) {
FILE: portiaui/app/helpers/chain-actions.js
function chainActions (line 3) | function chainActions(params/*, hash*/) {
FILE: portiaui/app/helpers/guid.js
function guid (line 3) | function guid([obj]/*, hash*/) {
FILE: portiaui/app/helpers/includes.js
function includes (line 3) | function includes([list, value]) {
FILE: portiaui/app/helpers/indexed-object.js
function indexedObject (line 3) | function indexedObject([ param ] /*, hash*/) {
FILE: portiaui/app/helpers/is-empty-object.js
function isEmptyObject (line 4) | function isEmptyObject(params) {
FILE: portiaui/app/helpers/is-object-or-array.js
function isObjectOrArray (line 5) | function isObjectOrArray(params) {
FILE: portiaui/app/helpers/is-object.js
function isObject (line 4) | function isObject([object]) {
FILE: portiaui/app/initializers/ui-state.js
function initialize (line 1) | function initialize(application) {
FILE: portiaui/app/instance-initializers/error-handler.js
function logErrorStack (line 4) | function logErrorStack(e, level) {
function initialize (line 20) | function initialize(applicationInstance) {
FILE: portiaui/app/mixins/options-route.js
method activate (line 6) | activate() {
method deactivate (line 10) | deactivate() {
FILE: portiaui/app/mixins/save-spider-mixin.js
method saveSpider (line 6) | saveSpider() {
FILE: portiaui/app/models/annotation.js
method defaultValue (line 28) | defaultValue() {
method defaultValue (line 33) | defaultValue() {
method addElement (line 51) | addElement(element) {
method removeElement (line 55) | removeElement(element) {
method moveElement (line 59) | moveElement(element, toProperty, fromProperty) {
method setSelector (line 87) | setSelector(selector) {
FILE: portiaui/app/models/base.js
function runActions (line 9) | function runActions() {
function mergeSaveOptions (line 40) | function mergeSaveOptions(dst, src) {
method save (line 84) | save(options) {
method deleteRecord (line 210) | deleteRecord() {
method reload (line 217) | reload() {
method set (line 241) | set(key) {
method setProperties (line 246) | setProperties(hash) {
method _clearPendingDelete (line 268) | _clearPendingDelete(...keys) {
FILE: portiaui/app/models/field.js
constant FIELD_TYPES (line 4) | const FIELD_TYPES = [
FILE: portiaui/app/models/project.js
function memberActionAndMarkClean (line 7) | function memberActionAndMarkClean(options) {
method markClean (line 32) | markClean() {
FILE: portiaui/app/models/sample.js
method normalizeTitle (line 24) | normalizeTitle(title) {
FILE: portiaui/app/models/spider.js
method get (line 8) | get() {
method set (line 11) | set(key, value) {
method defaultValue (line 18) | defaultValue() {
method defaultValue (line 24) | defaultValue() {
method defaultValue (line 41) | defaultValue() {
method defaultValue (line 46) | defaultValue() {
method defaultValue (line 59) | defaultValue() {
method defaultValue (line 64) | defaultValue() {
FILE: portiaui/app/models/start-url.js
method show (line 11) | show() {
method serialize (line 15) | serialize() {
method addSerialized (line 22) | addSerialized(serialized) { return serialized; }
method save (line 24) | save(spider) {
method init (line 38) | init() {
method show (line 49) | show() {
method addSerialized (line 53) | addSerialized(serialized) {
method generateList (line 58) | generateList() {
method show (line 81) | show() {
method _raw_url (line 85) | _raw_url() {
function buildStartUrl (line 109) | function buildStartUrl(startUrl) {
FILE: portiaui/app/routes/browsers.js
method model (line 19) | model() {
FILE: portiaui/app/routes/index.js
function identity (line 4) | function identity(x) { return x; }
method model (line 7) | model() {
method redirect (line 11) | redirect(model) {
FILE: portiaui/app/routes/projects.js
method model (line 4) | model() {
FILE: portiaui/app/routes/projects/project.js
method beforeModel (line 10) | beforeModel() {
method model (line 16) | model(params) {
method setupController (line 21) | setupController(controller, model) {
method deactivate (line 26) | deactivate() {
method renderTemplate (line 30) | renderTemplate() {
method projectNotFound (line 57) | projectNotFound() {
method conflict (line 69) | conflict() {
method reload (line 73) | reload() {
FILE: portiaui/app/routes/projects/project/compatibility.js
method model (line 4) | model(params) {
method redirect (line 8) | redirect({path}, {queryParams}) {
FILE: portiaui/app/routes/projects/project/conflicts.js
method model (line 4) | model() {
method renderTemplate (line 11) | renderTemplate() {
FILE: portiaui/app/routes/projects/project/conflicts/conflict.js
method model (line 4) | model(params) {
method renderTemplate (line 13) | renderTemplate() {
FILE: portiaui/app/routes/projects/project/schema.js
method model (line 4) | model(params) {
method afterModel (line 8) | afterModel(model) {
method renderTemplate (line 12) | renderTemplate() {
FILE: portiaui/app/routes/projects/project/schema/field.js
method model (line 4) | model(params) {
FILE: portiaui/app/routes/projects/project/schema/field/options.js
method model (line 5) | model() {
method renderTemplate (line 9) | renderTemplate() {
method close (line 17) | close() {
FILE: portiaui/app/routes/projects/project/spider.js
method model (line 6) | model(params) {
method afterModel (line 10) | afterModel(model) {
method redirect (line 14) | redirect(model, {queryParams}) {
method setupController (line 36) | setupController(controller) {
method resetController (line 43) | resetController(controller, isExiting) {
method renderTemplate (line 52) | renderTemplate() {
method error (line 75) | error() {
method transitionToFragments (line 80) | transitionToFragments(start_url_id) {
method closeOptions (line 84) | closeOptions() {
FILE: portiaui/app/routes/projects/project/spider/link-options.js
method model (line 5) | model() {
method renderTemplate (line 9) | renderTemplate() {
method close (line 18) | close() {
FILE: portiaui/app/routes/projects/project/spider/options.js
method model (line 5) | model() {
method renderTemplate (line 9) | renderTemplate() {
method close (line 18) | close() {
FILE: portiaui/app/routes/projects/project/spider/sample.js
method model (line 6) | model(params) {
method afterModel (line 10) | afterModel(model) {
method renderTemplate (line 16) | renderTemplate() {
method error (line 29) | error() {
FILE: portiaui/app/routes/projects/project/spider/sample/data.js
method init (line 9) | init() {
method model (line 14) | model() {
method afterModel (line 18) | afterModel(model) {
method activate (line 23) | activate() {
method deactivate (line 28) | deactivate() {
method renderTemplate (line 39) | renderTemplate() {
method renderOverlayTemplate (line 58) | renderOverlayTemplate() {
method updateDataStructure (line 66) | updateDataStructure(model) {
method viewPortClick (line 77) | viewPortClick() {
FILE: portiaui/app/routes/projects/project/spider/sample/data/annotation.js
method model (line 8) | model(params) {
method afterModel (line 12) | afterModel(model) {
method deactivate (line 21) | deactivate() {
method error (line 29) | error() {
FILE: portiaui/app/routes/projects/project/spider/sample/data/annotation/options.js
method model (line 5) | model() {
method afterModel (line 9) | afterModel() {
method renderTemplate (line 17) | renderTemplate() {
method close (line 25) | close() {
FILE: portiaui/app/routes/projects/project/spider/sample/data/item.js
method model (line 4) | model(params) {
method error (line 9) | error() {
FILE: portiaui/app/routes/projects/project/spider/sample/index.js
method redirect (line 4) | redirect(model, {queryParams}) {
FILE: portiaui/app/routes/projects/project/spider/start-url.js
method model (line 4) | model(params) {
FILE: portiaui/app/routes/projects/project/spider/start-url/options.js
method model (line 5) | model() {
method afterModel (line 14) | afterModel(model) {
method renderTemplate (line 20) | renderTemplate() {
method getSpider (line 27) | getSpider() {
method transitionToSpider (line 31) | transitionToSpider() {
method closeOptions (line 36) | closeOptions() {
FILE: portiaui/app/serializers/application.js
method normalize (line 4) | normalize(modelClass, resourceHash) {
method serialize (line 14) | serialize(snapshot, options) {
FILE: portiaui/app/services/annotation-structure.js
method init (line 12) | init() {
method destroy (line 17) | destroy() {
method addObservers (line 27) | addObservers() {
method removeObservers (line 159) | removeObservers() {
function createStructure (line 202) | function createStructure(sample) {
function updateStructureSelectors (line 237) | function updateStructureSelectors(structure, selectorMatcher) {
method addStructure (line 281) | addStructure(model, attribute, Class) {
method removeStructure (line 293) | removeStructure(model, attribute) {
method addDataStructure (line 305) | addDataStructure(sample) {
method removeDataStructure (line 309) | removeDataStructure(sample) {
FILE: portiaui/app/services/browser.js
constant NAVIGATION_MODE (line 5) | const NAVIGATION_MODE = 'navigation';
constant ANNOTATION_MODE (line 6) | const ANNOTATION_MODE = 'data-annotation';
constant INTERACTION_MODES (line 7) | const INTERACTION_MODES = new Set([ANNOTATION_MODE]);
constant DEFAULT_MODE (line 8) | const DEFAULT_MODE = NAVIGATION_MODE;
constant META_STYLE (line 11) | const META_STYLE = `<style title="portia-show-meta">
method get (line 81) | get() {
method set (line 86) | set(key, value) {
method get (line 95) | get() {
method set (line 99) | set(key, value) {
method init (line 108) | init() {
method invalidateUrl (line 126) | invalidateUrl() {
method go (line 130) | go(url) {
method back (line 148) | back() {
method forward (line 153) | forward() {
method reload (line 158) | reload() {
method checkCSS (line 162) | checkCSS() {
method disableCSS (line 169) | disableCSS() {
method enableCSS (line 188) | enableCSS() {
method setAnnotationMode (line 207) | setAnnotationMode() {
method clearAnnotationMode (line 211) | clearAnnotationMode() {
method _updateBuffers (line 218) | _updateBuffers(currentBuffer, otherBuffer) {
method _extract (line 232) | _extract() {
FILE: portiaui/app/services/changes.js
method get (line 12) | get() {
method set (line 21) | set(_, value) {
method init (line 33) | init() {
method _checkProjectChanges (line 43) | _checkProjectChanges() {
FILE: portiaui/app/services/dispatcher.js
function computedCanAddSpider (line 10) | function computedCanAddSpider() {
function computedCanAddSample (line 16) | function computedCanAddSample(spiderPropertyName) {
function computedEditableSample (line 28) | function computedEditableSample(spiderPropertyName) {
method addProject (line 47) | addProject(name, redirect = false) {
method addSchema (line 61) | addSchema(project, redirect = false) {
method addNamedSchema (line 66) | addNamedSchema(project, name, redirect = false) {
method addField (line 82) | addField(schema, type, redirect = false) {
method addNamedField (line 87) | addNamedField(schema, name, type, redirect = false) {
method addSpider (line 104) | addSpider(project, redirect = false) {
method addStartUrl (line 139) | addStartUrl(spider, url) {
method addGeneratedUrl (line 145) | addGeneratedUrl(spider, url) {
method addFeedUrl (line 158) | addFeedUrl(spider, url) {
method addSample (line 162) | addSample(spider, redirect = false) {
method addItem (line 195) | addItem(sample, redirect = false) {
method addNestedItem (line 201) | addNestedItem(parentItem, redirect = false) {
method _addItem (line 207) | _addItem(attributes, redirect = false) {
method addAnnotation (line 220) | addAnnotation(item, element, attribute, redirect = false) {
method saveAnnotationAndRelatedSelectors (line 260) | saveAnnotationAndRelatedSelectors(annotation) {
method addAnnotationTypeExtractor (line 295) | addAnnotationTypeExtractor(annotation, type) {
method addAnnotationRegexExtractor (line 320) | addAnnotationRegexExtractor(annotation, extractor) {
method addNewAnnotationRegexExtractor (line 325) | addNewAnnotationRegexExtractor(annotation) {
method addFragment (line 340) | addFragment(startUrl) {
method changeId (line 345) | changeId(model, json) {
method changeSpiderName (line 367) | changeSpiderName(spider) {
method changeAnnotationSource (line 382) | changeAnnotationSource(annotation, attribute) {
method removeSchema (line 389) | removeSchema(schema) {
method removeField (line 403) | removeField(field) {
method removeSpider (line 412) | removeSpider(spider) {
method removeStartUrl (line 421) | removeStartUrl(spider, url) {
method replaceStartUrl (line 426) | replaceStartUrl(spider, oldUrl, newUrl) {
method removeSample (line 438) | removeSample(sample) {
method removeItem (line 447) | removeItem(item) {
method removeAnnotation (line 458) | removeAnnotation(annotation) {
method removeAnnotationExtractor (line 472) | removeAnnotationExtractor(annotation, extractor) {
method removeFragment (line 477) | removeFragment(startUrl, fragment) {
method selectAnnotation (line 481) | selectAnnotation(annotation) {
method selectAnnotationElement (line 489) | selectAnnotationElement(annotation, element, redirect = false) {
method clearSelection (line 498) | clearSelection() {
method addElementToAnnotation (line 512) | addElementToAnnotation(annotation, element) {
method removeElementFromAnnotation (line 519) | removeElementFromAnnotation(annotation, element) {
method updateSampleSelectors (line 525) | updateSampleSelectors(sample) {
FILE: portiaui/app/services/extracted-items.js
constant SECOND (line 5) | const SECOND = 1000;
constant INITIAL_TIMEOUT (line 6) | const INITIAL_TIMEOUT = 2 * SECOND;
constant MAX_TIMEOUT (line 7) | const MAX_TIMEOUT = 30 * SECOND;
method init (line 25) | init() {
method activateExtraction (line 33) | activateExtraction() {
method failExtraction (line 41) | failExtraction(msg) {
method update (line 47) | update() {
method _getitems (line 51) | _getitems() {
method _setExtraction (line 63) | _setExtraction(data) {
method _setItems (line 81) | _setItems(data) {
method _startExtraction (line 92) | _startExtraction(data) {
method _updateItems (line 98) | _updateItems(data) {
method _updateExtraction (line 107) | _updateExtraction(data) {
method _finishExtraction (line 119) | _finishExtraction() {
FILE: portiaui/app/services/notification-manager.js
method add (line 7) | add(options) {
method addBanner (line 17) | addBanner(options) {
method removeBanner (line 21) | removeBanner(options) {
method showNotification (line 25) | showNotification(title, message, type) {
method showSuccessNotification (line 39) | showSuccessNotification(title, message) {
method showWarningNotification (line 43) | showWarningNotification(title, message) {
method showErrorNotification (line 47) | showErrorNotification(title, message) {
FILE: portiaui/app/services/overlays.js
method add (line 8) | add() {
method remove (line 12) | remove() {
FILE: portiaui/app/services/position-monitor.js
method registerElement (line 7) | registerElement(element, context, readCallback, writeCallback) {
method unRegisterElement (line 22) | unRegisterElement(element, context, readCallback, writeCallback) {
method updateRaf (line 36) | updateRaf() {
method update (line 44) | update(elements) {
FILE: portiaui/app/services/saving-notification.js
method start (line 12) | start() {
method end (line 17) | end() {
FILE: portiaui/app/services/selector-matcher.js
function nodesEqual (line 3) | function nodesEqual(nodesA, nodesB) {
method register (line 22) | register(selector, target, method) {
method unRegister (line 39) | unRegister(selector) {
method watch (line 50) | watch(target, method) {
method unWatch (line 54) | unWatch(target, method) {
method query (line 58) | query(selector) {
method scheduleUpdate (line 74) | scheduleUpdate(delay) {
method update (line 79) | update() {
FILE: portiaui/app/services/store.js
method didSaveRecord (line 4) | didSaveRecord(internalModel, dataArg) {
method _load (line 24) | _load(data) {
method updateRecordLinks (line 30) | updateRecordLinks(internalModel, links) {
FILE: portiaui/app/services/ui-state.js
function computedActiveRoutes (line 4) | function computedActiveRoutes(mapping) {
function computedRouteModels (line 19) | function computedRouteModels(mapping) {
FILE: portiaui/app/services/web-socket.js
constant APPLICATION_UNLOADING_CODE (line 6) | const APPLICATION_UNLOADING_CODE = 4001;
constant DEFAULT_RECONNECT_TIMEOUT (line 7) | const DEFAULT_RECONNECT_TIMEOUT = 5000;
constant DEFAULT_MAX_RECONNECT_TIMEOUT (line 8) | const DEFAULT_MAX_RECONNECT_TIMEOUT = 60000;
method _onclose (line 66) | _onclose(e) {
method _onmessage (line 88) | _onmessage({data}) {
method _onopen (line 118) | _onopen() {
FILE: portiaui/app/storages/ui-state-selected-tools.js
method init (line 4) | init() {
method initialState (line 15) | initialState() {
FILE: portiaui/app/utils/attrs.js
function attrValue (line 3) | function attrValue(attr) {
function attrChanged (line 7) | function attrChanged(oldAttrs, newAttrs, key) {
function attrChangedTo (line 11) | function attrChangedTo(oldAttrs, newAttrs, key, value) {
FILE: portiaui/app/utils/browser-features.js
function hasBrowserFeatures (line 4) | function hasBrowserFeatures() {
FILE: portiaui/app/utils/colors.js
constant COLORS (line 5) | const COLORS = [
constant NAMED_COLORS (line 88) | const NAMED_COLORS = {};
function interpolate (line 94) | function interpolate(start, end, fraction) {
function getColors (line 126) | function getColors(n) {
FILE: portiaui/app/utils/computed.js
function computedPropertiesEqual (line 3) | function computedPropertiesEqual(a, b) {
FILE: portiaui/app/utils/ensure-promise.js
function ensurePromise (line 4) | function ensurePromise(x) {
FILE: portiaui/app/utils/interaction-event.js
function getEventCategory (line 15) | function getEventCategory (evt) {
function copyProperties (line 72) | function copyProperties(update_props, obj){
FILE: portiaui/app/utils/promises.js
function ensurePromise (line 3) | function ensurePromise(valueOrPromise) {
FILE: portiaui/app/utils/selectors.js
constant IMPLICIT_TAGS (line 4) | const IMPLICIT_TAGS = new Set(['tbody']);
function elementPath (line 7) | function elementPath(element) {
function positionInParent (line 17) | function positionInParent(element) {
function pathSelector (line 21) | function pathSelector(element) {
function uniquePathSelector (line 26) | function uniquePathSelector(element) {
function smartSelector (line 38) | function smartSelector(element) {
function cssToXpath (line 47) | function cssToXpath(selector) {
method groupPaths (line 188) | groupPaths(paths) {
method createSelectors (line 201) | createSelectors(groupedPaths, parentMap) {
method createGroupSelectors (line 205) | createGroupSelectors(group, parentMap, generalize = false) {
method mergeSelectors (line 393) | mergeSelectors(selectors) {
method getGroupElementsAtIndex (line 400) | getGroupElementsAtIndex(group, index) {
method getElementClassSelectors (line 404) | getElementClassSelectors(elements) {
method getElementIndices (line 425) | getElementIndices(elements) {
method generalizationDistance (line 429) | generalizationDistance(element) {
method createGeneralizedSelectors (line 557) | createGeneralizedSelectors(groupedPaths) {
method filterRejectedSelectors (line 562) | filterRejectedSelectors(selectors) {
method init (line 581) | init() {
method destroy (line 586) | destroy() {
method addChild (line 628) | addChild(childGenerator) {
method addChildren (line 633) | addChildren(childGenerators) {
function setIntersection (line 642) | function setIntersection(a, b) {
function setDifference (line 646) | function setDifference(a, b) {
function getParents (line 650) | function getParents(element, upto) {
function getPreviousSiblings (line 666) | function getPreviousSiblings(element, upto) {
function closestParentIndex (line 679) | function closestParentIndex(element, parents) {
function findContainers (line 691) | function findContainers(extractedElements, upto) {
function findContainer (line 704) | function findContainer(extractedElements) {
function findRepeatedContainers (line 708) | function findRepeatedContainers(extracted, container) {
function parentWithSiblings (line 732) | function parentWithSiblings(groupedItems, container) {
function getItemBounds (line 774) | function getItemBounds(items, tagNumber=true) {
function groupItems (line 796) | function groupItems(extracted, upto) {
function makeItemsFromGroups (line 910) | function makeItemsFromGroups(groups) {
function createSelectorGenerators (line 923) | function createSelectorGenerators(structure, selectorMatcher) {
function accumulateSelectorGenerators (line 929) | function accumulateSelectorGenerators(structure, selectorMatcher, accumu...
FILE: portiaui/app/utils/start-urls.js
constant SAMPLE_SIZE (line 1) | const SAMPLE_SIZE = 20;
constant ALL_DIGITS (line 2) | const ALL_DIGITS = /^\d+-\d+$/;
constant ALL_LETTERS (line 3) | const ALL_LETTERS = /^[a-zA-Z]+-[a-zA-Z]+$/;
function nextLetter (line 5) | function nextLetter(letter) {
function numberRange (line 9) | function numberRange(a, b) {
function letterRange (line 17) | function letterRange(a, b) {
function _processDigitRange (line 28) | function _processDigitRange(value) {
function augmentRange (line 35) | function augmentRange(fragment_value) {
function augmentFragment (line 47) | function augmentFragment(fragment) {
function allLetters (line 64) | function allLetters(value) {
function allDigits (line 68) | function allDigits(value) {
function fragmentToString (line 72) | function fragmentToString(fragment) {
function augmentFragmentList (line 83) | function augmentFragmentList(fragmentList, fragment) {
function includesUrl (line 93) | function includesUrl(spider, url) {
function multiplicityFragment (line 97) | function multiplicityFragment(fragment) {
FILE: portiaui/app/utils/tree-mirror-delegate.js
function paintCanvasMessage (line 2) | function paintCanvasMessage(canvas) {
function addEmbedBlockedMessage (line 21) | function addEmbedBlockedMessage(node) {
function treeMirrorDelegate (line 50) | function treeMirrorDelegate(){
FILE: portiaui/app/utils/types.js
function toType (line 1) | function toType(obj) {
function isObject (line 5) | function isObject(obj) {
function isArray (line 9) | function isArray(obj) {
FILE: portiaui/app/utils/utils.js
function cleanUrl (line 6) | function cleanUrl(url) {
function s4 (line 28) | function s4() {
function guid (line 34) | function guid() {
function shortGuid (line 39) | function shortGuid(separator='-') {
function toType (line 43) | function toType(obj) {
function captureMessage (line 47) | function captureMessage(msg, params) {
function logError (line 53) | function logError(err, params) {
function renameAttr (line 60) | function renameAttr($elements, from, to) {
function flatten (line 75) | function flatten(list) {
FILE: portiaui/app/validators/range.js
function hasMixedCase (line 3) | function hasMixedCase(endpoints) {
function hasSingleLetters (line 7) | function hasSingleLetters(endpoints) {
function isRangeIncomplete (line 11) | function isRangeIncomplete(endpoints) {
function validateIncreasing (line 15) | function validateIncreasing(endpoints, isIncreasing) {
function validateRange (line 21) | function validateRange() {
FILE: portiaui/app/validators/whitespace.js
function validateWhitespace (line 1) | function validateWhitespace() {
FILE: portiaui/tests/helpers/destroy-app.js
function destroyApp (line 3) | function destroyApp(application) {
FILE: portiaui/tests/helpers/module-for-acceptance.js
method beforeEach (line 10) | beforeEach() {
method afterEach (line 18) | afterEach() {
FILE: portiaui/tests/helpers/start-app.js
function startApp (line 5) | function startApp(attrs) {
FILE: portiaui/tests/unit/models/start-url-test.js
function assertFeedsEqual (line 78) | function assertFeedsEqual(assert, url, test_url) {
FILE: portiaui/tests/unit/utils/selectors-test.js
class MockBrowser (line 14) | class MockBrowser {
method constructor (line 15) | constructor(documentRoot) {
FILE: portiaui/vendor/modernizr.js
function is (line 283) | function is(obj, type) {
function testRunner (line 294) | function testRunner() {
function cssToDOM (line 436) | function cssToDOM(name) {
function createElement (line 464) | function createElement() {
function setClasses (line 621) | function setClasses(classes) {
function addTest (line 794) | function addTest(feature, test) {
function testOver32kb (line 938) | function testOver32kb() {
function getBody (line 973) | function getBody() {
function injectElementWithStyles (line 1000) | function injectElementWithStyles(rule, callback, nodes, testnames) {
function contains (line 1331) | function contains(str, substr) {
function fnBind (line 1347) | function fnBind(fn, that) {
function testDOMProps (line 1365) | function testDOMProps(props, obj, elem) {
function domToCSS (line 1458) | function domToCSS(name) {
function nativeTestProps (line 1478) | function nativeTestProps(props, value) {
function testProps (line 1519) | function testProps(props, prefixed, value, skipValueTest) {
function testPropsAll (line 1614) | function testPropsAll(prop, prefixed, elem, value, skipValueTest) {
function testAllProps (line 1781) | function testAllProps(prop, value, skipValueTest) {
FILE: portiaui/vendor/mutation-summary.js
function __ (line 16) | function __() { this.constructor = d; }
function NodeMap (line 33) | function NodeMap() {
function NodeIdMap (line 84) | function NodeIdMap(){
function enteredOrExited (line 119) | function enteredOrExited(changeType) {
function NodeChange (line 124) | function NodeChange(node, childList, attributes, characterData, oldParen...
function ChildListChange (line 213) | function ChildListChange() {
function TreeChanges (line 225) | function TreeChanges(rootNode, mutations) {
function MutationProjection (line 321) | function MutationProjection(rootNode, mutations, selectors, calcReordere...
function recordOldPrevious (line 674) | function recordOldPrevious(node, previous) {
function isMoved (line 733) | function isMoved(node) {
function getOldPrevious (line 761) | function getOldPrevious(node) {
function getPrevious (line 779) | function getPrevious(node) {
function Summary (line 798) | function Summary(projection, query) {
function escapeQuotes (line 862) | function escapeQuotes(value) {
function Qualifier (line 867) | function Qualifier() {
function Selector (line 907) | function Selector() {
function newSelector (line 976) | function newSelector() {
function newQualifier (line 988) | function newQualifier() {
function validateAttribute (line 1353) | function validateAttribute(attribute) {
function validateElementAttributes (line 1368) | function validateElementAttributes(attribs) {
function elementFilterAttributes (line 1393) | function elementFilterAttributes(selectors) {
function MutationSummary (line 1406) | function MutationSummary(opts) {
function observeAttributes (line 1444) | function observeAttributes(attributes) {
FILE: portiaui/vendor/tree-mirror.js
function TreeMirror (line 4) | function TreeMirror(root, delegate) {
function isUrlAttribute (line 158) | function isUrlAttribute(tagName, attribute) {
function TreeMirrorClient (line 164) | function TreeMirrorClient(target, mirror, testingQueries) {
FILE: slybot/setup.py
function build_js (line 8) | def build_js():
class bdist_egg_command (line 30) | class bdist_egg_command(bdist_egg):
method run (line 31) | def run(self):
class sdist_command (line 36) | class sdist_command(sdist):
method run (line 37) | def run(self):
FILE: slybot/slybot/baseurl.py
function _is_abs_url (line 14) | def _is_abs_url(url):
function insert_base_url (line 18) | def insert_base_url(html, base):
function get_base_url (line 62) | def get_base_url(htmlpage):
FILE: slybot/slybot/closespider.py
class SlybotCloseSpider (line 18) | class SlybotCloseSpider(object):
method __init__ (line 20) | def __init__(self, crawler):
method spider_opened (line 34) | def spider_opened(self, spider):
method spider_closed (line 38) | def spider_closed(self, spider):
method item_scraped (line 42) | def item_scraped(self, item, spider):
method _check_crawled_items (line 45) | def _check_crawled_items(self, spider):
method from_crawler (line 53) | def from_crawler(cls, crawler):
FILE: slybot/slybot/clustering.py
class PersistentClusteringMiddleware (line 13) | class PersistentClusteringMiddleware(object):
method __init__ (line 14) | def __init__(self, directory, reset=False, stats=None):
method from_crawler (line 26) | def from_crawler(cls, crawler):
method spider_opened (line 37) | def spider_opened(self, spider):
method spider_closed (line 56) | def spider_closed(self, spider):
method process_spider_output (line 59) | def process_spider_output(self, response, result, spider):
FILE: slybot/slybot/dupefilter.py
class DupeFilterPipeline (line 10) | class DupeFilterPipeline(object):
method __init__ (line 11) | def __init__(self, settings):
method from_crawler (line 17) | def from_crawler(cls, crawler):
method process_item (line 20) | def process_item(self, item, spider):
FILE: slybot/slybot/exporter.py
class SlybotCSVItemExporter (line 5) | class SlybotCSVItemExporter(CsvItemExporter):
method __init__ (line 6) | def __init__(self, *args, **kwargs):
FILE: slybot/slybot/extractors.py
function create_regex_extractor (line 11) | def create_regex_extractor(pattern):
function create_type_extractor (line 37) | def create_type_extractor(_type):
class PipelineExtractor (line 57) | class PipelineExtractor:
method __init__ (line 58) | def __init__(self, *extractors):
method __call__ (line 61) | def __call__(self, value):
method __name__ (line 67) | def __name__(self):
function apply_extractors (line 71) | def apply_extractors(descriptor, template_extractors, extractors):
function add_extractors_to_descriptors (line 103) | def add_extractors_to_descriptors(descriptors, extractors):
FILE: slybot/slybot/fieldtypes/__init__.py
class FieldTypeManager (line 27) | class FieldTypeManager(object):
method available_type_names (line 37) | def available_type_names(self):
method type_processor_class (line 41) | def type_processor_class(self, name):
method type_processor_serializer (line 50) | def type_processor_serializer(self, name):
method all_processor_classes (line 54) | def all_processor_classes(self):
FILE: slybot/slybot/fieldtypes/date.py
class DateTimeFieldTypeProcessor (line 9) | class DateTimeFieldTypeProcessor(TextFieldTypeProcessor):
method extract (line 26) | def extract(self, htmlregion):
method adapt (line 29) | def adapt(self, text, htmlpage=None):
method serializer (line 36) | def serializer(cls, output):
FILE: slybot/slybot/fieldtypes/images.py
class ImagesFieldTypeProcessor (line 6) | class ImagesFieldTypeProcessor(UrlFieldTypeProcessor):
method extract (line 10) | def extract(self, text):
FILE: slybot/slybot/fieldtypes/number.py
class NumberTypeProcessor (line 6) | class NumberTypeProcessor(object):
method extract (line 26) | def extract(self, htmlregion):
method adapt (line 30) | def adapt(self, text, htmlpage=None):
FILE: slybot/slybot/fieldtypes/point.py
class GeoPointFieldTypeProcessor (line 2) | class GeoPointFieldTypeProcessor(object):
method extract (line 9) | def extract(self, value):
method adapt (line 12) | def adapt(self, value, htmlpage=None):
FILE: slybot/slybot/fieldtypes/price.py
class PriceTypeProcessor (line 6) | class PriceTypeProcessor(object):
method extract (line 11) | def extract(self, htmlregion):
method adapt (line 14) | def adapt(self, text, htmlpage=None):
FILE: slybot/slybot/fieldtypes/text.py
class _BaseTextProcessor (line 10) | class _BaseTextProcessor(object):
method extract (line 14) | def extract(self, text):
method adapt (line 18) | def adapt(self, text, htmlpage=None):
class RawFieldTypeProcessor (line 22) | class RawFieldTypeProcessor(_BaseTextProcessor):
class TextFieldTypeProcessor (line 37) | class TextFieldTypeProcessor(_BaseTextProcessor):
method extract (line 55) | def extract(self, htmlregion):
class SafeHtmlFieldTypeProcessor (line 60) | class SafeHtmlFieldTypeProcessor(_BaseTextProcessor):
method extract (line 84) | def extract(self, htmlregion):
method adapt (line 88) | def adapt(self, text, htmlpage=None):
FILE: slybot/slybot/fieldtypes/url.py
class UrlFieldTypeProcessor (line 10) | class UrlFieldTypeProcessor(object):
method extract (line 17) | def extract(self, text):
method adapt (line 22) | def adapt(self, text, htmlpage=None):
FILE: slybot/slybot/generic_form.py
class GenericForm (line 11) | class GenericForm:
method __init__ (line 13) | def __init__(self, **kwargs):
method _pick_node (line 16) | def _pick_node(self, doc, selector):
method _filter_by_regex (line 21) | def _filter_by_regex(self, lines, regex):
method _get_field_values (line 25) | def _get_field_values(self, form, field_descriptor):
method get_value (line 43) | def get_value(self, field_descriptor):
method set_values_url_field (line 50) | def set_values_url_field(self, field_descriptor, body):
method get_url_field (line 53) | def get_url_field(self, form_descriptor):
method fill_generic_form (line 60) | def fill_generic_form(self, url, body, form_descriptor):
FILE: slybot/slybot/item.py
class SlybotItem (line 12) | class SlybotItem(DictItem):
method __setitem__ (line 14) | def __setitem__(self, name, value):
method display_name (line 17) | def display_name(self):
method create_iblitem_class (line 21) | def create_iblitem_class(cls, schema):
function create_slybot_item_descriptor (line 40) | def create_slybot_item_descriptor(schema, schema_name=""):
class SlybotFieldDescriptor (line 56) | class SlybotFieldDescriptor(FieldDescriptor):
method __init__ (line 61) | def __init__(self, name, description, field_type_processor, required=F...
method processor (line 72) | def processor(self):
method __str__ (line 77) | def __str__(self):
class SlybotItemDescriptor (line 82) | class SlybotItemDescriptor(ItemDescriptor):
method __str__ (line 83) | def __str__(self):
method copy (line 86) | def copy(self):
function create_item_version (line 96) | def create_item_version(item):
FILE: slybot/slybot/linkextractor/__init__.py
function create_linkextractor_from_specs (line 24) | def create_linkextractor_from_specs(specs):
FILE: slybot/slybot/linkextractor/base.py
class BaseLinkExtractor (line 18) | class BaseLinkExtractor(object):
method __init__ (line 20) | def __init__(self, max_url_len=2083, ignore_extensions=_ignored_exts,
method _extract_links (line 31) | def _extract_links(self, source):
method links_to_follow (line 34) | def links_to_follow(self, source):
method normalize_link (line 41) | def normalize_link(self, link):
FILE: slybot/slybot/linkextractor/ecsv.py
class CsvLinkExtractor (line 21) | class CsvLinkExtractor(BaseLinkExtractor):
method __init__ (line 22) | def __init__(self, column=0, **kwargs):
method _extract_links (line 34) | def _extract_links(self, response):
FILE: slybot/slybot/linkextractor/html.py
function remove_entities (line 24) | def remove_entities(text, encoding):
class HtmlLinkExtractor (line 29) | class HtmlLinkExtractor(BaseLinkExtractor):
method _extract_links (line 39) | def _extract_links(self, response_or_htmlpage):
function iterlinks (line 49) | def iterlinks(htmlpage):
FILE: slybot/slybot/linkextractor/pagination.py
class PaginationExtractor (line 8) | class PaginationExtractor(HtmlLinkExtractor):
method __init__ (line 9) | def __init__(self, **specs):
method _extract_links (line 22) | def _extract_links(self, response_or_htmlpage, n_links=3):
FILE: slybot/slybot/linkextractor/regex.py
class RegexLinkExtractor (line 12) | class RegexLinkExtractor(BaseLinkExtractor):
method __init__ (line 13) | def __init__(self, regex=None, **kwargs):
method _extract_links (line 19) | def _extract_links(self, response):
FILE: slybot/slybot/linkextractor/xml.py
class XmlLinkExtractor (line 14) | class XmlLinkExtractor(BaseLinkExtractor):
method __init__ (line 16) | def __init__(self, xpath=None, **kwargs):
method _extract_links (line 25) | def _extract_links(self, response):
class RssLinkExtractor (line 39) | class RssLinkExtractor(XmlLinkExtractor):
method __init__ (line 41) | def __init__(self, **kwargs):
class SitemapLinkExtractor (line 45) | class SitemapLinkExtractor(XmlLinkExtractor):
method __init__ (line 47) | def __init__(self, **kwargs):
class AtomLinkExtractor (line 52) | class AtomLinkExtractor(XmlLinkExtractor):
method __init__ (line 53) | def __init__(self, **kwargs):
FILE: slybot/slybot/meta.py
class DropMetaPipeline (line 8) | class DropMetaPipeline(object):
method __init__ (line 9) | def __init__(self, settings):
method from_crawler (line 14) | def from_crawler(cls, crawler):
method process_item (line 17) | def process_item(self, item, spider):
FILE: slybot/slybot/pageactions.py
function filter_for_url (line 28) | def filter_for_url(url):
class PageActionsMiddleware (line 40) | class PageActionsMiddleware(object):
method process_request (line 41) | def process_request(self, request, spider):
FILE: slybot/slybot/plugins/scrapely_annotations/annotations.py
class Annotations (line 27) | class Annotations(object):
method setup_bot (line 32) | def setup_bot(self, settings, spider, spec, items, extractors, logger):
method _get_annotated_template (line 114) | def _get_annotated_template(self, template):
method handle_html (line 126) | def handle_html(self, response, seen=None):
method extract_items (line 142) | def extract_items(self, htmlpage, response=None):
method _do_extract_items_from (line 151) | def _do_extract_items_from(self, htmlpage, extractor, response=None):
method _process_attributes (line 212) | def _process_attributes(self, item, descriptor, htmlpage):
method build_url_filter (line 230) | def build_url_filter(self, spec):
method _cluster_page (line 253) | def _cluster_page(self, htmlpage):
method _filter_link (line 266) | def _filter_link(self, link, seen):
method _process_link_regions (line 277) | def _process_link_regions(self, htmlpage, link_regions):
method _requests_to_follow (line 289) | def _requests_to_follow(self, htmlpage):
method _request_to_follow_from_region (line 308) | def _request_to_follow_from_region(self, htmlregion):
method handle_xml (line 315) | def handle_xml(self, response, seen):
FILE: slybot/slybot/plugins/scrapely_annotations/builder.py
class Annotations (line 19) | class Annotations(object):
method __init__ (line 20) | def __init__(self, sample, **options):
method html (line 32) | def html(self):
method selector (line 44) | def selector(self):
method numbered_html (line 51) | def numbered_html(self):
method build (line 57) | def build(self):
method verify (line 63) | def verify(self, tagid_annotations):
method generate (line 87) | def generate(self, annotations):
method _generate_elem (line 124) | def _generate_elem(self, annotation, text):
method _get_generated (line 137) | def _get_generated(self, element, annotations, nodes, inserts):
method apply (line 192) | def apply(self):
method split (line 306) | def split(self):
method apply_selector (line 317) | def apply_selector(self, annotations):
method elements (line 357) | def elements(self, annotation):
function _clean_annotation_data (line 374) | def _clean_annotation_data(data):
function _get_data_id (line 412) | def _get_data_id(annotation):
function _get_text_nodes (line 418) | def _get_text_nodes(nodes, html_body):
function _get_generated_slice (line 433) | def _get_generated_slice(annotation):
function _get_inner_nodes (line 442) | def _get_inner_nodes(target, open_tags=1, insert_after=False,
function _add_element (line 464) | def _add_element(element, output, html):
function _annotation_key (line 471) | def _annotation_key(a):
function _merge_annotations_by_selector (line 475) | def _merge_annotations_by_selector(annotations):
function add_repeated_field (line 482) | def add_repeated_field(annotation, elems, page):
FILE: slybot/slybot/plugins/scrapely_annotations/exceptions.py
class MissingRequiredError (line 1) | class MissingRequiredError(Exception):
class ItemNotValidError (line 5) | class ItemNotValidError(Exception):
FILE: slybot/slybot/plugins/scrapely_annotations/extraction/container_extractors.py
class BaseContainerExtractor (line 32) | class BaseContainerExtractor(object):
method __init__ (line 42) | def __init__(self, extractors, template):
method annotation (line 53) | def annotation(self):
method apply (line 60) | def apply(cls, template, extractors):
method _build_extractors (line 73) | def _build_extractors(self, extractors, containers, container_contents,
method _get_container_data (line 100) | def _get_container_data(extractors):
method _build_extraction_tree (line 129) | def _build_extraction_tree(containers):
method _build_containerized_extractors (line 150) | def _build_containerized_extractors(cls, containers, container_annos,
method _add_new_container (line 172) | def _add_new_container(annotation, extractors, container_data, template,
method _find_annotation (line 187) | def _find_annotation(self, template, annotation_id):
method _validate_and_adapt_item (line 196) | def _validate_and_adapt_item(self, item, htmlpage, region=None,
method __str__ (line 218) | def __str__(self):
class ContainerExtractor (line 228) | class ContainerExtractor(BaseContainerExtractor, BasicTypeExtractor):
method __init__ (line 229) | def __init__(self, extractors, template, containers=None,
method extract (line 263) | def extract(self, page, start_index=0, end_index=None,
method _extract_items_from_region (line 289) | def _extract_items_from_region(self, region, page, ignored_regions,
method _merge_items (line 319) | def _merge_items(self, items):
class RepeatedContainerExtractor (line 330) | class RepeatedContainerExtractor(BaseContainerExtractor, RecordExtractor):
method __init__ (line 331) | def __init__(self, extractors, template, containers=None,
method extract (line 357) | def extract(self, page, start_index=0, end_index=None,
method _process_items (line 424) | def _process_items(self, items, page, region, surrounding_region):
method _find_prefix_suffix (line 431) | def _find_prefix_suffix(self, extractors, container_contents, containers,
method _find_siblings (line 473) | def _find_siblings(self, template, containers, container_contents):
method _find_siblings_end (line 493) | def _find_siblings_end(self, template, start_index, max_index, siblings):
method _trim_prefix (line 515) | def _trim_prefix(self, prefix, suffix, template, min_prefix_len=1,
method _find_tokens (line 553) | def _find_tokens(tokens, token_types, template, upto=None):
class RepeatedFieldsExtractor (line 573) | class RepeatedFieldsExtractor(RepeatedContainerExtractor):
method __init__ (line 574) | def __init__(self, extractors, template, containers=None,
method extract (line 585) | def extract(self, page, start_index=0, end_index=None,
method _validate_and_adapt_item (line 594) | def _validate_and_adapt_item(self, item, htmlpage=None, region=None,
method _find_siblings (line 598) | def _find_siblings(self, template, containers, container_contents):
FILE: slybot/slybot/plugins/scrapely_annotations/extraction/extractors.py
class TemplatePageMultiItemExtractor (line 17) | class TemplatePageMultiItemExtractor(TemplatePageExtractor):
method extract (line 18) | def extract(self, page, start_index=0, end_index=None):
class SlybotIBLExtractor (line 32) | class SlybotIBLExtractor(InstanceBasedLearningExtractor):
method __init__ (line 35) | def __init__(self, template_descriptor_pairs, trace=False,
method build_extraction_tree (line 64) | def build_extraction_tree(self, template, type_descriptor=None,
method extract (line 88) | def extract(self, html, pref_template_id=None):
method __str__ (line 118) | def __str__(self):
FILE: slybot/slybot/plugins/scrapely_annotations/extraction/pageparsing.py
class SlybotTemplatePageParser (line 15) | class SlybotTemplatePageParser(TemplatePageParser):
method to_template (line 16) | def to_template(self, descriptors=None):
method _read_template_annotation (line 37) | def _read_template_annotation(html_tag):
method read_jannotations (line 47) | def read_jannotations(self, html_tag):
method build_annotation (line 51) | def build_annotation(self, jannotation, is_open=True):
method handle_generated (line 63) | def handle_generated(self, annotation, ignored=False):
method handle_variant (line 76) | def handle_variant(self, annotation, is_open=True):
method handle_ignore (line 92) | def handle_ignore(self, html_tag, is_open=True):
method handle_replacement (line 109) | def handle_replacement(self, html_tag):
method _handle_unpaired_tag (line 127) | def _handle_unpaired_tag(self, html_tag):
method _handle_open_tag (line 139) | def _handle_open_tag(self, html_tag):
class SlybotTemplatePage (line 164) | class SlybotTemplatePage(TemplatePage):
method __init__ (line 167) | def __init__(self, htmlpage, token_dict, page_tokens, annotations,
method descriptor (line 178) | def descriptor(self, descriptor_name=None):
function parse_template (line 184) | def parse_template(token_dict, template_html, descriptors):
FILE: slybot/slybot/plugins/scrapely_annotations/extraction/region_extractors.py
class BaseExtractor (line 14) | class BaseExtractor(BasicTypeExtractor):
method __init__ (line 15) | def __init__(self, annotation, attribute_descriptors=None):
method _create_basic_extractor (line 34) | def _create_basic_extractor(cls, annotation, attribute_descriptors):
method __str__ (line 37) | def __str__(self):
class SlybotRecordExtractor (line 52) | class SlybotRecordExtractor(RecordExtractor):
method extract (line 53) | def extract(self, page, start_index=0, end_index=None,
method _doextract (line 69) | def _doextract(self, page, extractors, start_index, end_index,
FILE: slybot/slybot/plugins/scrapely_annotations/extraction/utils.py
function element_from_page_index (line 9) | def element_from_page_index(page, index):
function container_id (line 14) | def container_id(x):
function _int_cmp (line 18) | def _int_cmp(a, op, b):
function group_tree (line 25) | def group_tree(tree, container_annotations):
function _count_annotations (line 38) | def _count_annotations(extractor):
FILE: slybot/slybot/plugins/scrapely_annotations/migration.py
function short_guid (line 50) | def short_guid():
function gen_id (line 54) | def gen_id(disallow=None):
function gen_predictable_id (line 65) | def gen_predictable_id(id1, id2, disallow=None):
function id_to_int (line 85) | def id_to_int(id_):
function repair_ids (line 95) | def repair_ids(sample):
function port_sample (line 121) | def port_sample(sample, schemas=None, extractors=None):
function find_element (line 204) | def find_element(tagid, sel):
function css_escape (line 215) | def css_escape(s):
function find_generalized_css_selector (line 223) | def find_generalized_css_selector(elem, sel):
function handle_tables (line 228) | def handle_tables(selector):
function find_css_selector (line 248) | def find_css_selector(elem, sel, depth=0, previous_tbody=False):
function find_common_parent (line 327) | def find_common_parent(a, b):
function port_variants (line 349) | def port_variants(variant_annotations, sel, schema_id=None):
function _get_parent (line 391) | def _get_parent(annotations, sel):
function _get_parent_and_siblings (line 402) | def _get_parent_and_siblings(annotations, upto, sel):
function _get_highest (line 434) | def _get_highest(annotations, upto, sel):
function _create_container (line 445) | def _create_container(element, container_id, repeated=False, siblings=0,
function _add_annotation_data (line 474) | def _add_annotation_data(annotation, sample, extractors):
function port_generated (line 496) | def port_generated(generated_annotations, sel):
function port_standard (line 546) | def port_standard(standard_annotations, sel, sample, extractors):
function load_annotations (line 572) | def load_annotations(body):
function find_generated_annotation (line 609) | def find_generated_annotation(elem):
function guess_schema (line 664) | def guess_schema(sample, schemas):
function _guess_schema_id (line 677) | def _guess_schema_id(sample, schemas):
function add_fields (line 721) | def add_fields(schema, annotations):
function create_schema (line 727) | def create_schema(schemas, annotations):
function _create_fields (line 734) | def _create_fields(annotations, field_ids):
function _field (line 754) | def _field(field_id, num_fields):
function container_id_key (line 764) | def container_id_key(annotation):
class PartialKeyDict (line 771) | class PartialKeyDict(dict):
method __getitem__ (line 772) | def __getitem__(self, key):
method __setitem__ (line 786) | def __setitem__(self, key, value):
method _find_key (line 799) | def _find_key(self, key):
method _remove_parent (line 805) | def _remove_parent(self, key):
method _add_parent (line 810) | def _add_parent(self, key, full_key):
FILE: slybot/slybot/plugins/scrapely_annotations/processors.py
function _compose (line 22) | def _compose(f, g):
class ItemProcessor (line 34) | class ItemProcessor(object):
method __init__ (line 40) | def __init__(self, data, extractor, regions, parent_region=None,
method field (line 57) | def field(self):
method descriptor (line 62) | def descriptor(self):
method name (line 67) | def name(self):
method description (line 72) | def description(self):
method region_id (line 77) | def region_id(self):
method metadata (line 81) | def metadata(self):
method repeated (line 85) | def repeated(self):
method attribute_query (line 88) | def attribute_query(self, metadata):
method _process_fields (line 96) | def _process_fields(self, data):
method _field_class (line 117) | def _field_class(self, field):
method _add_item (line 124) | def _add_item(self, item, fields=None):
method _normalize_data (line 134) | def _normalize_data(self, data):
method process (line 167) | def process(self, selector=None, include_meta=False):
method _process_selectors (line 173) | def _process_selectors(self, selector):
method _selector_annotations (line 195) | def _selector_annotations(self):
method get_region_id (line 208) | def get_region_id(region):
method _process_css_and_xpath (line 213) | def _process_css_and_xpath(self, annotations, selector):
method _pick_elems (line 247) | def _pick_elems(self, elements, parents, containers):
method merge (line 270) | def merge(self, other):
method dump (line 297) | def dump(self, include_meta=False, validate=False):
method _dump (line 304) | def _dump(self, include_meta=False, validate=False):
method _validate (line 331) | def _validate(self, item):
method _item_with_names (line 347) | def _item_with_names(self, item, attribute=u'description'):
method __getitem__ (line 365) | def __getitem__(self, key):
method __bool__ (line 376) | def __bool__(self):
method __len__ (line 381) | def __len__(self):
method __hash__ (line 384) | def __hash__(self):
method __setitem__ (line 387) | def __setitem__(self, key, value):
method __str__ (line 391) | def __str__(self):
method __repr__ (line 394) | def __repr__(self):
class ItemField (line 399) | class ItemField(object):
method __init__ (line 402) | def __init__(self, value, meta, schema=None, modifiers=None,
method field (line 415) | def field(self):
method ignore (line 419) | def ignore(self):
method description (line 425) | def description(self):
method name (line 430) | def name(self):
method metadata (line 435) | def metadata(self):
method required (line 443) | def required(self):
method dump (line 446) | def dump(self):
method merge (line 451) | def merge(self, other):
method _load_extractors (line 457) | def _load_extractors(self, field, schema, modifiers):
method _process (line 483) | def _process(self):
method _adapt (line 498) | def _adapt(self, values):
method __hash__ (line 506) | def __hash__(self):
method __str__ (line 509) | def __str__(self):
method __repr__ (line 512) | def __repr__(self):
class ProcessedField (line 518) | class ProcessedField(ItemField):
method field (line 523) | def field(self):
method dump (line 526) | def dump(self):
class ScrapelyField (line 530) | class ScrapelyField(ItemField):
FILE: slybot/slybot/plugins/scrapely_annotations/utils.py
class cached_property (line 13) | class cached_property(object):
method __init__ (line 20) | def __init__(self, func):
method __get__ (line 24) | def __get__(self, obj, cls):
function region_id (line 31) | def region_id(region, attribute_name='data-tagid'):
function load_annotations (line 38) | def load_annotations(extractor):
FILE: slybot/slybot/plugins/selectors/__init__.py
class Selectors (line 2) | class Selectors(object):
method setup_bot (line 3) | def setup_bot(self, settings, spider, spec, items, extractors, logger):
method process_item (line 11) | def process_item(self, item, response):
FILE: slybot/slybot/spider.py
class IblSpider (line 41) | class IblSpider(SitemapSpider):
method __init__ (line 42) | def __init__(self, name, spec, item_schemas, all_extractors, settings=...
method _add_spider_args_to_spec (line 67) | def _add_spider_args_to_spec(self, spec, args):
method _create_start_urls (line 73) | def _create_start_urls(self, spec):
method _create_start_requests (line 80) | def _create_start_requests(self, spec):
method _create_init_requests (line 92) | def _create_init_requests(self, spec):
method _add_allowed_domains (line 106) | def _add_allowed_domains(self, spec):
method parse_login_page (line 111) | def parse_login_page(self, response):
method after_login (line 119) | def after_login(self, response):
method get_generic_form_start_request (line 125) | def get_generic_form_start_request(self, form_descriptor):
method parse_field_url_page (line 139) | def parse_field_url_page(self, response):
method parse_form_page (line 146) | def parse_form_page(self, response):
method after_form_page (line 159) | def after_form_page(self, response):
method _get_allowed_domains (line 163) | def _get_allowed_domains(self, spec):
method start_requests (line 170) | def start_requests(self):
method _create_start_request_from_specs (line 181) | def _create_start_request_from_specs(self, info):
method parse (line 196) | def parse(self, response):
method _configure_plugins (line 220) | def _configure_plugins(self, settings, spec, schemas, extractors):
method _plugin_hook (line 230) | def _plugin_hook(self, name, *args):
method _handle (line 237) | def _handle(self, hook, response, *extrasrgs):
method handle_xml (line 248) | def handle_xml(self, response):
method handle_html (line 251) | def handle_html(self, response):
method _configure_js (line 254) | def _configure_js(self, spec, settings):
method _build_js_url_filter (line 273) | def _build_js_url_filter(self, spec):
method _add_splash_meta (line 280) | def _add_splash_meta(self, request):
FILE: slybot/slybot/spiderlets.py
class DefaultSpiderlet (line 14) | class DefaultSpiderlet(object):
method __init__ (line 17) | def __init__(self, spider):
method process_request (line 20) | def process_request(self, request, response):
method process_item (line 23) | def process_item(self, item, response):
method process_start_request (line 26) | def process_start_request(self, request):
method parse_login_page (line 29) | def parse_login_page(self, response):
function list_spiderlets (line 33) | def list_spiderlets(spiderlets_module_path):
function _load_spiderlet (line 47) | def _load_spiderlet(spiderlets_module_path, spider):
class SpiderletsMiddleware (line 57) | class SpiderletsMiddleware(object):
method from_crawler (line 59) | def from_crawler(cls, crawler):
method __init__ (line 62) | def __init__(self, settings):
method spider_opened (line 69) | def spider_opened(self, spider):
method process_spider_output (line 72) | def process_spider_output(self, response, result, spider):
method process_start_requests (line 79) | def process_start_requests(self, start_requests, spider):
FILE: slybot/slybot/spidermanager.py
class SlybotSpiderManager (line 23) | class SlybotSpiderManager(object):
method __init__ (line 25) | def __init__(self, datadir, spider_cls=None, settings=None, **kwargs):
method from_crawler (line 43) | def from_crawler(cls, crawler):
method from_settings (line 48) | def from_settings(cls, settings):
method load (line 53) | def load(self, spider_name):
method create (line 66) | def create(self, name, **args):
method list (line 74) | def list(self):
method find_by_request (line 77) | def find_by_request(self, request):
class ZipfileSlybotSpiderManager (line 84) | class ZipfileSlybotSpiderManager(SlybotSpiderManager):
method __init__ (line 86) | def __init__(self, datadir, zipfile=None, spider_cls=None, settings=None,
method from_settings (line 94) | def from_settings(cls, settings):
FILE: slybot/slybot/splash.py
class SlybotJsMiddleware (line 24) | class SlybotJsMiddleware(SplashMiddleware):
method process_request (line 25) | def process_request(self, request, spider):
FILE: slybot/slybot/starturls/__init__.py
class StartUrlCollection (line 15) | class StartUrlCollection(object):
method __init__ (line 16) | def __init__(self, start_urls, generators=None):
method __iter__ (line 20) | def __iter__(self):
method uniq (line 25) | def uniq(self):
method allowed_domains (line 29) | def allowed_domains(self):
method normalize (line 33) | def normalize(self):
method _generate_urls (line 36) | def _generate_urls(self, start_url):
method _from_type (line 40) | def _from_type(self, start_url):
class StartUrl (line 48) | class StartUrl(object):
method __init__ (line 49) | def __init__(self, spec, generators):
method key (line 56) | def key(self):
method allowed_domains (line 62) | def allowed_domains(self):
method normalized (line 68) | def normalized(self):
method _find_fragment_domains (line 71) | def _find_fragment_domains(self):
method _join_fragments (line 87) | def _join_fragments(self, fragments):
method _has_domain (line 90) | def _has_domain(self, url):
method _has_fragments (line 96) | def _has_fragments(self):
class StringUrl (line 100) | class StringUrl(object):
method __init__ (line 101) | def __init__(self, spec):
method allowed_domains (line 108) | def allowed_domains(self):
method normalized (line 112) | def normalized(self):
FILE: slybot/slybot/starturls/feed_generator.py
class FeedGenerator (line 6) | class FeedGenerator(object):
method __init__ (line 7) | def __init__(self, callback):
method __call__ (line 10) | def __call__(self, url):
method parse_urls (line 13) | def parse_urls(self, response):
FILE: slybot/slybot/starturls/fragment_generator.py
class FragmentGenerator (line 7) | class FragmentGenerator(object):
method _process_fixed (line 8) | def _process_fixed(self, fragment):
method _process_list (line 11) | def _process_list(self, fragment):
method _process_date (line 14) | def _process_date(self, fragment):
method _process_range (line 18) | def _process_range(self, fragment):
method _process_fragment (line 28) | def _process_fragment(self, fragment):
method process_fragments (line 32) | def process_fragments(self, spec):
method __call__ (line 35) | def __call__(self, spec):
FILE: slybot/slybot/starturls/generated_url.py
class GeneratedUrl (line 4) | class GeneratedUrl(object):
method __init__ (line 5) | def __init__(self, spec):
method allowed_domains (line 12) | def allowed_domains(self):
method normalized (line 16) | def normalized(self):
method normalized_url (line 24) | def normalized_url(self):
method normalized_fragments (line 44) | def normalized_fragments(self):
method _path_fragments (line 55) | def _path_fragments(self):
method _fixed_fragments (line 59) | def _fixed_fragments(self):
method _query_fragments (line 64) | def _query_fragments(self):
method _query_params (line 73) | def _query_params(self):
function normalize_path (line 89) | def normalize_path(path):
function normalize_url_path (line 113) | def normalize_url_path(path):
function normalize_url_query_param (line 121) | def normalize_url_query_param(x, is_first=False):
function normalize_query_param (line 126) | def normalize_query_param(x, is_first=False):
function normalize_default (line 134) | def normalize_default(x):
function normalize_range (line 138) | def normalize_range(x):
function query_params_prefix (line 144) | def query_params_prefix(is_first):
FILE: slybot/slybot/starturls/generator.py
class IdentityGenerator (line 16) | class IdentityGenerator():
method __call__ (line 17) | def __call__(self, spec):
class UrlGenerator (line 21) | class UrlGenerator(object):
method __init__ (line 22) | def __init__(self, settings=None, spider_args=None):
method _process_date (line 34) | def _process_date(self, values):
method _process_default (line 38) | def _process_default(self, values):
method _process_option (line 41) | def _process_option(self, values):
method _process_range (line 44) | def _process_range(self, values):
method _process_setting (line 49) | def _process_setting(self, values):
method _process_args (line 57) | def _process_args(self, values):
method _build_section (line 65) | def _build_section(self, descriptor, params=False):
method _generate_urls (line 78) | def _generate_urls(self, template, paths, params_template, params):
method __call__ (line 96) | def __call__(self, spec):
FILE: slybot/slybot/tests/test_baseurl.py
class TestApplyAnnotations (line 9) | class TestApplyAnnotations(TestCase):
method test_insert_base_relative (line 10) | def test_insert_base_relative(self):
method test_insert_base_noreplace (line 18) | def test_insert_base_noreplace(self):
method test_insert_base_addbase (line 24) | def test_insert_base_addbase(self):
method test_insert_base_commented (line 34) | def test_insert_base_commented(self):
method test_insert_base_nohead (line 43) | def test_insert_base_nohead(self):
method test_get_base_url (line 52) | def test_get_base_url(self):
method test_get_base_url_nobase (line 59) | def test_get_base_url_nobase(self):
method test_get_base_url_empty_basehref (line 65) | def test_get_base_url_empty_basehref(self):
FILE: slybot/slybot/tests/test_dropmeta.py
class DropMetaTest (line 11) | class DropMetaTest(TestCase):
method test_dupefilter (line 13) | def test_dupefilter(self):
FILE: slybot/slybot/tests/test_dupefilter.py
class DupeFilterTest (line 14) | class DupeFilterTest(TestCase):
method test_dupefilter (line 17) | def test_dupefilter(self):
FILE: slybot/slybot/tests/test_extraction_speed.py
function _next_3 (line 17) | def _next_3(iterable):
function extract (line 47) | def extract(extractor, selector):
class FakeContainer (line 74) | class FakeContainer(BaseContainerExtractor):
method __init__ (line 75) | def __init__(self, schema, legacy=False):
class TestExtractionSpeed (line 99) | class TestExtractionSpeed(TestCase):
method test_parsel_parse_and_extract (line 100) | def test_parsel_parse_and_extract(self):
method test_slybot_parse_and_extract (line 106) | def test_slybot_parse_and_extract(self):
method test_parsel_extract (line 112) | def test_parsel_extract(self):
method test_slybot_extract (line 117) | def test_slybot_extract(self):
FILE: slybot/slybot/tests/test_extractors.py
class ExtractorTest (line 18) | class ExtractorTest(TestCase):
method test_regex_extractor (line 114) | def test_regex_extractor(self):
method test_raw_type_w_regex (line 121) | def test_raw_type_w_regex(self):
method test_negative_hit_w_regex (line 141) | def test_negative_hit_w_regex(self):
method test_text_type_w_regex (line 159) | def test_text_type_w_regex(self):
method test_type_extractor (line 177) | def test_type_extractor(self):
method test_default_type_extractor (line 198) | def test_default_type_extractor(self):
method test_text_type_w_regex_and_no_groups (line 212) | def test_text_type_w_regex_and_no_groups(self):
method test_extractor_w_empty_string_extraction (line 232) | def test_extractor_w_empty_string_extraction(self):
method test_per_annotation_extractors (line 259) | def test_per_annotation_extractors(self):
FILE: slybot/slybot/tests/test_fieldtypes.py
class FieldTypesUrlEncoding (line 6) | class FieldTypesUrlEncoding(TestCase):
method test_not_standard_chars_in_url (line 7) | def test_not_standard_chars_in_url(self):
method test_uri_stripped_of_whitespace_and_quote_characters_correctly (line 14) | def test_uri_stripped_of_whitespace_and_quote_characters_correctly(self):
method test_uri_with_illegal_html_entities (line 28) | def test_uri_with_illegal_html_entities(self):
method test_blank_image_url (line 43) | def test_blank_image_url(self):
FILE: slybot/slybot/tests/test_fragment_generator.py
class FragmentGeneratorTest (line 6) | class FragmentGeneratorTest(TestCase):
method test_generated_url_list (line 7) | def test_generated_url_list(self):
method test_generated_url_range (line 23) | def test_generated_url_range(self):
method test_mixed_fragments (line 39) | def test_mixed_fragments(self):
method test_generated_letters (line 66) | def test_generated_letters(self):
FILE: slybot/slybot/tests/test_generic_form.py
class GenericFormTest (line 11) | class GenericFormTest(TestCase):
method test_simple_search_form (line 13) | def test_simple_search_form(self):
method test_simple_search_form_2_values (line 34) | def test_simple_search_form_2_values(self):
method test_advanced_search_form (line 55) | def test_advanced_search_form(self):
method test_advanced_search_form_regex (line 80) | def test_advanced_search_form_regex(self):
method test_simple_search_form_with_named_parameter (line 107) | def test_simple_search_form_with_named_parameter(self):
method test_simple_search_form_with_file_type (line 128) | def test_simple_search_form_with_file_type(self):
FILE: slybot/slybot/tests/test_linkextractors.py
class Test_RegexLinkExtractor (line 17) | class Test_RegexLinkExtractor(TestCase):
method test_default (line 18) | def test_default(self):
method test_custom (line 28) | def test_custom(self):
method test_custom_withargs (line 38) | def test_custom_withargs(self):
class Test_XmlLinkExtractors (line 111) | class Test_XmlLinkExtractors(TestCase):
method setUp (line 112) | def setUp(self):
method test_rss (line 118) | def test_rss(self):
method test_xml (line 125) | def test_xml(self):
method test_sitemap (line 132) | def test_sitemap(self):
method test_atom (line 143) | def test_atom(self):
method test_xml_remove_namespaces (line 150) | def test_xml_remove_namespaces(self):
class TestXmlLinkExtractionFromSpider (line 158) | class TestXmlLinkExtractionFromSpider(TestCase):
method setUp (line 159) | def setUp(self):
method test_xml_content (line 163) | def test_xml_content(self):
method test_endswith_xml (line 171) | def test_endswith_xml(self):
method test_atom (line 184) | def test_atom(self):
method test_xml_response (line 193) | def test_xml_response(self):
class Test_CsvLinkExtractor (line 222) | class Test_CsvLinkExtractor(TestCase):
method test_simple (line 223) | def test_simple(self):
method test_extra_params (line 232) | def test_extra_params(self):
method test_header (line 241) | def test_header(self):
class Test_HtmlLinkExtractor (line 260) | class Test_HtmlLinkExtractor(TestCase):
method test_simple (line 261) | def test_simple(self):
class Test_PaginationExtractor (line 271) | class Test_PaginationExtractor(TestCase):
method test_simple (line 272) | def test_simple(self):
method test_start_urls (line 283) | def test_start_urls(self):
method test_trained (line 308) | def test_trained(self):
FILE: slybot/slybot/tests/test_migration.py
class MigrationTests (line 6) | class MigrationTests(unittest.TestCase):
method test_table_generalization (line 7) | def test_table_generalization(self):
FILE: slybot/slybot/tests/test_multiple_item_extraction.py
function _annotation_tag_to_dict (line 114) | def _annotation_tag_to_dict(tag):
class ContainerExtractorTest (line 121) | class ContainerExtractorTest(TestCase):
method test_get_container_info (line 123) | def test_get_container_info(self):
method test_build_extraction_tree (line 133) | def test_build_extraction_tree(self):
method test_group_tree (line 139) | def test_group_tree(self):
method test_find_annotation (line 148) | def test_find_annotation(self):
method test_validate_and_adapt_item (line 155) | def test_validate_and_adapt_item(self):
method test_find_tokens (line 182) | def test_find_tokens(self):
method test_extract (line 193) | def test_extract(self):
method test_extract_single_attribute_to_multiple_fields (line 242) | def test_extract_single_attribute_to_multiple_fields(self):
method test_extract_missing_schema (line 259) | def test_extract_missing_schema(self):
method test_extract_multiple_item_types (line 268) | def test_extract_multiple_item_types(self):
method test_extract_repeated_field (line 280) | def test_extract_repeated_field(self):
method test_item_merging_in_container (line 307) | def test_item_merging_in_container(self):
method test_extracted_items_are_scrapy_items (line 312) | def test_extracted_items_are_scrapy_items(self):
method test_required_annotation (line 318) | def test_required_annotation(self):
method test_missing_selectors (line 333) | def test_missing_selectors(self):
method test_against_false_positive (line 338) | def test_against_false_positive(self):
method test_nested_items (line 344) | def test_nested_items(self):
method test_nested_items_without_nested_structure (line 355) | def test_nested_items_without_nested_structure(self):
method test_empty_page (line 361) | def test_empty_page(self):
method test_repeated_css_extractors (line 368) | def test_repeated_css_extractors(self):
FILE: slybot/slybot/tests/test_page_actions.py
function mkreq (line 9) | def mkreq():
class PATest (line 19) | class PATest(TestCase):
method test_url_filter (line 20) | def test_url_filter(self):
method test_middleware (line 31) | def test_middleware(self):
FILE: slybot/slybot/tests/test_schema_validation.py
function spider_json (line 13) | def spider_json(start_urls):
function start_url_schema (line 21) | def start_url_schema(url_string):
class JsonSchemaTest (line 24) | class JsonSchemaTest(TestCase):
method test_regex_formatting_wrong (line 26) | def test_regex_formatting_wrong(self):
method test_regex_formatting_ok (line 36) | def test_regex_formatting_ok(self):
method test_valid_url (line 45) | def test_valid_url(self):
method test_invalid_url (line 71) | def test_invalid_url(self):
method test_valid_mixed_fragments (line 88) | def test_valid_mixed_fragments(self):
method test_valid_fragments (line 99) | def test_valid_fragments(self):
method test_test_project (line 119) | def test_test_project(self):
FILE: slybot/slybot/tests/test_selectors.py
class SpiderTest (line 8) | class SpiderTest(TestCase):
method test_spider_with_selectors (line 11) | def test_spider_with_selectors(self):
method test_spider_with_inbuilt_selectors (line 29) | def test_spider_with_inbuilt_selectors(self):
method test_spider_with_surrounded_selectors (line 43) | def test_spider_with_surrounded_selectors(self):
FILE: slybot/slybot/tests/test_spider.py
function splash_spider_manager (line 19) | def splash_spider_manager(splash_url='http://localhost:8050'):
class SpiderTest (line 26) | class SpiderTest(TestCase):
method test_list (line 29) | def test_list(self):
method test_spider_with_link_template (line 37) | def test_spider_with_link_template(self):
method test_spider_with_link_region_but_not_link_template (line 66) | def test_spider_with_link_region_but_not_link_template(self):
method test_spider_extracted_item_types (line 90) | def test_spider_extracted_item_types(self):
method test_login_requests (line 106) | def test_login_requests(self):
method test_generic_form_requests (line 144) | def test_generic_form_requests(self):
method test_generic_form_requests_with_file_field (line 235) | def test_generic_form_requests_with_file_field(self):
method test_generic_form_requests_with_spider_args (line 606) | def test_generic_form_requests_with_spider_args(self):
method test_allowed_domains (line 706) | def test_allowed_domains(self):
method test_allowed_domains_all (line 712) | def test_allowed_domains_all(self):
method test_allowed_domains_previous_behavior (line 718) | def test_allowed_domains_previous_behavior(self):
method test_links_from_rss (line 724) | def test_links_from_rss(self):
method test_links_from_atom (line 740) | def test_links_from_atom(self):
method test_links_from_sitemap (line 756) | def test_links_from_sitemap(self):
method test_empty_content_type (line 772) | def test_empty_content_type(self):
method test_variants (line 785) | def test_variants(self):
method test_start_requests (line 798) | def test_start_requests(self):
method test_start_requests_allowed_domains (line 818) | def test_start_requests_allowed_domains(self):
method test_override_start_urls (line 823) | def test_override_start_urls(self):
method test_generate_start_urls (line 829) | def test_generate_start_urls(self):
method test_links_to_follow (line 839) | def test_links_to_follow(self):
method test_js_enable_patterns (line 852) | def test_js_enable_patterns(self):
method test_js_disable_patterns (line 863) | def test_js_disable_patterns(self):
method test_js_enable_and_disable_patterns (line 874) | def test_js_enable_and_disable_patterns(self):
FILE: slybot/slybot/tests/test_starturls.py
class StartUrlCollectionTest (line 10) | class StartUrlCollectionTest(TestCase):
method setUp (line 11) | def setUp(self):
method test_mixed_start_urls_generation (line 19) | def test_mixed_start_urls_generation(self):
method test_generated_type (line 43) | def test_generated_type(self):
method test_malformed_generated_type (line 64) | def test_malformed_generated_type(self):
method test_unique_legacy_urls (line 87) | def test_unique_legacy_urls(self):
method test_unique_list_start_urls (line 103) | def test_unique_list_start_urls(self):
method test_allowed_domains_with_many_fragments (line 126) | def test_allowed_domains_with_many_fragments(self):
method test_allowed_domains_with_mixed_urls (line 146) | def test_allowed_domains_with_mixed_urls(self):
method test_empty_allowed_domains (line 180) | def test_empty_allowed_domains(self):
method test_multiple_empty_allowed_domains (line 193) | def test_multiple_empty_allowed_domains(self):
method test_normalize_string_url (line 207) | def test_normalize_string_url(self):
method test_normalize_start_url (line 218) | def test_normalize_start_url(self):
method test_normalize_generated_options (line 227) | def test_normalize_generated_options(self):
method test_normalize_generated_default (line 254) | def test_normalize_generated_default(self):
method test_normalize_generated_dates (line 282) | def test_normalize_generated_dates(self):
method test_normalized_generated_range (line 316) | def test_normalized_generated_range(self):
method test_normalized_generated_params_range (line 350) | def test_normalized_generated_params_range(self):
method test_normalized_generated_template_params (line 386) | def test_normalized_generated_template_params(self):
method test_normalized_mixed (line 423) | def test_normalized_mixed(self):
method test_feed_url (line 485) | def test_feed_url(self):
function generator_set (line 501) | def generator_set(generator, start_urls):
FILE: slybot/slybot/tests/test_starturls_generator.py
class StartUrlGenerators (line 11) | class StartUrlGenerators(TestCase):
method test_schema_format (line 142) | def test_schema_format(self):
method test_start_urls (line 154) | def test_start_urls(self):
method test_generate_start_urls_from_defaults (line 158) | def test_generate_start_urls_from_defaults(self):
method test_generate_start_urls_from_options (line 164) | def test_generate_start_urls_from_options(self):
method test_generate_start_urls_from_date (line 169) | def test_generate_start_urls_from_date(self):
method test_generate_start_urls_from_range (line 177) | def test_generate_start_urls_from_range(self):
method test_generate_start_urls_from_params_range (line 184) | def test_generate_start_urls_from_params_range(self):
method test_generate_start_urls_from_spider_arg (line 191) | def test_generate_start_urls_from_spider_arg(self):
method test_generate_start_urls_from_setting (line 199) | def test_generate_start_urls_from_setting(self):
method test_generate_start_urls_from_params (line 207) | def test_generate_start_urls_from_params(self):
method test_misconfigured_start_urls_spec_type (line 215) | def test_misconfigured_start_urls_spec_type(self):
method test_missing_arg_for_start_urls_spec (line 228) | def test_missing_arg_for_start_urls_spec(self):
FILE: slybot/slybot/tests/utils.py
function request_to_set (line 23) | def request_to_set(requests):
class UTF8Response (line 27) | class UTF8Response(TextResponse):
method __init__ (line 28) | def __init__(self, *args, **kwargs):
class UTF8HtmlResponse (line 35) | class UTF8HtmlResponse(UTF8Response, HtmlResponse):
class UTF8TextResponse (line 39) | class UTF8TextResponse(UTF8Response, TextResponse):
class UTF8XmlResponse (line 43) | class UTF8XmlResponse(UTF8Response, XmlResponse):
function make_spider (line 47) | def make_spider(start_urls=None, sample=None):
function open_spec (line 53) | def open_spec(name):
function open_sample_and_page (line 59) | def open_sample_and_page(name):
function open_page (line 66) | def open_page(name):
function open_spider_page_and_results (line 70) | def open_spider_page_and_results(name):
FILE: slybot/slybot/utils.py
function content_type (line 32) | def content_type(response):
function encode (line 45) | def encode(html, default=None):
function decode (line 51) | def decode(html, default=None):
function _encode_or_decode_string (line 57) | def _encode_or_decode_string(html, method, default):
function iter_unique_scheme_hostname (line 77) | def iter_unique_scheme_hostname(urls):
function open_project_from_dir (line 88) | def open_project_from_dir(project_dir):
function read (line 99) | def read(fp, encoding='utf-8'):
function _build_sample (line 106) | def _build_sample(sample, legacy=False):
function htmlpage_from_response (line 113) | def htmlpage_from_response(response, _add_tagids=False):
function load_plugins (line 121) | def load_plugins(settings):
function load_plugin_names (line 132) | def load_plugin_names(settings):
function include_exclude_filter (line 159) | def include_exclude_filter(include_patterns, exclude_patterns):
class IndexedDict (line 178) | class IndexedDict(OrderedDict):
method __setitem__ (line 201) | def __setitem__(self, key, value):
method __getitem__ (line 206) | def __getitem__(self, key):
function _quotify (line 217) | def _quotify(mystr):
function serialize_tag (line 245) | def serialize_tag(tag):
function _must_add_tagid (line 269) | def _must_add_tagid(element):
function _modify_tagids (line 276) | def _modify_tagids(source, add=True):
function add_tagids (line 295) | def add_tagids(source):
function remove_tagids (line 302) | def remove_tagids(source):
class Storage (line 308) | class Storage(object):
method __init__ (line 309) | def __init__(self, base_path):
method rel_path (line 312) | def rel_path(self, *args):
method _path (line 315) | def _path(self, *args):
method isdir (line 318) | def isdir(self, *args, **kwargs):
method listdir (line 321) | def listdir(self, *args, **kwargs):
method open (line 324) | def open(self, *args, **kwargs):
class SpiderLoader (line 331) | class SpiderLoader(object):
method __init__ (line 332) | def __init__(self, storage):
method __getitem__ (line 348) | def __getitem__(self, key):
method load_spider (line 355) | def load_spider(self, spider_name):
method keys (line 378) | def keys(self):
method items (line 382) | def items(self):
method values (line 387) | def values(self):
method load_external_templates (line 391) | def load_external_templates(self, spec_base, spider_name):
FILE: slybot/slybot/validation/schema.py
function load_schemas (line 20) | def load_schemas():
class SlybotJsonSchemaValidator (line 27) | class SlybotJsonSchemaValidator(Draft3Validator):
function is_valid_ipv6_address (line 33) | def is_valid_ipv6_address(address):
function get_url_re (line 40) | def get_url_re():
function get_schema_validator (line 70) | def get_schema_validator(schema):
function validate_project_schema (line 110) | def validate_project_schema(specs):
FILE: slyd/slyd/authmanager.py
class AuthManager (line 4) | class AuthManager(object):
method __init__ (line 6) | def __init__(self, settings):
method protectResource (line 13) | def protectResource(self, resource):
FILE: slyd/slyd/dummyauth.py
function protectResource (line 4) | def protectResource(resource, config):
class DummyAuthResource (line 9) | class DummyAuthResource(Resource):
method __init__ (line 12) | def __init__(self, resource):
method getChildWithDefault (line 16) | def getChildWithDefault(self, path, request):
FILE: slyd/slyd/errors.py
class BaseError (line 1) | class BaseError(Exception):
method __init__ (line 2) | def __init__(self, status, title, body=''):
method title (line 8) | def title(self):
method body (line 12) | def body(self):
method status (line 16) | def status(self):
method __repr__ (line 19) | def __repr__(self):
method __str__ (line 22) | def __str__(self):
class BaseHTTPError (line 26) | class BaseHTTPError(BaseError):
method __init__ (line 29) | def __init__(self, title, body=''):
class BadRequest (line 33) | class BadRequest(BaseHTTPError):
FILE: slyd/slyd/gitstorage/jsondiff.py
class Conflict (line 9) | class Conflict(object):
method __init__ (line 10) | def __init__(self, mine, other, base):
method from_prepared (line 16) | def from_prepared(cls, mine, other, base):
method resolve_sub_conflict (line 27) | def resolve_sub_conflict(cls, mine, other):
method update (line 31) | def update(self, m, o, b):
method resolve_conflict (line 39) | def resolve_conflict(self):
method _asdict (line 76) | def _asdict(self):
method __eq__ (line 85) | def __eq__(self, other):
method __str__ (line 89) | def __str__(self):
method __repr__ (line 92) | def __repr__(self):
function merge_lists (line 96) | def merge_lists(base, mine, other):
class JsonDiff (line 129) | class JsonDiff(object):
method __init__ (line 135) | def __init__(self, old, new):
method op_for_field (line 143) | def op_for_field(self, field_name):
function merge_jsons (line 154) | def merge_jsons(base, mine, other):
FILE: slyd/slyd/gitstorage/projects.py
function wrap_callback (line 17) | def wrap_callback(connection, callback, manager, retries=0, **parsed):
class GitProjectMixin (line 23) | class GitProjectMixin(object):
method setup (line 27) | def setup(cls, storage_backend, location):
method run (line 33) | def run(self, callback, **parsed):
method _project_name (line 39) | def _project_name(self, name):
method _open_repo (line 44) | def _open_repo(self, name=None, read_only=False):
method _get_branch_and_repo (line 53) | def _get_branch_and_repo(self, repo=None, read_only=False, name=None):
method _checkout_commit_or_head (line 66) | def _checkout_commit_or_head(self, name=None, commit_id=None,
method _get_branch (line 84) | def _get_branch(self, repo=None, read_only=False, name=None):
method _init_or_open_project (line 87) | def _init_or_open_project(self, name):
method list_spiders (line 98) | def list_spiders(self, name=None):
class GitProjectsManager (line 105) | class GitProjectsManager(GitProjectMixin, ProjectsManager):
method __init__ (line 107) | def __init__(self, *args, **kwargs):
method all_projects (line 129) | def all_projects(self):
method create_project (line 133) | def create_project(self, name):
method project_filename (line 145) | def project_filename(self, name):
method remove_project (line 148) | def remove_project(self, name):
method publish_project (line 151) | def publish_project(self, name, force):
method has_tag (line 160) | def has_tag(self, name, tag_name):
method _has_tag (line 163) | def _has_tag(self, name, tag_name):
method discard_changes (line 169) | def discard_changes(self, name):
method project_revisions (line 173) | def project_revisions(self, name):
method conflicted_files (line 177) | def conflicted_files(self, name):
method changed_files (line 183) | def changed_files(self, name):
method _changed_files (line 188) | def _changed_files(self, name):
method save_file (line 196) | def save_file(self, name, file_path, file_contents):
method _render_file (line 201) | def _render_file(self, request, request_data, body):
method _get_project_name (line 224) | def _get_project_name(self, _id):
method _gen_etag (line 231) | def _gen_etag(self, request_data):
method _schedule_info (line 237) | def _schedule_info(self, **kwargs):
FILE: slyd/slyd/gitstorage/projectspec.py
class GitProjectSpec (line 5) | class GitProjectSpec(GitProjectMixin, ProjectSpec):
method _schedule_data (line 6) | def _schedule_data(self, spider, args):
FILE: slyd/slyd/html_utils.py
function _contains_js (line 25) | def _contains_js(url):
function _replace_entity (line 36) | def _replace_entity(match):
function unescape (line 43) | def unescape(mystr):
function html4annotation (line 49) | def html4annotation(htmlpage, baseurl=None, proxy_resources=None):
function descriptify (line 59) | def descriptify(doc, base=None, proxy=None):
FILE: slyd/slyd/projects.py
function allowed_project_name (line 17) | def allowed_project_name(name):
class ProjectsManager (line 21) | class ProjectsManager(object):
method setup (line 24) | def setup(cls, location, **kwargs):
method __init__ (line 27) | def __init__(self, auth_info):
method run (line 40) | def run(self, callback, **kwargs):
method all_projects (line 43) | def all_projects(self):
method list_projects (line 46) | def list_projects(self):
method create_project (line 52) | def create_project(self, name):
method rename_project (line 67) | def rename_project(self, from_name, to_name):
method remove_project (line 73) | def remove_project(self, name):
method edit_project (line 76) | def edit_project(self, name, revision=None):
method validate_project_name (line 81) | def validate_project_name(self, name):
method copy_data (line 85) | def copy_data(self, source, destination, spiders, items):
method download_project (line 88) | def download_project(self, name, spiders=None, version=None, **kwargs):
method commit_changes (line 91) | def commit_changes(self):
method _render_file (line 95) | def _render_file(self, request, request_data, body):
method __repr__ (line 103) | def __repr__(self):
method __str__ (line 106) | def __str__(self):
class FileSystemProjectsManager (line 110) | class FileSystemProjectsManager(ProjectsManager):
method __init__ (line 114) | def __init__(self, auth_info):
method all_projects (line 119) | def all_projects(self):
method project_filename (line 127) | def project_filename(self, name):
method download_project (line 130) | def download_project(self, name, spiders=None, version=None, fmt=None,
FILE: slyd/slyd/projectspec.py
function convert_template (line 15) | def convert_template(template):
class ProjectSpec (line 21) | class ProjectSpec(object):
method setup (line 28) | def setup(cls, location, **kwargs):
method __init__ (line 31) | def __init__(self, project_name, auth_info):
method run (line 43) | def run(self, callback, **kwargs):
method list_spiders (line 46) | def list_spiders(self):
method spider_with_templates (line 52) | def spider_with_templates(self, spider):
method spider_json (line 63) | def spider_json(self, name):
method template_json (line 67) | def template_json(self, spider_name, template_name):
method rename_spider (line 76) | def rename_spider(self, from_name, to_name):
method remove_spider (line 92) | def remove_spider(self, name):
method rename_template (line 103) | def rename_template(self, spider_name, from_name, to_name):
method remove_template (line 112) | def remove_template(self, spider_name, name):
method _rfilename (line 124) | def _rfilename(self, *resources):
method _rdirname (line 127) | def _rdirname(self, *resources):
method _process_extraction_urls (line 130) | def _process_extraction_urls(self, urls):
method _process_extraction_response (line 137) | def _process_extraction_response(self, url, html):
method extract_data (line 140) | def extract_data(self, spider_name, url_info, request):
method resource (line 143) | def resource(self, *resources):
method savejson (line 146) | def savejson(self, obj, resources):
method commit_changes (line 152) | def commit_changes(self):
method __repr__ (line 156) | def __repr__(self):
method __str__ (line 159) | def __str__(self):
class FileSystemProjectSpec (line 163) | class FileSystemProjectSpec(ProjectSpec):
method __init__ (line 166) | def __init__(self, project_name, auth_info):
FILE: slyd/slyd/resource.py
class SlydJsonErrorPage (line 6) | class SlydJsonErrorPage(ErrorPage):
method render (line 7) | def render(self, request):
class SlydJsonNoResource (line 16) | class SlydJsonNoResource(NoResource, SlydJsonErrorPage):
class SlydJsonResource (line 20) | class SlydJsonResource(Resource):
method render (line 26) | def render(self, request):
method error (line 44) | def error(self, request, status, why):
method bad_request (line 47) | def bad_request(self, why):
method not_found (line 50) | def not_found(self, message=None):
method read_json (line 55) | def read_json(self, request):
class SlydJsonObjectResource (line 62) | class SlydJsonObjectResource(SlydJsonResource):
method render (line 67) | def render(self, request):
FILE: slyd/slyd/server.py
class Request (line 10) | class Request(WebRequest):
method is_ajax (line 11) | def is_ajax(self):
method processingFailed (line 15) | def processingFailed(self, reason):
class Site (line 33) | class Site(WebSite):
function debugLogFormatter (line 38) | def debugLogFormatter(timestamp, request):
FILE: slyd/slyd/specmanager.py
class SpecManager (line 4) | class SpecManager(object):
method __init__ (line 6) | def __init__(self, settings):
method project_spec (line 30) | def project_spec(self, project, auth_info):
method project_manager (line 33) | def project_manager(self, auth_info):
FILE: slyd/slyd/splash/commands.py
class Commands (line 32) | class Commands(object):
method __init__ (line 33) | def __init__(self, data, socket, storage):
method tab (line 37) | def tab(self):
method cookies (line 40) | def cookies(self):
method heartbeat (line 48) | def heartbeat(self):
method save_html (line 51) | def save_html(self, item_checker=None):
method extract_items (line 67) | def extract_items(self):
method _load_sample (line 82) | def _load_sample(self, data, project=None):
method _update_sample (line 89) | def _update_sample(self, sample=None, project=None, data=None):
method update_spider (line 121) | def update_spider(self, spider=None):
method load_page (line 124) | def load_page(self):
method interact_page (line 156) | def interact_page(self):
method resolve (line 166) | def resolve(self):
method metadata (line 179) | def metadata(self, extra={}):
method extract (line 197) | def extract(self):
method resize (line 216) | def resize(self):
method close_tab (line 223) | def close_tab(self):
method _open_tab (line 229) | def _open_tab(self):
function _process_items (line 235) | def _process_items(items):
function _restore (line 251) | def _restore(spider):
function _compare_items (line 257) | def _compare_items(a, b):
class ItemChecker (line 280) | class ItemChecker(object):
method __init__ (line 281) | def __init__(self, command, project, spider=None, sample=None):
method raw_html (line 303) | def raw_html(self):
method html (line 313) | def html(self):
method url (line 317) | def url(self):
method using_js (line 324) | def using_js(self):
method schemas (line 330) | def schemas(self):
method extractors (line 334) | def extractors(self):
method data (line 337) | def data(self):
method extract (line 344) | def extract(self):
method _load_items (line 349) | def _load_items(self, body_field='original_body', live=False):
method _check_items (line 372) | def _check_items(self):
method _check_sample (line 396) | def _check_sample(self, sample):
method _check_items_with_sample (line 414) | def _check_items_with_sample(self):
class MissingRequiredError (line 419) | class MissingRequiredError(Exception):
method __init__ (line 420) | def __init__(self, schema, fields):
FILE: slyd/slyd/splash/cookies.py
class PortiaCookieJar (line 4) | class PortiaCookieJar(SplashCookieJar):
method __init__ (line 5) | def __init__(self, web_page, socket):
method setCookiesFromUrl (line 10) | def setCookiesFromUrl(self, cookie_list, url):
method setAllCookies (line 15) | def setAllCookies(self, cookie_list):
method put_client_cookies (line 20) | def put_client_cookies(self, cookie_list):
method update_client_cookies (line 26) | def update_client_cookies(self):
FILE: slyd/slyd/splash/css_utils.py
function wrap_url (line 12) | def wrap_url(url, tabid, base=None):
function process_css (line 32) | def process_css(css_source, tabid, base_uri):
FILE: slyd/slyd/splash/ferry.py
function _is_xml (line 64) | def _is_xml(accepts):
function wrap_callback (line 74) | def wrap_callback(connection, callback, retries=0, **parsed):
function create_ferry_resource (line 81) | def create_ferry_resource(factory):
class PortiaNetworkManager (line 85) | class PortiaNetworkManager(SplashQNetworkAccessManager):
method createRequest (line 88) | def createRequest(self, operation, request, outgoingData=None):
method _ready_read (line 106) | def _ready_read(self):
class PortiaBrowserTab (line 113) | class PortiaBrowserTab(BrowserTab):
method url (line 115) | def url(self):
method evaljs (line 122) | def evaljs(self, *args, **kwargs):
class FerryWebSocketResource (line 126) | class FerryWebSocketResource(WebSocketResource):
method __init__ (line 127) | def __init__(self, factory):
method render (line 131) | def render(self, request):
class User (line 137) | class User(object):
method __init__ (line 140) | def __init__(self, auth, tab=None, spider=None, spiderspec=None):
method findById (line 151) | def findById(cls, tabid):
method name (line 155) | def name(self):
method __getattr__ (line 158) | def __getattr__(self, key):
class SpiderSpec (line 166) | class SpiderSpec(object):
method __init__ (line 167) | def __init__(self, project, name, spider, items, extractors):
method spider (line 175) | def spider(self):
method items (line 179) | def items(self):
method extractors (line 183) | def extractors(self):
method templates (line 187) | def templates(se
Condensed preview — 729 files, each showing path, character count, and a content snippet. Download the .json file or copy for the full structured content (6,450K chars).
[
{
"path": ".dockerignore",
"chars": 108,
"preview": ".git\n.vagrant\ndocs\n*/node_modules\n*/bower_components\n*/tests\n*/tmp\n*/db.sqlite3\n*/.tox\n*/.pyc\n*/__pycache__\n"
},
{
"path": ".drone.yml",
"chars": 487,
"preview": "image: scrapinghub\n\nscript:\n - echo \"Portia is at:\"`git show -s --pretty=%d HEAD`\n - git restore-mtime\n - shopt -s ex"
},
{
"path": ".editorconfig",
"chars": 514,
"preview": "# EditorConfig helps developers define and maintain consistent\n# coding styles between different editors and IDEs\n# edit"
},
{
"path": ".gitattributes",
"chars": 253,
"preview": "*.sh eol=lf\n*.bat eol=crlf\n*.js text\n*.py text\n*.css text\n*.hbs text\n*.json text\n*.html text\n*.xml text\n*.yml text\n*.txt"
},
{
"path": ".gitignore",
"chars": 520,
"preview": "# Python compiled files\n*__pycache__/*\n*.pyc\n\n# Vagrant files\n.vagrant/\n/.idea/\n\n# Python build files\n*.egg-info\nslybot/"
},
{
"path": ".jshintrc",
"chars": 518,
"preview": "{\n \"predef\": [\n \"document\",\n \"window\",\n \"-Promise\"\n ],\n \"browser\": true,\n \"boss\": true,\n \"curly\": true,\n "
},
{
"path": ".travis.yml",
"chars": 1116,
"preview": "language: python\npython: 3.7\ndist: bionic\nservices:\n - docker\nenv:\n- WHEELHOUSE=$HOME/.cache/wheelhouse PIP_FIND_LINKS="
},
{
"path": "CHANGES",
"chars": 6985,
"preview": "2.0.8 - 2017-04-20\n\nLimit project and spider id length to avoid causing issues in windows\nOnly use auto annotations when"
},
{
"path": "Dockerfile",
"chars": 1305,
"preview": "FROM ubuntu:16.04\nWORKDIR /app/slyd\n\nENV PATH=\"/opt/qt59/5.9.1/gcc_64/bin:${PATH}\"\nENV DEBIAN_FRONTEND noninteractive\nEN"
},
{
"path": "LICENSE",
"chars": 1515,
"preview": "Copyright (c) Scrapinghub.\nAll rights reserved.\n\nRedistribution and use in source and binary forms, with or without modi"
},
{
"path": "README.md",
"chars": 1155,
"preview": "Portia\n======\n\nPortia is a tool that allows you to visually scrape websites without any programming knowledge required. "
},
{
"path": "VERSION",
"chars": 6,
"preview": "2.0.8\n"
},
{
"path": "Vagrantfile",
"chars": 525,
"preview": "# vim:ft=ruby\n\nVagrant.configure(\"2\") do |config|\n\tconfig.vm.box = \"ubuntu/trusty64\"\n\tconfig.vm.host_name = \"portia\"\n\tco"
},
{
"path": "bin/bump_version.py",
"chars": 977,
"preview": "#!/usr/bin/env python3\nimport os\nfrom datetime import datetime\n_BASE_PATH = os.path.abspath(os.path.dirname(__file__))\nV"
},
{
"path": "docker/compile-assets.sh",
"chars": 50,
"preview": "#!/bin/bash\ncd portiaui\nnpm install\nnpm run build\n"
},
{
"path": "docker/entry",
"chars": 493,
"preview": "#!/bin/bash\nset -x\naction=$1\nshift\n\n_run() {\n service nginx start\n _set_env\n echo $PYTHONPATH\n /app/slyd/bin/slyd -p"
},
{
"path": "docker/nginx/nginx.conf",
"chars": 1599,
"preview": "worker_processes 1;\n\nevents { worker_connections 1024; }\n\nhttp {\n include mime.types;\n\n sendfile on;\n client_ma"
},
{
"path": "docker/nginx/proxy_portia_server.conf",
"chars": 258,
"preview": "proxy_pass http://127.0.0.1:8000;\nproxy_redirect off;\nproxy_set_header Host $http_host;\nproxy_set_header "
},
{
"path": "docker/nginx/proxy_slyd.conf",
"chars": 258,
"preview": "proxy_pass http://127.0.0.1:9002;\nproxy_redirect off;\nproxy_set_header Host $host:9002;\nproxy_set_header "
},
{
"path": "docker/portia.conf",
"chars": 306,
"preview": "description \"portia server\"\nstart on vagrant-mounted or filesystem\nstop on runlevel [!2345]\n\nscript\n export PYTHONPAT"
},
{
"path": "docker/provision.sh",
"chars": 11002,
"preview": "#!/bin/bash\nset -e\n\nif [ \"x$APP_ROOT\" = x ]\nthen\n for dir in \"$( cd \"$( dirname \"${BASH_SOURCE[0]}\" )\" && pwd )\" /app"
},
{
"path": "docker/qt_install.qs",
"chars": 1974,
"preview": "// Emacs mode hint: -*- mode: JavaScript -*-\n// https://bitbucket.org/xiannox/trusty-qt5.7-beta-x64/raw/HEAD/qt-installe"
},
{
"path": "docker/restore-mtime.sh",
"chars": 169,
"preview": "#!/bin/bash\ncommit=$(git rev-list -n 1 HEAD requirements.txt)\nmtime=$(git show --pretty=format:%ai --abbrev-commit $comm"
},
{
"path": "docker/run-tests.sh",
"chars": 273,
"preview": "#!/bin/bash\n\nexport PYTHONPATH=`pwd`/slybot:`pwd`/slyd\npip install tox\n\ncd /app/slyd\npython2.7 tests/testserver/server.p"
},
{
"path": "docker-compose.yml",
"chars": 332,
"preview": "version: '3'\nservices:\n app:\n build: .\n command: /app/docker/entry start-dev\n volumes:\n - ./data/projects"
},
{
"path": "docs/Makefile",
"chars": 6762,
"preview": "# Makefile for Sphinx documentation\n#\n\n# You can set these variables from the command line.\nSPHINXOPTS =\nSPHINXBUILD "
},
{
"path": "docs/conf.py",
"chars": 8983,
"preview": "# -*- coding: utf-8 -*-\n#\n# Portia documentation build configuration file, created by\n# sphinx-quickstart on Tue Aug 25 "
},
{
"path": "docs/examples.rst",
"chars": 7401,
"preview": ".. _examples:\n\n========\nExamples\n========\n\nCrawling paginated listings\n===========================\n\nMost e-commerce site"
},
{
"path": "docs/faq.rst",
"chars": 983,
"preview": ".. _faq:\n\nFAQ\n===\n\nHow do I use Crawlera with Portia?\n----------------------------------\n\nPortia spiders are standard Sc"
},
{
"path": "docs/getting-started.rst",
"chars": 3317,
"preview": ".. _getting-started:\n\n===============\nGetting Started\n===============\n\n.. note:: If you don't have Portia running yet, p"
},
{
"path": "docs/index.rst",
"chars": 301,
"preview": "Welcome to Portia's documentation!\n==================================\n\nContents:\n\n.. toctree::\n :maxdepth: 2\n\n insta"
},
{
"path": "docs/installation.rst",
"chars": 4196,
"preview": ".. _installation:\n\nInstallation\n============\n\nDocker (recommended)\n--------------------\n\nIf you are on a Linux machine y"
},
{
"path": "docs/items.rst",
"chars": 3210,
"preview": ".. _items:\n\n=====\nItems\n=====\n\nAn item refers to a single item of data scraped from the target website. A common example"
},
{
"path": "docs/make.bat",
"chars": 6701,
"preview": "@ECHO OFF\r\n\r\nREM Command file for Sphinx documentation\r\n\r\nif \"%SPHINXBUILD%\" == \"\" (\r\n\tset SPHINXBUILD=sphinx-build\r\n)\r\n"
},
{
"path": "docs/projects.rst",
"chars": 2733,
"preview": ".. _projects:\n\n========\nProjects\n========\n\nA project in Portia consists of one or more :ref:`spiders <spiders>` and can "
},
{
"path": "docs/samples.rst",
"chars": 4854,
"preview": ".. _samples:\n\n=======\nSamples\n=======\n\nWhat are samples?\n=================\n\nWhen the crawler visits a page, it matches t"
},
{
"path": "docs/spiders.rst",
"chars": 3200,
"preview": ".. _spiders:\n\n=======\nSpiders\n=======\n\nSpiders are web crawlers that use :ref:`samples <samples>` to extract data from t"
},
{
"path": "portia_server/db_repo/__init__.py",
"chars": 0,
"preview": ""
},
{
"path": "portia_server/db_repo/apps.py",
"chars": 129,
"preview": "from __future__ import unicode_literals\n\nfrom django.apps import AppConfig\n\n\nclass DbRepoConfig(AppConfig):\n name = '"
},
{
"path": "portia_server/db_repo/migrations/0001_initial.py",
"chars": 1881,
"preview": "# -*- coding: utf-8 -*-\n# Generated by Django 1.10 on 2016-10-04 06:54\nfrom __future__ import unicode_literals\n\nimport d"
},
{
"path": "portia_server/db_repo/migrations/__init__.py",
"chars": 0,
"preview": ""
},
{
"path": "portia_server/db_repo/migrations/slyd_to_django.sql",
"chars": 900,
"preview": "ALTER TABLE `objs` DROP PRIMARY KEY,\n ADD COLUMN `id` integer AUTO_INCREMENT NOT NULL PRIMARY KEY,\n "
},
{
"path": "portia_server/db_repo/models.py",
"chars": 2333,
"preview": "from __future__ import unicode_literals\n\nfrom django.db.models import (Model, BinaryField, BigIntegerField,\n "
},
{
"path": "portia_server/db_repo/repo.py",
"chars": 5910,
"preview": "from django.db.transaction import get_autocommit\nfrom django.db.utils import IntegrityError\nfrom dulwich.errors import O"
},
{
"path": "portia_server/manage.py",
"chars": 257,
"preview": "#!/usr/bin/env python3\nimport os\nimport sys\n\nif __name__ == \"__main__\":\n os.environ.setdefault(\"DJANGO_SETTINGS_MODUL"
},
{
"path": "portia_server/portia_api/__init__.py",
"chars": 0,
"preview": ""
},
{
"path": "portia_server/portia_api/apps.py",
"chars": 135,
"preview": "from __future__ import unicode_literals\n\nfrom django.apps import AppConfig\n\n\nclass PortiaApiConfig(AppConfig):\n name "
},
{
"path": "portia_server/portia_api/errors.py",
"chars": 895,
"preview": "class BaseError(Exception):\n def __init__(self, status, title, body=''):\n self._status = status\n self._"
},
{
"path": "portia_server/portia_api/jsonapi/__init__.py",
"chars": 35,
"preview": "from .response import JSONResponse\n"
},
{
"path": "portia_server/portia_api/jsonapi/exceptions.py",
"chars": 2317,
"preview": "from collections import OrderedDict\nfrom uuid import uuid4\n\nfrom rest_framework.exceptions import APIException, Validati"
},
{
"path": "portia_server/portia_api/jsonapi/parsers.py",
"chars": 166,
"preview": "from __future__ import unicode_literals\n\nfrom rest_framework.parsers import JSONParser\n\n\nclass JSONApiParser(JSONParser)"
},
{
"path": "portia_server/portia_api/jsonapi/registry.py",
"chars": 299,
"preview": "from portia_orm.exceptions import ImproperlyConfigured\n\n\n__all__ = [\n 'schema',\n]\n\nschemas = {}\n\n\ndef get_schema(sche"
},
{
"path": "portia_server/portia_api/jsonapi/relationships.py",
"chars": 3460,
"preview": "from marshmallow_jsonapi.fields import Relationship as BaseRelationship\n\nfrom portia_api.jsonapi.registry import get_sch"
},
{
"path": "portia_server/portia_api/jsonapi/renderers.py",
"chars": 1114,
"preview": "from __future__ import unicode_literals\n\nfrom rest_framework.renderers import JSONRenderer as BaseJSONRenderer\n\n\nclass J"
},
{
"path": "portia_server/portia_api/jsonapi/response.py",
"chars": 395,
"preview": "from django.http import HttpResponse\nfrom rest_framework.renderers import JSONRenderer\n\n\nclass JSONResponse(HttpResponse"
},
{
"path": "portia_server/portia_api/jsonapi/serializers.py",
"chars": 24605,
"preview": "from collections import defaultdict, OrderedDict\nfrom functools import partial\nfrom itertools import chain\nfrom operator"
},
{
"path": "portia_server/portia_api/jsonapi/utils.py",
"chars": 2089,
"preview": "from collections import defaultdict, OrderedDict\n\nfrom django.utils.text import camel_case_to_spaces\nfrom requests.statu"
},
{
"path": "portia_server/portia_api/resources/__init__.py",
"chars": 0,
"preview": ""
},
{
"path": "portia_server/portia_api/resources/annotations.py",
"chars": 1325,
"preview": "from .projects import BaseProjectModelRoute\nfrom ..jsonapi.utils import cached_property\nfrom portia_orm.models import Ba"
},
{
"path": "portia_server/portia_api/resources/extractors.py",
"chars": 368,
"preview": "from .projects import BaseProjectModelRoute\nfrom portia_orm.models import Extractor\n\n\nclass ExtractorRoute(BaseProjectMo"
},
{
"path": "portia_server/portia_api/resources/fields.py",
"chars": 1442,
"preview": "from portia_orm.models import Field\n\nfrom .projects import BaseProjectModelRoute\nfrom ..jsonapi.exceptions import JsonAp"
},
{
"path": "portia_server/portia_api/resources/items.py",
"chars": 1183,
"preview": "from .projects import BaseProjectModelRoute\nfrom ..jsonapi.utils import cached_property\nfrom portia_orm.models import It"
},
{
"path": "portia_server/portia_api/resources/models.py",
"chars": 15357,
"preview": "from itertools import chain\n\nfrom marshmallow_jsonapi import Schema, fields\nfrom marshmallow import pre_dump, post_load\n"
},
{
"path": "portia_server/portia_api/resources/projects.py",
"chars": 10128,
"preview": "from collections import OrderedDict\n\nfrom django.conf import settings\nfrom django.utils.functional import cached_propert"
},
{
"path": "portia_server/portia_api/resources/response.py",
"chars": 1494,
"preview": "import json\n\nfrom django.http.response import HttpResponse\nfrom wsgiref.util import FileWrapper\n\nfrom six.moves import m"
},
{
"path": "portia_server/portia_api/resources/route.py",
"chars": 10404,
"preview": "from collections import Sequence\nfrom operator import attrgetter\n\nfrom django.db import transaction\nfrom django.http.res"
},
{
"path": "portia_server/portia_api/resources/samples.py",
"chars": 1628,
"preview": "from .projects import BaseProjectModelRoute\nfrom .serializers import SampleSerializer\nfrom portia_orm.models import Samp"
},
{
"path": "portia_server/portia_api/resources/schemas.py",
"chars": 2019,
"preview": "from portia_orm.models import Schema\n\nfrom .projects import BaseProjectModelRoute\nfrom ..jsonapi.exceptions import JsonA"
},
{
"path": "portia_server/portia_api/resources/serializers.py",
"chars": 13594,
"preview": "from operator import attrgetter\n\nfrom six.moves import map\n\nfrom portia_api.jsonapi.serializers import JsonApiSerializer"
},
{
"path": "portia_server/portia_api/resources/spiders.py",
"chars": 3040,
"preview": "from django.http.response import Http404\n\nfrom rest_framework.decorators import detail_route\nfrom rest_framework.respons"
},
{
"path": "portia_server/portia_api/routers.py",
"chars": 657,
"preview": "from rest_framework_nested.routers import SimpleRouter, NestedSimpleRouter\n\n__all__ = [\n 'Router',\n 'NestedRouter'"
},
{
"path": "portia_server/portia_api/tests/__init__.py",
"chars": 0,
"preview": ""
},
{
"path": "portia_server/portia_api/tests/test_routes.py",
"chars": 445,
"preview": "import unittest\n\nfrom rest_framework.test import APIRequestFactory\n\nfrom portia_api.resources.route import JsonApiRoute\n"
},
{
"path": "portia_server/portia_api/urls.py",
"chars": 1479,
"preview": "from django.conf.urls import url, include\n\nfrom .routers import Router, NestedRouter\nfrom .resources.annotations import "
},
{
"path": "portia_server/portia_api/utils/__init__.py",
"chars": 0,
"preview": ""
},
{
"path": "portia_server/portia_api/utils/annotations.py",
"chars": 617,
"preview": "\nDEFAULTS = {\n 'accept': 'url',\n 'align': 'number',\n 'code': 'url',\n 'codebase': 'url',\n 'coords': 'geopo"
},
{
"path": "portia_server/portia_api/utils/copy.py",
"chars": 6035,
"preview": "import re\nfrom collections import defaultdict\n\nfrom portia_orm.models import Project\nfrom portia_orm.utils import short_"
},
{
"path": "portia_server/portia_api/utils/deploy/base.py",
"chars": 648,
"preview": "from portia_api.utils.download import ProjectArchiver\n\n\nclass BaseDeploy(object):\n def __init__(self, project):\n "
},
{
"path": "portia_server/portia_api/utils/deploy/package.py",
"chars": 1882,
"preview": "import os\nimport textwrap\nimport zipfile\n\nfrom datetime import datetime\nfrom distutils.dist import DistributionMetadata\n"
},
{
"path": "portia_server/portia_api/utils/deploy/scrapinghub.py",
"chars": 3299,
"preview": "import json\nimport os\n\nfrom six import StringIO\nfrom urllib.parse import urljoin\n\nfrom django.conf import settings as ap"
},
{
"path": "portia_server/portia_api/utils/deploy/scrapyd.py",
"chars": 3288,
"preview": "import requests\n\nfrom configparser import ConfigParser\nfrom urllib.parse import urljoin\n\nfrom django.conf import setting"
},
{
"path": "portia_server/portia_api/utils/download.py",
"chars": 9961,
"preview": "from __future__ import absolute_import\nimport itertools\nimport json\nimport os\nimport six\nimport zipfile\n\nfrom collection"
},
{
"path": "portia_server/portia_api/utils/extract.py",
"chars": 2869,
"preview": "import logging\n\nfrom twisted.internet import defer\nfrom twisted.web.client import getPage\n\nfrom scrapy import Request\nfr"
},
{
"path": "portia_server/portia_api/utils/projects.py",
"chars": 335,
"preview": "def unique_name(base_name, disallow=(), initial_suffix=''):\n disallow = set(disallow)\n suffix = initial_suffix\n "
},
{
"path": "portia_server/portia_api/utils/spiders.py",
"chars": 1117,
"preview": "import json\n\nfrom scrapy.settings import Settings\nfrom slybot.spider import IblSpider\n\n\ndef load_spider_data(model):\n "
},
{
"path": "portia_server/portia_orm/__init__.py",
"chars": 0,
"preview": ""
},
{
"path": "portia_server/portia_orm/apps.py",
"chars": 135,
"preview": "from __future__ import unicode_literals\n\nfrom django.apps import AppConfig\n\n\nclass PortiaOrmConfig(AppConfig):\n name "
},
{
"path": "portia_server/portia_orm/base.py",
"chars": 24113,
"preview": "from collections import OrderedDict\nfrom itertools import chain\nimport errno\nimport json\nimport re\n\nfrom toposort import"
},
{
"path": "portia_server/portia_orm/collection.py",
"chars": 13889,
"preview": "import json\n\nfrom collections import Sequence\n\nfrom .exceptions import ImproperlyConfigured, ValidationError\nfrom .snaps"
},
{
"path": "portia_server/portia_orm/datastore.py",
"chars": 1323,
"preview": "from contextlib import contextmanager\nfrom threading import local\nfrom weakref import WeakKeyDictionary\n\n\nclass DataStor"
},
{
"path": "portia_server/portia_orm/decorators.py",
"chars": 265,
"preview": "from marshmallow.decorators import (validates, validates_schema,\n pre_dump, post_dump"
},
{
"path": "portia_server/portia_orm/deletion.py",
"chars": 1702,
"preview": "from collections import OrderedDict\n\nfrom .exceptions import ProtectedError\nfrom .relationships import HasMany\n\n__all__ "
},
{
"path": "portia_server/portia_orm/exceptions.py",
"chars": 260,
"preview": "from marshmallow.exceptions import ValidationError\n\n__all__ = [\n 'ImproperlyConfigured',\n 'ValidationError',\n]\n\n\nc"
},
{
"path": "portia_server/portia_orm/fields.py",
"chars": 7861,
"preview": "import re\n\nfrom marshmallow import fields, Schema, validate\nfrom marshmallow.utils import get_value, missing\nfrom six im"
},
{
"path": "portia_server/portia_orm/middleware.py",
"chars": 281,
"preview": "from .datastore import data_store_context\n\n\nclass ORMDataStoreMiddleware(object):\n def __init__(self, get_response=No"
},
{
"path": "portia_server/portia_orm/models.py",
"chars": 33735,
"preview": "from __future__ import unicode_literals\nimport json\nimport re\nimport six\n\nfrom collections import deque, OrderedDict\nfro"
},
{
"path": "portia_server/portia_orm/registry.py",
"chars": 789,
"preview": "from six import itervalues\n\nfrom .exceptions import ImproperlyConfigured\n\n\n__all__ = [\n 'get_model',\n 'get_polymor"
},
{
"path": "portia_server/portia_orm/relationships.py",
"chars": 9716,
"preview": "from collections import Mapping\n\nfrom marshmallow import fields, utils\nfrom six import string_types\n\nfrom .collection im"
},
{
"path": "portia_server/portia_orm/serializers.py",
"chars": 3838,
"preview": "from collections import OrderedDict, Sequence\n\nfrom marshmallow import schema\nfrom six import iteritems, string_types\n\nf"
},
{
"path": "portia_server/portia_orm/snapshots.py",
"chars": 2793,
"preview": "from collections import defaultdict\n\n__all__ = [\n 'ModelSnapshots',\n]\n\n\nclass ModelSnapshots(defaultdict):\n defaul"
},
{
"path": "portia_server/portia_orm/tests/__init__.py",
"chars": 0,
"preview": ""
},
{
"path": "portia_server/portia_orm/tests/models.py",
"chars": 3465,
"preview": "from .. import fields\nfrom ..base import Model\n\n\nclass ExampleModel(Model):\n id = fields.String(primary_key=True)\n "
},
{
"path": "portia_server/portia_orm/tests/test_basic.py",
"chars": 10330,
"preview": "from unittest import mock\n\nfrom .models import (ExampleModel, RequiredFieldModel, SingleFileModel,\n "
},
{
"path": "portia_server/portia_orm/tests/test_collection.py",
"chars": 30642,
"preview": "from .models import (ExampleModel, ChildModel, PolymorphicChildBase,\n PolymorphicChildModel1, Polymo"
},
{
"path": "portia_server/portia_orm/tests/test_model.py",
"chars": 199783,
"preview": "from unittest import mock\n\nfrom .utils import DataStoreTestCase, mock_storage\nfrom ..exceptions import ValidationError\nf"
},
{
"path": "portia_server/portia_orm/tests/test_relationship.py",
"chars": 66281,
"preview": "import json\nfrom unittest import mock\n\nfrom .models import (OneToOneModel1, OneToOneModel2, ParentModel, ChildModel,\n "
},
{
"path": "portia_server/portia_orm/tests/utils.py",
"chars": 1929,
"preview": "import errno\nimport unittest\n\nfrom unittest import mock\n\nfrom storage.backends import ContentFile\nfrom ..datastore impor"
},
{
"path": "portia_server/portia_orm/utils.py",
"chars": 6150,
"preview": "from collections import OrderedDict\nfrom itertools import islice\nfrom uuid import uuid4\n\nfrom django.utils.functional im"
},
{
"path": "portia_server/portia_orm/validators.py",
"chars": 228,
"preview": "from marshmallow.validate import (ContainsOnly, Range, Regexp, Predicate,\n NoneOf, OneO"
},
{
"path": "portia_server/portia_server/__init__.py",
"chars": 0,
"preview": ""
},
{
"path": "portia_server/portia_server/backends.py",
"chars": 274,
"preview": "from .models import LocalUser\n\n\nclass LocalAuthentication(object):\n def authenticate(self, request, **kwargs):\n "
},
{
"path": "portia_server/portia_server/models.py",
"chars": 1237,
"preview": "from __future__ import unicode_literals\n\nimport getpass\nimport socket\n\nfrom django.db.models.fields import CharField\nfro"
},
{
"path": "portia_server/portia_server/settings.py",
"chars": 4198,
"preview": "\"\"\"\nDjango settings for portia_server project.\n\nGenerated by 'django-admin startproject' using Django 1.9.7.\n\nFor more i"
},
{
"path": "portia_server/portia_server/urls.py",
"chars": 225,
"preview": "from django.conf.urls import url, include\n\nfrom . import views\nfrom portia_api import urls\n\nurlpatterns = [\n url(r'^a"
},
{
"path": "portia_server/portia_server/views.py",
"chars": 296,
"preview": "from django.conf import settings\nfrom portia_api.jsonapi import JSONResponse\n\n\ndef capabilities(request):\n capabiliti"
},
{
"path": "portia_server/portia_server/wsgi.py",
"chars": 403,
"preview": "\"\"\"\nWSGI config for portia_server project.\n\nIt exposes the WSGI callable as a module-level variable named ``application`"
},
{
"path": "portia_server/requirements.txt",
"chars": 300,
"preview": "crochet==1.9.0\ndjango>=1.11.21\ndjango-cache-machine==1.0.0\ndjangorestframework==3.7.7\ndj-database-url==0.5.0\ndrf-nested-"
},
{
"path": "portia_server/storage/__init__.py",
"chars": 583,
"preview": "from django.conf import settings\nfrom django.utils.module_loading import import_string\n\n__all__ = [\n 'get_storage_cla"
},
{
"path": "portia_server/storage/apps.py",
"chars": 89,
"preview": "from django.apps import AppConfig\n\n\nclass StorageConfig(AppConfig):\n name = 'storage'\n"
},
{
"path": "portia_server/storage/backends.py",
"chars": 13733,
"preview": "import errno\nimport json\nimport logging\nimport os\nimport os.path\nimport re\nimport shutil\nimport six\nimport sys\n\nfrom col"
},
{
"path": "portia_server/storage/jsondiff.py",
"chars": 9502,
"preview": "import difflib\n\nfrom collections import namedtuple\nfrom six.moves import zip_longest\n\n_BLANK = object()\n\n\nclass Conflict"
},
{
"path": "portia_server/storage/projecttemplates.py",
"chars": 2547,
"preview": "from slybot import __version__ as slybot_version\n\n_PROJECT_TEMPLATE = \"\"\"\\\n{}\\\n\"\"\"\n\n\n_SETTINGS_TEMPLATE = \"\"\"\\\n# Automat"
},
{
"path": "portia_server/storage/repoman.py",
"chars": 18746,
"preview": "from __future__ import absolute_import\nfrom time import time\nfrom collections import defaultdict\nfrom json import dumps,"
},
{
"path": "portiaui/.bowerrc",
"chars": 60,
"preview": "{\n \"directory\": \"bower_components\",\n \"analytics\": false\n}\n"
},
{
"path": "portiaui/.editorconfig",
"chars": 514,
"preview": "# EditorConfig helps developers define and maintain consistent\n# coding styles between different editors and IDEs\n# edit"
},
{
"path": "portiaui/.ember-cli",
"chars": 31,
"preview": "{\n \"disableAnalytics\": true\n}\n"
},
{
"path": "portiaui/.gitignore",
"chars": 247,
"preview": "# See http://help.github.com/ignore-files/ for more about ignoring files.\n\n# compiled output\n/dist\n/tmp\n\n# dependencies\n"
},
{
"path": "portiaui/.jshintrc",
"chars": 646,
"preview": "{\n \"predef\": [\n \"document\",\n \"window\",\n \"$\",\n \"cookie\",\n \"moment\",\n \"URI\",\n \"TreeMirror\",\n \"-Pr"
},
{
"path": "portiaui/.watchmanconfig",
"chars": 37,
"preview": "{\n \"ignore_dirs\": [\"tmp\", \"dist\"]\n}\n"
},
{
"path": "portiaui/app/adapters/application.js",
"chars": 17734,
"preview": "import Ember from \"ember\";\nimport DS from \"ember-data\";\nimport UrlTemplates from \"ember-data-url-templates\";\nconst { inj"
},
{
"path": "portiaui/app/adapters/project.js",
"chars": 293,
"preview": "import ApplicationAdapter from './application';\n\nexport default ApplicationAdapter.extend({\n urlTemplate: '{+host}/ap"
},
{
"path": "portiaui/app/app.js",
"chars": 479,
"preview": "import Ember from 'ember';\nimport Resolver from './resolver';\nimport loadInitializers from 'ember-load-initializers';\nim"
},
{
"path": "portiaui/app/components/.gitkeep",
"chars": 0,
"preview": ""
},
{
"path": "portiaui/app/components/add-start-url-button.js",
"chars": 1287,
"preview": "import Ember from 'ember';\nconst { computed, inject: { service } } = Ember;\n\nexport default Ember.Component.extend({\n "
},
{
"path": "portiaui/app/components/animation-container.js",
"chars": 3723,
"preview": "import Ember from 'ember';\nimport { attrValue } from '../utils/attrs';\n\nexport default Ember.Component.extend({\n posi"
},
{
"path": "portiaui/app/components/annotation-options.js",
"chars": 4470,
"preview": "import Ember from 'ember';\nimport { getAttributeList, hasContentAttribute } from './inspector-panel';\n\nexport default Em"
},
{
"path": "portiaui/app/components/browser-iframe.js",
"chars": 13183,
"preview": "import Ember from 'ember';\nimport { storageFor } from 'ember-local-storage';\nimport { cleanUrl, shortGuid } from '../uti"
},
{
"path": "portiaui/app/components/browser-url-blocked.js",
"chars": 93,
"preview": "import Ember from 'ember';\n\nexport default Ember.Component.extend({\n tagName: 'span',\n});\n"
},
{
"path": "portiaui/app/components/browser-url-failing.js",
"chars": 253,
"preview": "import Ember from 'ember';\nconst { inject: { service } } = Ember;\n\nexport default Ember.Component.extend({\n tagName: "
},
{
"path": "portiaui/app/components/browser-view-port.js",
"chars": 1148,
"preview": "import Ember from 'ember';\nimport {getAttributeList} from './inspector-panel';\n\nexport default Ember.Component.extend({\n"
},
{
"path": "portiaui/app/components/buffered-input.js",
"chars": 3168,
"preview": "import Ember from 'ember';\nimport { ensurePromise } from '../utils/promises';\n\nexport default Ember.Component.extend({\n "
},
{
"path": "portiaui/app/components/colored-badge.js",
"chars": 313,
"preview": "import Ember from 'ember';\n\nexport default Ember.Component.extend({\n tagName: '',\n\n color: null,\n value: 0,\n\n "
},
{
"path": "portiaui/app/components/colored-span.js",
"chars": 340,
"preview": "import Ember from 'ember';\nconst { computed } = Ember;\n\nexport default Ember.Component.extend({\n tagName: 'span',\n "
},
{
"path": "portiaui/app/components/combo-box.js",
"chars": 2173,
"preview": "import Ember from 'ember';\nimport SelectBox from './select-box';\n\nexport default SelectBox.extend({\n autoSelect: fals"
},
{
"path": "portiaui/app/components/create-project-button.js",
"chars": 454,
"preview": "import Ember from 'ember';\nconst { computed, inject: { service } } = Ember;\n\nexport default Ember.Component.extend({\n "
},
{
"path": "portiaui/app/components/create-spider-button.js",
"chars": 434,
"preview": "import Ember from 'ember';\nimport {computedCanAddSpider} from '../services/dispatcher';\n\nexport default Ember.Component."
},
{
"path": "portiaui/app/components/data-structure-annotations.js",
"chars": 958,
"preview": "import Ember from 'ember';\n\nexport default Ember.Component.extend({\n dispatcher: Ember.inject.service(),\n uiState:"
},
{
"path": "portiaui/app/components/data-structure-listing.js",
"chars": 381,
"preview": "import Ember from 'ember';\n\nexport default Ember.Component.extend({\n dispatcher: Ember.inject.service(),\n\n tagName"
},
{
"path": "portiaui/app/components/dropdown-delete.js",
"chars": 487,
"preview": "import Ember from 'ember';\nconst { computed } = Ember;\n\nexport default Ember.Component.extend({\n tagName: 'li',\n c"
},
{
"path": "portiaui/app/components/dropdown-divider.js",
"chars": 175,
"preview": "import Ember from 'ember';\n\nexport default Ember.Component.extend({\n tagName: 'li',\n classNames: ['divider'],\n "
},
{
"path": "portiaui/app/components/dropdown-header.js",
"chars": 127,
"preview": "import Ember from 'ember';\n\nexport default Ember.Component.extend({\n tagName: 'li',\n classNames: ['dropdown-header"
},
{
"path": "portiaui/app/components/dropdown-item.js",
"chars": 946,
"preview": "import Ember from 'ember';\n\nexport default Ember.Component.extend({\n tagName: 'li',\n classNames: ['dropdown-item']"
},
{
"path": "portiaui/app/components/dropdown-menu.js",
"chars": 5208,
"preview": "import Ember from 'ember';\n\nfunction computedItem(propertyName) {\n const cachePropertyName = `_${propertyName}ItemCac"
},
{
"path": "portiaui/app/components/dropdown-widget.js",
"chars": 4644,
"preview": "import Ember from 'ember';\n\nexport default Ember.Component.extend({\n positionMonitor: Ember.inject.service(),\n\n cl"
},
{
"path": "portiaui/app/components/edit-sample-button.js",
"chars": 1877,
"preview": "import Ember from 'ember';\nconst { computed } = Ember;\nimport {\n computedCanAddSample,\n computedEditableSample\n} f"
},
{
"path": "portiaui/app/components/element-overlay.js",
"chars": 3886,
"preview": "import Ember from 'ember';\nimport {attrChanged, attrValue} from '../utils/attrs';\n\nexport default Ember.Component.extend"
},
{
"path": "portiaui/app/components/element-rect-overlay.js",
"chars": 2836,
"preview": "import Ember from 'ember';\n\nexport default Ember.Component.extend({\n classNames: ['overlay'],\n\n index: null,\n p"
},
{
"path": "portiaui/app/components/extracted-item-table.js",
"chars": 136,
"preview": "import Ember from 'ember';\n\n\nexport default Ember.Component.extend({\n tagName: 'table',\n classNames: ['extracted-i"
},
{
"path": "portiaui/app/components/extracted-items-group.js",
"chars": 88,
"preview": "import Ember from 'ember';\n\nexport default Ember.Component.extend({\n tagName: ''\n});\n"
},
{
"path": "portiaui/app/components/extracted-items-json-panel.js",
"chars": 133,
"preview": "import Ember from 'ember';\n\nexport default Ember.Component.extend({\n extractedItems: Ember.inject.service(),\n\n tag"
},
{
"path": "portiaui/app/components/extracted-items-json-value.js",
"chars": 2047,
"preview": "import Ember from 'ember';\nimport { toType } from '../utils/types';\n\nexport default Ember.Component.extend({\n tagName"
},
{
"path": "portiaui/app/components/extracted-items-json.js",
"chars": 3698,
"preview": "import Ember from 'ember';\nimport { toType } from '../utils/types';\n\nexport default Ember.Component.extend({\n tagName"
},
{
"path": "portiaui/app/components/extracted-items-panel.js",
"chars": 386,
"preview": "import Ember from 'ember';\nconst { inject: { service }, computed } = Ember;\n\nexport default Ember.Component.extend({\n "
},
{
"path": "portiaui/app/components/extracted-items-status.js",
"chars": 2787,
"preview": "import Ember from 'ember';\nconst { computed, inject: { service } } = Ember;\n\nexport default Ember.Component.extend({\n "
},
{
"path": "portiaui/app/components/extracted-items-tab.js",
"chars": 281,
"preview": "import Ember from 'ember';\nconst { inject: { service }, computed } = Ember;\n\nexport default Ember.Component.extend({\n "
},
{
"path": "portiaui/app/components/extractor-options.js",
"chars": 1384,
"preview": "import Ember from 'ember';\nimport { FIELD_TYPES } from '../models/field';\n\nexport default Ember.Component.extend({\n d"
},
{
"path": "portiaui/app/components/feed-url-options.js",
"chars": 486,
"preview": "import Ember from 'ember';\nimport { cleanUrl } from '../utils/utils';\n\nexport default Ember.Component.extend({\n feedL"
},
{
"path": "portiaui/app/components/field-options.js",
"chars": 194,
"preview": "import Ember from 'ember';\n\nexport default Ember.Component.extend({\n tagName: '',\n\n field: null,\n\n actions: {\n "
},
{
"path": "portiaui/app/components/fragment-options.js",
"chars": 4271,
"preview": "import Ember from 'ember';\nconst { computed, run } = Ember;\n\nimport { task, timeout } from 'ember-concurrency';\nimport C"
},
{
"path": "portiaui/app/components/generated-url-options.js",
"chars": 1653,
"preview": "import Ember from 'ember';\nconst { computed } = Ember;\nimport { getColors } from '../utils/colors';\nimport { multiplicit"
},
{
"path": "portiaui/app/components/help-icon.js",
"chars": 214,
"preview": "import Ember from 'ember';\n\nexport default Ember.Component.extend({\n tagName: '',\n\n tooltipClasses: null,\n tool"
},
{
"path": "portiaui/app/components/icon-button.js",
"chars": 2831,
"preview": "import Ember from 'ember';\n\nexport const ICON_CLASSES = {\n add: 'structure-list-add fa fa-plus-circle',\n 'add-drop"
},
{
"path": "portiaui/app/components/indentation-spacer.js",
"chars": 161,
"preview": "import Ember from 'ember';\n\nexport default Ember.Component.extend({\n classNames: ['indentation-spacer'],\n classNameBin"
},
{
"path": "portiaui/app/components/input-with-clear.js",
"chars": 351,
"preview": "import Ember from 'ember';\n\nexport default Ember.Component.extend({\n classNames: ['input-group', 'input-with-clear'],"
},
{
"path": "portiaui/app/components/inspector-panel.js",
"chars": 5004,
"preview": "import Ember from 'ember';\nimport { pathSelector } from '../utils/selectors';\n\nexport const IGNORED_ATTRIBUTES = new Set"
},
{
"path": "portiaui/app/components/link-crawling-options.js",
"chars": 263,
"preview": "import Ember from 'ember';\nimport SaveSpiderMixin from '../mixins/save-spider-mixin';\n\nexport default Ember.Component.ex"
},
{
"path": "portiaui/app/components/list-item-add-annotation-menu.js",
"chars": 597,
"preview": "import Ember from 'ember';\nimport config from '../config/environment';\n\n\nexport default Ember.Component.extend({\n dis"
},
{
"path": "portiaui/app/components/list-item-annotation-field.js",
"chars": 1710,
"preview": "import Ember from 'ember';\nimport {validateFieldName} from './schema-structure-listing';\n\nexport default Ember.Component"
},
{
"path": "portiaui/app/components/list-item-badge.js",
"chars": 86,
"preview": "import ColoredBadge from './colored-badge';\n\nexport default ColoredBadge.extend({\n});\n"
},
{
"path": "portiaui/app/components/list-item-combo.js",
"chars": 165,
"preview": "import ListItemSelectable from './list-item-selectable';\n\nexport default ListItemSelectable.extend({\n classNames: ['l"
},
{
"path": "portiaui/app/components/list-item-editable.js",
"chars": 437,
"preview": "import Ember from 'ember';\n\nexport default Ember.Component.extend({\n classNames: ['list-item-editable'],\n classNam"
},
{
"path": "portiaui/app/components/list-item-field-type.js",
"chars": 479,
"preview": "import Ember from 'ember';\nimport { FIELD_TYPES } from '../models/field';\nimport ensurePromise from '../utils/ensure-pro"
},
{
"path": "portiaui/app/components/list-item-icon-menu.js",
"chars": 246,
"preview": "import Ember from 'ember';\n\nexport default Ember.Component.extend({\n tagName: '',\n\n icon: null,\n\n actions: {\n "
},
{
"path": "portiaui/app/components/list-item-icon.js",
"chars": 211,
"preview": "import IconButton from './icon-button';\n\nexport default IconButton.extend({\n classNames: ['list-item-icon'],\n\n beforeC"
},
{
"path": "portiaui/app/components/list-item-item-schema.js",
"chars": 791,
"preview": "import Ember from 'ember';\n\nexport default Ember.Component.extend({\n dispatcher: Ember.inject.service(),\n\n tagName"
},
{
"path": "portiaui/app/components/list-item-link-crawling.js",
"chars": 2114,
"preview": "import Ember from 'ember';\nimport SaveSpiderMixin from '../mixins/save-spider-mixin';\nconst { computed, inject: { servic"
},
{
"path": "portiaui/app/components/list-item-relation-manager.js",
"chars": 1833,
"preview": "import Ember from 'ember';\nimport { ensurePromise } from '../utils/promises';\n\nexport default Ember.Component.extend({\n "
},
{
"path": "portiaui/app/components/list-item-selectable.js",
"chars": 420,
"preview": "import Ember from 'ember';\n\nexport default Ember.Component.extend({\n classNames: ['list-item-selectable'],\n classN"
},
{
"path": "portiaui/app/components/list-item-text.js",
"chars": 128,
"preview": "import Ember from 'ember';\n\nexport default Ember.Component.extend({\n tagName: 'span',\n classNames: ['list-item-tex"
},
{
"path": "portiaui/app/components/notification-container.js",
"chars": 1970,
"preview": "import Ember from 'ember';\n\nexport default Ember.Component.extend({\n notificationManager: Ember.inject.service(),\n\n "
},
{
"path": "portiaui/app/components/notification-message.js",
"chars": 1610,
"preview": "import Ember from 'ember';\nimport { attrChangedTo } from '../utils/attrs';\n\nexport default Ember.Component.extend({\n "
},
{
"path": "portiaui/app/components/page-actions-editor.js",
"chars": 1250,
"preview": "import Ember from 'ember';\n\nconst TYPES = ['click', 'set', 'wait'];\n\nexport default Ember.Component.extend({\n actionT"
},
{
"path": "portiaui/app/components/project-list.js",
"chars": 1576,
"preview": "import Ember from 'ember';\nconst { computed } = Ember;\n\nexport default Ember.Component.extend({\n store: Ember.inject."
},
{
"path": "portiaui/app/components/project-listing.js",
"chars": 2897,
"preview": "import Ember from 'ember';\nconst { computed, inject: { service } } = Ember;\n\nexport default Ember.Component.extend({\n "
},
{
"path": "portiaui/app/components/project-structure-listing.js",
"chars": 5188,
"preview": "import Ember from 'ember';\nconst { computed, observer } = Ember;\nimport {computedCanAddSpider} from '../services/dispatc"
},
{
"path": "portiaui/app/components/project-structure-spider-feed-url.js",
"chars": 577,
"preview": "import Ember from 'ember';\nconst { computed } = Ember;\nimport { cleanUrl } from '../utils/utils';\n\nexport default Ember."
},
{
"path": "portiaui/app/components/project-structure-spider-generated-url.js",
"chars": 318,
"preview": "import Ember from 'ember';\nconst { computed } = Ember;\n\nexport default Ember.Component.extend({\n tagName: '',\n\n fr"
},
{
"path": "portiaui/app/components/project-structure-spider-url.js",
"chars": 1431,
"preview": "import Ember from 'ember';\nconst { computed } = Ember;\nimport { cleanUrl } from '../utils/utils';\n\nexport default Ember."
},
{
"path": "portiaui/app/components/regex-pattern-list.js",
"chars": 1413,
"preview": "import Ember from 'ember';\n\nexport default Ember.Component.extend({\n classNames: ['regex-pattern-list'],\n\n list: ["
},
{
"path": "portiaui/app/components/reorder-handler.js",
"chars": 720,
"preview": "import Ember from 'ember';\n\nexport default Ember.Component.extend({\n attributeBindings: ['draggable', 'style'],\n "
}
]
// ... and 529 more files (download for full content)
About this extraction
This page contains the full source code of the scrapinghub/portia GitHub repository, extracted and formatted as plain text for AI agents and large language models (LLMs). The extraction includes 729 files (5.8 MB), approximately 1.6M tokens, and a symbol index with 2806 extracted functions, classes, methods, constants, and types. Use this with OpenClaw, Claude, ChatGPT, Cursor, Windsurf, or any other AI tool that accepts text input. You can copy the full output to your clipboard or download it as a .txt file.
Extracted by GitExtract — free GitHub repo to text converter for AI. Built by Nikandr Surkov.