Repository: googledatalab/pydatalab Branch: master Commit: 8bf007da3e43 Files: 438 Total size: 6.3 MB Directory structure: gitextract_nogfp4_v/ ├── .build-bot.json ├── .coveragerc ├── .gitignore ├── .travis.yml ├── CONTRIBUTING.md ├── LICENSE.txt ├── README.md ├── datalab/ │ ├── README │ ├── __init__.py │ ├── bigquery/ │ │ ├── __init__.py │ │ ├── _api.py │ │ ├── _csv_options.py │ │ ├── _dataset.py │ │ ├── _dialect.py │ │ ├── _federated_table.py │ │ ├── _job.py │ │ ├── _parser.py │ │ ├── _query.py │ │ ├── _query_job.py │ │ ├── _query_results_table.py │ │ ├── _query_stats.py │ │ ├── _sampling.py │ │ ├── _schema.py │ │ ├── _table.py │ │ ├── _udf.py │ │ ├── _utils.py │ │ ├── _view.py │ │ └── commands/ │ │ ├── __init__.py │ │ └── _bigquery.py │ ├── context/ │ │ ├── __init__.py │ │ ├── _api.py │ │ ├── _context.py │ │ ├── _project.py │ │ ├── _utils.py │ │ └── commands/ │ │ ├── __init__.py │ │ └── _projects.py │ ├── data/ │ │ ├── __init__.py │ │ ├── _csv.py │ │ ├── _sql_module.py │ │ ├── _sql_statement.py │ │ ├── _utils.py │ │ └── commands/ │ │ ├── __init__.py │ │ └── _sql.py │ ├── kernel/ │ │ └── __init__.py │ ├── notebook/ │ │ ├── __init__.py │ │ └── static/ │ │ ├── bigquery.css │ │ ├── bigquery.ts │ │ ├── charting.css │ │ ├── charting.ts │ │ ├── element.ts │ │ ├── extern/ │ │ │ ├── d3.parcoords.css │ │ │ ├── d3.parcoords.js │ │ │ ├── lantern-browser.html │ │ │ ├── parcoords-LICENSE.txt │ │ │ ├── sylvester-LICENSE.txt │ │ │ └── sylvester.js │ │ ├── job.css │ │ ├── job.ts │ │ ├── parcoords.ts │ │ ├── style.ts │ │ └── visualization.ts │ ├── stackdriver/ │ │ ├── __init__.py │ │ ├── commands/ │ │ │ ├── __init__.py │ │ │ └── _monitoring.py │ │ └── monitoring/ │ │ ├── __init__.py │ │ ├── _group.py │ │ ├── _metric.py │ │ ├── _query.py │ │ ├── _query_metadata.py │ │ ├── _resource.py │ │ └── _utils.py │ ├── storage/ │ │ ├── __init__.py │ │ ├── _api.py │ │ ├── _bucket.py │ │ ├── _item.py │ │ └── commands/ │ │ ├── __init__.py │ │ └── _storage.py │ └── utils/ │ ├── __init__.py │ ├── _async.py │ ├── _dataflow_job.py │ ├── _gcp_job.py │ ├── _http.py │ ├── _iterator.py │ ├── _job.py │ ├── _json_encoder.py │ ├── _lambda_job.py │ ├── _lru_cache.py │ ├── _utils.py │ └── commands/ │ ├── __init__.py │ ├── _chart.py │ ├── _chart_data.py │ ├── _commands.py │ ├── _csv.py │ ├── _extension.py │ ├── _html.py │ ├── _job.py │ ├── _modules.py │ └── _utils.py ├── docs/ │ ├── .nojekyll │ ├── Makefile │ ├── README │ ├── conf.py │ ├── datalab Commands.rst │ ├── datalab.bigquery.rst │ ├── datalab.context.rst │ ├── datalab.data.rst │ ├── datalab.stackdriver.monitoring.rst │ ├── datalab.storage.rst │ ├── gen-magic-rst.ipy │ ├── google.datalab Commands.rst │ ├── google.datalab.bigquery.rst │ ├── google.datalab.data.rst │ ├── google.datalab.ml.rst │ ├── google.datalab.rst │ ├── google.datalab.stackdriver.monitoring.rst │ ├── google.datalab.storage.rst │ ├── index.rst │ ├── make.bat │ ├── mltoolbox.classification.dnn.rst │ ├── mltoolbox.classification.linear.rst │ ├── mltoolbox.image.classification.rst │ ├── mltoolbox.regression.dnn.rst │ └── mltoolbox.regression.linear.rst ├── externs/ │ └── ts/ │ └── require/ │ └── require.d.ts ├── google/ │ ├── __init__.py │ └── datalab/ │ ├── __init__.py │ ├── _context.py │ ├── _job.py │ ├── bigquery/ │ │ ├── __init__.py │ │ ├── _api.py │ │ ├── _csv_options.py │ │ ├── _dataset.py │ │ ├── _external_data_source.py │ │ ├── _job.py │ │ ├── _parser.py │ │ ├── _query.py │ │ ├── _query_job.py │ │ ├── _query_output.py │ │ ├── _query_results_table.py │ │ ├── _query_stats.py │ │ ├── _sampling.py │ │ ├── _schema.py │ │ ├── _table.py │ │ ├── _udf.py │ │ ├── _utils.py │ │ ├── _view.py │ │ └── commands/ │ │ ├── __init__.py │ │ └── _bigquery.py │ ├── commands/ │ │ ├── __init__.py │ │ └── _datalab.py │ ├── contrib/ │ │ ├── __init__.py │ │ ├── bigquery/ │ │ │ ├── __init__.py │ │ │ ├── commands/ │ │ │ │ ├── __init__.py │ │ │ │ └── _bigquery.py │ │ │ └── operators/ │ │ │ ├── __init__.py │ │ │ ├── _bq_execute_operator.py │ │ │ ├── _bq_extract_operator.py │ │ │ └── _bq_load_operator.py │ │ ├── mlworkbench/ │ │ │ ├── __init__.py │ │ │ ├── _archive.py │ │ │ ├── _local_predict.py │ │ │ ├── _prediction_explainer.py │ │ │ ├── _shell_process.py │ │ │ └── commands/ │ │ │ ├── __init__.py │ │ │ └── _ml.py │ │ └── pipeline/ │ │ ├── __init__.py │ │ ├── _pipeline.py │ │ ├── airflow/ │ │ │ ├── __init__.py │ │ │ └── _airflow.py │ │ ├── commands/ │ │ │ ├── __init__.py │ │ │ └── _pipeline.py │ │ └── composer/ │ │ ├── __init__.py │ │ ├── _api.py │ │ └── _composer.py │ ├── data/ │ │ ├── __init__.py │ │ └── _csv_file.py │ ├── kernel/ │ │ └── __init__.py │ ├── ml/ │ │ ├── __init__.py │ │ ├── _cloud_models.py │ │ ├── _cloud_training_config.py │ │ ├── _confusion_matrix.py │ │ ├── _dataset.py │ │ ├── _fasets.py │ │ ├── _feature_slice_view.py │ │ ├── _job.py │ │ ├── _metrics.py │ │ ├── _summary.py │ │ ├── _tensorboard.py │ │ └── _util.py │ ├── notebook/ │ │ ├── __init__.py │ │ └── static/ │ │ ├── bigquery.css │ │ ├── bigquery.ts │ │ ├── charting.css │ │ ├── charting.ts │ │ ├── element.ts │ │ ├── extern/ │ │ │ ├── d3.parcoords.css │ │ │ ├── d3.parcoords.js │ │ │ ├── facets-jupyter.html │ │ │ ├── lantern-browser.html │ │ │ ├── parcoords-LICENSE.txt │ │ │ ├── sylvester-LICENSE.txt │ │ │ └── sylvester.js │ │ ├── job.css │ │ ├── job.ts │ │ ├── parcoords.ts │ │ ├── style.ts │ │ └── visualization.ts │ ├── stackdriver/ │ │ ├── __init__.py │ │ ├── commands/ │ │ │ ├── __init__.py │ │ │ └── _monitoring.py │ │ └── monitoring/ │ │ ├── __init__.py │ │ ├── _group.py │ │ ├── _metric.py │ │ ├── _query.py │ │ ├── _query_metadata.py │ │ ├── _resource.py │ │ └── _utils.py │ ├── storage/ │ │ ├── __init__.py │ │ ├── _api.py │ │ ├── _bucket.py │ │ ├── _object.py │ │ └── commands/ │ │ ├── __init__.py │ │ └── _storage.py │ └── utils/ │ ├── __init__.py │ ├── _async.py │ ├── _dataflow_job.py │ ├── _gcp_job.py │ ├── _http.py │ ├── _iterator.py │ ├── _json_encoder.py │ ├── _lambda_job.py │ ├── _lru_cache.py │ ├── _utils.py │ ├── commands/ │ │ ├── __init__.py │ │ ├── _chart.py │ │ ├── _chart_data.py │ │ ├── _commands.py │ │ ├── _csv.py │ │ ├── _html.py │ │ ├── _job.py │ │ └── _utils.py │ └── facets/ │ ├── __init__.py │ ├── base_feature_statistics_generator.py │ ├── base_generic_feature_statistics_generator.py │ ├── feature_statistics_generator.py │ ├── feature_statistics_pb2.py │ └── generic_feature_statistics_generator.py ├── install-no-virtualenv.sh ├── install-virtualenv.sh ├── legacy_tests/ │ ├── _util/ │ │ ├── __init__.py │ │ ├── http_tests.py │ │ ├── lru_cache_tests.py │ │ └── util_tests.py │ ├── bigquery/ │ │ ├── __init__.py │ │ ├── api_tests.py │ │ ├── dataset_tests.py │ │ ├── federated_table_tests.py │ │ ├── jobs_tests.py │ │ ├── parser_tests.py │ │ ├── query_tests.py │ │ ├── sampling_tests.py │ │ ├── schema_tests.py │ │ ├── table_tests.py │ │ ├── udf_tests.py │ │ └── view_tests.py │ ├── data/ │ │ ├── __init__.py │ │ └── sql_tests.py │ ├── kernel/ │ │ ├── __init__.py │ │ ├── bigquery_tests.py │ │ ├── chart_data_tests.py │ │ ├── chart_tests.py │ │ ├── commands_tests.py │ │ ├── html_tests.py │ │ ├── module_tests.py │ │ ├── sql_tests.py │ │ ├── storage_tests.py │ │ └── utils_tests.py │ ├── main.py │ ├── stackdriver/ │ │ ├── __init__.py │ │ ├── commands/ │ │ │ ├── __init__.py │ │ │ └── monitoring_tests.py │ │ └── monitoring/ │ │ ├── __init__.py │ │ ├── group_tests.py │ │ ├── metric_tests.py │ │ ├── query_metadata_tests.py │ │ ├── query_tests.py │ │ ├── resource_tests.py │ │ └── utils_tests.py │ └── storage/ │ ├── __init__.py │ ├── api_tests.py │ ├── bucket_tests.py │ └── item_tests.py ├── release.sh ├── setup.cfg ├── setup.py ├── solutionbox/ │ ├── image_classification/ │ │ ├── mltoolbox/ │ │ │ ├── __init__.py │ │ │ └── image/ │ │ │ ├── __init__.py │ │ │ └── classification/ │ │ │ ├── __init__.py │ │ │ ├── _api.py │ │ │ ├── _cloud.py │ │ │ ├── _inceptionlib.py │ │ │ ├── _local.py │ │ │ ├── _model.py │ │ │ ├── _predictor.py │ │ │ ├── _preprocess.py │ │ │ ├── _trainer.py │ │ │ ├── _util.py │ │ │ ├── setup.py │ │ │ └── task.py │ │ └── setup.py │ ├── ml_workbench/ │ │ ├── setup.py │ │ ├── tensorflow/ │ │ │ ├── __init__.py │ │ │ ├── analyze.py │ │ │ ├── setup.py │ │ │ ├── trainer/ │ │ │ │ ├── __init__.py │ │ │ │ ├── feature_analysis.py │ │ │ │ ├── feature_transforms.py │ │ │ │ └── task.py │ │ │ └── transform.py │ │ ├── test_tensorflow/ │ │ │ ├── run_all.sh │ │ │ ├── test_analyze.py │ │ │ ├── test_cloud_workflow.py │ │ │ ├── test_feature_transforms.py │ │ │ ├── test_training.py │ │ │ └── test_transform.py │ │ ├── test_xgboost/ │ │ │ ├── run_all.sh │ │ │ ├── test_analyze.py │ │ │ └── test_transform.py │ │ └── xgboost/ │ │ ├── __init__.py │ │ ├── analyze.py │ │ ├── setup.py │ │ ├── trainer/ │ │ │ ├── __init__.py │ │ │ ├── feature_analysis.py │ │ │ ├── feature_transforms.py │ │ │ └── task.py │ │ └── transform.py │ └── structured_data/ │ ├── build.sh │ ├── mltoolbox/ │ │ ├── __init__.py │ │ ├── _structured_data/ │ │ │ ├── __init__.py │ │ │ ├── __version__.py │ │ │ ├── _package.py │ │ │ ├── master_setup.py │ │ │ ├── prediction/ │ │ │ │ ├── __init__.py │ │ │ │ └── predict.py │ │ │ ├── preprocess/ │ │ │ │ ├── __init__.py │ │ │ │ ├── cloud_preprocess.py │ │ │ │ └── local_preprocess.py │ │ │ └── trainer/ │ │ │ ├── __init__.py │ │ │ ├── task.py │ │ │ └── util.py │ │ ├── classification/ │ │ │ ├── __init__.py │ │ │ ├── dnn/ │ │ │ │ ├── __init__.py │ │ │ │ └── _classification_dnn.py │ │ │ └── linear/ │ │ │ ├── __init__.py │ │ │ └── _classification_linear.py │ │ └── regression/ │ │ ├── __init__.py │ │ ├── dnn/ │ │ │ ├── __init__.py │ │ │ └── _regression_dnn.py │ │ └── linear/ │ │ ├── __init__.py │ │ └── _regression_linear.py │ ├── setup.py │ └── test_mltoolbox/ │ ├── __init__.py │ ├── e2e_functions.py │ ├── test_datalab_e2e.py │ ├── test_package_functions.py │ ├── test_sd_preprocess.py │ └── test_sd_trainer.py ├── tests/ │ ├── _util/ │ │ ├── __init__.py │ │ ├── commands_tests.py │ │ ├── feature_statistics_generator_test.py │ │ ├── generic_feature_statistics_generator_test.py │ │ ├── http_tests.py │ │ ├── lru_cache_tests.py │ │ └── util_tests.py │ ├── bigquery/ │ │ ├── __init__.py │ │ ├── api_tests.py │ │ ├── dataset_tests.py │ │ ├── external_data_source_tests.py │ │ ├── jobs_tests.py │ │ ├── operator_tests.py │ │ ├── parser_tests.py │ │ ├── pipeline_tests.py │ │ ├── query_tests.py │ │ ├── sampling_tests.py │ │ ├── schema_tests.py │ │ ├── table_tests.py │ │ ├── udf_tests.py │ │ └── view_tests.py │ ├── context_tests.py │ ├── integration/ │ │ └── storage_test.py │ ├── kernel/ │ │ ├── __init__.py │ │ ├── bigquery_tests.py │ │ ├── chart_data_tests.py │ │ ├── chart_tests.py │ │ ├── html_tests.py │ │ ├── pipeline_tests.py │ │ ├── storage_tests.py │ │ └── utils_tests.py │ ├── main.py │ ├── ml/ │ │ ├── __init__.py │ │ ├── confusion_matrix_tests.py │ │ ├── dataset_tests.py │ │ ├── facets_tests.py │ │ ├── metrics_tests.py │ │ ├── summary_tests.py │ │ └── tensorboard_tests.py │ ├── ml_workbench/ │ │ ├── __init__.py │ │ └── all_tests.py │ ├── mltoolbox_structured_data/ │ │ ├── __init__.py │ │ ├── dl_interface_tests.py │ │ ├── sd_e2e_tests.py │ │ └── traininglib_tests.py │ ├── mlworkbench_magic/ │ │ ├── __init__.py │ │ ├── archive_tests.py │ │ ├── explainer_tests.py │ │ ├── local_predict_tests.py │ │ ├── ml_tests.py │ │ └── shell_process_tests.py │ ├── pipeline/ │ │ ├── __init__.py │ │ ├── airflow_tests.py │ │ ├── composer_api_tests.py │ │ ├── composer_tests.py │ │ └── pipeline_tests.py │ ├── stackdriver/ │ │ ├── __init__.py │ │ ├── commands/ │ │ │ ├── __init__.py │ │ │ └── monitoring_tests.py │ │ └── monitoring/ │ │ ├── __init__.py │ │ ├── group_tests.py │ │ ├── metric_tests.py │ │ ├── query_metadata_tests.py │ │ ├── query_tests.py │ │ ├── resource_tests.py │ │ └── utils_tests.py │ └── storage/ │ ├── __init__.py │ ├── api_tests.py │ ├── bucket_tests.py │ └── object_tests.py └── tox.ini ================================================ FILE CONTENTS ================================================ ================================================ FILE: .coveragerc ================================================ # .coveragerc to control coverage.py [run] [report] include = */site-packages/google/datalab/* ================================================ FILE: .gitignore ================================================ *.pyc *.pyi *.map *.egg-info *.iml .idea .DS_Store MANIFEST build .coverage dist datalab.magics.rst datalab/notebook/static/*.js google/datalab/notebook/static/*.js # Test files .tox/ .cache/ ================================================ FILE: .travis.yml ================================================ language: python dist: trusty sudo: false matrix: include: - python: 2.7 env: TOX_ENV=py27 - python: 3.5 env: TOX_ENV=py35 - python: 2.7 env: TOX_ENV=flake8 - python: 2.7 env: TOX_ENV=coveralls before_install: - npm install -g typescript@3.0.3 - tsc --module amd --noImplicitAny --outdir datalab/notebook/static datalab/notebook/static/*.ts # We use tox for actually running tests. - pip install --upgrade pip tox script: # tox reads its configuration from tox.ini. - tox -e $TOX_ENV ================================================ FILE: CONTRIBUTING.md ================================================ Want to contribute? Great! First, read this page (including the small print at the end). ### Before you contribute Before we can use your code, you must sign the [Google Individual Contributor License Agreement] (https://cla.developers.google.com/about/google-individual) (CLA), which you can do online. The CLA is necessary mainly because you own the copyright to your changes, even after your contribution becomes part of our codebase, so we need your permission to use and distribute your code. We also need to be sure of various other things—for instance that you'll tell us if you know that your code infringes on other people's patents. You don't have to sign the CLA until after you've submitted your code for review and a member has approved it, but you must do it before we can put your code into our codebase. Before you start working on a larger contribution, you should get in touch with us first through the issue tracker with your idea so that we can help out and possibly guide you. Coordinating up front makes it much easier to avoid frustration later on. ### Code reviews All submissions, including submissions by project members, require review. We use Github pull requests for this purpose. ### Running tests We use [`tox`](https://tox.readthedocs.io/) for running our tests. To run tests before sending out a pull request, just [install tox](https://tox.readthedocs.io/en/latest/install.html) and run ```shell $ tox ``` to run tests under all supported environments. (This will skip any environments for which no interpreter is available.) `tox -l` will provide a list of all supported environments. `tox` will run all tests referenced by `tests/main.py` and `legacy_tests/main.py`. ### The small print Contributions made by corporations are covered by a different agreement than the one above, the [Software Grant and Corporate Contributor License Agreement] (https://cla.developers.google.com/about/google-corporate). ================================================ FILE: LICENSE.txt ================================================ Apache License Version 2.0, January 2004 http://www.apache.org/licenses/ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 1. Definitions. "License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document. "Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License. "Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity. "You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License. "Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files. "Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types. "Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below). "Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof. "Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution." "Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work. 2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form. 3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed. 4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions: (a) You must give any other recipients of the Work or Derivative Works a copy of this License; and (b) You must cause any modified files to carry prominent notices stating that You changed the files; and (c) You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and (d) If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License. You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License. 5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions. 6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file. 7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License. 8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages. 9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability. END OF TERMS AND CONDITIONS APPENDIX: How to apply the Apache License to your work. To apply the Apache License to your work, attach the following boilerplate notice, with the fields enclosed by brackets "[]" replaced with your own identifying information. (Don't include the brackets!) The text should be enclosed in the appropriate comment syntax for the file format. We also recommend that a file or class name and description of purpose be included on the same "printed page" as the copyright notice for easier identification within third-party archives. Copyright [yyyy] [name of copyright owner] Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. ================================================ FILE: README.md ================================================ # Google Cloud DataLab Datalab is deprecated. [Vertex AI Workbench](https://cloud.google.com/vertex-ai/docs/workbench) provides a notebook-based environment that offers capabilities beyond Datalab. We recommend that you use Vertex AI Workbench for new projects and [migrate your Datalab notebooks to Vertex AI Workbench](https://cloud.google.com/datalab/docs/resources/troubleshooting#migrate). For more information, see [Deprecation information](https://cloud.google.com/datalab/docs/resources/deprecation). To get help migrating Datalab projects to Vertex AI Workbench see [Get help](https://cloud.google.com/datalab/docs/resources/support#get-help). ================================================ FILE: datalab/README ================================================ Everything under datalab namespace is actively maintained but no new features are being added. Please use corresponding libraries under google.datalab namespace (source code under google/datalab directory). To migrate existing code that relies on datalab namespace, since most API interfaces are the same between google.datalab and datalab, usually you just need to change the import namespace. The magic interface is different for bigquery though (%%sql --> %%bq). For more details please see https://github.com/googledatalab/pydatalab/wiki/%60datalab%60-to-%60google.datalab%60-Migration-Guide. ================================================ FILE: datalab/__init__.py ================================================ # Copyright 2015 Google Inc. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except # in compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under the License # is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express # or implied. See the License for the specific language governing permissions and limitations under # the License. ================================================ FILE: datalab/bigquery/__init__.py ================================================ # Copyright 2015 Google Inc. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except # in compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under the License # is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express # or implied. See the License for the specific language governing permissions and limitations under # the License. """Google Cloud Platform library - BigQuery Functionality.""" from __future__ import absolute_import from ._csv_options import CSVOptions from ._dataset import Dataset, Datasets from ._dialect import Dialect from ._federated_table import FederatedTable from ._job import Job from ._query import Query from ._query_job import QueryJob from ._query_results_table import QueryResultsTable from ._query_stats import QueryStats from ._sampling import Sampling from ._schema import Schema from ._table import Table, TableMetadata from ._udf import UDF from ._utils import TableName, DatasetName from ._view import View __all__ = ['CSVOptions', 'Dataset', 'Datasets', 'Dialect', 'FederatedTable', 'Query', 'QueryJob', 'QueryResultsTable', 'QueryStats', 'Sampling', 'Schema', 'Table', 'TableMetadata', 'UDF', 'TableName', 'DatasetName', 'View'] def wait_any(jobs, timeout=None): """ Return when at least one of the specified jobs has completed or timeout expires. Args: jobs: a list of Jobs to wait on. timeout: a timeout in seconds to wait for. None (the default) means no timeout. Returns: Once at least one job completes, a list of all completed jobs. If the call times out then an empty list will be returned. """ return Job.wait_any(jobs, timeout) def wait_all(jobs, timeout=None): """ Return when all of the specified jobs have completed or timeout expires. Args: jobs: a single Job or list of Jobs to wait on. timeout: a timeout in seconds to wait for. None (the default) means no timeout. Returns: A list of completed Jobs. If the call timed out this will be shorter than the list of jobs supplied as a parameter. """ return Job.wait_all(jobs, timeout) ================================================ FILE: datalab/bigquery/_api.py ================================================ # Copyright 2015 Google Inc. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except # in compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under the License # is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express # or implied. See the License for the specific language governing permissions and limitations under # the License. """Implements BigQuery HTTP API wrapper.""" from __future__ import absolute_import from __future__ import unicode_literals from past.builtins import basestring from builtins import object import datalab.utils import datalab.bigquery class Api(object): """A helper class to issue BigQuery HTTP requests.""" # TODO(nikhilko): Use named placeholders in these string templates. _ENDPOINT = 'https://www.googleapis.com/bigquery/v2' _JOBS_PATH = '/projects/%s/jobs/%s' _QUERIES_PATH = '/projects/%s/queries/%s' _DATASETS_PATH = '/projects/%s/datasets/%s' _TABLES_PATH = '/projects/%s/datasets/%s/tables/%s%s' _TABLEDATA_PATH = '/projects/%s/datasets/%s/tables/%s%s/data' _DEFAULT_TIMEOUT = 60000 def __init__(self, context): """Initializes the BigQuery helper with context information. Args: context: a Context object providing project_id and credentials. """ self._credentials = context.credentials self._project_id = context.project_id @property def project_id(self): """The project_id associated with this API client.""" return self._project_id def jobs_insert_load(self, source, table_name, append=False, overwrite=False, create=False, source_format='CSV', field_delimiter=',', allow_jagged_rows=False, allow_quoted_newlines=False, encoding='UTF-8', ignore_unknown_values=False, max_bad_records=0, quote='"', skip_leading_rows=0): """ Issues a request to load data from GCS to a BQ table Args: source: the URL of the source bucket(s). Can include wildcards, and can be a single string argument or a list. table_name: a tuple representing the full name of the destination table. append: if True append onto existing table contents. overwrite: if True overwrite existing table contents. create: if True, create the table if it doesn't exist source_format: the format of the data; default 'CSV'. Other options are DATASTORE_BACKUP or NEWLINE_DELIMITED_JSON. field_delimiter: The separator for fields in a CSV file. BigQuery converts the string to ISO-8859-1 encoding, and then uses the first byte of the encoded string to split the data as raw binary (default ','). allow_jagged_rows: If True, accept rows in CSV files that are missing trailing optional columns; the missing values are treated as nulls (default False). allow_quoted_newlines: If True, allow quoted data sections in CSV files that contain newline characters (default False). encoding: The character encoding of the data, either 'UTF-8' (the default) or 'ISO-8859-1'. ignore_unknown_values: If True, accept rows that contain values that do not match the schema; the unknown values are ignored (default False). max_bad_records: The maximum number of bad records that are allowed (and ignored) before returning an 'invalid' error in the Job result (default 0). quote: The value used to quote data sections in a CSV file; default '"'. If your data does not contain quoted sections, set the property value to an empty string. If your data contains quoted newline characters, you must also enable allow_quoted_newlines. skip_leading_rows: A number of rows at the top of a CSV file to skip (default 0). Returns: A parsed result object. Raises: Exception if there is an error performing the operation. """ url = Api._ENDPOINT + (Api._JOBS_PATH % (table_name.project_id, '')) if isinstance(source, basestring): source = [source] write_disposition = 'WRITE_EMPTY' if overwrite: write_disposition = 'WRITE_TRUNCATE' if append: write_disposition = 'WRITE_APPEND' data = { 'kind': 'bigquery#job', 'configuration': { 'load': { 'sourceUris': source, 'destinationTable': { 'projectId': table_name.project_id, 'datasetId': table_name.dataset_id, 'tableId': table_name.table_id }, 'createDisposition': 'CREATE_IF_NEEDED' if create else 'CREATE_NEVER', 'writeDisposition': write_disposition, 'sourceFormat': source_format, 'ignoreUnknownValues': ignore_unknown_values, 'maxBadRecords': max_bad_records, } } } if source_format == 'CSV': load_config = data['configuration']['load'] load_config.update({ 'fieldDelimiter': field_delimiter, 'allowJaggedRows': allow_jagged_rows, 'allowQuotedNewlines': allow_quoted_newlines, 'quote': quote, 'encoding': encoding, 'skipLeadingRows': skip_leading_rows }) return datalab.utils.Http.request(url, data=data, credentials=self._credentials) def jobs_insert_query(self, sql, code=None, imports=None, table_name=None, append=False, overwrite=False, dry_run=False, use_cache=True, batch=True, allow_large_results=False, table_definitions=None, dialect=None, billing_tier=None): """Issues a request to insert a query job. Args: sql: the SQL string representing the query to execute. code: code for Javascript UDFs, if any. imports: a list of GCS URLs containing additional Javascript UDF support code, if any. table_name: None for an anonymous table, or a name parts tuple for a long-lived table. append: if True, append to the table if it is non-empty; else the request will fail if table is non-empty unless overwrite is True. overwrite: if the table already exists, truncate it instead of appending or raising an Exception. dry_run: whether to actually execute the query or just dry run it. use_cache: whether to use past query results or ignore cache. Has no effect if destination is specified. batch: whether to run this as a batch job (lower priority) or as an interactive job (high priority, more expensive). allow_large_results: whether to allow large results (slower with some restrictions but can handle big jobs). table_definitions: a list of JSON external table definitions for any external tables referenced in the query. dialect : {'legacy', 'standard'}, default 'legacy' 'legacy' : Use BigQuery's legacy SQL dialect. 'standard' : Use BigQuery's standard SQL (beta), which is compliant with the SQL 2011 standard. billing_tier: Limits the billing tier for this job. Queries that have resource usage beyond this tier will fail (without incurring a charge). If unspecified, this will be set to your project default. This can also be used to override your project-wide default billing tier on a per-query basis. Returns: A parsed result object. Raises: Exception if there is an error performing the operation. """ url = Api._ENDPOINT + (Api._JOBS_PATH % (self._project_id, '')) if dialect is None: dialect = datalab.bigquery.Dialect.default().bq_dialect data = { 'kind': 'bigquery#job', 'configuration': { 'query': { 'query': sql, 'useQueryCache': use_cache, 'allowLargeResults': allow_large_results, 'useLegacySql': dialect == 'legacy' }, 'dryRun': dry_run, 'priority': 'BATCH' if batch else 'INTERACTIVE', }, } query_config = data['configuration']['query'] resources = [] if code: resources.extend([{'inlineCode': fragment} for fragment in code]) if imports: resources.extend([{'resourceUri': uri} for uri in imports]) query_config['userDefinedFunctionResources'] = resources if table_definitions: query_config['tableDefinitions'] = table_definitions if table_name: query_config['destinationTable'] = { 'projectId': table_name.project_id, 'datasetId': table_name.dataset_id, 'tableId': table_name.table_id } if append: query_config['writeDisposition'] = "WRITE_APPEND" elif overwrite: query_config['writeDisposition'] = "WRITE_TRUNCATE" if billing_tier: query_config['maximumBillingTier'] = billing_tier return datalab.utils.Http.request(url, data=data, credentials=self._credentials) def jobs_query_results(self, job_id, project_id, page_size, timeout, start_index=0): """Issues a request to the jobs/getQueryResults method. Args: job_id: the id of job from a previously executed query. project_id: the project id to use to fetch the results; use None for the default project. page_size: limit to the number of rows to fetch. timeout: duration (in milliseconds) to wait for the query to complete. start_index: the index of the row (0-based) at which to start retrieving the page of result rows. Returns: A parsed result object. Raises: Exception if there is an error performing the operation. """ if timeout is None: timeout = Api._DEFAULT_TIMEOUT if project_id is None: project_id = self._project_id args = { 'maxResults': page_size, 'timeoutMs': timeout, 'startIndex': start_index } url = Api._ENDPOINT + (Api._QUERIES_PATH % (project_id, job_id)) return datalab.utils.Http.request(url, args=args, credentials=self._credentials) def jobs_get(self, job_id, project_id=None): """Issues a request to retrieve information about a job. Args: job_id: the id of the job project_id: the project id to use to fetch the results; use None for the default project. Returns: A parsed result object. Raises: Exception if there is an error performing the operation. """ if project_id is None: project_id = self._project_id url = Api._ENDPOINT + (Api._JOBS_PATH % (project_id, job_id)) return datalab.utils.Http.request(url, credentials=self._credentials) def datasets_insert(self, dataset_name, friendly_name=None, description=None): """Issues a request to create a dataset. Args: dataset_name: the name of the dataset to create. friendly_name: (optional) the friendly name for the dataset description: (optional) a description for the dataset Returns: A parsed result object. Raises: Exception if there is an error performing the operation. """ url = Api._ENDPOINT + (Api._DATASETS_PATH % (dataset_name.project_id, '')) data = { 'kind': 'bigquery#dataset', 'datasetReference': { 'projectId': dataset_name.project_id, 'datasetId': dataset_name.dataset_id }, } if friendly_name: data['friendlyName'] = friendly_name if description: data['description'] = description return datalab.utils.Http.request(url, data=data, credentials=self._credentials) def datasets_delete(self, dataset_name, delete_contents): """Issues a request to delete a dataset. Args: dataset_name: the name of the dataset to delete. delete_contents: if True, any tables in the dataset will be deleted. If False and the dataset is non-empty an exception will be raised. Returns: A parsed result object. Raises: Exception if there is an error performing the operation. """ url = Api._ENDPOINT + (Api._DATASETS_PATH % dataset_name) args = {} if delete_contents: args['deleteContents'] = True return datalab.utils.Http.request(url, method='DELETE', args=args, credentials=self._credentials, raw_response=True) def datasets_update(self, dataset_name, dataset_info): """Updates the Dataset info. Args: dataset_name: the name of the dataset to update as a tuple of components. dataset_info: the Dataset resource with updated fields. """ url = Api._ENDPOINT + (Api._DATASETS_PATH % dataset_name) return datalab.utils.Http.request(url, method='PUT', data=dataset_info, credentials=self._credentials) def datasets_get(self, dataset_name): """Issues a request to retrieve information about a dataset. Args: dataset_name: the name of the dataset Returns: A parsed result object. Raises: Exception if there is an error performing the operation. """ url = Api._ENDPOINT + (Api._DATASETS_PATH % dataset_name) return datalab.utils.Http.request(url, credentials=self._credentials) def datasets_list(self, project_id=None, max_results=0, page_token=None): """Issues a request to list the datasets in the project. Args: project_id: the project id to use to fetch the results; use None for the default project. max_results: an optional maximum number of tables to retrieve. page_token: an optional token to continue the retrieval. Returns: A parsed result object. Raises: Exception if there is an error performing the operation. """ if project_id is None: project_id = self._project_id url = Api._ENDPOINT + (Api._DATASETS_PATH % (project_id, '')) args = {} if max_results != 0: args['maxResults'] = max_results if page_token is not None: args['pageToken'] = page_token return datalab.utils.Http.request(url, args=args, credentials=self._credentials) def tables_get(self, table_name): """Issues a request to retrieve information about a table. Args: table_name: a tuple representing the full name of the table. Returns: A parsed result object. Raises: Exception if there is an error performing the operation. """ url = Api._ENDPOINT + (Api._TABLES_PATH % table_name) return datalab.utils.Http.request(url, credentials=self._credentials) def tables_list(self, dataset_name, max_results=0, page_token=None): """Issues a request to retrieve a list of tables. Args: dataset_name: the name of the dataset to enumerate. max_results: an optional maximum number of tables to retrieve. page_token: an optional token to continue the retrieval. Returns: A parsed result object. Raises: Exception if there is an error performing the operation. """ url = Api._ENDPOINT +\ (Api._TABLES_PATH % (dataset_name.project_id, dataset_name.dataset_id, '', '')) args = {} if max_results != 0: args['maxResults'] = max_results if page_token is not None: args['pageToken'] = page_token return datalab.utils.Http.request(url, args=args, credentials=self._credentials) def tables_insert(self, table_name, schema=None, query=None, friendly_name=None, description=None): """Issues a request to create a table or view in the specified dataset with the specified id. A schema must be provided to create a Table, or a query must be provided to create a View. Args: table_name: the name of the table as a tuple of components. schema: the schema, if this is a Table creation. query: the query, if this is a View creation. friendly_name: an optional friendly name. description: an optional description. Returns: A parsed result object. Raises: Exception if there is an error performing the operation. """ url = Api._ENDPOINT + \ (Api._TABLES_PATH % (table_name.project_id, table_name.dataset_id, '', '')) data = { 'kind': 'bigquery#table', 'tableReference': { 'projectId': table_name.project_id, 'datasetId': table_name.dataset_id, 'tableId': table_name.table_id } } if schema: data['schema'] = {'fields': schema} if query: data['view'] = {'query': query} if friendly_name: data['friendlyName'] = friendly_name if description: data['description'] = description return datalab.utils.Http.request(url, data=data, credentials=self._credentials) def tabledata_insert_all(self, table_name, rows): """Issues a request to insert data into a table. Args: table_name: the name of the table as a tuple of components. rows: the data to populate the table, as a list of dictionaries. Returns: A parsed result object. Raises: Exception if there is an error performing the operation. """ url = Api._ENDPOINT + (Api._TABLES_PATH % table_name) + "/insertAll" data = { 'kind': 'bigquery#tableDataInsertAllRequest', 'rows': rows } return datalab.utils.Http.request(url, data=data, credentials=self._credentials) def tabledata_list(self, table_name, start_index=None, max_results=None, page_token=None): """ Retrieves the contents of a table. Args: table_name: the name of the table as a tuple of components. start_index: the index of the row at which to start retrieval. max_results: an optional maximum number of rows to retrieve. page_token: an optional token to continue the retrieval. Returns: A parsed result object. Raises: Exception if there is an error performing the operation. """ url = Api._ENDPOINT + (Api._TABLEDATA_PATH % table_name) args = {} if start_index: args['startIndex'] = start_index if max_results: args['maxResults'] = max_results if page_token is not None: args['pageToken'] = page_token return datalab.utils.Http.request(url, args=args, credentials=self._credentials) def table_delete(self, table_name): """Issues a request to delete a table. Args: table_name: the name of the table as a tuple of components. Returns: A parsed result object. Raises: Exception if there is an error performing the operation. """ url = Api._ENDPOINT + (Api._TABLES_PATH % table_name) return datalab.utils.Http.request(url, method='DELETE', credentials=self._credentials, raw_response=True) def table_extract(self, table_name, destination, format='CSV', compress=True, field_delimiter=',', print_header=True): """Exports the table to GCS. Args: table_name: the name of the table as a tuple of components. destination: the destination URI(s). Can be a single URI or a list. format: the format to use for the exported data; one of CSV, NEWLINE_DELIMITED_JSON or AVRO. Defaults to CSV. compress: whether to compress the data on export. Compression is not supported for AVRO format. Defaults to False. field_delimiter: for CSV exports, the field delimiter to use. Defaults to ',' print_header: for CSV exports, whether to include an initial header line. Default true. Returns: A parsed result object. Raises: Exception if there is an error performing the operation. """ url = Api._ENDPOINT + (Api._JOBS_PATH % (table_name.project_id, '')) if isinstance(destination, basestring): destination = [destination] data = { # 'projectId': table_name.project_id, # Code sample shows this but it is not in job # reference spec. Filed as b/19235843 'kind': 'bigquery#job', 'configuration': { 'extract': { 'sourceTable': { 'projectId': table_name.project_id, 'datasetId': table_name.dataset_id, 'tableId': table_name.table_id, }, 'compression': 'GZIP' if compress else 'NONE', 'fieldDelimiter': field_delimiter, 'printHeader': print_header, 'destinationUris': destination, 'destinationFormat': format, } } } return datalab.utils.Http.request(url, data=data, credentials=self._credentials) def table_update(self, table_name, table_info): """Updates the Table info. Args: table_name: the name of the table to update as a tuple of components. table_info: the Table resource with updated fields. """ url = Api._ENDPOINT + (Api._TABLES_PATH % table_name) return datalab.utils.Http.request(url, method='PUT', data=table_info, credentials=self._credentials) ================================================ FILE: datalab/bigquery/_csv_options.py ================================================ # Copyright 2015 Google Inc. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except # in compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under the License # is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express # or implied. See the License for the specific language governing permissions and limitations under # the License. """Implements CSV options for External Tables and Table loads from GCS.""" from __future__ import absolute_import from __future__ import unicode_literals from builtins import object class CSVOptions(object): def __init__(self, delimiter=',', skip_leading_rows=0, encoding='utf-8', quote='"', allow_quoted_newlines=False, allow_jagged_rows=False): """ Initialize an instance of CSV options. Args: delimiter: The separator for fields in a CSV file. BigQuery converts the string to ISO-8859-1 encoding, and then uses the first byte of the encoded string to split the data as raw binary (default ','). skip_leading_rows: A number of rows at the top of a CSV file to skip (default 0). encoding: The character encoding of the data, either 'utf-8' (the default) or 'iso-8859-1'. quote: The value used to quote data sections in a CSV file; default '"'. If your data does not contain quoted sections, set the property value to an empty string. If your data contains quoted newline characters, you must also enable allow_quoted_newlines. allow_quoted_newlines: If True, allow quoted data sections in CSV files that contain newline characters (default False). allow_jagged_rows: If True, accept rows in CSV files that are missing trailing optional columns; the missing values are treated as nulls (default False). """ encoding_upper = encoding.upper() if encoding_upper != 'UTF-8' and encoding_upper != 'ISO-8859-1': raise Exception("Invalid source encoding %s" % encoding) self._delimiter = delimiter self._skip_leading_rows = skip_leading_rows self._encoding = encoding self._quote = quote self._allow_quoted_newlines = allow_quoted_newlines self._allow_jagged_rows = allow_jagged_rows @property def delimiter(self): return self._delimiter @property def skip_leading_rows(self): return self._skip_leading_rows @property def encoding(self): return self._encoding @property def quote(self): return self._quote @property def allow_quoted_newlines(self): return self._allow_quoted_newlines @property def allow_jagged_rows(self): return self._allow_jagged_rows def _to_query_json(self): """ Return the options as a dictionary to be used as JSON in a query job. """ return { 'quote': self._quote, 'fieldDelimiter': self._delimiter, 'encoding': self._encoding.upper(), 'skipLeadingRows': self._skip_leading_rows, 'allowQuotedNewlines': self._allow_quoted_newlines, 'allowJaggedRows': self._allow_jagged_rows } ================================================ FILE: datalab/bigquery/_dataset.py ================================================ # Copyright 2015 Google Inc. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except # in compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under the License # is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express # or implied. See the License for the specific language governing permissions and limitations under # the License. """Implements Dataset, and related Dataset BigQuery APIs.""" from __future__ import absolute_import from __future__ import unicode_literals from builtins import object import datalab.context import datalab.utils from . import _api from . import _table from . import _utils from . import _view class Dataset(object): """Represents a list of BigQuery tables in a dataset.""" def __init__(self, name, context=None): """Initializes an instance of a Dataset. Args: name: the name of the dataset, as a string or (project_id, dataset_id) tuple. context: an optional Context object providing project_id and credentials. If a specific project id or credentials are unspecified, the default ones configured at the global level are used. Raises: Exception if the name is invalid. """ if context is None: context = datalab.context.Context.default() self._context = context self._api = _api.Api(context) self._name_parts = _utils.parse_dataset_name(name, self._api.project_id) self._full_name = '%s:%s' % self._name_parts self._info = None try: self._info = self._get_info() except datalab.utils.RequestException: pass @property def name(self): """The DatasetName named tuple (project_id, dataset_id) for the dataset.""" return self._name_parts @property def description(self): """The description of the dataset, if any. Raises: Exception if the dataset exists but the metadata for the dataset could not be retrieved. """ self._get_info() return self._info['description'] if self._info else None @property def friendly_name(self): """The friendly name of the dataset, if any. Raises: Exception if the dataset exists but the metadata for the dataset could not be retrieved. """ self._get_info() return self._info['friendlyName'] if self._info else None def _get_info(self): try: if self._info is None: self._info = self._api.datasets_get(self._name_parts) return self._info except datalab.utils.RequestException as e: if e.status == 404: return None raise e except Exception as e: raise e def exists(self): """ Checks if the dataset exists. Returns: True if the dataset exists; False otherwise. Raises: Exception if the dataset exists but the metadata for the dataset could not be retrieved. """ self._get_info() return self._info is not None def delete(self, delete_contents=False): """Issues a request to delete the dataset. Args: delete_contents: if True, any tables and views in the dataset will be deleted. If False and the dataset is non-empty an exception will be raised. Returns: None on success. Raises: Exception if the delete fails (including if table was nonexistent). """ if not self.exists(): raise Exception('Cannot delete non-existent dataset %s' % self._full_name) try: self._api.datasets_delete(self._name_parts, delete_contents=delete_contents) except Exception as e: raise e self._info = None return None def create(self, friendly_name=None, description=None): """Creates the Dataset with the specified friendly name and description. Args: friendly_name: (optional) the friendly name for the dataset if it is being created. description: (optional) a description for the dataset if it is being created. Returns: The Dataset. Raises: Exception if the Dataset could not be created. """ if not self.exists(): try: response = self._api.datasets_insert(self._name_parts, friendly_name=friendly_name, description=description) except Exception as e: raise e if 'selfLink' not in response: raise Exception("Could not create dataset %s" % self._full_name) return self def update(self, friendly_name=None, description=None): """ Selectively updates Dataset information. Args: friendly_name: if not None, the new friendly name. description: if not None, the new description. Returns: """ self._get_info() if self._info: if friendly_name: self._info['friendlyName'] = friendly_name if description: self._info['description'] = description try: self._api.datasets_update(self._name_parts, self._info) except Exception as e: raise e finally: self._info = None # need a refresh def _retrieve_items(self, page_token, item_type): try: list_info = self._api.tables_list(self._name_parts, page_token=page_token) except Exception as e: raise e tables = list_info.get('tables', []) contents = [] if len(tables): try: for info in tables: if info['type'] != item_type: continue if info['type'] == 'TABLE': item = _table.Table((info['tableReference']['projectId'], info['tableReference']['datasetId'], info['tableReference']['tableId']), self._context) else: item = _view.View((info['tableReference']['projectId'], info['tableReference']['datasetId'], info['tableReference']['tableId']), self._context) contents.append(item) except KeyError: raise Exception('Unexpected item list response') page_token = list_info.get('nextPageToken', None) return contents, page_token def _retrieve_tables(self, page_token, _): return self._retrieve_items(page_token=page_token, item_type='TABLE') def _retrieve_views(self, page_token, _): return self._retrieve_items(page_token=page_token, item_type='VIEW') def tables(self): """ Returns an iterator for iterating through the Tables in the dataset. """ return iter(datalab.utils.Iterator(self._retrieve_tables)) def views(self): """ Returns an iterator for iterating through the Views in the dataset. """ return iter(datalab.utils.Iterator(self._retrieve_views)) def __iter__(self): """ Returns an iterator for iterating through the Tables in the dataset. """ return self.tables() def __str__(self): """Returns a string representation of the dataset using its specified name. Returns: The string representation of this object. """ return self._full_name def __repr__(self): """Returns a representation for the dataset for showing in the notebook. """ return 'Dataset %s' % self._full_name class Datasets(object): """ Iterator class for enumerating the datasets in a project. """ def __init__(self, project_id=None, context=None): """ Initialize the Datasets object. Args: project_id: the ID of the project whose datasets you want to list. If None defaults to the project in the context. context: an optional Context object providing project_id and credentials. If a specific project id or credentials are unspecified, the default ones configured at the global level are used. """ if context is None: context = datalab.context.Context.default() self._context = context self._api = _api.Api(context) self._project_id = project_id if project_id else self._api.project_id def _retrieve_datasets(self, page_token, count): try: list_info = self._api.datasets_list(self._project_id, max_results=count, page_token=page_token) except Exception as e: raise e datasets = list_info.get('datasets', []) if len(datasets): try: datasets = [Dataset((info['datasetReference']['projectId'], info['datasetReference']['datasetId']), self._context) for info in datasets] except KeyError: raise Exception('Unexpected response from server.') page_token = list_info.get('nextPageToken', None) return datasets, page_token def __iter__(self): """ Returns an iterator for iterating through the Datasets in the project. """ return iter(datalab.utils.Iterator(self._retrieve_datasets)) ================================================ FILE: datalab/bigquery/_dialect.py ================================================ # Copyright 2016 Google Inc. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except # in compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under the License # is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express # or implied. See the License for the specific language governing permissions and limitations under # the License. """Google Cloud Platform library - BigQuery SQL Dialect""" from __future__ import absolute_import class Dialect(object): """ Represents the default BigQuery SQL dialect """ _global_dialect = None def __init__(self, bq_dialect): self._global_dialect = bq_dialect @property def bq_dialect(self): """Retrieves the value of the bq_dialect property. Returns: The default BigQuery SQL dialect """ return self._global_dialect def set_bq_dialect(self, bq_dialect): """ Set the default BigQuery SQL dialect""" if bq_dialect in ['legacy', 'standard']: self._global_dialect = bq_dialect @staticmethod def default(): """Retrieves the default BigQuery SQL dialect, creating it if necessary. Returns: An initialized and shared instance of a Dialect object. """ if Dialect._global_dialect is None: Dialect._global_dialect = Dialect('legacy') return Dialect._global_dialect ================================================ FILE: datalab/bigquery/_federated_table.py ================================================ # Copyright 2015 Google Inc. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except # in compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under the License # is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express # or implied. See the License for the specific language governing permissions and limitations under # the License. """Implements External Table functionality.""" from __future__ import absolute_import from __future__ import unicode_literals from builtins import object from . import _csv_options class FederatedTable(object): @staticmethod def from_storage(source, source_format='csv', csv_options=None, ignore_unknown_values=False, max_bad_records=0, compressed=False, schema=None): """ Create an external table for a GCS object. Args: source: the URL of the source objects(s). Can include a wildcard '*' at the end of the item name. Can be a single source or a list. source_format: the format of the data, 'csv' or 'json'; default 'csv'. csv_options: For CSV files, the options such as quote character and delimiter. ignore_unknown_values: If True, accept rows that contain values that do not match the schema; the unknown values are ignored (default False). max_bad_records: The maximum number of bad records that are allowed (and ignored) before returning an 'invalid' error in the Job result (default 0). compressed: whether the data is GZ compressed or not (default False). Note that compressed data can be used as a federated table but cannot be loaded into a BQ Table. schema: the schema of the data. This is required for this table to be used as a federated table or to be loaded using a Table object that itself has no schema (default None). """ result = FederatedTable() # Do some sanity checking and concert some params from friendly form to form used by BQ. if source_format == 'csv': result._bq_source_format = 'CSV' if csv_options is None: csv_options = _csv_options.CSVOptions() # use defaults elif source_format == 'json': if csv_options: raise Exception('CSV options are not support for JSON tables') result._bq_source_format = 'NEWLINE_DELIMITED_JSON' else: raise Exception("Invalid source format %s" % source_format) result._source = source if isinstance(source, list) else [source] result._source_format = source_format result._csv_options = csv_options result._ignore_unknown_values = ignore_unknown_values result._max_bad_records = max_bad_records result._compressed = compressed result._schema = schema return result def __init__(self): """ Create an external table reference. Do not call this directly; use factory method(s). """ # Do some sanity checking and concert some params from friendly form to form used by BQ. self._bq_source_format = None self._source = None self._source_format = None self._csv_options = None self._ignore_unknown_values = None self._max_bad_records = None self._compressed = None self._schema = None @property def schema(self): return self._schema def _to_query_json(self): """ Return the table as a dictionary to be used as JSON in a query job. """ json = { 'compression': 'GZIP' if self._compressed else 'NONE', 'ignoreUnknownValues': self._ignore_unknown_values, 'maxBadRecords': self._max_bad_records, 'sourceFormat': self._bq_source_format, 'sourceUris': self._source, } if self._source_format == 'csv' and self._csv_options: json['csvOptions'] = {} json['csvOptions'].update(self._csv_options._to_query_json()) if self._schema: json['schema'] = {'fields': self._schema._bq_schema} return json ================================================ FILE: datalab/bigquery/_job.py ================================================ # Copyright 2015 Google Inc. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except # in compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under the License # is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express # or implied. See the License for the specific language governing permissions and limitations under # the License. """Implements BigQuery Job functionality.""" from __future__ import absolute_import from __future__ import unicode_literals from __future__ import division import datetime import datalab.utils from . import _api class Job(datalab.utils.GCPJob): """Represents a BigQuery Job. """ def __init__(self, job_id, context): """Initializes an instance of a Job. Args: job_id: the BigQuery job ID corresponding to this job. context: a Context object providing project_id and credentials. """ super(Job, self).__init__(job_id, context) def _create_api(self, context): return _api.Api(context) def _refresh_state(self): """ Get the state of a job. If the job is complete this does nothing; otherwise it gets a refreshed copy of the job resource. """ # TODO(gram): should we put a choke on refreshes? E.g. if the last call was less than # a second ago should we return the cached value? if self._is_complete: return try: response = self._api.jobs_get(self._job_id) except Exception as e: raise e if 'status' in response: status = response['status'] if 'state' in status and status['state'] == 'DONE': self._end_time = datetime.datetime.utcnow() self._is_complete = True self._process_job_status(status) if 'statistics' in response: statistics = response['statistics'] start_time = statistics.get('creationTime', None) end_time = statistics.get('endTime', None) if start_time and end_time and end_time >= start_time: self._start_time = datetime.datetime.fromtimestamp(float(start_time) / 1000.0) self._end_time = datetime.datetime.fromtimestamp(float(end_time) / 1000.0) def _process_job_status(self, status): if 'errorResult' in status: error_result = status['errorResult'] location = error_result.get('location', None) message = error_result.get('message', None) reason = error_result.get('reason', None) self._fatal_error = datalab.utils.JobError(location, message, reason) if 'errors' in status: self._errors = [] for error in status['errors']: location = error.get('location', None) message = error.get('message', None) reason = error.get('reason', None) self._errors.append(datalab.utils.JobError(location, message, reason)) ================================================ FILE: datalab/bigquery/_parser.py ================================================ # Copyright 2015 Google Inc. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except # in compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under the License # is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express # or implied. See the License for the specific language governing permissions and limitations under # the License. """Implements BigQuery related data parsing helpers.""" from __future__ import absolute_import from __future__ import division from __future__ import unicode_literals from builtins import zip from builtins import str from builtins import object import datetime class Parser(object): """A set of helper functions to parse data in BigQuery responses.""" def __init__(self): pass @staticmethod def parse_row(schema, data): """Parses a row from query results into an equivalent object. Args: schema: the array of fields defining the schema of the data. data: the JSON row from a query result. Returns: The parsed row object. """ def parse_value(data_type, value): """Parses a value returned from a BigQuery response. Args: data_type: the type of the value as specified by the schema. value: the raw value to return (before casting to data_type). Returns: The value cast to the data_type. """ if value is not None: if value == 'null': value = None elif data_type == 'INTEGER': value = int(value) elif data_type == 'FLOAT': value = float(value) elif data_type == 'TIMESTAMP': value = datetime.datetime.utcfromtimestamp(float(value)) elif data_type == 'BOOLEAN': value = value == 'true' elif (type(value) != str): # TODO(gram): Handle nested JSON records value = str(value) return value row = {} if data is None: return row for i, (field, schema_field) in enumerate(zip(data['f'], schema)): val = field['v'] name = schema_field['name'] data_type = schema_field['type'] repeated = True if 'mode' in schema_field and schema_field['mode'] == 'REPEATED' else False if repeated and val is None: row[name] = [] elif data_type == 'RECORD': sub_schema = schema_field['fields'] if repeated: row[name] = [Parser.parse_row(sub_schema, v['v']) for v in val] else: row[name] = Parser.parse_row(sub_schema, val) elif repeated: row[name] = [parse_value(data_type, v['v']) for v in val] else: row[name] = parse_value(data_type, val) return row @staticmethod def parse_timestamp(value): """Parses a timestamp. Args: value: the number of milliseconds since epoch. """ return datetime.datetime.utcfromtimestamp(float(value) / 1000.0) ================================================ FILE: datalab/bigquery/_query.py ================================================ # Copyright 2015 Google Inc. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except # in compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under the License # is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express # or implied. See the License for the specific language governing permissions and limitations under # the License. """Implements Query BigQuery API.""" from __future__ import absolute_import from __future__ import unicode_literals from builtins import object import datalab.context import datalab.data import datalab.utils from . import _api from . import _federated_table from . import _query_job from . import _sampling from . import _udf from . import _utils class Query(object): """Represents a Query object that encapsulates a BigQuery SQL query. This object can be used to execute SQL queries and retrieve results. """ @staticmethod def sampling_query(sql, context, fields=None, count=5, sampling=None, udfs=None, data_sources=None): """Returns a sampling Query for the SQL object. Args: sql: the SQL statement (string) or Query object to sample. context: a Context object providing project_id and credentials. fields: an optional list of field names to retrieve. count: an optional count of rows to retrieve which is used if a specific sampling is not specified. sampling: an optional sampling strategy to apply to the table. udfs: array of UDFs referenced in the SQL. data_sources: dictionary of federated (external) tables referenced in the SQL. Returns: A Query object for sampling the table. """ return Query(_sampling.Sampling.sampling_query(sql, fields, count, sampling), context=context, udfs=udfs, data_sources=data_sources) def __init__(self, sql, context=None, values=None, udfs=None, data_sources=None, **kwargs): """Initializes an instance of a Query object. Note that either values or kwargs may be used, but not both. Args: sql: the BigQuery SQL query string to execute, or a SqlStatement object. The latter will have any variable references replaced before being associated with the Query (i.e. once constructed the SQL associated with a Query is static). It is possible to have variable references in a query string too provided the variables are passed as keyword arguments to this constructor. context: an optional Context object providing project_id and credentials. If a specific project id or credentials are unspecified, the default ones configured at the global level are used. values: a dictionary used to expand variables if passed a SqlStatement or a string with variable references. udfs: array of UDFs referenced in the SQL. data_sources: dictionary of federated (external) tables referenced in the SQL. kwargs: arguments to use when expanding the variables if passed a SqlStatement or a string with variable references. Raises: Exception if expansion of any variables failed. """ if context is None: context = datalab.context.Context.default() self._context = context self._api = _api.Api(context) self._data_sources = data_sources self._udfs = udfs if data_sources is None: data_sources = {} self._results = None self._code = None self._imports = [] if values is None: values = kwargs self._sql = datalab.data.SqlModule.expand(sql, values) # We need to take care not to include the same UDF code twice so we use sets. udfs = set(udfs if udfs else []) for value in list(values.values()): if isinstance(value, _udf.UDF): udfs.add(value) included_udfs = set([]) tokens = datalab.data.tokenize(self._sql) udf_dict = {udf.name: udf for udf in udfs} for i, token in enumerate(tokens): # Find the preceding and following non-whitespace tokens prior = i - 1 while prior >= 0 and tokens[prior].isspace(): prior -= 1 if prior < 0: continue next = i + 1 while next < len(tokens) and tokens[next].isspace(): next += 1 if next >= len(tokens): continue uprior = tokens[prior].upper() if uprior != 'FROM' and uprior != 'JOIN': continue # Check for external tables. if tokens[next] not in "[('\"": if token not in data_sources: if values and token in values: if isinstance(values[token], _federated_table.FederatedTable): data_sources[token] = values[token] # Now check for UDF calls. if uprior != 'FROM' or tokens[next] != '(': continue # We have a 'FROM token (' sequence. if token in udf_dict: udf = udf_dict[token] if token not in included_udfs: included_udfs.add(token) if self._code is None: self._code = [] self._code.append(udf.code) if udf.imports: self._imports.extend(udf.imports) fields = ', '.join([f[0] for f in udf._outputs]) tokens[i] = '(SELECT %s FROM %s' % (fields, token) # Find the closing parenthesis and add the additional one now needed. num_paren = 0 j = i + 1 while j < len(tokens): if tokens[j] == '(': num_paren += 1 elif tokens[j] == ')': num_paren -= 1 if num_paren == 0: tokens[j] = '))' break j += 1 self._external_tables = None if len(data_sources): self._external_tables = {} for name, table in list(data_sources.items()): if table.schema is None: raise Exception('Referenced external table %s has no known schema' % name) self._external_tables[name] = table._to_query_json() self._sql = ''.join(tokens) def _repr_sql_(self): """Creates a SQL representation of this object. Returns: The SQL representation to use when embedding this object into other SQL. """ return '(%s)' % self._sql def __str__(self): """Creates a string representation of this object. Returns: The string representation of this object (the unmodified SQL). """ return self._sql def __repr__(self): """Creates a friendly representation of this object. Returns: The friendly representation of this object (the unmodified SQL). """ return self._sql @property def sql(self): """ Get the SQL for the query. """ return self._sql @property def scripts(self): """ Get the code for any Javascript UDFs used in the query. """ return self._code def results(self, use_cache=True, dialect=None, billing_tier=None): """Retrieves table of results for the query. May block if the query must be executed first. Args: use_cache: whether to use cached results or not. Ignored if append is specified. dialect : {'legacy', 'standard'}, default 'legacy' 'legacy' : Use BigQuery's legacy SQL dialect. 'standard' : Use BigQuery's standard SQL (beta), which is compliant with the SQL 2011 standard. billing_tier: Limits the billing tier for this job. Queries that have resource usage beyond this tier will fail (without incurring a charge). If unspecified, this will be set to your project default. This can also be used to override your project-wide default billing tier on a per-query basis. Returns: A QueryResultsTable containing the result set. Raises: Exception if the query could not be executed or query response was malformed. """ if not use_cache or (self._results is None): self.execute(use_cache=use_cache, dialect=dialect, billing_tier=billing_tier) return self._results.results def extract(self, storage_uris, format='csv', csv_delimiter=',', csv_header=True, compress=False, use_cache=True, dialect=None, billing_tier=None): """Exports the query results to GCS. Args: storage_uris: the destination URI(s). Can be a single URI or a list. format: the format to use for the exported data; one of 'csv', 'json', or 'avro' (default 'csv'). csv_delimiter: for csv exports, the field delimiter to use (default ','). csv_header: for csv exports, whether to include an initial header line (default True). compress: whether to compress the data on export. Compression is not supported for AVRO format (default False). use_cache: whether to use cached results or not (default True). dialect : {'legacy', 'standard'}, default 'legacy' 'legacy' : Use BigQuery's legacy SQL dialect. 'standard' : Use BigQuery's standard SQL (beta), which is compliant with the SQL 2011 standard. billing_tier: Limits the billing tier for this job. Queries that have resource usage beyond this tier will fail (without incurring a charge). If unspecified, this will be set to your project default. This can also be used to override your project-wide default billing tier on a per-query basis. Returns: A Job object for the export Job if it was completed successfully; else None. Raises: An Exception if the query or extract failed. """ return self.results(use_cache=use_cache, dialect=dialect, billing_tier=billing_tier).extract(storage_uris, format=format, csv_delimiter=csv_delimiter, csv_header=csv_header, compress=compress) @datalab.utils.async_method def extract_async(self, storage_uris, format='csv', csv_delimiter=',', csv_header=True, compress=False, use_cache=True, dialect=None, billing_tier=None): """Exports the query results to GCS. Returns a Job immediately. Note that there are two jobs that may need to be run sequentially, one to run the query, and the second to extract the resulting table. These are wrapped by a single outer Job. If the query has already been executed and you would prefer to get a Job just for the extract, you can can call extract_async on the QueryResultsTable instead; i.e.: query.results().extract_async(...) Args: storage_uris: the destination URI(s). Can be a single URI or a list. format: the format to use for the exported data; one of 'csv', 'json', or 'avro' (default 'csv'). csv_delimiter: for CSV exports, the field delimiter to use (default ','). csv_header: for CSV exports, whether to include an initial header line (default True). compress: whether to compress the data on export. Compression is not supported for AVRO format (default False). use_cache: whether to use cached results or not (default True). dialect : {'legacy', 'standard'}, default 'legacy' 'legacy' : Use BigQuery's legacy SQL dialect. 'standard' : Use BigQuery's standard SQL (beta), which is compliant with the SQL 2011 standard. billing_tier: Limits the billing tier for this job. Queries that have resource usage beyond this tier will fail (without incurring a charge). If unspecified, this will be set to your project default. This can also be used to override your project-wide default billing tier on a per-query basis. Returns: A Job for the combined (execute, extract) task that will in turn return the Job object for the completed extract task when done; else None. Raises: An Exception if the query failed. """ return self.extract(storage_uris, format=format, csv_delimiter=csv_delimiter, csv_header=csv_header, use_cache=use_cache, compress=compress, dialect=dialect, billing_tier=billing_tier) def to_dataframe(self, start_row=0, max_rows=None, use_cache=True, dialect=None, billing_tier=None): """ Exports the query results to a Pandas dataframe. Args: start_row: the row of the table at which to start the export (default 0). max_rows: an upper limit on the number of rows to export (default None). use_cache: whether to use cached results or not (default True). dialect : {'legacy', 'standard'}, default 'legacy' 'legacy' : Use BigQuery's legacy SQL dialect. 'standard' : Use BigQuery's standard SQL (beta), which is compliant with the SQL 2011 standard. billing_tier: Limits the billing tier for this job. Queries that have resource usage beyond this tier will fail (without incurring a charge). If unspecified, this will be set to your project default. This can also be used to override your project-wide default billing tier on a per-query basis. Returns: A Pandas dataframe containing the table data. """ return self.results(use_cache=use_cache, dialect=dialect, billing_tier=billing_tier) \ .to_dataframe(start_row=start_row, max_rows=max_rows) def to_file(self, path, format='csv', csv_delimiter=',', csv_header=True, use_cache=True, dialect=None, billing_tier=None): """Save the results to a local file in CSV format. Args: path: path on the local filesystem for the saved results. format: the format to use for the exported data; currently only 'csv' is supported. csv_delimiter: for CSV exports, the field delimiter to use. Defaults to ',' csv_header: for CSV exports, whether to include an initial header line. Default true. use_cache: whether to use cached results or not. dialect : {'legacy', 'standard'}, default 'legacy' 'legacy' : Use BigQuery's legacy SQL dialect. 'standard' : Use BigQuery's standard SQL (beta), which is compliant with the SQL 2011 standard. billing_tier: Limits the billing tier for this job. Queries that have resource usage beyond this tier will fail (without incurring a charge). If unspecified, this will be set to your project default. This can also be used to override your project-wide default billing tier on a per-query basis. Returns: The path to the local file. Raises: An Exception if the operation failed. """ self.results(use_cache=use_cache, dialect=dialect, billing_tier=billing_tier) \ .to_file(path, format=format, csv_delimiter=csv_delimiter, csv_header=csv_header) return path @datalab.utils.async_method def to_file_async(self, path, format='csv', csv_delimiter=',', csv_header=True, use_cache=True, dialect=None, billing_tier=None): """Save the results to a local file in CSV format. Returns a Job immediately. Args: path: path on the local filesystem for the saved results. format: the format to use for the exported data; currently only 'csv' is supported. csv_delimiter: for CSV exports, the field delimiter to use. Defaults to ',' csv_header: for CSV exports, whether to include an initial header line. Default true. use_cache: whether to use cached results or not. dialect : {'legacy', 'standard'}, default 'legacy' 'legacy' : Use BigQuery's legacy SQL dialect. 'standard' : Use BigQuery's standard SQL (beta), which is compliant with the SQL 2011 standard. billing_tier: Limits the billing tier for this job. Queries that have resource usage beyond this tier will fail (without incurring a charge). If unspecified, this will be set to your project default. This can also be used to override your project-wide default billing tier on a per-query basis. Returns: A Job for the save that returns the path to the local file on completion. Raises: An Exception if the operation failed. """ return self.to_file(path, format=format, csv_delimiter=csv_delimiter, csv_header=csv_header, use_cache=use_cache, dialect=dialect, billing_tier=billing_tier) def sample(self, count=5, fields=None, sampling=None, use_cache=True, dialect=None, billing_tier=None): """Retrieves a sampling of rows for the query. Args: count: an optional count of rows to retrieve which is used if a specific sampling is not specified (default 5). fields: the list of fields to sample (default None implies all). sampling: an optional sampling strategy to apply to the table. use_cache: whether to use cached results or not (default True). dialect : {'legacy', 'standard'}, default 'legacy' 'legacy' : Use BigQuery's legacy SQL dialect. 'standard' : Use BigQuery's standard SQL (beta), which is compliant with the SQL 2011 standard. billing_tier: Limits the billing tier for this job. Queries that have resource usage beyond this tier will fail (without incurring a charge). If unspecified, this will be set to your project default. This can also be used to override your project-wide default billing tier on a per-query basis. Returns: A QueryResultsTable containing a sampling of the result set. Raises: Exception if the query could not be executed or query response was malformed. """ return Query.sampling_query(self._sql, self._context, count=count, fields=fields, sampling=sampling, udfs=self._udfs, data_sources=self._data_sources).results(use_cache=use_cache, dialect=dialect, billing_tier=billing_tier) def execute_dry_run(self, dialect=None, billing_tier=None): """Dry run a query, to check the validity of the query and return some useful statistics. Args: dialect : {'legacy', 'standard'}, default 'legacy' 'legacy' : Use BigQuery's legacy SQL dialect. 'standard' : Use BigQuery's standard SQL (beta), which is compliant with the SQL 2011 standard. billing_tier: Limits the billing tier for this job. Queries that have resource usage beyond this tier will fail (without incurring a charge). If unspecified, this will be set to your project default. This can also be used to override your project-wide default billing tier on a per-query basis. Returns: A dict with 'cacheHit' and 'totalBytesProcessed' fields. Raises: An exception if the query was malformed. """ try: query_result = self._api.jobs_insert_query(self._sql, self._code, self._imports, dry_run=True, table_definitions=self._external_tables, dialect=dialect, billing_tier=billing_tier) except Exception as e: raise e return query_result['statistics']['query'] def execute_async(self, table_name=None, table_mode='create', use_cache=True, priority='interactive', allow_large_results=False, dialect=None, billing_tier=None): """ Initiate the query and return a QueryJob. Args: table_name: the result table name as a string or TableName; if None (the default), then a temporary table will be used. table_mode: one of 'create', 'overwrite' or 'append'. If 'create' (the default), the request will fail if the table exists. use_cache: whether to use past query results or ignore cache. Has no effect if destination is specified (default True). priority:one of 'batch' or 'interactive' (default). 'interactive' jobs should be scheduled to run quickly but are subject to rate limits; 'batch' jobs could be delayed by as much as three hours but are not rate-limited. allow_large_results: whether to allow large results; i.e. compressed data over 100MB. This is slower and requires a table_name to be specified) (default False). dialect : {'legacy', 'standard'}, default 'legacy' 'legacy' : Use BigQuery's legacy SQL dialect. 'standard' : Use BigQuery's standard SQL (beta), which is compliant with the SQL 2011 standard. billing_tier: Limits the billing tier for this job. Queries that have resource usage beyond this tier will fail (without incurring a charge). If unspecified, this will be set to your project default. This can also be used to override your project-wide default billing tier on a per-query basis. Returns: A QueryJob. Raises: Exception if query could not be executed. """ batch = priority == 'low' append = table_mode == 'append' overwrite = table_mode == 'overwrite' if table_name is not None: table_name = _utils.parse_table_name(table_name, self._api.project_id) try: query_result = self._api.jobs_insert_query(self._sql, self._code, self._imports, table_name=table_name, append=append, overwrite=overwrite, use_cache=use_cache, batch=batch, allow_large_results=allow_large_results, table_definitions=self._external_tables, dialect=dialect, billing_tier=billing_tier) except Exception as e: raise e if 'jobReference' not in query_result: raise Exception('Unexpected response from server') job_id = query_result['jobReference']['jobId'] if not table_name: try: destination = query_result['configuration']['query']['destinationTable'] table_name = (destination['projectId'], destination['datasetId'], destination['tableId']) except KeyError: # The query was in error raise Exception(_utils.format_query_errors(query_result['status']['errors'])) return _query_job.QueryJob(job_id, table_name, self._sql, context=self._context) def execute(self, table_name=None, table_mode='create', use_cache=True, priority='interactive', allow_large_results=False, dialect=None, billing_tier=None): """ Initiate the query, blocking until complete and then return the results. Args: table_name: the result table name as a string or TableName; if None (the default), then a temporary table will be used. table_mode: one of 'create', 'overwrite' or 'append'. If 'create' (the default), the request will fail if the table exists. use_cache: whether to use past query results or ignore cache. Has no effect if destination is specified (default True). priority:one of 'batch' or 'interactive' (default). 'interactive' jobs should be scheduled to run quickly but are subject to rate limits; 'batch' jobs could be delayed by as much as three hours but are not rate-limited. allow_large_results: whether to allow large results; i.e. compressed data over 100MB. This is slower and requires a table_name to be specified) (default False). dialect : {'legacy', 'standard'}, default 'legacy' 'legacy' : Use BigQuery's legacy SQL dialect. 'standard' : Use BigQuery's standard SQL (beta), which is compliant with the SQL 2011 standard. billing_tier: Limits the billing tier for this job. Queries that have resource usage beyond this tier will fail (without incurring a charge). If unspecified, this will be set to your project default. This can also be used to override your project-wide default billing tier on a per-query basis. Returns: The QueryResultsTable for the query. Raises: Exception if query could not be executed. """ job = self.execute_async(table_name=table_name, table_mode=table_mode, use_cache=use_cache, priority=priority, allow_large_results=allow_large_results, dialect=dialect, billing_tier=billing_tier) self._results = job.wait() return self._results def to_view(self, view_name): """ Create a View from this Query. Args: view_name: the name of the View either as a string or a 3-part tuple (projectid, datasetid, name). Returns: A View for the Query. """ # Do the import here to avoid circular dependencies at top-level. from . import _view return _view.View(view_name, self._context).create(self._sql) ================================================ FILE: datalab/bigquery/_query_job.py ================================================ # Copyright 2015 Google Inc. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except # in compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under the License # is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express # or implied. See the License for the specific language governing permissions and limitations under # the License. """Implements BigQuery query job functionality.""" from __future__ import absolute_import from __future__ import unicode_literals from builtins import str from . import _job from . import _query_results_table class QueryJob(_job.Job): """ Represents a BigQuery Query Job. """ def __init__(self, job_id, table_name, sql, context): """ Initializes a QueryJob object. Args: job_id: the ID of the query job. table_name: the name of the table where the query results will be stored. sql: the SQL statement that was executed for the query. context: the Context object providing project_id and credentials that was used when executing the query. """ super(QueryJob, self).__init__(job_id, context) self._sql = sql self._table = _query_results_table.QueryResultsTable(table_name, context, self, is_temporary=True) self._bytes_processed = None self._cache_hit = None self._total_rows = None @property def bytes_processed(self): """ The number of bytes processed, or None if the job is not complete. """ return self._bytes_processed @property def total_rows(self): """ The total number of rows in the result, or None if not complete. """ return self._total_rows @property def cache_hit(self): """ Whether the query results were obtained from the cache or not, or None if not complete. """ return self._cache_hit @property def sql(self): """ The SQL statement that was executed for the query. """ return self._sql def wait(self, timeout=None): """ Wait for the job to complete, or a timeout to happen. This is more efficient than the version in the base Job class, in that we can use a call that blocks for the poll duration rather than a sleep. That means we shouldn't block unnecessarily long and can also poll less. Args: timeout: how long to wait (in seconds) before giving up; default None which means no timeout. Returns: The QueryJob """ poll = 30 while not self._is_complete: try: query_result = self._api.jobs_query_results(self._job_id, project_id=self._context.project_id, page_size=0, timeout=poll * 1000) except Exception as e: raise e if query_result['jobComplete']: if 'totalBytesProcessed' in query_result: self._bytes_processed = int(query_result['totalBytesProcessed']) self._cache_hit = query_result.get('cacheHit', None) if 'totalRows' in query_result: self._total_rows = int(query_result['totalRows']) break if timeout is not None: timeout -= poll if timeout <= 0: break self._refresh_state() return self @property def results(self): """ Get the table used for the results of the query. If the query is incomplete, this blocks. Raises: Exception if we timed out waiting for results or the query failed. """ self.wait() if self.failed: raise Exception('Query failed: %s' % str(self.errors)) return self._table ================================================ FILE: datalab/bigquery/_query_results_table.py ================================================ # Copyright 2015 Google Inc. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except # in compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under the License # is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express # or implied. See the License for the specific language governing permissions and limitations under # the License. """Implements BigQuery query job results table functionality.""" from __future__ import absolute_import from __future__ import unicode_literals from . import _table class QueryResultsTable(_table.Table): """ A subclass of Table specifically for Query results. The primary differences are the additional properties job_id and sql. """ def __init__(self, name, context, job, is_temporary=False): """Initializes an instance of a Table object. Args: name: the name of the table either as a string or a 3-part tuple (projectid, datasetid, name). context: an optional Context object providing project_id and credentials. If a specific project id or credentials are unspecified, the default ones configured at the global level are used. job: the QueryJob associated with these results. is_temporary: if True, this is a short-lived table for intermediate results (default False). """ super(QueryResultsTable, self).__init__(name, context) self._job = job self._is_temporary = is_temporary def __repr__(self): """Returns a representation for the dataset for showing in the notebook. """ if self._is_temporary: return 'QueryResultsTable %s' % self.job_id else: return super(QueryResultsTable, self).__repr__() @property def job(self): """ The QueryJob object that caused the table to be populated. """ return self._job @property def job_id(self): """ The ID of the query job that caused the table to be populated. """ return self._job.id @property def sql(self): """ The SQL statement for the query that populated the table. """ return self._job.sql @property def is_temporary(self): """ Whether this is a short-lived table or not. """ return self._is_temporary ================================================ FILE: datalab/bigquery/_query_stats.py ================================================ # Copyright 2015 Google Inc. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except # in compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under the License # is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express # or implied. See the License for the specific language governing permissions and limitations under # the License. """Implements representation of BigQuery query job dry run results.""" from __future__ import absolute_import from __future__ import unicode_literals from builtins import object class QueryStats(object): """A wrapper for statistics returned by a dry run query. Useful so we can get an HTML representation in a notebook. """ def __init__(self, total_bytes, is_cached): self.total_bytes = float(total_bytes) self.is_cached = is_cached def _repr_html_(self): self.total_bytes = QueryStats._size_formatter(self.total_bytes) return """
Dry run information: %s to process, results %s
""" % (self.total_bytes, "cached" if self.is_cached else "not cached") @staticmethod def _size_formatter(byte_num, suf='B'): for mag in ['', 'K', 'M', 'G', 'T']: if byte_num < 1000.0: if suf == 'B': # Don't do fractional bytes return "%5d%s%s" % (int(byte_num), mag, suf) return "%3.1f%s%s" % (byte_num, mag, suf) byte_num /= 1000.0 return "%.1f%s%s".format(byte_num, 'P', suf) ================================================ FILE: datalab/bigquery/_sampling.py ================================================ # Copyright 2015 Google Inc. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except # in compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under the License # is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express # or implied. See the License for the specific language governing permissions and limitations under # the License. """Sampling for BigQuery.""" from __future__ import absolute_import from __future__ import division from __future__ import unicode_literals from builtins import object class Sampling(object): """Provides common sampling strategies. Sampling strategies can be used for sampling tables or queries. They are implemented as functions that take in a SQL statement representing the table or query that should be sampled, and return a new SQL statement that limits the result set in some manner. """ def __init__(self): pass @staticmethod def _create_projection(fields): """Creates a projection for use in a SELECT statement. Args: fields: the list of fields to be specified. """ if (fields is None) or (len(fields) == 0): return '*' return ','.join(fields) @staticmethod def default(fields=None, count=5): """Provides a simple default sampling strategy which limits the result set by a count. Args: fields: an optional list of field names to retrieve. count: optional number of rows to limit the sampled results to. Returns: A sampling function that can be applied to get a random sampling. """ projection = Sampling._create_projection(fields) return lambda sql: 'SELECT %s FROM (%s) LIMIT %d' % (projection, sql, count) @staticmethod def sorted(field_name, ascending=True, fields=None, count=5): """Provides a sampling strategy that picks from an ordered set of rows. Args: field_name: the name of the field to sort the rows by. ascending: whether to sort in ascending direction or not. fields: an optional list of field names to retrieve. count: optional number of rows to limit the sampled results to. Returns: A sampling function that can be applied to get the initial few rows. """ direction = '' if ascending else ' DESC' projection = Sampling._create_projection(fields) return lambda sql: 'SELECT %s FROM (%s) ORDER BY %s%s LIMIT %d' % (projection, sql, field_name, direction, count) @staticmethod def sampling_query(sql, fields=None, count=5, sampling=None): """Returns a sampling query for the SQL object. Args: sql: the SQL object to sample fields: an optional list of field names to retrieve. count: an optional count of rows to retrieve which is used if a specific sampling is not specified. sampling: an optional sampling strategy to apply to the table. Returns: A SQL query string for sampling the input sql. """ if sampling is None: sampling = Sampling.default(count=count, fields=fields) return sampling(sql) @staticmethod def hashed(field_name, percent, fields=None, count=0): """Provides a sampling strategy based on hashing and selecting a percentage of data. Args: field_name: the name of the field to hash. percent: the percentage of the resulting hashes to select. fields: an optional list of field names to retrieve. count: optional maximum count of rows to pick. Returns: A sampling function that can be applied to get a hash-based sampling. """ def _hashed_sampling(sql): projection = Sampling._create_projection(fields) sql = 'SELECT %s FROM (%s) WHERE ABS(HASH(%s)) %% 100 < %d' % \ (projection, sql, field_name, percent) if count != 0: sql = '%s LIMIT %d' % (sql, count) return sql return _hashed_sampling @staticmethod def random(percent, fields=None, count=0): """Provides a sampling strategy that picks a semi-random set of rows. Args: percent: the percentage of the resulting hashes to select. fields: an optional list of field names to retrieve. count: maximum number of rows to limit the sampled results to (default 5). Returns: A sampling function that can be applied to get some random rows. In order for this to provide a good random sample percent should be chosen to be ~count/#rows where #rows is the number of rows in the object (query, view or table) being sampled. The rows will be returned in order; i.e. the order itself is not randomized. """ def _random_sampling(sql): projection = Sampling._create_projection(fields) sql = 'SELECT %s FROM (%s) WHERE rand() < %f' % (projection, sql, (float(percent) / 100.0)) if count != 0: sql = '%s LIMIT %d' % (sql, count) return sql return _random_sampling ================================================ FILE: datalab/bigquery/_schema.py ================================================ # Copyright 2015 Google Inc. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except # in compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under the License # is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express # or implied. See the License for the specific language governing permissions and limitations under # the License. """Implements Table and View Schema APIs.""" from __future__ import absolute_import from __future__ import unicode_literals from builtins import str from builtins import range from past.builtins import basestring from builtins import object import datetime import pandas class Schema(list): """Represents the schema of a BigQuery table as a flattened list of objects representing fields. Each field object has name, data_type, mode and description properties. Nested fields get flattened with their full-qualified names. So a Schema that has an object A with nested field B will be represented as [(name: 'A', ...), (name: 'A.b', ...)]. """ class Field(object): """ Represents a single field in a Table schema. This has the properties: - name: the flattened, full-qualified name of the field. - data_type: the type of the field as a string ('INTEGER', 'BOOLEAN', 'FLOAT', 'STRING' or 'TIMESTAMP'). - mode: the mode of the field; 'NULLABLE' by default. - description: a description of the field, if known; empty string by default. """ # TODO(gram): consider renaming data_type member to type. Yes, it shadows top-level # name but that is what we are using in __str__ and __getitem__ and is what is used in BQ. # The shadowing is unlikely to cause problems. def __init__(self, name, data_type, mode='NULLABLE', description=''): self.name = name self.data_type = data_type self.mode = mode self.description = description def _repr_sql_(self): """Returns a representation of the field for embedding into a SQL statement. Returns: A formatted field name for use within SQL statements. """ return self.name def __eq__(self, other): """ Compare two schema field objects for equality (ignoring description). """ return self.name == other.name and self.data_type == other.data_type\ and self.mode == other.mode def __str__(self): """ Returns the schema field as a string form of a dictionary. """ return "{ 'name': '%s', 'type': '%s', 'mode':'%s', 'description': '%s' }" % \ (self.name, self.data_type, self.mode, self.description) def __repr__(self): """ Returns the schema field as a string form of a dictionary. """ return str(self) def __getitem__(self, item): # TODO(gram): Currently we need this for a Schema object to work with the Parser object. # Eventually if we change Parser to only work with Schema (and not also with the # schema dictionaries in query results) we can remove this. if item == 'name': return self.name if item == 'type': return self.data_type if item == 'mode': return self.mode if item == 'description': return self.description @staticmethod def _from_dataframe(dataframe, default_type='STRING'): """ Infer a BigQuery table schema from a Pandas dataframe. Note that if you don't explicitly set the types of the columns in the dataframe, they may be of a type that forces coercion to STRING, so even though the fields in the dataframe themselves may be numeric, the type in the derived schema may not be. Hence it is prudent to make sure the Pandas dataframe is typed correctly. Args: dataframe: The DataFrame. default_type : The default big query type in case the type of the column does not exist in the schema. Defaults to 'STRING'. Returns: A list of dictionaries containing field 'name' and 'type' entries, suitable for use in a BigQuery Tables resource schema. """ type_mapping = { 'i': 'INTEGER', 'b': 'BOOLEAN', 'f': 'FLOAT', 'O': 'STRING', 'S': 'STRING', 'U': 'STRING', 'M': 'TIMESTAMP' } fields = [] for column_name, dtype in dataframe.dtypes.iteritems(): fields.append({'name': column_name, 'type': type_mapping.get(dtype.kind, default_type)}) return fields @staticmethod def from_dataframe(dataframe, default_type='STRING'): """ Infer a BigQuery table schema from a Pandas dataframe. Note that if you don't explicitly set the types of the columns in the dataframe, they may be of a type that forces coercion to STRING, so even though the fields in the dataframe themselves may be numeric, the type in the derived schema may not be. Hence it is prudent to make sure the Pandas dataframe is typed correctly. Args: dataframe: The DataFrame. default_type : The default big query type in case the type of the column does not exist in the schema. Defaults to 'STRING'. Returns: A Schema. """ return Schema(Schema._from_dataframe(dataframe, default_type=default_type)) @staticmethod def _get_field_entry(name, value): entry = {'name': name} if isinstance(value, datetime.datetime): _type = 'TIMESTAMP' elif isinstance(value, bool): _type = 'BOOLEAN' elif isinstance(value, float): _type = 'FLOAT' elif isinstance(value, int): _type = 'INTEGER' elif isinstance(value, dict) or isinstance(value, list): _type = 'RECORD' entry['fields'] = Schema._from_record(value) else: _type = 'STRING' entry['type'] = _type return entry @staticmethod def _from_dict_record(data): """ Infer a BigQuery table schema from a dictionary. If the dictionary has entries that are in turn OrderedDicts these will be turned into RECORD types. Ideally this will be an OrderedDict but it is not required. Args: data: The dict to infer a schema from. Returns: A list of dictionaries containing field 'name' and 'type' entries, suitable for use in a BigQuery Tables resource schema. """ return [Schema._get_field_entry(name, value) for name, value in list(data.items())] @staticmethod def _from_list_record(data): """ Infer a BigQuery table schema from a list of values. Args: data: The list of values. Returns: A list of dictionaries containing field 'name' and 'type' entries, suitable for use in a BigQuery Tables resource schema. """ return [Schema._get_field_entry('Column%d' % (i + 1), value) for i, value in enumerate(data)] @staticmethod def _from_record(data): """ Infer a BigQuery table schema from a list of fields or a dictionary. The typeof the elements is used. For a list, the field names are simply 'Column1', 'Column2', etc. Args: data: The list of fields or dictionary. Returns: A list of dictionaries containing field 'name' and 'type' entries, suitable for use in a BigQuery Tables resource schema. """ if isinstance(data, dict): return Schema._from_dict_record(data) elif isinstance(data, list): return Schema._from_list_record(data) else: raise Exception('Cannot create a schema from record %s' % str(data)) @staticmethod def from_record(source): """ Infers a table/view schema from a single record that can contain a list of fields or a dictionary of fields. The type of the elements is used for the types in the schema. For a dict, key names are used for column names while for a list, the field names are simply named 'Column1', 'Column2', etc. Note that if using a dict you may want to use an OrderedDict to ensure column ordering is deterministic. Args: source: The list of field values or dictionary of key/values. Returns: A Schema for the data. """ # TODO(gram): may want to allow an optional second argument which is a list of field # names; could be useful for the record-containing-list case. return Schema(Schema._from_record(source)) @staticmethod def from_data(source): """Infers a table/view schema from its JSON representation, a list of records, or a Pandas dataframe. Args: source: the Pandas Dataframe, a dictionary representing a record, a list of heterogeneous data (record) or homogeneous data (list of records) from which to infer the schema, or a definition of the schema as a list of dictionaries with 'name' and 'type' entries and possibly 'mode' and 'description' entries. Only used if no data argument was provided. 'mode' can be 'NULLABLE', 'REQUIRED' or 'REPEATED'. For the allowed types, see: https://cloud.google.com/bigquery/preparing-data-for-bigquery#datatypes Note that there is potential ambiguity when passing a list of lists or a list of dicts between whether that should be treated as a list of records or a single record that is a list. The heuristic used is to check the length of the entries in the list; if they are equal then a list of records is assumed. To avoid this ambiguity you can instead use the Schema.from_record method which assumes a single record, in either list of values or dictionary of key-values form. Returns: A Schema for the data. """ if isinstance(source, pandas.DataFrame): bq_schema = Schema._from_dataframe(source) elif isinstance(source, list): if len(source) == 0: bq_schema = source elif all(isinstance(d, dict) for d in source): if all('name' in d and 'type' in d for d in source): # It looks like a bq_schema; use it as-is. bq_schema = source elif all(len(d) == len(source[0]) for d in source): bq_schema = Schema._from_dict_record(source[0]) else: raise Exception(('Cannot create a schema from heterogeneous list %s; perhaps you meant ' + 'to use Schema.from_record?') % str(source)) elif isinstance(source[0], list) and \ all([isinstance(l, list) and len(l) == len(source[0]) for l in source]): # A list of lists all of the same length; treat first entry as a list record. bq_schema = Schema._from_record(source[0]) else: # A heterogeneous list; treat as a record. raise Exception(('Cannot create a schema from heterogeneous list %s; perhaps you meant ' + 'to use Schema.from_record?') % str(source)) elif isinstance(source, dict): raise Exception(('Cannot create a schema from dict %s; perhaps you meant to use ' + 'Schema.from_record?') % str(source)) else: raise Exception('Cannot create a schema from %s' % str(source)) return Schema(bq_schema) def __init__(self, definition=None): """Initializes a Schema from its raw JSON representation, a Pandas Dataframe, or a list. Args: definition: a definition of the schema as a list of dictionaries with 'name' and 'type' entries and possibly 'mode' and 'description' entries. Only used if no data argument was provided. 'mode' can be 'NULLABLE', 'REQUIRED' or 'REPEATED'. For the allowed types, see: https://cloud.google.com/bigquery/preparing-data-for-bigquery#datatypes """ super(Schema, self).__init__() self._map = {} self._bq_schema = definition self._populate_fields(definition) def __getitem__(self, key): """Provides ability to lookup a schema field by position or by name. """ if isinstance(key, basestring): return self._map.get(key, None) # noinspection PyCallByClass return list.__getitem__(self, key) def _add_field(self, name, data_type, mode='NULLABLE', description=''): field = Schema.Field(name, data_type, mode, description) self.append(field) self._map[name] = field def find(self, name): """ Get the index of a field in the flattened list given its (fully-qualified) name. Args: name: the fully-qualified name of the field. Returns: The index of the field, if found; else -1. """ for i in range(0, len(self)): if self[i].name == name: return i return -1 def _populate_fields(self, data, prefix=''): for field_data in data: name = prefix + field_data['name'] data_type = field_data['type'] self._add_field(name, data_type, field_data.get('mode', None), field_data.get('description', None)) if data_type == 'RECORD': # Recurse into the nested fields, using this field's name as a prefix. self._populate_fields(field_data.get('fields'), name + '.') def __str__(self): """ Returns a string representation of the non-flattened form of the schema. """ # TODO(gram): We should probably return the flattened form. There was a reason why this is # not but I don't remember what it was. Figure that out and fix this. return str(self._bq_schema) def __eq__(self, other): """ Compares two schema for equality. """ other_map = other._map if len(self._map) != len(other_map): return False for name in self._map.keys(): if name not in other_map: return False if not self._map[name] == other_map[name]: return False return True def __ne__(self, other): """ Compares two schema for inequality. """ return not(self.__eq__(other)) ================================================ FILE: datalab/bigquery/_table.py ================================================ # Copyright 2015 Google Inc. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except # in compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under the License # is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express # or implied. See the License for the specific language governing permissions and limitations under # the License. """Implements Table, and related Table BigQuery APIs.""" from __future__ import absolute_import from __future__ import division from __future__ import unicode_literals from builtins import str from past.utils import old_div from builtins import object import calendar import codecs import csv import datetime import pandas import time import traceback import uuid import sys import datalab.context import datalab.utils from . import _api from . import _csv_options from . import _job from . import _parser from . import _schema from . import _utils # import of Query is at end of module as we have a circular dependency of # Query.execute().results -> Table and Table.sample() -> Query class TableMetadata(object): """Represents metadata about a BigQuery table.""" def __init__(self, table, info): """Initializes a TableMetadata instance. Args: table: the Table object this belongs to. info: The BigQuery information about this table as a Python dictionary. """ self._table = table self._info = info @property def created_on(self): """The creation timestamp.""" timestamp = self._info.get('creationTime') return _parser.Parser.parse_timestamp(timestamp) @property def description(self): """The description of the table if it exists.""" return self._info.get('description', '') @property def expires_on(self): """The timestamp for when the table will expire, or None if unknown.""" timestamp = self._info.get('expirationTime', None) if timestamp is None: return None return _parser.Parser.parse_timestamp(timestamp) @property def friendly_name(self): """The friendly name of the table if it exists.""" return self._info.get('friendlyName', '') @property def modified_on(self): """The timestamp for when the table was last modified.""" timestamp = self._info.get('lastModifiedTime') return _parser.Parser.parse_timestamp(timestamp) @property def rows(self): """The number of rows within the table, or -1 if unknown. """ return int(self._info['numRows']) if 'numRows' in self._info else -1 @property def size(self): """The size of the table in bytes, or -1 if unknown. """ return int(self._info['numBytes']) if 'numBytes' in self._info else -1 def refresh(self): """ Refresh the metadata. """ self._info = self._table._load_info() class Table(object): """Represents a Table object referencing a BigQuery table. """ # Allowed characters in a BigQuery table column name _VALID_COLUMN_NAME_CHARACTERS = '_abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789' # When fetching table contents, the max number of rows to fetch per HTTP request _DEFAULT_PAGE_SIZE = 1024 # Milliseconds per week _MSEC_PER_WEEK = 7 * 24 * 3600 * 1000 def __init__(self, name, context=None): """Initializes an instance of a Table object. The Table need not exist yet. Args: name: the name of the table either as a string or a 3-part tuple (projectid, datasetid, name). If a string, it must have the form '| %s | ' % attr) self._segments.append('|||
|---|---|---|---|
| %s | ' % HtmlBuilder._format(o)) else: for attr in attributes: if datatype == 'dict': self._segments.append('%s | ' % HtmlBuilder._format(o.get(attr, None), nbsp=True)) elif datatype == 'chartdata': self._segments.append('%s | ' % HtmlBuilder._format(o['c'][indices[attr]]['v'], nbsp=True)) else: self._segments.append('%s | ' % HtmlBuilder._format(o.__getattribute__(attr), nbsp=True)) self._segments.append('
<empty>'): """Renders an HTML list with the specified list of strings. Args: items: the iterable collection of objects to render. empty: what to render if the list is None or empty. """ if not items or len(items) == 0: self._segments.append(empty) return self._segments.append('
Dry run information: %s to process, results %s
""" % (self.total_bytes, "cached" if self.is_cached else "not cached") @staticmethod def _size_formatter(byte_num, suf='B'): for mag in ['', 'K', 'M', 'G', 'T']: if byte_num < 1000.0: if suf == 'B': # Don't do fractional bytes return "%5d%s%s" % (int(byte_num), mag, suf) return "%3.1f%s%s" % (byte_num, mag, suf) byte_num /= 1000.0 return "%.1f%s%s".format(byte_num, 'P', suf) ================================================ FILE: google/datalab/bigquery/_sampling.py ================================================ # Copyright 2015 Google Inc. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except # in compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under the License # is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express # or implied. See the License for the specific language governing permissions and limitations under # the License. """Sampling for BigQuery.""" from __future__ import absolute_import from __future__ import division from __future__ import unicode_literals from builtins import object class Sampling(object): """Provides common sampling strategies. Sampling strategies can be used for sampling tables or queries. They are implemented as functions that take in a SQL statement representing the table or query that should be sampled, and return a new SQL statement that limits the result set in some manner. """ def __init__(self): pass @staticmethod def _create_projection(fields): """Creates a projection for use in a SELECT statement. Args: fields: the list of fields to be specified. """ if (fields is None) or (len(fields) == 0): return '*' return ','.join(fields) @staticmethod def default(fields=None, count=5): """Provides a simple default sampling strategy which limits the result set by a count. Args: fields: an optional list of field names to retrieve. count: optional number of rows to limit the sampled results to. Returns: A sampling function that can be applied to get a random sampling. """ projection = Sampling._create_projection(fields) return lambda sql: 'SELECT %s FROM (%s) LIMIT %d' % (projection, sql, count) @staticmethod def sorted(field_name, ascending=True, fields=None, count=5): """Provides a sampling strategy that picks from an ordered set of rows. Args: field_name: the name of the field to sort the rows by. ascending: whether to sort in ascending direction or not. fields: an optional list of field names to retrieve. count: optional number of rows to limit the sampled results to. Returns: A sampling function that can be applied to get the initial few rows. """ if field_name is None: raise Exception('Sort field must be specified') direction = '' if ascending else ' DESC' projection = Sampling._create_projection(fields) return lambda sql: 'SELECT %s FROM (%s) ORDER BY %s%s LIMIT %d' % (projection, sql, field_name, direction, count) @staticmethod def hashed(field_name, percent, fields=None, count=0): """Provides a sampling strategy based on hashing and selecting a percentage of data. Args: field_name: the name of the field to hash. percent: the percentage of the resulting hashes to select. fields: an optional list of field names to retrieve. count: optional maximum count of rows to pick. Returns: A sampling function that can be applied to get a hash-based sampling. """ if field_name is None: raise Exception('Hash field must be specified') def _hashed_sampling(sql): projection = Sampling._create_projection(fields) sql = 'SELECT %s FROM (%s) WHERE MOD(ABS(FARM_FINGERPRINT(CAST(%s AS STRING))), 100) < %d' % \ (projection, sql, field_name, percent) if count != 0: sql = '%s LIMIT %d' % (sql, count) return sql return _hashed_sampling @staticmethod def random(percent, fields=None, count=0): """Provides a sampling strategy that picks a semi-random set of rows. Args: percent: the percentage of the resulting hashes to select. fields: an optional list of field names to retrieve. count: maximum number of rows to limit the sampled results to (default 5). Returns: A sampling function that can be applied to get some random rows. In order for this to provide a good random sample percent should be chosen to be ~count/#rows where #rows is the number of rows in the object (query, view or table) being sampled. The rows will be returned in order; i.e. the order itself is not randomized. """ def _random_sampling(sql): projection = Sampling._create_projection(fields) sql = 'SELECT %s FROM (%s) WHERE rand() < %f' % (projection, sql, (float(percent) / 100.0)) if count != 0: sql = '%s LIMIT %d' % (sql, count) return sql return _random_sampling @staticmethod def _auto(method, fields, count, percent, key_field, ascending): """Construct a sampling function according to the provided sampling technique, provided all its needed fields are passed as arguments Args: method: one of the supported sampling methods: {limit,random,hashed,sorted} fields: an optional list of field names to retrieve. count: maximum number of rows to limit the sampled results to. percent: the percentage of the resulting hashes to select if using hashed sampling key_field: the name of the field to sort the rows by or use for hashing ascending: whether to sort in ascending direction or not. Returns: A sampling function using the provided arguments Raises: Exception if an unsupported mathod name is passed """ if method == 'limit': return Sampling.default(fields=fields, count=count) elif method == 'random': return Sampling.random(fields=fields, percent=percent, count=count) elif method == 'hashed': return Sampling.hashed(fields=fields, field_name=key_field, percent=percent, count=count) elif method == 'sorted': return Sampling.sorted(fields=fields, field_name=key_field, ascending=ascending, count=count) else: raise Exception('Unsupported sampling method: %s' % method) ================================================ FILE: google/datalab/bigquery/_schema.py ================================================ # Copyright 2015 Google Inc. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except # in compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under the License # is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express # or implied. See the License for the specific language governing permissions and limitations under # the License. """Implements Table and View Schema APIs.""" from __future__ import absolute_import from __future__ import unicode_literals from builtins import str from builtins import range from past.builtins import basestring from builtins import object import datetime import pandas import pprint class SchemaField(object): """ Represents a single field in a Table schema. This has the properties: - name: the flattened, full-qualified name of the field. - type: the type of the field as a string ('INTEGER', 'BOOLEAN', 'FLOAT', 'STRING' or 'TIMESTAMP'). - mode: the mode of the field; 'NULLABLE' by default. - description: a description of the field, if known; empty string by default. """ def __init__(self, name, type, mode='NULLABLE', description=''): self.name = name self.type = type self.mode = mode self.description = description def _repr_sql_(self): """Returns a representation of the field for embedding into a SQL statement. Returns: A formatted field name for use within SQL statements. """ return self.name def __eq__(self, other): """ Compare two schema field objects for equality (ignoring description). """ return self.name == other.name and self.type == other.type\ and self.mode == other.mode def __repr__(self): """ Returns the schema field as a string form of a dictionary. """ return 'BigQuery Schema Field:\n%s' % pprint.pformat(vars(self), width=1) def __getitem__(self, item): # TODO(gram): Currently we need this for a Schema object to work with the Parser object. # Eventually if we change Parser to only work with Schema (and not also with the # schema dictionaries in query results) we can remove this. if item == 'name': return self.name if item == 'type': return self.type if item == 'mode': return self.mode if item == 'description': return self.description class Schema(list): """Represents the schema of a BigQuery table as a flattened list of objects representing fields. Each field object has name, type, mode and description properties. Nested fields get flattened with their full-qualified names. So a Schema that has an object A with nested field B will be represented as [(name: 'A', ...), (name: 'A.b', ...)]. """ @staticmethod def _from_dataframe(dataframe, default_type='STRING'): """ Infer a BigQuery table schema from a Pandas dataframe. Note that if you don't explicitly set the types of the columns in the dataframe, they may be of a type that forces coercion to STRING, so even though the fields in the dataframe themselves may be numeric, the type in the derived schema may not be. Hence it is prudent to make sure the Pandas dataframe is typed correctly. Args: dataframe: The DataFrame. default_type : The default big query type in case the type of the column does not exist in the schema. Defaults to 'STRING'. Returns: A list of dictionaries containing field 'name' and 'type' entries, suitable for use in a BigQuery Tables resource schema. """ type_mapping = { 'i': 'INTEGER', 'b': 'BOOLEAN', 'f': 'FLOAT', 'O': 'STRING', 'S': 'STRING', 'U': 'STRING', 'M': 'TIMESTAMP' } fields = [] for column_name, dtype in dataframe.dtypes.iteritems(): fields.append({'name': column_name, 'type': type_mapping.get(dtype.kind, default_type)}) return fields @staticmethod def _get_field_entry(name, value): entry = {'name': name} if isinstance(value, datetime.datetime): _type = 'TIMESTAMP' elif isinstance(value, datetime.date): _type = 'DATE' elif isinstance(value, datetime.time): _type = 'TIME' elif isinstance(value, bool): _type = 'BOOLEAN' elif isinstance(value, float): _type = 'FLOAT' elif isinstance(value, int): _type = 'INTEGER' elif isinstance(value, dict) or isinstance(value, list): _type = 'RECORD' entry['fields'] = Schema._from_record(value) else: _type = 'STRING' entry['type'] = _type return entry @staticmethod def _from_dict_record(data): """ Infer a BigQuery table schema from a dictionary. If the dictionary has entries that are in turn OrderedDicts these will be turned into RECORD types. Ideally this will be an OrderedDict but it is not required. Args: data: The dict to infer a schema from. Returns: A list of dictionaries containing field 'name' and 'type' entries, suitable for use in a BigQuery Tables resource schema. """ return [Schema._get_field_entry(name, value) for name, value in list(data.items())] @staticmethod def _from_list_record(data): """ Infer a BigQuery table schema from a list of values. Args: data: The list of values. Returns: A list of dictionaries containing field 'name' and 'type' entries, suitable for use in a BigQuery Tables resource schema. """ return [Schema._get_field_entry('Column%d' % (i + 1), value) for i, value in enumerate(data)] @staticmethod def _from_record(data): """ Infer a BigQuery table schema from a list of fields or a dictionary. The typeof the elements is used. For a list, the field names are simply 'Column1', 'Column2', etc. Args: data: The list of fields or dictionary. Returns: A list of dictionaries containing field 'name' and 'type' entries, suitable for use in a BigQuery Tables resource schema. """ if isinstance(data, dict): return Schema._from_dict_record(data) elif isinstance(data, list): return Schema._from_list_record(data) else: raise Exception('Cannot create a schema from record %s' % str(data)) @staticmethod def from_record(source): """ Infers a table/view schema from a single record that can contain a list of fields or a dictionary of fields. The type of the elements is used for the types in the schema. For a dict, key names are used for column names while for a list, the field names are simply named 'Column1', 'Column2', etc. Note that if using a dict you may want to use an OrderedDict to ensure column ordering is deterministic. Args: source: The list of field values or dictionary of key/values. Returns: A Schema for the data. """ # TODO(gram): may want to allow an optional second argument which is a list of field # names; could be useful for the record-containing-list case. return Schema(Schema._from_record(source)) @staticmethod def from_data(source): """Infers a table/view schema from its JSON representation, a list of records, or a Pandas dataframe. Args: source: the Pandas Dataframe, a dictionary representing a record, a list of heterogeneous data (record) or homogeneous data (list of records) from which to infer the schema, or a definition of the schema as a list of dictionaries with 'name' and 'type' entries and possibly 'mode' and 'description' entries. Only used if no data argument was provided. 'mode' can be 'NULLABLE', 'REQUIRED' or 'REPEATED'. For the allowed types, see: https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types Note that there is potential ambiguity when passing a list of lists or a list of dicts between whether that should be treated as a list of records or a single record that is a list. The heuristic used is to check the length of the entries in the list; if they are equal then a list of records is assumed. To avoid this ambiguity you can instead use the Schema.from_record method which assumes a single record, in either list of values or dictionary of key-values form. Returns: A Schema for the data. """ if isinstance(source, pandas.DataFrame): bq_schema = Schema._from_dataframe(source) elif isinstance(source, list): if len(source) == 0: bq_schema = source elif all(isinstance(d, dict) for d in source): if all('name' in d and 'type' in d for d in source): # It looks like a bq_schema; use it as-is. bq_schema = source elif all(len(d) == len(source[0]) for d in source): bq_schema = Schema._from_dict_record(source[0]) else: raise Exception(('Cannot create a schema from heterogeneous list %s; perhaps you meant ' + 'to use Schema.from_record?') % str(source)) elif isinstance(source[0], list) and \ all([isinstance(l, list) and len(l) == len(source[0]) for l in source]): # A list of lists all of the same length; treat first entry as a list record. bq_schema = Schema._from_record(source[0]) else: # A heterogeneous list; treat as a record. raise Exception(('Cannot create a schema from heterogeneous list %s; perhaps you meant ' + 'to use Schema.from_record?') % str(source)) elif isinstance(source, dict): bq_schema = Schema._from_record(source) else: raise Exception('Cannot create a schema from %s' % str(source)) return Schema(bq_schema) def __init__(self, definition=None): """Initializes a Schema from its raw JSON representation, a Pandas Dataframe, or a list. Args: definition: a definition of the schema as a list of dictionaries with 'name' and 'type' entries and possibly 'mode' and 'description' entries. Only used if no data argument was provided. 'mode' can be 'NULLABLE', 'REQUIRED' or 'REPEATED'. For the allowed types, see: https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types """ super(Schema, self).__init__() self._map = {} self._bq_schema = definition self._populate_fields(definition) def __getitem__(self, key): """Provides ability to lookup a schema field by position or by name. """ if isinstance(key, basestring): return self._map.get(key, None) # noinspection PyCallByClass return list.__getitem__(self, key) def _add_field(self, name, type, mode='NULLABLE', description=''): field = SchemaField(name, type, mode, description) self.append(field) self._map[name] = field def find(self, name): """ Get the index of a field in the flattened list given its (fully-qualified) name. Args: name: the fully-qualified name of the field. Returns: The index of the field, if found; else -1. """ for i in range(0, len(self)): if self[i].name == name: return i return -1 def _populate_fields(self, data, prefix=''): for field_data in data: name = prefix + field_data['name'] type = field_data['type'] self._add_field(name, type, field_data.get('mode', None), field_data.get('description', None)) if type == 'RECORD': # Recurse into the nested fields, using this field's name as a prefix. self._populate_fields(field_data.get('fields'), name + '.') def __repr__(self): """ Returns a string representation of the schema for notebooks.""" return 'BigQuery Schema - Fields:\n%s' % pprint.pformat(self._bq_schema, width=1) def __eq__(self, other): """ Compares two schema for equality. """ other_map = other._map if len(self._map) != len(other_map): return False for name in self._map.keys(): if name not in other_map: return False if not self._map[name] == other_map[name]: return False return True def __ne__(self, other): """ Compares two schema for inequality. """ return not(self.__eq__(other)) ================================================ FILE: google/datalab/bigquery/_table.py ================================================ # Copyright 2015 Google Inc. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except # in compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under the License # is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express # or implied. See the License for the specific language governing permissions and limitations under # the License. """Implements Table, and related Table BigQuery APIs.""" from __future__ import absolute_import from __future__ import division from __future__ import unicode_literals from builtins import str from past.utils import old_div from builtins import object import calendar import codecs import csv import datetime import pandas import time import traceback import uuid import sys import google.datalab import google.datalab.utils from . import _api from . import _csv_options from . import _job from . import _parser from . import _schema from . import _utils class TableMetadata(object): """Represents metadata about a BigQuery table.""" def __init__(self, table, info): """Initializes a TableMetadata instance. Args: table: the Table object this belongs to. info: The BigQuery information about this table as a Python dictionary. """ self._table = table self._info = info @property def created_on(self): """The creation timestamp.""" timestamp = self._info.get('creationTime') return _parser.Parser.parse_timestamp(timestamp) @property def description(self): """The description of the table if it exists.""" return self._info.get('description', '') @property def expires_on(self): """The timestamp for when the table will expire, or None if unknown.""" timestamp = self._info.get('expirationTime', None) if timestamp is None: return None return _parser.Parser.parse_timestamp(timestamp) @property def friendly_name(self): """The friendly name of the table if it exists.""" return self._info.get('friendlyName', '') @property def modified_on(self): """The timestamp for when the table was last modified.""" timestamp = self._info.get('lastModifiedTime') return _parser.Parser.parse_timestamp(timestamp) @property def rows(self): """The number of rows within the table, or -1 if unknown. """ return int(self._info['numRows']) if 'numRows' in self._info else -1 @property def size(self): """The size of the table in bytes, or -1 if unknown. """ return int(self._info['numBytes']) if 'numBytes' in self._info else -1 def refresh(self): """ Refresh the metadata. """ self._info = self._table._load_info() class Table(object): """Represents a Table object referencing a BigQuery table. """ # Allowed characters in a BigQuery table column name _VALID_COLUMN_NAME_CHARACTERS = '_abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789' # When fetching table contents for a range or iteration, use a small page size per request _DEFAULT_PAGE_SIZE = 1024 # When fetching the entire table, use the maximum number of rows. The BigQuery service # will always return fewer rows than this if their encoded JSON size is larger than 10MB _MAX_PAGE_SIZE = 100000 # Milliseconds per week _MSEC_PER_WEEK = 7 * 24 * 3600 * 1000 def __init__(self, name, context=None): """Initializes an instance of a Table object. The Table need not exist yet. Args: name: the name of the table either as a string or a 3-part tuple (projectid, datasetid, name). If a string, it must have the form '